Day 3: HBase API

API call

More commonly used in the work is to call and implement operations similar to HBase shell through HBase API.

Environmental preparation

IDEA + Maven + HBase

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.sowhat.demo</groupId>
    <artifactId>hbasetest</artifactId>
    <version>1.0-SNAPSHOT</version>

    <!-- Form assignment of property JDK edition -->
    <properties>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
    </properties>

    <!--    Add several dependencies-->
    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.7.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>1.3.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-server</artifactId>
            <version>1.3.1</version>
        </dependency>
    </dependencies>



    <build>
        <plugins>

            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <!-- Mainly customized packaging
                 https://www.cnblogs.com/fnlingnzb-learner/p/10537228.html-->
                <version>3.0.0</version>
                <configuration>
                    <archive>
                        <manifest>
                            <mainClass>com.sowhat.demo.TestHBase</mainClass>
                        </manifest>
                        <!-- appoint main Method file -->
                    </archive>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                        <!--Third party to be relied on jar Package to jar This is convenient for us to publish executable jar Package. -->
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <!--Any name -->
                        <phase>package</phase>
                        <!-- Bind to package Life cycle stage -->
                        <goals>
                            <goal>single</goal>
                            <!-- Run only once -->
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>

Commons

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;

import java.io.IOException;
import java.util.ArrayList;

	private static Connection connection; // Create connection
	private static Admin admin; // admin object

	static
	{
		try
		{
			//To create configuration information, ZK configuration is generally the same as HBase/conf/hbase-site.xml Li hbase.zookeeper.quorum  Same configuration
			Configuration configuration = HBaseConfiguration.create();
			configuration.set("hbase.zookeeper.quorum", "hadoop102,hadoop103,hadoop104");
		
			//Create connection
			connection = ConnectionFactory.createConnection(configuration);

			//Create Admin object
			admin = connection.getAdmin();

		} catch (IOException e)
		{
			e.printStackTrace();
		}
	}

	public static void close() // Close properly after use
	{
		try
		{
			if (admin != null)
			{
				admin.close();
			}
		} catch (IOException e)
		{
			e.printStackTrace();
		}
		try
		{
			if (connection != null)
			{
				connection.close();
			}
		} catch (IOException e)
		{
			e.printStackTrace();
		}
	}

API operation HBase

1. Judge whether the table exists
	//1. Judge whether there is an old version method in the table, which has been discarded
	public static boolean isTableExistOld(String tableName) throws IOException
	{

		//Create configuration information
		HBaseConfiguration configuration = new HBaseConfiguration();

		//Add parameters to configuration information
		configuration.set("hbase.zookeeper.quorum", "hadoop102,hadoop103,hadoop104");

		//Create HBase client
		HBaseAdmin admin = new HBaseAdmin(configuration);

		//Executive judgment
		boolean exists = admin.tableExists(tableName);

		//Close connection
		admin.close();

		return exists;
	}

	//1. Judge whether the table exists
	public static boolean isTableExistNew(String tableName) throws IOException
	{

		//Executive judgment
		return admin.tableExists(TableName.valueOf(tableName));
	}
2. Create table

You must have at least one column family when you create a table.

	//2. Create multiple column families in the table
	public static void createTable(String tableName, String... cfs) throws IOException
	{

		//Judge the number of column families
		if (cfs.length < 1)
		{
			System.out.println("Please set the correct column family information!!!");
			return;
		}

		//Whether the interpretation table exists
		if (isTableExistNew(tableName))
		{
			System.out.println("surface" + tableName + "Already exists!!!");
			return;
		}

		//Create table descriptor
		HTableDescriptor hTableDescriptor = new HTableDescriptor(TableName.valueOf(tableName));

		//Loop to add column family information
		for (String cf : cfs)
		{

			//Create column family descriptor
			HColumnDescriptor hColumnDescriptor = new HColumnDescriptor(cf);

			//Add column family information
			hTableDescriptor.addFamily(hColumnDescriptor);
		}

		//Create table
		admin.createTable(hTableDescriptor);
	}
3. Delete table
	//3. Delete table
	public static void dropTable(String tableName) throws IOException
	{

		//Judge whether the table exists
		if (!isTableExistNew(tableName))
		{
			System.out.println("surface" + tableName + "non-existent!!!");
			return;
		}

		//Offline the meter
		admin.disableTable(TableName.valueOf(tableName));

		//Delete table operation
		admin.deleteTable(TableName.valueOf(tableName));
	}
4. Create a namespace
	//4. Create namespace
	public static void createNS(String nameSpace) throws IOException
	{

		//Create a namespace descriptor addConfiguration, which can be added selectively,
		NamespaceDescriptor namespaceDescriptor = NamespaceDescriptor.create(nameSpace).addConfiguration("createTime", System.currentTimeMillis() + "").build();

		//Create namespace
		try
		{
			admin.createNamespace(namespaceDescriptor);
		} catch (NamespaceExistException e)
		{
			System.out.println("Namespace" + nameSpace + "Already exists!!!");
		} catch (IOException e)
		{
			e.printStackTrace();
		}
	}
5. Delete namespace
	//5. Delete namespace
	public static void deleteNS(String nameSpace)
	{

		try
		{
			//Perform delete operation
			admin.deleteNamespace(nameSpace);
		} catch (IOException e)
		{
			e.printStackTrace();
		}
	}
6. Insert data

A RowKey corresponds to a Put, which can be created

	//6. Insert data: put "stu", "1001"“ info:name ","qiangquan"
	public static void putData(String tableName, String rowKey, String cf, String cn, String value) throws IOException
	{

		//Get table object
		Table table = connection.getTable(TableName.valueOf(tableName));

		//To create a Put object, a RowKey corresponds to a Put. You can build multiple puts to match multiple rowkeys,
		Put put = new Put(Bytes.toBytes(rowKey));

		//Add data to Put object to insert 1-N pieces of data
		put.addColumn(Bytes.toBytes(cf), Bytes.toBytes(cn), Bytes.toBytes(value));
		put.addColumn(Bytes.toBytes(cf), Bytes.toBytes("sex"), Bytes.toBytes("male"));
		put.addColumn(Bytes.toBytes(cf), Bytes.toBytes("addr"), Bytes.toBytes("SZ"));

		//Insert data support table.put (List<put>)
		table.put(put);

		//Close table connection
		table.close();
	}
7. get query
  1. get 'stu','1001'
  2. get 'stu','1001','info'
  3. get 'stu','1001','info:name'
    Depending on the query above, simultaneous interpreting different parameters, selective configuration, one Get,
	//7. Get: get "stu", "1001"“ info:name "
	public static void getData(String tableName, String rowKey, String cf, String cn) throws IOException
	{

		//1. Get table object
		Table table = connection.getTable(TableName.valueOf(tableName));

		//2. Create Get object corresponding to Put
		Get get = new Get(Bytes.toBytes(rowKey));

		//2.1 the column families get 'stu','1001','info' specified for query are used selectively according to the following parameters
		get.addFamily(Bytes.toBytes(cf));

		//2.2 specify the column families get 'stu', '1001', '1001' to query info:name '
		get.addColumn(Bytes.toBytes(cf), Bytes.toBytes(cn));

		//2.3 set the maximum number of versions to get data
		get.setMaxVersions();

		//3. Execute query
		Result result = table.get(get);

		//4. Parse the format of result get 'SUT' and '1002' to parse some data of a row
		for (Cell cell : result.rawCells())
		{
			System.out.println("RowKey: " + Bytes.toString(CellUtil.cloneRow(cell)) +
					",CF: " + Bytes.toString(CellUtil.cloneFamily(cell)) +
					",CN: " + Bytes.toString(CellUtil.cloneQualifier(cell)) +
					",Value: " + Bytes.toString(CellUtil.cloneValue(cell)) +
					",TimeStamp:" + cell.getTimestamp());
		}
		//
		//Close connection
		table.close();
	}

Of course, you can query multiple rowKeys at the same time

	//8. get data and obtain several rowKeys in batch
	public static void getData(String tableName, String... rowKeys) throws IOException
	{

		//Get table object
		Table table = connection.getTable(TableName.valueOf(tableName));

		//Create a collection of Get objects
		ArrayList<Get> gets = new ArrayList<>();

		//Loop create Get object
		for (String rowKey : rowKeys)
		{

			//Create Get object
			Get get = new Get(Bytes.toBytes(rowKey));

			//Put Get object in gets
			gets.add(get);
		}

		//Get data results
		Result[] results = table.get(gets);

		//Parse results
		for (Result result : results)
		{

			//Parse result
			for (Cell cell : result.rawCells())
			{
				System.out.println("RowKey:" + Bytes.toString(CellUtil.cloneRow(cell)) +
						",CF:" + Bytes.toString(CellUtil.cloneFamily(cell)) +
						",CN:" + Bytes.toString(CellUtil.cloneQualifier(cell)) +
						",Value:" + Bytes.toString(CellUtil.cloneValue(cell)));
			}
		}
		//Close connection
		table.close();
	}
8. scan obtaining data

scan tableName

	//9. Scan table data
	public static void scanTable(String tableName) throws IOException
	{

		//Get table object
		Table table = connection.getTable(TableName.valueOf(tableName));

		//To create a Scan object, you can also pass in startRow endRow and add a filter
		Scan scan = new Scan();
		// Scan scan = new Scan(Bytes.toBytes("1001"),Bytes.toBytes("1002"));

		//Scan table for data
		ResultScanner resultScanner = table.getScanner(scan);

		//Traverse resultScanner
		for (Result result : resultScanner)
		{
			//Parse result
			for (Cell cell : result.rawCells())
			{
				System.out.println("RowKey:" + Bytes.toString(CellUtil.cloneRow(cell)) +
						",CF:" + Bytes.toString(CellUtil.cloneFamily(cell)) +
						",CN:" + Bytes.toString(CellUtil.cloneQualifier(cell)) +
						",Value:" + Bytes.toString(CellUtil.cloneValue(cell)));
			}
		}

		//Close connection
		table.close();
	}
9. Delete data

delete rowkey delete ColumnFamily delete multiple versions.
Delete rowkey cfdelete more than one version of ColumnFamily.
delete rowkey cf cn {column,columns}
Key point: trace the source code of HBase deletion, and you will find that the data is finally added to the business logic code. It's just that we put the data to value, and now it's just type = Column and ColumnFamily.

	//10. Delete data
	public static void deleteData(String tableName, String rowKey, String cf, String cn) throws IOException
	{

		//1. Get table object
		Table table = connection.getTable(TableName.valueOf(tableName));

		//2. Create a delete object. Both delete and deleteall call this window deleteall 'stu','1001'
		Delete delete = new Delete(Bytes.toBytes(rowKey));

		//2.1 specify the column family and column to delete data. By default, delete the latest timestamp data. With timestamp, delete the specified version of the specified column
		// If name=old and name=new are inserted but there is no flush, you can still query old after deleting new,
		// If you delete new after name=old and name=new flush, you will not be able to query old. Use this method with caution.
        delete.addColumn(Bytes.toBytes(cf), Bytes.toBytes(cn),123);
        
		//2.2 delete all versions without time stamp, and delete all versions with time stamp less than time stamp
        delete.addColumns(Bytes.toBytes(cf), Bytes.toBytes(cn));

		//Specify the column family to delete data delete 'stu','1001','info'
        delete.addFamily(Bytes.toBytes(cf));

		//Perform delete data operation
		table.delete(delete);

		//Close connection
		table.close();
	}
	
	// Delete multiline data
public static void deleteMultiRow(String tableName, String... rows) throws IOException{
	HTable hTable = new HTable(conf, tableName);
	List<Delete> deleteList = new ArrayList<Delete>();
	for(String row : rows){
		Delete delete = new Delete(Bytes.toBytes(row));
		deleteList.add(delete);
	}
	hTable.delete(deleteList);
	hTable.close();
}
10 . main
public static void main(String[] args) throws IOException
	{

		//Judge whether the table exists (old)
		System.out.println(isTableExistOld("dddas"));

		//Judge whether the table exists (New)
		System.out.println(isTableExistNew("bbb"));

		//Create table
		createTable("fruit", "info");

		//Delete table
		dropTable("bigdata");

		//Create namespace
		createNS("aaa");

		//Delete namespace
		deleteNS("bigdata");

		//insert data
		putData("aaa", "1006", "info", "name", "xinxin");

		//Get a row of data
		getData("aaa", "1001", "1002", "1003", "1005", "1006");

		//Scan the whole table
		scanTable("fruit");

		//Delete data
		deleteData("aaa", "1006", "info2", "name");

		close();
	}

It can be executed locally or in jar mode on a cluster that can access Hbase services after packaging.

MapReduce operation HBase

HBase regards it as a database in big data, and then any engine that can analyze HBase, such as MR, Hive, spark.
Next, some jars that need to operate HBase for MR are put into MR.

  1. What Jar packages are needed to view HBase data of MR operation
$ bin/hbase mapredcp
---Show if MR To call Hbase Some dependent jar
/usr/local/src/hbase/hbase-2.2.2/lib/shaded-clients/hbase-shaded-mapreduce-....
  1. Import of environment variables
  • Perform import of environment variables (temporarily effective, do the following on the command line)
$ export HBASE_HOME=/opt/module/hbase
$ export HADOOP_HOME=/opt/module/hadoop-2.7.2
$ export HADOOP_CLASSPATH= `${HBASE_HOME}/bin/hbase mapredcp` // Here is the assignment of the variable to mapredcp
  • Permanent effect: configured in / etc/profile
 export HBASE_HOME=/opt/module/hbase
 export HADOOP_HOME=/opt/module/hadoop-2.7.2
 And find Hadoop in etc in hadoop-env.sh  Configuration in: (Note: configuration after for loop)
 export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/opt/module/hbase/lib/*

Then restart Hadoop and HBase

bin/stop-hbase.sh 
hadoop all closed
 After the above configuration, all the files will be sent to all nodes,
Then restart Hadoop and Hbase 

1. Official case

Realize the interactive data between MR and HBase.

Case 1: read HBase data into MR

Count the number of rows in the Student table, and execute the following code in HBase directory,.

/usr/local/src/hadoop/hadoop-3.1.3/bin/yarn jar /usr/local/src/hbase/hbase-2.2.2/lib/hbase-server-2.2.2.jar rowcounter sowhat
---- Follow me hbase shell implement  count 'sowhat'equally

Case 2: use MapReduce to import local data into HBase

  1. Create a TSV file locally: fruit.tsv
1001	Apple	Red
1002	Pear		Yellow
1003	Pineapple	Yellow
  1. Create HBase table
hbase(main):001:0> create 'fruit','info'
  1. Create input in HDFS_ Fruit folder and upload fruit.tsv file
$ /opt/module/hadoop-2.7.2/bin/hdfs dfs -mkdir /input_fruit/
$ /opt/module/hadoop-2.7.2/bin/hdfs dfs -put fruit.tsv /input_fruit/
  1. Execute MapReduce to HBase's fruit table
$ /opt/module/hadoop-2.7.2/bin/yarn jar lib/hbase-server-1.3.1.jar importtsv \
-Dimporttsv.columns=HBASE_ROW_KEY,info:name,info:color fruit \
hdfs://hadoop102:9000/input_fruit
  1. Use scan command to view the results after import
hbase(main):001:0> scan 'fruit'

2. Custom implementation

1. Read HDFS to HBase

Objective: to write the data in HDFS to HBase table.
HDFS2HBaseMapper

package com.atguigu.mr2;

import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;

import java.io.IOException;

public class HDFS2HBaseMapper extends Mapper<LongWritable, Text, NullWritable, Put> {

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        //Get a row of data and slice it
        String[] fields = value.toString().split("\t");

        //Create a Put object
        Put put = new Put(Bytes.toBytes(fields[0]));

        //Assign a value to the Put object
        put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("name"), Bytes.toBytes(fields[1]));
        put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("color"), Bytes.toBytes(fields[2]));

        //Write
        context.write(NullWritable.get(), put);

    }
}

HDFS2HBaseReducer

package com.atguigu.mr2;

import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.io.NullWritable;

import java.io.IOException;

public class HDFS2HBaseReducer extends TableReducer<NullWritable, Put, NullWritable> {

    @Override
    protected void reduce(NullWritable key, Iterable<Put> values, Context context) throws IOException, InterruptedException {

        //Ergodic writing
        for (Put value : values) {
            context.write(key, value);
        }

    }
}

HDFS2HBaseDriver

package com.atguigu.mr2;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.Put;

import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class HDFS2HBaseDriver extends Configuration implements Tool {

    //Claim configuration information
    private Configuration configuration;

    @Override
    public int run(String[] args) throws Exception {

        //1. Create Job object
        Job job = Job.getInstance(configuration);

        //2. Set main class
        job.setJarByClass(HDFS2HBaseDriver.class);

        //3. Set Mapper
        job.setMapperClass(HDFS2HBaseMapper.class);

        //4. Set the output type of Mapper
        job.setMapOutputKeyClass(NullWritable.class);
        job.setMapOutputValueClass(Put.class);

        //5. Set up Reducer
        TableMapReduceUtil.initTableReducerJob(args[1], HDFS2HBaseReducer.class, job);

        //6. Set input path
        FileInputFormat.setInputPaths(job, new Path(args[0]));

        //7. Submit task
        boolean result = job.waitForCompletion(true);

        return result ? 0 : 1;
    }

    @Override
    public void setConf(Configuration conf) {
        configuration = conf;
    }

    @Override
    public Configuration getConf() {
        return configuration;
    }
    public static void main(String[] args) {
        //Create configuration information
        Configuration configuration = new Configuration();
        //function
        try {
            int run = ToolRunner.run(configuration, new HDFS2HBaseDriver(), args);
            System.exit(run);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

2. HBase to HBase

Target: move part of the data in the fruit table to fruit through MR_ MR table.

package com.atguigu.mr1;

import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;

import java.io.IOException;
// Read HBase data
public class FruitMapper extends TableMapper<ImmutableBytesWritable, Put> {

    @Override
    protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {

        //Create a Put object
        Put put = new Put(key.get());

        //Traversal value (one row of data)
        for (Cell cell : value.rawCells()) {
            if ("name".equals(Bytes.toString(CellUtil.cloneQualifier(cell)))) {
                //Set value for put
                put.add(cell);
            }
        }
        //Write
        context.write(key, put);
    }
}
--- 
package com.atguigu.mr1;

import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class FruitReducer extends TableReducer<ImmutableBytesWritable, Put, ImmutableBytesWritable> {

    @Override
    protected void reduce(ImmutableBytesWritable key, Iterable<Put> values, Context context) throws IOException, InterruptedException {

        //Ergodic writing
        for (Put value : values) {
            context.write(key, value);
        }
    }
}
--- 
package com.atguigu.mr1;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class FruitDriver extends Configuration implements Tool {

    //Claim configuration information
    Configuration configuration;

    @Override
    public int run(String[] args) throws Exception {

        //1. Create Job object
        Job job = Job.getInstance(configuration);

        //2. Set main class
        job.setJarByClass(FruitDriver.class);

        //3. Set Mapper class
        TableMapReduceUtil.initTableMapperJob("sowhat",
                new Scan(),
                FruitMapper.class,
                ImmutableBytesWritable.class,
                Put.class,
                job);

        //4. Set Reducer class
			TableMapReduceUtil.initTableReducerJob("sowhat1412",
                FruitReducer.class,
                job);


        //5. Submission
        boolean result = job.waitForCompletion(true);

        return result ? 0 : 1;
    }

    @Override
    public void setConf(Configuration conf) {
        configuration = conf;
    }

    @Override
    public Configuration getConf() {
        return configuration;
    }
    public static void main(String[] args) {

        //Create configuration information
		Configuration configuration = HBaseConfiguration.create();
		configuration.set("hbase.zookeeper.quorum", "host-10-100-34-111,host-10-100-34-120,host-10-100-34-140");

		try {
            int result = ToolRunner.run(configuration, new FruitDriver(), args);

            System.exit(result);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

PS: we put some HBase jar s into Hadoop environment variables when we operate on the cluster, but we don't need to do it when we develop locally, because the dependency of HBase contains the core code of Hadoop.

For local development, Hbase in Hbase cluster should be used- site.xml The file is loaded into the resource of IDEA to help local code find the ZK cluster. Of course, you can also choose to write ZK configuration to the configuration file. Above. windows ran into problems rely on

Hive operation HBase

We can associate Hive with HBase, and then the data in Hive is no longer stored by HDFS but stored in HBase. Moreover, after association, the data added in Hive can be seen in HBase, and the data added in HBase can also be seen in Hive.

Hive

  1. data warehouse
    Hive's essence is actually equivalent to making a bijection relationship between the files already stored in HDFS in Mysql to facilitate the use of HQL to manage queries.
  2. For data analysis and cleaning
    Hive is suitable for offline data analysis and cleaning with high delay.
  3. Based on HDFS, MapReduce
    The data stored in Hive is still on the DataNode, and the HQL statement written will eventually be converted to MapReduce code for execution.
  4. Analysis framework, metadata information to MySQL

HBase

  1. database
    It is a non relational database for column family storage.
  2. It is used to store structured and unstructured data (in fact, it is more structured, and MongoDB stores more unstructured data)
    It is suitable for the storage of single table non relational data, not suitable for the operation of association query such as sum, AVG, and similar JOIN.
  3. Based on HDFS
    The embodiment of data persistent storage is Hfile, which is stored in DataNode and managed by the ResionServer in the form of region.
  4. Low latency, access to online services
    In the face of a large number of enterprise data, HBase can store a large number of data in a single line table, while providing efficient data access speed.
  5. HBase storage framework keeps metadata information by itself.
  6. The amount of data is not large enough. Its practical Redis, ES Better.

HBase and Hive integrated use

Scream: HBase and Hive integration are not compatible in the latest two versions. Therefore, we can only recompile bravely with tears: Hive hbase-handler-1.2.2.jar. After compatibility, the data equivalent to Hive is stored in HBase. Compatible with Baidu.

Environmental preparation
Because we may have an impact on HBase while operating Hive in the future, Hive needs to hold the Jar for operating HBase, then copy the Jar package relied on by Hive (or use the form of soft connection).

export HBASE_HOME=/opt/module/hbase
export HIVE_HOME=/opt/module/hive

ln -s $HBASE_HOME/lib/hbase-common-1.3.1.jar  $HIVE_HOME/lib/hbase-common-1.3.1.jar
ln -s $HBASE_HOME/lib/hbase-server-1.3.1.jar $HIVE_HOME/lib/hbase-server-1.3.1.jar
ln -s $HBASE_HOME/lib/hbase-client-1.3.1.jar $HIVE_HOME/lib/hbase-client-1.3.1.jar
ln -s $HBASE_HOME/lib/hbase-protocol-1.3.1.jar $HIVE_HOME/lib/hbase-protocol-1.3.1.jar
ln -s $HBASE_HOME/lib/hbase-it-1.3.1.jar $HIVE_HOME/lib/hbase-it-1.3.1.jar
ln -s $HBASE_HOME/lib/htrace-core-3.1.0-incubating.jar $HIVE_HOME/lib/htrace-core-3.1.0-incubating.jar
ln -s $HBASE_HOME/lib/hbase-hadoop2-compat-1.3.1.jar $HIVE_HOME/lib/hbase-hadoop2-compat-1.3.1.jar
ln -s $HBASE_HOME/lib/hbase-hadoop-compat-1.3.1.jar $HIVE_HOME/lib/hbase-hadoop-compat-1.3.1.jar

At the same time in hive-site.xml Modify the properties of zookeeper as follows:

<property>
  <name>hive.zookeeper.quorum</name>
  <value>hadoop102,hadoop103,hadoop104</value>
  <description>The list of ZooKeeper servers to talk to. This is only needed for read/write locks.</description>
</property>
<property>
  <name>hive.zookeeper.client.port</name>
  <value>2181</value>
  <description>The port of ZooKeeper servers to talk to. This is only needed for read/write locks.</description>
</property>
1. Case 1

Objective: to establish Hive table, associate HBase table, insert data into Hive table and affect HBase table at the same time.
Step by step implementation: map according to the location relationship, a little Hive and neo4j.

  1. Create tables in Hive and associate HBase
CREATE TABLE hive_hbase_emp_table(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double,
deptno int)
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,info:ename,info:job,info:mgr,info:hiredate,info:sal,info:comm,info:deptno")
TBLPROPERTIES ("hbase.table.name" = "hbase_emp_table");

Tip: after completion, you can enter Hive and HBase respectively to view and generate corresponding tables
2. Create a temporary intermediate table in Hive for the data in the load file
Note: data cannot be directly load ed into the table of HBase associated with Hive

CREATE TABLE emp(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double,
deptno int)
row format delimited fields terminated by '\t';
  1. load data to Hive intermediate table, the result is failure!
hive> load data local inpath '/home/admin/softwares/data/emp.txt' into table emp;
  1. Use the insert command to import the data in the middle table into the table of Hive associated HBase. After success, the data will be imported into HBase. Remember the flush operation of HBase.
hive> insert into table hive_hbase_emp_table select * from emp;
  1. Check whether Hive and the associated HBase table have successfully synchronized and inserted data
hive> select * from hive_hbase_emp_table;
hbase> scan 'hbase_emp_table'
2. More common case 2

Objective: a table HBase has been stored in hbase_emp_table, and then create an external table in Hive to associate HBase in hbase_emp_table, so that it can use Hive to analyze the data in HBase.
Note: case 2 follows case 1, so please complete case 1 before completing this case.

  1. Create an external table in Hive
CREATE EXTERNAL TABLE relevance_hbase_emp(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double,
deptno int)
STORED BY 
'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping" = 
":key,info:ename,info:job,info:mgr,info:hiredate,info:sal,info:comm,info:deptno") 
TBLPROPERTIES ("hbase.table.name" = "hbase_emp_table");
  1. After association, you can use Hive function to perform some analysis operations
hive (default)> select * from relevance_hbase_emp;

reference resources

API
HBase API

Tags: HBase Hadoop Apache hive

Posted on Sat, 20 Jun 2020 03:37:46 -0400 by Ting