Hive common functions, window functions, UDF, UDTF

1. System built-in function

  • View the functions provided by the system
    hive> show functions;
    
  • Displays the usage of the built-in function
    //View the usage of the upper function
    hive> desc function upper;
    
  • Displays the usage of the built-in function in detail
    //View the detailed usage of the upper function
    hive> desc function extended upper;
    

2. Common functions

2.1 relation operation

// Equivalent comparison 
= == <=>
// Unequal comparison 
!= <>
// Interval comparison:
 select * from students where id between 1500100001 and 1500100010;
// Null value / non null value judgment:
is null,is not null,nvl(),isnull()

//If value is NULL, the NVL function returns default_value, otherwise return the value of value. If both parameters are NULL, return NULL.
NVL(value´╝îdefault_value)
// 
like,rlike,regexp usage

2.2 numerical calculation

Random number between 0 and 1: rand()
Rounding function(rounding): round()
Round up: ceil()
Round down: floor()

2.3 condition function

//If (expression, return value if the expression holds, return value if the expression does not hold)
select if(1>0,1,0); //1
select if(1>0,if(-1>0,-1,1),0);//1

//coalesce returns the first non null value
select COALESCE(null,'1','2'); // 1 match from left to right until it is not empty
select COALESCE('1',null,'2'); // 1

//case when
select  score
        ,case when score>120 then 'excellent'
              when score>100 then 'good'
              when score>90 then 'pass'
        else 'fail,'
        end as level
from score limit 20;

select  name
        ,case name when "Shi Xiaohuai" then "Sophora japonica ge"
                  when "LV Jinpeng" then "Peng ge"
                  when "Dan Lerui" then "Pistil jie"
        else "Forget it, stop yelling"
        end as nickname
from students limit 10;

2.4 date function

//Converts a timestamp to a date in the specified format
select from_unixtime(1610611142,'YYYY/MM/dd HH:mm:ss');
//unix_timestamp() gets the timestamp of the current date and converts it to the specified format
select from_unixtime(unix_timestamp(),'YYYY/MM/dd HH:mm:ss');

// 'January 14, 2021' - > 'January 14, 2021'
select from_unixtime(unix_timestamp('2021 January 14','yyyy year MM month dd day'),'yyyy-MM-dd');

//Format the specified date into the specified format 2021 / 11 / 28
select date_format('2021-11-28', 'yyyy/MM/dd');
//Time plus 5 days
select date_add('2021-11-28',5);
//Time minus 5 days
select date_sub('2021-11-28',5);
//Time subtraction
select datediff('2021-11-28','2021-11-20');


2.5. String function

//String connection. If a value is null, the result will be null
concat('123','456'); // 123456
concat('123','456',null); // NULL

//Specify a delimiter connection and ignore null
select concat_ws('#','a','b','c'); // a#b#c
select concat_ws('#','a','b','c',NULL); // a#b#c you can specify a delimiter, and NULL is automatically ignored
select concat_ws("|",cast(id as string),name,cast(age as string),gender,clazz) from students limit 10;

//String interception. The position starts from 1, and the starting position can also specify a negative number
select substring("abcdefg",1); // abcdefg HQL starts counting from 1 when the position is involved
select substring('helloworld', -5); //world
//Start at the sixth position and intercept five characters
select substring('helloworld', 6, 5); //world
// '2021/01/14' -> '2021-01-14'
select concat_ws("-",substring('2021/01/14',1,4),substring('2021/01/14',6,2),substring('2021/01/14',9,2));

//String segmentation
select split("abcde,fgh",","); // ["abcde","fgh"]
select split("a,b,c,d,e,f",",")[2]; // c
//Split and turn columns into rows according to
select explode(split("abcde,fgh",",")); // abcde
										//  fgh

// Parsing json formatted data 
// Format: get_json_object(json string, $indicates root)
select get_json_object('{"name":"zhangsan","age":18,"score":[{"course_name":"math","score":100},{"course_name":"english","score":60}]}',"$.score[0].score"); // 100

2.6 Hive calculates WordCount

create table words(
    words string
)row format delimited fields terminated by '|';

// data
hello,java,hello,java,scala,python
hbase,hadoop,hadoop,hdfs,hive,hive
hbase,hadoop,hadoop,hdfs,hive,hive

select word,count(*) from (select explode(split(words,',')) word from words) a group by a.word;

// result
hadoop	4
hbase	2
hdfs	2
hello	2
hive	4
java	2
python	1
scala	1

3. Column to row

lateral view:
Usage: final view udtf (expression) tablealias as columnalias
lateral view explode
It can split a column of data into multiple rows of data, and aggregate the split data on this basis.

raw data
 full name	language,mathematics,English
zs	80,88,90
ls	95,81,81

Target results
zs	80 //language
zs	88 //mathematics
zs	90 //English
ls	95 //language
ls	81 //mathematics
ls	81 //English

step

//Create tables and import data
create table stu_score(
        name string,
        score string
)
row format delimited fields terminated by '\t';

//Query statement
//First, divide the score column by, and then split it into multiple rows
select name, sco from stu_score 
lateral view explode(split(score,',')) t1 as sco;

4. Row to column

Functions used:
concat()
concat_ws()
The above two string functions are introduced
collect_set(col): the function only accepts basic data types. Its main function is to de summarize the values of a field and generate an array type field

raw data
name	province	city
 Zhang San		Anhui		Hefei
 Li Si		Anhui		Bengbu
 Wang Wu		Jiangsu		Nanjing
 Zhao Liu		Jiangsu		Suzhou
 Xiao Ming		Anhui		Hefei
 Xiao Hong		Jiangsu		Nanjing

Target results
 Anhui,Hefei	Zhang San|Xiao Ming
 Anhui,Bengbu	Li Si
 Jiangsu,Nanjing	Wang Wu|Xiao Hong
 Jiangsu,Suzhou	Zhao Liu

step

//Create table statement
create table person(
        name string,
        province string,
        city string
)
row format delimited fields terminated by '\t';
//Load data
load data local inpath '/root/data/person.txt' into table person;

//Processing results
 Result 1
select name, concat(province,',',city) as city from person;
Result 2
select city, collect_set(name) from (
    select name, concat(province,',',city) as city from person
)as r1 group by city;

final result
select city, concat_ws('|',collect_set(name)) as names from (
    select name, concat(province,',',city) as city from person
)as r1 group by city;


Result 1

Result 2

final result

5. Window function

It seems to open a window for each piece of data, so it is called the window function

  in sql, there is a kind of function called aggregate function, such as sum(), avg(), max(), etc. this kind of function can aggregate multiple rows of data into one row according to rules. Generally speaking, the number of rows after aggregation is less than that before aggregation. However, sometimes we want to display both the data before aggregation and the data after aggregation. At this time, we introduce window function.

over(): Specifies the size of the data window in which the analysis function works. The size of the data window may change with the change of rows

5.1. Common window functions

  • row_number: add a column of serial numbers without parallel ranking
    Usage: select xxxx, row_number() over(partition by grouping field order by sorting field desc) as rn from tb group by xxxx
  • dense_rank: there are parallel rankings, and they increase in turn
  • rank: there are parallel rankings, not increasing in turn
  • percent_rank: (rank result - 1) / (number of data in the partition - 1)
  • cume_dist: calculates the cumulative distribution of a value in a window or partition.
    Assuming ascending sorting, the cumulative distribution is determined using the following formula: rows less than or equal to the current value x / total rows in the window or partition partition. Where x is equal to the value in the current row of the column specified in the order by clause.
  • NTILE(n): divide the data in the partition into n groups, and then mark the group number
  • max, min, avg, count, sum: make corresponding calculations based on the data in each partition
  • LAG(col,n,default_val): data in the nth row ahead
  • LEAD(col,n, default_val): data of the nth row in the future
  • FIRST_VALUE: the first value after sorting in the group up to the current line
  • LAST_VALUE: the last value after sorting in the group up to the current row. For parallel ranking, the last value is taken

test data

id,score,clazz,department

111,69,class1,department1
112,80,class1,department1
113,74,class1,department1
114,94,class1,department1
115,93,class1,department1
121,74,class2,department1
122,86,class2,department1
123,78,class2,department1
124,70,class2,department1
211,93,class1,department2
212,83,class1,department2
213,94,class1,department2
214,94,class1,department2
215,82,class1,department2
216,74,class1,department2
221,99,class2,department2
222,78,class2,department2
223,74,class2,department2
224,80,class2,department2
225,85,class2,department2

Create table statement

create table new_score(
     id  int
    ,score int
    ,clazz string
    ,department string
) row format delimited fields terminated by ",";

//Load data
load data local inpath '/root/data/new_score.txt' into table new_score;

5.2 test row_number(), rank(), etc

Test row_number(),dense_rank(),rank(),percent_rank()
Sort in descending order according to class partition and score

select 
    id
    ,score
    ,clazz
    ,department
    ,row_number() over(partition by clazz order by score desc) as rn
    ,dense_rank() over(partition by clazz order by score desc) as dr
    ,rank() over (partition by clazz order by score desc) as rk
    ,percent_rank() over (partition by clazz order by score desc) as percent_rk
from new_score;

5.3. Test lag, lead and other functions

Test lag(), lead(), first_value(),last_value(),ntile()
Sort in descending order according to class partition and score

select  id
        ,score
        ,clazz
        ,department
        ,lag(id,2) over (partition by clazz order by score desc) as lag_num
        ,lead(id,2) over (partition by clazz order by score desc) as lead_num
        ,first_value(id) over (partition by clazz order by score desc) as first_v_num
        ,last_value(id) over (partition by clazz order by score desc) as last_v_num
        ,ntile(3) over (partition by clazz order by score desc) as ntile_num
from new_score;

6. Window sentence (window frame) and with sentence

6.1 window sentence

window sentence format:

window w as (partition by Partition field order by sort field rows|range between Starting position and End position)

window size explanation:

  • CURRENT ROW: CURRENT ROW
  • N predicting: data in the previous n rows
  • n FOLLOWING: data in the next n rows
  • UNBOUNDED: starting point,
    Unbounded predicting indicates the previous starting point,
    UNBOUNDED FOLLOWING indicates to the following end point

Hive provides two forms of defining window frames: ROWS and RANGE. Both types require upper and lower bounds to be configured.

  • Example: ROWS
    Rows between unbounded prediction and current row indicates that all rows from the starting record of the partition to the current record are selected;

  • Example: RANGE
    Sum (close) range between 100 predicting and 200 following is selected by field difference.
    If the close field value of the previous line is 200, the definition of this window frame will select the records in the partition where the close field value falls between 100 and 400.

If no window frame is defined, the default is range between unbounded prediction and current row.

The following are all possible combinations of window frame definitions.

(ROWS | RANGE) BETWEEN (UNBOUNDED | [num]) PRECEDING AND ([num] PRECEDING | CURRENT ROW | (UNBOUNDED | [num]) FOLLOWING)
(ROWS | RANGE) BETWEEN CURRENT ROW AND (CURRENT ROW | (UNBOUNDED | [num]) FOLLOWING)
(ROWS | RANGE) BETWEEN [num] FOLLOWING AND (UNBOUNDED | [num]) FOLLOWING
range between 3 PRECEDING and 11 FOLLOWING

Note: it can only be used in max, min, avg, count, sum and FIRST_VALUE,LAST_VALUE on these window functions

6.2 use of window sentence

Test the sql and specify the window size. Rows between 2 predicting and 2 following: there are five rows in total from the first two rows of the current row to the last two rows of the current row

SELECT id
     ,score
     ,clazz
     ,SUM(score) OVER w as sum_w
     ,round(avg(score) OVER w,3) as avg_w
     ,count(score) OVER w as cnt_w
FROM new_score
window w AS (PARTITION BY clazz ORDER BY score rows between 2 PRECEDING and 2 FOLLOWING);

The explanation is shown in the figure below

6.3 with sentence

with format

with Table name 1 as (Query statement 1),
Table name 2 as (Query statement 2)
select * from Table name 2;

The word with defines the query result as a temporary table and provides it to the next query.

7. Custom function

7.1 UDF function: one in and one out

  • Create a maven project and add the following dependencies
<dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>1.2.1</version>
</dependency>
  • Write code, inherit org.apache.hadoop.hive.ql.exec.UDF, implement the evaluate method, and implement your own logic in the evaluate method
import org.apache.hadoop.hive.ql.exec.UDF;

public class HiveUDF extends UDF {
    // hadoop => #hadoop$
    public String evaluate(String col1) {
    // Add a # sign on the left and a # sign on the right to the incoming data$
        String result = "#" + col1 + "$";
        return result;
    }
}
  • Make a jar package and upload it to the Linux virtual machine
  • In the hive shell, add the jar package as a resource to the hive environment using the add jar path
    add jar /usr/local/soft/jars/HiveUDF2-1.0.jar;
    
  • Register a temporary function using jar package resources. Fxx1 is your function name and 'MyUDF' is the main class name
    create temporary function fxxx1 as 'MyUDF';
    
  • Process data using function names
    select fxx1(name) as fxx_name from students limit 10;
    
    //result
    #Shi Xiaohuai$
    #LV Jinpeng$
    #Dan Lerui$
    #Ge Deyao$
    #Xuanguqin$
    #Bian angxiong$
    #Shang Gufeng$
    #Fu banshuang$
    #Shen Dechang$
    #Yi Yanchang$
    

7.2 UDTF function: one in, many out

Example:

Original data
"key1:value1,key2:value2,key3:value3"

Target results
key1 value1
key2 value2
key3 value3
  1. Method 1: use expand + Split
with r1 as (select explode(split('key1:value1,key2:value2,key3:value3',',')) as kv)
select split(kv,':')[0] as k,split(kv,':')[1] as v from r1;

  1. Method 2: Custom UDTF

code:

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

import java.util.ArrayList;

public class HiveUDTF extends GenericUDTF {
    // Specify the output column name and type
    @Override
    public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
        ArrayList<String> filedNames = new ArrayList<String>();
        ArrayList<ObjectInspector> filedObj = new ArrayList<ObjectInspector>();
        filedNames.add("col1");
        filedObj.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        filedNames.add("col2");
        filedObj.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
        return ObjectInspectorFactory.getStandardStructObjectInspector(filedNames, filedObj);
    }

    // Processing logic my_udtf(col1,col2,col3)
    // "key1:value1,key2:value2,key3:value3"
    // my_udtf("key1:value1,key2:value2,key3:value3")
    public void process(Object[] objects) throws HiveException {
        // objects represents the incoming N columns
        String col = objects[0].toString();
        // key1:value1  key2:value2  key3:value3
        String[] splits = col.split(",");
        for (String str : splits) {
            String[] cols = str.split(":");
            // Output data
            forward(cols);
        }

    }

    // Called at the end of the UDTF
    public void close() throws HiveException {
    }
}
First, type the above code as jar Package and upload to linux

stay hive shell In, use add jar Path will jar Add package as resource to hive In the environment
add jar /usr/local/soft/jars/HiveUDTF-1.0.jar;

use jar The package resource registers a temporary function, my_udtf Is your function name,'HiveUDTF'Is the main class name
create temporary function my_udtf as 'HiveUDTF';

Processing data
select my_udtf("key1:value1,key2:value2,key3:value3");

8. Example: continuous login problem

  in e-commerce, logistics and banks, you may often encounter such needs: Statistics on the total amount of users' continuous transactions, continuous login days, start and end times of continuous login, interval days, etc.

8.1 data:

Note: each user may have multiple records every day

id	datestr	  amount
1,2019-02-08,6214.23 
1,2019-02-08,6247.32 
1,2019-02-09,85.63 
1,2019-02-09,967.36 
1,2019-02-10,85.69 
1,2019-02-12,769.85 
1,2019-02-13,943.86 
1,2019-02-14,538.42
1,2019-02-15,369.76
1,2019-02-16,369.76
1,2019-02-18,795.15
1,2019-02-19,715.65
1,2019-02-21,537.71
2,2019-02-08,6214.23 
2,2019-02-08,6247.32 
2,2019-02-09,85.63 
2,2019-02-09,967.36 
2,2019-02-10,85.69 
2,2019-02-12,769.85 
2,2019-02-13,943.86 
2,2019-02-14,943.18
2,2019-02-15,369.76
2,2019-02-18,795.15
2,2019-02-19,715.65
2,2019-02-21,537.71
3,2019-02-08,6214.23 
3,2019-02-08,6247.32 
3,2019-02-09,85.63 
3,2019-02-09,967.36 
3,2019-02-10,85.69 
3,2019-02-12,769.85 
3,2019-02-13,943.86 
3,2019-02-14,276.81
3,2019-02-15,369.76
3,2019-02-16,369.76
3,2019-02-18,795.15
3,2019-02-19,715.65
3,2019-02-21,537.71

Create table statement:

create table deal_tb(
    id string
    ,datestr string
    ,amount string
)row format delimited fields terminated by ',';

8.2 calculation idea

  • First, group and sum by user and date, so that each user has only one piece of data every day

     select  id
             ,datestr
             ,sum(amount) as sum_amount
     from deal_tb
     group by id,datestr;
    

  • Groups are sorted by date according to user ID, and the start date of continuous login is obtained by subtracting the date and group serial number. If the start date is the same, it indicates continuous login

    with t1 as (select 
                     id
                    ,datestr
                    ,sum(amount) as sum_amount 
                from deal_tb 
                group by id,datestr),
         t2 as (select 
                    t1.id
                    ,t1.datestr
                    ,t1.sum_amount
                    ,row_number() over(partition by id order by datestr) as rn
                from t1)
    select  
          t2.id
         ,t2.datestr
         ,t2.sum_amount
         ,date_sub(t2.datestr,rn) as grp 
    from t2;
    

  • Count the total amount of continuous transactions, continuous login days, start and end time of continuous login, and interval days

    with t1 as (select 
                     id
                    ,datestr
                    ,sum(amount) as sum_amount 
                from deal_tb 
                group by id,datestr),
         t2 as (select 
                    t1.id
                    ,t1.datestr
                    ,t1.sum_amount
                    ,row_number() over(partition by id order by datestr) as rn
                from t1),
         t3 as (select  
                    t2.id
                    ,t2.datestr
                    ,t2.sum_amount
                    ,date_sub(t2.datestr,rn) as grp 
                from t2)
    select t3.id
           ,t3.grp
           ,round(sum(t3.sum_amount),2) as sc_sum_amount
           ,count(1) as sc_days
           ,min(t3.datestr) as sc_start_date
           ,max(t3.datestr) as sc_end_date
           ,datediff(t3.grp,lag(t3.grp,1) over(partition by t3.id order by t3.grp)) as iv_days
    from t3
    group by t3.id,t3.grp;
    

Tags: Java Big Data hive

Posted on Sun, 28 Nov 2021 12:21:55 -0500 by filteredhigh