Hive函数进阶(hive常用函数总结)
Hive函数进阶
explode(UDTF)
create table the_nba_championship( team_name string, champion_year array<string> ) row format delimited fields terminated by ',' collection items terminated by '|'; -- 使用explode函数对champion_year进行拆分 复制代码
NBA总冠军球队
lateral View
行列转换 concat
列比较
select count(case when gender='男' then id end) as '男',count(case when gender='女' then id end) as '女' from students 复制代码
男 女 30 40 行比较
select gender,count(id) as count from student group by gender 复制代码
gender count 男 30 女 40 create table row2col2( col1 string, col2 string, col3 int )row format delimited fields terminated by '\t'; 复制代码
create table col2row2( col1 string, col2 string, col3 string )row format delimited fields terminated by '\t'; 复制代码
列转行
行转列
json数据处理
create table tb_json_test1 ( json string ); get_json_object(json,'$.deviceType') 复制代码
create table tb_json_test2 ( device string, deviceType string, signal double, `time` string ) ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' STORED AS TEXTFILE; json_tuple(json,'device','deviceType') 复制代码
create table tb_json_test2 ( device string, deviceType string, signal double, `time` string ) ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' STORED AS TEXTFILE; 复制代码
-- 拓展--xml数据处理 desc function extended xpath; SELECT xpath('<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>', 'a/b/text()') 复制代码
serder
多个字段
单个字段
窗口函数
聚合函数、分析函数、排序函数 over(partition by 字段1... order by 字段1... rows 范围)
建表
---建表并且加载数据 create table website_pv_info( cookieid string, createtime string, --day pv int ) row format delimited fields terminated by ','; create table website_url_info ( cookieid string, createtime string, --访问时间 url string --访问页面 ) row format delimited fields terminated by ',' 复制代码
求出每个用户总pv数sum+group by普通常规聚合操作
select cookieid,sum(pv) from website_pv_info group by cookieid; 复制代码
求出网站总的pv数 所有用户所有访问加起来
复制代码
求出每个用户总pv数
select cookieid,sum(pv) over(partition by cookieid) from website_pv_info; 复制代码
求出每个用户截止到当天,累积的总pv数
-- 求出每个用户截止到当天,累积的总pv数 between unbounded preceding and current row 等价 unbounded preceding -- order by 默认有一个计算范围 unbounded preceding select cookieid,sum(pv) over(partition by cookieid order by createtime rows between unbounded preceding and current row ) from website_pv_info; select cookieid,sum(pv) over(partition by cookieid order by createtime rows unbounded preceding) from website_pv_info; select cookieid,sum(pv) over(partition by cookieid order by createtime ) from website_pv_info; 复制代码
自定义窗口范围
unbounded preceding 复制代码
找出每个用户访问pv最多的Top3重复并列的不考虑
-- 排序 rank 遇到重复会生成不连续的序号 dense_rank连续序号 row_number生成行号 select *,rank() over (partition by cookieid order by pv desc ) as rank_num from website_pv_info; select *,dense_rank() over (partition by cookieid order by pv desc ) as rank_num from website_pv_info; select *,row_number() over (partition by cookieid order by pv desc ) as rank_num from website_pv_info; select *,ntile(3) over (partition by cookieid order by pv desc ) as rank_num from website_pv_info; with tb1 as ( select *,dense_rank() over (partition by cookieid order by pv desc ) as rank_num from website_pv_info ) from tb1 select * where rank_num <= 3; 复制代码
统计每个用户pv数最多的前3分之1天
with tb1 as (select *,ntile(3) over (partition by cookieid order by pv desc ) as rank_num from website_pv_info) from tb1 select * where rank_num = 1; 复制代码
分析函数
select *,lag(pv) over (partition by cookieid order by pv desc ) as rank_num from website_pv_info; -- 向下取值 select *,lead(pv) over (partition by cookieid order by pv desc ) as rank_num from website_pv_info; -- 取第一个值 select *,first_value(pv) over (partition by cookieid order by pv desc ) as rank_num from website_pv_info; -- 去最后一个值 注意取值范围 因为orderby 所用取值范围需要增加向下的范围 select *,last_value(pv) over (partition by cookieid order by pv desc rows between unbounded preceding and unbounded following) as rank_num from website_pv_info; select sum(pv) sum(sum(pv)) over() from website_pv_info 复制代码
数据压缩
压缩算法
map压缩
1)开启hive中间传输数据压缩功能 set hive.exec.compress.intermediate=true; 2)开启mapreduce中map输出压缩功能 set mapreduce.map.output.compress=true; 3)设置mapreduce中map输出数据的压缩方式 Set mapreduce.map.output.compress.codec= org.apache.hadoop.io.compress.SnappyCodec; 复制代码
reduce压缩
1)开启hive最终输出数据压缩功能 set hive.exec.compress.output=true; 2)开启mapreduce最终输出数据压缩 set mapreduce.output.fileoutputformat.compress=true; 3)设置mapreduce最终数据输出压缩方式 set mapreduce.output.fileoutputformat.compress.codec = org.apache.hadoop.io.compress.SnappyCodec; 4)设置mapreduce最终数据输出压缩为块压缩 set mapreduce.output.fileoutputformat.compress.type=BLOCK; 复制代码
数据存储格式
行存储和列存储
TextFILE
ORC
PAROUET
格式对比
TEXTfile
create table log_text ( track_time string, url string, session_id string, referer string, ip string, end_user_id string, city_id string )ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE ; 复制代码
orc
create table log_orc( track_time string, url string, session_id string, referer string, ip string, end_user_id string, city_id string ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS orc ; 复制代码
PARQUET
create table log_parquet( track_time string, url string, session_id string, referer string, ip string, end_user_id string, city_id string ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS PARQUET ;
作者:life_is_short
链接:https://juejin.cn/post/7029581350181011464