1 groupby(大表分组-局部聚合+全局聚合)
示例1:- select label,sum(cnt) as all from
- (
- select rd,label,sum(1) as cnt from
- (
- select id,label,round(rand(),2) as rd,value from tmp1
- ) as tmp
- group by rd,label
- ) as tmp
- group by label;
复制代码 示例2:- select
- split(new_source,'\\_')[0] as source
- ,sum(cnt) as cnt
- from
- (select
- concat(source,'_', rand()*100) as new_source
- ,count(1) as cnt
- from test_table
- where day ='2022-01-01'
- group by
- concat(source,'_', rand()*100)
- )tt
- group by
- split(new_source,'\\_')[0]
复制代码 2 join(大中表Join - 加salt + 小表膨胀)
示例1:- select label,sum(value) as all from
- (
- select rd,label,sum(value) as cnt from
- (
- select tmp1.rd as rd,tmp1.label as label,tmp1.value*tmp2.value as value
- from
- (
- select id,round(rand(),1) as rd,label,value from tmp1
- ) as tmp1
- join
- (
- select id,rd,label,value from tmp2
- lateral view explode(split('0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9',',')) mytable as rd
- ) as tmp2
- on tmp1.rd = tmp2.rd and tmp1.label = tmp2.label
- ) as tmp1
- group by rd,label
- ) as tmp1
- group by label;
复制代码 示例2:- select
- source
- ,source_name
- ,sum(cnt) as cnt
- from
- (select
- t1.source
- ,new_source
- ,nvl(source_name,'未知') as source_name
- ,count(imei) as cnt
- from
- (select
- imei
- ,source
- ,concat(cast(rand()*10 as int ),'_',source ) as new_source
- from test_table_1
- where day ='2022-01-01'
- ) t1
- inner join
- (
- select
- source_name
- ,concat(preflix,'_',source) as new_source
- from test_table_1
- where day ='2022-01-01'
- lateral view explode(split('0,1,2,3,4,5,6,7,8,9,10',','))b as preflix
- ) t2
- on t1.new_source =t2.new_source
- group by
- t1.source
- ,new_source
- ,nvl(source_name,'未知')
- ) tta
- group by
- source
- ,source_name
复制代码 3 双大表Join - 抽样取倾斜key+BroadJoin
- ##优化前:
- create table test.tmp_table_test_all as
- select
- imei
- ,lable_id
- ,nvl(label_name,'未知')
- from tmp_table_1 t1
- left join
- (select
- lable_id
- ,label_name
- from tmp_table_2
- where day ='2024-01-01') t2
- on t1.lable_id =t2.lable_id
- where t1.day ='2024-01-01'
- ;
-
- ## 优化后 :
- create table test.tmp_table_test_all_new as
-
-
- with tmp_table_test_1 as
- (select
- lable_id
- ,count(1) as cnt
- from tmp_table_1 t1
- tablesample(5 percent) --抽样取5%的数据,减少table scan的量
- group by lable_id
- order by cnt desc
- limit 100
- )
-
-
- select
- imei
- ,lable_id
- ,nvl(label_name,'未知') as label_name
- from tmp_table_1 t1
- left join tmp_table_test_1 t2
- on t1.lable_id =t2.lable_id
- left join
- (select
- lable_id
- ,label_name
- from tmp_table_2
- where day ='2024-01-01') t3
- on t1.lable_id =t3.lable_id
- where t1.day ='2024-01-01' and t2.lable_id is null
-
- union all
-
- select
- imei
- ,lable_id
- ,nvl(label_name,'未知') as label_name
- from tmp_table_1 t1
- inner join
- (select
- lable_id
- from tmp_table_test_1 t1
- left join tmp_table_2 t2
- on t1.lable_id =t2.lable_id
- where t2.day ='2024-01-01') t3
- on t1.lable_id =t3.lable_id
- where t1.day ='2024-01-01'
- ;
复制代码 4 小结
到此这篇关于处理Hive中的数据倾斜的方法的文章就介绍到这了,更多相关处理Hive数据倾斜内容请搜索脚本之家以前的文章或继续浏览下面的相关文章希望大家以后多多支持脚本之家!
来源:https://www.jb51.net/database/329624ibi.htm
免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作! |
|