• 设为首页
  • 收藏本站
  • 积分充值
  • VIP赞助
  • 手机版
  • 微博
  • 微信
    微信公众号 添加方式:
    1:搜索微信号(888888
    2:扫描左侧二维码
  • 快捷导航
    福建二哥 门户 查看主题

    处理Hive中的数据倾斜的方法

    发布者: Error | 发布时间: 2025-6-20 09:03| 查看数: 104| 评论数: 0|帖子模式

    1 groupby(大表分组-局部聚合+全局聚合)

    示例1:
    1. select label,sum(cnt) as all from
    2. (
    3.     select rd,label,sum(1) as cnt from
    4.     (
    5.         select id,label,round(rand(),2) as rd,value from tmp1
    6.     ) as tmp
    7.     group by rd,label
    8. ) as tmp
    9. group by label;
    复制代码
    示例2:
    1. select
    2.         split(new_source,'\\_')[0] as source
    3.         ,sum(cnt) as cnt
    4. from  
    5. (select  
    6.         concat(source,'_', rand()*100) as  new_source
    7.         ,count(1) as cnt
    8. from  test_table
    9. where day ='2022-01-01'
    10. group by
    11.         concat(source,'_', rand()*100)
    12. )tt
    13. group by
    14.         split(new_source,'\\_')[0]
    复制代码
    2 join(大中表Join - 加salt + 小表膨胀)

    示例1:
    1. select label,sum(value) as all from
    2. (
    3.     select rd,label,sum(value) as cnt from
    4.     (
    5.         select tmp1.rd as rd,tmp1.label as label,tmp1.value*tmp2.value as value
    6.         from
    7.         (
    8.             select id,round(rand(),1) as rd,label,value from tmp1
    9.         ) as tmp1
    10.         join
    11.         (
    12.             select id,rd,label,value from tmp2
    13.             lateral view explode(split('0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9',',')) mytable as rd
    14.         ) as tmp2
    15.         on tmp1.rd = tmp2.rd and tmp1.label = tmp2.label
    16.     ) as tmp1
    17.     group by rd,label
    18. ) as tmp1
    19. group by label;
    复制代码
    示例2:
    1. select
    2.         source
    3.         ,source_name
    4.         ,sum(cnt) as cnt
    5. from  
    6. (select
    7.         t1.source
    8.         ,new_source
    9.         ,nvl(source_name,'未知') as source_name
    10.         ,count(imei) as cnt
    11. from  
    12. (select  
    13.         imei
    14.         ,source
    15.         ,concat(cast(rand()*10 as int ),'_',source ) as new_source
    16. from  test_table_1
    17. where day ='2022-01-01'
    18. ) t1
    19. inner join
    20. (
    21. select
    22.         source_name
    23.         ,concat(preflix,'_',source) as new_source
    24. from  test_table_1
    25. where day ='2022-01-01'
    26. lateral view explode(split('0,1,2,3,4,5,6,7,8,9,10',','))b as preflix
    27. ) t2
    28. on t1.new_source =t2.new_source
    29. group by
    30. t1.source
    31. ,new_source
    32. ,nvl(source_name,'未知')
    33. ) tta  
    34. group by
    35.         source
    36.         ,source_name
    复制代码
    3 双大表Join - 抽样取倾斜key+BroadJoin
    1. ##优化前:
    2. create table test.tmp_table_test_all as
    3. select  
    4. imei
    5. ,lable_id
    6. ,nvl(label_name,'未知')
    7. from tmp_table_1  t1  
    8. left join
    9. (select  
    10. lable_id
    11. ,label_name
    12. from  tmp_table_2
    13. where day ='2024-01-01') t2
    14. on t1.lable_id =t2.lable_id
    15. where t1.day ='2024-01-01'
    16. ;

    17. ## 优化后 :
    18. create table test.tmp_table_test_all_new  as


    19. with tmp_table_test_1 as
    20. (select  
    21. lable_id
    22. ,count(1) as cnt
    23. from tmp_table_1  t1
    24. tablesample(5 percent) --抽样取5%的数据,减少table scan的量
    25. group by lable_id
    26. order by cnt desc
    27. limit 100
    28. )


    29. select  
    30.         imei
    31.         ,lable_id
    32.         ,nvl(label_name,'未知') as  label_name
    33. from tmp_table_1  t1
    34. left join  tmp_table_test_1  t2
    35. on t1.lable_id =t2.lable_id
    36. left join
    37. (select  
    38.         lable_id
    39.         ,label_name
    40. from  tmp_table_2
    41. where day ='2024-01-01') t3
    42. on t1.lable_id =t3.lable_id
    43. where t1.day ='2024-01-01' and  t2.lable_id is null

    44. union all  

    45. select  
    46.         imei
    47.         ,lable_id
    48.         ,nvl(label_name,'未知') as  label_name
    49. from tmp_table_1  t1
    50. inner  join
    51. (select  
    52.         lable_id
    53. from  tmp_table_test_1  t1
    54. left   join   tmp_table_2  t2
    55. on t1.lable_id =t2.lable_id
    56. where t2.day ='2024-01-01') t3
    57. on t1.lable_id =t3.lable_id
    58. where t1.day ='2024-01-01'
    59. ;
    复制代码
    4 小结

    到此这篇关于处理Hive中的数据倾斜的方法的文章就介绍到这了,更多相关处理Hive数据倾斜内容请搜索脚本之家以前的文章或继续浏览下面的相关文章希望大家以后多多支持脚本之家!

    来源:https://www.jb51.net/database/329624ibi.htm
    免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!

    最新评论

    QQ Archiver 手机版 小黑屋 福建二哥 ( 闽ICP备2022004717号|闽公网安备35052402000345号 )

    Powered by Discuz! X3.5 © 2001-2023

    快速回复 返回顶部 返回列表