hive中路径转换(漏斗模型)

    技术2024-07-17  92

    1 加载数据

    load data local inpath '/root/hivedata/click-part-r-00000' overwrite into table ods_click_pageviews partition(datestr='20181103');

    2 查询每一个步骤的总访问人数

    UNION All将多个SELECT语句的结果集合并为一个独立的结果集

    create table dw_oute_numbs as select 'step1' as step,count(distinct remote_addr) as numbs from ods_click_pageviews where datestr='20181103' and request like '/item%' union all select 'step2' as step,count(distinct remote_addr) as numbs from ods_click_pageviews where datestr='20181103' and request like '/category%' union all select 'step3' as step,count(distinct remote_addr) as numbs from ods_click_pageviews where datestr='20181103' and request like '/order%' union all select 'step4' as step,count(distinct remote_addr) as numbs from ods_click_pageviews where datestr='20181103' and request like '/index%';

    3 结果

    +---------------------+----------------------+--+ | dw_oute_numbs.step | dw_oute_numbs.numbs | +---------------------+----------------------+--+ | step1 | 1029 | | step2 | 1029 | | step3 | 1028 | | step4 | 1018 | +---------------------+----------------------+--+

    4 查询每一步骤相对于路径起点人数的比例级联查询,自己跟自己join

    select * from (select rn.step as rnstep, rn.numbs as rnnumbs, rr.step as rrstep, rr.numbs as rrnumbs from dw_oute_numbs rn inner join dw_oute_numbs rr) tmp where rrstep ="step1"; select (rnnumbs/rrnumbs)*100 from (select rn.step as rnstep, rn.numbs as rnnumbs, rr.step as rrstep, rr.numbs as rrnumbs from dw_oute_numbs rn inner join dw_oute_numbs rr) tmp where rrstep ="step1";

    5 自join后结果如下所示:

    +---------+----------+---------+----------+--+ | rnstep | rnnumbs | rrstep | rrnumbs | +---------+----------+---------+----------+--+ | step1 | 1029 | step1 | 1029 | | step2 | 1029 | step1 | 1029 | | step3 | 1028 | step1 | 1029 | | step4 | 1018 | step1 | 1029 | | step1 | 1029 | step2 | 1029 | | step2 | 1029 | step2 | 1029 | | step3 | 1028 | step2 | 1029 | | step4 | 1018 | step2 | 1029 | | step1 | 1029 | step3 | 1028 | | step2 | 1029 | step3 | 1028 | | step3 | 1028 | step3 | 1028 | | step4 | 1018 | step3 | 1028 | | step1 | 1029 | step4 | 1018 | | step2 | 1029 | step4 | 1018 | | step3 | 1028 | step4 | 1018 | | step4 | 1018 | step4 | 1018 | +---------+----------+---------+----------+--+

    6 每一步的人数/第一步的人数==每一步相对起点人数比例

    select tmp.rnstep, tmp.rnnumbs/tmp.rrnumbs as ratio from (select rn.step as rnstep, rn.numbs as rnnumbs, rr.step as rrstep, rr.numbs as rrnumbs from dw_oute_numbs rn inner join dw_oute_numbs rr) tmp where tmp.rrstep='step1';--tmp

    7 结果

    +---------+----------+---------+----------+--+ | rnstep | rnnumbs | rrstep | rrnumbs | +---------+----------+---------+----------+--+ | step1 | 1029 | step1 | 1029 | | step2 | 1029 | step1 | 1029 | | step3 | 1028 | step1 | 1029 | | step4 | 1018 | step1 | 1029 |

    8 查询每一步骤相对于上一步骤的漏出率首先通过自join表过滤出每一步跟上一步的记录

    select rn.step as rnstep, rn.numbs as rnnumbs, rr.step as rrstep, rr.numbs as rrnumbs from dw_oute_numbs rn inner join dw_oute_numbs rr where cast(substr(rn.step,5,1) as int)=cast(substr(rr.step,5,1) as int)-1;

    9 注意:cast为Hive内置函数 类型转换

    select cast(1 as float); --1.0 select cast(‘2016-05-22’ as date); --2016-05-22

    10 结果

    +---------+----------+---------+----------+--+ | rnstep | rnnumbs | rrstep | rrnumbs | +---------+----------+---------+----------+--+ | step1 | 1029 | step2 | 1029 | | step2 | 1029 | step3 | 1028 | | step3 | 1028 | step4 | 1018 | +---------+----------+---------+----------+--+

    11 然后就可以非常简单的计算出每一步相对上一步的漏出率

    select tmp.rrstep as step, tmp.rrnumbs/tmp.rnnumbs as leakage_rate from ( select rn.step as rnstep, rn.numbs as rnnumbs, rr.step as rrstep,rr.numbs as rrnumbs from dw_oute_numbs rn inner join dw_oute_numbs rr) tmp where cast(substr(tmp.rnstep,5,1) as int)=cast(substr(tmp.rrstep,5,1) as int)-1;

    12 汇总以上两种指标

    select abs.step, abs.numbs, abs.rate as abs_ratio, rel.rate as leakage_rate from (select tmp.rnstep as step, tmp.rnnumbs as numbs, tmp.rnnumbs/tmp.rrnumbs as rate from (select rn.step as rnstep, rn.numbs as rnnumbs, rr.step as rrstep, rr.numbs as rrnumbs from dw_oute_numbs rn inner join dw_oute_numbs rr) tmp where tmp.rrstep='step1') abs left outer join (select tmp.rrstep as step, tmp.rrnumbs/tmp.rnnumbs as rate from (select rn.step as rnstep, rn.numbs as rnnumbs, rr.step as rrstep, rr.numbs as rrnumbs from dw_oute_numbs rn inner join dw_oute_numbs rr) tmp where cast(substr(tmp.rnstep,5,1) as int)=cast(substr(tmp.rrstep,5,1) as int)-1) rel on abs.step=rel.step;
    Processed: 0.172, SQL: 9