(从电商项目认识数仓体系六)需求及实现

    技术2022-07-10  101

    需求一

    用户活跃主题

    定义:统计当日、当周、当月活动的每个设备明细

    DWS层表及逻辑设计

    1 每日活跃设备明细分析

    -- 建表语句 DROP TABLE IF EXISTS dws_uv_detail_day; CREATE TABLE dws_uv_detail_day( `mid_id` STRING COMMENT'设备唯一标识', `user_id` STRING COMMENT'用户标识', `version_code` STRING COMMENT'程序版本号', `version_name` STRING COMMENT'程序版本名', `lang` STRING COMMENT'系统语言', `source` STRING COMMENT'渠道号', `os` STRING COMMENT'安卓系统版本', `area` STRING COMMENT'区域', `model` STRING COMMENT'手机型号', `brand` STRING COMMENT'手机品牌', `sdk_version` STRING COMMENT'sdkVersion', `gmail` STRING COMMENT'gmail', `height_width` STRING COMMENT'屏幕宽高', `app_time` STRING COMMENT'客户端日志产生时的时间', `network` STRING COMMENT'网络模式', `lng` STRING COMMENT'经度', `lat` STRING COMMENT'纬度', ) COOMENT'活跃用户天明细' PARTITIONED BY (`dt` STRING) STORED AS PARQUET LOCATION '/warehouse/gmall/dws/des_uv_detail_day' -- 明细 INSERT OVERWRITE TABLE dws_uv_detail_day PARTITION(dt='2019-02-10') SELECT mid_id, collect_set(user_id)[0] user_id, collect_set(version_code)[0] version_code, collect_set(version_name)[0] version_name, collect_set(lang)[0] lang, collect_set(source)[0] source, collect_set(os)[0] os, collect_set(area)[0] area, collect_set(model)[0] model, collect_set(brand)[0] brand, collect_set(sdk_version)[0]sdk_version , collect_set(gmail)[0] gmail, collect_set(height_width)[0] height_width, collect_set(app_time)[0] app_time, collect_set(network)[0] network, collect_set(lng)[0] lng, collect_set(lat)[0] lat FROM dwd_start_log WHERE dt='2019-02-10' GROUP BY mid_id;

    2 每周活跃设备明细分析

    -- 建表语句 DROP TABLE IF EXISTS dws_uv_detail_week; CREATE TABLE dws_uv_detail_week( `mid_id` STRING COMMENT'设备唯一标识', `user_id` STRING COMMENT'用户标识', `version_code` STRING COMMENT'程序版本号', `version_name` STRING COMMENT'程序版本名', `lang` STRING COMMENT'系统语言', `source` STRING COMMENT'渠道号', `os` STRING COMMENT'安卓系统版本', `area` STRING COMMENT'区域', `model` STRING COMMENT'手机型号', `brand` STRING COMMENT'手机品牌', `sdk_version` STRING COMMENT'sdkVersion', `gmail` STRING COMMENT'gmail', `height_width` STRING COMMENT'屏幕宽高', `app_time` STRING COMMENT'客户端日志产生时的时间', `network` STRING COMMENT'网络模式', `lng` STRING COMMENT'经度', `lat` STRING COMMENT'纬度', `monday_date` STRING COMMENT'周一日期', `sunday_date` STRING COMMENT'周日日期', ) COOMENT'活跃用户周明细' PARTITIONED BY (`week_dt` STRING) STORED AS PARQUET LOCATION '/warehouse/gmall/dws/des_uv_detail_week' -- 明细 INSERT OVERWRITE TABLE dws_uv_detail_week PARTITION(week_dt) SELECT mid_id, collect_set(user_id)[0] user_id, collect_set(version_code)[0] version_code, collect_set(version_name)[0] version_name, collect_set(lang)[0] lang, collect_set(source)[0] source, collect_set(os)[0] os, collect_set(area)[0] area, collect_set(model)[0] model, collect_set(brand)[0] brand, collect_set(sdk_version)[0]sdk_version , collect_set(gmail)[0] gmail, collect_set(height_width)[0] height_width, collect_set(app_time)[0] app_time, collect_set(network)[0] network, collect_set(lng)[0] lng, collect_set(lat)[0] lat, DATE_ADD(NEXT_DAY('2019-02-10','MONDAY'),-7), DATE_ADD(NEXT_DAY('2019-02-10','SUNDAY'),-7), CONCAT(DATE_ADD(NEXT_DAY('2019-02-10','MONDAY'),-7),'_',DATE_ADD(NEXT_DAY('2019-02-10','MONDAY'),-1)) FROM dws_uv_detail_day WHERE dt>=DATE_ADD(NEXT_DAY('2019-02-10','MONDAY'),-7) AND dt<=DATE_ADD(NEXT_DAY('2019-02-10','MONDAY'),-1) GROUP BY mid_id;

    3 每月活跃设备明细分析

    -- 建表语句 DROP TABLE IF EXISTS dws_uv_detail_month; CREATE TABLE dws_uv_detail_week( `mid_id` STRING COMMENT'设备唯一标识', `user_id` STRING COMMENT'用户标识', `version_code` STRING COMMENT'程序版本号', `version_name` STRING COMMENT'程序版本名', `lang` STRING COMMENT'系统语言', `source` STRING COMMENT'渠道号', `os` STRING COMMENT'安卓系统版本', `area` STRING COMMENT'区域', `model` STRING COMMENT'手机型号', `brand` STRING COMMENT'手机品牌', `sdk_version` STRING COMMENT'sdkVersion', `gmail` STRING COMMENT'gmail', `height_width` STRING COMMENT'屏幕宽高', `app_time` STRING COMMENT'客户端日志产生时的时间', `network` STRING COMMENT'网络模式', `lng` STRING COMMENT'经度', `lat` STRING COMMENT'纬度' ) COOMENT'活跃用户月明细' PARTITIONED BY (`month_dt` STRING) STORED AS PARQUET LOCATION '/warehouse/gmall/dws/des_uv_detail_month' -- 明细 INSERT OVERWRITE TABLE dws_uv_detail_month PARTITION(month_dt) SELECT mid_id, collect_set(user_id)[0] user_id, collect_set(version_code)[0] version_code, collect_set(version_name)[0] version_name, collect_set(lang)[0] lang, collect_set(source)[0] source, collect_set(os)[0] os, collect_set(area)[0] area, collect_set(model)[0] model, collect_set(brand)[0] brand, collect_set(sdk_version)[0]sdk_version , collect_set(gmail)[0] gmail, collect_set(height_width)[0] height_width, collect_set(app_time)[0] app_time, collect_set(network)[0] network, collect_set(lng)[0] lng, collect_set(lat)[0] lat, DATE_FORMAT('2019-02-10','yyyy-MM') FROM dws_uv_detail_day WHERE DATE_FORMAT(dt,'yyyy-MM')=DATE_FORMAT('2019-02-10','yyyy-MM') GROUP BY mid_id;

    4 形成脚本

    #!/bin/bash DB=gmail if [ -n $1 ] ;then log_date=$1 else log_date=`date -d "-1 day" +%F` fi hql=" INSERT OVERWRITE TABLE "${DB}".dws_uv_detail_day PARTITION(dt='$log_date') ... " hive -e "${hql}"

    ADS层表设计及逻辑

    1 结果表

    -- 建表语句 DROP TABLE IF EXISTS ads_uv_count; CREATE EXTERNAL TABLE ads_uv_count( `dt` STRING COMMENT '统计日期', `day_count` BIGINT COMMENT '当日用户数量', `week_count` BIGINT COMMENT '当周用户数量', `month_count` BIGINT COMMENT '当月用户数量', `is_weekend` BIGINT COMMENT 'Y,N是否是周末,用于得到本周最终结果', `is_monthend` BIGINT COMMENT 'Y,N是否是月末,用于得到本月最终结果', ) COMMENT '每日活跃用户数量' STORED AS PARQUET LOCATION '/warehouse/gmall/ads/ads_uv_count_day/' -- 实现逻辑 INSERT OVERWRITE TABLE ads_uv_count SELECT `2019-02-10` dt, daycount.ct, weekcount.ct, monthcount.ct, IF(DATE_ADD(NEXT_DAY('2019-02-10','MONTH'),-1)='2019-02-10','Y','N'), IF(LAST_DAY('2019-02-10')='2019-02-10','Y','N') FROM ( SELECT '2019-02-10' dt, COUNT(1) ct FROM dws_uv_detail_day WHERE dt='2019-02-10' )daycount JOIN ( SELECT '2019-02-10' dt, COUNT(1) ct FROM dws_uv_detail_week WHERE week_dt=CONCAT(DATE_ADD(NEXT_DAY('2019-02-10','MONDAY'),-7),'_',DATE_ADD(NEXT_DAY('2019-02-10','MONDAY'),-1)) ) weekcount ON daycount.dt=weekcount.dt JOIN ( SELECT '2019-02-10' dt, COUNT(1) ct FROM dws_uv_detail_month WHERE month_dt=DATE_FORMAT('2019-02-10','yyyy-MM') )monthcount ON daycount.dt=monthcount.dt;

    2 形成脚本(与之前脚本类似,不再重复示例)

    需求二

    用户新增主题

    定义:首次联网使用应用的用户。如果一个用户首次打开某app,那这个用户定义为新增用户;卸载再安装的设备,不会被算作一次新增。新增用户包括日新增用户、周新增用户、月新增用户

    DWS层表及逻辑设计

    1 每日新增设备明细表(总设备表)

    -- 建表语句 DROP TABLE IF EXISTS `dws_new_mid_day`; CREATE TABLE `dws_new_mid_day` ( `mid_id` STRING COMMENT'设备唯一标识', `user_id` STRING COMMENT'用户标识', `version_code` STRING COMMENT'程序版本号', `version_name` STRING COMMENT'程序版本名', `lang` STRING COMMENT'系统语言', `source` STRING COMMENT'渠道号', `os` STRING COMMENT'安卓系统版本', `area` STRING COMMENT'区域', `model` STRING COMMENT'手机型号', `brand` STRING COMMENT'手机品牌', `sdk_version` STRING COMMENT'sdkVersion', `gmail` STRING COMMENT'gmail', `height_width` STRING COMMENT'屏幕宽高', `app_time` STRING COMMENT'客户端日志产生时的时间', `network` STRING COMMENT'网络模式', `lng` STRING COMMENT'经度', `lat` STRING COMMENT'纬度', `create_date` STRING COMMENT'创建时间' ) COOMENT'每日新增设备信息' PARTITIONED BY (`dt` STRING) STORED AS PARQUET LOCATION '/warehouse/gmall/dws/dws_new_mid_day' -- 逻辑实现 INSERT OVERWRITE TABLE dws_new_mid_day PARTITION(dt='2019-02-10') SELECT ud.mid_id, ud.user_id, ud.version_code, ud.version_name, ud.lang, ud.source, ud.os, ud.area, ud.model, ud.brand, ud.sdk_version, ud.gmail, ud.height_width, ud.app_time, ud.network, ud.lng, ud.lat, '2019-02-10' FROM dwd_start_log ud LEFT JOIN dws_new_mid_day nm ON ud.mid_id=nm.mid_id WHERE ud.dt='2019-02-10' AND nm.mid_id IS NULL; -- 个人认为,这个逻辑,虽然计算正确,但是并不是很合理

    ADS层表及逻辑设计

    1 每日新增设备报表

    -- 建表语句 DROP TABLE IF EXISTS `ads_new_mid_count`; CREATE TABLE `ads_new_mid_count` ( `create_date` STRING COMMENT'创建时间', `new_mid_count` BIGINT COMMENT'新增设备数量' ) COOMENT'每日新增设备量' PARTITIONED BY (`dt` STRING) STORED AS PARQUET LOCATION '/warehouse/gmall/ads/ads_new_mid_count' -- 逻辑实现 INSERT OVERWRITE TABLE ads_new_mid_count SELECT create_date, count(1) FROM dws_new_mid_day WHERE create_date='2019-02-10' GROUP BY create_Date;

    需求三

    用户留存

    定义:

    留存用户:某段时间内的新增用户(活跃用户),经过一段时间后,又继续使用应用的被认作是留存用户留存率:留存用户占当时新增用户(活跃用户)的比例即是留存率

    例:2月10日新增用户100,这100人在2月11日启动过应用的有30人,2月12日启动过应用的有25人,2月13日启动过应用的有32人,则2月10日新增用户次日留存率是30/100=30%,两日留存率是25/100=25%,三日留存率是32/100=32%

    需求:每天计算前1、2、3、4、7、14天的留存率

    需求分析:假设今天是11日,要统计前1天也就是10日新增设备的留存率,则计算方式为10日的新增设备且11日活跃的/10日的新增设备,10日的新增设备且11日活跃可使用10日新增设备明细与11日活跃设备关联,得到留存用户,10日新增设备可有每日新增设备明细表直接取出(根据分区)

    DWS层表及逻辑设计

    1 每日用户留存明细

    -- 建表语句 DROP TABLE IF EXISTS `dws_user_retention_day`; CREATE TABLE `dws_user_retention_day` ( `mid_id` STRING COMMENT'设备唯一标识', `user_id` STRING COMMENT'用户标识', `version_code` STRING COMMENT'程序版本号', `version_name` STRING COMMENT'程序版本名', `lang` STRING COMMENT'系统语言', `source` STRING COMMENT'渠道号', `os` STRING COMMENT'安卓系统版本', `area` STRING COMMENT'区域', `model` STRING COMMENT'手机型号', `brand` STRING COMMENT'手机品牌', `sdk_version` STRING COMMENT'sdkVersion', `gmail` STRING COMMENT'gmail', `height_width` STRING COMMENT'屏幕宽高', `app_time` STRING COMMENT'客户端日志产生时的时间', `network` STRING COMMENT'网络模式', `lng` STRING COMMENT'经度', `lat` STRING COMMENT'纬度', `create_date` STRING COMMENT'设备新增时间', `retention_day` INT COMMENT'截止当前日期留存天数' ) COOMENT'每日用户留存情况' PARTITIONED BY (`dt` STRING) STORED AS PARQUET LOCATION '/warehouse/gmall/dws/dws_user_retention_day' -- 逻辑实现 INSERT OVERWRITE TABLE dws_user_retention_day PARTITION(dt='2019-02-11') SELECT nm.mid_id, nm.user_id, nm.version_code, nm.version_name, nm.lang, nm.source, nm.os, nm.area, nm.model, nm.brand, nm.sdk_version, nm.gmail, nm.height_width, nm.app_time, nm.network, nm.lng, nm.lat, nm.create_date, 1 retention_day FROM dws_uv_detail_day ud LEFT JOIN dws_new_mid_day nm ON ud.mid_id=nm.mid_id WHERE ud.dt='2019-02-11' AND nm.create_date=DATE_ADD('2019-02-11',-1); UNION ALL SELECT nm.mid_id, nm.user_id, nm.version_code, nm.version_name, nm.lang, nm.source, nm.os, nm.area, nm.model, nm.brand, nm.sdk_version, nm.gmail, nm.height_width, nm.app_time, nm.network, nm.lng, nm.lat, nm.create_date, 2 retention_day FROM dws_uv_detail_day ud LEFT JOIN dws_new_mid_day nm ON ud.mid_id=nm.mid_id WHERE ud.dt='2019-02-11' AND nm.create_date=DATE_ADD('2019-02-11',-2) ...;

    ADS层表及逻辑设计

    1 留存用户数

    -- 建表语句 DROP TABLE IF EXISTS `ads_user_retention_day_count`; CREATE TABLE `ads_user_retention_day_count` ( `create_date` STRING COMMENT'设备新增日期', `retention_day` INT COMMENT'截止当前日期留存天数' `retention_count` BIGINT COMMENT'留存数量' ) COOMENT'每日新增设备量' PARTITIONED BY (`dt` STRING) STORED AS PARQUET LOCATION '/warehouse/gmall/ads/ads_user_retention_day_count' -- 逻辑实现 INSERT INTO TABLE ads_user_retention_day_count SELECT create_date, retention_day, count(1) retention_count FROM dws_new_mid_day WHERE create_date='2019-02-11' GROUP BY create_date, retention_day;

    2 留存用户比率

    -- 建表语句 DROP TABLE IF EXISTS `ads_user_retention_day_rate`; CREATE TABLE `ads_user_retention_day_rate` ( `stat_date` STRING COMMENT'统计日期', `create_date` STRING COMMENT'设备新增日期', `retention_day` INT COMMENT'截止当前日期留存天数', `retention_count` BIGINT COMMENT'留存数量', `new_mid_count` INT COMMENT'当日设备新增数量', `retention_ratio` INT COMMENT'留存率', ) COOMENT'每日用户留存情况' PARTITIONED BY (`dt` STRING) STORED AS PARQUET LOCATION '/warehouse/gmall/ads/ads_user_retention_day_rate' -- 逻辑实现 INSERT INTO TABLE ads_user_retention_day_rate SELECT '2019-02-11', ur.create_date, ur.retention_day, ur.retention_count, nc.new_mid_count, ur.retention_count/nv.new_mid_count*100 FROM ( SELECT create_date, retention_day, count(1) retention_count FROM `dws_user_retention_day` WHERE dt='2019-02-11' GROUP BY create_Date, retention_day ) ur JOIN ads_new_mid_count nc ON nv.create_date=ur.create_date

     

    Processed: 0.010, SQL: 12