Hive拉链表实现

拉链表测试:

有如下测试数据

--2019/12/1号订单的全量数据
id    status    create_time    operation_time
1    待支付    2019-12-01    
2    待支付    2019-12-01    
3    已支付    2019-12-01    

--2019/12/2号订单的全量数据
id    status    create_time    operation_time
1    待支付    2019-12-01    
2    已支付    2019-12-01    2019-12-02
3    已支付    2019-12-01    
4    待支付    2019-12-02    
5    已支付    2019-12-02
--创建订单表
drop table if exists order_info;
create table order_info(
id int, 
status string,
create_time string,
operate_time string
)
partitioned by(dt string)
row format
delimited fields terminated by ‘\t‘;
--加载12/1的数据
load data local inpath "/opt/data/order_info1" into table chain.order_info partition(dt=‘2019-12-01‘);

--加载12/2的数据
load data local inpath "/opt/data/order_info2" into table chain.order_info partition(dt=‘2019-12-02‘);
--创建拉链表
drop table if exists order_info_chain;
create table order_info_chain(
id int, 
status string,
create_time string,
operate_time string,
start_time string,
end_time string
)
row format 
delimited fields terminated by ‘\t‘;

--初始化拉链表,加载12/1号的数据
insert overwrite table order_info_chain
select 
id,
status,
create_time,
operate_time,
‘2019-12-01‘,
‘9999-99-99‘
from order_info
where dt=‘2019-12-01‘;
--订单变化表
create table order_change(
id string,
status string,
create_time string,
operate_time string
) partitioned by (dt string);

--灌入12/2变化和新增的数据,通过create_time,create_time判断
insert overwrite table order_change partition(dt=‘2019-12-02‘)
select 
id,
status,
create_time,
operate_time
from order_info where create_time=‘2019-12-02‘ or operate_time=‘2019-12-02‘;
--创建临时拉链表
create table tmp_chain(
id string,
status string,
create_time string COMMENT ‘创建时间‘,
operate_time string COMMENT ‘修改时间‘,
start_time string COMMENT ‘有效开始时间‘,
end_time string COMMENT ‘有效结束时间‘
);
--向临时拉链表导入数据
insert overwrite table tmp_chain
select * from
(
    select
        id,
        status,
        create_time,
        operate_time,
        ‘2019-12-02‘ as start_time,
        ‘9999-99-99‘ as end_time
    from order_change where dt=‘2019-12-02‘
    
    union all
    
    --修改发生过更新的记录的end_time为前一天
    select 
        orch.id,
        orch.status,
        orch.create_time,
        orch.operate_time,
        orch.start_time,
        if(ch.id is null, orch.end_time, date_add(ch.dt,-1)) as end_time
    from order_info_chain orch
    left join
        (select * from order_change where dt=‘2019-12-02‘) ch
    on orch.id = ch.id and orch.end_time=‘9999-99-99‘
) t 
order by t.id,t.start_time;
--把临时表覆盖给拉链表
insert overwrite table order_info_chain
select * from tmp_chain;

相关推荐