【项目淘宝用户行为数据分析（人货场模型/SQL+Tableau）

数据运维 2023-12-30 张二河手机阅读

数据来源

淘宝用户购物行为数据集_数据集-阿里云天池 (aliyun.com)

本数据集包含了2017年11月25日至2017年12月3日之间，有行为的约一百万随机用户的所有行为（行为包括点击、购买、加购、喜欢），即数据集的每一行表示一条用户行为，由用户ID、商品ID、商品类目ID、行为类型和时间戳组成，并以逗号分隔。

背景与目的

互联网的增长时代已经过去，进入后流量时代，电商企业的首要目标已经不是拓宽用户池，而是对现有用户做更加精细化的运营，对存量用户做深挖，用以维持企业的生命力。
而精细化运营不仅仅是对某一个点做细化，而是围绕着整个业务场景，从用户到商品到平台，逐步改善用户的使用场景和使用体验，用细节决定成败。

数据分析

1. 分析思路

2. 数据清洗

2.1 去除null

SELECT COUNT(user_id),COUNT(item_id),COUNT(category_id),COUNT(behavior_type),COUNT(timestamp)
FROM userbehavior

【项目淘宝用户行为数据分析（人货场模型/SQL+Tableau）-每日运维
由结果看到没有null值

2.2 去除重复值

根据数据来源可知这是100万用户产生的1亿条行为数据，所以单看user_id或者item_id或者category_id时是一定存在重复值的，所以要结合着看

select user_id,item_id,timestamps
from userbehavior
group by user_id,item_id,timestamps
having count(*) > 1;

# 根据user_id,item_id,timestamps分组，若组内由数据个数大于1，则说明有重复值

查询结果性显示 53 条重复值，需要去重

alter table userbehavior add id int first;
alter table userbehavior MODIFY id int primary key auto_increment; # 将id 设置为自增主键

delete userbehavior from userbehavior,
(select user_id,item_id,timestamps,min(id) as id
from userbehavior
group by user_id,item_id,timestamps
HAVING count(*) > 1
) b
where userbehavior.user_id = b.user_id
and userbehavior.item_id = b.item_id
and userbehavior.timestamps = b.timestamps
and userbehavior.id > b.id;

2.3 转换时间戳

# 将时间戳转为日期并新增date,time,hour字段

# 新增datetime列，将格式设置为timestamp（0）0代表就到秒，不要毫秒那些
alter table userbehavior add datetime timestamp(0);
update userbehavior set datetime = FROM_UNIXTIME(timestamps)

# 增加date,time,hour列
alter table userbehavior 
add date char(10),
add time char(8),
add hour char(2);

# 更新数据(空格也算一个字符)
update userbehavior set 
date = SUBSTRING(datetime,1,10),
time = SUBSTRING(datetime,12,8), 
hour = SUBSTRING(datetime,12,2);

# 注意添加多个列的写法是 XX add XX ，add XX ，add XX（每个都要写关键字）
# 更新表数据多个列的写法是 update XX set XX = XX ，XX = XX，XX = XX（只需写一个关键字）

2.4 去除异常值

# 本数据中，异常值几乎只出现在时间戳字段内，所以对date字段操作
select max(datetime),min(datetime) from userbehavior;

在数据说明中指出，数据时间范围在2017-11-25 到 2017-12-3，明显看出有异常数据，需要过滤

delete from userbehavior 
where datetime < "2017-11-25 00:00:00"
or datetime > "2017-12-04 00:00:00";

3. 人货场模型指标构建

3.1 人（用户）

3.1.1 流量分析（PV、UV、PV/UV）

# 创建一个pv、uv的数据表
create table pv_uv_pu(
date char(10),
pv int,
uv int,
pu decimal(10,1)
);

# 插入数据
insert into pv_uv_pu 
select date,count(behavior_type) as pv,
count( distinct user_id) as uv,
round(count(behavior_type) / count(distinct user_id),1) as "pv/uv"
from userbehavior
where behavior_type = "pv"
group by date;

3.1.2 留存分析（次日、三日、七日）

create table retention(
date date,
uv int,
remain1 decimal(10,2),
remain3 decimal(10,2),
remain7 decimal(10,2)
);

insert into retention 
select t1.date,
count(distinct t1.user_id) as "当日活跃用户数",
round(count(case when datediff(t2.date,t1.date) = 1 then t2.user_id else null end ) / count(distinct t1.user_id),2) as "次留率",
round(count(case when datediff(t2.date,t1.date) = 3 then t2.user_id else null end )/ count(distinct t1.user_id),2) as "3日留存率",
round(count(case when datediff(t2.date,t1.date) = 7 then t2.user_id else null end ) / count(distinct t1.user_id),2) as "7日留存率"
from
(-- 1. 每个用户每日的访问数据表
select user_id,date
from userbehavior
group by user_id,date) t1 

left join 

(select user_id,date
from userbehavior
group by user_id,date) t2

on t1.user_id = t2.user_id
where t2.date > t1.date
group by t1.date;

3.1.3 行为路径分析

# 第一步，统计每个用户对每个商品的行为，并量化
create view user_behavior_step1_splite as
select user_id,item_id,
count(if(behavior_type = 'pv',user_id,null)) as pv,
count(if(behavior_type = 'fav',user_id,null)) as fav,
count(if(behavior_type = 'cart',user_id,null)) as cart,
count(if(behavior_type = 'buy',user_id,null)) as buy
from userbehavior
group by user_id,item_id
order by user_id,item_id

# 第二步，对各个行为标准化，比如某人对某商品的浏览很多次，pv大于1，在这里转换成1，因为本次分析并不需要具体pv的数量，只要为1就算浏览过
create view user_behavior_step2_standerd as
select user_id,item_id,
if(pv >= 1 ,1,0) as pv,
if(fav >= 1 ,1,0) as fav,
if(cart >= 1 ,1,0) as cart,
if(buy >= 1 ,1,0) as buy
from user_behavior_step1_splite;

# 第三步，合并行为路径
create view user_behavior_step3_add as 
select user_id,item_id,
concat(pv,fav,cart,buy) as "行为路径"
from user_behavior_step2_standerd;

# 第四步，统计各行为路径的数量
select 行为路径,count(行为路径) as "数量"
from user_behavior_step3_add
group by 行为路径

最终结果为：四位数按顺序为浏览-收藏-加购-购买

为了更清晰的理解表中信息，对这个表进行描述扩充

# 创建行为路径描述表并插入数据
create table behavior_path_description(
path_type char(4),
description varchar(50)
);
insert into behavior_path_description values 
('0001','仅购买'),
('0010','仅加购'),
('0100','仅收藏'),
('1000','仅浏览'),
('1100','浏览收藏'),
('1010','浏览加购'),
('1001','浏览购买'),
('0110','收藏加购'),
('0101','收藏购买'),
('0011','加购购买'),
('1110','浏览收藏加购'),
('1101','浏览收藏购买'),
('1011','浏览加购购买'),
('0111','收藏加购购买'),
('1111','浏览收藏加购购买');

# 创建行为路径表并插入数据
create table behavior_path_num (
path_type char(4),
num int
);
insert into behavior_path_num select `行为路径`,数量 from user_brhavior_path;
-- user_brhavior_path是之前行为路径数量生成的视图，就是上面的那张图

# 将两表合并并创建一个新表
create table behavior_path_num (
path_type char(4),
num int
);
insert into behavior_path_num select `行为路径`,数量 from user_brhavior_path;

create table behavior_path(
path_type char(4),
description varchar(50),
num int
);
insert into behavior_path (select behavior_path_num.path_type,description,num from behavior_path_num left join behavior_path_description on behavior_path_num.path_type =behavior_path_description.path_type
);

--

behavior_path如下图所示：

3.1.4 用户转化率分析

根据之前得到的behavior_path表，计算每个环节的转化率

收藏加购转化率过低，猜测是否热销品牌与平台推荐平台不符

3.1.5 用户价值分析（RFM）

R：最近一次消费
F：消费频率
M：消费金额

数据中没有金额数据，只能用R值和F值给用户分类

# 
create table rfm_model(
user_id varchar(20),
R_value int,
F_value int
);

# step 2：根据得到的最近消费时间和消费次数进行打分
insert into rfm_model select user_id,
(case when datediff('2017-12-03',最近消费时间) < 1 then 5
			 when datediff('2017-12-03',最近消费时间) <= 3 then 4
			 when datediff('2017-12-03',最近消费时间) <= 5 then 3
			 when datediff('2017-12-03',最近消费时间) <= 7 then 2
			 when datediff('2017-12-03',最近消费时间) > 7 then 1 
			 end) as R_value,
(case when 消费次数 <= 5 then 1 
			when 消费次数 <= 10 then 2
			when 消费次数 <= 15 then 3
			when 消费次数 <= 20 then 4
			else 5
			end) as F_value
from
# step 1： 得到每个用户的最近消费时间和消费次数
(select user_id,max(date)as '最近消费时间',count(user_id) as '消费次数'
from userbehavior
where behavior_type = 'buy'
group by user_id) a;

通过观察step1得到的数据，时间跨度仅为8天，所以r值的打分规则跨度需要小一点，且观察到只有少部分用户的消费频次超过20，大部分集中在小于5的区间内，所以打分规则指定的区间也比较小,得到rf的分数表之后，需要计算r和f的均值，进行用户分类

# 定义r和f的均值并计算出来
set @r_avg = null
set @f_avg = null
select avg(R_value) into @r_avg from rfm_model;
select avg(F_value) into @f_avg from rfm_model;

# 新增一列并插入数据
alter table rfm_model add user_type varchar(10);
update rfm_model set user_type = 
case when R_value >= @r_avg and F_value >= @f_avg then "价值用户"
		 when R_value >= @r_avg and F_value < @f_avg then "发展用户"
		 when R_value < @r_avg and F_value >= @f_avg then "保持用户"
		 when R_value < @r_avg and F_value < @f_avg then "挽留用户"
		 end ;

rfm_model表的部分截图如下

对各类用户进行计数统计

select user_type,count(*) as num
from rfm_model
group by user_type
order by num

结果如下

3.2 货（商品）

3.2.1 帕累托分析

create view pareto_item_buy as 
-- 商品购买量（帕累托分析）
select 下单次数,count(item_id) as 商品种类数
from
# 按照下单次数进行分组
(select item_id,count(behavior_type) as 下单次数
from userbehavior
where behavior_type = 'buy'
group by item_id 
order by 下单次数 desc) a
group by 下单次数
order by 下单次数 desc

【项目淘宝用户行为数据分析（人货场模型/SQL+Tableau）-每日运维
结果共314条记录

根据图像可知，1%的产品贡献了80%的购买量，头部效应明显。

3.2.2 转化率分析

select item_id,
count(if(behavior_type = 'pv',user_id,null)) as pv,
count(if(behavior_type = 'fav',user_id,null)) as fav,
count(if(behavior_type = 'cart',user_id,null)) as cart,
count(if(behavior_type = 'buy',user_id,null)) as buy,
count(distinct if(behavior_type = 'buy',user_id,null)) / count(distinct user_id) as 商品转化率
from userbehavior
group by item_id
order by 商品转化率

3.2.3 四象限分析

create view quadrant as 
select item_id,
sum(case when behavior_type = 'pv' then 1 else 0 end) as 浏览量,
sum(case when behavior_type = 'buy' then 1 else 0 end) as 购买量
from userbehavior
group by item_id
order by 购买量 desc

3.3 场（平台）

3.3.1 跳出率

跳出率:用户从入口进入且只访问了单个页面的访问量/ 网站全体页面总访问量

用于衡量网站性能，跳出率高说明用户体验做的不好，用户刚进来就出去了，网站不能提供用户一眼即达的清晰度，衡量用户体验的指标。

select CONCAT(round(count(user_id) / (select count(user_id) from temp_behavior where behavior_type = 'pv'),2) * 100,'%') as 网站跳出率
from 
(select user_id,count(user_id) as 访问次数
from userbehavior
where behavior_type = 'pv' 
group by user_id) a

4.问题驱动分析导向

根据上文进行的分析，提出如下业务问题：

网站的每日流量趋势如何？

用户的流失率如何，如何减小每环节的流失率？

热销商品和推送产品是否匹配？
热销商品前十：

将用户按照模型进行分类，如何精细化运营？

结论：

双十一活动期间，客流量一直维持在较高水准，尤其12月的2、3两天，用户的留存率也较高。

根据分析用户行为路径，得出各环节转化率，可以看出，浏览-收藏/加购环节转化率仅为5%，用户流失严重。据此产生两个猜想：

1.平台体验不好

2.商品推送不精准

根据猜想1，计算出跳出率，为1%，说明不是平台问题

根据猜想2，分别分析热销前十的商品和浏览量较高的产品，发现它们之间大部分并不重合，说明平台推送的商品并不是用户想要购买的产品。

后续平台可以适当调整推送产品的种类，提高转化率。
例如可以用发放优惠卷或红包等福利提高浏览-收藏/加购环节的转化率

根据RFM模型对用户进行分类研究，发现发展用户和挽留用户占比较高，说明大部分用户的消费频率较低，后续应该想办法提高用户消费频率，调整推荐策略，以及多联系挽留用户，防止其流失。