1. 数据来源:瓜子二手车爬虫(数据部分截图如下)
数据集字段含义:
Brand:品牌Name:型号Boarding_time:上牌时间Km:里程数Discharge:排放标准Sec_price:二手车价格New_price:新车价格2、缺失值的查找和处理
SELECT count(*) FROM used_car_analysis WHERE Brand IS NULL OR Name IS NULL OR Boarding_time IS NULL OR Km IS NULL OR Discharge IS NULL OR Sec_price IS NULL OR New_price IS NULL;3、数据类型不一致的处理
3、1 Brand
查看“品牌”列,无异常值。合计104种品牌。
SELECT Brand, count(*) from used_car_analysis group by Brand; SELECT count(*) from (SELECT Brand, count(*) from used_car_analysis group by Brand) c;3、2 Name
“型号”列比较复杂,等下会进行特征提取。
3、3 Bording_time
“上牌时间”中有93个“暂无”数据。全部修改为当前年份和月份。
SELECT Boarding_time, count(*) FROM used_car_analysis WHERE Boarding_time NOT LIKE "%年%月"; UPDATE used_car_analysis SET Boarding_time = "2020年6月" WHERE Boarding_time = "未上牌";3、4 Km
对于“里程数”列,存在部分数据为“百公里内”。将这些数据全部替换为0.01(即0.01万公里);其他数据删除单位“万公里”
SELECT * FROM used_car_analysis WHERE Km NOT LIKE "%万%" GROUP BY Km; UPDATE used_car_analysis a INNER JOIN ( SELECT NAME, ( CASE WHEN Km LIKE "%万公里" THEN LEFT (Km, LOCATE("万", Km) - 1) ELSE 0.01 END ) Km FROM used_car_analysis ) AS b ON a. NAME = b. NAME SET a.Km = b.Km;
3、5 Discharge
对此列不做处理。本次分析不会用到这一列。
3、6 Sec_price
查看是否有0。结果:无。
3、7 New_price
查看是否有不同于其他数据的数据。
SELECT * FROM used_car_analysis WHERE New_price NOT LIKE "%万"part 2. 特征处理
1、车龄
根据Boarding_time,抽取上牌年份和月份,计算出距离当前时间的月份数,即车龄。
ALTER TABLE used_car_analysis ADD COLUMN Boarding_year INT UPDATE used_car_analysis a INNER JOIN ( SELECT NAME, LEFT ( Boarding_time, LOCATE("年", Boarding_time) - 1 ) Boarding_year FROM used_car_analysis ) AS b ON a. NAME = b. NAME SET a.Boarding_year = b.Boarding_year #提取上牌年份 ALTER TABLE used_car_analysis ADD COLUMN Boarding_month INT UPDATE used_car_analysis a INNER JOIN ( SELECT NAME, SUBSTRING( Boarding_time FROM LOCATE("年", Boarding_time) + 1 FOR ( LOCATE("月", Boarding_time) - 1 - LOCATE("年", Boarding_time) ) ) Boarding_month FROM used_car_analysis ) AS b ON a. NAME = b. NAME SET a.Boarding_month = b.Boarding_month #提取上牌月份 ALTER TABLE used_car_analysis ADD COLUMN time INT UPDATE used_car_analysis a INNER JOIN ( SELECT NAME, ( ( YEAR (CURDATE()) - Boarding_year ) * 12 + MONTH (CURDATE()) - Boarding_month ) time FROM used_car_analysis ) AS b ON a. NAME = b. NAME SET a.time = b.time #计算车龄 ALTER TABLE used_car_analysis DROP COLUMN Boarding_month ALTER TABLE used_car_analysis DROP COLUMN Boarding_year #删除无用的列2、折旧率(年限平均折旧法)
ALTER TABLE used_car_analysis ADD COLUMN Depreciation FLOAT; UPDATE used_car_analysis a INNER JOIN ( SELECT Name, (case WHEN time !=0 then ((New_price - Sec_price)*12/time)/New_price when time = 0 then ((New_price - Sec_price)/(time+1))/New_price end) Depreciation from used_car_analysis ) AS b ON a. NAME = b. NAME SET a.Depreciation= b.Depreciation ; 3、折旧率(按照行驶里程数折旧) ALTER TABLE used_car_analysis ADD COLUMN Dep_Distance FLOAT UPDATE used_car_analysis a INNER JOIN ( SELECT Name, (case WHEN Km !=0 then ((New_price - Sec_price)*10000/Km)/New_price ELSE 0 end) Dep_Distance from used_car_analysis ) AS b ON a. NAME = b. NAME SET a.Dep_Distance = b.Dep_Distance4、款式
从Name中提取二手车的款式。
select count(*) from used_car_analysis WHERE Name like "%款%"; #查看是否每个Name中是否都含有“款”字,为之后的款式字符抽取做准备。(结果表明每列都包含有“款”字) ALTER TABLE used_car_analysis ADD COLUMN Style INT; UPDATE used_car_analysis a INNER JOIN ( SELECT Name,right(style,4) Style from (SELECT Name,SUBSTRING_INDEX(Name,'款',1) style FROM used_car_analysis) c ) AS b ON a. NAME = b. NAME SET a.Style = b.Style ; #抽取每款车的款式5、型号
从Name中提取二手车的型号。
ALTER TABLE used_car_analysis ADD COLUMN model CHAR(255); UPDATE used_car_analysis a INNER JOIN ( SELECT NAME, SUBSTRING_INDEX(model, "20", 1) model FROM used_car_analysis ) AS b ON a. NAME = b. NAME SET a.model = b.model; #提取每款车的型号。 UPDATE used_car_analysis a INNER JOIN ( SELECT NAME, SUBSTRING_INDEX(model, " ", 1) model FROM ( SELECT NAME, Style, model FROM used_car_analysis WHERE Style NOT LIKE "20%" ) c ) AS b ON a. NAME = b. NAME SET a.model = b.model; #对于2000年以前的型号,另做更新。 (为方便“二八”法则的分析,特提取下列数据信息) 6、各品牌销售量累计求和 #对各品牌的销量累计求和 SET @csum := 0; SELECT Brand, sale, (@csum := @csum + sale) AS cum_sale FROM ( SELECT Brand, count(*) sale FROM used_car_analysis GROUP BY Brand ORDER BY sale DESC ) c;7、各品牌销售额累计求和
SET @csum := 0; SELECT Brand, sale_money, (@csum := @csum + sale_money) cum_sale_money FROM ( select Brand, sum(if(Brand = Brand,Sec_price,0)) sale_money from used_car_analysis group by Brand ORDER by sale_money DESC ) c;