import pandas
as pd
import numpy
as np
import time
from datetime
import datetime
from geopy
.distance
import geodesic
from math
import radians
, cos
, sin
, asin
, sqrt
一、端午节的淘宝粽子交易
问题
(1) 请删除最后一列为缺失值的行,并求所有在杭州发货的商品单价均值。
df_1
= pd
.read_csv
('data/rice_pudding.csv')
df_1
.head
()
标题价格付款人数店铺发货地址
0五芳斋粽子礼盒 心悦+18只装咸鸭蛋组合端午节礼品团购嘉兴肉粽子1296人付款五芳斋官方旗舰店浙江 嘉兴1北京稻香村端午粽子手工豆沙粽220g*2袋散装豆沙粽香甜软糯豆沙粽448人付款天猫超市上海2五芳斋粽子礼盒装鲜肉咸蛋黄大肉粽嘉兴豆沙甜粽端午团购散装礼品89.9100万+人付款五芳斋官方旗舰店浙江 嘉兴3稻香私房鲜肉粽蛋黄肉粽嘉兴粽子咸鸭蛋礼盒装端午节送礼特产团购1381936人付款稻香村食品旗舰店北京4嘉兴粽子 蛋黄鲜肉粽新鲜大肉粽早餐散装团购浙江特产蜜枣多口味3.89500+人付款城城喂食猫浙江 嘉兴
df_1
.info
()
<class 'pandas.core.frame.DataFrame'>
RangeIndex
: 4403 entries
, 0 to
4402
Data columns
(total
5 columns
):
--- ------ -------------- -----
0 标题
4403 non
-null
object
1 价格
4403 non
-null
object
2 付款人数
4332 non
-null
object
3 店铺
4403 non
-null
object
4 发货地址
4400 non
-null
object
dtypes
: object(5)
memory usage
: 172.1+ KB
通过info看到columns中存在空格
df_1
.columns
= df_1
.columns
.str.strip
()
df_1
.columns
删除最后一列为缺失值的行
df_result_1
= df_1
.copy
()
df_result_1
.dropna
(axis
=0, subset
=['发货地址'], inplace
=True)
df_result_1
标题价格付款人数店铺发货地址
0五芳斋粽子礼盒 心悦+18只装咸鸭蛋组合端午节礼品团购嘉兴肉粽子1296人付款五芳斋官方旗舰店浙江 嘉兴1北京稻香村端午粽子手工豆沙粽220g*2袋散装豆沙粽香甜软糯豆沙粽448人付款天猫超市上海2五芳斋粽子礼盒装鲜肉咸蛋黄大肉粽嘉兴豆沙甜粽端午团购散装礼品89.9100万+人付款五芳斋官方旗舰店浙江 嘉兴3稻香私房鲜肉粽蛋黄肉粽嘉兴粽子咸鸭蛋礼盒装端午节送礼特产团购1381936人付款稻香村食品旗舰店北京4嘉兴粽子 蛋黄鲜肉粽新鲜大肉粽早餐散装团购浙江特产蜜枣多口味3.89500+人付款城城喂食猫浙江 嘉兴..................4398红船100克*2只五谷杂粮粽 浙江特产嘉兴粽子 团购粽子嘉兴端午节7.9347人付款红船旗舰店浙江 嘉兴4399隐栗古法烧肉粽/咸蛋黄/豆沙板栗端午粽子三种口味6只家庭礼盒装9380人付款峨眉山隐栗粑粑罗栗四川 乐山4400五芳斋 佛系粽子全素口味礼盒装桂花八宝粽豆沙甜粽端午礼品团购98.25人付款北京美程嘉译北京4401【粉丝专享】端午节特产新鲜蛋黄鲜肉粽豆沙嘉兴粽子800g/箱65.9149人付款chaoge1931安徽 合肥4402粽子端午节送礼盒包装1200g双蛋黄广东东莞大个道滘裹蒸棕鲜肉粽9920人付款hezhiqi广东 东莞
4400 rows × 5 columns
df_result_1
['发货地址'].str.contains
('杭州')
0 False
1 False
2 False
3 False
4 False
...
4398 False
4399 False
4400 False
4401 False
4402 False
Name
: 发货地址
, Length
: 4400, dtype
: bool
df_result_1
['价格'][~(df_result_1
['价格'].str.replace
(r
'-?\d+\.?\d+','True')=='True')]
538 45.9.
1353 5
2183 5
2537 4
2813 9
3127 4
3166 9
3174 6
3284 4
3390 5
3506 5
3637 4
3697 5
3841 4
3989 4
4000 4
4062 4
4124 5
4146 9
4329 5
4376 45_
Name
: 价格
, dtype
: object
df_result_1
.loc
[[538, 4376], '价格'] = [45.9, 45]
df_result_1
['价格'][df_result_1
['发货地址'].str.contains
('杭州')].astype
('float').mean
()
均值
80.90088888888877
(2) 商品标题带有“嘉兴”但发货地却不在嘉兴的商品有多少条记录?
df_result_2
= df_1
.copy
()
df_result_2
.dropna
(axis
=0, subset
=['发货地址'], inplace
=True)
df_result_2
[(df_result_2
['标题'].str.contains
('嘉兴'))&(~df_result_2
['发货地址'].str.contains
('嘉兴'))].shape
[0]
记录数目
1032
(3) 请按照分位数将价格分为“高、较高、中、较低、低”5 个类别,再将类别结果插入到标题一列之后,最后对类别列进行降序排序。
df_result_3
= df_1
.copy
()
df_result_3
.loc
[[538, 4376], '价格'] = [45.9, 45]
df_result_3
['价格'] = df_result_3
['价格'].astype
('float')
df_result_3
['价格'].quantile
([0, 0.2, 0.4, 0.6, 0.8, 1.0])
0.0 2.80
0.2 22.80
0.4 39.80
0.6 60.00
0.8 106.12
1.0 1780.00
Name
: 价格
, dtype
: float64
df_result_3
['价格'].describe
([0, 0.2, 0.4, 0.6, 0.8, 1.0])
分位数情况
count
4403.000000
mean
73.635199
std
79.831288
min 2.800000
0% 2.800000
20% 22.800000
40% 39.800000
50% 49.900000
60% 60.000000
80% 106.120000
100% 1780.000000
max 1780.000000
Name
: 价格
, dtype
: float64
根据分位数划分cut方法
df_result_3
['类别'] = pd
.cut
(df_result_3
['价格'], [-1e-10, 22.8, 39.8, 60.0, 106.12, 1780.0], labels
=['低','较低','中','较高','高'])
pd
.value_counts
(df_result_3
['类别'])
低
894
较低
890
高
881
较高
876
中
862
Name
: 类别
, dtype
: int64
根据分位数划分qcut方法(百度)
df_result_3
['类别'] = pd
.qcut
(df_result_3
['价格'], [0, 0.2, 0.4, 0.6, 0.8, 1.0], labels
=['低','较低','中','较高','高'])
pd
.value_counts
(df_result_3
['类别'])
低
894
较低
890
高
881
较高
876
中
862
Name
: 类别
, dtype
: int64
排序
df_result_3
= df_result_3
.reindex
(columns
=['标题', '类别','价格','付款人数','店铺','发货地址']).sort_values
('类别', ascending
=False)
df_result_3
.head
()
标题类别价格付款人数店铺发货地址
0五芳斋粽子礼盒 心悦+18只装咸鸭蛋组合端午节礼品团购嘉兴肉粽子高129.06人付款五芳斋官方旗舰店浙江 嘉兴847真真老老蛋黄肉粽140g*60个新鲜大肉粽冷冻方便早餐嘉兴粽子端午高168.0139人付款tb6182109_2012上海2773山西双合成粽子端午节送礼御粽膳坊礼盒粽叶板栗豆沙黄米杂粮粽高129.033人付款双合成旗舰店山西 晋中2768采芝斋粽子甜粽新鲜蛋黄大肉粽嘉兴口味鲜肉粽子食品端午礼盒定制高216.047人付款采芝斋杭州专卖店浙江 杭州2763真真老老臻瑞礼盒1.12kg/盒端午节粽子礼盒装送130g经典鲜肉粽高126.02人付款天猫超市上海
(4) 付款人数一栏有缺失值吗?若有则请利用上一问的分类结果对这些缺失值进行合理估计并填充。
df_result_4
= df_result_3
.copy
()
df_result_4
[df_result_4
['付款人数'].isna
()]
标题类别价格付款人数店铺发货地址
595稻香村粽子礼盒京彩缤纷1670g粽子松仁肉肚鸭蛋 甜粽肉粽多口味高189.00NaN天猫会员店上海1074三全龙舟粽礼盒五月尚品礼盒1800g北方端午粽子礼盒高129.00NaN天猫会员店上海386五芳斋华礼竹篮礼盒1360g蛋粽组合端午礼品嘉兴粽子礼盒高129.00NaN天猫会员店上海491久知味双层竹篮粽子酱鸭礼盒1560克 蛋黄粽粽子礼盒装 端午节送礼较高89.00NaN天猫会员店上海438真真老老嘉情礼盒10粽6蛋1.52kg/盒嘉兴粽子端午粽子礼盒较高69.90NaN天猫会员店上海.....................756五芳斋嘉兴粽子真空散装大肉粽140g*2只端午特产早餐速食低18.60NaN天猫会员店上海1813真真老老 嘉兴 栗子肉粽 130g 粽子 真空 端午节低4.50NaN天猫会员店上海1812真真老老 嘉兴 豆沙粽子 130g 真空 粽子 端午节低3.90NaN天猫会员店上海2130三珍斋蛋黄粽200g真空100g*2只嘉兴特产粽子速食早餐低7.88NaN天猫会员店上海2131俏香阁经典鲜肉粽子120g*2只端午节嘉兴特产速食早餐点心低5.90NaN天猫会员店上海
71 rows × 6 columns
替换过程
df_result_4
['付款人数'] = df_result_4
['付款人数'].str.replace
(r
'(?P<num>\d+\.?\d?)(?P<s>\w)?\+?\w*', lambda x
:str(float(x
.group
('num'))*10000) if x
.group
('s')=='万' else x
.group
('num'))
df_result_4
['付款人数'] = df_result_4
['付款人数'].astype
('float')
df_result_4
.head
()
标题类别价格付款人数店铺发货地址
0五芳斋粽子礼盒 心悦+18只装咸鸭蛋组合端午节礼品团购嘉兴肉粽子高129.06.0五芳斋官方旗舰店浙江 嘉兴847真真老老蛋黄肉粽140g*60个新鲜大肉粽冷冻方便早餐嘉兴粽子端午高168.0139.0tb6182109_2012上海2773山西双合成粽子端午节送礼御粽膳坊礼盒粽叶板栗豆沙黄米杂粮粽高129.033.0双合成旗舰店山西 晋中2768采芝斋粽子甜粽新鲜蛋黄大肉粽嘉兴口味鲜肉粽子食品端午礼盒定制高216.047.0采芝斋杭州专卖店浙江 杭州2763真真老老臻瑞礼盒1.12kg/盒端午节粽子礼盒装送130g经典鲜肉粽高126.02.0天猫超市上海
填值
df_result
= df_result_4
.groupby
('类别').apply(lambda x
:x
['付款人数'].fillna
(x
['付款人数'].mean
())).reset_index
(level
=0)
df_result
类别付款人数
3956低292.02705低91.02649低423.01064低368.0881低847.0.........88高746.04296高9.03603高17.061高3817.04294高18.0
4403 rows × 2 columns
检查是否存在NaN
df_result_4
= df_result
.join
(df_result_4
, rsuffix
='_').reindex
(columns
=['标题','类别','价格','付款人数','店铺','发货地址'])
df_result_4
[df_result_4
['付款人数'].isna
()==True]
标题类别价格付款人数店铺发货地址
df_result_4
.info
()
<class 'pandas.core.frame.DataFrame'>
Int64Index
: 4403 entries
, 3956 to
4294
Data columns
(total
6 columns
):
--- ------ -------------- -----
0 标题
4403 non
-null
object
1 类别
4403 non
-null category
2 价格
4403 non
-null float64
3 付款人数
4403 non
-null float64
4 店铺
4403 non
-null
object
5 发货地址
4400 non
-null
object
dtypes
: category
(1), float64
(2), object(3)
memory usage
: 370.9+ KB
(5) 请将数据后四列合并为如下格式的Series:商品发货地为××,店铺为××,共计×× 人付款,单价为××。
df_result_5
= df_result_4
.copy
()
df_result_5
['发货地址'].fillna
(df_result_5
['发货地址'].value_counts
().index
[0], inplace
=True)
df_result_5
['付款人数'] = df_result_5
['付款人数'].astype
('int')
df_result_5
.head
()
标题类别价格付款人数店铺发货地址
3956新鲜粽叶现摘大号粽子叶江西省农家野生干箬叶5斤特级广西大棕叶低11.36292毛丽兰87江西 吉安2705嘉兴特产粽子咸鸭蛋礼盒装新鲜大肉粽端午节礼品粽子礼盒团购送礼低18.9091qixiaoxia8811江苏 无锡2649粽叶干大号免邮广东省广西野生粑棕子叶新鲜包粽子叶芦苇叶大柊叶低9.90423秋与页广东 广州1064采芝斋 蛋黄肉粽200g真空包装嘉兴粽子百年工艺送礼佳品速食早餐低16.90368天猫超市上海881傅太粽子嘉兴风味新鲜杂粮散装甜糯米手工八宝粽礼袋包装素粽批发低22.80847傅太旗舰店江西 萍乡
df_result_5
[['价格', '付款人数']] = df_result_5
[['价格', '付款人数']].astype
('str')
df_result_5
.info
()
<class 'pandas.core.frame.DataFrame'>
Int64Index
: 4403 entries
, 3956 to
4294
Data columns
(total
6 columns
):
--- ------ -------------- -----
0 标题
4403 non
-null
object
1 类别
4403 non
-null category
2 价格
4403 non
-null
object
3 付款人数
4403 non
-null
object
4 店铺
4403 non
-null
object
5 发货地址
4403 non
-null
object
dtypes
: category
(1), object(5)
memory usage
: 210.9+ KB
df_new
= ('商品发货地为:'+ df_result_5
['发货地址'] + ',店铺为:'+df_result_5
['店铺'] +',共计'+ df_result_5
['付款人数'] + '人付款' + ',单价为:' + df_result_5
['价格']).to_frame
().rename
(columns
={0:'详情'})
df_new
= df_new
.reset_index
(drop
=True)
df_new
.head
()
详情
0商品发货地为:江西 吉安,店铺为:毛丽兰87,共计292人付款,单价为:11.361商品发货地为:江苏 无锡,店铺为:qixiaoxia8811,共计91人付款,单价为:18.92商品发货地为:广东 广州,店铺为:秋与页,共计423人付款,单价为:9.93商品发货地为:上海,店铺为:天猫超市,共计368人付款,单价为:16.94商品发货地为:江西 萍乡,店铺为:傅太旗舰店,共计847人付款,单价为:22.8
(6) 请将上一问中的结果恢复成原来的四列。
正则表达式
import re
pattern
= r
'商品发货地为:(?P<发货地址>[\w]+.*?[\w]+),店铺为:(?P<店铺>[\w]+),共计(?P<付款人数>[\S]+),单价为:(?P<价格>[\d]+\.*[\d]+)'
str_test
= '商品发货地为:安徽 芜湖,店铺为:皖南小灶,共计35人付款,单价为:19.8'
re
.search
(pattern
, str_test
).group
(0)
'商品发货地为:安徽 芜湖,店铺为:皖南小灶,共计35人付款,单价为:19.8'
df_result_6
= df_new
['详情'].str.extract
(r
'商品发货地为:(?P<发货地址>[\w]+.*?[\w]+),店铺为:(?P<店铺>[\w]+),共计(?P<付款人数>[\S]+),单价为:(?P<价格>[\d]+\.*[\d]+)')
df_result_6
.reindex
(columns
=['价格', '付款人数', '店铺', '发货地址']).head
()
价格付款人数店铺发货地址
011.36292人付款毛丽兰87江西 吉安118.991人付款qixiaoxia8811江苏 无锡29.9423人付款秋与页广东 广州316.9368人付款天猫超市上海422.8847人付款傅太旗舰店江西 萍乡
二、墨尔本每日最低温度
问题
(1) 剔除国庆节、五一劳动节和每月第一个周一,求每月的平均最低气温。
和第9章练习一类似
df_2
= pd
.read_csv
('data/temperature.csv')
df_2
['Date'] = pd
.to_datetime
(df_2
['Date'])
df_2
.head
()
DateTemp
01981-01-0120.711981-01-0217.921981-01-0318.831981-01-0414.641981-01-0515.8
df_2
.info
()
<class 'pandas.core.frame.DataFrame'>
RangeIndex
: 3650 entries
, 0 to
3649
Data columns
(total
2 columns
):
--- ------ -------------- -----
0 Date
3650 non
-null datetime64
[ns
]
1 Temp
3650 non
-null float64
dtypes
: datetime64
[ns
](1), float64
(1)
memory usage
: 57.2 KB
df_result_1
= df_2
.copy
()
df_result_1
= df_result_1
.set_index
('Date', drop
=True)
df_result_1
.index
DatetimeIndex
(['1981-01-01', '1981-01-02', '1981-01-03', '1981-01-04',
'1981-01-05', '1981-01-06', '1981-01-07', '1981-01-08',
'1981-01-09', '1981-01-10',
...
'1990-12-22', '1990-12-23', '1990-12-24', '1990-12-25',
'1990-12-26', '1990-12-27', '1990-12-28', '1990-12-29',
'1990-12-30', '1990-12-31'],
dtype
='datetime64[ns]', name
='Date', length
=3650, freq
=None)
day
= pd
.date_range
(start
='1981-1-1', end
='1990-12-31', freq
=pd
.offsets
.WeekOfMonth
(1,1))
day
DatetimeIndex
(['1981-01-05', '1981-02-02', '1981-03-02', '1981-04-06',
'1981-05-04', '1981-06-01', '1981-07-06', '1981-08-03',
'1981-09-07', '1981-10-05',
...
'1990-03-05', '1990-04-02', '1990-05-07', '1990-06-04',
'1990-07-02', '1990-08-06', '1990-09-03', '1990-10-01',
'1990-11-05', '1990-12-03'],
dtype
='datetime64[ns]', length
=120, freq
='WOM-1MON')
hol
= pd
.date_range
(start
='1981-5-1', end
='1981-5-3').append
(pd
.date_range
(start
='1981-10-1', end
='1981-10-7'))
day
.append
(hol
)
for i
in range(1, 10):
day
= day
.append
(hol
+pd
.DateOffset
(years
=i
))
day
DatetimeIndex
(['1981-01-05', '1981-02-02', '1981-03-02', '1981-04-06',
'1981-05-04', '1981-06-01', '1981-07-06', '1981-08-03',
'1981-09-07', '1981-10-05',
...
'1990-05-01', '1990-05-02', '1990-05-03', '1990-10-01',
'1990-10-02', '1990-10-03', '1990-10-04', '1990-10-05',
'1990-10-06', '1990-10-07'],
dtype
='datetime64[ns]', length
=210, freq
=None)
day
.value_counts
()
1982-05-03 2
1987-10-05 2
1989-05-01 2
1984-10-01 2
1985-10-07 2
..
1982-05-01 1
1982-05-02 1
1987-10-07 1
1982-08-02 1
1981-08-03 1
Length
: 197, dtype
: int64
day
DatetimeIndex
(['1981-01-05', '1981-02-02', '1981-03-02', '1981-04-06',
'1981-05-04', '1981-06-01', '1981-07-06', '1981-08-03',
'1981-09-07', '1981-10-05',
...
'1990-05-01', '1990-05-02', '1990-05-03', '1990-10-01',
'1990-10-02', '1990-10-03', '1990-10-04', '1990-10-05',
'1990-10-06', '1990-10-07'],
dtype
='datetime64[ns]', length
=210, freq
=None)
df_2
[~df_2
['Date'].isin
(day
)].set_index
('Date').resample
('M').mean
().head
()
TempDate
1981-01-3117.7766671981-02-2817.6370371981-03-3113.3300001981-04-3012.2103451981-05-319.520000
(2) 季节指数是一种对于周期性变化序列的特征刻画。记数据集中第
k
k
k年平均最低气温为
T
Y
k
(
k
=
1
,
…
,
10
)
\mathrm{TY}_{\mathrm{k}}(\mathrm{k}=1, \ldots, 10)
TYk(k=1,…,10),第
k
k
k年第
j
j
j个月平均最低气温为
T
M
k
j
(
j
=
1
,
…
,
12
)
\mathrm{TM}_{\mathrm{kj}}(\mathrm{j}=1, \ldots, 12)
TMkj(j=1,…,12),定义
S
j
=
∑
k
T
M
k
j
∑
k
T
Y
k
\mathrm{S}_{\mathrm{j}}=\frac{\sum_{\mathrm{k}} \mathrm{TM}_{\mathrm{kj}}}{\sum_{\mathrm{k}} \mathrm{TY}_{\mathrm{k}}}
Sj=∑kTYk∑kTMkj。请按照如上定义,计算12个月的季节指数
S
j
S_j
Sj
df_result_2
= df_2
.copy
()
df_result_2
= df_result_2
.set_index
('Date', drop
=True)
TY
= df_result_2
.resample
('Y').mean
().sum()
TMk
= df_result_2
.resample
('M').mean
()
print(TY
)
TMk
Temp
111.777534
dtype
: float64
TempDate
1981-01-3117.7129031981-02-2817.6785711981-03-3113.5000001981-04-3012.3566671981-05-319.490323......1990-08-317.8258061990-09-309.1666671990-10-3111.3451611990-11-3012.6566671990-12-3114.367742
120 rows × 1 columns
for i
in range(1, 13):
print('S{}={}\n'.format(i
, round((TMk
[TMk
.index
.month
==i
].sum()/TY
).values
[0], 4)))
S1
=1.3447
S2
=1.3758
S3
=1.3031
S4
=1.0815
S5
=0.8827
S6
=0.6511
S7
=0.5987
S8
=0.706
S9
=0.8031
S10
=0.9223
S11
=1.1165
S12
=1.2394
(3)移动平均法是一种时间序列的常见平滑方式,可分为k 期移动平均和k期中心移动平均,都使用了某一时刻及其周围的数据对该时刻的数据进行平滑修正。设原序列为
x
1
,
…
,
x
n
x_{1}, \ldots, x_{n}
x1,…,xn,对于
x
t
x_t
xt的
k
k
k期移动平均修正
x
~
t
\widetilde{x}_{t}
x
t为
∑
i
=
0
k
−
1
x
t
−
i
k
\frac{\sum_{i=0}^{k-1} x_{t-i}}{k}
k∑i=0k−1xt−i,对于
k
k
k期中心移动平均修正为:
x
~
t
=
{
1
k
(
1
2
x
t
−
k
2
+
x
t
−
k
2
+
1
+
…
+
x
t
+
…
+
x
t
+
k
2
−
1
+
1
2
x
t
+
k
2
)
,
k
is even
1
k
(
x
t
−
k
−
1
2
+
x
t
−
k
−
1
2
+
1
+
…
+
x
t
+
…
+
x
t
+
k
−
1
2
−
1
+
x
t
+
k
−
1
2
)
,
else
\widetilde{x}_{t}=\left\{\begin{array}{l}\frac{1}{k}\left(\frac{1}{2} x_{t-\frac{k}{2}}+x_{t-\frac{k}{2}+1}+\ldots+x_{t}+\ldots+x_{t+\frac{k}{2}-1}+\frac{1}{2} x_{t+\frac{k}{2}}\right), k \text { is even } \\\frac{1}{k}\left(x_{t-\frac{k-1}{2}}+x_{t-\frac{k-1}{2}+1}+\ldots+x_{t}+\ldots+x_{t+\frac{k-1}{2}-1}+x_{t+\frac{k-1}{2}}\right), \text { else }\end{array}\right.
x
t=⎩⎨⎧k1(21xt−2k+xt−2k+1+…+xt+…+xt+2k−1+21xt+2k),k is even k1(xt−2k−1+xt−2k−1+1+…+xt+…+xt+2k−1−1+xt+2k−1), else
(a) 求原序列的5 期移动平均序列。
df_result_3
= df_2
.copy
()
df_result_3
= df_result_3
.set_index
('Date', drop
=True)
df_result_3
.rolling
(window
=5, min_periods
=0).sum()/5
TempDate
1981-01-014.141981-01-027.721981-01-0311.481981-01-0414.401981-01-0517.56......1990-12-2713.081990-12-2813.021990-12-2913.721990-12-3014.281990-12-3113.96
3650 rows × 1 columns
(b) 求原序列的5 期与6 期中心移动平均序列。
TempDate
1981-01-0111.481981-01-0214.401981-01-0317.561981-01-0416.581981-01-0516.16......1990-12-2713.721990-12-2814.281990-12-2913.961990-12-30NaN1990-12-31NaN
3650 rows × 1 columns
三、2016 年8 月上海市摩拜单车骑行记录
问题
(1) 平均而言,周末单天用车量比工作日单天用车量更大吗?
df_3
= pd
.read_csv
('data/Bicycle.csv')
df_3
.head
()
orderidbikeiduseridstart_timestart_location_xstart_location_yend_timeend_location_xend_location_ytrack
078387158357100802016-08-20 06:57121.34831.3892016-08-20 07:04121.35731.388121.347,31.392#121.348,31.389#121.349,31.390#1...18913339277666052016-08-29 19:09121.50831.2792016-08-29 19:31121.48931.271121.489,31.270#121.489,31.271#121.490,31.270#1...2110662315204588762016-08-13 16:17121.38331.2542016-08-13 16:36121.40531.248121.381,31.251#121.382,31.251#121.382,31.252#1...31389484196259106482016-08-23 21:34121.48431.3202016-08-23 21:43121.47131.325121.471,31.325#121.472,31.325#121.473,31.324#1...418853778208117352016-08-16 07:32121.40731.2922016-08-16 07:41121.41831.288121.407,31.291#121.407,31.292#121.408,31.291#1...
df_3
.info
()
<class 'pandas.core.frame.DataFrame'>
RangeIndex
: 102361 entries
, 0 to
102360
Data columns
(total
10 columns
):
--- ------ -------------- -----
0 orderid
102361 non
-null int64
1 bikeid
102361 non
-null int64
2 userid
102361 non
-null int64
3 start_time
102361 non
-null
object
4 start_location_x
102361 non
-null float64
5 start_location_y
102361 non
-null float64
6 end_time
102361 non
-null
object
7 end_location_x
102361 non
-null float64
8 end_location_y
102361 non
-null float64
9 track
102361 non
-null
object
dtypes
: float64
(4), int64
(3), object(3)
memory usage
: 7.8+ MB
df_result_1
= df_3
.copy
()
df_result_1
.info
()
<class 'pandas.core.frame.DataFrame'>
RangeIndex
: 102361 entries
, 0 to
102360
Data columns
(total
10 columns
):
--- ------ -------------- -----
0 orderid
102361 non
-null int64
1 bikeid
102361 non
-null int64
2 userid
102361 non
-null int64
3 start_time
102361 non
-null
object
4 start_location_x
102361 non
-null float64
5 start_location_y
102361 non
-null float64
6 end_time
102361 non
-null
object
7 end_location_x
102361 non
-null float64
8 end_location_y
102361 non
-null float64
9 track
102361 non
-null
object
dtypes
: float64
(4), int64
(3), object(3)
memory usage
: 7.8+ MB
df_result_1
['start_time'] = pd
.to_datetime
(df_result_1
['start_time'])
df_weekend
= df_result_1
.loc
[df_result_1
['start_time'].dt
.dayofweek
.isin
([5,6])]
weekend
= df_weekend
.shape
[0]/(2*df_weekend
['start_time'].nunique
())
df_weekdays
= df_result_1
.loc
[~df_result_1
['start_time'].dt
.dayofweek
.isin
([5,6])]
weekdays
= df_weekdays
.shape
[0]/(5*df_weekend
['start_time'].nunique
())
weekend
> weekdays
False
df_result_1
['start_time'].apply(lambda x
:x
.weekday
()).value_counts
().sort_index
()
0 16380
1 16259
2 17829
3 12484
4 13208
5 13359
6 12842
Name
: start_time
, dtype
: int64
(2) 工作日每天的高峰时间段大致为上午7:30 至9:30、下午17:00 至19:00,请问8 月里早高峰骑行记录量(以start_time 为准)高于晚高峰的有几天?
df_result_2
= df_3
.copy
()
df_result_2
['start_time'] = pd
.to_datetime
(df_result_2
['start_time'])
df_new_2
= df_result_2
['start_time'].apply(lambda x
:x
.date
()).to_frame
()
df_new_2
['time'] = df_result_2
['start_time'].apply(lambda x
:x
.time
())
df_new_2
.head
()
start_timetime
02016-08-2006:57:0012016-08-2919:09:0022016-08-1316:17:0032016-08-2321:34:0042016-08-1607:32:00
df_new_2
.info
()
<class 'pandas.core.frame.DataFrame'>
RangeIndex
: 102361 entries
, 0 to
102360
Data columns
(total
2 columns
):
--- ------ -------------- -----
0 start_time
102361 non
-null
object
1 time
102361 non
-null
object
dtypes
: object(2)
memory usage
: 1.6+ MB
df_new_2
['start_time'] = pd
.to_datetime
(df_new_2
['start_time'])
df_weekdays
= df_new_2
[~df_new_2
['start_time'].dt
.dayofweek
.isin
([5,6])]
df_weekdays
start_timetime
12016-08-2919:09:0032016-08-2321:34:0042016-08-1607:32:0062016-08-2913:39:0072016-08-2909:21:00.........1023522016-08-1107:42:001023532016-08-2609:03:001023542016-08-0310:05:001023552016-08-2915:39:001023602016-08-0910:51:00
76160 rows × 2 columns
mor_start
= datetime
.strptime
('7:30', '%H:%M').time
()
mor_end
= datetime
.strptime
('9:30', '%H:%M').time
()
df_morning
= df_weekdays
.loc
[(df_weekdays
['time'] >= mor_start
)&(df_weekdays
['time'] <= mor_end
)].set_index
('start_time', drop
=True)
df_morning_count
= df_morning
.resample
('D').count
()['time']
df_morning_count
start_time
2016-08-01 307
2016-08-02 248
2016-08-03 361
2016-08-04 416
2016-08-05 6
2016-08-06 0
2016-08-07 0
2016-08-08 405
2016-08-09 491
2016-08-10 529
2016-08-11 488
2016-08-12 541
2016-08-13 0
2016-08-14 0
2016-08-15 611
2016-08-16 629
2016-08-17 608
2016-08-18 618
2016-08-19 633
2016-08-20 0
2016-08-21 0
2016-08-22 618
2016-08-23 654
2016-08-24 647
2016-08-25 763
2016-08-26 688
2016-08-27 0
2016-08-28 0
2016-08-29 901
2016-08-30 854
2016-08-31 828
Freq
: D
, Name
: time
, dtype
: int64
even_start
= datetime
.strptime
('17:00', '%H:%M').time
()
even_end
= datetime
.strptime
('19:00', '%H:%M').time
()
df_evening
= df_weekdays
.loc
[(df_weekdays
['time'] >= even_start
)&(df_weekdays
['time'] <= even_end
)].set_index
('start_time', drop
=True)
df_evening_count
= df_evening
.resample
('D').count
()['time']
df_evening_count
start_time
2016-08-01 377
2016-08-02 350
2016-08-03 423
2016-08-04 217
2016-08-05 439
2016-08-06 0
2016-08-07 0
2016-08-08 597
2016-08-09 553
2016-08-10 656
2016-08-11 612
2016-08-12 683
2016-08-13 0
2016-08-14 0
2016-08-15 766
2016-08-16 735
2016-08-17 775
2016-08-18 782
2016-08-19 822
2016-08-20 0
2016-08-21 0
2016-08-22 839
2016-08-23 872
2016-08-24 825
2016-08-25 958
2016-08-26 1001
2016-08-27 0
2016-08-28 0
2016-08-29 399
2016-08-30 573
2016-08-31 1111
Freq
: D
, Name
: time
, dtype
: int64
df_result
= df_morning_count
.index
[(df_morning_count
- df_evening_count
)>0]
print(df_result
)
df_result
.shape
[0]
DatetimeIndex
(['2016-08-04', '2016-08-29', '2016-08-30'], dtype
='datetime64[ns]', name
='start_time', freq
=None)
3
(3) 请给出在所有周五中(以start_time 为准),记录条数最多的那个周五所在的日期,并在该天内分别按30 分钟、2 小时、6 小时统计摩拜单车使用时间的均值。
df_result_3
= df_3
.copy
()
df_result_3
['start_time'] = pd
.to_datetime
(df_result_3
['start_time'])
df_result_3
['end_time'] = pd
.to_datetime
(df_result_3
['end_time'])
df_result_3
= df_result_3
.set_index
('start_time', drop
=True)
Fri
= df_result_3
[df_result_3
.index
.dayofweek
== 4]['orderid'].resample
('D').count
().idxmax
()
Fri
.date
()
datetime
.date
(2016, 8, 26)
df_bycycle_friday
= df_result_3
[str(Fri
.date
())].copy
()
df_bycycle_friday
['usetime'] = (df_bycycle_friday
['end_time'] - df_bycycle_friday
.index
).dt
.total_seconds
()
df_bycycle_friday
['usetime'].resample
('30min').mean
().apply(lambda x
:pd
.Timedelta
(seconds
= int(x
))).head
()
start_time
2016-08-26 00:00:00 00:18:56
2016-08-26 00:30:00 00:26:42
2016-08-26 01:00:00 00:13:27
2016-08-26 01:30:00 00:15:30
2016-08-26 02:00:00 00:17:17
Freq
: 30T
, Name
: usetime
, dtype
: timedelta64
[ns
]
df_bycycle_friday
['usetime'].resample
('2h').mean
().apply(lambda x
:pd
.Timedelta
(seconds
= int(x
))).head
()
start_time
2016-08-26 00:00:00 00:19:57
2016-08-26 02:00:00 00:19:18
2016-08-26 04:00:00 00:13:33
2016-08-26 06:00:00 00:11:31
2016-08-26 08:00:00 00:11:35
Freq
: 2H
, Name
: usetime
, dtype
: timedelta64
[ns
]
df_bycycle_friday
['usetime'].resample
('6h').mean
().apply(lambda x
:pd
.Timedelta
(seconds
= int(x
))).head
()
start_time
2016-08-26 00:00:00 00:17:47
2016-08-26 06:00:00 00:11:39
2016-08-26 12:00:00 00:16:00
2016-08-26 18:00:00 00:21:39
Freq
: 6H
, Name
: usetime
, dtype
: timedelta64
[ns
]
(4) 请自行搜索相关代码或调用库,计算每条记录起点到终点的球面距离。
计算方法(百度)
df_result_4
= df_3
.copy
()
df_result_4
.head
()
orderidbikeiduseridstart_timestart_location_xstart_location_yend_timeend_location_xend_location_ytrack
078387158357100802016-08-20 06:57121.34831.3892016-08-20 07:04121.35731.388121.347,31.392#121.348,31.389#121.349,31.390#1...18913339277666052016-08-29 19:09121.50831.2792016-08-29 19:31121.48931.271121.489,31.270#121.489,31.271#121.490,31.270#1...2110662315204588762016-08-13 16:17121.38331.2542016-08-13 16:36121.40531.248121.381,31.251#121.382,31.251#121.382,31.252#1...31389484196259106482016-08-23 21:34121.48431.3202016-08-23 21:43121.47131.325121.471,31.325#121.472,31.325#121.473,31.324#1...418853778208117352016-08-16 07:32121.40731.2922016-08-16 07:41121.41831.288121.407,31.291#121.407,31.292#121.408,31.291#1...
def geodistance(lng1
,lat1
,lng2
,lat2
):
lng1
, lat1
, lng2
, lat2
= map(radians
, [float(lng1
), float(lat1
), float(lng2
), float(lat2
)])
dlon
=lng2
-lng1
dlat
=lat2
-lat1
a
=sin
(dlat
/2)**2 + cos
(lat1
) * cos
(lat2
) * sin
(dlon
/2)**2
distance
=2*asin
(sqrt
(a
))*6371*1000
distance
=round(distance
/1000,3)
return distance
df_result_4
['spg_dist'] = df_result_4
.apply(lambda x
: str(geodistance
(x
['start_location_x'], x
['start_location_y'], x
['end_location_x'], x
['end_location_y'])) + 'm', axis
=1)
df_result_4
.head
()
orderidbikeiduseridstart_timestart_location_xstart_location_yend_timeend_location_xend_location_ytrackspg_dist
078387158357100802016-08-20 06:57121.34831.3892016-08-20 07:04121.35731.388121.347,31.392#121.348,31.389#121.349,31.390#1...0.862m18913339277666052016-08-29 19:09121.50831.2792016-08-29 19:31121.48931.271121.489,31.270#121.489,31.271#121.490,31.270#1...2.013m2110662315204588762016-08-13 16:17121.38331.2542016-08-13 16:36121.40531.248121.381,31.251#121.382,31.251#121.382,31.252#1...2.195m31389484196259106482016-08-23 21:34121.48431.3202016-08-23 21:43121.47131.325121.471,31.325#121.472,31.325#121.473,31.324#1...1.354m418853778208117352016-08-16 07:32121.40731.2922016-08-16 07:41121.41831.288121.407,31.291#121.407,31.292#121.408,31.291#1...1.136m
df_result_4
['spg_dist'] = df_result_4
.apply(lambda x
: geodesic
((x
['start_location_y'], x
['start_location_x']), (x
['end_location_y'], x
['end_location_x'])).m
, axis
=1)
df_result_4
.head
()
orderidbikeiduseridstart_timestart_location_xstart_location_yend_timeend_location_xend_location_ytrackspg_dist
078387158357100802016-08-20 06:57121.34831.3892016-08-20 07:04121.35731.388121.347,31.392#121.348,31.389#121.349,31.390#1...863.18459218913339277666052016-08-29 19:09121.50831.2792016-08-29 19:31121.48931.271121.489,31.270#121.489,31.271#121.490,31.270#1...2015.0719852110662315204588762016-08-13 16:17121.38331.2542016-08-13 16:36121.40531.248121.381,31.251#121.382,31.251#121.382,31.252#1...2198.62746631389484196259106482016-08-23 21:34121.48431.3202016-08-23 21:43121.47131.325121.471,31.325#121.472,31.325#121.473,31.324#1...1355.870641418853778208117352016-08-16 07:32121.40731.2922016-08-16 07:41121.41831.288121.407,31.291#121.407,31.292#121.408,31.291#1...1137.383495
(5) 摩拜单车的骑行结束时间是以电子锁关闭的记录时间为准,但有时候用户会忘记关锁,导致骑行时间出现异常。同时,正常人的骑行速度往往大致落在一个合理的区间,请结合上一问中的球面距离和骑行起始、结束时间,找出潜在的异常骑行记录。
(6) 由于路线的曲折性,起点到终点的球面距离往往不能充分反应行程长度,请利用track 列的路线坐标数据,计算估计实际骑行距离,并重新仿照上一问的方法找出可能的异常记录。