pandas练习

    技术2022-07-11  71

    import numpy as np import pandas as pd df=pd.read_csv('端午粽子数据.csv') df.head()

    df.shape

    (4403, 5)

    df.columns

    Index([‘标题’, ’ 价格’, ‘付款人数’, ‘店铺’, '发货地址 '], dtype=‘object’)

    # 去掉空格 df.columns = df.columns.str.strip() # 去掉发货地址的缺失值 df.dropna(axis=0, subset=['发货地址'], inplace=True) # 寻找脏数据 df_hz = df.loc[df['发货地址'].str.contains(r'杭州')] df_hz.loc[~df_hz['价格'].str.match(r'^\d+\.?\d+$')]

    df_hz.loc[4376, '价格'] = 45 df_hz['价格'].astype('float').mean()

    80.90088888888877

    df_jx = df.loc[df['标题'].str.contains(r'嘉兴')] df_jx.loc[~df_jx['发货地址'].str.contains(r'嘉兴')]

    # 找出不正常的数字 df.loc[~df['价格'].str.match(r'^\d+\.?\d+$')]

    df.loc[[538, 4376], '价格'] = [45.9, 45] df['价格'] = df['价格'].astype('float') # 按照百分位数分组 df['价位'] = pd.cut(df['价格'], bins=np.percentile(df['价格'], [0, 20, 40, 60, 80, 100]), labels=['低', '较低', '中', '较高', '高']) df

    cols = list(df.columns) cols.remove('价位') cols.insert(1, '价位') df = df[cols] df.sort_values('价位', ascending=False)

    df.loc[df['价位'].isna(), '价位'] = ['低', '低'] df.loc[df['付款人数'].isna()]

    df = df.convert_dtypes() df['人数'] = df['付款人数'].str.replace(r'人付款', '') df['人数'] = df['人数'].str.replace(r'\+', '') df_dot = df.loc[df['人数'].str.contains(r'\.'), '人数'].str.replace('.', '') df_dot = df_dot.str.replace('万', '000') df.loc[list(df_dot.index), '人数'] = list(df_dot.values) df_wan = df.loc[df['人数'].str.contains('万'), '人数'].str.replace('万', '0000') df.loc[list(df_wan.index), '人数'] = list(df_wan.values) df.loc[df['人数'].isna()]

    # 转回float类型 df['人数'] = pd.to_numeric(df['人数'], errors='coerce') # 进行线性插值 for _, group in df.groupby('价位'): df.loc[group.index, '人数'] = group[['价格', '人数']].sort_values(by='价格')['人数'].interpolate() df.loc[df['人数'].isna()]

    df.loc[df['人数'].isna(), '人数'] = [900, 900, 900] amount = list(df.loc[df['付款人数'].isna(), '人数']) for i in range(len(amount)): amount[i] = str(round(amount[i])) person = '人付款' for i in range(len(amount)): amount[i] = amount[i] + person df.loc[df['付款人数'].isna(), '付款人数'] = amount df.loc[[183, 2131]] df['价格'] = df['价格'].astype('str') df_zongzi = ('商品发货地为' + df['发货地址'] + ',店铺为' + df['店铺'] + ',共计'+ df['付款人数'] + ',单价为' + df['价格']) df_zongzi

    df_zongzi.str.extract(r'商品发货地为(?P<发货地址>\w+\s?\w+?),店铺为(?P<店铺>[\w]+),共计(?P<付款人数>\d+[万\+]*?人付款),单价为(?P<价格>\d+\.?\d+)')

    Processed: 0.011, SQL: 9