数据清洗与整理

    技术2022-07-16  73

    from pandas import Series,DataFrame import pandas as pd import numpy as npdf1.isnull()df1.notnull()df1.isnull().sum()df1.isnull().sum().sum()df1.info() 按照字段统计df1.dropna()df2.iloc[2,:]=np.nan df2[3]=np.nan df2df2.dropna(how=‘all’) 10.df2.fillna({1:6,3:0},inplace=True)df2.fillna(method=‘ffill’) 向前填充df2[0] = df2[0].fillna(df2[0].mean())df2.fillna?df1.replace([’’,2001],[‘不详’,2002])df1.replace({’’:‘不详’,2001:2002})df1.duplicated() 查看重复df1.drop_duplicates([‘sex’,‘year’],keep=‘last’) 删除重复项 data = { 'name':['张三','李四','王五','小明'], 'math':[79,52,63,92] } df2 = DataFrame(data) df2 def f(x): if x>90: return '优秀' elif 70<=x<90: return '良好' elif 60<=x<70: return '合格' else: return '不合格' df2['class'] = df2['math'].map(f) df2

    创建一个函数,然后用map方法逐个处理

    df3 = DataFrame(np.arange(10),columns=[‘X’]) df3[‘Y’]= 2*df3[‘X’] + 0.5 df3.iloc[9,1] = 185 df3 df3.plot(kind=‘scatter’,x=‘X’,y=‘Y’) 检测异常值lambda表达式,通常是在需要一个函数,但是又不想费神去命名一个函数的场合下使用,也就是指匿名函数。dummies = df2[‘朝向’].apply(lambda x:Series(x.split(’/’)).value_counts()) dummies dummies = dummies.fillna(0).astype(int) dummiesprice = DataFrame({ ‘fruit’:[‘apple’,‘banana’,‘orange’], ‘price’:[23,32,45] }) amount = DataFrame({ ‘fruit’:[‘apple’,‘banana’,‘apple’,‘apple’,‘banana’,‘pear’], ‘amount’:[5,3,6,3,5,7] }) pd.merge(amount,price) 按列合并 交集 pd.merge(amount,price,on=‘fruit’) pd.merge(amount,price,left_on=‘fruit’,right_on=‘fruit’) pd.merge(amount,price,how=‘outer’) 并集 pd.merge(amount,price,how=‘left’)left = DataFrame({ ‘key1’:[‘one’,‘one’,‘two’], ‘key2’:[‘a’,‘b’,‘a’], ‘val1’:[‘2’,‘3’,‘4’] }) right = DataFrame({ ‘key1’:[‘one’,‘one’,‘two’,‘two’], ‘key2’:[‘a’,‘a’,‘a’,‘b’], ‘val2’:[‘5’,‘6’,‘7’,‘8’] }) pd.merge(left,right,on=[‘key1’,‘key2’],how=‘outer’) 多键链接,传入list pd.merge(left,right,on=‘key1’) pd.merge(left,right,on=‘key1’,suffixes=(’_left’,’_right’))pd.merge(left2,right2,left_on=‘key’,right_index=True) #索引作为连接left3.join(right3,how=‘outer’) #jion方法快速链接
    Processed: 0.012, SQL: 9