数据挖掘之房价预测任务

    技术2024-11-04  23

    目录:

    一. 查看数据1.1 查看标签1.2 特征与标签1.3 相关性1.4 缺失值 二. 数据处理2.1 去掉离群点2.2 正态分布变换: 对数变换log(1+x)2.3 缺失值填充2.4 Encoder转换2.5 正态分布变换: Box-Cox变换 三. 集成算法建模3.1 单模型3.2 平均模型3.3 堆叠模型

    一. 查看数据

    1.1 查看标签

    import pandas as pd import numpy as np import warnings warnings.filterwarnings('ignore') df_train = pd.read_csv(r'data\train.csv') print(df_train.shape) df_train['SalePrice'].describe()

    (1460, 81) 观察一下它的偏度值

    print('Skewness: %f' % df_train['SalePrice'].skew()) print('Kurtosis: %f' % df_train['SalePrice'].kurt()) import matplotlib.pyplot as plt import seaborn as sns %matplotlib inline sns.distplot(df_train['SalePrice'])

    Skewness: 1.882876 Kurtosis: 6.536282

    1.2 特征与标签

    # 居住面积平方英尺 data = pd.concat([df_train['SalePrice'], df_train['GrLivArea']], axis = 1) data.plot.scatter(x = 'GrLivArea', y = 'SalePrice')

    # 地下室面积平方英尺 data = pd.concat([df_train['SalePrice'], df_train['TotalBsmtSF']], axis = 1) data.plot.scatter(x = 'TotalBsmtSF', y = 'SalePrice')

    # 整体材料和饰面质量 data = df_train[['SalePrice', 'OverallQual']] plt.subplots(figsize = (8, 6)) sns.boxplot(x = 'OverallQual', y = 'SalePrice', data = data)

    data = df_train[['Neighborhood', 'SalePrice']] plt.subplots(figsize = (10, 6)) sns.boxplot(x = 'Neighborhood', y = 'SalePrice', data = data) plt.xticks(rotation = 60);

    1.3 相关性

    corrmat = df_train.corr() plt.subplots(figsize = (12, 9)) sns.heatmap(corrmat, square = True, cmap = 'Greens');

    cols = corrmat.nlargest(10, 'SalePrice')['SalePrice'].index # 数值最大的前十个 cm = np.corrcoef(df_train[cols].values.T) plt.figure(figsize = (8, 6)) sns.heatmap(cm, cbar = True, annot = True, square = True, fmt = '.2f', cmap = 'Blues', annot_kws = {'size': 10}, yticklabels = cols.values, xticklabels = cols.values)

    cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt'] sns.pairplot(df_train[cols], size = 3);

    1.4 缺失值

    total_missing = df_train.isnull().sum().sort_values(ascending = False) percent = (df_train.isnull().sum() / len(df_train)).sort_values(ascending = False).round(3) missing_data = pd.concat([total_missing, percent], axis = 1, keys = ['Total', 'Percent']) missing_data.head()

    二. 数据处理

    2.1 去掉离群点

    import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns %matplotlib inline train = pd.read_csv(r'data\train.csv') test = pd.read_csv(r'data\test.csv') print('The train data size before dropping Id feature is: {}'.format(train.shape)) print('The test data size before dropping Id feature is: {}'.format(test.shape)) # ID先留着,暂时不用 train_ID = train['Id'] test_ID = test['Id'] train.drop('Id', axis = 1, inplace = True) test.drop('Id', axis = 1, inplace = True) # 发现离群点 plt.figure(figsize = (8, 6)) plt.scatter(x = train['GrLivArea'], y = train['SalePrice']) plt.xlabel('GrLivArea', fontsize = 12) plt.ylabel('SalePrice', fontsize = 12);

    train = train.drop(train[(train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000)].index) plt.figure(figsize = (8, 6)) plt.scatter(x = train['GrLivArea'], y = train['SalePrice']) plt.xlabel('GrLivArea', fontsize = 12) plt.ylabel('SalePrice', fontsize = 12);

    2.2 正态分布变换: 对数变换log(1+x)

    from scipy.stats import norm from scipy import stats plt.figure(figsize = (8, 6)) sns.distplot(train['SalePrice'], fit = norm) (mu, sigma) = norm.fit(train['SalePrice']) print('mu = {:.2f} and sigma = {:.2f}'.format(mu, sigma)) plt.legend(['Normal dist. ($\mu = $ {:.2f} and $\sigma = $ {:.2f})'.format(mu, sigma)], loc = 'best') plt.ylabel('Frequency') plt.title('SalePrice distribution') fig = plt.figure(figsize = (8, 6)) stats.probplot(train['SalePrice'], plot = plt);

    #对数变换log(1+x) train['SalePrice'] = np.log1p(train['SalePrice']) (mu, sigma) = norm.fit(train['SalePrice']) print('mu = {:.2f} and sigma = {:.2f}'.format(mu, sigma)) plt.figure(figsize = (8, 6)) sns.distplot(train['SalePrice'], fit = norm) plt.legend(['Normal dist. ($\mu = $ {:.2f} and $\sigma$ = {:.2f})'.format(mu, sigma)], loc = 'best') plt.ylabel('Frequency') plt.title('SalePrice distribution') plt.figure(figsize = (8, 6)) stats.probplot(train['SalePrice'], plot = plt);

    2.3 缺失值填充

    ntrain = train.shape[0] ntest = test.shape[0] y_train = train.SalePrice.values all_data = pd.concat((train, test)).reset_index(drop = True) all_data.drop(['SalePrice'], axis = 1, inplace = True) print('all_data size is: {}'.format(all_data.shape)) all_data_na = (all_data.isnull().sum() / len(all_data)) * 100 all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending = False)[: 20] missing_data = pd.DataFrame({'Missing Ratio': all_data_na}) missing_data

    plt.figure(figsize = (12, 8)) sns.barplot(x = all_data_na.index, y = all_data_na) plt.xlabel('Features') plt.ylabel('Percent of missing values') plt.xticks(rotation = 90) plt.title('Percent minssing data by feature');

    all_data['PoolQC'] = all_data['PoolQC'].fillna('None') # 游泳池 all_data['MiscFeature'] = all_data['MiscFeature'].fillna('None') # 没有特征 all_data['Alley'] = all_data['Alley'].fillna('None') # 通道的入口 all_data['Fence'] = all_data['Fence'].fillna('None') # 栅栏 all_data['FireplaceQu'] = all_data['FireplaceQu'].fillna('None') # 壁炉 all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median())) # 到街道的距离 # 车库 for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'): all_data[col] = all_data[col].fillna('None') for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'): all_data[col] = all_data[col].fillna(0) # 地下室 for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'): all_data[col] = all_data[col].fillna(0) for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'): all_data[col] = all_data[col].fillna('None') all_data['MasVnrType'] = all_data['MasVnrType'].fillna('None') # 砌体 all_data['MasVnrArea'] = all_data['MasVnrArea'].fillna(0) # 砌体 # 一般分区分类,用众数 all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0]) all_data['Functional'] = all_data['Functional'].fillna(all_data['Functional'].mode()[0]) # 家庭功能评定 all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0]) # 电力系统 all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0]) # 厨房的品质 all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0]) # 外部 all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0]) # 外部 all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0]) # 销售类型 all_data['MSSubClass'] = all_data['MSSubClass'].fillna('None') # 建筑类型 all_data = all_data.drop('Utilities', axis = 1) all_data_na = (all_data.isnull().sum() / len(all_data)) * 100 all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending = False) missing_data = pd.DataFrame({'Missing Ratio': all_data_na}) missing_data.head()

    2.4 Encoder转换

    from sklearn.preprocessing import LabelEncoder cols = ['FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope', 'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 'YrSold', 'MoSold'] for c in cols: encode = LabelEncoder() encode.fit(list(all_data[c].values)) all_data[c] = encode.transform(list(all_data[c].values)) # 增加一个新特征总面积 all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

    2.5 正态分布变换: Box-Cox变换

    from scipy.stats import norm, skew numeric_feats = all_data.dtypes[all_data.dtypes != 'object'].index skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending = False) print('Skew in numerical features:') skewness = pd.DataFrame({'Skew': skewed_feats}) skewness.head(10)

    skewness = skewness[abs(skewness) > 0.75] print('There are {} skewed numerical features to Box Cox transform'.format(skewness.shape[0])) # 关键点在于如何找到一个合适的参数,一般情况下0.15为经验值 from scipy.special import boxcox1p skewed_features = skewness.index lam = 0.15 for feat in skewed_features: all_data[feat] = boxcox1p(all_data[feat], lam) all_data.head()

    三. 集成算法建模

    all_data = pd.get_dummies(all_data) print(all_data.shape) train = all_data[: ntrain] test = all_data[ntrain:]

    (2917, 220)

    3.1 单模型

    from sklearn.model_selection import KFold, cross_val_score def rmse_cv(n_folds, model): kf = KFold(n_folds, shuffle = True, random_state = 42).get_n_splits(train.values) rmse = np.sqrt(-cross_val_score(model, train.values, y_train, scoring = 'neg_mean_squared_error', cv = kf)) return rmse # make_pipeline:级联起来去做事 RobustScaler:更适合处理离群点 from sklearn.pipeline import make_pipeline from sklearn.preprocessing import RobustScaler from sklearn.linear_model import Lasso, ElasticNet lasso = make_pipeline(RobustScaler(), Lasso(alpha = 0.0005, random_state = 42)) # ElasticNet同时使用l1和l2 ENet = make_pipeline(RobustScaler(), ElasticNet(alpha = 0.0005, l1_ratio = .9, random_state = 42)) # KernelRidge带有核函数的岭回归 from sklearn.kernel_ridge import KernelRidge KRR = KernelRidge(alpha = 0.6, kernel = 'polynomial', degree = 2, coef0 = 2.5) from sklearn.ensemble import GradientBoostingRegressor GBoost = GradientBoostingRegressor(n_estimators = 3000, learning_rate = 0.05, max_depth = 4, max_features = 'sqrt', min_samples_leaf = 15, loss = 'huber', min_samples_split = 10, random_state = 42) import xgboost as xgb model_xgb = xgb.XGBRegressor(colsample_bytree = 0.4603, gamma = 0.0468, learning_rate = 0.05, max_depth = 3, min_child_weight = 1.7817, n_estimators = 2200, reg_alpha = 0.4640, reg_lambda = 0.8571, subsample = 0.5213, silent = 1, nthread = -1) score = rmse_cv(5, lasso) print('Lasso score: {:.4f} ({:.4f})'.format(score.mean(), score.std())) score = rmse_cv(5, ENet) print('ElasticNet score: {:.4f} ({:.4f})'.format(score.mean(), score.std())) score = rmse_cv(5, KRR) print('Kernel Ridge score: {:.4f} ({:.4f})'.format(score.mean(), score.std())) score = rmse_cv(5, GBoost) print('Gradient Boosting score: {:.4f} ({:.4f})'.format(score.mean(), score.std())) score = rmse_cv(5, model_xgb) print('Xgboost score: {:.4f} ({:.4f})'.format(score.mean(), score.std()))

    Lasso score: 0.1116 (0.0072) ElasticNet score: 0.1116 (0.0072) Kernel Ridge score: 0.1153 (0.0071) Gradient Boosting score: 0.1161 (0.0067) Xgboost score: 0.1168 (0.0072)

    3.2 平均模型

    from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin): def __init__(self, models): self.models = models def fit(self, X, y): self.models_ = [clone(x) for x in self.models] for model in self.models_: model.fit(X, y) return self def predict(self, X): predictions = np.column_stack([model.predict(X) for model in self.models_]) return np.mean(predictions, axis = 1) averaged_models = AveragingModels(models = (ENet, GBoost, KRR, lasso)) score = rmse_cv(5, averaged_models) print('Averaged base models score: {:.4f} ({:.4f})'.format(score.mean(), score.std()))

    Averaged base models score: 0.1083 (0.0069)

    3.3 堆叠模型

    class StackingAverageModels(BaseEstimator, RegressorMixin, TransformerMixin): def __init__(self, base_models, meta_model, n_folds = 5): self.base_models = base_models self.meta_model = meta_model self.n_folds = n_folds def fit(self, X, y): self.base_models_ = [list() for x in self.base_models] self.meta_model_ = clone(self.meta_model) kfold = KFold(n_splits = self.n_folds, shuffle = True, random_state = 42) out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models))) for i, model in enumerate(self.base_models): for train_index, holdout_index in kfold.split(X, y): instance = clone(model) self.base_models_[i].append(instance) instance.fit(X[train_index], y[train_index]) y_pred = instance.predict(X[holdout_index]) out_of_fold_predictions[holdout_index, i] = y_pred self.meta_model_.fit(out_of_fold_predictions, y) return self def predict(self, X): meta_features = np.column_stack([np.column_stack([model.predict(X) for model in base_models]).mean(axis = 1) for base_models in self.base_models_]) return self.meta_model_.predict(meta_features) stacked_averaged_models = StackingAverageModels(base_models = (ENet, GBoost, KRR), meta_model = lasso) score = rmse_cv(5, stacked_averaged_models) print('Stacking Averaged models score: {:.4f} ({:.4f})'.format(score.mean(), score.std()))

    Stacking Averaged models score: 0.1079 (0.0072)

    Processed: 0.014, SQL: 9