目录:
一. 查看数据1.1 查看标签1.2 特征与标签1.3 相关性1.4 缺失值
二. 数据处理2.1 去掉离群点2.2 正态分布变换: 对数变换log(1+x)2.3 缺失值填充2.4 Encoder转换2.5 正态分布变换: Box-Cox变换
三. 集成算法建模3.1 单模型3.2 平均模型3.3 堆叠模型
一. 查看数据
1.1 查看标签
import pandas
as pd
import numpy
as np
import warnings
warnings
.filterwarnings
('ignore')
df_train
= pd
.read_csv
(r
'data\train.csv')
print(df_train
.shape
)
df_train
['SalePrice'].describe
()
(1460, 81) 观察一下它的偏度值
print('Skewness: %f' % df_train
['SalePrice'].skew
())
print('Kurtosis: %f' % df_train
['SalePrice'].kurt
())
import matplotlib
.pyplot
as plt
import seaborn
as sns
%matplotlib inline
sns
.distplot
(df_train
['SalePrice'])
Skewness: 1.882876 Kurtosis: 6.536282
1.2 特征与标签
data
= pd
.concat
([df_train
['SalePrice'], df_train
['GrLivArea']], axis
= 1)
data
.plot
.scatter
(x
= 'GrLivArea', y
= 'SalePrice')
data
= pd
.concat
([df_train
['SalePrice'], df_train
['TotalBsmtSF']], axis
= 1)
data
.plot
.scatter
(x
= 'TotalBsmtSF', y
= 'SalePrice')
data
= df_train
[['SalePrice', 'OverallQual']]
plt
.subplots
(figsize
= (8, 6))
sns
.boxplot
(x
= 'OverallQual', y
= 'SalePrice', data
= data
)
data
= df_train
[['Neighborhood', 'SalePrice']]
plt
.subplots
(figsize
= (10, 6))
sns
.boxplot
(x
= 'Neighborhood', y
= 'SalePrice', data
= data
)
plt
.xticks
(rotation
= 60);
1.3 相关性
corrmat
= df_train
.corr
()
plt
.subplots
(figsize
= (12, 9))
sns
.heatmap
(corrmat
, square
= True, cmap
= 'Greens');
cols
= corrmat
.nlargest
(10, 'SalePrice')['SalePrice'].index
cm
= np
.corrcoef
(df_train
[cols
].values
.T
)
plt
.figure
(figsize
= (8, 6))
sns
.heatmap
(cm
, cbar
= True, annot
= True, square
= True, fmt
= '.2f', cmap
= 'Blues',
annot_kws
= {'size': 10}, yticklabels
= cols
.values
, xticklabels
= cols
.values
)
cols
= ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns
.pairplot
(df_train
[cols
], size
= 3);
1.4 缺失值
total_missing
= df_train
.isnull
().sum().sort_values
(ascending
= False)
percent
= (df_train
.isnull
().sum() / len(df_train
)).sort_values
(ascending
= False).round(3)
missing_data
= pd
.concat
([total_missing
, percent
], axis
= 1, keys
= ['Total', 'Percent'])
missing_data
.head
()
二. 数据处理
2.1 去掉离群点
import pandas
as pd
import numpy
as np
import matplotlib
.pyplot
as plt
import seaborn
as sns
%matplotlib inline
train
= pd
.read_csv
(r
'data\train.csv')
test
= pd
.read_csv
(r
'data\test.csv')
print('The train data size before dropping Id feature is: {}'.format(train
.shape
))
print('The test data size before dropping Id feature is: {}'.format(test
.shape
))
train_ID
= train
['Id']
test_ID
= test
['Id']
train
.drop
('Id', axis
= 1, inplace
= True)
test
.drop
('Id', axis
= 1, inplace
= True)
plt
.figure
(figsize
= (8, 6))
plt
.scatter
(x
= train
['GrLivArea'], y
= train
['SalePrice'])
plt
.xlabel
('GrLivArea', fontsize
= 12)
plt
.ylabel
('SalePrice', fontsize
= 12);
train
= train
.drop
(train
[(train
['GrLivArea'] > 4000) & (train
['SalePrice'] < 300000)].index
)
plt
.figure
(figsize
= (8, 6))
plt
.scatter
(x
= train
['GrLivArea'], y
= train
['SalePrice'])
plt
.xlabel
('GrLivArea', fontsize
= 12)
plt
.ylabel
('SalePrice', fontsize
= 12);
2.2 正态分布变换: 对数变换log(1+x)
from scipy
.stats
import norm
from scipy
import stats
plt
.figure
(figsize
= (8, 6))
sns
.distplot
(train
['SalePrice'], fit
= norm
)
(mu
, sigma
) = norm
.fit
(train
['SalePrice'])
print('mu = {:.2f} and sigma = {:.2f}'.format(mu
, sigma
))
plt
.legend
(['Normal dist. ($\mu = $ {:.2f} and $\sigma = $ {:.2f})'.format(mu
, sigma
)], loc
= 'best')
plt
.ylabel
('Frequency')
plt
.title
('SalePrice distribution')
fig
= plt
.figure
(figsize
= (8, 6))
stats
.probplot
(train
['SalePrice'], plot
= plt
);
train
['SalePrice'] = np
.log1p
(train
['SalePrice'])
(mu
, sigma
) = norm
.fit
(train
['SalePrice'])
print('mu = {:.2f} and sigma = {:.2f}'.format(mu
, sigma
))
plt
.figure
(figsize
= (8, 6))
sns
.distplot
(train
['SalePrice'], fit
= norm
)
plt
.legend
(['Normal dist. ($\mu = $ {:.2f} and $\sigma$ = {:.2f})'.format(mu
, sigma
)], loc
= 'best')
plt
.ylabel
('Frequency')
plt
.title
('SalePrice distribution')
plt
.figure
(figsize
= (8, 6))
stats
.probplot
(train
['SalePrice'], plot
= plt
);
2.3 缺失值填充
ntrain
= train
.shape
[0]
ntest
= test
.shape
[0]
y_train
= train
.SalePrice
.values
all_data
= pd
.concat
((train
, test
)).reset_index
(drop
= True)
all_data
.drop
(['SalePrice'], axis
= 1, inplace
= True)
print('all_data size is: {}'.format(all_data
.shape
))
all_data_na
= (all_data
.isnull
().sum() / len(all_data
)) * 100
all_data_na
= all_data_na
.drop
(all_data_na
[all_data_na
== 0].index
).sort_values
(ascending
= False)[: 20]
missing_data
= pd
.DataFrame
({'Missing Ratio': all_data_na
})
missing_data
plt
.figure
(figsize
= (12, 8))
sns
.barplot
(x
= all_data_na
.index
, y
= all_data_na
)
plt
.xlabel
('Features')
plt
.ylabel
('Percent of missing values')
plt
.xticks
(rotation
= 90)
plt
.title
('Percent minssing data by feature');
all_data
['PoolQC'] = all_data
['PoolQC'].fillna
('None')
all_data
['MiscFeature'] = all_data
['MiscFeature'].fillna
('None')
all_data
['Alley'] = all_data
['Alley'].fillna
('None')
all_data
['Fence'] = all_data
['Fence'].fillna
('None')
all_data
['FireplaceQu'] = all_data
['FireplaceQu'].fillna
('None')
all_data
['LotFrontage'] = all_data
.groupby
('Neighborhood')['LotFrontage'].transform
(lambda x
: x
.fillna
(x
.median
()))
for col
in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
all_data
[col
] = all_data
[col
].fillna
('None')
for col
in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
all_data
[col
] = all_data
[col
].fillna
(0)
for col
in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
all_data
[col
] = all_data
[col
].fillna
(0)
for col
in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
all_data
[col
] = all_data
[col
].fillna
('None')
all_data
['MasVnrType'] = all_data
['MasVnrType'].fillna
('None')
all_data
['MasVnrArea'] = all_data
['MasVnrArea'].fillna
(0)
all_data
['MSZoning'] = all_data
['MSZoning'].fillna
(all_data
['MSZoning'].mode
()[0])
all_data
['Functional'] = all_data
['Functional'].fillna
(all_data
['Functional'].mode
()[0])
all_data
['Electrical'] = all_data
['Electrical'].fillna
(all_data
['Electrical'].mode
()[0])
all_data
['KitchenQual'] = all_data
['KitchenQual'].fillna
(all_data
['KitchenQual'].mode
()[0])
all_data
['Exterior1st'] = all_data
['Exterior1st'].fillna
(all_data
['Exterior1st'].mode
()[0])
all_data
['Exterior2nd'] = all_data
['Exterior2nd'].fillna
(all_data
['Exterior2nd'].mode
()[0])
all_data
['SaleType'] = all_data
['SaleType'].fillna
(all_data
['SaleType'].mode
()[0])
all_data
['MSSubClass'] = all_data
['MSSubClass'].fillna
('None')
all_data
= all_data
.drop
('Utilities', axis
= 1)
all_data_na
= (all_data
.isnull
().sum() / len(all_data
)) * 100
all_data_na
= all_data_na
.drop
(all_data_na
[all_data_na
== 0].index
).sort_values
(ascending
= False)
missing_data
= pd
.DataFrame
({'Missing Ratio': all_data_na
})
missing_data
.head
()
2.4 Encoder转换
from sklearn
.preprocessing
import LabelEncoder
cols
= ['FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 'ExterQual',
'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 'BsmtFinType2',
'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope', 'LotShape',
'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 'YrSold', 'MoSold']
for c
in cols
:
encode
= LabelEncoder
()
encode
.fit
(list(all_data
[c
].values
))
all_data
[c
] = encode
.transform
(list(all_data
[c
].values
))
all_data
['TotalSF'] = all_data
['TotalBsmtSF'] + all_data
['1stFlrSF'] + all_data
['2ndFlrSF']
2.5 正态分布变换: Box-Cox变换
from scipy
.stats
import norm
, skew
numeric_feats
= all_data
.dtypes
[all_data
.dtypes
!= 'object'].index
skewed_feats
= all_data
[numeric_feats
].apply(lambda x
: skew
(x
.dropna
())).sort_values
(ascending
= False)
print('Skew in numerical features:')
skewness
= pd
.DataFrame
({'Skew': skewed_feats
})
skewness
.head
(10)
skewness
= skewness
[abs(skewness
) > 0.75]
print('There are {} skewed numerical features to Box Cox transform'.format(skewness
.shape
[0]))
from scipy
.special
import boxcox1p
skewed_features
= skewness
.index
lam
= 0.15
for feat
in skewed_features
:
all_data
[feat
] = boxcox1p
(all_data
[feat
], lam
)
all_data
.head
()
三. 集成算法建模
all_data
= pd
.get_dummies
(all_data
)
print(all_data
.shape
)
train
= all_data
[: ntrain
]
test
= all_data
[ntrain
:]
(2917, 220)
3.1 单模型
from sklearn
.model_selection
import KFold
, cross_val_score
def rmse_cv(n_folds
, model
):
kf
= KFold
(n_folds
, shuffle
= True, random_state
= 42).get_n_splits
(train
.values
)
rmse
= np
.sqrt
(-cross_val_score
(model
, train
.values
, y_train
, scoring
= 'neg_mean_squared_error', cv
= kf
))
return rmse
from sklearn
.pipeline
import make_pipeline
from sklearn
.preprocessing
import RobustScaler
from sklearn
.linear_model
import Lasso
, ElasticNet
lasso
= make_pipeline
(RobustScaler
(), Lasso
(alpha
= 0.0005, random_state
= 42))
ENet
= make_pipeline
(RobustScaler
(), ElasticNet
(alpha
= 0.0005, l1_ratio
= .9, random_state
= 42))
from sklearn
.kernel_ridge
import KernelRidge
KRR
= KernelRidge
(alpha
= 0.6, kernel
= 'polynomial', degree
= 2, coef0
= 2.5)
from sklearn
.ensemble
import GradientBoostingRegressor
GBoost
= GradientBoostingRegressor
(n_estimators
= 3000, learning_rate
= 0.05, max_depth
= 4,
max_features
= 'sqrt', min_samples_leaf
= 15, loss
= 'huber',
min_samples_split
= 10, random_state
= 42)
import xgboost
as xgb
model_xgb
= xgb
.XGBRegressor
(colsample_bytree
= 0.4603, gamma
= 0.0468, learning_rate
= 0.05,
max_depth
= 3, min_child_weight
= 1.7817, n_estimators
= 2200,
reg_alpha
= 0.4640, reg_lambda
= 0.8571, subsample
= 0.5213,
silent
= 1, nthread
= -1)
score
= rmse_cv
(5, lasso
)
print('Lasso score: {:.4f} ({:.4f})'.format(score
.mean
(), score
.std
()))
score
= rmse_cv
(5, ENet
)
print('ElasticNet score: {:.4f} ({:.4f})'.format(score
.mean
(), score
.std
()))
score
= rmse_cv
(5, KRR
)
print('Kernel Ridge score: {:.4f} ({:.4f})'.format(score
.mean
(), score
.std
()))
score
= rmse_cv
(5, GBoost
)
print('Gradient Boosting score: {:.4f} ({:.4f})'.format(score
.mean
(), score
.std
()))
score
= rmse_cv
(5, model_xgb
)
print('Xgboost score: {:.4f} ({:.4f})'.format(score
.mean
(), score
.std
()))
Lasso score: 0.1116 (0.0072) ElasticNet score: 0.1116 (0.0072) Kernel Ridge score: 0.1153 (0.0071) Gradient Boosting score: 0.1161 (0.0067) Xgboost score: 0.1168 (0.0072)
3.2 平均模型
from sklearn
.base
import BaseEstimator
, TransformerMixin
, RegressorMixin
, clone
class AveragingModels(BaseEstimator
, RegressorMixin
, TransformerMixin
):
def __init__(self
, models
):
self
.models
= models
def fit(self
, X
, y
):
self
.models_
= [clone
(x
) for x
in self
.models
]
for model
in self
.models_
:
model
.fit
(X
, y
)
return self
def predict(self
, X
):
predictions
= np
.column_stack
([model
.predict
(X
) for model
in self
.models_
])
return np
.mean
(predictions
, axis
= 1)
averaged_models
= AveragingModels
(models
= (ENet
, GBoost
, KRR
, lasso
))
score
= rmse_cv
(5, averaged_models
)
print('Averaged base models score: {:.4f} ({:.4f})'.format(score
.mean
(), score
.std
()))
Averaged base models score: 0.1083 (0.0069)
3.3 堆叠模型
class StackingAverageModels(BaseEstimator
, RegressorMixin
, TransformerMixin
):
def __init__(self
, base_models
, meta_model
, n_folds
= 5):
self
.base_models
= base_models
self
.meta_model
= meta_model
self
.n_folds
= n_folds
def fit(self
, X
, y
):
self
.base_models_
= [list() for x
in self
.base_models
]
self
.meta_model_
= clone
(self
.meta_model
)
kfold
= KFold
(n_splits
= self
.n_folds
, shuffle
= True, random_state
= 42)
out_of_fold_predictions
= np
.zeros
((X
.shape
[0], len(self
.base_models
)))
for i
, model
in enumerate(self
.base_models
):
for train_index
, holdout_index
in kfold
.split
(X
, y
):
instance
= clone
(model
)
self
.base_models_
[i
].append
(instance
)
instance
.fit
(X
[train_index
], y
[train_index
])
y_pred
= instance
.predict
(X
[holdout_index
])
out_of_fold_predictions
[holdout_index
, i
] = y_pred
self
.meta_model_
.fit
(out_of_fold_predictions
, y
)
return self
def predict(self
, X
):
meta_features
= np
.column_stack
([np
.column_stack
([model
.predict
(X
) for model
in base_models
]).mean
(axis
= 1)
for base_models
in self
.base_models_
])
return self
.meta_model_
.predict
(meta_features
)
stacked_averaged_models
= StackingAverageModels
(base_models
= (ENet
, GBoost
, KRR
), meta_model
= lasso
)
score
= rmse_cv
(5, stacked_averaged_models
)
print('Stacking Averaged models score: {:.4f} ({:.4f})'.format(score
.mean
(), score
.std
()))
Stacking Averaged models score: 0.1079 (0.0072)