探索性分析
#固定酸占总酸比重 plt.figure(figsize = (8,5)) plt.suptitle('固定酸占总酸比分布情况', y=1.02, fontsize = 16) #总标题 temp = df[{'total acid','fixed acidity'}] temp['precent'] = temp.apply(lambda x: x['fixed acidity']/x['total acid'], axis=1) #计算占比 temp['precent'].hist(bins = 100) plt.xlabel('红葡萄酒固定酸占比',fontsize = 12) plt.ylabel('频数',fontsize = 12) #固定酸占比对评分影响 plt.figure(figsize = (8,4)) plt.suptitle('固定酸占总酸比对评分的影响', y=1.02, fontsize = 16) #总标题 temp = df[{'total acid','fixed acidity','quality'}] temp['precent'] = temp.apply(lambda x: x['fixed acidity']/x['total acid'], axis=1) #计算占比 sns.boxplot(x=temp['quality'], y=temp['precent']) plt.xlabel('红葡萄酒评分',fontsize = 12) plt.ylabel('固定酸占比',fontsize = 12) #pH对评分影响 plt.figure(figsize = (7,5)) plt.suptitle('pH值对评分的影响', y=1.02, fontsize = 16) #总标题 temp = df[{'pH','quality'}] sns.boxplot(x=temp['quality'], y=temp['pH']) plt.xlabel('红葡萄酒评分',fontsize = 12) plt.ylabel('pH值',fontsize = 12) #红葡热力相关图 plt.figure(figsize = (10,8)) colnm = df.columns.tolist() mcorr = df[colnm].corr() mask = np.zeros_like(mcorr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True cmap = sns.diverging_palette(220, 10, as_cmap=True) sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f') plt.title('红葡萄酒各变量间热力相关图') # 查看葡萄酒质量情况分布 score = df.groupby("quality").agg({"fixed acidity": lambda x: len(x)}) score = score.reset_index() score.columns = ["quality","count"] sns.barplot(x = 'quality', y = 'count', data = score, palette="rocket").set_title("葡萄酒质量分布") plt.show()建模
df["GoodWine"] = df.quality.apply(lambda x: 1 if x >=6 else 0) #特征 X = np.array(df[df.columns[:11]]) #分类标签 y = np.array(df.GoodWine) # 将数据打乱并分开 from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3) #归一化 from sklearn.preprocessing import StandardScaler scaler = StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) from sklearn.neighbors import KNeighborsClassifier #初始化 k = 3 clf = KNeighborsClassifier(k) #使用training set训练模型 clf.fit(X_train, y_train) print("训练集正确率:{}%".format(round(clf.score(X_train, y_train)*100,2))) from sklearn import model_selection as cv scores = cv.cross_val_score(clf, X_train, y_train, cv = 5) score = scores.mean() print("交叉验证正确率:{}%".format(round(score*100, 2))) ks = range(1,100) inSampleScores = [] crossValidationScores = [] d = {} #key = k, value = cv accuracy rate for k in ks: clf = KNeighborsClassifier(k).fit(X_train, y_train) inSampleScores.append(clf.score(X_train, y_train)) scores = cv.cross_val_score(clf, X_train, y_train, cv = 5) crossValidationScores.append(scores.mean()) d[k] = scores.mean() import matplotlib.pyplot as plt p1 = plt.plot(ks, inSampleScores) p2 = plt.plot(ks, crossValidationScores) plt.legend(["train正确率", "cv正确率"]) plt.show() # 选择最好的k best_k = sorted(d.items(), key = lambda x:x[1], reverse = True)[0][0] print("最优的k值:{}".format(best_k)) #建模 clf = KNeighborsClassifier(best_k).fit(X_train, y_train) #预测 y_test_pred = clf.predict(X_test) #正确率 print("测试集正确率:{}%".format(round(clf.score(X_test, y_test)*100, 2))) # 评估报告 from sklearn.metrics import classification_report print(classification_report(y_test, y_test_pred))