bayes自动调参

    技术2022-07-12  60

    使用bayes_XGB快速调参

    from sklearn.metrics import roc_curve, auc, recall_score, accuracy_score, roc_auc_score, precision_score, f1_score import matplotlib as plt def model_metrics(clf, X_train, X_test, y_train, y_test): # 预测 y_train_pred = clf.predict(X_train) y_test_pred = clf.predict(X_test) y_train_proba = clf.predict_proba(X_train)[:, 1] y_test_proba = clf.predict_proba(X_test)[:, 1] # 准确率 print('[准确率]', end=' ') print('训练集:', '%.4f' % accuracy_score(y_train, y_train_pred), end=' ') print('测试集:', '%.4f' % accuracy_score(y_test, y_test_pred)) # 精准率 print('[精准率]', end=' ') print('训练集:', '%.4f' % precision_score(y_train, y_train_pred), end=' ') print('测试集:', '%.4f' % precision_score(y_test, y_test_pred)) # 召回率 print('[召回率]', end=' ') print('训练集:', '%.4f' % recall_score(y_train, y_train_pred), end=' ') print('测试集:', '%.4f' % recall_score(y_test, y_test_pred)) # f1-score print('[f1-score]', end=' ') print('训练集:', '%.4f' % f1_score(y_train, y_train_pred), end=' ') print('测试集:', '%.4f' % f1_score(y_test, y_test_pred)) # auc取值:用roc_auc_score或auc print('[auc值]', end=' ') print('训练集:', '%.4f' % roc_auc_score(y_train, y_train_proba), end=' ') print('测试集:', '%.4f' % roc_auc_score(y_test, y_test_proba)) # roc曲线 fpr_train, tpr_train, thresholds_train = roc_curve(y_train, y_train_proba, pos_label=1) fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_test_proba, pos_label=1) label = ["Train - AUC:{:.4f}".format(auc(fpr_train, tpr_train)), "Test - AUC:{:.4f}".format(auc(fpr_test, tpr_test))] plt.plot(fpr_train, tpr_train) plt.plot(fpr_test, tpr_test) plt.plot([0, 1], [0, 1], 'd--') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.legend(label, loc=4) plt.title("ROC curve") import numpy as np import pandas as pd import matplotlib as plt def plot_ks(y_test, y_score, positive_flag): # 对y_test,y_score重新设置索引 y_test.index = np.arange(len(y_test)) # y_score.index = np.arange(len(y_score)) # 构建目标数据集 target_data = pd.DataFrame({'y_test': y_test, 'y_score': y_score}) # 按y_score降序排列 target_data.sort_values(by='y_score', ascending=False, inplace=True) # 自定义分位点 cuts = np.arange(0.1, 1, 0.1) # 计算各分位点对应的Score值 index = len(target_data.y_score) * cuts scores = target_data.y_score.iloc[index.astype('int')] # 根据不同的Score值,计算Sensitivity(灵敏性)和Specificity(特异性) Sensitivity = [] Specificity = [] for score in scores: # 正例覆盖样本数量与实际正例样本量 positive_recall= target_data.loc[(target_data.y_test == positive_flag) & (target_data.y_score > score), :].shape[0] positive = sum(target_data.y_test == positive_flag) # 负例覆盖样本数量与实际负例样本量 negative_recall= target_data.loc[(target_data.y_test != positive_flag) & (target_data.y_score <= score), :].shape[0] negative = sum(target_data.y_test != positive_flag) Sensitivity.append(positive_recall / positive) Specificity.append(negative_recall / negative) # 构建绘图数据 plot_data = pd.DataFrame({'cuts': cuts, 'y1': 1 - np.array(Specificity), 'y2': np.array(Sensitivity), 'ks': np.array(Sensitivity) - (1 - np.array(Specificity))}) # 寻找Sensitivity和1-Specificity之差的最大值索引 max_ks_index = np.argmax(plot_data.ks) plt.plot([0] + cuts.tolist() + [1], [0] + plot_data.y1.tolist() + [1], label='1-Specificity(TPR)') plt.plot([0] + cuts.tolist() + [1], [0] + plot_data.y2.tolist() + [1], label='Sensitivity(FPR)') # 添加参考线 plt.vlines(plot_data.cuts[max_ks_index], ymin=plot_data.y1[max_ks_index], ymax=plot_data.y2[max_ks_index], linestyles='--') # 添加文本信息 plt.text(x=plot_data.cuts[max_ks_index] + 0.01, y=plot_data.y1[max_ks_index] + plot_data.ks[max_ks_index] / 2, s='KS= %.2f' % plot_data.ks[max_ks_index]) # 显示图例 plt.legend() # 显示图形 plt.show() # # 调用自定义函数,绘制K-S曲线 # plot_ks(y_test=vali_y, y_score=y_pred, positive_flag def XGB_predict(X_train, y_train, X_test, y_test): def xgb_cv(eta, gamma, max_depth): val = cross_val_score(xgb.XGBClassifier(objective='binary.logistic', learning_rate=max(eta, 0), gamma=max(gamma, 0), max_depth=int(max_depth), fit_params={'early_stopping_round': 10, 'eval_metric': 'auc'}, n_jobs=-1 ), X=X_train, y=y_train).mean() return val opt = BayesianOptimization(xgb_cv, {'eta': (0.001, 0.1), 'gamma': (0, 1), 'max_depth': (1, 15), 'n_estimators': (40, 80)}) opt.maximize() params = opt.max params = params['params'] params.update({'max_depth': int(params['max_depth'])}) # xgb_val = xgb.DMatrix(X_test,label=y_test) xgb_train = xgb.DMatrix(X_train, label=y_train) xgb_model = xgb.train(params, xgb_train) # 画图 plot_auc.model_metrics(xgb_model, X_train, y_train, X_test, y_test) y_score = xgb_model.predict_proba(X_test) df = pd.DataFrame(y_score) y_score = df[1] plot_ks.plot_ks(y_test, y_score, 1) return xgb_model,params
    Processed: 0.012, SQL: 9