gridsearch and cross validation
import pandas as pd import numpy as np from sklearn.neighbors import KNeighborsClassifier # classifier from sklearn.preprocessing import StandardScaler # standard from sklearn.model_selection import train_test_split,GridSearchCV #split data and gridsearchcv # read data data=pd.read_csv('data/---.csv') data.head() # feature,score x=data[['--','---','---']] #array_x should be two-dimensional y=data[['target--']] #score # split data x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=24) #test_size-the test data ratio。random_state---set casually,migrate easily # standard sc=StandardScaler() new_x_train=sc.fit_transform(x_train) new_x_test=sc.transform(x_test) # mahine learning------ knn=KNeighborsClassifier() # GridSearchCv-----------------------! param_dict={'n_neighbors':[1,3,5]} # param dict for adjustment es=GridSearchCV(knn,param_grid=param_dict,cv=3) # gridsearch and cross validation es.fit(new_x_train,y_train) #data training # model assessment y_predict=es.predict(x_test) # predict y score=es.score(x_test,y_test) # calculate the precision es.best_score_ # best score es.best_params_ # best paramssklearn.linear_model.Ridge(alpha=1.0, fit_intercept=True,solver=“auto”, normalize=False)
# ridge=Ridge(alpha=0.1) ridge=RidgeCV(alphas=(0.001,0.005,0.1,0.5,1,10,20,50,100)) ridge.fit(new_x_train,y_train) #predict y_pred=ridge.predict(new_x_test)sklearn.linear_model.LogisticRegression(solver=‘liblinear’, penalty=‘l2’, C = 1.0)
#model training lr=LogisticRegression() lr.fit(new_x_train,y_train) #model assessment pred=lr.predict(new_x_test) pred #ouput score lr.score(new_x_test,y_test) # the distribution of test data y_test.value_counts() # assessment from sklearn.metrics import confusion_matrix confusion_matrix(y_test,pred) tn,fp,fn,tp=confusion_matrix(y_test,pred).ravel() tn,fp,fn,tp pr=tp/(tp+fp) # precision recall=tp/(tp+fn) # recall #api for calculating precision and recall from sklearn.metrics import precision_score,recall_score precision_score(y_true,y_pred) # y_true and y_pred must be 1 or 0 recall_score(y_true,y_pred) #calculate f1 f1=2*tp/(2*tp+fn+fp) f1 f12=2*pr*recall/(pr+recall) f12 # f1_score from sklearn.metrics import f1_score f1_score(y_true,y_pred) # classification_report!!!!!!!!!!! from sklearn.metrics import classification_report print(classification_report(y_test,pred,labels=(1,0),target_names=("good","bad"))) precision_score(y_true,y_pred)RandomForest
from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV # init the object rf=RandomForestClassifier() # setting the param= {"n_estimators": [120,200,300,500,800,1200], "max_depth": [5, 8, 15, 25, 30]} # define gridsearch gs=GridSearchCV(rf,param_grid=param,cv=3) gs.fit(new_x_train,y_train) gs.score(new_x_test,y_test) gs.best_params_adaboost
from sklearn.ensemble import AdaBoostClassifier ada=AdaBoostClassifier(n_estimators=1000,learning_rate=0.01) ada.fit(new_x_train,y_train) ada.score(new_x_test,y_test)from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score # model training km=KMeans(n_clusters=8,random_state=24) km.fit(data) y_predict=km.predict(data) km.cluster_centers_ silhouette_score(data, y_predict)
Feature selector that removes all low-variance features.
from sklearn.feature_selection import VarianceThreshold # removes all low-variance features vt=VarianceThreshold(threshold=1) new_x=vt.fit_transform(x) new_x.shapePrincipal component analysis (PCA).
from sklearn.decomposition import PCA pca=PCA(n_components=2) pca_x=pca.fit_transform(new_x) pca_x.shape