# -*- coding: utf-8 -*-
"""
Created on Tue Jun 30 18:22:31 2020
@author: Administrator
"""
#并行搜索
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 30 14:59:19 2020
@author: Administrator
"""
from sklearn.datasets import fetch_20newsgroups
import numpy as np
news = fetch_20newsgroups(subset='all') #subset='all'参数代表下载全部接近2万文本储存到变量news中
#数据分割
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(news.data[:3000],news.target[:3000],test_size=0.25,random_state=33)
#导入支持向量机模型
from sklearn.svm import SVC
# from sklearn import svm
#导入TfidfVectorizer文本抽取器
from sklearn.feature_extraction.text import TfidfVectorizer #还去掉停用词了
#导入Pipeline
from sklearn.pipeline import Pipeline
clf = Pipeline([('vect',TfidfVectorizer(stop_words='english',analyzer='word')),('svc',SVC())])
#这里需要实验的2个超参数的个数分别为4、3,svc_gamma的超参数共有10^-2,10^-1,,.这样我们一共有十二种
parameters={'svc__gamma':np.logspace(-2,1,4),'svc__C':np.logspace(-1,1,3)}
# parameters={'svc__gamma':np.logspace(-2,1,4),'svc__C':np.logspace(-1,1,3),'svc__kernel':('linear','rbf')}
# parameters={'svc__gamma':np.linspace(0.9,1.1,3),'svc__C':np.linspace(0.9,1.1,3),'svc__kernel':('linear','rbf')}
#从sklearn.grid_search中导入网格搜索模块GridSearchCV #linspace(start,stop,num) #logspace(start,stop,num,base=2)否则默认base为10
from sklearn.model_selection import GridSearchCV
#将12组参数组合以及初始化的Pipline包括3折交叉验证的要求全部告知GridSearchCV refit=True
gs = GridSearchCV(clf,parameters,verbose=2,refit=True,cv=3,n_jobs=-1) #cv=3三折交叉验证 verbose=2对每个子模型都输出
#执行单线程网格搜索 #n_jobs=-1代表使用该计算机全部的CPU
time_ = gs.fit(X_train,y_train)
print(gs.best_params_,gs.best_score_)
#输出最佳模型在测试集上的准确性
print(gs.score(X_test,y_test))