数据预处理
#填充缺失值 data = data.fillna('Unknow') print(data.isnull().sum()) #删除重复记录 data.drop_duplicates(inplace=True) #文本内容清洗,清楚特殊符号,用正则表达式 import re pattern = r"[!\"#$%&'()*+,-./:;<=>?@[\\\]^_^{|}~—!,。?、¥…():【】《》‘’“”\s]+" re_obj = re.compile(pattern) def clear(text): return re_obj.sub("",text) data['comment'] = data['comment'].apply(clear) #分词 def cut_word(text): return jieba.cut(text) data["comment"] = data["comment"].apply(cut_word) #停用词处理 def get_stopword(): s = set() with open("hit_stopwords.txt", encoding="UTF-8") as f: for line in f: s.add(line.strip()) return s def remove_stopword(words): return [word for word in words if word not in stopword] stopword = get_stopword() data["comment"] = data["comment"].apply(remove_stopword)探索性分析
#城市分析 crank = data.groupby("city")["name"].count().reset_index().sort_values(by='name',ascending=False)[:10] fig, ax = plt.subplots(1,2) #创建画布 一行 两列 fig.set_size_inches(15,5) #设置画布大小 count = data["city"].value_counts() top = count.iloc[:10] bottom = count.iloc[-10:] for index, d, title in zip(range(2), [top, bottom], ["前10城市","后10城市"]): a = sns.barplot(d.index, d.values, ax=ax[index]) #旋转45度 a.set_xticklabels(a.get_xticklabels(),rotation=45) a.set_title(title) #评分分析 score = data.groupby("score")["name"].count().reset_index() score.columns=["评分","数量"] sns.countplot(x="score",data=data,log=True) #统计出现频率最多的15个词汇 from itertools import chain from collections import Counter li_2d = data['comment'].tolist() #将二维列表转换为一维 li_1d = list(chain.from_iterable(li_2d)) print(f'总词汇量:{len(li_1d)}') c = Counter(li_1d) print(f'不重复词汇量:{len(c)}') common = c.most_common(15) import seaborn as sns plt.rcParams['font.family'] = ['sans-serif'] plt.rcParams['font.sans-serif'] = ['SimHei'] d = dict(common) plt.figure(figsize=(15, 5)) sns.barplot(list(d.keys()), list(d.values()))建模分析
#建立模型 def join(text_list): return " ".join(text_list) data['comment'] = data['comment'].apply(join) #构造目标列 data['target'] = np.where(data['score'] >=4.5, 2,np.where(data['score'] >=3,1,0)) data['target'].value_counts() #下采样 p = data[data['target'] == 2] m = data[data['target'] == 1] n = data[data['target'] == 0] p = p.sample(len(m)) m = m.sample(len(m)) data2 = pd.concat([p,m,n],axis=0) data2['target'].value_counts() # 构建训练集和测试集。 from sklearn.model_selection import train_test_split X = data2['comment'] y = data2['target'] X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25) print('训练集样本数:',X_train.shape[0],'测试集样本数',X_test.shape[0]) #特征选择 vec = TfidfVectorizer(ngram_range=(1,2),max_df=0.5,min_df=1) X_train_trans = vec.fit_transform(X_train) X_test_trans = vec.fit_transform(X_test) display(X_train_trans,X_test_trans)逻辑回归
from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report lr = LogisticRegression(class_weight='balanced',multi_class='ovr',solver='sag') lr.fit(X_train_trans,y_train) y_hat = lr.predict(X_test_trans) print(classification_report(y_test,y_hat))朴素贝叶斯
from sklearn.naive_bayes import ComplementNB gnb = ComplementNB() gnb.fit(X_train_trans,y_train) y_hat = gnb.predict(X_test_trans) print(classification_report(y_test,y_hat))