利用jieba分词对文章进行分词(这里是遍历了一个文件夹里的所有文件)
def
segment():
"""word segment"""
for txt in os
.listdir(base_path
):
whole_base
= os
.path
.join(base_path
, txt
)
whole_seg
= os
.path
.join(seg_path
, txt
)
with codecs
.open(whole_base
, 'r', 'utf-8') as fr
:
fw
= codecs
.open(whole_seg
, 'w', 'utf-8')
for line in fr
.readlines():
# seg_list = jieba.cut(line.strip())
seg_list
= jieba
.analyse
.extract_tags(line
.strip(), topK
=20, withWeight
=False
, allowPOS
=())
# for item in seg_list:
# # 分别为关键词和相应的权重
(要返回权重时withWeight值需改为True
)
# print(item[0], item[1])
fw
.write(" ".join(seg_list
))
fw
.close()
jieba.analyse.extract_tags(line.strip(), topK=20, withWeight=False, allowPOS=())
#第一个参数line.strip():为待提取关键词的文本 #第二个参数topK:返回关键词的数量,重要性从高到低排序 #第三个参数withWeight:是否同时返回每个关键词的权重 #第四个参数allowPOS:词性过滤,为空表示不过滤,若提供则仅返回符合词性要求的关键词
读取分词后的文章保存在list中
def
read_doc_list():
trade_list
= []
doc_list
= []
for txt in os
.listdir(seg_path
):
trade_list
.append(txt
.split(".")[0])
with codecs
.open(os
.path
.join(seg_path
, txt
), "r", "utf-8") as fr
:
doc_list
.append(fr
.read().replace('\n', ''))
return trade_list
, doc_list
计算关键词的tfidf值
def
tfidf_top(trade_list
, doc_list
, max_df
, topn
):
vectorizer
= TfidfVectorizer(max_df
=max_df
)
matrix
= vectorizer
.fit_transform(doc_list
)
feature_dict
= {v
: k
for k
, v in vectorizer
.vocabulary_
.items()} # index
-> feature_name
top_n_matrix
= np
.argsort(-matrix
.todense())[:, :topn
] # top tf
-idf words
for each row
df
= pd
.DataFrame(np
.vectorize(feature_dict
.get
)(top_n_matrix
), index
=trade_list
) # convert matrix to df
return df
max_df用于删除过于频繁出现的术语,也称为“语料库特定的停用词”.例如:
max_df = 0.50表示“忽略出现在50%以上文档中的术语”. max_df = 25表示“忽略超过25个文档中出现的术语”.
默认的max_df是1.0,这意味着“忽略出现在100%以上文档中的术语”.因此,默认设置不会忽略任何术语.
min_df用于删除不经常出现的术语.例如:
min_df = 0.01表示“忽略出现在少于1%的文档中的术语”. min_df = 5表示“忽略少于5个文档中出现的术语”.
默认min_df为1,表示“忽略少于1个文档中出现的术语”.因此,默认设置不会忽略任何术语.
topn代表要提取的关键词的个数
完整代码
import codecs
import os
import jieba
.analyse
import numpy as np
import pandas as pd
from sklearn
.feature_extraction
.text import TfidfVectorizer
#要提取关键词的文本所在文件夹
base_path
= "data"
#分词后的文本保存的文件位置
seg_path
= "segmented/"
def
segment():
"""word segment"""
for txt in os
.listdir(base_path
):
whole_base
= os
.path
.join(base_path
, txt
)
whole_seg
= os
.path
.join(seg_path
, txt
)
with codecs
.open(whole_base
, 'r', 'utf-8') as fr
:
fw
= codecs
.open(whole_seg
, 'w', 'utf-8')
for line in fr
.readlines():
# seg_list = jieba.cut(line.strip())
seg_list
= jieba
.analyse
.extract_tags(line
.strip(), topK
=20, withWeight
=False
, allowPOS
=())
# 第一个参数:待提取关键词的文本
# 第二个参数:返回关键词的数量,重要性从高到低排序
# 第三个参数:是否同时返回每个关键词的权重
# 第四个参数:词性过滤,为空表示不过滤,若提供则仅返回符合词性要求的关键词
# for item in seg_list:
# # 分别为关键词和相应的权重
(要返回权重时withweight值需改为True
)
# print(item[0], item[1])
fw
.write(" ".join(seg_list
))
fw
.close()
def
read_doc_list():
trade_list
= []
doc_list
= []
for txt in os
.listdir(seg_path
):
trade_list
.append(txt
.split(".")[0])
with codecs
.open(os
.path
.join(seg_path
, txt
), "r", "utf-8") as fr
:
doc_list
.append(fr
.read().replace('\n', ''))
return trade_list
, doc_list
def
tfidf_top(trade_list
, doc_list
, max_df
, topn
):
vectorizer
= TfidfVectorizer(max_df
=max_df
)
matrix
= vectorizer
.fit_transform(doc_list
)
feature_dict
= {v
: k
for k
, v in vectorizer
.vocabulary_
.items()} # index
-> feature_name
top_n_matrix
= np
.argsort(-matrix
.todense())[:, :topn
] # top tf
-idf words
for each row
df
= pd
.DataFrame(np
.vectorize(feature_dict
.get
)(top_n_matrix
), index
=trade_list
) # convert matrix to df
return df
segment()
tl
, dl
= read_doc_list()
tdf
= tfidf_top(tl
, dl
, max_df
=0.5, topn
=10)
#忽略出现在
50%以上文档中的术语
,提取前十个关键词
tdf
.to_csv("keywords.txt", header
=False
, encoding
='utf-8')