python词频分析2

技术2022-07-12 96

上一篇绘制了一个简单的词云图，下面在学习一些改进。

from wordcloud import WordCloud import jieba import imageio with open('threekingdom.txt', 'r', encoding='utf-8') as f1: list1 = f1.read() list2 = jieba.lcut(list1) excludes = {"将军", "却说", "荆州", "二人", "不可", "不能", "如此", "如何", "东吴", "商议", "军马", "引兵", "次日", "主公", "左右", "大喜", "军士", "天下", "今日", "于是", "不敢", "魏兵", "都督", "陛下","玄德曰","孔明曰" } count={}#字典 for word in list2: if len(word)<=1: continue else: count[word]=count.get(word,0)+1 count["孔明"]=count["孔明"]+count["孔明曰"] count["玄德"] = count["玄德"] + count["玄德曰"]+count["刘备"] count["关公"] = count["关公"] + count["云长"] for word in excludes: del count[word] #print(count) #转换成列表排序 new_word_list=list(count.items())#元组 #print(new_word_list) #进行排序 new_word_list.sort(key=lambda x:x[1],reverse=True) text_list=[] for x in range(10):#显示前二十 #print(new_word_list[x]) role,count=new_word_list[x]#拆包 print(role,count) for j in range(count): text_list.append(role) print(text_list) text=" ".join(text_list) mask = imageio.imread("china.jpg") wc = WordCloud( width=1000, height=800, background_color='white', font_path="MSYH.TTC", mask=mask, collocations=False ).generate(text).to_file("三国词云.png")

上面的代码主要的改进是，把我不需要的词但是词频高的删除，之后再把重复的词从词云中删除，把同义词合并。注意wordcount的collocation的用法，还有这里的元组的转换，还有join函数，都是值得研究的点。

Processed: 0.023, SQL: 9