jieba:优秀的中文分词第三方库 ThreeKingdoms.txt(三国演义.txt):https://python123.io/resources/pye/threekingdoms.txt
代码1:
# CalThreeKingdomsV1
.py
import jieba
txt
= open("ThreeKingdoms.txt", encoding
="utf-8").read() # 打开文件
words
= jieba
.lcut(txt
) # 分词
counts
= {} # 建字典
for word
in words
:
if len(word
) == 1:
continue
else:
counts
[word
] = counts
.get(word
, 0) + 1 # 判断名字是否在字典中存在,存在则值加
1,否则值为
1
items
= list(counts
.items()) # 列表化
items
.sort(key
=lambda x
: x
[1], reverse
=True
) # 排序,默认从小到大,reverse反序输出
for i
in range(20): # 输出前
20名的词
word
, count
= items
[i
]
print("{0:<10}{1:>5}".format(word
, count
))
结果:
曹操
953
孔明
836
将军
772
却说
656
玄德
585
关公
510
丞相
491
二人
469
不可
440
荆州
425
玄德曰
390
孔明曰
390
不能
384
如此
378
张飞
358
商议
344
如何
338
主公
331
军士
317
吕布
300
Process finished
with exit code
0
很明显输出的结果中有些不是人名,需要去掉;还有一些名字指的是同一个人,需要合并,所以就有了
代码2:
# CalThreeKingdomsV2
.py
import jieba
txt
= open("ThreeKingdoms.txt", encoding
="utf-8").read()
excludes
= {"将军", "却说", "荆州", "二人", "不可", "不能", "如此", "商议", "如何", "军士", "主公", "左右", "军马",
"次日", "引兵", "大喜"} # 多次运行代码,找出那些不是人物名的词
words
= jieba
.lcut(txt
) # 分词
counts
= {} # 建字典
for word
in words
:
if len(word
) == 1:
continue
elif word
== "诸葛亮" or word
== "孔明曰": # 合并同一个人的名词
r_word
= "孔明"
elif word
== "关公" or word
== "云长":
r_word
= "关羽"
elif word
== "玄德" or word
== "玄德曰":
r_word
= "刘备"
elif word
== "孟德" or word
== "丞相": # 暂且先认为所有的“丞相”都是指“曹操”
r_word
= "曹操"
else:
r_word
= word
counts
[r_word
] = counts
.get(r_word
, 0) + 1 # 统计词频
for word
in excludes
:
del counts
[word
] # 删掉非人的词
items
= list(counts
.items())
items
.sort(key
=lambda x
: x
[1], reverse
=True
) # 排序,反序输出
for i
in range(8): # 输出前
8名
word
, count
= items
[i
]
print("{0:<10}{1:>5}".format(word
, count
))
结果:
曹操
1451
孔明
1383
刘备
1252
关羽
784
张飞
358
吕布
300
赵云
278
孙权
264
Process finished
with exit code
0