文本词频统计
英文词频统计
def getText():
txt
= open("hamlet.txt", "r", encoding
='UTF-8').read
()
txt
= txt
.lower
()
for ch
in '!"#$%&()*+,-./:;<=>?@[\\]^_‘{|},《》~’':
txt
= txt
.replace
(ch
, " ")
return txt
hamleTxt
= getText
()
words
= hamleTxt
.split
()
counts
= {}
for word
in words
:
counts
[word
] = counts
.get
(word
, 0) + 1
items
= list(counts
.items
())
items
.sort
(key
=lambda x
:x
[1], reverse
=True)
for i
in range(10):
word
, count
= items
[i
]
print("{0:<10}{1:>5}".format(word
, count
))
中文文本词频统计
以三国演义为例
import jieba
txt
= open("三国演义.txt", "r", encoding
="gb18030").read
()
words
= jieba
.lcut
(txt
)
counts
= {}
for word
in words
:
if len(word
) == 1:
continue
else:
counts
[word
] = counts
.get
(word
, 0) + 1
items
= list(counts
.items
())
items
.sort
(key
=lambda x
:x
[1], reverse
=True)
for i
in range(15):
word
, count
= items
[i
]
print("{0:<10}{1:>5}".format(word
, count
))
面向问题分析(人名统计)
代码升级版
增加排除词库,和人名的别名统一 注意:排除词库需要不断运行,不断测试添加,此处并不完全
import jieba
txt
= open("三国演义.txt", "r", encoding
="gb18030").read
()
excludes
= {"将军", "却说", "荆州", "二人", "不可", "不能", "如此", "商议", "如何", "左右", "军马", "次日"}
words
= jieba
.lcut
(txt
)
counts
= {}
for word
in words
:
if len(word
) == 1:
continue
elif word
== "诸葛亮" or word
== "孔明曰":
rword
= "孔明"
elif word
== "关公" or word
== "云长":
rword
= "关羽"
elif word
== "玄德" or word
== "玄德曰":
rword
= "刘备"
elif word
== "孟德" or word
== "丞相":
rword
= "曹操"
else:
rword
= word
counts
[rword
] = counts
.get
(rword
, 0) + 1
for word
in excludes
:
del counts
[word
]
items
= list(counts
.items
())
items
.sort
(key
=lambda x
:x
[1], reverse
=True)
for i
in range(15):
word
, count
= items
[i
]
print("{0:<10}{1:>5}".format(word
, count
))
通过不断添加排除词库,运行程序,得三国演义人物出场顺序前20
转载请注明原文地址:https://ipadbbs.8miu.com/read-13278.html