文章目录
一.Python代码二.代码说明三.结果输出
一.Python代码
'''
@file: keras_data_prepare.py
@time: 2020/6/30 14:05
@author: Jack
@contact:jack18588951684@163.com
'''
import string
import re
from os
import listdir
from numpy
import array
from nltk
.corpus
import stopwords
from keras
.preprocessing
.text
import Tokenizer
def load_doc(filename
):
file = open(filename
, 'r')
text
= file.read
()
file.close
()
return text
def clean_doc(doc
):
tokens
= doc
.split
()
re_punc
= re
.compile('[%s]' % re
.escape
(string
.punctuation
))
tokens
= [re_punc
.sub
('', w
) for w
in tokens
]
tokens
= [w
for w
in tokens
if w
.isalpha
()]
stop_words
= set(stopwords
.words
('english'))
tokens
= [w
for w
in tokens
if not w
in stop_words
]
tokens
= [w
for w
in tokens
if len(w
) > 1]
return tokens
def doc_to_line(filename
, vocab
):
doc
= load_doc
(filename
)
tokens
= clean_doc
(doc
)
tokens
= [w
for w
in tokens
if w
in vocab
]
return ' '.join
(tokens
)
def process_docs(directory
, vocab
, is_train
):
lines
= list()
for filename
in listdir
(directory
):
if is_train
and filename
.startswith
('cv9'):
continue
if not is_train
and not filename
.startswith
('cv9'):
continue
path
= directory
+ '/' + filename
line
= doc_to_line
(path
, vocab
)
lines
.append
(line
)
return lines
def load_clean_dataset(vocab
, is_train
):
neg
= process_docs
('txt_sentoken/neg', vocab
, is_train
)
pos
= process_docs
('txt_sentoken/pos', vocab
, is_train
)
docs
= neg
+ pos
labels
= array
([0 for _
in range(len(neg
))] + [1 for _
in range(len(pos
))])
return docs
, labels
def create_tokenizer(lines
):
tokenizer
= Tokenizer
()
tokenizer
.fit_on_texts
(lines
)
return tokenizer
vocab_filename
= 'vocab.txt'
vocab
= load_doc
(vocab_filename
)
vocab
= set(vocab
.split
())
train_docs
, ytrain
= load_clean_dataset
(vocab
, True)
test_docs
, ytest
= load_clean_dataset
(vocab
, False)
tokenizer
= create_tokenizer
(train_docs
)
Xtrain
= tokenizer
.texts_to_matrix
(train_docs
, mode
='freq')
Xtest
= tokenizer
.texts_to_matrix
(test_docs
, mode
='freq')
print(Xtrain
.shape
, Xtest
.shape
)
二.代码说明
该部分将把每个评论文档转换为多层感知器模型可以处理的数据表示形式。词袋模型就是一种从文本中提取特征的方法,因此经过词袋模型处理后的文本就可以输入给神经网络等机器学习算法使用了。每个文档被转换为矢量表示,表示文档的向量中的元素数量对应于词汇表中的单词数,词汇量越大,矢量维度越高,因此词汇表大小会影响输入数据的大小,我们需要找个平衡点选择词汇表的大小,一般建议在能解决问题的情况下,词汇表越小越好。 1.根据词汇表将评论转成限定词汇行表示 doc_to_line()函数完成文档加载、文档清洗、过滤掉不在词汇表中的词汇,最后将文档转成一串空白分隔的token序列。 process_docs()函数用来处理目录中的所有文档(例如pos和neg)以将文档转换为行。load_clean_dataset()函数调用process_docs()处理正面和负面评论,构建评论文本及其相关输出标签的数据集,0表示负面,1表示正面。 2.电影评论到词袋向量 使用Keras创建Tokenizer,然后fit训练数据集中的文本文档,得到特定的tokenizer,然后用text_to_matrix()将评论通过编码转换为固定长度的文档向量。可以通过mode来指定对单词进行评分的方法。
三.结果输出
(1800, 25778) (200, 25778)