文章目录
一.Python代码二.代码说明三.结果输出
一.Python代码
'''
@file: keras_emotional_analysis_mlp.py
@time: 2020/7/4 0004 12:06
@author: Jack
@contact: jack18588951684@163.com
'''
import string
import re
from os
import listdir
from numpy
import array
from nltk
.corpus
import stopwords
from keras
.preprocessing
.text
import Tokenizer
from keras
.utils
.vis_utils
import plot_model
from keras
.models
import Sequential
from keras
.layers
import Dense
def load_doc(filename
):
file = open(filename
, 'r')
text
= file.read
()
file.close
()
return text
def clean_doc(doc
):
tokens
= doc
.split
()
re_punc
= re
.compile('[%s]]' % re
.escape
(string
.punctuation
))
tokens
= [re_punc
.sub
('', w
) for w
in tokens
]
tokens
= [w
for w
in tokens
if w
.isalpha
()]
stop_words
= set(stopwords
.words
('english'))
tokens
= [w
for w
in tokens
if not w
in stop_words
]
tokens
= [w
for w
in tokens
if len(w
) > 1]
return tokens
def doc_to_line(filename
, vocab
):
doc
= load_doc
(filename
)
tokens
= clean_doc
(doc
)
tokens
= [w
for w
in tokens
if w
in vocab
]
return ' '.join
(tokens
)
def process_docs(directory
, vocab
, is_train
):
lines
= list()
for filename
in listdir
(directory
):
if is_train
and filename
.startswith
('cv9'):
continue
if not is_train
and not filename
.startswith
('cv9'):
continue
path
= directory
+ '/' + filename
line
= doc_to_line
(path
, vocab
)
lines
.append
(line
)
return lines
def load_clean_dataset(vocab
, is_train
):
neg
= process_docs
('txt_sentoken/neg', vocab
, is_train
)
pos
= process_docs
('txt_sentoken/pos', vocab
, is_train
)
docs
= neg
+ pos
labels
= array
([0 for _
in range(len(neg
))] + [1 for _
in range(len(pos
))])
return docs
, labels
def create_tokenizer(lines
):
tokenizer
= Tokenizer
()
tokenizer
.fit_on_texts
(lines
)
return tokenizer
def define_model(n_words
):
model
= Sequential
()
model
.add
(Dense
(50, input_shape
=(n_words
,), activation
='relu'))
model
.add
(Dense
(1, activation
='sigmoid'))
model
.compile(loss
='binary_crossentropy', optimizer
='adam', metrics
=['accuracy'])
model
.summary
()
return model
if __name__
== "__main__":
vocab_filename
= 'vocab.txt'
vocab
= load_doc
(vocab_filename
)
vocab
= set(vocab
.split
())
train_docs
, ytrain
= load_clean_dataset
(vocab
, True)
test_docs
, ytest
= load_clean_dataset
(vocab
, False)
tokenizer
= create_tokenizer
(train_docs
)
Xtrain
= tokenizer
.texts_to_matrix
(train_docs
, mode
='freq')
Xtest
= tokenizer
.texts_to_matrix
(test_docs
, mode
='freq')
n_words
= Xtest
.shape
[1]
model
= define_model
(n_words
)
model
.fit
(Xtrain
, ytrain
, epochs
=10, verbose
=2)
loss
, acc
= model
.evaluate
(Xtest
, ytest
, verbose
=0)
print('Test Accuracy:{}'.format(acc
* 100))
二.代码说明
代码使用Keras中的Dense全连接层构建简单的多层感知器MLP将编码文档分类为正面或负面。模型的输入层的大小等于词汇表中的单词数即输入文档的长度,将其存储在一个名为n_words的新变量中。模型定义一个具有50个神经元和使用relu激活函数的隐藏层,输出层是使用sigmoid激活函数的单个神经元,用于预测最终评论的类别:0为负面评论,1为正面评论。模型训练过程中梯度下降使用Adam算法和二元交叉熵损失函数,在训练和评估模型时,跟踪模型准确性accuracy数据。
三.结果输出
Model
: "sequential"
_________________________________________________________________
Layer
(type) Output Shape Param
=================================================================
dense
(Dense
) (None, 50) 1181350
_________________________________________________________________
dense_1
(Dense
) (None, 1) 51
=================================================================
Total params
: 1,181,401
Trainable params
: 1,181,401
Non
-trainable params
: 0
_________________________________________________________________
Epoch
1/10
57/57 - 0s
- loss
: 0.6912 - accuracy
: 0.6300
Epoch
2/10
57/57 - 0s
- loss
: 0.6809 - accuracy
: 0.8617
Epoch
3/10
57/57 - 0s
- loss
: 0.6613 - accuracy
: 0.9172
Epoch
4/10
57/57 - 0s
- loss
: 0.6317 - accuracy
: 0.9128
Epoch
5/10
57/57 - 0s
- loss
: 0.5948 - accuracy
: 0.9383
Epoch
6/10
57/57 - 0s
- loss
: 0.5519 - accuracy
: 0.9456
Epoch
7/10
57/57 - 0s
- loss
: 0.5067 - accuracy
: 0.9456
Epoch
8/10
57/57 - 0s
- loss
: 0.4610 - accuracy
: 0.9589
Epoch
9/10
57/57 - 0s
- loss
: 0.4187 - accuracy
: 0.9561
Epoch
10/10
57/57 - 0s
- loss
: 0.3782 - accuracy
: 0.9667
Test Accuracy
:88.99999856948853