NLP入门-文本分类|paddle

技术2025-01-12 52

文本分类：自然语言处理领域中的一个经典问题，文本分类是利用电脑对文本按照一定的分类体系进行自动分类标记。

数据来源：从网站上爬取56821条数据中文新闻摘要数据内容：包含10种类别，国际、文化、娱乐、体育、财经、汽车、教育、科技、房产、证券

严格意义上来说这个新闻的数据集不是太好，每个类目的新闻数目不是一致的，一个好的数据集对于各个类别分布是比较均匀的。

1、准备数据:

数据进行预处理创建数据集和数据字典创建数据读取器train_reader 和test_reader

2、配置网络

定义网络

定义损失函数：交叉熵损失函数

定义优化算法：选择优化器，adam，SGD等等

3、训练网络

需要对网络进行训练，丢入训练集，去训练我们的模型

4、模型评估

5、模型预测

# 查看当前挂载的数据集目录 !ls /home/aistudio/data/ #将数据移动到 /home/aistudio/data/ 目录下 !cp data/data6825/news_classify_data.txt data/ data6825

# 导入必要的包 import os #系统操作包 from multiprocessing import cpu_count import numpy as np #计算包 import shutil import paddle #paddle的工具包 import paddle.fluid as fluid # 创建数据集和数据字典 data_root_path='/home/aistudio/data/' #选择数据路径 #对我们读取出来的路径创建数据词典 def create_data_list(data_root_path): with open(data_root_path + 'test_list.txt', 'w') as f: pass with open(data_root_path + 'train_list.txt', 'w') as f: pass with open(os.path.join(data_root_path, 'dict_txt.txt'), 'r', encoding='utf-8') as f_data: dict_txt = eval(f_data.readlines()[0]) with open(os.path.join(data_root_path, 'news_classify_data.txt'), 'r', encoding='utf-8') as f_data: lines = f_data.readlines() i = 0 for line in lines: title = line.split('_!_')[-1].replace('\n', '') l = line.split('_!_')[1] labs = "" if i % 10 == 0: with open(os.path.join(data_root_path, 'test_list.txt'), 'a', encoding='utf-8') as f_test: for s in title: lab = str(dict_txt[s]) labs = labs + lab + ',' labs = labs[:-1] labs = labs + '\t' + l + '\n' f_test.write(labs) else: with open(os.path.join(data_root_path, 'train_list.txt'), 'a', encoding='utf-8') as f_train: for s in title: lab = str(dict_txt[s]) labs = labs + lab + ',' labs = labs[:-1] labs = labs + '\t' + l + '\n' f_train.write(labs) i += 1 print("数据列表生成完成！") # 把下载得数据生成一个字典 #将每一个文本每一个子映射到词典得到一个数字ID，因为输入到模型里面的不是汉字，是一个数字ID def create_dict(data_path, dict_path): dict_set = set() # 读取已经下载得数据 with open(data_path, 'r', encoding='utf-8') as f: lines = f.readlines() # 把数据生成一个元组 for line in lines: title = line.split('_!_')[-1].replace('\n', '') for s in title: dict_set.add(s) # 把元组转换成字典，一个字对应一个数字 dict_list = [] i = 0 for s in dict_set: dict_list.append([s, i]) i += 1 # 添加未知字符 dict_txt = dict(dict_list) end_dict = {"<unk>": i} dict_txt.update(end_dict) # 把这些字典保存到本地中 with open(dict_path, 'w', encoding='utf-8') as f: f.write(str(dict_txt)) print("数据字典生成完成！") # 获取字典的长度 def get_dict_len(dict_path): with open(dict_path, 'r', encoding='utf-8') as f: line = eval(f.readlines()[0]) return len(line.keys()) if __name__ == '__main__': # 把生产的数据列表都放在自己的总类别文件夹中 data_root_path = "/home/aistudio/data/" data_path = os.path.join(data_root_path, 'news_classify_data.txt') dict_path = os.path.join(data_root_path, "dict_txt.txt") # 创建数据字典 create_dict(data_path, dict_path) # 创建数据列表 create_data_list(data_root_path) 数据字典生成完成！数据列表生成完成！

创建好的字典：每一个字会对应一个数字ID

创建好的数据列表：文本转化为序列化的表示

每一行代表一句新闻，就是一个样本。

paddle.reader.xmap_readers():通过多线程方式，通过用户自定义的映射器mapper来映射reader返回的样本（到输出队列)。

# 创建数据读取器train_reader 和test_reader # 训练/测试数据的预处理 def data_mapper(sample): data, label = sample data = [int(data) for data in data.split(',')] return data, int(label) # 创建数据读取器train_reader def train_reader(train_list_path): def reader(): with open(train_list_path, 'r') as f: lines = f.readlines() # 打乱数据 np.random.shuffle(lines) # 开始获取每张图像和标签 for line in lines: data, label = line.split('\t') yield data, label return paddle.reader.xmap_readers(data_mapper, reader, cpu_count(), 1024) # 创建数据读取器test_reader def test_reader(test_list_path): def reader(): with open(test_list_path, 'r') as f: lines = f.readlines() for line in lines: data, label = line.split('\t') yield data, label return paddle.reader.xmap_readers(data_mapper, reader, cpu_count(), 1024)

至此，数据准备工作已经完成了。

卷积神经网络（Convolutional Neural Networks, CNN）

输入词向量序列，产生一个特征图（feature map），对特征图采用时间维度上的最大池化（max pooling over time）操作得到此卷积核对应的整句话的特征，最后，将所有卷积核得到的特征拼接起来即为文本的定长向量表示，对于文本分类问题，将其连接至softmax即构建出完整的模型。

在实际应用中，我们会使用多个卷积核来处理句子，窗口大小相同的卷积核堆叠起来形成一个矩阵，这样可以更高效的完成运算。

另外，我们也可使用窗口大小不同的卷积核来处理句子.

# 创建CNN网络 def CNN_net(data,dict_dim, class_dim=10, emb_dim=128, hid_dim=128,hid_dim2=98): emb = fluid.layers.embedding(input=data,#进模型之前需要得到一个emb词嵌入，得到一个矩阵的编码 size=[dict_dim, emb_dim]) conv_3 = fluid.nets.sequence_conv_pool( input=emb, num_filters=hid_dim, filter_size=3,#卷积核 act="tanh", pool_type="sqrt") conv_4 = fluid.nets.sequence_conv_pool( input=emb, num_filters=hid_dim2, filter_size=4, act="tanh", pool_type="sqrt") output = fluid.layers.fc( input=[conv_3, conv_4], size=class_dim, act='softmax')#经过全连接层，将两个cnn的结果拼接起来 return output#1x10的概率分布的矩阵，10个数，概率最大的数就是当前模型的预测结果 # 定义输入数据， lod_level不为0指定输入数据为序列数据 words = fluid.layers.data(name='words', shape=[1], dtype='int64', lod_level=1)#lod_level 处理变长序列，paddle官网的文档中LoDtensor lodlayer的索引定长的数据不需要考虑这个问题 label = fluid.layers.data(name='label', shape=[1], dtype='int64') # 获取数据字典长度 dict_dim = get_dict_len('/home/aistudio/data/dict_txt.txt') # 获取卷积神经网络 # model = CNN_net(words, dict_dim, 15) # 获取分类器 model = CNN_net(words, dict_dim) # 获取损失函数和准确率 cost = fluid.layers.cross_entropy(input=model, label=label)#损失函数 avg_cost = fluid.layers.mean(cost)#每次训练都是一个batch，求一个平均 acc = fluid.layers.accuracy(input=model, label=label) # 获取预测程序 test_program = fluid.default_main_program().clone(for_test=True)#clone克隆函数 # 定义优化方法 optimizer = fluid.optimizer.AdagradOptimizer(learning_rate=0.002) opt = optimizer.minimize(avg_cost) # 创建一个执行器，CPU训练速度比较慢 #place = fluid.CPUPlace() place = fluid.CUDAPlace(0)#GPU执行 exe = fluid.Executor(place) # 进行参数初始化 exe.run(fluid.default_startup_program()) [] # 获取训练数据读取器和测试数据读取器 train_reader = paddle.batch(reader=train_reader('/home/aistudio/data/train_list.txt'), batch_size=128) test_reader = paddle.batch(reader=test_reader('/home/aistudio/data/test_list.txt'), batch_size=128) # 定义数据映射器 feeder = fluid.DataFeeder(place=place, feed_list=[words, label]) EPOCH_NUM=20#迭代次数 model_save_dir = '/home/aistudio/work/infer_model/' # 开始训练 for pass_id in range(EPOCH_NUM): # 进行训练 for batch_id, data in enumerate(train_reader()): train_cost, train_acc = exe.run(program=fluid.default_main_program(), feed=feeder.feed(data), fetch_list=[avg_cost, acc]) if batch_id % 100 == 0:#每执行100次，打印一次 print('Pass:%d, Batch:%d, Cost:%0.5f, Acc:%0.5f' % (pass_id, batch_id, train_cost[0], train_acc[0])) # 进行测试，读入一批陌生的数据，模型没有见过的数据， test_costs = [] test_accs = [] for batch_id, data in enumerate(test_reader()): test_cost, test_acc = exe.run(program=test_program, feed=feeder.feed(data), fetch_list=[avg_cost, acc]) test_costs.append(test_cost[0]) test_accs.append(test_acc[0]) # 计算平均预测损失在和准确率 test_cost = (sum(test_costs) / len(test_costs)) test_acc = (sum(test_accs) / len(test_accs)) print('Test:%d, Cost:%0.5f, ACC:%0.5f' % (pass_id, test_cost, test_acc)) # 保存预测模型，可以考虑将这段保存模型的代码放到for循环里面，将每一轮的模型都保存起来 if not os.path.exists(model_save_dir): os.makedirs(model_save_dir) fluid.io.save_inference_model(model_save_dir, feeded_var_names=[words.name], target_vars=[model], executor=exe) print('训练模型保存完成！') Pass:0, Batch:0, Cost:2.30681, Acc:0.09375 Pass:0, Batch:100, Cost:0.99743, Acc:0.68750 Pass:0, Batch:200, Cost:0.89360, Acc:0.76562 Pass:0, Batch:300, Cost:0.92248, Acc:0.70312 Test:0, Cost:0.81883, ACC:0.73921 Pass:1, Batch:0, Cost:0.90457, Acc:0.67969 Pass:1, Batch:100, Cost:0.67305, Acc:0.83594 Pass:1, Batch:200, Cost:0.63098, Acc:0.80469 Pass:1, Batch:300, Cost:0.76019, Acc:0.77344 Test:1, Cost:0.75819, ACC:0.75909 Pass:2, Batch:0, Cost:0.73232, Acc:0.76562 Pass:2, Batch:100, Cost:0.70476, Acc:0.77344 Pass:2, Batch:200, Cost:0.71542, Acc:0.75781 Pass:2, Batch:300, Cost:0.63258, Acc:0.78125 Test:2, Cost:0.73717, ACC:0.76160 Pass:3, Batch:0, Cost:0.56025, Acc:0.82812 Pass:3, Batch:100, Cost:0.48580, Acc:0.86719 Pass:3, Batch:200, Cost:0.54991, Acc:0.84375 Pass:3, Batch:300, Cost:0.67272, Acc:0.78906 Test:3, Cost:0.72726, ACC:0.76317 Pass:4, Batch:0, Cost:0.53660, Acc:0.82812 Pass:4, Batch:100, Cost:0.73550, Acc:0.78906 Pass:4, Batch:200, Cost:0.53774, Acc:0.80469 Pass:4, Batch:300, Cost:0.46155, Acc:0.85156 Test:4, Cost:0.72185, ACC:0.76169 Pass:5, Batch:0, Cost:0.65421, Acc:0.78906 Pass:5, Batch:100, Cost:0.59889, Acc:0.80469 Pass:5, Batch:200, Cost:0.71301, Acc:0.79688 Pass:5, Batch:300, Cost:0.69682, Acc:0.81250 Test:5, Cost:0.71626, ACC:0.76525 Pass:6, Batch:0, Cost:0.72434, Acc:0.75000 Pass:6, Batch:100, Cost:0.59109, Acc:0.77344 Pass:6, Batch:200, Cost:0.48783, Acc:0.81250 Pass:6, Batch:300, Cost:0.57463, Acc:0.81250 Test:6, Cost:0.71520, ACC:0.76447 Pass:7, Batch:0, Cost:0.50502, Acc:0.84375 Pass:7, Batch:100, Cost:0.62133, Acc:0.79688 Pass:7, Batch:200, Cost:0.68593, Acc:0.76562 Pass:7, Batch:300, Cost:0.55528, Acc:0.80469 Test:7, Cost:0.71300, ACC:0.76769 Pass:8, Batch:0, Cost:0.60046, Acc:0.76562 Pass:8, Batch:100, Cost:0.47617, Acc:0.82812 Pass:8, Batch:200, Cost:0.59591, Acc:0.79688 Pass:8, Batch:300, Cost:0.66050, Acc:0.76562 Test:8, Cost:0.71475, ACC:0.76594 Pass:9, Batch:0, Cost:0.40968, Acc:0.84375 Pass:9, Batch:100, Cost:0.50980, Acc:0.81250 Pass:9, Batch:200, Cost:0.55923, Acc:0.85156 Pass:9, Batch:300, Cost:0.42255, Acc:0.87500 Test:9, Cost:0.71282, ACC:0.76717 Pass:10, Batch:0, Cost:0.44147, Acc:0.88281 Pass:10, Batch:100, Cost:0.55140, Acc:0.85938 Pass:10, Batch:200, Cost:0.50935, Acc:0.84375 Pass:10, Batch:300, Cost:0.56366, Acc:0.83594 Test:10, Cost:0.71520, ACC:0.76586 Pass:11, Batch:0, Cost:0.55133, Acc:0.79688 Pass:11, Batch:100, Cost:0.45308, Acc:0.80469 Pass:11, Batch:200, Cost:0.63471, Acc:0.78125 Pass:11, Batch:300, Cost:0.52810, Acc:0.80469 Test:11, Cost:0.71511, ACC:0.76673 Pass:12, Batch:0, Cost:0.51947, Acc:0.83594 Pass:12, Batch:100, Cost:0.63086, Acc:0.80469 Pass:12, Batch:200, Cost:0.57166, Acc:0.82812 Pass:12, Batch:300, Cost:0.59658, Acc:0.75781 Test:12, Cost:0.71533, ACC:0.76673 Pass:13, Batch:0, Cost:0.34512, Acc:0.89062 Pass:13, Batch:100, Cost:0.47249, Acc:0.82812 Pass:13, Batch:200, Cost:0.51224, Acc:0.85156 Pass:13, Batch:300, Cost:0.45350, Acc:0.84375 Test:13, Cost:0.71736, ACC:0.76647 Pass:14, Batch:0, Cost:0.45494, Acc:0.85156 Pass:14, Batch:100, Cost:0.68085, Acc:0.78125 Pass:14, Batch:200, Cost:0.48124, Acc:0.83594 Pass:14, Batch:300, Cost:0.47296, Acc:0.85938 Test:14, Cost:0.71745, ACC:0.76760 Pass:15, Batch:0, Cost:0.73750, Acc:0.77344 Pass:15, Batch:100, Cost:0.55038, Acc:0.83594 Pass:15, Batch:200, Cost:0.59775, Acc:0.74219 Pass:15, Batch:300, Cost:0.47932, Acc:0.82812 Test:15, Cost:0.72163, ACC:0.76673 Pass:16, Batch:0, Cost:0.31890, Acc:0.90625 Pass:16, Batch:100, Cost:0.38017, Acc:0.85156 Pass:16, Batch:200, Cost:0.57517, Acc:0.79688 Pass:16, Batch:300, Cost:0.44878, Acc:0.87500 Test:16, Cost:0.72158, ACC:0.76786 Pass:17, Batch:0, Cost:0.43048, Acc:0.88281 Pass:17, Batch:100, Cost:0.47145, Acc:0.82031 Pass:17, Batch:200, Cost:0.47934, Acc:0.82812 Pass:17, Batch:300, Cost:0.36709, Acc:0.89062 Test:17, Cost:0.72381, ACC:0.76647 Pass:18, Batch:0, Cost:0.35568, Acc:0.88281 Pass:18, Batch:100, Cost:0.61057, Acc:0.82031 Pass:18, Batch:200, Cost:0.40052, Acc:0.88281 Pass:18, Batch:300, Cost:0.45469, Acc:0.83594 Test:18, Cost:0.72549, ACC:0.76743 Pass:19, Batch:0, Cost:0.41658, Acc:0.86719 Pass:19, Batch:100, Cost:0.48703, Acc:0.86719 Pass:19, Batch:200, Cost:0.47010, Acc:0.83594 Pass:19, Batch:300, Cost:0.35333, Acc:0.84375 Test:19, Cost:0.72887, ACC:0.76690 训练模型保存完成！ # 用训练好的模型进行预测并输出预测结果 # 创建执行器 #place = fluid.CPUPlace() place = fluid.CUDAPlace(0) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) save_path = '/home/aistudio/work/infer_model/' # 从模型中获取预测程序、输入数据名称列表、分类器 [infer_program, feeded_var_names, target_var] = fluid.io.load_inference_model(dirname=save_path, executor=exe) # 获取数据 def get_data(sentence): # 读取数据字典 with open('/home/aistudio/data/dict_txt.txt', 'r', encoding='utf-8') as f_data: dict_txt = eval(f_data.readlines()[0]) dict_txt = dict(dict_txt) # 把字符串数据转换成列表数据 keys = dict_txt.keys() data = [] for s in sentence: # 判断是否存在未知字符 if not s in keys: s = '<unk>' data.append(int(dict_txt[s])) return data data = [] # 获取图片数据 data1 = get_data('在获得诺贝尔文学奖7年之后，莫言15日晚间在山西汾阳贾家庄如是说') data2 = get_data('综合“今日美国”、《世界日报》等当地媒体报道，芝加哥河滨警察局表示，') data.append(data1) data.append(data2) # 获取每句话的单词数量 base_shape = [[len(c) for c in data]] # 生成预测数据 tensor_words = fluid.create_lod_tensor(data, base_shape, place) # 执行预测 result = exe.run(program=infer_program, feed={feeded_var_names[0]: tensor_words}, fetch_list=target_var) # 分类名称 names = [ '文化', '娱乐', '体育', '财经','房产', '汽车', '教育', '科技', '国际', '证券'] # 获取结果概率最大的label for i in range(len(data)): lab = np.argsort(result)[0][i][-1]#10个概率值，对其进行排序，选择最大的那个概率，(-1) print('预测结果标签为：%d，名称为：%s，概率为：%f' % (lab, names[lab], result[0][i][lab])) 预测结果标签为：0，名称为：文化，概率为：0.949490 预测结果标签为：8，名称为：国际，概率为：0.472569

Processed: 0.023, SQL: 9