文章目录
一\介绍二\数据处理思路读取数据提取char和word,转化为txt利用上面的txt, 使用word2vec生成模型对每句话根据上面生成的模型,得到对应的向量利用re正则匹配将csv中的word和char做成list,将list通过keras的preprocessing方法转化为数字序列
三\模型两个全连层,最简单的神经网络了两层卷积最后的模型, 因为题目是使用f1的值作为结果,因此定义f1作为每轮训练的补偿处理模型结果,得到最终结果
一\介绍
2020中国大学生保险数字科技竞赛 其中训练集是 数据中给的是char和word,都是代表一句话,最终根据char和word以及判断的是label
二\数据处理思路
数据两种思路 一种是常见的自然语言处理文本处理, 使用word2vec生成对应的词向量,因为直接调用API特别方便
读取数据
import pandas
as pd
import numpy
as np
pd
.set_option
('display.max_rows', 5)
file = pd
.read_csv
(r
"data/train.csv")
print(len(file))
print(file.columns
.values
)
104344
['id' 'category' 'char' 'word' 'label']
提取char和word,转化为txt
def csv_2_txt(words
, save_file
):
import re
print(len(words
))
with open(save_file
, "w") as fd
:
for i
,c
in enumerate(words
):
all_numb_char_
= re
.findall
(r
'\d+-\d+|\d+|[1-9]\d*', str(c
))
for j
in all_numb_char_
:
fd
.write
(j
+" ")
if i
%10000==0 :
print(i
)
print(all_numb_char_
)
csv_2_txt
(file["char"].values
, "swap/char_all.txt")
csv_2_txt
(file["word"].values
, "swap/word_all.txt")
利用上面的txt, 使用word2vec生成模型
def get_vector(txt_file
, save_file
):
import gensim
from gensim
.models
import word2vec
sentences
=word2vec
.Text8Corpus
(txt_file
)
model
=word2vec
.Word2Vec
(sentences
,
sg
=1,
size
=128,
window
=5,
min_count
=1,
negative
=3,sample
=0.001,
hs
=1,
workers
=4)
model
.save
(save_file
)
get_vector
("swap/word_all.txt", 'swap/word_all.pkl')
对每句话根据上面生成的模型,得到对应的向量
def numextract(lists
):
vector
= []
for i
in range(len(lists
)):
for t
in range(len(lists
[i
])):
vector
.append
(lists
[i
].item
(t
))
return vector
from gensim
.models
.word2vec
import Word2Vec
model_char
= Word2Vec
.load
('swap/char_all.pkl')
model_word
= Word2Vec
.load
('swap/word_all.pkl')
def get_generate_vector_from_word2vec(model
, ind
="word"):
import re
import numpy
as np
vectors
= []
numbers
= []
num
= []
for sentence_char
in file[ind
].values
:
vector
= []
a
= re
.findall
(r
'\d+-\d+|\d+|[1-9]\d*', sentence_char
)
num
.append
(len(a
))
vector
.append
(model
[a
])
numbers
.append
(a
)
vectors
.append
(numextract
(vector
))
np
.save
(f
"swap/{ind}_vectors.npy",vectors
)
np
.save
(f
"swap/{ind}_numbers.npy",numbers
)
这里生成vector的方法可以在很多NLP中使用, 因为模型训练的输入肯定是数字化格式
第二个方法直接使用文本序列, 在利用
利用re正则匹配将csv中的word和char做成list,
import pandas
as pd
import re
csv_data
= pd
.read_csv
("data/public_test.csv")
csv_data
.head
(2)
char_line
= []
word_line
= []
for i
in range(len(csv_data
)):
csv_data_char
= " ".join
(re
.findall
(r
'\d+-\d+|\d+|[1-9]\d*', csv_data
["char"].values
[i
]))
csv_data_word
= " ".join
(re
.findall
(r
'\d+-\d+|\d+|[1-9]\d*', csv_data
["word"].values
[i
]))
char_line
.append
(csv_data_char
)
word_line
.append
(csv_data_word
)
np
.save
("char_line_list_public.npy",char_line
)
np
.save
("word_line_list_public.npy",word_line
)
array(['109 57 56 52', '109',
'54 55 56 52 57 58 59 60 ....'],
dtype='<U1143')
将list通过keras的preprocessing方法转化为数字序列
from keras
.preprocessing
.sequence
import pad_sequences
from keras
.preprocessing
.text
import Tokenizer
from keras
.layers
.merge
import concatenate
from keras
.models
import Sequential
, Model
from keras
.layers
import Dense
, Embedding
, Activation
, merge
, Input
, Lambda
, Reshape
from keras
.layers
import Convolution1D
, Flatten
, Dropout
, MaxPool1D
, GlobalAveragePooling1D
from keras
.layers
import LSTM
, GRU
, TimeDistributed
, Bidirectional
from keras
.utils
.np_utils
import to_categorical
from keras
import initializers
from keras
import backend
as K
from keras
.engine
.topology
import Layer
from sklearn
.naive_bayes
import MultinomialNB
from sklearn
.linear_model
import SGDClassifier
from sklearn
.feature_extraction
.text
import TfidfVectorizer
import pandas
as pd
import numpy
as np
def encode_1hot(label
) -> np
.array
:
from sklearn
.preprocessing
import OneHotEncoder
listUniq
= list(set(label
))
print(listUniq
, len(listUniq
))
label_onehot_
= []
for i
in label
:
label_onehot_
.append
(listUniq
.index
(i
))
labels
= np
.array
(label_onehot_
).reshape
(len(label_onehot_
), -1)
enc
= OneHotEncoder
()
enc
.fit
(labels
)
tempdata
= np
.array
(enc
.transform
(labels
).toarray
())
return tempdata
,len(listUniq
)
char_line
= np
.load
("swap/char_line_list.npy")
word_line
= np
.load
("swap/word_line_list.npy")
csv_data
= pd
.read_csv
("swap/new_train.csv")
tokenizer
= Tokenizer
(filters
='!"#$%&()*+,./:;<=>?@[\\]^_`{|}~\t\n',lower
=True,split
=" ")
tokenizer
.fit_on_texts
(char_line
)
vocab
= tokenizer
.word_index
print("字典长度是:",len(vocab
))
word_ids
= tokenizer
.texts_to_sequences
(char_line
)
print("word_id长度:",len(word_ids
))
sequences
= pad_sequences
(word_ids
, maxlen
=100)
print(sequences
[2])
y
= csv_data
["label"].values
y_1hot
,specise
=encode_1hot
(y
)
三\模型
两个全连层,最简单的神经网络了
seed
= 236
from keras
import optimizers
model
= Sequential
()
model
.add
(Dense
(256,activation
='relu', input_dim
=100))
model
.add
(Dropout
(0.5))
model
.add
(Dense
(specise
,activation
='softmax'))
model
.compile(loss
='categorical_crossentropy',
optimizer
='adam',
metrics
=['accuracy'])
model
.summary
()
sgd
= optimizers
.SGD
(lr
=0.01,
decay
=1e-6,
momentum
=0.9,
nesterov
=True)
model
.compile(loss
='mean_squared_error',
optimizer
=sgd
,
metrics
=['accuracy'])
from sklearn
.model_selection
import StratifiedKFold
kf
= StratifiedKFold
(n_splits
=10, shuffle
=True, random_state
=seed
)
for j
, (train_index
, test_index
) in enumerate(kf
.split
(y
, y
)):
model
.fit
(sequences
[train_index
],y_1hot
[train_index
],epochs
=50,batch_size
=64,verbose
=1)
test_loss
,test_acc
= model
.evaluate
(sequences
[test_index
],y_1hot
[test_index
])
print("test_loss:",test_loss
,"准确率",test_acc
)
两层卷积
seed
= 236
from keras
import optimizers
from keras
.layers
.normalization
import BatchNormalization
model
= Sequential
()
model
.add
(Embedding
(len(vocab
)+1, 100, input_length
=100))
model
.add
(Convolution1D
(256, 3, padding
="same"))
model
.add
(MaxPool1D
(3,3,padding
="same"))
model
.add
(Convolution1D
(128, 3, padding
="same"))
model
.add
(MaxPool1D
(3,3,padding
="same"))
model
.add
(Convolution1D
(64, 3, padding
="same"))
model
.add
(Flatten
())
model
.add
(Dropout
(0.1))
model
.add
(BatchNormalization
())
model
.add
(Dense
(256,activation
="relu"))
model
.add
(Dropout
(0.1))
model
.add
(Dense
(specise
,activation
="softmax"))
model
.summary
()
sgd
= optimizers
.SGD
(lr
=0.01, decay
=1e-6, momentum
=0.9, nesterov
=True)
model
.compile(loss
='mean_squared_error', optimizer
=sgd
,metrics
=['accuracy'])
from sklearn
.model_selection
import StratifiedKFold
kf
= StratifiedKFold
(n_splits
=10, shuffle
=True, random_state
=seed
)
for j
, (train_index
, test_index
) in enumerate(kf
.split
(y
, y
)):
model
.fit
(sequences
[train_index
],y_1hot
[train_index
],epochs
=50,batch_size
=64,verbose
=0)
test_loss
,test_acc
= model
.evaluate
(sequences
[test_index
],y_1hot
[test_index
])
print("test_loss:",test_loss
,"准确率",test_acc
)
最后的模型, 因为题目是使用f1的值作为结果,因此定义f1作为每轮训练的补偿
from keras
.preprocessing
.sequence
import pad_sequences
from keras
.preprocessing
.text
import Tokenizer
from keras
.layers
.merge
import concatenate
from keras
.models
import Sequential
, Model
from keras
.layers
import Dense
, Embedding
, Activation
, merge
, Input
, Lambda
, Reshape
from keras
.layers
import Convolution1D
, Flatten
, Dropout
, MaxPool1D
, GlobalAveragePooling1D
from keras
.layers
import LSTM
, GRU
, TimeDistributed
, Bidirectional
from keras
.utils
.np_utils
import to_categorical
from keras
import initializers
from keras
import backend
as K
from keras
.engine
.topology
import Layer
from sklearn
.naive_bayes
import MultinomialNB
from sklearn
.linear_model
import SGDClassifier
from sklearn
.feature_extraction
.text
import TfidfVectorizer
import pandas
as pd
import numpy
as np
from keras
import backend
as K
def recall_m(y_true
, y_pred
):
true_positives
= K
.sum(K
.round(K
.clip
(y_true
* y_pred
, 0, 1)))
possible_positives
= K
.sum(K
.round(K
.clip
(y_true
, 0, 1)))
recall
= true_positives
/ (possible_positives
+ K
.epsilon
())
return recall
def precision_m(y_true
, y_pred
):
true_positives
= K
.sum(K
.round(K
.clip
(y_true
* y_pred
, 0, 1)))
predicted_positives
= K
.sum(K
.round(K
.clip
(y_pred
, 0, 1)))
precision
= true_positives
/ (predicted_positives
+ K
.epsilon
())
return precision
def f1_m(y_true
, y_pred
):
precision
= precision_m
(y_true
, y_pred
)
recall
= recall_m
(y_true
, y_pred
)
return 2*((precision
*recall
)/(precision
+recall
+K
.epsilon
()))
def encode_1hot(label
) -> np
.array
:
from sklearn
.preprocessing
import OneHotEncoder
listUniq
= list(set(label
))
np
.save
("data/listUniq.npy", listUniq
)
print(listUniq
, len(listUniq
))
label_onehot_
= []
for i
in label
:
label_onehot_
.append
(listUniq
.index
(i
))
labels
= np
.array
(label_onehot_
).reshape
(len(label_onehot_
), -1)
enc
= OneHotEncoder
()
enc
.fit
(labels
)
tempdata
= np
.array
(enc
.transform
(labels
).toarray
())
return tempdata
,len(listUniq
)
char_line
= np
.load
("swap/char_line_list.npy")
word_line
= np
.load
("swap/word_line_list.npy")
char_line_public
= np
.load
("cnn/char_line_list_public.npy")
word_line_public
= np
.load
("cnn/word_line_list_public.npy")
csv_data
= pd
.read_csv
("swap/new_train.csv")
csv_data_test
= pd
.read_csv
("data/public_test.csv")
for i
in range(len(word_line
)):
list(char_line
[i
]).extend
(word_line
[i
])
train_and_test
= np
.array
(char_line
)
print("train_and_test len = :",len(train_and_test
))
for i
in range(len(char_line_public
)):
list(char_line_public
[i
]).extend
(word_line_public
[i
])
need_to_predict
= np
.array
(char_line_public
)
print("need_to_predict len = :",len(need_to_predict
))
list(char_line
).extend
(char_line_public
)
all_data
= np
.array
(char_line
)
print("all_data len = :",len(all_data
))
tokenizer
= Tokenizer
(filters
='!"#$%&()*+,./:;<=>?@[\\]^_`{|}~\t\n',lower
=True,split
=" ")
tokenizer
.fit_on_texts
(all_data
)
vocab
= tokenizer
.word_index
print("字典长度是:",len(vocab
))
word_ids
= tokenizer
.texts_to_sequences
(train_and_test
)
print("word_id len:",len(word_ids
))
word_ids_predict
= tokenizer
.texts_to_sequences
(need_to_predict
)
print("word_ids_predict len:",len(word_ids_predict
))
sequences
= pad_sequences
(word_ids
, maxlen
=128)
sequences_predict
= pad_sequences
(word_ids_predict
, maxlen
=128)
y
= csv_data
["label"].values
y_1hot
,specise
=encode_1hot
(y
)
seed
= 256
from keras
.models
import load_model
from keras
import optimizers
from keras
.layers
.normalization
import BatchNormalization
model
= Sequential
()
model
.add
(Embedding
(len(vocab
)+1, 128, input_length
=128))
model
.add
(Convolution1D
(128, 3, padding
="same"))
model
.add
(MaxPool1D
(3,3,padding
="same"))
model
.add
(Convolution1D
(128, 3, padding
="same"))
model
.add
(MaxPool1D
(3,3,padding
="same"))
model
.add
(Convolution1D
(64, 3, padding
="same"))
model
.add
(MaxPool1D
(3,3,padding
="same"))
model
.add
(Convolution1D
(64, 3, padding
="same"))
model
.add
(Flatten
())
model
.add
(Dropout
(0.1))
model
.add
(BatchNormalization
())
model
.add
(Dense
(256,activation
="relu"))
model
.add
(Dropout
(0.1))
model
.add
(Dense
(specise
,activation
="softmax"))
model
.summary
()
sgd
= optimizers
.SGD
(lr
=0.01, decay
=1e-6, momentum
=0.9, nesterov
=True)
model
.compile(loss
='categorical_crossentropy', optimizer
="adam", metrics
=[f1_m
])
from sklearn
.model_selection
import StratifiedKFold
kf
= StratifiedKFold
(n_splits
=10, shuffle
=True, random_state
=seed
)
for j
, (train_index
, test_index
) in enumerate(kf
.split
(y
, y
)):
print(test_index
[0:20])
model
.fit
(sequences
[train_index
],y_1hot
[train_index
],epochs
=20,batch_size
=64,verbose
=1)
one
= model
.evaluate
(sequences
[test_index
],y_1hot
[test_index
])
print("result == === ==:",one
)
predict
= model
.predict
(sequences_predict
)
predict
=np
.argmax
(predict
,axis
=1)
pd
.DataFrame
({"predict":predict
}).to_csv
(f
"data/result_{j}.csv")
print(f
"model save model/model_{j}.h5")
model
.save
(f
"model/model_{j}.h5")
最终是利用每一轮训练出来的模型 预测测试集的标签, 保存到csv中,但是由于已经进过了1热编码,因此需要反一热编码,下面的代码处理csv文件生成比赛要求结果格式
处理模型结果,得到最终结果
import numpy
as np
import pandas
as pd
import sys
csv_file
= sys
.argv
[1]
listUniq
= np
.load
("listUniq.npy")
print(listUniq
)
print(len(listUniq
))
data
= pd
.read_csv
("public_test.csv")
predict_result
= pd
.read_csv
(csv_file
)["predict"].values
listUniq
= np
.load
("listUniq.npy")
data
.head
(2)
print(data
["id"].values
[1])
id_
= []
predict
= []
for i
,num
in enumerate(data
["catgory"].values
):
if num
==0:
id_
.append
(data
["id"].values
[i
])
predict
.append
(listUniq
[predict_result
[i
]])
pd
.DataFrame
({"id":id_
, "prodict":predict
}).to_csv
("result_xxyl.csv", index
= False, sep
="\t", header
=False)
result_xxyl.csv