faiss
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
d = 512
n_data = 2000
np.random.seed(0)
data = []
mu = 3
sigma = 0.1
for i in range(n_data):
data.append(np.random.normal(mu, sigma, d))
data = np.array(data).astype('float32')
print(data.shape)
import matplotlib.pyplot as plt
plt.hist(data[5])
plt.show()
query = []
n_query = 10
mu = 3
sigma = 0.1
np.random.seed(12)
query = []
for i in range(n_query):
query.append(np.random.normal(mu, sigma, d))
query = np.array(query).astype('float32')
import faiss
index = faiss.IndexFlatL2(d)
print(index.is_trained)
index.add(data)
print(index.ntotal)
k = 10
query_self = data[:5]
dis, ind = index.search(query_self, k)
print(dis.shape)
print(ind.shape)
print(dis)
print(ind)
nlist = 50
k = 10
quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
print(index.is_trained)
index.train(data)
print(index.is_trained)
index.add(data)
index.nprobe = 1
dis, ind = index.search(query_self, k)
print(dis)
print(ind)
nlist = 50
m = 8
k = 10
quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 8)
index.train(data)
index.add(data)
index.nprobe = 50
dis, ind = index.search(query_self, k)
print(dis)
print(ind)
"""
dis, ind = index.search(query, k) # 真实查询
print(dis)
print(ind)
"""
import re
import numpy as np
import pandas as pd
import jieba
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from tqdm import tqdm
from pprint import pprint
import os
with open('chinese_stopwords.txt','r', encoding='utf-8') as file:
stopwords=[i[:-1] for i in file.readlines()]
news = pd.read_csv('sqlResult.csv',encoding='gb18030')
print(news.shape)
print(news.head(5))
print(news[news.content.isna()].head(5))
news=news.dropna(subset=['content'])
print(news.shape)
def split_text(text):
text = text.replace(' ', '')
text = text.replace('\n', '')
text2 = jieba.cut(text.strip())
result = ' '.join([w for w in text2 if w not in stopwords])
return result
print(news.iloc[0].content)
print(split_text(news.iloc[0].content))
if not os.path.exists("corpus.pkl"):
corpus=list(map(split_text,[str(i) for i in news.content]))
print(corpus[0])
print(len(corpus))
print(corpus[1])
with open('corpus.pkl','wb') as file:
pickle.dump(corpus, file)
else:
with open('corpus.pkl','rb') as file:
corpus = pickle.load(file)
countvectorizer = CountVectorizer(encoding='gb18030',min_df=0.015)
tfidftransformer = TfidfTransformer()
countvector = countvectorizer.fit_transform(corpus)
print(countvector.shape)
tfidf = tfidftransformer.fit_transform(countvector)
with open('tfidf.pkl','wb') as file:
pickle.dump(tfidf, file)
print(tfidf.shape)
label=list(map(lambda source: 1 if '新华' in str(source) else 0,news.source))
X_train, X_test, y_train, y_test = train_test_split(tfidf.toarray(), label, test_size = 0.3, random_state=42)
clf = MultinomialNB()
clf.fit(X=X_train, y=y_train)
"""
# 进行CV=3折交叉验证
scores=cross_validate(clf, X_train, y_train, scoring=('accuracy','precision','recall','f1'), cv=3, return_train_score=True)
pprint(scores)
"""
y_predict = clf.predict(X_test)
def show_test_reslt(y_true,y_pred):
print('accuracy:',accuracy_score(y_true,y_pred))
print('precison:',precision_score(y_true,y_pred))
print('recall:',recall_score(y_true,y_pred))
print('f1_score:',f1_score(y_true,y_pred))
show_test_reslt(y_test, y_predict)
prediction = clf.predict(tfidf.toarray())
labels = np.array(label)
compare_news_index = pd.DataFrame({'prediction':prediction,'labels':labels})
copy_news_index=compare_news_index[(compare_news_index['prediction'] == 1) & (compare_news_index['labels'] == 0)].index
xinhuashe_news_index=compare_news_index[(compare_news_index['labels'] == 1)].index
print('可能为Copy的新闻条数:', len(copy_news_index))
if not os.path.exists("label.pkl"):
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans
normalizer = Normalizer()
scaled_array = normalizer.fit_transform(tfidf.toarray())
kmeans = KMeans(n_clusters=25,random_state=42,n_jobs=-1)
k_labels = kmeans.fit_predict(scaled_array)
with open('label.pkl','wb') as file:
pickle.dump(k_labels, file)
print(k_labels.shape)
print(k_labels[0])
else:
with open('label.pkl','rb') as file:
k_labels = pickle.load(file)
if not os.path.exists("id_class.pkl"):
id_class = {index:class_ for index, class_ in enumerate(k_labels)}
with open('id_class.pkl','wb') as file:
pickle.dump(id_class, file)
else:
with open('id_class.pkl','rb') as file:
id_class = pickle.load(file)
if not os.path.exists("class_id.pkl"):
from collections import defaultdict
class_id = defaultdict(set)
for index,class_ in id_class.items():
if index in xinhuashe_news_index.tolist():
class_id[class_].add(index)
with open('class_id.pkl','wb') as file:
pickle.dump(class_id, file)
else:
with open('class_id.pkl','rb') as file:
class_id = pickle.load(file)
count=0
for k in class_id:
print(count, len(class_id[k]))
count +=1
def find_similar_text(cpindex, top=10):
dist_dict={i:cosine_similarity(tfidf[cpindex],tfidf[i]) for i in class_id[id_class[cpindex]]}
return sorted(dist_dict.items(),key=lambda x:x[1][0], reverse=True)[:top]
import editdistance
cpindex = 3352
similar_list = find_similar_text(cpindex)
print(similar_list)
print('怀疑抄袭:\n', news.iloc[cpindex].content)
similar2 = similar_list[0][0]
print('相似原文:\n', news.iloc[similar2].content)
print('编辑距离:',editdistance.eval(corpus[cpindex], corpus[similar2]))
def find_similar_sentence(candidate, raw):
similist = []
cl = candidate.strip().split('。')
ra = raw.strip().split('。')
for c in cl:
for r in ra:
similist.append([c,r,editdistance.eval(c,r)])
sort=sorted(similist,key=lambda x:x[2])[:5]
for c,r,ed in sort:
if c!='' and r!='':
print('怀疑抄袭句:{0}\n相似原句:{1}\n 编辑距离:{2}\n'.format(c,r,ed))
find_similar_sentence(news.iloc[cpindex].content, news.iloc[similar2].content)
import numpy as np
import pandas as pd
import faiss
import pickle
news = pd.read_csv('sqlResult.csv', encoding='gb18030')
news = news.dropna(subset=['content'])
with open('./tfidf.pkl', 'rb') as file:
tfidf = pickle.load(file)
d = tfidf.shape[1]
tfidf = tfidf.toarray().astype(np.float32)
index = faiss.IndexFlatL2(d)
index.add(tfidf)
k = 10
cpindex = 3352
query_self = tfidf[cpindex:cpindex+1]
dis, ind = index.search(query_self, k)
print(ind.shape)
print('怀疑抄袭:\n', news.iloc[cpindex].content)
similar2 = ind[0][1]
print('相似原文:\n', news.iloc[similar2].content)
转载请注明原文地址:https://ipadbbs.8miu.com/read-3330.html