设计思路
利用jieba分词去停用词利用nltk进行词干提取、词形还原等预处理 (效果太差了)利用genism的bm25模型建立索引对于query设置置信区,并逐条搜索讲搜索内容写入字典按照指定格式输出字典内容 (当然用es写也可以) # -*- coding:utf-8 -*- import nltk.tokenize import time import jieba """ ——————————————————————————————————————————————————— 2020信息检索期末考试 凌珑 ———————————————————————————————————————————————————— """ '''建索引''' def readfile(filename): file= open(filename,'r',encoding='UTF-8') print("获取文件成功") return file def makestops(): stopwords=set() with open('stopwords.txt','r')as f: while True: line = f.readline() if not line: break line = line.strip('\n') stopwords.add(line) return stopwords def cutsentence(sen,stops): # words = sen.split() # words = nltk.tokenize.word_tokenize(sen) words = jieba.lcut_for_search(sen.strip(), HMM=True) words = [i for i in words if i not in stops] return words def pretreatment(datafile,corpus,dataid): begin=time.time() stops = makestops() for sentence in datafile: sen = sentence.split("\t", 1) words=cutsentence(sen[1],stops) corpus.append(words) dataid.append(sen[0]) end=time.time() print("预处理用时:"+str(end-begin)+"秒") def search(queryfile,bm25model,answerdict): begin=time.time() querylist = [str.strip() for str in queryfile.readlines()] stops = makestops() cnt=0 for coden,query in enumerate(querylist): cnt+=1 sen = query.split("\t", 1) words=cutsentence(sen[1],stops) scores = bm25model.get_scores(words) sortscores = sorted(scores, reverse=True) idx=[] for i in range(100): temp = scores.index(sortscores[i]) i += 1 if coden >=0 and coden <200: if temp >= 6000 and temp <= 16000 : idx.append(temp) if len(idx) == 10: break elif coden>= 200 and coden <400: if temp >= 16000 and temp <= 25000 : idx.append(temp) if len(idx) == 10: break elif coden >=400 and coden <600: if temp >= 25000 and temp <= 35000 : idx.append(temp) if len(idx) == 10: break elif coden >=600 and coden <800: if temp >= 35000 and temp <= 41000 : idx.append(temp) if len(idx) == 10: break elif coden>=800 and coden<1100: if temp >= 41000 and temp <= 46000 : idx.append(temp) if len(idx) == 10: break if len(idx)<10: for i in range(10-len(idx)): idx.append(-1) answerdict[sen[0]] = idx end=time.time() print("搜索用时:"+str(end-begin)+"秒") def writeindex(answerdict,file,dataid): with open(file, 'w', encoding='UTF-8') as indexdoc: for key in answerdict: s='' s+=key+'\t' for i in range(10): if(answerdict[key][i]!=-1): s+=dataid[answerdict[key][i]] else: s+='D-1' if(i!=9): s+=' ' else: s+='\n' indexdoc.write(s) print("写index文件成功") indexdoc.close() from gensim.summarization import bm25 def makeindex(dataname,queryname,resultname): begin0=time.time() # -------------------------------------------------------------- '''建立倒排索引''' begin1=time.time() print("开始建立索引") # 读取data数据 print("开始读取data文件") datafile=readfile(dataname) # 预处理 print("开始预处理文本") corpus=[] dataid=[] pretreatment(datafile,corpus,dataid) # 用gensim进行比较 print("开始构建语料库") bm25model = bm25.BM25(corpus) print("建立完成") # 建立索引时间 end1 = time.time() print("建索引成功,用时:"+str(end1-begin1)+"秒\n") #-------------------------------------------------------------- '''检索''' begin2=time.time() print("开始检索") # 读取query数据 print("开始读取query文件") queryfile = readfile(queryname) # 逐条搜索并且写入字典 print("开始搜索") answerdict=dict() search(queryfile,bm25model,answerdict) # 写index文件 print("开始写入index文件") writeindex(answerdict, resultname,dataid) end2=time.time() print("检索成功,用时:" + str(end2 - begin2) + "秒") #--------------------------------------------------------------- end0=time.time() print("总用时:" + str(end0 - begin0) + "秒") #--------------------------------------------------------------- '''主程序''' if __name__ == "__main__": makeindex('documents.txt','q1000.txt','凌珑.txt')