在利用字典树判断文章相似度的基础上，能否使用ac自动机来优化时间

技术2022-07-10 173

利用字典树判断两篇文章相似度时，大致思路是这样的。文章中的某个单词（注意，它是一个整体。这个单词中的任意一连续的部分如果在字典树中存在，都不计数）如果在字典树中出现，就令该单词在这篇文章中的出现次数+1。这样可得到每篇文章里各个单词出现的数量，然后判断高频单词是否类似即可。

ac自动机是什么

那么如果使用ac自动机的话，首先什么是ac自动机。ac自动机的本质就是，在字典树的基础上，对其每个节点都添加fail指针。那么当在某个节点之后查询不到待匹配的字母时，就跳到fail指针指向的节点继续匹配，这样这棵字典树只要匹配一遍字符串文本就可以得到答案，消除了原本只使用字典树时不可避免的回溯。

如果不了解字典树和ac自动机，可以看下面这个b站视频，讲的非常好，听一遍就能学肥 ac自动机—b站

使用了ac自动机后会如何匹配

举个例子，P={“is” ,“this”,“his”}，T=“this is me”。对P建立字典树并添加fail指针，如下图所示。

希望的结果是is出现1次，this出现1次，但如果用ac自动机来判断的话，is会出现两次，his也出现了一次。因为T中的this里也包含了is和his，就导致结果出现偏差。

因此，我觉得，给字典树加上fail指针后，优化不了时间，并且会增加时间消耗。原因有两点：

字典树的节点添加fail指针和vector向量后，变得更复杂。本来只使用字典树时，带匹配的文本可以被认为是由一个个单词组成，此时单词是作为一个整体去字典树里匹配，单词不可拆散，某个单词的字母匹配失败时，就认为该单词不存在，匹配下一个单词。而加上ac自动机的话，单词没法作为一个整体去匹配，带匹配的文本相当于是由一个个字符组成，如果某个单词的字母匹配失败，就会跳到fail指针指向的节点继续匹配，这样的话单词已被拆散

添加了ac自动机的代码

如果硬要加上ac自动机查看最后运行效果的话，代码如下

#include <stdio.h> #include <stdlib.h> #include <string.h> #include <time.h> #include <queue> #include <vector> using namespace std; #define MAX 26 #define swapnum(a,b) {int temp=a;a=b;b=temp;} #define TIMEA #define _CRT_SECURE_NO_WARNINGS； #pragma warning(disable:4996)； //字典树的结构体 typedef struct _TrieNode { int cnt1;//以该节点为底的单词在article1中出现的次数 int cnt2;//以该节点为底的单词在article2中出现的次数 vector<int> exist;//记录以该节点为底的单词的长度 struct _TrieNode *fail;//失配指针 struct _TrieNode *next[MAX];//指针数组 _TrieNode()//构造函数 { cnt1 = 0; cnt2 = 0; fail = NULL; memset(next, NULL, MAX * sizeof(struct _TrieNode *)); } }TrieNode, *TrieNodeptr; //保存单词的结构体 typedef struct Node { int flag;//该单词在另一篇文章中是否出现 int num; char word[50]; }Node, *Nodeptr; Node words1[400000];//结构体数组存所有词 Node words2[400000]; TrieNodeptr createTrieNode(); void insertTrie(TrieNodeptr pRoot, char *str); int searchTrie1(TrieNodeptr root, char *str); int searchTrie2(TrieNodeptr root, char *str); void deleteTrie(TrieNodeptr root, char *str); int getword1(); int getword2(); void countSort1(int exp); void countSort2(int exp); void radixSort1(); void radixSort2(); void swapchar(char *s1, char *s2); void TraverseTrie1(TrieNodeptr root); void TraverseTrie2(TrieNodeptr root); void TraverseTrie(TrieNodeptr root);//用于检查字典树的遍历函数 void switchstru(Nodeptr a, Nodeptr b); int ReadFile(char *path, char data[]); void ac_build(TrieNodeptr root); void ac_query1(TrieNodeptr root, const char *T); void ac_query2(TrieNodeptr root, const char *T); int sum1;//article1里所有不重复单词的数量 int sum2; Node output1[400000]; Node output2[400000]; char word[50];//某个单词 FILE *indic, *instp, *in1, *in2; FILE *out; char dic[7000000];//保存字典里所有的单词 char stp[7000000];// char art1[7000000];//文章1 char art2[7000000];//文章2 clock_t s4, e4, s5, e5; float ms4 = 0.0; float ms5 = 0.0; int main() { int n; int i = 0; int j = 0; int flag; #ifdef TIMEA clock_t start, end; start = clock(); end = clock(); printf("time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA //读入字典单词 indic = fopen("dictionary.txt", "rb"); fseek(indic, 0, SEEK_END);//移动文件流的读写位置 int lengthdic = ftell(indic);//记录当前的读写位置，指向最后一个字符的下一个位置。在文本文件中，lendthdic直接输出可能没有实际意义 fseek(indic, 0, SEEK_SET);//？ fread(dic, lengthdic, 1, indic);//把indic文本中的字符全部读如dic数组中 dic[lengthdic++] = 0;// \0的ascii码是0 //printf("%s",dic); #ifdef TIMEA end = clock(); printf("111 time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA //读入停止单词 instp = fopen("stopwords.txt", "rb"); fseek(instp, 0, SEEK_END); int lengthstp = ftell(instp); fseek(instp, 0, SEEK_SET); fread(stp, lengthstp, 1, instp); stp[lengthstp++] = 0; //printf("%d",lengthstp); #ifdef TIMEA end = clock(); printf("222 time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA //读入第一篇文章 in1 = fopen("article1.txt", "rb"); fseek(in1, 0, SEEK_END); int length1 = ftell(in1); fseek(in1, 0, SEEK_SET); fread(art1, length1, 1, in1); art1[length1++] = 0; //printf("%s",art1); //printf("%d",length1); #ifdef TIMEA end = clock(); printf("333 time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA //读入第二篇文章 in2 = fopen("article2.txt", "rb"); fseek(in2, 0, SEEK_END); int length2 = ftell(in2); fseek(in2, 0, SEEK_SET); fread(art2, length2, 1, in2); art2[length2++] = 0; //printf("%d",length2); #ifdef TIMEA end = clock(); printf("444 time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); clock_t start1, end1, start2, end2, start3, end3; //m1：记录跳过所有dic中的换行符的时间 //m2：记录把所有dic中的字母依次写到word单词中的时间 //m3：记录把所有word中的单词依次写入字典树中的时间 float ms1, ms2, ms3; ms1 = ms2 = ms3 = 0.0; #endif // TIMEA // 一建立关于dictionary的字典树分三步 out = fopen("results.txt", "w"); TrieNodeptr rootdic = new TrieNode(); for (i = 0; i < lengthdic; i++) { #ifdef TIMEA start1 = clock(); #endif // TIMEA flag = 0; // 1 如果dic数组中的字符不是字母，就跳过；如果是\0，则结束for循环 while ((dic[i] | 32) > 'z' || 'a' > (dic[i] | 32))//dic[i]|32表示把字母都转化成小写的 { //当dic到了末尾，就记flag为1 if (dic[i] == '\0') { flag = 1; break; } i++; } #ifdef TIMEA end1 = clock(); ms1 += (double)(end1 - start1) * 1000 / CLK_TCK; #endif // TIMEA if (flag == 1) break; j = 0; #ifdef TIMEA start2 = clock(); #endif // TIMEA // 2 如果字符在a到z之间，就逐记录整个单词到word中 while ('a' <= (dic[i] | 32) && (dic[i] | 32) <= 'z') { word[j++] = dic[i] | 32; i++; } word[j] = '\0'; #ifdef TIMEA end2 = clock(); ms2 += (double)(end2 - start2) * 1000 / CLK_TCK; #endif // TIMEA #ifdef TIMEA start3 = clock(); #endif // TIMEA //printf("%s\n",word); // 3 把刚找到的单词word，计入dictionary字典树中 insertTrie(rootdic, word); #ifdef TIMEA end3 = clock(); ms3 += (double)(end3 - start3) * 1000 / CLK_TCK; #endif // TIMEA //if(dic[i]=='\0') break; //printf("%s\n",word); } //检查dic字典树 //TraverseTrie(rootdic); #ifdef TIMEA //建立关于dictionary的字典树的每步的时间 printf("444---inner time=%f,%f,%f\n", ms1, ms2, ms3); #endif // TIMEA #ifdef TIMEA end = clock(); //构建dic字典树的整体时间 printf("555 time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA //根据stopwords文本，删除字典树rootdic中对应的单词 for (i = 0; i < lengthstp; i++) { flag = 0; while ((stp[i] | 32) > 'z' || 'a' > (stp[i] | 32)) { if (stp[i] == '\0') { flag = 1; break; } i++; } if (flag == 1) break; j = 0; while ('a' <= (stp[i] | 32) && (stp[i] | 32) <= 'z') { word[j++] = stp[i] | 32; i++; } word[j] = '\0'; deleteTrie(rootdic, word); //if(stp[i]=='\0') break; } //检查stp字典树 //TraverseTrie(rootdic); /* 二对rootdic字典树的每个节点构建fail指针 */ ac_build(rootdic); //检查 //printf("\n%d\n",rootdic->next[1]->fail!=NULL?1:0); #ifdef TIMEA end = clock(); //删除字典树rootdic中stopwords单词，以及构建fail指针的时间【占主要部分】 printf("666---构建fail指针 time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA //rootdic字典树与article1匹配 ac_query1(rootdic, art1); #ifdef TIMEA end = clock(); //rootdic字典树与article1匹配的时间 printf("777---rootdic与article1匹配 time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA //rootdic字典树与article2匹配 ac_query2(rootdic, art2); //检查 //TraverseTrie(rootdic); #ifdef TIMEA end = clock(); //rootdic字典树与article2匹配的时间 printf("777---rootdic与article2匹配 time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA //把article1、article2出现的所有单词和出现次数保存到words1[]、words2[]中的时间 TraverseTrie1(rootdic); //for (i = 0; i < sum1; i++) printf("%s %d %d\n", words1[i].word, words1[i].num, words1[i].flag); #ifdef TIMEA end = clock(); //把article1、article2出现的所有单词和出现次数保存到words1[]、words2[]中的时间 printf("777---把article1、article1出现的所有单词和出现次数保存到words1[]、words1[]中 time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA //排words1.num radixSort1(); #ifdef TIMEA end = clock(); //word1按单词出现频率排序的时间 printf("888 time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA //排words2.num radixSort2(); #ifdef TIMEA end = clock(); printf("999 time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA //检查 //for(i=0;i<sum1;i++) printf("%s %d %d\n",words1[i].word,words1[i].num,words1[i].flag); scanf("%d", &n); #ifdef TIMEA start = clock(); #endif // TIMEA //这里进行比较操作得出m个值 //root1 第一篇文章中的前n个高频词的字典树 TrieNodeptr root1 = new TrieNode(); for (i = 0; i < n; i++) { insertTrie(root1, words1[i].word); } //TraverseTrie(root1); //puts(""); #ifdef TIMEA end = clock(); printf("aaa time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA //root2 第二篇文章中的前n个高频词的字典树 TrieNodeptr root2 = new TrieNode(); for (i = 0; i < n; i++) { insertTrie(root2, words2[i].word); } //TraverseTrie(root2); //puts(""); #ifdef TIMEA end = clock(); printf("bbb time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA /* 判断单词的cnt1和cnt2是否同时大于0即可 */ //判断文章一的前n个高频单词在文章二的字典树中是否出现 for (i = 0; i < n; i++) { if (searchTrie1(root2, words1[i].word)) { words1[i].flag = 1; } } #ifdef TIMEA end = clock(); printf("ccc time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA //判断文章二的前n个高频单词在文章一的字典树中是否出现 for (i = 0; i < n; i++) { if (searchTrie2(root1, words2[i].word)) { words2[i].flag = 1; } } #ifdef TIMEA end = clock(); printf("ddd time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA //检查words1数组 //for(i=0;i<n;i++) printf("%s %d %d\n",words1[i].word,words1[i].num,words1[i].flag); //for(i=0;i<n;i++) printf("%s %d %d\n",words2[i].word,words2[i].num,words2[i].flag); //计算pro1和pro2 int freqm1 = 0;//目标出现次数 int freqn1 = 0; int freqm2 = 0; int freqn2 = 0; //检查 //for(i=0;i<sum1;i++) printf("%s %d %d\n",words1[i].word,words1[i].num,words1[i].flag); for (i = 0; i < n; i++) { if (words1[i].flag == 1) freqm1 += words1[i].num; if (words2[i].flag == 1) freqm2 += words2[i].num; freqn1 += words1[i].num; freqn2 += words2[i].num; } double pro1 = (1.0)*freqm1 / freqn1; double pro2 = (1.0)*freqm2 / freqn2; double sim = (pro1 > pro2) ? (pro2 / pro1) : (pro1 / pro2); printf("%.5lf", sim); fprintf(out, "%.5lf\n\n", sim); #ifdef TIMEA end = clock(); //计算pro的时间 printf("\neee time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA fputc('\n', out); for (i = 0; i < n; i++) { fprintf(out, "%s %d\n", words1[i].word, words1[i].num); } #ifdef TIMEA end = clock(); //文章一的高频词写入result文本的时间 printf("fff time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA for (i = 0; i < n; i++) { fprintf(out, "%s %d\n", words2[i].word, words2[i].num); } #ifdef TIMEA end = clock(); //文章二的高频词写入result文本的时间 printf("ggg time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA fclose(in1); fclose(in2); //fclose(indic); //fclose(instp); fclose(out); } /* 创建字典树的节点 */ //TrieNodeptr createTrieNode() //{ // TrieNodeptr tmp = (TrieNodeptr)malloc(sizeof(TrieNode)); // tmp->nCount = 0; // int i; // for (i = 0; i < MAX; i++) // { // tmp->next[i] = NULL; // } // return tmp; //} /* 把word单词计入rootdic字典树中 */ void insertTrie(TrieNodeptr pRoot, char *str) { TrieNodeptr tmp = pRoot; int i = 0, k; while (str[i] && str[i] != '\n'&&str[i] != '\r') { k = str[i] - 'a'; if (tmp->next[k] == NULL) { tmp->next[k] = new TrieNode(); } tmp = tmp->next[k]; i++; } tmp->exist.push_back(strlen(word));//在单词的最后一个字母上，记录该单词的长度 } /* 通过宽度遍历，构建fail指针 */ void ac_build(TrieNodeptr root) { //构建一个存放字典树节点地址的队列 queue<TrieNodeptr> q; //先把第一层节点的都指向root for (int i = 0; i < MAX; i++) { if (root->next[i]) { root->next[i]->fail = root; q.push(root->next[i]); } } //!!!构建第二层及以下的节点的fail指针 while (!q.empty()) { TrieNodeptr x = q.front(); q.pop(); for (int i = 0; i < MAX; i++) { //如果某个节点存在，表示有字母i+'a' if (x->next[i]) { /* 寻找y的fail指针 */ TrieNodeptr y = x->next[i], fafail = x->fail; //当节点y的父亲节点x的fail指针指向的【节点】存在 //且【该节点】不存在和节点y存放的字母相同的节点 //就不断重复地去寻找【该节点】的fail指针 while (fafail && NULL == fafail->next[i]) fafail = fafail->fail; //如果最终找到的【节点】为空，则令y节点指向root if (NULL == fafail) y->fail = root; else//否则，令y的fail指针指向父亲节点x的fail指针指向的节点的内容为i+'a'的儿子节点 y->fail = fafail->next[i]; //如果那个节点是某个【或某些】单词的末尾，把它们全都计入y的exist[]中 if (y->fail->exist.size()) for (int j = 0; j < y->fail->exist.size(); j++) y->exist.push_back(y->fail->exist[j]); q.push(y); } } } } /* 字典树与字符串匹配 */ void ac_query1(TrieNodeptr root, const char *T) { TrieNodeptr tmp = root; for (int i = 0; i < strlen(T); i++) { int c = T[i] | 32; if (c > 'z' || 'a' > c) continue;//如果不是字母，就跳过 c = c - 'a'; //如果儿子节点不存在该字母，但该节点存在fail指针，则【不断地】去匹配fail指向的节点【找最长后缀】 //直到找到某个节点它的儿子节点具有该字母，或tmp指向了root节点（root的fail为NULL），就不再继续 while (NULL == tmp->next[c] && tmp->fail) { tmp = tmp->fail; //如果该节点可以构成单词，以该节点为底的单词数++ if (tmp->exist.size()) tmp->cnt1++; } //如果此时tmp指向的节点有【儿子节点】保存着这个字母，进入这个儿子节点 //否则，匹配下一个字符 if (tmp->next[c]) tmp = tmp->next[c]; else continue; //如果该节点可以构成单词，以该节点为底的单词数++ if (tmp->exist.size()) tmp->cnt1++; //这段代码得按以上的逻辑来写，虽然看起来有些别扭，但这样写，代码会简单很多 } } void ac_query2(TrieNodeptr root, const char *T) { TrieNodeptr tmp = root; for (int i = 0; i < strlen(T); i++) { int c = T[i] | 32; if (c > 'z' || 'a' > c) continue;//如果不是字母，就跳过 c = c - 'a'; //如果儿子节点不存在该字母，但该节点存在fail指针，则【不断地】去匹配fail指向的节点【找最长后缀】 //直到找到某个节点它的儿子节点具有该字母，或tmp指向了root节点（root的fail为NULL），就不再继续 while (NULL == tmp->next[c] && tmp->fail) { tmp = tmp->fail; if (tmp->exist.size()) tmp->cnt2++; } //如果此时tmp指向的节点有【儿子节点】保存着这个字母，进入这个儿子节点 //否则，匹配下一个字符 if (tmp->next[c]) tmp = tmp->next[c]; else continue; //如果该节点可以构成单词，以该节点为底的单词数++ if (tmp->exist.size()) tmp->cnt2++; //这段代码得按以上的逻辑来写，虽然看起来有些别扭，但这样写，代码会简单很多 } } /* 返回某个单词word在某棵字典树中的出现次数 */ int searchTrie1(TrieNode *root, char *str) { if (root == NULL) return 0; TrieNode *tmp = root; int i = 0, k; while (str[i])//'\0'的ASCII码为0 { k = str[i] - 'a'; if (tmp->next[k]) tmp = tmp->next[k]; else return 0; i++; } return tmp->cnt2; } /* 返回某个单词word在某棵字典树中的出现次数 */ int searchTrie2(TrieNode *root, char *str) { if (root == NULL) return 0; TrieNode *tmp = root; int i = 0, k; while (str[i])//'\0'的ASCII码为0 { k = str[i] - 'a'; if (tmp->next[k]) tmp = tmp->next[k]; else return 0; i++; } return tmp->cnt1; } //根据stopwords文本，删除字典树rootdic中对应的单词 void deleteTrie(TrieNodeptr root, char *str) { int i, index; TrieNodeptr temp = root; for (i = 0; i < strlen(str); i++) { index = str[i] - 'a'; if (temp->next[index] == NULL) return; temp = temp->next[index]; } //temp->exist.pop_back();//？为什么是--，不是=0 temp->exist.clear(); return; } void swapchar(char *s1, char *s2) { char *temp; temp = (char *)malloc(sizeof(char) * 50); strcpy(temp, s1); strcpy(s1, s2); strcpy(s2, temp); free(temp); } //把article1、article1出现的所有单词和出现次数保存到words1[]、words1[]中 void TraverseTrie1(TrieNodeptr root) { int i; static char word[40] = { 0 };//全部置为'\0' static int j = 0;//因为是递归，所以这里设为静态 if (root == NULL) return; //for循环里面递归,先j++,后j-- ————>回溯 for (i = 0; i < 26; i++) { if (root->next[i] == NULL) continue; word[j++] = i + 'a';//这里j++ if (root->next[i]->cnt1 > 0)//如果它已经构成了一个单词 { word[j] = '\0'; strcpy(words1[sum1].word, word);//把这个单词写入words1[]中 words1[sum1++].num = root->next[i]->cnt1;//words1[]里的这个单词的数量也更新 } if (root->next[i]->cnt2 > 0) { word[j] = '\0'; strcpy(words2[sum2].word, word); words2[sum2++].num = root->next[i]->cnt2; } TraverseTrie1(root->next[i]); j--;//撤销操作 } } //把article2出现的所有单词和出现次数保存到words2[]中 /*void TraverseTrie2(TrieNodeptr root) { int i; static char word2[40] = { 0 }; static int j2 = 0; if (root == NULL) return; for (i = 0; i < 26; i++) { if (root->next[i] == NULL) continue; word2[j2++] = i + 'a'; if (root->next[i]->cnt2 > 0) { word2[j2] = '\0'; strcpy(words2[sum2].word, word2); words2[sum2++].num = root->next[i]->cnt2; } TraverseTrie2(root->next[i]); j2--; } }*/ //用于检查字典树的遍历函数 void TraverseTrie(TrieNodeptr root) { int i; static char word[40] = { 0 }; static int j = 0; if (root == NULL) return; for (i = 0; i < 26; i++) { if (root->next[i] == NULL) continue; word[j++] = i + 'a'; if (root->next[i]->exist.size() > 0) { word[j] = '\0'; printf("%s %d %d %d\n", word, root->next[i]->exist.size(),root->next[i]->cnt1, root->next[i]->cnt2); } TraverseTrie(root->next[i]); j--; } } void countSort1(int exp) { int i, buckets[10] = { 0 }; for (i = 0; i < sum1; i++) { buckets[(words1[i].num / exp) % 10]++; }//每个桶的的值 swapnum(buckets[0], buckets[9]); swapnum(buckets[1], buckets[8]); swapnum(buckets[2], buckets[7]); swapnum(buckets[3], buckets[6]); swapnum(buckets[4], buckets[5]); for (i = 1; i < 10; i++) buckets[i] += buckets[i - 1];//每个数的排序 for (i = sum1 - 1; i >= 0; i--) { switchstru(&output1[buckets[9 - (words1[i].num / exp) % 10] - 1], &words1[i]); buckets[9 - (words1[i].num / exp) % 10]--; } //检查 //for(i=0;i<sum1;i++) printf("%s %d\n",output1[i].word,output1[i].num); for (i = 0; i < sum1; i++) { switchstru(&words1[i], &output1[i]); } } void countSort2(int exp) { int i, buckets[10] = { 0 }; for (i = 0; i < sum2; i++) { buckets[(words2[i].num / exp) % 10]++; }//每个桶的的值 swapnum(buckets[0], buckets[9]); swapnum(buckets[1], buckets[8]); swapnum(buckets[2], buckets[7]); swapnum(buckets[3], buckets[6]); swapnum(buckets[4], buckets[5]); for (i = 1; i < 10; i++) buckets[i] += buckets[i - 1];//每个数的排序 for (i = sum2 - 1; i >= 0; i--) { switchstru(&output2[buckets[9 - (words2[i].num / exp) % 10] - 1], &words2[i]); buckets[9 - (words2[i].num / exp) % 10]--; } //检查 //for(i=0;i<sum1;i++) printf("%s %d\n",output1[i].word,output1[i].num); for (i = 0; i < sum2; i++) { switchstru(&words2[i], &output2[i]); } } void radixSort1() { int exp; int max = 50000; for (exp = 1; max / exp > 0; exp *= 10) countSort1(exp); } void radixSort2() { int exp; int max = 50000; for (exp = 1; max / exp > 0; exp *= 10) countSort2(exp); } void switchstru(Nodeptr a, Nodeptr b) { *a = *b; }

对比时间

由于我本人比较菜，写出的代码质量不高，改了以后导致花费时间上升了好几个数量级。最后的运行时间对比如下：第一张是加上ac自动机后的运行时间；第二张是原本只是用字典树的运行时间

Processed: 0.029, SQL: 10

在利用字典树判断文章相似度的基础上，能否使用ac自动机来优化时间

目录

字典树是怎么判断文章相似度的

ac自动机是什么

使用了ac自动机后会如何匹配

添加了ac自动机的代码

对比时间