如何用字典树来判断两篇文章的相似性

技术2022-07-10 152

问题描述

简单讲就是，给你两篇文章，问你这两篇文章是不是同一个人写的。¹ 具体内容如下图：

解题思路

把dictionary文本里的所有单词读入dic[]中，输出读入时间111 把stop words文本里的所有单词读入stp[]中，输出读入时间222 把artical1文本里的所有单词读入art1[]中，输出读入时间333 把artical2文本里的所有单词读入art2[]中，输出读入时间444

利用dic[]，建立关于dictionary的字典树rootdic，（分三步）输出建立时间55 m1：记录跳过所有dic[]中的换行符的时间，并输出 m2：记录把所有dic[]中的字母依次写到word[]中的时间，并输出 m3：记录把所有word[]中的单词依次写入字典树rootdic中的时间，并输出利用stp[]，删除dictionary的字典树rootdic中对应的单词

构建第一篇文章的字典树rootwords1，并把其中所有的单词写入结构体数组words1[]中。输出删除、构建、写入的时间666 【在构建字典树的同时，记录单词出现的次数】构建第二篇文章的字典树rootwords2，并把其中所有的单词写入结构体数组words2[]中。输出构建、写入的时间777

给words1中所有的单词按出现频率排序，并输出排序时间888 给words2中所有的单词按出现频率排序，并输出排序时间999

构建article1的前n个高频词的字典树root1，并输出构建时间aaa 构建article2的前n个高频词的字典树root2，并输出构建时间bbb

判断article1的前n个高频单词在article2的字典树中是否出现，并输出判断时间ccc 判断article2的前n个高频单词在article1的字典树中是否出现，并输出判断时间ddd

计算两篇文章的pro，并输出计算时间eee article1的前n个高频词写入result文本的时间，并输出写入时间fff article2的前n个高频词写入result文本的时间，并输出写入时间ggg

源代码

#include <stdio.h> #include <stdlib.h> #include <string.h> #include <time.h> #define MAX 26 #define swapnum(a,b) {int temp=a;a=b;b=temp;} #define TIMEA typedef struct TrieNode { int nCount; struct TrieNode *next[MAX]; }TrieNode, *TrieNodeptr;//字典树的结构体 typedef struct Node { int flag;//该单词在另一篇文章中是否出现 int num; char word[50]; }Node, *Nodeptr; Node words1[400000];//结构体数组存article1的所有词 Node words2[400000]; TrieNodeptr createTrieNode(); void insertTrie(TrieNodeptr pRoot, char *str); int searchTrie(TrieNodeptr root, char *str); void deleteTrie(TrieNodeptr root, char *str); int getword1(); int getword2(); void countSort1(int exp); void countSort2(int exp); void radixSort1(); void radixSort2(); void swapchar(char *s1, char *s2); void TraverseTrie1(TrieNodeptr root); void TraverseTrie2(TrieNodeptr root); void TraverseTrie(TrieNodeptr root);//用于检查字典树的遍历函数 void switchstru(Nodeptr a, Nodeptr b); int ReadFile(char *path, char data[]); int sum1;//article1里所有不重复单词的数量 int sum2; Node output1[400000]; Node output2[400000]; char word[50];//某个单词 FILE *indic, *instp, *in1, *in2; FILE *out; char dic[7000000];//保存字典里所有的单词 char stp[7000000];// char art1[7000000];//文章1 char art2[7000000];//文章2 clock_t s4, e4, s5, e5; float ms4 = 0.0; float ms5 = 0.0; int main() { int n; int i = 0; int j = 0; int flag; #ifdef TIMEA clock_t start, end; start = clock(); end = clock(); printf("time=%f\n", (double)(end - start) * 1000 / CLK_TCK); #endif // TIMEA //读入字典单词 indic = fopen("dictionary.txt", "rb"); fseek(indic, 0, SEEK_END);//移动文件流的读写位置 int lengthdic = ftell(indic);//记录当前的读写位置，指向最后一个字符的下一个位置。在文本文件中，lendthdic直接输出可能没有实际意义 fseek(indic, 0, SEEK_SET);//？ fread(dic, lengthdic, 1, indic);//把indic文本中的字符全部读如dic数组中 dic[lengthdic++] = 0;// \0的ascii码是0 //printf("%s",dic); #ifdef TIMEA end = clock(); printf("111 time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA //读入停止单词 instp = fopen("stopwords.txt", "rb"); fseek(instp, 0, SEEK_END); int lengthstp = ftell(instp); fseek(instp, 0, SEEK_SET); fread(stp, lengthstp, 1, instp); stp[lengthstp++] = 0; //printf("%d",lengthstp); #ifdef TIMEA end = clock(); printf("222 time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA //读入第一篇文章 in1 = fopen("article1.txt", "rb"); fseek(in1, 0, SEEK_END); int length1 = ftell(in1); fseek(in1, 0, SEEK_SET); fread(art1, length1, 1, in1); art1[length1++] = 0; //printf("%s",art1); //printf("%d",length1); #ifdef TIMEA end = clock(); printf("333 time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA //读入第二篇文章 in2 = fopen("article2.txt", "rb"); fseek(in2, 0, SEEK_END); int length2 = ftell(in2); fseek(in2, 0, SEEK_SET); fread(art2, length2, 1, in2); art2[length2++] = 0; //printf("%d",length2); #ifdef TIMEA end = clock(); printf("444 time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); clock_t start1, end1, start2, end2, start3, end3; //m1：记录跳过所有dic中的换行符的时间 //m2：记录把所有dic中的字母依次写到word单词中的时间 //m3：记录把所有word中的单词依次写入字典树中的时间 float ms1, ms2, ms3; ms1 = ms2 = ms3 = 0.0; #endif // TIMEA //建立关于dictionary的字典树分三步 out = fopen("results.txt", "w"); TrieNodeptr rootdic = createTrieNode(); for (i = 0; i < lengthdic; i++) { #ifdef TIMEA start1 = clock(); #endif // TIMEA flag = 0; // 1 如果dic[]中的字符不是字母，就跳过；如果是\0，则结束for循环 while ((dic[i] | 32) > 'z' || 'a' > (dic[i] | 32))//dic[i]|32表示把字母都转化成小写的 { if (dic[i] == '\0') { flag = 1; break; } i++; } #ifdef TIMEA end1 = clock(); ms1 += (double)(end1 - start1) * 1000 / CLK_TCK; #endif // TIMEA if (flag == 1) break; j = 0; #ifdef TIMEA start2 = clock(); #endif // TIMEA // 2 如果字符在a到z之间，就逐记录整个单词到word中 while ('a' <= (dic[i] | 32) && (dic[i] | 32) <= 'z') { word[j++] = dic[i] | 32; i++; } word[j] = '\0'; #ifdef TIMEA end2 = clock(); ms2 += (double)(end2 - start2) * 1000 / CLK_TCK; #endif // TIMEA #ifdef TIMEA start3 = clock(); #endif // TIMEA //printf("%s\n",word); // 3 把刚找到的单词word，计入dictionary字典树中 insertTrie(rootdic, word); #ifdef TIMEA end3 = clock(); ms3 += (double)(end3 - start3) * 1000 / CLK_TCK; #endif // TIMEA //if(dic[i]=='\0') break; //printf("%s\n",word); } //检查dic字典树 //TraverseTrie(rootdic); #ifdef TIMEA //建立关于dictionary的字典树的每步的时间 printf("444---inner time=%f,%f,%f,%f,%f\n", ms1, ms2, ms3, ms4, ms5); #endif // TIMEA #ifdef TIMEA end = clock(); //构建dic字典树的整体时间 printf("555 time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA //根据stopwords文本，删除字典树rootdic中对应的单词 for (i = 0; i < lengthstp; i++) { flag = 0; while ((stp[i] | 32) > 'z' || 'a' > (stp[i] | 32)) { if (stp[i] == '\0') { flag = 1; break; } i++; } if (flag == 1) break; j = 0; while ('a' <= (stp[i] | 32) && (stp[i] | 32) <= 'z') { word[j++] = stp[i] | 32; i++; } word[j] = '\0'; deleteTrie(rootdic, word); //if(stp[i]=='\0') break; } //检查stp字典树 //TraverseTrie(rootstp); TrieNodeptr rootwords1 = createTrieNode();//建立第一篇文章的字典树 TrieNodeptr rootwords2 = createTrieNode();//建立第二篇文章的字典树 //通过j来判断是否为单词末尾，是（j!=-1），不是(j==-1)，如这样。 j = -1;//（这行我添加的，不然第一个单词无法正确读入word中） //构建第一篇文章的字典树的过程 for (i = 0; i <= length1; i++) { //如果是小写字母，j++ if (art1[i] >= 'a'&&art1[i] <= 'z') j++; //如果是大写字母，转化成小写字母 else if (art1[i] >= 'A'&&art1[i] <= 'Z') { art1[i] |= 32; j++; } //如果不是字母，且为单词末尾，就把该位置的字符转化成\0，然后判断这个单词 else if (j != -1) { j++; art1[i] = 0; strcpy(word, art1 + i - j);//strcpy当碰到str2的\0标识时就会停止复制 if (searchTrie(rootdic, word))//判断单词word是否在rootdic中存在 { insertTrie(rootwords1, word); } j = -1; //printf("%s\n",word); } //printf("%s\n",word); } //TraverseTrie(rootwords1); TraverseTrie1(rootwords1); #ifdef TIMEA end = clock(); //删除rootdic中stopwords出现的单词和建立第一篇文章字典树的整体时间 printf("666 time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA //构建第二篇文章的字典树的过程 for (i = 0; i <= length2; i++) { if (art2[i] >= 'a'&&art2[i] <= 'z') j++; else if (art2[i] >= 'A'&&art2[i] <= 'Z') { art2[i] |= 32; j++; } else if (j != -1) { j++; art2[i] = 0; strcpy(word, art2 + i - j); if (searchTrie(rootdic, word))//在dic中出现在stp中不出现 { insertTrie(rootwords2, word); } j = -1; //printf("%s\n",word); } //printf("%s\n",word); } //TraverseTrie(rootwords2); TraverseTrie2(rootwords2); #ifdef TIMEA end = clock(); //建立第二篇文章的字典树的时间 printf("777 time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA //排words1.num radixSort1(); #ifdef TIMEA end = clock(); //word1按单词出现频率排序的时间 printf("888 time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA //排words2.num radixSort2(); #ifdef TIMEA end = clock(); printf("999 time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA //检查 //for(i=0;i<sum1;i++) printf("%s %d %d\n",words1[i].word,words1[i].num,words1[i].flag); scanf("%d", &n); #ifdef TIMEA start = clock(); #endif // TIMEA //这里进行比较操作得出m个值 //root1 第一篇文章中的前n个高频词的字典树 TrieNodeptr root1 = createTrieNode(); for (i = 0; i < n; i++) { insertTrie(root1, words1[i].word); } //TraverseTrie(root1); //puts(""); #ifdef TIMEA end = clock(); printf("aaa time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA //root2 第二篇文章中的前n个高频词的字典树 TrieNodeptr root2 = createTrieNode(); for (i = 0; i < n; i++) { insertTrie(root2, words2[i].word); } //TraverseTrie(root2); //puts(""); #ifdef TIMEA end = clock(); printf("bbb time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA //判断文章一的前n个高频单词在文章二的字典树中是否出现 for (i = 0; i < n; i++) { if (searchTrie(root2, words1[i].word)) { words1[i].flag = 1; } } #ifdef TIMEA end = clock(); printf("ccc time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA //判断文章二的前n个高频单词在文章一的字典树中是否出现 for (i = 0; i < n; i++) { if (searchTrie(root1, words2[i].word)) { words2[i].flag = 1; } } #ifdef TIMEA end = clock(); printf("ddd time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA //检查words1数组 //for(i=0;i<n;i++) printf("%s %d %d\n",words1[i].word,words1[i].num,words1[i].flag); //for(i=0;i<n;i++) printf("%s %d %d %d\n",words2[i].word,words2[i].num,words2[i].flag,i); //计算pro1和pro2 int freqm1 = 0;//目标出现次数 int freqn1 = 0; int freqm2 = 0; int freqn2 = 0; //检查 //for(i=0;i<sum1;i++) printf("%s %d %d\n",words1[i].word,words1[i].num,words1[i].flag); for (i = 0; i < n; i++) { if (words1[i].flag == 1) freqm1 += words1[i].num; if (words2[i].flag == 1) freqm2 += words2[i].num; freqn1 += words1[i].num; freqn2 += words2[i].num; } double pro1 = (1.0)*freqm1 / freqn1; double pro2 = (1.0)*freqm2 / freqn2; double sim = (pro1 > pro2) ? (pro2 / pro1) : (pro1 / pro2); printf("%.5lf", sim); fprintf(out, "%.5lf\n\n", sim); #ifdef TIMEA end = clock(); //计算pro的时间 printf("\neee time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA fputc('\n', out); for (i = 0; i < n; i++) { fprintf(out, "%s %d\n", words1[i].word, words1[i].num); } #ifdef TIMEA end = clock(); //文章一的高频词写入result文本的时间 printf("fff time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA for (i = 0; i < n; i++) { fprintf(out, "%s %d\n", words2[i].word, words2[i].num); } #ifdef TIMEA end = clock(); //文章二的高频词写入result文本的时间 printf("ggg time=%f\n", (double)(end - start) * 1000 / CLK_TCK); start = clock(); #endif // TIMEA fclose(in1); fclose(in2); //fclose(indic); //fclose(instp); fclose(out); } /* 创建字典树的节点 */ TrieNodeptr createTrieNode() { TrieNodeptr tmp = (TrieNodeptr)malloc(sizeof(TrieNode)); tmp->nCount = 0; int i; for (i = 0; i < MAX; i++) { tmp->next[i] = NULL; } return tmp; } /* 把word单词计入rootdic字典树中 */ void insertTrie(TrieNodeptr pRoot, char *str) { TrieNodeptr tmp = pRoot; int i = 0, k; while (str[i] && str[i] != '\n'&&str[i] != '\r') { k = str[i] - 'a'; if (tmp->next[k] == NULL) { #ifdef TIMEA s4 = clock(); #endif // TIMEA tmp->next[k] = createTrieNode(); #ifdef TIMEA e4 = clock(); ms4 += (double)(e4 - s4) * 1000 / CLK_TCK; #endif // TIMEA } tmp = tmp->next[k]; i++; } #ifdef TIMEA s5 = clock(); #endif // TIMEA tmp->nCount++;//nCount 在单词的最后一个字母上+1，记录该单词出现的次数 #ifdef TIMEA e5 = clock(); ms5 += (double)(e5 - s5) * 1000 / CLK_TCK; #endif // TIMEA } /* 返回某个单词word在某棵字典树中的出现次数 */ int searchTrie(TrieNode *root, char *str) { if (root == NULL) return 0; TrieNode *tmp = root; int i = 0, k; while (str[i])//'\0'的ASCII码为0 { k = str[i] - 'a'; if (tmp->next[k]) tmp = tmp->next[k]; else return 0; i++; } return tmp->nCount; } void deleteTrie(TrieNodeptr root, char *str) { int i, index; TrieNodeptr temp = root; for (i = 0; i < strlen(str); i++) { index = str[i] - 'a'; if (temp->next[index] == NULL) return; temp = temp->next[index]; } temp->nCount--;//？为什么是--，不是=0 return; } void swapchar(char *s1, char *s2) { char *temp; temp = (char *)malloc(sizeof(char) * 50); strcpy(temp, s1); strcpy(s1, s2); strcpy(s2, temp); free(temp); } //把article1、article1出现的所有单词和出现次数保存到words1[]、words1[]中 void TraverseTrie1(TrieNodeptr root) { int i; static char word[40] = { 0 };//全部置为'\0' static int j = 0;//因为是递归，所以这里设为静态 if (root == NULL) return; //for循环里面递归,先j++,后j-- ————>回溯 for (i = 0; i < 26; i++) { if (root->next[i] == NULL) continue; word[j++] = i + 'a';//这里j++ if (root->next[i]->cnt1 > 0)//如果它已经构成了一个单词 { word[j] = '\0'; strcpy(words1[sum1].word, word);//把这个单词写入words1[]中 words1[sum1++].num = root->next[i]->cnt1;//words1[]里的这个单词的数量也更新 } if (root->next[i]->cnt2 > 0) { word[j] = '\0'; strcpy(words2[sum2].word, word); words2[sum2++].num = root->next[i]->cnt2; } TraverseTrie1(root->next[i]); j--;//撤销操作 } } void TraverseTrie2(TrieNodeptr root) { int i; static char word[40] = { 0 }; static int j = 0; if (root == NULL) return; for (i = 0; i < 26; i++) { if (root->next[i] == NULL) continue; word[j++] = i + 'a'; if (root->next[i]->nCount > 0) { word[j] = '\0'; strcpy(words2[sum2].word, word); words2[sum2++].num = root->next[i]->nCount; } TraverseTrie2(root->next[i]); j--; } } //用于检查字典树的遍历函数 void TraverseTrie(TrieNodeptr root) { int i; static char word[40] = { 0 }; static int j = 0; if (root == NULL) return; for (i = 0; i < 26; i++) { if (root->next[i] == NULL) continue; word[j++] = i + 'a'; if (root->next[i]->nCount > 0) { word[j] = '\0'; printf("%s %d\n", word, root->next[i]->nCount); } TraverseTrie(root->next[i]); j--; } } void countSort1(int exp) { int i, buckets[10] = { 0 }; for (i = 0; i < sum1; i++) { buckets[(words1[i].num / exp) % 10]++; }//每个桶的的值 swapnum(buckets[0], buckets[9]); swapnum(buckets[1], buckets[8]); swapnum(buckets[2], buckets[7]); swapnum(buckets[3], buckets[6]); swapnum(buckets[4], buckets[5]); for (i = 1; i < 10; i++) buckets[i] += buckets[i - 1];//每个数的排序 for (i = sum1 - 1; i >= 0; i--) { switchstru(&output1[buckets[9 - (words1[i].num / exp) % 10] - 1], &words1[i]); buckets[9 - (words1[i].num / exp) % 10]--; } //检查 //for(i=0;i<sum1;i++) printf("%s %d\n",output1[i].word,output1[i].num); for (i = 0; i < sum1; i++) { switchstru(&words1[i], &output1[i]); } } void countSort2(int exp) { int i, buckets[10] = { 0 }; for (i = 0; i < sum2; i++) { buckets[(words2[i].num / exp) % 10]++; }//每个桶的的值 swapnum(buckets[0], buckets[9]); swapnum(buckets[1], buckets[8]); swapnum(buckets[2], buckets[7]); swapnum(buckets[3], buckets[6]); swapnum(buckets[4], buckets[5]); for (i = 1; i < 10; i++) buckets[i] += buckets[i - 1];//每个数的排序 for (i = sum2 - 1; i >= 0; i--) { switchstru(&output2[buckets[9 - (words2[i].num / exp) % 10] - 1], &words2[i]); buckets[9 - (words2[i].num / exp) % 10]--; } //检查 //for(i=0;i<sum1;i++) printf("%s %d\n",output1[i].word,output1[i].num); for (i = 0; i < sum2; i++) { switchstru(&words2[i], &output2[i]); } } void radixSort1() { int exp; int max = 50000; for (exp = 1; max / exp > 0; exp *= 10) countSort1(exp); } void radixSort2() { int exp; int max = 50000; for (exp = 1; max / exp > 0; exp *= 10) countSort2(exp); } void switchstru(Nodeptr a, Nodeptr b) { *a = *b; }

这是我研究生阶段导师布置的第一个任务，所以写的比较认真 😃。其实当时老师给了源代码，她的本意让我在字典树的基础上能否用ac自动机的方法做优化，我思忖了一番觉得不太行，于是认认真真对源码进行了解读。 ↩︎

Processed: 0.011, SQL: 9

如何用字典树来判断两篇文章的相似性

目录

问题描述

解题思路

源代码