目录
问题描述解题思路源代码
问题描述
简单讲就是,给你两篇文章,问你这两篇文章是不是同一个人写的。 具体内容如下图:
解题思路
把dictionary文本里的所有单词读入dic[]中,输出读入时间111 把stop words文本里的所有单词读入stp[]中,输出读入时间222 把artical1文本里的所有单词读入art1[]中,输出读入时间333 把artical2文本里的所有单词读入art2[]中,输出读入时间444
利用dic[],建立关于dictionary的字典树rootdic,(分三步)输出建立时间55 m1:记录跳过所有dic[]中的换行符的时间,并输出 m2:记录把所有dic[]中的字母依次写到word[]中的时间,并输出 m3:记录把所有word[]中的单词依次写入字典树rootdic中的时间,并输出 利用stp[],删除dictionary的字典树rootdic中对应的单词
构建第一篇文章的字典树rootwords1,并把其中所有的单词写入结构体数组words1[]中。输出删除、构建、写入的时间666 【在构建字典树的同时,记录单词出现的次数】 构建第二篇文章的字典树rootwords2,并把其中所有的单词写入结构体数组words2[]中。输出构建、写入的时间777
给words1中所有的单词按出现频率排序,并输出排序时间888 给words2中所有的单词按出现频率排序,并输出排序时间999
构建article1的前n个高频词的字典树root1,并输出构建时间aaa 构建article2的前n个高频词的字典树root2,并输出构建时间bbb
判断article1的前n个高频单词在article2的字典树中是否出现,并输出判断时间ccc 判断article2的前n个高频单词在article1的字典树中是否出现,并输出判断时间ddd
计算两篇文章的pro,并输出计算时间eee article1的前n个高频词写入result文本的时间,并输出写入时间fff article2的前n个高频词写入result文本的时间,并输出写入时间ggg
源代码
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define MAX 26
#define swapnum(a,b) {int temp=a;a=b;b=temp;}
#define TIMEA
typedef struct TrieNode
{
int nCount
;
struct TrieNode
*next
[MAX
];
}TrieNode
, *TrieNodeptr
;
typedef struct Node
{
int flag
;
int num
;
char word
[50];
}Node
, *Nodeptr
;
Node words1
[400000];
Node words2
[400000];
TrieNodeptr
createTrieNode();
void insertTrie(TrieNodeptr pRoot
, char *str
);
int searchTrie(TrieNodeptr root
, char *str
);
void deleteTrie(TrieNodeptr root
, char *str
);
int getword1();
int getword2();
void countSort1(int exp
);
void countSort2(int exp
);
void radixSort1();
void radixSort2();
void swapchar(char *s1
, char *s2
);
void TraverseTrie1(TrieNodeptr root
);
void TraverseTrie2(TrieNodeptr root
);
void TraverseTrie(TrieNodeptr root
);
void switchstru(Nodeptr a
, Nodeptr b
);
int ReadFile(char *path
, char data
[]);
int sum1
;
int sum2
;
Node output1
[400000];
Node output2
[400000];
char word
[50];
FILE
*indic
, *instp
, *in1
, *in2
;
FILE
*out
;
char dic
[7000000];
char stp
[7000000];
char art1
[7000000];
char art2
[7000000];
clock_t s4
, e4
, s5
, e5
;
float ms4
= 0.0;
float ms5
= 0.0;
int main()
{
int n
;
int i
= 0;
int j
= 0;
int flag
;
#ifdef TIMEA
clock_t start
, end
;
start
= clock();
end
= clock();
printf("time=%f\n", (double)(end
- start
) * 1000 / CLK_TCK
);
#endif
indic
= fopen("dictionary.txt", "rb");
fseek(indic
, 0, SEEK_END);
int lengthdic
= ftell(indic
);
fseek(indic
, 0, SEEK_SET);
fread(dic
, lengthdic
, 1, indic
);
dic
[lengthdic
++] = 0;
#ifdef TIMEA
end
= clock();
printf("111 time=%f\n", (double)(end
- start
) * 1000 / CLK_TCK
);
start
= clock();
#endif
instp
= fopen("stopwords.txt", "rb");
fseek(instp
, 0, SEEK_END);
int lengthstp
= ftell(instp
);
fseek(instp
, 0, SEEK_SET);
fread(stp
, lengthstp
, 1, instp
);
stp
[lengthstp
++] = 0;
#ifdef TIMEA
end
= clock();
printf("222 time=%f\n", (double)(end
- start
) * 1000 / CLK_TCK
);
start
= clock();
#endif
in1
= fopen("article1.txt", "rb");
fseek(in1
, 0, SEEK_END);
int length1
= ftell(in1
);
fseek(in1
, 0, SEEK_SET);
fread(art1
, length1
, 1, in1
);
art1
[length1
++] = 0;
#ifdef TIMEA
end
= clock();
printf("333 time=%f\n", (double)(end
- start
) * 1000 / CLK_TCK
);
start
= clock();
#endif
in2
= fopen("article2.txt", "rb");
fseek(in2
, 0, SEEK_END);
int length2
= ftell(in2
);
fseek(in2
, 0, SEEK_SET);
fread(art2
, length2
, 1, in2
);
art2
[length2
++] = 0;
#ifdef TIMEA
end
= clock();
printf("444 time=%f\n", (double)(end
- start
) * 1000 / CLK_TCK
);
start
= clock();
clock_t start1
, end1
, start2
, end2
, start3
, end3
;
float ms1
, ms2
, ms3
;
ms1
= ms2
= ms3
= 0.0;
#endif
out
= fopen("results.txt", "w");
TrieNodeptr rootdic
= createTrieNode();
for (i
= 0; i
< lengthdic
; i
++)
{
#ifdef TIMEA
start1
= clock();
#endif
flag
= 0;
while ((dic
[i
] | 32) > 'z' || 'a' > (dic
[i
] | 32))
{
if (dic
[i
] == '\0')
{
flag
= 1;
break;
}
i
++;
}
#ifdef TIMEA
end1
= clock();
ms1
+= (double)(end1
- start1
) * 1000 / CLK_TCK
;
#endif
if (flag
== 1) break;
j
= 0;
#ifdef TIMEA
start2
= clock();
#endif
while ('a' <= (dic
[i
] | 32) && (dic
[i
] | 32) <= 'z')
{
word
[j
++] = dic
[i
] | 32;
i
++;
}
word
[j
] = '\0';
#ifdef TIMEA
end2
= clock();
ms2
+= (double)(end2
- start2
) * 1000 / CLK_TCK
;
#endif
#ifdef TIMEA
start3
= clock();
#endif
insertTrie(rootdic
, word
);
#ifdef TIMEA
end3
= clock();
ms3
+= (double)(end3
- start3
) * 1000 / CLK_TCK
;
#endif
}
#ifdef TIMEA
printf("444---inner time=%f,%f,%f,%f,%f\n", ms1
, ms2
, ms3
, ms4
, ms5
);
#endif
#ifdef TIMEA
end
= clock();
printf("555 time=%f\n", (double)(end
- start
) * 1000 / CLK_TCK
);
start
= clock();
#endif
for (i
= 0; i
< lengthstp
; i
++)
{
flag
= 0;
while ((stp
[i
] | 32) > 'z' || 'a' > (stp
[i
] | 32))
{
if (stp
[i
] == '\0')
{
flag
= 1;
break;
}
i
++;
}
if (flag
== 1) break;
j
= 0;
while ('a' <= (stp
[i
] | 32) && (stp
[i
] | 32) <= 'z')
{
word
[j
++] = stp
[i
] | 32;
i
++;
}
word
[j
] = '\0';
deleteTrie(rootdic
, word
);
}
TrieNodeptr rootwords1
= createTrieNode();
TrieNodeptr rootwords2
= createTrieNode();
j
= -1;
for (i
= 0; i
<= length1
; i
++)
{
if (art1
[i
] >= 'a'&&art1
[i
] <= 'z') j
++;
else if (art1
[i
] >= 'A'&&art1
[i
] <= 'Z')
{
art1
[i
] |= 32;
j
++;
}
else if (j
!= -1)
{
j
++;
art1
[i
] = 0;
strcpy(word
, art1
+ i
- j
);
if (searchTrie(rootdic
, word
))
{
insertTrie(rootwords1
, word
);
}
j
= -1;
}
}
TraverseTrie1(rootwords1
);
#ifdef TIMEA
end
= clock();
printf("666 time=%f\n", (double)(end
- start
) * 1000 / CLK_TCK
);
start
= clock();
#endif
for (i
= 0; i
<= length2
; i
++)
{
if (art2
[i
] >= 'a'&&art2
[i
] <= 'z') j
++;
else if (art2
[i
] >= 'A'&&art2
[i
] <= 'Z')
{
art2
[i
] |= 32;
j
++;
}
else if (j
!= -1)
{
j
++;
art2
[i
] = 0;
strcpy(word
, art2
+ i
- j
);
if (searchTrie(rootdic
, word
))
{
insertTrie(rootwords2
, word
);
}
j
= -1;
}
}
TraverseTrie2(rootwords2
);
#ifdef TIMEA
end
= clock();
printf("777 time=%f\n", (double)(end
- start
) * 1000 / CLK_TCK
);
start
= clock();
#endif
radixSort1();
#ifdef TIMEA
end
= clock();
printf("888 time=%f\n", (double)(end
- start
) * 1000 / CLK_TCK
);
start
= clock();
#endif
radixSort2();
#ifdef TIMEA
end
= clock();
printf("999 time=%f\n", (double)(end
- start
) * 1000 / CLK_TCK
);
start
= clock();
#endif
scanf("%d", &n
);
#ifdef TIMEA
start
= clock();
#endif
TrieNodeptr root1
= createTrieNode();
for (i
= 0; i
< n
; i
++)
{
insertTrie(root1
, words1
[i
].word
);
}
#ifdef TIMEA
end
= clock();
printf("aaa time=%f\n", (double)(end
- start
) * 1000 / CLK_TCK
);
start
= clock();
#endif
TrieNodeptr root2
= createTrieNode();
for (i
= 0; i
< n
; i
++)
{
insertTrie(root2
, words2
[i
].word
);
}
#ifdef TIMEA
end
= clock();
printf("bbb time=%f\n", (double)(end
- start
) * 1000 / CLK_TCK
);
start
= clock();
#endif
for (i
= 0; i
< n
; i
++)
{
if (searchTrie(root2
, words1
[i
].word
))
{
words1
[i
].flag
= 1;
}
}
#ifdef TIMEA
end
= clock();
printf("ccc time=%f\n", (double)(end
- start
) * 1000 / CLK_TCK
);
start
= clock();
#endif
for (i
= 0; i
< n
; i
++)
{
if (searchTrie(root1
, words2
[i
].word
))
{
words2
[i
].flag
= 1;
}
}
#ifdef TIMEA
end
= clock();
printf("ddd time=%f\n", (double)(end
- start
) * 1000 / CLK_TCK
);
start
= clock();
#endif
int freqm1
= 0;
int freqn1
= 0;
int freqm2
= 0;
int freqn2
= 0;
for (i
= 0; i
< n
; i
++)
{
if (words1
[i
].flag
== 1) freqm1
+= words1
[i
].num
;
if (words2
[i
].flag
== 1) freqm2
+= words2
[i
].num
;
freqn1
+= words1
[i
].num
;
freqn2
+= words2
[i
].num
;
}
double pro1
= (1.0)*freqm1
/ freqn1
;
double pro2
= (1.0)*freqm2
/ freqn2
;
double sim
= (pro1
> pro2
) ? (pro2
/ pro1
) : (pro1
/ pro2
);
printf("%.5lf", sim
);
fprintf(out
, "%.5lf\n\n", sim
);
#ifdef TIMEA
end
= clock();
printf("\neee time=%f\n", (double)(end
- start
) * 1000 / CLK_TCK
);
start
= clock();
#endif
fputc('\n', out
);
for (i
= 0; i
< n
; i
++)
{
fprintf(out
, "%s %d\n", words1
[i
].word
, words1
[i
].num
);
}
#ifdef TIMEA
end
= clock();
printf("fff time=%f\n", (double)(end
- start
) * 1000 / CLK_TCK
);
start
= clock();
#endif
for (i
= 0; i
< n
; i
++)
{
fprintf(out
, "%s %d\n", words2
[i
].word
, words2
[i
].num
);
}
#ifdef TIMEA
end
= clock();
printf("ggg time=%f\n", (double)(end
- start
) * 1000 / CLK_TCK
);
start
= clock();
#endif
fclose(in1
);
fclose(in2
);
fclose(out
);
}
TrieNodeptr
createTrieNode()
{
TrieNodeptr tmp
= (TrieNodeptr
)malloc(sizeof(TrieNode
));
tmp
->nCount
= 0;
int i
;
for (i
= 0; i
< MAX
; i
++)
{
tmp
->next
[i
] = NULL;
}
return tmp
;
}
void insertTrie(TrieNodeptr pRoot
, char *str
)
{
TrieNodeptr tmp
= pRoot
;
int i
= 0, k
;
while (str
[i
] && str
[i
] != '\n'&&str
[i
] != '\r')
{
k
= str
[i
] - 'a';
if (tmp
->next
[k
] == NULL)
{
#ifdef TIMEA
s4
= clock();
#endif
tmp
->next
[k
] = createTrieNode();
#ifdef TIMEA
e4
= clock();
ms4
+= (double)(e4
- s4
) * 1000 / CLK_TCK
;
#endif
}
tmp
= tmp
->next
[k
];
i
++;
}
#ifdef TIMEA
s5
= clock();
#endif
tmp
->nCount
++;
#ifdef TIMEA
e5
= clock();
ms5
+= (double)(e5
- s5
) * 1000 / CLK_TCK
;
#endif
}
int searchTrie(TrieNode
*root
, char *str
)
{
if (root
== NULL) return 0;
TrieNode
*tmp
= root
;
int i
= 0, k
;
while (str
[i
])
{
k
= str
[i
] - 'a';
if (tmp
->next
[k
]) tmp
= tmp
->next
[k
];
else return 0;
i
++;
}
return tmp
->nCount
;
}
void deleteTrie(TrieNodeptr root
, char *str
)
{
int i
, index
;
TrieNodeptr temp
= root
;
for (i
= 0; i
< strlen(str
); i
++)
{
index
= str
[i
] - 'a';
if (temp
->next
[index
] == NULL) return;
temp
= temp
->next
[index
];
}
temp
->nCount
--;
return;
}
void swapchar(char *s1
, char *s2
)
{
char *temp
;
temp
= (char *)malloc(sizeof(char) * 50);
strcpy(temp
, s1
);
strcpy(s1
, s2
);
strcpy(s2
, temp
);
free(temp
);
}
void TraverseTrie1(TrieNodeptr root
)
{
int i
;
static char word
[40] = { 0 };
static int j
= 0;
if (root
== NULL) return;
for (i
= 0; i
< 26; i
++)
{
if (root
->next
[i
] == NULL) continue;
word
[j
++] = i
+ 'a';
if (root
->next
[i
]->cnt1
> 0)
{
word
[j
] = '\0';
strcpy(words1
[sum1
].word
, word
);
words1
[sum1
++].num
= root
->next
[i
]->cnt1
;
}
if (root
->next
[i
]->cnt2
> 0)
{
word
[j
] = '\0';
strcpy(words2
[sum2
].word
, word
);
words2
[sum2
++].num
= root
->next
[i
]->cnt2
;
}
TraverseTrie1(root
->next
[i
]);
j
--;
}
}
void TraverseTrie2(TrieNodeptr root
)
{
int i
;
static char word
[40] = { 0 };
static int j
= 0;
if (root
== NULL) return;
for (i
= 0; i
< 26; i
++)
{
if (root
->next
[i
] == NULL) continue;
word
[j
++] = i
+ 'a';
if (root
->next
[i
]->nCount
> 0)
{
word
[j
] = '\0';
strcpy(words2
[sum2
].word
, word
);
words2
[sum2
++].num
= root
->next
[i
]->nCount
;
}
TraverseTrie2(root
->next
[i
]);
j
--;
}
}
void TraverseTrie(TrieNodeptr root
)
{
int i
;
static char word
[40] = { 0 };
static int j
= 0;
if (root
== NULL) return;
for (i
= 0; i
< 26; i
++)
{
if (root
->next
[i
] == NULL) continue;
word
[j
++] = i
+ 'a';
if (root
->next
[i
]->nCount
> 0)
{
word
[j
] = '\0';
printf("%s %d\n", word
, root
->next
[i
]->nCount
);
}
TraverseTrie(root
->next
[i
]);
j
--;
}
}
void countSort1(int exp
)
{
int i
, buckets
[10] = { 0 };
for (i
= 0; i
< sum1
; i
++)
{
buckets
[(words1
[i
].num
/ exp
) % 10]++;
}
swapnum(buckets
[0], buckets
[9]);
swapnum(buckets
[1], buckets
[8]);
swapnum(buckets
[2], buckets
[7]);
swapnum(buckets
[3], buckets
[6]);
swapnum(buckets
[4], buckets
[5]);
for (i
= 1; i
< 10; i
++) buckets
[i
] += buckets
[i
- 1];
for (i
= sum1
- 1; i
>= 0; i
--)
{
switchstru(&output1
[buckets
[9 - (words1
[i
].num
/ exp
) % 10] - 1], &words1
[i
]);
buckets
[9 - (words1
[i
].num
/ exp
) % 10]--;
}
for (i
= 0; i
< sum1
; i
++)
{
switchstru(&words1
[i
], &output1
[i
]);
}
}
void countSort2(int exp
)
{
int i
, buckets
[10] = { 0 };
for (i
= 0; i
< sum2
; i
++)
{
buckets
[(words2
[i
].num
/ exp
) % 10]++;
}
swapnum(buckets
[0], buckets
[9]);
swapnum(buckets
[1], buckets
[8]);
swapnum(buckets
[2], buckets
[7]);
swapnum(buckets
[3], buckets
[6]);
swapnum(buckets
[4], buckets
[5]);
for (i
= 1; i
< 10; i
++) buckets
[i
] += buckets
[i
- 1];
for (i
= sum2
- 1; i
>= 0; i
--)
{
switchstru(&output2
[buckets
[9 - (words2
[i
].num
/ exp
) % 10] - 1], &words2
[i
]);
buckets
[9 - (words2
[i
].num
/ exp
) % 10]--;
}
for (i
= 0; i
< sum2
; i
++)
{
switchstru(&words2
[i
], &output2
[i
]);
}
}
void radixSort1()
{
int exp
;
int max
= 50000;
for (exp
= 1; max
/ exp
> 0; exp
*= 10) countSort1(exp
);
}
void radixSort2()
{
int exp
;
int max
= 50000;
for (exp
= 1; max
/ exp
> 0; exp
*= 10) countSort2(exp
);
}
void switchstru(Nodeptr a
, Nodeptr b
)
{
*a
= *b
;
}
这是我研究生阶段导师布置的第一个任务,所以写的比较认真 😃。其实当时老师给了源代码,她的本意让我在字典树的基础上能否用ac自动机的方法做优化,我思忖了一番觉得不太行,于是认认真真对源码进行了解读。 ↩︎