Pagini recente » Cod sursa (job #3303899) | Cod sursa (job #3319372) | Cod sursa (job #3311141) | Cod sursa (job #3319356) | Cod sursa (job #3354304)
#include <stdio.h>
#include <string>
#include <queue>
#include <vector>
#include <unordered_map>
using namespace std;
#define TEXT_LEN 1000005
#define WORD_LEN 10005
#define ALPHABET_SIZE 26
#define NUM_WORDS 105
char text[TEXT_LEN];
char word[WORD_LEN];
int word_freq[NUM_WORDS];
struct TrieNode {
TrieNode() : children{}, failure(nullptr), output(nullptr), visit_count(0) {}
TrieNode* children[ALPHABET_SIZE];
/**
* Pointer to another node that represents the `longest suffix for current node`,
* equivalent to `the largest prefix in trie that is suffix for current node`.
* This is the node where we can restart the matching from.
* This link (blue link) is used for fast prefix matching when advancing the main text.
*/
TrieNode* failure;
/**
* Pointer to the next node in the dictionary that can be reached by following failure arcs.
* This link is used to reconstruct the solution: the prefixes that match and that are a word in the dictionary.
* This is an optimization to avoid iterating failure links that are not a complete word when reconstructing the solution.
*/
TrieNode *output;
/**
* Ids of all words that end in this node.
*/
vector<int> pattern_ids;
/**
* Number of times this automaton state was reached while scanning the text.
*/
int visit_count;
};
vector<TrieNode*> bfs_order;
void trie_insert(TrieNode *node, const char *str, const int i) {
while (*str) {
const int c = *str - 'a';
if (!node->children[c]) {
node->children[c] = new TrieNode();
}
node = node->children[c];
++str;
}
node->pattern_ids.push_back(i);
}
/**
* For every node sets a failure link to a node representing the longest possible strict suffix of it in the trie.
* For every node sets an output link to the first node in the dictionary that can be reached by following failure links.
*/
void bfs_build_links(TrieNode *root) {
queue<TrieNode*> q;
q.push(root);
root->failure = root;
while (!q.empty()) {
TrieNode *parent = q.front();
q.pop();
bfs_order.push_back(parent);
for (int i = 0; i < ALPHABET_SIZE; ++i) {
TrieNode* child = parent->children[i];
if (!child) {
continue;
}
q.push(child);
// Compute failure node for child
if (parent == root) {
// Nodes on the first level don't have suffixes. Assigning the root as failure link.
child->failure = root;
} else {
const TrieNode *fail = parent->failure;
while (fail != root && !fail->children[i]) {
fail = fail->failure;
}
if (fail->children[i] && fail->children[i] != child) {
child->failure = fail->children[i];
} else {
child->failure = root;
}
}
// Compute the output node for child
if (child->failure->pattern_ids.size() > 0) {
child->output = child->failure;
} else {
child->output = child->failure->output;
}
}
}
}
void aho_corasick_print_matches(TrieNode *root, const char *suffix, unordered_map<int, string>& index_to_word) {
TrieNode *node = root;
int pos = 0;
while (*suffix != '\0') {
const int c = *suffix - 'a';
// Follow failure links and advance the matching.
while (node != root && !node->children[c]) {
node = node->failure;
}
if (node->children[c]) {
node = node->children[c];
}
// Print exact matches
for (int id : node->pattern_ids) {
printf("%s:%d, ", index_to_word[id].c_str(), pos);
word_freq[id]++;
}
// Print suffix matches
TrieNode *out = node->output;
while (out) {
for (int id : out->pattern_ids) {
printf("%s:%d, ", index_to_word[id].c_str(), pos);
word_freq[id]++;
}
out = out->output;
}
suffix++;
pos++;
}
}
/**
* Optimally counts the number of occurrences for each word in dictionary.
*/
void aho_corasick_count_occurrences(TrieNode *root, const char *suffix) {
TrieNode *node = root;
// Scan text and count state visits
while (*suffix) {
int c = *suffix - 'a';
while (node != root && !node->children[c]) {
node = node->failure;
}
if (node->children[c]) {
node = node->children[c];
}
node->visit_count++;
++suffix;
}
// Propagate counts upward through failure links in reverse BFS order
for (int i = bfs_order.size() - 1; i >= 0; --i) {
node = bfs_order[i];
if (node->failure) {
node->failure->visit_count += node->visit_count;
}
for (int id : node->pattern_ids) {
word_freq[id] = node->visit_count;
}
}
}
int main() {
freopen("ahocorasick.in", "r", stdin);
freopen("ahocorasick.out", "w", stdout);
int n; // n <= 100
auto *root = new TrieNode();
unordered_map<int, string> index_to_word;
const bool print_matches = false;
scanf("%s", text);
scanf("%d", &n);
for (int i = 0; i < n; ++i) {
scanf("%s", word);
trie_insert(root, word, i);
if (print_matches) {
index_to_word.insert({i, string(word)});
}
}
bfs_build_links(root);
if (print_matches) {
aho_corasick_print_matches(root, text, index_to_word);
} else {
aho_corasick_count_occurrences(root, text);
for (int i = 0; i < n; ++i) {
printf("%d\n", word_freq[i]);
}
}
return 0;
}