Cod sursa(job #1744163)

Utilizator mouse_wirelessMouse Wireless mouse_wireless Data 19 august 2016 13:38:48
Problema Aho-Corasick Scor 95
Compilator cpp Status done
Runda Arhiva educationala Marime 6.4 kb
#include <cstdio>
#include <cstring>
#include <cstdlib>

typedef unsigned char BYTE;
typedef unsigned int DWORD;
typedef bool BOOL;
#define FALSE false
#define TRUE true
#define CHECK(expr) if (!expr) return FALSE;
#define CHECKRET(expr, retval) if (!expr) return retval;

void *fakeMalloc(int sizeToAllocate, BYTE **fakeMallocBuffer, int *fakeMallocSizeRemaining) {
	CHECKRET((*fakeMallocSizeRemaining) - sizeToAllocate >= 0, NULL);
	*fakeMallocSizeRemaining -= sizeToAllocate;
	void *returnValue = (void*)(*fakeMallocBuffer);
	*fakeMallocBuffer += sizeToAllocate;
	return returnValue;
}

#define AhoCorasick_AlphabetSize 26

struct AhoCorasick_Node {
	DWORD finishedWord;
	DWORD numberOfMatches;
	AhoCorasick_Node *next[AhoCorasick_AlphabetSize], *fail;
};

inline BYTE AhoCorasick_ByteConversion(BYTE b) {
	return b - 'a';
}

inline void printIdentation(DWORD identationLevel) {
	for (DWORD i = 0; i < identationLevel; ++i)
		printf("%s", " |");
}

void printTrie(DWORD identationLevel, AhoCorasick_Node *node) {
	printf("%X", node);
	if (node->finishedWord != (DWORD)(-1))
		printf(" %u", node->finishedWord);
	putchar('\n');
	for (DWORD i = 0; i < AhoCorasick_AlphabetSize; ++i)
	if (node->next[i]) {
		printIdentation(identationLevel);
		printf(" |%c ", i + 'a');
		printTrie(identationLevel + 1, node->next[i]);
	}
}

AhoCorasick_Node *AhoCorasick_NodeConstructor(BYTE **fakeMallocBuffer, int *fakeMallocSizeRemaining) {
	AhoCorasick_Node *nod = (AhoCorasick_Node*)fakeMalloc(sizeof(AhoCorasick_Node), fakeMallocBuffer, fakeMallocSizeRemaining);
	CHECKRET(nod, NULL);
	nod->finishedWord = (DWORD)(-1);
	nod->numberOfMatches = 0;
	memset(nod->next, 0, sizeof(nod->next));
	nod->fail = NULL;
	return nod;
}

BYTE AhoCorasick_Insert(AhoCorasick_Node *nod, BYTE *word, DWORD sizeRemaining, DWORD wordID, BYTE **fakeMallocBuffer, int *fakeMallocSizeRemaining) {
	BYTE next = AhoCorasick_ByteConversion(*word);
	if (nod->next[next] == NULL) {
		nod->next[next] = AhoCorasick_NodeConstructor(fakeMallocBuffer, fakeMallocSizeRemaining);
		CHECKRET(nod->next[next] != NULL, (BYTE)(-1));
	}
	return next;
}

void AhoCorasick_SetFailsBFS(AhoCorasick_Node *nod, AhoCorasick_Node **nodeQueue, int *queueSize) {
	nod->fail = nod;
	int qindex = 0; *queueSize = 1;
	for (nodeQueue[0] = nod; qindex < *queueSize; ++qindex) {
		AhoCorasick_Node *front = nodeQueue[qindex];
		for (int i = 0; i < AhoCorasick_AlphabetSize; ++i) {
			AhoCorasick_Node *curNode = front->next[i];
			if (curNode == NULL) continue;
			AhoCorasick_Node *dolar = front->fail;
			for (; dolar != nod && dolar->next[i] == NULL; dolar = dolar->fail)
				;
			if (dolar->next[i] != NULL && dolar->next[i] != front->next[i]) dolar = dolar->next[i];
			curNode->fail = dolar;
			nodeQueue[(*queueSize)++] = curNode;
		}
	}

	nod->fail = NULL;
}

#define AmmountToMalloc 5384000
#define MaxBigwordLen 1000010
#define SmallWordLen 10010
#define MaxNumberOfWords 110

BYTE buf[MaxBigwordLen];
BYTE words[MaxNumberOfWords][SmallWordLen];

#define MAX(i, j) (((i) > (j)) ? (i) : (j))

BOOL AhoCorasick_Precompute(
	BYTE words[][SmallWordLen],
	DWORD *wordSizes,
	DWORD numWords,
	BYTE *fakeMallocBuffer,
	int fakeMallocSizeRemaining,
	//OUT parameters:
	AhoCorasick_Node **root,
	AhoCorasick_Node ***nodeQueue,
	int *queueSize) {

	DWORD totalLen = 0;
	for (DWORD i = 0; i < numWords; i++)
		totalLen += wordSizes[i];
	*nodeQueue = (AhoCorasick_Node**)fakeMalloc(sizeof(AhoCorasick_Node*)* (totalLen + 2), &fakeMallocBuffer, &fakeMallocSizeRemaining);
	CHECK(*nodeQueue);

	*root = AhoCorasick_NodeConstructor(&fakeMallocBuffer, &fakeMallocSizeRemaining);

	CHECK(*root);
	for (DWORD i = 0; i < numWords; ++i) {
		AhoCorasick_Node *curNode = *root;
		BYTE next = 0;
		BYTE *positionInWord = words[i];
		DWORD sizeRemaining = wordSizes[i];
		while (sizeRemaining) {
			next = AhoCorasick_Insert(curNode, positionInWord, wordSizes[i], i, &fakeMallocBuffer, &fakeMallocSizeRemaining);
			curNode = curNode->next[next];
			++positionInWord;
			--sizeRemaining;
		}
		curNode->finishedWord = i;
	}
	AhoCorasick_SetFailsBFS(*root, *nodeQueue, queueSize);

	return TRUE;
}

BOOL AhoCorasick_Matching(
	BYTE *buffer,
	DWORD buffSz,
	AhoCorasick_Node *root,
	AhoCorasick_Node **nodeQueue,
	int queueSize,
	DWORD *numberOfMatches) {

	AhoCorasick_Node *curNode = root;

	for (DWORD i = 0; i < buffSz; i++) {
		BYTE curByte = AhoCorasick_ByteConversion(buffer[i]);
		for (; curNode->next[curByte] == NULL && curNode != root; curNode = curNode->fail)
			;
		if (curNode->next[curByte] != NULL) curNode = curNode->next[curByte];
		++(curNode->numberOfMatches);
	}

	for (int qindex = queueSize - 1; qindex >= 0; --qindex) {
		AhoCorasick_Node *front = nodeQueue[qindex];
		CHECK(front);
		if (front->fail != NULL)
			front->fail->numberOfMatches += front->numberOfMatches;

		DWORD finWord = front->finishedWord;
		if (finWord != (DWORD)(-1))
			numberOfMatches[finWord] = front->numberOfMatches;
	}
	return TRUE;
}

BOOL DoEverything() {
	scanf("%s", buf);
	DWORD numberOfWords;
	scanf("%u", &numberOfWords);
	DWORD wordSizes[MaxNumberOfWords];
	for (DWORD i = 0; i < numberOfWords; ++i) {
		scanf("%s", words[i]);
		wordSizes[i] = strlen((char*)words[i]);
	}

	AhoCorasick_Node *root;
	AhoCorasick_Node **nodeQueue;
	int queueSize;
	BYTE *fakeMallocBuffer = (BYTE*)malloc(AmmountToMalloc);
	DWORD numberOfMatches[MaxNumberOfWords] = { 0 };
	CHECK(AhoCorasick_Precompute(words, wordSizes, numberOfWords, fakeMallocBuffer, AmmountToMalloc, &root, &nodeQueue, &queueSize));

#ifndef INFOARENA
	//printTrie(0, root);
#endif

	if (!AhoCorasick_Matching(buf, strlen((char*)buf), root, nodeQueue, queueSize, numberOfMatches))
		return FALSE;

	BYTE primitiveHash[SmallWordLen >> 2] = { 0 };
	for (DWORD i = 0; i < numberOfWords; ++i) {
		if (!primitiveHash[wordSizes[i]]) {
			primitiveHash[wordSizes[i]] = 1;
			continue;
		}
		for (DWORD j = i - 1; j != (DWORD)(-1); --j)
		if (!strcmp((char*)words[i], (char*)words[j]))
			numberOfMatches[i] = numberOfMatches[j] = MAX(numberOfMatches[i], numberOfMatches[j]);
	}

	for (DWORD i = 0; i < numberOfWords; i++)
		printf("%u\n", numberOfMatches[i]);

	return TRUE;
}

int main() {
#ifdef INFOARENA
	freopen("ahocorasick.in", "r", stdin);
	freopen("ahocorasick.out", "w", stdout);
#else
	freopen("fis.in", "r", stdin);
#endif
	DoEverything();
	return 0;
}