Search engine
Simple search engine to find words in documents.
#include <stdlib.h> #include <string.h> #include <assert.h> #include <stdio.h> #include <stdbool.h> #define strdup _strdup struct document { const char* url; const char* title; }; struct documents { struct document** data; int size; int capacity; }; void document_delete(struct document* p) { free(p); } int documents_reserve(struct documents* p, int n) { if (n > p->capacity) { void* pnew = realloc(p->data, n * sizeof(p->data[0])); if (pnew) { p->data = pnew; p->capacity = n; } else return 0; /*out of mem*/ } return p->capacity; } int documents_push(struct documents* p, struct document* pitem) { if (p->size + 1 > p->capacity) { int n = p->capacity * 2; if (n == 0) n = 1; if (documents_reserve(p, n) == 0) { document_delete(pitem); /*design choice*/ return 0; } } p->data[p->size] = pitem; p->size++; return p->size - 1; } void documents_destroy(struct documents* p) { for (int i = 0; i < p->size; i++) document_delete(p->data[i]); free(p->data); } #define BITSET_NUM_BITS_PER_WORD (CHAR_BIT * (int)sizeof(unsigned long)) struct bitset { unsigned long* bits; int size; }; inline bool bitset_getbit(struct bitset* bitset, int index) { assert(index / BITSET_NUM_BITS_PER_WORD < bitset->size); return (bitset->bits[index / BITSET_NUM_BITS_PER_WORD] & (1ul << index % BITSET_NUM_BITS_PER_WORD)) != 0; } int bitset_resize(struct bitset* p, int newSize); struct search_index_entry { struct search_index_entry* next; unsigned int hash; char* key; struct bitset bitset; }; struct search_index { struct search_index_entry** table; unsigned int capacity; int size; }; int bitset_resize(struct bitset* p, int newSize) { if (newSize > p->size) { int oldsize = p->size; unsigned long* pnew = realloc(p->bits, newSize * sizeof(p->bits[0])); if (pnew) { memset(pnew + oldsize, 0, sizeof(p->bits[0]) * (newSize - oldsize)); p->bits = pnew; p->size = newSize; } else { return 0; /*out of mem*/ } } return p->size; } void bitset_setbit(struct bitset* bitset, int index, bool b) { int wordindex = index / BITSET_NUM_BITS_PER_WORD; if ((wordindex + 1) > bitset->size) { if (bitset_resize(bitset, wordindex + 1) == 0) return; } unsigned long bit = 1ul << index % BITSET_NUM_BITS_PER_WORD; if (b) bitset->bits[wordindex] |= bit; else bitset->bits[wordindex] &= ~bit; } void bitset_destroy(struct bitset* p) { free(p->bits); } void search_index_remove_all(struct search_index* pMap) { if (pMap->table != NULL) { for (unsigned int i = 0; i < pMap->capacity; i++) { struct search_index_entry* pentry = pMap->table[i]; while (pentry != NULL) { struct search_index_entry* pentryCurrent = pentry; bitset_destroy(&pentryCurrent->bitset); pentry = pentry->next; free(pentryCurrent->key); free(pentryCurrent); } } free(pMap->table); pMap->table = NULL; pMap->size = 0; } } void search_index_destroy(struct search_index* pMap) { search_index_remove_all(pMap); } unsigned int stringhash(const char* key) { unsigned int uHashVal = 2166136261U; unsigned int uFirst = 0; unsigned int uLast = (unsigned int)strlen(key); unsigned int uStride = 1 + uLast / 10; for (; uFirst < uLast; uFirst += uStride) { uHashVal = 16777619U * uHashVal ^ (unsigned int)key[uFirst]; } return (uHashVal); } struct bitset* search_index_find(struct search_index* pMap, const char* key) { struct bitset* p = NULL; unsigned int hash = stringhash(key); int index = hash % pMap->capacity; struct search_index_entry* pentry = pMap->table[index]; for (; pentry != NULL; pentry = pentry->next) { if (pentry->hash == hash && strcmp(pentry->key, key) == 0) { p = &pentry->bitset; break; } } return p; } int search_index_set(struct search_index* pMap, const char* key, int docindex) { int result = 0; if (pMap->table == NULL) { if (pMap->capacity < 1) { pMap->capacity = 1000; } pMap->table = calloc(pMap->capacity, sizeof(pMap->table[0])); } if (pMap->table != NULL) { unsigned int hash = stringhash(key); int index = hash % pMap->capacity; struct search_index_entry* pentry = pMap->table[index]; for (; pentry != NULL; pentry = pentry->next) { if (pentry->hash == hash && strcmp(pentry->key, key) == 0) { break; } } if (pentry == NULL) { pentry = calloc(1, sizeof(*pentry)); pentry->hash = hash; pentry->key = strdup(key); pentry->next = pMap->table[index]; pMap->table[index] = pentry; pMap->size++; result = 0; bitset_setbit(&pentry->bitset, docindex, 1); } else { result = 1; bitset_setbit(&pentry->bitset, docindex, 1); } } return result; } void find_docs(int nwords, const char* words[/*nwords*/], struct search_index* search_index, struct bitset* result) { bool bFirst = true; for (int i = 0; i < nwords; i++) { struct bitset* bitset = search_index_find(search_index, words[i]); if (bitset) { if (bFirst) { for (int j = 0; j < bitset->size; j++) { if ((j + 1) > result->size) { bitset_resize(result, j + 1); } result->bits[j] = bitset->bits[j]; } bFirst = false; } else { for (int j = 0; j < bitset->size; j++) { if (j > result->size) { bitset_resize(result, j + 1); } result->bits[j] &= bitset->bits[j]; } } } } } void print_results(struct bitset* bitset, struct documents* documents) { for (int i = 0; i < bitset->size * BITSET_NUM_BITS_PER_WORD; i++) { if (bitset_getbit(bitset, i)) { printf("%s\n", documents->data[i]->title); } } } int main(void) { struct documents documents = { 0 }; struct search_index search_index = { 0 }; struct document* pdoc = calloc(1, sizeof * pdoc); if (pdoc) { pdoc->url = strdup("doc1"); pdoc->title = strdup("document 1"); int docindex1 = documents_push(&documents, pdoc); search_index_set(&search_index, "1", docindex1); search_index_set(&search_index, "document", docindex1); struct document* pdoc2 = calloc(1, sizeof * pdoc2); if (pdoc2) { pdoc2->url = strdup("doc2"); pdoc2->title = strdup("document 2"); int docindex2 = documents_push(&documents, pdoc2); search_index_set(&search_index, "2", docindex2); search_index_set(&search_index, "document", docindex2); struct bitset bitset = { 0 }; find_docs(1, (const char* []) { "1" }, &search_index, &bitset); print_results(&bitset, &documents); bitset_destroy(&bitset); printf("---\n"); struct bitset bitset2 = { 0 }; find_docs(1, (const char* []) {"document"}, &search_index, &bitset2); print_results(&bitset2, &documents); bitset_destroy(&bitset2); } } documents_destroy(&documents); search_index_destroy(&search_index); }