From 0e3ef580b5d265ee59d50d35053e989b8c4277c2 Mon Sep 17 00:00:00 2001 From: David Robillard Date: Wed, 28 Sep 2011 17:41:06 +0000 Subject: Add ZixHash git-svn-id: http://svn.drobilla.net/zix/trunk@39 df6676b4-ccc9-40e5-b5d6-7c4628a128e3 --- src/hash.c | 226 ++++++++++++++++++++++++++++++++++++++++++++++++++++ test/hash_test.c | 157 ++++++++++++++++++++++++++++++++++++ test/patree_bench.c | 41 +++++++++- wscript | 2 + zix/common.h | 8 +- zix/hash.h | 75 +++++++++++++++++ 6 files changed, 504 insertions(+), 5 deletions(-) create mode 100644 src/hash.c create mode 100644 test/hash_test.c create mode 100644 zix/hash.h diff --git a/src/hash.c b/src/hash.c new file mode 100644 index 0000000..c267757 --- /dev/null +++ b/src/hash.c @@ -0,0 +1,226 @@ +/* + Copyright 2011 David Robillard + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#include +#include +#include + +#include "zix/hash.h" + +/** + Primes, each slightly less than twice its predecessor, and as far away + from powers of two as possible. +*/ +static const unsigned sizes[] = { + 53, 97, 193, 389, 769, 1543, 3079, 6151, 12289, 24593, 49157, 98317, + 196613, 393241, 786433, 1572869, 3145739, 6291469, 12582917, 25165843, + 50331653, 100663319, 201326611, 402653189, 805306457, 1610612741, 0 +}; + +typedef struct _Entry { + const void* key; ///< Hash key + void* data; ///< Value + struct _Entry* next; ///< Next entry in bucket + unsigned hash; ///< Non-modulo hash value (for cheap rehash) +} Entry; + +struct ZixHashImpl { + ZixHashFunc hash_func; + ZixEqualFunc key_equal_func; + Entry** buckets; + const unsigned* n_buckets; + unsigned count; +}; + +ZixHash* +zix_hash_new(ZixHashFunc hash_func, + ZixEqualFunc key_equal_func) + +{ + ZixHash* hash = (ZixHash*)malloc(sizeof(ZixHash)); + hash->hash_func = hash_func; + hash->key_equal_func = key_equal_func; + hash->count = 0; + hash->n_buckets = &sizes[0]; + hash->buckets = malloc(*hash->n_buckets * sizeof(Entry*)); + memset(hash->buckets, 0, *hash->n_buckets * sizeof(Entry*)); + + return hash; +} + +void +zix_hash_free(ZixHash* hash) +{ + for (unsigned b = 0; b < *hash->n_buckets; ++b) { + Entry* bucket = hash->buckets[b]; + for (Entry* e = bucket; e;) { + Entry* next = e->next; + free(e); + e = next; + } + } + + free(hash->buckets); + free(hash); +} + +unsigned +zix_string_hash(const void* key) +{ + // Trusty old DJB hash + const char* str = (const char*)key; + unsigned h = 5381; + for (const char* s = str; *s != '\0'; ++s) { + h = (h << 5) + h + *s; // h = h * 33 + c + } + return h; +} + +bool +zix_string_equal(const void* a, const void* b) +{ + return !strcmp((const char*)a, (const char*)b); +} + +static void +insert_entry(Entry** bucket, + Entry* entry) +{ + entry->next = *bucket; + *bucket = entry; +} + +static ZixStatus +rehash(ZixHash* hash, unsigned new_n_buckets) +{ + Entry** new_buckets = (Entry**)malloc(new_n_buckets * sizeof(Entry*)); + if (!new_buckets) { + return ZIX_STATUS_NO_MEM; + } + + memset(new_buckets, 0, new_n_buckets * sizeof(Entry*)); + + for (unsigned b = 0; b < *hash->n_buckets; ++b) { + for (Entry* e = hash->buckets[b]; e;) { + Entry* const next = e->next; + const unsigned h = e->hash % new_n_buckets; + insert_entry(&new_buckets[h], e); + e = next; + } + } + + free(hash->buckets); + hash->buckets = new_buckets; + + return ZIX_STATUS_SUCCESS; +} + +static Entry* +find_entry(const ZixHash* hash, + const void* key, + unsigned h) +{ + for (Entry* e = hash->buckets[h]; e; e = e->next) { + if (hash->key_equal_func(e->key, key)) { + return e; + } + } + + return NULL; +} + +void* +zix_hash_find(const ZixHash* hash, const void* key) +{ + const unsigned h = hash->hash_func(key) % *hash->n_buckets; + Entry* const entry = find_entry(hash, key, h); + return entry ? entry->data : 0; +} + +ZixStatus +zix_hash_insert(ZixHash* hash, const void* key, void* data) +{ + unsigned h_nomod = hash->hash_func(key); + unsigned h = h_nomod % *hash->n_buckets; + + Entry* elem = find_entry(hash, key, h); + if (elem) { + assert(elem->hash == h_nomod); + return ZIX_STATUS_EXISTS; + } + + elem = (Entry*)malloc(sizeof(Entry)); + if (!elem) { + return ZIX_STATUS_NO_MEM; + } + elem->key = key; + elem->data = data; + elem->next = NULL; + elem->hash = h_nomod; + const unsigned next_n_buckets = *(hash->n_buckets + 1); + if (next_n_buckets != 0 && (hash->count + 1) >= next_n_buckets) { + if (!rehash(hash, next_n_buckets)) { + h = h_nomod % *(++hash->n_buckets); + } + } + + insert_entry(&hash->buckets[h], elem); + ++hash->count; + return ZIX_STATUS_SUCCESS; +} + +ZixStatus +zix_hash_remove(ZixHash* hash, const void* key) +{ + unsigned h = hash->hash_func(key) % *hash->n_buckets; + + Entry** next_ptr = &hash->buckets[h]; + for (Entry* e = hash->buckets[h]; e; e = e->next) { + if (hash->key_equal_func(e->key, key)) { + *next_ptr = e->next; + free(e); + return ZIX_STATUS_SUCCESS; + } + next_ptr = &e->next; + } + + if (hash->n_buckets != sizes) { + const unsigned prev_n_buckets = *(hash->n_buckets - 1); + if (hash->count - 1 <= prev_n_buckets) { + if (!rehash(hash, prev_n_buckets)) { + --hash->n_buckets; + } + } + } + + --hash->count; + return ZIX_STATUS_NOT_FOUND; +} + +ZIX_API +void +zix_hash_foreach(const ZixHash* hash, + void (*f)(const void* key, void* value, void* user_data), + void* user_data) +{ + for (unsigned b = 0; b < *hash->n_buckets; ++b) { + Entry* bucket = hash->buckets[b]; + for (Entry* e = bucket; e; e = e->next) { + f(e->key, e->data, user_data); + } + } +} + diff --git a/test/hash_test.c b/test/hash_test.c new file mode 100644 index 0000000..ec8ea6f --- /dev/null +++ b/test/hash_test.c @@ -0,0 +1,157 @@ +/* + Copyright 2011 David Robillard + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#include +#include +#include + +#include "zix/hash.h" + +static const char* strings[] = { + "one", "two", "three", "four", "five", "six", "seven", "eight", + "2one", "2two", "2three", "2four", "2five", "2six", "2seven", "2eight", + "3one", "3two", "3three", "3four", "3five", "3six", "3seven", "3eight", + "4one", "4two", "4three", "4four", "4five", "4six", "4seven", "4eight", + "5one", "5two", "5three", "5four", "5five", "5six", "5seven", "5eight", + "6one", "6two", "6three", "6four", "6five", "6six", "6seven", "6eight", + "7one", "7two", "7three", "7four", "7five", "7six", "7seven", "7eight", + "8one", "8two", "8three", "8four", "8five", "8six", "8seven", "8eight", + "9one", "9two", "9three", "9four", "9five", "9six", "9seven", "9eight", + "Aone", "Atwo", "Athree", "Afour", "Afive", "Asix", "Aseven", "Aeight", + "Bone", "Btwo", "Bthree", "Bfour", "Bfive", "Bsix", "Bseven", "Beight", + "Cone", "Ctwo", "Cthree", "Cfour", "Cfive", "Csix", "Cseven", "Ceight", + "Done", "Dtwo", "Dthree", "Dfour", "Dfive", "Dsix", "Dseven", "Deight", +}; + +static int +test_fail(const char* fmt, ...) +{ + va_list args; + va_start(args, fmt); + fprintf(stderr, "error: "); + vfprintf(stderr, fmt, args); + va_end(args); + return 1; +} + +static unsigned n_checked = 0; + +static void +check(const void* key, void* value, void* user_data) +{ + if (key == value) { + ++n_checked; + } else { + fprintf(stderr, "ERROR: %s != %s\n", + (const char*)key, (const char*)value); + } +} + +int +main(int argc, char** argv) +{ + ZixHash* hash = zix_hash_new(zix_string_hash, zix_string_equal); + + const size_t n_strings = sizeof(strings) / sizeof(char*); + + // Insert each string + for (size_t i = 0; i < n_strings; ++i) { + ZixStatus st = zix_hash_insert(hash, strings[i], (char*)strings[i]); + if (st) { + return test_fail("Failed to insert `%s'\n", strings[i]); + } + } + + //zix_hash_print_dot(hash, stdout); + + // Attempt to insert each string again + for (size_t i = 0; i < n_strings; ++i) { + ZixStatus st = zix_hash_insert(hash, strings[i], (char*)strings[i]); + if (st != ZIX_STATUS_EXISTS) { + return test_fail("Double inserted `%s'\n", strings[i]); + } + } + + // Search for each string + for (size_t i = 0; i < n_strings; ++i) { + const char* match = (const char*)zix_hash_find(hash, strings[i]); + if (!match) { + return test_fail("Failed to find `%s'\n", strings[i]); + } + if (match != strings[i]) { + return test_fail("Bad match for `%s'\n", strings[i]); + } + } + + // Try some false matches + const char* not_indexed[] = { + "ftp://example.org/not-there-at-all", + "http://example.org/foobar", + "http://", + "http://otherdomain.com" + }; + const size_t n_not_indexed = sizeof(not_indexed) / sizeof(char*); + for (size_t i = 0; i < n_not_indexed; ++i) { + const char* match = (const char*)zix_hash_find(hash, not_indexed[i]); + if (match) { + return test_fail("Unexpectedly found `%s'\n", not_indexed[i]); + } + } + + // Remove strings + for (size_t i = 0; i < n_strings; ++i) { + // Remove string + ZixStatus st = zix_hash_remove(hash, strings[i]); + if (st) { + return test_fail("Failed to remove `%s'\n", strings[i]); + } + + // Ensure second removal fails + st = zix_hash_remove(hash, strings[i]); + if (st != ZIX_STATUS_NOT_FOUND) { + return test_fail("Unexpectedly removed `%s' twice\n", strings[i]); + } + + // Check to ensure remaining strings are still present + for (size_t j = i + 1; j < n_strings; ++j) { + const char* match = (const char*)zix_hash_find(hash, strings[j]); + if (!match) { + return test_fail("Failed to find `%s' after remove\n", strings[j]); + } + if (match != strings[j]) { + return test_fail("Bad match for `%s' after remove\n", strings[j]); + } + } + } + + // Insert each string again (to check non-empty desruction) + for (size_t i = 0; i < n_strings; ++i) { + ZixStatus st = zix_hash_insert(hash, strings[i], (char*)strings[i]); + if (st) { + return test_fail("Failed to insert `%s'\n", strings[i]); + } + } + + // Check key == value (and test zix_hash_foreach) + zix_hash_foreach(hash, check, NULL); + if (n_checked != n_strings) { + return test_fail("Check failed\n"); + } + + zix_hash_free(hash); + + return 0; +} diff --git a/test/patree_bench.c b/test/patree_bench.c index 02998d6..64e8042 100644 --- a/test/patree_bench.c +++ b/test/patree_bench.c @@ -23,8 +23,9 @@ #include -#include "zix/patree.h" #include "zix/fat_patree.h" +#include "zix/hash.h" +#include "zix/patree.h" const unsigned seed = 1; @@ -52,7 +53,7 @@ main(int argc, char** argv) return test_fail("Failed to open file %s\n", file); } - size_t max_n_strings = 100000; + size_t max_n_strings = 10000000; /* Read input strings */ char** strings = NULL; @@ -87,12 +88,13 @@ main(int argc, char** argv) FILE* insert_dat = fopen("insert.dat", "w"); FILE* search_dat = fopen("search.dat", "w"); - fprintf(insert_dat, "# n\tGHashTable\tZixPatree\tZixFatPatree\n"); - fprintf(search_dat, "# n\tGHashTable\tZixPatree\tZixFatPatree\n"); + fprintf(insert_dat, "# n\tGHashTable\tZixHash\tZixPatree\tZixFatPatree\n"); + fprintf(search_dat, "# n\tGHashTable\tZixHash\tZixPatree\tZixFatPatree\n"); for (size_t n = 1; n <= n_strings; n *= 2) { printf("Benchmarking n = %zu\n", n); ZixPatree* patree = zix_patree_new(); + ZixHash* zhash = zix_hash_new(zix_string_hash, zix_string_equal); ZixFatPatree* fat_patree = zix_fat_patree_new(); GHashTable* hash = g_hash_table_new(g_str_hash, g_str_equal); fprintf(insert_dat, "%zu", n); @@ -107,6 +109,16 @@ main(int argc, char** argv) } fprintf(insert_dat, "\t%lf", bench_end(&insert_start)); + // ZixHash + insert_start = bench_start(); + for (size_t i = 0; i < n; ++i) { + ZixStatus st = zix_hash_insert(zhash, strings[i], strings[i]); + if (st && st != ZIX_STATUS_EXISTS) { + return test_fail("Failed to insert `%s'\n", strings[i]); + } + } + fprintf(insert_dat, "\t%lf", bench_end(&insert_start)); + // ZixPatree insert_start = bench_start(); for (size_t i = 0; i < n; ++i) { @@ -119,12 +131,14 @@ main(int argc, char** argv) // ZixFatPatree insert_start = bench_start(); + /* for (size_t i = 0; i < n; ++i) { ZixStatus st = zix_fat_patree_insert(fat_patree, strings[i]); if (st && st != ZIX_STATUS_EXISTS) { return test_fail("Failed to insert `%s'\n", strings[i]); } } + */ fprintf(insert_dat, "\t%lf\n", bench_end(&insert_start)); // Benchmark search @@ -141,6 +155,21 @@ main(int argc, char** argv) } fprintf(search_dat, "\t%lf", bench_end(&search_start)); + // ZixHash + srand(seed); + search_start = bench_start(); + for (size_t i = 0; i < n; ++i) { + const size_t index = rand() % n; + const char* match = NULL; + if (!(match = zix_hash_find(zhash, strings[index]))) { + return test_fail("Hash: Failed to find `%s'\n", strings[index]); + } + if (strcmp(match, strings[index])) { + return test_fail("Hash: Bad match for `%s'\n", strings[index]); + } + } + fprintf(search_dat, "\t%lf", bench_end(&search_start)); + // ZixPatree srand(seed); search_start = bench_start(); @@ -151,6 +180,7 @@ main(int argc, char** argv) return test_fail("Patree: Failed to find `%s'\n", strings[index]); } if (strcmp(match, strings[index])) { + return test_fail("Patree: Bad match for `%s'\n", strings[index]); } } @@ -159,6 +189,7 @@ main(int argc, char** argv) // ZixFatPatree srand(seed); search_start = bench_start(); + /* for (size_t i = 0; i < n; ++i) { const size_t index = rand() % n; char* match = NULL; @@ -169,10 +200,12 @@ main(int argc, char** argv) return test_fail("FatPatree: Bad match for `%s'\n", strings[index]); } } + */ fprintf(search_dat, "\t%lf\n", bench_end(&search_start)); zix_patree_free(patree); zix_fat_patree_free(fat_patree); + zix_hash_free(zhash); g_hash_table_unref(hash); } diff --git a/wscript b/wscript index b2521f1..27eb91f 100644 --- a/wscript +++ b/wscript @@ -63,6 +63,7 @@ def configure(conf): print('') tests = [ + 'hash_test', 'patree_test', 'ring_test', 'sorted_array_test', @@ -79,6 +80,7 @@ def build(bld): lib_source = ''' src/fat_patree.c + src/hash.c src/patree.c src/ring.c src/sorted_array.c diff --git a/zix/common.h b/zix/common.h index 2107df5..a7edf76 100644 --- a/zix/common.h +++ b/zix/common.h @@ -17,6 +17,8 @@ #ifndef ZIX_COMMON_H #define ZIX_COMMON_H +#include + /** @addtogroup zix @{ @@ -55,7 +57,11 @@ typedef enum { typedef int (*ZixComparator)(const void* a, const void* b, void* user_data); /** - @} + Function for testing equality of two elements. +*/ +typedef bool (*ZixEqualFunc)(const void* a, const void* b); + +/**@} */ #endif /* ZIX_COMMON_H */ diff --git a/zix/hash.h b/zix/hash.h new file mode 100644 index 0000000..44521f1 --- /dev/null +++ b/zix/hash.h @@ -0,0 +1,75 @@ +/* + Copyright 2011 David Robillard + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#ifndef ZIX_HASH_H +#define ZIX_HASH_H + +#include "zix/common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct ZixHashImpl ZixHash; + +/** + Function for computing the hash of an element. +*/ +typedef unsigned (*ZixHashFunc)(const void* key); + +ZIX_API +ZixHash* +zix_hash_new(ZixHashFunc hash_func, + ZixEqualFunc key_equal_func); + +ZIX_API +void +zix_hash_free(ZixHash* hash); + +ZIX_API +unsigned +zix_string_hash(const void* key); + +ZIX_API +bool +zix_string_equal(const void* a, const void* b); + +ZIX_API +ZixStatus +zix_hash_insert(ZixHash* hash, + const void* key, + void* data); + +ZIX_API +ZixStatus +zix_hash_remove(ZixHash* hash, const void* key); + +ZIX_API +void* +zix_hash_find(const ZixHash* hash, + const void* key); + +ZIX_API +void +zix_hash_foreach(const ZixHash* hash, + void (*f)(const void* key, void* value, void* user_data), + void* user_data); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* ZIX_HASH_H */ -- cgit v1.2.1