From f522f8e480e8ef95ecb5e597ff3a22700d9ac9cf Mon Sep 17 00:00:00 2001 From: David Robillard Date: Fri, 10 Sep 2021 20:11:44 -0400 Subject: Rewrite ZixHash as a flat table with open addressing --- benchmark/dict_bench.c | 82 +++++----- include/.clang-tidy | 1 + include/zix/hash.h | 277 ++++++++++++++++++++++++++++----- meson.build | 2 +- src/hash.c | 403 +++++++++++++++++++++++++++++++------------------ test/.clang-tidy | 1 + test/hash_test.c | 314 ++++++++++++++++++++++++++++---------- 7 files changed, 776 insertions(+), 304 deletions(-) diff --git a/benchmark/dict_bench.c b/benchmark/dict_bench.c index 952aae3..62c1283 100644 --- a/benchmark/dict_bench.c +++ b/benchmark/dict_bench.c @@ -1,5 +1,5 @@ /* - Copyright 2011-2020 David Robillard + Copyright 2011-2021 David Robillard Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above @@ -25,7 +25,6 @@ ZIX_DISABLE_GLIB_WARNINGS #include ZIX_RESTORE_WARNINGS -#include #include #include #include @@ -35,7 +34,7 @@ ZIX_RESTORE_WARNINGS #include typedef struct { - void* buf; + char* buf; size_t len; } ZixChunk; @@ -49,10 +48,16 @@ lcg64(const uint64_t i) return (a * i) + c; } -static uint32_t +ZIX_PURE_FUNC static const void* +identity(const void* record) +{ + return record; +} + +static size_t zix_chunk_hash(const ZixChunk* const chunk) { - return zix_digest32(0u, chunk->buf, chunk->len); + return zix_digest(0u, chunk->buf, chunk->len); } static bool @@ -88,27 +93,27 @@ main(int argc, char** argv) return test_fail("Failed to open file %s\n", file); } - size_t max_n_strings = 100000000; + size_t max_n_strings = 1u << 20u; /* Read input strings */ - char** strings = NULL; - size_t* lengths = NULL; - size_t n_strings = 0; - char* buf = (char*)calloc(1, 1); - size_t buf_len = 1; - size_t this_str_len = 0; + ZixChunk* chunks = NULL; + size_t n_chunks = 0; + char* buf = (char*)calloc(1, 1); + size_t buf_len = 1; + size_t this_str_len = 0; for (int c = 0; (c = fgetc(fd)) != EOF;) { - if (isspace(c)) { + if (c == '\n') { if (this_str_len == 0) { continue; } - strings = (char**)realloc(strings, (n_strings + 1) * sizeof(char*)); - lengths = (size_t*)realloc(lengths, (n_strings + 1) * sizeof(size_t)); - strings[n_strings] = (char*)malloc(buf_len); - lengths[n_strings] = this_str_len; - memcpy(strings[n_strings], buf, buf_len); + + chunks = (ZixChunk*)realloc(chunks, (n_chunks + 1) * sizeof(ZixChunk)); + + chunks[n_chunks].buf = (char*)malloc(buf_len); + chunks[n_chunks].len = this_str_len; + memcpy(chunks[n_chunks].buf, buf, buf_len); this_str_len = 0; - if (++n_strings == max_n_strings) { + if (++n_chunks == max_n_strings) { break; } } else { @@ -129,12 +134,13 @@ main(int argc, char** argv) fprintf(insert_dat, "# n\tGHashTable\tZixHash\n"); fprintf(search_dat, "# n\tGHashTable\tZixHash\n"); - for (size_t n = n_strings / 16; n <= n_strings; n *= 2) { + for (size_t n = n_chunks / 16; n <= n_chunks; n *= 2) { printf("Benchmarking n = %zu\n", n); - GHashTable* hash = g_hash_table_new(g_str_hash, g_str_equal); - ZixHash* zhash = zix_hash_new((ZixHashFunc)zix_chunk_hash, - (ZixEqualFunc)zix_chunk_equal, - sizeof(ZixChunk)); + GHashTable* hash = g_hash_table_new(g_str_hash, g_str_equal); + + ZixHash* zhash = zix_hash_new( + identity, (ZixHashFunc)zix_chunk_hash, (ZixEqualFunc)zix_chunk_equal); + fprintf(insert_dat, "%zu", n); fprintf(search_dat, "%zu", n); @@ -143,17 +149,16 @@ main(int argc, char** argv) // GHashTable struct timespec insert_start = bench_start(); for (size_t i = 0; i < n; ++i) { - g_hash_table_insert(hash, strings[i], strings[i]); + g_hash_table_insert(hash, chunks[i].buf, chunks[i].buf); } fprintf(insert_dat, "\t%lf", bench_end(&insert_start)); // ZixHash insert_start = bench_start(); for (size_t i = 0; i < n; ++i) { - const ZixChunk chunk = {strings[i], lengths[i] + 1}; - ZixStatus st = zix_hash_insert(zhash, &chunk, NULL); + ZixStatus st = zix_hash_insert(zhash, &chunks[i]); if (st && st != ZIX_STATUS_EXISTS) { - return test_fail("Failed to insert `%s'\n", strings[i]); + return test_fail("Failed to insert `%s'\n", (const char*)chunks[i].buf); } } fprintf(insert_dat, "\t%lf\n", bench_end(&insert_start)); @@ -164,9 +169,10 @@ main(int argc, char** argv) struct timespec search_start = bench_start(); for (size_t i = 0; i < n; ++i) { const size_t index = (size_t)(lcg64(seed + i) % n); - char* match = (char*)g_hash_table_lookup(hash, strings[index]); - if (!!strcmp(match, strings[index])) { - return test_fail("Bad match for `%s'\n", strings[index]); + char* match = (char*)g_hash_table_lookup(hash, chunks[index].buf); + if (!!strcmp(match, chunks[index].buf)) { + return test_fail( + "GHashTable: Bad match `%s' for `%s'\n", match, chunks[index].buf); } } fprintf(search_dat, "\t%lf", bench_end(&search_start)); @@ -175,17 +181,15 @@ main(int argc, char** argv) search_start = bench_start(); for (size_t i = 0; i < n; ++i) { const size_t index = (size_t)(lcg64(seed + i) % n); - const ZixChunk key = {strings[index], lengths[index] + 1}; const ZixChunk* match = NULL; - if (!(match = (const ZixChunk*)zix_hash_find(zhash, &key))) { - return test_fail("Hash: Failed to find `%s'\n", strings[index]); + if (!(match = + (const ZixChunk*)zix_hash_find_record(zhash, &chunks[index]))) { + return test_fail("Hash: Failed to find `%s'\n", chunks[index].buf); } - if (!!strcmp((const char*)match->buf, strings[index])) { - return test_fail("Hash: Bad match %p for `%s': `%s'\n", - (const void*)match, - strings[index], - (const char*)match->buf); + if (!!strcmp(match->buf, chunks[index].buf)) { + return test_fail( + "ZixHash: Bad match `%s' for `%s'\n", match->buf, chunks[index].buf); } } fprintf(search_dat, "\t%lf\n", bench_end(&search_start)); diff --git a/include/.clang-tidy b/include/.clang-tidy index fec15b8..fc5b3c5 100644 --- a/include/.clang-tidy +++ b/include/.clang-tidy @@ -5,6 +5,7 @@ Checks: > -clang-diagnostic-unused-function, -clang-diagnostic-unused-macros, -hicpp-multiway-paths-covered, + -hicpp-uppercase-literal-suffix, -llvmlibc-*, WarningsAsErrors: '*' HeaderFilterRegex: '.*' diff --git a/include/zix/hash.h b/include/zix/hash.h index 6312d7b..f08d267 100644 --- a/include/zix/hash.h +++ b/include/zix/hash.h @@ -1,5 +1,5 @@ /* - Copyright 2011-2020 David Robillard + Copyright 2011-2021 David Robillard Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above @@ -19,8 +19,8 @@ #include "zix/common.h" +#include #include -#include #ifdef __cplusplus extern "C" { @@ -33,90 +33,289 @@ extern "C" { @{ */ +// ZIX_HASH_KEY_TYPE can be defined to make the API more type-safe +#if defined(ZIX_HASH_KEY_TYPE) +typedef ZIX_HASH_KEY_TYPE ZixHashKey; +#else +typedef void ZixHashKey; +#endif + +// ZIX_HASH_RECORD_TYPE can be defined to make the API more type-safe +#if defined(ZIX_HASH_RECORD_TYPE) +typedef ZIX_HASH_RECORD_TYPE ZixHashRecord; +#else +typedef void ZixHashRecord; +#endif + +// ZIX_HASH_SEARCH_DATA_TYPE can be defined to make the API more type-safe +#if defined(ZIX_HASH_SEARCH_DATA_TYPE) +typedef ZIX_HASH_SEARCH_DATA_TYPE ZixHashSearchData; +#else +typedef void ZixHashSearchData; +#endif + +/** + A hash table. + + This is an open addressing hash table that stores pointers to arbitrary user + data. Internally, everything is stored in a single flat array that is + resized when necessary. + + The single user-provided pointer that is stored in the table is called a + "record". A record contains a "key", which is accessed via a user-provided + function. This design supports storing large records (which may be + expensive to allocate/construct) without requiring an entire record to + search. Simple atomic values can be stored by providing a trivial identity + function as a key function. + + The table uses power of 2 sizes with a growth factor of 2, so that hash + values can be folded into an array index using bitwise AND as a fast modulo. + This means that only the necessary low bits of the hash value will be used, + so the hash function must be well-balanced within this range. More or less + any good modern hash algorithm will be fine, but beware, for example, hash + functions that assume they are targeting a table with a prime size. + + Since this doubles and halves in size, it may not be an optimal choice if + memory reuse is a priority. A growth factor of 1.5 with fast range + reduction may be a better choice there, at the cost of requiring 128-bit + arithmetic on 64-bit platforms, and indexing operations being slightly more + expensive. +*/ typedef struct ZixHashImpl ZixHash; -/// Function for computing the hash of an element -typedef uint32_t (*ZixHashFunc)(const void* value); +/// A full hash code for a key which is not folded down to the table size +typedef size_t ZixHashCode; + +/** + An iterator to an entry in a hash table. + + This is really just an index, but should be considered opaque to the user + and only used via the provided API and equality comparison. +*/ +typedef size_t ZixHashIter; + +/// User function for computing the hash of a key +typedef ZixHashCode (*ZixHashFunc)(const ZixHashKey* key); + +/// User function for determining if two keys are truly equal +typedef bool (*ZixKeyEqualFunc)(const ZixHashKey* a, const ZixHashKey* b); -/// Function to visit a hash element -typedef void (*ZixHashVisitFunc)(void* value, void* user_data); +/// User function for determining if a key matches in a custom search +typedef bool (*ZixKeyMatchFunc)(const ZixHashKey* key, + const ZixHashSearchData* user_data); + +/// User function for getting the key of a record +typedef const ZixHashKey* (*ZixKeyFunc)(const ZixHashRecord* record); /** - Create a new hash table. + A "plan" (position) to insert a record in a hash table. - To minimize space overhead, unlike many hash tables this stores a single - value, not a key and a value. Any size of value can be stored, but all the - values in the hash table must be the same size, and the values must be safe - to copy with memcpy. To get key:value behaviour, simply insert a struct - with a key and value into the hash. + This contains everything necessary to insert a record, except the record + itself. It is exposed to split up insertion so that records only need to be + allocated if an existing record is not found, but the contents should be + considered opaque to the user. +*/ +typedef struct { + ZixHashCode code; ///< Hash code used to find this position + ZixHashIter index; ///< Index into hash table +} ZixHashInsertPlan; - @param hash_func The hashing function. - @param equal_func A function to test value equality. - @param value_size The size of the values to be stored. +/** + Create a new hash table. + + @param key_func A function to retrieve the key from a record. + @param hash_func The key hashing function. + @param equal_func A function to test keys for equality. */ ZIX_API ZixHash* -zix_hash_new(ZixHashFunc hash_func, ZixEqualFunc equal_func, size_t value_size); +zix_hash_new(ZixKeyFunc key_func, + ZixHashFunc hash_func, + ZixKeyEqualFunc equal_func); /// Free `hash` ZIX_API void zix_hash_free(ZixHash* hash); -/// Return the number of elements in `hash` +/// Return an iterator to the first record in a hash, or the end if it is empty +ZIX_PURE_API +ZixHashIter +zix_hash_begin(const ZixHash* hash); + +/// Return an iterator one past the last possible record in a hash +ZIX_PURE_API +ZixHashIter +zix_hash_end(const ZixHash* hash); + +/// Return the record pointed to by an iterator +ZIX_PURE_API +ZixHashRecord* +zix_hash_get(const ZixHash* hash, ZixHashIter i); + +/// Return an iterator that has been advanced to the next record in a hash +ZIX_PURE_API +ZixHashIter +zix_hash_next(const ZixHash* hash, ZixHashIter i); + +/// Return the number of elements in a hash ZIX_PURE_API size_t zix_hash_size(const ZixHash* hash); /** - Insert an item into `hash`. + Find the best position to insert a record with the given key. + + This searches the hash table and returns either the position of an existing + equivalent record, or the best available position to insert a new one. The + difference can be determined with zix_hash_record_at(). + + If the returned position is not occupied, then it is valid to insert a + record with this key using zix_hash_insert_at() until the hash table is + modified (which invalidates the position). +*/ +ZIX_API +ZixHashInsertPlan +zix_hash_plan_insert(const ZixHash* hash, const ZixHashKey* key); + +/** + Find the best position to insert a record with a custom search. - If no matching value is found, ZIX_STATUS_SUCCESS will be returned, and @p - inserted will be pointed to the copy of `value` made in the new hash node. + This is an advanced low-level version of zix_hash_plan_insert(). It allows + a precalculated hash code to be given, along with a custom search predicate. + These must be compatible with the functions that manage the hash table: + trivial usage would be to pass the equality function used by the hash and + the key to search for. - If a matching value already exists, ZIX_STATUS_EXISTS will be returned, and - `inserted` will be pointed to the existing value. + This is exposed to make it possible to avoid constructing a key at all when + potentially inserting. For example, if keys are structures with a fixed + header and a variably-sized body, and you have a separate header and body + this can be used to find an insert position without having to allocate a + contiguous key. + + Note that care must be taken when using this function: improper use can + corrupt the hash table. The hash code given must be correct for the key to + be inserted, and the predicate must return true only if the key it is called + with (the first argument) matches the key to be inserted. +*/ +ZIX_API +ZixHashInsertPlan +zix_hash_plan_insert_prehashed(const ZixHash* hash, + ZixHashCode code, + ZixKeyMatchFunc predicate, + const ZixHashSearchData* user_data); + +/** + Return the record at the given position, or null. + + This is used to determine if a position returned from zix_hash_plan_insert() + can be used to insert a new record, or to access the existing matching + record. +*/ +ZIX_PURE_API +ZixHashRecord* +zix_hash_record_at(const ZixHash* hash, ZixHashInsertPlan position); + +/** + Insert a record at a specific position. + + The position must have been found with an earlier call to + zix_hash_plan_insert(), and no modifications must have been made to the hash + table since. + + @param hash The hash table. + + @param position The position for the new record. + + @param record The record to insert which, on success, can now be considered + owned by the hash table. + + @return ZIX_STATUS_SUCCESS, ZIX_STATUS_EXISTS if a record already exists at + this position, or ZIX_STATUS_NO_MEM if growing the hash table failed. +*/ +ZIX_API +ZixStatus +zix_hash_insert_at(ZixHash* hash, + ZixHashInsertPlan position, + ZixHashRecord* record); + +/** + Insert a record. + + This is a trivial wrapper for zix_hash_plan_insert() and + zix_hash_insert_at() that is more convenient when record construction is not + expensive. @param hash The hash table. - @param value The value to be inserted. - @param inserted The copy of `value` in the hash table. + + @param record The record to insert which, on success, can now be considered + owned by the hash table. + @return ZIX_STATUS_SUCCESS, ZIX_STATUS_EXISTS, or ZIX_STATUS_NO_MEM. */ ZIX_API ZixStatus -zix_hash_insert(ZixHash* hash, const void* value, void** inserted); +zix_hash_insert(ZixHash* hash, ZixHashRecord* record); /** - Remove an item from `hash`. + Erase a record at a specific position. + + @param hash The hash table to remove the record from. + + @param i Iterator to the record to remove. This must be a valid iterator + from an earlier call to zix_hash_find(), that is, the hash table must not + have been modified since. + + @param removed Set to the removed record, or null. + + @return ZIX_STATUS_SUCCES or ZIX_STATUS_BAD_ARG if `i` does not point at a + removable record. +*/ +ZIX_API +ZixStatus +zix_hash_erase(ZixHash* hash, ZixHashIter i, ZixHashRecord** removed); + +/** + Remove a record. @param hash The hash table. - @param value The value to remove. + @param key The key of the record to remove. + @param removed Set to the removed record, or null. @return ZIX_STATUS_SUCCES or ZIX_STATUS_NOT_FOUND. */ ZIX_API ZixStatus -zix_hash_remove(ZixHash* hash, const void* value); +zix_hash_remove(ZixHash* hash, const ZixHashKey* key, ZixHashRecord** removed); /** - Search for an item in `hash`. + Find the position of a record with a given key. - @param hash The hash table. - @param value The value to search for. + @param hash The hash table to search. + + @param key The key of the desired record. + + @return An iterator to the matching record, or the end iterator if no such + record exists. */ ZIX_API -void* -zix_hash_find(const ZixHash* hash, const void* value); +ZixHashIter +zix_hash_find(const ZixHash* hash, const ZixHashKey* key); /** - Call `f` on each value in `hash`. + Find a record with a given key. - @param hash The hash table. - @param f The function to call on each value. - @param user_data The user_data parameter passed to `f`. + This is essentially the same as zix_hash_find(), but returns a pointer to + the record for convenience. + + @param hash The hash table to search. + + @param key The key of the desired record. + + @return A pointer to the matching record, of null if no such record exists. */ ZIX_API -void -zix_hash_foreach(ZixHash* hash, ZixHashVisitFunc f, void* user_data); +ZixHashRecord* +zix_hash_find_record(const ZixHash* hash, const ZixHashKey* key); /** @} diff --git a/meson.build b/meson.build index cdde084..7b8e872 100644 --- a/meson.build +++ b/meson.build @@ -34,7 +34,7 @@ if get_option('strict') elif cc.get_id() == 'gcc' c_warnings = [ # '-Waggregate-return', - '-Walloc-size-larger-than=4096', + '-Walloc-size-larger-than=16384', '-Walloc-zero', '-Walloca', '-Wanalyzer-too-complex', diff --git a/src/hash.c b/src/hash.c index d9556ed..5bc9d60 100644 --- a/src/hash.c +++ b/src/hash.c @@ -1,5 +1,5 @@ /* - Copyright 2011-2020 David Robillard + Copyright 2011-2021 David Robillard Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above @@ -17,214 +17,327 @@ #include "zix/hash.h" #include +#include +#include #include -#include - -/** - Primes, each slightly less than twice its predecessor, and as far away - from powers of two as possible. -*/ -static const unsigned sizes[] = { - 53, 97, 193, 389, 769, 1543, 3079, - 6151, 12289, 24593, 49157, 98317, 196613, 393241, - 786433, 1572869, 3145739, 6291469, 12582917, 25165843, 50331653, - 100663319, 201326611, 402653189, 805306457, 1610612741, 0}; typedef struct ZixHashEntry { - struct ZixHashEntry* next; ///< Next entry in bucket - uint32_t hash; ///< Non-modulo hash value - // Value follows here (access with zix_hash_value) + ZixHashCode hash; ///< Non-folded hash value + ZixHashRecord* value; ///< Pointer to user-owned record } ZixHashEntry; struct ZixHashImpl { - ZixHashFunc hash_func; - ZixEqualFunc equal_func; - ZixHashEntry** buckets; - const unsigned* n_buckets; - size_t value_size; - unsigned count; + ZixKeyFunc key_func; ///< User-provided key accessor + ZixHashFunc hash_func; ///< User-provided hashing function + ZixKeyEqualFunc equal_func; ///< User-provided equality comparison function + size_t count; ///< Number of records stored in the table + size_t mask; ///< Bit mask for fast modulo (n_entries - 1) + size_t n_entries; ///< Power of two table size + ZixHashEntry* entries; ///< Pointer to dynamically allocated table }; -static inline void* -zix_hash_value(ZixHashEntry* entry) -{ - return entry + 1; -} +static const size_t min_n_entries = 4u; +static const size_t tombstone = 0xDEADu; ZixHash* -zix_hash_new(ZixHashFunc hash_func, ZixEqualFunc equal_func, size_t value_size) +zix_hash_new(const ZixKeyFunc key_func, + const ZixHashFunc hash_func, + const ZixKeyEqualFunc equal_func) { - ZixHash* hash = (ZixHash*)malloc(sizeof(ZixHash)); - if (hash) { - hash->hash_func = hash_func; - hash->equal_func = equal_func; - hash->n_buckets = &sizes[0]; - hash->value_size = value_size; - hash->count = 0; - if (!(hash->buckets = - (ZixHashEntry**)calloc(*hash->n_buckets, sizeof(ZixHashEntry*)))) { - free(hash); - return NULL; - } + ZixHash* const hash = (ZixHash*)malloc(sizeof(ZixHash)); + if (!hash) { + return NULL; } + + hash->key_func = key_func; + hash->hash_func = hash_func; + hash->equal_func = equal_func; + hash->count = 0u; + hash->n_entries = min_n_entries; + hash->mask = hash->n_entries - 1u; + + hash->entries = (ZixHashEntry*)calloc(hash->n_entries, sizeof(ZixHashEntry)); + if (!hash->entries) { + free(hash); + return NULL; + } + return hash; } void -zix_hash_free(ZixHash* hash) +zix_hash_free(ZixHash* const hash) { - if (!hash) { - return; + if (hash) { + free(hash->entries); + free(hash); } +} - for (unsigned b = 0; b < *hash->n_buckets; ++b) { - ZixHashEntry* bucket = hash->buckets[b]; - for (ZixHashEntry* e = bucket; e;) { - ZixHashEntry* next = e->next; - free(e); - e = next; - } - } +ZixHashIter +zix_hash_begin(const ZixHash* const hash) +{ + return hash->entries[0u].value ? 0u : zix_hash_next(hash, 0u); +} + +ZixHashIter +zix_hash_end(const ZixHash* const hash) +{ + return hash->n_entries; +} + +ZixHashRecord* +zix_hash_get(const ZixHash* hash, const ZixHashIter i) +{ + assert(i < hash->n_entries); - free(hash->buckets); - free(hash); + return hash->entries[i].value; +} + +ZixHashIter +zix_hash_next(const ZixHash* const hash, ZixHashIter i) +{ + do { + ++i; + } while (i < hash->n_entries && !hash->entries[i].value); + + return i; } size_t -zix_hash_size(const ZixHash* hash) +zix_hash_size(const ZixHash* const hash) { return hash->count; } -static inline void -insert_entry(ZixHashEntry** bucket, ZixHashEntry* entry) +static inline size_t +fold_hash(const ZixHashCode h_nomod, const size_t mask) +{ + return h_nomod & mask; +} + +static inline bool +is_empty(const ZixHashEntry* const entry) +{ + return !entry->value && !entry->hash; +} + +static inline bool +is_match(const ZixHash* const hash, + const ZixHashCode code, + const size_t entry_index, + ZixKeyEqualFunc predicate, + const void* const user_data) +{ + const ZixHashEntry* const entry = &hash->entries[entry_index]; + + return entry->value && entry->hash == code && + predicate(hash->key_func(entry->value), user_data); +} + +static inline size_t +next_index(const ZixHash* const hash, const size_t i) { - entry->next = *bucket; - *bucket = entry; + return (i == hash->mask) ? 0u : (i + 1u); } -static inline ZixStatus -rehash(ZixHash* hash, unsigned new_n_buckets) +static inline ZixHashIter +find_entry(const ZixHash* const hash, + const ZixHashKey* const key, + const size_t h, + const ZixHashCode code) { - ZixHashEntry** new_buckets = - (ZixHashEntry**)calloc(new_n_buckets, sizeof(ZixHashEntry*)); - if (!new_buckets) { + size_t i = h; + + while (!is_match(hash, code, i, hash->equal_func, key) && + !is_empty(&hash->entries[i])) { + i = next_index(hash, i); + } + + return i; +} + +static ZixStatus +rehash(ZixHash* const hash, const size_t old_n_entries) +{ + ZixHashEntry* const old_entries = hash->entries; + const size_t new_n_entries = hash->n_entries; + + // Replace the array in the hash first so we can use find_entry() normally + hash->entries = (ZixHashEntry*)calloc(new_n_entries, sizeof(ZixHashEntry)); + if (!hash->entries) { return ZIX_STATUS_NO_MEM; } - const unsigned old_n_buckets = *hash->n_buckets; - for (unsigned b = 0; b < old_n_buckets; ++b) { - for (ZixHashEntry* e = hash->buckets[b]; e;) { - ZixHashEntry* const next = e->next; - const unsigned h = e->hash % new_n_buckets; - insert_entry(&new_buckets[h], e); - e = next; + // Reinsert every element into the new array + for (size_t i = 0u; i < old_n_entries; ++i) { + ZixHashEntry* const entry = &old_entries[i]; + + if (entry->value) { + assert(hash->mask == hash->n_entries - 1u); + const size_t new_h = fold_hash(entry->hash, hash->mask); + const size_t new_i = find_entry(hash, entry->value, new_h, entry->hash); + + hash->entries[new_i] = *entry; } } - free(hash->buckets); - hash->buckets = new_buckets; + free(old_entries); + return ZIX_STATUS_SUCCESS; +} + +static ZixStatus +grow(ZixHash* const hash) +{ + const size_t old_n_entries = hash->n_entries; + + hash->n_entries <<= 1u; + hash->mask = hash->n_entries - 1u; + + return rehash(hash, old_n_entries); +} + +static ZixStatus +shrink(ZixHash* const hash) +{ + if (hash->n_entries > min_n_entries) { + const size_t old_n_entries = hash->n_entries; + + hash->n_entries >>= 1u; + hash->mask = hash->n_entries - 1u; + + return rehash(hash, old_n_entries); + } return ZIX_STATUS_SUCCESS; } -static inline ZixHashEntry* -find_entry(const ZixHash* hash, - const void* key, - const unsigned h, - const unsigned h_nomod) +ZixHashIter +zix_hash_find(const ZixHash* const hash, const ZixHashKey* const key) +{ + const ZixHashCode h_nomod = hash->hash_func(key); + const size_t h = fold_hash(h_nomod, hash->mask); + const ZixHashIter i = find_entry(hash, key, h, h_nomod); + + return is_empty(&hash->entries[i]) ? hash->n_entries : i; +} + +ZixHashRecord* +zix_hash_find_record(const ZixHash* const hash, const ZixHashKey* const key) { - for (ZixHashEntry* e = hash->buckets[h]; e; e = e->next) { - if (e->hash == h_nomod && hash->equal_func(zix_hash_value(e), key)) { - return e; + const ZixHashCode h_nomod = hash->hash_func(key); + const size_t h = fold_hash(h_nomod, hash->mask); + + return hash->entries[find_entry(hash, key, h, h_nomod)].value; +} + +ZixHashInsertPlan +zix_hash_plan_insert_prehashed(const ZixHash* const hash, + const ZixHashCode code, + const ZixKeyMatchFunc predicate, + const void* const user_data) +{ + // Calculate an ideal initial position + ZixHashInsertPlan pos = {code, fold_hash(code, hash->mask)}; + + // Search for a free position starting at the ideal one + const size_t start_index = pos.index; + size_t first_tombstone = SIZE_MAX; + while (!is_empty(&hash->entries[pos.index])) { + if (is_match(hash, code, pos.index, predicate, user_data)) { + return pos; + } + + if (first_tombstone == SIZE_MAX && !hash->entries[pos.index].value) { + assert(hash->entries[pos.index].hash == tombstone); + first_tombstone = pos.index; // Remember the first/best free index + } + + pos.index = next_index(hash, pos.index); + if (pos.index == start_index) { + break; // Rare edge case: entire table is full of entries/tombstones } } - return NULL; + + // If we found a tombstone before an empty slot, place the new element there + pos.index = first_tombstone < SIZE_MAX ? first_tombstone : pos.index; + assert(!hash->entries[pos.index].value); + + return pos; } -void* -zix_hash_find(const ZixHash* hash, const void* value) +ZixHashInsertPlan +zix_hash_plan_insert(const ZixHash* const hash, const ZixHashKey* const key) { - const unsigned h_nomod = hash->hash_func(value); - const unsigned h = h_nomod % *hash->n_buckets; - ZixHashEntry* const entry = find_entry(hash, value, h, h_nomod); - return entry ? zix_hash_value(entry) : 0; + return zix_hash_plan_insert_prehashed( + hash, hash->hash_func(key), hash->equal_func, key); } -ZixStatus -zix_hash_insert(ZixHash* hash, const void* value, void** inserted) +ZixHashRecord* +zix_hash_record_at(const ZixHash* const hash, const ZixHashInsertPlan position) { - unsigned h_nomod = hash->hash_func(value); - unsigned h = h_nomod % *hash->n_buckets; + return hash->entries[position.index].value; +} - ZixHashEntry* elem = find_entry(hash, value, h, h_nomod); - if (elem) { - assert(elem->hash == h_nomod); - if (inserted) { - *inserted = zix_hash_value(elem); - } +ZixStatus +zix_hash_insert_at(ZixHash* const hash, + const ZixHashInsertPlan position, + ZixHashRecord* const record) +{ + if (hash->entries[position.index].value) { return ZIX_STATUS_EXISTS; } - elem = (ZixHashEntry*)malloc(sizeof(ZixHashEntry) + hash->value_size); - if (!elem) { - return ZIX_STATUS_NO_MEM; - } - elem->next = NULL; - elem->hash = h_nomod; - memcpy(elem + 1, value, hash->value_size); - - const unsigned next_n_buckets = *(hash->n_buckets + 1); - if (next_n_buckets != 0 && (hash->count + 1) >= next_n_buckets) { - if (!rehash(hash, next_n_buckets)) { - h = h_nomod % *(++hash->n_buckets); - } - } + // Set entry to new value + ZixHashEntry* const entry = &hash->entries[position.index]; + assert(!entry->value); + entry->hash = position.code; + entry->value = record; - insert_entry(&hash->buckets[h], elem); - ++hash->count; - if (inserted) { - *inserted = zix_hash_value(elem); + // Update size and rehash if we exceeded the maximum load + const size_t max_load = hash->n_entries / 2u + hash->n_entries / 8u; + if (++hash->count >= max_load) { + return grow(hash); } + return ZIX_STATUS_SUCCESS; } ZixStatus -zix_hash_remove(ZixHash* hash, const void* value) -{ - const unsigned h_nomod = hash->hash_func(value); - const unsigned h = h_nomod % *hash->n_buckets; - - ZixHashEntry** next_ptr = &hash->buckets[h]; - for (ZixHashEntry* e = hash->buckets[h]; e; e = e->next) { - if (h_nomod == e->hash && hash->equal_func(zix_hash_value(e), value)) { - *next_ptr = e->next; - free(e); - --hash->count; - return ZIX_STATUS_SUCCESS; - } - next_ptr = &e->next; - } +zix_hash_insert(ZixHash* const hash, ZixHashRecord* const record) +{ + const ZixHashKey* const key = hash->key_func(record); + const ZixHashInsertPlan position = zix_hash_plan_insert(hash, key); - if (hash->n_buckets != sizes) { - const unsigned prev_n_buckets = *(hash->n_buckets - 1); - if (hash->count - 1 <= prev_n_buckets) { - if (!rehash(hash, prev_n_buckets)) { - --hash->n_buckets; - } - } + return zix_hash_insert_at(hash, position, record); +} + +ZixStatus +zix_hash_erase(ZixHash* const hash, + const ZixHashIter i, + ZixHashRecord** const removed) +{ + // Replace entry with a tombstone + *removed = hash->entries[i].value; + hash->entries[i].hash = tombstone; + hash->entries[i].value = NULL; + + // Decrease element count and rehash if necessary + --hash->count; + if (hash->count < hash->n_entries / 4u) { + return shrink(hash); } - return ZIX_STATUS_NOT_FOUND; + return ZIX_STATUS_SUCCESS; } -void -zix_hash_foreach(ZixHash* hash, ZixHashVisitFunc f, void* user_data) +ZixStatus +zix_hash_remove(ZixHash* const hash, + const ZixHashKey* const key, + ZixHashRecord** const removed) { - for (unsigned b = 0; b < *hash->n_buckets; ++b) { - ZixHashEntry* bucket = hash->buckets[b]; - for (ZixHashEntry* e = bucket; e; e = e->next) { - f(zix_hash_value(e), user_data); - } - } + const ZixHashIter i = zix_hash_find(hash, key); + + return i == hash->n_entries ? ZIX_STATUS_NOT_FOUND + : zix_hash_erase(hash, i, removed); } diff --git a/test/.clang-tidy b/test/.clang-tidy index b8084d5..0139c99 100644 --- a/test/.clang-tidy +++ b/test/.clang-tidy @@ -11,6 +11,7 @@ Checks: > -cppcoreguidelines-avoid-non-const-global-variables, -google-readability-casting, -hicpp-multiway-paths-covered, + -hicpp-uppercase-literal-suffix, -llvm-header-guard, -llvmlibc-*, -misc-no-recursion, diff --git a/test/hash_test.c b/test/hash_test.c index d82b76a..68556b0 100644 --- a/test/hash_test.c +++ b/test/hash_test.c @@ -14,37 +14,29 @@ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +#undef NDEBUG + +#define ZIX_HASH_KEY_TYPE const char +#define ZIX_HASH_RECORD_TYPE const char + +#include "test_data.h" #include "test_malloc.h" #include "zix/common.h" #include "zix/digest.h" #include "zix/hash.h" +#include #include #include #include #include #include +#include #include static bool expect_failure = false; -static const char* strings[] = { - "one", "two", "three", "four", "five", "six", "seven", "eight", - "2one", "2two", "2three", "2four", "2five", "2six", "2seven", "2eight", - "3one", "3two", "3three", "3four", "3five", "3six", "3seven", "3eight", - "4one", "4two", "4three", "4four", "4five", "4six", "4seven", "4eight", - "5one", "5two", "5three", "5four", "5five", "5six", "5seven", "5eight", - "6one", "6two", "6three", "6four", "6five", "6six", "6seven", "6eight", - "7one", "7two", "7three", "7four", "7five", "7six", "7seven", "7eight", - "8one", "8two", "8three", "8four", "8five", "8six", "8seven", "8eight", - "9one", "9two", "9three", "9four", "9five", "9six", "9seven", "9eight", - "Aone", "Atwo", "Athree", "Afour", "Afive", "Asix", "Aseven", "Aeight", - "Bone", "Btwo", "Bthree", "Bfour", "Bfive", "Bsix", "Bseven", "Beight", - "Cone", "Ctwo", "Cthree", "Cfour", "Cfive", "Csix", "Cseven", "Ceight", - "Done", "Dtwo", "Dthree", "Dfour", "Dfive", "Dsix", "Dseven", "Deight", -}; - ZIX_LOG_FUNC(1, 2) static int test_fail(const char* fmt, ...) @@ -60,161 +52,323 @@ test_fail(const char* fmt, ...) return 1; } -static unsigned n_checked = 0; +ZIX_PURE_FUNC static const char* +identity(const char* record) +{ + return record; +} + +/// Decent hash function using zix_digest (murmur2) +ZIX_PURE_FUNC static size_t +decent_string_hash(const char* const str) +{ + return zix_digest(0u, str, strlen(str)); +} -static void -check(void* value, void* ZIX_UNUSED(user_data)) +/// Terrible hash function from K&R first edition +ZIX_PURE_FUNC static size_t +terrible_string_hash(const char* str) { - if (strlen(*(const char* const*)value) >= 3) { - ++n_checked; - } else { - fprintf(stderr, "ERROR: %s\n", (const char*)value); + size_t hash = 0u; + uint8_t c = 0u; + + while ((c = (uint8_t)*str++)) { + hash += c; } + + return hash; } -static uint32_t -string_ptr_hash(const void* value) +ZIX_PURE_FUNC static size_t +string_hash_aligned(const char* const str) { - const char* const str = *(const char* const*)value; + size_t length = strlen(str); - return zix_digest32(0u, str, strlen(str)); + length = (length + (sizeof(size_t) - 1)) / sizeof(size_t) * sizeof(size_t); + return zix_digest_aligned(0u, str, length); +} + +ZIX_PURE_FUNC static size_t +string_hash32(const char* const str) +{ + return (size_t)zix_digest32(0u, str, strlen(str)); } +ZIX_PURE_FUNC static size_t +string_hash64(const char* const str) +{ + return (size_t)zix_digest64(0u, str, strlen(str)); +} + +ZIX_PURE_FUNC static size_t +string_hash32_aligned(const char* const str) +{ + size_t length = strlen(str); + + length = (length + 3u) / 4u * 4u; + return (size_t)zix_digest32_aligned(0u, str, length); +} + +#if UINTPTR_MAX >= UINT64_MAX + +ZIX_PURE_FUNC static size_t +string_hash64_aligned(const char* const str) +{ + size_t length = strlen(str); + + length = (length + 7u) / 8u * 8u; + return (size_t)zix_digest64_aligned(0u, str, length); +} + +#endif + static bool -string_ptr_equal(const void* a, const void* b) +string_equal(const char* const a, const char* const b) { - return !strcmp(*(const char* const*)a, *(const char* const*)b); + return !strcmp(a, b); } static int -stress(void) +stress_with(const ZixHashFunc hash_func, const size_t n_elems) { - ZixHash* hash = - zix_hash_new(string_ptr_hash, string_ptr_equal, sizeof(const char*)); + ZixHash* hash = zix_hash_new(identity, hash_func, string_equal); if (!hash) { return test_fail("Failed to allocate hash\n"); } - const size_t n_strings = sizeof(strings) / sizeof(const char*); + static const size_t string_length = 15; + + char* const buffer = (char*)calloc(1, n_elems * (string_length + 1)); + char** const strings = (char**)calloc(sizeof(char*), n_elems); + if (!buffer || !strings) { + free(buffer); + free(strings); + return test_fail("Failed to allocate strings\n"); + } + + uint32_t seed = 1u; + for (size_t i = 0u; i < n_elems; ++i) { + strings[i] = buffer + i * (string_length + 1); + assert((uintptr_t)strings[i] % sizeof(size_t) == 0); + assert((uintptr_t)strings[i] % sizeof(uint32_t) == 0); + + for (size_t j = 0u; j < string_length; ++j) { + seed = lcg32(seed); + strings[i][j] = (char)('!' + (seed % 92)); + } + } // Insert each string - for (size_t i = 0; i < n_strings; ++i) { - void* inserted = NULL; - ZixStatus st = zix_hash_insert(hash, &strings[i], &inserted); + for (size_t i = 0; i < n_elems; ++i) { + ZixStatus st = zix_hash_insert(hash, strings[i]); if (st) { return test_fail("Failed to insert `%s'\n", strings[i]); } - - if (*(const void* const*)inserted != strings[i]) { - return test_fail("Corrupt insertion %s != %s\n", - strings[i], - *(const char* const*)inserted); - } } // Ensure hash size is correct - if (zix_hash_size(hash) != n_strings) { - return test_fail("Hash size %" PRIuPTR " != %" PRIuPTR "\n", - zix_hash_size(hash), - n_strings); + if (zix_hash_size(hash) != n_elems) { + return test_fail( + "Hash size %" PRIuPTR " != %" PRIuPTR "\n", zix_hash_size(hash), n_elems); } - // zix_hash_print_dot(hash, stdout); - // Attempt to insert each string again - for (size_t i = 0; i < n_strings; ++i) { - void* inserted = NULL; - ZixStatus st = zix_hash_insert(hash, &strings[i], &inserted); + for (size_t i = 0; i < n_elems; ++i) { + ZixStatus st = zix_hash_insert(hash, strings[i]); if (st != ZIX_STATUS_EXISTS) { - return test_fail("Double inserted `%s'\n", strings[i]); + return test_fail( + "Double inserted `%s' (%s)\n", strings[i], zix_strerror(st)); } } // Search for each string - for (size_t i = 0; i < n_strings; ++i) { - const char* const* match = - (const char* const*)zix_hash_find(hash, &strings[i]); + for (size_t i = 0; i < n_elems; ++i) { + const char* match = (const char*)zix_hash_find_record(hash, strings[i]); if (!match) { return test_fail("Failed to find `%s'\n", strings[i]); } - if (*match != strings[i]) { - return test_fail("Bad match for `%s': `%s'\n", strings[i], *match); + if (match != strings[i]) { + return test_fail("Bad match for `%s': `%s'\n", strings[i], match); } } - // Try some false matches - const char* not_indexed[] = {"ftp://example.org/not-there-at-all", - "http://example.org/foobar", - "http://", - "http://otherdomain.com"}; - const size_t n_not_indexed = sizeof(not_indexed) / sizeof(char*); - for (size_t i = 0; i < n_not_indexed; ++i) { - const char* const* match = - (const char* const*)zix_hash_find(hash, ¬_indexed[i]); + static const char* const not_indexed_string = "__not__indexed__"; + + // Do a quick smoke test to ensure that false matches don't succeed + char* const not_indexed = (char*)calloc(1, strlen(not_indexed_string) + 1); + if (not_indexed) { + memcpy(not_indexed, not_indexed_string, strlen(not_indexed_string) + 1); + const char* match = (const char*)zix_hash_find_record(hash, not_indexed); if (match) { - return test_fail("Unexpectedly found `%s'\n", not_indexed[i]); + return test_fail("Unexpectedly found `%s'\n", not_indexed); } + free(not_indexed); } // Remove strings - for (size_t i = 0; i < n_strings; ++i) { + for (size_t i = 0; i < n_elems; ++i) { const size_t initial_size = zix_hash_size(hash); // Remove string - ZixStatus st = zix_hash_remove(hash, &strings[i]); + const char* removed = NULL; + ZixStatus st = zix_hash_remove(hash, strings[i], &removed); if (st) { return test_fail("Failed to remove `%s'\n", strings[i]); } + // Ensure the removed value is what we expected + if (removed != strings[i]) { + return test_fail("Removed `%s` instead of `%s`\n", removed, strings[i]); + } + // Ensure size is updated if (zix_hash_size(hash) != initial_size - 1) { return test_fail("Removing node did not decrease hash size\n"); } // Ensure second removal fails - st = zix_hash_remove(hash, &strings[i]); + st = zix_hash_remove(hash, strings[i], &removed); if (st != ZIX_STATUS_NOT_FOUND) { return test_fail("Unexpectedly removed `%s' twice\n", strings[i]); } + // Ensure value can no longer be found + assert(zix_hash_find(hash, strings[i]) == zix_hash_end(hash)); + // Check to ensure remaining strings are still present - for (size_t j = i + 1; j < n_strings; ++j) { - const char* const* match = - (const char* const*)zix_hash_find(hash, &strings[j]); + for (size_t j = i + 1; j < n_elems; ++j) { + const char* match = (const char*)zix_hash_find_record(hash, strings[j]); if (!match) { return test_fail("Failed to find `%s' after remove\n", strings[j]); } - if (*match != strings[j]) { + if (match != strings[j]) { return test_fail("Bad match for `%s' after remove\n", strings[j]); } } } // Insert each string again (to check non-empty destruction) - for (size_t i = 0; i < n_strings; ++i) { - ZixStatus st = zix_hash_insert(hash, &strings[i], NULL); + for (size_t i = 0; i < n_elems; ++i) { + ZixHashInsertPlan plan = zix_hash_plan_insert(hash, strings[i]); + + assert(!zix_hash_record_at(hash, plan)); + ZixStatus st = zix_hash_insert_at(hash, plan, strings[i]); if (st) { return test_fail("Failed to insert `%s'\n", strings[i]); } } // Check key == value (and test zix_hash_foreach) - zix_hash_foreach(hash, check, NULL); - if (n_checked != n_strings) { + size_t n_checked = 0u; + for (ZixHashIter i = zix_hash_begin(hash); i != zix_hash_end(hash); + i = zix_hash_next(hash, i)) { + const char* const string = (const char*)zix_hash_get(hash, i); + assert(string); + if (strlen(string) < 3) { + return test_fail("Corrupt value `%s'\n", string); + } + + ++n_checked; + } + if (n_checked != n_elems) { return test_fail("Check failed\n"); } + free(strings); + free(buffer); zix_hash_free(hash); return 0; } +static int +stress(const size_t n_elems) +{ + if (stress_with(decent_string_hash, n_elems) || + stress_with(terrible_string_hash, n_elems / 4) || + stress_with(string_hash_aligned, n_elems / 4) || + stress_with(string_hash32, n_elems / 4) || + stress_with(string_hash64, n_elems / 4) || + stress_with(string_hash32_aligned, n_elems / 4)) { + return 1; + } + +#if UINTPTR_MAX >= UINT64_MAX + if (stress_with(string_hash64_aligned, n_elems / 4)) { + return 1; + } +#endif + + return 0; +} + +/// Identity hash function for numeric strings for explicitly hitting cases +ZIX_PURE_FUNC static size_t +identity_index_hash(const char* const str) +{ + return strtoul(str, NULL, 10); +} + +static void +test_all_tombstones(void) +{ + /* This tests an edge case where a minimum-sized table can be entirely full + of tombstones. If the search loop is not written carefully, then this can + result in a hang. This has been seen to occur in real code, though it's + very hard to hit with a decent hash function. So, we use the above + degenerate index hashing function to explicitly place elements exactly + where we want to hit this case. */ + +#define N_STRINGS 4 + + static const char* original_strings[N_STRINGS] = { + "0 a", + "1 a", + "2 a", + "3 a", + }; + + static const char* collision_strings[N_STRINGS] = { + "0 b", + "1 b", + "2 b", + "3 b", + }; + + ZixStatus st = ZIX_STATUS_SUCCESS; + ZixHash* hash = zix_hash_new(identity, identity_index_hash, string_equal); + + // Insert each element then immediately remove it + for (unsigned i = 0u; i < N_STRINGS; ++i) { + const char* removed = NULL; + + assert(!zix_hash_insert(hash, original_strings[i])); + assert(!zix_hash_remove(hash, original_strings[i], &removed)); + } + + // Now the table should be "empty" but contain tombstones + assert(zix_hash_size(hash) == 0); + + // Insert clashing elements which should hit the "all tombstones" case + for (unsigned i = 0u; i < N_STRINGS; ++i) { + assert(!zix_hash_insert(hash, collision_strings[i])); + assert(!st); + } + + zix_hash_free(hash); + +#undef N_STRINGS +} + int main(void) { zix_hash_free(NULL); + test_all_tombstones(); + + static const size_t n_elems = 1024u; - if (stress()) { + if (stress(n_elems)) { return 1; } @@ -224,7 +378,7 @@ main(void) expect_failure = true; for (size_t i = 0; i < total_n_allocs; ++i) { test_malloc_reset(i); - stress(); + stress(n_elems); } test_malloc_reset((size_t)-1); -- cgit v1.2.1