From f8ce6cd6fbc197db2a62beb51a7c8072f9a8722d Mon Sep 17 00:00:00 2001 From: David Robillard Date: Mon, 19 Sep 2011 06:14:27 +0000 Subject: Add ZixFatPatree git-svn-id: http://svn.drobilla.net/zix/trunk@24 df6676b4-ccc9-40e5-b5d6-7c4628a128e3 --- src/fat_patree.c | 238 ++++++++++++++++++++++++++++++++++++++++++++++++++++ test/patree_bench.c | 54 +++++++++--- wscript | 1 + zix/fat_patree.h | 71 ++++++++++++++++ 4 files changed, 354 insertions(+), 10 deletions(-) create mode 100644 src/fat_patree.c create mode 100644 zix/fat_patree.h diff --git a/src/fat_patree.c b/src/fat_patree.c new file mode 100644 index 0000000..278da2a --- /dev/null +++ b/src/fat_patree.c @@ -0,0 +1,238 @@ +/* + Copyright 2011 David Robillard + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#define _XOPEN_SOURCE 500 + +#include +#include +#include +#include +#include +#include + +#include "zix/common.h" +#include "zix/fat_patree.h" + +#define N_CHILDREN 256 + +typedef uint_fast8_t n_edges_t; + +typedef struct _ZixFatPatreeNode { + char* label_first; /* Start of incoming label */ + char* label_last; /* End of incoming label */ + char* str; /* String stored at this node */ + struct _ZixFatPatreeNode* children[N_CHILDREN]; /* Children of this node */ +} ZixFatPatreeNode; + +struct _ZixFatPatree { + ZixFatPatreeNode* root; /* Root of the tree */ +}; + +ZIX_API +ZixFatPatree* +zix_fat_patree_new(void) +{ + ZixFatPatree* t = (ZixFatPatree*)malloc(sizeof(ZixFatPatree)); + memset(t, '\0', sizeof(struct _ZixFatPatree)); + + t->root = (ZixFatPatreeNode*)malloc(sizeof(ZixFatPatreeNode)); + memset(t->root, '\0', sizeof(ZixFatPatreeNode)); + + return t; +} + +static void +zix_fat_patree_free_rec(ZixFatPatreeNode* n) +{ + if (n) { + for (int i = 0; i < N_CHILDREN; ++i) { + zix_fat_patree_free_rec(n->children[i]); + } + } +} + +ZIX_API +void +zix_fat_patree_free(ZixFatPatree* t) +{ + zix_fat_patree_free_rec(t->root); + free(t->root); + free(t); +} + +static inline bool +patree_find_edge(ZixFatPatreeNode* n, uint8_t c, n_edges_t* index) +{ + if (n->children[c]) { + *index = c; + return true; + } + return false; +} + +static void +patree_add_edge(ZixFatPatreeNode* n, + char* str, + char* first, + char* last) +{ + assert(last >= first); + const int index = (uint8_t)first[0]; + assert(!n->children[index]); + + ZixFatPatreeNode* new_node = malloc(sizeof(ZixFatPatreeNode)); + n->children[index] = new_node; + n->children[index]->label_first = first; + n->children[index]->label_last = last; + n->children[index]->str = str; + for (int i = 0; i < N_CHILDREN; ++i) { + n->children[index]->children[i] = NULL; + } +} + +/* Split an outgoing edge (to a child) at the given index */ +static void +patree_split_edge(ZixFatPatreeNode* child, + size_t index) +{ + char* const first = child->label_first + index; + char* const last = child->label_last; + assert(last >= first); + + ZixFatPatreeNode* new_node = malloc(sizeof(ZixFatPatreeNode)); + new_node->label_first = first; + new_node->label_last = last; + new_node->str = child->str; + memcpy(new_node->children, child->children, + sizeof(ZixFatPatreeNode*) * N_CHILDREN); + + child->label_last = child->label_first + (index - 1); // chop end + child->str = NULL; + + for (int i = 0; i < N_CHILDREN; ++i) { + child->children[i] = NULL; + } + const int new_node_index = (int)first[0]; + assert(new_node_index < N_CHILDREN); + assert(!child->children[new_node_index]); + child->children[new_node_index] = new_node; +} + +static ZixStatus +patree_insert_internal(ZixFatPatreeNode* n, char* str, char* first, char* last) +{ + n_edges_t child_i; + assert(first <= last); + + if (patree_find_edge(n, *first, &child_i)) { + ZixFatPatreeNode* child = n->children[child_i]; + char* const label_first = child->label_first; + char* const label_last = child->label_last; + size_t split_i = 0; + const size_t label_len = label_last - label_first + 1; + const size_t s_len = last - first + 1; + const size_t max_len = (s_len < label_len) ? s_len : label_len; + + while (split_i < max_len && first[split_i] == label_first[split_i]) { + ++split_i; + } + child = n->children[child_i]; + + if (label_len < s_len) { + if (split_i == label_len) { + return patree_insert_internal(child, str, first + label_len, last); + } else { + patree_split_edge(child, split_i); + return patree_insert_internal(child, str, first + split_i, last); + } + } else if (label_len != split_i) { + patree_split_edge(child, split_i); + if (split_i != s_len) { + patree_add_edge(child, str, first + split_i, last); + } else { + assert(!child->str); + child->str = str; + } + return ZIX_STATUS_SUCCESS; + } else if (label_len == s_len && !child->str) { + child->str = str; + } else { + return ZIX_STATUS_EXISTS; + } + + } else { + patree_add_edge(n, str, first, last); + } + + return ZIX_STATUS_SUCCESS; +} + +ZIX_API +ZixStatus +zix_fat_patree_insert(ZixFatPatree* t, const char* str) +{ + const size_t len = strlen(str); + // FIXME: awful casts + return patree_insert_internal(t->root, (char*)str, (char*)str, (char*)str + len - 1); +} + +ZIX_API +ZixStatus +zix_fat_patree_find(ZixFatPatree* t, const char* p, char** match) +{ + ZixFatPatreeNode* n = t->root; + n_edges_t child_i; + + *match = NULL; + + while (*p != '\0') { + if (patree_find_edge(n, p[0], &child_i)) { + ZixFatPatreeNode* const child = n->children[child_i]; + const char* const label_first = child->label_first; + const char* const label_last = child->label_last; + + /* Prefix compare search string and label */ + const char* l = label_first; + while (*p != '\0' && l <= label_last) { + if (*l++ != *p++) { + return ZIX_STATUS_NOT_FOUND; + } + } + + if (!*p) { + /* Reached end of search string */ + if (l == label_last + 1) { + /* Reached end of label string as well, match */ + *match = child->str; + return *match ? ZIX_STATUS_SUCCESS : ZIX_STATUS_NOT_FOUND; + } else { + /* Label string is longer, no match (prefix match) */ + return ZIX_STATUS_NOT_FOUND; + } + } else { + /* Match at this node, continue search downwards. + Possibly we have prematurely hit a leaf, so the next + edge search will fail. + */ + n = child; + } + } else { + return ZIX_STATUS_NOT_FOUND; + } + } + + return ZIX_STATUS_NOT_FOUND; +} diff --git a/test/patree_bench.c b/test/patree_bench.c index 559110a..aca4d45 100644 --- a/test/patree_bench.c +++ b/test/patree_bench.c @@ -24,6 +24,7 @@ #include #include "zix/patree.h" +#include "zix/fat_patree.h" static int test_fail(const char* fmt, ...) @@ -49,6 +50,8 @@ main(int argc, char** argv) return test_fail("Failed to open file %s\n", file); } + size_t max_n_strings = 500000; + /* Read input strings */ char** strings = NULL; size_t n_strings = 0; @@ -64,7 +67,9 @@ main(int argc, char** argv) strings[n_strings] = malloc(buf_len); memcpy(strings[n_strings], buf, buf_len); this_str_len = 0; - ++n_strings; + if (++n_strings == max_n_strings) { + break; + } } else { ++this_str_len; if (buf_len < this_str_len + 1) { @@ -76,14 +81,18 @@ main(int argc, char** argv) } } + fclose(fd); + FILE* insert_dat = fopen("insert.dat", "w"); FILE* search_dat = fopen("search.dat", "w"); - fprintf(insert_dat, "# n\tGHashTable\tZixPatree\n"); - fprintf(search_dat, "# n\tGHashTable\tZixPatree\n"); + fprintf(insert_dat, "# n\tGHashTable\tZixPatree\tZixFatPatree\n"); + fprintf(search_dat, "# n\tGHashTable\tZixPatree\tZixFatPatree\n"); for (size_t n = 1; n <= n_strings; n *= 2) { - ZixPatree* patree = zix_patree_new(); - GHashTable* hash = g_hash_table_new(g_str_hash, g_str_equal); + printf("Benchmarking n = %zu\n", n); + ZixPatree* patree = zix_patree_new(); + ZixFatPatree* fat_patree = zix_fat_patree_new(); + GHashTable* hash = g_hash_table_new(g_str_hash, g_str_equal); fprintf(insert_dat, "%zu", n); fprintf(search_dat, "%zu", n); @@ -99,7 +108,18 @@ main(int argc, char** argv) // ZixPatree insert_start = bench_start(); for (size_t i = 0; i < n; ++i) { - if (zix_patree_insert(patree, strings[i])) { + ZixStatus st = zix_patree_insert(patree, strings[i]); + if (st && st != ZIX_STATUS_EXISTS) { + return test_fail("Failed to insert `%s'\n", strings[i]); + } + } + fprintf(insert_dat, "\t%lf", bench_end(&insert_start)); + + // ZixFatPatree + insert_start = bench_start(); + for (size_t i = 0; i < n; ++i) { + ZixStatus st = zix_fat_patree_insert(fat_patree, strings[i]); + if (st && st != ZIX_STATUS_EXISTS) { return test_fail("Failed to insert `%s'\n", strings[i]); } } @@ -111,7 +131,7 @@ main(int argc, char** argv) struct timespec search_start = bench_start(); for (size_t i = 0; i < n; ++i) { char* match = g_hash_table_lookup(hash, strings[i]); - if (match != strings[i]) { + if (strcmp(match, strings[i])) { return test_fail("Bad match for `%s'\n", strings[i]); } } @@ -122,15 +142,29 @@ main(int argc, char** argv) for (size_t i = 0; i < n; ++i) { char* match = NULL; if (zix_patree_find(patree, strings[i], &match)) { - return test_fail("Failed to find `%s'\n", strings[i]); + return test_fail("Patree: Failed to find `%s'\n", strings[i]); } - if (match != strings[i]) { - return test_fail("Bad match for `%s'\n", strings[i]); + if (strcmp(match, strings[i])) { + return test_fail("Patree: Bad match for `%s'\n", strings[i]); + } + } + fprintf(search_dat, "\t%lf", bench_end(&search_start)); + + // ZixFatPatree + search_start = bench_start(); + for (size_t i = 0; i < n; ++i) { + char* match = NULL; + if (zix_fat_patree_find(fat_patree, strings[i], &match)) { + return test_fail("FatPatree: Failed to find `%s'\n", strings[i]); + } + if (strcmp(match, strings[i])) { + return test_fail("FatPatree: Bad match for `%s'\n", strings[i]); } } fprintf(search_dat, "\t%lf\n", bench_end(&search_start)); zix_patree_free(patree); + zix_fat_patree_free(fat_patree); g_hash_table_unref(hash); } diff --git a/wscript b/wscript index d2d0d85..bde4b68 100644 --- a/wscript +++ b/wscript @@ -78,6 +78,7 @@ def build(bld): autowaf.build_pc(bld, 'ZIX', ZIX_VERSION, []) lib_source = ''' + src/fat_patree.c src/patree.c src/ring.c src/sorted_array.c diff --git a/zix/fat_patree.h b/zix/fat_patree.h new file mode 100644 index 0000000..057ac51 --- /dev/null +++ b/zix/fat_patree.h @@ -0,0 +1,71 @@ +/* + Copyright 2011 David Robillard + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#ifndef ZIX_FAT_PATREE_H +#define ZIX_FAT_PATREE_H + +#include "zix/common.h" + +/** + @addtogroup zix + @{ + @name Patree + @{ +*/ + +typedef struct _ZixFatPatree ZixFatPatree; + +/** + Construct a new Patree. +*/ +ZIX_API +ZixFatPatree* +zix_fat_patree_new(void); + +/** + Destroy @a t. +*/ +ZIX_API +void +zix_fat_patree_free(ZixFatPatree* t); + +/** + Print a DOT description of @a t to @a fd. +*/ +ZIX_API +void +zix_fat_patree_print_dot(const ZixFatPatree* t, FILE* fd); + +/** + Insert @a str into @a t. +*/ +ZIX_API +ZixStatus +zix_fat_patree_insert(ZixFatPatree* t, const char* str); + +/** + Search for @a str in @a t. +*/ +ZIX_API +ZixStatus +zix_fat_patree_find(ZixFatPatree* t, const char* str, char** match); + +/** + @} + @} +*/ + +#endif /* ZIX_FAT_PATREE_H */ -- cgit v1.2.1