From cd4cf53953cb7b2d24fe1cd6f23870eff7cc3e1d Mon Sep 17 00:00:00 2001 From: David Robillard Date: Mon, 11 Jul 2016 18:33:04 +0000 Subject: Add alternative dictionary structures git-svn-id: http://svn.drobilla.net/zix/trunk@109 df6676b4-ccc9-40e5-b5d6-7c4628a128e3 --- zix/ampatree.c | 320 +++++++++++++++++++++++++++++++++++++++++++++++++++++ zix/ampatree.h | 66 +++++++++++ zix/bitset.c | 108 ++++++++++++++++++ zix/bitset.h | 97 ++++++++++++++++ zix/trie.c | 341 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ zix/trie.h | 66 +++++++++++ 6 files changed, 998 insertions(+) create mode 100644 zix/ampatree.c create mode 100644 zix/ampatree.h create mode 100644 zix/bitset.c create mode 100644 zix/bitset.h create mode 100644 zix/trie.c create mode 100644 zix/trie.h (limited to 'zix') diff --git a/zix/ampatree.c b/zix/ampatree.c new file mode 100644 index 0000000..7fec6cd --- /dev/null +++ b/zix/ampatree.c @@ -0,0 +1,320 @@ +/* + Copyright 2011-2016 David Robillard + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#define _XOPEN_SOURCE 500 + +#include +#include +#include +#include +#include + +#include "zix/ampatree.h" +#include "zix/bitset.h" +#include "zix/common.h" +#include "zix/trie_util.h" + +#define ZIX_AMPATREE_BITS 256 +#define ZIX_AMPATREE_ELEMS (ZIX_BITSET_ELEMS(ZIX_AMPATREE_BITS)) + +typedef size_t n_edges_t; + +typedef struct ZixAMPatreeNode { + const uint8_t* label_first; /* Start of incoming label */ + const uint8_t* label_last; /* End of incoming label */ + const uint8_t* str; /* String stored at this node */ + n_edges_t n_children; /* Number of outgoing edges */ + + ZixBitsetTally tally[ZIX_AMPATREE_ELEMS]; + ZixBitset bits[ZIX_AMPATREE_ELEMS]; + + struct ZixAMPatreeNode* children[]; +} ZixAMPatreeNode; + +struct _ZixAMPatree { + ZixAMPatreeNode* root; /* Root of the tree */ +}; + +ZIX_PRIVATE ZixAMPatreeNode** +zix_ampatree_get_child_ptr(ZixAMPatreeNode* node, n_edges_t index) +{ + return node->children + index; +} + +ZIX_PRIVATE void +zix_ampatree_print_rec(ZixAMPatreeNode* node, FILE* fd) +{ + if (node->label_first) { + size_t edge_label_len = node->label_last - node->label_first + 1; + char* edge_label = (char*)calloc(1, edge_label_len + 1); + strncpy(edge_label, node->label_first, edge_label_len); + fprintf(fd, "\t\"%p\" [label=<" + "" + "", (void*)node, edge_label); + free(edge_label); + if (node->str) { + fprintf(fd, "", node->str); + } + fprintf(fd, "
%s
\"%s\"
>,shape=\"plaintext\"] ;\n"); + } else { + fprintf(fd, "\t\"%p\" [label=\"\"] ;\n", (void*)node); + } + + for (n_edges_t i = 0; i < node->n_children; ++i) { + ZixAMPatreeNode* const child = *zix_ampatree_get_child_ptr(node, i); + zix_ampatree_print_rec(child, fd); + fprintf(fd, "\t\"%p\" -> \"%p\" ;\n", (void*)node, (void*)child); + } +} + +ZIX_API +void +zix_ampatree_print_dot(const ZixAMPatree* t, FILE* fd) +{ + fprintf(fd, "digraph Patree { \n"); + zix_ampatree_print_rec(t->root, fd); + fprintf(fd, "}\n"); +} + +ZIX_PRIVATE size_t +zix_ampatree_node_size(n_edges_t n_children) +{ + return sizeof(ZixAMPatreeNode) + (n_children * sizeof(ZixAMPatreeNode*)); +} + +ZIX_PRIVATE ZixAMPatreeNode* +realloc_node(ZixAMPatreeNode* n, int n_children) +{ + return (ZixAMPatreeNode*)realloc(n, zix_ampatree_node_size(n_children)); +} + +ZIX_API ZixAMPatree* +zix_ampatree_new(void) +{ + ZixAMPatree* t = (ZixAMPatree*)calloc(1, sizeof(ZixAMPatree)); + + t->root = calloc(1, sizeof(ZixAMPatreeNode)); + + return t; +} + +ZIX_PRIVATE void +zix_ampatree_free_rec(ZixAMPatreeNode* n) +{ + if (n) { + for (n_edges_t i = 0; i < n->n_children; ++i) { + zix_ampatree_free_rec(*zix_ampatree_get_child_ptr(n, i)); + } + free(n); + } +} + +ZIX_API void +zix_ampatree_free(ZixAMPatree* t) +{ + zix_ampatree_free_rec(t->root); + free(t); +} + +ZIX_PRIVATE inline bool +patree_is_leaf(const ZixAMPatreeNode* n) +{ + return n->n_children == 0; +} + +ZIX_PRIVATE size_t +ampatree_find_edge(const ZixAMPatreeNode* n, const uint8_t c) +{ + return zix_bitset_count_up_to_if(n->bits, n->tally, c); +} + +#ifndef NDEBUG +ZIX_PRIVATE bool +zix_ampatree_node_check(ZixAMPatreeNode* const n) +{ + for (n_edges_t i = 0; i < n->n_children; ++i) { + assert(*zix_ampatree_get_child_ptr(n, i) != NULL); + } + return true; +} +#endif + +ZIX_PRIVATE void +patree_add_edge(ZixAMPatreeNode** n_ptr, + const uint8_t* str, + const uint8_t* first, + const uint8_t* last) +{ + ZixAMPatreeNode* n = *n_ptr; + + assert(last >= first); + + ZixAMPatreeNode* const child = realloc_node(NULL, 0); + zix_bitset_clear(child->bits, child->tally, ZIX_AMPATREE_BITS); + child->label_first = first; + child->label_last = last; + child->str = str; + child->n_children = 0; + + n = realloc_node(n, n->n_children + 1); + ZixAMPatreeNode* m = malloc(zix_ampatree_node_size(n->n_children + 1)); + m->label_first = n->label_first; + m->label_last = n->label_last; + m->str = n->str; + m->n_children = n->n_children + 1; + + memcpy(m->bits, n->bits, ZIX_AMPATREE_ELEMS * sizeof(ZixBitset)); + memcpy(m->tally, n->tally, ZIX_AMPATREE_ELEMS * sizeof(ZixBitsetTally)); + zix_bitset_set(m->bits, m->tally, first[0]); + + const n_edges_t l = ampatree_find_edge(m, first[0]); + memcpy(zix_ampatree_get_child_ptr(m, 0), + zix_ampatree_get_child_ptr(n, 0), + l * sizeof(ZixAMPatreeNode*)); + *zix_ampatree_get_child_ptr(m, l) = child; + memcpy(zix_ampatree_get_child_ptr(m, l + 1), + zix_ampatree_get_child_ptr(n, l), + (n->n_children - l) * sizeof(ZixAMPatreeNode*)); + free(n); + n = m; + + *n_ptr = n; + + assert(zix_ampatree_node_check(n)); + assert(zix_ampatree_node_check(child)); +} + +/* Split an outgoing edge (to a child) at the given index */ +ZIX_PRIVATE void +patree_split_edge(ZixAMPatreeNode** child_ptr, + size_t index) +{ + ZixAMPatreeNode* child = *child_ptr; + + const size_t grandchild_size = zix_ampatree_node_size(child->n_children); + + ZixAMPatreeNode* const grandchild = realloc_node(NULL, child->n_children); + memcpy(grandchild, child, grandchild_size); + grandchild->label_first = child->label_first + index; + + child = realloc_node(child, 1); + child->n_children = 1; + child->label_last = child->label_first + (index - 1); + child->str = NULL; + *zix_ampatree_get_child_ptr(child, 0) = grandchild; + zix_bitset_clear(child->bits, child->tally, ZIX_AMPATREE_BITS); + zix_bitset_set(child->bits, child->tally, grandchild->label_first[0]); + + *child_ptr = child; + + assert(zix_ampatree_node_check(child)); + assert(zix_ampatree_node_check(grandchild)); +} + +ZIX_PRIVATE ZixStatus +patree_insert_internal(ZixAMPatreeNode** n_ptr, + const uint8_t* str, + const uint8_t* first, + const size_t len) +{ + ZixAMPatreeNode* n = *n_ptr; + const n_edges_t child_i = ampatree_find_edge(n, *first); + if (child_i != (n_edges_t)-1) { + ZixAMPatreeNode** child_ptr = zix_ampatree_get_child_ptr(n, child_i); + ZixAMPatreeNode* child = *child_ptr; + const uint8_t* const label_first = child->label_first; + const uint8_t* const label_last = child->label_last; + const size_t label_len = label_last - label_first + 1; + const size_t split_i = zix_trie_change_index(first, label_first, label_len); + + if (first[split_i]) { + if (split_i == label_len) { + return patree_insert_internal(child_ptr, str, first + label_len, len); + } else { + patree_split_edge(child_ptr, split_i); + return patree_insert_internal(child_ptr, str, first + split_i, len); + } + } else if (label_len != split_i) { + patree_split_edge(child_ptr, split_i); + if (first[split_i]) { + patree_add_edge(child_ptr, str, first + split_i, str + len - 1); + } else { + assert(!(*child_ptr)->str); + (*child_ptr)->str = str; + } + return ZIX_STATUS_SUCCESS; + } else if (label_first[split_i] && !child->str) { + child->str = str; + } else { + return ZIX_STATUS_EXISTS; + } + } else { + patree_add_edge(n_ptr, str, first, str + len - 1); + } + + return ZIX_STATUS_SUCCESS; +} + +ZIX_API ZixStatus +zix_ampatree_insert(ZixAMPatree* t, const char* str, size_t len) +{ + assert(strlen(str) == len); + + const uint8_t* ustr = (const uint8_t*)str; + return patree_insert_internal(&t->root, ustr, ustr, len); +} + +ZIX_API ZixStatus +zix_ampatree_find(const ZixAMPatree* t, const char* const str, const char** match) +{ + ZixAMPatreeNode* n = t->root; + const uint8_t* p = str; + n_edges_t child_i; + while ((child_i = ampatree_find_edge(n, p[0])) != (n_edges_t)-1) { + assert(child_i <= n->n_children); + ZixAMPatreeNode* const child = *zix_ampatree_get_child_ptr(n, child_i); + + /* Prefix compare search string and label */ + const uint8_t* const l = child->label_first; + const size_t len = child->label_last - l + 1; + const int change_index = zix_trie_change_index(p, l, len); + + p += change_index; + + if (!*p) { + /* Reached end of search string */ + if (l + change_index == child->label_last + 1) { + /* Reached end of label string as well, match */ + *match = child->str; + return *match ? ZIX_STATUS_SUCCESS : ZIX_STATUS_NOT_FOUND; + } else { + /* Label string is longer, no match (prefix match) */ + return ZIX_STATUS_NOT_FOUND; + } + } else { + /* Didn't reach end of search string */ + if (patree_is_leaf(n)) { + /* Hit leaf early, no match */ + return ZIX_STATUS_NOT_FOUND; + } else { + /* Match at this node, continue search downwards (or match) */ + n = child; + } + } + } + + return ZIX_STATUS_NOT_FOUND; +} diff --git a/zix/ampatree.h b/zix/ampatree.h new file mode 100644 index 0000000..33e8acf --- /dev/null +++ b/zix/ampatree.h @@ -0,0 +1,66 @@ +/* + Copyright 2011-2016 David Robillard + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#ifndef ZIX_AMPATREE_H +#define ZIX_AMPATREE_H + +#include "zix/common.h" + +/** + @addtogroup zix + @{ + @name AMPatree + @{ +*/ + +typedef struct _ZixAMPatree ZixAMPatree; + +/** + Construct a new Patree. +*/ +ZIX_API ZixAMPatree* +zix_ampatree_new(void); + +/** + Destroy `t`. +*/ +ZIX_API void +zix_ampatree_free(ZixAMPatree* t); + +/** + Print a DOT description of `t` to `fd`. +*/ +ZIX_API void +zix_ampatree_print_dot(const ZixAMPatree* t, FILE* fd); + +/** + Insert `str` into `t`. +*/ +ZIX_API ZixStatus +zix_ampatree_insert(ZixAMPatree* t, const char* str, size_t len); + +/** + Search for `str` in `t`. +*/ +ZIX_API ZixStatus +zix_ampatree_find(const ZixAMPatree* t, const char* str, const char** match); + +/** + @} + @} +*/ + +#endif /* ZIX_AMPATREE_H */ diff --git a/zix/bitset.c b/zix/bitset.c new file mode 100644 index 0000000..155fc82 --- /dev/null +++ b/zix/bitset.c @@ -0,0 +1,108 @@ +/* + Copyright 2014-2016 David Robillard + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#include + +#include + +#include "zix/bitset.h" + +/** Number of bits per ZixBitset element (ala CHAR_BIT). */ +#define ZIX_BITSET_ELEM_BIT (CHAR_BIT * sizeof(ZixBitset)) + +ZIX_API void +zix_bitset_clear(ZixBitset* b, ZixBitsetTally* t, size_t n_bits) +{ + memset(b, 0, ZIX_BITSET_ELEMS(n_bits) * sizeof(ZixBitset)); + memset(t, 0, ZIX_BITSET_ELEMS(n_bits) * sizeof(ZixBitsetTally)); +} + +ZIX_API void +zix_bitset_set(ZixBitset* b, ZixBitsetTally* t, size_t i) +{ + const size_t e = i / ZIX_BITSET_ELEM_BIT; + const size_t ebit = i & (ZIX_BITSET_ELEM_BIT - 1); // i % ELEM_BIT + const ZixBitset mask = (ZixBitset)1 << ebit; + + if (!(b[e] & mask)) { + ++t[e]; + } + + b[e] |= mask; +} + +ZIX_API void +zix_bitset_reset(ZixBitset* b, ZixBitsetTally* t, size_t i) +{ + const size_t e = i / ZIX_BITSET_ELEM_BIT; + const size_t ebit = i & (ZIX_BITSET_ELEM_BIT - 1); // i % ELEM_BIT + const ZixBitset mask = (ZixBitset)1 << ebit; + + if (b[e] & mask) { + --t[e]; + } + + b[e] &= ~mask; +} + +ZIX_API bool +zix_bitset_get(const ZixBitset* b, size_t i) +{ + const size_t e = i / ZIX_BITSET_ELEM_BIT; + const size_t ebit = i & (ZIX_BITSET_ELEM_BIT - 1); // i % ELEM_BIT + const ZixBitset mask = (ZixBitset)1 << ebit; + + return b[e] & mask; +} + +ZIX_API size_t +zix_bitset_count_up_to(const ZixBitset* b, const ZixBitsetTally* t, size_t i) +{ + const size_t full_elems = i / ZIX_BITSET_ELEM_BIT; + const size_t extra = i & (ZIX_BITSET_ELEM_BIT - 1); // i % ELEM_BIT + + size_t count = 0; + for (size_t e = 0; e < full_elems; ++e) { + count += t[e]; + } + + const ZixBitset mask = ~(~(ZixBitset)0 << extra); + count += __builtin_popcountl(b[full_elems] & mask); + + return count; +} + +ZIX_API size_t +zix_bitset_count_up_to_if(const ZixBitset* b, const ZixBitsetTally* t, size_t i) +{ + const size_t full_elems = i / ZIX_BITSET_ELEM_BIT; + const size_t extra = i & (ZIX_BITSET_ELEM_BIT - 1); // i % ELEM_BIT + + const ZixBitset get_mask = (ZixBitset)1 << extra; + if (!(b[full_elems] & get_mask)) { + return (size_t)-1; + } + + size_t count = 0; + for (size_t e = 0; e < full_elems; ++e) { + count += t[e]; + } + + const ZixBitset mask = ~(~(ZixBitset)0 << extra); + count += __builtin_popcountl(b[full_elems] & mask); + + return count; +} diff --git a/zix/bitset.h b/zix/bitset.h new file mode 100644 index 0000000..eadf1d5 --- /dev/null +++ b/zix/bitset.h @@ -0,0 +1,97 @@ +/* + Copyright 2014-2016 David Robillard + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#ifndef ZIX_BITSET_H +#define ZIX_BITSET_H + +#include +#include +#include + +#include "zix/common.h" + +/** + @addtogroup zix + @{ + @name Bitset + @{ +*/ + +/** + A bitset (always referred to by pointer, actually an array). +*/ +typedef unsigned long ZixBitset; + +/** + Tally of the number of bits in one ZixBitset element. +*/ +typedef uint8_t ZixBitsetTally; + +/** + The number of bits per ZixBitset array element. +*/ +#define ZIX_BITSET_BITS_PER_ELEM (CHAR_BIT * sizeof(ZixBitset)) + +/** + The number of bitset elements needed for the given number of bits. +*/ +#define ZIX_BITSET_ELEMS(n_bits) \ + ((n_bits / ZIX_BITSET_BITS_PER_ELEM) + \ + (n_bits % ZIX_BITSET_BITS_PER_ELEM ? 1 : 0)) + +/** + Clear a Bitset. +*/ +ZIX_API void +zix_bitset_clear(ZixBitset* b, ZixBitsetTally* t, size_t n_bits); + +/** + Set bit `i` in `t` to 1. +*/ +ZIX_API void +zix_bitset_set(ZixBitset* b, ZixBitsetTally* t, size_t i); + +/** + Clear bit `i` in `t` (set to 0). +*/ +ZIX_API void +zix_bitset_reset(ZixBitset* b, ZixBitsetTally* t, size_t i); + +/** + Return the `i`th bit in `t`. +*/ +ZIX_API bool +zix_bitset_get(const ZixBitset* b, size_t i); + +/** + Return the number of set bits in `b` up to bit `i` (non-inclusive). +*/ +ZIX_API size_t +zix_bitset_count_up_to(const ZixBitset* b, const ZixBitsetTally* t, size_t i); + +/** + Return the number of set bits in `b` up to bit `i` (non-inclusive) if bit + `i` is set, or (size_t)-1 otherwise. +*/ +ZIX_API size_t +zix_bitset_count_up_to_if(const ZixBitset* b, const ZixBitsetTally* t, size_t i); + +/** + @} + @} +*/ + +#endif /* ZIX_BITSET_H */ diff --git a/zix/trie.c b/zix/trie.c new file mode 100644 index 0000000..71fe281 --- /dev/null +++ b/zix/trie.c @@ -0,0 +1,341 @@ +/* + Copyright 2011-2016 David Robillard + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#define _XOPEN_SOURCE 500 + +#include +#include +#include +#include +#include + +// #define ZIX_TRIE_LINEAR_SEARCH 1 + +#include "zix/common.h" +#include "zix/trie.h" +#include "zix/trie_util.h" + +//typedef uint_fast8_t n_edges_t; +typedef uintptr_t n_edges_t; + +typedef struct { + const uint8_t* str; /* String stored at this node */ + n_edges_t num_children; /* Number of outgoing edges */ + + /** Array of first child characters, followed by parallel array of pointers + to ZixTrieNode pointers. */ + uint8_t children[]; +} ZixTrieNode; + +struct _ZixTrie { + ZixTrieNode* root; /* Root of the tree */ +}; + +/** Round `size` up to the next multiple of the word size. */ +ZIX_PRIVATE uint32_t +zix_trie_pad(uint32_t size) +{ + return (size + sizeof(void*) - 1) & (~(sizeof(void*) - 1)); +} + +ZIX_PRIVATE ZixTrieNode** +zix_trie_get_children(ZixTrieNode* node) +{ + return (ZixTrieNode**)(node->children + + zix_trie_pad(node->num_children)); +} + +ZIX_PRIVATE ZixTrieNode** +zix_trie_get_child_ptr(ZixTrieNode* node, n_edges_t index) +{ + return zix_trie_get_children(node) + index; +} + +ZIX_PRIVATE void +zix_trie_print_rec(ZixTrieNode* node, FILE* fd) +{ + if (node->str) { + fprintf(fd, "\t\"%p\" [label=\"%s\"] ;\n", (void*)node, node->str); + } else { + fprintf(fd, "\t\"%p\" [label=\"\"] ;\n", (void*)node); + } + + for (n_edges_t i = 0; i < node->num_children; ++i) { + ZixTrieNode* const child = *zix_trie_get_child_ptr(node, i); + zix_trie_print_rec(child, fd); + fprintf(fd, "\t\"%p\" -> \"%p\" [ label = \"%c\" ];\n", + (void*)node, (void*)child, node->children[i]); + } +} + +ZIX_API +void +zix_trie_print_dot(const ZixTrie* t, FILE* fd) +{ + fprintf(fd, "digraph Trie { \n"); + zix_trie_print_rec(t->root, fd); + fprintf(fd, "}\n"); +} + +ZIX_PRIVATE size_t +zix_trie_node_size(n_edges_t num_children) +{ + return (sizeof(ZixTrieNode) + + zix_trie_pad(num_children) + + (num_children * sizeof(ZixTrieNode*))); +} + +ZIX_PRIVATE ZixTrieNode* +realloc_node(ZixTrieNode* n, int num_children) +{ + return (ZixTrieNode*)realloc(n, zix_trie_node_size(num_children)); +} + +ZIX_API ZixTrie* +zix_trie_new(void) +{ + ZixTrie* t = (ZixTrie*)calloc(1, sizeof(ZixTrie)); + + t->root = calloc(1, sizeof(ZixTrieNode)); + + return t; +} + +ZIX_PRIVATE void +zix_trie_free_rec(ZixTrieNode* n) +{ + if (n) { + for (n_edges_t i = 0; i < n->num_children; ++i) { + zix_trie_free_rec(*zix_trie_get_child_ptr(n, i)); + } + free(n); + } +} + +ZIX_API void +zix_trie_free(ZixTrie* t) +{ + zix_trie_free_rec(t->root); + free(t); +} + +ZIX_PRIVATE inline bool +trie_is_leaf(const ZixTrieNode* n) +{ + return n->num_children == 0; +} + +ZIX_PRIVATE bool +trie_find_edge(const ZixTrieNode* n, const uint8_t c, n_edges_t* const index) +{ + *index = zix_trie_find_key(n->children, n->num_children, c); + return *index < n->num_children && n->children[*index] == c; +} + +#ifndef NDEBUG +ZIX_PRIVATE bool +zix_trie_node_check(ZixTrieNode* const n) +{ + for (n_edges_t i = 0; i < n->num_children; ++i) { + assert(n->children[i] != '\0'); + assert(*zix_trie_get_child_ptr(n, i) != NULL); + } + return true; +} +#endif + +ZIX_PRIVATE size_t +zix_trie_add_child(ZixTrieNode** n_ptr, + ZixTrieNode* child, + const uint8_t first) +{ + ZixTrieNode* n = *n_ptr; + + n = realloc_node(n, n->num_children + 1); +#ifdef ZIX_TRIE_LINEAR_SEARCH + const n_edges_t l = n->num_children; + memmove(n->children + zix_trie_pad(n->num_children + 1), + n->children + zix_trie_pad(n->num_children), + n->num_children * sizeof(ZixTrieNode*)); + ++n->num_children; + n->children[n->num_children - 1] = first; + *zix_trie_get_child_ptr(n, n->num_children - 1) = child; +#else + const n_edges_t l = zix_trie_find_key(n->children, n->num_children, first); + ZixTrieNode* m = malloc(zix_trie_node_size(n->num_children + 1)); + m->str = n->str; + m->num_children = n->num_children + 1; + + memcpy(m->children, n->children, l); + m->children[l] = first; + memcpy(m->children + l + 1, n->children + l, n->num_children - l); + + memcpy(zix_trie_get_child_ptr(m, 0), + zix_trie_get_child_ptr(n, 0), + l * sizeof(ZixTrieNode*)); + *zix_trie_get_child_ptr(m, l) = child; + memcpy(zix_trie_get_child_ptr(m, l + 1), + zix_trie_get_child_ptr(n, l), + (n->num_children - l) * sizeof(ZixTrieNode*)); + free(n); + n = m; +#endif + + *n_ptr = n; + + return l; + + assert(zix_trie_node_check(n)); + //assert(zix_trie_node_check(child)); +} + +ZIX_PRIVATE ZixTrieNode** +trie_insert_tail(ZixTrieNode** n_ptr, + const uint8_t* str, + const uint8_t* first, + const size_t len) +{ + assert(first[0]); + ZixTrieNode* c = NULL; + while (first[0]) { + assert(zix_trie_node_check(*n_ptr)); + + c = realloc_node(NULL, 1); + c->str = NULL; + c->num_children = 0; + + const size_t l = zix_trie_add_child(n_ptr, c, first[0]); + n_ptr = zix_trie_get_child_ptr(*n_ptr, l); + assert(*n_ptr == c); + + ++first; + } + c->num_children = 0; + c->str = str; + assert(zix_trie_node_check(c)); + assert(*n_ptr == c); + return n_ptr; +} + +ZIX_PRIVATE ZixStatus +trie_insert_internal(ZixTrieNode** n_ptr, + const uint8_t* str, + const uint8_t* first, + const size_t len) +{ + assert(zix_trie_node_check(*n_ptr)); + ZixTrieNode* n = *n_ptr; + n_edges_t child_i; + while (trie_find_edge(n, first[0], &child_i)) { + assert(child_i <= n->num_children); + ZixTrieNode** const child_ptr = zix_trie_get_child_ptr(n, child_i); + ZixTrieNode* const child = *child_ptr; + assert(child_ptr); + + assert(zix_trie_node_check(child)); + + if (!first[0]) { + /* Reached end of search string */ + if (child->str) { + /* Found exact string in trie. */ + return ZIX_STATUS_EXISTS; + } else { + /* Set node value to inserted string */ + child->str = str; + return ZIX_STATUS_SUCCESS; + } + } else { + /* Didn't reach end of search string */ + if (trie_is_leaf(n)) { + /* Hit leaf early, insert underneath */ + ZixTrieNode** c = trie_insert_tail(n_ptr, str, first, len); + assert(zix_trie_node_check(*n_ptr)); + assert(zix_trie_node_check(*c)); + return ZIX_STATUS_SUCCESS; + } else { + /* Match at this node, continue search downwards (or match) */ + n_ptr = child_ptr; + assert(*n_ptr); + n = *n_ptr; + assert(n == child); + ++first; + } + } + } + + if (!first[0]) { + if ((*n_ptr)->str) { + return ZIX_STATUS_EXISTS; + } else { + (*n_ptr)->str = str; + return ZIX_STATUS_SUCCESS; + } + } + + /* Hit leaf early, insert underneath */ + assert(zix_trie_node_check(*n_ptr)); + ZixTrieNode** c = trie_insert_tail(n_ptr, str, first, len); + assert(zix_trie_node_check(*n_ptr)); + assert(zix_trie_node_check(*c)); + return ZIX_STATUS_SUCCESS; +} + +ZIX_API ZixStatus +zix_trie_insert(ZixTrie* t, const char* str, size_t len) +{ + assert(strlen(str) == len); + + const uint8_t* ustr = (const uint8_t*)str; + return trie_insert_internal(&t->root, ustr, ustr, len); +} + +ZIX_API ZixStatus +zix_trie_find(const ZixTrie* t, const char* const str, const char** match) +{ + ZixTrieNode* n = t->root; + const char* p = str; + n_edges_t child_i; + + while (trie_find_edge(n, p[0], &child_i)) { + assert(child_i <= n->num_children); + ZixTrieNode* const child = *zix_trie_get_child_ptr(n, child_i); + + ++p; + + if (!*p) { + /* Reached end of search string */ + if (child->str) { + /* Found exact string in trie. */ + *match = (const char*)child->str; + return ZIX_STATUS_SUCCESS; + } else { + /* Prefix match, but exact string not present. */ + return ZIX_STATUS_NOT_FOUND; + } + } else { + /* Didn't reach end of search string */ + if (trie_is_leaf(n)) { + /* Hit leaf early, no match */ + return ZIX_STATUS_NOT_FOUND; + } else { + /* Match at this node, continue search downwards (or match) */ + n = child; + } + } + } + + return ZIX_STATUS_NOT_FOUND; +} diff --git a/zix/trie.h b/zix/trie.h new file mode 100644 index 0000000..9843460 --- /dev/null +++ b/zix/trie.h @@ -0,0 +1,66 @@ +/* + Copyright 2011-2016 David Robillard + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#ifndef ZIX_TRIE_H +#define ZIX_TRIE_H + +#include "zix/common.h" + +/** + @addtogroup zix + @{ + @name Trie + @{ +*/ + +typedef struct _ZixTrie ZixTrie; + +/** + Construct a new Trie. +*/ +ZIX_API ZixTrie* +zix_trie_new(void); + +/** + Destroy `t`. +*/ +ZIX_API void +zix_trie_free(ZixTrie* t); + +/** + Print a DOT description of `t` to `fd`. +*/ +ZIX_API void +zix_trie_print_dot(const ZixTrie* t, FILE* fd); + +/** + Insert `str` into `t`. +*/ +ZIX_API ZixStatus +zix_trie_insert(ZixTrie* t, const char* str, size_t len); + +/** + Search for `str` in `t`. +*/ +ZIX_API ZixStatus +zix_trie_find(const ZixTrie* t, const char* str, const char** match); + +/** + @} + @} +*/ + +#endif /* ZIX_TRIE_H */ -- cgit v1.2.1