From 0c67a1bd0cd5f40fe93584a2e3713cb8963d6835 Mon Sep 17 00:00:00 2001 From: David Robillard Date: Thu, 27 Jan 2011 19:29:55 +0000 Subject: Initial check-in (branched from ). git-svn-id: http://svn.drobilla.net/sord/trunk@2 3d64ff67-21c5-427c-a301-fe4f08042e5a --- src/sord.c | 1142 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/sord_test.c | 347 +++++++++++++++++ 2 files changed, 1489 insertions(+) create mode 100644 src/sord.c create mode 100644 src/sord_test.c (limited to 'src') diff --git a/src/sord.c b/src/sord.c new file mode 100644 index 0000000..23c1de4 --- /dev/null +++ b/src/sord.c @@ -0,0 +1,1142 @@ +/* Sord, a lightweight RDF model library. + * Copyright 2010-2011 David Robillard + * + * Sord is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Sord is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + * License for details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see . + */ + +/** @file + * Sord Implementation. + * + * Tuples are represented as simple arrays of SordID, of length 4, + * which represent statements (RDF triples) with an optional + * context. When contexts are not used, only the first 3 elements + * are considered. + */ + +// C99 +#include +#include +#include +#include +#include +#include +#include + +// GLib +#include + +#include "sord-config.h" +#include "sord/sord.h" + +#define SORD_LOG(prefix, ...) fprintf(stderr, "[Sord::" prefix "] " __VA_ARGS__) + +#ifdef SORD_DEBUG_ITER + #define SORD_ITER_LOG(...) SORD_LOG("iter", __VA_ARGS__) +#else + #define SORD_ITER_LOG(...) +#endif +#ifdef SORD_DEBUG_SEARCH + #define SORD_FIND_LOG(...) SORD_LOG("search", __VA_ARGS__) +#else + #define SORD_FIND_LOG(...) +#endif +#ifdef SORD_DEBUG_WRITE + #define SORD_WRITE_LOG(...) SORD_LOG("write", __VA_ARGS__) +#else + #define SORD_WRITE_LOG(...) +#endif + +#define NUM_ORDERS 12 +#define STATEMENT_LEN 3 +#define TUP_LEN STATEMENT_LEN + 1 +#define DEFAULT_ORDER SPO +#define DEFAULT_GRAPH_ORDER GSPO + +#define TUP_FMT "(%d %d %d %d)" +#define TUP_FMT_ARGS(t) ((t)[0]), ((t)[1]), ((t)[2]), ((t)[3]) + +#define TUP_S 0 +#define TUP_P 1 +#define TUP_O 2 +#define TUP_G 3 + +/** Triple ordering */ +typedef enum { + SPO, ///< Subject, Predicate, Object + SOP, ///< Subject, Object, Predicate + OPS, ///< Object, Predicate, Subject + OSP, ///< Object, Subject, Predicate + PSO, ///< Predicate, Subject, Object + POS, ///< Predicate, Object, Subject + GSPO, ///< Graph, Subject, Predicate, Object + GSOP, ///< Graph, Subject, Object, Predicate + GOPS, ///< Graph, Object, Predicate, Subject + GOSP, ///< Graph, Object, Subject, Predicate + GPSO, ///< Graph, Predicate, Subject, Object + GPOS, ///< Graph, Predicate, Object, Subject +} SordOrder; + +/** String name of each ordering (array indexed by SordOrder) */ +static const char* const order_names[NUM_ORDERS] = { + "spo", "sop", "ops", "osp", "pso", "pos", + "gspo", "gsop", "gops", "gosp", "gpso", "gpos" +}; + +/** Tuples of indices for each order, from most to least significant + * (array indexed by SordOrder) + */ +static const int orderings[NUM_ORDERS][TUP_LEN] = { + { 0,1,2,3}, { 0,2,1,3}, { 2,1,0,3}, { 2,0,1,3}, { 1,0,2,3}, { 1,2,0,3}, + {3,0,1,2 }, {3,0,2,1 }, {3,2,1,0 }, {3,2,0,1 }, {3,1,0,2 }, {3,1,2,0 } +}; + +/** Store Metadata */ +typedef struct { + SordCount n_tuples; + SordCount n_nodes; +} SordMeta; + +/** Store */ +struct _Sord { + GHashTable* names; ///< URI or blank node identifier string => ID + GHashTable* literals; ///< Literal => ID + + /** Index for each possible triple ordering (may or may not exist). + * If an index for e.g. SPO exists, it is a dictionary with + * (S P O) keys (as a SordTuplrID), and ignored values. + */ + GSequence* indices[NUM_ORDERS]; + + void (*user_data_free)(void*); ///< Destructor for node user data + + SordMeta* meta; +}; + +/** Mode for searching or iteration */ +typedef enum { + ALL, ///< Iterate to end of store, returning all results, no filtering + SINGLE, ///< Iteration over a single element (exact search) + RANGE, ///< Iterate over range with equal prefix + FILTER_RANGE, ///< Iterate over range with equal prefix, filtering + FILTER_ALL ///< Iterate to end of store, filtering +} SearchMode; + +/** Iterator over some range of a store */ +struct _SordIter { + Sord sord; ///< Store this is an iterator for + GSequenceIter* cur; ///< Current DB cursor + SordTuple pat; ///< Iteration pattern (in ordering order) + int ordering[TUP_LEN]; ///< Store ordering + SearchMode mode; ///< Iteration mode + int n_prefix; ///< Length of range prefix (RANGE, FILTER_RANGE) + bool end; ///< True iff reached end + bool skip_graphs; ///< True iff iteration should ignore graphs +}; + +#define PACKED __attribute__((__packed__)) + +/** Node (value in `nodes' hash table) */ +struct _SordNode { + uint8_t type; ///< SordNodeType + SordCount refs; + int size; ///< Size of node's data (not including type) + void* user_data; ///< Opaque user data + char data[]; ///< String for URI or BLANK, Literal for LITERAL +} PACKED; + +/** Literal value (possibly with type and/or language) */ +typedef struct _SordLiteral { + SordNode type; ///< Literal data type (ID of a URI node, or 0) + uint8_t lang_size; ///< Length of language, including terminator (or 0) + char data[]; ///< Language (terminated), followed by value string +} PACKED Literal; + +static char* +sord_literal_node_get_value(SordNode node) +{ + assert(node->type == SORD_LITERAL); + Literal* lit = (Literal*)node->data; + return lit->data + lit->lang_size; +} + +static unsigned +sord_literal_hash(const void* n) +{ + SordNode node = (SordNode)n; + return g_str_hash(sord_literal_node_get_value(node)); +} + +static gboolean +sord_literal_equal(const void* a, const void* b) +{ + SordNode a_node = (SordNode)a; + SordNode b_node = (SordNode)b; + // FIXME: type, lang + return g_str_equal(sord_literal_node_get_value(a_node), + sord_literal_node_get_value(b_node)); +} + +static inline int +sord_node_compare(Sord sord, const SordNode a, const SordNode b) +{ + if (a->type != b->type) + return a->type - b->type; + + Literal* lit_a; + Literal* lit_b; + switch ((SordNodeType)a->type) { + case SORD_URI: + case SORD_BLANK: + return strcmp(a->data, b->data); + case SORD_LITERAL: + // TODO: lang, type + lit_a = (Literal*)a->data; + lit_b = (Literal*)b->data; + return strcmp(sord_literal_node_get_value(a), sord_literal_node_get_value(b)); + } + assert(false); + return 0; +} + +/** Compare two IDs (dereferencing if necessary). + * The null ID, 0, is treated as a minimum (it is less than every other + * possible ID, except itself). This allows it to be used as a wildcard + * when searching, ensuring the search will reach the minimum possible + * key in the tree and iteration from that point will produce the entire + * result set. + */ +static inline int +sord_id_compare(Sord sord, const SordID a, const SordID b) +{ + if (a == b || !a || !b) { + return (const char*)a - (const char*)b; + } else { + SordNode a_node = sord_node_load(sord, a); + SordNode b_node = sord_node_load(sord, b); + const int ret = sord_node_compare(sord, a_node, b_node); + return ret; + } +} + +/** Return true iff IDs are equivalent, or one is a wildcard */ +static inline bool +sord_id_match(const SordID a, const SordID b) +{ + return !a || !b || (a == b); +} + +static inline bool +sord_tuple_match_inline(const SordTuple x, const SordTuple y) +{ + return sord_id_match(x[0], y[0]) + && sord_id_match(x[1], y[1]) + && sord_id_match(x[2], y[2]) + && sord_id_match(x[3], y[3]); +} + +bool +sord_tuple_match(const SordTuple x, const SordTuple y) +{ + return sord_tuple_match_inline(x, y); +} + +void +sord_tuple_load(Sord sord, SordTuple tup, SordNode* s, SordNode* p, SordNode* o) +{ + *s = sord_node_load(sord, tup[TUP_S]); + *p = sord_node_load(sord, tup[TUP_P]); + *o = sord_node_load(sord, tup[TUP_O]); +} + +/** Compare two tuple IDs lexicographically. + * NULL IDs (equal to 0) are treated as wildcards, always less than every + * other possible ID, except itself. + */ +static int +sord_tuple_compare(const void* x_ptr, const void* y_ptr, void* user_data) +{ + Sord const sord = (Sord)user_data; + SordID* const x = (SordID*)x_ptr; + SordID* const y = (SordID*)y_ptr; + + for (int i = 0; i < TUP_LEN; ++i) { + const int cmp = sord_id_compare(sord, x[i], y[i]); + if (cmp) + return cmp; + } + + return 0; +} + +static inline bool +sord_iter_next(SordIter iter) +{ + if (!iter->skip_graphs) { + iter->cur = g_sequence_iter_next(iter->cur); + return !g_sequence_iter_is_end(iter->cur); + } + + const SordID* key = (const SordID*)g_sequence_get(iter->cur); + const SordTuple initial = { key[0], key[1], key[2], key[3] }; + while (true) { + iter->cur = g_sequence_iter_next(iter->cur); + if (g_sequence_iter_is_end(iter->cur)) + return true; + + key = (const SordID*)g_sequence_get(iter->cur); + for (int i = 0; i < 3; ++i) + if (key[i] != initial[i]) + return false; + } + assert(false); +} + +/** Seek forward as necessary until @a iter points at a match. + * @return true iff iterator reached end of valid range. + */ +static inline bool +sord_iter_seek_match(SordIter iter) +{ + for (iter->end = true; + !g_sequence_iter_is_end(iter->cur); + sord_iter_next(iter)) { + const SordID* const key = (const SordID*)g_sequence_get(iter->cur); + if (sord_tuple_match_inline(key, iter->pat)) + return (iter->end = false); + } + return true; +} + +/** Seek forward as necessary until @a iter points at a match, or the prefix + * no longer matches iter->pat. + * @return true iff iterator reached end of valid range. + */ +static inline bool +sord_iter_seek_match_range(SordIter iter) +{ + if (iter->end) + return true; + + do { + const SordID* key = (const SordID*)g_sequence_get(iter->cur); + + if (sord_tuple_match_inline(key, iter->pat)) + return false; // Found match + + for (int i = 0; i < iter->n_prefix; ++i) { + if (!sord_id_match(key[i], iter->pat[i])) { + iter->end = true; // Reached end of valid range + return true; + } + } + } while (!sord_iter_next(iter)); + + return (iter->end = true); // Reached end +} + +static SordIter +sord_iter_new(Sord sord, GSequenceIter* cur, const SordTuple pat, + SordOrder order, SearchMode mode, int n_prefix) +{ + const int* ordering = orderings[order]; + + SordIter iter = malloc(sizeof(struct _SordIter)); + iter->sord = sord; + iter->cur = cur; + iter->mode = mode; + iter->n_prefix = n_prefix; + iter->end = false; + iter->skip_graphs = order < GSPO; + for (int i = 0; i < TUP_LEN; ++i) { + iter->pat[i] = pat[ordering[i]]; + iter->ordering[i] = ordering[i]; + } + + switch (iter->mode) { + case ALL: + case SINGLE: + case RANGE: + assert(sord_tuple_match_inline( + (const SordID*)g_sequence_get(iter->cur), + iter->pat)); + break; + case FILTER_RANGE: + sord_iter_seek_match_range(iter); + break; + case FILTER_ALL: + sord_iter_seek_match(iter); + break; + } + +#ifdef SORD_DEBUG_ITER + SordTuple value; + sord_iter_get(iter, value); + SORD_ITER_LOG("New %p pat=" TUP_FMT " cur=" TUP_FMT " end=%d\n", (void*)iter, + TUP_FMT_ARGS(pat), TUP_FMT_ARGS(value), iter->end); +#endif + return iter; +} + +Sord +sord_iter_get_sord(SordIter iter) +{ + return iter->sord; +} + +void +sord_iter_get(SordIter iter, SordTuple id) +{ + const SordID* key = (const SordID*)g_sequence_get(iter->cur); + id[iter->ordering[0]] = key[0]; + id[iter->ordering[1]] = key[1]; + id[iter->ordering[2]] = key[2]; + id[iter->ordering[3]] = key[3]; +} + +bool +sord_iter_increment(SordIter iter) +{ + if (iter->end) + return true; + + const SordID* key; + iter->end = sord_iter_next(iter); + if (!iter->end) { + switch (iter->mode) { + case ALL: + // At the end if the cursor is (assigned above) + break; + case SINGLE: + iter->end = true; + break; + case RANGE: + // At the end if the MSN no longer matches + key = (const SordID*)g_sequence_get(iter->cur); + assert(key); + for (int i = 0; i < iter->n_prefix; ++i) { + if (!sord_id_match(key[i], iter->pat[i])) { + iter->end = true; + break; + } + } + break; + case FILTER_RANGE: + // Seek forward to next match, stopping if prefix changes + sord_iter_seek_match_range(iter); + break; + case FILTER_ALL: + // Seek forward to next match + sord_iter_seek_match(iter); + break; + } + } + + if (iter->end) { + SORD_ITER_LOG("%p Reached end\n", (void*)iter); + return true; + } else { +#ifdef SORD_DEBUG_ITER + SordTuple tup; + sord_iter_get(iter, tup); + SORD_ITER_LOG("%p Increment to " TUP_FMT "\n", (void*)iter, TUP_FMT_ARGS(tup)); +#endif + return false; + } +} + +bool +sord_iter_is_end(SordIter iter) +{ + return !iter || iter->end; +} + +void +sord_iter_free(SordIter iter) +{ + SORD_ITER_LOG("%p Free\n", (void*)iter); + if (iter) { + free(iter); + } +} + +/** Return true iff @a sord has an index for @a order. + * If @a graph_search is true, @a order will be modified to be the + * corresponding order with a G prepended (so G will be the MSN). + */ +static inline bool +sord_has_index(Sord sord, SordOrder* order, int* n_prefix, bool graph_search) +{ + if (graph_search) { + *order += GSPO; + *n_prefix += 1; + } + + return sord->indices[*order]; +} + +/** Return the best available index for a pattern. + * @param pat Pattern in standard (S P O G) order + * @param mode Set to the (best) iteration mode for iterating over results + * @param n_prefix Set to the length of the range prefix + * (for @a mode == RANGE and @a mode == FILTER_RANGE) + */ +static inline SordOrder +sord_best_index(Sord sord, const SordTuple pat, SearchMode* mode, int* n_prefix) +{ + const bool graph_search = (pat[TUP_G] != 0); + + const unsigned sig + = (pat[0] ? 1 : 0) * 0x100 + + (pat[1] ? 1 : 0) * 0x010 + + (pat[2] ? 1 : 0) * 0x001; + + SordOrder good[2]; + + // Good orderings that don't require filtering + *mode = RANGE; + *n_prefix = 0; + switch (sig) { + case 0x000: *mode = ALL; return graph_search ? DEFAULT_GRAPH_ORDER : DEFAULT_ORDER; + case 0x001: *mode = RANGE; good[0] = OPS; good[1] = OSP; *n_prefix = 1; break; + case 0x010: *mode = RANGE; good[0] = POS; good[1] = PSO; *n_prefix = 1; break; + case 0x011: *mode = RANGE; good[0] = OPS; good[1] = POS; *n_prefix = 2; break; + case 0x100: *mode = RANGE; good[0] = SPO; good[1] = SOP; *n_prefix = 1; break; + case 0x101: *mode = RANGE; good[0] = SOP; good[1] = OSP; *n_prefix = 2; break; + case 0x110: *mode = RANGE; good[0] = SPO; good[1] = PSO; *n_prefix = 2; break; + case 0x111: *mode = SINGLE; return graph_search ? DEFAULT_GRAPH_ORDER : DEFAULT_ORDER; + } + + if (sord_has_index(sord, &good[0], n_prefix, graph_search)) { + return good[0]; + } else if (sord_has_index(sord, &good[1], n_prefix, graph_search)) { + return good[1]; + } + + // Not so good orderings that require filtering, but can + // still be constrained to a range + switch (sig) { + case 0x011: *mode = FILTER_RANGE; good[0] = OSP; good[1] = PSO; *n_prefix = 1; break; + case 0x101: *mode = FILTER_RANGE; good[0] = SPO; good[1] = OPS; *n_prefix = 1; break; + case 0x110: *mode = FILTER_RANGE; good[0] = SOP; good[1] = POS; *n_prefix = 1; break; + default: break; + } + + if (*mode == FILTER_RANGE) { + if (sord_has_index(sord, &good[0], n_prefix, graph_search)) { + return good[0]; + } else if (sord_has_index(sord, &good[1], n_prefix, graph_search)) { + return good[1]; + } + } + + if (graph_search) { + *mode = FILTER_RANGE; + *n_prefix = 1; + return DEFAULT_GRAPH_ORDER; + } else { + *mode = FILTER_ALL; + return DEFAULT_ORDER; + } +} + +static GSequence* +sord_new_btree() +{ + return g_sequence_new(NULL); +} + +Sord +sord_new() +{ + Sord sord = (Sord)malloc(sizeof(struct _Sord)); + sord->names = g_hash_table_new_full(g_str_hash, g_str_equal, 0, 0); + sord->literals = g_hash_table_new_full(sord_literal_hash, sord_literal_equal, 0, 0); + sord->user_data_free = NULL; + + for (unsigned i = 0; i < NUM_ORDERS; ++i) + sord->indices[i] = NULL; + + return sord; +} + +void +sord_free(Sord sord) +{ + if (!sord) + return; + + free(sord->meta); + g_hash_table_unref(sord->names); + g_hash_table_unref(sord->literals); + for (unsigned i = 0; i < NUM_ORDERS; ++i) + if (sord->indices[i]) + g_sequence_free(sord->indices[i]); + + free(sord); +} + +void +sord_set_option(Sord sord, const char* key, const char* value, + SordNodeType type, const char* datatype, const char* lang) +{ + const char* const prefix = "http://drobilla.net/ns/sord#"; + const size_t prefix_len = strlen(prefix); + if (strncmp(key, prefix, prefix_len)) { + fprintf(stderr, "Unknown option %s\n", key); + return; + } + + const char* option = key + prefix_len; + const bool value_is_true = !strcmp(value, "true") || !strcmp(value, "1") || !strcmp(value, "yes"); + if (!strcmp(option, "index-all")) { + for (int i = 0; i < NUM_ORDERS; ++i) { + sord->indices[i] = sord_new_btree(); + } + } else if (!strncmp(option, "index-", 6) && value_is_true) { + for (int i = 0; i < NUM_ORDERS; ++i) { + if (!strcmp(option + 6, order_names[i])) { + sord->indices[i] = sord_new_btree(); + return; + } + } + } else { + fprintf(stderr, "Unknown option %s\n", key); + } +} + +bool +sord_open(Sord sord) +{ + sord->meta = malloc(sizeof(SordMeta)); + memset(sord->meta, 0, sizeof(SordMeta)); + + bool no_indices = true; + for (unsigned i = 0; i < NUM_ORDERS; ++i) { + if (sord->indices[i]) { + no_indices = false; + break; + } + } + + if (no_indices) { + // Use default indexing, avoids O(n) in all cases + sord->indices[SPO] = sord_new_btree(); + sord->indices[OPS] = sord_new_btree(); + sord->indices[PSO] = sord_new_btree(); + sord->indices[GSPO] = sord_new_btree(); // XXX: default? do on demand? + } + + if (!sord->indices[DEFAULT_ORDER]) + sord->indices[DEFAULT_ORDER] = sord_new_btree(); + + return true; +} + +static void +sord_drop_node(Sord sord, SordID id) +{ + // FIXME: leak? +} + +int +sord_num_tuples(Sord sord) +{ + return sord->meta->n_tuples; +} + +int +sord_num_nodes(Sord sord) +{ + return sord->meta->n_nodes; +} + +void +sord_node_set_user_data_free_function(Sord sord, void (*f)(void*)) +{ + sord->user_data_free = f; +} + +SordIter +sord_begin(Sord sord) +{ + if (sord_num_tuples(sord) == 0) { + return NULL; + } else { + GSequenceIter* cur = g_sequence_get_begin_iter(sord->indices[DEFAULT_ORDER]); + SordTuple pat = { 0, 0, 0, 0 }; + return sord_iter_new(sord, cur, pat, DEFAULT_ORDER, ALL, 0); + } +} + +SordIter +sord_graphs_begin(Sord read) +{ + return NULL; +} + +static inline GSequenceIter* +index_search(Sord sord, GSequence* db, const SordTuple search_key) +{ + return g_sequence_search( + db, (void*)search_key, sord_tuple_compare, sord); +} + +static inline GSequenceIter* +index_lower_bound(Sord sord, GSequence* db, const SordTuple search_key) +{ + GSequenceIter* i = g_sequence_search( + db, (void*)search_key, sord_tuple_compare, sord); + + /* i is now at the position where search_key would be inserted, + but this is not necessarily a match, and we need the leftmost... + */ + + if (g_sequence_iter_is_begin(i)) { + return i; + } else if (g_sequence_iter_is_end(i)) { + i = g_sequence_iter_prev(i); + } + + if (!sord_tuple_match_inline(search_key, g_sequence_get(i))) { + // No match, but perhaps immediately after a match + GSequenceIter* const prev = g_sequence_iter_prev(i); + if (!sord_tuple_match_inline(search_key, g_sequence_get(prev))) { + return i; // No match (caller must check) + } else { + i = prev; + } + } + + /* i now points to some matching element, + but not necessarily the first... + */ + assert(sord_tuple_match_inline(search_key, g_sequence_get(i))); + + while (!g_sequence_iter_is_begin(i)) { + if (sord_tuple_match_inline(search_key, g_sequence_get(i))) { + GSequenceIter* const prev = g_sequence_iter_prev(i); + if (sord_tuple_match_inline(search_key, g_sequence_get(prev))) { + i = prev; + continue; + } + } + break; + } + + return i; +} + +SordIter +sord_find(Sord sord, const SordTuple pat) +{ + if (!pat[0] && !pat[1] && !pat[2] && !pat[3]) + return sord_begin(sord); + + SearchMode mode; + int prefix_len; + const SordOrder index_order = sord_best_index(sord, pat, &mode, &prefix_len); + const int* const ordering = orderings[index_order]; + + SORD_FIND_LOG("Find " TUP_FMT " index=%s mode=%d prefix_len=%d ordering=%d%d%d%d\n", + TUP_FMT_ARGS(pat), order_names[index_order], mode, prefix_len, + ordering[0], ordering[1], ordering[2], ordering[3]); + + // It's easiest to think about this algorithm in terms of (S P O) ordering, + // assuming (A B C) == (S P O). For other orderings this is not actually + // the case, but it works the same way. + const SordID a = pat[ordering[0]]; // Most Significant Node (MSN) + const SordID b = pat[ordering[1]]; // ... + const SordID c = pat[ordering[2]]; // ... + const SordID d = pat[ordering[3]]; // Least Significant Node (LSN) + + if (a && b && c) + mode = SINGLE; // No duplicate tuples (Sord is a set) + + SordTuple search_key = { a, b, c, d }; + GSequence* const db = sord->indices[index_order]; + GSequenceIter* const cur = index_lower_bound(sord, db, search_key); + const SordID* const key = (const SordID*)g_sequence_get(cur); + if (!key || ( (mode == RANGE || mode == SINGLE) + && !sord_tuple_match_inline(search_key, key) )) { + SORD_FIND_LOG("No match found\n"); + return NULL; + } + + return sord_iter_new(sord, cur, pat, index_order, mode, prefix_len); +} + +static SordID +sord_lookup_name(Sord sord, const char* str, int str_len) +{ + return g_hash_table_lookup(sord->names, str); +} + +static SordNode +sord_new_literal_node(Sord sord, SordNode type, + const char* str, int str_len, + const char* lang, uint8_t lang_len, + int* node_size) +{ + const uint8_t lang_size = lang ? lang_len + 1 : lang_len; + *node_size = sizeof(struct _SordNode) + sizeof(struct _SordLiteral) + str_len + 1 + lang_size; + SordNode node = malloc(*node_size); + node->type = SORD_LITERAL; + Literal* lit = (Literal*)node->data; + lit->type = type; + lit->lang_size = lang_size; + if (lang) + memcpy(lit->data, lang, lang_size); + + memcpy(lit->data + lang_size, str, str_len + 1); + return node; +} + +static SordNode +sord_lookup_literal(Sord sord, SordNode type, + const char* str, int str_len, + const char* lang, uint8_t lang_len) +{ + int node_size; + SordNode node = sord_new_literal_node(sord, type, str, str_len, lang, lang_len, &node_size); + + SordNode id = g_hash_table_lookup(sord->literals, node); + free(node); + if (id) { + return id; + } else { + return 0; + } +} + +SordNode +sord_node_load(Sord sord, SordID id) +{ + return (SordNode)id; +} + +static SordNode +sord_new_node(Sord sord, SordNodeType type, const char* buf, int buf_len, int* node_size) +{ + *node_size = sizeof(struct _SordNode) + buf_len; + SordNode node = malloc(*node_size); + node->type = type; + memcpy(node->data, buf, buf_len); + return node; +} + +SordNodeType +sord_node_get_type(SordNode ref) +{ + return ref->type; +} + +const char* +sord_node_get_string(SordNode ref) +{ + switch (ref->type) { + case SORD_URI: + case SORD_BLANK: + return ref->data; + break; + case SORD_LITERAL: + return ((Literal*)&ref->data)->data; + break; + } + assert(false); + return NULL; +} + +void +sord_node_set_user_data(SordNode ref, void* user_data) +{ + ref->user_data = user_data; +} + +void* +sord_node_get_user_data(SordNode ref) +{ + return ref->user_data; +} + +const char* +sord_literal_get_lang(SordNode ref) +{ + Literal* lit = (Literal*)&ref->data; + if (lit->lang_size > 0) + return lit->data; + else + return NULL; +} + +SordNode +sord_literal_get_datatype(SordNode ref) +{ + Literal* lit = (Literal*)&ref->data; + return lit->type; +} + +static void +sord_add_node(Sord sord, SordNode node, int node_size) +{ + node->refs = 0; + ++sord->meta->n_nodes; +} + +SordID +sord_get_uri_counted(Sord sord, bool create, const char* str, int str_len) +{ + SordID id = sord_lookup_name(sord, str, str_len); + if (id || !create) + return id; + + int node_size; + id = sord_new_node(sord, SORD_URI, str, str_len + 1, &node_size); + + assert(id); + g_hash_table_insert(sord->names, (void*)g_strdup(str), (void*)id); + sord_add_node(sord, id, node_size); + + return id; +} + +SordID +sord_get_uri(Sord sord, bool create, const char* str) +{ + return sord_get_uri_counted(sord, create, str, strlen(str)); +} + +SordID +sord_get_blank_counted(Sord sord, bool create, const char* str, int str_len) +{ + SordID id = sord_lookup_name(sord, str, str_len); + if (id || !create) + return id; + + int node_size; + id = sord_new_node(sord, SORD_BLANK, str, str_len + 1, &node_size); + + assert(id); + g_hash_table_insert(sord->names, (void*)g_strdup(str), (void*)id); + sord_add_node(sord, id, node_size); + + return id; +} + +SordID +sord_get_blank(Sord sord, bool create, const char* str) +{ + return sord_get_blank_counted(sord, create, str, strlen(str)); +} + +SordID +sord_get_literal_counted(Sord sord, bool create, SordID type, + const char* str, int str_len, + const char* lang, uint8_t lang_len) +{ + SordID id = sord_lookup_literal(sord, type, str, str_len, lang, lang_len); + if (id || !create) + return id; + + int node_size; + id = sord_new_literal_node(sord, type, str, str_len, lang, lang_len, &node_size); + + g_hash_table_insert(sord->literals, (void*)id, id); + sord_add_node(sord, id, node_size); + + return id; +} + +SordID +sord_get_literal(Sord sord, bool create, SordID type, const char* str, const char* lang) +{ + return sord_get_literal_counted(sord, create, type, str, strlen(str), + lang, lang ? strlen(lang) : 0); +} + +static void +sord_add_tuple_ref(Sord sord, const SordID id) +{ + if (id) { + SordNode node = sord_node_load(sord, id); + ++node->refs; + } +} + +static void +sord_drop_tuple_ref(Sord sord, const SordID id) +{ + if (id) { + SordNode node = sord_node_load(sord, id); + if (--node->refs == 0) { + sord_drop_node(sord, id); + } + } +} + +static inline bool +sord_add_to_index(Sord sord, const SordTuple tup, SordOrder order) +{ + assert(sord->indices[order]); + const int* const ordering = orderings[order]; + const SordTuple key = { + tup[ordering[0]], tup[ordering[1]], tup[ordering[2]], tup[ordering[3]] + }; + GSequenceIter* const cur = index_search(sord, sord->indices[order], key); + if (!g_sequence_iter_is_end(cur) + && !sord_tuple_compare(g_sequence_get(cur), key, sord)) { + return false; // Tuple already stored in this index + } + + // FIXME: memory leak? + SordID* key_copy = malloc(sizeof(SordTuple)); + memcpy(key_copy, key, sizeof(SordTuple)); + g_sequence_insert_before(cur, key_copy); + return true; +} + +void +sord_add(Sord sord, const SordTuple tup) +{ + SORD_WRITE_LOG("Add " TUP_FMT "\n", TUP_FMT_ARGS(tup)); + assert(tup[0] && tup[1] && tup[2]); + + for (unsigned i = 0; i < NUM_ORDERS; ++i) { + if (sord->indices[i]) { + if (!sord_add_to_index(sord, tup, i)) { + assert(i == 0); // Assuming index coherency + return; // Tuple already stored, do nothing + } + } + } + + for (int i = 0; i < TUP_LEN; ++i) + sord_add_tuple_ref(sord, tup[i]); + + ++sord->meta->n_tuples; + assert(sord->meta->n_tuples == g_sequence_get_length(sord->indices[SPO])); +} + +void +sord_remove(Sord sord, const SordTuple tup) +{ + SORD_WRITE_LOG("Remove " TUP_FMT "\n", TUP_FMT_ARGS(tup)); + + for (unsigned i = 0; i < NUM_ORDERS; ++i) { + if (sord->indices[i]) { + const int* const ordering = orderings[i]; + const SordTuple key = { + tup[ordering[0]], tup[ordering[1]], tup[ordering[2]], tup[ordering[3]] + }; + GSequenceIter* const cur = index_search(sord, sord->indices[i], key); + if (!g_sequence_iter_is_end(cur)) { + g_sequence_remove(cur); + } else { + assert(i == 0); // Assuming index coherency + return; // Tuple not found, do nothing + } + } + } + + for (int i = 0; i < TUP_LEN; ++i) + sord_drop_tuple_ref(sord, tup[i]); + + --sord->meta->n_tuples; +} + +void +sord_remove_iter(Sord sord, SordIter iter) +{ + SordTuple tup; + sord_iter_get(iter, tup); + + SORD_WRITE_LOG("Remove " TUP_FMT "\n", TUP_FMT_ARGS(tup)); + + // TODO: Directly remove the iterator's index (avoid one search) + + for (unsigned i = 0; i < NUM_ORDERS; ++i) { + if (sord->indices[i]) { + const int* const ordering = orderings[i]; + const SordTuple key = { + tup[ordering[0]], tup[ordering[1]], tup[ordering[2]], tup[ordering[3]] + }; + GSequenceIter* const cur = index_search(sord, sord->indices[i], key); + if (!g_sequence_iter_is_end(cur)) { + g_sequence_remove(cur); + } else { + assert(i == 0); // Assuming index coherency + } + } + } + + for (int i = 0; i < TUP_LEN; ++i) + sord_drop_tuple_ref(sord, tup[i]); + + --sord->meta->n_tuples; + + iter->end = g_sequence_iter_is_end(iter->cur); +} + +void +sord_remove_graph(Sord sord, SordID graph) +{ + #if 0 + if (!sord->indices[GSPO]) + return; + + // Remove all tuples in graph from non-graph indices + BDBCUR* cur = tcbdbcurnew(sord->indices[GSPO]); + const SordTuple search_key = { graph, 0, 0, 0 }; + int key_size = sizeof(SordTuple); + tcbdbcurjump(cur, &search_key, key_size); + do { + const SordID* key = (const SordID*)tcbdbcurkey3(cur, &key_size); + if (!key || key[0] != graph) + break; + + for (unsigned i = 0; i < GSPO; ++i) { + if (sord->indices[i]) { + const int* const ordering = orderings[i]; + const SordTuple tup = { key[1], key[2], key[3], key[0] }; // Key in SPOG order + const SordTuple subkey = { + tup[ordering[0]], tup[ordering[1]], tup[ordering[2]], tup[ordering[3]] + }; + if (!tcbdbout(sord->indices[i], &subkey, sizeof(SordTuple))) + fprintf(stderr, "Failed to remove key " TUP_FMT "\n", TUP_FMT_ARGS(subkey)); + } + } + + --sord->meta->n_tuples; + } while (tcbdbcurnext(cur)); + + // Remove all tuples in graph from graph indices + for (unsigned i = GSPO; i < NUM_ORDERS; ++i) { + if (sord->indices[i]) { + BDBCUR* cur = tcbdbcurnew(sord->indices[i]); + tcbdbcurjump(cur, &search_key, key_size); + while (true) { + const SordID* key = (const SordID*)tcbdbcurkey3(cur, &key_size); + if (!key || key[0] != graph) { + break; + } else if (i == GSPO) { + for (int i = 0; i < TUP_LEN; ++i) { + sord_drop_tuple_ref(sord, key[i]); + } + } + if (!tcbdbcurout(cur)) + break; + } + } + } + #endif +} diff --git a/src/sord_test.c b/src/sord_test.c new file mode 100644 index 0000000..371e5c1 --- /dev/null +++ b/src/sord_test.c @@ -0,0 +1,347 @@ +/* Sord, a lightweight RDF model library. + * Copyright 2010-2011 David Robillard + * + * Sord is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Sord is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + * License for details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see . + */ + +#define _XOPEN_SOURCE 500 + +#include +#include +#include +#include "sord/sord.h" + +static const int DIGITS = 3; +static const int MAX_NUM = 999; + +typedef struct { SordTuple query; int expected_num_results; } QueryTest; + +static SordID +uri(Sord sord, int num) +{ + if (num == 0) + return 0; + + char uri[] = "eg:000"; + const size_t uri_len = 3 + DIGITS; + char* uri_num = uri + 3; // First `0' + snprintf(uri_num, DIGITS + 1, "%0*d", DIGITS, num); + return sord_get_uri_counted(sord, true, uri, uri_len); +} + +void +generate(Sord sord, size_t n_tuples, size_t n_objects_per) +{ + fprintf(stderr, "Generating %zu (S P *) tuples with %zu objects each\n", + n_tuples, n_objects_per); + + for (size_t i = 0; i < n_tuples; ++i) { + int num = (i * n_objects_per) + 1; + + SordID ids[2 + n_objects_per]; + for (size_t j = 0; j < 2 + n_objects_per; ++j) { + ids[j] = uri(sord, num++); + } + + for (size_t j = 0; j < n_objects_per; ++j) { + SordTuple tup = { ids[0], ids[1], ids[2 + j] }; + sord_add(sord, tup); + } + } + + // Add some literals + SordTuple tup; + tup[0] = uri(sord, 98); + tup[1] = uri(sord, 4); + tup[2] = sord_get_literal(sord, true, 0, "hello", NULL); + tup[3] = 0; + sord_add(sord, tup); + tup[2] = sord_get_literal(sord, true, 0, "hi", NULL); + sord_add(sord, tup); + + tup[0] = uri(sord, 14); + tup[2] = sord_get_literal(sord, true, 0, "bonjour", "fr"); + sord_add(sord, tup); + tup[2] = sord_get_literal(sord, true, 0, "salut", "fr"); + sord_add(sord, tup); + + // Attempt to add some duplicates + sord_add(sord, tup); + sord_add(sord, tup); + + // Add a blank node subject + tup[0] = sord_get_blank(sord, true, "ablank"); + sord_add(sord, tup); + + tup[1] = uri(sord, 6); + tup[2] = uri(sord, 7); + sord_add(sord, tup); +} + +/** Trivial function to return EXIT_FAILURE (useful as a breakpoint) */ +int +test_fail() +{ + return EXIT_FAILURE; +} + +#define TUP_FMT "(%6s %6s %6s)" +#define TUP_FMT_ARGS(t) \ + (sord_node_load(sord, (t)[0]) \ + ? sord_node_get_string(sord_node_load(sord, (t)[0])) : "*"), \ + (sord_node_load(sord, (t)[1]) \ + ? sord_node_get_string(sord_node_load(sord, (t)[1])) : "*"), \ + (sord_node_load(sord, (t)[2]) \ + ? sord_node_get_string(sord_node_load(sord, (t)[2])) : "*") + +int +test_read(Sord sord, const size_t n_tuples, const int n_objects_per) +{ + int ret = EXIT_SUCCESS; + + SordTuple id; + + SordIter iter = sord_begin(sord); + if (sord_iter_get_sord(iter) != sord) { + fprintf(stderr, "Fail: Iterator has incorrect sord pointer\n"); + return test_fail(); + } + + for (; !sord_iter_is_end(iter); sord_iter_increment(iter)) + sord_iter_get(iter, id); + + // Attempt to increment past end + if (!sord_iter_increment(iter)) { + fprintf(stderr, "Fail: Successfully incremented past end\n"); + return test_fail(); + } + + sord_iter_free(iter); + +#define NUM_PATTERNS 9 + + QueryTest patterns[NUM_PATTERNS] = { + { { 0, 0, 0 }, (n_tuples * n_objects_per) + 6 }, + { { uri(sord, 9), uri(sord, 9), uri(sord, 9) }, 0 }, + { { uri(sord, 1), uri(sord, 2), uri(sord, 4) }, 1 }, + { { uri(sord, 3), uri(sord, 4), uri(sord, 0) }, 2 }, + { { uri(sord, 0), uri(sord, 2), uri(sord, 4) }, 1 }, + { { uri(sord, 0), uri(sord, 0), uri(sord, 4) }, 1 }, + { { uri(sord, 1), uri(sord, 0), uri(sord, 0) }, 2 }, + { { uri(sord, 1), uri(sord, 0), uri(sord, 4) }, 1 }, + { { uri(sord, 0), uri(sord, 2), uri(sord, 0) }, 2 } }; + + for (unsigned i = 0; i < NUM_PATTERNS; ++i) { + QueryTest test = patterns[i]; + SordTuple pat = { test.query[0], test.query[1], test.query[2], 0 }; + fprintf(stderr, "Query " TUP_FMT "... ", TUP_FMT_ARGS(pat)); + + iter = sord_find(sord, pat); + int num_results = 0; + for (; !sord_iter_is_end(iter); sord_iter_increment(iter)) { + sord_iter_get(iter, id); + ++num_results; + if (!sord_tuple_match(pat, id)) { + sord_iter_free(iter); + fprintf(stderr, "Fail: Query result " TUP_FMT " does not match pattern\n", + TUP_FMT_ARGS(id)); + return test_fail(); + } + } + sord_iter_free(iter); + if (num_results != test.expected_num_results) { + fprintf(stderr, "Fail: Expected %d results, got %d\n", + test.expected_num_results, num_results); + return test_fail(); + } + fprintf(stderr, "OK (%u matches)\n", test.expected_num_results); + } + + // Query blank node subject + SordTuple pat = { sord_get_blank(sord, true, "ablank"), 0, 0 }; + if (!pat[0]) { + fprintf(stderr, "Blank node subject lost\n"); + return test_fail(); + } + fprintf(stderr, "Query " TUP_FMT "... ", TUP_FMT_ARGS(pat)); + iter = sord_find(sord, pat); + int num_results = 0; + for (; !sord_iter_is_end(iter); sord_iter_increment(iter)) { + sord_iter_get(iter, id); + ++num_results; + if (!sord_tuple_match(pat, id)) { + sord_iter_free(iter); + fprintf(stderr, "Fail: Query result " TUP_FMT " does not match pattern\n", + TUP_FMT_ARGS(id)); + return test_fail(); + } + } + fprintf(stderr, "OK\n"); + sord_iter_free(iter); + if (num_results != 2) { + fprintf(stderr, "Blank node subject query failed\n"); + return test_fail(); + } + + // Test nested queries + fprintf(stderr, "Nested Queries... "); + pat[0] = pat[1] = pat[2] = 0; + SordID last_subject = 0; + iter = sord_find(sord, pat); + for (; !sord_iter_is_end(iter); sord_iter_increment(iter)) { + sord_iter_get(iter, id); + if (id[0] == last_subject) + continue; + + SordTuple subpat = { id[0], 0, 0 }; + SordIter subiter = sord_find(sord, subpat); + int num_sub_results = 0; + for (; !sord_iter_is_end(subiter); sord_iter_increment(subiter)) { + SordTuple subid; + sord_iter_get(subiter, subid); + if (!sord_tuple_match(subpat, subid)) { + sord_iter_free(iter); + sord_iter_free(subiter); + fprintf(stderr, "Fail: Nested query result does not match pattern\n"); + return test_fail(); + } + ++num_sub_results; + } + sord_iter_free(subiter); + if (num_sub_results != n_objects_per) { + fprintf(stderr, "Fail: Nested query failed (got %d results, expected %d)\n", + num_sub_results, n_objects_per); + return test_fail(); + } + last_subject = id[0]; + } + fprintf(stderr, "OK\n\n"); + sord_iter_free(iter); + + return ret; +} + +int +test_write(Sord sord, const size_t n_tuples, const int n_objects_per) +{ + int ret = EXIT_SUCCESS; + + fprintf(stderr, "Removing Statements... "); + + // Remove statements + SordIter iter; + for (iter = sord_begin(sord); !sord_iter_is_end(iter);) { + sord_remove_iter(sord, iter); + } + sord_iter_free(iter); + + const int num_tuples = sord_num_tuples(sord); + if (num_tuples != 0) { + fprintf(stderr, "Fail: All tuples removed but %d tuples remain\n", num_tuples); + return test_fail(); + } + + fprintf(stderr, "OK\n\n"); + + return ret; +} + +int +main(int argc, char** argv) +{ + static const size_t n_tuples = 300; + static const int n_objects_per = 2; + + sord_free(NULL); // Shouldn't crash + + // Create with default options + Sord sord = sord_new("testdb"); + sord_set_option(sord, "http://unknown", "something", SORD_LITERAL, NULL, NULL); + sord_open(sord); + generate(sord, n_tuples, n_objects_per); + + if (test_read(sord, n_tuples, n_objects_per)) { + sord_free(sord); + return EXIT_FAILURE; + } + + // Check interning merges equivalent values + SordID uri_id = sord_get_uri(sord, true, "http://example.org"); + SordID blank_id = sord_get_uri(sord, true, "testblank"); + SordID lit_id = sord_get_literal(sord, true, uri_id, "hello", NULL); + //sord_clear_cache(write); + SordID uri_id2 = sord_get_uri(sord, false, "http://example.org"); + SordID blank_id2 = sord_get_uri(sord, false, "testblank"); + SordID lit_id2 = sord_get_literal(sord, false, uri_id, "hello", NULL); + if (uri_id2 != uri_id) { + fprintf(stderr, "Fail: URI interning failed (duplicates)\n"); + goto fail; + } else if (blank_id2 != blank_id) { + fprintf(stderr, "Fail: Blank node interning failed (duplicates)\n"); + goto fail; + } else if (lit_id2 != lit_id) { + fprintf(stderr, "Fail: Literal interning failed (duplicates)\n"); + goto fail; + } + + // Check interning doesn't clash non-equivalent values + SordID uri_id3 = sord_get_uri(sord, false, "http://example.orgX"); + SordID blank_id3 = sord_get_uri(sord, false, "testblankX"); + SordID lit_id3 = sord_get_literal(sord, false, uri_id, "helloX", NULL); + if (uri_id3 == uri_id) { + fprintf(stderr, "Fail: URI interning failed (clash)\n"); + goto fail; + } else if (blank_id3 == blank_id) { + fprintf(stderr, "Fail: Blank node interning failed (clash)\n"); + goto fail; + } else if (lit_id3 == lit_id) { + fprintf(stderr, "Fail: Literal interning failed (clash)\n"); + goto fail; + } + + sord_free(sord); + + // Test each pattern type with each index + static const char* const index_names[6] = { + "spo", "sop", "ops", "osp", "pso", "pos" + }; + + char* option = strdup("http://drobilla.net/ns/sord#index-xxx"); + const size_t option_len = strlen(option); + for (int i = 0; i < 6; ++i) { + strncpy(option + option_len - 3, index_names[i], 3); + sord = sord_new("testdb"); + sord_set_option(sord, option, "true", SORD_LITERAL, NULL, NULL); + printf("Testing Index `%s'\n", index_names[i]); + sord_open(sord); + generate(sord, n_tuples, n_objects_per); + if (test_read(sord, n_tuples, n_objects_per)) + goto fail; + sord_free(sord); + } + free(option); + + sord = sord_new("testdb"); + sord_open(sord); + if (test_write(sord, n_tuples, n_objects_per)) + goto fail; + + sord_free(sord); + + return EXIT_SUCCESS; + +fail: + sord_free(sord); + return EXIT_FAILURE; +} -- cgit v1.2.1