From 1fd33e0a85bdf6bcc4f8138940462c4a4a391175 Mon Sep 17 00:00:00 2001 From: David Robillard Date: Mon, 14 Oct 2019 23:26:41 +0200 Subject: Add support for converting literals to canonical form --- src/canon.c | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/node.c | 2 +- src/node.h | 4 ++ src/serdi.c | 20 ++++-- src/string_utils.h | 2 +- 5 files changed, 216 insertions(+), 6 deletions(-) create mode 100644 src/canon.c (limited to 'src') diff --git a/src/canon.c b/src/canon.c new file mode 100644 index 00000000..99351f9e --- /dev/null +++ b/src/canon.c @@ -0,0 +1,194 @@ +/* + Copyright 2019-2020 David Robillard + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#include "cursor.h" +#include "namespaces.h" +#include "node.h" +#include "statement.h" +#include "string_utils.h" +#include "world.h" + +#include "exess/exess.h" +#include "serd/serd.h" + +#include +#include +#include +#include + +typedef struct { + const SerdWorld* world; + const SerdSink* target; + SerdCanonFlags flags; +} SerdCanonData; + +static ExessResult +make_canonical(SerdNode** const out, const SerdNode* const SERD_NONNULL node) +{ + *out = NULL; + + const char* str = serd_node_string(node); + const SerdNode* datatype = serd_node_datatype(node); + ExessResult r = {EXESS_SUCCESS, 0}; + + if (serd_node_type(datatype) != SERD_URI) { + r.status = EXESS_BAD_VALUE; + return r; + } + + const char* datatype_uri = serd_node_string(datatype); + if (!strcmp(datatype_uri, NS_RDF "langString")) { + *out = serd_new_string(serd_node_string_view(node)); + return r; + } + + const ExessDatatype value_type = exess_datatype_from_uri(datatype_uri); + if (value_type == EXESS_NOTHING) { + return r; + } + + // Measure canonical form to know how much space to allocate for node + ExessVariant variant = exess_make_nothing(EXESS_SUCCESS); + if (exess_datatype_is_bounded(value_type)) { + r = exess_read_variant(&variant, value_type, str); + if (!r.status) { + r = exess_write_variant(variant, 0, NULL); + } + } else { + r = exess_write_canonical(str, value_type, 0, NULL); + } + + if (r.status) { + return r; + } + + // Allocate node + const size_t datatype_uri_len = serd_node_length(datatype); + const size_t len = serd_node_pad_size(r.count); + const size_t total_len = sizeof(SerdNode) + len + datatype_uri_len; + + SerdNode* const result = + serd_node_malloc(total_len, SERD_HAS_DATATYPE, SERD_LITERAL); + + // Write canonical form directly into node + char* buf = serd_node_buffer(result); + if (exess_datatype_is_bounded(value_type)) { + r = exess_write_variant(variant, r.count + 1, buf); + result->n_bytes = r.count; + } else { + r = exess_write_canonical(str, value_type, r.count + 1, buf); + result->n_bytes = r.count; + } + + if (r.status) { + serd_node_free(result); + return r; + } + + SerdNode* const datatype_node = result + 1 + (len / sizeof(SerdNode)); + char* const datatype_buf = serd_node_buffer(datatype_node); + + datatype_node->n_bytes = datatype_uri_len; + datatype_node->type = SERD_URI; + memcpy(datatype_buf, datatype_uri, datatype_uri_len + 1); + + /* serd_node_check_padding(datatype_node); */ + /* serd_node_check_padding(result); */ + + *out = result; + return r; +} + +static SerdStatus +serd_canon_on_statement(SerdCanonData* data, + SerdStatementFlags flags, + const SerdStatement* statement) +{ + const SerdNode* object = serd_statement_object(statement); + if (serd_node_type(object) != SERD_LITERAL || !serd_node_datatype(object)) { + return serd_sink_write_statement(data->target, flags, statement); + } + + SerdNode* normo = NULL; + ExessResult r = make_canonical(&normo, object); + if (r.status) { + const bool lax = (data->flags & SERD_CANON_LAX); + const SerdLogLevel level = + lax ? SERD_LOG_LEVEL_WARNING : SERD_LOG_LEVEL_ERR; + + if (statement->cursor) { + // Adjust column to point at the exact error location in the literal + const SerdCursor cursor = {statement->cursor->file, + statement->cursor->line, + statement->cursor->col + 1 + + (unsigned)r.count}; + + serd_world_logf_internal(data->world, + SERD_ERR_INVALID, + level, + &cursor, + "invalid literal (%s)\n", + exess_strerror(r.status)); + } else { + serd_world_logf_internal(data->world, + SERD_ERR_INVALID, + level, + NULL, + "invalid literal (%s)\n", + exess_strerror(r.status)); + } + + if (!lax) { + return SERD_ERR_INVALID; + } + } + + if (!normo) { + return serd_sink_write_statement(data->target, flags, statement); + } + + const SerdStatus st = serd_sink_write(data->target, + flags, + statement->nodes[0], + statement->nodes[1], + normo, + statement->nodes[3]); + serd_node_free(normo); + return st; +} + +static SerdStatus +serd_canon_on_event(SerdCanonData* data, const SerdEvent* event) +{ + return (event->type == SERD_STATEMENT) + ? serd_canon_on_statement( + data, event->statement.flags, event->statement.statement) + : serd_sink_write_event(data->target, event); +} + +SerdSink* +serd_canon_new(const SerdWorld* world, + const SerdSink* target, + const SerdCanonFlags flags) +{ + SerdCanonData* const data = (SerdCanonData*)calloc(1, sizeof(SerdCanonData)); + + data->world = world; + data->target = target; + data->flags = flags; + + return serd_sink_new(data, (SerdEventFunc)serd_canon_on_event, free); +} diff --git a/src/node.c b/src/node.c index 7d1177e4..0283b14f 100644 --- a/src/node.c +++ b/src/node.c @@ -48,7 +48,7 @@ typedef struct { static SerdNode* serd_new_from_uri(const SerdURIView uri, const SerdURIView base); -static size_t +size_t serd_node_pad_size(const size_t n_bytes) { const size_t pad = sizeof(SerdNode) - (n_bytes + 2) % sizeof(SerdNode); diff --git a/src/node.h b/src/node.h index 5206b109..6470d939 100644 --- a/src/node.h +++ b/src/node.h @@ -62,6 +62,10 @@ void serd_node_set(SerdNode* SERD_NONNULL* SERD_NONNULL dst, const SerdNode* SERD_NONNULL src); +SERD_CONST_FUNC +size_t +serd_node_pad_size(const size_t n_bytes); + SERD_PURE_FUNC size_t serd_node_total_size(const SerdNode* SERD_NONNULL node); diff --git a/src/serdi.c b/src/serdi.c index 70c52682..b028b862 100644 --- a/src/serdi.c +++ b/src/serdi.c @@ -55,6 +55,7 @@ print_usage(const char* name, bool error) fprintf(os, "Usage: %s [OPTION]... INPUT...\n", name); fprintf(os, "Read and write RDF syntax.\n"); fprintf(os, "Use - for INPUT to read from standard input.\n\n"); + fprintf(os, " -C Convert literals to canonical form.\n"); fprintf(os, " -I BASE_URI Input base URI.\n"); fprintf(os, " -V Validate inputs.\n"); fprintf(os, " -a Write ASCII output if possible.\n"); @@ -150,6 +151,7 @@ main(int argc, char** argv) bool osyntax_set = false; bool validate = false; bool use_model = false; + bool normalise = false; bool quiet = false; size_t stack_size = 4194304; const char* input_string = NULL; @@ -163,7 +165,9 @@ main(int argc, char** argv) break; } - if (argv[a][1] == 'I') { + if (argv[a][1] == 'C') { + normalise = true; + } else if (argv[a][1] == 'I') { if (++a == argc) { return missing_arg(argv[0], 'I'); } @@ -316,7 +320,7 @@ main(int argc, char** argv) SerdModel* model = NULL; SerdSink* inserter = NULL; - const SerdSink* sink = NULL; + const SerdSink* out_sink = NULL; if (use_model) { const SerdModelFlags flags = SERD_INDEX_SPO | (input_has_graphs ? SERD_INDEX_GRAPHS : 0u) | @@ -324,9 +328,16 @@ main(int argc, char** argv) model = serd_model_new(world, flags); inserter = serd_inserter_new(model, env, NULL); - sink = inserter; + out_sink = inserter; } else { - sink = serd_writer_sink(writer); + out_sink = serd_writer_sink(writer); + } + + const SerdSink* sink = out_sink; + + SerdSink* canon = NULL; + if (normalise) { + sink = canon = serd_canon_new(world, out_sink, reader_flags); } if (quiet) { @@ -410,6 +421,7 @@ main(int argc, char** argv) serd_range_free(range); } + serd_sink_free(canon); serd_sink_free(inserter); serd_model_free(model); serd_writer_free(writer); diff --git a/src/string_utils.h b/src/string_utils.h index 0e9eee43..a302bc49 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -62,7 +62,7 @@ is_xdigit(const int c) } static inline bool -is_space(const char c) +is_space(const int c) { switch (c) { case ' ': -- cgit v1.2.1