From 89612ec05f596d135640413e093251fb9691ca14 Mon Sep 17 00:00:00 2001 From: David Robillard Date: Sun, 30 May 2021 12:23:07 -0400 Subject: Add support for converting literals to canonical form --- src/canon.c | 192 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/serdi.c | 20 ++++-- src/string.c | 2 + src/string_utils.h | 8 +-- 4 files changed, 214 insertions(+), 8 deletions(-) create mode 100644 src/canon.c (limited to 'src') diff --git a/src/canon.c b/src/canon.c new file mode 100644 index 00000000..84d20d0c --- /dev/null +++ b/src/canon.c @@ -0,0 +1,192 @@ +/* + Copyright 2019-2022 David Robillard + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#include "caret.h" +#include "namespaces.h" +#include "node.h" +#include "statement.h" +#include "string_utils.h" + +#include "exess/exess.h" +#include "serd/serd.h" + +#include +#include +#include + +typedef struct { + const SerdWorld* world; + const SerdSink* target; + SerdCanonFlags flags; +} SerdCanonData; + +static ExessResult +build_typed(SerdNode** const out, + const SerdNode* const SERD_NONNULL node, + const SerdNode* const SERD_NONNULL datatype) +{ + *out = NULL; + + const char* str = serd_node_string(node); + const char* datatype_uri = serd_node_string(datatype); + ExessResult r = {EXESS_SUCCESS, 0}; + + if (!strcmp(datatype_uri, NS_RDF "langString")) { + *out = serd_new_string(serd_node_string_view(node)); + return r; + } + + const ExessDatatype value_type = exess_datatype_from_uri(datatype_uri); + if (value_type == EXESS_NOTHING) { + return r; + } + + // Measure canonical form to know how much space to allocate for node + if ((r = exess_write_canonical(str, value_type, 0, NULL)).status) { + return r; + } + + // Allocate node + const size_t datatype_uri_len = serd_node_length(datatype); + const size_t datatype_size = serd_node_total_size(datatype); + const size_t len = serd_node_pad_length(r.count); + const size_t total_len = sizeof(SerdNode) + len + datatype_size; + SerdNode* const result = serd_node_malloc(total_len); + + result->length = r.count; + result->flags = SERD_HAS_DATATYPE; + result->type = SERD_LITERAL; + + // Write canonical form directly into node + exess_write_canonical(str, value_type, r.count + 1, serd_node_buffer(result)); + + SerdNode* const datatype_node = result + 1 + (len / sizeof(SerdNode)); + char* const datatype_buf = serd_node_buffer(datatype_node); + + datatype_node->length = datatype_uri_len; + datatype_node->type = SERD_URI; + memcpy(datatype_buf, datatype_uri, datatype_uri_len + 1); + + *out = result; + return r; +} + +static ExessResult +build_tagged(SerdNode** const out, + const SerdNode* const SERD_NONNULL node, + const SerdNode* const SERD_NONNULL language) +{ +#define MAX_LANG_LEN 48 // RFC5646 requires 35, RFC4646 recommends 42 + + const size_t node_len = serd_node_length(node); + const char* const lang = serd_node_string(language); + const size_t lang_len = serd_node_length(language); + if (lang_len > MAX_LANG_LEN) { + const ExessResult r = {EXESS_NO_SPACE, node_len}; + return r; + } + + // Convert language tag to lower-case + char canonical_lang[MAX_LANG_LEN] = {0}; + for (size_t i = 0u; i < lang_len; ++i) { + canonical_lang[i] = serd_to_lower(lang[i]); + } + + // Make a new literal that is otherwise identical + *out = serd_new_literal(serd_node_string_view(node), + serd_node_flags(node), + SERD_SUBSTRING(canonical_lang, lang_len)); + + const ExessResult r = {EXESS_SUCCESS, node_len}; + return r; + +#undef MAX_LANG_LEN +} + +static SerdStatus +serd_canon_on_statement(SerdCanonData* const data, + const SerdStatementFlags flags, + const SerdStatement* const statement) +{ + const SerdNode* const object = serd_statement_object(statement); + const SerdNode* const datatype = serd_node_datatype(object); + const SerdNode* const language = serd_node_language(object); + if (!datatype && !language) { + return serd_sink_write_statement(data->target, flags, statement); + } + + SerdNode* normo = NULL; + const ExessResult r = datatype ? build_typed(&normo, object, datatype) + : build_tagged(&normo, object, language); + + if (r.status) { + SerdCaret caret = {NULL, 0u, 0u}; + const bool lax = (data->flags & SERD_CANON_LAX); + + if (statement->caret) { + // Adjust column to point at the error within the literal + caret.file = statement->caret->file; + caret.line = statement->caret->line; + caret.col = statement->caret->col + 1 + (unsigned)r.count; + } + + serd_logf_at(data->world, + lax ? SERD_LOG_LEVEL_WARNING : SERD_LOG_LEVEL_ERROR, + statement->caret ? &caret : NULL, + "invalid literal (%s)", + exess_strerror(r.status)); + + if (!lax) { + return SERD_ERR_INVALID; + } + } + + if (!normo) { + return serd_sink_write_statement(data->target, flags, statement); + } + + const SerdStatus st = serd_sink_write(data->target, + flags, + statement->nodes[0], + statement->nodes[1], + normo, + statement->nodes[3]); + serd_node_free(normo); + return st; +} + +static SerdStatus +serd_canon_on_event(SerdCanonData* const data, const SerdEvent* const event) +{ + return (event->type == SERD_STATEMENT) + ? serd_canon_on_statement( + data, event->statement.flags, event->statement.statement) + : serd_sink_write_event(data->target, event); +} + +SerdSink* +serd_canon_new(const SerdWorld* const world, + const SerdSink* const target, + const SerdCanonFlags flags) +{ + SerdCanonData* const data = (SerdCanonData*)calloc(1, sizeof(SerdCanonData)); + + data->world = world; + data->target = target; + data->flags = flags; + + return serd_sink_new(data, (SerdEventFunc)serd_canon_on_event, free); +} diff --git a/src/serdi.c b/src/serdi.c index 97601b83..243be98e 100644 --- a/src/serdi.c +++ b/src/serdi.c @@ -56,6 +56,7 @@ print_usage(const char* const name, const bool error) fprintf(os, "Usage: %s [OPTION]... INPUT...\n", name); fprintf(os, "Read and write RDF syntax.\n"); fprintf(os, "Use - for INPUT to read from standard input.\n\n"); + fprintf(os, " -C Convert literals to canonical form.\n"); fprintf(os, " -I BASE_URI Input base URI.\n"); fprintf(os, " -a Write ASCII output if possible.\n"); fprintf(os, " -b Fast bulk output for large serialisations.\n"); @@ -153,6 +154,7 @@ main(int argc, char** argv) bool no_inline = false; bool osyntax_set = false; bool use_model = false; + bool canonical = false; bool quiet = false; size_t stack_size = 4194304; const char* input_string = NULL; @@ -169,7 +171,9 @@ main(int argc, char** argv) for (int o = 1; argv[a][o]; ++o) { const char opt = argv[a][o]; - if (opt == 'a') { + if (opt == 'C') { + canonical = true; + } else if (opt == 'a') { writer_flags |= SERD_WRITE_ASCII; } else if (opt == 'b') { bulk_write = true; @@ -337,7 +341,7 @@ main(int argc, char** argv) SerdModel* model = NULL; SerdSink* inserter = NULL; - const SerdSink* sink = NULL; + const SerdSink* out_sink = NULL; if (use_model) { const SerdModelFlags flags = (input_has_graphs ? SERD_STORE_GRAPHS : 0u); @@ -354,9 +358,16 @@ main(int argc, char** argv) } inserter = serd_inserter_new(model, NULL); - sink = inserter; + out_sink = inserter; } else { - sink = serd_writer_sink(writer); + out_sink = serd_writer_sink(writer); + } + + const SerdSink* sink = out_sink; + + SerdSink* canon = NULL; + if (canonical) { + sink = canon = serd_canon_new(world, out_sink, reader_flags); } if (quiet) { @@ -455,6 +466,7 @@ main(int argc, char** argv) serd_cursor_free(everything); } + serd_sink_free(canon); serd_sink_free(inserter); serd_model_free(model); serd_writer_free(writer); diff --git a/src/string.c b/src/string.c index 11b53050..6942b7b6 100644 --- a/src/string.c +++ b/src/string.c @@ -62,6 +62,8 @@ serd_strerror(const SerdStatus status) return "Invalid or unresolved URI"; case SERD_ERR_BAD_INDEX: return "No optimal model index available"; + case SERD_ERR_INVALID: + return "Invalid data"; } return "Unknown error"; diff --git a/src/string_utils.h b/src/string_utils.h index 54f7877c..5cf7ba8c 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -60,7 +60,7 @@ is_xdigit(const int c) } static inline bool -is_space(const char c) +is_space(const int c) { switch (c) { case ' ': @@ -89,16 +89,16 @@ is_windows_path(const char* path) } static inline char -serd_to_upper(const char c) +serd_to_lower(const char c) { - return (char)((c >= 'a' && c <= 'z') ? c - 32 : c); + return (char)((c >= 'A' && c <= 'Z') ? c + 32 : c); } static inline int serd_strncasecmp(const char* s1, const char* s2, size_t n) { for (; n > 0 && *s2; s1++, s2++, --n) { - if (serd_to_upper(*s1) != serd_to_upper(*s2)) { + if (serd_to_lower(*s1) != serd_to_lower(*s2)) { return ((*(const uint8_t*)s1 < *(const uint8_t*)s2) ? -1 : +1); } } -- cgit v1.2.1