diff options
author | David Robillard <d@drobilla.net> | 2019-10-14 23:26:41 +0200 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2020-10-27 13:13:59 +0100 |
commit | 80fb6d0ff7c093466ac70b38be5676b868516c08 (patch) | |
tree | 9589dad1cae377a3c7e11aa7983106ac9d24afd0 /src | |
parent | 7f1d50b40814db24573b9eb425566ce1d44d2e85 (diff) | |
download | serd-80fb6d0ff7c093466ac70b38be5676b868516c08.tar.gz serd-80fb6d0ff7c093466ac70b38be5676b868516c08.tar.bz2 serd-80fb6d0ff7c093466ac70b38be5676b868516c08.zip |
Add support for basic literal normalisation
Diffstat (limited to 'src')
-rw-r--r-- | src/normalise.c | 273 | ||||
-rw-r--r-- | src/serdi.c | 18 | ||||
-rw-r--r-- | src/string_utils.h | 2 |
3 files changed, 289 insertions, 4 deletions
diff --git a/src/normalise.c b/src/normalise.c new file mode 100644 index 00000000..34f97f71 --- /dev/null +++ b/src/normalise.c @@ -0,0 +1,273 @@ +/* + Copyright 2019-2020 David Robillard <http://drobilla.net> + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#include "namespaces.h" +#include "node.h" +#include "statement.h" +#include "string_utils.h" + +#include "serd/serd.h" + +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> + +typedef struct +{ + const SerdEnv* env; + const SerdSink* target; +} SerdNormaliserData; + +/// Return true iff `c` is "+" or "-" +static inline bool +is_sign(const int c) +{ + return c == '+' || c == '-'; +} + +/// Return true iff `c` is "0" +static inline bool +is_zero(const int c) +{ + return c == '0'; +} + +/// Return true iff `c` is "." +static inline bool +is_point(const int c) +{ + return c == '.'; +} + +/// Return a view of `buf` with leading and trailing whitespace trimmed +static SerdStringView +trim(const char* buf, const size_t len) +{ + SerdStringView view = {buf, len}; + + while (view.len > 0 && is_space(*view.buf)) { + ++view.buf; + --view.len; + } + + while (is_space(view.buf[view.len - 1])) { + --view.len; + } + + return view; +} + +/// Scan `s` forwards as long as `pred` is true for the character it points at +static inline const char* +scan(const char** s, bool (*pred)(const int)) +{ + while (pred(**s)) { + ++(*s); + } + + return *s; +} + +/// Skip `s` forward once if `pred` is true for the character it points at +static inline const char** +skip(const char** s, bool (*pred)(const int)) +{ + *s += pred(**s); + return s; +} + +static SerdNode* +serd_normalise_decimal(const char* str) +{ + const char* s = str; // Cursor + const char* sign = scan(&s, is_space); // Sign + const char* first = scan(skip(&s, is_sign), is_zero); // First non-zero + const char* point = scan(&s, is_digit); // Decimal point + const char* last = scan(skip(&s, is_point), is_digit); // Last digit + const char* end = scan(&s, is_space); // Last non-space + + if (*end != '\0') { + return NULL; + } else if (*point == '.') { + while (*(last - 1) == '0') { + --last; + } + } + + char* buf = (char*)calloc(1, (size_t)(end - sign) + 4u); + char* b = buf; + if (*sign == '-') { + *b++ = '-'; + } + + if (*first == '.' || first == last) { + *b++ = '0'; // Add missing leading zero (before point) + } + + memcpy(b, first, (size_t)(last - first)); + b += last - first; + + if (*point != '.') { + *b++ = '.'; + *b++ = '0'; + } else if (point == last - 1) { + *b++ = '0'; // Add missing trailing zero (after point) + } + + const char* const datatype = NS_XSD "decimal"; + SerdNode* node = serd_new_literal( + buf, (size_t)(b - buf), datatype, strlen(datatype), NULL, 0); + + free(buf); + return node; +} + +static SerdNode* +serd_normalise_integer(const char* str, const SerdNode* datatype) +{ + const char* s = str; // Cursor + const char* sign = scan(&s, is_space); // Sign + const char* first = scan(skip(&s, is_sign), is_zero); // First non-zero + const char* last = scan(&s, is_digit); // Last digit + const char* end = scan(&s, is_space); // Last non-space + + if (*end != '\0') { + return NULL; + } + + char* const buf = (char*)calloc(1, (size_t)(end - sign) + 2u); + char* b = buf; + if (*sign == '-') { + *b++ = '-'; + } + + if (first == last) { + *b = '0'; + } else { + memcpy(b, first, (size_t)(last - first)); + } + + SerdNode* node = serd_new_typed_literal(buf, datatype); + + free(buf); + return node; +} + +SerdNode* +serd_node_normalise(const SerdEnv* env, const SerdNode* const node) +{ +#define INTEGER_TYPE_LEN 19 + + static const char int_types[13][INTEGER_TYPE_LEN] = {"byte", + "int", + "integer", + "long", + "negativeInteger", + "nonNegativeInteger", + "nonPositiveInteger", + "positiveInteger", + "short", + "unsignedByte", + "unsignedInt", + "unsignedLong", + "unsignedShort"}; + + const char* str = serd_node_string(node); + SerdNode* datatype = serd_env_expand(env, serd_node_datatype(node)); + if (node->type != SERD_LITERAL || !datatype) { + return NULL; + } + + const char* datatype_uri = serd_node_string(datatype); + SerdNode* result = NULL; + if (!strcmp(datatype_uri, NS_XSD "boolean")) { + const SerdStringView trimmed = trim(str, serd_node_length(node)); + if (trimmed.len) { + if (!strncmp(trimmed.buf, "false", trimmed.len) || + !strncmp(trimmed.buf, "0", trimmed.len)) { + result = serd_new_boolean(false); + } else if (!strncmp(trimmed.buf, "true", trimmed.len) || + !strncmp(trimmed.buf, "1", trimmed.len)) { + result = serd_new_boolean(true); + } + } + } else if (!strcmp(datatype_uri, NS_XSD "float")) { + result = serd_new_float((float)serd_strtod(str, NULL)); + } else if (!strcmp(datatype_uri, NS_XSD "double")) { + result = serd_new_double(serd_strtod(str, NULL)); + } else if (!strcmp(datatype_uri, NS_XSD "decimal")) { + result = serd_normalise_decimal(str); + } else if (!strncmp(datatype_uri, NS_XSD, strlen(NS_XSD)) && + bsearch(datatype_uri + strlen(NS_XSD), + &int_types, + sizeof(int_types) / INTEGER_TYPE_LEN, + INTEGER_TYPE_LEN, + (int (*)(const void*, const void*))strcmp)) { + result = serd_normalise_integer(str, datatype); + } + + serd_node_free(datatype); + return result; +} + +static SerdStatus +serd_normaliser_on_statement(SerdNormaliserData* data, + SerdStatementFlags flags, + const SerdStatement* statement) +{ + const SerdNode* object = serd_statement_object(statement); + SerdNode* normo = serd_node_normalise(data->env, object); + + if (normo) { + const SerdStatus st = serd_sink_write(data->target, + flags, + statement->nodes[0], + statement->nodes[1], + normo, + statement->nodes[3]); + + serd_node_free(normo); + return st; + } + + return serd_sink_write_statement(data->target, flags, statement); +} + +static SerdStatus +serd_normaliser_on_event(SerdNormaliserData* data, const SerdEvent* event) +{ + return (event->type == SERD_STATEMENT) + ? serd_normaliser_on_statement(data, + event->statement.flags, + event->statement.statement) + : serd_sink_write_event(data->target, event); +} + +SerdSink* +serd_normaliser_new(const SerdSink* target, const SerdEnv* env) +{ + SerdNormaliserData* data = + (SerdNormaliserData*)calloc(1, sizeof(SerdNormaliserData)); + + data->env = env; + data->target = target; + + SerdSink* sink = serd_sink_new(data, free); + + serd_sink_set_event_func(sink, (SerdEventFunc)serd_normaliser_on_event); + + return sink; +} diff --git a/src/serdi.c b/src/serdi.c index c3127e8c..7f4880fd 100644 --- a/src/serdi.c +++ b/src/serdi.c @@ -63,6 +63,7 @@ print_usage(const char* name, bool error) fprintf(os, " -k BYTES Parser stack size.\n"); fprintf(os, " -l Lax (non-strict) parsing.\n"); fprintf(os, " -m Build and serialise a model (no streaming).\n"); + fprintf(os, " -n Normalise literals.\n"); fprintf(os, " -o SYNTAX Output syntax: turtle/ntriples/nquads.\n"); fprintf(os, " -p PREFIX Add PREFIX to blank node IDs.\n"); fprintf(os, " -q Suppress all output except data.\n"); @@ -138,6 +139,7 @@ main(int argc, char** argv) bool osyntax_set = false; bool validate = false; bool use_model = false; + bool normalise = false; bool quiet = false; size_t stack_size = 4194304; const char* input_string = NULL; @@ -170,6 +172,8 @@ main(int argc, char** argv) writer_flags |= SERD_WRITE_LAX; } else if (argv[a][1] == 'm') { use_model = true; + } else if (argv[a][1] == 'n') { + normalise = true; } else if (argv[a][1] == 'q') { quiet = true; } else if (argv[a][1] == 'v') { @@ -271,7 +275,7 @@ main(int argc, char** argv) SerdModel* model = NULL; SerdSink* inserter = NULL; - const SerdSink* sink = NULL; + const SerdSink* out_sink = NULL; if (use_model) { const SerdModelFlags flags = SERD_INDEX_SPO | (input_has_graphs ? SERD_INDEX_GRAPHS : 0u) | @@ -280,9 +284,16 @@ main(int argc, char** argv) model = serd_model_new(world, flags); inserter = serd_inserter_new(model, env, NULL); - sink = inserter; + out_sink = inserter; } else { - sink = serd_writer_get_sink(writer); + out_sink = serd_writer_get_sink(writer); + } + + const SerdSink* sink = out_sink; + + SerdSink* normaliser = NULL; + if (normalise) { + sink = normaliser = serd_normaliser_new(out_sink, env); } if (quiet) { @@ -359,6 +370,7 @@ main(int argc, char** argv) serd_range_free(range); } + serd_sink_free(normaliser); serd_node_free(input_name); serd_sink_free(inserter); serd_model_free(model); diff --git a/src/string_utils.h b/src/string_utils.h index 4bd36721..3f3d8c12 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -64,7 +64,7 @@ is_xdigit(const int c) } static inline bool -is_space(const char c) +is_space(const int c) { switch (c) { case ' ': case '\f': case '\n': case '\r': case '\t': case '\v': |