diff options
author | David Robillard <d@drobilla.net> | 2023-12-02 15:53:25 -0500 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2023-12-02 18:49:08 -0500 |
commit | 339f9d90d1fe001978d15e1c007a3861a7145453 (patch) | |
tree | d29ec45610ad04a79dd353fbb2609bbb2931148f | |
parent | 4711fdf527f416faee8ff19e15f050d4b48dcfb2 (diff) | |
download | serd-339f9d90d1fe001978d15e1c007a3861a7145453.tar.gz serd-339f9d90d1fe001978d15e1c007a3861a7145453.tar.bz2 serd-339f9d90d1fe001978d15e1c007a3861a7145453.zip |
[WIP] Add support for converting literals to canonical form
-rw-r--r-- | NEWS | 1 | ||||
-rw-r--r-- | doc/man/serd-pipe.1 | 14 | ||||
-rw-r--r-- | include/serd/canon.h | 47 | ||||
-rw-r--r-- | include/serd/serd.h | 1 | ||||
-rw-r--r-- | meson.build | 2 | ||||
-rw-r--r-- | src/canon.c | 220 | ||||
-rw-r--r-- | src/string_utils.h | 8 | ||||
-rw-r--r-- | test/extra/canon/bad-boolean.ttl | 5 | ||||
-rw-r--r-- | test/extra/canon/bad-decimal-leading.ttl | 4 | ||||
-rw-r--r-- | test/extra/canon/bad-decimal-trailing.ttl | 4 | ||||
-rw-r--r-- | test/extra/canon/bad-empty-boolean.ttl | 5 | ||||
-rw-r--r-- | test/extra/canon/bad-integer-leading.ttl | 4 | ||||
-rw-r--r-- | test/extra/canon/bad-integer-trailing.ttl | 4 | ||||
-rw-r--r-- | test/extra/canon/bad-lang-long.ttl | 3 | ||||
-rw-r--r-- | test/extra/canon/manifest.ttl | 65 | ||||
-rw-r--r-- | test/extra/canon/test-canon.nt | 70 | ||||
-rw-r--r-- | test/extra/canon/test-canon.ttl | 76 | ||||
-rw-r--r-- | test/meson.build | 9 | ||||
-rw-r--r-- | test/test_canon.c | 103 | ||||
-rw-r--r-- | tools/serd-pipe.c | 25 |
20 files changed, 657 insertions, 13 deletions
@@ -3,6 +3,7 @@ serd (1.1.1) unstable; urgency=medium * Add SerdBuffer for mutable buffers to keep SerdChunk const-correct * Add SerdWorld for shared library state * Add extensible logging API + * Add support for converting literals to canonical form * Add support for parsing variables * Add support for writing terse output with minimal newlines * Add support for xsd:float and xsd:double literals diff --git a/doc/man/serd-pipe.1 b/doc/man/serd-pipe.1 index 793737f9..d731f0b4 100644 --- a/doc/man/serd-pipe.1 +++ b/doc/man/serd-pipe.1 @@ -8,7 +8,7 @@ .Nd read and write RDF data .Sh SYNOPSIS .Nm serd-pipe -.Op Fl afhlqtvx +.Op Fl Cafhlqtvx .Op Fl B Ar base .Op Fl b Ar bytes .Op Fl c Ar prefix @@ -68,6 +68,18 @@ When the input is a file, the URI of the file is automatically used as the base URI. This option can be used to override that, or to provide a base URI for input from stdin or a string. +.It Fl C +Convert literals to canonical form. +Literals with supported XSD datatypes will be parsed and rewritten canonically. +Invalid literals will cause an error. +All numeric datatypes are supported, as well as +.Vt boolean , +.Vt duration , +.Vt datetime , +.Vt time , +.Vt hexBinary , +and +.Vt base64Binary . .It Fl a Write ASCII output. If this is enabled, all non-ASCII characters will be escaped, even if the output syntax allows them to be written in UTF-8. diff --git a/include/serd/canon.h b/include/serd/canon.h new file mode 100644 index 00000000..862a4db0 --- /dev/null +++ b/include/serd/canon.h @@ -0,0 +1,47 @@ +// Copyright 2011-2022 David Robillard <d@drobilla.net> +// SPDX-License-Identifier: ISC + +#ifndef SERD_CANON_H +#define SERD_CANON_H + +#include "serd/attributes.h" +#include "serd/sink.h" +#include "serd/world.h" +#include "zix/attributes.h" + +#include <stdint.h> + +SERD_BEGIN_DECLS + +/** + @defgroup serd_canon Canon + @ingroup serd_streaming + @{ +*/ + +/// Flags that control canonical node transformation +typedef enum { + SERD_CANON_LAX = 1U << 0U, ///< Tolerate and pass through invalid input +} SerdCanonFlag; + +/// Bitwise OR of SerdCanonFlag values +typedef uint32_t SerdCanonFlags; + +/** + Return a new sink that transforms literals to canonical form where possible. + + The returned sink acts like `target` in all respects, except literal nodes + in statements may be modified from the original. +*/ +SERD_API SerdSink* ZIX_ALLOCATED +serd_canon_new(const SerdWorld* ZIX_NONNULL world, + const SerdSink* ZIX_NONNULL target, + SerdCanonFlags flags); + +/** + @} +*/ + +SERD_END_DECLS + +#endif // SERD_CANON_H diff --git a/include/serd/serd.h b/include/serd/serd.h index 88be5daa..d264192f 100644 --- a/include/serd/serd.h +++ b/include/serd/serd.h @@ -69,6 +69,7 @@ @{ */ +#include "serd/canon.h" #include "serd/env.h" #include "serd/event.h" #include "serd/sink.h" diff --git a/meson.build b/meson.build index 7bd2e560..82dc839b 100644 --- a/meson.build +++ b/meson.build @@ -129,6 +129,7 @@ include_dirs = include_directories('include') c_headers = files( 'include/serd/attributes.h', 'include/serd/buffer.h', + 'include/serd/canon.h', 'include/serd/caret.h', 'include/serd/env.h', 'include/serd/event.h', @@ -158,6 +159,7 @@ sources = files( 'src/block_dumper.c', 'src/buffer.c', 'src/byte_source.c', + 'src/canon.c', 'src/caret.c', 'src/env.c', 'src/input_stream.c', diff --git a/src/canon.c b/src/canon.c new file mode 100644 index 00000000..c0ce8ef4 --- /dev/null +++ b/src/canon.c @@ -0,0 +1,220 @@ +// Copyright 2019-2022 David Robillard <d@drobilla.net> +// SPDX-License-Identifier: ISC + +#include "caret.h" // IWYU pragma: keep +#include "memory.h" +#include "namespaces.h" +#include "node.h" +#include "statement.h" // IWYU pragma: keep +#include "string_utils.h" + +#include "exess/exess.h" +#include "serd/canon.h" +#include "serd/caret.h" +#include "serd/event.h" +#include "serd/log.h" +#include "serd/memory.h" +#include "serd/node.h" +#include "serd/sink.h" +#include "serd/statement.h" +#include "serd/status.h" +#include "serd/string_view.h" +#include "serd/world.h" +#include "zix/attributes.h" + +#include <assert.h> +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> + +typedef struct { + const SerdWorld* world; + const SerdSink* target; + SerdCanonFlags flags; +} SerdCanonData; + +static ExessResult +build_typed(SerdAllocator* const ZIX_NONNULL allocator, + SerdNode** const out, + const SerdNode* const ZIX_NONNULL node, + const SerdNode* const ZIX_NONNULL datatype) +{ + *out = NULL; + + const char* str = serd_node_string(node); + const char* datatype_uri = serd_node_string(datatype); + ExessResult r = {EXESS_SUCCESS, 0}; + + if (!strcmp(datatype_uri, NS_RDF "langString")) { + *out = + serd_node_new(allocator, serd_a_string_view(serd_node_string_view(node))); + return r; + } + + const ExessDatatype value_type = exess_datatype_from_uri(datatype_uri); + if (value_type == EXESS_NOTHING) { + return r; + } + + // Measure canonical form to know how much space to allocate for node + if ((r = exess_write_canonical(str, value_type, 0, NULL)).status) { + return r; + } + + // Allocate node + const size_t datatype_uri_len = serd_node_length(datatype); + const size_t datatype_size = serd_node_total_size(datatype); + const size_t len = serd_node_pad_length(r.count); + const size_t total_len = sizeof(SerdNode) + len + datatype_size; + SerdNode* const result = serd_node_malloc(allocator, total_len); + if (!result) { + r.status = EXESS_NO_SPACE; + return r; + } + + result->length = r.count; + result->flags = SERD_HAS_DATATYPE; + result->type = SERD_LITERAL; + + // Write canonical form directly into node + exess_write_canonical(str, value_type, r.count + 1, serd_node_buffer(result)); + + SerdNode* const datatype_node = result + 1 + (len / sizeof(SerdNode)); + char* const datatype_buf = serd_node_buffer(datatype_node); + + datatype_node->length = datatype_uri_len; + datatype_node->type = SERD_URI; + memcpy(datatype_buf, datatype_uri, datatype_uri_len + 1); + + *out = result; + return r; +} + +static ExessResult +build_tagged(SerdAllocator* const ZIX_NONNULL allocator, + SerdNode** const out, + const SerdNode* const ZIX_NONNULL node, + const SerdNode* const ZIX_NONNULL language) +{ +#define MAX_LANG_LEN 48 // RFC5646 requires 35, RFC4646 recommends 42 + + const size_t node_len = serd_node_length(node); + const char* const lang = serd_node_string(language); + const size_t lang_len = serd_node_length(language); + if (lang_len > MAX_LANG_LEN) { + const ExessResult r = {EXESS_NO_SPACE, node_len}; + return r; + } + + // Convert language tag to lower-case + char canonical_lang[MAX_LANG_LEN] = {0}; + for (size_t i = 0U; i < lang_len; ++i) { + canonical_lang[i] = serd_to_lower(lang[i]); + } + + // Make a new literal that is otherwise identical + *out = + serd_node_new(allocator, + serd_a_literal(serd_node_string_view(node), + serd_node_flags(node), + serd_substring(canonical_lang, lang_len))); + + const ExessResult r = {EXESS_SUCCESS, node_len}; + return r; + +#undef MAX_LANG_LEN +} + +static SerdStatus +serd_canon_on_statement(SerdCanonData* const data, + const SerdStatementFlags flags, + const SerdStatement* const statement) +{ + SerdAllocator* const allocator = serd_world_allocator(data->world); + const SerdNode* const object = serd_statement_object(statement); + const SerdNode* const datatype = serd_node_datatype(object); + const SerdNode* const language = serd_node_language(object); + if (!datatype && !language) { + return serd_sink_write_statement(data->target, flags, statement); + } + + SerdNode* normo = NULL; + const ExessResult r = datatype + ? build_typed(allocator, &normo, object, datatype) + : build_tagged(allocator, &normo, object, language); + + if (r.status) { + SerdCaret caret = {NULL, 0U, 0U}; + const bool lax = (data->flags & SERD_CANON_LAX); + + if (statement->caret) { + // Adjust column to point at the error within the literal + caret.document = statement->caret->document; + caret.line = statement->caret->line; + caret.col = statement->caret->col + 1 + (unsigned)r.count; + } + + serd_logf_at(data->world, + lax ? SERD_LOG_LEVEL_WARNING : SERD_LOG_LEVEL_ERROR, + statement->caret ? &caret : NULL, + "invalid literal (%s)", + exess_strerror(r.status)); + + if (!lax) { + return r.status == EXESS_NO_SPACE ? SERD_BAD_ALLOC : SERD_BAD_LITERAL; + } + } + + if (!normo) { + return serd_sink_write_statement(data->target, flags, statement); + } + + const SerdStatus st = serd_sink_write(data->target, + flags, + statement->nodes[0], + statement->nodes[1], + normo, + statement->nodes[3]); + serd_node_free(allocator, normo); + return st; +} + +static SerdStatus +serd_canon_on_event(void* const handle, const SerdEvent* const event) +{ + SerdCanonData* const data = (SerdCanonData*)handle; + + return (event->type == SERD_STATEMENT) + ? serd_canon_on_statement( + data, event->statement.flags, event->statement.statement) + : serd_sink_write_event(data->target, event); +} + +SerdSink* +serd_canon_new(const SerdWorld* const world, + const SerdSink* const target, + const SerdCanonFlags flags) +{ + assert(world); + assert(target); + + SerdCanonData* const data = + (SerdCanonData*)serd_wcalloc(world, 1, sizeof(SerdCanonData)); + + if (!data) { + return NULL; + } + + data->world = world; + data->target = target; + data->flags = flags; + + SerdSink* const sink = + serd_sink_new(serd_world_allocator(world), data, serd_canon_on_event, free); + + if (!sink) { + serd_wfree(world, data); + } + + return sink; +} diff --git a/src/string_utils.h b/src/string_utils.h index 2517b270..3337f012 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -67,7 +67,7 @@ is_utf8_continuation(const uint8_t c) } static inline bool -is_space(const char c) +is_space(const int c) { switch (c) { case ' ': @@ -102,16 +102,16 @@ hex_digit_value(const uint8_t c) } static inline char -serd_to_upper(const char c) +serd_to_lower(const char c) { - return (char)((c >= 'a' && c <= 'z') ? c - 32 : c); + return (char)((c >= 'A' && c <= 'Z') ? c + 32 : c); } static inline int serd_strncasecmp(const char* s1, const char* s2, size_t n) { for (; n > 0 && *s2; s1++, s2++, --n) { - if (serd_to_upper(*s1) != serd_to_upper(*s2)) { + if (serd_to_lower(*s1) != serd_to_lower(*s2)) { return (*s1 < *s2) ? -1 : +1; } } diff --git a/test/extra/canon/bad-boolean.ttl b/test/extra/canon/bad-boolean.ttl new file mode 100644 index 00000000..c4fc3eb5 --- /dev/null +++ b/test/extra/canon/bad-boolean.ttl @@ -0,0 +1,5 @@ +@base <http://example.org/> . +@prefix xsd: <http://www.w3.org/2001/XMLSchema#> . + +[] <boolean> " ja "^^xsd:boolean . + diff --git a/test/extra/canon/bad-decimal-leading.ttl b/test/extra/canon/bad-decimal-leading.ttl new file mode 100644 index 00000000..0d18eac7 --- /dev/null +++ b/test/extra/canon/bad-decimal-leading.ttl @@ -0,0 +1,4 @@ +@base <http://example.org/> . +@prefix xsd: <http://www.w3.org/2001/XMLSchema#> . + +[] <decimal> " junk 1234.5678 "^^xsd:decimal . diff --git a/test/extra/canon/bad-decimal-trailing.ttl b/test/extra/canon/bad-decimal-trailing.ttl new file mode 100644 index 00000000..10882ef5 --- /dev/null +++ b/test/extra/canon/bad-decimal-trailing.ttl @@ -0,0 +1,4 @@ +@base <http://example.org/> . +@prefix xsd: <http://www.w3.org/2001/XMLSchema#> . + +[] <decimal> " 1234.5678 junk "^^xsd:decimal . diff --git a/test/extra/canon/bad-empty-boolean.ttl b/test/extra/canon/bad-empty-boolean.ttl new file mode 100644 index 00000000..9a390c46 --- /dev/null +++ b/test/extra/canon/bad-empty-boolean.ttl @@ -0,0 +1,5 @@ +@base <http://example.org/> . +@prefix xsd: <http://www.w3.org/2001/XMLSchema#> . + +[] <boolean> ""^^xsd:boolean . + diff --git a/test/extra/canon/bad-integer-leading.ttl b/test/extra/canon/bad-integer-leading.ttl new file mode 100644 index 00000000..80c1a6af --- /dev/null +++ b/test/extra/canon/bad-integer-leading.ttl @@ -0,0 +1,4 @@ +@base <http://example.org/> . +@prefix xsd: <http://www.w3.org/2001/XMLSchema#> . + +[] <integer> " junk 987654321 "^^xsd:integer . diff --git a/test/extra/canon/bad-integer-trailing.ttl b/test/extra/canon/bad-integer-trailing.ttl new file mode 100644 index 00000000..a94a9ec4 --- /dev/null +++ b/test/extra/canon/bad-integer-trailing.ttl @@ -0,0 +1,4 @@ +@base <http://example.org/> . +@prefix xsd: <http://www.w3.org/2001/XMLSchema#> . + +[] <integer> " 987654321 junk "^^xsd:integer . diff --git a/test/extra/canon/bad-lang-long.ttl b/test/extra/canon/bad-lang-long.ttl new file mode 100644 index 00000000..f84df07f --- /dev/null +++ b/test/extra/canon/bad-lang-long.ttl @@ -0,0 +1,3 @@ +@base <http://example.org/> . + +[] <tagged> "hello"@ridiculously-long-lang-tag-beyond-even-RFC4646-recommendation . diff --git a/test/extra/canon/manifest.ttl b/test/extra/canon/manifest.ttl new file mode 100644 index 00000000..143928ee --- /dev/null +++ b/test/extra/canon/manifest.ttl @@ -0,0 +1,65 @@ +@prefix mf: <http://www.w3.org/2001/sw/DataAccess/tests/test-manifest#> . +@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . +@prefix rdft: <http://www.w3.org/ns/rdftest#> . + +<> + a mf:Manifest ; + rdfs:comment "Serd canonical literal writing test suite" ; + mf:entries ( + <#bad-boolean> + <#bad-decimal-leading> + <#bad-decimal-trailing> + <#bad-empty-boolean> + <#bad-integer-leading> + <#bad-integer-trailing> + <#bad-lang-long> + <#test-canon> + ) . + +<#bad-boolean> + a rdft:TestTurtleNegativeEval ; + rdfs:comment "Invalid xsd::boolean syntax" ; + mf:action <bad-boolean.ttl> ; + mf:name "bad-boolean" . + +<#bad-decimal-leading> + a rdft:TestTurtleNegativeEval ; + rdfs:comment "Invalid xsd::decimal syntax (leading garbage)" ; + mf:action <bad-decimal-leading.ttl> ; + mf:name "bad-decimal-leading" . + +<#bad-decimal-trailing> + a rdft:TestTurtleNegativeEval ; + rdfs:comment "Invalid xsd::decimal syntax (trailing garbage)" ; + mf:action <bad-decimal-trailing.ttl> ; + mf:name "bad-decimal-trailing" . + +<#bad-empty-boolean> + a rdft:TestTurtleNegativeEval ; + rdfs:comment "Invalid xsd::boolean syntax (no value)" ; + mf:action <bad-empty-boolean.ttl> ; + mf:name "bad-empty-boolean" . + +<#bad-integer-leading> + a rdft:TestTurtleNegativeEval ; + rdfs:comment "Invalid xsd::integer syntax (leading garbage)" ; + mf:action <bad-integer-leading.ttl> ; + mf:name "bad-integer-leading" . + +<#bad-integer-trailing> + a rdft:TestTurtleNegativeEval ; + rdfs:comment "Invalid xsd::integer syntax (trailing garbage)" ; + mf:action <bad-integer-trailing.ttl> ; + mf:name "bad-integer-trailing" . + +<#bad-lang-long> + a rdft:TestTurtleNegativeEval ; + rdfs:comment "Overly long language tag" ; + mf:action <bad-lang-long.ttl> ; + mf:name "bad-lang-long" . + +<#test-canon> + a rdft:TestTurtleEval ; + mf:action <test-canon.ttl> ; + mf:name "test-canon" ; + mf:result <test-canon.nt> . diff --git a/test/extra/canon/test-canon.nt b/test/extra/canon/test-canon.nt new file mode 100644 index 00000000..ff492890 --- /dev/null +++ b/test/extra/canon/test-canon.nt @@ -0,0 +1,70 @@ +_:b1 <http://example.org/boolean> "false"^^<http://www.w3.org/2001/XMLSchema#boolean> . +_:b1 <http://example.org/boolean> "false"^^<http://www.w3.org/2001/XMLSchema#boolean> . +_:b1 <http://example.org/boolean> "true"^^<http://www.w3.org/2001/XMLSchema#boolean> . +_:b1 <http://example.org/boolean> "true"^^<http://www.w3.org/2001/XMLSchema#boolean> . +_:b1 <http://example.org/ieee754> "1.0E2"^^<http://www.w3.org/2001/XMLSchema#float> . +_:b1 <http://example.org/ieee754> "-1.0E2"^^<http://www.w3.org/2001/XMLSchema#float> . +_:b1 <http://example.org/ieee754> "1.0E3"^^<http://www.w3.org/2001/XMLSchema#double> . +_:b1 <http://example.org/ieee754> "-1.0E3"^^<http://www.w3.org/2001/XMLSchema#double> . +_:b1 <http://example.org/machine> "9223372036854775807"^^<http://www.w3.org/2001/XMLSchema#long> . +_:b1 <http://example.org/machine> "-9223372036854775808"^^<http://www.w3.org/2001/XMLSchema#long> . +_:b1 <http://example.org/machine> "2147483647"^^<http://www.w3.org/2001/XMLSchema#int> . +_:b1 <http://example.org/machine> "-2147483648"^^<http://www.w3.org/2001/XMLSchema#int> . +_:b1 <http://example.org/machine> "32767"^^<http://www.w3.org/2001/XMLSchema#short> . +_:b1 <http://example.org/machine> "-32768"^^<http://www.w3.org/2001/XMLSchema#short> . +_:b1 <http://example.org/machine> "127"^^<http://www.w3.org/2001/XMLSchema#byte> . +_:b1 <http://example.org/machine> "-128"^^<http://www.w3.org/2001/XMLSchema#byte> . +_:b1 <http://example.org/machine> "1"^^<http://www.w3.org/2001/XMLSchema#unsignedLong> . +_:b1 <http://example.org/machine> "18446744073709551615"^^<http://www.w3.org/2001/XMLSchema#unsignedLong> . +_:b1 <http://example.org/machine> "1"^^<http://www.w3.org/2001/XMLSchema#unsignedInt> . +_:b1 <http://example.org/machine> "4294967295"^^<http://www.w3.org/2001/XMLSchema#unsignedInt> . +_:b1 <http://example.org/machine> "1"^^<http://www.w3.org/2001/XMLSchema#unsignedShort> . +_:b1 <http://example.org/machine> "65535"^^<http://www.w3.org/2001/XMLSchema#unsignedShort> . +_:b1 <http://example.org/machine> "1"^^<http://www.w3.org/2001/XMLSchema#unsignedByte> . +_:b1 <http://example.org/machine> "255"^^<http://www.w3.org/2001/XMLSchema#unsignedByte> . +_:b1 <http://example.org/decimal> "0.0"^^<http://www.w3.org/2001/XMLSchema#decimal> . +_:b1 <http://example.org/decimal> "0.0"^^<http://www.w3.org/2001/XMLSchema#decimal> . +_:b1 <http://example.org/decimal> "-0.0"^^<http://www.w3.org/2001/XMLSchema#decimal> . +_:b1 <http://example.org/decimal> "36893488147419103232.0"^^<http://www.w3.org/2001/XMLSchema#decimal> . +_:b1 <http://example.org/decimal> "36893488147419103232.0"^^<http://www.w3.org/2001/XMLSchema#decimal> . +_:b1 <http://example.org/decimal> "36893488147419103232.0"^^<http://www.w3.org/2001/XMLSchema#decimal> . +_:b1 <http://example.org/decimal> "36893488147419103232.0"^^<http://www.w3.org/2001/XMLSchema#decimal> . +_:b1 <http://example.org/decimal> "36893488147419103232.0"^^<http://www.w3.org/2001/XMLSchema#decimal> . +_:b1 <http://example.org/decimal> "36893488147419103232.0"^^<http://www.w3.org/2001/XMLSchema#decimal> . +_:b1 <http://example.org/decimal> "36893488147419103232.123"^^<http://www.w3.org/2001/XMLSchema#decimal> . +_:b1 <http://example.org/decimal> "-36893488147419103232.0"^^<http://www.w3.org/2001/XMLSchema#decimal> . +_:b1 <http://example.org/decimal> "-36893488147419103232.0"^^<http://www.w3.org/2001/XMLSchema#decimal> . +_:b1 <http://example.org/decimal> "-36893488147419103232.0"^^<http://www.w3.org/2001/XMLSchema#decimal> . +_:b1 <http://example.org/decimal> "-36893488147419103232.0"^^<http://www.w3.org/2001/XMLSchema#decimal> . +_:b1 <http://example.org/decimal> "-36893488147419103232.123"^^<http://www.w3.org/2001/XMLSchema#decimal> . +_:b1 <http://example.org/decimal> "0.123"^^<http://www.w3.org/2001/XMLSchema#decimal> . +_:b1 <http://example.org/decimal> "0.123"^^<http://www.w3.org/2001/XMLSchema#decimal> . +_:b1 <http://example.org/decimal> "0.123"^^<http://www.w3.org/2001/XMLSchema#decimal> . +_:b1 <http://example.org/decimal> "0.123"^^<http://www.w3.org/2001/XMLSchema#decimal> . +_:b1 <http://example.org/decimal> "-0.123"^^<http://www.w3.org/2001/XMLSchema#decimal> . +_:b1 <http://example.org/decimal> "-0.123"^^<http://www.w3.org/2001/XMLSchema#decimal> . +_:b1 <http://example.org/integer> "36893488147419103232"^^<http://www.w3.org/2001/XMLSchema#integer> . +_:b1 <http://example.org/integer> "36893488147419103232"^^<http://www.w3.org/2001/XMLSchema#integer> . +_:b1 <http://example.org/integer> "36893488147419103232"^^<http://www.w3.org/2001/XMLSchema#integer> . +_:b1 <http://example.org/integer> "36893488147419103232"^^<http://www.w3.org/2001/XMLSchema#integer> . +_:b1 <http://example.org/integer> "-36893488147419103232"^^<http://www.w3.org/2001/XMLSchema#integer> . +_:b1 <http://example.org/integer> "-36893488147419103232"^^<http://www.w3.org/2001/XMLSchema#integer> . +_:b1 <http://example.org/integer> "0"^^<http://www.w3.org/2001/XMLSchema#nonPositiveInteger> . +_:b1 <http://example.org/integer> "-36893488147419103232"^^<http://www.w3.org/2001/XMLSchema#nonPositiveInteger> . +_:b1 <http://example.org/integer> "-1"^^<http://www.w3.org/2001/XMLSchema#negativeInteger> . +_:b1 <http://example.org/integer> "-36893488147419103232"^^<http://www.w3.org/2001/XMLSchema#negativeInteger> . +_:b1 <http://example.org/integer> "0"^^<http://www.w3.org/2001/XMLSchema#nonNegativeInteger> . +_:b1 <http://example.org/integer> "36893488147419103232"^^<http://www.w3.org/2001/XMLSchema#nonNegativeInteger> . +_:b1 <http://example.org/integer> "1"^^<http://www.w3.org/2001/XMLSchema#positiveInteger> . +_:b1 <http://example.org/integer> "36893488147419103232"^^<http://www.w3.org/2001/XMLSchema#positiveInteger> . +_:b1 <http://example.org/langString> "no language tag" . +_:b1 <http://example.org/taggedString> "english"@en-ca . +_:b1 <http://example.org/time> "P1Y6M"^^<http://www.w3.org/2001/XMLSchema#duration> . +_:b1 <http://example.org/time> "12:15:01Z"^^<http://www.w3.org/2001/XMLSchema#time> . +_:b1 <http://example.org/time> "2004-04-12Z"^^<http://www.w3.org/2001/XMLSchema#date> . +_:b1 <http://example.org/binary> "A1B7F080"^^<http://www.w3.org/2001/XMLSchema#hexBinary> . +_:b1 <http://example.org/binary> "Zm9vYmF="^^<http://www.w3.org/2001/XMLSchema#base64Binary> . +_:b1 <http://example.org/other> "untyped" . +_:b1 <http://example.org/other> <http://example.org/uri> . +_:b1 <http://example.org/other> "notxsd"^^<http://example.org/sometype> . +_:b1 <http://example.org/other> "unsupported"^^<http://www.w3.org/2001/XMLSchema#name> . diff --git a/test/extra/canon/test-canon.ttl b/test/extra/canon/test-canon.ttl new file mode 100644 index 00000000..0d0b4682 --- /dev/null +++ b/test/extra/canon/test-canon.ttl @@ -0,0 +1,76 @@ +@base <http://example.org/> . +@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . +@prefix xsd: <http://www.w3.org/2001/XMLSchema#> . + +[ + <boolean> " false "^^xsd:boolean , + " 0 "^^xsd:boolean , + " true "^^xsd:boolean , + " 1 "^^xsd:boolean ; + <ieee754> " +0100.0 "^^xsd:float , + " -0100.0 "^^xsd:float , + " +01000.0 "^^xsd:double , + " -01000.0 "^^xsd:double ; + <machine> " +09223372036854775807 "^^xsd:long , + " -09223372036854775808 "^^xsd:long , + " +02147483647 "^^xsd:int , + " -02147483648 "^^xsd:int , + " +032767 "^^xsd:short , + " -032768 "^^xsd:short , + " +0127 "^^xsd:byte , + " -0128 "^^xsd:byte , + " 01 "^^xsd:unsignedLong , + " 018446744073709551615 "^^xsd:unsignedLong , + " 01 "^^xsd:unsignedInt , + " 04294967295 "^^xsd:unsignedInt , + " 01 "^^xsd:unsignedShort , + " 065535 "^^xsd:unsignedShort , + " 01 "^^xsd:unsignedByte , + " 0255 "^^xsd:unsignedByte ; + <decimal> " 00 "^^xsd:decimal , + " +0 "^^xsd:decimal , + " -0 "^^xsd:decimal , + " 36893488147419103232 "^^xsd:decimal , + " 0036893488147419103232 "^^xsd:decimal , + " +36893488147419103232 "^^xsd:decimal , + " +0036893488147419103232 "^^xsd:decimal , + " +0036893488147419103232. "^^xsd:decimal , + " +0036893488147419103232.00 "^^xsd:decimal , + " +0036893488147419103232.12300 "^^xsd:decimal , + " -36893488147419103232 "^^xsd:decimal , + " -0036893488147419103232 "^^xsd:decimal , + " -0036893488147419103232. "^^xsd:decimal , + " -0036893488147419103232.00 "^^xsd:decimal , + " -0036893488147419103232.12300 "^^xsd:decimal , + " 00.12300 "^^xsd:decimal , + " .12300 "^^xsd:decimal , + " +.12300 "^^xsd:decimal , + " +00.12300 "^^xsd:decimal , + " -.12300 "^^xsd:decimal , + " -00.12300 "^^xsd:decimal ; + <integer> " 36893488147419103232 "^^xsd:integer , + " 0036893488147419103232 "^^xsd:integer , + " +36893488147419103232 "^^xsd:integer , + " +0036893488147419103232 "^^xsd:integer , + " -36893488147419103232 "^^xsd:integer , + " -0036893488147419103232 "^^xsd:integer , + " 00 "^^xsd:nonPositiveInteger , + " -036893488147419103232 "^^xsd:nonPositiveInteger , + " -01 "^^xsd:negativeInteger , + " -036893488147419103232 "^^xsd:negativeInteger , + " 00 "^^xsd:nonNegativeInteger , + " 036893488147419103232 "^^xsd:nonNegativeInteger , + " +01 "^^xsd:positiveInteger , + " 036893488147419103232 "^^xsd:positiveInteger ; + <langString> "no language tag"^^rdf:langString ; + <taggedString> "english"@EN-CA ; + <time> " P1Y6M0D "^^xsd:duration , + " 12:15:01+00:00 "^^xsd:time , + " 02004-04-12+00:00 "^^xsd:date ; + <binary> "A 1 B7 F080"^^xsd:hexBinary , + " Zm 9v Y m F="^^xsd:base64Binary ; + <other> "untyped" , + <uri> , + "notxsd"^^<sometype> , + "unsupported"^^xsd:name +] . diff --git a/test/meson.build b/test/meson.build index 5c0ac7e8..24180efa 100644 --- a/test/meson.build +++ b/test/meson.build @@ -29,6 +29,7 @@ ttl_metadata_file_paths = [ 'extra/abbreviate/manifest.ttl', 'extra/bad/manifest.ttl', 'extra/big/manifest.ttl', + 'extra/canon/manifest.ttl', 'extra/full/manifest.ttl', 'extra/good/manifest.ttl', 'extra/lax/manifest.ttl', @@ -124,6 +125,7 @@ subdir('headers') ############## unit_tests = [ + 'canon', 'caret', 'env', 'free_null', @@ -444,6 +446,13 @@ test_suites = { '-a', ['-b', '1'], ], + 'canon': [ + files('extra/canon/manifest.ttl'), + ns_serdtest + 'canon/', + '--', + '-C', + '-a', + ], 'fast': [ files('extra/perfect/manifest.ttl'), ns_serdtest + 'perfect/', diff --git a/test/test_canon.c b/test/test_canon.c new file mode 100644 index 00000000..1a569664 --- /dev/null +++ b/test/test_canon.c @@ -0,0 +1,103 @@ +// Copyright 2021 David Robillard <d@drobilla.net> +// SPDX-License-Identifier: ISC + +#undef NDEBUG + +#include "failing_allocator.h" + +#include "serd/canon.h" +#include "serd/event.h" +#include "serd/node.h" +#include "serd/sink.h" +#include "serd/status.h" +#include "serd/string_view.h" +#include "serd/world.h" + +#include <assert.h> +#include <stddef.h> + +static SerdStatus +ignore_event(void* handle, const SerdEvent* event) +{ + (void)handle; + (void)event; + return SERD_SUCCESS; +} + +static void +test_new_failed_alloc(void) +{ + SerdFailingAllocator allocator = serd_failing_allocator(); + + SerdWorld* const world = serd_world_new(&allocator.base); + + SerdSink* target = serd_sink_new(&allocator.base, NULL, ignore_event, NULL); + const size_t n_setup_allocs = allocator.n_allocations; + + // Successfully allocate a canon to count the number of allocations + SerdSink* canon = serd_canon_new(world, target, 0U); + assert(canon); + + // Test that each allocation failing is handled gracefully + const size_t n_new_allocs = allocator.n_allocations - n_setup_allocs; + for (size_t i = 0; i < n_new_allocs; ++i) { + allocator.n_remaining = i; + assert(!serd_canon_new(world, target, 0U)); + } + + serd_sink_free(canon); + serd_sink_free(target); + serd_world_free(world); +} + +static void +test_write_failed_alloc(void) +{ + const SerdStringView s_string = serd_string("http://example.org/s"); + const SerdStringView p_string = serd_string("http://example.org/p"); + const SerdStringView o_string = serd_string("012.340"); + const SerdStringView xsd_float = + serd_string("http://www.w3.org/2001/XMLSchema#float"); + + SerdFailingAllocator allocator = serd_failing_allocator(); + + SerdWorld* const world = serd_world_new(&allocator.base); + + const SerdNode* const s = + serd_node_new(&allocator.base, serd_a_uri(s_string)); + + const SerdNode* const p = + serd_node_new(&allocator.base, serd_a_uri(p_string)); + + const SerdNode* const o = + serd_node_new(&allocator.base, serd_a_typed_literal(o_string, xsd_float)); + + SerdSink* target = serd_sink_new(&allocator.base, NULL, ignore_event, NULL); + SerdSink* canon = serd_canon_new(world, target, 0U); + const size_t n_setup_allocs = allocator.n_allocations; + + // Successfully write statement to count the number of allocations + assert(canon); + assert(!serd_sink_write(canon, 0U, s, p, o, NULL)); + + // Test that each allocation failing is handled gracefully + const size_t n_new_allocs = allocator.n_allocations - n_setup_allocs; + for (size_t i = 0; i < n_new_allocs; ++i) { + allocator.n_remaining = i; + + const SerdStatus st = serd_sink_write(canon, 0U, s, p, o, NULL); + assert(st == SERD_BAD_ALLOC); + } + + serd_sink_free(canon); + serd_sink_free(target); + serd_world_free(world); +} + +int +main(void) +{ + test_new_failed_alloc(); + test_write_failed_alloc(); + return 0; +} diff --git a/tools/serd-pipe.c b/tools/serd-pipe.c index 3f1c3f53..bdb871fa 100644 --- a/tools/serd-pipe.c +++ b/tools/serd-pipe.c @@ -3,6 +3,7 @@ #include "console.h" +#include "serd/canon.h" #include "serd/env.h" #include "serd/input_stream.h" #include "serd/log.h" @@ -37,6 +38,7 @@ print_usage(const char* const name, const bool error) "Read and write RDF syntax.\n" "Use - for INPUT to read from standard input.\n\n" " -B BASE_URI Base URI.\n" + " -C Convert literals to canonical form.\n" " -a Write ASCII output.\n" " -b BYTES I/O block size.\n" " -c PREFIX Chop PREFIX from matching blank node IDs.\n" @@ -115,6 +117,7 @@ main(int argc, char** argv) SerdReaderFlags reader_flags = 0; SerdWriterFlags writer_flags = 0; bool osyntax_set = false; + bool canonical = false; bool quiet = false; size_t block_size = 4096U; size_t stack_size = 1048576U; @@ -140,7 +143,9 @@ main(int argc, char** argv) for (int o = 1; argv[a][o]; ++o) { const char opt = argv[a][o]; - if (opt == 'a') { + if (opt == 'C') { + canonical = true; + } else if (opt == 'a') { writer_flags |= SERD_WRITE_ASCII; } else if (opt == 'f') { writer_flags |= (SERD_WRITE_EXPANDED | SERD_WRITE_VERBATIM); @@ -305,6 +310,13 @@ main(int argc, char** argv) SerdWriter* const writer = serd_writer_new(world, output_syntax, writer_flags, env, &out, block_size); + const SerdSink* sink = serd_writer_sink(writer); + + SerdSink* canon = NULL; + if (canonical) { + sink = canon = serd_canon_new(world, sink, reader_flags); + } + if (quiet) { serd_set_log_func(world, serd_quiet_log_func, NULL); } @@ -321,12 +333,8 @@ main(int argc, char** argv) const char* position = input_string; SerdInputStream string_in = serd_open_input_string(&position); - SerdReader* const reader = - serd_reader_new(world, - input_syntax ? input_syntax : SERD_TRIG, - reader_flags, - env, - serd_writer_sink(writer)); + SerdReader* const reader = serd_reader_new( + world, input_syntax ? input_syntax : SERD_TRIG, reader_flags, env, sink); serd_reader_add_blank_prefix(reader, add_prefix); @@ -361,7 +369,7 @@ main(int argc, char** argv) serd_choose_syntax(world, input_syntax, inputs[i]), reader_flags, env, - serd_writer_sink(writer), + sink, stack_size, inputs[i], n_inputs > 1 ? prefix : add_prefix, @@ -371,6 +379,7 @@ main(int argc, char** argv) } free(prefix); + serd_sink_free(canon); serd_writer_free(writer); serd_node_free(NULL, input_name); serd_env_free(env); |