aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Robillard <d@drobilla.net>2023-12-02 15:53:25 -0500
committerDavid Robillard <d@drobilla.net>2023-12-02 18:49:08 -0500
commit339f9d90d1fe001978d15e1c007a3861a7145453 (patch)
treed29ec45610ad04a79dd353fbb2609bbb2931148f
parent4711fdf527f416faee8ff19e15f050d4b48dcfb2 (diff)
downloadserd-339f9d90d1fe001978d15e1c007a3861a7145453.tar.gz
serd-339f9d90d1fe001978d15e1c007a3861a7145453.tar.bz2
serd-339f9d90d1fe001978d15e1c007a3861a7145453.zip
[WIP] Add support for converting literals to canonical form
-rw-r--r--NEWS1
-rw-r--r--doc/man/serd-pipe.114
-rw-r--r--include/serd/canon.h47
-rw-r--r--include/serd/serd.h1
-rw-r--r--meson.build2
-rw-r--r--src/canon.c220
-rw-r--r--src/string_utils.h8
-rw-r--r--test/extra/canon/bad-boolean.ttl5
-rw-r--r--test/extra/canon/bad-decimal-leading.ttl4
-rw-r--r--test/extra/canon/bad-decimal-trailing.ttl4
-rw-r--r--test/extra/canon/bad-empty-boolean.ttl5
-rw-r--r--test/extra/canon/bad-integer-leading.ttl4
-rw-r--r--test/extra/canon/bad-integer-trailing.ttl4
-rw-r--r--test/extra/canon/bad-lang-long.ttl3
-rw-r--r--test/extra/canon/manifest.ttl65
-rw-r--r--test/extra/canon/test-canon.nt70
-rw-r--r--test/extra/canon/test-canon.ttl76
-rw-r--r--test/meson.build9
-rw-r--r--test/test_canon.c103
-rw-r--r--tools/serd-pipe.c25
20 files changed, 657 insertions, 13 deletions
diff --git a/NEWS b/NEWS
index b82e70a9..a222c99b 100644
--- a/NEWS
+++ b/NEWS
@@ -3,6 +3,7 @@ serd (1.1.1) unstable; urgency=medium
* Add SerdBuffer for mutable buffers to keep SerdChunk const-correct
* Add SerdWorld for shared library state
* Add extensible logging API
+ * Add support for converting literals to canonical form
* Add support for parsing variables
* Add support for writing terse output with minimal newlines
* Add support for xsd:float and xsd:double literals
diff --git a/doc/man/serd-pipe.1 b/doc/man/serd-pipe.1
index 793737f9..d731f0b4 100644
--- a/doc/man/serd-pipe.1
+++ b/doc/man/serd-pipe.1
@@ -8,7 +8,7 @@
.Nd read and write RDF data
.Sh SYNOPSIS
.Nm serd-pipe
-.Op Fl afhlqtvx
+.Op Fl Cafhlqtvx
.Op Fl B Ar base
.Op Fl b Ar bytes
.Op Fl c Ar prefix
@@ -68,6 +68,18 @@ When the input is a file,
the URI of the file is automatically used as the base URI.
This option can be used to override that,
or to provide a base URI for input from stdin or a string.
+.It Fl C
+Convert literals to canonical form.
+Literals with supported XSD datatypes will be parsed and rewritten canonically.
+Invalid literals will cause an error.
+All numeric datatypes are supported, as well as
+.Vt boolean ,
+.Vt duration ,
+.Vt datetime ,
+.Vt time ,
+.Vt hexBinary ,
+and
+.Vt base64Binary .
.It Fl a
Write ASCII output.
If this is enabled, all non-ASCII characters will be escaped, even if the output syntax allows them to be written in UTF-8.
diff --git a/include/serd/canon.h b/include/serd/canon.h
new file mode 100644
index 00000000..862a4db0
--- /dev/null
+++ b/include/serd/canon.h
@@ -0,0 +1,47 @@
+// Copyright 2011-2022 David Robillard <d@drobilla.net>
+// SPDX-License-Identifier: ISC
+
+#ifndef SERD_CANON_H
+#define SERD_CANON_H
+
+#include "serd/attributes.h"
+#include "serd/sink.h"
+#include "serd/world.h"
+#include "zix/attributes.h"
+
+#include <stdint.h>
+
+SERD_BEGIN_DECLS
+
+/**
+ @defgroup serd_canon Canon
+ @ingroup serd_streaming
+ @{
+*/
+
+/// Flags that control canonical node transformation
+typedef enum {
+ SERD_CANON_LAX = 1U << 0U, ///< Tolerate and pass through invalid input
+} SerdCanonFlag;
+
+/// Bitwise OR of SerdCanonFlag values
+typedef uint32_t SerdCanonFlags;
+
+/**
+ Return a new sink that transforms literals to canonical form where possible.
+
+ The returned sink acts like `target` in all respects, except literal nodes
+ in statements may be modified from the original.
+*/
+SERD_API SerdSink* ZIX_ALLOCATED
+serd_canon_new(const SerdWorld* ZIX_NONNULL world,
+ const SerdSink* ZIX_NONNULL target,
+ SerdCanonFlags flags);
+
+/**
+ @}
+*/
+
+SERD_END_DECLS
+
+#endif // SERD_CANON_H
diff --git a/include/serd/serd.h b/include/serd/serd.h
index 88be5daa..d264192f 100644
--- a/include/serd/serd.h
+++ b/include/serd/serd.h
@@ -69,6 +69,7 @@
@{
*/
+#include "serd/canon.h"
#include "serd/env.h"
#include "serd/event.h"
#include "serd/sink.h"
diff --git a/meson.build b/meson.build
index 7bd2e560..82dc839b 100644
--- a/meson.build
+++ b/meson.build
@@ -129,6 +129,7 @@ include_dirs = include_directories('include')
c_headers = files(
'include/serd/attributes.h',
'include/serd/buffer.h',
+ 'include/serd/canon.h',
'include/serd/caret.h',
'include/serd/env.h',
'include/serd/event.h',
@@ -158,6 +159,7 @@ sources = files(
'src/block_dumper.c',
'src/buffer.c',
'src/byte_source.c',
+ 'src/canon.c',
'src/caret.c',
'src/env.c',
'src/input_stream.c',
diff --git a/src/canon.c b/src/canon.c
new file mode 100644
index 00000000..c0ce8ef4
--- /dev/null
+++ b/src/canon.c
@@ -0,0 +1,220 @@
+// Copyright 2019-2022 David Robillard <d@drobilla.net>
+// SPDX-License-Identifier: ISC
+
+#include "caret.h" // IWYU pragma: keep
+#include "memory.h"
+#include "namespaces.h"
+#include "node.h"
+#include "statement.h" // IWYU pragma: keep
+#include "string_utils.h"
+
+#include "exess/exess.h"
+#include "serd/canon.h"
+#include "serd/caret.h"
+#include "serd/event.h"
+#include "serd/log.h"
+#include "serd/memory.h"
+#include "serd/node.h"
+#include "serd/sink.h"
+#include "serd/statement.h"
+#include "serd/status.h"
+#include "serd/string_view.h"
+#include "serd/world.h"
+#include "zix/attributes.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+typedef struct {
+ const SerdWorld* world;
+ const SerdSink* target;
+ SerdCanonFlags flags;
+} SerdCanonData;
+
+static ExessResult
+build_typed(SerdAllocator* const ZIX_NONNULL allocator,
+ SerdNode** const out,
+ const SerdNode* const ZIX_NONNULL node,
+ const SerdNode* const ZIX_NONNULL datatype)
+{
+ *out = NULL;
+
+ const char* str = serd_node_string(node);
+ const char* datatype_uri = serd_node_string(datatype);
+ ExessResult r = {EXESS_SUCCESS, 0};
+
+ if (!strcmp(datatype_uri, NS_RDF "langString")) {
+ *out =
+ serd_node_new(allocator, serd_a_string_view(serd_node_string_view(node)));
+ return r;
+ }
+
+ const ExessDatatype value_type = exess_datatype_from_uri(datatype_uri);
+ if (value_type == EXESS_NOTHING) {
+ return r;
+ }
+
+ // Measure canonical form to know how much space to allocate for node
+ if ((r = exess_write_canonical(str, value_type, 0, NULL)).status) {
+ return r;
+ }
+
+ // Allocate node
+ const size_t datatype_uri_len = serd_node_length(datatype);
+ const size_t datatype_size = serd_node_total_size(datatype);
+ const size_t len = serd_node_pad_length(r.count);
+ const size_t total_len = sizeof(SerdNode) + len + datatype_size;
+ SerdNode* const result = serd_node_malloc(allocator, total_len);
+ if (!result) {
+ r.status = EXESS_NO_SPACE;
+ return r;
+ }
+
+ result->length = r.count;
+ result->flags = SERD_HAS_DATATYPE;
+ result->type = SERD_LITERAL;
+
+ // Write canonical form directly into node
+ exess_write_canonical(str, value_type, r.count + 1, serd_node_buffer(result));
+
+ SerdNode* const datatype_node = result + 1 + (len / sizeof(SerdNode));
+ char* const datatype_buf = serd_node_buffer(datatype_node);
+
+ datatype_node->length = datatype_uri_len;
+ datatype_node->type = SERD_URI;
+ memcpy(datatype_buf, datatype_uri, datatype_uri_len + 1);
+
+ *out = result;
+ return r;
+}
+
+static ExessResult
+build_tagged(SerdAllocator* const ZIX_NONNULL allocator,
+ SerdNode** const out,
+ const SerdNode* const ZIX_NONNULL node,
+ const SerdNode* const ZIX_NONNULL language)
+{
+#define MAX_LANG_LEN 48 // RFC5646 requires 35, RFC4646 recommends 42
+
+ const size_t node_len = serd_node_length(node);
+ const char* const lang = serd_node_string(language);
+ const size_t lang_len = serd_node_length(language);
+ if (lang_len > MAX_LANG_LEN) {
+ const ExessResult r = {EXESS_NO_SPACE, node_len};
+ return r;
+ }
+
+ // Convert language tag to lower-case
+ char canonical_lang[MAX_LANG_LEN] = {0};
+ for (size_t i = 0U; i < lang_len; ++i) {
+ canonical_lang[i] = serd_to_lower(lang[i]);
+ }
+
+ // Make a new literal that is otherwise identical
+ *out =
+ serd_node_new(allocator,
+ serd_a_literal(serd_node_string_view(node),
+ serd_node_flags(node),
+ serd_substring(canonical_lang, lang_len)));
+
+ const ExessResult r = {EXESS_SUCCESS, node_len};
+ return r;
+
+#undef MAX_LANG_LEN
+}
+
+static SerdStatus
+serd_canon_on_statement(SerdCanonData* const data,
+ const SerdStatementFlags flags,
+ const SerdStatement* const statement)
+{
+ SerdAllocator* const allocator = serd_world_allocator(data->world);
+ const SerdNode* const object = serd_statement_object(statement);
+ const SerdNode* const datatype = serd_node_datatype(object);
+ const SerdNode* const language = serd_node_language(object);
+ if (!datatype && !language) {
+ return serd_sink_write_statement(data->target, flags, statement);
+ }
+
+ SerdNode* normo = NULL;
+ const ExessResult r = datatype
+ ? build_typed(allocator, &normo, object, datatype)
+ : build_tagged(allocator, &normo, object, language);
+
+ if (r.status) {
+ SerdCaret caret = {NULL, 0U, 0U};
+ const bool lax = (data->flags & SERD_CANON_LAX);
+
+ if (statement->caret) {
+ // Adjust column to point at the error within the literal
+ caret.document = statement->caret->document;
+ caret.line = statement->caret->line;
+ caret.col = statement->caret->col + 1 + (unsigned)r.count;
+ }
+
+ serd_logf_at(data->world,
+ lax ? SERD_LOG_LEVEL_WARNING : SERD_LOG_LEVEL_ERROR,
+ statement->caret ? &caret : NULL,
+ "invalid literal (%s)",
+ exess_strerror(r.status));
+
+ if (!lax) {
+ return r.status == EXESS_NO_SPACE ? SERD_BAD_ALLOC : SERD_BAD_LITERAL;
+ }
+ }
+
+ if (!normo) {
+ return serd_sink_write_statement(data->target, flags, statement);
+ }
+
+ const SerdStatus st = serd_sink_write(data->target,
+ flags,
+ statement->nodes[0],
+ statement->nodes[1],
+ normo,
+ statement->nodes[3]);
+ serd_node_free(allocator, normo);
+ return st;
+}
+
+static SerdStatus
+serd_canon_on_event(void* const handle, const SerdEvent* const event)
+{
+ SerdCanonData* const data = (SerdCanonData*)handle;
+
+ return (event->type == SERD_STATEMENT)
+ ? serd_canon_on_statement(
+ data, event->statement.flags, event->statement.statement)
+ : serd_sink_write_event(data->target, event);
+}
+
+SerdSink*
+serd_canon_new(const SerdWorld* const world,
+ const SerdSink* const target,
+ const SerdCanonFlags flags)
+{
+ assert(world);
+ assert(target);
+
+ SerdCanonData* const data =
+ (SerdCanonData*)serd_wcalloc(world, 1, sizeof(SerdCanonData));
+
+ if (!data) {
+ return NULL;
+ }
+
+ data->world = world;
+ data->target = target;
+ data->flags = flags;
+
+ SerdSink* const sink =
+ serd_sink_new(serd_world_allocator(world), data, serd_canon_on_event, free);
+
+ if (!sink) {
+ serd_wfree(world, data);
+ }
+
+ return sink;
+}
diff --git a/src/string_utils.h b/src/string_utils.h
index 2517b270..3337f012 100644
--- a/src/string_utils.h
+++ b/src/string_utils.h
@@ -67,7 +67,7 @@ is_utf8_continuation(const uint8_t c)
}
static inline bool
-is_space(const char c)
+is_space(const int c)
{
switch (c) {
case ' ':
@@ -102,16 +102,16 @@ hex_digit_value(const uint8_t c)
}
static inline char
-serd_to_upper(const char c)
+serd_to_lower(const char c)
{
- return (char)((c >= 'a' && c <= 'z') ? c - 32 : c);
+ return (char)((c >= 'A' && c <= 'Z') ? c + 32 : c);
}
static inline int
serd_strncasecmp(const char* s1, const char* s2, size_t n)
{
for (; n > 0 && *s2; s1++, s2++, --n) {
- if (serd_to_upper(*s1) != serd_to_upper(*s2)) {
+ if (serd_to_lower(*s1) != serd_to_lower(*s2)) {
return (*s1 < *s2) ? -1 : +1;
}
}
diff --git a/test/extra/canon/bad-boolean.ttl b/test/extra/canon/bad-boolean.ttl
new file mode 100644
index 00000000..c4fc3eb5
--- /dev/null
+++ b/test/extra/canon/bad-boolean.ttl
@@ -0,0 +1,5 @@
+@base <http://example.org/> .
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+
+[] <boolean> " ja "^^xsd:boolean .
+
diff --git a/test/extra/canon/bad-decimal-leading.ttl b/test/extra/canon/bad-decimal-leading.ttl
new file mode 100644
index 00000000..0d18eac7
--- /dev/null
+++ b/test/extra/canon/bad-decimal-leading.ttl
@@ -0,0 +1,4 @@
+@base <http://example.org/> .
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+
+[] <decimal> " junk 1234.5678 "^^xsd:decimal .
diff --git a/test/extra/canon/bad-decimal-trailing.ttl b/test/extra/canon/bad-decimal-trailing.ttl
new file mode 100644
index 00000000..10882ef5
--- /dev/null
+++ b/test/extra/canon/bad-decimal-trailing.ttl
@@ -0,0 +1,4 @@
+@base <http://example.org/> .
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+
+[] <decimal> " 1234.5678 junk "^^xsd:decimal .
diff --git a/test/extra/canon/bad-empty-boolean.ttl b/test/extra/canon/bad-empty-boolean.ttl
new file mode 100644
index 00000000..9a390c46
--- /dev/null
+++ b/test/extra/canon/bad-empty-boolean.ttl
@@ -0,0 +1,5 @@
+@base <http://example.org/> .
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+
+[] <boolean> ""^^xsd:boolean .
+
diff --git a/test/extra/canon/bad-integer-leading.ttl b/test/extra/canon/bad-integer-leading.ttl
new file mode 100644
index 00000000..80c1a6af
--- /dev/null
+++ b/test/extra/canon/bad-integer-leading.ttl
@@ -0,0 +1,4 @@
+@base <http://example.org/> .
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+
+[] <integer> " junk 987654321 "^^xsd:integer .
diff --git a/test/extra/canon/bad-integer-trailing.ttl b/test/extra/canon/bad-integer-trailing.ttl
new file mode 100644
index 00000000..a94a9ec4
--- /dev/null
+++ b/test/extra/canon/bad-integer-trailing.ttl
@@ -0,0 +1,4 @@
+@base <http://example.org/> .
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+
+[] <integer> " 987654321 junk "^^xsd:integer .
diff --git a/test/extra/canon/bad-lang-long.ttl b/test/extra/canon/bad-lang-long.ttl
new file mode 100644
index 00000000..f84df07f
--- /dev/null
+++ b/test/extra/canon/bad-lang-long.ttl
@@ -0,0 +1,3 @@
+@base <http://example.org/> .
+
+[] <tagged> "hello"@ridiculously-long-lang-tag-beyond-even-RFC4646-recommendation .
diff --git a/test/extra/canon/manifest.ttl b/test/extra/canon/manifest.ttl
new file mode 100644
index 00000000..143928ee
--- /dev/null
+++ b/test/extra/canon/manifest.ttl
@@ -0,0 +1,65 @@
+@prefix mf: <http://www.w3.org/2001/sw/DataAccess/tests/test-manifest#> .
+@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
+@prefix rdft: <http://www.w3.org/ns/rdftest#> .
+
+<>
+ a mf:Manifest ;
+ rdfs:comment "Serd canonical literal writing test suite" ;
+ mf:entries (
+ <#bad-boolean>
+ <#bad-decimal-leading>
+ <#bad-decimal-trailing>
+ <#bad-empty-boolean>
+ <#bad-integer-leading>
+ <#bad-integer-trailing>
+ <#bad-lang-long>
+ <#test-canon>
+ ) .
+
+<#bad-boolean>
+ a rdft:TestTurtleNegativeEval ;
+ rdfs:comment "Invalid xsd::boolean syntax" ;
+ mf:action <bad-boolean.ttl> ;
+ mf:name "bad-boolean" .
+
+<#bad-decimal-leading>
+ a rdft:TestTurtleNegativeEval ;
+ rdfs:comment "Invalid xsd::decimal syntax (leading garbage)" ;
+ mf:action <bad-decimal-leading.ttl> ;
+ mf:name "bad-decimal-leading" .
+
+<#bad-decimal-trailing>
+ a rdft:TestTurtleNegativeEval ;
+ rdfs:comment "Invalid xsd::decimal syntax (trailing garbage)" ;
+ mf:action <bad-decimal-trailing.ttl> ;
+ mf:name "bad-decimal-trailing" .
+
+<#bad-empty-boolean>
+ a rdft:TestTurtleNegativeEval ;
+ rdfs:comment "Invalid xsd::boolean syntax (no value)" ;
+ mf:action <bad-empty-boolean.ttl> ;
+ mf:name "bad-empty-boolean" .
+
+<#bad-integer-leading>
+ a rdft:TestTurtleNegativeEval ;
+ rdfs:comment "Invalid xsd::integer syntax (leading garbage)" ;
+ mf:action <bad-integer-leading.ttl> ;
+ mf:name "bad-integer-leading" .
+
+<#bad-integer-trailing>
+ a rdft:TestTurtleNegativeEval ;
+ rdfs:comment "Invalid xsd::integer syntax (trailing garbage)" ;
+ mf:action <bad-integer-trailing.ttl> ;
+ mf:name "bad-integer-trailing" .
+
+<#bad-lang-long>
+ a rdft:TestTurtleNegativeEval ;
+ rdfs:comment "Overly long language tag" ;
+ mf:action <bad-lang-long.ttl> ;
+ mf:name "bad-lang-long" .
+
+<#test-canon>
+ a rdft:TestTurtleEval ;
+ mf:action <test-canon.ttl> ;
+ mf:name "test-canon" ;
+ mf:result <test-canon.nt> .
diff --git a/test/extra/canon/test-canon.nt b/test/extra/canon/test-canon.nt
new file mode 100644
index 00000000..ff492890
--- /dev/null
+++ b/test/extra/canon/test-canon.nt
@@ -0,0 +1,70 @@
+_:b1 <http://example.org/boolean> "false"^^<http://www.w3.org/2001/XMLSchema#boolean> .
+_:b1 <http://example.org/boolean> "false"^^<http://www.w3.org/2001/XMLSchema#boolean> .
+_:b1 <http://example.org/boolean> "true"^^<http://www.w3.org/2001/XMLSchema#boolean> .
+_:b1 <http://example.org/boolean> "true"^^<http://www.w3.org/2001/XMLSchema#boolean> .
+_:b1 <http://example.org/ieee754> "1.0E2"^^<http://www.w3.org/2001/XMLSchema#float> .
+_:b1 <http://example.org/ieee754> "-1.0E2"^^<http://www.w3.org/2001/XMLSchema#float> .
+_:b1 <http://example.org/ieee754> "1.0E3"^^<http://www.w3.org/2001/XMLSchema#double> .
+_:b1 <http://example.org/ieee754> "-1.0E3"^^<http://www.w3.org/2001/XMLSchema#double> .
+_:b1 <http://example.org/machine> "9223372036854775807"^^<http://www.w3.org/2001/XMLSchema#long> .
+_:b1 <http://example.org/machine> "-9223372036854775808"^^<http://www.w3.org/2001/XMLSchema#long> .
+_:b1 <http://example.org/machine> "2147483647"^^<http://www.w3.org/2001/XMLSchema#int> .
+_:b1 <http://example.org/machine> "-2147483648"^^<http://www.w3.org/2001/XMLSchema#int> .
+_:b1 <http://example.org/machine> "32767"^^<http://www.w3.org/2001/XMLSchema#short> .
+_:b1 <http://example.org/machine> "-32768"^^<http://www.w3.org/2001/XMLSchema#short> .
+_:b1 <http://example.org/machine> "127"^^<http://www.w3.org/2001/XMLSchema#byte> .
+_:b1 <http://example.org/machine> "-128"^^<http://www.w3.org/2001/XMLSchema#byte> .
+_:b1 <http://example.org/machine> "1"^^<http://www.w3.org/2001/XMLSchema#unsignedLong> .
+_:b1 <http://example.org/machine> "18446744073709551615"^^<http://www.w3.org/2001/XMLSchema#unsignedLong> .
+_:b1 <http://example.org/machine> "1"^^<http://www.w3.org/2001/XMLSchema#unsignedInt> .
+_:b1 <http://example.org/machine> "4294967295"^^<http://www.w3.org/2001/XMLSchema#unsignedInt> .
+_:b1 <http://example.org/machine> "1"^^<http://www.w3.org/2001/XMLSchema#unsignedShort> .
+_:b1 <http://example.org/machine> "65535"^^<http://www.w3.org/2001/XMLSchema#unsignedShort> .
+_:b1 <http://example.org/machine> "1"^^<http://www.w3.org/2001/XMLSchema#unsignedByte> .
+_:b1 <http://example.org/machine> "255"^^<http://www.w3.org/2001/XMLSchema#unsignedByte> .
+_:b1 <http://example.org/decimal> "0.0"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+_:b1 <http://example.org/decimal> "0.0"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+_:b1 <http://example.org/decimal> "-0.0"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+_:b1 <http://example.org/decimal> "36893488147419103232.0"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+_:b1 <http://example.org/decimal> "36893488147419103232.0"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+_:b1 <http://example.org/decimal> "36893488147419103232.0"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+_:b1 <http://example.org/decimal> "36893488147419103232.0"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+_:b1 <http://example.org/decimal> "36893488147419103232.0"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+_:b1 <http://example.org/decimal> "36893488147419103232.0"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+_:b1 <http://example.org/decimal> "36893488147419103232.123"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+_:b1 <http://example.org/decimal> "-36893488147419103232.0"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+_:b1 <http://example.org/decimal> "-36893488147419103232.0"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+_:b1 <http://example.org/decimal> "-36893488147419103232.0"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+_:b1 <http://example.org/decimal> "-36893488147419103232.0"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+_:b1 <http://example.org/decimal> "-36893488147419103232.123"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+_:b1 <http://example.org/decimal> "0.123"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+_:b1 <http://example.org/decimal> "0.123"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+_:b1 <http://example.org/decimal> "0.123"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+_:b1 <http://example.org/decimal> "0.123"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+_:b1 <http://example.org/decimal> "-0.123"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+_:b1 <http://example.org/decimal> "-0.123"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+_:b1 <http://example.org/integer> "36893488147419103232"^^<http://www.w3.org/2001/XMLSchema#integer> .
+_:b1 <http://example.org/integer> "36893488147419103232"^^<http://www.w3.org/2001/XMLSchema#integer> .
+_:b1 <http://example.org/integer> "36893488147419103232"^^<http://www.w3.org/2001/XMLSchema#integer> .
+_:b1 <http://example.org/integer> "36893488147419103232"^^<http://www.w3.org/2001/XMLSchema#integer> .
+_:b1 <http://example.org/integer> "-36893488147419103232"^^<http://www.w3.org/2001/XMLSchema#integer> .
+_:b1 <http://example.org/integer> "-36893488147419103232"^^<http://www.w3.org/2001/XMLSchema#integer> .
+_:b1 <http://example.org/integer> "0"^^<http://www.w3.org/2001/XMLSchema#nonPositiveInteger> .
+_:b1 <http://example.org/integer> "-36893488147419103232"^^<http://www.w3.org/2001/XMLSchema#nonPositiveInteger> .
+_:b1 <http://example.org/integer> "-1"^^<http://www.w3.org/2001/XMLSchema#negativeInteger> .
+_:b1 <http://example.org/integer> "-36893488147419103232"^^<http://www.w3.org/2001/XMLSchema#negativeInteger> .
+_:b1 <http://example.org/integer> "0"^^<http://www.w3.org/2001/XMLSchema#nonNegativeInteger> .
+_:b1 <http://example.org/integer> "36893488147419103232"^^<http://www.w3.org/2001/XMLSchema#nonNegativeInteger> .
+_:b1 <http://example.org/integer> "1"^^<http://www.w3.org/2001/XMLSchema#positiveInteger> .
+_:b1 <http://example.org/integer> "36893488147419103232"^^<http://www.w3.org/2001/XMLSchema#positiveInteger> .
+_:b1 <http://example.org/langString> "no language tag" .
+_:b1 <http://example.org/taggedString> "english"@en-ca .
+_:b1 <http://example.org/time> "P1Y6M"^^<http://www.w3.org/2001/XMLSchema#duration> .
+_:b1 <http://example.org/time> "12:15:01Z"^^<http://www.w3.org/2001/XMLSchema#time> .
+_:b1 <http://example.org/time> "2004-04-12Z"^^<http://www.w3.org/2001/XMLSchema#date> .
+_:b1 <http://example.org/binary> "A1B7F080"^^<http://www.w3.org/2001/XMLSchema#hexBinary> .
+_:b1 <http://example.org/binary> "Zm9vYmF="^^<http://www.w3.org/2001/XMLSchema#base64Binary> .
+_:b1 <http://example.org/other> "untyped" .
+_:b1 <http://example.org/other> <http://example.org/uri> .
+_:b1 <http://example.org/other> "notxsd"^^<http://example.org/sometype> .
+_:b1 <http://example.org/other> "unsupported"^^<http://www.w3.org/2001/XMLSchema#name> .
diff --git a/test/extra/canon/test-canon.ttl b/test/extra/canon/test-canon.ttl
new file mode 100644
index 00000000..0d0b4682
--- /dev/null
+++ b/test/extra/canon/test-canon.ttl
@@ -0,0 +1,76 @@
+@base <http://example.org/> .
+@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+
+[
+ <boolean> " false "^^xsd:boolean ,
+ " 0 "^^xsd:boolean ,
+ " true "^^xsd:boolean ,
+ " 1 "^^xsd:boolean ;
+ <ieee754> " +0100.0 "^^xsd:float ,
+ " -0100.0 "^^xsd:float ,
+ " +01000.0 "^^xsd:double ,
+ " -01000.0 "^^xsd:double ;
+ <machine> " +09223372036854775807 "^^xsd:long ,
+ " -09223372036854775808 "^^xsd:long ,
+ " +02147483647 "^^xsd:int ,
+ " -02147483648 "^^xsd:int ,
+ " +032767 "^^xsd:short ,
+ " -032768 "^^xsd:short ,
+ " +0127 "^^xsd:byte ,
+ " -0128 "^^xsd:byte ,
+ " 01 "^^xsd:unsignedLong ,
+ " 018446744073709551615 "^^xsd:unsignedLong ,
+ " 01 "^^xsd:unsignedInt ,
+ " 04294967295 "^^xsd:unsignedInt ,
+ " 01 "^^xsd:unsignedShort ,
+ " 065535 "^^xsd:unsignedShort ,
+ " 01 "^^xsd:unsignedByte ,
+ " 0255 "^^xsd:unsignedByte ;
+ <decimal> " 00 "^^xsd:decimal ,
+ " +0 "^^xsd:decimal ,
+ " -0 "^^xsd:decimal ,
+ " 36893488147419103232 "^^xsd:decimal ,
+ " 0036893488147419103232 "^^xsd:decimal ,
+ " +36893488147419103232 "^^xsd:decimal ,
+ " +0036893488147419103232 "^^xsd:decimal ,
+ " +0036893488147419103232. "^^xsd:decimal ,
+ " +0036893488147419103232.00 "^^xsd:decimal ,
+ " +0036893488147419103232.12300 "^^xsd:decimal ,
+ " -36893488147419103232 "^^xsd:decimal ,
+ " -0036893488147419103232 "^^xsd:decimal ,
+ " -0036893488147419103232. "^^xsd:decimal ,
+ " -0036893488147419103232.00 "^^xsd:decimal ,
+ " -0036893488147419103232.12300 "^^xsd:decimal ,
+ " 00.12300 "^^xsd:decimal ,
+ " .12300 "^^xsd:decimal ,
+ " +.12300 "^^xsd:decimal ,
+ " +00.12300 "^^xsd:decimal ,
+ " -.12300 "^^xsd:decimal ,
+ " -00.12300 "^^xsd:decimal ;
+ <integer> " 36893488147419103232 "^^xsd:integer ,
+ " 0036893488147419103232 "^^xsd:integer ,
+ " +36893488147419103232 "^^xsd:integer ,
+ " +0036893488147419103232 "^^xsd:integer ,
+ " -36893488147419103232 "^^xsd:integer ,
+ " -0036893488147419103232 "^^xsd:integer ,
+ " 00 "^^xsd:nonPositiveInteger ,
+ " -036893488147419103232 "^^xsd:nonPositiveInteger ,
+ " -01 "^^xsd:negativeInteger ,
+ " -036893488147419103232 "^^xsd:negativeInteger ,
+ " 00 "^^xsd:nonNegativeInteger ,
+ " 036893488147419103232 "^^xsd:nonNegativeInteger ,
+ " +01 "^^xsd:positiveInteger ,
+ " 036893488147419103232 "^^xsd:positiveInteger ;
+ <langString> "no language tag"^^rdf:langString ;
+ <taggedString> "english"@EN-CA ;
+ <time> " P1Y6M0D "^^xsd:duration ,
+ " 12:15:01+00:00 "^^xsd:time ,
+ " 02004-04-12+00:00 "^^xsd:date ;
+ <binary> "A 1 B7 F080"^^xsd:hexBinary ,
+ " Zm 9v Y m F="^^xsd:base64Binary ;
+ <other> "untyped" ,
+ <uri> ,
+ "notxsd"^^<sometype> ,
+ "unsupported"^^xsd:name
+] .
diff --git a/test/meson.build b/test/meson.build
index 5c0ac7e8..24180efa 100644
--- a/test/meson.build
+++ b/test/meson.build
@@ -29,6 +29,7 @@ ttl_metadata_file_paths = [
'extra/abbreviate/manifest.ttl',
'extra/bad/manifest.ttl',
'extra/big/manifest.ttl',
+ 'extra/canon/manifest.ttl',
'extra/full/manifest.ttl',
'extra/good/manifest.ttl',
'extra/lax/manifest.ttl',
@@ -124,6 +125,7 @@ subdir('headers')
##############
unit_tests = [
+ 'canon',
'caret',
'env',
'free_null',
@@ -444,6 +446,13 @@ test_suites = {
'-a',
['-b', '1'],
],
+ 'canon': [
+ files('extra/canon/manifest.ttl'),
+ ns_serdtest + 'canon/',
+ '--',
+ '-C',
+ '-a',
+ ],
'fast': [
files('extra/perfect/manifest.ttl'),
ns_serdtest + 'perfect/',
diff --git a/test/test_canon.c b/test/test_canon.c
new file mode 100644
index 00000000..1a569664
--- /dev/null
+++ b/test/test_canon.c
@@ -0,0 +1,103 @@
+// Copyright 2021 David Robillard <d@drobilla.net>
+// SPDX-License-Identifier: ISC
+
+#undef NDEBUG
+
+#include "failing_allocator.h"
+
+#include "serd/canon.h"
+#include "serd/event.h"
+#include "serd/node.h"
+#include "serd/sink.h"
+#include "serd/status.h"
+#include "serd/string_view.h"
+#include "serd/world.h"
+
+#include <assert.h>
+#include <stddef.h>
+
+static SerdStatus
+ignore_event(void* handle, const SerdEvent* event)
+{
+ (void)handle;
+ (void)event;
+ return SERD_SUCCESS;
+}
+
+static void
+test_new_failed_alloc(void)
+{
+ SerdFailingAllocator allocator = serd_failing_allocator();
+
+ SerdWorld* const world = serd_world_new(&allocator.base);
+
+ SerdSink* target = serd_sink_new(&allocator.base, NULL, ignore_event, NULL);
+ const size_t n_setup_allocs = allocator.n_allocations;
+
+ // Successfully allocate a canon to count the number of allocations
+ SerdSink* canon = serd_canon_new(world, target, 0U);
+ assert(canon);
+
+ // Test that each allocation failing is handled gracefully
+ const size_t n_new_allocs = allocator.n_allocations - n_setup_allocs;
+ for (size_t i = 0; i < n_new_allocs; ++i) {
+ allocator.n_remaining = i;
+ assert(!serd_canon_new(world, target, 0U));
+ }
+
+ serd_sink_free(canon);
+ serd_sink_free(target);
+ serd_world_free(world);
+}
+
+static void
+test_write_failed_alloc(void)
+{
+ const SerdStringView s_string = serd_string("http://example.org/s");
+ const SerdStringView p_string = serd_string("http://example.org/p");
+ const SerdStringView o_string = serd_string("012.340");
+ const SerdStringView xsd_float =
+ serd_string("http://www.w3.org/2001/XMLSchema#float");
+
+ SerdFailingAllocator allocator = serd_failing_allocator();
+
+ SerdWorld* const world = serd_world_new(&allocator.base);
+
+ const SerdNode* const s =
+ serd_node_new(&allocator.base, serd_a_uri(s_string));
+
+ const SerdNode* const p =
+ serd_node_new(&allocator.base, serd_a_uri(p_string));
+
+ const SerdNode* const o =
+ serd_node_new(&allocator.base, serd_a_typed_literal(o_string, xsd_float));
+
+ SerdSink* target = serd_sink_new(&allocator.base, NULL, ignore_event, NULL);
+ SerdSink* canon = serd_canon_new(world, target, 0U);
+ const size_t n_setup_allocs = allocator.n_allocations;
+
+ // Successfully write statement to count the number of allocations
+ assert(canon);
+ assert(!serd_sink_write(canon, 0U, s, p, o, NULL));
+
+ // Test that each allocation failing is handled gracefully
+ const size_t n_new_allocs = allocator.n_allocations - n_setup_allocs;
+ for (size_t i = 0; i < n_new_allocs; ++i) {
+ allocator.n_remaining = i;
+
+ const SerdStatus st = serd_sink_write(canon, 0U, s, p, o, NULL);
+ assert(st == SERD_BAD_ALLOC);
+ }
+
+ serd_sink_free(canon);
+ serd_sink_free(target);
+ serd_world_free(world);
+}
+
+int
+main(void)
+{
+ test_new_failed_alloc();
+ test_write_failed_alloc();
+ return 0;
+}
diff --git a/tools/serd-pipe.c b/tools/serd-pipe.c
index 3f1c3f53..bdb871fa 100644
--- a/tools/serd-pipe.c
+++ b/tools/serd-pipe.c
@@ -3,6 +3,7 @@
#include "console.h"
+#include "serd/canon.h"
#include "serd/env.h"
#include "serd/input_stream.h"
#include "serd/log.h"
@@ -37,6 +38,7 @@ print_usage(const char* const name, const bool error)
"Read and write RDF syntax.\n"
"Use - for INPUT to read from standard input.\n\n"
" -B BASE_URI Base URI.\n"
+ " -C Convert literals to canonical form.\n"
" -a Write ASCII output.\n"
" -b BYTES I/O block size.\n"
" -c PREFIX Chop PREFIX from matching blank node IDs.\n"
@@ -115,6 +117,7 @@ main(int argc, char** argv)
SerdReaderFlags reader_flags = 0;
SerdWriterFlags writer_flags = 0;
bool osyntax_set = false;
+ bool canonical = false;
bool quiet = false;
size_t block_size = 4096U;
size_t stack_size = 1048576U;
@@ -140,7 +143,9 @@ main(int argc, char** argv)
for (int o = 1; argv[a][o]; ++o) {
const char opt = argv[a][o];
- if (opt == 'a') {
+ if (opt == 'C') {
+ canonical = true;
+ } else if (opt == 'a') {
writer_flags |= SERD_WRITE_ASCII;
} else if (opt == 'f') {
writer_flags |= (SERD_WRITE_EXPANDED | SERD_WRITE_VERBATIM);
@@ -305,6 +310,13 @@ main(int argc, char** argv)
SerdWriter* const writer =
serd_writer_new(world, output_syntax, writer_flags, env, &out, block_size);
+ const SerdSink* sink = serd_writer_sink(writer);
+
+ SerdSink* canon = NULL;
+ if (canonical) {
+ sink = canon = serd_canon_new(world, sink, reader_flags);
+ }
+
if (quiet) {
serd_set_log_func(world, serd_quiet_log_func, NULL);
}
@@ -321,12 +333,8 @@ main(int argc, char** argv)
const char* position = input_string;
SerdInputStream string_in = serd_open_input_string(&position);
- SerdReader* const reader =
- serd_reader_new(world,
- input_syntax ? input_syntax : SERD_TRIG,
- reader_flags,
- env,
- serd_writer_sink(writer));
+ SerdReader* const reader = serd_reader_new(
+ world, input_syntax ? input_syntax : SERD_TRIG, reader_flags, env, sink);
serd_reader_add_blank_prefix(reader, add_prefix);
@@ -361,7 +369,7 @@ main(int argc, char** argv)
serd_choose_syntax(world, input_syntax, inputs[i]),
reader_flags,
env,
- serd_writer_sink(writer),
+ sink,
stack_size,
inputs[i],
n_inputs > 1 ? prefix : add_prefix,
@@ -371,6 +379,7 @@ main(int argc, char** argv)
}
free(prefix);
+ serd_sink_free(canon);
serd_writer_free(writer);
serd_node_free(NULL, input_name);
serd_env_free(env);