From 6ca012d4fb97c02a2206aebd42aef1f9cd5e1993 Mon Sep 17 00:00:00 2001 From: David Robillard Date: Sun, 26 Feb 2023 16:57:46 -0500 Subject: [WIP] Partially resurrect CURIE nodes --- include/serd/node.h | 12 ++++++++++-- include/serd/reader.h | 30 +++++++++++++++++++++++++----- scripts/serd_bench.py | 2 +- src/inserter.c | 7 +++++++ src/read_turtle.c | 29 +++++++++++++++++------------ src/writer.c | 26 ++++++++++++++++++++++++++ tools/serd-pipe.c | 7 +++++++ 7 files changed, 93 insertions(+), 20 deletions(-) diff --git a/include/serd/node.h b/include/serd/node.h index 941be5d1..78f5baae 100644 --- a/include/serd/node.h +++ b/include/serd/node.h @@ -70,6 +70,14 @@ typedef enum { */ SERD_URI = 2, + /** + CURIE, a shortened URI. + + Value is an unquoted CURIE string relative to the current environment, + e.g. "rdf:type". @see [CURIE Syntax 1.0](http://www.w3.org/TR/curie) + */ + SERD_CURIE = 3, + /** A blank node. @@ -80,7 +88,7 @@ typedef enum { @see [RDF 1.1 Turtle](http://www.w3.org/TR/turtle/#grammar-production-BLANK_NODE_LABEL) */ - SERD_BLANK = 3, + SERD_BLANK = 4, /** A variable node. @@ -93,7 +101,7 @@ typedef enum { @see [SPARQL 1.1 Query Language](https://www.w3.org/TR/sparql11-query/#rVar) */ - SERD_VARIABLE = 4, + SERD_VARIABLE = 5, } SerdNodeType; /// Node flags, which ORed together make a #SerdNodeFlags diff --git a/include/serd/reader.h b/include/serd/reader.h index 57c8b2c3..a500bb88 100644 --- a/include/serd/reader.h +++ b/include/serd/reader.h @@ -54,6 +54,26 @@ typedef enum { */ SERD_READ_VARIABLES = 1U << 1U, + /** + Read prefixed name (CURIE) references exactly without expanding them. + + Normally, the reader expands all prefixed names to full URIs based on the + prefixes in the current environment, and considers failure to expand a + syntax error. This flag disables that expansion so prefixed names will be + emitted directly as CURIE nodes. + + Note that these nodes rely on some context which can change over time, and + may even be undefined initially, so this flag should be used with caution. + Most applications should leave it off and avoid using CURIE nodes + entirely, because they are error-prone compared to working with complete + URIs. However, it can be useful for error-tolerance, or in constrained or + high-performance streaming contexts. For example, to re-indent a Turtle + file and ignore any possibly undefined prefixed names, this flag can be + used to disable expansion, which also boosts performance since it avoids + the lookup and expansion overhead. + */ + SERD_READ_CURIES = 1U << 2U, + /** Read relative URI references exactly without resolving them. @@ -61,7 +81,7 @@ typedef enum { flag disables that, so that URI references are passed to the sink exactly as they are in the input. */ - SERD_READ_RELATIVE = 1U << 2U, + SERD_READ_RELATIVE = 1U << 3U, /** Read blank node labels without adding a prefix unique to the document. @@ -75,7 +95,7 @@ typedef enum { corruption. Specifically, if data from separate documents parsed with this flag is combined, the IDs from each document may clash. */ - SERD_READ_GLOBAL = 1U << 3U, + SERD_READ_GLOBAL = 1U << 4U, /** Read generated blank node labels exactly without adjusting them. @@ -90,7 +110,7 @@ typedef enum { anonymous nodes, the generated IDs for those nodes may clash with IDs from the input document. */ - SERD_READ_GENERATED = 1U << 4U, + SERD_READ_GENERATED = 1U << 5U, /** Generate blank node labels with suffixes left-padded with zeros. @@ -100,7 +120,7 @@ typedef enum { nodes). In particular, this can be used to preserve blank node ordering from documents when the statements are sorted, such as in a model. */ - SERD_READ_ORDERED = 1U << 5U, + SERD_READ_ORDERED = 1U << 6U, /** Read URIs with percent-encoded UTF-8 characters decoded. @@ -109,7 +129,7 @@ typedef enum { preserved. This flags enables UTF-8 decoding of URIs, so octet escapes like "%7E" in URIs will be decoded to UTF-8 characters like "~". */ - SERD_READ_DECODED = 1U << 6U, + SERD_READ_DECODED = 1U << 7U, } SerdReaderFlag; /// Bitwise OR of SerdReaderFlag values diff --git a/scripts/serd_bench.py b/scripts/serd_bench.py index 35869ce6..3d1ef4ac 100755 --- a/scripts/serd_bench.py +++ b/scripts/serd_bench.py @@ -278,7 +278,7 @@ example: serd_opts = "-I turtle -I verbatim -O turtle -O verbatim -O expanded" progs = [ - "tools/serd-pipe " + serd_opts, + "tools/serd-pipe -I lax " + serd_opts, "tools/serd-sort " + serd_opts, ] + args.run diff --git a/src/inserter.c b/src/inserter.c index f3b8631b..13af697e 100644 --- a/src/inserter.c +++ b/src/inserter.c @@ -45,6 +45,13 @@ can_insert(SerdWorld* const world, const SerdNode* const node) } break; + case SERD_CURIE: + serd_logf(world, + SERD_LOG_LEVEL_ERROR, + "attempt to insert CURIE %s into model", + serd_node_string(node)); + return false; + case SERD_BLANK: case SERD_VARIABLE: break; diff --git a/src/read_turtle.c b/src/read_turtle.c index e8e57dfc..6457d608 100644 --- a/src/read_turtle.c +++ b/src/read_turtle.c @@ -393,18 +393,23 @@ read_PrefixedName(SerdReader* const reader, skip_byte(reader, ':'); - // Search environment for the prefix URI - const ZixStringView prefix = serd_node_string_view(dest); - const ZixStringView prefix_uri = serd_env_find_prefix(reader->env, prefix); - if (!prefix_uri.length) { - return r_err(reader, st, "unknown prefix \"%s\"", prefix.data); - } - - // Pop back to the start of the string - serd_stack_pop_to(&reader->stack, string_start_offset); - dest->length = 0U; - dest->type = SERD_URI; - push_bytes(reader, dest, (const uint8_t*)prefix_uri.data, prefix_uri.length); + if ((reader->flags & SERD_READ_CURIES)) { + dest->type = SERD_CURIE; + } else { + // Search environment for the prefix URI + const ZixStringView name = serd_node_string_view(dest); + const ZixStringView uri = serd_env_find_prefix(reader->env, name); + if (!uri.length) { + return r_err(reader, st, "unknown prefix \"%s\"", name.data); + } + + // Pop back to the start of the string and replace it + serd_stack_pop_to(&reader->stack, string_start_offset); + dest->length = 0U; + dest->type = SERD_URI; + push_bytes(reader, dest, (const uint8_t*)uri.data, uri.length); + } + if ((st = read_PN_LOCAL(reader, dest, ate_dot)) > SERD_FAILURE) { return st; } diff --git a/src/writer.c b/src/writer.c index faf5b7d9..db04c8f9 100644 --- a/src/writer.c +++ b/src/writer.c @@ -954,6 +954,29 @@ write_uri_node(SerdWriter* const writer, return write_full_uri_node(writer, node); } +SERD_NODISCARD static SerdStatus +write_curie_node(SerdWriter* const writer, const SerdNode* const node) +{ + writer->last_sep = SEP_NONE; + + const ZixStringView curie = serd_node_string_view(node); + if (supports_abbrev(writer)) { + return write_lname(writer, curie.data, curie.length); + } + + ZixStringView prefix = {NULL, 0}; + ZixStringView suffix = {NULL, 0}; + SerdStatus st = SERD_SUCCESS; + if ((st = serd_env_expand_in_place(writer->env, curie, &prefix, &suffix))) { + return w_err(writer, st, "unknown namespace prefix in '%s'", curie.data); + } + + TRY(st, esink("<", 1, writer)); + TRY(st, ewrite_uri(writer, prefix.data, prefix.length)); + TRY(st, ewrite_uri(writer, suffix.data, suffix.length)); + return esink(">", 1, writer); +} + SERD_NODISCARD static SerdStatus write_blank(SerdWriter* const writer, const SerdNode* node, @@ -1012,6 +1035,9 @@ write_node(SerdWriter* const writer, case SERD_URI: st = write_uri_node(writer, node, field); break; + case SERD_CURIE: + st = write_curie_node(writer, node); + break; case SERD_BLANK: st = write_blank(writer, node, field, flags); break; diff --git a/tools/serd-pipe.c b/tools/serd-pipe.c index 4fed5a1f..d837f6f3 100644 --- a/tools/serd-pipe.c +++ b/tools/serd-pipe.c @@ -190,6 +190,13 @@ main(const int argc, char* const* const argv) } } + // Enable direct CURIE transmission for lax streams where supported + if ((opts.common.input.flags & SERD_READ_LAX) && + opts.common.output.syntax != SERD_NTRIPLES && + opts.common.output.syntax != SERD_NQUADS) { + opts.common.input.flags |= SERD_READ_CURIES; + } + // Every argument past the last option is an input opts.inputs = argv + iter.a; opts.n_inputs = argc - iter.a; -- cgit v1.2.1