diff options
author | David Robillard <d@drobilla.net> | 2023-02-26 16:57:46 -0500 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2023-12-02 18:49:08 -0500 |
commit | 6ca012d4fb97c02a2206aebd42aef1f9cd5e1993 (patch) | |
tree | 4e69f79738c2a4e5c4533a173595a280a4e0fa14 | |
parent | 2d5e6aa234faeb406911ed44f56038dc73f8ff8e (diff) | |
download | serd-1.x.tar.gz serd-1.x.tar.bz2 serd-1.x.zip |
[WIP] Partially resurrect CURIE nodes1.x
-rw-r--r-- | include/serd/node.h | 12 | ||||
-rw-r--r-- | include/serd/reader.h | 30 | ||||
-rwxr-xr-x | scripts/serd_bench.py | 2 | ||||
-rw-r--r-- | src/inserter.c | 7 | ||||
-rw-r--r-- | src/read_turtle.c | 29 | ||||
-rw-r--r-- | src/writer.c | 26 | ||||
-rw-r--r-- | tools/serd-pipe.c | 7 |
7 files changed, 93 insertions, 20 deletions
diff --git a/include/serd/node.h b/include/serd/node.h index 941be5d1..78f5baae 100644 --- a/include/serd/node.h +++ b/include/serd/node.h @@ -71,6 +71,14 @@ typedef enum { SERD_URI = 2, /** + CURIE, a shortened URI. + + Value is an unquoted CURIE string relative to the current environment, + e.g. "rdf:type". @see [CURIE Syntax 1.0](http://www.w3.org/TR/curie) + */ + SERD_CURIE = 3, + + /** A blank node. A blank node is a resource that has no URI. The identifier of a blank @@ -80,7 +88,7 @@ typedef enum { @see [RDF 1.1 Turtle](http://www.w3.org/TR/turtle/#grammar-production-BLANK_NODE_LABEL) */ - SERD_BLANK = 3, + SERD_BLANK = 4, /** A variable node. @@ -93,7 +101,7 @@ typedef enum { @see [SPARQL 1.1 Query Language](https://www.w3.org/TR/sparql11-query/#rVar) */ - SERD_VARIABLE = 4, + SERD_VARIABLE = 5, } SerdNodeType; /// Node flags, which ORed together make a #SerdNodeFlags diff --git a/include/serd/reader.h b/include/serd/reader.h index 57c8b2c3..a500bb88 100644 --- a/include/serd/reader.h +++ b/include/serd/reader.h @@ -55,13 +55,33 @@ typedef enum { SERD_READ_VARIABLES = 1U << 1U, /** + Read prefixed name (CURIE) references exactly without expanding them. + + Normally, the reader expands all prefixed names to full URIs based on the + prefixes in the current environment, and considers failure to expand a + syntax error. This flag disables that expansion so prefixed names will be + emitted directly as CURIE nodes. + + Note that these nodes rely on some context which can change over time, and + may even be undefined initially, so this flag should be used with caution. + Most applications should leave it off and avoid using CURIE nodes + entirely, because they are error-prone compared to working with complete + URIs. However, it can be useful for error-tolerance, or in constrained or + high-performance streaming contexts. For example, to re-indent a Turtle + file and ignore any possibly undefined prefixed names, this flag can be + used to disable expansion, which also boosts performance since it avoids + the lookup and expansion overhead. + */ + SERD_READ_CURIES = 1U << 2U, + + /** Read relative URI references exactly without resolving them. Normally, the reader expands all relative URIs against the base URI. This flag disables that, so that URI references are passed to the sink exactly as they are in the input. */ - SERD_READ_RELATIVE = 1U << 2U, + SERD_READ_RELATIVE = 1U << 3U, /** Read blank node labels without adding a prefix unique to the document. @@ -75,7 +95,7 @@ typedef enum { corruption. Specifically, if data from separate documents parsed with this flag is combined, the IDs from each document may clash. */ - SERD_READ_GLOBAL = 1U << 3U, + SERD_READ_GLOBAL = 1U << 4U, /** Read generated blank node labels exactly without adjusting them. @@ -90,7 +110,7 @@ typedef enum { anonymous nodes, the generated IDs for those nodes may clash with IDs from the input document. */ - SERD_READ_GENERATED = 1U << 4U, + SERD_READ_GENERATED = 1U << 5U, /** Generate blank node labels with suffixes left-padded with zeros. @@ -100,7 +120,7 @@ typedef enum { nodes). In particular, this can be used to preserve blank node ordering from documents when the statements are sorted, such as in a model. */ - SERD_READ_ORDERED = 1U << 5U, + SERD_READ_ORDERED = 1U << 6U, /** Read URIs with percent-encoded UTF-8 characters decoded. @@ -109,7 +129,7 @@ typedef enum { preserved. This flags enables UTF-8 decoding of URIs, so octet escapes like "%7E" in URIs will be decoded to UTF-8 characters like "~". */ - SERD_READ_DECODED = 1U << 6U, + SERD_READ_DECODED = 1U << 7U, } SerdReaderFlag; /// Bitwise OR of SerdReaderFlag values diff --git a/scripts/serd_bench.py b/scripts/serd_bench.py index 35869ce6..3d1ef4ac 100755 --- a/scripts/serd_bench.py +++ b/scripts/serd_bench.py @@ -278,7 +278,7 @@ example: serd_opts = "-I turtle -I verbatim -O turtle -O verbatim -O expanded" progs = [ - "tools/serd-pipe " + serd_opts, + "tools/serd-pipe -I lax " + serd_opts, "tools/serd-sort " + serd_opts, ] + args.run diff --git a/src/inserter.c b/src/inserter.c index f3b8631b..13af697e 100644 --- a/src/inserter.c +++ b/src/inserter.c @@ -45,6 +45,13 @@ can_insert(SerdWorld* const world, const SerdNode* const node) } break; + case SERD_CURIE: + serd_logf(world, + SERD_LOG_LEVEL_ERROR, + "attempt to insert CURIE %s into model", + serd_node_string(node)); + return false; + case SERD_BLANK: case SERD_VARIABLE: break; diff --git a/src/read_turtle.c b/src/read_turtle.c index e8e57dfc..6457d608 100644 --- a/src/read_turtle.c +++ b/src/read_turtle.c @@ -393,18 +393,23 @@ read_PrefixedName(SerdReader* const reader, skip_byte(reader, ':'); - // Search environment for the prefix URI - const ZixStringView prefix = serd_node_string_view(dest); - const ZixStringView prefix_uri = serd_env_find_prefix(reader->env, prefix); - if (!prefix_uri.length) { - return r_err(reader, st, "unknown prefix \"%s\"", prefix.data); - } - - // Pop back to the start of the string - serd_stack_pop_to(&reader->stack, string_start_offset); - dest->length = 0U; - dest->type = SERD_URI; - push_bytes(reader, dest, (const uint8_t*)prefix_uri.data, prefix_uri.length); + if ((reader->flags & SERD_READ_CURIES)) { + dest->type = SERD_CURIE; + } else { + // Search environment for the prefix URI + const ZixStringView name = serd_node_string_view(dest); + const ZixStringView uri = serd_env_find_prefix(reader->env, name); + if (!uri.length) { + return r_err(reader, st, "unknown prefix \"%s\"", name.data); + } + + // Pop back to the start of the string and replace it + serd_stack_pop_to(&reader->stack, string_start_offset); + dest->length = 0U; + dest->type = SERD_URI; + push_bytes(reader, dest, (const uint8_t*)uri.data, uri.length); + } + if ((st = read_PN_LOCAL(reader, dest, ate_dot)) > SERD_FAILURE) { return st; } diff --git a/src/writer.c b/src/writer.c index faf5b7d9..db04c8f9 100644 --- a/src/writer.c +++ b/src/writer.c @@ -955,6 +955,29 @@ write_uri_node(SerdWriter* const writer, } SERD_NODISCARD static SerdStatus +write_curie_node(SerdWriter* const writer, const SerdNode* const node) +{ + writer->last_sep = SEP_NONE; + + const ZixStringView curie = serd_node_string_view(node); + if (supports_abbrev(writer)) { + return write_lname(writer, curie.data, curie.length); + } + + ZixStringView prefix = {NULL, 0}; + ZixStringView suffix = {NULL, 0}; + SerdStatus st = SERD_SUCCESS; + if ((st = serd_env_expand_in_place(writer->env, curie, &prefix, &suffix))) { + return w_err(writer, st, "unknown namespace prefix in '%s'", curie.data); + } + + TRY(st, esink("<", 1, writer)); + TRY(st, ewrite_uri(writer, prefix.data, prefix.length)); + TRY(st, ewrite_uri(writer, suffix.data, suffix.length)); + return esink(">", 1, writer); +} + +SERD_NODISCARD static SerdStatus write_blank(SerdWriter* const writer, const SerdNode* node, const SerdField field, @@ -1012,6 +1035,9 @@ write_node(SerdWriter* const writer, case SERD_URI: st = write_uri_node(writer, node, field); break; + case SERD_CURIE: + st = write_curie_node(writer, node); + break; case SERD_BLANK: st = write_blank(writer, node, field, flags); break; diff --git a/tools/serd-pipe.c b/tools/serd-pipe.c index 4fed5a1f..d837f6f3 100644 --- a/tools/serd-pipe.c +++ b/tools/serd-pipe.c @@ -190,6 +190,13 @@ main(const int argc, char* const* const argv) } } + // Enable direct CURIE transmission for lax streams where supported + if ((opts.common.input.flags & SERD_READ_LAX) && + opts.common.output.syntax != SERD_NTRIPLES && + opts.common.output.syntax != SERD_NQUADS) { + opts.common.input.flags |= SERD_READ_CURIES; + } + // Every argument past the last option is an input opts.inputs = argv + iter.a; opts.n_inputs = argc - iter.a; |