From 6ca012d4fb97c02a2206aebd42aef1f9cd5e1993 Mon Sep 17 00:00:00 2001 From: David Robillard Date: Sun, 26 Feb 2023 16:57:46 -0500 Subject: [WIP] Partially resurrect CURIE nodes --- include/serd/node.h | 12 ++++++++++-- include/serd/reader.h | 30 +++++++++++++++++++++++++----- 2 files changed, 35 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/serd/node.h b/include/serd/node.h index 941be5d1..78f5baae 100644 --- a/include/serd/node.h +++ b/include/serd/node.h @@ -70,6 +70,14 @@ typedef enum { */ SERD_URI = 2, + /** + CURIE, a shortened URI. + + Value is an unquoted CURIE string relative to the current environment, + e.g. "rdf:type". @see [CURIE Syntax 1.0](http://www.w3.org/TR/curie) + */ + SERD_CURIE = 3, + /** A blank node. @@ -80,7 +88,7 @@ typedef enum { @see [RDF 1.1 Turtle](http://www.w3.org/TR/turtle/#grammar-production-BLANK_NODE_LABEL) */ - SERD_BLANK = 3, + SERD_BLANK = 4, /** A variable node. @@ -93,7 +101,7 @@ typedef enum { @see [SPARQL 1.1 Query Language](https://www.w3.org/TR/sparql11-query/#rVar) */ - SERD_VARIABLE = 4, + SERD_VARIABLE = 5, } SerdNodeType; /// Node flags, which ORed together make a #SerdNodeFlags diff --git a/include/serd/reader.h b/include/serd/reader.h index 57c8b2c3..a500bb88 100644 --- a/include/serd/reader.h +++ b/include/serd/reader.h @@ -54,6 +54,26 @@ typedef enum { */ SERD_READ_VARIABLES = 1U << 1U, + /** + Read prefixed name (CURIE) references exactly without expanding them. + + Normally, the reader expands all prefixed names to full URIs based on the + prefixes in the current environment, and considers failure to expand a + syntax error. This flag disables that expansion so prefixed names will be + emitted directly as CURIE nodes. + + Note that these nodes rely on some context which can change over time, and + may even be undefined initially, so this flag should be used with caution. + Most applications should leave it off and avoid using CURIE nodes + entirely, because they are error-prone compared to working with complete + URIs. However, it can be useful for error-tolerance, or in constrained or + high-performance streaming contexts. For example, to re-indent a Turtle + file and ignore any possibly undefined prefixed names, this flag can be + used to disable expansion, which also boosts performance since it avoids + the lookup and expansion overhead. + */ + SERD_READ_CURIES = 1U << 2U, + /** Read relative URI references exactly without resolving them. @@ -61,7 +81,7 @@ typedef enum { flag disables that, so that URI references are passed to the sink exactly as they are in the input. */ - SERD_READ_RELATIVE = 1U << 2U, + SERD_READ_RELATIVE = 1U << 3U, /** Read blank node labels without adding a prefix unique to the document. @@ -75,7 +95,7 @@ typedef enum { corruption. Specifically, if data from separate documents parsed with this flag is combined, the IDs from each document may clash. */ - SERD_READ_GLOBAL = 1U << 3U, + SERD_READ_GLOBAL = 1U << 4U, /** Read generated blank node labels exactly without adjusting them. @@ -90,7 +110,7 @@ typedef enum { anonymous nodes, the generated IDs for those nodes may clash with IDs from the input document. */ - SERD_READ_GENERATED = 1U << 4U, + SERD_READ_GENERATED = 1U << 5U, /** Generate blank node labels with suffixes left-padded with zeros. @@ -100,7 +120,7 @@ typedef enum { nodes). In particular, this can be used to preserve blank node ordering from documents when the statements are sorted, such as in a model. */ - SERD_READ_ORDERED = 1U << 5U, + SERD_READ_ORDERED = 1U << 6U, /** Read URIs with percent-encoded UTF-8 characters decoded. @@ -109,7 +129,7 @@ typedef enum { preserved. This flags enables UTF-8 decoding of URIs, so octet escapes like "%7E" in URIs will be decoded to UTF-8 characters like "~". */ - SERD_READ_DECODED = 1U << 6U, + SERD_READ_DECODED = 1U << 7U, } SerdReaderFlag; /// Bitwise OR of SerdReaderFlag values -- cgit v1.2.1