From 4339b6f9cb0da8a9d6519077f4a0ecc385cc382c Mon Sep 17 00:00:00 2001 From: David Robillard <d@drobilla.net> Date: Thu, 12 Aug 2021 12:56:03 -0400 Subject: Add a reader flag to disable generated blank label avoidance --- include/serd/serd.h | 15 +++++++++++++++ src/node_syntax.c | 9 +++++++-- src/read_ntriples.c | 6 ++++-- tools/console.c | 1 + 4 files changed, 27 insertions(+), 4 deletions(-) diff --git a/include/serd/serd.h b/include/serd/serd.h index e99e4e6a..3f59ded6 100644 --- a/include/serd/serd.h +++ b/include/serd/serd.h @@ -2299,6 +2299,21 @@ typedef enum { a syntax error. */ SERD_READ_VERBATIM = 1u << 2u, + + /** + Read generated blank node labels exactly without adjusting them. + + Normally, the reader will adapt blank node labels in the input that clash + with its scheme for generating new ones, for example mapping "_:b123" to + "_:B123". This flag disables that, so that blank node labels are passed + to the sink exactly as they are in the input. + + Note that this flag should be used carefully, since it can result in data + corruption. Specifically, if the input is a syntax like Turtle with + anonymous nodes, the generated IDs for those nodes may clash with IDs from + the input document. + */ + SERD_READ_GENERATED = 1u << 3u, } SerdReaderFlag; /// Bitwise OR of SerdReaderFlag values diff --git a/src/node_syntax.c b/src/node_syntax.c index 0c45e33a..edf5cbf5 100644 --- a/src/node_syntax.c +++ b/src/node_syntax.c @@ -55,8 +55,13 @@ serd_node_from_syntax_in(const char* const str, SerdSink* const sink = serd_sink_new(&object, on_node_string_event, NULL); SerdByteSource* const source = serd_byte_source_new_string(doc, NULL); - SerdReader* const reader = serd_reader_new( - world, syntax, SERD_READ_VERBATIM, env, sink, 1024 + doc_len); + SerdReader* const reader = + serd_reader_new(world, + syntax, + SERD_READ_VERBATIM | SERD_READ_GENERATED, + env, + sink, + 1024 + doc_len); serd_reader_start(reader, source); serd_reader_read_document(reader); diff --git a/src/read_ntriples.c b/src/read_ntriples.c index aa8f5468..08c489fe 100644 --- a/src/read_ntriples.c +++ b/src/read_ntriples.c @@ -256,13 +256,15 @@ read_STRING_LITERAL(SerdReader* const reader, static SerdStatus adjust_blank_id(SerdReader* const reader, char* const buf) { - if (!(reader->flags & SERD_READ_VERBATIM) && + if (!(reader->flags & SERD_READ_GENERATED) && is_digit(buf[reader->bprefix_len + 1])) { const char tag = buf[reader->bprefix_len]; if (tag == 'b') { - buf[reader->bprefix_len] = 'B'; // Prevent clash + // Presumably generated ID like b123 in the input, adjust to B123 + buf[reader->bprefix_len] = 'B'; reader->seen_genid = true; } else if (tag == 'B' && reader->seen_genid) { + // We've seen both b123 and B123 styles, abort due to possible clashes return r_err(reader, SERD_ERR_ID_CLASH, "found both `b' and `B' blank IDs, prefix required"); diff --git a/tools/console.c b/tools/console.c index e13bf8a5..2a2905d9 100644 --- a/tools/console.c +++ b/tools/console.c @@ -70,6 +70,7 @@ serd_set_input_option(const SerdStringView name, {"lax", SERD_READ_LAX}, {"variables", SERD_READ_VARIABLES}, {"verbatim", SERD_READ_VERBATIM}, + {"generated", SERD_READ_GENERATED}, {NULL, SERD_READ_LAX}, }; -- cgit v1.2.1