From d4aec28ba8ad24d5aef3ee12beeb1b805148eab1 Mon Sep 17 00:00:00 2001 From: David Robillard <d@drobilla.net> Date: Thu, 12 Aug 2021 13:42:25 -0400 Subject: Make blank node prefixing automatic Though potentially useful, I don't think the complexity cost of the old interface (both to the implementation and to the user) is worth it. A special tool to transform blank node labels (for example with regular expressions) would be a better approach to this if it's ever needed in the future. --- doc/serdi.1 | 15 --------------- include/serd/serd.h | 25 ------------------------- src/read_ntriples.c | 6 ++---- src/reader.c | 39 +++++++++++++++++---------------------- src/reader.h | 2 +- src/world.h | 1 + src/writer.c | 30 +----------------------------- test/multifile/output.nq | 4 ++-- test/run_test_suite.py | 4 ---- test/test_overflow.c | 2 +- test/test_reader_writer.c | 14 -------------- tools/serdi.c | 38 ++------------------------------------ 12 files changed, 27 insertions(+), 153 deletions(-) diff --git a/doc/serdi.1 b/doc/serdi.1 index 698bec29..c6356953 100644 --- a/doc/serdi.1 +++ b/doc/serdi.1 @@ -10,11 +10,9 @@ .Op Fl F Ar pattern | Fl G Ar pattern .Op Fl I Ar base .Op Fl b Ar bytes -.Op Fl c Ar prefix .Op Fl i Ar syntax .Op Fl k Ar bytes .Op Fl o Ar syntax -.Op Fl p Ar prefix .Op Fl r Ar root .Op Fl s Ar string .Op Fl w Ar filename @@ -80,13 +78,6 @@ This is the number of bytes in a file that will be read or written at once. The default is 4096, which should perform well in most cases. Note that this only applies to files, standard input and output are always processed one byte at a time. .Pp -.It Fl c Ar prefix -Chop -.Ar prefix -from matching blank node IDs. -This is the inverse of -.Fl p . -.Pp .It Fl f Fast and loose mode. This disables shortening URIs into prefixed names or relative URI references. @@ -185,12 +176,6 @@ Tolerate invalid UTF-8 by writing the replacement character when necessary. Note that data may be lost when using this option! .El .Pp -.It Fl p Ar prefix -Add -.Ar prefix -to blank node IDs. -This can be used to avoid clashes between blank node IDs in input documents. -.Pp .It Fl q Suppress all output except data. .Pp diff --git a/include/serd/serd.h b/include/serd/serd.h index c30c656b..1f46e092 100644 --- a/include/serd/serd.h +++ b/include/serd/serd.h @@ -2338,20 +2338,6 @@ serd_reader_new(SerdWorld* SERD_NONNULL world, const SerdSink* SERD_NONNULL sink, size_t stack_size); -/** - Set a prefix to be added to all blank node identifiers. - - This is useful when multiple files are to be parsed into the same output (a - model or a file). Since Serd preserves blank node IDs, this could cause - conflicts where two non-equivalent blank nodes are merged, resulting in - corrupt data. By setting a unique blank node prefix for each parsed file, - this can be avoided, while preserving blank node names. -*/ -SERD_API -void -serd_reader_add_blank_prefix(SerdReader* SERD_NONNULL reader, - const char* SERD_NULLABLE prefix); - /// Prepare to read from a byte source SERD_API SerdStatus @@ -2587,17 +2573,6 @@ SERD_API char* SERD_NONNULL serd_buffer_sink_finish(SerdBuffer* SERD_NONNULL stream); -/** - Set a prefix to be removed from matching blank node identifiers. - - This is the counterpart to serd_reader_add_blank_prefix() which can be used - to "undo" added prefixes. -*/ -SERD_API -void -serd_writer_chop_blank_prefix(SerdWriter* SERD_NONNULL writer, - const char* SERD_NULLABLE prefix); - /** Set the current output base URI, and emit a directive if applicable. diff --git a/src/read_ntriples.c b/src/read_ntriples.c index 08c489fe..c107207d 100644 --- a/src/read_ntriples.c +++ b/src/read_ntriples.c @@ -287,10 +287,8 @@ read_BLANK_NODE_LABEL(SerdReader* const reader, return st; } - if (!(*dest = push_node(reader, - SERD_BLANK, - reader->bprefix ? reader->bprefix : "", - reader->bprefix_len))) { + if (!(*dest = push_node( + reader, SERD_BLANK, reader->bprefix, reader->bprefix_len))) { return SERD_ERR_OVERFLOW; } diff --git a/src/reader.c b/src/reader.c index 7a640cc6..c8a66c42 100644 --- a/src/reader.c +++ b/src/reader.c @@ -24,6 +24,7 @@ #include "stack.h" #include "statement.h" #include "system.h" +#include "world.h" #include <assert.h> #include <stdarg.h> @@ -70,11 +71,10 @@ set_blank_id(SerdReader* const reader, SerdNode* const node, const size_t buf_size) { - char* buf = (char*)(node + 1); - const char* prefix = reader->bprefix ? (const char*)reader->bprefix : ""; + char* const buf = (char*)(node + 1); - node->length = - (size_t)snprintf(buf, buf_size, "%sb%u", prefix, reader->next_id++); + node->length = (size_t)snprintf( + buf, buf_size, "%sb%u", reader->bprefix, reader->next_id++); } size_t @@ -201,6 +201,13 @@ serd_reader_read_document(SerdReader* const reader) return SERD_ERR_BAD_CALL; } + if (!(reader->flags & SERD_READ_GLOBAL)) { + reader->bprefix_len = (size_t)snprintf(reader->bprefix, + sizeof(reader->bprefix), + "f%u", + ++reader->world->next_document_id); + } + if (reader->syntax != SERD_SYNTAX_EMPTY && !reader->source->prepared) { SerdStatus st = serd_reader_prepare(reader); if (st) { @@ -263,6 +270,12 @@ serd_reader_new(SerdWorld* const world, assert(me->rdf_rest); assert(me->rdf_nil); + if (!(flags & SERD_READ_GLOBAL)) { + me->bprefix[0] = 'f'; + me->bprefix[1] = '0'; + me->bprefix_len = 2; + } + return me; } @@ -276,27 +289,9 @@ serd_reader_free(SerdReader* const reader) serd_reader_finish(reader); serd_free_aligned(reader->stack.buf); - free(reader->bprefix); free(reader); } -void -serd_reader_add_blank_prefix(SerdReader* const reader, const char* const prefix) -{ - assert(reader); - - free(reader->bprefix); - reader->bprefix_len = 0; - reader->bprefix = NULL; - - const size_t prefix_len = prefix ? strlen(prefix) : 0; - if (prefix_len) { - reader->bprefix_len = prefix_len; - reader->bprefix = (char*)malloc(reader->bprefix_len + 1); - memcpy(reader->bprefix, prefix, reader->bprefix_len + 1); - } -} - static SerdStatus skip_bom(SerdReader* const me) { diff --git a/src/reader.h b/src/reader.h index 23d38009..ecf420f4 100644 --- a/src/reader.h +++ b/src/reader.h @@ -51,7 +51,7 @@ struct SerdReaderImpl { SerdReaderFlags flags; unsigned next_id; uint8_t* buf; - char* bprefix; + char bprefix[24]; size_t bprefix_len; bool strict; ///< True iff strict parsing bool seen_genid; diff --git a/src/world.h b/src/world.h index 8cc99e61..f1124023 100644 --- a/src/world.h +++ b/src/world.h @@ -41,6 +41,7 @@ struct SerdWorldImpl { } blank; uint32_t next_blank_id; + uint32_t next_document_id; bool stderr_color; }; diff --git a/src/writer.c b/src/writer.c index 5f178a6f..5f9fdaab 100644 --- a/src/writer.c +++ b/src/writer.c @@ -130,8 +130,6 @@ struct SerdWriterImpl { size_t anon_stack_size; SerdByteSink* byte_sink; WriteContext context; - char* bprefix; - size_t bprefix_len; Sep last_sep; int indent; bool empty; @@ -893,15 +891,7 @@ write_blank(SerdWriter* const writer, SerdStatus st = SERD_SUCCESS; TRY(st, esink("_:", 2, writer)); - if (!st && writer->bprefix && - !strncmp(node_str, writer->bprefix, writer->bprefix_len)) { - TRY(st, - esink(node_str + writer->bprefix_len, - node->length - writer->bprefix_len, - writer)); - } else { - TRY(st, esink(node_str, node->length, writer)); - } + TRY(st, esink(node_str, node->length, writer)); writer->last_sep = SEP_NONE; return st; @@ -1344,23 +1334,6 @@ serd_writer_new(SerdWorld* world, return writer; } -void -serd_writer_chop_blank_prefix(SerdWriter* writer, const char* prefix) -{ - assert(writer); - - free(writer->bprefix); - writer->bprefix_len = 0; - writer->bprefix = NULL; - - const size_t prefix_len = prefix ? strlen(prefix) : 0; - if (prefix_len) { - writer->bprefix_len = prefix_len; - writer->bprefix = (char*)malloc(writer->bprefix_len + 1); - memcpy(writer->bprefix, prefix, writer->bprefix_len + 1); - } -} - SerdStatus serd_writer_set_base_uri(SerdWriter* writer, const SerdNode* uri) { @@ -1445,7 +1418,6 @@ serd_writer_free(SerdWriter* writer) serd_writer_finish(writer); free(writer->anon_stack); - free(writer->bprefix); serd_node_free(writer->root_node); free(writer); } diff --git a/test/multifile/output.nq b/test/multifile/output.nq index dd35dc4d..cf3ea93e 100644 --- a/test/multifile/output.nq +++ b/test/multifile/output.nq @@ -1,3 +1,3 @@ -_:f0b1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/Type> . _:f1b1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/Type> . -_:f1b2 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/OtherType> <http://example.org/graph> . +_:f2b1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/Type> . +_:f2b2 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/OtherType> <http://example.org/graph> . diff --git a/test/run_test_suite.py b/test/run_test_suite.py index 13bb3c7d..457e7f81 100755 --- a/test/run_test_suite.py +++ b/test/run_test_suite.py @@ -58,8 +58,6 @@ def test_thru( isyntax, "-o", isyntax, - "-p", - "foo", "-w", out_path, "-I", @@ -76,8 +74,6 @@ def test_thru( isyntax, "-o", osyntax, - "-c", - "foo", "-w", thru_path, "-o", diff --git a/test/test_overflow.c b/test/test_overflow.c index 2a77135c..bb2513d5 100644 --- a/test/test_overflow.c +++ b/test/test_overflow.c @@ -22,7 +22,7 @@ #include <stdio.h> static const size_t min_stack_size = 4 * sizeof(size_t) + 230u; -static const size_t max_stack_size = 1024u; +static const size_t max_stack_size = 2048u; static SerdStatus test_size(SerdWorld* const world, diff --git a/test/test_reader_writer.c b/test/test_reader_writer.c index c0af313a..1bad22cd 100644 --- a/test/test_reader_writer.c +++ b/test/test_reader_writer.c @@ -51,9 +51,6 @@ test_writer(const char* const path) assert(writer); - serd_writer_chop_blank_prefix(writer, "tmp"); - serd_writer_chop_blank_prefix(writer, NULL); - const SerdNode* lit = serd_nodes_string(nodes, SERD_STRING("hello")); const SerdSink* const iface = serd_writer_sink(writer); @@ -152,17 +149,6 @@ test_reader(const char* path) assert(serd_reader_read_document(reader) == SERD_ERR_BAD_CALL); assert(serd_reader_read_chunk(reader) == SERD_ERR_BAD_CALL); - serd_reader_add_blank_prefix(reader, "tmp"); - -#if defined(__GNUC__) -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wnonnull" -#endif - serd_reader_add_blank_prefix(reader, NULL); -#if defined(__GNUC__) -# pragma GCC diagnostic pop -#endif - SerdByteSource* byte_source = serd_byte_source_new_filename(path, 4096); assert(!serd_reader_start(reader, byte_source)); assert(!serd_reader_read_document(reader)); diff --git a/tools/serdi.c b/tools/serdi.c index 1e21fe19..b1542727 100644 --- a/tools/serdi.c +++ b/tools/serdi.c @@ -46,7 +46,6 @@ print_usage(const char* const name, const bool error) " -G PATTERN Only include statements matching PATTERN.\n" " -I BASE_URI Input base URI.\n" " -b BYTES I/O block size.\n" - " -c PREFIX Chop PREFIX from matching blank node IDs.\n" " -f Fast and loose mode (possibly ugly output).\n" " -h Display this help and exit.\n" " -i SYNTAX Input syntax (turtle/ntriples/trig/nquads),\n" @@ -55,7 +54,6 @@ print_usage(const char* const name, const bool error) " -m Build a model in memory before writing.\n" " -o SYNTAX Output syntax (empty/turtle/ntriples/nquads),\n" " or flag (ascii/expanded/verbatim/terse/lax).\n" - " -p PREFIX Add PREFIX to blank node IDs.\n" " -q Suppress all output except data.\n" " -r ROOT_URI Keep relative URIs within ROOT_URI.\n" " -s STRING Parse STRING as input.\n" @@ -140,7 +138,6 @@ read_file(SerdWorld* const world, const SerdSink* const sink, const size_t stack_size, const char* const filename, - const char* const add_prefix, const size_t block_size) { SerdByteSource* byte_source = serd_open_input(filename, block_size); @@ -155,8 +152,6 @@ read_file(SerdWorld* const world, SerdReader* reader = serd_reader_new(world, syntax, flags, env, sink, stack_size); - serd_reader_add_blank_prefix(reader, add_prefix); - SerdStatus st = serd_reader_start(reader, byte_source); st = st ? st : serd_reader_read_document(reader); @@ -187,8 +182,6 @@ main(int argc, char** argv) const char* input_string = NULL; const char* in_pattern = NULL; const char* out_pattern = NULL; - const char* add_prefix = ""; - const char* chop_prefix = NULL; const char* root_uri = NULL; const char* out_filename = NULL; int a = 1; @@ -247,13 +240,6 @@ main(int argc, char** argv) } block_size = (size_t)size; break; - } else if (opt == 'c') { - if (argv[a][o + 1] || ++a == argc) { - return missing_arg(prog, 'c'); - } - - chop_prefix = argv[a]; - break; } else if (opt == 'i') { if (argv[a][o + 1] || ++a == argc) { return missing_arg(prog, 'i'); @@ -290,13 +276,6 @@ main(int argc, char** argv) osyntax_set = output_syntax != SERD_SYNTAX_EMPTY || !strcmp(argv[a], "empty"); - break; - } else if (opt == 'p') { - if (argv[a][o + 1] || ++a == argc) { - return missing_arg(prog, 'p'); - } - - add_prefix = argv[a]; break; } else if (opt == 'r') { if (argv[a][o + 1] || ++a == argc) { @@ -439,8 +418,6 @@ main(int argc, char** argv) serd_writer_set_root_uri(writer, SERD_STRING(root_uri)); } - serd_writer_chop_blank_prefix(writer, chop_prefix); - SerdStatus st = SERD_SUCCESS; if (input_string) { SerdByteSource* const byte_source = @@ -454,8 +431,6 @@ main(int argc, char** argv) sink, stack_size); - serd_reader_add_blank_prefix(reader, add_prefix); - if (!(st = serd_reader_start(reader, byte_source))) { st = serd_reader_read_document(reader); } @@ -464,11 +439,8 @@ main(int argc, char** argv) serd_byte_source_free(byte_source); } - size_t prefix_len = 0; - char* prefix = NULL; - if (n_inputs > 1) { - prefix_len = 8 + strlen(add_prefix); - prefix = (char*)calloc(1, prefix_len); + if (n_inputs == 1) { + reader_flags |= SERD_READ_GLOBAL; } for (int i = 0; !st && i < n_inputs; ++i) { @@ -479,10 +451,6 @@ main(int argc, char** argv) } } - if (n_inputs > 1) { - snprintf(prefix, prefix_len, "f%d%s", i, add_prefix); - } - if ((st = read_file(world, serd_choose_input_syntax(world, input_syntax, inputs[i]), @@ -491,12 +459,10 @@ main(int argc, char** argv) sink, stack_size, inputs[i], - n_inputs > 1 ? prefix : add_prefix, block_size))) { break; } } - free(prefix); if (st <= SERD_FAILURE && use_model) { const SerdSink* writer_sink = serd_writer_sink(writer); -- cgit v1.2.1