diff options
-rw-r--r-- | meson.build | 1 | ||||
-rw-r--r-- | src/n3.c | 92 | ||||
-rw-r--r-- | src/read_nquads.c | 124 | ||||
-rw-r--r-- | src/read_nquads.h | 29 | ||||
-rw-r--r-- | src/read_ntriples.c | 13 | ||||
-rw-r--r-- | src/read_ntriples.h | 10 | ||||
-rw-r--r-- | src/reader.c | 6 | ||||
-rw-r--r-- | src/reader.h | 6 | ||||
-rw-r--r-- | test/extra/lax/manifest.ttl | 7 | ||||
-rw-r--r-- | test/extra/lax/test-bad-string.nq | 3 |
10 files changed, 178 insertions, 113 deletions
diff --git a/meson.build b/meson.build index cd6c77f5..df3f237f 100644 --- a/meson.build +++ b/meson.build @@ -157,6 +157,7 @@ sources = files( 'src/env.c', 'src/n3.c', 'src/node.c', + 'src/read_nquads.c', 'src/read_ntriples.c', 'src/read_utf8.c', 'src/reader.c', @@ -23,12 +23,6 @@ #include <stdio.h> #include <string.h> -static bool -fancy_syntax(const SerdReader* const reader) -{ - return reader->syntax == SERD_TURTLE || reader->syntax == SERD_TRIG; -} - static SerdStatus read_collection(SerdReader* reader, ReadContext ctx, SerdNode** dest); @@ -145,11 +139,6 @@ read_String(SerdReader* const reader, SerdNode* const node) return SERD_SUCCESS; } - if (!fancy_syntax(reader)) { - return r_err( - reader, SERD_BAD_SYNTAX, "syntax does not support long literals"); - } - skip_byte(reader, q3); return read_STRING_LITERAL_LONG(reader, node, (uint8_t)q1); } @@ -315,10 +304,6 @@ read_PN_PREFIX(SerdReader* const reader, SerdNode* const dest) static SerdStatus read_IRIREF(SerdReader* const reader, SerdNode** const dest) { - if (!fancy_syntax(reader)) { - return read_IRI(reader, dest); - } - SerdStatus st = SERD_SUCCESS; TRY(st, eat_byte_check(reader, '<')); @@ -589,17 +574,6 @@ read_object(SerdReader* const reader, bool simple = (ctx->subject != 0); SerdNode* o = 0; const int c = peek_byte(reader); - if (!fancy_syntax(reader)) { - switch (c) { - case '"': - case ':': - case '<': - case '_': - break; - default: - return r_err(reader, SERD_BAD_SYNTAX, "expected: ':', '<', or '_'"); - } - } switch (c) { case EOF: @@ -672,11 +646,9 @@ read_object(SerdReader* const reader, } } + ctx->object = o; if (!st && emit && simple && o) { st = emit_statement(reader, *ctx, o); - } else if (!st && !emit) { - ctx->object = o; - return SERD_SUCCESS; } serd_stack_pop_to(&reader->stack, orig_stack_size); @@ -1128,65 +1100,3 @@ read_turtleTrigDoc(SerdReader* const reader) return SERD_SUCCESS; } - -SerdStatus -read_nquads_statement(SerdReader* const reader) -{ - SerdStatus st = SERD_SUCCESS; - SerdStatementFlags flags = 0; - ReadContext ctx = {0, 0, 0, 0, &flags}; - bool ate_dot = false; - int s_type = 0; - - read_ws_star(reader); - if (peek_byte(reader) == EOF) { - return SERD_FAILURE; - } - - if (peek_byte(reader) == '@') { - return r_err(reader, SERD_BAD_SYNTAX, "syntax does not support directives"); - } - - // subject predicate object - if ((st = read_subject(reader, ctx, &ctx.subject, &s_type)) || - !read_ws_star(reader) || (st = read_IRIREF(reader, &ctx.predicate)) || - !read_ws_star(reader) || - (st = read_object(reader, &ctx, false, &ate_dot))) { - return st; - } - - if (!ate_dot) { // graphLabel? - read_ws_star(reader); - switch (peek_byte(reader)) { - case '.': - break; - case '_': - TRY(st, read_BLANK_NODE_LABEL(reader, &ctx.graph, &ate_dot)); - break; - default: - TRY(st, read_IRIREF(reader, &ctx.graph)); - } - - // Terminating '.' - read_ws_star(reader); - TRY(st, eat_byte_check(reader, '.')); - } - - return emit_statement(reader, ctx, ctx.object); -} - -SerdStatus -read_nquadsDoc(SerdReader* const reader) -{ - SerdStatus st = SERD_SUCCESS; - - while (!reader->source.eof && !st) { - const size_t orig_stack_size = reader->stack.size; - - st = read_nquads_statement(reader); - - serd_stack_pop_to(&reader->stack, orig_stack_size); - } - - return st; -} diff --git a/src/read_nquads.c b/src/read_nquads.c new file mode 100644 index 00000000..b4e200d4 --- /dev/null +++ b/src/read_nquads.c @@ -0,0 +1,124 @@ +// Copyright 2011-2021 David Robillard <d@drobilla.net> +// SPDX-License-Identifier: ISC + +#include "read_nquads.h" + +#include "caret.h" +#include "node.h" +#include "read_ntriples.h" +#include "reader.h" +#include "stack.h" +#include "statement.h" +#include "try.h" + +#include "serd/caret.h" +#include "serd/node.h" +#include "serd/sink.h" +#include "serd/statement.h" + +#include <stdbool.h> +#include <stdio.h> + +/// [6] graphLabel +static SerdStatus +read_graphLabel(SerdReader* const reader, SerdNode** const dest) +{ + return read_nt_subject(reader, dest); // Equivalent rule +} + +/// [2] statement +static SerdStatus +read_nquads_statement(SerdReader* const reader) +{ + SerdStatementFlags flags = 0; + ReadContext ctx = {0, 0, 0, 0, &flags}; + SerdStatus st = SERD_SUCCESS; + bool ate_dot = false; + + // Read subject and predicate + if ((st = read_nt_subject(reader, &ctx.subject)) || + (st = skip_horizontal_whitespace(reader)) || + (st = read_nt_predicate(reader, &ctx.predicate)) || + (st = skip_horizontal_whitespace(reader))) { + return st; + } + + // Preserve the caret for error reporting and read object + SerdCaret orig_caret = reader->source.caret; + if ((st = read_nt_object(reader, &ctx.object, &ate_dot)) || + (st = skip_horizontal_whitespace(reader))) { + return st; + } + + if (!ate_dot) { + if (peek_byte(reader) == '.') { + eat_byte(reader); + } else { + TRY(st, read_graphLabel(reader, &ctx.graph)); + skip_horizontal_whitespace(reader); + TRY(st, eat_byte_check(reader, '.')); + } + } + + serd_node_zero_pad(ctx.object); + const SerdStatement statement = { + {ctx.subject, ctx.predicate, ctx.object, ctx.graph}, &orig_caret}; + + return serd_sink_write_statement(reader->sink, *ctx.flags, &statement); +} + +SerdStatus +read_nquads_line(SerdReader* const reader) +{ + const size_t orig_stack_size = reader->stack.size; + SerdStatus st = SERD_SUCCESS; + + skip_horizontal_whitespace(reader); + + switch (peek_byte(reader)) { + case EOF: + return SERD_FAILURE; + + case '\n': + case '\r': + return read_EOL(reader); + + case '#': + return read_comment(reader); + + default: + if (!(st = read_nquads_statement(reader))) { + skip_horizontal_whitespace(reader); + if (peek_byte(reader) == '#') { + st = read_comment(reader); + } + } + break; + } + + serd_stack_pop_to(&reader->stack, orig_stack_size); + + return (st || peek_byte(reader) == EOF) ? st : read_EOL(reader); +} + +SerdStatus +read_nquadsDoc(SerdReader* const reader) +{ + // Read the first line + SerdStatus st = read_nquads_line(reader); + if (st == SERD_FAILURE || !tolerate_status(reader, st)) { + return st; + } + + // Continue reading lines for as long as possible + for (st = SERD_SUCCESS; !st;) { + st = read_nquads_line(reader); + if (st > SERD_FAILURE && !reader->strict && tolerate_status(reader, st)) { + serd_reader_skip_until_byte(reader, '\n'); + st = SERD_SUCCESS; + } + } + + // If we made it this far, we succeeded at reading at least one line + return st > SERD_FAILURE ? st : SERD_SUCCESS; +} diff --git a/src/read_nquads.h b/src/read_nquads.h new file mode 100644 index 00000000..571c6f96 --- /dev/null +++ b/src/read_nquads.h @@ -0,0 +1,29 @@ +// Copyright 2011-2021 David Robillard <d@drobilla.net> +// SPDX-License-Identifier: ISC + +#ifndef SERD_SRC_READ_NQUADS_H +#define SERD_SRC_READ_NQUADS_H + +#include "serd/reader.h" +#include "serd/status.h" + +// Nonterminals + +/** + Read a single NQuads line. + + May read a statement, but may also just skip some input like comments or + extra whitespace. +*/ +SerdStatus +read_nquads_line(SerdReader* reader); + +/** + Read a complete NQuads document. + + RDF 1.1 NQuads: [1] nquadsDoc +*/ +SerdStatus +read_nquadsDoc(SerdReader* reader); + +#endif // SERD_SRC_READ_NQUADS_H diff --git a/src/read_ntriples.c b/src/read_ntriples.c index 3063a667..3d3af2ce 100644 --- a/src/read_ntriples.c +++ b/src/read_ntriples.c @@ -191,13 +191,18 @@ read_IRIREF_suffix(SerdReader* const reader, SerdNode* const node) return tolerate_status(reader, st) ? SERD_SUCCESS : st; } -SerdStatus +/** + Read an absolute IRI. + + This is a stricter subset of [8] IRIREF in the NTriples grammar, since a + scheme is required. Handling this in the parser results in better error + messages. +*/ +static SerdStatus read_IRI(SerdReader* const reader, SerdNode** const dest) { SerdStatus st = SERD_SUCCESS; - if ((st = eat_byte_check(reader, '<'))) { - return st; - } + TRY(st, eat_byte_check(reader, '<')); if (!(*dest = push_node(reader, SERD_URI, "", 0))) { return SERD_BAD_STACK; diff --git a/src/read_ntriples.h b/src/read_ntriples.h index bc76fed6..e6051fde 100644 --- a/src/read_ntriples.h +++ b/src/read_ntriples.h @@ -56,16 +56,6 @@ SerdStatus read_EOL(SerdReader* reader); /** - Read an absolute IRI. - - This is a stricter subset of [8] IRIREF in the NTriples grammar, since a - scheme is required. Handling this in the parser results in better error - messages. -*/ -SerdStatus -read_IRI(SerdReader* reader, SerdNode** dest); - -/** Read an IRI reference suffix into an existing node. RDF 1.1 NTriples: [8] IRIREF diff --git a/src/reader.c b/src/reader.c index 057f1d7f..1c99b033 100644 --- a/src/reader.c +++ b/src/reader.c @@ -6,6 +6,7 @@ #include "byte_source.h" #include "namespaces.h" #include "node.h" +#include "read_nquads.h" #include "read_ntriples.h" #include "stack.h" #include "statement.h" @@ -362,11 +363,12 @@ serd_reader_read_chunk(SerdReader* const reader) if (peek_byte(reader) == 0) { // Skip leading null byte, for reading from a null-delimited socket - st = skip_byte(reader, 0); + serd_byte_source_advance(&reader->source); + return SERD_FAILURE; } return st ? st - : (reader->syntax == SERD_NQUADS) ? read_nquads_statement(reader) + : (reader->syntax == SERD_NQUADS) ? read_nquads_line(reader) : read_n3_statement(reader); } diff --git a/src/reader.h b/src/reader.h index 559c9cee..0907d4cc 100644 --- a/src/reader.h +++ b/src/reader.h @@ -91,12 +91,6 @@ SerdStatus read_n3_statement(SerdReader* reader); SerdStatus -read_nquads_statement(SerdReader* reader); - -SerdStatus -read_nquadsDoc(SerdReader* reader); - -SerdStatus read_turtleTrigDoc(SerdReader* reader); static inline int diff --git a/test/extra/lax/manifest.ttl b/test/extra/lax/manifest.ttl index c68f0176..9291b7b3 100644 --- a/test/extra/lax/manifest.ttl +++ b/test/extra/lax/manifest.ttl @@ -6,6 +6,7 @@ a mf:Manifest ; rdfs:comment "Serd lax parsing test suite" ; mf:entries ( + <#test-bad-string-nq> <#test-bad-string-nt> <#test-bad-string-ttl> <#test-bad-uri-nq> @@ -18,6 +19,12 @@ <#test-out-of-range-unicode> ) . +<#test-bad-string-nq> + a rdft:TestNQuadsNegativeSyntax ; + mf:action <test-bad-string.nq> ; + mf:name "test-bad-string-nq" ; + mf:result <test-bad-string-out.nq> . + <#test-bad-string-nt> a rdft:TestNTriplesNegativeSyntax ; mf:action <test-bad-string.nt> ; diff --git a/test/extra/lax/test-bad-string.nq b/test/extra/lax/test-bad-string.nq new file mode 100644 index 00000000..72eb9621 --- /dev/null +++ b/test/extra/lax/test-bad-string.nq @@ -0,0 +1,3 @@ +<http://example.org/s1> <http://example.org/p1> "Truncated line +<http://example.org/s1> <http://example.org/p1> "Bad escape \? " . +<http://example.org/s1> <http://example.org/p2> "Good" . |