diff options
author | David Robillard <d@drobilla.net> | 2019-12-18 19:09:49 -0500 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2021-03-08 23:23:06 -0500 |
commit | fc3e5a0a7c9c64d275fec498f0e80dc02444c12d (patch) | |
tree | 538ab16491a4726431233fe22b2cd08b001281c6 | |
parent | b7948f8c9ad54c30e2579fd5da4626c6f3de325a (diff) | |
download | serd-fc3e5a0a7c9c64d275fec498f0e80dc02444c12d.tar.gz serd-fc3e5a0a7c9c64d275fec498f0e80dc02444c12d.tar.bz2 serd-fc3e5a0a7c9c64d275fec498f0e80dc02444c12d.zip |
Add support for parsing variables
This adds a reader flag and serdi option for extending a syntax with support
for SPARQL-like variables, for storing things like patterns or simple queries.
-rw-r--r-- | NEWS | 3 | ||||
-rw-r--r-- | doc/serdi.1 | 7 | ||||
-rw-r--r-- | include/serd/serd.h | 17 | ||||
-rw-r--r-- | src/env.c | 1 | ||||
-rw-r--r-- | src/n3.c | 79 | ||||
-rw-r--r-- | src/node.c | 3 | ||||
-rw-r--r-- | src/reader.c | 1 | ||||
-rw-r--r-- | src/reader.h | 1 | ||||
-rw-r--r-- | src/serdi.c | 3 | ||||
-rw-r--r-- | src/writer.c | 14 | ||||
-rw-r--r-- | test/bad/bad-var.ttl | 2 | ||||
-rw-r--r-- | test/bad/manifest.ttl | 6 | ||||
-rw-r--r-- | test/meson.build | 7 | ||||
-rw-r--r-- | test/pattern/bad-pattern.nq | 1 | ||||
-rw-r--r-- | test/pattern/bad-pattern.ttl | 2 | ||||
-rw-r--r-- | test/pattern/manifest.ttl | 35 | ||||
-rw-r--r-- | test/pattern/test-pattern.nt | 6 | ||||
-rw-r--r-- | test/pattern/test-pattern.ttl | 10 | ||||
-rw-r--r-- | test/test_overflow.c | 4 |
19 files changed, 192 insertions, 10 deletions
@@ -3,6 +3,7 @@ serd (1.0.1) unstable; * Add SerdBuffer for mutable buffers to keep SerdChunk const-correct * Add SerdWorld for shared library state * Add option for writing terse output without newlines + * Add support for parsing variables * Add support for writing terse collections * Add support for xsd:float and xsd:double literals * Bring read/write interface closer to C standard @@ -16,7 +17,7 @@ serd (1.0.1) unstable; * Use a fixed-size reader stack * Use char* for strings in public API - -- David Robillard <d@drobilla.net> Wed, 13 Jan 2021 13:29:44 +0000 + -- David Robillard <d@drobilla.net> Wed, 13 Jan 2021 13:37:55 +0000 serd (0.30.11) unstable; diff --git a/doc/serdi.1 b/doc/serdi.1 index c834ce42..b89bfba7 100644 --- a/doc/serdi.1 +++ b/doc/serdi.1 @@ -139,6 +139,13 @@ Display version information and exit. Write output to the given .Ar filename instead of stdout. +.Pp +.It Fl x +Support parsing variable nodes. +Variables can be written in SPARQL style, for example +.Dq ?var +or +.Dq $var . .El .Sh EXIT STATUS .Nm diff --git a/include/serd/serd.h b/include/serd/serd.h index e36dd24b..b125bb3d 100644 --- a/include/serd/serd.h +++ b/include/serd/serd.h @@ -151,7 +151,8 @@ typedef uint32_t SerdStatementFlags; An RDF node, in the abstract sense, can be either a resource, literal, or a blank. This type is more precise, because syntactically there are two ways - to refer to a resource (by URI or CURIE). + to refer to a resource (by URI or CURIE). Serd also has support for + variable nodes to support some features, which are not RDF nodes. There are also two ways to refer to a blank node in syntax (by ID or anonymously), but this is handled by statement flags rather than distinct @@ -190,7 +191,16 @@ typedef enum { is meaningful only within this serialisation. @see [RDF 1.1 Turtle](http://www.w3.org/TR/turtle/#grammar-production-BLANK_NODE_LABEL) */ - SERD_BLANK = 4 + SERD_BLANK = 4, + + /** + A variable node + + Value is a variable name without any syntactic prefix, like "name", + which is meaningful only within this serialisation. @see [SPARQL 1.1 + Query Language](https://www.w3.org/TR/sparql11-query/#rVar) + */ + SERD_VARIABLE = 5 } SerdNodeType; /// Flags indicating certain string properties relevant to serialisation @@ -294,7 +304,8 @@ typedef struct { /// Reader options typedef enum { - SERD_READ_LAX = 1u << 0u ///< Tolerate invalid input where possible + SERD_READ_LAX = 1u << 0u, ///< Tolerate invalid input where possible + SERD_READ_VARIABLES = 1u << 1u ///< Support variable nodes } SerdReaderFlag; /// Bitwise OR of SerdReaderFlag values @@ -355,6 +355,7 @@ serd_env_expand(const SerdEnv* env, const SerdNode* node) case SERD_CURIE: return expand_curie(env, node); case SERD_BLANK: + case SERD_VARIABLE: return serd_node_copy(node); } } @@ -929,10 +929,53 @@ read_literal(SerdReader* reader, SerdNode** dest, bool* ate_dot) } static SerdStatus +read_VARNAME(SerdReader* reader, SerdNode** dest) +{ + // Simplified from SPARQL: VARNAME ::= (PN_CHARS_U | [0-9])+ + SerdNode* n = *dest; + SerdStatus st = SERD_SUCCESS; + int c = 0; + peek_byte(reader); + while ((c = peek_byte(reader))) { + if (is_digit(c) || c == '_') { + push_byte(reader, n, eat_byte_safe(reader, c)); + } else if ((st = read_PN_CHARS(reader, n))) { + st = st > SERD_FAILURE ? st : SERD_SUCCESS; + break; + } + } + + return st; +} + +static SerdStatus +read_Var(SerdReader* reader, SerdNode** dest) +{ + if (!(reader->flags & SERD_READ_VARIABLES)) { + return r_err( + reader, SERD_ERR_BAD_SYNTAX, "syntax does not support variables\n"); + } + + if (!(*dest = push_node(reader, SERD_VARIABLE, "", 0))) { + return SERD_ERR_OVERFLOW; + } + + assert(peek_byte(reader) == '$' || peek_byte(reader) == '?'); + serd_byte_source_advance(reader->source); + + return read_VARNAME(reader, dest); +} + +static SerdStatus read_verb(SerdReader* reader, SerdNode** dest) { const size_t orig_stack_size = reader->stack.size; - if (peek_byte(reader) == '<') { + + switch (peek_byte(reader)) { + case '$': + case '?': + return read_Var(reader, dest); + case '<': return read_IRIREF(reader, dest); } @@ -1110,6 +1153,12 @@ read_object(SerdReader* reader, ReadContext* ctx, bool emit, bool* ate_dot) case '<': case '_': break; + case '$': + case '?': + if (reader->flags & SERD_READ_VARIABLES) { + break; + } + break; default: return r_err(reader, SERD_ERR_BAD_SYNTAX, "expected: ':', '<', or '_'\n"); } @@ -1119,6 +1168,10 @@ read_object(SerdReader* reader, ReadContext* ctx, bool emit, bool* ate_dot) case EOF: case ')': return r_err(reader, SERD_ERR_BAD_SYNTAX, "expected object\n"); + case '$': + case '?': + ret = read_Var(reader, &o); + break; case '[': simple = false; ret = read_anon(reader, *ctx, false, &o); @@ -1341,6 +1394,10 @@ read_subject(SerdReader* reader, ReadContext ctx, SerdNode** dest, int* s_type) SerdStatus st = SERD_SUCCESS; bool ate_dot = false; switch ((*s_type = peek_byte(reader))) { + case '$': + case '?': + st = read_Var(reader, dest); + break; case '[': st = read_anon(reader, ctx, true, dest); break; @@ -1598,6 +1655,7 @@ read_n3_statement(SerdReader* reader) if (s_type == '(' || (s_type == '[' && !*ctx.flags)) { return r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid graph name\n"); } + ctx.graph = ctx.subject; ctx.subject = NULL; TRY(st, read_wrappedGraph(reader, &ctx)); @@ -1613,6 +1671,7 @@ read_n3_statement(SerdReader* reader) } return st > SERD_FAILURE ? st : SERD_ERR_BAD_SYNTAX; + } else if (!ate_dot) { read_ws_star(reader); st = (eat_byte_check(reader, '.') == '.') ? SERD_SUCCESS @@ -1671,10 +1730,22 @@ read_nquadsDoc(SerdReader* reader) return SERD_ERR_BAD_SYNTAX; } - // subject predicate object if ((st = read_subject(reader, ctx, &ctx.subject, &s_type)) || - !read_ws_star(reader) || (st = read_IRIREF(reader, &ctx.predicate)) || - !read_ws_star(reader) || + !read_ws_star(reader)) { + return st; + } + + switch (peek_byte(reader)) { + case '$': + case '?': + st = read_Var(reader, &ctx.predicate); + break; + case '<': + st = read_IRIREF(reader, &ctx.predicate); + break; + } + + if (st || !read_ws_star(reader) || (st = read_object(reader, &ctx, false, &ate_dot))) { return st; } @@ -165,7 +165,8 @@ serd_node_zero_pad(SerdNode* node) SerdNode* serd_new_simple_node(SerdNodeType type, const SerdStringView str) { - if (type != SERD_BLANK && type != SERD_CURIE && type != SERD_URI) { + if (type != SERD_BLANK && type != SERD_CURIE && type != SERD_URI && + type != SERD_VARIABLE) { return NULL; } diff --git a/src/reader.c b/src/reader.c index 1c9469af..4c4bffa5 100644 --- a/src/reader.c +++ b/src/reader.c @@ -172,6 +172,7 @@ serd_reader_new(SerdWorld* const world, me->sink = sink; me->stack = serd_stack_new(stack_size, sizeof(SerdNode)); me->syntax = syntax; + me->flags = flags; me->next_id = 1; me->strict = !(flags & SERD_READ_LAX); diff --git a/src/reader.h b/src/reader.h index 62feeee6..9bdbf72f 100644 --- a/src/reader.h +++ b/src/reader.h @@ -60,6 +60,7 @@ struct SerdReaderImpl { SerdByteSource* source; SerdStack stack; SerdSyntax syntax; + SerdReaderFlags flags; unsigned next_id; uint8_t* buf; char* bprefix; diff --git a/src/serdi.c b/src/serdi.c index 1925cd88..2801a2da 100644 --- a/src/serdi.c +++ b/src/serdi.c @@ -72,6 +72,7 @@ print_usage(const char* name, bool error) fprintf(os, " -s INPUT Parse INPUT as string.\n"); fprintf(os, " -t Write terser output without newlines.\n"); fprintf(os, " -v Display version information and exit.\n"); + fprintf(os, " -x Support parsing variable nodes like `?x'.\n"); return error ? 1 : 0; } @@ -246,6 +247,8 @@ main(int argc, char** argv) return missing_arg(argv[0], 'w'); } out_filename = argv[a]; + } else if (argv[a][1] == 'x') { + reader_flags |= SERD_READ_VARIABLES; } else { SERDI_ERRORF("invalid option -- '%s'\n", argv[a] + 1); return print_usage(argv[0], true); diff --git a/src/writer.c b/src/writer.c index b1404a91..fbc9dd5d 100644 --- a/src/writer.c +++ b/src/writer.c @@ -881,6 +881,18 @@ write_blank(SerdWriter* const writer, } SERD_WARN_UNUSED_RESULT static SerdStatus +write_variable(SerdWriter* const writer, const SerdNode* node) +{ + SerdStatus st = SERD_SUCCESS; + + TRY(st, esink("?", 1, writer)); + TRY(st, esink(serd_node_string(node), node->n_bytes, writer)); + + writer->last_sep = SEP_NONE; + return st; +} + +SERD_WARN_UNUSED_RESULT static SerdStatus write_node(SerdWriter* writer, const SerdNode* node, const SerdField field, @@ -895,6 +907,8 @@ write_node(SerdWriter* writer, return write_curie(writer, node, field, flags); case SERD_BLANK: return write_blank(writer, node, field, flags); + case SERD_VARIABLE: + return write_variable(writer, node); } return SERD_ERR_INTERNAL; diff --git a/test/bad/bad-var.ttl b/test/bad/bad-var.ttl new file mode 100644 index 00000000..29b5b008 --- /dev/null +++ b/test/bad/bad-var.ttl @@ -0,0 +1,2 @@ +?s + <http://example.org/p> <http://example.org/o> . diff --git a/test/bad/manifest.ttl b/test/bad/manifest.ttl index 5f208983..f76a9d1b 100644 --- a/test/bad/manifest.ttl +++ b/test/bad/manifest.ttl @@ -71,6 +71,7 @@ <#bad-string> <#bad-subject> <#bad-uri-escape> + <#bad-var> <#bad-verb> <#invalid-char-in-local> <#invalid-char-in-prefix> @@ -400,6 +401,11 @@ mf:name "bad-uri-escape" ; mf:action <bad-uri-escape.ttl> . +<#bad-var> + rdf:type rdft:TestTurtleNegativeSyntax ; + mf:name "bad-var" ; + mf:action <bad-var.ttl> . + <#bad-verb> rdf:type rdft:TestTurtleNegativeSyntax ; mf:name "bad-verb" ; diff --git a/test/meson.build b/test/meson.build index ac1fb205..3d12d211 100644 --- a/test/meson.build +++ b/test/meson.build @@ -159,6 +159,13 @@ if get_option('utils') suite: ['rdf', 'serd'], timeout: 240) + manifest = files('pattern' / 'manifest.ttl') + base_uri = serd_base + 'pattern' + '/' + test('pattern', run_test_suite, + args: script_args + [manifest, base_uri, '--', '-x'], + suite: ['rdf', 'serd'], + timeout: 240) + ### Run the lax suite with lax parsing enabled as well manifest = files('lax/manifest.ttl') base_uri = serd_base + 'lax/' diff --git a/test/pattern/bad-pattern.nq b/test/pattern/bad-pattern.nq new file mode 100644 index 00000000..a7e0c994 --- /dev/null +++ b/test/pattern/bad-pattern.nq @@ -0,0 +1 @@ +<http://example.org/s> ?pµ <http://example.org/o> . diff --git a/test/pattern/bad-pattern.ttl b/test/pattern/bad-pattern.ttl new file mode 100644 index 00000000..5f3dbfdd --- /dev/null +++ b/test/pattern/bad-pattern.ttl @@ -0,0 +1,2 @@ +?sµ + <http://example.org/p> <http://example.org/o> . diff --git a/test/pattern/manifest.ttl b/test/pattern/manifest.ttl new file mode 100644 index 00000000..a179a64d --- /dev/null +++ b/test/pattern/manifest.ttl @@ -0,0 +1,35 @@ +@prefix mf: <http://www.w3.org/2001/sw/DataAccess/tests/test-manifest#> . +@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . +@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . +@prefix rdft: <http://www.w3.org/ns/rdftest#> . + +<> + rdf:type mf:Manifest ; + rdfs:comment "Serd pattern syntax test cases" ; + mf:entries ( + <#bad-pattern-nq> + <#bad-pattern-ttl> + <#test-pattern-nt> + <#test-pattern-ttl> + ) . + +<#bad-pattern-nq> + rdf:type rdft:TestNQuadsNegativeSyntax ; + mf:name "bad-pattern-nq" ; + mf:action <bad-pattern.nq> . + +<#bad-pattern-ttl> + rdf:type rdft:TestTurtleNegativeSyntax ; + mf:name "bad-pattern" ; + mf:action <bad-pattern.ttl> . + +<#test-pattern-nt> + rdf:type rdft:TestNTriplesPositiveSyntax ; + mf:name "test-pattern-nt" ; + mf:action <test-pattern.nt> . + +<#test-pattern-ttl> + rdf:type rdft:TestTurtleEval ; + mf:name "test-pattern" ; + mf:action <test-pattern.ttl> ; + mf:result <test-pattern.nt> . diff --git a/test/pattern/test-pattern.nt b/test/pattern/test-pattern.nt new file mode 100644 index 00000000..ddfe6d3c --- /dev/null +++ b/test/pattern/test-pattern.nt @@ -0,0 +1,6 @@ +?s <http://example.org/p1> <http://example.org/o1> . +<http://example.org/s> ?p <http://example.org/o1> . +<http://example.org/s> <http://example.org/p1> ?o . +<http://example.org/s> <http://example.org/p2> _:b1 . +_:b1 ?2p <http://example.org/o2> . +_:b1 <http://example.org/p3> ?_o . diff --git a/test/pattern/test-pattern.ttl b/test/pattern/test-pattern.ttl new file mode 100644 index 00000000..3742e5ed --- /dev/null +++ b/test/pattern/test-pattern.ttl @@ -0,0 +1,10 @@ +?s + <http://example.org/p1> <http://example.org/o1> . + +<http://example.org/s> + ?p <http://example.org/o1> ; + <http://example.org/p1> ?o ; + <http://example.org/p2> [ + ?2p <http://example.org/o2> ; + <http://example.org/p3> ?_o + ] . diff --git a/test/test_overflow.c b/test/test_overflow.c index 5a65d887..851c8289 100644 --- a/test/test_overflow.c +++ b/test/test_overflow.c @@ -26,7 +26,8 @@ test(SerdWorld* world, SerdSink* sink, const char* str, size_t stack_size) { SerdByteSource* byte_source = serd_byte_source_new_string(str, NULL); - SerdReader* reader = serd_reader_new(world, SERD_TURTLE, 0, sink, stack_size); + SerdReader* reader = + serd_reader_new(world, SERD_TURTLE, SERD_READ_VARIABLES, sink, stack_size); serd_reader_start(reader, byte_source); const SerdStatus st = serd_reader_read_document(reader); @@ -55,6 +56,7 @@ main(void) {":s :p \"literal\"", sizes + 264}, {":s :p \"verb\"", sizes + 263}, {":s :p _:blank .", sizes + 276}, + {":s :p ?o .", sizes + 295}, {":s :p true .", sizes + 295}, {":s :p true .", sizes + 329}, {":s :p \"\"@en .", sizes + 302}, |