diff options
author | David Robillard <d@drobilla.net> | 2020-10-10 12:31:29 +0200 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2023-12-02 18:49:08 -0500 |
commit | 155f5e2f24c24f5b5ffbf13fcea5cf1a355ec372 (patch) | |
tree | 5f5f7009e96379aa0e31cb1db3757f8d6589669f | |
parent | e8f392d57bf6eba9b62509a32e4073e8b34b18e2 (diff) | |
download | serd-155f5e2f24c24f5b5ffbf13fcea5cf1a355ec372.tar.gz serd-155f5e2f24c24f5b5ffbf13fcea5cf1a355ec372.tar.bz2 serd-155f5e2f24c24f5b5ffbf13fcea5cf1a355ec372.zip |
[WIP] Simplify reader and writer flags
[WIP] Testing?
-rw-r--r-- | include/serd/reader.h | 41 | ||||
-rw-r--r-- | include/serd/writer.h | 49 | ||||
-rw-r--r-- | src/read_ntriples.c | 3 | ||||
-rw-r--r-- | src/read_turtle.c | 2 | ||||
-rw-r--r-- | src/writer.c | 8 | ||||
-rw-r--r-- | tools/serd-pipe.c | 2 |
6 files changed, 89 insertions, 16 deletions
diff --git a/include/serd/reader.h b/include/serd/reader.h index b6b9cac3..e25565cb 100644 --- a/include/serd/reader.h +++ b/include/serd/reader.h @@ -30,10 +30,43 @@ typedef struct SerdReaderImpl SerdReader; /// Reader options typedef enum { - SERD_READ_LAX = 1U << 0U, ///< Tolerate invalid input where possible - SERD_READ_VARIABLES = 1U << 1U, ///< Support variable nodes - SERD_READ_PREFIXED = 1U << 2U, ///< Do not expand prefixed names - SERD_READ_RELATIVE = 1U << 3U, ///< Do not expand relative URI references + /** + Tolerate invalid input where possible. + + This will attempt to ignore invalid input and continue reading. Invalid + Unicode characters will be replaced with the replacement character, and + various other syntactic problems will be ignored. If there are more + severe problems, the reader will try to skip the statement and continue + parsing. This should work reasonably well for line-based syntaxes like + NTriples and NQuads, but abbreviated Turtle or TriG may not recover. + + Note that this flag should be used carefully, since it can result in data + loss. + */ + SERD_READ_LAX = 1U << 0U, + + /** + Support reading variable nodes. + + As an extension, serd supports reading variables nodes with SPARQL-like + syntax, for example "?foo" or "$bar". This can be used for storing + graph patterns and templates. + */ + SERD_READ_VARIABLES = 1U << 1U, + + /** + Read URIs and blank node labels exactly. + + Normally, the reader expands all relative URIs, and may adjust blank node + labels to avoid clashing with generated ones. This flag disables all of + this processing, so that URI references and blank nodes are passed to the + sink exactly as they are in the input. + + Note that this does not apply to CURIEs, since serd deliberately does not + have a way to represent CURIE nodes. A bad namespace prefix is considered + a syntax error. + */ + SERD_READ_VERBATIM = 1U << 2U, } SerdReaderFlag; /// Bitwise OR of SerdReaderFlag values diff --git a/include/serd/writer.h b/include/serd/writer.h index b85c7b82..812b1851 100644 --- a/include/serd/writer.h +++ b/include/serd/writer.h @@ -36,11 +36,50 @@ typedef struct SerdWriterImpl SerdWriter; does not support abbreviation and is always ASCII. */ typedef enum { - SERD_WRITE_ASCII = 1U << 0U, ///< Escape all non-ASCII characters - SERD_WRITE_UNQUALIFIED = 1U << 1U, ///< Do not shorten URIs into CURIEs - SERD_WRITE_UNRESOLVED = 1U << 2U, ///< Do not make URIs relative - SERD_WRITE_LAX = 1U << 3U, ///< Tolerate lossy output - SERD_WRITE_TERSE = 1U << 4U, ///< Write terser output without newlines + /** + Escape all non-ASCII characters. + + Although all the supported syntaxes are UTF-8 by definition, this can be + used to escape all non-ASCII characters so that data will survive + transmission through ASCII-only channels. + */ + SERD_WRITE_ASCII = 1U << 0U, + + /** + Write expanded URIs instead of prefixed names. + + This will avoid shortening URIs into CURIEs entirely, even if the output + syntax supports prefixed names. This can be useful for making chunks of + syntax context-free. + */ + SERD_WRITE_EXPANDED = 1U << 1U, + + /** + Write URI references exactly as they are received. + + Normally, the writer resolves URIs against the base URI, so it can + potentially write them as relative URI references. This flag disables + that, so URI nodes are written exactly as they are received. + */ + SERD_WRITE_VERBATIM = 1U << 2U, + + /** + Write terser output without newlines. + + For Turtle and TriG, this enables a terser form of output which only has + newlines at the top level. This can result in very long lines, but is + more compact and useful for making these abbreviated syntaxes line-based. + */ + SERD_WRITE_TERSE = 1U << 3U, + + /** + Tolerate lossy output. + + This will tolerate input that can not be written without loss, in + particular invalid UTF-8 text. Note that this flag should be used + carefully, since it can result in data loss. + */ + SERD_WRITE_LAX = 1U << 4U, } SerdWriterFlag; /// Bitwise OR of #SerdWriterFlag values diff --git a/src/read_ntriples.c b/src/read_ntriples.c index 6822b64f..7a43e4c2 100644 --- a/src/read_ntriples.c +++ b/src/read_ntriples.c @@ -240,7 +240,8 @@ read_STRING_LITERAL(SerdReader* const reader, static bool avoid_blank_clashes(const SerdReader* const reader) { - return reader->syntax == SERD_TURTLE || reader->syntax == SERD_TRIG; + return (reader->syntax == SERD_TURTLE || reader->syntax == SERD_TRIG) && + !(reader->flags & SERD_READ_VERBATIM); } static SerdStatus diff --git a/src/read_turtle.c b/src/read_turtle.c index 042393e0..613b33d2 100644 --- a/src/read_turtle.c +++ b/src/read_turtle.c @@ -372,7 +372,7 @@ read_IRIREF(SerdReader* const reader, SerdNode** const dest) return st; } - return (reader->flags & SERD_READ_RELATIVE) + return (reader->flags & SERD_READ_VERBATIM) ? SERD_SUCCESS : resolve_IRIREF(reader, *dest, string_start_offset); } diff --git a/src/writer.c b/src/writer.c index 31daa71c..fa1abd9f 100644 --- a/src/writer.c +++ b/src/writer.c @@ -801,10 +801,10 @@ write_literal(SerdWriter* const writer, SERD_NODISCARD static SerdStatus write_full_uri_node(SerdWriter* const writer, const SerdNode* const node) { - SerdStatus st = SERD_SUCCESS; - const bool resolve_disabled = writer->flags & SERD_WRITE_UNRESOLVED; + SerdStatus st = SERD_SUCCESS; + const bool verbatim = (writer->flags & SERD_WRITE_VERBATIM); - if (resolve_disabled || !serd_env_base_uri(writer->env)) { + if (verbatim || !serd_env_base_uri(writer->env)) { // Resolution disabled or we have no base URI, simply write the node TRY(st, esink("<", 1, writer)); TRY(st, write_uri_from_node(writer, node)); @@ -855,7 +855,7 @@ write_uri_node(SerdWriter* const writer, SerdStringView prefix = {NULL, 0}; SerdStringView suffix = {NULL, 0}; - if (has_scheme && !(writer->flags & SERD_WRITE_UNQUALIFIED) && + if (has_scheme && !(writer->flags & SERD_WRITE_EXPANDED) && !serd_env_qualify(writer->env, string, &prefix, &suffix)) { TRY(st, write_lname(writer, prefix.data, prefix.length)); TRY(st, esink(":", 1, writer)); diff --git a/tools/serd-pipe.c b/tools/serd-pipe.c index 8bf26f2a..080fc9a1 100644 --- a/tools/serd-pipe.c +++ b/tools/serd-pipe.c @@ -141,7 +141,7 @@ main(int argc, char** argv) if (opt == 'a') { writer_flags |= SERD_WRITE_ASCII; } else if (opt == 'f') { - writer_flags |= (SERD_WRITE_UNQUALIFIED | SERD_WRITE_UNRESOLVED); + writer_flags |= (SERD_WRITE_EXPANDED | SERD_WRITE_VERBATIM); } else if (opt == 'h') { return print_usage(prog, false); } else if (opt == 'l') { |