[WIP] Simplify reader and writer flags

[WIP] Testing?
author: David Robillard <d@drobilla.net> 2020-10-10 12:31:29 +0200
committer: David Robillard <d@drobilla.net> 2023-12-02 18:49:08 -0500
commit: 155f5e2f24c24f5b5ffbf13fcea5cf1a355ec372 (patch)
tree: 5f5f7009e96379aa0e31cb1db3757f8d6589669f
parent: e8f392d57bf6eba9b62509a32e4073e8b34b18e2 (diff)
download: serd-155f5e2f24c24f5b5ffbf13fcea5cf1a355ec372.tar.gz
serd-155f5e2f24c24f5b5ffbf13fcea5cf1a355ec372.tar.bz2
serd-155f5e2f24c24f5b5ffbf13fcea5cf1a355ec372.zip
6 files changed, 89 insertions, 16 deletions
diff --git a/include/serd/reader.h b/include/serd/reader.h
index b6b9cac3..e25565cb 100644
--- a/include/serd/reader.h
+++ b/include/serd/reader.h
@@ -30,10 +30,43 @@ typedef struct SerdReaderImpl SerdReader;
 
 /// Reader options
 typedef enum {
-  SERD_READ_LAX       = 1U << 0U, ///< Tolerate invalid input where possible
-  SERD_READ_VARIABLES = 1U << 1U, ///< Support variable nodes
-  SERD_READ_PREFIXED  = 1U << 2U, ///< Do not expand prefixed names
-  SERD_READ_RELATIVE  = 1U << 3U, ///< Do not expand relative URI references
+  /**
+     Tolerate invalid input where possible.
+
+     This will attempt to ignore invalid input and continue reading.  Invalid
+     Unicode characters will be replaced with the replacement character, and
+     various other syntactic problems will be ignored.  If there are more
+     severe problems, the reader will try to skip the statement and continue
+     parsing.  This should work reasonably well for line-based syntaxes like
+     NTriples and NQuads, but abbreviated Turtle or TriG may not recover.
+
+     Note that this flag should be used carefully, since it can result in data
+     loss.
+  */
+  SERD_READ_LAX = 1U << 0U,
+
+  /**
+     Support reading variable nodes.
+
+     As an extension, serd supports reading variables nodes with SPARQL-like
+     syntax, for example "?foo" or "$bar".  This can be used for storing
+     graph patterns and templates.
+  */
+  SERD_READ_VARIABLES = 1U << 1U,
+
+  /**
+     Read URIs and blank node labels exactly.
+
+     Normally, the reader expands all relative URIs, and may adjust blank node
+     labels to avoid clashing with generated ones.  This flag disables all of
+     this processing, so that URI references and blank nodes are passed to the
+     sink exactly as they are in the input.
+
+     Note that this does not apply to CURIEs, since serd deliberately does not
+     have a way to represent CURIE nodes.  A bad namespace prefix is considered
+     a syntax error.
+  */
+  SERD_READ_VERBATIM = 1U << 2U,
 } SerdReaderFlag;
 
 /// Bitwise OR of SerdReaderFlag values
diff --git a/include/serd/writer.h b/include/serd/writer.h
index b85c7b82..812b1851 100644
--- a/include/serd/writer.h
+++ b/include/serd/writer.h
@@ -36,11 +36,50 @@ typedef struct SerdWriterImpl SerdWriter;
    does not support abbreviation and is always ASCII.
 */
 typedef enum {
-  SERD_WRITE_ASCII       = 1U << 0U, ///< Escape all non-ASCII characters
-  SERD_WRITE_UNQUALIFIED = 1U << 1U, ///< Do not shorten URIs into CURIEs
-  SERD_WRITE_UNRESOLVED  = 1U << 2U, ///< Do not make URIs relative
-  SERD_WRITE_LAX         = 1U << 3U, ///< Tolerate lossy output
-  SERD_WRITE_TERSE       = 1U << 4U, ///< Write terser output without newlines
+  /**
+     Escape all non-ASCII characters.
+
+     Although all the supported syntaxes are UTF-8 by definition, this can be
+     used to escape all non-ASCII characters so that data will survive
+     transmission through ASCII-only channels.
+  */
+  SERD_WRITE_ASCII = 1U << 0U,
+
+  /**
+     Write expanded URIs instead of prefixed names.
+
+     This will avoid shortening URIs into CURIEs entirely, even if the output
+     syntax supports prefixed names.  This can be useful for making chunks of
+     syntax context-free.
+  */
+  SERD_WRITE_EXPANDED = 1U << 1U,
+
+  /**
+     Write URI references exactly as they are received.
+
+     Normally, the writer resolves URIs against the base URI, so it can
+     potentially write them as relative URI references.  This flag disables
+     that, so URI nodes are written exactly as they are received.
+  */
+  SERD_WRITE_VERBATIM = 1U << 2U,
+
+  /**
+     Write terser output without newlines.
+
+     For Turtle and TriG, this enables a terser form of output which only has
+     newlines at the top level.  This can result in very long lines, but is
+     more compact and useful for making these abbreviated syntaxes line-based.
+  */
+  SERD_WRITE_TERSE = 1U << 3U,
+
+  /**
+     Tolerate lossy output.
+
+     This will tolerate input that can not be written without loss, in
+     particular invalid UTF-8 text.  Note that this flag should be used
+     carefully, since it can result in data loss.
+  */
+  SERD_WRITE_LAX = 1U << 4U,
 } SerdWriterFlag;
 
 /// Bitwise OR of #SerdWriterFlag values
diff --git a/src/read_ntriples.c b/src/read_ntriples.c
index 6822b64f..7a43e4c2 100644
--- a/src/read_ntriples.c
+++ b/src/read_ntriples.c
@@ -240,7 +240,8 @@ read_STRING_LITERAL(SerdReader* const reader,
 static bool
 avoid_blank_clashes(const SerdReader* const reader)
 {
-  return reader->syntax == SERD_TURTLE || reader->syntax == SERD_TRIG;
+  return (reader->syntax == SERD_TURTLE || reader->syntax == SERD_TRIG) &&
+         !(reader->flags & SERD_READ_VERBATIM);
 }
 
 static SerdStatus
diff --git a/src/read_turtle.c b/src/read_turtle.c
index 042393e0..613b33d2 100644
--- a/src/read_turtle.c
+++ b/src/read_turtle.c
@@ -372,7 +372,7 @@ read_IRIREF(SerdReader* const reader, SerdNode** const dest)
     return st;
   }
 
-  return (reader->flags & SERD_READ_RELATIVE)
+  return (reader->flags & SERD_READ_VERBATIM)
            ? SERD_SUCCESS
            : resolve_IRIREF(reader, *dest, string_start_offset);
 }
diff --git a/src/writer.c b/src/writer.c
index 31daa71c..fa1abd9f 100644
--- a/src/writer.c
+++ b/src/writer.c
@@ -801,10 +801,10 @@ write_literal(SerdWriter* const        writer,
 SERD_NODISCARD static SerdStatus
 write_full_uri_node(SerdWriter* const writer, const SerdNode* const node)
 {
-  SerdStatus st               = SERD_SUCCESS;
-  const bool resolve_disabled = writer->flags & SERD_WRITE_UNRESOLVED;
+  SerdStatus st       = SERD_SUCCESS;
+  const bool verbatim = (writer->flags & SERD_WRITE_VERBATIM);
 
-  if (resolve_disabled || !serd_env_base_uri(writer->env)) {
+  if (verbatim || !serd_env_base_uri(writer->env)) {
     // Resolution disabled or we have no base URI, simply write the node
     TRY(st, esink("<", 1, writer));
     TRY(st, write_uri_from_node(writer, node));
@@ -855,7 +855,7 @@ write_uri_node(SerdWriter* const     writer,
 
     SerdStringView prefix = {NULL, 0};
     SerdStringView suffix = {NULL, 0};
-    if (has_scheme && !(writer->flags & SERD_WRITE_UNQUALIFIED) &&
+    if (has_scheme && !(writer->flags & SERD_WRITE_EXPANDED) &&
         !serd_env_qualify(writer->env, string, &prefix, &suffix)) {
       TRY(st, write_lname(writer, prefix.data, prefix.length));
       TRY(st, esink(":", 1, writer));
diff --git a/tools/serd-pipe.c b/tools/serd-pipe.c
index 8bf26f2a..080fc9a1 100644
--- a/tools/serd-pipe.c
+++ b/tools/serd-pipe.c
@@ -141,7 +141,7 @@ main(int argc, char** argv)
       if (opt == 'a') {
         writer_flags |= SERD_WRITE_ASCII;
       } else if (opt == 'f') {
-        writer_flags |= (SERD_WRITE_UNQUALIFIED | SERD_WRITE_UNRESOLVED);
+        writer_flags |= (SERD_WRITE_EXPANDED | SERD_WRITE_VERBATIM);
       } else if (opt == 'h') {
         return print_usage(prog, false);
       } else if (opt == 'l') {
author	David Robillard <d@drobilla.net>	2020-10-10 12:31:29 +0200
committer	David Robillard <d@drobilla.net>	2023-12-02 18:49:08 -0500
commit	155f5e2f24c24f5b5ffbf13fcea5cf1a355ec372 (patch)
tree	5f5f7009e96379aa0e31cb1db3757f8d6589669f
parent	e8f392d57bf6eba9b62509a32e4073e8b34b18e2 (diff)
download	serd-155f5e2f24c24f5b5ffbf13fcea5cf1a355ec372.tar.gz serd-155f5e2f24c24f5b5ffbf13fcea5cf1a355ec372.tar.bz2 serd-155f5e2f24c24f5b5ffbf13fcea5cf1a355ec372.zip