From 343124df71010055c2c1e6cdcadd13d23b2c013a Mon Sep 17 00:00:00 2001 From: David Robillard Date: Sun, 5 Feb 2023 18:39:49 -0500 Subject: [WIP] Add support for URI hex escape decoding --- include/serd/reader.h | 9 +++++++++ include/serd/writer.h | 16 ++++++++++++++++ 2 files changed, 25 insertions(+) (limited to 'include') diff --git a/include/serd/reader.h b/include/serd/reader.h index 78b51d00..57c8b2c3 100644 --- a/include/serd/reader.h +++ b/include/serd/reader.h @@ -101,6 +101,15 @@ typedef enum { from documents when the statements are sorted, such as in a model. */ SERD_READ_ORDERED = 1U << 5U, + + /** + Read URIs with percent-encoded UTF-8 characters decoded. + + Normally, percent-encoded octets in URIs are treated as plain text and + preserved. This flags enables UTF-8 decoding of URIs, so octet escapes + like "%7E" in URIs will be decoded to UTF-8 characters like "~". + */ + SERD_READ_DECODED = 1U << 6U, } SerdReaderFlag; /// Bitwise OR of SerdReaderFlag values diff --git a/include/serd/writer.h b/include/serd/writer.h index bf54c46c..e3915a86 100644 --- a/include/serd/writer.h +++ b/include/serd/writer.h @@ -42,6 +42,10 @@ typedef enum { Although all the supported syntaxes are UTF-8 by definition, this can be used to escape all non-ASCII characters so that data will survive transmission through ASCII-only channels. + + Non-printable-ASCII characters will be written as "U" escapes like + "\u007F" in string literals, and as hex-encoded UTF-8 bytes like "%07F" in + URIs. */ SERD_WRITE_ASCII = 1U << 0U, @@ -96,6 +100,18 @@ typedef enum { implicit context, so it will only be readable in a suitable enviromnent. */ SERD_WRITE_CONTEXTUAL = 1U << 6U, + + /** + Escape additional characters in RDF Test Cases format. + + This writes "extended" characters as printable ASCII, using "U" escapes in + URIs instead of hex-encoding (escapes like "\u007F" instead of "%7F"). + This is the format used by the outputs in the Turtle test suite (which + predates RDF 1.1 NTriples). This style makes NTriples output + non-canonical, so it generally shouldn't be used except for compatibility + purposes. See . + */ + SERD_WRITE_ESCAPES = 1U << 7U, } SerdWriterFlag; /// Bitwise OR of #SerdWriterFlag values -- cgit v1.2.1