diff options
author | David Robillard <d@drobilla.net> | 2023-03-24 20:59:54 -0400 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2023-12-02 18:49:07 -0500 |
commit | 65cbb4a13f615658282677fcf04685bae63e893c (patch) | |
tree | b9c66e757cf28ce96906d3426300811645753173 | |
parent | c661dbe50d7f634ec5b2863260f41f098fc9c882 (diff) | |
download | serd-65cbb4a13f615658282677fcf04685bae63e893c.tar.gz serd-65cbb4a13f615658282677fcf04685bae63e893c.tar.bz2 serd-65cbb4a13f615658282677fcf04685bae63e893c.zip |
Support writing all escapes in Turtle and TriG prefixed names
-rw-r--r-- | NEWS | 3 | ||||
-rw-r--r-- | src/.clang-tidy | 1 | ||||
-rw-r--r-- | src/turtle.h | 6 | ||||
-rw-r--r-- | src/writer.c | 141 | ||||
-rw-r--r-- | test/extra/qualify/qualify-in.ttl | 3 | ||||
-rw-r--r-- | test/extra/qualify/qualify-out.ttl | 7 | ||||
-rw-r--r-- | test/test_writer.c | 74 |
7 files changed, 158 insertions, 77 deletions
@@ -13,10 +13,11 @@ serd (1.1.1) unstable; urgency=medium * Rename SerdChunk to SerdStringView * Simplify statement flags * Simplify writer style options and write UTF-8 by default + * Support writing all escapes in Turtle and TriG prefixed names * Use a fixed-size reader stack * Use char* for strings in public API - -- David Robillard <d@drobilla.net> Wed, 13 Jul 2022 20:39:07 +0000 + -- David Robillard <d@drobilla.net> Wed, 13 Jul 2022 21:43:56 +0000 serd (0.32.0) stable; urgency=medium diff --git a/src/.clang-tidy b/src/.clang-tidy index 638041cc..c2df3e44 100644 --- a/src/.clang-tidy +++ b/src/.clang-tidy @@ -9,7 +9,6 @@ Checks: > -clang-analyzer-valist.Uninitialized, -clang-diagnostic-unused-function, -concurrency-mt-unsafe, - -google-readability-todo, -hicpp-multiway-paths-covered, -hicpp-signed-bitwise, -llvm-header-guard, diff --git a/src/turtle.h b/src/turtle.h index 6e7e3a8d..f794e1e8 100644 --- a/src/turtle.h +++ b/src/turtle.h @@ -8,7 +8,6 @@ #include "string_utils.h" #include <stdbool.h> -#include <string.h> static inline bool is_PN_CHARS_U(const int c) @@ -26,7 +25,10 @@ is_PN_CHARS(const int c) static inline bool is_PN_LOCAL_ESC(const int c) { - return strchr("!#$%&\'()*+,-./;=?@_~", c) != NULL; + return c == '!' || c == '#' || c == '$' || c == '%' || c == '&' || + c == '\'' || c == '(' || c == ')' || c == '*' || c == '+' || + c == ',' || c == '-' || c == '.' || c == '/' || c == ';' || c == '=' || + c == '?' || c == '@' || c == '\\' || c == '_' || c == '~'; } #endif // SERD_SRC_TURTLE_H diff --git a/src/writer.c b/src/writer.c index 7201c976..60c17e11 100644 --- a/src/writer.c +++ b/src/writer.c @@ -9,6 +9,7 @@ #include "string_utils.h" #include "system.h" #include "try.h" +#include "turtle.h" #include "uri_utils.h" #include "world.h" @@ -281,10 +282,10 @@ esink(const void* buf, size_t len, SerdWriter* writer) // Write a single character as a Unicode escape // (Caller prints any single byte characters that don't need escaping) static size_t -write_character(SerdWriter* writer, - const uint8_t* utf8, - size_t* size, - SerdStatus* st) +write_character(SerdWriter* const writer, + const uint8_t* const utf8, + size_t* const size, + SerdStatus* const st) { char escape[11] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; const uint32_t c = parse_utf8_char(utf8, size); @@ -395,64 +396,77 @@ write_uri_from_node(SerdWriter* writer, const SerdNode* node) return ewrite_uri(writer, serd_node_string(node), serd_node_length(node)); } -static bool -lname_must_escape(const char c) +SERD_NODISCARD static SerdStatus +write_utf8_percent_escape(SerdWriter* const writer, + const char* const utf8, + const size_t n_bytes) { - /* This arbitrary list of characters, most of which have nothing to do with - Turtle, must be handled as special cases here because the RDF and SPARQL - WGs are apparently intent on making the once elegant Turtle a baroque - and inconsistent mess, throwing elegance and extensibility completely - out the window for no good reason. + static const char hex_chars[] = "0123456789ABCDEF"; - Note '-', '.', and '_' are also in PN_LOCAL_ESC, but are valid unescaped - in local names, so they are not escaped here. */ + SerdStatus st = SERD_SUCCESS; + char escape[4] = {'%', 0, 0, 0}; - switch (c) { - case '\'': - case '!': - case '#': - case '$': - case '%': - case '&': - case '(': - case ')': - case '*': - case '+': - case ',': - case '/': - case ';': - case '=': - case '?': - case '@': - case '~': - return true; - default: - break; + for (size_t i = 0U; i < n_bytes; ++i) { + const uint8_t byte = (uint8_t)utf8[i]; + escape[1] = hex_chars[byte >> 4U]; + escape[2] = hex_chars[byte & 0x0FU]; + + TRY(st, esink(escape, 3, writer)); } - return false; + + return st; +} + +SERD_NODISCARD static SerdStatus +write_PN_LOCAL_ESC(SerdWriter* const writer, const char c) +{ + const char buf[2] = {'\\', c}; + + return esink(buf, sizeof(buf), writer); +} + +SERD_NODISCARD static SerdStatus +write_lname_escape(SerdWriter* writer, const char* const utf8, size_t n_bytes) +{ + return is_PN_LOCAL_ESC(utf8[0]) + ? write_PN_LOCAL_ESC(writer, utf8[0]) + : write_utf8_percent_escape(writer, utf8, n_bytes); } SERD_NODISCARD static SerdStatus -write_lname(SerdWriter* writer, const char* utf8, size_t n_bytes) +write_lname(SerdWriter* writer, const char* utf8, const size_t n_bytes) { SerdStatus st = SERD_SUCCESS; - for (size_t i = 0; i < n_bytes; ++i) { - size_t j = i; // Index of next character that must be escaped - for (; j < n_bytes; ++j) { - if (lname_must_escape(utf8[j])) { - break; - } - } + if (!n_bytes) { + return st; + } - // Bulk write all characters up to this special one - TRY(st, esink(&utf8[i], j - i, writer)); - if ((i = j) == n_bytes) { - break; // Reached end + /* Thanks to the horribly complicated Turtle grammar for prefixed names, + making sure we never write an invalid character is tedious. We need to + handle the first and last characters separately since they have different + sets of valid characters. */ + + // Write first character + size_t first_size = 0U; + const int first = (int)parse_utf8_char((const uint8_t*)utf8, &first_size); + if (is_PN_CHARS_U(first) || first == ':' || is_digit(first)) { + TRY(st, esink(utf8, first_size, writer)); + } else { + TRY(st, write_lname_escape(writer, utf8, first_size)); + } + + // Write middle and last characters + for (size_t i = first_size; i < n_bytes;) { + size_t c_size = 0U; + const int c = (int)parse_utf8_char((const uint8_t*)utf8 + i, &c_size); + + if (is_PN_CHARS(c) || c == ':' || (c == '.' && (i + 1U < n_bytes))) { + TRY(st, esink(&utf8[i], c_size, writer)); + } else { + TRY(st, write_lname_escape(writer, &utf8[i], c_size)); } - // Write escape - TRY(st, esink("\\", 1, writer)); - TRY(st, esink(&utf8[i], 1, writer)); + i += c_size; } return st; @@ -780,20 +794,6 @@ write_literal(SerdWriter* const writer, return st; } -// Return true iff `buf` is a valid prefixed name prefix or suffix -static bool -is_name(const char* buf, const size_t len) -{ - // TODO: This is more strict than it should be - for (size_t i = 0; i < len; ++i) { - if (!(is_alpha(buf[i]) || is_digit(buf[i]))) { - return false; - } - } - - return true; -} - SERD_NODISCARD static SerdStatus write_full_uri_node(SerdWriter* const writer, const SerdNode* const node) { @@ -837,12 +837,12 @@ write_uri_node(SerdWriter* const writer, const SerdField field) { SerdStatus st = SERD_SUCCESS; - const SerdNode* prefix = NULL; - SerdStringView suffix = {NULL, 0}; const char* const node_str = serd_node_string(node); const bool has_scheme = serd_uri_string_has_scheme(node_str); if (supports_abbrev(writer)) { + const SerdNode* prefix_node = NULL; + SerdStringView suffix = {NULL, 0}; if (field == SERD_PREDICATE && !strcmp(node_str, NS_RDF "type")) { return esink("a", 1, writer); } @@ -852,12 +852,11 @@ write_uri_node(SerdWriter* const writer, } if (has_scheme && !(writer->flags & SERD_WRITE_UNQUALIFIED) && - serd_env_qualify_in_place(writer->env, node, &prefix, &suffix) && - is_name(serd_node_string(prefix), serd_node_length(prefix)) && - is_name(suffix.data, suffix.length)) { - TRY(st, write_uri_from_node(writer, prefix)); + serd_env_qualify_in_place(writer->env, node, &prefix_node, &suffix)) { + const SerdStringView prefix = serd_node_string_view(prefix_node); + TRY(st, write_lname(writer, prefix.data, prefix.length)); TRY(st, esink(":", 1, writer)); - return ewrite_uri(writer, suffix.data, suffix.length); + return write_lname(writer, suffix.data, suffix.length); } } diff --git a/test/extra/qualify/qualify-in.ttl b/test/extra/qualify/qualify-in.ttl index 04afc07f..b30e1721 100644 --- a/test/extra/qualify/qualify-in.ttl +++ b/test/extra/qualify/qualify-in.ttl @@ -6,5 +6,8 @@ <http://example.org/a-subject> <http://example.org/a-predicate> <http://example.org/a-object> . +<http://example.org/special-!#$%&'()*+,-./;=?@_~-chars> + <http://example.org/p> <http://example.org/o> . + <http://www.w3.org/1999/02/22-rdf-syntax-ns#nil> <http://www.w3.org/2000/01/rdf-schema#label> "nil" . diff --git a/test/extra/qualify/qualify-out.ttl b/test/extra/qualify/qualify-out.ttl index f4dd15d4..79148017 100644 --- a/test/extra/qualify/qualify-out.ttl +++ b/test/extra/qualify/qualify-out.ttl @@ -3,8 +3,11 @@ eg:s eg:p eg:o . -<http://example.org/a-subject> - <http://example.org/a-predicate> <http://example.org/a-object> . +eg:a-subject + eg:a-predicate eg:a-object . + +eg:special-\!\#\$\%\&\'\(\)\*\+\,-.\/\;\=\?\@_\~-chars + eg:p eg:o . () <http://www.w3.org/2000/01/rdf-schema#label> "nil" . diff --git a/test/test_writer.c b/test/test_writer.c index 6066b6e3..a4d92c5b 100644 --- a/test/test_writer.c +++ b/test/test_writer.c @@ -19,6 +19,7 @@ #include <assert.h> #include <stdint.h> #include <stdio.h> +#include <stdlib.h> #include <string.h> static void @@ -325,6 +326,78 @@ test_write_empty_syntax(void) serd_world_free(world); } +static void +check_pname_escape(const char* const lname, const char* const expected) +{ + SerdWorld* world = serd_world_new(); + SerdEnv* env = serd_env_new(serd_empty_string()); + SerdBuffer buffer = {NULL, 0}; + + SerdWriter* writer = + serd_writer_new(world, SERD_TURTLE, 0U, env, serd_buffer_sink, &buffer); + + assert(writer); + + static const char* const prefix = "http://example.org/"; + const size_t prefix_len = strlen(prefix); + + serd_env_set_prefix(env, serd_string("eg"), serd_string(prefix)); + + SerdNode* s = serd_new_uri(serd_string("http://example.org/s")); + SerdNode* p = serd_new_uri(serd_string("http://example.org/p")); + + char* const uri = (char*)calloc(1, prefix_len + strlen(lname) + 1); + memcpy(uri, prefix, prefix_len + 1); + memcpy(uri + prefix_len, lname, strlen(lname) + 1); + + SerdNode* node = serd_new_uri(serd_string(uri)); + assert(!serd_sink_write(serd_writer_sink(writer), 0, s, p, node, NULL)); + serd_node_free(node); + + free(uri); + serd_node_free(p); + serd_node_free(s); + serd_writer_free(writer); + serd_env_free(env); + + char* out = serd_buffer_sink_finish(&buffer); + assert(!strcmp((char*)out, expected)); + serd_free(out); + + serd_world_free(world); +} + +static void +test_write_pname_escapes(void) +{ + // Check that '.' is escaped only at the start and end + check_pname_escape(".xyz", "eg:s\n\teg:p eg:\\.xyz .\n"); + check_pname_escape("w.yz", "eg:s\n\teg:p eg:w.yz .\n"); + check_pname_escape("wx.z", "eg:s\n\teg:p eg:wx.z .\n"); + check_pname_escape("wxy.", "eg:s\n\teg:p eg:wxy\\. .\n"); + + // Check that ':' is not escaped anywhere + check_pname_escape(":xyz", "eg:s\n\teg:p eg::xyz .\n"); + check_pname_escape("w:yz", "eg:s\n\teg:p eg:w:yz .\n"); + check_pname_escape("wx:z", "eg:s\n\teg:p eg:wx:z .\n"); + check_pname_escape("wxy:", "eg:s\n\teg:p eg:wxy: .\n"); + + // Check that special characters like '~' are escaped everywhere + check_pname_escape("~xyz", "eg:s\n\teg:p eg:\\~xyz .\n"); + check_pname_escape("w~yz", "eg:s\n\teg:p eg:w\\~yz .\n"); + check_pname_escape("wx~z", "eg:s\n\teg:p eg:wx\\~z .\n"); + check_pname_escape("wxy~", "eg:s\n\teg:p eg:wxy\\~ .\n"); + + // Check that out of range multi-byte characters are escaped everywhere + static const char first_escape[] = {(char)0xC3U, (char)0xB7U, 'y', 'z', 0}; + static const char mid_escape[] = {'w', (char)0xC3U, (char)0xB7U, 'z', 0}; + static const char last_escape[] = {'w', 'x', (char)0xC3U, (char)0xB7U, 0}; + + check_pname_escape((const char*)first_escape, "eg:s\n\teg:p eg:%C3%B7yz .\n"); + check_pname_escape((const char*)mid_escape, "eg:s\n\teg:p eg:w%C3%B7z .\n"); + check_pname_escape((const char*)last_escape, "eg:s\n\teg:p eg:wx%C3%B7 .\n"); +} + int main(void) { @@ -336,6 +409,7 @@ main(void) test_write_error(); test_writer_stack_overflow(); test_write_empty_syntax(); + test_write_pname_escapes(); return 0; } |