diff options
author | David Robillard <d@drobilla.net> | 2023-03-24 20:59:54 -0400 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2023-12-02 18:49:07 -0500 |
commit | 65cbb4a13f615658282677fcf04685bae63e893c (patch) | |
tree | b9c66e757cf28ce96906d3426300811645753173 /src | |
parent | c661dbe50d7f634ec5b2863260f41f098fc9c882 (diff) | |
download | serd-65cbb4a13f615658282677fcf04685bae63e893c.tar.gz serd-65cbb4a13f615658282677fcf04685bae63e893c.tar.bz2 serd-65cbb4a13f615658282677fcf04685bae63e893c.zip |
Support writing all escapes in Turtle and TriG prefixed names
Diffstat (limited to 'src')
-rw-r--r-- | src/.clang-tidy | 1 | ||||
-rw-r--r-- | src/turtle.h | 6 | ||||
-rw-r--r-- | src/writer.c | 141 |
3 files changed, 74 insertions, 74 deletions
diff --git a/src/.clang-tidy b/src/.clang-tidy index 638041cc..c2df3e44 100644 --- a/src/.clang-tidy +++ b/src/.clang-tidy @@ -9,7 +9,6 @@ Checks: > -clang-analyzer-valist.Uninitialized, -clang-diagnostic-unused-function, -concurrency-mt-unsafe, - -google-readability-todo, -hicpp-multiway-paths-covered, -hicpp-signed-bitwise, -llvm-header-guard, diff --git a/src/turtle.h b/src/turtle.h index 6e7e3a8d..f794e1e8 100644 --- a/src/turtle.h +++ b/src/turtle.h @@ -8,7 +8,6 @@ #include "string_utils.h" #include <stdbool.h> -#include <string.h> static inline bool is_PN_CHARS_U(const int c) @@ -26,7 +25,10 @@ is_PN_CHARS(const int c) static inline bool is_PN_LOCAL_ESC(const int c) { - return strchr("!#$%&\'()*+,-./;=?@_~", c) != NULL; + return c == '!' || c == '#' || c == '$' || c == '%' || c == '&' || + c == '\'' || c == '(' || c == ')' || c == '*' || c == '+' || + c == ',' || c == '-' || c == '.' || c == '/' || c == ';' || c == '=' || + c == '?' || c == '@' || c == '\\' || c == '_' || c == '~'; } #endif // SERD_SRC_TURTLE_H diff --git a/src/writer.c b/src/writer.c index 7201c976..60c17e11 100644 --- a/src/writer.c +++ b/src/writer.c @@ -9,6 +9,7 @@ #include "string_utils.h" #include "system.h" #include "try.h" +#include "turtle.h" #include "uri_utils.h" #include "world.h" @@ -281,10 +282,10 @@ esink(const void* buf, size_t len, SerdWriter* writer) // Write a single character as a Unicode escape // (Caller prints any single byte characters that don't need escaping) static size_t -write_character(SerdWriter* writer, - const uint8_t* utf8, - size_t* size, - SerdStatus* st) +write_character(SerdWriter* const writer, + const uint8_t* const utf8, + size_t* const size, + SerdStatus* const st) { char escape[11] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; const uint32_t c = parse_utf8_char(utf8, size); @@ -395,64 +396,77 @@ write_uri_from_node(SerdWriter* writer, const SerdNode* node) return ewrite_uri(writer, serd_node_string(node), serd_node_length(node)); } -static bool -lname_must_escape(const char c) +SERD_NODISCARD static SerdStatus +write_utf8_percent_escape(SerdWriter* const writer, + const char* const utf8, + const size_t n_bytes) { - /* This arbitrary list of characters, most of which have nothing to do with - Turtle, must be handled as special cases here because the RDF and SPARQL - WGs are apparently intent on making the once elegant Turtle a baroque - and inconsistent mess, throwing elegance and extensibility completely - out the window for no good reason. + static const char hex_chars[] = "0123456789ABCDEF"; - Note '-', '.', and '_' are also in PN_LOCAL_ESC, but are valid unescaped - in local names, so they are not escaped here. */ + SerdStatus st = SERD_SUCCESS; + char escape[4] = {'%', 0, 0, 0}; - switch (c) { - case '\'': - case '!': - case '#': - case '$': - case '%': - case '&': - case '(': - case ')': - case '*': - case '+': - case ',': - case '/': - case ';': - case '=': - case '?': - case '@': - case '~': - return true; - default: - break; + for (size_t i = 0U; i < n_bytes; ++i) { + const uint8_t byte = (uint8_t)utf8[i]; + escape[1] = hex_chars[byte >> 4U]; + escape[2] = hex_chars[byte & 0x0FU]; + + TRY(st, esink(escape, 3, writer)); } - return false; + + return st; +} + +SERD_NODISCARD static SerdStatus +write_PN_LOCAL_ESC(SerdWriter* const writer, const char c) +{ + const char buf[2] = {'\\', c}; + + return esink(buf, sizeof(buf), writer); +} + +SERD_NODISCARD static SerdStatus +write_lname_escape(SerdWriter* writer, const char* const utf8, size_t n_bytes) +{ + return is_PN_LOCAL_ESC(utf8[0]) + ? write_PN_LOCAL_ESC(writer, utf8[0]) + : write_utf8_percent_escape(writer, utf8, n_bytes); } SERD_NODISCARD static SerdStatus -write_lname(SerdWriter* writer, const char* utf8, size_t n_bytes) +write_lname(SerdWriter* writer, const char* utf8, const size_t n_bytes) { SerdStatus st = SERD_SUCCESS; - for (size_t i = 0; i < n_bytes; ++i) { - size_t j = i; // Index of next character that must be escaped - for (; j < n_bytes; ++j) { - if (lname_must_escape(utf8[j])) { - break; - } - } + if (!n_bytes) { + return st; + } - // Bulk write all characters up to this special one - TRY(st, esink(&utf8[i], j - i, writer)); - if ((i = j) == n_bytes) { - break; // Reached end + /* Thanks to the horribly complicated Turtle grammar for prefixed names, + making sure we never write an invalid character is tedious. We need to + handle the first and last characters separately since they have different + sets of valid characters. */ + + // Write first character + size_t first_size = 0U; + const int first = (int)parse_utf8_char((const uint8_t*)utf8, &first_size); + if (is_PN_CHARS_U(first) || first == ':' || is_digit(first)) { + TRY(st, esink(utf8, first_size, writer)); + } else { + TRY(st, write_lname_escape(writer, utf8, first_size)); + } + + // Write middle and last characters + for (size_t i = first_size; i < n_bytes;) { + size_t c_size = 0U; + const int c = (int)parse_utf8_char((const uint8_t*)utf8 + i, &c_size); + + if (is_PN_CHARS(c) || c == ':' || (c == '.' && (i + 1U < n_bytes))) { + TRY(st, esink(&utf8[i], c_size, writer)); + } else { + TRY(st, write_lname_escape(writer, &utf8[i], c_size)); } - // Write escape - TRY(st, esink("\\", 1, writer)); - TRY(st, esink(&utf8[i], 1, writer)); + i += c_size; } return st; @@ -780,20 +794,6 @@ write_literal(SerdWriter* const writer, return st; } -// Return true iff `buf` is a valid prefixed name prefix or suffix -static bool -is_name(const char* buf, const size_t len) -{ - // TODO: This is more strict than it should be - for (size_t i = 0; i < len; ++i) { - if (!(is_alpha(buf[i]) || is_digit(buf[i]))) { - return false; - } - } - - return true; -} - SERD_NODISCARD static SerdStatus write_full_uri_node(SerdWriter* const writer, const SerdNode* const node) { @@ -837,12 +837,12 @@ write_uri_node(SerdWriter* const writer, const SerdField field) { SerdStatus st = SERD_SUCCESS; - const SerdNode* prefix = NULL; - SerdStringView suffix = {NULL, 0}; const char* const node_str = serd_node_string(node); const bool has_scheme = serd_uri_string_has_scheme(node_str); if (supports_abbrev(writer)) { + const SerdNode* prefix_node = NULL; + SerdStringView suffix = {NULL, 0}; if (field == SERD_PREDICATE && !strcmp(node_str, NS_RDF "type")) { return esink("a", 1, writer); } @@ -852,12 +852,11 @@ write_uri_node(SerdWriter* const writer, } if (has_scheme && !(writer->flags & SERD_WRITE_UNQUALIFIED) && - serd_env_qualify_in_place(writer->env, node, &prefix, &suffix) && - is_name(serd_node_string(prefix), serd_node_length(prefix)) && - is_name(suffix.data, suffix.length)) { - TRY(st, write_uri_from_node(writer, prefix)); + serd_env_qualify_in_place(writer->env, node, &prefix_node, &suffix)) { + const SerdStringView prefix = serd_node_string_view(prefix_node); + TRY(st, write_lname(writer, prefix.data, prefix.length)); TRY(st, esink(":", 1, writer)); - return ewrite_uri(writer, suffix.data, suffix.length); + return write_lname(writer, suffix.data, suffix.length); } } |