diff options
-rw-r--r-- | NEWS | 3 | ||||
-rw-r--r-- | src/.clang-tidy | 1 | ||||
-rw-r--r-- | src/n3.c | 42 | ||||
-rw-r--r-- | src/writer.c | 151 | ||||
-rw-r--r-- | test/test_writer.c | 77 |
5 files changed, 186 insertions, 88 deletions
@@ -19,10 +19,11 @@ serd (1.0.1) unstable; * Rename SerdChunk to SerdStringView * Simplify streaming API and improve pretty printing * Simplify writer style options + * Support writing all escapes in Turtle and TriG prefixed names * Use a fixed-size reader stack * Use char* for strings in public API - -- David Robillard <d@drobilla.net> Sat, 17 Jul 2021 18:19:48 +0000 + -- David Robillard <d@drobilla.net> Sat, 31 Jul 2021 23:27:35 +0000 serd (0.30.11) unstable; diff --git a/src/.clang-tidy b/src/.clang-tidy index 6029eeaa..5cf5e873 100644 --- a/src/.clang-tidy +++ b/src/.clang-tidy @@ -10,7 +10,6 @@ Checks: > -bugprone-suspicious-string-compare, -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling, -concurrency-mt-unsafe, - -google-readability-todo, -hicpp-multiway-paths-covered, -hicpp-signed-bitwise, -llvm-header-guard, @@ -658,6 +658,13 @@ read_anon(SerdReader* const reader, return eat_byte_check(reader, ']'); } +static bool +node_has_string(const SerdNode* const node, const SerdStringView string) +{ + return node->length == string.len && + !memcmp(serd_node_string(node), string.buf, string.len); +} + // Read a "named" object: a boolean literal or a prefixed name static SerdStatus read_named_object(SerdReader* const reader, @@ -672,38 +679,33 @@ read_named_object(SerdReader* const reader, characters, so this is more tedious to deal with in a non-tokenizing parser like this one. - Deal with this here by first reading the prefix into a tentative node. If - it turns out to be "true" or "false", switch it to a boolean literal after - the fact. */ + Deal with this here by trying to read a prefixed node, then if it turns + out to actually be "true" or "false", switch it to a boolean literal. */ if (!(*dest = push_node(reader, SERD_URI, "", 0))) { return SERD_ERR_OVERFLOW; } - const size_t string_start_offset = reader->stack.size; - SerdNode* const node = *dest; - SerdStatus st = SERD_SUCCESS; - while (!(st = read_PN_CHARS_BASE(reader, node))) { - } + SerdNode* node = *dest; + SerdStatus st = SERD_SUCCESS; - if (st > SERD_FAILURE) { - return st; - } + // Attempt to read a prefixed name + st = read_PrefixedName(reader, node, true, ate_dot, reader->stack.size); - if ((node->length == 4 && !memcmp(serd_node_string(node), "true", 4)) || - (node->length == 5 && !memcmp(serd_node_string(node), "false", 5))) { - node->flags |= SERD_HAS_DATATYPE; - node->type = SERD_LITERAL; + // Check if this is actually a special boolean node + if (st == SERD_FAILURE && (node_has_string(node, SERD_STRING("true")) || + node_has_string(node, SERD_STRING("false")))) { + node->flags = SERD_HAS_DATATYPE; + node->type = SERD_LITERAL; return push_node(reader, SERD_URI, XSD_BOOLEAN, XSD_BOOLEAN_LEN) ? SERD_SUCCESS : SERD_ERR_OVERFLOW; } - if ((st = read_PN_PREFIX_tail(reader, node)) > SERD_FAILURE || - (st = read_PrefixedName( - reader, node, false, ate_dot, string_start_offset))) { - st = (st > SERD_FAILURE) ? st : SERD_ERR_BAD_SYNTAX; - return r_err(reader, st, "expected prefixed name"); + // Any other failure is a syntax error + if (st) { + st = st > SERD_FAILURE ? st : SERD_ERR_BAD_SYNTAX; + return r_err(reader, st, "expected prefixed name or boolean"); } return SERD_SUCCESS; diff --git a/src/writer.c b/src/writer.c index e73533e7..d139ef9d 100644 --- a/src/writer.c +++ b/src/writer.c @@ -23,6 +23,7 @@ #include "string_utils.h" #include "system.h" #include "try.h" +#include "turtle.h" #include "uri_utils.h" #include "world.h" @@ -395,64 +396,97 @@ write_uri_from_node(SerdWriter* writer, const SerdNode* node) return ewrite_uri(writer, serd_node_string(node), node->length); } -static bool -lname_must_escape(const char c) +SERD_WARN_UNUSED_RESULT static SerdStatus +write_utf8_percent_escape(SerdWriter* const writer, + const char* const utf8, + const size_t n_bytes) { - /* This arbitrary list of characters, most of which have nothing to do with - Turtle, must be handled as special cases here because the RDF and SPARQL - WGs are apparently intent on making the once elegant Turtle a baroque - and inconsistent mess, throwing elegance and extensibility completely - out the window for no good reason. + SerdStatus st = SERD_SUCCESS; + char escape[4] = {0, 0, 0, 0}; - Note '-', '.', and '_' are also in PN_LOCAL_ESC, but are valid unescaped - in local names, so they are not escaped here. */ + for (size_t i = 0u; i < n_bytes; ++i) { + snprintf(escape, sizeof(escape), "%%%02X", (uint8_t)utf8[i]); + TRY(st, esink(escape, 3, writer)); + } - switch (c) { - case '\'': - case '!': - case '#': - case '$': - case '%': - case '&': - case '(': - case ')': - case '*': - case '+': - case ',': - case '/': - case ';': - case '=': - case '?': - case '@': - case '~': - return true; - default: - break; + return st; +} + +SERD_WARN_UNUSED_RESULT static SerdStatus +write_PN_LOCAL_ESC(SerdWriter* const writer, const char c) +{ + SerdStatus st = SERD_SUCCESS; + + if (!(st = esink("\\", 1, writer))) { + st = esink(&c, 1, writer); } - return false; + + return st; +} + +SERD_WARN_UNUSED_RESULT static SerdStatus +write_lname_escape(SerdWriter* writer, const char* const utf8, size_t n_bytes) +{ + SerdStatus st = SERD_SUCCESS; + + if (is_PN_LOCAL_ESC(utf8[0])) { + st = write_PN_LOCAL_ESC(writer, utf8[0]); + } else { + st = write_utf8_percent_escape(writer, utf8, n_bytes); + } + + return st; } SERD_WARN_UNUSED_RESULT static SerdStatus write_lname(SerdWriter* writer, const char* utf8, size_t n_bytes) { SerdStatus st = SERD_SUCCESS; - for (size_t i = 0; i < n_bytes; ++i) { - size_t j = i; // Index of next character that must be escaped - for (; j < n_bytes; ++j) { - if (lname_must_escape(utf8[j])) { - break; - } + if (!n_bytes) { + return st; + } + + /* Thanks to the horribly complicated Turtle grammar for prefixed names, + making sure we never write an invalid character is tedious. We need to + handle the first and last characters separately since they have different + sets of valid characters. */ + + // Write first character + size_t first_size = 0u; + const int first = (int)parse_utf8_char((const uint8_t*)utf8, &first_size); + if (is_PN_CHARS_U(first) || first == ':' || is_digit(first)) { + st = esink(utf8, first_size, writer); + } else { + st = write_lname_escape(writer, utf8, first_size); + } + + // Write middle characters + size_t i = first_size; + while (!st && i < n_bytes - 1u) { + size_t c_size = 0u; + const int c = (int)parse_utf8_char((const uint8_t*)utf8 + i, &c_size); + if (i + c_size >= n_bytes) { + break; } - // Bulk write all characters up to this special one - TRY(st, esink(&utf8[i], j - i, writer)); - if ((i = j) == n_bytes) { - break; // Reached end + if (is_PN_CHARS(c) || c == '.' || c == ':') { + st = esink(&utf8[i], c_size, writer); + } else { + st = write_lname_escape(writer, &utf8[i], c_size); } - // Write escape - TRY(st, esink("\\", 1, writer)); - TRY(st, esink(&utf8[i], 1, writer)); + i += c_size; + } + + // Write last character + if (!st && i < n_bytes) { + size_t last_size = 0u; + const int last = (int)parse_utf8_char((const uint8_t*)utf8 + i, &last_size); + if (is_PN_CHARS(last) || last == ':') { + st = esink(&utf8[i], last_size, writer); + } else { + st = write_lname_escape(writer, &utf8[i], last_size); + } } return st; @@ -756,20 +790,6 @@ write_literal(SerdWriter* const writer, return st; } -// Return true iff `buf` is a valid prefixed name prefix or suffix -static bool -is_name(const char* buf, const size_t len) -{ - // TODO: This is more strict than it should be - for (size_t i = 0; i < len; ++i) { - if (!(is_alpha(buf[i]) || is_digit(buf[i]) || lname_must_escape(buf[i]))) { - return false; - } - } - - return true; -} - SERD_WARN_UNUSED_RESULT static SerdStatus write_full_uri_node(SerdWriter* const writer, const SerdNode* const node) { @@ -812,11 +832,12 @@ write_uri_node(SerdWriter* const writer, const SerdNode* const node, const SerdField field) { - SerdStatus st = SERD_SUCCESS; - SerdStringView prefix = {NULL, 0}; - SerdStringView suffix = {NULL, 0}; - const char* node_str = serd_node_string(node); - const bool has_scheme = serd_uri_string_has_scheme(node_str); + SerdStatus st = SERD_SUCCESS; + SerdStringView prefix = {NULL, 0}; + SerdStringView suffix = {NULL, 0}; + const SerdStringView node_view = serd_node_string_view(node); + const char* node_str = serd_node_string(node); + const bool has_scheme = serd_uri_string_has_scheme(node_str); if (supports_abbrev(writer)) { if (field == SERD_PREDICATE && serd_node_equals(node, writer->world->rdf_type)) { @@ -828,9 +849,7 @@ write_uri_node(SerdWriter* const writer, } if (has_scheme && !(writer->flags & SERD_WRITE_UNQUALIFIED) && - !serd_env_qualify( - writer->env, serd_node_string_view(node), &prefix, &suffix) && - is_name(prefix.buf, prefix.len) && is_name(suffix.buf, suffix.len)) { + !serd_env_qualify(writer->env, node_view, &prefix, &suffix)) { TRY(st, write_lname(writer, prefix.buf, prefix.len)); TRY(st, esink(":", 1, writer)); return write_lname(writer, suffix.buf, suffix.len); diff --git a/test/test_writer.c b/test/test_writer.c index fd650ecd..9a59d92f 100644 --- a/test/test_writer.c +++ b/test/test_writer.c @@ -22,6 +22,7 @@ #include <errno.h> #include <stdint.h> #include <stdio.h> +#include <stdlib.h> #include <string.h> static void @@ -329,6 +330,81 @@ test_write_bad_uri(void) serd_world_free(world); } +static void +check_pname_escape(const char* const lname, const char* const expected) +{ + SerdWorld* world = serd_world_new(); + SerdNodes* nodes = serd_world_nodes(world); + SerdEnv* env = serd_env_new(SERD_EMPTY_STRING()); + SerdBuffer buffer = {NULL, 0}; + SerdByteSink* byte_sink = serd_byte_sink_new_buffer(&buffer); + + SerdWriter* writer = serd_writer_new(world, SERD_TURTLE, 0u, env, byte_sink); + assert(writer); + + static const char* const prefix = "http://example.org/"; + const size_t prefix_len = strlen(prefix); + + serd_env_set_prefix(env, SERD_STRING("eg"), SERD_STRING(prefix)); + + const SerdNode* s = + serd_nodes_uri(nodes, SERD_STRING("http://example.org/s")); + + const SerdNode* p = + serd_nodes_uri(nodes, SERD_STRING("http://example.org/p")); + + char* const uri = (char*)calloc(1, prefix_len + strlen(lname) + 1); + memcpy(uri, prefix, prefix_len + 1); + memcpy(uri + prefix_len, lname, strlen(lname) + 1); + + const SerdNode* node = serd_nodes_uri(nodes, SERD_STRING(uri)); + + assert(!serd_sink_write(serd_writer_sink(writer), 0, s, p, node, NULL)); + + serd_writer_free(writer); + serd_byte_sink_free(byte_sink); + serd_env_free(env); + + char* out = serd_buffer_sink_finish(&buffer); + + assert(!strcmp((char*)out, expected)); + serd_free(out); + + free(uri); + serd_world_free(world); +} + +static void +test_write_pname_escapes(void) +{ + // Check that '.' is escaped only at the start and end + check_pname_escape(".xyz", "eg:s\n\teg:p eg:\\.xyz .\n"); + check_pname_escape("w.yz", "eg:s\n\teg:p eg:w.yz .\n"); + check_pname_escape("wx.z", "eg:s\n\teg:p eg:wx.z .\n"); + check_pname_escape("wxy.", "eg:s\n\teg:p eg:wxy\\. .\n"); + + // Check that ':' is not escaped anywhere + check_pname_escape(":xyz", "eg:s\n\teg:p eg::xyz .\n"); + check_pname_escape("w:yz", "eg:s\n\teg:p eg:w:yz .\n"); + check_pname_escape("wx:z", "eg:s\n\teg:p eg:wx:z .\n"); + check_pname_escape("wxy:", "eg:s\n\teg:p eg:wxy: .\n"); + + // Check that special characters like '~' are escaped everywhere + check_pname_escape("~xyz", "eg:s\n\teg:p eg:\\~xyz .\n"); + check_pname_escape("w~yz", "eg:s\n\teg:p eg:w\\~yz .\n"); + check_pname_escape("wx~z", "eg:s\n\teg:p eg:wx\\~z .\n"); + check_pname_escape("wxy~", "eg:s\n\teg:p eg:wxy\\~ .\n"); + + // Check that out of range multi-byte characters are escaped everywhere + static const char first_escape[] = {(char)0xC3u, (char)0xB7u, 'y', 'z', 0}; + static const char mid_escape[] = {'w', (char)0xC3u, (char)0xB7u, 'z', 0}; + static const char last_escape[] = {'w', 'x', (char)0xC3u, (char)0xB7u, 0}; + + check_pname_escape((const char*)first_escape, "eg:s\n\teg:p eg:%C3%B7yz .\n"); + check_pname_escape((const char*)mid_escape, "eg:s\n\teg:p eg:w%C3%B7z .\n"); + check_pname_escape((const char*)last_escape, "eg:s\n\teg:p eg:wx%C3%B7 .\n"); +} + int main(void) { @@ -339,6 +415,7 @@ main(void) test_write_error(); test_write_empty_syntax(); test_write_bad_uri(); + test_write_pname_escapes(); return 0; } |