diff options
author | David Robillard <d@drobilla.net> | 2021-07-22 15:26:22 -0400 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2022-01-14 19:37:51 -0500 |
commit | ca3a7049506cd1ba91326a36fc02a7319657728c (patch) | |
tree | 7289c877d9dc6286ab1826fab45e8a1612e8bc20 /src | |
parent | 21f17ad27b3e805003e50b0f5fcbe606bfef0f3b (diff) | |
download | serd-ca3a7049506cd1ba91326a36fc02a7319657728c.tar.gz serd-ca3a7049506cd1ba91326a36fc02a7319657728c.tar.bz2 serd-ca3a7049506cd1ba91326a36fc02a7319657728c.zip |
Preserve long or short quoting from input documents
Diffstat (limited to 'src')
-rw-r--r-- | src/n3.c | 5 | ||||
-rw-r--r-- | src/node.c | 141 | ||||
-rw-r--r-- | src/nodes.c | 19 | ||||
-rw-r--r-- | src/read_ntriples.c | 22 | ||||
-rw-r--r-- | src/string.c | 51 | ||||
-rw-r--r-- | src/string_utils.h | 5 | ||||
-rw-r--r-- | src/writer.c | 3 |
7 files changed, 88 insertions, 158 deletions
@@ -114,7 +114,7 @@ read_STRING_LITERAL_LONG(SerdReader* const reader, eat_byte_safe(reader, q3); break; } - ref->flags |= SERD_HAS_QUOTE; + if (!(st = push_byte(reader, ref, c))) { st = read_character(reader, ref, (uint8_t)q2); } @@ -151,7 +151,10 @@ read_String(SerdReader* const reader, SerdNode* const node) return SERD_SUCCESS; } + // Long string eat_byte_safe(reader, q3); + node->flags |= SERD_IS_LONG; + return read_STRING_LITERAL_LONG(reader, node, (uint8_t)q1); } @@ -173,8 +173,8 @@ serd_new_simple_node(const SerdNodeType type, const SerdStringView str) return NULL; } - SerdNodeFlags flags = 0; - const size_t length = str.buf ? serd_strlen(str.buf, &flags) : 0; + SerdNodeFlags flags = 0u; + const size_t length = str.buf ? str.len : 0u; SerdNode* node = serd_node_malloc(length, flags, type); if (node) { @@ -193,87 +193,93 @@ serd_new_simple_node(const SerdNodeType type, const SerdStringView str) SerdNode* serd_new_string(const SerdStringView str) { - SerdNodeFlags flags = 0; - const size_t length = serd_substrlen(str.buf, str.len, &flags); - SerdNode* node = serd_node_malloc(length, flags, SERD_LITERAL); - - memcpy(serd_node_buffer(node), str.buf, str.len); - node->length = length; - - serd_node_check_padding(node); - return node; -} - -/// Internal pre-measured implementation of serd_new_plain_literal -static SerdNode* -serd_new_plain_literal_i(const SerdStringView str, - SerdNodeFlags flags, - const SerdStringView lang) -{ - assert(str.len); - assert(lang.len); - - flags |= SERD_HAS_LANGUAGE; - - const size_t len = serd_node_pad_length(str.len); - const size_t total_len = len + sizeof(SerdNode) + lang.len; + SerdNodeFlags flags = 0u; + SerdNode* node = serd_node_malloc(str.len, flags, SERD_LITERAL); - SerdNode* node = serd_node_malloc(total_len, flags, SERD_LITERAL); - memcpy(serd_node_buffer(node), str.buf, str.len); - node->length = str.len; + if (node) { + if (str.buf && str.len) { + memcpy(serd_node_buffer(node), str.buf, str.len); + } - SerdNode* lang_node = node + 1 + (len / sizeof(SerdNode)); - lang_node->type = SERD_LITERAL; - lang_node->length = lang.len; - memcpy(serd_node_buffer(lang_node), lang.buf, lang.len); - serd_node_check_padding(lang_node); + node->length = str.len; + serd_node_check_padding(node); + } - serd_node_check_padding(node); return node; } -SerdNode* -serd_new_plain_literal(const SerdStringView str, const SerdStringView lang) +SERD_PURE_FUNC +static bool +is_langtag(const SerdStringView string) { - if (!lang.len) { - return serd_new_string(str); + // First character must be a letter + size_t i = 0; + if (!string.len || !is_alpha(string.buf[i])) { + return false; } - SerdNodeFlags flags = 0; - serd_strlen(str.buf, &flags); + // First component must be all letters + while (++i < string.len && string.buf[i] && string.buf[i] != '-') { + if (!is_alpha(string.buf[i])) { + return false; + } + } + + // Following components can have letters and digits + while (i < string.len && string.buf[i] == '-') { + while (++i < string.len && string.buf[i] && string.buf[i] != '-') { + const char c = string.buf[i]; + if (!is_alpha(c) && !is_digit(c)) { + return false; + } + } + } - return serd_new_plain_literal_i(str, flags, lang); + return true; } SerdNode* -serd_new_typed_literal(const SerdStringView str, - const SerdStringView datatype_uri) +serd_new_literal(const SerdStringView string, + const SerdNodeFlags flags, + const SerdStringView meta) { - if (!datatype_uri.len) { - return serd_new_string(str); + if (!(flags & (SERD_HAS_DATATYPE | SERD_HAS_LANGUAGE))) { + SerdNode* node = serd_node_malloc(string.len, flags, SERD_LITERAL); + + memcpy(serd_node_buffer(node), string.buf, string.len); + node->length = string.len; + serd_node_check_padding(node); + return node; } - if (!strcmp(datatype_uri.buf, NS_RDF "langString")) { + if ((flags & SERD_HAS_DATATYPE) && (flags & SERD_HAS_LANGUAGE)) { return NULL; } - SerdNodeFlags flags = 0u; - serd_strlen(str.buf, &flags); + if (!meta.len) { + return NULL; + } - flags |= SERD_HAS_DATATYPE; + if (((flags & SERD_HAS_DATATYPE) && + (!serd_uri_string_has_scheme(meta.buf) || + !strcmp(meta.buf, NS_RDF "langString"))) || + ((flags & SERD_HAS_LANGUAGE) && !is_langtag(meta))) { + return NULL; + } - const size_t len = serd_node_pad_length(str.len); - const size_t total_len = len + sizeof(SerdNode) + datatype_uri.len; + const size_t len = serd_node_pad_length(string.len); + const size_t meta_len = serd_node_pad_length(meta.len); + const size_t meta_size = sizeof(SerdNode) + meta_len; - SerdNode* node = serd_node_malloc(total_len, flags, SERD_LITERAL); - memcpy(serd_node_buffer(node), str.buf, str.len); - node->length = str.len; + SerdNode* node = serd_node_malloc(len + meta_size, flags, SERD_LITERAL); + memcpy(serd_node_buffer(node), string.buf, string.len); + node->length = string.len; - SerdNode* datatype_node = node + 1 + (len / sizeof(SerdNode)); - datatype_node->length = datatype_uri.len; - datatype_node->type = SERD_URI; - memcpy(serd_node_buffer(datatype_node), datatype_uri.buf, datatype_uri.len); - serd_node_check_padding(datatype_node); + SerdNode* meta_node = node + 1u + (len / sizeof(SerdNode)); + meta_node->length = meta.len; + meta_node->type = (flags & SERD_HAS_DATATYPE) ? SERD_URI : SERD_LITERAL; + memcpy(serd_node_buffer(meta_node), meta.buf, meta.len); + serd_node_check_padding(meta_node); serd_node_check_padding(node); return node; @@ -564,8 +570,9 @@ serd_new_double(const double d) const ExessResult r = exess_write_double(d, sizeof(buf), buf); return r.status ? NULL - : serd_new_typed_literal(SERD_SUBSTRING(buf, r.count), - SERD_STRING(EXESS_XSD_URI "double")); + : serd_new_literal(SERD_SUBSTRING(buf, r.count), + SERD_HAS_DATATYPE, + SERD_STRING(EXESS_XSD_URI "double")); } SerdNode* @@ -576,15 +583,17 @@ serd_new_float(const float f) const ExessResult r = exess_write_float(f, sizeof(buf), buf); return r.status ? NULL - : serd_new_typed_literal(SERD_SUBSTRING(buf, r.count), - SERD_STRING(EXESS_XSD_URI "float")); + : serd_new_literal(SERD_SUBSTRING(buf, r.count), + SERD_HAS_DATATYPE, + SERD_STRING(EXESS_XSD_URI "float")); } SerdNode* serd_new_boolean(bool b) { - return serd_new_typed_literal(b ? SERD_STRING("true") : SERD_STRING("false"), - serd_node_string_view(&serd_xsd_boolean.node)); + return serd_new_literal(b ? SERD_STRING("true") : SERD_STRING("false"), + SERD_HAS_DATATYPE, + serd_node_string_view(&serd_xsd_boolean.node)); } SerdNode* diff --git a/src/nodes.c b/src/nodes.c index 412d0d24..51f354bb 100644 --- a/src/nodes.c +++ b/src/nodes.c @@ -156,20 +156,13 @@ serd_nodes_string(SerdNodes* const nodes, const SerdStringView string) return serd_nodes_manage(nodes, serd_new_string(string)); } -const SerdNode* -serd_nodes_plain_literal(SerdNodes* const nodes, - const SerdStringView string, - const SerdStringView language) -{ - return serd_nodes_manage(nodes, serd_new_plain_literal(string, language)); -} - -const SerdNode* -serd_nodes_typed_literal(SerdNodes* const nodes, - const SerdStringView string, - const SerdStringView datatype_uri) +const SerdNode* SERD_ALLOCATED +serd_nodes_literal(SerdNodes* const nodes, + const SerdStringView string, + const SerdNodeFlags flags, + const SerdStringView meta) { - return serd_nodes_manage(nodes, serd_new_typed_literal(string, datatype_uri)); + return serd_nodes_manage(nodes, serd_new_literal(string, flags, meta)); } const SerdNode* diff --git a/src/read_ntriples.c b/src/read_ntriples.c index 78e46634..017c4dcf 100644 --- a/src/read_ntriples.c +++ b/src/read_ntriples.c @@ -206,24 +206,8 @@ read_IRI(SerdReader* const reader, SerdNode** const dest) SerdStatus read_character(SerdReader* const reader, SerdNode* const dest, const uint8_t c) { - if (!(c & 0x80)) { - switch (c) { - case 0xA: - case 0xD: - dest->flags |= SERD_HAS_NEWLINE; - break; - case '"': - case '\'': - dest->flags |= SERD_HAS_QUOTE; - break; - default: - break; - } - - return push_byte(reader, dest, c); - } - - return read_utf8_continuation(reader, dest, c); + return !(c & 0x80) ? push_byte(reader, dest, c) + : read_utf8_continuation(reader, dest, c); } /// [9] STRING_LITERAL_QUOTE @@ -432,11 +416,9 @@ read_ECHAR(SerdReader* const reader, SerdNode* const dest) eat_byte_safe(reader, 'b'); return push_byte(reader, dest, '\b'); case 'n': - dest->flags |= SERD_HAS_NEWLINE; eat_byte_safe(reader, 'n'); return push_byte(reader, dest, '\n'); case 'r': - dest->flags |= SERD_HAS_NEWLINE; eat_byte_safe(reader, 'r'); return push_byte(reader, dest, '\r'); case 'f': diff --git a/src/string.c b/src/string.c index 97c1432b..2d25fb80 100644 --- a/src/string.c +++ b/src/string.c @@ -14,13 +14,9 @@ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -#include "string_utils.h" - #include "serd/serd.h" -#include <assert.h> #include <stdlib.h> -#include <string.h> void serd_free(void* const ptr) @@ -66,50 +62,3 @@ serd_strerror(const SerdStatus status) return "Unknown error"; } - -static void -serd_update_flags(const char c, SerdNodeFlags* const flags) -{ - switch (c) { - case '\r': - case '\n': - *flags |= SERD_HAS_NEWLINE; - break; - case '"': - *flags |= SERD_HAS_QUOTE; - break; - default: - break; - } -} - -size_t -serd_substrlen(const char* const str, - const size_t len, - SerdNodeFlags* const flags) -{ - assert(flags); - - size_t i = 0; - *flags = 0; - for (; i < len && str[i]; ++i) { - serd_update_flags(str[i], flags); - } - - return i; -} - -size_t -serd_strlen(const char* const str, SerdNodeFlags* const flags) -{ - if (flags) { - size_t i = 0; - *flags = 0; - for (; str[i]; ++i) { - serd_update_flags(str[i], flags); - } - return i; - } - - return strlen(str); -} diff --git a/src/string_utils.h b/src/string_utils.h index a411b90d..54f7877c 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -17,8 +17,6 @@ #ifndef SERD_STRING_UTILS_H #define SERD_STRING_UTILS_H -#include "serd/serd.h" - #include <stdbool.h> #include <stddef.h> #include <stdint.h> @@ -90,9 +88,6 @@ is_windows_path(const char* path) (path[2] == '/' || path[2] == '\\'); } -size_t -serd_substrlen(const char* str, size_t len, SerdNodeFlags* flags); - static inline char serd_to_upper(const char c) { diff --git a/src/writer.c b/src/writer.c index c6a91d78..5416d144 100644 --- a/src/writer.c +++ b/src/writer.c @@ -737,8 +737,7 @@ write_literal(SerdWriter* const writer, } SerdStatus st = SERD_SUCCESS; - if (supports_abbrev(writer) && - (node->flags & (SERD_HAS_NEWLINE | SERD_HAS_QUOTE))) { + if (supports_abbrev(writer) && (node->flags & SERD_IS_LONG)) { TRY(st, esink("\"\"\"", 3, writer)); TRY(st, write_text(writer, WRITE_LONG_STRING, node_str, node->length)); TRY(st, esink("\"\"\"", 3, writer)); |