diff options
author | David Robillard <d@drobilla.net> | 2021-07-22 15:26:22 -0400 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2023-12-02 18:49:08 -0500 |
commit | 5e4538756d601e6a941c5290777af95ea8848e1a (patch) | |
tree | 9868e188a48a528e9908fcf695147f75790c3a56 /src | |
parent | 64024d0fa6a6dc048b2b846738846da597025f56 (diff) | |
download | serd-5e4538756d601e6a941c5290777af95ea8848e1a.tar.gz serd-5e4538756d601e6a941c5290777af95ea8848e1a.tar.bz2 serd-5e4538756d601e6a941c5290777af95ea8848e1a.zip |
[WIP] Preserve long or short quoting from input documents
Diffstat (limited to 'src')
-rw-r--r-- | src/node.c | 148 | ||||
-rw-r--r-- | src/read_ntriples.c | 22 | ||||
-rw-r--r-- | src/read_turtle.c | 4 | ||||
-rw-r--r-- | src/string.c | 53 | ||||
-rw-r--r-- | src/string_utils.h | 5 | ||||
-rw-r--r-- | src/uri_utils.h | 6 | ||||
-rw-r--r-- | src/writer.c | 3 |
7 files changed, 87 insertions, 154 deletions
@@ -11,7 +11,6 @@ #include "serd/buffer.h" #include "serd/node.h" #include "serd/status.h" -#include "serd/string.h" #include "serd/string_view.h" #include "serd/uri.h" #include "serd/write_result.h" @@ -195,88 +194,92 @@ serd_new_token(const SerdNodeType type, const SerdStringView str) SerdNode* serd_new_string(const SerdStringView str) { - SerdNodeFlags flags = 0; - const size_t length = serd_substrlen(str.data, str.length, &flags); - SerdNode* node = serd_node_malloc(length, flags, SERD_LITERAL); - - memcpy(serd_node_buffer(node), str.data, str.length); - node->length = length; - - serd_node_check_padding(node); - return node; -} - -/// Internal pre-measured implementation of serd_new_plain_literal -static SerdNode* -serd_new_plain_literal_i(const SerdStringView str, - SerdNodeFlags flags, - const SerdStringView lang) -{ - assert(str.length); - assert(lang.length); - - flags |= SERD_HAS_LANGUAGE; - - const size_t len = serd_node_pad_length(str.length); - const size_t total_len = len + sizeof(SerdNode) + lang.length; + SerdNodeFlags flags = 0U; + SerdNode* node = serd_node_malloc(str.length, flags, SERD_LITERAL); - SerdNode* node = serd_node_malloc(total_len, flags, SERD_LITERAL); - memcpy(serd_node_buffer(node), str.data, str.length); - node->length = str.length; + if (node) { + if (str.data && str.length) { + memcpy(serd_node_buffer(node), str.data, str.length); + } - SerdNode* lang_node = node + 1 + (len / sizeof(SerdNode)); - lang_node->type = SERD_LITERAL; - lang_node->length = lang.length; - memcpy(serd_node_buffer(lang_node), lang.data, lang.length); - serd_node_check_padding(lang_node); + node->length = str.length; + serd_node_check_padding(node); + } - serd_node_check_padding(node); return node; } -SerdNode* -serd_new_plain_literal(const SerdStringView str, const SerdStringView lang) +ZIX_PURE_FUNC static bool +is_langtag(const SerdStringView string) { - if (!lang.length) { - return serd_new_string(str); + // First character must be a letter + size_t i = 0; + if (!string.length || !is_alpha(string.data[i])) { + return false; } - SerdNodeFlags flags = 0; - serd_strlen(str.data, &flags); + // First component must be all letters + while (++i < string.length && string.data[i] && string.data[i] != '-') { + if (!is_alpha(string.data[i])) { + return false; + } + } - return serd_new_plain_literal_i(str, flags, lang); + // Following components can have letters and digits + while (i < string.length && string.data[i] == '-') { + while (++i < string.length && string.data[i] && string.data[i] != '-') { + const char c = string.data[i]; + if (!is_alpha(c) && !is_digit(c)) { + return false; + } + } + } + + return true; } SerdNode* -serd_new_typed_literal(const SerdStringView str, - const SerdStringView datatype_uri) +serd_new_literal(const SerdStringView string, + const SerdNodeFlags flags, + const SerdStringView meta) { - if (!datatype_uri.length) { - return serd_new_string(str); + if (!(flags & (SERD_HAS_DATATYPE | SERD_HAS_LANGUAGE))) { + SerdNode* node = serd_node_malloc(string.length, flags, SERD_LITERAL); + + memcpy(serd_node_buffer(node), string.data, string.length); + node->length = string.length; + serd_node_check_padding(node); + return node; } - if (!strcmp(datatype_uri.data, NS_RDF "langString")) { + if ((flags & SERD_HAS_DATATYPE) && (flags & SERD_HAS_LANGUAGE)) { return NULL; } - SerdNodeFlags flags = 0U; - serd_strlen(str.data, &flags); + if (!meta.length) { + return NULL; + } - flags |= SERD_HAS_DATATYPE; + if (((flags & SERD_HAS_DATATYPE) && + (!serd_uri_string_has_scheme(meta.data) || + !strcmp(meta.data, NS_RDF "langString"))) || + ((flags & SERD_HAS_LANGUAGE) && !is_langtag(meta))) { + return NULL; + } - const size_t len = serd_node_pad_length(str.length); - const size_t total_len = len + sizeof(SerdNode) + datatype_uri.length; + const size_t len = serd_node_pad_length(string.length); + const size_t meta_len = serd_node_pad_length(meta.length); + const size_t meta_size = sizeof(SerdNode) + meta_len; - SerdNode* node = serd_node_malloc(total_len, flags, SERD_LITERAL); - memcpy(serd_node_buffer(node), str.data, str.length); - node->length = str.length; + SerdNode* node = serd_node_malloc(len + meta_size, flags, SERD_LITERAL); + memcpy(serd_node_buffer(node), string.data, string.length); + node->length = string.length; - SerdNode* datatype_node = node + 1 + (len / sizeof(SerdNode)); - datatype_node->length = datatype_uri.length; - datatype_node->type = SERD_URI; - memcpy( - serd_node_buffer(datatype_node), datatype_uri.data, datatype_uri.length); - serd_node_check_padding(datatype_node); + SerdNode* meta_node = node + 1U + (len / sizeof(SerdNode)); + meta_node->length = meta.length; + meta_node->type = (flags & SERD_HAS_DATATYPE) ? SERD_URI : SERD_LITERAL; + memcpy(serd_node_buffer(meta_node), meta.data, meta.length); + serd_node_check_padding(meta_node); serd_node_check_padding(node); return node; @@ -548,13 +551,6 @@ typedef size_t (*SerdWriteLiteralFunc)(const void* user_data, size_t buf_size, char* buf); -SerdNode* -serd_new_boolean(bool b) -{ - return serd_new_typed_literal(b ? serd_string("true") : serd_string("false"), - serd_node_string_view(&serd_xsd_boolean.node)); -} - static SerdNode* serd_new_custom_literal(const void* const user_data, const size_t len, @@ -589,8 +585,9 @@ serd_new_double(const double d) const ExessResult r = exess_write_double(d, sizeof(buf), buf); return r.status ? NULL - : serd_new_typed_literal(serd_substring(buf, r.count), - serd_string(EXESS_XSD_URI "double")); + : serd_new_literal(serd_substring(buf, r.count), + SERD_HAS_DATATYPE, + serd_string(EXESS_XSD_URI "double")); } SerdNode* @@ -601,8 +598,17 @@ serd_new_float(const float f) const ExessResult r = exess_write_float(f, sizeof(buf), buf); return r.status ? NULL - : serd_new_typed_literal(serd_substring(buf, r.count), - serd_string(EXESS_XSD_URI "float")); + : serd_new_literal(serd_substring(buf, r.count), + SERD_HAS_DATATYPE, + serd_string(EXESS_XSD_URI "float")); +} + +SerdNode* +serd_new_boolean(bool b) +{ + return serd_new_literal(b ? serd_string("true") : serd_string("false"), + SERD_HAS_DATATYPE, + serd_node_string_view(&serd_xsd_boolean.node)); } SerdNode* diff --git a/src/read_ntriples.c b/src/read_ntriples.c index bec59c13..6822b64f 100644 --- a/src/read_ntriples.c +++ b/src/read_ntriples.c @@ -190,24 +190,8 @@ read_IRI(SerdReader* const reader, SerdNode** const dest) SerdStatus read_character(SerdReader* const reader, SerdNode* const dest, const uint8_t c) { - if (!(c & 0x80)) { - switch (c) { - case 0xA: - case 0xD: - dest->flags |= SERD_HAS_NEWLINE; - break; - case '"': - case '\'': - dest->flags |= SERD_HAS_QUOTE; - break; - default: - break; - } - - return push_byte(reader, dest, c); - } - - return read_utf8_continuation(reader, dest, c); + return !(c & 0x80) ? push_byte(reader, dest, c) + : read_utf8_continuation(reader, dest, c); } SerdStatus @@ -423,10 +407,8 @@ read_ECHAR(SerdReader* const reader, SerdNode* const dest) case 'b': return (st = skip_byte(reader, 'b')) ? st : push_byte(reader, dest, '\b'); case 'n': - dest->flags |= SERD_HAS_NEWLINE; return (st = skip_byte(reader, 'n')) ? st : push_byte(reader, dest, '\n'); case 'r': - dest->flags |= SERD_HAS_NEWLINE; return (st = skip_byte(reader, 'r')) ? st : push_byte(reader, dest, '\r'); case 'f': return (st = skip_byte(reader, 'f')) ? st : push_byte(reader, dest, '\f'); diff --git a/src/read_turtle.c b/src/read_turtle.c index c3970a1e..22269741 100644 --- a/src/read_turtle.c +++ b/src/read_turtle.c @@ -115,7 +115,6 @@ read_STRING_LITERAL_LONG(SerdReader* const reader, push_byte(reader, ref, c); st = read_string_escape(reader, ref); } else { - ref->flags |= SERD_HAS_QUOTE; if (!(st = push_byte(reader, ref, c))) { st = read_character(reader, ref, (uint8_t)q2); } @@ -151,7 +150,10 @@ read_String(SerdReader* const reader, SerdNode* const node) return SERD_SUCCESS; } + // Long string skip_byte(reader, q3); + node->flags |= SERD_IS_LONG; + return read_STRING_LITERAL_LONG(reader, node, (uint8_t)q1); } diff --git a/src/string.c b/src/string.c index 8cc839bd..ed3149d0 100644 --- a/src/string.c +++ b/src/string.c @@ -1,16 +1,10 @@ // Copyright 2011-2020 David Robillard <d@drobilla.net> // SPDX-License-Identifier: ISC -#include "string_utils.h" - #include "serd/memory.h" -#include "serd/node.h" #include "serd/status.h" -#include "serd/string.h" -#include <assert.h> #include <stdlib.h> -#include <string.h> void serd_free(void* const ptr) @@ -68,50 +62,3 @@ serd_strerror(const SerdStatus status) return "Unknown error"; } - -static void -serd_update_flags(const char c, SerdNodeFlags* const flags) -{ - switch (c) { - case '\r': - case '\n': - *flags |= SERD_HAS_NEWLINE; - break; - case '"': - *flags |= SERD_HAS_QUOTE; - break; - default: - break; - } -} - -size_t -serd_substrlen(const char* const str, - const size_t len, - SerdNodeFlags* const flags) -{ - assert(flags); - - size_t i = 0; - *flags = 0; - for (; i < len && str[i]; ++i) { - serd_update_flags(str[i], flags); - } - - return i; -} - -size_t -serd_strlen(const char* const str, SerdNodeFlags* const flags) -{ - if (flags) { - size_t i = 0; - *flags = 0; - for (; str[i]; ++i) { - serd_update_flags(str[i], flags); - } - return i; - } - - return strlen(str); -} diff --git a/src/string_utils.h b/src/string_utils.h index 9de03fa0..2517b270 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -4,8 +4,6 @@ #ifndef SERD_SRC_STRING_UTILS_H #define SERD_SRC_STRING_UTILS_H -#include "serd/node.h" - #include <stdbool.h> #include <stddef.h> #include <stdint.h> @@ -97,9 +95,6 @@ is_windows_path(const char* path) (path[2] == '/' || path[2] == '\\'); } -size_t -serd_substrlen(const char* str, size_t len, SerdNodeFlags* flags); - static inline uint8_t hex_digit_value(const uint8_t c) { diff --git a/src/uri_utils.h b/src/uri_utils.h index 004129d2..76060d6a 100644 --- a/src/uri_utils.h +++ b/src/uri_utils.h @@ -4,10 +4,12 @@ #ifndef SERD_SRC_URI_UTILS_H #define SERD_SRC_URI_UTILS_H -#include "serd/attributes.h" - #include "string_utils.h" +#include "serd/attributes.h" +#include "serd/string_view.h" +#include "serd/uri.h" + #include <stdbool.h> #include <string.h> diff --git a/src/writer.c b/src/writer.c index 94c75625..329a29ad 100644 --- a/src/writer.c +++ b/src/writer.c @@ -778,8 +778,7 @@ write_literal(SerdWriter* const writer, } } - if (supports_abbrev(writer) && - (node->flags & (SERD_HAS_NEWLINE | SERD_HAS_QUOTE))) { + if (supports_abbrev(writer) && (node->flags & SERD_IS_LONG)) { TRY(st, esink("\"\"\"", 3, writer)); TRY(st, write_text(writer, WRITE_LONG_STRING, node_str, node->length)); TRY(st, esink("\"\"\"", 3, writer)); |