From 0a62fc5f6aafd3e3f67d861634014d7e894c7bfd Mon Sep 17 00:00:00 2001 From: David Robillard Date: Thu, 20 Jan 2011 07:31:58 +0000 Subject: Rework character reading functions to support reading multi-byte characters (take a string dest parameter instead of returning uchar). Escape ntriples output. Pass all good read tests with output verification. git-svn-id: http://svn.drobilla.net/serd/trunk@8 490d8e77-9747-427b-9fa3-0b8f29cee8a0 --- doc/reference.doxygen.in | 8 +- serd/serd.h | 105 +++++++++-------- src/namespaces.c | 46 +------- src/reader.c | 287 +++++++++++++++++++++++------------------------ src/serdi.c | 84 ++------------ src/string.c | 65 +++++++++++ src/uri.c | 49 +++----- src/write.c | 170 ++++++++++++++++++++++++++++ wscript | 8 +- 9 files changed, 472 insertions(+), 350 deletions(-) create mode 100644 src/string.c create mode 100644 src/write.c diff --git a/doc/reference.doxygen.in b/doc/reference.doxygen.in index 66fda410..db972e34 100644 --- a/doc/reference.doxygen.in +++ b/doc/reference.doxygen.in @@ -270,7 +270,7 @@ SUBGROUPING = YES # be useful for C code in case the coding convention dictates that all compound # types are typedef'ed and only the typedef is referenced, never the tag name. -TYPEDEF_HIDES_STRUCT = NO +TYPEDEF_HIDES_STRUCT = YES # The SYMBOL_CACHE_SIZE determines the size of the internal cache use to # determine which symbols to keep in memory and which to flush to disk. @@ -297,7 +297,7 @@ SYMBOL_CACHE_SIZE = 0 # Private class members and static file members will be hidden unless # the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES -EXTRACT_ALL = NO +EXTRACT_ALL = YES # If the EXTRACT_PRIVATE tag is set to YES all private members of a class # will be included in the documentation. @@ -480,14 +480,14 @@ SHOW_DIRECTORIES = NO # This will remove the Files entry from the Quick Index and from the # Folder Tree View (if specified). The default is YES. -SHOW_FILES = YES +SHOW_FILES = NO # Set the SHOW_NAMESPACES tag to NO to disable the generation of the # Namespaces page. # This will remove the Namespaces entry from the Quick Index # and from the Folder Tree View (if specified). The default is YES. -SHOW_NAMESPACES = YES +SHOW_NAMESPACES = NO # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from diff --git a/serd/serd.h b/serd/serd.h index d3d2b03d..dd4b25b7 100644 --- a/serd/serd.h +++ b/serd/serd.h @@ -49,6 +49,10 @@ * @{ */ +typedef struct SerdNamespacesImpl* SerdNamespaces; +typedef struct SerdReaderImpl* SerdReader; + + /** RDF syntax */ typedef enum { SERD_TURTLE = 1, @@ -57,56 +61,31 @@ typedef enum { /** Type of RDF node. */ typedef enum { - BLANK = 1, - URI = 2, - QNAME = 3, - LITERAL = 4 + BLANK = 1, ///< Blank node (resource with no URI) + URI = 2, ///< URI (universal identifier) + QNAME = 3, ///< CURIE/QName (URI shortened with a namespace) + LITERAL = 4 ///< Literal string (with optional lang or datatype) } SerdNodeType; - -/** @name String - * @{ - */ - -/** Measured UTF-8 string. */ -typedef struct { - size_t n_bytes; - size_t n_chars; - uint8_t buf[]; -} SerdString; - -/** Create a new UTF-8 string from @a utf8. */ -SERD_API -SerdString* -serd_string_new(const uint8_t* utf8); - -/** Copy @a string. */ -SERD_API -SerdString* -serd_string_copy(const SerdString* string); - -/** @} */ - - /** @name URIs * @{ */ -/** Range of memory. */ +/* Range of memory. */ typedef struct { const uint8_t* buf; size_t len; } SerdRange; -/** Parsed URI. */ +/* Parsed URI. */ typedef struct { - SerdRange scheme; - SerdRange authority; - SerdRange path_base; - SerdRange path; - SerdRange query; - SerdRange fragment; - bool base_uri_has_authority; + SerdRange scheme; ///< Scheme + SerdRange authority; ///< Authority + SerdRange path_base; ///< Path prefix if relative + SerdRange path; ///< Path suffix + SerdRange query; ///< Query + SerdRange fragment; ///< Fragment + bool base_uri_has_authority; ///< True iff base URI has authority } SerdURI; /** Return true iff @a utf8 is a relative URI string. */ @@ -129,11 +108,52 @@ SERD_API bool serd_uri_write(const SerdURI* uri, FILE* file); +/** Sink function for raw string output. */ +typedef size_t (*SerdSink)(const uint8_t* buf, size_t len, void* stream); + +/** Serialise @a uri with a series of calls to @a sink. */ +SERD_API +size_t +serd_uri_serialise(const SerdURI* uri, SerdSink sink, void* stream); + +/** @} */ + +/** @name String + * @{ + */ + +/** Measured UTF-8 string. */ +typedef struct { + size_t n_bytes; ///< Size in bytes including trailing null byte + size_t n_chars; ///< Length in characters + uint8_t buf[]; ///< Buffer +} SerdString; + +/** Create a new UTF-8 string from @a utf8. */ +SERD_API +SerdString* +serd_string_new(const uint8_t* utf8); + +/** Copy @a string. */ +SERD_API +SerdString* +serd_string_copy(const SerdString* string); + /** Serialise @a uri to a string. */ SERD_API SerdString* -serd_uri_serialise(const SerdURI* uri, - SerdURI* out); +serd_string_new_from_uri(const SerdURI* uri, + SerdURI* out); + +SERD_API +bool +serd_write_node(FILE* file, + const SerdURI* base_uri, + SerdNamespaces ns, + SerdNodeType type, + const SerdString* str, + const SerdString* datatype, + const SerdString* lang); /** @} */ @@ -142,9 +162,6 @@ serd_uri_serialise(const SerdURI* uri, * @{ */ -/** Reader. */ -typedef struct SerdReaderImpl* SerdReader; - /** Handler for base URI changes. */ typedef bool (*SerdBaseHandler)(void* handle, const SerdString* uri); @@ -194,8 +211,6 @@ serd_reader_free(SerdReader reader); * @{ */ -typedef struct SerdNamespacesImpl* SerdNamespaces; - /** Create a new namespaces dictionary. */ SERD_API SerdNamespaces diff --git a/src/namespaces.c b/src/namespaces.c index fab53ea3..f18ebcff 100644 --- a/src/namespaces.c +++ b/src/namespaces.c @@ -32,48 +32,6 @@ struct SerdNamespacesImpl { size_t n_namespaces; }; -static inline size_t -utf8_strlen(const uint8_t* utf8, size_t* out_n_bytes) -{ - size_t n_chars = 0; - size_t i = 0; - for (; utf8[i]; ++i) { - if ((utf8[i] & 0xC0) != 0x80) { - // Does not start with `10', start of a new character - ++n_chars; - } - } - if (out_n_bytes) { - *out_n_bytes = i + 1; - } - return n_chars; -} - -SERD_API -SerdString* -serd_string_new(const uint8_t* utf8) -{ - size_t n_bytes; - size_t n_chars = utf8_strlen(utf8, &n_bytes); - SerdString* const str = malloc(sizeof(SerdString) + n_bytes); - str->n_bytes = n_bytes; - str->n_chars = n_chars; - memcpy(str->buf, utf8, str->n_bytes); - return str; -} - -SERD_API -SerdString* -serd_string_copy(const SerdString* s) -{ - if (s) { - SerdString* const copy = malloc(sizeof(SerdString) + s->n_bytes); - memcpy(copy, s, sizeof(SerdString) + s->n_bytes); - return copy; - } - return NULL; -} - SERD_API SerdNamespaces serd_namespaces_new() @@ -148,9 +106,9 @@ serd_namespaces_expand(SerdNamespaces ns, SerdNamespace* const record = serd_namespaces_find(ns, qname->buf, colon - qname->buf); if (record) { uri_prefix->buf = record->uri->buf; - uri_prefix->len = record->uri->n_bytes; + uri_prefix->len = record->uri->n_bytes - 1; uri_suffix->buf = colon + 1; - uri_suffix->len = qname->n_bytes - (colon - qname->buf) - 1; + uri_suffix->len = qname->n_bytes - (colon - qname->buf) - 2; return true; } return false; diff --git a/src/reader.c b/src/reader.c index bf0eea11..52999180 100644 --- a/src/reader.c +++ b/src/reader.c @@ -140,22 +140,6 @@ readahead(SerdReader parser, uint8_t* pre, int n) return true; } -static inline unsigned -utf8_char_len(const uint8_t b0) -{ - if ((b0 & 0x80) == 0) { // Starts with `0' - return 1; - } else if ((b0 & 0xE0) == 0xC0) { // Starts with `110' - return 2; - } else if ((b0 & 0xF0) == 0xE0) { // Starts with `1110' - return 3; - } else if ((b0 & 0xF8) == 0xF0) { // Starts with `11110' - return 4; - } else { - return 0; - } -} - static inline uchar peek_utf8_char(SerdReader parser, unsigned* n_bytes) { @@ -334,52 +318,82 @@ read_hex(SerdReader parser) } } -static inline uchar -read_hex_escape(SerdReader parser, unsigned length) +static inline bool +read_hex_escape(SerdReader parser, unsigned length, Ref dest) { - uchar ret = 0; - uint8_t chars[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - uint8_t code[4] = { 0, 0, 0, 0 }; + uint8_t buf[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; for (unsigned i = 0; i < length; ++i) { - chars[i] = read_hex(parser); - } - - sscanf((const char*)chars, "%X", (uint32_t*)code); - const uint32_t code_num = *(uint32_t*)code; - if (code_num < 0x80) { - fprintf(stderr, "1 byte UTF-8 escape\n"); - return code[0]; - } else if (code_num < 0x800) { - fprintf(stderr, "2 byte UTF-8 escape\n"); - fprintf(stderr, "B0 %X\n", code[0]); - fprintf(stderr, "B1 %X\n", code[1]); - fprintf(stderr, "B2 %X\n", code[2]); - fprintf(stderr, "B3 %X\n", code[3]); - ret = ((0xC0 + ((code[3] & 0x1F) << 2) + ((code[4] & 0xC0) >> 6)) << 8) - + (code[4] & 0x3F); - fprintf(stderr, "RET %X\n", ret); - } else if (code_num < 0x10000) { - fprintf(stderr, "3 byte UTF-8 escape\n"); + buf[i] = read_hex(parser); + } + + uint32_t c; + sscanf((const char*)buf, "%X", &c); + + unsigned size = 0; + if (c < 0x00000080) { + size = 1; + } else if (c < 0x00000800) { + size = 2; + } else if (c < 0x00010000) { + size = 3; + } else if (c < 0x00200000) { + size = 4; + } else if (c < 0x04000000) { + size = 5; + } else if (c < 0x80000000) { + size = 6; } else { - fprintf(stderr, "4 byte UTF-8 escape\n"); + return false; } - return ret; + + // Build output in buf + // (Note # of bytes = # of leading 1 bits in first byte) + switch (size) { + case 6: + buf[5] = 0x80 | (uint8_t)(c & 0x3F); + c >>= 6; + c |= (4 << 24); // set bit 2 + case 5: + buf[4] = 0x80 | (uint8_t)(c & 0x3F); + c >>= 6; + c |= (8 << 18); // set bit 3 + case 4: + buf[3] = 0x80 | (uint8_t)(c & 0x3F); + c >>= 6; + c |= (16 << 12); // set bit 4 + case 3: + buf[2] = 0x80 | (uint8_t)(c & 0x3F); + c >>= 6; + c |= (32 << 6); // set bit 5 + case 2: + buf[1] = 0x80 | (uint8_t)(c & 0x3F); + c >>= 6; + c |= 0xC0; // set bits 6 and 7 + case 1: + buf[0] = (uint8_t)c; + } + + for (unsigned i = 0; i < size; ++i) { + push_char(parser, dest, buf[i]); + } + return true; } -static inline uchar -character_escape(SerdReader parser, const uchar esc) +static inline bool +read_character_escape(SerdReader parser, Ref dest) { - switch (esc) { + switch (peek_char(parser)) { case '\\': - return eat_char(parser, '\\'); + push_char(parser, dest, eat_char(parser, '\\')); + return true; case 'u': - eat_char(parser, esc); - return read_hex_escape(parser, 4); + eat_char(parser, 'u'); + return read_hex_escape(parser, 4, dest); case 'U': - eat_char(parser, esc); - return read_hex_escape(parser, 8); + eat_char(parser, 'U'); + return read_hex_escape(parser, 8, dest); default: - return 0; + return false; } } @@ -387,117 +401,124 @@ character_escape(SerdReader parser, const uchar esc) // | '\U' hex hex hex hex hex hex hex hex // | '\\' // | [#x20-#x5B] | [#x5D-#x10FFFF] -static inline uchar -read_character(SerdReader parser) +static inline bool +read_character(SerdReader parser, Ref dest) { const uchar c = peek_char(parser); - uchar esc; switch (c) { case '\\': eat_char(parser, '\\'); - esc = character_escape(parser, peek_char(parser)); - if (esc) { - return esc; + if (read_character_escape(parser, dest)) { + return true; } else { - return error(parser, "illegal escape `\\%c'\n", esc); + return error(parser, "invalid escape `\\%c'\n", peek_char(parser)); } default: if (in_range(c, 0x20, 0x5B) || in_range(c, 0x5D, 0x10FFF)) { - return eat_char(parser, c); + push_char(parser, dest, eat_char(parser, c)); + return true; } else { - return error(parser, "illegal character `%c'\n", c); + return error(parser, "invalid character `%c'\n", c); } } } -static inline uchar -echaracter_escape(SerdReader parser, const uchar esc) +static inline bool +read_echaracter_escape(SerdReader parser, Ref dest) { - const uchar ret = character_escape(parser, esc); - if (ret) { - return ret; + if (read_character_escape(parser, dest)) { + return true; } - switch (esc) { + switch (peek_char(parser)) { case 't': eat_char(parser, 't'); - return '\t'; + push_char(parser, dest, '\t'); + return true; case 'n': eat_char(parser, 'n'); - return '\n'; + push_char(parser, dest, '\n'); + return true; case 'r': eat_char(parser, 'r'); - return '\r'; + push_char(parser, dest, '\r'); + return true; default: - return 0; + return false; } } // [39] echaracter ::= character | '\t' | '\n' | '\r' -static inline uchar -read_echaracter(SerdReader parser) +static inline bool +read_echaracter(SerdReader parser, Ref dest) { uchar c = peek_char(parser); - uchar esc; switch (c) { case '\\': eat_char(parser, '\\'); - esc = echaracter_escape(parser, peek_char(parser)); - if (esc) { - return esc; + if (read_echaracter_escape(parser, peek_char(parser))) { + return true; } else { - return error(parser, "illegal escape `\\%c'\n", esc); + return error(parser, "illegal escape `\\%c'\n", peek_char(parser)); } default: - return read_character(parser); + return read_character(parser, dest); } } -static inline uchar -scharacter_escape(SerdReader parser, const uchar esc) +static inline bool +read_scharacter_escape(SerdReader parser, Ref dest) { - const uchar ret = echaracter_escape(parser, esc); - if (ret) { - return ret; - } else if (esc == '"') { - return eat_char(parser, '"'); + if (read_echaracter_escape(parser, dest)) { + return true; + } else if (peek_char(parser) == '"') { + push_char(parser, dest, eat_char(parser, '"')); + return true; } - return 0; + return false; } -static inline uchar -ucharacter_escape(SerdReader parser, const uchar esc) +static inline bool +read_ucharacter_escape(SerdReader parser, Ref dest) { - const uchar ret = echaracter_escape(parser, esc); - if (ret) { - return ret; - } else if (esc == '>') { - return eat_char(parser, '>'); + if (read_echaracter_escape(parser, dest)) { + return true; + } else if (peek_char(parser) == '>') { + push_char(parser, dest, eat_char(parser, '>')); + return true; } - return 0; + return false; } // [43] lcharacter ::= echaracter | '\"' | #x9 | #xA | #xD -static inline uchar -read_lcharacter(SerdReader parser, bool* is_escape) +static inline bool +read_lcharacter(SerdReader parser, Ref dest) { - *is_escape = false; - const uchar c = peek_char(parser); - uchar esc; + uchar c = peek_char(parser); + uint8_t pre[3]; switch (c) { + case '"': + readahead(parser, pre, 3); + if (pre[1] == '\"' && pre[2] == '\"') { + eat_char(parser, '\"'); + eat_char(parser, '\"'); + eat_char(parser, '\"'); + return false; + } else { + push_char(parser, dest, eat_char(parser, '"')); + return true; + } case '\\': eat_char(parser, '\\'); - esc = scharacter_escape(parser, peek_char(parser)); - if (esc) { - *is_escape = true; - return esc; + if (read_scharacter_escape(parser, dest)) { + return true; } else { - return error(parser, "illegal escape `\\%c'\n", esc); + return error(parser, "illegal escape `\\%c'\n", peek_char(parser)); } case 0x9: case 0xA: case 0xD: - eat_char(parser, c); + push_char(parser, dest, eat_char(parser, c)); return c; default: - return read_echaracter(parser); + return read_echaracter(parser, dest); } } @@ -506,48 +527,39 @@ static inline bool read_scharacter(SerdReader parser, Ref dest) { uchar c = peek_char(parser); - uchar esc; switch (c) { case '\\': eat_char(parser, '\\'); - esc = scharacter_escape(parser, peek_char(parser)); - if (esc) { - push_char(parser, dest, esc); + if (read_scharacter_escape(parser, dest)) { return true; } else { - return error(parser, "illegal escape `\\%c'\n", esc); + return error(parser, "illegal escape `\\%c'\n", peek_char(parser)); } case '\"': return false; default: - c = read_character(parser); - if (c) { - push_char(parser, dest, c); - } - return c; + return read_character(parser, dest); } } // Spec: [41] ucharacter ::= ( character - #x3E ) | '\>' // Actual: [41] ucharacter ::= ( echaracter - #x3E ) | '\>' -static inline uchar -read_ucharacter(SerdReader parser) +static inline bool +read_ucharacter(SerdReader parser, Ref dest) { - const uchar c = peek_char(parser); - uchar esc; + uchar c = peek_char(parser); switch (c) { case '\\': eat_char(parser, '\\'); - esc = ucharacter_escape(parser, peek_char(parser)); - if (esc) { - return esc; + if (read_ucharacter_escape(parser, dest)) { + return true; } else { - return error(parser, "illegal escape `\\%c'\n", esc); + return error(parser, "illegal escape `\\%c'\n", peek_char(parser)); } case '>': - return 0; + return false; default: - return read_character(parser); + return read_character(parser, dest); } } @@ -607,21 +619,7 @@ read_longString(SerdReader parser) { eat_string(parser, "\"\"\"", 3); Ref str = push_string(parser, "", 1); - uchar c; - bool is_escape = false; - while ((c = read_lcharacter(parser, &is_escape)) != 0) { - if (c == '\"' && !is_escape) { - uint8_t pre[2]; - readahead(parser, pre, 2); - if (pre[0] == '\"' && pre[1] == '\"') { - eat_char(parser, '\"'); - eat_char(parser, '\"'); - return str; - } - } - push_char(parser, str, c); - } - eat_string(parser, "\"\"\"", 3); + while (read_lcharacter(parser, str)) {} return str; } @@ -658,11 +656,8 @@ read_quotedString(SerdReader parser) static inline Ref read_relativeURI(SerdReader parser) { - uchar c; - Ref str = push_string(parser, "", 1); - while ((c = read_ucharacter(parser)) != 0) { - push_char(parser, str, c); - } + Ref str = push_string(parser, "", 1); + while (read_ucharacter(parser, str)) {} return str; } diff --git a/src/serdi.c b/src/serdi.c index f5be93c3..3f91e040 100644 --- a/src/serdi.c +++ b/src/serdi.c @@ -49,7 +49,7 @@ event_base(void* handle, assert(false); return false; } - base_uri_str = serd_uri_serialise(&abs_base_uri, &base_uri); + base_uri_str = serd_string_new_from_uri(&abs_base_uri, &base_uri); // FIXME: double parse serd_uri_parse(base_uri_str->buf, &base_uri); } else { @@ -83,7 +83,7 @@ event_prefix(void* handle, return false; } SerdURI new_abs_uri; - SerdString* abs_uri_string = serd_uri_serialise(&abs_uri, &new_abs_uri); + SerdString* abs_uri_string = serd_string_new_from_uri(&abs_uri, &new_abs_uri); serd_namespaces_add(state->ns, name, abs_uri_string); } else { serd_namespaces_add(state->ns, name, uri_string); @@ -91,77 +91,6 @@ event_prefix(void* handle, return true; } -static inline bool -write_node(State* state, - const SerdString* str, - SerdNodeType type, - const SerdString* datatype, - const SerdString* lang) -{ - SerdRange uri_prefix; - SerdRange uri_suffix; - switch (type) { - case BLANK: - fwrite("_:", 1, 2, state->out_fd); - fwrite(str->buf, 1, str->n_bytes - 1, state->out_fd); - break; - case QNAME: - if (!serd_namespaces_expand(state->ns, str, &uri_prefix, &uri_suffix)) { - fprintf(stderr, "error: undefined namespace prefix `%s'\n", str->buf); - return false; - } - fwrite("<", 1, 1, state->out_fd); - fwrite(uri_prefix.buf, 1, uri_prefix.len - 1, state->out_fd); - fwrite(uri_suffix.buf, 1, uri_suffix.len - 1, state->out_fd); - fwrite(">", 1, 1, state->out_fd); - break; - case URI: - if (serd_uri_string_is_relative(str->buf)) { - SerdURI uri; - if (serd_uri_parse(str->buf, &uri)) { - SerdURI abs_uri; - if (serd_uri_resolve(&uri, &state->base_uri, &abs_uri)) { - fwrite("<", 1, 1, state->out_fd); - serd_uri_write(&abs_uri, state->out_fd); - fwrite(">", 1, 1, state->out_fd); - return true; - } - } - } else { - fwrite("<", 1, 1, state->out_fd); - fwrite(str->buf, 1, str->n_bytes - 1, state->out_fd); - fwrite(">", 1, 1, state->out_fd); - return true; - } - return false; - case LITERAL: - fwrite("\"", 1, 1, state->out_fd); - for (size_t i = 0; i < str->n_bytes - 1; ++i) { - const char c = str->buf[i]; - switch (c) { - case '\\': fwrite("\\\\", 1, 2, state->out_fd); break; - case '\n': fwrite("\\n", 1, 2, state->out_fd); break; - case '\r': fwrite("\\r", 1, 2, state->out_fd); break; - case '\t': fwrite("\\t", 1, 2, state->out_fd); break; - case '"': fwrite("\\\"", 1, 2, state->out_fd); break; - default: - fwrite(&c, 1, 1, state->out_fd); - } - } - fwrite("\"", 1, 1, state->out_fd); - if (lang) { - fwrite("@\"", 1, 2, state->out_fd); - fwrite(lang->buf, 1, lang->n_bytes - 1, state->out_fd); - fwrite("\"", 1, 1, state->out_fd); - } else if (datatype) { - fwrite("^^", 1, 2, state->out_fd); - write_node(state, datatype, URI, NULL, NULL); - } - break; - } - return true; -} - static bool event_statement(void* handle, const SerdString* graph, @@ -176,11 +105,14 @@ event_statement(void* handle, { State* const state = (State*)handle; FILE* const fd = state->out_fd; - write_node(state, subject, subject_type, NULL, NULL); + serd_write_node(fd, &state->base_uri, state->ns, + subject_type, subject, NULL, NULL); fwrite(" ", 1, 1, fd); - write_node(state, predicate, predicate_type, NULL, NULL); + serd_write_node(fd, &state->base_uri, state->ns, + predicate_type, predicate, NULL, NULL); fwrite(" ", 1, 1, fd); - write_node(state, object, object_type, object_datatype, object_lang); + serd_write_node(fd, &state->base_uri, state->ns, + object_type, object, object_datatype, object_lang); fwrite(" .\n", 1, 3, fd); return true; } diff --git a/src/string.c b/src/string.c new file mode 100644 index 00000000..301a98cc --- /dev/null +++ b/src/string.c @@ -0,0 +1,65 @@ +/* Serd, an RDF serialisation library. + * Copyright 2011 David Robillard + * + * Serd is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Serd is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + * License for details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include + +#include "serd/serd.h" + +static inline size_t +utf8_strlen(const uint8_t* utf8, size_t* out_n_bytes) +{ + size_t n_chars = 0; + size_t i = 0; + for (; utf8[i]; ++i) { + if ((utf8[i] & 0xC0) != 0x80) { + // Does not start with `10', start of a new character + ++n_chars; + } + } + if (out_n_bytes) { + *out_n_bytes = i + 1; + } + return n_chars; +} + +SERD_API +SerdString* +serd_string_new(const uint8_t* utf8) +{ + size_t n_bytes; + size_t n_chars = utf8_strlen(utf8, &n_bytes); + SerdString* const str = malloc(sizeof(SerdString) + n_bytes); + str->n_bytes = n_bytes; + str->n_chars = n_chars; + memcpy(str->buf, utf8, str->n_bytes); + return str; +} + +SERD_API +SerdString* +serd_string_copy(const SerdString* s) +{ + if (s) { + SerdString* const copy = malloc(sizeof(SerdString) + s->n_bytes); + memcpy(copy, s, sizeof(SerdString) + s->n_bytes); + return copy; + } + return NULL; +} diff --git a/src/uri.c b/src/uri.c index d98f07ff..1ff7a6d9 100644 --- a/src/uri.c +++ b/src/uri.c @@ -260,10 +260,9 @@ serd_uri_resolve(const SerdURI* r, const SerdURI* base, SerdURI* t) return true; } -typedef size_t (*Sink)(const void* data, size_t size, size_t nmemb, void* stream); - -static size_t -serd_uri_serialise_internal(const SerdURI* uri, Sink sink, void* stream) +SERD_API +size_t +serd_uri_serialise(const SerdURI* uri, SerdSink sink, void* stream) { /* See http://tools.ietf.org/html/rfc3986#section-5.3 */ @@ -271,16 +270,16 @@ serd_uri_serialise_internal(const SerdURI* uri, Sink sink, void* stream) #define WRITE(buf, len) \ write_size += len; \ if (len) { \ - sink(buf, 1, len, stream); \ + sink((const uint8_t*)buf, len, stream); \ } #define WRITE_CHAR(c) WRITE(&(c), 1) #define WRITE_COMPONENT(prefix, field, suffix) \ if ((field).len) { \ - for (const char* c = prefix; *c != '\0'; ++c) { \ + for (const uint8_t* c = (const uint8_t*)prefix; *c != '\0'; ++c) { \ WRITE(c, 1); \ } \ WRITE((field).buf, (field).len); \ - for (const char* c = suffix; *c != '\0'; ++c) { \ + for (const uint8_t* c = (const uint8_t*)suffix; *c != '\0'; ++c) { \ WRITE(c, 1); \ } \ } @@ -354,26 +353,9 @@ serd_uri_serialise_internal(const SerdURI* uri, Sink sink, void* stream) // Note uri->fragment.buf includes the leading `#' WRITE_COMPONENT("", uri->fragment, ""); } - WRITE("\0", 1); return write_size; } -SERD_API -bool -serd_uri_write(const SerdURI* uri, FILE* file) -{ - //#if 0 - SerdURI flat_uri; - SerdString* const flat_uri_str = serd_uri_serialise(uri, &flat_uri); - if (flat_uri_str) { - fwrite(flat_uri_str->buf, 1, flat_uri_str->n_bytes - 1, file); - free(flat_uri_str); - return true; - } - return false; - //#endif - //return (serd_uri_serialise_internal(uri, (Sink)fwrite, file) > 0); -} static size_t serd_uri_string_length(const SerdURI* uri) @@ -393,18 +375,17 @@ serd_uri_string_length(const SerdURI* uri) } static size_t -string_write(const void* data, size_t size, size_t nmemb, void* stream) +string_sink(const uint8_t* buf, size_t len, void* stream) { - uint8_t** ptr = (uint8_t**)stream; - const size_t write_size = (size * nmemb); - memcpy(*ptr, data, write_size); - *ptr += write_size; - return nmemb; + uint8_t** ptr = (uint8_t**)stream; + memcpy(*ptr, buf, len); + *ptr += len; + return len; } SERD_API SerdString* -serd_uri_serialise(const SerdURI* uri, SerdURI* out) +serd_string_new_from_uri(const SerdURI* uri, SerdURI* out) { const size_t len = serd_uri_string_length(uri); SerdString* str = malloc(sizeof(SerdString) + len + 1); @@ -412,10 +393,10 @@ serd_uri_serialise(const SerdURI* uri, SerdURI* out) str->n_chars = len; // FIXME: UTF-8 uint8_t* ptr = str->buf; - const size_t actual_len = serd_uri_serialise_internal(uri, string_write, &ptr); + const size_t actual_len = serd_uri_serialise(uri, string_sink, &ptr); - str->buf[actual_len] = '\0'; - str->n_bytes = actual_len; + str->buf[actual_len + 1] = '\0'; + str->n_bytes = actual_len + 1; str->n_chars = str->n_bytes - 1; #ifdef URI_DEBUG diff --git a/src/write.c b/src/write.c new file mode 100644 index 00000000..c2b92d78 --- /dev/null +++ b/src/write.c @@ -0,0 +1,170 @@ +/* Serd, an RDF serialisation library. + * Copyright 2011 David Robillard + * + * Serd is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Serd is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + * License for details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see . + */ + +#include +#include + +#include "serd/serd.h" + +static size_t +file_sink(const uint8_t* buf, size_t len, void* stream) +{ + FILE* file = (FILE*)stream; + return fwrite(buf, 1, len, file); +} + +static inline bool +serd_write_uri(FILE* file, const SerdURI* uri) +{ + return serd_uri_serialise(uri, file_sink, file); +} + +static bool +serd_write_ascii(const uint8_t* utf8, size_t n_bytes, FILE* out_fd, const uint8_t esc) +{ + for (size_t i = 0; i < n_bytes;) { + uint8_t in = utf8[i++]; + switch (in) { + case '\\': fwrite("\\\\", 1, 2, out_fd); continue; + case '\n': fwrite("\\n", 1, 2, out_fd); continue; + case '\r': fwrite("\\r", 1, 2, out_fd); continue; + case '\t': fwrite("\\t", 1, 2, out_fd); continue; + case '"': if (esc == '"') { fwrite("\\\"", 1, 2, out_fd); continue; } + default: break; + } + + if (in == esc) { + fprintf(out_fd, "\\u%04X", esc); + continue; + } + + uint32_t c = 0; + size_t size = 0; + if ((in & 0x80) == 0) { // Starts with `0' + size = 1; + c = in & 0x7F; + if ((in >= 0x20) && (in <= 0x7E)) { // Printable ASCII + fwrite(&in, 1, 1, out_fd); + continue; + } + } else if ((in & 0xE0) == 0xC0) { // Starts with `110' + size = 2; + c = in & 0x1F; + } else if ((in & 0xF0) == 0xE0) { // Starts with `1110' + size = 3; + c = in & 0x0F; + } else if ((in & 0xF8) == 0xF0) { // Starts with `11110' + size = 4; + c = in & 0x07; + } else if ((in & 0xFC) == 0xF8) { // Starts with `111110' + size = 5; + c = in & 0x03; + } else if ((in & 0xFE) == 0xFC) { // Starts with `1111110' + size = 6; + c = in & 0x01; + } else { + fprintf(stderr, "invalid UTF-8 at offset %zu: %X\n", i, in); + return false; + } + +#define READ_BYTE() do { \ + assert(i < n_bytes); \ + in = utf8[i++] & 0x3f; \ + c <<= 6; \ + c |= in; \ + } while (0) + + switch (size) { + case 6: READ_BYTE(); + case 5: READ_BYTE(); + case 4: READ_BYTE(); + case 3: READ_BYTE(); + case 2: READ_BYTE(); + } + + if (c < 0xFFFF) { + fprintf(out_fd, "\\u%04X", c); + } else { + fprintf(out_fd, "\\U%08X", c); + } + } + return true; +} + +SERD_API +bool +serd_write_node(FILE* fd, + const SerdURI* base_uri, + SerdNamespaces ns, + SerdNodeType type, + const SerdString* str, + const SerdString* datatype, + const SerdString* lang) +{ + SerdRange uri_prefix; + SerdRange uri_suffix; + switch (type) { + case BLANK: + fwrite("_:", 1, 2, fd); + fwrite(str->buf, 1, str->n_bytes - 1, fd); + break; + case QNAME: + if (!serd_namespaces_expand(ns, str, &uri_prefix, &uri_suffix)) { + fprintf(stderr, "error: undefined namespace prefix `%s'\n", str->buf); + return false; + } + fwrite("<", 1, 1, fd); + serd_write_ascii(uri_prefix.buf, uri_prefix.len, fd, '>'); + serd_write_ascii(uri_suffix.buf, uri_suffix.len, fd, '>'); + fwrite(">", 1, 1, fd); + break; + case URI: + if (serd_uri_string_is_relative(str->buf)) { + SerdURI uri; + if (serd_uri_parse(str->buf, &uri)) { + SerdURI abs_uri; + if (serd_uri_resolve(&uri, base_uri, &abs_uri)) { + fwrite("<", 1, 1, fd); + serd_write_uri(fd, &abs_uri); + fwrite(">", 1, 1, fd); + return true; + } + } + } else { + fwrite("<", 1, 1, fd); + serd_write_ascii(str->buf, str->n_bytes - 1, fd, '>'); + fwrite(">", 1, 1, fd); + return true; + } + return false; + case LITERAL: + fwrite("\"", 1, 1, fd); + serd_write_ascii(str->buf, str->n_bytes - 1, fd, '"'); + fwrite("\"", 1, 1, fd); + if (lang) { + fwrite("@\"", 1, 2, fd); + fwrite(lang->buf, 1, lang->n_bytes - 1, fd); + fwrite("\"", 1, 1, fd); + } else if (datatype) { + fwrite("^^", 1, 2, fd); + serd_write_node(fd, base_uri, ns, URI, datatype, NULL, NULL); + } + break; + } + return true; +} + diff --git a/wscript b/wscript index 3367977e..b701c98c 100644 --- a/wscript +++ b/wscript @@ -54,7 +54,13 @@ def build(bld): # Pkgconfig file autowaf.build_pc(bld, 'SERD', SERD_VERSION, ['REDLAND']) - lib_source = 'src/reader.c src/namespaces.c src/uri.c' + lib_source = ''' + src/namespaces.c + src/reader.c + src/string.c + src/uri.c + src/write.c + ''' # Library obj = bld(features = 'c cshlib') -- cgit v1.2.1