diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/namespaces.c | 157 | ||||
-rw-r--r-- | src/reader.c | 1249 | ||||
-rw-r--r-- | src/serdi.c | 234 | ||||
-rw-r--r-- | src/uri.c | 428 |
4 files changed, 2068 insertions, 0 deletions
diff --git a/src/namespaces.c b/src/namespaces.c new file mode 100644 index 00000000..fab53ea3 --- /dev/null +++ b/src/namespaces.c @@ -0,0 +1,157 @@ +/* Serd, an RDF serialisation library. + * Copyright 2011 David Robillard <d@drobilla.net> + * + * Serd is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Serd is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + * License for details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <assert.h> +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> + +#include "serd/serd.h" + +typedef struct { + SerdString* name; + SerdString* uri; +} SerdNamespace; + +struct SerdNamespacesImpl { + SerdNamespace* namespaces; + size_t n_namespaces; +}; + +static inline size_t +utf8_strlen(const uint8_t* utf8, size_t* out_n_bytes) +{ + size_t n_chars = 0; + size_t i = 0; + for (; utf8[i]; ++i) { + if ((utf8[i] & 0xC0) != 0x80) { + // Does not start with `10', start of a new character + ++n_chars; + } + } + if (out_n_bytes) { + *out_n_bytes = i + 1; + } + return n_chars; +} + +SERD_API +SerdString* +serd_string_new(const uint8_t* utf8) +{ + size_t n_bytes; + size_t n_chars = utf8_strlen(utf8, &n_bytes); + SerdString* const str = malloc(sizeof(SerdString) + n_bytes); + str->n_bytes = n_bytes; + str->n_chars = n_chars; + memcpy(str->buf, utf8, str->n_bytes); + return str; +} + +SERD_API +SerdString* +serd_string_copy(const SerdString* s) +{ + if (s) { + SerdString* const copy = malloc(sizeof(SerdString) + s->n_bytes); + memcpy(copy, s, sizeof(SerdString) + s->n_bytes); + return copy; + } + return NULL; +} + +SERD_API +SerdNamespaces +serd_namespaces_new() +{ + SerdNamespaces ns = malloc(sizeof(struct SerdNamespacesImpl)); + ns->namespaces = NULL; + ns->n_namespaces = 0; + return ns; +} + +SERD_API +void +serd_namespaces_free(SerdNamespaces ns) +{ + for (size_t i = 0; i < ns->n_namespaces; ++i) { + free(ns->namespaces[i].name); + free(ns->namespaces[i].uri); + } + free(ns->namespaces); + free(ns); +} + +static inline SerdNamespace* +serd_namespaces_find(SerdNamespaces ns, + const uint8_t* name, + size_t name_len) +{ + for (size_t i = 0; i < ns->n_namespaces; ++i) { + const SerdString* ns_name = ns->namespaces[i].name; + if (ns_name->n_bytes == name_len + 1) { + if (!memcmp(ns_name->buf, name, name_len)) { + return &ns->namespaces[i]; + } + } + } + return NULL; +} + +SERD_API +void +serd_namespaces_add(SerdNamespaces ns, + const SerdString* name, + const SerdString* uri) +{ + assert(name); + assert(uri); + SerdNamespace* const record = serd_namespaces_find(ns, name->buf, name->n_chars); + if (record) { + free(record->uri); + record->uri = serd_string_copy(uri); + } else { + ++ns->n_namespaces; + ns->namespaces = realloc(ns->namespaces, + ns->n_namespaces * sizeof(SerdNamespace)); + ns->namespaces[ns->n_namespaces - 1].name = serd_string_copy(name); + ns->namespaces[ns->n_namespaces - 1].uri = serd_string_copy(uri); + } +} + +SERD_API +bool +serd_namespaces_expand(SerdNamespaces ns, + const SerdString* qname, + SerdRange* uri_prefix, + SerdRange* uri_suffix) +{ + const uint8_t* colon = memchr((const char*)qname->buf, ':', qname->n_bytes); + if (!colon) { + return false; // Illegal qname + } + + SerdNamespace* const record = serd_namespaces_find(ns, qname->buf, colon - qname->buf); + if (record) { + uri_prefix->buf = record->uri->buf; + uri_prefix->len = record->uri->n_bytes; + uri_suffix->buf = colon + 1; + uri_suffix->len = qname->n_bytes - (colon - qname->buf) - 1; + return true; + } + return false; +} diff --git a/src/reader.c b/src/reader.c new file mode 100644 index 00000000..66dec851 --- /dev/null +++ b/src/reader.c @@ -0,0 +1,1249 @@ +/* Serd, an RDF serialisation library. + * Copyright 2011 David Robillard <d@drobilla.net> + * + * Serd is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Serd is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + * License for details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <assert.h> +#include <stdarg.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "serd/serd.h" + +#define TRY_THROW(exp) if (!(exp)) goto except; +#define TRY_RET(exp) if (!(exp)) return 0; + +#define STACK_CHUNK_SIZE 4096 +#define STACK_INITIAL_TOP 8 +#ifndef NDEBUG +#define STACK_DEBUG 1 +#endif + +typedef struct { + const uint8_t* filename; + unsigned line; + unsigned col; +} Cursor; + +typedef struct { + uint8_t* buf; ///< Stack memory + size_t buf_size; ///< Allocated size of buf (>= size) + size_t size; ///< Conceptual size of stack in buf +} Stack; + +typedef uint32_t uchar; + +typedef size_t Ref; + +static const int32_t READ_BUF_LEN = 4096; +static const int32_t MAX_READAHEAD = 8; + +struct SerdReaderImpl { + void* handle; + SerdBaseHandler base_handler; + SerdPrefixHandler prefix_handler; + SerdStatementHandler statement_handler; + FILE* fd; + Stack stack; + Cursor cur; + uint8_t* buf; + unsigned next_id; + int err; + uint8_t* read_buf; + int32_t read_head; ///< Offset into read_buf + bool eof; +#ifdef STACK_DEBUG + Ref* alloc_stack; ///< Stack of push offsets + size_t n_allocs; ///< Number of stack pushes +#endif +}; + +typedef struct { + SerdNodeType type; + Ref value; + Ref datatype; + Ref lang; +} Node; + +static inline uchar +error(SerdReader parser, const char* fmt, ...) +{ + va_list args; + va_start(args, fmt); + fprintf(stderr, "error: %s:%u:%u: ", + parser->cur.filename, parser->cur.line, parser->cur.col); + vfprintf(stderr, fmt, args); + parser->err = 1; + return 0; +} + +static Node +make_node(SerdNodeType type, Ref value, Ref datatype, Ref lang) +{ + const Node ret = { type, value, datatype, lang }; + return ret; +} + +static inline bool +page(SerdReader parser) +{ + parser->read_head = 0; + const int32_t n_read = fread(parser->read_buf, 1, READ_BUF_LEN, parser->fd); + if (n_read == 0) { + parser->read_buf[0] = '\0'; + parser->eof = true; + return false; + } else if (n_read < READ_BUF_LEN) { + parser->read_buf[n_read] = '\0'; + } + return true; +} + +static inline bool +readahead(SerdReader parser, uint8_t* pre, int n) +{ + uint8_t* ptr = parser->read_buf + parser->read_head; + for (int i = 0; i < n; ++i) { + if (parser->read_head + i >= READ_BUF_LEN) { + //fprintf(stderr, "PAGE FAULT DURING READAHEAD\n"); + if (!page(parser)) { + return false; + } + ptr = parser->read_buf; + parser->read_head = -i; + memcpy(parser->read_buf + parser->read_head, pre, i); + assert(parser->read_buf[parser->read_head] == pre[0]); + } + if ((pre[i] = *ptr++) == '\0') { + return false; + } + } + return true; +} + +static inline uchar +read_utf8_char(SerdReader parser) +{ + if (parser->read_head == READ_BUF_LEN) { + return error(parser, "page fault\n"); + } + const uchar c = parser->read_buf[parser->read_head++]; + switch (c) { + case '\n': ++parser->cur.line; parser->cur.col = 0; break; + default: ++parser->cur.col; + } + /*while ((byte & 0xC0) == 0x80) { + // Starts with `10', continuation byte + character += (byte & 0x7F); + byte = getc(parser->fd); + }*/ + return c; +} + +static inline uchar +peek_char(SerdReader parser) +{ + if (parser->eof) { + return EOF; + } + return parser->read_buf[parser->read_head]; +} + +static inline uchar +eat_char(SerdReader parser, const uchar character) +{ + const uchar c = parser->read_buf[parser->read_head++]; + switch (c) { + case '\0': return error(parser, "unexpected end of file\n"); + case '\n': ++parser->cur.line; parser->cur.col = 0; break; + default: ++parser->cur.col; + } + + if (c != character) { + return error(parser, "expected `%c', not `%c'\n", character, c); + } + if (parser->read_head == READ_BUF_LEN) { + TRY_RET(page(parser)); + } + assert(parser->read_head < READ_BUF_LEN); + if (parser->read_buf[parser->read_head] == '\0') { + parser->eof = true; + } + return c; +} + +static inline void +eat_string(SerdReader parser, const char* str, unsigned n) +{ + for (unsigned i = 0; i < n; ++i) { + eat_char(parser, str[i]); + } +} + +static inline bool +in_range(const uchar c, const uchar min, const uchar max) +{ + return (c >= min && c <= max); +} + +#ifdef STACK_DEBUG +static inline bool +stack_is_top_string(SerdReader parser, Ref ref) +{ + return ref == parser->alloc_stack[parser->n_allocs - 1]; +} +#endif + +static inline uint8_t* +stack_push(SerdReader parser, size_t n_bytes) +{ + const size_t new_size = parser->stack.size + n_bytes; + if (parser->stack.buf_size < new_size) { + parser->stack.buf_size = ((new_size / STACK_CHUNK_SIZE) + 1) * STACK_CHUNK_SIZE; + parser->stack.buf = realloc(parser->stack.buf, parser->stack.buf_size); + } + uint8_t* const ret = (parser->stack.buf + parser->stack.size); + parser->stack.size = new_size; + return ret; +} + +static inline intptr_t +pad_size(intptr_t size) +{ + return (size + 7) & (~7); +} + +// Make a new string from a non-UTF-8 C string (internal use only) +static Ref +push_string(SerdReader parser, const char* c_str, size_t n_bytes) +{ + // Align strings to 64-bits (assuming malloc/realloc are aligned to 64-bits) + const size_t stack_size = pad_size((intptr_t)parser->stack.size); + const size_t pad = stack_size - parser->stack.size; + SerdString* const str = (SerdString*)( + stack_push(parser, pad + sizeof(SerdString) + n_bytes) + pad); + str->n_bytes = n_bytes; + str->n_chars = n_bytes - 1; + memcpy(str->buf, c_str, n_bytes); +#ifdef STACK_DEBUG + parser->alloc_stack = realloc(parser->alloc_stack, sizeof(uint8_t*) * (++parser->n_allocs)); + parser->alloc_stack[parser->n_allocs - 1] = ((uint8_t*)str - parser->stack.buf); +#endif + return (uint8_t*)str - parser->stack.buf; +} + +static inline SerdString* +deref(SerdReader parser, const Ref ref) +{ + if (ref) { + return (SerdString*)(parser->stack.buf + ref); + } + return NULL; +} + +static inline void +push_char(SerdReader parser, Ref ref, const uchar c) +{ + #ifdef STACK_DEBUG + assert(stack_is_top_string(parser, ref)); + #endif + stack_push(parser, 1); + SerdString* const str = deref(parser, ref); + ++str->n_bytes; + if ((c & 0xC0) == 0x80) { + fprintf(stderr, "PUSH WIDE CHAR %X\n", c); + } else { + ++str->n_chars; + } + str->buf[str->n_bytes - 2] = c; + str->buf[str->n_bytes - 1] = '\0'; +} + +static void +pop_string(SerdReader parser, Ref ref) +{ + if (ref) { + #ifdef STACK_DEBUG + assert(stack_is_top_string(parser, ref)); + --parser->n_allocs; + #endif + parser->stack.size -= deref(parser, ref)->n_bytes; + } +} + +static inline void +emit_statement(SerdReader parser, + const Node* g, const Node* s, const Node* p, const Node* o) +{ + parser->statement_handler(parser->handle, + g ? deref(parser, g->value) : NULL, + deref(parser, s->value), s->type, + deref(parser, p->value), p->type, + deref(parser, o->value), o->type, + deref(parser, o->datatype), deref(parser, o->lang)); +} + + +static bool read_predicateObjectList(SerdReader parser, const Node* subject); + +// [40] hex ::= [#x30-#x39] | [#x41-#x46] +static inline uchar +read_hex(SerdReader parser) +{ + const uchar c = peek_char(parser); + if (in_range(c, 0x30, 0x39) || in_range(c, 0x41, 0x46)) { + return c; + } else { + return error(parser, "illegal hexadecimal digit `%c'\n", c); + } +} + +static inline uchar +read_hex_escape(SerdReader parser, unsigned length) +{ + uchar ret = 0; + unsigned mult = 1; + for (unsigned i = 0; i < length; ++i) { + const uchar c = read_hex(parser); + ret += (c * mult); + mult *= 8; + } + return ret; +} + +static inline uchar +character_escape(SerdReader parser, const uchar esc) +{ + switch (esc) { + case '\\': + return eat_char(parser, '\\'); + case 'u': + eat_char(parser, esc); + return read_hex_escape(parser, 4); + case 'U': + eat_char(parser, esc); + return read_hex_escape(parser, 8); + default: + return 0; + } +} + +// [38] character ::= '\u' hex hex hex hex +// | '\U' hex hex hex hex hex hex hex hex +// | '\\' +// | [#x20-#x5B] | [#x5D-#x10FFFF] +static inline uchar +read_character(SerdReader parser) +{ + const uchar c = peek_char(parser); + uchar esc; + switch (c) { + case '\\': + eat_char(parser, '\\'); + esc = character_escape(parser, peek_char(parser)); + if (esc) { + return esc; + } else { + return error(parser, "illegal escape `\\%c'\n", esc); + } + default: + if (in_range(c, 0x20, 0x5B) || in_range(c, 0x5D, 0x10FFF)) { + return eat_char(parser, c); + } else { + return error(parser, "illegal character `%c'\n", c); + } + } +} + +static inline uchar +echaracter_escape(SerdReader parser, const uchar esc) +{ + const uchar ret = character_escape(parser, esc); + if (ret) { + return ret; + } + switch (esc) { + case 't': + eat_char(parser, 't'); + return '\t'; + case 'n': + eat_char(parser, 'n'); + return '\n'; + case 'r': + eat_char(parser, 'r'); + return '\r'; + default: + return 0; + } +} + +// [39] echaracter ::= character | '\t' | '\n' | '\r' +static inline uchar +read_echaracter(SerdReader parser) +{ + uchar c = peek_char(parser); + uchar esc; + switch (c) { + case '\\': + eat_char(parser, '\\'); + esc = echaracter_escape(parser, peek_char(parser)); + if (esc) { + return esc; + } else { + return error(parser, "illegal escape `\\%c'\n", esc); + } + default: + return read_character(parser); + } +} + +static inline uchar +scharacter_escape(SerdReader parser, const uchar esc) +{ + const uchar ret = echaracter_escape(parser, esc); + if (ret) { + return ret; + } else if (esc == '"') { + return eat_char(parser, '"'); + } + return 0; +} + +static inline uchar +ucharacter_escape(SerdReader parser, const uchar esc) +{ + const uchar ret = echaracter_escape(parser, esc); + if (ret) { + return ret; + } else if (esc == '>') { + return eat_char(parser, '>'); + } + return 0; +} + +// [43] lcharacter ::= echaracter | '\"' | #x9 | #xA | #xD +static inline uchar +read_lcharacter(SerdReader parser, bool* is_escape) +{ + *is_escape = false; + const uchar c = peek_char(parser); + uchar esc; + switch (c) { + case '\\': + eat_char(parser, '\\'); + esc = scharacter_escape(parser, peek_char(parser)); + if (esc) { + *is_escape = true; + return esc; + } else { + return error(parser, "illegal escape `\\%c'\n", esc); + } + case 0x9: case 0xA: case 0xD: + eat_char(parser, c); + return c; + default: + return read_echaracter(parser); + } +} + +// [42] scharacter ::= ( echaracter - #x22 ) | '\"' +static inline uchar +read_scharacter(SerdReader parser) +{ + const uchar c = peek_char(parser); + uchar esc; + switch (c) { + case '\\': + eat_char(parser, '\\'); + esc = scharacter_escape(parser, peek_char(parser)); + if (esc) { + return esc; + } else { + return error(parser, "illegal escape `\\%c'\n", esc); + } + case '\"': + return 0; + default: + return read_character(parser); + } +} + +// Spec: [41] ucharacter ::= ( character - #x3E ) | '\>' +// Actual: [41] ucharacter ::= ( echaracter - #x3E ) | '\>' +static inline uchar +read_ucharacter(SerdReader parser) +{ + const uchar c = peek_char(parser); + uchar esc; + switch (c) { + case '\\': + eat_char(parser, '\\'); + esc = ucharacter_escape(parser, peek_char(parser)); + if (esc) { + return esc; + } else { + return error(parser, "illegal escape `\\%c'\n", esc); + } + case '>': + return 0; + default: + return read_character(parser); + } +} + +// [10] comment ::= '#' ( [^#xA #xD] )* +static void +read_comment(SerdReader parser) +{ + eat_char(parser, '#'); + uchar c; + while (((c = peek_char(parser)) != 0xA) && (c != 0xD)) { + eat_char(parser, c); + } +} + +// [24] ws ::= #x9 | #xA | #xD | #x20 | comment +static inline bool +read_ws(SerdReader parser, bool required) +{ + const uchar c = peek_char(parser); + switch (c) { + case '\0': + assert(parser->eof); + return false; + case 0x9: case 0xA: case 0xD: case 0x20: + eat_char(parser, c); + return true; + case '#': + read_comment(parser); + return true; + default: + if (required) { + error(parser, "expected whitespace\n"); + } + return false; + } +} + +static inline bool +read_ws_plus(SerdReader parser) +{ + if (read_ws(parser, true)) { + while (read_ws(parser, false)) {} + return true; + } + return false; +} + +static inline void +read_ws_star(SerdReader parser) +{ + while (read_ws(parser, false)) {} +} + +// [37] longSerdString ::= #x22 #x22 #x22 lcharacter* #x22 #x22 #x22 +static Ref +read_longString(SerdReader parser) +{ + eat_string(parser, "\"\"\"", 3); + Ref str = push_string(parser, "", 1); + uchar c; + bool is_escape = false; + while ((c = read_lcharacter(parser, &is_escape)) != 0) { + if (c == '\"' && !is_escape) { + uint8_t pre[2]; + readahead(parser, pre, 2); + if (pre[0] == '\"' && pre[1] == '\"') { + eat_char(parser, '\"'); + eat_char(parser, '\"'); + return str; + } + } + push_char(parser, str, c); + } + eat_string(parser, "\"\"\"", 3); + return str; +} + +// [36] string ::= #x22 scharacter* #x22 +static Ref +read_string(SerdReader parser) +{ + eat_char(parser, '\"'); + Ref str = push_string(parser, "", 1); + uchar c; + while ((c = read_scharacter(parser)) != 0) { + push_char(parser, str, c); + } + eat_char(parser, '\"'); + return str; +} + +// [35] quotedString ::= string | longSerdString +static Ref +read_quotedString(SerdReader parser) +{ + uint8_t pre[3]; + readahead(parser, pre, 3); + assert(pre[0] == '\"'); + switch (pre[1]) { + case '\"': + if (pre[2] == '\"') + return read_longString(parser); + else + return read_string(parser); + default: + return read_string(parser); + } +} + +// [34] relativeURI ::= ucharacter* +static inline Ref +read_relativeURI(SerdReader parser) +{ + uchar c; + Ref str = push_string(parser, "", 1); + while ((c = read_ucharacter(parser)) != 0) { + push_char(parser, str, c); + } + return str; +} + +// [30] nameStartChar ::= [A-Z] | "_" | [a-z] +// | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x02FF] | [#x0370-#x037D] +// | [#x037F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] +// | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] +static inline uchar +read_nameStartChar(SerdReader parser, bool required) +{ + const uchar c = peek_char(parser); + if (in_range(c, 'A', 'Z') || (c == '_') || in_range(c, 'a', 'z')) { + return eat_char(parser, c); + } else { + if (required) { + error(parser, "illegal character `%c'\n", c); + } + return 0; + } +} + +// [31] nameChar ::= nameStartChar | '-' | [0-9] | #x00B7 | [#x0300-#x036F] | [#x203F-#x2040] +static inline uchar +read_nameChar(SerdReader parser) +{ + uchar c = read_nameStartChar(parser, false); + if (c) + return c; + + switch ((c = peek_char(parser))) { + case '-': case 0xB7: case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + return eat_char(parser, c); + default: + if (in_range(c, 0x300, 0x036F) || in_range(c, 0x203F, 0x2040)) { + return eat_char(parser, c); + } + } + return 0; +} + +// [33] prefixName ::= ( nameStartChar - '_' ) nameChar* +static Ref +read_prefixName(SerdReader parser) +{ + uchar c = peek_char(parser); + if (c == '_') { + error(parser, "unexpected `_'\n"); + return 0; + } + TRY_RET(c = read_nameStartChar(parser, false)); + Ref str = push_string(parser, "", 1); + push_char(parser, str, c); + while ((c = read_nameChar(parser)) != 0) { + push_char(parser, str, c); + } + return str; +} + +// [32] name ::= nameStartChar nameChar* +static Ref +read_name(SerdReader parser, Ref dest, bool required) +{ + uchar c = read_nameStartChar(parser, required); + if (!c) { + if (required) { + error(parser, "illegal character at start of name\n"); + } + return 0; + } + do { + push_char(parser, dest, c); + } while ((c = read_nameChar(parser)) != 0); + return dest; +} + +// [29] language ::= [a-z]+ ('-' [a-z0-9]+ )* +static Ref +read_language(SerdReader parser) +{ + const uchar start = peek_char(parser); + if (!in_range(start, 'a', 'z')) { + error(parser, "unexpected `%c'\n", start); + return 0; + } + Ref str = push_string(parser, "", 1); + push_char(parser, str, start); + uchar c; + while ((c = peek_char(parser)) && in_range(c, 'a', 'z')) { + push_char(parser, str, eat_char(parser, c)); + } + if (peek_char(parser) == '-') { + push_char(parser, str, eat_char(parser, '-')); + while ((c = peek_char(parser)) && (in_range(c, 'a', 'z') || in_range(c, '0', '9'))) { + push_char(parser, str, eat_char(parser, c)); + } + } + return str; +} + +// [28] uriref ::= '<' relativeURI '>' +static Ref +read_uriref(SerdReader parser) +{ + eat_char(parser, '<'); + Ref const str = read_relativeURI(parser); + eat_char(parser, '>'); + return str; +} + +// [27] qname ::= prefixName? ':' name? +static Ref +read_qname(SerdReader parser) +{ + Ref prefix = read_prefixName(parser); + if (!prefix) { + prefix = push_string(parser, "", 1); + } + push_char(parser, prefix, eat_char(parser, ':')); + Ref str = read_name(parser, prefix, false); + if (parser->err) + return 0; + return str ? str : prefix; +} + + +static Ref +read_0_9(SerdReader parser, Ref str, bool at_least_one) +{ + uchar c; + if (at_least_one) { + TRY_RET(in_range(c = peek_char(parser), '0', '9')); + push_char(parser, str, eat_char(parser, c)); + } + while (in_range((c = peek_char(parser)), '0', '9')) { + push_char(parser, str, eat_char(parser, c)); + } + return str; +} + +// [19] exponent ::= [eE] ('-' | '+')? [0-9]+ +// [18] decimal ::= ( '-' | '+' )? ( [0-9]+ '.' [0-9]* +// | '.' ([0-9])+ +// | ([0-9])+ ) +// [17] double ::= ( '-' | '+' )? ( [0-9]+ '.' [0-9]* exponent +// | '.' ([0-9])+ exponent +// | ([0-9])+ exponent ) +// [16] integer ::= ( '-' | '+' ) ? [0-9]+ +static bool +read_number(SerdReader parser, Node* dest) +{ + #define XSD_DECIMAL "http://www.w3.org/2001/XMLSchema#decimal" + #define XSD_DOUBLE "http://www.w3.org/2001/XMLSchema#double" + #define XSD_INTEGER "http://www.w3.org/2001/XMLSchema#integer" + Ref str = push_string(parser, "", 1); + uchar c = peek_char(parser); + bool has_decimal = false; + Ref datatype = 0; + if (c == '-' || c == '+') { + push_char(parser, str, eat_char(parser, c)); + } + if ((c = peek_char(parser)) == '.') { + has_decimal = true; + // decimal case 2 (e.g. '.0' or `-.0' or `+.0') + push_char(parser, str, eat_char(parser, c)); + TRY_THROW(str = read_0_9(parser, str, true)); + } else { + // all other cases ::= ( '-' | '+' ) [0-9]+ ( . )? ( [0-9]+ )? ... + TRY_THROW(str = read_0_9(parser, str, true)); + if ((c = peek_char(parser)) == '.') { + has_decimal = true; + push_char(parser, str, eat_char(parser, c)); + TRY_THROW(str = read_0_9(parser, str, false)); + } + } + c = peek_char(parser); + if (c == 'e' || c == 'E') { + // double + push_char(parser, str, eat_char(parser, c)); + str = read_0_9(parser, str, true); + datatype = push_string(parser, XSD_DOUBLE, strlen(XSD_DOUBLE) + 1); + } else if (has_decimal) { + datatype = push_string(parser, XSD_DECIMAL, strlen(XSD_DECIMAL) + 1); + } else { + datatype = push_string(parser, XSD_INTEGER, strlen(XSD_INTEGER) + 1); + } + *dest = make_node(LITERAL, str, datatype, 0); + assert(dest->value); + return true; +except: + pop_string(parser, datatype); + pop_string(parser, str); + return false; +} + +// [25] resource ::= uriref | qname +static bool +read_resource(SerdReader parser, Node* dest) +{ + switch (peek_char(parser)) { + case '<': + *dest = make_node(URI, read_uriref(parser), 0, 0); + break; + default: + *dest = make_node(QNAME, read_qname(parser), 0, 0); + } + return (dest->value != 0); +} + +// [14] literal ::= quotedString ( '@' language )? | datatypeSerdString +// | integer | double | decimal | boolean +static bool +read_literal(SerdReader parser, Node* dest) +{ + Ref str = 0; + Node datatype = { 0, 0, 0, 0 }; + const uchar c = peek_char(parser); + if (in_range(c, '0', '9') || c == '-' || c == '+') { + return read_number(parser, dest); + } else if (c == '\"') { + str = read_quotedString(parser); + if (!str) { + return false; + } + + Ref lang = 0; + switch (peek_char(parser)) { + case '^': + eat_char(parser, '^'); + eat_char(parser, '^'); + TRY_THROW(read_resource(parser, &datatype)); + break; + case '@': + eat_char(parser, '@'); + TRY_THROW(lang = read_language(parser)); + } + *dest = make_node(LITERAL, str, datatype.value, lang); + } else { + *dest = make_node(QNAME, read_qname(parser), 0, 0); + } + return true; +except: + pop_string(parser, str); + return false; +} + +// [12] predicate ::= resource +static Node +read_predicate(SerdReader parser) +{ + Node node = { 0, 0, 0, 0 }; + read_resource(parser, &node); + return node; +} + +// [9] verb ::= predicate | 'a' +static Node +read_verb(SerdReader parser) +{ + #define RDF_TYPE "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" + uint8_t pre[2]; + readahead(parser, pre, 2); + switch (pre[0]) { + case 'a': + switch (pre[1]) { + case 0x9: case 0xA: case 0xD: case 0x20: + eat_char(parser, pre[0]); + return make_node(URI, push_string(parser, RDF_TYPE, 48), 0, 0); + default: break; // fall through + } + default: + return read_predicate(parser); + } +} + +// [26] nodeID ::= '_:' name +static Ref +read_nodeID(SerdReader parser) +{ + eat_char(parser, '_'); + eat_char(parser, ':'); + Ref str = push_string(parser, "", 1); + return read_name(parser, str, true); +} + +static Ref +blank_id(SerdReader parser) +{ + char str[32]; + const int len = snprintf(str, 32, "genid%u", parser->next_id++); + return push_string(parser, str, len + 1); +} + +// Spec: [21] blank ::= nodeID | '[]' | '[' predicateObjectList ']' | collection +// Actual: [21] blank ::= nodeID | '[ ws* ]' | '[' ws* predicateObjectList ws* ']' | collection +static bool +read_blank(SerdReader parser, Node* dest) +{ + switch (peek_char(parser)) { + case '_': + *dest = make_node(BLANK, read_nodeID(parser), 0, 0); + return true; + case '[': + eat_char(parser, '['); + read_ws_star(parser); + if (peek_char(parser) == ']') { + eat_char(parser, ']'); + *dest = make_node(BLANK, blank_id(parser), 0, 0); + return true; + } else { + *dest = make_node(BLANK, blank_id(parser), 0, 0); + read_predicateObjectList(parser, dest); + read_ws_star(parser); + eat_char(parser, ']'); + return true; + } + default: + error(parser, "illegal blank node\n"); + } + // TODO: collections + return false; +} + +inline static bool +is_object_end(const uchar c) +{ + switch (c) { + case 0x9: case 0xA: case 0xD: case 0x20: + case '#': case '.': case ';': + return true; + default: + return false; + } +} + +// [13] object ::= resource | blank | literal +// Recurses, calling statement_handler for every statement encountered. +static bool +read_object(SerdReader parser, const Node* subject, const Node* predicate) +{ + static const char* const XSD_BOOLEAN = "http://www.w3.org/2001/XMLSchema#boolean"; + static const size_t XSD_BOOLEAN_LEN = 40; + + bool ret = false; + Node o = { 0, 0, 0, 0 }; + const uchar c = peek_char(parser); + switch (c) { + case '[': case '(': case '_': + TRY_THROW(ret = read_blank(parser, &o)); + break; + case '<': case ':': + TRY_THROW(ret = read_resource(parser, &o)); + break; + case '\"': case '+': case '-': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + TRY_THROW(ret = read_literal(parser, &o)); + break; + case '.': + TRY_THROW(ret = read_literal(parser, &o)); + default: break; + } + + if (!ret) { + assert(o.value == 0); + /* Either a boolean literal, or a qname. + Unfortunately there is no way to distinbuish these without a lot of readahead, + since `true' or `false' could be the start of a qname. + */ + uint8_t pre[6]; + readahead(parser, pre, 6); + if (!strncmp((char*)pre, "true", 4) && is_object_end(pre[4])) { + eat_string(parser, "true", 4); + const Ref value = push_string(parser, "true", 5); + const Ref datatype = push_string(parser, XSD_BOOLEAN, XSD_BOOLEAN_LEN + 1); + o = make_node(LITERAL, value, datatype, 0); + } else if (!strncmp((char*)pre, "false", 5) && is_object_end(pre[5])) { + eat_string(parser, "false", 5); + const Ref value = push_string(parser, "false", 6); + const Ref datatype = push_string(parser, XSD_BOOLEAN, XSD_BOOLEAN_LEN + 1); + o = make_node(LITERAL, value, datatype, 0); + } else if (!is_object_end(c)) { + o = make_node(QNAME, read_qname(parser), 0, 0); + } + } + + if (o.value != 0) { + emit_statement(parser, NULL, subject, predicate, &o); + pop_string(parser, o.lang); + pop_string(parser, o.datatype); + pop_string(parser, o.value); + return true; + } +except: + pop_string(parser, o.lang); + pop_string(parser, o.datatype); + pop_string(parser, o.value); + return false; +} + +// Spec: [8] objectList ::= object ( ',' object )* +// Actual: [8] objectList ::= object ( ws* ',' ws* object )* +static bool +read_objectList(SerdReader parser, const Node* subject, const Node* predicate) +{ + TRY_RET(read_object(parser, subject, predicate)); + read_ws_star(parser); + while (peek_char(parser) == ',') { + eat_char(parser, ','); + read_ws_star(parser); + TRY_RET(read_object(parser, subject, predicate)); + read_ws_star(parser); + } + return true; +} + +// Spec: [7] predicateObjectList ::= verb objectList ( ';' verb objectList )* ( ';' )? +// Actual: [7] predicateObjectList ::= verb ws+ objectList ( ws* ';' ws* verb ws+ objectList )* ( ';' )? +static bool +read_predicateObjectList(SerdReader parser, const Node* subject) +{ + if (parser->eof) { + return false; + } + Node predicate = read_verb(parser); + read_ws_plus(parser); + TRY_THROW(read_objectList(parser, subject, &predicate)); + pop_string(parser, predicate.value); + predicate.value = 0; + read_ws_star(parser); + while (peek_char(parser) == ';') { + eat_char(parser, ';'); + read_ws_star(parser); + switch (peek_char(parser)) { + case '.': case ']': + return true; + default: + TRY_THROW((predicate = read_verb(parser)).value != 0); + read_ws_plus(parser); + TRY_THROW(read_objectList(parser, subject, &predicate)); + pop_string(parser, predicate.value); + predicate.value = 0; + read_ws_star(parser); + } + } + //pop_string(parser, predicate.value); + return true; +except: + pop_string(parser, predicate.value); + return false; +} + +// [11] subject ::= resource | blank +static Node +read_subject(SerdReader parser) +{ + Node subject = { 0, 0, 0, 0 }; + switch (peek_char(parser)) { + case '[': case '(': case '_': + read_blank(parser, &subject); + break; + default: + read_resource(parser, &subject); + } + return subject; +} + +// Spec: [6] triples ::= subject predicateObjectList +// Actual: [6] triples ::= subject ws+ predicateObjectList +static bool +read_triples(SerdReader parser) +{ + const Node subject = read_subject(parser); + if (subject.value != 0) { + read_ws_plus(parser); + const bool ret = read_predicateObjectList(parser, &subject); + pop_string(parser, subject.value); + return ret; + } + return false; +} + +// [5] base ::= '@base' ws+ uriref +static bool +read_base(SerdReader parser) +{ + // `@' is already eaten in read_directive + eat_string(parser, "base", 4); + read_ws_plus(parser); + Ref uri; + TRY_RET(uri = read_uriref(parser)); + parser->base_handler(parser->handle, deref(parser, uri)); + pop_string(parser, uri); + return true; +} + +// Spec: [4] prefixID ::= '@prefix' ws+ prefixName? ':' uriref +// Actual: [4] prefixID ::= '@prefix' ws+ prefixName? ':' ws* uriref +static bool +read_prefixID(SerdReader parser) +{ + // `@' is already eaten in read_directive + eat_string(parser, "prefix", 6); + read_ws_plus(parser); + bool ret = false; + Ref name = read_prefixName(parser); + if (!name) { + name = push_string(parser, "", 1); + } + TRY_THROW(eat_char(parser, ':') == ':'); + read_ws_star(parser); + Ref uri = 0; + TRY_THROW(uri = read_uriref(parser)); + ret = parser->prefix_handler(parser->handle, + deref(parser, name), + deref(parser, uri)); + pop_string(parser, uri); +except: + pop_string(parser, name); + return ret; +} + +// [3] directive ::= prefixID | base +static bool +read_directive(SerdReader parser) +{ + eat_char(parser, '@'); + switch (peek_char(parser)) { + case 'b': + return read_base(parser); + case 'p': + return read_prefixID(parser); + default: + return error(parser, "illegal directive\n"); + } +} + +// Spec: [1] statement ::= directive '.' | triples '.' | ws+ +// Actual: [1] statement ::= directive ws* '.' | triples ws* '.' | ws+ +static bool +read_statement(SerdReader parser) +{ + read_ws_star(parser); + if (parser->eof) { + return true; + } + switch (peek_char(parser)) { + case '@': + TRY_RET(read_directive(parser)); + break; + default: + TRY_RET(read_triples(parser)); + break; + } + read_ws_star(parser); + return eat_char(parser, '.'); +} + +// [1] turtleDoc ::= statement +static bool +read_turtleDoc(SerdReader parser) +{ + while (!parser->err && !parser->eof) { + TRY_RET(read_statement(parser)); + } + return true;//!parser->err; +} + +SERD_API +SerdReader +serd_reader_new(SerdSyntax syntax, + void* handle, + SerdBaseHandler base_handler, + SerdPrefixHandler prefix_handler, + SerdStatementHandler statement_handler) +{ + const Cursor cur = { NULL, 0, 0 }; + SerdReader reader = malloc(sizeof(struct SerdReaderImpl)); + reader->handle = handle; + reader->base_handler = base_handler; + reader->prefix_handler = prefix_handler; + reader->statement_handler = statement_handler; + reader->fd = 0; + reader->stack.buf = malloc(STACK_CHUNK_SIZE); + reader->stack.buf_size = STACK_CHUNK_SIZE; + reader->stack.size = 8; + reader->cur = cur; + reader->next_id = 1; + reader->err = 0; + reader->read_buf = (uint8_t*)malloc(READ_BUF_LEN) + MAX_READAHEAD; + reader->read_head = 0; + reader->eof = false; +#ifdef STACK_DEBUG + reader->alloc_stack = 0; + reader->n_allocs = 0; +#endif + return reader; +} + +SERD_API +bool +serd_reader_read_file(SerdReader reader, FILE* file, const uint8_t* name) +{ + SerdReader const me = (SerdReader)reader; + const Cursor cur = { name, 1, 1 }; + me->fd = file; + me->cur = cur; + fread(me->read_buf, 1, READ_BUF_LEN, file); + const bool ret = read_turtleDoc(me); + me->fd = 0; + me->cur = cur; + return ret; +} + +SERD_API +void +serd_reader_free(SerdReader reader) +{ + SerdReader const me = (SerdReader)reader; +#ifdef STACK_DEBUG + free(me->alloc_stack); +#endif + free(me->stack.buf); + free(me->read_buf - MAX_READAHEAD); + free(me); +} diff --git a/src/serdi.c b/src/serdi.c new file mode 100644 index 00000000..f5be93c3 --- /dev/null +++ b/src/serdi.c @@ -0,0 +1,234 @@ +/* Serd, an RDF serialisation library. + * Copyright 2011 David Robillard <d@drobilla.net> + * + * Serd is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Serd is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + * License for details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <assert.h> +#include <stdlib.h> +#include <string.h> + +#include "serd/serd.h" + +typedef struct { + FILE* out_fd; + SerdNamespaces ns; + SerdString* base_uri_str; + SerdURI base_uri; +} State; + +static bool +event_base(void* handle, + const SerdString* uri_str) +{ + State* const state = (State*)handle; + + SerdURI uri; + if (!serd_uri_parse(uri_str->buf, &uri)) { + return false; + } + + SerdURI base_uri = {{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},false}; + SerdString* base_uri_str; + if (!uri.scheme.len) { + // URI has no scheme (relative by definition), resolve + SerdURI abs_base_uri; + if (!serd_uri_resolve(&uri, &state->base_uri, &abs_base_uri)) { + fprintf(stderr, "error: failed to resolve new base URI\n"); + assert(false); + return false; + } + base_uri_str = serd_uri_serialise(&abs_base_uri, &base_uri); + // FIXME: double parse + serd_uri_parse(base_uri_str->buf, &base_uri); + } else { + // Absolute URI, use literally as new base URI + base_uri_str = serd_string_copy(uri_str); + // FIXME: double parse + serd_uri_parse(base_uri_str->buf, &base_uri); + } + + // Replace the old base URI + free(state->base_uri_str); + state->base_uri_str = base_uri_str; + state->base_uri = base_uri; + + return true; +} + +static bool +event_prefix(void* handle, + const SerdString* name, + const SerdString* uri_string) +{ + State* const state = (State*)handle; + if (serd_uri_string_is_relative(uri_string->buf)) { + SerdURI uri; + if (!serd_uri_parse(uri_string->buf, &uri)) { + return false; + } + SerdURI abs_uri; + if (!serd_uri_resolve(&uri, &state->base_uri, &abs_uri)) { + return false; + } + SerdURI new_abs_uri; + SerdString* abs_uri_string = serd_uri_serialise(&abs_uri, &new_abs_uri); + serd_namespaces_add(state->ns, name, abs_uri_string); + } else { + serd_namespaces_add(state->ns, name, uri_string); + } + return true; +} + +static inline bool +write_node(State* state, + const SerdString* str, + SerdNodeType type, + const SerdString* datatype, + const SerdString* lang) +{ + SerdRange uri_prefix; + SerdRange uri_suffix; + switch (type) { + case BLANK: + fwrite("_:", 1, 2, state->out_fd); + fwrite(str->buf, 1, str->n_bytes - 1, state->out_fd); + break; + case QNAME: + if (!serd_namespaces_expand(state->ns, str, &uri_prefix, &uri_suffix)) { + fprintf(stderr, "error: undefined namespace prefix `%s'\n", str->buf); + return false; + } + fwrite("<", 1, 1, state->out_fd); + fwrite(uri_prefix.buf, 1, uri_prefix.len - 1, state->out_fd); + fwrite(uri_suffix.buf, 1, uri_suffix.len - 1, state->out_fd); + fwrite(">", 1, 1, state->out_fd); + break; + case URI: + if (serd_uri_string_is_relative(str->buf)) { + SerdURI uri; + if (serd_uri_parse(str->buf, &uri)) { + SerdURI abs_uri; + if (serd_uri_resolve(&uri, &state->base_uri, &abs_uri)) { + fwrite("<", 1, 1, state->out_fd); + serd_uri_write(&abs_uri, state->out_fd); + fwrite(">", 1, 1, state->out_fd); + return true; + } + } + } else { + fwrite("<", 1, 1, state->out_fd); + fwrite(str->buf, 1, str->n_bytes - 1, state->out_fd); + fwrite(">", 1, 1, state->out_fd); + return true; + } + return false; + case LITERAL: + fwrite("\"", 1, 1, state->out_fd); + for (size_t i = 0; i < str->n_bytes - 1; ++i) { + const char c = str->buf[i]; + switch (c) { + case '\\': fwrite("\\\\", 1, 2, state->out_fd); break; + case '\n': fwrite("\\n", 1, 2, state->out_fd); break; + case '\r': fwrite("\\r", 1, 2, state->out_fd); break; + case '\t': fwrite("\\t", 1, 2, state->out_fd); break; + case '"': fwrite("\\\"", 1, 2, state->out_fd); break; + default: + fwrite(&c, 1, 1, state->out_fd); + } + } + fwrite("\"", 1, 1, state->out_fd); + if (lang) { + fwrite("@\"", 1, 2, state->out_fd); + fwrite(lang->buf, 1, lang->n_bytes - 1, state->out_fd); + fwrite("\"", 1, 1, state->out_fd); + } else if (datatype) { + fwrite("^^", 1, 2, state->out_fd); + write_node(state, datatype, URI, NULL, NULL); + } + break; + } + return true; +} + +static bool +event_statement(void* handle, + const SerdString* graph, + const SerdString* subject, + SerdNodeType subject_type, + const SerdString* predicate, + SerdNodeType predicate_type, + const SerdString* object, + SerdNodeType object_type, + const SerdString* object_datatype, + const SerdString* object_lang) +{ + State* const state = (State*)handle; + FILE* const fd = state->out_fd; + write_node(state, subject, subject_type, NULL, NULL); + fwrite(" ", 1, 1, fd); + write_node(state, predicate, predicate_type, NULL, NULL); + fwrite(" ", 1, 1, fd); + write_node(state, object, object_type, object_datatype, object_lang); + fwrite(" .\n", 1, 3, fd); + return true; +} + +int +main(int argc, char** argv) +{ + if (/*argc != 2 && */argc != 3) { + fprintf(stderr, "Bad parameters\n"); + return 1; + } + + const uint8_t* const in_filename = (uint8_t*)argv[1]; + const uint8_t* base_uri_str = in_filename; + + SerdURI base_uri; + if (argc > 2) { + base_uri_str = (const uint8_t*)argv[2]; + if (!serd_uri_parse(base_uri_str, &base_uri)) { + fprintf(stderr, "invalid base uri: %s\n", base_uri_str); + return 1; + } + + } + + FILE* const in_fd = fopen((const char*)in_filename, "r"); + FILE* out_fd = stdout; + + if (!in_fd) { + fprintf(stderr, "failed to open file\n"); + return 1; + } + + //SerdURI null_uri = {{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}}; + State state = { out_fd, serd_namespaces_new(), serd_string_new(base_uri_str), base_uri }; + + SerdReader reader = serd_reader_new( + SERD_TURTLE, &state, event_base, event_prefix, event_statement); + + const bool success = serd_reader_read_file(reader, in_fd, in_filename); + serd_reader_free(reader); + fclose(in_fd); + serd_namespaces_free(state.ns); + free(state.base_uri_str); + + if (success) { + return 0; + } + + return 1; +} diff --git a/src/uri.c b/src/uri.c new file mode 100644 index 00000000..d98f07ff --- /dev/null +++ b/src/uri.c @@ -0,0 +1,428 @@ +/* Serd, an RDF serialisation library. + * Copyright 2011 David Robillard <d@drobilla.net> + * + * Serd is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Serd is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + * License for details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <assert.h> +#include <stdlib.h> +#include <string.h> + +#include "serd/serd.h" + +//#define URI_DEBUG 1 + +/** Return true if @a c lies within [min...max] (inclusive) */ +static inline bool +in_range(const char c, const char min, const char max) +{ + return (c >= min && c <= max); +} + +/** RFC2234: ALPHA := %x41-5A / %x61-7A ; A-Z / a-z */ +static inline bool +is_alpha(const uint8_t c) +{ + return in_range(c, 'A', 'Z') || in_range(c, 'a', 'z'); +} + +/** RFC2234: DIGIT ::= %x30-39 ; 0-9 */ +static inline bool +is_digit(const uint8_t c) +{ + return in_range(c, '0', '9'); +} + +/** Return true if @a uri is relative (i.e. does not start with a scheme) */ +SERD_API +bool +serd_uri_string_is_relative(const uint8_t* utf8) +{ + // RFC3986: scheme ::= ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) + if (!is_alpha(utf8[0])) { + return true; // Invalid scheme initial character, URI is relative + } + for (uint8_t c = *++utf8; (c = *utf8) != '\0'; ++utf8) { + switch (c) { + case ':': + return false; // End of scheme, URI is absolute + case '+': case '-': case '.': + break; // Valid scheme character, continue + default: + if (!is_alpha(c) && !is_digit(c)) { + return true; // Invalid scheme character, URI is relative + } + } + } + + return true; +} + +#ifdef URI_DEBUG +static void +serd_uri_dump(const SerdURI* uri, FILE* file) +{ +#define PRINT_PART(range, name) \ + if (range.buf) { \ + fprintf(stderr, " " name " = "); \ + fwrite((range).buf, 1, (range).len, stderr); \ + fprintf(stderr, "\n"); \ + } + + PRINT_PART(uri->scheme, "scheme"); + PRINT_PART(uri->authority, "authority"); + PRINT_PART(uri->path_base, "path_base"); + PRINT_PART(uri->path, "path"); + PRINT_PART(uri->query, "query"); + PRINT_PART(uri->fragment, "fragment"); +} +#endif + +SERD_API +bool +serd_uri_parse(const uint8_t* utf8, SerdURI* uri) +{ + static const SerdURI null_uri = {{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},false}; + *uri = null_uri; + assert(uri->path_base.buf == NULL); + assert(uri->path_base.len == 0); + assert(uri->authority.len == 0); + + const uint8_t* ptr = utf8; + + /* See http://tools.ietf.org/html/rfc3986#section-3 + URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] + */ + + /* S3.1: scheme ::= ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) */ + if (is_alpha(*ptr)) { + for (uint8_t c = *++ptr; true; c = *++ptr) { + switch (c) { + case '\0': case '/': case '?': case '#': + ptr = utf8; + goto path; // Relative URI (starts with path by definition) + case ':': + uri->scheme.buf = utf8; + uri->scheme.len = (ptr++) - utf8; + goto maybe_authority; // URI with scheme + case '+': case '-': case '.': + continue; + default: + if (is_alpha(c) || is_digit(c)) { + continue; + } + } + } + } + + /* S3.2: The authority component is preceded by a double slash ("//") + and is terminated by the next slash ("/"), question mark ("?"), + or number sign ("#") character, or by the end of the URI. + */ +maybe_authority: + if (*ptr == '/' && *(ptr + 1) == '/') { + ptr += 2; + uri->authority.buf = ptr; + assert(uri->authority.len == 0); + for (uint8_t c = *ptr; (c = *ptr) != '\0'; ++ptr) { + switch (c) { + case '/': goto path; + case '?': goto query; + case '#': goto fragment; + default: + ++uri->authority.len; + } + } + } + + /* RFC3986 S3.3: The path is terminated by the first question mark ("?") + or number sign ("#") character, or by the end of the URI. + */ +path: + switch (*ptr) { + case '?': goto query; + case '#': goto fragment; + case '\0': goto end; + default: break; + } + uri->path.buf = ptr; + uri->path.len = 0; + for (uint8_t c = *ptr; (c = *ptr) != '\0'; ++ptr) { + switch (c) { + case '?': goto query; + case '#': goto fragment; + default: + ++uri->path.len; + } + } + + /* RFC3986 S3.4: The query component is indicated by the first question + mark ("?") character and terminated by a number sign ("#") character + or by the end of the URI. + */ +query: + if (*ptr == '?') { + uri->query.buf = ++ptr; + for (uint8_t c = *ptr; (c = *ptr) != '\0'; ++ptr) { + switch (c) { + case '#': + goto fragment; + default: + ++uri->query.len; + } + } + } + + /* RFC3986 S3.5: A fragment identifier component is indicated by the + presence of a number sign ("#") character and terminated by the end + of the URI. + */ +fragment: + if (*ptr == '#') { + uri->fragment.buf = ptr; + while (*ptr++ != '\0') { + ++uri->fragment.len; + } + } + +end: + #ifdef URI_DEBUG + fprintf(stderr, "PARSE URI <%s>\n", utf8); + serd_uri_dump(uri, stderr); + fprintf(stderr, "\n"); + #endif + + return true; +} + +SERD_API +bool +serd_uri_resolve(const SerdURI* r, const SerdURI* base, SerdURI* t) +{ + assert(!r->scheme.len); // r is relative + + /** See http://tools.ietf.org/html/rfc3986#section-5.2.2 */ + + t->path_base.buf = NULL; + t->path_base.len = 0; + t->base_uri_has_authority = base->authority.len; + if (r->scheme.len) { + t->scheme = r->scheme; + t->authority = r->authority; + t->path = r->path; + t->query = r->query; + } else { + if (r->authority.len) { + t->authority = r->authority; + t->path = r->path; + t->query = r->query; + } else { + t->path = r->path; + if (!r->path.len) { + t->path_base = base->path; + if (r->query.len) { + t->query = r->query; + } else { + t->query = base->query; + } + } else { + if (r->path.buf[0] != '/') { + t->path_base = base->path; + } + t->query = r->query; + } + t->authority = base->authority; + } + t->scheme = base->scheme; + } + t->fragment = r->fragment; + + #ifdef URI_DEBUG + fprintf(stderr, "RESOLVE URI\nBASE:\n"); + serd_uri_dump(base, stderr); + fprintf(stderr, "URI:\n"); + serd_uri_dump(r, stderr); + fprintf(stderr, "RESULT:\n"); + serd_uri_dump(t, stderr); + fprintf(stderr, "\n"); + #endif + return true; +} + +typedef size_t (*Sink)(const void* data, size_t size, size_t nmemb, void* stream); + +static size_t +serd_uri_serialise_internal(const SerdURI* uri, Sink sink, void* stream) +{ + /* See http://tools.ietf.org/html/rfc3986#section-5.3 */ + + size_t write_size = 0; +#define WRITE(buf, len) \ + write_size += len; \ + if (len) { \ + sink(buf, 1, len, stream); \ + } +#define WRITE_CHAR(c) WRITE(&(c), 1) +#define WRITE_COMPONENT(prefix, field, suffix) \ + if ((field).len) { \ + for (const char* c = prefix; *c != '\0'; ++c) { \ + WRITE(c, 1); \ + } \ + WRITE((field).buf, (field).len); \ + for (const char* c = suffix; *c != '\0'; ++c) { \ + WRITE(c, 1); \ + } \ + } + + WRITE_COMPONENT("", uri->scheme, ":"); + WRITE_COMPONENT("//", uri->authority, ""); + if (uri->path_base.len) { + if (!uri->path.buf && (uri->fragment.buf || uri->query.buf)) { + WRITE_COMPONENT("", uri->path_base, ""); + } else { + /* Merge paths, removing dot components. + See http://tools.ietf.org/html/rfc3986#section-5.2.3 + */ + if (uri->base_uri_has_authority && !uri->path_base.len) { + WRITE("/", 1); + WRITE_COMPONENT("", uri->path, ""); + } else { + const uint8_t* uri_first = uri->path.buf; + const uint8_t* uri_end = uri_first; + size_t up = 1; + if (uri_first) { + // Count and skip leading dot components + uri_end = uri->path.buf + uri->path.len; + while (uri_first < uri_end) { + if (!memcmp((const char*)uri_first, "./", 2)) { + uri_first += 2; + } else if (!memcmp((const char*)uri_first, "../", 3)) { + ++up; + uri_first += 3; + } else if (!memcmp((const char*)uri_first, "..", 2)) { + ++up; + uri_first += 2; + } else if (!memcmp((const char*)uri_first, ".", 1)) { + ++uri_first; + } else if (!memcmp((const char*)uri_first, "//", 1)) { + ++uri_first; + } else { + break; + } + } + } + + if (uri->path.buf && uri->path_base.buf) { + // Find the up'th last slash + const uint8_t* base_last = uri->path_base.buf + uri->path_base.len - 1; + //for (; base_last > uri->path_base.buf; --base_last) { + do { + if (*base_last == '/') { + --up; + } + } while (up > 0 && (--base_last > uri->path_base.buf)); + + // Write base URI prefix + const size_t base_len = base_last - uri->path_base.buf + 1; + WRITE(uri->path_base.buf, base_len); + + } else { + // Relative path is just query or fragment, append it to full base URI + WRITE_COMPONENT("", uri->path_base, ""); + } + + // Write URI suffix + WRITE(uri_first, uri_end - uri_first); + } + } + } else { + WRITE_COMPONENT("", uri->path, ""); + } + WRITE_COMPONENT("?", uri->query, ""); + if (uri->fragment.buf) { + // Note uri->fragment.buf includes the leading `#' + WRITE_COMPONENT("", uri->fragment, ""); + } + WRITE("\0", 1); + return write_size; +} + +SERD_API +bool +serd_uri_write(const SerdURI* uri, FILE* file) +{ + //#if 0 + SerdURI flat_uri; + SerdString* const flat_uri_str = serd_uri_serialise(uri, &flat_uri); + if (flat_uri_str) { + fwrite(flat_uri_str->buf, 1, flat_uri_str->n_bytes - 1, file); + free(flat_uri_str); + return true; + } + return false; + //#endif + //return (serd_uri_serialise_internal(uri, (Sink)fwrite, file) > 0); +} + +static size_t +serd_uri_string_length(const SerdURI* uri) +{ + size_t len = uri->path_base.len; + +#define ADD_LEN(field, n_delims) \ + if ((field).len) { len += (field).len + (n_delims); } + + ADD_LEN(uri->path, 1); // + possible leading `/' + ADD_LEN(uri->scheme, 1); // + trailing `:' + ADD_LEN(uri->authority, 2); // + leading `//' + ADD_LEN(uri->query, 1); // + leading `?' + ADD_LEN(uri->fragment, 1); // + leading `#' + + return len; +} + +static size_t +string_write(const void* data, size_t size, size_t nmemb, void* stream) +{ + uint8_t** ptr = (uint8_t**)stream; + const size_t write_size = (size * nmemb); + memcpy(*ptr, data, write_size); + *ptr += write_size; + return nmemb; +} + +SERD_API +SerdString* +serd_uri_serialise(const SerdURI* uri, SerdURI* out) +{ + const size_t len = serd_uri_string_length(uri); + SerdString* str = malloc(sizeof(SerdString) + len + 1); + str->n_bytes = len + 1; + str->n_chars = len; // FIXME: UTF-8 + + uint8_t* ptr = str->buf; + const size_t actual_len = serd_uri_serialise_internal(uri, string_write, &ptr); + + str->buf[actual_len] = '\0'; + str->n_bytes = actual_len; + str->n_chars = str->n_bytes - 1; + + #ifdef URI_DEBUG + fwrite("URI: `'", 1, 6, stderr); + fwrite(str->buf, 1, str->n_bytes - 1, stderr); + fwrite("'\n", 1, 2, stderr); + #endif + + return str; +} |