/* Serd, an RDF serialisation library. * Copyright 2011 David Robillard * * Serd is free software: you can redistribute it and/or modify it under * the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Serd is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public * License for details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include #include #include #include "serd/serd.h" //#define URI_DEBUG 1 /** Return true if @a c lies within [min...max] (inclusive) */ static inline bool in_range(const char c, const char min, const char max) { return (c >= min && c <= max); } /** RFC2234: ALPHA := %x41-5A / %x61-7A ; A-Z / a-z */ static inline bool is_alpha(const uint8_t c) { return in_range(c, 'A', 'Z') || in_range(c, 'a', 'z'); } /** RFC2234: DIGIT ::= %x30-39 ; 0-9 */ static inline bool is_digit(const uint8_t c) { return in_range(c, '0', '9'); } SERD_API bool serd_uri_string_has_scheme(const uint8_t* utf8) { // RFC3986: scheme ::= ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) if (!is_alpha(utf8[0])) { return false; // Invalid scheme initial character, URI is relative } for (uint8_t c = *++utf8; (c = *utf8) != '\0'; ++utf8) { switch (c) { case ':': return true; // End of scheme case '+': case '-': case '.': break; // Valid scheme character, continue default: if (!is_alpha(c) && !is_digit(c)) { return false; // Invalid scheme character } } } return false; } #ifdef URI_DEBUG static void serd_uri_dump(const SerdURI* uri, FILE* file) { #define PRINT_PART(range, name) \ if (range.buf) { \ fprintf(stderr, " " name " = "); \ fwrite((range).buf, 1, (range).len, stderr); \ fprintf(stderr, "\n"); \ } PRINT_PART(uri->scheme, "scheme"); PRINT_PART(uri->authority, "authority"); PRINT_PART(uri->path_base, "path_base"); PRINT_PART(uri->path, "path"); PRINT_PART(uri->query, "query"); PRINT_PART(uri->fragment, "fragment"); } #endif SERD_API bool serd_uri_parse(const uint8_t* utf8, SerdURI* uri) { *uri = SERD_URI_NULL; assert(uri->path_base.buf == NULL); assert(uri->path_base.len == 0); assert(uri->authority.len == 0); const uint8_t* ptr = utf8; /* See http://tools.ietf.org/html/rfc3986#section-3 URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] */ /* S3.1: scheme ::= ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) */ if (is_alpha(*ptr)) { for (uint8_t c = *++ptr; true; c = *++ptr) { switch (c) { case '\0': case '/': case '?': case '#': ptr = utf8; goto path; // Relative URI (starts with path by definition) case ':': uri->scheme.buf = utf8; uri->scheme.len = (ptr++) - utf8; goto maybe_authority; // URI with scheme case '+': case '-': case '.': continue; default: if (is_alpha(c) || is_digit(c)) { continue; } } } } /* S3.2: The authority component is preceded by a double slash ("//") and is terminated by the next slash ("/"), question mark ("?"), or number sign ("#") character, or by the end of the URI. */ maybe_authority: if (*ptr == '/' && *(ptr + 1) == '/') { ptr += 2; uri->authority.buf = ptr; assert(uri->authority.len == 0); for (uint8_t c = *ptr; (c = *ptr) != '\0'; ++ptr) { switch (c) { case '/': goto path; case '?': goto query; case '#': goto fragment; default: ++uri->authority.len; } } } /* RFC3986 S3.3: The path is terminated by the first question mark ("?") or number sign ("#") character, or by the end of the URI. */ path: switch (*ptr) { case '?': goto query; case '#': goto fragment; case '\0': goto end; default: break; } uri->path.buf = ptr; uri->path.len = 0; for (uint8_t c = *ptr; (c = *ptr) != '\0'; ++ptr) { switch (c) { case '?': goto query; case '#': goto fragment; default: ++uri->path.len; } } /* RFC3986 S3.4: The query component is indicated by the first question mark ("?") character and terminated by a number sign ("#") character or by the end of the URI. */ query: if (*ptr == '?') { uri->query.buf = ++ptr; for (uint8_t c = *ptr; (c = *ptr) != '\0'; ++ptr) { switch (c) { case '#': goto fragment; default: ++uri->query.len; } } } /* RFC3986 S3.5: A fragment identifier component is indicated by the presence of a number sign ("#") character and terminated by the end of the URI. */ fragment: if (*ptr == '#') { uri->fragment.buf = ptr; while (*ptr++ != '\0') { ++uri->fragment.len; } } end: #ifdef URI_DEBUG fprintf(stderr, "PARSE URI <%s>\n", utf8); serd_uri_dump(uri, stderr); fprintf(stderr, "\n"); #endif return true; } SERD_API bool serd_uri_resolve(const SerdURI* r, const SerdURI* base, SerdURI* t) { assert(!r->scheme.len); // r is relative // See http://tools.ietf.org/html/rfc3986#section-5.2.2 t->path_base.buf = NULL; t->path_base.len = 0; if (r->scheme.len) { t->scheme = r->scheme; t->authority = r->authority; t->path = r->path; t->query = r->query; } else { if (r->authority.len) { t->authority = r->authority; t->path = r->path; t->query = r->query; } else { t->path = r->path; if (!r->path.len) { t->path_base = base->path; if (r->query.len) { t->query = r->query; } else { t->query = base->query; } } else { if (r->path.buf[0] != '/') { t->path_base = base->path; } t->query = r->query; } t->authority = base->authority; } t->scheme = base->scheme; } t->fragment = r->fragment; #ifdef URI_DEBUG fprintf(stderr, "RESOLVE URI\nBASE:\n"); serd_uri_dump(base, stderr); fprintf(stderr, "URI:\n"); serd_uri_dump(r, stderr); fprintf(stderr, "RESULT:\n"); serd_uri_dump(t, stderr); fprintf(stderr, "\n"); #endif return true; } SERD_API size_t serd_uri_serialise(const SerdURI* uri, SerdSink sink, void* stream) { // See http://tools.ietf.org/html/rfc3986#section-5.3 size_t write_size = 0; #define WRITE(buf, len) \ write_size += len; \ if (len) { \ sink((const uint8_t*)buf, len, stream); \ } #define WRITE_CHAR(c) WRITE(&(c), 1) #define WRITE_COMPONENT(prefix, field, suffix) \ if ((field).len) { \ for (const uint8_t* c = (const uint8_t*)prefix; *c != '\0'; ++c) { \ WRITE(c, 1); \ } \ WRITE((field).buf, (field).len); \ for (const uint8_t* c = (const uint8_t*)suffix; *c != '\0'; ++c) { \ WRITE(c, 1); \ } \ } WRITE_COMPONENT("", uri->scheme, ":"); WRITE_COMPONENT("//", uri->authority, ""); if (uri->path_base.len) { if (!uri->path.buf && (uri->fragment.buf || uri->query.buf)) { WRITE_COMPONENT("", uri->path_base, ""); } else { /* Merge paths, removing dot components. See http://tools.ietf.org/html/rfc3986#section-5.2.3 */ const uint8_t* uri_first = uri->path.buf; const uint8_t* uri_end = uri_first; size_t up = 1; if (uri_first) { // Count and skip leading dot components uri_end = uri->path.buf + uri->path.len; while (uri_first < uri_end) { if (!memcmp((const char*)uri_first, "./", 2)) { uri_first += 2; } else if (!memcmp((const char*)uri_first, "../", 3)) { ++up; uri_first += 3; } else if (!memcmp((const char*)uri_first, "..", 2)) { ++up; uri_first += 2; } else if (!memcmp((const char*)uri_first, ".", 1)) { ++uri_first; } else if (!memcmp((const char*)uri_first, "//", 1)) { ++uri_first; } else { break; } } if (uri->path.buf && uri->path_base.buf) { // Find the up'th last slash const uint8_t* base_last = uri->path_base.buf + uri->path_base.len - 1; do { if (*base_last == '/') { --up; } } while (up > 0 && (--base_last > uri->path_base.buf)); // Write base URI prefix const size_t base_len = base_last - uri->path_base.buf + 1; WRITE(uri->path_base.buf, base_len); } else { // Relative path is just query or fragment, append it to full base URI WRITE_COMPONENT("", uri->path_base, ""); } // Write URI suffix WRITE(uri_first, uri_end - uri_first); } } } else { WRITE_COMPONENT("", uri->path, ""); } WRITE_COMPONENT("?", uri->query, ""); if (uri->fragment.buf) { // Note uri->fragment.buf includes the leading `#' WRITE_COMPONENT("", uri->fragment, ""); } return write_size; } static size_t serd_uri_string_length(const SerdURI* uri) { size_t len = uri->path_base.len; #define ADD_LEN(field, n_delims) \ if ((field).len) { len += (field).len + (n_delims); } ADD_LEN(uri->path, 1); // + possible leading `/' ADD_LEN(uri->scheme, 1); // + trailing `:' ADD_LEN(uri->authority, 2); // + leading `//' ADD_LEN(uri->query, 1); // + leading `?' ADD_LEN(uri->fragment, 1); // + leading `#' return len; } static size_t string_sink(const void* buf, size_t len, void* stream) { uint8_t** ptr = (uint8_t**)stream; memcpy(*ptr, buf, len); *ptr += len; return len; } SERD_API SerdString* serd_string_new_from_uri(const SerdURI* uri, SerdURI* out) { const size_t len = serd_uri_string_length(uri); SerdString* str = malloc(sizeof(SerdString) + len + 1); str->n_bytes = len + 1; str->n_chars = len; // FIXME: UTF-8 uint8_t* ptr = str->buf; const size_t actual_len = serd_uri_serialise(uri, string_sink, &ptr); str->buf[actual_len] = '\0'; str->n_bytes = actual_len + 1; str->n_chars = str->n_bytes - 1; // FIXME: UTF-8 #ifdef URI_DEBUG fwrite("URI: `'", 1, 6, stderr); fwrite(str->buf, 1, str->n_bytes - 1, stderr); fwrite("'\n", 1, 2, stderr); #endif return str; }