aboutsummaryrefslogtreecommitdiffstats
path: root/src/n3.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/n3.c')
-rw-r--r--src/n3.c563
1 files changed, 25 insertions, 538 deletions
diff --git a/src/n3.c b/src/n3.c
index 582beae4..777c83d3 100644
--- a/src/n3.c
+++ b/src/n3.c
@@ -18,12 +18,11 @@
#include "env.h"
#include "namespaces.h"
#include "node.h"
-#include "read_utf8.h"
+#include "read_ntriples.h"
#include "reader.h"
#include "stack.h"
#include "string_utils.h"
#include "try.h"
-#include "uri_utils.h"
#include "serd/serd.h"
@@ -31,19 +30,8 @@
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
-#include <stdlib.h>
#include <string.h>
-#if defined(__clang__) && __clang_major__ >= 10
-# define SERD_FALLTHROUGH __attribute__((fallthrough))
-_Pragma("clang diagnostic push")
-_Pragma("clang diagnostic ignored \"-Wmissing-declarations\"")
-#elif defined(__GNUC__) && __GNUC__ >= 7
-# define SERD_FALLTHROUGH __attribute__((fallthrough))
-#else
-# define SERD_FALLTHROUGH
-#endif
-
static bool
fancy_syntax(const SerdReader* const reader)
{
@@ -56,189 +44,31 @@ read_collection(SerdReader* reader, ReadContext ctx, SerdNode** dest);
static SerdStatus
read_predicateObjectList(SerdReader* reader, ReadContext ctx, bool* ate_dot);
-static uint8_t
-read_HEX(SerdReader* const reader)
-{
- const int c = peek_byte(reader);
- if (is_xdigit(c)) {
- return (uint8_t)eat_byte_safe(reader, c);
- }
-
- r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid hexadecimal digit `%c'", c);
- return 0;
-}
-
-// Read UCHAR escape, initial \ is already eaten by caller
+// whitespace ::= #x9 | #xA | #xD | #x20 | comment
static SerdStatus
-read_UCHAR(SerdReader* const reader,
- SerdNode* const dest,
- uint32_t* const char_code)
+read_whitespace(SerdReader* const reader)
{
- const int b = peek_byte(reader);
- unsigned length = 0;
- switch (b) {
- case 'U':
- length = 8;
- break;
- case 'u':
- length = 4;
- break;
- default:
- return SERD_ERR_BAD_SYNTAX;
- }
- eat_byte_safe(reader, b);
-
- uint8_t buf[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
- for (unsigned i = 0; i < length; ++i) {
- if (!(buf[i] = read_HEX(reader))) {
- return SERD_ERR_BAD_SYNTAX;
- }
- }
-
- char* endptr = NULL;
- const uint32_t code = (uint32_t)strtoul((const char*)buf, &endptr, 16);
- assert(endptr == (char*)buf + length);
-
- unsigned size = 0;
- if (code < 0x00000080) {
- size = 1;
- } else if (code < 0x00000800) {
- size = 2;
- } else if (code < 0x00010000) {
- size = 3;
- } else if (code < 0x00110000) {
- size = 4;
- } else {
- r_err(
- reader, SERD_ERR_BAD_SYNTAX, "unicode character 0x%X out of range", code);
-
- *char_code = 0xFFFD;
- const SerdStatus st = push_bytes(reader, dest, replacement_char, 3);
- return st ? st : SERD_SUCCESS;
- }
-
- // Build output in buf
- // (Note # of bytes = # of leading 1 bits in first byte)
- uint32_t c = code;
- switch (size) {
- case 4:
- buf[3] = (uint8_t)(0x80u | (c & 0x3Fu));
- c >>= 6;
- c |= (16 << 12); // set bit 4
- SERD_FALLTHROUGH;
- case 3:
- buf[2] = (uint8_t)(0x80u | (c & 0x3Fu));
- c >>= 6;
- c |= (32 << 6); // set bit 5
- SERD_FALLTHROUGH;
- case 2:
- buf[1] = (uint8_t)(0x80u | (c & 0x3Fu));
- c >>= 6;
- c |= 0xC0; // set bits 6 and 7
- SERD_FALLTHROUGH;
- case 1:
- buf[0] = (uint8_t)c;
- SERD_FALLTHROUGH;
- default:
- break;
- }
-
- *char_code = code;
- return push_bytes(reader, dest, buf, size);
-}
-
-// Read ECHAR escape, initial \ is already eaten by caller
-static SerdStatus
-read_ECHAR(SerdReader* const reader, SerdNode* const dest)
-{
- const int c = peek_byte(reader);
- switch (c) {
- case 't':
- eat_byte_safe(reader, 't');
- return push_byte(reader, dest, '\t');
- case 'b':
- eat_byte_safe(reader, 'b');
- return push_byte(reader, dest, '\b');
- case 'n':
- dest->flags |= SERD_HAS_NEWLINE;
- eat_byte_safe(reader, 'n');
- return push_byte(reader, dest, '\n');
- case 'r':
- dest->flags |= SERD_HAS_NEWLINE;
- eat_byte_safe(reader, 'r');
- return push_byte(reader, dest, '\r');
- case 'f':
- eat_byte_safe(reader, 'f');
- return push_byte(reader, dest, '\f');
- case '\\':
- case '"':
- case '\'':
- return push_byte(reader, dest, eat_byte_safe(reader, c));
- default:
- return SERD_ERR_BAD_SYNTAX;
- }
-}
-
-// Read one character (possibly multi-byte)
-// The first byte, c, has already been eaten by caller
-static SerdStatus
-read_character(SerdReader* const reader, SerdNode* const dest, const uint8_t c)
-{
- if (!(c & 0x80)) {
- switch (c) {
- case 0xA:
- case 0xD:
- dest->flags |= SERD_HAS_NEWLINE;
- break;
- case '"':
- case '\'':
- dest->flags |= SERD_HAS_QUOTE;
- break;
- default:
- break;
- }
-
- return push_byte(reader, dest, c);
- }
-
- return read_utf8_continuation(reader, dest, c);
-}
-
-// [10] comment ::= '#' ( [^#xA #xD] )*
-static void
-read_comment(SerdReader* const reader)
-{
- eat_byte_safe(reader, '#');
- int c = 0;
- while (((c = peek_byte(reader)) != 0xA) && c != 0xD && c != EOF && c) {
- eat_byte_safe(reader, c);
- }
-}
-
-// [24] ws ::= #x9 | #xA | #xD | #x20 | comment
-static bool
-read_ws(SerdReader* const reader)
-{
- const int c = peek_byte(reader);
- switch (c) {
- case 0x9:
- case 0xA:
- case 0xD:
- case 0x20:
- eat_byte_safe(reader, c);
- return true;
+ switch (peek_byte(reader)) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case ' ':
+ eat_byte_safe(reader, peek_byte(reader));
+ return SERD_SUCCESS;
case '#':
read_comment(reader);
- return true;
+ return SERD_SUCCESS;
default:
- return false;
+ break;
}
+
+ return SERD_FAILURE;
}
static bool
read_ws_star(SerdReader* const reader)
{
- while (read_ws(reader)) {
+ while (!read_whitespace(reader)) {
}
return true;
@@ -301,44 +131,6 @@ read_STRING_LITERAL_LONG(SerdReader* const reader,
return tolerate_status(reader, st) ? SERD_SUCCESS : st;
}
-// STRING_LITERAL_QUOTE and STRING_LITERAL_SINGLE_QUOTE
-// Initial quote is already eaten by caller
-static SerdStatus
-read_STRING_LITERAL(SerdReader* const reader,
- SerdNode* const ref,
- const uint8_t q)
-{
- SerdStatus st = SERD_SUCCESS;
-
- while (tolerate_status(reader, st)) {
- const int c = peek_byte(reader);
- uint32_t code = 0;
- switch (c) {
- case EOF:
- return r_err(reader, SERD_ERR_BAD_SYNTAX, "end of file in short string");
- case '\n':
- case '\r':
- return r_err(reader, SERD_ERR_BAD_SYNTAX, "line end in short string");
- case '\\':
- eat_byte_safe(reader, c);
- if ((st = read_ECHAR(reader, ref)) &&
- (st = read_UCHAR(reader, ref, &code))) {
- return r_err(reader, st, "invalid escape `\\%c'", peek_byte(reader));
- }
- break;
- default:
- if (c == q) {
- eat_byte_safe(reader, q);
- return SERD_SUCCESS;
- } else {
- st = read_character(reader, ref, (uint8_t)eat_byte_safe(reader, c));
- }
- }
- }
-
- return tolerate_status(reader, st) ? SERD_SUCCESS : st;
-}
-
static SerdStatus
read_String(SerdReader* const reader, SerdNode* const node)
{
@@ -373,65 +165,6 @@ read_String(SerdReader* const reader, SerdNode* const node)
return read_STRING_LITERAL_LONG(reader, node, (uint8_t)q1);
}
-static bool
-is_PN_CHARS_BASE(const uint32_t c)
-{
- return ((c >= 0x00C0 && c <= 0x00D6) || (c >= 0x00D8 && c <= 0x00F6) ||
- (c >= 0x00F8 && c <= 0x02FF) || (c >= 0x0370 && c <= 0x037D) ||
- (c >= 0x037F && c <= 0x1FFF) || (c >= 0x200C && c <= 0x200D) ||
- (c >= 0x2070 && c <= 0x218F) || (c >= 0x2C00 && c <= 0x2FEF) ||
- (c >= 0x3001 && c <= 0xD7FF) || (c >= 0xF900 && c <= 0xFDCF) ||
- (c >= 0xFDF0 && c <= 0xFFFD) || (c >= 0x10000 && c <= 0xEFFFF));
-}
-
-static SerdStatus
-read_PN_CHARS_BASE(SerdReader* const reader, SerdNode* const dest)
-{
- uint32_t code = 0;
- const int c = peek_byte(reader);
- SerdStatus st = SERD_SUCCESS;
- if (is_alpha(c)) {
- st = push_byte(reader, dest, eat_byte_safe(reader, c));
- } else if (c == EOF || !(c & 0x80)) {
- return SERD_FAILURE;
- } else if ((st = read_utf8_code_point(reader, dest, &code, (uint8_t)c))) {
- return st;
- } else if (!is_PN_CHARS_BASE(code)) {
- r_err(
- reader, SERD_ERR_BAD_SYNTAX, "invalid character U+%04X in name", code);
- if (reader->strict) {
- return SERD_ERR_BAD_SYNTAX;
- }
- }
- return st;
-}
-
-static bool
-is_PN_CHARS(const uint32_t c)
-{
- return (is_PN_CHARS_BASE(c) || c == 0xB7 || (c >= 0x0300 && c <= 0x036F) ||
- (c >= 0x203F && c <= 0x2040));
-}
-
-static SerdStatus
-read_PN_CHARS(SerdReader* const reader, SerdNode* const dest)
-{
- uint32_t code = 0;
- const int c = peek_byte(reader);
- SerdStatus st = SERD_SUCCESS;
- if (is_alpha(c) || is_digit(c) || c == '_' || c == '-') {
- st = push_byte(reader, dest, eat_byte_safe(reader, c));
- } else if (c == EOF || !(c & 0x80)) {
- return SERD_FAILURE;
- } else if ((st = read_utf8_code_point(reader, dest, &code, (uint8_t)c))) {
- return st;
- } else if (!is_PN_CHARS(code)) {
- return r_err(
- reader, SERD_ERR_BAD_SYNTAX, "invalid character U+%04X in name", code);
- }
- return st;
-}
-
static SerdStatus
read_PERCENT(SerdReader* const reader, SerdNode* const dest)
{
@@ -592,67 +325,6 @@ read_PN_PREFIX(SerdReader* const reader, SerdNode* const dest)
return st;
}
-static SerdStatus
-read_LANGTAG(SerdReader* const reader)
-{
- int c = peek_byte(reader);
- if (!is_alpha(c)) {
- return r_err(reader, SERD_ERR_BAD_SYNTAX, "unexpected `%c'", c);
- }
-
- SerdNode* node = push_node(reader, SERD_LITERAL, "", 0);
- if (!node) {
- return SERD_ERR_OVERFLOW;
- }
-
- SerdStatus st = SERD_SUCCESS;
- TRY(st, push_byte(reader, node, eat_byte_safe(reader, c)));
- while ((c = peek_byte(reader)) && is_alpha(c)) {
- TRY(st, push_byte(reader, node, eat_byte_safe(reader, c)));
- }
- while (peek_byte(reader) == '-') {
- TRY(st, push_byte(reader, node, eat_byte_safe(reader, '-')));
- while ((c = peek_byte(reader)) && (is_alpha(c) || is_digit(c))) {
- TRY(st, push_byte(reader, node, eat_byte_safe(reader, c)));
- }
- }
- return SERD_SUCCESS;
-}
-
-static SerdStatus
-read_IRIREF_scheme(SerdReader* const reader, SerdNode* const dest)
-{
- int c = peek_byte(reader);
- if (!is_alpha(c)) {
- return r_err(reader, SERD_ERR_BAD_SYNTAX, "bad IRI scheme start `%c'", c);
- }
-
- SerdStatus st = SERD_SUCCESS;
- while ((c = peek_byte(reader)) != EOF) {
- if (c == '>') {
- return r_err(reader, SERD_ERR_BAD_SYNTAX, "missing IRI scheme");
- }
-
- if (!is_uri_scheme_char(c)) {
- return r_err(reader,
- SERD_ERR_BAD_SYNTAX,
- "bad IRI scheme char U+%04X (%c)",
- (unsigned)c,
- (char)c);
- }
-
- if ((st = push_byte(reader, dest, eat_byte_safe(reader, c)))) {
- return st;
- }
-
- if (c == ':') {
- return SERD_SUCCESS; // End of scheme
- }
- }
-
- return SERD_FAILURE;
-}
-
typedef struct {
SerdReader* reader;
SerdNode* node;
@@ -714,6 +386,10 @@ resolve_IRIREF(SerdReader* const reader,
static SerdStatus
read_IRIREF(SerdReader* const reader, SerdNode** const dest)
{
+ if (!fancy_syntax(reader)) {
+ return read_IRI(reader, dest);
+ }
+
SerdStatus st = SERD_SUCCESS;
if ((st = eat_byte_check(reader, '<'))) {
return st;
@@ -725,68 +401,14 @@ read_IRIREF(SerdReader* const reader, SerdNode** const dest)
const size_t string_start_offset = reader->stack.size;
- if (!fancy_syntax(reader) && (st = read_IRIREF_scheme(reader, *dest))) {
- return r_err(reader, st, "expected IRI scheme");
- }
-
- uint32_t code = 0;
- while (st <= SERD_FAILURE) {
- const int c = eat_byte_safe(reader, peek_byte(reader));
- switch (c) {
- case '"':
- case '<':
- return r_err(
- reader, SERD_ERR_BAD_SYNTAX, "invalid IRI character `%c'", c);
- case '>':
- return (reader->flags & SERD_READ_RELATIVE)
- ? SERD_SUCCESS
- : resolve_IRIREF(reader, *dest, string_start_offset);
- case '\\':
- if (read_UCHAR(reader, *dest, &code)) {
- return r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid IRI escape");
- }
- switch (code) {
- case 0:
- case ' ':
- case '<':
- case '>':
- return r_err(reader,
- SERD_ERR_BAD_SYNTAX,
- "invalid escaped IRI character U+%04X",
- code);
- default:
- break;
- }
- break;
- case '^':
- case '`':
- case '{':
- case '|':
- case '}':
- return r_err(
- reader, SERD_ERR_BAD_SYNTAX, "invalid IRI character `%c'", c);
- default:
- if (c <= 0x20) {
- st = r_err(reader,
- SERD_ERR_BAD_SYNTAX,
- "invalid IRI character (escape %%%02X)",
- (unsigned)c);
- if (reader->strict) {
- break;
- }
-
- if (!(st = push_byte(reader, *dest, c))) {
- st = SERD_FAILURE;
- }
- } else if (!(c & 0x80)) {
- st = push_byte(reader, *dest, c);
- } else {
- st = read_utf8_continuation(reader, *dest, (uint8_t)c);
- }
- }
+ st = read_IRIREF_suffix(reader, *dest);
+ if (!tolerate_status(reader, st)) {
+ return st;
}
- return tolerate_status(reader, st) ? SERD_SUCCESS : st;
+ return (reader->flags & SERD_READ_RELATIVE)
+ ? SERD_SUCCESS
+ : resolve_IRIREF(reader, *dest, string_start_offset);
}
static SerdStatus
@@ -970,44 +592,6 @@ read_literal(SerdReader* const reader,
}
static SerdStatus
-read_VARNAME(SerdReader* const reader, SerdNode** const dest)
-{
- // Simplified from SPARQL: VARNAME ::= (PN_CHARS_U | [0-9])+
- SerdNode* n = *dest;
- SerdStatus st = SERD_SUCCESS;
- int c = 0;
- peek_byte(reader);
- while ((c = peek_byte(reader))) {
- if (is_digit(c) || c == '_') {
- st = push_byte(reader, n, eat_byte_safe(reader, c));
- } else if ((st = read_PN_CHARS(reader, n))) {
- st = st > SERD_FAILURE ? st : SERD_SUCCESS;
- break;
- }
- }
-
- return st;
-}
-
-static SerdStatus
-read_Var(SerdReader* const reader, SerdNode** const dest)
-{
- if (!(reader->flags & SERD_READ_VARIABLES)) {
- return r_err(
- reader, SERD_ERR_BAD_SYNTAX, "syntax does not support variables");
- }
-
- if (!(*dest = push_node(reader, SERD_VARIABLE, "", 0))) {
- return SERD_ERR_OVERFLOW;
- }
-
- assert(peek_byte(reader) == '$' || peek_byte(reader) == '?');
- serd_byte_source_advance(reader->source);
-
- return read_VARNAME(reader, dest);
-}
-
-static SerdStatus
read_verb(SerdReader* reader, SerdNode** dest)
{
const size_t orig_stack_size = reader->stack.size;
@@ -1055,83 +639,6 @@ read_verb(SerdReader* reader, SerdNode** dest)
return SERD_SUCCESS;
}
-static bool
-avoid_blank_clashes(const SerdReader* const reader)
-{
- return fancy_syntax(reader) && !(reader->flags & SERD_READ_EXACT_BLANKS);
-}
-
-static SerdStatus
-adjust_blank_id(SerdReader* const reader, char* const buf)
-{
- if (avoid_blank_clashes(reader) && is_digit(buf[reader->bprefix_len + 1])) {
- const char tag = buf[reader->bprefix_len];
- if (tag == 'b') {
- buf[reader->bprefix_len] = 'B'; // Prevent clash
- reader->seen_genid = true;
- } else if (tag == 'B' && reader->seen_genid) {
- return r_err(reader,
- SERD_ERR_ID_CLASH,
- "found both `b' and `B' blank IDs, prefix required");
- }
- }
-
- return SERD_SUCCESS;
-}
-
-static SerdStatus
-read_BLANK_NODE_LABEL(SerdReader* const reader,
- SerdNode** const dest,
- bool* const ate_dot)
-{
- SerdStatus st = SERD_SUCCESS;
-
- eat_byte_safe(reader, '_');
- TRY(st, eat_byte_check(reader, ':'));
-
- if (!(*dest = push_node(reader,
- SERD_BLANK,
- reader->bprefix ? reader->bprefix : "",
- reader->bprefix_len))) {
- return SERD_ERR_OVERFLOW;
- }
-
- // Read first: (PN_CHARS | '_' | [0-9])
- SerdNode* const n = *dest;
- int c = peek_byte(reader);
- if (is_digit(c) || c == '_') {
- TRY(st, push_byte(reader, n, eat_byte_safe(reader, c)));
- } else if ((st = read_PN_CHARS(reader, n))) {
- return r_err(reader, st, "invalid name start");
- }
-
- // Read middle: (PN_CHARS | '.')*
- while ((c = peek_byte(reader))) {
- if (c == '.') {
- TRY(st, push_byte(reader, n, eat_byte_safe(reader, c)));
- } else if ((st = read_PN_CHARS(reader, n))) {
- break;
- }
- }
-
- if (st > SERD_FAILURE) {
- return st;
- }
-
- // Deal with annoying edge case of having eaten the trailing dot
- char* const buf = serd_node_buffer(n);
- if (buf[n->length - 1] == '.' && read_PN_CHARS(reader, n)) {
- --n->length;
- serd_stack_pop(&reader->stack, 1);
- *ate_dot = true;
- }
-
- // Adjust ID to avoid clashes with generated IDs if necessary
- st = adjust_blank_id(reader, buf);
-
- return tolerate_status(reader, st) ? SERD_SUCCESS : st;
-}
-
static SerdStatus
read_anon(SerdReader* const reader,
ReadContext ctx,
@@ -1312,10 +819,6 @@ read_objectList(SerdReader* const reader, ReadContext ctx, bool* const ate_dot)
{
SerdStatus st = SERD_SUCCESS;
TRY(st, read_object(reader, &ctx, true, ate_dot));
- if (!fancy_syntax(reader) && peek_delim(reader, ',')) {
- return r_err(
- reader, SERD_ERR_BAD_SYNTAX, "syntax does not support abbreviation");
- }
while (st <= SERD_FAILURE && !*ate_dot && eat_delim(reader, ',')) {
st = read_object(reader, &ctx, true, ate_dot);
@@ -1698,10 +1201,6 @@ read_n3_statement(SerdReader* const reader)
case EOF:
return SERD_FAILURE;
case '@':
- if (!fancy_syntax(reader)) {
- return r_err(
- reader, SERD_ERR_BAD_SYNTAX, "syntax does not support directives");
- }
TRY(st, read_directive(reader));
read_ws_star(reader);
break;
@@ -1761,14 +1260,6 @@ read_n3_statement(SerdReader* const reader)
return st;
}
-static void
-skip_until(SerdReader* const reader, const uint8_t byte)
-{
- for (int c = 0; (c = peek_byte(reader)) && c != EOF && c != byte;) {
- eat_byte_safe(reader, c);
- }
-}
-
SerdStatus
read_turtleTrigDoc(SerdReader* const reader)
{
@@ -1855,7 +1346,3 @@ read_nquadsDoc(SerdReader* const reader)
}
return st;
}
-
-#if defined(__clang__) && __clang_major__ >= 10
-_Pragma("clang diagnostic pop")
-#endif