aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Robillard <d@drobilla.net>2023-03-31 10:50:12 -0400
committerDavid Robillard <d@drobilla.net>2023-12-02 18:49:07 -0500
commitd22653dfe356e3da1354cdb0f7915e29c4a33e3b (patch)
tree496e678ba61a436e2bf0b11b079bf6115ba630fa
parent469034ec4ae5c0b5230ca30c40aaa9b1432c13a2 (diff)
downloadserd-d22653dfe356e3da1354cdb0f7915e29c4a33e3b.tar.gz
serd-d22653dfe356e3da1354cdb0f7915e29c4a33e3b.tar.bz2
serd-d22653dfe356e3da1354cdb0f7915e29c4a33e3b.zip
Factor out NTriples reader
-rw-r--r--meson.build1
-rw-r--r--src/n3.c534
-rw-r--r--src/read_ntriples.c737
-rw-r--r--src/read_ntriples.h186
-rw-r--r--src/reader.c42
-rw-r--r--src/reader.h15
-rw-r--r--src/string_utils.h16
-rw-r--r--test/extra/bad/bad-blank-node-label.nt1
-rw-r--r--test/extra/bad/bad-trailing-garbage.nt1
-rw-r--r--test/extra/bad/manifest.ttl13
-rw-r--r--test/extra/good/manifest.ttl7
-rw-r--r--test/extra/lax/manifest.ttl7
-rw-r--r--test/extra/lax/test-out-of-range-unicode.nt (renamed from test/extra/good/test-out-of-range-unicode.nt)0
-rw-r--r--test/extra/lax/test-out-of-range-unicode.ttl (renamed from test/extra/good/test-out-of-range-unicode.ttl)0
-rw-r--r--test/test_overflow.c3
-rw-r--r--test/test_reader.c49
16 files changed, 1084 insertions, 528 deletions
diff --git a/meson.build b/meson.build
index 09a170e8..cd6c77f5 100644
--- a/meson.build
+++ b/meson.build
@@ -157,6 +157,7 @@ sources = files(
'src/env.c',
'src/n3.c',
'src/node.c',
+ 'src/read_ntriples.c',
'src/read_utf8.c',
'src/reader.c',
'src/sink.c',
diff --git a/src/n3.c b/src/n3.c
index 967f5162..2c64ab0f 100644
--- a/src/n3.c
+++ b/src/n3.c
@@ -1,14 +1,14 @@
// Copyright 2011-2023 David Robillard <d@drobilla.net>
// SPDX-License-Identifier: ISC
+#include "byte_source.h"
#include "namespaces.h"
#include "node.h"
-#include "read_utf8.h"
+#include "read_ntriples.h"
#include "reader.h"
#include "stack.h"
#include "string_utils.h"
#include "try.h"
-#include "uri_utils.h"
#include "serd/node.h"
#include "serd/reader.h"
@@ -23,16 +23,6 @@
#include <stdio.h>
#include <string.h>
-#if defined(__clang__) && __clang_major__ >= 10
-# define SERD_FALLTHROUGH __attribute__((fallthrough))
-_Pragma("clang diagnostic push")
-_Pragma("clang diagnostic ignored \"-Wmissing-declarations\"")
-#elif defined(__GNUC__) && __GNUC__ >= 7
-# define SERD_FALLTHROUGH __attribute__((fallthrough))
-#else
-# define SERD_FALLTHROUGH
-#endif
-
static bool
fancy_syntax(const SerdReader* const reader)
{
@@ -45,187 +35,29 @@ read_collection(SerdReader* reader, ReadContext ctx, SerdNode** dest);
static SerdStatus
read_predicateObjectList(SerdReader* reader, ReadContext ctx, bool* ate_dot);
-static uint8_t
-read_HEX(SerdReader* const reader)
-{
- const int c = peek_byte(reader);
- if (is_xdigit(c)) {
- return (uint8_t)eat_byte_safe(reader, c);
- }
-
- r_err(reader, SERD_BAD_SYNTAX, "invalid hexadecimal digit '%c'", c);
- return 0;
-}
-
-// Read UCHAR escape, initial \ is already eaten by caller
+// whitespace ::= #x9 | #xA | #xD | #x20 | comment
static SerdStatus
-read_UCHAR(SerdReader* const reader,
- SerdNode* const dest,
- uint32_t* const char_code)
+read_whitespace(SerdReader* const reader)
{
- const int b = peek_byte(reader);
- unsigned length = 0;
- switch (b) {
- case 'U':
- length = 8;
- break;
- case 'u':
- length = 4;
- break;
- default:
- return SERD_BAD_SYNTAX;
- }
-
- skip_byte(reader, b);
-
- // Read character code point in hex
- uint8_t buf[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
- uint32_t code = 0U;
- for (unsigned i = 0U; i < length; ++i) {
- if (!(buf[i] = read_HEX(reader))) {
- return SERD_BAD_SYNTAX;
- }
-
- code = (code << (i ? 4U : 0U)) | hex_digit_value(buf[i]);
- }
-
- // Determine the encoded size from the code point
- unsigned size = 0;
- if (code < 0x00000080) {
- size = 1;
- } else if (code < 0x00000800) {
- size = 2;
- } else if (code < 0x00010000) {
- size = 3;
- } else if (code < 0x00110000) {
- size = 4;
- } else {
- r_err(reader, SERD_BAD_SYNTAX, "unicode character 0x%X out of range", code);
-
- *char_code = 0xFFFD;
- const SerdStatus st = push_bytes(reader, dest, replacement_char, 3);
- return st ? st : SERD_SUCCESS;
- }
-
- // Build output in buf
- // (Note # of bytes = # of leading 1 bits in first byte)
- uint32_t c = code;
- switch (size) {
- case 4:
- buf[3] = (uint8_t)(0x80U | (c & 0x3FU));
- c >>= 6;
- c |= (16 << 12); // set bit 4
- SERD_FALLTHROUGH;
- case 3:
- buf[2] = (uint8_t)(0x80U | (c & 0x3FU));
- c >>= 6;
- c |= (32 << 6); // set bit 5
- SERD_FALLTHROUGH;
- case 2:
- buf[1] = (uint8_t)(0x80U | (c & 0x3FU));
- c >>= 6;
- c |= 0xC0; // set bits 6 and 7
- SERD_FALLTHROUGH;
- case 1:
- buf[0] = (uint8_t)c;
- SERD_FALLTHROUGH;
+ switch (peek_byte(reader)) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case ' ':
+ return serd_byte_source_advance(&reader->source);
+ case '#':
+ return read_comment(reader);
default:
break;
}
- *char_code = code;
- return push_bytes(reader, dest, buf, size);
-}
-
-// Read ECHAR escape, initial \ is already eaten by caller
-static SerdStatus
-read_ECHAR(SerdReader* const reader, SerdNode* const dest)
-{
- SerdStatus st = SERD_SUCCESS;
- const int c = peek_byte(reader);
- switch (c) {
- case 't':
- return (st = skip_byte(reader, 't')) ? st : push_byte(reader, dest, '\t');
- case 'b':
- return (st = skip_byte(reader, 'b')) ? st : push_byte(reader, dest, '\b');
- case 'n':
- dest->flags |= SERD_HAS_NEWLINE;
- return (st = skip_byte(reader, 'n')) ? st : push_byte(reader, dest, '\n');
- case 'r':
- dest->flags |= SERD_HAS_NEWLINE;
- return (st = skip_byte(reader, 'r')) ? st : push_byte(reader, dest, '\r');
- case 'f':
- return (st = skip_byte(reader, 'f')) ? st : push_byte(reader, dest, '\f');
- case '\\':
- case '"':
- case '\'':
- return push_byte(reader, dest, eat_byte_safe(reader, c));
- default:
- return SERD_BAD_SYNTAX;
- }
-}
-
-// Read one character (possibly multi-byte)
-// The first byte, c, has already been eaten by caller
-static SerdStatus
-read_character(SerdReader* const reader, SerdNode* const dest, const uint8_t c)
-{
- if (!(c & 0x80)) {
- switch (c) {
- case 0xA:
- case 0xD:
- dest->flags |= SERD_HAS_NEWLINE;
- break;
- case '"':
- case '\'':
- dest->flags |= SERD_HAS_QUOTE;
- break;
- default:
- break;
- }
-
- return push_byte(reader, dest, c);
- }
-
- return read_utf8_continuation(reader, dest, c);
-}
-
-// [10] comment ::= '#' ( [^#xA #xD] )*
-static void
-read_comment(SerdReader* const reader)
-{
- skip_byte(reader, '#');
-
- int c = 0;
- while (((c = peek_byte(reader)) != 0xA) && c != 0xD && c != EOF && c) {
- skip_byte(reader, c);
- }
-}
-
-// [24] ws ::= #x9 | #xA | #xD | #x20 | comment
-static bool
-read_ws(SerdReader* const reader)
-{
- const int c = peek_byte(reader);
- switch (c) {
- case 0x9:
- case 0xA:
- case 0xD:
- case 0x20:
- skip_byte(reader, c);
- return true;
- case '#':
- read_comment(reader);
- return true;
- default:
- return false;
- }
+ return SERD_FAILURE;
}
static bool
read_ws_star(SerdReader* const reader)
{
- while (read_ws(reader)) {
+ while (!read_whitespace(reader)) {
}
return true;
@@ -249,18 +81,6 @@ eat_delim(SerdReader* const reader, const uint8_t delim)
return false;
}
-static SerdStatus
-read_string_escape(SerdReader* const reader, SerdNode* const ref)
-{
- SerdStatus st = SERD_SUCCESS;
- uint32_t code = 0;
- if ((st = read_ECHAR(reader, ref)) && (st = read_UCHAR(reader, ref, &code))) {
- return r_err(reader, st, "invalid escape '\\%c'", peek_byte(reader));
- }
-
- return st;
-}
-
// STRING_LITERAL_LONG_QUOTE and STRING_LITERAL_LONG_SINGLE_QUOTE
// Initial triple quotes are already eaten by caller
static SerdStatus
@@ -302,39 +122,6 @@ read_STRING_LITERAL_LONG(SerdReader* const reader,
return tolerate_status(reader, st) ? SERD_SUCCESS : st;
}
-// STRING_LITERAL_QUOTE and STRING_LITERAL_SINGLE_QUOTE
-// Initial quote is already eaten by caller
-static SerdStatus
-read_STRING_LITERAL(SerdReader* const reader,
- SerdNode* const ref,
- const uint8_t q)
-{
- SerdStatus st = SERD_SUCCESS;
-
- while (tolerate_status(reader, st)) {
- const int c = peek_byte(reader);
- switch (c) {
- case EOF:
- return r_err(reader, SERD_BAD_SYNTAX, "end of file in short string");
- case '\n':
- case '\r':
- return r_err(reader, SERD_BAD_SYNTAX, "line end in short string");
- case '\\':
- skip_byte(reader, c);
- TRY(st, read_string_escape(reader, ref));
- break;
- default:
- if (c == q) {
- return skip_byte(reader, c);
- }
-
- st = read_character(reader, ref, (uint8_t)eat_byte_safe(reader, c));
- }
- }
-
- return tolerate_status(reader, st) ? SERD_SUCCESS : st;
-}
-
static SerdStatus
read_String(SerdReader* const reader, SerdNode* const node)
{
@@ -367,80 +154,6 @@ read_String(SerdReader* const reader, SerdNode* const node)
return read_STRING_LITERAL_LONG(reader, node, (uint8_t)q1);
}
-static bool
-is_PN_CHARS_BASE(const uint32_t c)
-{
- return ((c >= 0x00C0 && c <= 0x00D6) || (c >= 0x00D8 && c <= 0x00F6) ||
- (c >= 0x00F8 && c <= 0x02FF) || (c >= 0x0370 && c <= 0x037D) ||
- (c >= 0x037F && c <= 0x1FFF) || (c >= 0x200C && c <= 0x200D) ||
- (c >= 0x2070 && c <= 0x218F) || (c >= 0x2C00 && c <= 0x2FEF) ||
- (c >= 0x3001 && c <= 0xD7FF) || (c >= 0xF900 && c <= 0xFDCF) ||
- (c >= 0xFDF0 && c <= 0xFFFD) || (c >= 0x10000 && c <= 0xEFFFF));
-}
-
-static SerdStatus
-read_PN_CHARS_BASE(SerdReader* const reader, SerdNode* const dest)
-{
- uint32_t code = 0;
- const int c = peek_byte(reader);
- SerdStatus st = SERD_SUCCESS;
-
- if (is_alpha(c)) {
- return push_byte(reader, dest, eat_byte_safe(reader, c));
- }
-
- if (c == EOF || !(c & 0x80)) {
- return SERD_FAILURE;
- }
-
- if ((st = read_utf8_code_point(reader, dest, &code, (uint8_t)c))) {
- return st;
- }
-
- if (!is_PN_CHARS_BASE(code)) {
- r_err(reader, SERD_BAD_SYNTAX, "invalid character U+%04X in name", code);
- if (reader->strict) {
- return SERD_BAD_SYNTAX;
- }
- }
-
- return st;
-}
-
-static bool
-is_PN_CHARS(const uint32_t c)
-{
- return (is_PN_CHARS_BASE(c) || c == 0xB7 || (c >= 0x0300 && c <= 0x036F) ||
- (c >= 0x203F && c <= 0x2040));
-}
-
-static SerdStatus
-read_PN_CHARS(SerdReader* const reader, SerdNode* const dest)
-{
- uint32_t code = 0;
- const int c = peek_byte(reader);
- SerdStatus st = SERD_SUCCESS;
-
- if (is_alpha(c) || is_digit(c) || c == '_' || c == '-') {
- return push_byte(reader, dest, eat_byte_safe(reader, c));
- }
-
- if (c == EOF || !(c & 0x80)) {
- return SERD_FAILURE;
- }
-
- if ((st = read_utf8_code_point(reader, dest, &code, (uint8_t)c))) {
- return st;
- }
-
- if (!is_PN_CHARS(code)) {
- return r_err(
- reader, SERD_BAD_SYNTAX, "invalid character U+%04X in name", code);
- }
-
- return st;
-}
-
static SerdStatus
read_PERCENT(SerdReader* const reader, SerdNode* const dest)
{
@@ -600,69 +313,12 @@ read_PN_PREFIX(SerdReader* const reader, SerdNode* const dest)
}
static SerdStatus
-read_LANGTAG(SerdReader* const reader)
-{
- int c = peek_byte(reader);
- if (!is_alpha(c)) {
- return r_err(reader, SERD_BAD_SYNTAX, "unexpected '%c'", c);
- }
-
- SerdNode* node = push_node(reader, SERD_LITERAL, "", 0);
- if (!node) {
- return SERD_BAD_STACK;
- }
-
- SerdStatus st = SERD_SUCCESS;
- TRY(st, push_byte(reader, node, eat_byte_safe(reader, c)));
- while ((c = peek_byte(reader)) && is_alpha(c)) {
- TRY(st, push_byte(reader, node, eat_byte_safe(reader, c)));
- }
- while (peek_byte(reader) == '-') {
- TRY(st, push_byte(reader, node, eat_byte_safe(reader, '-')));
- while ((c = peek_byte(reader)) && (is_alpha(c) || is_digit(c))) {
- TRY(st, push_byte(reader, node, eat_byte_safe(reader, c)));
- }
- }
- return SERD_SUCCESS;
-}
-
-static SerdStatus
-read_IRIREF_scheme(SerdReader* const reader, SerdNode* const dest)
+read_IRIREF(SerdReader* const reader, SerdNode** const dest)
{
- int c = peek_byte(reader);
- if (!is_alpha(c)) {
- return r_err(reader, SERD_BAD_SYNTAX, "bad IRI scheme start '%c'", c);
- }
-
- SerdStatus st = SERD_SUCCESS;
- while ((c = peek_byte(reader)) != EOF) {
- if (c == '>') {
- return r_err(reader, SERD_BAD_SYNTAX, "missing IRI scheme");
- }
-
- if (!is_uri_scheme_char(c)) {
- return r_err(reader,
- SERD_BAD_SYNTAX,
- "bad IRI scheme char U+%04X (%c)",
- (unsigned)c,
- (char)c);
- }
-
- if ((st = push_byte(reader, dest, eat_byte_safe(reader, c)))) {
- return st;
- }
-
- if (c == ':') {
- return SERD_SUCCESS; // End of scheme
- }
+ if (!fancy_syntax(reader)) {
+ return read_IRI(reader, dest);
}
- return SERD_FAILURE;
-}
-
-static SerdStatus
-read_IRIREF(SerdReader* const reader, SerdNode** const dest)
-{
SerdStatus st = SERD_SUCCESS;
TRY(st, eat_byte_check(reader, '<'));
@@ -670,64 +326,7 @@ read_IRIREF(SerdReader* const reader, SerdNode** const dest)
return SERD_BAD_STACK;
}
- if (!fancy_syntax(reader) && (st = read_IRIREF_scheme(reader, *dest))) {
- return r_err(reader, st, "expected IRI scheme");
- }
-
- uint32_t code = 0;
- while (st <= SERD_FAILURE) {
- const int c = eat_byte_safe(reader, peek_byte(reader));
- switch (c) {
- case '"':
- case '<':
- return r_err(reader, SERD_BAD_SYNTAX, "invalid IRI character '%c'", c);
- case '>':
- return SERD_SUCCESS;
- case '\\':
- if (read_UCHAR(reader, *dest, &code)) {
- return r_err(reader, SERD_BAD_SYNTAX, "invalid IRI escape");
- }
- switch (code) {
- case 0:
- case ' ':
- case '<':
- case '>':
- return r_err(reader,
- SERD_BAD_SYNTAX,
- "invalid escaped IRI character U+%04X",
- code);
- default:
- break;
- }
- break;
- case '^':
- case '`':
- case '{':
- case '|':
- case '}':
- return r_err(reader, SERD_BAD_SYNTAX, "invalid IRI character '%c'", c);
- default:
- if (c <= 0x20) {
- st = r_err(reader,
- SERD_BAD_SYNTAX,
- "invalid IRI character (escape %%%02X)",
- (unsigned)c);
- if (reader->strict) {
- break;
- }
-
- if (!(st = push_byte(reader, *dest, c))) {
- st = SERD_FAILURE;
- }
- } else if (!(c & 0x80)) {
- st = push_byte(reader, *dest, c);
- } else {
- st = read_utf8_continuation(reader, *dest, (uint8_t)c);
- }
- }
- }
-
- return tolerate_status(reader, st) ? SERD_SUCCESS : st;
+ return read_IRIREF_suffix(reader, *dest);
}
static SerdStatus
@@ -925,78 +524,6 @@ read_verb(SerdReader* const reader, SerdNode** const dest)
}
static SerdStatus
-adjust_blank_id(SerdReader* const reader, char* const buf)
-{
- if (fancy_syntax(reader) && is_digit(buf[reader->bprefix_len + 1])) {
- const char tag = buf[reader->bprefix_len];
- if (tag == 'b') {
- buf[reader->bprefix_len] = 'B'; // Prevent clash
- reader->seen_genid = true;
- } else if (tag == 'B' && reader->seen_genid) {
- return r_err(reader,
- SERD_BAD_LABEL,
- "found both 'b' and 'B' blank IDs, prefix required");
- }
- }
-
- return SERD_SUCCESS;
-}
-
-static SerdStatus
-read_BLANK_NODE_LABEL(SerdReader* const reader,
- SerdNode** const dest,
- bool* const ate_dot)
-{
- SerdStatus st = SERD_SUCCESS;
-
- skip_byte(reader, '_');
- TRY(st, eat_byte_check(reader, ':'));
-
- if (!(*dest = push_node(reader,
- SERD_BLANK,
- reader->bprefix ? reader->bprefix : "",
- reader->bprefix_len))) {
- return SERD_BAD_STACK;
- }
-
- // Read first: (PN_CHARS | '_' | [0-9])
- SerdNode* const n = *dest;
- int c = peek_byte(reader);
- if (is_digit(c) || c == '_') {
- TRY(st, push_byte(reader, n, eat_byte_safe(reader, c)));
- } else if ((st = read_PN_CHARS(reader, n))) {
- st = st > SERD_FAILURE ? st : SERD_BAD_SYNTAX;
- return r_err(reader, st, "invalid name start");
- }
-
- // Read middle: (PN_CHARS | '.')*
- while ((c = peek_byte(reader))) {
- if (c == '.') {
- TRY(st, push_byte(reader, n, eat_byte_safe(reader, c)));
- } else if ((st = read_PN_CHARS(reader, n))) {
- break;
- }
- }
-
- if (st > SERD_FAILURE) {
- return st;
- }
-
- // Deal with annoying edge case of having eaten the trailing dot
- char* const buf = serd_node_buffer(n);
- if (buf[n->length - 1] == '.' && read_PN_CHARS(reader, n)) {
- --n->length;
- serd_stack_pop(&reader->stack, 1);
- *ate_dot = true;
- }
-
- // Adjust ID to avoid clashes with generated IDs if necessary
- st = adjust_blank_id(reader, buf);
-
- return tolerate_status(reader, st) ? SERD_SUCCESS : st;
-}
-
-static SerdStatus
read_anon(SerdReader* const reader,
ReadContext ctx,
const bool subject,
@@ -1164,10 +691,6 @@ read_objectList(SerdReader* const reader, ReadContext ctx, bool* const ate_dot)
{
SerdStatus st = SERD_SUCCESS;
TRY(st, read_object(reader, &ctx, true, ate_dot));
- if (!fancy_syntax(reader) && peek_delim(reader, ',')) {
- return r_err(
- reader, SERD_BAD_SYNTAX, "syntax does not support abbreviation");
- }
while (st <= SERD_FAILURE && !*ate_dot && eat_delim(reader, ',')) {
st = read_object(reader, &ctx, true, ate_dot);
@@ -1528,10 +1051,6 @@ read_n3_statement(SerdReader* const reader)
case EOF:
return SERD_FAILURE;
case '@':
- if (!fancy_syntax(reader)) {
- return r_err(
- reader, SERD_BAD_SYNTAX, "syntax does not support directives");
- }
TRY(st, read_directive(reader));
read_ws_star(reader);
break;
@@ -1590,19 +1109,6 @@ read_n3_statement(SerdReader* const reader)
}
SerdStatus
-serd_reader_skip_until_byte(SerdReader* const reader, const uint8_t byte)
-{
- int c = peek_byte(reader);
-
- while (c != byte && c != EOF) {
- skip_byte(reader, c);
- c = peek_byte(reader);
- }
-
- return c == EOF ? SERD_FAILURE : SERD_SUCCESS;
-}
-
-SerdStatus
read_turtleTrigDoc(SerdReader* const reader)
{
while (!reader->source.eof) {
@@ -1684,7 +1190,3 @@ read_nquadsDoc(SerdReader* const reader)
return st;
}
-
-#if defined(__clang__) && __clang_major__ >= 10
-_Pragma("clang diagnostic pop")
-#endif
diff --git a/src/read_ntriples.c b/src/read_ntriples.c
new file mode 100644
index 00000000..3063a667
--- /dev/null
+++ b/src/read_ntriples.c
@@ -0,0 +1,737 @@
+// Copyright 2011-2021 David Robillard <d@drobilla.net>
+// SPDX-License-Identifier: ISC
+
+#include "read_ntriples.h"
+
+#include "caret.h"
+#include "node.h"
+#include "read_utf8.h"
+#include "reader.h"
+#include "stack.h"
+#include "statement.h"
+#include "string_utils.h"
+#include "try.h"
+#include "uri_utils.h"
+
+#include "serd/caret.h"
+#include "serd/sink.h"
+#include "serd/statement.h"
+#include "serd/syntax.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+
+// Utilities
+
+static inline bool
+codepoint_in_range(const uint32_t c, const uint32_t min, const uint32_t max)
+{
+ return c >= min && c <= max;
+}
+
+bool
+is_PN_CHARS_BASE(const uint32_t c)
+{
+ return (codepoint_in_range(c, 'A', 'Z') || codepoint_in_range(c, 'a', 'z') ||
+ codepoint_in_range(c, 0x000C0U, 0x000D6U) ||
+ codepoint_in_range(c, 0x000D8U, 0x000F6U) ||
+ codepoint_in_range(c, 0x000F8U, 0x002FFU) ||
+ codepoint_in_range(c, 0x00370U, 0x0037DU) ||
+ codepoint_in_range(c, 0x0037FU, 0x01FFFU) ||
+ codepoint_in_range(c, 0x0200CU, 0x0200DU) ||
+ codepoint_in_range(c, 0x02070U, 0x0218FU) ||
+ codepoint_in_range(c, 0x02C00U, 0x02FEFU) ||
+ codepoint_in_range(c, 0x03001U, 0x0D7FFU) ||
+ codepoint_in_range(c, 0x0F900U, 0x0FDCFU) ||
+ codepoint_in_range(c, 0x0FDF0U, 0x0FFFDU) ||
+ codepoint_in_range(c, 0x10000U, 0xEFFFFU));
+}
+
+/**
+ Read an initial prefixed name character.
+
+ RDF 1.1 NTriples: [158s] PN_CHARS_U
+*/
+static SerdStatus
+read_PN_CHARS_U(SerdReader* reader, SerdNode* dest);
+
+// Terminals
+
+/// [144s] LANGTAG
+SerdStatus
+read_LANGTAG(SerdReader* const reader)
+{
+ int c = peek_byte(reader);
+ if (!is_alpha(c)) {
+ return r_err(reader, SERD_BAD_SYNTAX, "expected A-Z or a-z");
+ }
+
+ SerdNode* node = push_node(reader, SERD_LITERAL, "", 0);
+ if (!node) {
+ return SERD_BAD_STACK;
+ }
+
+ SerdStatus st = SERD_SUCCESS;
+ TRY(st, push_byte(reader, node, eat_byte_safe(reader, c)));
+ while ((c = peek_byte(reader)) && is_alpha(c)) {
+ TRY(st, push_byte(reader, node, eat_byte_safe(reader, c)));
+ }
+ while (peek_byte(reader) == '-') {
+ TRY(st, push_byte(reader, node, eat_byte_safe(reader, '-')));
+ while ((c = peek_byte(reader)) && (is_alpha(c) || is_digit(c))) {
+ TRY(st, push_byte(reader, node, eat_byte_safe(reader, c)));
+ }
+ }
+ return SERD_SUCCESS;
+}
+
+static bool
+is_EOL(const int c)
+{
+ return c == '\n' || c == '\r';
+}
+
+/// [7] EOL
+SerdStatus
+read_EOL(SerdReader* const reader)
+{
+ if (!is_EOL(peek_byte(reader))) {
+ return r_err(reader, SERD_BAD_SYNTAX, "expected a line ending");
+ }
+
+ while (is_EOL(peek_byte(reader))) {
+ eat_byte(reader);
+ }
+
+ return SERD_SUCCESS;
+}
+
+static SerdStatus
+read_IRI_scheme(SerdReader* const reader, SerdNode* const dest)
+{
+ int c = peek_byte(reader);
+ if (!is_alpha(c)) {
+ return r_err(
+ reader, SERD_BAD_SYNTAX, "'%c' is not a valid first IRI character", c);
+ }
+
+ SerdStatus st = SERD_SUCCESS;
+ while (!st && (c = peek_byte(reader)) != EOF) {
+ if (c == ':') {
+ return SERD_SUCCESS; // End of scheme
+ }
+
+ st = is_uri_scheme_char(c)
+ ? push_byte(reader, dest, eat_byte_safe(reader, c))
+ : r_err(reader,
+ SERD_BAD_SYNTAX,
+ "U+%04X is not a valid IRI scheme character",
+ (unsigned)c);
+ }
+
+ return st ? st : SERD_BAD_SYNTAX;
+}
+
+SerdStatus
+read_IRIREF_suffix(SerdReader* const reader, SerdNode* const node)
+{
+ SerdStatus st = SERD_SUCCESS;
+ uint32_t code = 0U;
+
+ while (st <= SERD_FAILURE) {
+ const int c = eat_byte(reader);
+ switch (c) {
+ case EOF:
+ return r_err(reader, SERD_BAD_SYNTAX, "unexpected end of file");
+
+ case ' ':
+ case '"':
+ case '<':
+ case '^':
+ case '`':
+ case '{':
+ case '|':
+ case '}':
+ return r_err(
+ reader, SERD_BAD_SYNTAX, "'%c' is not a valid IRI character", c);
+
+ case '>':
+ return SERD_SUCCESS;
+
+ case '\\':
+ TRY(st, read_UCHAR(reader, node, &code));
+
+ if (!code || code == ' ' || code == '<' || code == '>') {
+ return r_err(
+ reader, SERD_BAD_SYNTAX, "U+%04X is not a valid IRI character", code);
+ }
+
+ break;
+
+ default:
+ if (c <= 0x20) {
+ st = r_err(reader,
+ SERD_BAD_SYNTAX,
+ "control character U+%04X is not a valid IRI character",
+ (uint32_t)c);
+
+ if (reader->strict) {
+ return st;
+ }
+ }
+
+ st = ((uint8_t)c & 0x80)
+ ? read_utf8_continuation(reader, node, (uint8_t)c)
+ : push_byte(reader, node, c);
+ }
+ }
+
+ return tolerate_status(reader, st) ? SERD_SUCCESS : st;
+}
+
+SerdStatus
+read_IRI(SerdReader* const reader, SerdNode** const dest)
+{
+ SerdStatus st = SERD_SUCCESS;
+ if ((st = eat_byte_check(reader, '<'))) {
+ return st;
+ }
+
+ if (!(*dest = push_node(reader, SERD_URI, "", 0))) {
+ return SERD_BAD_STACK;
+ }
+
+ if ((st = read_IRI_scheme(reader, *dest))) {
+ return r_err(reader, st, "expected IRI scheme");
+ }
+
+ return read_IRIREF_suffix(reader, *dest);
+}
+
+SerdStatus
+read_character(SerdReader* const reader, SerdNode* const dest, const uint8_t c)
+{
+ if (!(c & 0x80)) {
+ switch (c) {
+ case 0xA:
+ case 0xD:
+ dest->flags |= SERD_HAS_NEWLINE;
+ break;
+ case '"':
+ case '\'':
+ dest->flags |= SERD_HAS_QUOTE;
+ break;
+ default:
+ break;
+ }
+
+ return push_byte(reader, dest, c);
+ }
+
+ return read_utf8_continuation(reader, dest, c);
+}
+
+SerdStatus
+read_string_escape(SerdReader* const reader, SerdNode* const ref)
+{
+ SerdStatus st = SERD_SUCCESS;
+ uint32_t code = 0;
+ if ((st = read_ECHAR(reader, ref)) && (st = read_UCHAR(reader, ref, &code))) {
+ return r_err(reader, st, "invalid escape '\\%c'", peek_byte(reader));
+ }
+
+ return st;
+}
+
+SerdStatus
+read_STRING_LITERAL(SerdReader* const reader,
+ SerdNode* const ref,
+ const uint8_t q)
+{
+ SerdStatus st = SERD_SUCCESS;
+
+ while (tolerate_status(reader, st)) {
+ const int c = peek_byte(reader);
+ switch (c) {
+ case EOF:
+ return r_err(reader, SERD_BAD_SYNTAX, "end of file in short string");
+ case '\n':
+ case '\r':
+ return r_err(reader, SERD_BAD_SYNTAX, "line end in short string");
+ case '\\':
+ skip_byte(reader, c);
+ TRY(st, read_string_escape(reader, ref));
+ break;
+ default:
+ if (c == q) {
+ return skip_byte(reader, c);
+ }
+
+ st = read_character(reader, ref, (uint8_t)eat_byte_safe(reader, c));
+ }
+ }
+
+ return tolerate_status(reader, st) ? SERD_SUCCESS : st;
+}
+
+static bool
+avoid_blank_clashes(const SerdReader* const reader)
+{
+ return reader->syntax == SERD_TURTLE || reader->syntax == SERD_TRIG;
+}
+
+static SerdStatus
+adjust_blank_id(SerdReader* const reader, char* const buf)
+{
+ if (avoid_blank_clashes(reader) && is_digit(buf[reader->bprefix_len + 1])) {
+ const char tag = buf[reader->bprefix_len];
+ if (tag == 'b') {
+ buf[reader->bprefix_len] = 'B'; // Prevent clash
+ reader->seen_genid = true;
+ } else if (tag == 'B' && reader->seen_genid) {
+ return r_err(reader,
+ SERD_BAD_LABEL,
+ "found both 'b' and 'B' blank IDs, prefix required");
+ }
+ }
+
+ return SERD_SUCCESS;
+}
+
+SerdStatus
+read_BLANK_NODE_LABEL(SerdReader* const reader,
+ SerdNode** const dest,
+ bool* const ate_dot)
+{
+ SerdStatus st = SERD_SUCCESS;
+
+ skip_byte(reader, '_');
+ TRY(st, eat_byte_check(reader, ':'));
+
+ int c = peek_byte(reader);
+ if (c == ':') {
+ // The spec says PN_CHARS_U, the tests say no colon, so exclude it here
+ return r_err(reader, SERD_BAD_SYNTAX, "expected blank node label");
+ }
+
+ if (!(*dest = push_node(reader,
+ SERD_BLANK,
+ reader->bprefix ? reader->bprefix : "",
+ reader->bprefix_len))) {
+ return SERD_BAD_STACK;
+ }
+
+ // Read first: (PN_CHARS_U | [0-9])
+ SerdNode* const n = *dest;
+ if (is_digit(c)) {
+ TRY(st, push_byte(reader, n, eat_byte_safe(reader, c)));
+ } else {
+ TRY(st, read_PN_CHARS_U(reader, *dest));
+ }
+
+ // Read middle: (PN_CHARS | '.')*
+ while (!st && (c = peek_byte(reader))) {
+ st = (c == '.') ? push_byte(reader, n, eat_byte_safe(reader, c))
+ : read_PN_CHARS(reader, n);
+ }
+
+ if (st > SERD_FAILURE) {
+ return st;
+ }
+
+ // Deal with annoying edge case of having eaten the trailing dot
+ char* const buf = serd_node_buffer(n);
+ if (buf[n->length - 1] == '.' && read_PN_CHARS(reader, n)) {
+ --n->length;
+ serd_stack_pop(&reader->stack, 1);
+ *ate_dot = true;
+ }
+
+ // Adjust ID to avoid clashes with generated IDs if necessary
+ st = adjust_blank_id(reader, buf);
+
+ return tolerate_status(reader, st) ? SERD_SUCCESS : st;
+}
+
+static unsigned
+utf8_from_codepoint(uint8_t* const out, const uint32_t code)
+{
+ const unsigned size = utf8_num_bytes_for_codepoint(code);
+ uint32_t c = code;
+
+ assert(size <= 4U);
+
+ if (size == 4U) {
+ out[3] = (uint8_t)(0x80U | (c & 0x3FU));
+ c >>= 6;
+ c |= 0x10000;
+ }
+
+ if (size >= 3U) {
+ out[2] = (uint8_t)(0x80U | (c & 0x3FU));
+ c >>= 6;
+ c |= 0x800;
+ }
+
+ if (size >= 2U) {
+ out[1] = (uint8_t)(0x80U | (c & 0x3FU));
+ c >>= 6;
+ c |= 0xC0;
+ }
+
+ if (size >= 1U) {
+ out[0] = (uint8_t)c;
+ }
+
+ return size;
+}
+
+SerdStatus
+read_UCHAR(SerdReader* const reader,
+ SerdNode* const node,
+ uint32_t* const code_point)
+{
+ SerdStatus st = SERD_SUCCESS;
+
+ // Consume first character to determine which type of escape this is
+ const int b = peek_byte(reader);
+ unsigned length = 0U;
+ switch (b) {
+ case 'U':
+ length = 8;
+ break;
+ case 'u':
+ length = 4;
+ break;
+ default:
+ return r_err(reader, SERD_BAD_SYNTAX, "expected 'U' or 'u'");
+ }
+
+ TRY(st, skip_byte(reader, b));
+
+ // Read character code point in hex
+ uint8_t buf[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
+ uint32_t code = 0U;
+ for (unsigned i = 0; i < length; ++i) {
+ if (!(buf[i] = read_HEX(reader))) {
+ return SERD_BAD_SYNTAX;
+ }
+
+ code = (code << (i ? 4U : 0U)) | hex_digit_value(buf[i]);
+ }
+
+ // Reuse buf to write the UTF-8
+ const unsigned size = utf8_from_codepoint(buf, code);
+ if (!size) {
+ *code_point = 0xFFFD;
+ return (reader->strict
+ ? r_err(reader, SERD_BAD_SYNTAX, "U+%X is out of range", code)
+ : push_bytes(reader, node, replacement_char, 3));
+ }
+
+ *code_point = code;
+ return push_bytes(reader, node, buf, size);
+}
+
+SerdStatus
+read_ECHAR(SerdReader* const reader, SerdNode* const dest)
+{
+ SerdStatus st = SERD_SUCCESS;
+ const int c = peek_byte(reader);
+ switch (c) {
+ case 't':
+ return (st = skip_byte(reader, 't')) ? st : push_byte(reader, dest, '\t');
+ case 'b':
+ return (st = skip_byte(reader, 'b')) ? st : push_byte(reader, dest, '\b');
+ case 'n':
+ dest->flags |= SERD_HAS_NEWLINE;
+ return (st = skip_byte(reader, 'n')) ? st : push_byte(reader, dest, '\n');
+ case 'r':
+ dest->flags |= SERD_HAS_NEWLINE;
+ return (st = skip_byte(reader, 'r')) ? st : push_byte(reader, dest, '\r');
+ case 'f':
+ return (st = skip_byte(reader, 'f')) ? st : push_byte(reader, dest, '\f');
+ case '\\':
+ case '"':
+ case '\'':
+ return push_byte(reader, dest, eat_byte_safe(reader, c));
+ default:
+ return SERD_BAD_SYNTAX;
+ }
+}
+
+SerdStatus
+read_PN_CHARS_BASE(SerdReader* const reader, SerdNode* const dest)
+{
+ uint32_t code = 0;
+ const int c = peek_byte(reader);
+ SerdStatus st = SERD_SUCCESS;
+
+ if (is_alpha(c)) {
+ return push_byte(reader, dest, eat_byte_safe(reader, c));
+ }
+
+ if (c == EOF || !(c & 0x80)) {
+ return SERD_FAILURE;
+ }
+
+ TRY(st, read_utf8_code_point(reader, dest, &code, (uint8_t)c));
+
+ if (!is_PN_CHARS_BASE(code)) {
+ r_err(
+ reader, SERD_BAD_SYNTAX, "U+%04X is not a valid name character", code);
+ if (reader->strict) {
+ return SERD_BAD_SYNTAX;
+ }
+ }
+
+ return st;
+}
+
+SerdStatus
+read_PN_CHARS_U(SerdReader* const reader, SerdNode* const dest)
+{
+ const int c = peek_byte(reader);
+
+ return (c == ':' || c == '_')
+ ? push_byte(reader, dest, eat_byte_safe(reader, c))
+ : read_PN_CHARS_BASE(reader, dest);
+}
+
+SerdStatus
+read_PN_CHARS(SerdReader* const reader, SerdNode* const dest)
+{
+ const int c = peek_byte(reader);
+ SerdStatus st = SERD_SUCCESS;
+
+ if (c == EOF) {
+ return SERD_NO_DATA;
+ }
+
+ if (is_alpha(c) || is_digit(c) || c == '_' || c == '-') {
+ return push_byte(reader, dest, eat_byte_safe(reader, c));
+ }
+
+ if (!(c & 0x80)) {
+ return SERD_FAILURE;
+ }
+
+ uint32_t code = 0U;
+ TRY(st, read_utf8_code_point(reader, dest, &code, (uint8_t)c));
+
+ if (!is_PN_CHARS_BASE(code) && code != 0xB7 &&
+ !(code >= 0x0300 && code <= 0x036F) &&
+ !(code >= 0x203F && code <= 0x2040)) {
+ return r_err(
+ reader, SERD_BAD_SYNTAX, "U+%04X is not a valid name character", code);
+ }
+
+ return st;
+}
+
+uint8_t
+read_HEX(SerdReader* const reader)
+{
+ const int c = peek_byte(reader);
+ if (is_xdigit(c)) {
+ return (uint8_t)eat_byte_safe(reader, c);
+ }
+
+ r_err(reader, SERD_BAD_SYNTAX, "invalid hexadecimal digit '%c'", c);
+ return 0;
+}
+
+// Nonterminals
+
+// comment ::= '#' ( [^#xA #xD] )*
+SerdStatus
+read_comment(SerdReader* const reader)
+{
+ skip_byte(reader, '#');
+
+ for (int c = peek_byte(reader); c && c != '\n' && c != '\r' && c != EOF;) {
+ skip_byte(reader, c);
+ c = peek_byte(reader);
+ }
+
+ return SERD_SUCCESS;
+}
+
+/// [6] literal
+static SerdStatus
+read_literal(SerdReader* const reader, SerdNode** const dest)
+{
+ SerdStatus st = SERD_SUCCESS;
+
+ if (!(*dest = push_node(reader, SERD_LITERAL, "", 0))) {
+ return SERD_BAD_STACK;
+ }
+
+ skip_byte(reader, '"');
+ TRY(st, read_STRING_LITERAL(reader, *dest, '"'));
+
+ SerdNode* datatype = NULL;
+ switch (peek_byte(reader)) {
+ case '@':
+ skip_byte(reader, '@');
+ TRY(st, read_LANGTAG(reader));
+ (*dest)->flags |= SERD_HAS_LANGUAGE;
+ break;
+ case '^':
+ skip_byte(reader, '^');
+ TRY(st, eat_byte_check(reader, '^'));
+ TRY(st, read_IRI(reader, &datatype));
+ (*dest)->flags |= SERD_HAS_DATATYPE;
+ break;
+ }
+
+ return st;
+}
+
+/// [3] subject
+SerdStatus
+read_nt_subject(SerdReader* const reader, SerdNode** const dest)
+{
+ bool ate_dot = false;
+
+ switch (peek_byte(reader)) {
+ case '<':
+ return read_IRI(reader, dest);
+ case '_':
+ return read_BLANK_NODE_LABEL(reader, dest, &ate_dot);
+ default:
+ break;
+ }
+
+ return r_err(reader, SERD_BAD_SYNTAX, "expected '<' or '_'");
+}
+
+/// [4] predicate
+SerdStatus
+read_nt_predicate(SerdReader* const reader, SerdNode** const dest)
+{
+ return read_IRI(reader, dest);
+}
+
+/// [4] object
+SerdStatus
+read_nt_object(SerdReader* const reader,
+ SerdNode** const dest,
+ bool* const ate_dot)
+{
+ *ate_dot = false;
+
+ switch (peek_byte(reader)) {
+ case '"':
+ return read_literal(reader, dest);
+ case '<':
+ return read_IRI(reader, dest);
+ case '_':
+ return read_BLANK_NODE_LABEL(reader, dest, ate_dot);
+ default:
+ break;
+ }
+
+ return r_err(reader, SERD_BAD_SYNTAX, "expected '<', '_', or '\"'");
+}
+
+/// [2] triple
+static SerdStatus
+read_triple(SerdReader* const reader)
+{
+ SerdStatementFlags flags = 0;
+ ReadContext ctx = {0, 0, 0, 0, &flags};
+ SerdStatus st = SERD_SUCCESS;
+ bool ate_dot = false;
+
+ // Read subject and predicate
+ if ((st = read_nt_subject(reader, &ctx.subject)) ||
+ (st = skip_horizontal_whitespace(reader)) ||
+ (st = read_nt_predicate(reader, &ctx.predicate)) ||
+ (st = skip_horizontal_whitespace(reader))) {
+ return st;
+ }
+
+ // Preserve the caret for error reporting and read object
+ SerdCaret orig_caret = reader->source.caret;
+ if ((st = read_nt_object(reader, &ctx.object, &ate_dot)) ||
+ (st = skip_horizontal_whitespace(reader))) {
+ return st;
+ }
+
+ if (!ate_dot && (st = eat_byte_check(reader, '.'))) {
+ return st;
+ }
+
+ if (ctx.object) {
+ serd_node_zero_pad(ctx.object);
+ }
+
+ const SerdStatement statement = {
+ {ctx.subject, ctx.predicate, ctx.object, ctx.graph}, &orig_caret};
+
+ return serd_sink_write_statement(reader->sink, *ctx.flags, &statement);
+}
+
+static SerdStatus
+read_line(SerdReader* const reader)
+{
+ SerdStatus st = SERD_SUCCESS;
+
+ skip_horizontal_whitespace(reader);
+
+ switch (peek_byte(reader)) {
+ case EOF:
+ return SERD_FAILURE;
+
+ case '\n':
+ case '\r':
+ return read_EOL(reader);
+
+ case '#':
+ st = read_comment(reader);
+ break;
+
+ default:
+ if (!(st = read_triple(reader))) {
+ skip_horizontal_whitespace(reader);
+ if (peek_byte(reader) == '#') {
+ st = read_comment(reader);
+ }
+ }
+ break;
+ }
+
+ return (st || peek_byte(reader) == EOF) ? st : read_EOL(reader);
+}
+
+/// [1] ntriplesDoc
+SerdStatus
+read_ntriplesDoc(SerdReader* const reader)
+{
+ // Record the initial stack size and read the first line
+ const size_t orig_stack_size = reader->stack.size;
+ SerdStatus st = read_line(reader);
+
+ // Return early if we failed to read anything at all
+ serd_stack_pop_to(&reader->stack, orig_stack_size);
+ if (st == SERD_FAILURE || !tolerate_status(reader, st)) {
+ return st;
+ }
+
+ // Continue reading lines for as long as possible
+ for (st = SERD_SUCCESS; !st;) {
+ st = read_line(reader);
+ serd_stack_pop_to(&reader->stack, orig_stack_size);
+
+ if (st > SERD_FAILURE && !reader->strict && tolerate_status(reader, st)) {
+ serd_reader_skip_until_byte(reader, '\n');
+ st = SERD_SUCCESS;
+ }
+ }
+
+ // If we made it this far, we succeeded at reading at least one line
+ return st > SERD_FAILURE ? st : SERD_SUCCESS;
+}
diff --git a/src/read_ntriples.h b/src/read_ntriples.h
new file mode 100644
index 00000000..bc76fed6
--- /dev/null
+++ b/src/read_ntriples.h
@@ -0,0 +1,186 @@
+// Copyright 2011-2021 David Robillard <d@drobilla.net>
+// SPDX-License-Identifier: ISC
+
+#ifndef SERD_SRC_READ_NTRIPLES_H
+#define SERD_SRC_READ_NTRIPLES_H
+
+#include "serd/node.h"
+#include "serd/reader.h"
+#include "serd/status.h"
+
+#include <stdbool.h>
+#include <stdint.h>
+
+// Utilities
+
+/**
+ Return true if the codepoint `c` is a valid PN_CHARS_BASE character.
+
+ RDF 1.1 NTriples: [157s] PN_CHARS_BASE
+*/
+bool
+is_PN_CHARS_BASE(uint32_t c);
+
+/**
+ Read one (possibly multi-byte) character.
+
+ The caller must have already eaten the first byte, `c`.
+*/
+SerdStatus
+read_character(SerdReader* reader, SerdNode* dest, uint8_t c);
+
+/**
+ Read one string literal escape.
+
+ The caller must have already eaten the first byte, a backslash.
+*/
+SerdStatus
+read_string_escape(SerdReader* reader, SerdNode* ref);
+
+// Terminals
+
+/**
+ Read a language tag starting after the '@'.
+
+ RDF 1.1 NTriples: [144s] LANGTAG
+*/
+SerdStatus
+read_LANGTAG(SerdReader* reader);
+
+/**
+ Read an end of line.
+
+ RDF 1.1 NTriples: [7] EOL
+*/
+SerdStatus
+read_EOL(SerdReader* reader);
+
+/**
+ Read an absolute IRI.
+
+ This is a stricter subset of [8] IRIREF in the NTriples grammar, since a
+ scheme is required. Handling this in the parser results in better error
+ messages.
+*/
+SerdStatus
+read_IRI(SerdReader* reader, SerdNode** dest);
+
+/**
+ Read an IRI reference suffix into an existing node.
+
+ RDF 1.1 NTriples: [8] IRIREF
+*/
+SerdStatus
+read_IRIREF_suffix(SerdReader* reader, SerdNode* node);
+
+/**
+ Read a string that is single-quoted with the given character.
+
+ RDF 1.1 NTriples: [9] STRING_LITERAL_QUOTE
+ RDF 1.1 Turtle: [23] STRING_LITERAL_SINGLE_QUOTE
+*/
+SerdStatus
+read_STRING_LITERAL(SerdReader* reader, SerdNode* ref, uint8_t q);
+
+/**
+ Read a blank node label that comes after "_:".
+
+ RDF 1.1 NTriples: [141s] BLANK_NODE_LABEL
+*/
+SerdStatus
+read_BLANK_NODE_LABEL(SerdReader* reader, SerdNode** dest, bool* ate_dot);
+
+/**
+ Read an escape like "u201C", starting after the initial backslash.
+
+ RDF 1.1 NTriples: [10] UCHAR
+*/
+SerdStatus
+read_UCHAR(SerdReader* reader, SerdNode* node, uint32_t* code_point);
+
+/**
+ Read an escape like "n", starting after the initial backslash.
+
+ RDF 1.1 NTriples: [153s] ECHAR
+*/
+SerdStatus
+read_ECHAR(SerdReader* reader, SerdNode* dest);
+
+/**
+ Read a basic prefixed name character.
+
+ RDF 1.1 NTriples: [157s] PN_CHARS_BASE
+*/
+SerdStatus
+read_PN_CHARS_BASE(SerdReader* reader, SerdNode* dest);
+
+/**
+ Read any prefixed name character.
+
+ RDF 1.1 NTriples: [160s] PN_CHARS
+*/
+SerdStatus
+read_PN_CHARS(SerdReader* reader, SerdNode* dest);
+
+/**
+ Read a single hexadecimal digit.
+
+ RDF 1.1 NTriples: [162s] HEX
+*/
+uint8_t
+read_HEX(SerdReader* reader);
+
+// Nonterminals
+
+/**
+ Read a comment that starts with '#' and ends with the line.
+
+ Not described by a rule in the grammar since RDF 1.1.
+*/
+SerdStatus
+read_comment(SerdReader* reader);
+
+/**
+ Read a subject (IRI or blank).
+
+ RDF 1.1 NTriples: [3] subject
+*/
+SerdStatus
+read_nt_subject(SerdReader* reader, SerdNode** dest);
+
+/**
+ Read a predicate (IRI).
+
+ RDF 1.1 NTriples: [4] predicate
+*/
+SerdStatus
+read_nt_predicate(SerdReader* reader, SerdNode** dest);
+
+/**
+ Read an object (IRI or blank or literal).
+
+ RDF 1.1 NTriples: [5] object
+*/
+SerdStatus
+read_nt_object(SerdReader* reader, SerdNode** dest, bool* ate_dot);
+
+/**
+ Read a variable that starts with '?' or '$'.
+
+ This is an extension that serd uses in certain contexts to support
+ patterns.
+
+ Restricted version of SPARQL 1.1: [108] Var
+*/
+SerdStatus
+read_Var(SerdReader* reader, SerdNode** dest);
+
+/**
+ Read a complete NTriples document.
+
+ RDF 1.1 NTriples: [1] ntriplesDoc
+*/
+SerdStatus
+read_ntriplesDoc(SerdReader* reader);
+
+#endif // SERD_SRC_READ_NTRIPLES_H
diff --git a/src/reader.c b/src/reader.c
index 7eb2323e..057f1d7f 100644
--- a/src/reader.c
+++ b/src/reader.c
@@ -6,6 +6,7 @@
#include "byte_source.h"
#include "namespaces.h"
#include "node.h"
+#include "read_ntriples.h"
#include "stack.h"
#include "statement.h"
#include "system.h"
@@ -36,6 +37,29 @@ r_err(SerdReader* const reader, const SerdStatus st, const char* const fmt, ...)
return st;
}
+SerdStatus
+skip_horizontal_whitespace(SerdReader* const reader)
+{
+ while (peek_byte(reader) == '\t' || peek_byte(reader) == ' ') {
+ eat_byte(reader);
+ }
+
+ return SERD_SUCCESS;
+}
+
+SerdStatus
+serd_reader_skip_until_byte(SerdReader* const reader, const uint8_t byte)
+{
+ int c = peek_byte(reader);
+
+ while (c != byte && c != EOF) {
+ skip_byte(reader, c);
+ c = peek_byte(reader);
+ }
+
+ return c == EOF ? SERD_FAILURE : SERD_SUCCESS;
+}
+
void
set_blank_id(SerdReader* const reader,
SerdNode* const node,
@@ -152,15 +176,27 @@ serd_reader_read_document(SerdReader* const reader)
{
assert(reader);
- if (!reader->source.prepared) {
+ if (reader->syntax != SERD_SYNTAX_EMPTY && !reader->source.prepared) {
SerdStatus st = serd_reader_prepare(reader);
if (st) {
return st;
}
}
- return ((reader->syntax == SERD_NQUADS) ? read_nquadsDoc(reader)
- : read_turtleTrigDoc(reader));
+ switch (reader->syntax) {
+ case SERD_SYNTAX_EMPTY:
+ break;
+ case SERD_TURTLE:
+ return read_turtleTrigDoc(reader);
+ case SERD_NTRIPLES:
+ return read_ntriplesDoc(reader);
+ case SERD_NQUADS:
+ return read_nquadsDoc(reader);
+ case SERD_TRIG:
+ return read_turtleTrigDoc(reader);
+ }
+
+ return SERD_SUCCESS;
}
SerdReader*
diff --git a/src/reader.h b/src/reader.h
index 9b9a217e..559c9cee 100644
--- a/src/reader.h
+++ b/src/reader.h
@@ -52,6 +52,9 @@ struct SerdReaderImpl {
bool seen_genid;
};
+SerdStatus
+skip_horizontal_whitespace(SerdReader* reader);
+
SERD_LOG_FUNC(3, 4)
SerdStatus
r_err(SerdReader* reader, SerdStatus st, const char* fmt, ...);
@@ -114,6 +117,18 @@ skip_byte(SerdReader* reader, const int byte)
return serd_byte_source_advance(&reader->source);
}
+static inline int
+eat_byte(SerdReader* const reader)
+{
+ const int c = peek_byte(reader);
+
+ if (c != EOF) {
+ serd_byte_source_advance(&reader->source);
+ }
+
+ return c;
+}
+
static inline int SERD_NODISCARD
eat_byte_safe(SerdReader* reader, const int byte)
{
diff --git a/src/string_utils.h b/src/string_utils.h
index 4102a54c..564c58ad 100644
--- a/src/string_utils.h
+++ b/src/string_utils.h
@@ -114,20 +114,32 @@ utf8_num_bytes(const uint8_t leading)
: 0U; // Invalid
}
+static inline unsigned
+utf8_num_bytes_for_codepoint(const uint32_t code)
+{
+ return (code < 0x00000080) ? 1U
+ : (code < 0x00000800) ? 2U
+ : (code < 0x00010000) ? 3U
+ : (code < 0x00110000) ? 4U
+ : 0U;
+}
+
/// Return the code point of a UTF-8 character with known length
static inline uint32_t
-parse_counted_utf8_char(const uint8_t* utf8, size_t size)
+parse_counted_utf8_char(const uint8_t* const utf8, const size_t size)
{
uint32_t c = utf8[0] & ((1U << (8U - size)) - 1U);
+
for (size_t i = 1; i < size; ++i) {
c = (c << 6) | (utf8[i] & 0x3FU);
}
+
return c;
}
/// Parse a UTF-8 character, set *size to the length, and return the code point
static inline uint32_t
-parse_utf8_char(const uint8_t* utf8, size_t* size)
+parse_utf8_char(const uint8_t* const utf8, size_t* const size)
{
switch (*size = utf8_num_bytes(utf8[0])) {
case 1:
diff --git a/test/extra/bad/bad-blank-node-label.nt b/test/extra/bad/bad-blank-node-label.nt
new file mode 100644
index 00000000..d178a623
--- /dev/null
+++ b/test/extra/bad/bad-blank-node-label.nt
@@ -0,0 +1 @@
+<http://example.org/s> <http://example.org/p> _nocolon .
diff --git a/test/extra/bad/bad-trailing-garbage.nt b/test/extra/bad/bad-trailing-garbage.nt
new file mode 100644
index 00000000..79790812
--- /dev/null
+++ b/test/extra/bad/bad-trailing-garbage.nt
@@ -0,0 +1 @@
+<http://example.org/s> <http://example.org/p> <http://example.org/o> . <http://example.org/error>
diff --git a/test/extra/bad/manifest.ttl b/test/extra/bad/manifest.ttl
index 9dda56b2..7ab5a427 100644
--- a/test/extra/bad/manifest.ttl
+++ b/test/extra/bad/manifest.ttl
@@ -10,8 +10,10 @@
<#bad-a-subject>
<#bad-base>
<#bad-blank>
+ <#bad-blank-node-label>
<#bad-blank-predicate>
<#bad-blank-syntax>
+ <#bad-blank-syntax>
<#bad-bom>
<#bad-char-in-local>
<#bad-char-in-prefix>
@@ -78,6 +80,7 @@
<#bad-semicolon-after-subject>
<#bad-string>
<#bad-subject>
+ <#bad-trailing-garbage>
<#bad-true-predicate>
<#bad-true-subject>
<#bad-uri-escape>
@@ -107,6 +110,11 @@
mf:action <bad-blank.ttl> ;
mf:name "bad-blank" .
+<#bad-blank-node-label>
+ a rdft:TestNTriplesNegativeSyntax ;
+ mf:action <bad-blank-node-label.nt> ;
+ mf:name "bad-blank-node-label" .
+
<#bad-blank-predicate>
a rdft:TestTurtleNegativeSyntax ;
mf:action <bad-blank-predicate.ttl> ;
@@ -447,6 +455,11 @@
mf:action <bad-subject.ttl> ;
mf:name "bad-subject" .
+<#bad-trailing-garbage>
+ a rdft:TestNTriplesNegativeSyntax ;
+ mf:action <bad-trailing-garbage.nt> ;
+ mf:name "bad-trailing-garbage" .
+
<#bad-true-predicate>
a rdft:TestTurtleNegativeSyntax ;
mf:action <bad-true-predicate.ttl> ;
diff --git a/test/extra/good/manifest.ttl b/test/extra/good/manifest.ttl
index c8bfa6f8..bce7c564 100644
--- a/test/extra/good/manifest.ttl
+++ b/test/extra/good/manifest.ttl
@@ -29,7 +29,6 @@
<#test-long-utf8>
<#test-no-spaces>
<#test-non-curie-uri>
- <#test-out-of-range-unicode>
<#test-prefix>
<#test-quote-escapes>
<#test-rel>
@@ -177,12 +176,6 @@
mf:name "test-non-curie-uri" ;
mf:result <test-non-curie-uri.nt> .
-<#test-out-of-range-unicode>
- a rdft:TestTurtleEval ;
- mf:action <test-out-of-range-unicode.ttl> ;
- mf:name "test-out-of-range-unicode" ;
- mf:result <test-out-of-range-unicode.nt> .
-
<#test-prefix>
a rdft:TestTurtleEval ;
mf:action <test-prefix.ttl> ;
diff --git a/test/extra/lax/manifest.ttl b/test/extra/lax/manifest.ttl
index 5fd50f50..c68f0176 100644
--- a/test/extra/lax/manifest.ttl
+++ b/test/extra/lax/manifest.ttl
@@ -15,6 +15,7 @@
<#test-bad-utf8-nt>
<#test-bad-utf8-ttl>
<#test-lone-list>
+ <#test-out-of-range-unicode>
) .
<#test-bad-string-nt>
@@ -70,3 +71,9 @@
mf:action <test-lone-list.ttl> ;
mf:name "test-lone-list" ;
mf:result <test-lone-list.nt> .
+
+<#test-out-of-range-unicode>
+ a rdft:TestTurtleNegativeSyntax ;
+ mf:action <test-out-of-range-unicode.ttl> ;
+ mf:name "test-out-of-range-unicode" ;
+ mf:result <test-out-of-range-unicode.nt> .
diff --git a/test/extra/good/test-out-of-range-unicode.nt b/test/extra/lax/test-out-of-range-unicode.nt
index 5def9e31..5def9e31 100644
--- a/test/extra/good/test-out-of-range-unicode.nt
+++ b/test/extra/lax/test-out-of-range-unicode.nt
diff --git a/test/extra/good/test-out-of-range-unicode.ttl b/test/extra/lax/test-out-of-range-unicode.ttl
index 7e64785a..7e64785a 100644
--- a/test/extra/good/test-out-of-range-unicode.ttl
+++ b/test/extra/lax/test-out-of-range-unicode.ttl
diff --git a/test/test_overflow.c b/test/test_overflow.c
index bc0a2aaf..6e018033 100644
--- a/test/test_overflow.c
+++ b/test/test_overflow.c
@@ -59,6 +59,9 @@ test_ntriples_overflow(void)
{
static const char* const test_strings[] = {
"<http://example.org/s> <http://example.org/p> <http://example.org/o> .",
+ "<http://example.org/s> <http://example.org/p> \"literal\" .",
+ "<http://example.org/s> <http://example.org/p> _:blank .",
+ "<http://example.org/s> <http://example.org/p> \"\"@en .",
NULL,
};
diff --git a/test/test_reader.c b/test/test_reader.c
index a5595804..f33c3429 100644
--- a/test/test_reader.c
+++ b/test/test_reader.c
@@ -19,6 +19,7 @@
#endif
#include <assert.h>
+#include <stdbool.h>
#include <stdio.h>
#include <string.h>
@@ -365,6 +366,51 @@ test_read_turtle_chunks(const char* const path)
assert(!zix_remove(path));
}
+static size_t
+empty_test_read(void* buf, size_t size, size_t nmemb, void* stream)
+{
+ (void)buf;
+ (void)size;
+ (void)nmemb;
+ (void)stream;
+
+ assert(false);
+
+ return 0;
+}
+
+static int
+empty_test_error(void* stream)
+{
+ (void)stream;
+ return 0;
+}
+
+/// Test that reading SERD_SYNTAX_EMPTY "succeeds" without reading any input
+static void
+test_read_empty(void)
+{
+ SerdWorld* const world = serd_world_new();
+ ReaderTest rt = {0, 0, 0, 0};
+
+ SerdSink* const sink = serd_sink_new(&rt, test_sink, NULL);
+ assert(sink);
+
+ SerdReader* const reader = serd_reader_new(world, SERD_SYNTAX_EMPTY, 0, sink);
+ assert(reader);
+
+ SerdStatus st = serd_reader_start_stream(
+ reader, empty_test_read, empty_test_error, &rt, NULL, 1);
+ assert(st == SERD_SUCCESS);
+
+ assert(serd_reader_read_document(reader) == SERD_SUCCESS);
+ assert(rt.n_statement == 0);
+
+ serd_reader_free(reader);
+ serd_sink_free(sink);
+ serd_world_free(world);
+}
+
int
main(void)
{
@@ -379,6 +425,9 @@ main(void)
test_read_string();
test_read_eof_by_page(ttl_path);
test_read_eof_by_byte();
+ test_read_nquads_chunks(nq_path);
+ test_read_turtle_chunks(ttl_path);
+ test_read_empty();
assert(!zix_remove(dir));