aboutsummaryrefslogtreecommitdiffstats
path: root/src/n3.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/n3.c')
-rw-r--r--src/n3.c215
1 files changed, 105 insertions, 110 deletions
diff --git a/src/n3.c b/src/n3.c
index 31dc5d45..c5066581 100644
--- a/src/n3.c
+++ b/src/n3.c
@@ -1,9 +1,8 @@
-// Copyright 2011-2023 David Robillard <d@drobilla.net>
+// Copyright 2011-2025 David Robillard <d@drobilla.net>
// SPDX-License-Identifier: ISC
#include "reader.h"
#include "serd_internal.h"
-#include "stack.h"
#include "string_utils.h"
#include "try.h"
#include "uri_utils.h"
@@ -46,7 +45,7 @@ read_HEX(SerdReader* const reader)
return (uint8_t)eat_byte_safe(reader, c);
}
- r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid hexadecimal digit '%c'\n", c);
+ r_err_char(reader, "hexadecimal", c);
return 0;
}
@@ -354,11 +353,11 @@ read_STRING_LITERAL_LONG(SerdReader* const reader,
push_byte(reader, ref, c);
st = read_character(reader, ref, flags, (uint8_t)q2);
}
- } else if (c == EOF) {
- st = r_err(reader, SERD_ERR_BAD_SYNTAX, "end of file in long string\n");
- } else {
+ } else if (c > 0) {
st =
read_character(reader, ref, flags, (uint8_t)eat_byte_safe(reader, c));
+ } else {
+ return r_err(reader, SERD_ERR_BAD_SYNTAX, "end of file in long string\n");
}
}
@@ -457,7 +456,7 @@ read_PN_CHARS_BASE(SerdReader* const reader, const Ref dest)
return push_byte(reader, dest, eat_byte_safe(reader, c));
}
- if (c == EOF || !(c & 0x80)) {
+ if (c < 0x80) {
return SERD_FAILURE;
}
@@ -465,11 +464,7 @@ read_PN_CHARS_BASE(SerdReader* const reader, const Ref dest)
read_utf8_code(reader, dest, &code, (uint8_t)c);
if (!is_PN_CHARS_BASE(code)) {
- r_err(
- reader, SERD_ERR_BAD_SYNTAX, "invalid character U+%04X in name\n", code);
- if (reader->strict) {
- return SERD_ERR_BAD_SYNTAX;
- }
+ st = r_err_char(reader, "name", (int)code);
}
return st;
@@ -493,7 +488,7 @@ read_PN_CHARS(SerdReader* const reader, const Ref dest)
return push_byte(reader, dest, eat_byte_safe(reader, c));
}
- if (c == EOF || !(c & 0x80)) {
+ if (c < 0x80) {
return SERD_FAILURE;
}
@@ -501,8 +496,7 @@ read_PN_CHARS(SerdReader* const reader, const Ref dest)
TRY(st, read_utf8_code(reader, dest, &code, (uint8_t)c));
if (!is_PN_CHARS(code)) {
- return r_err(
- reader, SERD_ERR_BAD_SYNTAX, "invalid character U+%04X in name\n", code);
+ st = r_err_char(reader, "name", (int)code);
}
return st;
@@ -531,7 +525,7 @@ read_PN_LOCAL_ESC(SerdReader* const reader, const Ref dest)
return ((c == '!') || in_range(c, '#', '/') || (c == ';') || (c == '=') ||
(c == '?') || (c == '@') || (c == '_') || (c == '~'))
? push_byte(reader, dest, eat_byte_safe(reader, c))
- : r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid escape\n");
+ : r_err(reader, SERD_ERR_BAD_SYNTAX, "bad escape\n");
}
static SerdStatus
@@ -589,9 +583,7 @@ read_PN_LOCAL(SerdReader* const reader, const Ref dest, bool* const ate_dot)
SerdNode* const n = deref(reader, dest);
if (trailing_unescaped_dot) {
// Ate trailing dot, pop it from stack/node and inform caller
- --n->n_bytes;
- serd_stack_pop(&reader->stack, 1);
- *ate_dot = true;
+ *ate_dot = pop_last_node_char(reader, n);
}
return (st > SERD_FAILURE) ? st : SERD_SUCCESS;
@@ -599,31 +591,37 @@ read_PN_LOCAL(SerdReader* const reader, const Ref dest, bool* const ate_dot)
// Read the remainder of a PN_PREFIX after some initial characters
static SerdStatus
-read_PN_PREFIX_tail(SerdReader* const reader, const Ref dest)
+read_PN_PREFIX_tail(SerdReader* const reader,
+ const Ref dest,
+ bool* const ate_dot)
{
- int c = 0;
- while ((c = peek_byte(reader)) > 0) { // Middle: (PN_CHARS | '.')*
+ SerdStatus st = SERD_SUCCESS;
+ bool trailing_unescaped_dot = false;
+
+ while (!st) { // Middle: (PN_CHARS | '.')*
+ const int c = peek_byte(reader);
if (c == '.') {
push_byte(reader, dest, eat_byte_safe(reader, c));
- } else if (read_PN_CHARS(reader, dest)) {
- break;
+ trailing_unescaped_dot = true;
+ } else if (!(st = read_PN_CHARS(reader, dest))) {
+ trailing_unescaped_dot = false;
}
}
- const SerdNode* const n = deref(reader, dest);
- if (n->buf[n->n_bytes - 1] == '.' && read_PN_CHARS(reader, dest)) {
- return r_err(reader, SERD_ERR_BAD_SYNTAX, "prefix ends with '.'\n");
+ if (trailing_unescaped_dot) {
+ SerdNode* const n = deref(reader, dest);
+ *ate_dot = pop_last_node_char(reader, n);
}
- return SERD_SUCCESS;
+ return st;
}
static SerdStatus
-read_PN_PREFIX(SerdReader* const reader, const Ref dest)
+read_PN_PREFIX(SerdReader* const reader, const Ref dest, bool* const ate_dot)
{
const SerdStatus st = read_PN_CHARS_BASE(reader, dest);
- return st ? st : read_PN_PREFIX_tail(reader, dest);
+ return st ? st : read_PN_PREFIX_tail(reader, dest, ate_dot);
}
static SerdStatus
@@ -631,7 +629,7 @@ read_LANGTAG(SerdReader* const reader, Ref* const dest)
{
int c = peek_byte(reader);
if (!is_alpha(c)) {
- return r_err(reader, SERD_ERR_BAD_SYNTAX, "unexpected '%c'\n", c);
+ return r_err_char(reader, "language", c);
}
*dest = push_node(reader, SERD_LITERAL, "", 0);
@@ -657,7 +655,7 @@ read_IRIREF_scheme(SerdReader* const reader, const Ref dest)
{
int c = peek_byte(reader);
if (!is_alpha(c)) {
- return r_err(reader, SERD_ERR_BAD_SYNTAX, "bad IRI scheme start '%c'\n", c);
+ return r_err_char(reader, "IRI scheme start", c);
}
while ((c = peek_byte(reader)) > 0) {
@@ -666,11 +664,7 @@ read_IRIREF_scheme(SerdReader* const reader, const Ref dest)
}
if (!is_uri_scheme_char(c)) {
- return r_err(reader,
- SERD_ERR_BAD_SYNTAX,
- "bad IRI scheme char U+%04X (%c)\n",
- (unsigned)c,
- (char)c);
+ return r_err_char(reader, "IRI scheme", c);
}
push_byte(reader, dest, eat_byte_safe(reader, c));
@@ -704,8 +698,7 @@ read_IRIREF(SerdReader* const reader, Ref* const dest)
case '"':
case '<':
*dest = pop_node(reader, *dest);
- return r_err(
- reader, SERD_ERR_BAD_SYNTAX, "invalid IRI character '%c'\n", c);
+ return r_err_char(reader, "IRI", c);
case '>':
return SERD_SUCCESS;
@@ -713,7 +706,7 @@ read_IRIREF(SerdReader* const reader, Ref* const dest)
case '\\':
if (read_UCHAR(reader, *dest, &code)) {
*dest = pop_node(reader, *dest);
- return r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid IRI escape\n");
+ return r_err_char(reader, "IRI escape", c);
}
if (code == ' ' || code == '<' || code == '>') {
@@ -731,11 +724,12 @@ read_IRIREF(SerdReader* const reader, Ref* const dest)
case '|':
case '}':
*dest = pop_node(reader, *dest);
- return r_err(
- reader, SERD_ERR_BAD_SYNTAX, "invalid IRI character '%c'\n", c);
+ return r_err_char(reader, "IRI", c);
default:
- if (c <= 0x20) {
+ if (c <= 0) {
+ st = r_err(reader, SERD_ERR_BAD_SYNTAX, "unexpected end of file\n");
+ } else if (c <= 0x20) {
st = r_err(reader,
SERD_ERR_BAD_SYNTAX,
"invalid IRI character (escape %%%02X)\n",
@@ -743,8 +737,6 @@ read_IRIREF(SerdReader* const reader, Ref* const dest)
if (!reader->strict) {
st = SERD_FAILURE;
push_byte(reader, *dest, c);
- } else {
- break;
}
} else if (!(c & 0x80)) {
push_byte(reader, *dest, c);
@@ -762,15 +754,10 @@ read_IRIREF(SerdReader* const reader, Ref* const dest)
}
static SerdStatus
-read_PrefixedName(SerdReader* const reader,
- const Ref dest,
- const bool read_prefix,
- bool* const ate_dot)
+read_PrefixedName(SerdReader* const reader, const Ref dest, bool* const ate_dot)
{
SerdStatus st = SERD_SUCCESS;
- if (read_prefix) {
- TRY_FAILING(st, read_PN_PREFIX(reader, dest));
- }
+ TRY_FAILING(st, read_PN_PREFIX(reader, dest, ate_dot));
if (peek_byte(reader) != ':') {
return SERD_FAILURE;
@@ -825,18 +812,16 @@ read_number(SerdReader* const reader,
// all other cases ::= ( '-' | '+' ) [0-9]+ ( . )? ( [0-9]+ )? ...
TRY(st, read_0_9(reader, *dest, true));
if ((c = peek_byte(reader)) == '.') {
- has_decimal = true;
-
// Annoyingly, dot can be end of statement, so tentatively eat
skip_byte(reader, c);
c = peek_byte(reader);
if (!is_digit(c) && c != 'e' && c != 'E') {
- *ate_dot = true; // Force caller to deal with stupid grammar
- return SERD_SUCCESS; // Next byte is not a number character
+ *ate_dot = true; // Force caller to deal with silly grammar
+ } else {
+ has_decimal = true;
+ push_byte(reader, *dest, '.');
+ read_0_9(reader, *dest, false);
}
-
- push_byte(reader, *dest, '.');
- read_0_9(reader, *dest, false);
}
}
c = peek_byte(reader);
@@ -868,7 +853,7 @@ read_iri(SerdReader* const reader, Ref* const dest, bool* const ate_dot)
}
*dest = push_node(reader, SERD_CURIE, "", 0);
- return read_PrefixedName(reader, *dest, true, ate_dot);
+ return read_PrefixedName(reader, *dest, ate_dot);
}
static SerdStatus
@@ -920,29 +905,30 @@ read_verb(SerdReader* const reader, Ref* const dest)
return read_IRIREF(reader, dest);
}
- /* Either a qname, or "a". Read the prefix first, and if it is in fact
- "a", produce that instead.
- */
- *dest = push_node(reader, SERD_CURIE, "", 0);
+ Ref p = push_node(reader, SERD_CURIE, "", 0);
- SerdStatus st = read_PN_PREFIX(reader, *dest);
- bool ate_dot = false;
- const SerdNode* const node = deref(reader, *dest);
- const int next = peek_byte(reader);
- if (!st && node->n_bytes == 1 && node->buf[0] == 'a' && next != ':' &&
- !is_PN_CHARS_BASE((uint32_t)next)) {
- pop_node(reader, *dest);
- *dest = push_node(reader, SERD_URI, NS_RDF "type", 47);
- return SERD_SUCCESS;
+ // Try to read as a prefixed name
+ bool ate_dot = false;
+ SerdStatus st = read_PrefixedName(reader, p, &ate_dot);
+
+ if (st == SERD_FAILURE) {
+ // Check if this is actually the "a" shorthand
+ const SerdNode* const node = deref(reader, p);
+ if (node->n_bytes == 1 && node->buf[0] == 'a') {
+ pop_node(reader, p);
+ p = push_node(reader, SERD_URI, NS_RDF "type", 47);
+ st = SERD_SUCCESS;
+ } else {
+ st = SERD_ERR_BAD_SYNTAX;
+ }
}
- if (st > SERD_FAILURE ||
- (st = read_PrefixedName(reader, *dest, false, &ate_dot)) || ate_dot) {
- *dest = pop_node(reader, *dest);
- st = st > SERD_FAILURE ? st : SERD_ERR_BAD_SYNTAX;
+ if (st) {
+ pop_node(reader, p);
return r_err(reader, st, "bad verb\n");
}
+ *dest = p;
return SERD_SUCCESS;
}
@@ -981,9 +967,7 @@ read_BLANK_NODE_LABEL(SerdReader* const reader,
SerdNode* n = deref(reader, ref);
if (n->buf[n->n_bytes - 1] == '.' && read_PN_CHARS(reader, ref)) {
// Ate trailing dot, pop it from stack/node and inform caller
- --n->n_bytes;
- serd_stack_pop(&reader->stack, 1);
- *ate_dot = true;
+ *ate_dot = pop_last_node_char(reader, n);
}
if (fancy_syntax(reader)) {
@@ -1078,6 +1062,40 @@ read_anon(SerdReader* const reader,
: SERD_ERR_BAD_SYNTAX;
}
+// Read a "named" object: a boolean literal or a prefixed name
+static SerdStatus
+read_named_object(SerdReader* const reader,
+ Ref* const dest,
+ Ref* const datatype,
+ bool* const ate_dot)
+{
+ static const char* const XSD_BOOLEAN = NS_XSD "boolean";
+ static const size_t XSD_BOOLEAN_LEN = 40;
+
+ // Try to read as a prefixed name
+ const Ref o = push_node(reader, SERD_CURIE, "", 0);
+ SerdStatus st = read_PrefixedName(reader, o, ate_dot);
+
+ if (st == SERD_FAILURE) {
+ // Check if this is actually a boolean literal
+ SerdNode* const node = deref(reader, o);
+ if ((node->n_bytes == 4 && !memcmp(node->buf, "true", 4)) ||
+ (node->n_bytes == 5 && !memcmp(node->buf, "false", 5))) {
+ node->type = SERD_LITERAL;
+ *datatype = push_node(reader, SERD_URI, XSD_BOOLEAN, XSD_BOOLEAN_LEN);
+ st = SERD_SUCCESS;
+ }
+ }
+
+ if (st) {
+ pop_node(reader, o);
+ return r_err(reader, SERD_ERR_BAD_SYNTAX, "expected prefixed name\n");
+ }
+
+ *dest = o;
+ return SERD_SUCCESS;
+}
+
/* If emit is true: recurses, calling statement_sink for every statement
encountered, and leaves stack in original calling state (i.e. pops
everything it pushes). */
@@ -1087,9 +1105,6 @@ read_object(SerdReader* const reader,
const bool emit,
bool* const ate_dot)
{
- static const char* const XSD_BOOLEAN = NS_XSD "boolean";
- static const size_t XSD_BOOLEAN_LEN = 40;
-
#ifndef NDEBUG
const size_t orig_stack_size = reader->stack.size;
#endif
@@ -1097,7 +1112,6 @@ read_object(SerdReader* const reader,
SerdStatus st = SERD_FAILURE;
bool simple = (ctx->subject != 0);
- SerdNode* node = NULL;
Ref o = 0;
Ref datatype = 0;
Ref lang = 0;
@@ -1147,27 +1161,8 @@ read_object(SerdReader* const reader,
st = read_literal(reader, &o, &datatype, &lang, &flags, ate_dot);
break;
default:
- /* Either a boolean literal, or a qname. Read the prefix first, and if
- it is in fact a "true" or "false" literal, produce that instead.
- */
- o = push_node(reader, SERD_CURIE, "", 0);
- while (!read_PN_CHARS_BASE(reader, o)) {
- }
- node = deref(reader, o);
- if ((node->n_bytes == 4 && !memcmp(node->buf, "true", 4)) ||
- (node->n_bytes == 5 && !memcmp(node->buf, "false", 5))) {
- node->type = SERD_LITERAL;
- datatype = push_node(reader, SERD_URI, XSD_BOOLEAN, XSD_BOOLEAN_LEN);
- st = SERD_SUCCESS;
- } else if (read_PN_PREFIX_tail(reader, o) > SERD_FAILURE) {
- st = SERD_ERR_BAD_SYNTAX;
- } else {
- if ((st = read_PrefixedName(reader, o, false, ate_dot))) {
- st = st > SERD_FAILURE ? st : SERD_ERR_BAD_SYNTAX;
- pop_node(reader, o);
- return r_err(reader, st, "expected prefixed name\n");
- }
- }
+ // Either a boolean literal or a prefixed name
+ st = read_named_object(reader, &o, &datatype, ate_dot);
}
if (!st && simple && o) {
@@ -1444,12 +1439,12 @@ read_prefixID(SerdReader* const reader, const bool sparql, const bool token)
}
read_ws_star(reader);
- Ref name = push_node(reader, SERD_LITERAL, "", 0);
- TRY_FAILING(st, read_PN_PREFIX(reader, name));
-
- if (eat_byte_check(reader, ':') != ':') {
+ Ref name = push_node(reader, SERD_LITERAL, "", 0);
+ bool ate_dot = false;
+ TRY_FAILING(st, read_PN_PREFIX(reader, name, &ate_dot));
+ if (ate_dot || eat_byte_check(reader, ':') != ':') {
pop_node(reader, name);
- return SERD_ERR_BAD_SYNTAX;
+ return r_err(reader, SERD_ERR_BAD_SYNTAX, "expected a prefix name\n");
}
read_ws_star(reader);