// Copyright 2011-2022 David Robillard // SPDX-License-Identifier: ISC #ifndef SERD_READER_H #define SERD_READER_H #include "serd/attributes.h" #include "serd/env.h" #include "serd/input_stream.h" #include "serd/node.h" #include "serd/sink.h" #include "serd/status.h" #include "serd/syntax.h" #include "serd/world.h" #include "zix/attributes.h" #include #include SERD_BEGIN_DECLS /** @defgroup serd_reader Reader @ingroup serd_reading_writing @{ */ /// Streaming parser that reads a text stream and writes to a statement sink typedef struct SerdReaderImpl SerdReader; /// Reader options typedef enum { /** Tolerate invalid input where possible. This will attempt to ignore invalid input and continue reading. Invalid Unicode characters will be replaced with the replacement character, and various other syntactic problems will be ignored. If there are more severe problems, the reader will try to skip the statement and continue parsing. This should work reasonably well for line-based syntaxes like NTriples and NQuads, but abbreviated Turtle or TriG may not recover. Note that this flag should be used carefully, since it can result in data loss. */ SERD_READ_LAX = 1U << 0U, /** Support reading variable nodes. As an extension, serd supports reading variables nodes with SPARQL-like syntax, for example "?foo" or "$bar". This can be used for storing graph patterns and templates. */ SERD_READ_VARIABLES = 1U << 1U, /** Read prefixed name (CURIE) references exactly without expanding them. Normally, the reader expands all prefixed names to full URIs based on the prefixes in the current environment, and considers failure to expand a syntax error. This flag disables that expansion so prefixed names will be emitted directly as CURIE nodes. Note that these nodes rely on some context which can change over time, and may even be undefined initially, so this flag should be used with caution. Most applications should leave it off and avoid using CURIE nodes entirely, because they are error-prone compared to working with complete URIs. However, it can be useful for error-tolerance, or in constrained or high-performance streaming contexts. For example, to re-indent a Turtle file and ignore any possibly undefined prefixed names, this flag can be used to disable expansion, which also boosts performance since it avoids the lookup and expansion overhead. */ SERD_READ_CURIES = 1U << 2U, /** Read relative URI references exactly without resolving them. Normally, the reader expands all relative URIs against the base URI. This flag disables that, so that URI references are passed to the sink exactly as they are in the input. */ SERD_READ_RELATIVE = 1U << 3U, /** Read blank node labels without adding a prefix unique to the document. Normally, the reader adds a prefix like "f1", "f2", and so on, to blank node labels, to separate the namespaces from separate input documents. This flag disables that, so that blank node labels will be read without any prefix added. Note that this flag should be used carefully, since it can result in data corruption. Specifically, if data from separate documents parsed with this flag is combined, the IDs from each document may clash. */ SERD_READ_GLOBAL = 1U << 4U, /** Read generated blank node labels exactly without adjusting them. Normally, the reader will adapt blank node labels in the input that clash with its scheme for generating new ones, for example mapping "_:b123" to "_:B123". This flag disables that, so that blank node labels are passed to the sink exactly as they are in the input. Note that this flag should be used carefully, since it can result in data corruption. Specifically, if the input is a syntax like Turtle with anonymous nodes, the generated IDs for those nodes may clash with IDs from the input document. */ SERD_READ_GENERATED = 1U << 5U, /** Generate blank node labels with suffixes left-padded with zeros. This is useful because it makes generated blank node IDs like "_:b0000000123" match the numerical order when compared as strings (or as nodes). In particular, this can be used to preserve blank node ordering from documents when the statements are sorted, such as in a model. */ SERD_READ_ORDERED = 1U << 6U, /** Read URIs with percent-encoded UTF-8 characters decoded. Normally, percent-encoded octets in URIs are treated as plain text and preserved. This flags enables UTF-8 decoding of URIs, so octet escapes like "%7E" in URIs will be decoded to UTF-8 characters like "~". */ SERD_READ_DECODED = 1U << 7U, } SerdReaderFlag; /// Bitwise OR of SerdReaderFlag values typedef uint32_t SerdReaderFlags; /// Create a new RDF reader SERD_API SerdReader* ZIX_ALLOCATED serd_reader_new(SerdWorld* ZIX_NONNULL world, SerdSyntax syntax, SerdReaderFlags flags, SerdEnv* ZIX_NONNULL env, const SerdSink* ZIX_NONNULL sink); /** Prepare to read some input. This sets up the reader to read from the given input, but will not read any bytes from it. This should be followed by serd_reader_read_chunk() or serd_reader_read_document() to actually read the input. @param reader The reader. @param input An opened input stream to read from. @param input_name The name of the input stream for error messages. @param block_size The number of bytes to read from the stream at once. */ SERD_API SerdStatus serd_reader_start(SerdReader* ZIX_NONNULL reader, SerdInputStream* ZIX_NONNULL input, const SerdNode* ZIX_NULLABLE input_name, size_t block_size); /** Read a single "chunk" of data during an incremental read. This function will read a single top level description, and return. This may be a directive, statement, or several statements; essentially it reads until a '.' is encountered. This is particularly useful for reading directly from a pipe or socket. */ SERD_API SerdStatus serd_reader_read_chunk(SerdReader* ZIX_NONNULL reader); /** Read a complete document from the source. This function will continue pulling from the source until a complete document has been read. Note that this may block when used with streams, for incremental reading use serd_reader_read_chunk(). */ SERD_API SerdStatus serd_reader_read_document(SerdReader* ZIX_NONNULL reader); /** Finish reading from the source. This should be called before starting to read from another source. */ SERD_API SerdStatus serd_reader_finish(SerdReader* ZIX_NONNULL reader); /** Skip over bytes in the input until a specific byte is encountered. Typically used for recording from errors in a line-based syntax by skipping ahead to the next newline. @return #SERD_SUCCESS if the given byte was reached, or #SERD_FAILURE if the end of input is reached. */ SERD_API SerdStatus serd_reader_skip_until_byte(SerdReader* ZIX_NONNULL reader, uint8_t byte); /** Free `reader`. The reader will be finished via `serd_reader_finish()` if necessary. */ SERD_API void serd_reader_free(SerdReader* ZIX_NULLABLE reader); /** @} */ SERD_END_DECLS #endif // SERD_READER_H