diff options
-rw-r--r-- | NEWS | 7 | ||||
-rw-r--r-- | doc/serdi.1 | 10 | ||||
-rw-r--r-- | serd/serd.h | 34 | ||||
-rw-r--r-- | src/reader.c | 78 | ||||
-rw-r--r-- | src/serdi.c | 19 | ||||
-rw-r--r-- | wscript | 4 |
6 files changed, 135 insertions, 17 deletions
@@ -1,3 +1,10 @@ +serd (9999) unstable; + + * Add incremental read interface suitable for reading from infinite streams. + * Add -e option to serdi to use incremental reading. + + -- David Robillard <d@drobilla.net> + serd (0.14.0) stable; * Use path variables in pkgconfig files diff --git a/doc/serdi.1 b/doc/serdi.1 index 23d77ac3..45001c90 100644 --- a/doc/serdi.1 +++ b/doc/serdi.1 @@ -1,4 +1,4 @@ -.TH SERDI 1 "17 Jan 2012" +.TH SERDI 1 "08 May 2012" .SH NAME .B serdi \- Read and write RDF syntax @@ -17,6 +17,14 @@ Fast bulk output for large serialisations. Chop PREFIX from matching blank node IDs. .TP +\fB\-e\fR +Eat input one character at a time, rather than a page at a time which is the +default. This is useful when reading from a pipe since output will be +generated immediately as input arrives, rather than waiting until an entire +page of input has arrived. With this option serdi uses one page less memory, +but will likely be significantly slower. + +.TP \fB\-f\fR Keep full URIs in input (don't qualify). diff --git a/serd/serd.h b/serd/serd.h index 807664dd..cc2365f4 100644 --- a/serd/serd.h +++ b/serd/serd.h @@ -705,6 +705,40 @@ serd_reader_read_file(SerdReader* reader, const uint8_t* uri); /** + Start an incremental read from a file handle. + + Iff @p bulk is true, @p file will be read a page at a time. This is more + efficient, but uses a page of memory and means that an entire page of input + must be ready before any callbacks will fire. To react as soon as input + arrives, set @p bulk to false. +*/ +SERD_API +SerdStatus +serd_reader_start_stream(SerdReader* me, + FILE* file, + const uint8_t* name, + bool bulk); + +/** + Read a single "chunk" of data during an incremental read. + + This function will read a single top level description, and return. This + may be a directive, statement, or several statements; essentially it reads + until a '.' is encountered. This is particularly useful for reading + directly from a pipe or socket. +*/ +SERD_API +SerdStatus +serd_reader_read_chunk(SerdReader* me); + +/** + Finish an incremental read from a file handle. +*/ +SERD_API +SerdStatus +serd_reader_end_stream(SerdReader* me); + +/** Read @c file. */ SERD_API diff --git a/src/reader.c b/src/reader.c index 70dc7fe5..5c623039 100644 --- a/src/reader.c +++ b/src/reader.c @@ -78,7 +78,9 @@ struct SerdReaderImpl { unsigned next_id; uint8_t* read_buf; int32_t read_head; ///< Offset into read_buf + uint8_t read_byte; ///< 1-byte 'buffer' used when not paging bool from_file; ///< True iff reading from @ref fd + bool paging; ///< True iff reading a page at a time bool eof; bool seen_genid; #ifdef SERD_STACK_CHECK @@ -124,14 +126,19 @@ static inline uint8_t eat_byte_safe(SerdReader* reader, const uint8_t byte) { assert(peek_byte(reader) == byte); - ++reader->read_head; switch (byte) { case '\0': reader->eof = true; break; case '\n': ++reader->cur.line; reader->cur.col = 0; break; default: ++reader->cur.col; } - if (reader->from_file && (reader->read_head == SERD_PAGE_SIZE)) { + if (reader->from_file && !reader->paging) { + const int c = fgetc(reader->fd); + reader->read_byte = (c == EOF) ? 0 : (uint8_t)c; + if (c == EOF) { + reader->eof = true; + } + } else if (++reader->read_head == SERD_PAGE_SIZE && reader->paging) { page(reader); } return byte; @@ -548,7 +555,7 @@ read_comment(SerdReader* reader) { eat_byte_safe(reader, '#'); uint8_t c; - while (((c = peek_byte(reader)) != 0xA) && (c != 0xD)) { + while (((c = peek_byte(reader)) != 0xA) && (c != 0xD) && c) { eat_byte_safe(reader, c); } } @@ -1458,14 +1465,17 @@ static void skip_bom(SerdReader* me) { const uint8_t* const b = me->read_buf; - if (b[0] == 0xEF && b[1] == 0xBB && b[2] == 0xBF) { + if (me->paging && b[0] == 0xEF && b[1] == 0xBB && b[2] == 0xBF) { me->read_head += 3; } } SERD_API SerdStatus -serd_reader_read_file_handle(SerdReader* me, FILE* file, const uint8_t* name) +serd_reader_start_stream(SerdReader* me, + FILE* file, + const uint8_t* name, + bool bulk) { const Cursor cur = { name, 1, 1 }; me->fd = file; @@ -1473,19 +1483,62 @@ serd_reader_read_file_handle(SerdReader* me, FILE* file, const uint8_t* name) me->cur = cur; me->from_file = true; me->eof = false; - me->read_buf = (uint8_t*)serd_bufalloc(SERD_PAGE_SIZE); + me->paging = bulk; + + if (bulk) { + me->read_buf = (uint8_t*)serd_bufalloc(SERD_PAGE_SIZE); + memset(me->read_buf, '\0', SERD_PAGE_SIZE); + SerdStatus st = page(me); + if (st) { + serd_reader_end_stream(me); + return st; + } + skip_bom(me); + } else { + me->read_buf = &me->read_byte; + me->read_byte = 0; // Don't read to avoid potentially blocking + } - memset(me->read_buf, '\0', SERD_PAGE_SIZE); + return SERD_SUCCESS; +} - SerdStatus st = page(me); - if (!st) { - skip_bom(me); - st = read_turtleDoc(me) ? SERD_SUCCESS : SERD_ERR_UNKNOWN; +SERD_API +SerdStatus +serd_reader_read_chunk(SerdReader* me) +{ + if (!me->read_byte) { + // Read initial byte + const int c = fgetc(me->fd); + me->read_byte = (c == EOF) ? 0 : (uint8_t)c; + if (c == EOF) { + me->eof = true; + return SERD_FAILURE; + } } + return read_statement(me) ? SERD_SUCCESS : SERD_FAILURE; +} - free(me->read_buf); +SERD_API +SerdStatus +serd_reader_end_stream(SerdReader* me) +{ + if (me->paging) { + free(me->read_buf); + } me->fd = 0; me->read_buf = NULL; + return SERD_SUCCESS; +} + +SERD_API +SerdStatus +serd_reader_read_file_handle(SerdReader* me, FILE* file, const uint8_t* name) +{ + SerdStatus st = serd_reader_start_stream(me, file, name, true); + if (!st) { + st = read_turtleDoc(me) ? SERD_SUCCESS : SERD_ERR_UNKNOWN; + serd_reader_end_stream(me); + } return st; } @@ -1499,6 +1552,7 @@ serd_reader_read_string(SerdReader* me, const uint8_t* utf8) me->read_head = 0; me->cur = cur; me->from_file = false; + me->paging = false; me->eof = false; skip_bom(me); diff --git a/src/serdi.c b/src/serdi.c index 74a84992..4c482884 100644 --- a/src/serdi.c +++ b/src/serdi.c @@ -46,6 +46,7 @@ print_usage(const char* name, bool error) fprintf(os, "Use - for INPUT to read from standard input.\n\n"); fprintf(os, " -b Fast bulk output for large serialisations.\n"); fprintf(os, " -c PREFIX Chop PREFIX from matching blank node IDs.\n"); + fprintf(os, " -e Eat input one character at a time.\n"); fprintf(os, " -f Keep full URIs in input (don't qualify).\n"); fprintf(os, " -h Display this help and exit.\n"); fprintf(os, " -i SYNTAX Input syntax (`turtle' or `ntriples').\n"); @@ -89,6 +90,7 @@ main(int argc, char** argv) SerdSyntax input_syntax = SERD_TURTLE; SerdSyntax output_syntax = SERD_NTRIPLES; bool from_file = true; + bool bulk_read = true; bool bulk_write = false; bool full_uris = false; const uint8_t* in_name = NULL; @@ -103,6 +105,8 @@ main(int argc, char** argv) break; } else if (argv[a][1] == 'b') { bulk_write = true; + } else if (argv[a][1] == 'e') { + bulk_read = false; } else if (argv[a][1] == 'f') { full_uris = true; } else if (argv[a][1] == 'h') { @@ -206,9 +210,18 @@ main(int argc, char** argv) serd_writer_chop_blank_prefix(writer, chop_prefix); serd_reader_add_blank_prefix(reader, add_prefix); - const SerdStatus status = (from_file) - ? serd_reader_read_file_handle(reader, in_fd, in_name) - : serd_reader_read_string(reader, input); + SerdStatus status = SERD_SUCCESS; + if (!from_file) { + status = serd_reader_read_string(reader, input); + } else if (bulk_read) { + status = serd_reader_read_file_handle(reader, in_fd, in_name); + } else { + status = serd_reader_start_stream(reader, in_fd, in_name, false); + while (!status) { + status = serd_reader_read_chunk(reader); + } + serd_reader_end_stream(reader); + } serd_reader_free(reader); @@ -9,7 +9,7 @@ from waflib.extras import autowaf as autowaf import waflib.Logs as Logs, waflib.Options as Options # Version of this package (even if built as a child) -SERD_VERSION = '0.14.0' +SERD_VERSION = '0.15.0' SERD_MAJOR_VERSION = '0' # Library version (UNIX style major, minor, micro) @@ -389,6 +389,8 @@ def test(ctx): flags += ' -f' if (num % 3 == 0): flags += ' -r http://www.w3.org/' + if (num % 7 == 0): + flags += ' -e' base_uri = 'http://www.w3.org/2001/sw/DataAccess/df1/' + test.replace('\\', '/') out_filename = test + '.thru' commands += [ |