From b92d598a22fdad8c96a1167362d4bb79015af006 Mon Sep 17 00:00:00 2001 From: David Robillard Date: Wed, 18 May 2011 02:00:03 +0000 Subject: Add test to ensure blank node IDs don't clash with generated IDs. Add handle destructor parameter to serd_reader_new. Add serd_reader_get_handle. Rename serd_reader_set_blank_prefix to serd_reader_add_blank_prefix. Rename serd_reader_read_file to serd_reader_read_file_handle. Add new serd_reader_read_file that takes a path/URI parameter. Add serdi -i option to select input syntax. Add serdi -p and -c options to add/chop a prefix to/from blank IDs. Add optional base_uri parameter to serd_env_new. Add serd_writer_chop_blank_prefix. Bump version to 0.3.0. git-svn-id: http://svn.drobilla.net/serd/trunk@183 490d8e77-9747-427b-9fa3-0b8f29cee8a0 --- serd/serd.h | 33 ++++++++++-- src/env.c | 5 +- src/reader.c | 159 ++++++++++++++++++++++++++++++++++++++++++++---------- src/serdi.c | 94 ++++++++++++++++++++++---------- src/writer.c | 52 ++++++++++++++---- tests/test-id.out | 2 + tests/test-id.ttl | 4 ++ wscript | 17 +++--- 8 files changed, 286 insertions(+), 80 deletions(-) create mode 100644 tests/test-id.out create mode 100644 tests/test-id.ttl diff --git a/serd/serd.h b/serd/serd.h index 4d87c1a3..977c9e51 100644 --- a/serd/serd.h +++ b/serd/serd.h @@ -428,7 +428,7 @@ typedef SerdStatus (*SerdEndSink)(void* handle, */ SERD_API SerdEnv* -serd_env_new(); +serd_env_new(const SerdNode* base_uri); /** Free @a ns. @@ -512,11 +512,19 @@ SERD_API SerdReader* serd_reader_new(SerdSyntax syntax, void* handle, + void (*free_handle)(void*), SerdBaseSink base_sink, SerdPrefixSink prefix_sink, SerdStatementSink statement_sink, SerdEndSink end_sink); +/** + Return the @c handle passed to @ref serd_reader_new. +*/ +SERD_API +void* +serd_reader_get_handle(const SerdReader* reader); + /** Set a prefix to be added to all blank node identifiers. @@ -528,17 +536,26 @@ serd_reader_new(SerdSyntax syntax, */ SERD_API void -serd_reader_set_blank_prefix(SerdReader* reader, +serd_reader_add_blank_prefix(SerdReader* reader, const uint8_t* prefix); /** Read @a file. + @param Path or file: URI of file to read. */ SERD_API SerdStatus serd_reader_read_file(SerdReader* reader, - FILE* file, - const uint8_t* name); + const uint8_t* uri); + +/** + Read @a file. +*/ +SERD_API +SerdStatus +serd_reader_read_file_handle(SerdReader* reader, + FILE* file, + const uint8_t* name); /** Read @a utf8. @@ -579,6 +596,14 @@ SERD_API void serd_writer_free(SerdWriter* writer); +/** + Set a prefix to be removed from matching blank node identifiers. +*/ +SERD_API +void +serd_writer_chop_blank_prefix(SerdWriter* writer, + const uint8_t* prefix); + /** Set the current output base URI (and emit directive if applicable). diff --git a/src/env.c b/src/env.c index 64fccaac..a9b8e474 100644 --- a/src/env.c +++ b/src/env.c @@ -35,13 +35,16 @@ struct SerdEnvImpl { SERD_API SerdEnv* -serd_env_new() +serd_env_new(const SerdNode* base_uri) { SerdEnv* env = malloc(sizeof(struct SerdEnvImpl)); env->prefixes = NULL; env->n_prefixes = 0; env->base_uri_node = SERD_NODE_NULL; env->base_uri = SERD_URI_NULL; + if (base_uri) { + serd_env_set_base_uri(env, base_uri); + } return env; } diff --git a/src/reader.c b/src/reader.c index 44910ddf..12109ad3 100644 --- a/src/reader.c +++ b/src/reader.c @@ -65,6 +65,7 @@ static const Node INTERNAL_NODE_NULL = { 0, 0 }; struct SerdReaderImpl { void* handle; + void (*free_handle)(void*); SerdBaseSink base_sink; SerdPrefixSink prefix_sink; SerdStatementSink statement_sink; @@ -75,9 +76,11 @@ struct SerdReaderImpl { Node rdf_nil; FILE* fd; SerdStack stack; + SerdSyntax syntax; Cursor cur; uint8_t* buf; - const uint8_t* blank_prefix; + uint8_t* bprefix; + size_t bprefix_len; unsigned next_id; int err; uint8_t* read_buf; @@ -241,6 +244,23 @@ push_byte(SerdReader* reader, Ref ref, const uint8_t c) str->buf[str->n_bytes] = '\0'; } +static inline void +append_string(SerdReader* reader, Ref ref, const uint8_t* suffix) +{ + #ifdef SERD_STACK_CHECK + assert(stack_is_top_string(reader, ref)); + #endif + size_t n_bytes; + uint32_t flags = 0; + size_t n_chars = serd_strlen(suffix, &n_bytes, &flags); + serd_stack_push(&reader->stack, n_bytes); + SerdString* const str = deref(reader, ref); + assert(str->n_bytes >= str->n_chars); + memcpy(str->buf + str->n_bytes, suffix, n_bytes + 1); + str->n_bytes += n_bytes; + str->n_chars += n_chars; +} + static void pop_string(SerdReader* reader, Ref ref) { @@ -968,20 +988,33 @@ read_nodeID(SerdReader* reader) { eat_byte(reader, '_'); eat_byte(reader, ':'); - Ref str = push_string(reader, "", 0); - return read_name(reader, str, true); + Ref ref = push_string(reader, "", 0); + read_name(reader, ref, true); + SerdString* const str = deref(reader, ref); + if (reader->syntax == SERD_TURTLE + && !strncmp((const char*)str->buf, "genid", 5)) { + // Replace "genid" nodes with "docid" to prevent clashing + memcpy(str->buf, "docid", 5); + } + return ref; } static Ref blank_id(SerdReader* reader) { - const char* prefix = reader->blank_prefix - ? (const char*)reader->blank_prefix - : "genid"; - char str[32]; // FIXME: ensure length of reader->blank_prefix is OK - const int len = snprintf(str, sizeof(str), "%s%u", - prefix, reader->next_id++); - return push_string(reader, str, len); + Ref str; + if (reader->bprefix) { + str = push_string(reader, + (const char*)reader->bprefix, + reader->bprefix_len); + } else { + str = push_string(reader, "", 0); + } + char num[32]; + snprintf(num, sizeof(num), "%u", reader->next_id++); + append_string(reader, str, (const uint8_t*)"genid"); + append_string(reader, str, (const uint8_t*)num); + return str; } // Spec: [21] blank ::= nodeID | '[]' @@ -1368,29 +1401,33 @@ SERD_API SerdReader* serd_reader_new(SerdSyntax syntax, void* handle, + void (*free_handle)(void*), SerdBaseSink base_sink, SerdPrefixSink prefix_sink, SerdStatementSink statement_sink, SerdEndSink end_sink) { const Cursor cur = { NULL, 0, 0 }; - SerdReader* me = malloc(sizeof(struct SerdReaderImpl)); - me->handle = handle; - me->base_sink = base_sink; - me->prefix_sink = prefix_sink; - me->statement_sink = statement_sink; - me->end_sink = end_sink; - me->fd = 0; - me->stack = serd_stack_new(STACK_PAGE_SIZE); - me->cur = cur; - me->blank_prefix = NULL; - me->next_id = 1; - me->read_buf = 0; - me->read_head = 0; - me->eof = false; + SerdReader* me = malloc(sizeof(struct SerdReaderImpl)); + me->handle = handle; + me->free_handle = free_handle; + me->base_sink = base_sink; + me->prefix_sink = prefix_sink; + me->statement_sink = statement_sink; + me->end_sink = end_sink; + me->fd = 0; + me->stack = serd_stack_new(STACK_PAGE_SIZE); + me->syntax = syntax; + me->cur = cur; + me->bprefix = NULL; + me->bprefix_len = 0; + me->next_id = 1; + me->read_buf = 0; + me->read_head = 0; + me->eof = false; #ifdef SERD_STACK_CHECK - me->alloc_stack = 0; - me->n_allocs = 0; + me->alloc_stack = 0; + me->n_allocs = 0; #endif #define RDF_FIRST NS_RDF "first" @@ -1415,20 +1452,84 @@ serd_reader_free(SerdReader* reader) free(reader->alloc_stack); #endif free(reader->stack.buf); + free(reader->bprefix); + if (reader->free_handle) { + reader->free_handle(reader->handle); + } free(reader); } +SERD_API +void* +serd_reader_get_handle(const SerdReader* reader) +{ + return reader->handle; +} + SERD_API void -serd_reader_set_blank_prefix(SerdReader* reader, +serd_reader_add_blank_prefix(SerdReader* reader, const uint8_t* prefix) { - reader->blank_prefix = prefix; + if (reader->bprefix) { + free(reader->bprefix); + reader->bprefix_len = 0; + reader->bprefix = NULL; + } + if (prefix) { + reader->bprefix_len = strlen((const char*)prefix); + reader->bprefix = malloc(reader->bprefix_len + 1); + memcpy(reader->bprefix, prefix, reader->bprefix_len + 1); + } +} + +static const uint8_t* +file_uri_to_path(const uint8_t* uri) +{ + const uint8_t* filename = NULL; + if (serd_uri_string_has_scheme(uri)) { + // Absolute URI, ensure it a file and chop scheme + if (strncmp((const char*)uri, "file:", 5)) { + fprintf(stderr, "Unsupported URI scheme `%s'\n", uri); + return NULL; +#ifdef __WIN32__ + } else if (!strncmp((const char*)uri, "file:///", 8)) { + filename = uri + 8; +#else + } else if (!strncmp((const char*)uri, "file://", 7)) { + filename = uri + 7; +#endif + } else { + filename = uri + 5; + } + } else { + filename = uri; + } + return filename; +} + +SERD_API +SerdStatus +serd_reader_read_file(SerdReader* reader, + const uint8_t* uri) +{ + const uint8_t* path = file_uri_to_path(uri); + if (!path) { + return SERD_ERR_BAD_ARG; + } + + FILE* fd = fopen((const char*)path, "r"); + if (!fd) { + return SERD_ERR_UNKNOWN; + } + SerdStatus ret = serd_reader_read_file_handle(reader, fd, path); + fclose(fd); + return ret; } SERD_API SerdStatus -serd_reader_read_file(SerdReader* me, FILE* file, const uint8_t* name) +serd_reader_read_file_handle(SerdReader* me, FILE* file, const uint8_t* name) { const Cursor cur = { name, 1, 1 }; me->fd = file; diff --git a/src/serdi.c b/src/serdi.c index bc7c9d18..833e92d2 100644 --- a/src/serdi.c +++ b/src/serdi.c @@ -45,8 +45,11 @@ print_usage(const char* name, bool error) fprintf(os, "Read and write RDF syntax.\n"); fprintf(os, "Use - for INPUT to read from standard input.\n\n"); fprintf(os, " -h Display this help and exit\n"); + fprintf(os, " -i SYNTAX Input syntax (`turtle' or `ntriples')\n"); fprintf(os, " -o SYNTAX Output syntax (`turtle' or `ntriples')\n"); fprintf(os, " -s INPUT Parse INPUT as string (terminates options)\n"); + fprintf(os, " -p PREFIX Add PREFIX to blank node IDs\n"); + fprintf(os, " -c PREFIX Chop PREFIX from matching blank node IDs\n"); fprintf(os, " -v Display version information and exit\n"); return error ? 1 : 0; } @@ -58,6 +61,20 @@ file_sink(const void* buf, size_t len, void* stream) return fwrite(buf, 1, len, file); } +bool +set_syntax(SerdSyntax* syntax, const char* name) +{ + if (!strcmp(name, "turtle")) { + *syntax = SERD_TURTLE; + } else if (!strcmp(name, "ntriples")) { + *syntax = SERD_NTRIPLES; + } else { + fprintf(stderr, "Unknown input format `%s'\n", name); + return false; + } + return true; +} + int main(int argc, char** argv) { @@ -65,14 +82,17 @@ main(int argc, char** argv) return print_usage(argv[0], true); } - FILE* in_fd = NULL; - SerdSyntax output_syntax = SERD_NTRIPLES; - bool from_file = true; - const char* in_name = NULL; + FILE* in_fd = NULL; + SerdSyntax input_syntax = SERD_TURTLE; + SerdSyntax output_syntax = SERD_NTRIPLES; + bool from_file = true; + const uint8_t* in_name = NULL; + const uint8_t* add_prefix = NULL; + const uint8_t* chop_prefix = NULL; int a = 1; for (; a < argc && argv[a][0] == '-'; ++a) { if (argv[a][1] == '\0') { - in_name = "(stdin)"; + in_name = (const uint8_t*)"(stdin)"; in_fd = stdin; break; } else if (argv[a][1] == 'h') { @@ -80,23 +100,38 @@ main(int argc, char** argv) } else if (argv[a][1] == 'v') { return print_version(); } else if (argv[a][1] == 's') { - in_name = "(string)"; + in_name = (const uint8_t*)"(string)"; from_file = false; ++a; break; + } else if (argv[a][1] == 'i') { + if (++a == argc) { + fprintf(stderr, "Missing value for -i\n"); + return 1; + } + if (!set_syntax(&input_syntax, argv[a])) { + return 1; + } } else if (argv[a][1] == 'o') { if (++a == argc) { fprintf(stderr, "Missing value for -o\n"); return 1; } - if (!strcmp(argv[a], "turtle")) { - output_syntax = SERD_TURTLE; - } else if (!strcmp(argv[a], "ntriples")) { - output_syntax = SERD_NTRIPLES; - } else { - fprintf(stderr, "Unknown output format `%s'\n", argv[a]); + if (!set_syntax(&output_syntax, argv[a])) { return 1; } + } else if (argv[a][1] == 'p') { + if (++a == argc) { + fprintf(stderr, "Missing value for -p\n"); + return 1; + } + add_prefix = (const uint8_t*)argv[a]; + } else if (argv[a][1] == 'c') { + if (++a == argc) { + fprintf(stderr, "Missing value for -c\n"); + return 1; + } + chop_prefix = (const uint8_t*)argv[a]; } else { fprintf(stderr, "Unknown option `%s'\n", argv[a]); return print_usage(argv[0], true); @@ -105,7 +140,7 @@ main(int argc, char** argv) const uint8_t* input = (const uint8_t*)argv[a++]; if (from_file) { - in_name = in_name ? in_name : (const char*)input; + in_name = in_name ? in_name : input; if (!in_fd) { if (serd_uri_string_has_scheme(input)) { // INPUT is an absolute URI, ensure it a file and chop scheme @@ -132,27 +167,25 @@ main(int argc, char** argv) } const uint8_t* base_uri_str = NULL; - SerdURI base_uri; if (a < argc) { // Base URI given on command line - const uint8_t* const in_base_uri = (const uint8_t*)argv[a]; - if (serd_uri_parse((const uint8_t*)in_base_uri, &base_uri)) { - fprintf(stderr, "Invalid base URI <%s>\n", argv[2]); - return 1; - } - base_uri_str = in_base_uri; + base_uri_str = (const uint8_t*)argv[a]; } else if (from_file) { // Use input file URI base_uri_str = input; } else { base_uri_str = (const uint8_t*)""; } - if (serd_uri_parse(base_uri_str, &base_uri)) { + SerdURI base_uri = SERD_URI_NULL; + SerdNode base_uri_node = serd_node_new_uri_from_string( + base_uri_str, &base_uri, &base_uri); + + if (!base_uri_node.buf) { fprintf(stderr, "Invalid base URI <%s>\n", base_uri_str); return 1; } FILE* out_fd = stdout; - SerdEnv* env = serd_env_new(); + SerdEnv* env = serd_env_new(&base_uri_node); SerdStyle output_style = SERD_STYLE_RESOLVED; if (output_syntax == SERD_NTRIPLES) { @@ -161,24 +194,28 @@ main(int argc, char** argv) output_style |= SERD_STYLE_ABBREVIATED; } - SerdNode base_uri_node = serd_node_from_string(SERD_URI, base_uri_str); - serd_env_set_base_uri(env, &base_uri_node); - serd_env_get_base_uri(env, &base_uri); - SerdWriter* writer = serd_writer_new( output_syntax, output_style, env, &base_uri, file_sink, out_fd); + if (chop_prefix) { + serd_writer_chop_blank_prefix(writer, chop_prefix); + } + State state = { env, writer }; SerdReader* reader = serd_reader_new( - SERD_TURTLE, state.writer, + input_syntax, state.writer, NULL, (SerdBaseSink)serd_writer_set_base_uri, (SerdPrefixSink)serd_writer_set_prefix, (SerdStatementSink)serd_writer_write_statement, (SerdEndSink)serd_writer_end_anon); + if (add_prefix) { + serd_reader_add_blank_prefix(reader, add_prefix); + } + const SerdStatus status = (from_file) - ? serd_reader_read_file(reader, in_fd, (const uint8_t*)in_name) + ? serd_reader_read_file_handle(reader, in_fd, in_name) : serd_reader_read_string(reader, input); serd_reader_free(reader); @@ -190,6 +227,7 @@ main(int argc, char** argv) serd_writer_finish(state.writer); serd_writer_free(state.writer); serd_env_free(state.env); + serd_node_free(&base_uri_node); return (status == SERD_SUCCESS) ? 0 : 1; } diff --git a/src/writer.c b/src/writer.c index d0120cde..fafde07c 100644 --- a/src/writer.c +++ b/src/writer.c @@ -44,6 +44,8 @@ struct SerdWriterImpl { SerdSink sink; void* stream; WriteContext context; + uint8_t* bprefix; + size_t bprefix_len; unsigned indent; bool empty; }; @@ -214,7 +216,15 @@ write_node(SerdWriter* writer, } // else fall through case SERD_BLANK_ID: writer->sink("_:", 2, writer->stream); - writer->sink(node->buf, node->n_bytes, writer->stream); + if (writer->bprefix + && !strncmp((const char*)node->buf, (const char*)writer->bprefix, + writer->bprefix_len)) { + writer->sink(node->buf + writer->bprefix_len, + node->n_bytes - writer->bprefix_len, + writer->stream); + } else { + writer->sink(node->buf, node->n_bytes, writer->stream); + } break; case SERD_CURIE: switch (writer->syntax) { @@ -445,19 +455,38 @@ serd_writer_new(SerdSyntax syntax, { const WriteContext context = WRITE_CONTEXT_NULL; SerdWriter* writer = malloc(sizeof(struct SerdWriterImpl)); - writer->syntax = syntax; - writer->style = style; - writer->env = env; - writer->base_uri = base_uri ? *base_uri : SERD_URI_NULL; - writer->anon_stack = serd_stack_new(sizeof(WriteContext)); - writer->sink = sink; - writer->stream = stream; - writer->context = context; - writer->indent = 0; - writer->empty = true; + writer->syntax = syntax; + writer->style = style; + writer->env = env; + writer->base_uri = base_uri ? *base_uri : SERD_URI_NULL; + writer->anon_stack = serd_stack_new(sizeof(WriteContext)); + writer->sink = sink; + writer->stream = stream; + writer->context = context; + writer->bprefix = NULL; + writer->bprefix_len = 0; + writer->indent = 0; + writer->empty = true; return writer; } +SERD_API +void +serd_writer_chop_blank_prefix(SerdWriter* writer, + const uint8_t* prefix) +{ + if (writer->bprefix) { + free(writer->bprefix); + writer->bprefix_len = 0; + writer->bprefix = NULL; + } + if (prefix) { + writer->bprefix_len = strlen((const char*)prefix); + writer->bprefix = malloc(writer->bprefix_len + 1); + memcpy(writer->bprefix, prefix, writer->bprefix_len + 1); + } +} + SERD_API SerdStatus serd_writer_set_base_uri(SerdWriter* writer, @@ -512,5 +541,6 @@ serd_writer_free(SerdWriter* writer) SerdWriter* const me = (SerdWriter*)writer; serd_writer_finish(me); serd_stack_free(&writer->anon_stack); + free(writer->bprefix); free(me); } diff --git a/tests/test-id.out b/tests/test-id.out new file mode 100644 index 00000000..722fc6d1 --- /dev/null +++ b/tests/test-id.out @@ -0,0 +1,2 @@ + _:genid1 . +_:docid1 . diff --git a/tests/test-id.ttl b/tests/test-id.ttl new file mode 100644 index 00000000..6bc1c37d --- /dev/null +++ b/tests/test-id.ttl @@ -0,0 +1,4 @@ +@prefix : . + +:c :d [] . +_:genid1 :a :b . diff --git a/wscript b/wscript index 06c96040..16ae242a 100644 --- a/wscript +++ b/wscript @@ -9,7 +9,7 @@ from waflib.extras import autowaf as autowaf import waflib.Logs as Logs, waflib.Options as Options # Version of this package (even if built as a child) -SERD_VERSION = '0.2.0' +SERD_VERSION = '0.3.0' SERD_MAJOR_VERSION = '0' # Library version (UNIX style major, minor, micro) @@ -245,18 +245,21 @@ def test(ctx): autowaf.run_tests(ctx, APPNAME, commands, 1, name='bad') + thru_tests = good_tests + thru_tests.remove('tests/test-id.ttl') # IDs are mapped so files won't be identical + commands = [] - for test in good_tests: + for test in thru_tests: base_uri = 'http://www.w3.org/2001/sw/DataAccess/df1/' + test out_filename = test + '.thru' commands += [ - '%s -o turtle %s/%s \'%s\' | %s - \'%s\' > %s.thru' % ( - './serdi_static', srcdir, test, base_uri, - './serdi_static', base_uri, test) ] - + '%s -o turtle %s/%s \'%s\' | %s -i turtle - \'%s\' | sed \'s/_:docid/_:genid/g\' > %s.thru' % ( + './serdi_static', srcdir, test, base_uri, + './serdi_static', base_uri, test) ] + autowaf.run_tests(ctx, APPNAME, commands, 0, name='turtle-round-trip') Logs.pprint('BOLD', '\nVerifying ntriples => turtle => ntriples') - for test in good_tests: + for test in thru_tests: out_filename = test + '.thru' if not os.access(out_filename, os.F_OK): Logs.pprint('RED', 'FAIL: %s output is missing' % test) -- cgit v1.2.1