/* Copyright 2011-2021 David Robillard Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies. THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "console.h" #include "system.h" #include "serd/serd.h" #include #include #include #include #include #include #define SERDI_ERROR(msg) fprintf(stderr, "serdi: " msg) #define SERDI_ERRORF(fmt, ...) fprintf(stderr, "serdi: " fmt, __VA_ARGS__) typedef struct { SerdNode* s; SerdNode* p; SerdNode* o; SerdNode* g; } FilterPattern; static int print_usage(const char* const name, const bool error) { FILE* const os = error ? stderr : stdout; fprintf(os, "%s", error ? "\n" : ""); fprintf(os, "Usage: %s [OPTION]... INPUT...\n", name); fprintf(os, "Read and write RDF syntax.\n"); fprintf(os, "Use - for INPUT to read from standard input.\n\n"); fprintf(os, " -C Convert literals to canonical form.\n"); fprintf(os, " -F PATTERN Filter out statements that match PATTERN.\n"); fprintf(os, " -G PATTERN Only include statements matching PATTERN.\n"); fprintf(os, " -I BASE_URI Input base URI.\n"); fprintf(os, " -a Write ASCII output if possible.\n"); fprintf(os, " -b Fast bulk output for large serialisations.\n"); fprintf(os, " -c PREFIX Chop PREFIX from matching blank node IDs.\n"); fprintf(os, " -e Eat input one character at a time.\n"); fprintf(os, " -f Fast and loose mode (possibly ugly output).\n"); fprintf(os, " -h Display this help and exit.\n"); fprintf(os, " -i SYNTAX Input syntax: turtle/ntriples/trig/nquads.\n"); fprintf(os, " -k BYTES Parser stack size.\n"); fprintf(os, " -l Lax (non-strict) parsing.\n"); fprintf(os, " -m Build a model in memory before writing.\n"); fprintf(os, " -o SYNTAX Output syntax: empty/turtle/ntriples/nquads.\n"); fprintf(os, " -p PREFIX Add PREFIX to blank node IDs.\n"); fprintf(os, " -q Suppress all output except data.\n"); fprintf(os, " -r ROOT_URI Keep relative URIs within ROOT_URI.\n"); fprintf(os, " -s STRING Parse STRING as input.\n"); fprintf(os, " -t Write terser output without newlines.\n"); fprintf(os, " -v Display version information and exit.\n"); fprintf(os, " -w FILENAME Write output to FILENAME instead of stdout.\n"); fprintf(os, " -x Support parsing variable nodes like `?x'.\n"); return error ? 1 : 0; } static int missing_arg(const char* const name, const char opt) { SERDI_ERRORF("option requires an argument -- '%c'\n", opt); return print_usage(name, true); } static SerdStatus on_filter_event(void* const handle, const SerdEvent* const event) { if (event->type == SERD_STATEMENT) { FilterPattern* const pat = (FilterPattern*)handle; if (pat->s) { return SERD_ERR_INVALID; } const SerdStatement* const statement = event->statement.statement; pat->s = serd_node_copy(serd_statement_subject(statement)); pat->p = serd_node_copy(serd_statement_predicate(statement)); pat->o = serd_node_copy(serd_statement_object(statement)); pat->g = serd_node_copy(serd_statement_graph(statement)); } return SERD_SUCCESS; } static SerdSink* parse_filter(SerdWorld* const world, const SerdSink* const sink, const char* const str, const bool inclusive) { SerdEnv* const env = serd_env_new(SERD_EMPTY_STRING()); FilterPattern pat = {NULL, NULL, NULL, NULL}; SerdSink* in_sink = serd_sink_new(&pat, on_filter_event, NULL); SerdByteSource* byte_source = serd_byte_source_new_string(str, NULL); SerdReader* reader = serd_reader_new( world, SERD_NQUADS, SERD_READ_VARIABLES, env, in_sink, 4096); SerdStatus st = serd_reader_start(reader, byte_source); if (!st) { st = serd_reader_read_document(reader); } serd_reader_free(reader); serd_env_free(env); serd_byte_source_free(byte_source); serd_sink_free(in_sink); if (st) { return NULL; } SerdSink* filter = serd_filter_new(sink, pat.s, pat.p, pat.o, pat.g, inclusive); serd_node_free(pat.s); serd_node_free(pat.p); serd_node_free(pat.o); serd_node_free(pat.g); return filter; } static SerdStatus read_file(SerdWorld* const world, SerdSyntax syntax, const SerdReaderFlags flags, SerdEnv* const env, const SerdSink* const sink, const size_t stack_size, const char* const filename, const char* const add_prefix, const bool bulk_read) { syntax = syntax ? syntax : serd_guess_syntax(filename); syntax = syntax ? syntax : SERD_TRIG; SerdByteSource* byte_source = serd_open_input(filename, bulk_read ? SERD_PAGE_SIZE : 1u); if (!byte_source) { SERDI_ERRORF( "failed to open input file `%s' (%s)\n", filename, strerror(errno)); return SERD_ERR_UNKNOWN; } SerdReader* reader = serd_reader_new(world, syntax, flags, env, sink, stack_size); serd_reader_add_blank_prefix(reader, add_prefix); SerdStatus st = serd_reader_start(reader, byte_source); st = st ? st : serd_reader_read_document(reader); serd_reader_free(reader); serd_byte_source_free(byte_source); return st; } int main(int argc, char** argv) { const char* const prog = argv[0]; if (argc < 2) { return print_usage(prog, true); } SerdNode* base = NULL; SerdSyntax input_syntax = SERD_SYNTAX_EMPTY; SerdSyntax output_syntax = SERD_SYNTAX_EMPTY; SerdReaderFlags reader_flags = 0; SerdWriterFlags writer_flags = 0; bool bulk_read = true; bool bulk_write = false; bool no_inline = false; bool osyntax_set = false; bool use_model = false; bool canonical = false; bool quiet = false; size_t stack_size = 4194304; const char* input_string = NULL; const char* in_pattern = NULL; const char* out_pattern = NULL; const char* add_prefix = ""; const char* chop_prefix = NULL; const char* root_uri = NULL; const char* out_filename = NULL; int a = 1; for (; a < argc && argv[a][0] == '-'; ++a) { if (argv[a][1] == '\0') { break; } for (int o = 1; argv[a][o]; ++o) { const char opt = argv[a][o]; if (opt == 'C') { canonical = true; } else if (opt == 'a') { writer_flags |= SERD_WRITE_ASCII; } else if (opt == 'b') { bulk_write = true; } else if (opt == 'e') { bulk_read = false; } else if (opt == 'f') { no_inline = true; writer_flags |= (SERD_WRITE_EXPANDED | SERD_WRITE_VERBATIM); } else if (opt == 'h') { return print_usage(prog, false); } else if (opt == 'l') { reader_flags |= SERD_READ_LAX; writer_flags |= SERD_WRITE_LAX; } else if (argv[a][1] == 'm') { use_model = true; } else if (opt == 'q') { quiet = true; } else if (opt == 't') { writer_flags |= SERD_WRITE_TERSE; } else if (opt == 'v') { return serd_print_version(argv[0]); } else if (opt == 'x') { reader_flags |= SERD_READ_VARIABLES; } else if (argv[a][1] == 'F') { if (++a == argc) { return missing_arg(argv[0], 'F'); } out_pattern = argv[a]; break; } else if (argv[a][1] == 'G') { if (++a == argc) { return missing_arg(argv[0], 'g'); } in_pattern = argv[a]; break; } else if (argv[a][1] == 'I') { if (++a == argc) { return missing_arg(prog, 'I'); } base = serd_new_uri(SERD_STRING(argv[a])); break; } else if (opt == 'c') { if (argv[a][o + 1] || ++a == argc) { return missing_arg(prog, 'c'); } chop_prefix = argv[a]; break; } else if (opt == 'i') { if (argv[a][o + 1] || ++a == argc) { return missing_arg(prog, 'i'); } if (!(input_syntax = serd_syntax_by_name(argv[a]))) { return print_usage(prog, true); } break; } else if (opt == 'k') { if (argv[a][o + 1] || ++a == argc) { return missing_arg(prog, 'k'); } char* endptr = NULL; const long size = strtol(argv[a], &endptr, 10); if (size <= 0 || size == LONG_MAX || *endptr != '\0') { SERDI_ERRORF("invalid stack size `%s'\n", argv[a]); return 1; } stack_size = (size_t)size; break; } else if (opt == 'o') { osyntax_set = true; if (argv[a][o + 1] || ++a == argc) { return missing_arg(prog, 'o'); } if (!strcmp(argv[a], "empty")) { output_syntax = SERD_SYNTAX_EMPTY; } else if (!(output_syntax = serd_syntax_by_name(argv[a]))) { return print_usage(argv[0], true); } break; } else if (opt == 'p') { if (argv[a][o + 1] || ++a == argc) { return missing_arg(prog, 'p'); } add_prefix = argv[a]; break; } else if (opt == 'r') { if (argv[a][o + 1] || ++a == argc) { return missing_arg(prog, 'r'); } root_uri = argv[a]; break; } else if (opt == 's') { if (argv[a][o + 1] || ++a == argc) { return missing_arg(prog, 's'); } input_string = argv[a]; break; } else if (opt == 'w') { if (argv[a][o + 1] || ++a == argc) { return missing_arg(argv[0], 'w'); } out_filename = argv[a]; break; } else { SERDI_ERRORF("invalid option -- '%s'\n", argv[a] + 1); return print_usage(prog, true); } } } if (in_pattern && out_pattern) { SERDI_ERROR("only one of -F and -G can be given at once\n"); return 1; } if (a == argc && !input_string) { SERDI_ERROR("missing input\n"); return 1; } char* const* const inputs = argv + a; const int n_inputs = argc - a; bool input_has_graphs = serd_syntax_has_graphs(input_syntax); for (int i = a; i < argc; ++i) { if (serd_syntax_has_graphs(serd_guess_syntax(argv[i]))) { input_has_graphs = true; break; } } if (!output_syntax && !osyntax_set) { output_syntax = input_has_graphs ? SERD_NQUADS : SERD_NTRIPLES; } if (!base && n_inputs == 1 && (output_syntax == SERD_NQUADS || output_syntax == SERD_NTRIPLES)) { // Choose base URI from the single input path char* const input_path = serd_canonical_path(inputs[0]); if (!input_path || !(base = serd_new_file_uri(SERD_STRING(input_path), SERD_EMPTY_STRING()))) { SERDI_ERRORF("unable to determine base URI from path %s\n", inputs[0]); } serd_free(input_path); } SerdWorld* const world = serd_world_new(); SerdEnv* const env = serd_env_new(base ? serd_node_string_view(base) : SERD_EMPTY_STRING()); serd_set_stream_utf8_mode(stdin); if (!out_filename) { serd_set_stream_utf8_mode(stdout); } const SerdDescribeFlags describe_flags = no_inline ? SERD_NO_INLINE_OBJECTS : 0u; const size_t block_size = bulk_write ? 4096u : 1u; SerdByteSink* const byte_sink = out_filename ? serd_byte_sink_new_filename(out_filename, block_size) : serd_byte_sink_new_function((SerdWriteFunc)fwrite, stdout, block_size); if (!byte_sink) { perror("serdi: error opening output file"); return 1; } SerdWriter* const writer = serd_writer_new(world, output_syntax, writer_flags, env, byte_sink); SerdModel* model = NULL; SerdSink* inserter = NULL; const SerdSink* out_sink = NULL; if (use_model) { const SerdModelFlags flags = (input_has_graphs ? SERD_STORE_GRAPHS : 0u); model = serd_model_new(world, SERD_ORDER_SPO, flags); if (input_has_graphs) { serd_model_add_index(model, SERD_ORDER_GSPO); } if (!no_inline) { serd_model_add_index(model, SERD_ORDER_OPS); if (input_has_graphs) { serd_model_add_index(model, SERD_ORDER_GOPS); } } inserter = serd_inserter_new(model, NULL); out_sink = inserter; } else { out_sink = serd_writer_sink(writer); } const SerdSink* sink = out_sink; SerdSink* canon = NULL; if (canonical) { sink = canon = serd_canon_new(world, out_sink, reader_flags); } SerdSink* filter = NULL; if (in_pattern) { if (!(filter = parse_filter(world, sink, in_pattern, true))) { SERDI_ERROR("error parsing inclusive filter pattern\n"); return EXIT_FAILURE; } sink = filter; } else if (out_pattern) { if (!(filter = parse_filter(world, sink, out_pattern, false))) { SERDI_ERROR("error parsing exclusive filter pattern\n"); return EXIT_FAILURE; } sink = filter; } if (quiet) { serd_set_log_func(world, serd_quiet_log_func, NULL); } if (root_uri) { serd_writer_set_root_uri(writer, SERD_STRING(root_uri)); } serd_writer_chop_blank_prefix(writer, chop_prefix); SerdStatus st = SERD_SUCCESS; if (input_string) { SerdByteSource* const byte_source = serd_byte_source_new_string(input_string, NULL); SerdReader* const reader = serd_reader_new(world, input_syntax ? input_syntax : SERD_TRIG, reader_flags, env, sink, stack_size); serd_reader_add_blank_prefix(reader, add_prefix); if (!(st = serd_reader_start(reader, byte_source))) { st = serd_reader_read_document(reader); } serd_reader_free(reader); serd_byte_source_free(byte_source); } size_t prefix_len = 0; char* prefix = NULL; if (n_inputs > 1) { prefix_len = 8 + strlen(add_prefix); prefix = (char*)calloc(1, prefix_len); } for (int i = 0; !st && i < n_inputs; ++i) { if (!base && strcmp(inputs[i], "-")) { char* const input_path = serd_canonical_path(inputs[i]); if (!input_path) { SERDI_ERRORF("failed to resolve path %s\n", inputs[i]); st = SERD_ERR_BAD_ARG; break; } SerdNode* const file_uri = serd_new_file_uri(SERD_STRING(input_path), SERD_EMPTY_STRING()); serd_env_set_base_uri(env, serd_node_string_view(file_uri)); serd_node_free(file_uri); serd_free(input_path); } if (n_inputs > 1) { snprintf(prefix, prefix_len, "f%d%s", i, add_prefix); } if ((st = read_file(world, input_syntax, reader_flags, env, sink, stack_size, inputs[i], n_inputs > 1 ? prefix : add_prefix, bulk_read))) { break; } } free(prefix); if (st <= SERD_FAILURE && use_model) { const SerdSink* writer_sink = serd_writer_sink(writer); SerdCursor* everything = serd_model_begin_ordered( model, input_has_graphs ? SERD_ORDER_GSPO : SERD_ORDER_SPO); serd_env_write_prefixes(env, writer_sink); st = serd_describe_range( everything, writer_sink, describe_flags | ((output_syntax == SERD_NTRIPLES || output_syntax == SERD_NQUADS) ? SERD_NO_INLINE_OBJECTS : 0u)); serd_cursor_free(everything); } serd_sink_free(canon); serd_sink_free(filter); serd_sink_free(inserter); serd_model_free(model); serd_writer_free(writer); serd_env_free(env); serd_node_free(base); serd_world_free(world); if (serd_byte_sink_close(byte_sink) || (!out_filename && fclose(stdout))) { perror("serdi: write error"); st = SERD_ERR_UNKNOWN; } serd_byte_sink_free(byte_sink); return (st > SERD_FAILURE) ? 1 : 0; }