From f93a441065a611cc32874dde67e53a8295c87baf Mon Sep 17 00:00:00 2001 From: David Robillard Date: Sun, 27 May 2018 15:48:25 +0200 Subject: [WIP] Add validation --- tools/serd-validate.c | 479 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 479 insertions(+) create mode 100644 tools/serd-validate.c (limited to 'tools/serd-validate.c') diff --git a/tools/serd-validate.c b/tools/serd-validate.c new file mode 100644 index 00000000..fd203611 --- /dev/null +++ b/tools/serd-validate.c @@ -0,0 +1,479 @@ +/* + Copyright 2011-2022 David Robillard + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#include "console.h" + +#include "serd/serd.h" + +#include +#include +#include +#include +#include + +#define NS_OWL "http://www.w3.org/2002/07/owl#" +#define NS_RDF "http://www.w3.org/1999/02/22-rdf-syntax-ns#" +#define NS_RDFS "http://www.w3.org/2000/01/rdf-schema#" + +/* Application (after parsing command-line arguments) */ + +#define SERDI_ERROR(msg) fprintf(stderr, "serdi: " msg) +#define SERDI_ERRORF(fmt, ...) fprintf(stderr, "serdi: " fmt, __VA_ARGS__) + +typedef struct { + const char* base_uri_string; + const char* out_filename; + char* const* inputs; + intptr_t n_inputs; + SerdSyntaxOptions input_options; + size_t block_size; + size_t stack_size; + bool verbose; + bool quiet; +} Options; + +static SerdStatus +consume_source(SerdWorld* const world, + const Options opts, + SerdSyntax syntax, + SerdEnv* const env, + const SerdSink* const sink, + SerdInputStream input, + const SerdNode* const name) +{ + if (!input.stream) { + return SERD_UNKNOWN_ERROR; + } + + SerdStatus st = SERD_SUCCESS; + SerdReader* const reader = serd_reader_new( + world, syntax, opts.input_options.flags, env, sink, opts.stack_size); + + if (!(st = serd_reader_start(reader, &input, name, opts.block_size))) { + st = serd_reader_read_document(reader); + } + + serd_reader_free(reader); + serd_close_input(&input); + return st; +} + +static SerdStatus +read_file(SerdWorld* const world, + const Options opts, + SerdEnv* const env, + const SerdSink* const sink, + const char* const filename) +{ + SerdStatus st = SERD_SUCCESS; + if (!opts.base_uri_string && strcmp(filename, "-")) { + if ((st = serd_set_base_uri_from_path(env, filename))) { + SERDI_ERRORF("failed to determine base URI from path %s\n", filename); + return st; + } + } + + const SerdNode* const name = + serd_nodes_string(serd_world_nodes(world), SERD_STRING(filename)); + + st = consume_source( + world, + opts, + serd_choose_syntax(world, opts.input_options, filename, SERD_TRIG), + env, + sink, + serd_open_tool_input(filename), + name); + + return st; +} + +static SerdEnv* +build_env(SerdWorld* const world, Options opts) +{ + char* const* const inputs = opts.inputs; + const intptr_t n_inputs = opts.n_inputs; + + if (!opts.base_uri_string && n_inputs == 1) { + // Choose base URI from the single input path + char* const input_path = serd_canonical_path(NULL, inputs[0]); + + SerdNode* base = + input_path + ? serd_new_file_uri(NULL, SERD_STRING(input_path), SERD_EMPTY_STRING()) + : NULL; + if (!base) { + SERDI_ERRORF("unable to determine base URI from path %s\n", inputs[0]); + } + + SerdEnv* const env = serd_env_new( + world, base ? serd_node_string_view(base) : SERD_EMPTY_STRING()); + + serd_free(NULL, input_path); + serd_node_free(NULL, base); + return env; + } + + return serd_env_new(world, + opts.base_uri_string ? SERD_STRING(opts.base_uri_string) + : SERD_EMPTY_STRING()); +} + +static SerdModel* +build_model(SerdWorld* const world, const Options opts, bool with_graphs) +{ + (void)opts; // FIXME + + SerdModel* const model = serd_model_new( + world, + with_graphs ? SERD_ORDER_GSPO : SERD_ORDER_SPO, + (with_graphs * (unsigned)SERD_STORE_GRAPHS) | SERD_STORE_CARETS); + + with_graphs = true; // FIXME + + if (with_graphs) { + serd_model_add_index(model, SERD_ORDER_GSPO); + } + + serd_model_add_index(model, SERD_ORDER_POS); + serd_model_add_index(model, SERD_ORDER_GPOS); + + serd_model_add_index(model, SERD_ORDER_PSO); + serd_model_add_index(model, SERD_ORDER_GPSO); + + serd_model_add_index(model, SERD_ORDER_OPS); + if (with_graphs) { + serd_model_add_index(model, SERD_ORDER_GOPS); + } + + return model; +} + +static bool +input_has_graphs(const Options opts) +{ + if (opts.input_options.syntax) { + return serd_syntax_has_graphs(opts.input_options.syntax); + } + + for (intptr_t i = 0u; i < opts.n_inputs; ++i) { + if (serd_syntax_has_graphs(serd_guess_syntax(opts.inputs[i]))) { + return true; + } + } + + return false; +} + +static SerdStatus +read_inputs(SerdWorld* world, + const Options opts, + SerdEnv* env, + const SerdSink* const sink) +{ + SerdStatus st = SERD_SUCCESS; + + size_t prefix_len = 0; + char* prefix = NULL; + if (opts.n_inputs > 1) { + prefix_len = 32; // FIXME + prefix = (char*)calloc(1, prefix_len); + } + + for (intptr_t i = 0; !st && i < opts.n_inputs; ++i) { + if (opts.n_inputs > 1) { + snprintf(prefix, prefix_len, "f%" PRIdPTR, i); + } + + st = read_file(world, opts, env, sink, opts.inputs[i]); + } + + free(prefix); + return st; +} + +/* + Return a model where every object is the file URI of a schema to load. + + The statements in the result are like `?ontology rdfs:seeAlso ?resource`, + where `?ontology` is the URI of the owl:Ontology instance and `?resource` is + a file URI. +*/ +static SerdModel* +get_schemas_model(const Options opts, + SerdWorld* const world, + SerdModel* const model) +{ + static const SerdStringView s_rdf_type = SERD_STRING(NS_RDF "type"); + static const SerdStringView s_owl_Ontology = SERD_STRING(NS_OWL "Ontology"); + static const SerdStringView s_rdfs_seeAlso = SERD_STRING(NS_RDFS "seeAlso"); + + SerdNodes* const nodes = serd_world_nodes(world); + SerdModel* const schemas_model = + serd_model_new(world, SERD_ORDER_SPO, SERD_STORE_CARETS); + + const SerdNode* const rdf_type = serd_nodes_uri(nodes, s_rdf_type); + const SerdNode* const owl_Ontology = serd_nodes_uri(nodes, s_owl_Ontology); + const SerdNode* const rdfs_seeAlso = serd_nodes_uri(nodes, s_rdfs_seeAlso); + + SerdCursor* const i = + serd_model_find(model, NULL, rdf_type, owl_Ontology, NULL); + + for (; !serd_cursor_is_end(i); serd_cursor_advance(i)) { + const SerdStatement* const typing = serd_cursor_get(i); + const SerdNode* const ontology = serd_statement_subject(typing); + + const SerdStatement* const link = + serd_model_get_statement(model, ontology, rdfs_seeAlso, NULL, NULL); + if (link) { + const SerdNode* const resource = serd_statement_object(link); + if (resource && serd_node_type(resource) == SERD_URI) { + if (opts.verbose) { + serd_logf_at(world, + SERD_LOG_LEVEL_NOTICE, + serd_statement_caret(link), + "Loading %s", + serd_node_string(resource)); + } + + const char* const resource_uri = serd_node_string(resource); + if (!strncmp(resource_uri, "file://", strlen("file://"))) { + serd_model_add(schemas_model, ontology, rdfs_seeAlso, resource, NULL); + } + } + } + } + + serd_cursor_free(i); + + return schemas_model; +} + +static SerdStatus +run(Options opts, int argc, char** argv) +{ + const bool with_graphs = input_has_graphs(opts); + + SerdOutputStream out = serd_open_tool_output(opts.out_filename); + if (!out.stream) { + perror("error opening output file"); + return SERD_UNKNOWN_ERROR; + } + + SerdWorld* const world = serd_world_new(NULL); + + const SerdNode* const schema_graph = + serd_nodes_uri(serd_world_nodes(world), + SERD_STRING("http://drobilla.net/sw/serd#schemas")); + + const SerdNode* const data_graph = serd_nodes_uri( + serd_world_nodes(world), SERD_STRING("http://drobilla.net/sw/serd#data")); + + SerdEnv* const env = build_env(world, opts); + SerdModel* const model = build_model(world, opts, with_graphs); + SerdSink* const schema_sink = serd_inserter_new(model, schema_graph); + SerdSink* const data_sink = serd_inserter_new(model, data_graph); + if (!schema_sink || !data_sink) { + SERDI_ERROR("failed to construct data pipeline, aborting\n"); + return SERD_BAD_ARG; // FIXME: ? + } + + if (opts.quiet) { + serd_set_log_func(world, serd_quiet_log_func, NULL); + } + + SerdStatus st = read_inputs(world, opts, env, data_sink); + + if (st <= SERD_FAILURE) { // FIXME: ? + SerdValidator* const validator = serd_validator_new(world); + bool checks_given = false; + + for (int i = 1; i < argc; ++i) { + if (argv[i][0] == '-') { + if (argv[i][1] == 'W') { + serd_validator_enable_checks(validator, argv[++i]); + checks_given = true; + } else if (argv[i][1] == 'X') { + serd_validator_disable_checks(validator, argv[++i]); + checks_given = true; + } else if (argv[i][1] == 's') { + st = read_file(world, opts, env, schema_sink, argv[++i]); + } + } + } + + if (!checks_given) { + serd_validator_enable_checks(validator, "all"); + } + + { + SerdModel* const schemas_model = get_schemas_model(opts, world, model); + + SerdCursor* const i = serd_model_begin(schemas_model); + for (; !serd_cursor_is_end(i); serd_cursor_advance(i)) { + const SerdStatement* const link = serd_cursor_get(i); + const SerdNode* const resource = serd_statement_object(link); + const char* const resource_uri = serd_node_string(resource); + + char* const path = serd_parse_file_uri(NULL, resource_uri, NULL); + if (path) { + st = read_file(world, opts, env, schema_sink, path); + serd_free(NULL, path); + } + } + + serd_cursor_free(i); + serd_model_free(schemas_model); + } + + st = serd_validate(validator, model, data_graph, env); + + serd_validator_free(validator); + } + + serd_sink_free(data_sink); + serd_sink_free(schema_sink); + serd_model_free(model); + serd_env_free(env); + serd_world_free(world); + + if (serd_close_output(&out)) { + perror("serdi: write error"); + st = SERD_UNKNOWN_ERROR; + } + + return st; +} + +/* Command-line interface (before setting up serd) */ + +static SerdStatus +print_usage(const char* const name, const bool error) +{ + static const char* const description = + "Validate RDF data against RDFS and OWL schemas.\n" + "INPUT can be a local filename, or \"-\" to read from standard input.\n\n" + " -B BASE_URI Base URI or path for resolving relative references.\n" + " -I SYNTAX Input syntax (turtle/ntriples/trig/nquads),\n" + " or option (lax/variables/relative/global/generated).\n" + " -V Display version information and exit.\n" + " -W CHECKS Enable checks matching regex CHECKS (or \"all\").\n" + " -X CHECKS Exclude checks matching regex CHECKS (or \"all\").\n" + " -b BYTES I/O block size.\n" + " -h Display this help and exit.\n" + " -k BYTES Parser stack size.\n" + " -v Print verbose messages about loaded resources.\n" + " -s SCHEMA Schema input file.\n"; + + FILE* const os = error ? stderr : stdout; + fprintf(os, "%s", error ? "\n" : ""); + fprintf(os, "Usage: %s [OPTION]... INPUT...\n", name); + fprintf(os, "%s", description); + return error ? SERD_BAD_ARG : SERD_SUCCESS; +} + +static SerdStatus +parse_option(OptionIter* iter, Options* const opts) +{ +#define ARG_ERRORF(fmt, ...) \ + fprintf(stderr, "%s: " fmt, iter->argv[0], __VA_ARGS__) + + const char opt = iter->argv[iter->a][iter->f]; + const char* argument = NULL; + + switch (opt) { + case 'B': + return serd_get_argument(iter, &opts->base_uri_string); + + case 'I': + return serd_parse_input_argument(iter, &opts->input_options); + + case 'V': + return serd_print_version("serd-validate"); + + case 'W': + case 'X': + // Just enable validation and skip the pattern, checks are parsed later + return serd_get_argument(iter, &argument); + + case 'b': + return serd_get_size_argument(iter, &opts->block_size); + + case 'h': + print_usage(iter->argv[0], false); + return SERD_FAILURE; + + case 'k': + return serd_get_size_argument(iter, &opts->stack_size); + + case 'q': + opts->quiet = true; + return serd_option_iter_advance(iter); + + case 's': + // Schema input, ignore here since these are loaded later + return serd_get_argument(iter, &argument); + + case 'v': + opts->verbose = true; + return serd_option_iter_advance(iter); + + case 'w': + return serd_get_argument(iter, &opts->out_filename); + + default: + break; + } + + ARG_ERRORF("invalid option -- '%c'\n", opt); + return print_usage(iter->argv[0], true); +} + +int +main(int argc, char** argv) +{ + Options opts = {NULL, + NULL, + NULL, + 0, + {SERD_SYNTAX_EMPTY, 0u, false}, + 4096u, + 4194304u, + false, + false}; + + // Parse all command line options (which must precede inputs) + SerdStatus st = SERD_SUCCESS; + OptionIter iter = {argv, argc, 1, 1}; + while (!serd_option_iter_is_end(iter)) { + if ((st = parse_option(&iter, &opts))) { + return (st > SERD_FAILURE); + } + } + + // Every argument past the last option is an input + opts.inputs = argv + iter.a; + opts.n_inputs = argc - iter.a; + if (opts.n_inputs == 0) { + fprintf(stderr, "%s: missing input\n", argv[0]); + print_usage(argv[0], true); + return EXIT_FAILURE; + } + + st = st ? st : run(opts, argc, argv); + + return (st <= SERD_FAILURE) ? 0 : (int)st; +} -- cgit v1.2.1