diff options
author | David Robillard <d@drobilla.net> | 2012-03-30 02:19:52 +0000 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2012-03-30 02:19:52 +0000 |
commit | ff3c6fc88657d2e94847ceb00adea7597894d897 (patch) | |
tree | 95931a641722a75fb3ed306e7960f3f179680eaf | |
parent | 43ee05bf7402c47e7034490bfebf08d3f57900e7 (diff) | |
download | sord-ff3c6fc88657d2e94847ceb00adea7597894d897.tar.gz sord-ff3c6fc88657d2e94847ceb00adea7597894d897.tar.bz2 sord-ff3c6fc88657d2e94847ceb00adea7597894d897.zip |
Add convenient sord_search(), sord_ask(), and sord_count().
Add sord_validate tool for validating data against RDF/OWL schemas.
git-svn-id: http://svn.drobilla.net/sord/trunk@211 3d64ff67-21c5-427c-a301-fe4f08042e5a
-rw-r--r-- | ChangeLog | 2 | ||||
-rw-r--r-- | doc/sord_validate.1 | 34 | ||||
-rw-r--r-- | sord/sord.h | 36 | ||||
-rw-r--r-- | src/sord.c | 38 | ||||
-rw-r--r-- | src/sord_test.c | 29 | ||||
-rw-r--r-- | src/sord_validate.c | 389 | ||||
-rw-r--r-- | src/sordi.c | 4 | ||||
-rw-r--r-- | wscript | 20 |
8 files changed, 531 insertions, 21 deletions
@@ -10,6 +10,8 @@ sord (UNRELEASED) unstable; urgency=low * Refuse to intern relative URIs in sord_new_uri*() * Add sord_new_relative_uri() * Add SordInserter for writing to a model via Serd sink functions. + * Add convenient sord_search(), sord_ask(), and sord_count() + * Add sord_validate tool for validating data against RDF/OWL schemas -- David Robillard <d@drobilla.net> (UNRELEASED) diff --git a/doc/sord_validate.1 b/doc/sord_validate.1 new file mode 100644 index 0000000..071611d --- /dev/null +++ b/doc/sord_validate.1 @@ -0,0 +1,34 @@ +.TH SORD_VALIDATE 1 "21 Mar 2012" + +.SH NAME +.B sord_validate \- Validate RDF data + +.SH SYNOPSIS +sord_validate INPUT... + +This is a simple validator which checks that all used properties are actually +defined, and that the domain and range of properties is explicitly correct. +Note that an "error" from this program does not necessarily mean data is +invalid, since it is not required to explicitly list types in RDF, however it +is a good idea to do so. If data type definitions are available with an +xsd:pattern property, literals with that datatype will be checked against the +xsd:pattern (a regular expresssion) to ensure they are valid. + +This program does not retrieve any data from the web or magical places on the +file system. It only processes files passed directly on the command line. +This means you must pass all used vocabularies to get a useful result. + +.SH AUTHOR +sord_validate was written by David Robillard <d@drobilla.net> + +.SH COPYRIGHT +Copyright \(co 2012 David Robillard. +.br +License: <http://www.opensource.org/licenses/isc-license> +.br +This is free software; you are free to change and redistribute it. +.br +There is NO WARRANTY, to the extent permitted by law. + +.SH "SEE ALSO" +<http://drobilla.net/software/sord> diff --git a/sord/sord.h b/sord/sord.h index 9bd3f02..9c4baf8 100644 --- a/sord/sord.h +++ b/sord/sord.h @@ -367,7 +367,7 @@ SordIter* sord_begin(const SordModel* model); /** - Search for a triple pattern. + Search for statements by a quad pattern. @return an iterator to the first match, or NULL if no matches found. */ SORD_API @@ -375,6 +375,40 @@ SordIter* sord_find(SordModel* model, const SordQuad pat); /** + Search for statements by nodes. + @return an iterator to the first match, or NULL if no matches found. +*/ +SORD_API +SordIter* +sord_search(SordModel* model, + const SordNode* s, + const SordNode* p, + const SordNode* o, + const SordNode* g); + +/** + Return true iff a statement exists. +*/ +SORD_API +bool +sord_ask(SordModel* model, + const SordNode* s, + const SordNode* p, + const SordNode* o, + const SordNode* g); + +/** + Return the number of matching statements. +*/ +SORD_API +uint64_t +sord_count(SordModel* model, + const SordNode* s, + const SordNode* p, + const SordNode* o, + const SordNode* g); + +/** Check if @a model contains a triple pattern. */ SORD_API @@ -816,6 +816,44 @@ sord_find(SordModel* sord, const SordQuad pat) return sord_iter_new(sord, cur, pat, index_order, mode, n_prefix); } +SordIter* +sord_search(SordModel* model, + const SordNode* s, + const SordNode* p, + const SordNode* o, + const SordNode* g) +{ + SordQuad pat = { s, p, o, g }; + return sord_find(model, pat); +} + +bool +sord_ask(SordModel* model, + const SordNode* s, + const SordNode* p, + const SordNode* o, + const SordNode* g) +{ + SordQuad pat = { s, p, o, g }; + return sord_contains(model, pat); +} + +uint64_t +sord_count(SordModel* model, + const SordNode* s, + const SordNode* p, + const SordNode* o, + const SordNode* g) +{ + SordIter* i = sord_search(model, s, p, o, g); + uint64_t n = 0; + for (; !sord_iter_end(i); sord_iter_next(i)) { + ++n; + } + sord_iter_free(i); + return n; +} + bool sord_contains(SordModel* sord, const SordQuad pat) { diff --git a/src/sord_test.c b/src/sord_test.c index 662f920..b871181 100644 --- a/src/sord_test.c +++ b/src/sord_test.c @@ -21,9 +21,9 @@ #include "sord/sord.h" -static const int DIGITS = 3; -static const int MAX_NUM = 999; -static const int n_objects_per = 2; +static const int DIGITS = 3; +static const int MAX_NUM = 999; +static const unsigned n_objects_per = 2; typedef struct { SordQuad query; @@ -68,18 +68,18 @@ generate(SordWorld* world, int num = (i * n_objects_per) + 1; SordNode* ids[2 + n_objects_per]; - for (int j = 0; j < 2 + n_objects_per; ++j) { + for (unsigned j = 0; j < 2 + n_objects_per; ++j) { ids[j] = uri(world, num++); } - for (int j = 0; j < n_objects_per; ++j) { + for (unsigned j = 0; j < n_objects_per; ++j) { SordQuad tup = { ids[0], ids[1], ids[2 + j] }; if (!sord_add(sord, tup)) { return test_fail("Fail: Failed to add quad\n"); } } - for (int j = 0; j < 2 + n_objects_per; ++j) { + for (unsigned j = 0; j < 2 + n_objects_per; ++j) { sord_node_free(world, ids[j]); } } @@ -290,9 +290,8 @@ test_read(SordWorld* world, SordModel* sord, SordNode* g, // Test nested queries fprintf(stderr, "Nested Queries... "); - pat[0] = pat[1] = pat[2] = 0; const SordNode* last_subject = 0; - iter = sord_find(sord, pat); + iter = sord_search(sord, NULL, NULL, NULL, NULL); for (; !sord_iter_end(iter); sord_iter_next(iter)) { sord_iter_get(iter, id); if (id[0] == last_subject) @@ -300,7 +299,7 @@ test_read(SordWorld* world, SordModel* sord, SordNode* g, SordQuad subpat = { id[0], 0, 0 }; SordIter* subiter = sord_find(sord, subpat); - int num_sub_results = 0; + uint64_t num_sub_results = 0; if (sord_iter_get_node(subiter, SORD_SUBJECT) != id[0]) { return test_fail("Fail: Incorrect initial submatch\n"); } @@ -322,6 +321,14 @@ test_read(SordWorld* world, SordModel* sord, SordNode* g, " (%d results, expected %d)\n", TUP_FMT_ARGS(subpat), num_sub_results, n_objects_per); } + + uint64_t count = sord_count(sord, id[0], 0, 0, 0); + if (count != num_sub_results) { + return test_fail("Fail: Query " TUP_FMT " sord_count() %d" + "does not match result count %d\n", + TUP_FMT_ARGS(subpat), count, num_sub_results); + } + last_subject = id[0]; } fprintf(stderr, "OK\n\n"); @@ -501,6 +508,10 @@ main(int argc, char** argv) tup[2] = sord_new_literal(world, 0, USTR("hello"), NULL); tup[3] = 0; sord_add(sord, tup); + if (!sord_ask(sord, tup[0], tup[1], tup[2], tup[3])) { + fprintf(stderr, "Failed to add tuple\n"); + goto fail; + } sord_node_free(world, (SordNode*)tup[2]); tup[2] = sord_new_literal(world, 0, USTR("hi"), NULL); sord_add(sord, tup); diff --git a/src/sord_validate.c b/src/sord_validate.c new file mode 100644 index 0000000..3dcd86e --- /dev/null +++ b/src/sord_validate.c @@ -0,0 +1,389 @@ +/* + Copyright 2012 David Robillard <http://drobilla.net> + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#define _BSD_SOURCE // for realpath + +#include <assert.h> +#include <stdlib.h> +#include <string.h> + +#ifdef _WIN32 +# include <windows.h> +#endif + +#include "serd/serd.h" +#include "sord/sord.h" +#include "sord_config.h" + +#ifdef HAVE_PCRE +# include <pcre.h> +#endif + +#define USTR(s) ((const uint8_t*)s) + +#define NS_foaf (const uint8_t*)"http://xmlns.com/foaf/0.1/" +#define NS_owl (const uint8_t*)"http://www.w3.org/2002/07/owl#" +#define NS_rdf (const uint8_t*)"http://www.w3.org/1999/02/22-rdf-syntax-ns#" +#define NS_rdfs (const uint8_t*)"http://www.w3.org/2000/01/rdf-schema#" +#define NS_xsd (const uint8_t*)"http://www.w3.org/2001/XMLSchema#" + +typedef struct { + SordNode* foaf_Document; + SordNode* owl_AnnotationProperty; + SordNode* owl_Class; + SordNode* owl_DatatypeProperty; + SordNode* owl_FunctionalProperty; + SordNode* owl_InverseFunctionalProperty; + SordNode* owl_ObjectProperty; + SordNode* owl_OntologyProperty; + SordNode* owl_Thing; + SordNode* owl_equivalentClass; + SordNode* rdf_Property; + SordNode* rdf_type; + SordNode* rdfs_Class; + SordNode* rdfs_Literal; + SordNode* rdfs_Resource; + SordNode* rdfs_domain; + SordNode* rdfs_range; + SordNode* rdfs_subClassOf; + SordNode* xsd_pattern; + SordNode* xsd_string; +} URIs; + +int n_errors = 0; + +int +print_version() +{ + printf("sord_validate " SORD_VERSION + " <http://drobilla.net/software/sord>\n"); + printf("Copyright 2012 David Robillard <http://drobilla.net>.\n" + "License: <http://www.opensource.org/licenses/isc>\n" + "This is free software; you are free to change and redistribute it." + "\nThere is NO WARRANTY, to the extent permitted by law.\n"); + return 0; +} + +int +print_usage(const char* name, bool error) +{ + FILE* const os = error ? stderr : stdout; + fprintf(os, "Usage: %s INPUT...\n", name); + fprintf(os, + "Validate RDF data. This is a simple validator which checks\n" + "that all used properties are actually defined. It does not" + "do any fancy file retrieval, the files passed on the command line" + "are the only data that is read. In other words, you must pass" + "the definition of all vocabularies used on the command line.\n"); + return error ? 1 : 0; +} + +uint8_t* +absolute_path(const uint8_t* path) +{ +#ifdef _WIN32 + char* out = (char*)malloc(MAX_PATH); + GetFullPathName((const char*)path, MAX_PATH, out, NULL); + return (uint8_t*)out; +#else + return (uint8_t*)realpath((const char*)path, NULL); +#endif +} + +void +error(const char* msg, const SordQuad quad) +{ + ++n_errors; + fprintf(stderr, "error: %s:\n %s\n %s\n %s\n", + msg, + (const char*)sord_node_get_string(quad[SORD_SUBJECT]), + (const char*)sord_node_get_string(quad[SORD_PREDICATE]), + (const char*)sord_node_get_string(quad[SORD_OBJECT])); +} + +bool +is_subclass_of(SordModel* model, + const URIs* uris, + const SordNode* class, + const SordNode* super) +{ + if (!class) { + return false; + } else if (sord_node_equals(class, super) || + sord_ask(model, class, uris->owl_equivalentClass, super, NULL)) { + return true; + } + + SordIter* i = sord_search(model, class, uris->rdfs_subClassOf, NULL, NULL); + for (; !sord_iter_end(i); sord_iter_next(i)) { + const SordNode* o = sord_iter_get_node(i, SORD_OBJECT); + if (sord_node_equals(class, o)) { + continue; // Class is explicitly subClassOf itself + } + if (is_subclass_of(model, uris, o, super)) { + sord_iter_free(i); + return true; + } + } + sord_iter_free(i); + + return false; +} + +bool +regexp_match(const char* pat, const char* str) +{ +#ifdef HAVE_PCRE + const char* error; + int erroffset; + pcre* re = pcre_compile(pat, PCRE_ANCHORED, &error, &erroffset, NULL); + if (!re) { + fprintf(stderr, "Error in regexp \"%s\" at offset %d (%s)\n", + pat, erroffset, error); + return false; + } + + int st = pcre_exec(re, NULL, str, strlen(str), 0, 0, NULL, 0); + if (st < 0) { + fprintf(stderr, "Error %d executing regexp \"%s\"\n", st, pat); + return false; + } +#endif // HAVE_PCRE + return true; +} + +bool +literal_is_valid(SordModel* model, + const URIs* uris, + const SordNode* literal, + const SordNode* type) +{ + if (!type) { + return true; + } + + SordIter* p = sord_search(model, type, uris->xsd_pattern, 0, 0); + const SordNode* pattern = sord_iter_get_node(p, SORD_OBJECT); + if (!pattern) { + fprintf(stderr, "warning: No pattern for datatype <%s>\n", + sord_node_get_string(type)); + return true; + } + if (regexp_match((const char*)sord_node_get_string(pattern), + (const char*)sord_node_get_string(literal))) { + return true; + } + fprintf(stderr, "Literal \"%s\" does not match <%s> pattern \"%s\"\n", + sord_node_get_string(literal), + sord_node_get_string(type), + sord_node_get_string(pattern)); + return false; +} + +bool +check_type(SordModel* model, + URIs* uris, + const SordNode* node, + const SordNode* type) +{ + if (sord_node_equals(type, uris->rdfs_Resource) || + sord_node_equals(type, uris->owl_Thing)) { + return true; + } + + if (sord_node_get_type(node) == SORD_LITERAL) { + if (sord_node_equals(type, uris->rdfs_Literal) || + sord_node_equals(type, uris->xsd_string)) { + return true; + } else { + const SordNode* datatype = sord_node_get_datatype(node); + return is_subclass_of(model, uris, datatype, type) || + literal_is_valid(model, uris, node, type); + } + } else if (sord_node_get_type(node) == SORD_URI) { + if (sord_node_equals(type, uris->foaf_Document)) { + return true; // Questionable... + } else { + SordIter* t = sord_search(model, node, uris->rdf_type, NULL, NULL); + for (; !sord_iter_end(t); sord_iter_next(t)) { + if (is_subclass_of(model, uris, + sord_iter_get_node(t, SORD_OBJECT), + type)) { + sord_iter_free(t); + return true; + } + } + sord_iter_free(t); + return false; + } + } else { + return true; // Blanks often lack explicit types, ignore + } + + return false; +} + +int +main(int argc, char** argv) +{ + if (argc < 2) { + return print_usage(argv[0], true); + } + + SordWorld* world = sord_world_new(); + SordModel* model = sord_new(world, SORD_SPO|SORD_OPS, false); + SerdEnv* env = serd_env_new(&SERD_NODE_NULL); + SerdReader* reader = sord_new_reader(model, env, SERD_TURTLE, NULL); + + for (int a = 1; a < argc; ++a) { + const uint8_t* input = (const uint8_t*)argv[a]; + uint8_t* in_path = absolute_path(serd_uri_to_path(input)); + + if (!in_path) { + fprintf(stderr, "Skipping file %s\n", input); + continue; + } + + SerdURI base_uri; + SerdNode base_uri_node = serd_node_new_file_uri( + in_path, NULL, &base_uri, false); + + serd_env_set_base_uri(env, &base_uri_node); + const SerdStatus st = serd_reader_read_file(reader, in_path); + if (st) { + fprintf(stderr, "error reading %s: %s\n", + in_path, serd_strerror(st)); + } + + serd_node_free(&base_uri_node); + } + +#define URI(prefix, suffix) \ + .prefix##_##suffix = sord_new_uri(world, NS_##prefix #suffix) + + URIs uris = { + URI(foaf, Document), + URI(owl, AnnotationProperty), + URI(owl, Class), + URI(owl, DatatypeProperty), + URI(owl, FunctionalProperty), + URI(owl, InverseFunctionalProperty), + URI(owl, ObjectProperty), + URI(owl, OntologyProperty), + URI(owl, Thing), + URI(owl, equivalentClass), + URI(rdf, Property), + URI(rdf, type), + URI(rdfs, Class), + URI(rdfs, Literal), + URI(rdfs, Resource), + URI(rdfs, domain), + URI(rdfs, range), + URI(rdfs, subClassOf), + URI(xsd, pattern), + URI(xsd, string) + }; + +#ifndef HAVE_PCRE + fprintf(stderr, "warning: Built without PCRE, datatypes not checked.\n"); +#endif + + SordIter* i = sord_begin(model); + for (; !sord_iter_end(i); sord_iter_next(i)) { + SordQuad quad; + sord_iter_get(i, quad); + + const SordNode* subj = quad[SORD_SUBJECT]; + const SordNode* pred = quad[SORD_PREDICATE]; + const SordNode* obj = quad[SORD_OBJECT]; + + bool is_Property = sord_ask( + model, pred, uris.rdf_type, uris.rdf_Property, 0); + bool is_OntologyProperty = sord_ask( + model, pred, uris.rdf_type, uris.owl_OntologyProperty, 0); + bool is_ObjectProperty = sord_ask( + model, pred, uris.rdf_type, uris.owl_ObjectProperty, 0); + bool is_FunctionalProperty = sord_ask( + model, pred, uris.rdf_type, uris.owl_FunctionalProperty, 0); + bool is_InverseFunctionalProperty = sord_ask( + model, pred, uris.rdf_type, uris.owl_InverseFunctionalProperty, 0); + bool is_DatatypeProperty = sord_ask( + model, pred, uris.rdf_type, uris.owl_DatatypeProperty, 0); + bool is_AnnotationProperty = sord_ask( + model, pred, uris.rdf_type, uris.owl_AnnotationProperty, 0); + + if (!is_Property && !is_OntologyProperty && !is_ObjectProperty && + !is_FunctionalProperty && !is_InverseFunctionalProperty && + !is_DatatypeProperty && !is_AnnotationProperty) { + error("Use of undefined property", quad); + } + + if (is_DatatypeProperty && + sord_node_get_type(obj) != SORD_LITERAL) { + error("Datatype property with non-literal value", quad); + } + + if (is_ObjectProperty && + sord_node_get_type(obj) == SORD_LITERAL) { + error("Object property with literal value", quad); + } + + if (is_FunctionalProperty && + sord_count(model, subj, pred, NULL, NULL) > 1) { + error("Functional property with several objects", quad); + } + + if (is_InverseFunctionalProperty && + sord_count(model, NULL, pred, obj, NULL) > 1) { + error("Inverse functional property with several subjects", quad); + } + + if (sord_node_equals(pred, uris.rdf_type) && + !sord_ask(model, obj, uris.rdf_type, uris.rdfs_Class, NULL) && + !sord_ask(model, obj, uris.rdf_type, uris.owl_Class, NULL)) { + error("Type is not a rdfs:Class or owl:Class", quad); + } + + if (sord_node_get_type(obj) == SORD_LITERAL && + !literal_is_valid(model, &uris, obj, sord_node_get_datatype(obj))) { + error("Literal does not match datatype", quad); + } + + SordIter* r = sord_search(model, pred, uris.rdfs_range, NULL, NULL); + if (r) { + const SordNode* range = sord_iter_get_node(r, SORD_OBJECT); + if (!check_type(model, &uris, obj, range)) { + error("Object not in property range", quad); + fprintf(stderr, "note: Range is <%s>\n", + sord_node_get_string(range)); + } + } + + SordIter* d = sord_search(model, pred, uris.rdfs_domain, NULL, NULL); + if (d) { + const SordNode* domain = sord_iter_get_node(d, SORD_OBJECT); + if (!check_type(model, &uris, subj, domain)) { + error("Subject not in property domain", quad); + fprintf(stderr, "note: Domain is <%s>\n", + sord_node_get_string(domain)); + } + } + } + sord_iter_free(i); + + printf("Found %d errors among %d files\n", n_errors, argc - 1); + return 0; +} diff --git a/src/sordi.c b/src/sordi.c index a02472d..9e4ed3a 100644 --- a/src/sordi.c +++ b/src/sordi.c @@ -154,8 +154,8 @@ main(int argc, char** argv) } } - SerdURI base_uri = SERD_URI_NULL; - SerdNode base_uri_node = SERD_NODE_NULL; + SerdURI base_uri = SERD_URI_NULL; + SerdNode base_uri_node = SERD_NODE_NULL; if (a < argc) { // Base URI given on command line base_uri_node = serd_node_new_uri_from_string( (const uint8_t*)argv[a], NULL, &base_uri); @@ -51,6 +51,7 @@ def configure(conf): autowaf.check_pkg(conf, 'serd-0', uselib_store='SERD', atleast_version='0.14.0', mandatory=True) + autowaf.check_pkg(conf, 'libpcre', uselib_store='PCRE', mandatory=False) conf.env['BUILD_TESTS'] = Options.options.build_tests conf.env['BUILD_UTILS'] = True @@ -198,16 +199,17 @@ def build(bld): defines = defines) autowaf.use_lib(bld, obj, 'SERD') - # Command line utility + # Command line utilities if bld.env['BUILD_UTILS']: - obj = bld(features = 'c cprogram', - source = 'src/sordi.c', - includes = ['.', './src'], - use = 'libsord', - target = 'sordi', - install_path = '${BINDIR}', - defines = defines) - autowaf.use_lib(bld, obj, 'SERD') + for i in ['sordi', 'sord_validate']: + obj = bld(features = 'c cprogram', + source = 'src/%s.c' % i, + includes = ['.', './src'], + use = 'libsord', + target = i, + install_path = '${BINDIR}', + defines = defines) + autowaf.use_lib(bld, obj, 'SERD PCRE') # Documentation autowaf.build_dox(bld, 'SORD', SORD_VERSION, top, out) |