From 178a5a788b59d429879ffbf393fb92993b25ea2b Mon Sep 17 00:00:00 2001 From: David Robillard Date: Sun, 9 Sep 2012 07:09:58 +0000 Subject: Implement better data type validation in sord_validate conformant with the XSD and OWL specifications. Fix memory leaks in sord_validate. git-svn-id: http://svn.drobilla.net/sord/trunk@261 3d64ff67-21c5-427c-a301-fe4f08042e5a --- NEWS | 7 +- doc/sord_validate.1 | 39 +++++++-- doc/sordi.1 | 2 +- src/sord_validate.c | 226 ++++++++++++++++++++++++++++++++++++++++++---------- wscript | 2 +- 5 files changed, 224 insertions(+), 52 deletions(-) diff --git a/NEWS b/NEWS index 4f2e445..1dcd191 100644 --- a/NEWS +++ b/NEWS @@ -1,8 +1,11 @@ -sord (0.10.1) unstable; +sord (0.10.3) unstable; + * Implement better data type validation in sord_validate conformant with + the XSD and OWL specifications + * Fix memory leaks in sord_validate * Install sord_validate man page - -- David Robillard Sat, 25 Aug 2012 15:23:44 -0400 + -- David Robillard Sun, 09 Sep 2012 01:41:31 -0400 sord (0.10.0) stable; diff --git a/doc/sord_validate.1 b/doc/sord_validate.1 index 602707f..9b8668a 100644 --- a/doc/sord_validate.1 +++ b/doc/sord_validate.1 @@ -6,18 +6,41 @@ .SH SYNOPSIS sord_validate [OPTION]... INPUT... +.SH OPTIONS +.TP +\fB\-h\fR +Print the command line options. + +.TP +\fB\-l\fR +Print errors on a single line. + +.TP +\fB\-v\fR +Display version information and exit. + +.SH DESCRIPTION This is a simple validator which checks that all used properties are actually defined, and that the domain and range of properties is explicitly correct. Note that an "error" from this program does not necessarily mean data is invalid, since it is not required to explicitly list types in RDF, however it -is a good idea to do so. If data type definitions are available with an -xsd:pattern property, literals with that datatype will be checked against the -xsd:pattern (a regular expresssion) to ensure they are valid. - -This program does not retrieve any data from the web or magical places on the -file system. It only processes files passed directly on the command line. -This means you must pass all used vocabularies to get a useful result. - +is a good idea to do so. + +This program never retrieves data from the web or magical places on the file +system, it only processes files passed directly on the command line. This +means you must pass all used vocabularies to get a useful result. + +If an appropriate schema is available, literals are checked against datatype +definitions (both the explicit datatype of the literal itself as well as any +types implied by the corresponding property). Three XML Schema Datatypes (XSD) +constraints are currently supported: regular expressions (xsd:pattern), and +inclusive range (xsd:minimumInclusive and xsd:maximumInclusive). Given an +appropriate schema, this is enough to validate against most of the standard XSD +datatypes. + +.SH EXAMPLES +sord_validate `find ~/schemas/ -name '*.ttl'` data.ttl + .SH AUTHOR sord_validate was written by David Robillard diff --git a/doc/sordi.1 b/doc/sordi.1 index 8168999..6c53755 100644 --- a/doc/sordi.1 +++ b/doc/sordi.1 @@ -8,7 +8,7 @@ sordi [OPTION]... INPUT BASE_URI .SH OPTIONS .TP -\fB\-h\fR, \fB\-\-help\fR +\fB\-h\fR Print the command line options. .TP diff --git a/src/sord_validate.c b/src/sord_validate.c index 9f8c176..2418b87 100644 --- a/src/sord_validate.c +++ b/src/sord_validate.c @@ -51,7 +51,11 @@ typedef struct { SordNode* owl_OntologyProperty; SordNode* owl_Thing; SordNode* owl_equivalentClass; + SordNode* owl_onDatatype; + SordNode* owl_withRestrictions; SordNode* rdf_Property; + SordNode* rdf_first; + SordNode* rdf_rest; SordNode* rdf_type; SordNode* rdfs_Class; SordNode* rdfs_Literal; @@ -59,11 +63,15 @@ typedef struct { SordNode* rdfs_domain; SordNode* rdfs_range; SordNode* rdfs_subClassOf; + SordNode* xsd_decimal; + SordNode* xsd_maxInclusive; + SordNode* xsd_minInclusive; SordNode* xsd_pattern; SordNode* xsd_string; } URIs; int n_errors = 0; +int n_restrictions = 0; bool one_line_errors = false; static int @@ -84,7 +92,9 @@ print_usage(const char* name, bool error) FILE* const os = error ? stderr : stdout; fprintf(os, "Usage: %s [OPTION]... INPUT...\n", name); fprintf(os, "Validate RDF data\n\n"); + fprintf(os, " -h Display this help and exit\n"); fprintf(os, " -l Print errors on a single line.\n"); + fprintf(os, " -v Display version information and exit\n"); fprintf(os, "Validate RDF data. This is a simple validator which checks\n" "that all used properties are actually defined. It does not do\n" @@ -119,25 +129,26 @@ error(const char* msg, const SordQuad quad) } static bool -is_subclass_of(SordModel* model, - const URIs* uris, - const SordNode* klass, - const SordNode* super) +is_descendant_of(SordModel* model, + const URIs* uris, + const SordNode* child, + const SordNode* parent, + const SordNode* pred) { - if (!klass) { + if (!child) { return false; - } else if (sord_node_equals(klass, super) || - sord_ask(model, klass, uris->owl_equivalentClass, super, NULL)) { + } else if (sord_node_equals(child, parent) || + sord_ask(model, child, uris->owl_equivalentClass, parent, NULL)) { return true; } - SordIter* i = sord_search(model, klass, uris->rdfs_subClassOf, NULL, NULL); + SordIter* i = sord_search(model, child, pred, NULL, NULL); for (; !sord_iter_end(i); sord_iter_next(i)) { const SordNode* o = sord_iter_get_node(i, SORD_OBJECT); - if (sord_node_equals(klass, o)) { - continue; // Class is explicitly subClassOf itself + if (sord_node_equals(child, o)) { + continue; // Weird class is explicitly a descendent of itself } - if (is_subclass_of(model, uris, o, super)) { + if (is_descendant_of(model, uris, o, parent, pred)) { sord_iter_free(i); return true; } @@ -148,27 +159,122 @@ is_subclass_of(SordModel* model, } static bool -regexp_match(const char* pat, const char* str) +regexp_match(const uint8_t* pat, const char* str) { #ifdef HAVE_PCRE + // Append a $ to the pattern so we only match if the entire string matches + const size_t len = strlen((const char*)pat); + char* const regx = malloc(len + 2); + memcpy(regx, pat, len); + regx[len] = '$'; + regx[len + 1] = '\0'; + const char* err; int erroffset; - pcre* re = pcre_compile(pat, PCRE_ANCHORED, &err, &erroffset, NULL); + pcre* re = pcre_compile(regx, PCRE_ANCHORED, &err, &erroffset, NULL); + free(regx); if (!re) { - fprintf(stderr, "Error in regexp \"%s\" at offset %d (%s)\n", + fprintf(stderr, "Error in pattern `%s' at offset %d (%s)\n", pat, erroffset, err); return false; } - int st = pcre_exec(re, NULL, str, strlen(str), 0, 0, NULL, 0); - if (st < 0) { - fprintf(stderr, "Error %d executing regexp \"%s\"\n", st, pat); - return false; - } + const bool ret = pcre_exec(re, NULL, str, strlen(str), 0, 0, NULL, 0) >= 0; + pcre_free(re); + return ret; #endif // HAVE_PCRE return true; } +static bool +check_restriction(SordModel* model, + const URIs* uris, + const SordNode* literal, + const SordNode* type, + const SordNode* restriction) +{ + size_t len = 0; + const char* str = (const char*)sord_node_get_string_counted(literal, &len); + ++n_restrictions; + + // Check xsd:pattern + SordIter* p = sord_search(model, restriction, uris->xsd_pattern, 0, 0); + if (p) { + const SordNode* pat = sord_iter_get_node(p, SORD_OBJECT); + const bool good = regexp_match(sord_node_get_string(pat), str); + if (!good) { + fprintf(stderr, "`%s' does not match <%s> pattern `%s'\n", + sord_node_get_string(literal), + sord_node_get_string(type), + sord_node_get_string(pat)); + } + + sord_iter_free(p); + return good; + } + + /* We'll do some comparison tricks for xsd:decimal types, where + lexicographical comparison would be incorrect. Note that if the + literal's type is a descendant of xsd:decimal, we'll end up checking it + against the xsd:decimal pattern so there's no need to validate digits + here. At worst we'll get a false positive but it will fail later. */ + const bool is_decimal = is_descendant_of( + model, uris, type, uris->xsd_decimal, uris->owl_onDatatype); + + // Check xsd:minInclusive + SordIter* l = sord_search(model, restriction, uris->xsd_minInclusive, 0, 0); + if (l) { + const SordNode* lower = sord_iter_get_node(l, SORD_OBJECT); + size_t lower_len = 0; + const char* lower_str = (const char*)sord_node_get_string_counted(lower, &lower_len); + bool good = false; + if (!is_decimal || len == lower_len) { + // Not decimal, or equal lengths, strcmp + good = (strcmp(str, lower_str) >= 0); + } else { + // Decimal with different length, only good if longer than the min + good = (len > lower_len); + } + if (!good) { + fprintf(stderr, "`%s' is not >= <%s> minimum `%s'\n", + sord_node_get_string(literal), + sord_node_get_string(type), + sord_node_get_string(lower)); + } + + sord_iter_free(l); + return good; + } + + // Check xsd:maxInclusive + SordIter* u = sord_search(model, restriction, uris->xsd_maxInclusive, 0, 0); + if (u) { + const SordNode* upper = sord_iter_get_node(u, SORD_OBJECT); + size_t upper_len = 0; + const char* upper_str = (const char*)sord_node_get_string_counted(upper, &upper_len); + bool good = false; + if (!is_decimal || len == upper_len) { + // Not decimal, or equal lengths, strcmp + good = (strcmp(str, upper_str) <= 0); + } else { + // Decimal with different length, only good if shorter than the max + good = (len < upper_len); + } + if (!good) { + fprintf(stderr, "`%s' is not <= <%s> maximum `%s'\n", + sord_node_get_string(literal), + sord_node_get_string(type), + sord_node_get_string(upper)); + } + + sord_iter_free(u); + return good; + } + + --n_restrictions; + return true; // Unknown restriction, be quietly tolerant +} + static bool literal_is_valid(SordModel* model, const URIs* uris, @@ -179,22 +285,47 @@ literal_is_valid(SordModel* model, return true; } - SordIter* p = sord_search(model, type, uris->xsd_pattern, 0, 0); - const SordNode* pattern = sord_iter_get_node(p, SORD_OBJECT); - if (!pattern) { - fprintf(stderr, "warning: No pattern for datatype <%s>\n", - sord_node_get_string(type)); - return true; + // Find restrictions list + SordIter* rs = sord_search(model, type, uris->owl_withRestrictions, 0, 0); + if (sord_iter_end(rs)) { + return true; // No restrictions } - if (regexp_match((const char*)sord_node_get_string(pattern), - (const char*)sord_node_get_string(literal))) { - return true; + + // Walk list, checking each restriction + const SordNode* head = sord_iter_get_node(rs, SORD_OBJECT); + while (head) { + SordIter* f = sord_search(model, head, uris->rdf_first, 0, 0); + if (!f) { + break; // Reached end of restrictions list without failure + } + + // Check this restriction + const bool good = check_restriction( + model, uris, literal, type, sord_iter_get_node(f, SORD_OBJECT)); + sord_iter_free(f); + + if (!good) { + sord_iter_free(rs); + return false; // Failed, literal is invalid + } + + // Seek to next list node + SordIter* n = sord_search(model, head, uris->rdf_rest, 0, 0); + head = n ? sord_iter_get_node(n, SORD_OBJECT) : NULL; + sord_iter_free(n); } - fprintf(stderr, "Literal \"%s\" does not match <%s> pattern \"%s\"\n", - sord_node_get_string(literal), - sord_node_get_string(type), - sord_node_get_string(pattern)); - return false; + + sord_iter_free(rs); + + SordIter* s = sord_search(model, type, uris->owl_onDatatype, 0, 0); + if (s) { + const SordNode* super = sord_iter_get_node(s, SORD_OBJECT); + const bool good = literal_is_valid(model, uris, literal, super); + sord_iter_free(s); + return good; // Match iff literal also matches supertype + } + + return true; // Matches top level type } static bool @@ -213,9 +344,7 @@ check_type(SordModel* model, sord_node_equals(type, uris->xsd_string)) { return true; } else { - const SordNode* datatype = sord_node_get_datatype(node); - return is_subclass_of(model, uris, datatype, type) || - literal_is_valid(model, uris, node, type); + return literal_is_valid(model, uris, node, type); } } else if (sord_node_get_type(node) == SORD_URI) { if (sord_node_equals(type, uris->foaf_Document)) { @@ -223,9 +352,10 @@ check_type(SordModel* model, } else { SordIter* t = sord_search(model, node, uris->rdf_type, NULL, NULL); for (; !sord_iter_end(t); sord_iter_next(t)) { - if (is_subclass_of(model, uris, - sord_iter_get_node(t, SORD_OBJECT), - type)) { + if (is_descendant_of(model, uris, + sord_iter_get_node(t, SORD_OBJECT), + type, + uris->rdfs_subClassOf)) { sord_iter_free(t); return true; } @@ -285,7 +415,10 @@ main(int argc, char** argv) } serd_node_free(&base_uri_node); + free(in_path); } + serd_reader_free(reader); + serd_env_free(env); #define URI(prefix, suffix) \ uris.prefix##_##suffix = sord_new_uri(world, NS_##prefix #suffix) @@ -301,7 +434,11 @@ main(int argc, char** argv) URI(owl, OntologyProperty); URI(owl, Thing); URI(owl, equivalentClass); + URI(owl, onDatatype); + URI(owl, withRestrictions); URI(rdf, Property); + URI(rdf, first); + URI(rdf, rest); URI(rdf, type); URI(rdfs, Class); URI(rdfs, Literal); @@ -309,6 +446,9 @@ main(int argc, char** argv) URI(rdfs, domain); URI(rdfs, range); URI(rdfs, subClassOf); + URI(xsd, decimal); + URI(xsd, maxInclusive); + URI(xsd, minInclusive); URI(xsd, pattern); URI(xsd, string); @@ -385,6 +525,7 @@ main(int argc, char** argv) fprintf(stderr, "note: Range is <%s>\n", sord_node_get_string(range)); } + sord_iter_free(r); } SordIter* d = sord_search(model, pred, uris.rdfs_domain, NULL, NULL); @@ -395,10 +536,15 @@ main(int argc, char** argv) fprintf(stderr, "note: Domain is <%s>\n", sord_node_get_string(domain)); } + sord_iter_free(d); } } sord_iter_free(i); - printf("Found %d errors among %d files\n", n_errors, argc - 1); + printf("Found %d errors among %d files (checked %d restrictions)\n", + n_errors, argc - 1, n_restrictions); + + sord_free(model); + sord_world_free(world); return 0; } diff --git a/wscript b/wscript index d186dfa..3391831 100644 --- a/wscript +++ b/wscript @@ -7,7 +7,7 @@ from waflib.extras import autowaf as autowaf import waflib.Logs as Logs, waflib.Options as Options # Version of this package (even if built as a child) -SORD_VERSION = '0.10.1' +SORD_VERSION = '0.10.3' SORD_MAJOR_VERSION = '0' # Library version (UNIX style major, minor, micro) -- cgit v1.2.1