diff options
author | David Robillard <d@drobilla.net> | 2020-12-19 22:07:41 +0100 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2021-03-08 23:34:56 -0500 |
commit | 00456f395e50b43a9f26ff1479b9ef8b06a4d219 (patch) | |
tree | 31c22b3bd12d8e1bb6597c53fade859f6ca37c79 /subprojects/rerex | |
parent | ec0edd399d3ef493c4ddb0e821a56b53a19dbc96 (diff) | |
download | serd-00456f395e50b43a9f26ff1479b9ef8b06a4d219.tar.gz serd-00456f395e50b43a9f26ff1479b9ef8b06a4d219.tar.bz2 serd-00456f395e50b43a9f26ff1479b9ef8b06a4d219.zip |
Add rerex from git@gitlab.com:drobilla/rerex 2420851
Diffstat (limited to 'subprojects/rerex')
-rw-r--r-- | subprojects/rerex/.clang-format | 21 | ||||
-rw-r--r-- | subprojects/rerex/.clang-tidy | 6 | ||||
-rw-r--r-- | subprojects/rerex/.gitignore | 1 | ||||
-rw-r--r-- | subprojects/rerex/.gitlab-ci.yml | 165 | ||||
-rw-r--r-- | subprojects/rerex/CONTRIBUTING.md | 32 | ||||
-rw-r--r-- | subprojects/rerex/COPYING | 13 | ||||
-rw-r--r-- | subprojects/rerex/NEWS | 5 | ||||
-rw-r--r-- | subprojects/rerex/README.md | 158 | ||||
-rw-r--r-- | subprojects/rerex/include/rerex/rerex.h | 112 | ||||
-rw-r--r-- | subprojects/rerex/meson.build | 121 | ||||
-rw-r--r-- | subprojects/rerex/meson/meson.build | 204 | ||||
-rw-r--r-- | subprojects/rerex/meson_options.txt | 5 | ||||
-rw-r--r-- | subprojects/rerex/src/.clang-tidy | 8 | ||||
-rw-r--r-- | subprojects/rerex/src/rerex.c | 765 | ||||
-rw-r--r-- | subprojects/rerex/test/test_match.c | 238 | ||||
-rw-r--r-- | subprojects/rerex/test/test_syntax.c | 99 | ||||
-rw-r--r-- | subprojects/rerex/test/test_xsd.c | 522 |
17 files changed, 2475 insertions, 0 deletions
diff --git a/subprojects/rerex/.clang-format b/subprojects/rerex/.clang-format new file mode 100644 index 00000000..fcdbab81 --- /dev/null +++ b/subprojects/rerex/.clang-format @@ -0,0 +1,21 @@ +--- +AlignConsecutiveAssignments: true +AlignConsecutiveDeclarations: true +AlignEscapedNewlinesLeft: true +BasedOnStyle: Mozilla +BraceWrapping: + AfterEnum: false + AfterExternBlock: false + AfterFunction: true + AfterStruct: false + AfterUnion: false +BreakBeforeBraces: Custom +Cpp11BracedListStyle: true +IndentCaseLabels: false +IndentPPDirectives: AfterHash +KeepEmptyLinesAtTheStartOfBlocks: false +SpacesInContainerLiterals: false +StatementMacros: + - REREX_API + - REREX_CONST_API +... diff --git a/subprojects/rerex/.clang-tidy b/subprojects/rerex/.clang-tidy new file mode 100644 index 00000000..3a0d4d70 --- /dev/null +++ b/subprojects/rerex/.clang-tidy @@ -0,0 +1,6 @@ +Checks: > + *, + -llvmlibc*, +WarningsAsErrors: '*' +HeaderFilterRegex: '.*' +FormatStyle: file diff --git a/subprojects/rerex/.gitignore b/subprojects/rerex/.gitignore new file mode 100644 index 00000000..84c048a7 --- /dev/null +++ b/subprojects/rerex/.gitignore @@ -0,0 +1 @@ +/build/ diff --git a/subprojects/rerex/.gitlab-ci.yml b/subprojects/rerex/.gitlab-ci.yml new file mode 100644 index 00000000..5e7cb7bb --- /dev/null +++ b/subprojects/rerex/.gitlab-ci.yml @@ -0,0 +1,165 @@ +stages: + - build + - deploy + +.build_template: &build_definition + stage: build + + +arm32_dbg: + <<: *build_definition + image: lv2plugin/debian-arm32 + script: + - meson . build --cross-file=/usr/share/meson/cross/arm-linux-gnueabihf.ini -Dbuildtype=debug -Dstrict=true -Dwerror=true -Dtests=true + - ninja -C build test + +arm32_rel: + <<: *build_definition + image: lv2plugin/debian-arm32 + script: + - meson . build --cross-file=/usr/share/meson/cross/arm-linux-gnueabihf.ini -Dbuildtype=release -Dstrict=true -Dwerror=true -Dtests=true + - ninja -C build test + + +arm64_dbg: + <<: *build_definition + image: lv2plugin/debian-arm64 + script: + - meson . build --cross-file=/usr/share/meson/cross/aarch64-linux-gnu.ini -Dbuildtype=debug -Dstrict=true -Dwerror=true -Dtests=true + - ninja -C build test + +arm64_rel: + <<: *build_definition + image: lv2plugin/debian-arm64 + script: + - meson . build --cross-file=/usr/share/meson/cross/aarch64-linux-gnu.ini -Dbuildtype=release -Dstrict=true -Dwerror=true -Dtests=true + - ninja -C build test + + +x32_dbg: + <<: *build_definition + image: lv2plugin/debian-x32 + script: + - meson . build --cross-file=/usr/share/meson/cross/i686-linux-gnu.ini -Dbuildtype=debug -Dstrict=true -Dwerror=true + - ninja -C build test + +x32_rel: + <<: *build_definition + image: lv2plugin/debian-x32 + script: + - meson . build --cross-file=/usr/share/meson/cross/i686-linux-gnu.ini -Dbuildtype=release -Dstrict=true -Dwerror=true + - ninja -C build test + + +x64_dbg: + <<: *build_definition + image: lv2plugin/debian-x64 + script: + - meson . build -Dbuildtype=debug -Dstrict=true -Dwerror=true -Dtests=true -Db_coverage=true + - ninja -C build test + - ninja -C build coverage-html + artifacts: + paths: + - build/meson-logs/coveragereport + +x64_rel: + <<: *build_definition + image: lv2plugin/debian-x64 + script: + - meson . build -Dbuildtype=release -Dstrict=true -Dwerror=true -Dtests=true + - ninja -C build test + + +x64_static: + <<: *build_definition + image: lv2plugin/debian-x64 + script: + - meson . build -Ddefault_library=static -Dstrict=true -Dwerror=true -Dtests=true + - ninja -C build test + + +x64_sanitize: + <<: *build_definition + image: lv2plugin/debian-x64-clang + script: + - meson . build -Db_lundef=false -Dbuildtype=plain -Dstrict=true -Dwerror=true -Dtests=true + - ninja -C build test + variables: + CC: "clang" + CFLAGS: "-fno-sanitize-recover=all -fsanitize=address -fsanitize=undefined -fsanitize=float-divide-by-zero -fsanitize=unsigned-integer-overflow -fsanitize=implicit-conversion -fsanitize=local-bounds -fsanitize=nullability" + LDFLAGS: "-fno-sanitize-recover=all -fsanitize=address -fsanitize=undefined -fsanitize=float-divide-by-zero -fsanitize=unsigned-integer-overflow -fsanitize=implicit-conversion -fsanitize=local-bounds -fsanitize=nullability" + + +mingw32_dbg: + <<: *build_definition + image: lv2plugin/debian-mingw32 + script: + - meson . build --cross-file=/usr/share/meson/cross/i686-w64-mingw32.ini -Dbuildtype=debug -Dstrict=true -Dwerror=true -Dtests=true + - ninja -C build test + +mingw32_rel: + <<: *build_definition + image: lv2plugin/debian-mingw32 + script: + - meson . build --cross-file=/usr/share/meson/cross/i686-w64-mingw32.ini -Dbuildtype=release -Dstrict=true -Dwerror=true -Dtests=true + - ninja -C build test + + +mingw64_dbg: + <<: *build_definition + image: lv2plugin/debian-mingw64 + script: + - meson . build --cross-file=/usr/share/meson/cross/x86_64-w64-mingw32.ini -Dbuildtype=debug -Dstrict=true -Dwerror=true -Dtests=true + - ninja -C build test + +mingw64_rel: + <<: *build_definition + image: lv2plugin/debian-mingw64 + script: + - meson . build --cross-file=/usr/share/meson/cross/x86_64-w64-mingw32.ini -Dbuildtype=release -Dstrict=true -Dwerror=true -Dtests=true + - ninja -C build test + + +mac_dbg: + <<: *build_definition + tags: [macos] + script: + - meson . build -Dbuildtype=debug -Dstrict=true -Dwerror=true -Dtests=true + - ninja -C build test + +mac_rel: + <<: *build_definition + tags: [macos] + script: + - meson . build -Dbuildtype=release -Dstrict=true -Dwerror=true -Dtests=true + - ninja -C build test + + +win_dbg: + <<: *build_definition + tags: [windows,meson] + script: + - meson . build -Dbuildtype=debug -Dstrict=true -Dwerror=true -Dtests=true + - ninja -C build test + +win_rel: + <<: *build_definition + tags: [windows,meson] + script: + - meson . build -Dbuildtype=release -Dstrict=true -Dwerror=true -Dtests=true + - ninja -C build test + + +pages: + stage: deploy + script: + - mkdir -p .public + - mv build/meson-logs/coveragereport/ .public/coverage + - mv .public public + dependencies: + - x64_dbg + artifacts: + paths: + - public + only: + - master diff --git a/subprojects/rerex/CONTRIBUTING.md b/subprojects/rerex/CONTRIBUTING.md new file mode 100644 index 00000000..c942308a --- /dev/null +++ b/subprojects/rerex/CONTRIBUTING.md @@ -0,0 +1,32 @@ +Contributing to Rerex +===================== + +Feel free to tinker with Rerex, I tried to make the code as easy to follow as I +could while preserving the minimalist spirit of it. Implementing character +sets and escapes and so on have made things a little bit hairier than a +textbook example, but hopefully it is still easy to digest. + +If you want to make a contribution, feel free to send patches, but note that I +would like to preserve the cleanliness avoid bloating things too much, so I may +not accept patches for features that are too complicated. Bug fixes or other +improvements to the code are, of course, most welcome. + +That said, this is, of course, free software, so you're free to take it and do +as you please. I'd be happy to chat about it, these are just guidelines for +what I feel would be appropriate for inclusion in the project. + +Happy hacking. + +Tooling +------- + +All of the code is machine formatted, the style is defined by the included +`clang-format` configuration. You can use `ninja clang-format` to format +everything. + +Configuring with `-Dstrict=true` will enable the ultra-strict warnings that are +run on CI (nearly all of them). Configuring with `-Dwerror=true` will make +them errors. + +A `clang-tidy` configuration is included for more intensive static checking, +you can run all the checks with `ninja clang-tidy`. diff --git a/subprojects/rerex/COPYING b/subprojects/rerex/COPYING new file mode 100644 index 00000000..0ba77822 --- /dev/null +++ b/subprojects/rerex/COPYING @@ -0,0 +1,13 @@ +Copyright 2020 David Robillard <d@drobilla.net> + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
\ No newline at end of file diff --git a/subprojects/rerex/NEWS b/subprojects/rerex/NEWS new file mode 100644 index 00000000..41f780a0 --- /dev/null +++ b/subprojects/rerex/NEWS @@ -0,0 +1,5 @@ +rerex (0.0.1) unstable; + + * Initial release + + -- David Robillard <d@drobilla.net> Wed, 09 Dec 2020 18:01:50 +0000 diff --git a/subprojects/rerex/README.md b/subprojects/rerex/README.md new file mode 100644 index 00000000..a2571cee --- /dev/null +++ b/subprojects/rerex/README.md @@ -0,0 +1,158 @@ +Rerex +===== + +Rerex (REally REgular eXpressions) is a simple and efficient NFA-based regular +expression implementation in C99. + +Rerex parses a regular expression into an NFA, then simulates execution using +Thompson's algorithm, so matching is linear in the size of the input. For +details on the theory behind this approach, see [Programming Techniques: +Regular expression search algorithm], the classic paper by Ken Thompson. +[Regular Expression Matching Can Be Simple And Fast] is also a good read, which +compares this approach with those used in most modern regular expression +implementations. The matching algorithm in Rerex is effectively the same as +the one described there, though with a slightly different implementation. +Parsing and construction are completely different, using a recursive-descent +parser which constructs the automata on the fly. + +This is a minimalist implementation with a limited feature set which is mainly +suitable for basic applications like checking markup or programming language +syntax. It does, however, strive to be clear and high-quality code which is a +bit more fully-featured than a simple example, and suitable for real-world use. +For example, character set expressions are supported, and meaningful errors are +reported with the offset of the problematic character. + +The generated automata are not always optimal, though there are a few simple +optimizations that avoid generating redundant states. Though there is room for +improvement there (at the cost of additional complexity), matching performs +relatively well. + +Implementation +-------------- + +The implementation is clean and warning-free C99, tested and known to work with +at least clang, GCC, and MSVC on POSIX, MacOS, and Windows. It weighs in at +under 600 [SLOC][], which can be compiled to about 5k on x86. + +Additionally, a simple test suite is included which covers 100% of the code. +It is run continuously on all of the above mentioned platforms on x64, as well +as in Linux on 32 and 64 bit ARM via emulation. + +Supported Grammar +----------------- + +Rerex supports a basic subset of classic regular expression syntax that is +supported by almost all implementations. The grammar is written here in [XML +EBNF notation][] with terminals in uppercase: + + DOT ::= '.' + OPERATOR ::= '*' | '+' | '?' + SPECIAL ::= DOT | OPERATOR | '(' | ')' | '[' | ']' | '^' | '{' | '|' | '}' + ESCAPABLE ::= SPECIAL | '-' + ESCAPE ::= '\' SPECIAL + CHAR ::= ESCAPE | [#x20-#x7E] - SPECIAL + ELEMENT ::= ([#x20-#x7E] - ']') | ('\' ']') + Range ::= ELEMENT | ELEMENT '-' ELEMENT + Set ::= Range | Range Set + Atom ::= CHAR | DOT | '(' Expr ')' | '[' Set ']' + Factor ::= Atom | Atom OPERATOR + Term ::= Factor | Factor Term + Expr ::= Term | Term '|' Expr + +Notable Limitations +------------------- + +Rerex is not meant to replace fully-featured regular expression libraries, so +the feature set is somewhat limited. The most glaring omissions include: + + - Only supports printable ASCII input + - Only supports anchored matching (no back references or group extraction) + - Only reads from a null-terminated string (not, for example, files) + - No support for counted replication with `{}` + +Should I Use This? +------------------ + +If you just want to understand how regular expressions work, and have some +clean code that's easy to understand and tinker with, sure. + +If you need a minimal regular expression matching implementation in portable C +for basic ASCII tasks, maybe. + +If you need a fully-featured regular expression implementation for +international text with group extraction and other more advanced features, +probably not. + +In any case, if you do, letting me know or linking to this page would be +appreciated. + +Dependencies +------------ + +None, except the C standard library. + +Building +-------- + +Rerex consists only of a single header and configuration-free source file, so +building it with any compiler manually should be straightforward. It should, +in theory, work just about anywhere. + +A [Meson][] build definition is included which can be used to do a proper +system installation with a `pkg-config` file, generate IDE projects, run the +tests, and so on. For example, the library and tests can be built and run like +so: + + meson setup -Dtests=true build + cd build + ninja test + +See the [Meson documentation][] for more details on using Meson. + +Usage +----- + +Rerex has a small API based around two objects: a _pattern_, which is an +immutable compiled representation of a regular expression, and a _matcher_ +which can be used to match strings against a pattern. See [the +header](include/rerex/rerex.h) for details, there is no external documentation +aside from this README. + +This simple example function uses everything in the API to provide a high-level +string-based match function: + +```c +bool matches(const char* regexp, const char* string) +{ + RerexPattern* pattern = NULL; + size_t end = 0; + RerexStatus st = rerex_compile(regexp, &end, &pattern); + + if (st) { + fprintf(stderr, "(string):%zu: error: %s\n", end, rerex_strerror(st)); + return false; + } + + RerexMatcher* matcher = rerex_new_matcher(pattern); + bool matches = rerex_match(matcher, string); + + rerex_free_matcher(matcher); + rerex_free_pattern(pattern); + + return matches; +} +``` + + -- David Robillard <d@drobilla.net> + +[XML EBNF notation]: http://www.w3.org/TR/REC-xml/#sec-notation + +[SLOC]: https://en.wikipedia.org/wiki/Source_lines_of_code + +[Meson]: https://mesonbuild.com/ + +[Meson documentation]: https://mesonbuild.com/Quick-guide.html + +[Programming Techniques: Regular expression search algorithm]: https://dl.acm.org/doi/10.1145/363347.363387 + +[Regular Expression Matching Can Be Simple And Fast]: https://swtch.com/~rsc/regexp/regexp1.html diff --git a/subprojects/rerex/include/rerex/rerex.h b/subprojects/rerex/include/rerex/rerex.h new file mode 100644 index 00000000..507a3a96 --- /dev/null +++ b/subprojects/rerex/include/rerex/rerex.h @@ -0,0 +1,112 @@ +/* + Copyright 2020 David Robillard <d@drobilla.net> + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#ifndef REREX_REREX_H +#define REREX_REREX_H + +#include <stdbool.h> +#include <stddef.h> + +#if defined(_WIN32) && !defined(REREX_STATIC) && defined(REREX_INTERNAL) +# define REREX_API __declspec(dllexport) +#elif defined(_WIN32) && !defined(REREX_STATIC) +# define REREX_API __declspec(dllimport) +#elif defined(__GNUC__) +# define REREX_API __attribute__((visibility("default"))) +#else +# define REREX_API +#endif + +#if defined(__GNUC__) +# define REREX_CONST_API __attribute__((const)) REREX_API +#else +# define REREX_CONST_API REREX_API +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/// Return status code +typedef enum { + REREX_SUCCESS, + REREX_EXPECTED_CHAR, + REREX_EXPECTED_ELEMENT, + REREX_EXPECTED_RBRACKET, + REREX_EXPECTED_RPAREN, + REREX_EXPECTED_SPECIAL, + REREX_UNEXPECTED_SPECIAL, + REREX_UNEXPECTED_END, + REREX_UNORDERED_RANGE, +} RerexStatus; + +/// Pattern that represents a compiled valid regular expression +typedef struct RerexPatternImpl RerexPattern; + +/// Matcher that can be used to match strings against a pattern +typedef struct RerexMatcherImpl RerexMatcher; + +/// Return a human-readable description of `status` +REREX_CONST_API +const char* +rerex_strerror(RerexStatus status); + +/** + Build a regular expression from a pattern string. + + The `pattern` must be a null-terminated string in supported regular + expression syntax. Upon return, `end` is set to the offset of the last + character processed. On success, `REREX_SUCCESS` is returned, and `out` is + pointed to a newly allocated pattern which must be freed with + rerex_free_pattern(). On error, `out` is unchanged, an error code is + returned, and `end` can be used to determine the position of the error in + the pattern. +*/ +REREX_API +RerexStatus +rerex_compile(const char* pattern, size_t* end, RerexPattern** out); + +/** + Allocate a new matcher for matching against a pattern. + + The returned matcher can be used to match several strings against a single + pattern using rerex_match(). It is newly allocated and must be freed with + rerex_free_matcher(). +*/ +REREX_API +RerexMatcher* +rerex_new_matcher(const RerexPattern* regexp); + +/// Return true if `string` matches the pattern of `matcher` +REREX_API +bool +rerex_match(RerexMatcher* matcher, const char* string); + +/// Free a matcher allocated with rerex_new_matcher() +REREX_API +void +rerex_free_matcher(RerexMatcher* matcher); + +/// Free a pattern allocated with rerex_compile() +REREX_API +void +rerex_free_pattern(RerexPattern* expression); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // REREX_REREX_H diff --git a/subprojects/rerex/meson.build b/subprojects/rerex/meson.build new file mode 100644 index 00000000..af64fac9 --- /dev/null +++ b/subprojects/rerex/meson.build @@ -0,0 +1,121 @@ +project('rerex', ['c'], + version: '0.0.1', + license: 'ISC', + meson_version: '>= 0.49.0', + default_options: [ + 'b_ndebug=if-release', + 'buildtype=release', + 'c_std=c99', + 'default_library=shared', + 'warning_level=3', + ]) + +major_version = meson.project_version().split('.')[0] +version_suffix = '-@0@'.format(major_version) +versioned_name = 'rerex' + version_suffix + +# Load build tools +pkg = import('pkgconfig') +cc = meson.get_compiler('c') + +# Set ultra strict warnings for developers, if requested +c_warnings = [] +c_suppressions = [] +if get_option('strict') + if not meson.is_subproject() + subdir('meson') + add_project_arguments(all_c_warnings, language: ['c']) + endif + + if cc.get_id() == 'gcc' + c_suppressions = [ + '-Wno-aggregate-return', + '-Wno-switch-default', + ] + elif cc.get_id() == 'msvc' + c_suppressions = [ + '/wd4706', # assignment within conditional expression + '/wd4711', # function selected for automatic inline expansion + '/wd5045', # will insert Spectre mitigation + ] + else + c_suppressions = [] + endif +endif + +rerex_c_args = cc.get_supported_arguments(c_suppressions) + +if cc.get_id() == 'msvc' + # Build as C++ + add_project_arguments(['/TP'], language: ['c']) + + # Suppress warnings in system headers + add_project_arguments(['/experimental:external', + '/external:W0', + '/external:anglebrackets'], + language: ['c']) +endif + +# Determine library type and the flags needed to build it +if get_option('default_library') == 'both' + if host_machine.system() == 'windows' + error('default_library=both is not supported on Windows') + endif + + library_type = 'both_libraries' + library_args = ['-DREREX_INTERNAL'] +elif get_option('default_library') == 'shared' + library_type = 'shared_library' + library_args = ['-DREREX_INTERNAL'] +else + library_type = 'static_library' + library_args = ['-DREREX_INTERNAL', '-DREREX_STATIC'] +endif + +# Build shared and/or static library/libraries +librerex = build_target( + versioned_name, + ['src/rerex.c'], + version: meson.project_version(), + include_directories: include_directories(['include']), + c_args: rerex_c_args + library_args, + gnu_symbol_visibility: 'hidden', + install: true, + target_type: library_type) + +# Generage pkg-config file +pkg.generate( + librerex, + name: 'Rerex', + filebase: versioned_name, + subdirs: [versioned_name], + version: meson.project_version(), + description: 'A simple and efficient regular expression implementation') + +# Install header to a versioned include directory +install_headers(['include/rerex/rerex.h'], subdir: versioned_name / 'rerex') + +rerex_dep = declare_dependency( + link_with: librerex, + include_directories: include_directories(['include'])) + +# Build and run tests +if get_option('tests') + foreach name : ['syntax', 'match', 'xsd'] + full_name = 'test_@0@'.format(name) + test(full_name, + executable(full_name, + 'test/@0@.c'.format(full_name), + c_args: rerex_c_args, + include_directories: include_directories(['include']), + dependencies: rerex_dep)) + endforeach +endif + +if meson.version().version_compare('>=0.53.0') + summary('Tests', get_option('tests'), bool_yn: true) + + summary('Install prefix', get_option('prefix')) + summary('Headers', get_option('prefix') / get_option('includedir')) + summary('Libraries', get_option('prefix') / get_option('libdir')) +endif diff --git a/subprojects/rerex/meson/meson.build b/subprojects/rerex/meson/meson.build new file mode 100644 index 00000000..5dd0de38 --- /dev/null +++ b/subprojects/rerex/meson/meson.build @@ -0,0 +1,204 @@ +# General code to enable approximately all warnings. +# +# This is trivial for clang and MSVC, but GCC does not have such an option, and +# has several esoteric warnings, so we need to enable everything we want +# explicitly. We enable everything that does not require a value argument, +# except for warnings that are only relevant for very old languages (earlier +# than C99 or C++11) or non-standard extensions. +# +# Omitted common warnings: +# +# Wabi= +# Waggregate-return +# Walloc-size-larger-than=BYTES +# Walloca-larger-than=BYTES +# Wframe-larger-than=BYTES +# Wlarger-than=BYTES +# Wstack-usage=BYTES +# Wsystem-headers +# Wtraditional +# Wtraditional-conversion +# Wtrampolines +# Wvla-larger-than=BYTES +# +# Omitted C warnings: +# +# Wc90-c99-compat +# Wdeclaration-after-statement +# Wtraditional +# Wtraditional-conversion +# +# Omitted C++ warnings: +# +# Wnamespaces +# Wtemplates + +gcc_common_warnings = [ + '-Walloc-zero', + '-Walloca', + '-Wanalyzer-too-complex', + '-Warith-conversion', + '-Warray-bounds=2', + '-Wattribute-alias=2', + '-Wcast-align=strict', + '-Wcast-qual', + '-Wconversion', + '-Wdate-time', + '-Wdisabled-optimization', + '-Wdouble-promotion', + '-Wduplicated-branches', + '-Wduplicated-cond', + '-Wfloat-equal', + '-Wformat-overflow=2', + '-Wformat-signedness', + '-Wformat-truncation=2', + '-Wformat=2', + '-Wimplicit-fallthrough=2', + '-Winit-self', + '-Winline', + '-Winvalid-pch', + '-Wlogical-op', + '-Wmissing-declarations', + '-Wmissing-include-dirs', + '-Wmultichar', + '-Wnormalized=nfc', + '-Wnull-dereference', + '-Wpacked', + '-Wpadded', + '-Wredundant-decls', + '-Wscalar-storage-order', + '-Wshadow', + '-Wshift-overflow=2', + '-Wsizeof-array-argument', + '-Wstack-protector', + '-Wstrict-aliasing=3', + '-Wstrict-overflow=5', + '-Wstringop-overflow=3', + '-Wsuggest-attribute=cold', + '-Wsuggest-attribute=const', + '-Wsuggest-attribute=format', + '-Wsuggest-attribute=malloc', + '-Wsuggest-attribute=noreturn', + '-Wsuggest-attribute=pure', + '-Wswitch-default', + '-Wswitch-enum', + '-Wsync-nand', + '-Wundef', + '-Wunused-const-variable=2', + '-Wunused-macros', + '-Wvarargs', + '-Wvector-operation-performance', + '-Wvla', + '-Wwrite-strings', +] + +gcc_c_warnings = [ + '-Wbad-function-cast', + '-Wc++-compat', + '-Wc99-c11-compat', + '-Wdesignated-init', + '-Wdiscarded-array-qualifiers', + '-Wdiscarded-qualifiers', + '-Wincompatible-pointer-types', + '-Wjump-misses-init', + '-Wmissing-prototypes', + '-Wnested-externs', + '-Wold-style-definition', + '-Wstrict-prototypes', + '-Wunsuffixed-float-constants', +] + +# Set all_c_warnings for the current C compiler +if is_variable('cc') and not is_variable('all_c_warnings') + if cc.get_id() == 'clang' + all_c_warnings = ['-Weverything'] + elif cc.get_id() == 'gcc' + all_c_warnings = gcc_common_warnings + [ + '-Wbad-function-cast', + '-Wc++-compat', + '-Wc99-c11-compat', + '-Wdesignated-init', + '-Wdiscarded-array-qualifiers', + '-Wdiscarded-qualifiers', + '-Wincompatible-pointer-types', + '-Wjump-misses-init', + '-Wmissing-prototypes', + '-Wnested-externs', + '-Wold-style-definition', + '-Wstrict-prototypes', + '-Wunsuffixed-float-constants', + ] + elif cc.get_id() == 'msvc' + all_c_warnings = ['/Wall'] + else + all_c_warnings = [] + endif + + all_c_warnings = cc.get_supported_arguments(all_c_warnings) + +endif + +# Set all_cpp_warnings for the current C++ compiler +if is_variable('cpp') and not is_variable('all_cpp_warnings') + if cpp.get_id() == 'clang' + all_cpp_warnings = [ + '-Weverything', + '-Wno-c++98-compat', + '-Wno-c++98-compat-pedantic' + ] + elif cpp.get_id() == 'gcc' + all_cpp_warnings = gcc_common_warnings + [ + '-Wabi-tag', + '-Waligned-new=all', + '-Wcatch-value=3', + '-Wcomma-subscript', + '-Wconditionally-supported', + '-Wctor-dtor-privacy', + '-Wdeprecated-copy-dtor', + '-Weffc++', + '-Wextra-semi', + '-Wmismatched-tags', + '-Wmultiple-inheritance', + '-Wnoexcept', + '-Wnoexcept-type', + '-Wnon-virtual-dtor', + '-Wold-style-cast', + '-Woverloaded-virtual', + '-Wplacement-new=2', + '-Wredundant-tags', + '-Wregister', + '-Wsign-promo', + '-Wstrict-null-sentinel', + '-Wsuggest-final-methods', + '-Wsuggest-final-types', + '-Wsuggest-override', + '-Wuseless-cast', + '-Wvirtual-inheritance', + '-Wvolatile', + '-Wzero-as-null-pointer-constant', + ] + elif cpp.get_id() == 'msvc' + all_cpp_warnings = ['/Wall'] + else + all_cpp_warnings = [] + endif + + all_cpp_warnings = cpp.get_supported_arguments(all_cpp_warnings) + +endif + +# Set all_objc_warnings for the current Objective C compiler +if is_variable('objcc') and not is_variable('all_objc_warnings') + all_objc_warnings = [] + if objcc.get_id() == 'clang' + all_objc_warnings = ['-Weverything'] + elif objc.get_id() == 'gcc' + all_objc_warnings = gcc_common_warnings + [ + '-Wno-direct-ivar-access', + ] + else + all_objc_warnings = [] + endif + + all_objc_warnings = objcc.get_supported_arguments(all_objc_warnings) +endif diff --git a/subprojects/rerex/meson_options.txt b/subprojects/rerex/meson_options.txt new file mode 100644 index 00000000..f2b59781 --- /dev/null +++ b/subprojects/rerex/meson_options.txt @@ -0,0 +1,5 @@ +option('strict', type: 'boolean', value: false, + description: 'Enable ultra-strict warnings') + +option('tests', type: 'boolean', value: true, + description: 'Build tests') diff --git a/subprojects/rerex/src/.clang-tidy b/subprojects/rerex/src/.clang-tidy new file mode 100644 index 00000000..420ce833 --- /dev/null +++ b/subprojects/rerex/src/.clang-tidy @@ -0,0 +1,8 @@ +Checks: > + *, + -hicpp-multiway-paths-covered, + -llvmlibc-*, + -misc-no-recursion, +WarningsAsErrors: '*' +HeaderFilterRegex: '.*' +FormatStyle: file diff --git a/subprojects/rerex/src/rerex.c b/subprojects/rerex/src/rerex.c new file mode 100644 index 00000000..49970a45 --- /dev/null +++ b/subprojects/rerex/src/rerex.c @@ -0,0 +1,765 @@ +/* + Copyright 2020 David Robillard <d@drobilla.net> + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#include "rerex/rerex.h" + +#include <assert.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <stdlib.h> + +static const char cmin = 0x20; // Inclusive minimum normal character +static const char cmax = 0x7E; // Inclusive maximum normal character + +const char* +rerex_strerror(const RerexStatus status) +{ + switch (status) { + case REREX_SUCCESS: + return "Success"; + case REREX_EXPECTED_CHAR: + return "Expected a regular character"; + case REREX_EXPECTED_ELEMENT: + return "Expected a character in a set"; + case REREX_EXPECTED_RBRACKET: + return "Expected ']'"; + case REREX_EXPECTED_RPAREN: + return "Expected ')'"; + case REREX_EXPECTED_SPECIAL: + return "Expected a special character (one of \"()*+-?[]^|\")"; + case REREX_UNEXPECTED_SPECIAL: + return "Unexpected special character"; + case REREX_UNEXPECTED_END: + return "Unexpected end of input"; + case REREX_UNORDERED_RANGE: + return "Range is out of order"; + } + + return "Unknown error"; +} + +/* State */ + +#define NO_STATE 0 + +// The ID for a state, which is an index into the state array +typedef size_t StateIndex; + +// A code point (currently only 8-bit ASCII but we use the space anyway) +typedef int Codepoint; + +// Special type for states with only epsilon out arcs +typedef enum { + REREX_MATCH = 0xE000, ///< Matching state, no out arcs + REREX_SPLIT = 0xE001, ///< Splitting state, one or two out arcs +} StateType; + +/* A state in an NFA. + + A state in Thompson's NFA can have either a single character-labeled + transition to another state, or up to two unlabeled epsilon transitions to + other states. There is both a minimum and maximum label for supporting + character ranges. So, either `min` and `max` are ASCII characters that are + the label of an arc to next1 (and next2 is null), or `min` is a special + StateType and next1 and/or next2 may be set to successor states. +*/ +typedef struct { + StateIndex next1; ///< Head of first out arc (or NULL) + StateIndex next2; ///< Head of second out arc (or NULL) + Codepoint min; ///< Special type, or inclusive min label for next1 + Codepoint max; ///< Inclusive max label for next2 +} State; + +// Create a match (end) state with no successors +static State +match_state(void) +{ + const State s = {NO_STATE, NO_STATE, REREX_MATCH, 0}; + return s; +} + +// Create a split state with at most two successors +static State +split_state(const StateIndex next1, const StateIndex next2) +{ + const State s = {next1, next2, REREX_SPLIT, 0}; + return s; +} + +// Create a labeled state with one successor reached by a character arc +static State +range_state(const char min, const char max, const StateIndex next) +{ + const State s = {next, NO_STATE, min, max}; + return s; +} + +/* Array of states. + + States are stored in a flat array to reduce memory fragmentation, and for + easy memory management since the automata graph may be cyclic. This simple + implementation calls realloc() for every state, which isn't terribly + efficient, but works well enough. Note that state addresses therefore change + during compilation, so states are generally referred to by their index, and + not by pointer. Conveniently, using indices is also useful during matching + for storing auxiliary information about states. +*/ +typedef struct { + State* states; + size_t n_states; +} StateArray; + +// Append a new state to the end of the state array +static StateIndex +add_state(StateArray* const array, const State state) +{ + const size_t new_n_states = array->n_states + 1; + const size_t new_size = new_n_states * sizeof(State); + + array->states = (State*)realloc(array->states, new_size); + array->states[array->n_states] = state; + + return array->n_states++; +} + +/* Automata. + * + * This is a lightweiht description of an NFA fragment. The states are stored + * elsewhere, this is just used to provide a facade to conceptually work with + * automata for high-level operations. + */ +typedef struct { + StateIndex start; + StateIndex end; +} Automata; + +// Simple utility function for making an automata in an expression +static Automata +make_automata(const StateIndex start, const StateIndex end) +{ + Automata result = {start, end}; + return result; +} + +// Return whether `nfa` has only two simple states (used for optimizations) +static inline bool +is_trivial(const StateArray* const states, const Automata nfa) +{ + return (states->states[nfa.start].min < REREX_MATCH && + states->states[nfa.start].next1 == nfa.end); +} + +// Kleene's Star of an NFA +static Automata +star(StateArray* const states, const Automata nfa) +{ + const StateIndex end = add_state(states, match_state()); + const StateIndex start = add_state(states, split_state(nfa.start, end)); + + states->states[nfa.end] = split_state(nfa.start, end); + + return make_automata(start, end); +} + +// Zero-or-one of an NFA +static Automata +question(StateArray* const states, const Automata nfa) +{ + const StateIndex start = add_state(states, split_state(nfa.start, nfa.end)); + + return make_automata(start, nfa.end); +} + +// One-or-more of an NFA +static Automata +plus(StateArray* const states, const Automata nfa) +{ + const StateIndex end = add_state(states, match_state()); + + states->states[nfa.end] = split_state(nfa.start, end); + + return make_automata(nfa.start, end); +} + +// Concatenation of two NFAs +static Automata +concatenate(StateArray* const states, const Automata a, const Automata b) +{ + if (is_trivial(states, a)) { + // Optimization: link a's start directly to b's start (drop a's end) + states->states[a.start].next1 = b.start; + } else { + states->states[a.end] = split_state(b.start, NO_STATE); + } + + return make_automata(a.start, b.end); +} + +// Alternation (OR) of two NFAs +static Automata +alternate(StateArray* const states, const Automata a, const Automata b) +{ + const StateIndex split = add_state(states, split_state(a.start, b.start)); + + if (is_trivial(states, a)) { + // Optimization: link a's start directly to b's end (drop a's end) + states->states[a.start].next1 = b.end; + return make_automata(split, b.end); + } + + if (is_trivial(states, b)) { + // Optimization: link b's start directly to a's end (drop b's end) + states->states[b.start].next1 = a.end; + return make_automata(split, a.end); + } + + const StateIndex end = add_state(states, match_state()); + + states->states[a.end] = split_state(end, NO_STATE); + states->states[b.end] = split_state(end, NO_STATE); + + return make_automata(split, end); +} + +/* Parser input. + * + * The parser reads from a string one character at a time, though it would be + * simple to change this to read from any stream. All reading is done by three + * operations: peek, peekahead, and eat. + */ +typedef struct { + const char* const str; + size_t offset; +} Input; + +// Return the next character in the input without consuming it +static inline char +peek(Input* const input) +{ + return input->str[input->offset]; +} + +// Return the next-next character in the input without consuming any +static inline char +peekahead(Input* const input) +{ + // Unfortunately we need 2-char lookahead for the ambiguity of '-' in sets + return input->str[input->offset + 1]; +} + +// Consume and return the next character in the input +static inline char +eat(Input* const input) +{ + return input->str[input->offset++]; +} + +// Forward declaration for read_expr because it is called recursively +static RerexStatus +read_expr(Input* input, StateArray* states, Automata* out); + +// DOT ::= '.' +// OPERATOR ::= '*' | '+' | '?' +// SPECIAL ::= DOT | OPERATOR | '(' | ')' | '[' | ']' | '^' | '{' | '|' | '}' +static bool +is_special(const char c) +{ + switch (c) { + case '(': + case ')': + case '*': + case '+': + case '.': + case '?': + case '[': + case ']': + case '^': + case '{': + case '|': + case '}': + return true; + default: + break; + } + + return false; +} + +// DOT ::= '.' +static RerexStatus +read_dot(Input* const input, StateArray* const states, Automata* const out) +{ + assert(peek(input) == '.'); + eat(input); + + const StateIndex end = add_state(states, match_state()); + const StateIndex start = add_state(states, range_state(cmin, cmax, end)); + + *out = make_automata(start, end); + + return REREX_SUCCESS; +} + +// ESCAPE ::= '\' SPECIAL +static RerexStatus +read_escape(Input* const input, char* const out) +{ + assert(peek(input) == '\\'); + eat(input); + + const char c = peek(input); + if (is_special(c) || c == '-') { + *out = eat(input); + return REREX_SUCCESS; + } + + return REREX_EXPECTED_SPECIAL; +} + +// CHAR ::= ESCAPE | [#x20-#x7E] - SPECIAL +static RerexStatus +read_char(Input* const input, char* const out) +{ + const char c = peek(input); + + switch (c) { + case '\0': + return REREX_UNEXPECTED_END; + case '\\': + return read_escape(input, out); + default: + break; + } + + if (is_special(c)) { + return REREX_UNEXPECTED_SPECIAL; + } + + if (c >= cmin && c <= cmax) { + *out = eat(input); + return REREX_SUCCESS; + } + + return REREX_EXPECTED_CHAR; +} + +// ELEMENT ::= ([#x20-#x7E] - ']') | ('\' ']') +static RerexStatus +read_element(Input* const input, char* const out) +{ + const char c = peek(input); + + switch (c) { + case '\0': + return REREX_UNEXPECTED_END; + case ']': + return REREX_UNEXPECTED_SPECIAL; + case '\\': + eat(input); + if (peek(input) != ']') { + return REREX_EXPECTED_RBRACKET; + } + + *out = eat(input); + return REREX_SUCCESS; + default: + break; + } + + if (c >= cmin && c <= cmax) { + *out = eat(input); + return REREX_SUCCESS; + } + + return REREX_EXPECTED_ELEMENT; +} + +// Range ::= ELEMENT | ELEMENT '-' ELEMENT +static RerexStatus +read_range(Input* const input, + StateArray* const states, + const bool negated, + Automata* const out) +{ + RerexStatus st = REREX_SUCCESS; + char min = 0; + char max = 0; + + // Read the first (or only) character + if ((st = read_element(input, &min))) { + return st; + } + + // Handle '-' which is a bit hairy because it may or may not be special + if (peek(input) == '-') { + switch (peekahead(input)) { + case ']': + // Weird case like [a-] where '-' is a character + max = min; + break; + default: + // Normal range like [a-z] (note that '[' isn't special here) + eat(input); + if ((st = read_element(input, &max))) { + return st; + } + break; + } + } else { + // Single character element + max = min; + } + + if (max < min) { + return REREX_UNORDERED_RANGE; + } + + const StateIndex end = add_state(states, match_state()); + if (negated) { + const char emin = (char)(min - 1); + const char emax = (char)(max + 1); + const StateIndex low = add_state(states, range_state(cmin, emin, end)); + const StateIndex high = add_state(states, range_state(emax, cmax, end)); + const StateIndex fork = add_state(states, split_state(low, high)); + + *out = make_automata(fork, end); + } else { + const StateIndex start = add_state(states, range_state(min, max, end)); + + *out = make_automata(start, end); + } + + return st; +} + +// Set ::= Range | Range Set +static RerexStatus +read_set(Input* const input, StateArray* const states, Automata* const out) +{ + RerexStatus st = REREX_SUCCESS; + bool negated = false; + + if (peek(input) == '^') { + eat(input); + negated = true; + } + + Automata nfa = {NO_STATE, NO_STATE}; + if ((st = read_range(input, states, negated, &nfa))) { + return st; + } + + while (peek(input) != ']') { + Automata range_nfa = {NO_STATE, NO_STATE}; + if ((st = read_range(input, states, negated, &range_nfa))) { + return st; + } + + nfa = alternate(states, nfa, range_nfa); + } + + *out = nfa; + return st; +} + +// Atom ::= CHAR | DOT | '(' Expr ')' | '[' Set ']' +static RerexStatus +read_atom(Input* const input, StateArray* const states, Automata* const out) +{ + RerexStatus st = REREX_SUCCESS; + + switch (peek(input)) { + case '(': + eat(input); + if ((st = read_expr(input, states, out))) { + return st; + } + + if (peek(input) != ')') { + return REREX_EXPECTED_RPAREN; + } + + eat(input); + return st; + + case '.': + return read_dot(input, states, out); + + case '[': + eat(input); + if ((st = read_set(input, states, out))) { + return st; + } + + eat(input); + return st; + + default: + break; + } + + char c = 0; + if ((st = read_char(input, &c))) { + return st; + } + + const StateIndex end = add_state(states, match_state()); + const StateIndex start = add_state(states, range_state(c, c, end)); + + *out = make_automata(start, end); + + return st; +} + +// OPERATOR ::= '*' | '+' | '?' +// Factor ::= Atom | Atom OPERATOR +static RerexStatus +read_factor(Input* const input, StateArray* const states, Automata* const out) +{ + RerexStatus st = REREX_SUCCESS; + Automata atom_nfa = {NO_STATE, NO_STATE}; + + if (!(st = read_atom(input, states, &atom_nfa))) { + const char c = peek(input); + switch (c) { + case '*': + eat(input); + *out = star(states, atom_nfa); + break; + case '+': + eat(input); + *out = plus(states, atom_nfa); + break; + case '?': + eat(input); + *out = question(states, atom_nfa); + break; + default: + *out = atom_nfa; + break; + } + } + + return st; +} + +// Term ::= Factor | Factor Term +static RerexStatus +read_term(Input* const input, StateArray* const states, Automata* const out) +{ + RerexStatus st = REREX_SUCCESS; + Automata factor_nfa = {NO_STATE, NO_STATE}; + Automata term_nfa = {NO_STATE, NO_STATE}; + + if (!(st = read_factor(input, states, &factor_nfa))) { + switch (peek(input)) { + case '\0': + case ')': + case '|': + *out = factor_nfa; + break; + default: + if (!(st = read_term(input, states, &term_nfa))) { + *out = concatenate(states, factor_nfa, term_nfa); + } + break; + } + } + + return st; +} + +// Expr ::= Term | Term '|' Expr +static RerexStatus +read_expr(Input* const input, StateArray* const states, Automata* const out) +{ + RerexStatus st = REREX_SUCCESS; + Automata term_nfa = {NO_STATE, NO_STATE}; + Automata expr_nfa = {NO_STATE, NO_STATE}; + + if ((st = read_term(input, states, &term_nfa))) { + return st; + } + + if (peek(input) == '|') { + eat(input); + if (!(st = read_expr(input, states, &expr_nfa))) { + *out = alternate(states, term_nfa, expr_nfa); + } + } else { + *out = term_nfa; + } + + return st; +} + +/* Pattern. + * + * A pattern is simply an array of states and an index to the start state. The + * end state(s) are known because they have type REREX_MATCH. A pattern is + * immutable after construction, the matcher does not modify it. + */ +struct RerexPatternImpl { + StateArray states; + StateIndex start; +}; + +void +rerex_free_pattern(RerexPattern* const regexp) +{ + free(regexp->states.states); + free(regexp); +} + +RerexStatus +rerex_compile(const char* const pattern, + size_t* const end, + RerexPattern** const out) +{ + Input input = {pattern, 0}; + Automata nfa = {NO_STATE, NO_STATE}; + StateArray states = {NULL, 0}; + + // Add null state so that no actual state has NO_STATE as an ID + add_state(&states, split_state(NO_STATE, NO_STATE)); + + const RerexStatus st = read_expr(&input, &states, &nfa); + if (st) { + free(states.states); + } else { + *out = (RerexPattern*)malloc(sizeof(RerexPattern)); + (*out)->states = states; + (*out)->start = nfa.start; + } + + *end = input.offset; + return st; +} + +/* Matcher */ + +typedef struct { + StateIndex* indices; // Array of state indices + size_t n_indices; // Number of elements in indices +} IndexList; + +/* Matcher. + * + * The matcher tracks active states by keeping two lists of indices: one for + * the current iteration, and one for the next. A separate array, keyed by + * state index, stores the number of the last iteration the state was entered + * in. This makes it simple and fast to check if a state has already been + * entered in the current iteration, avoiding the need to search the active + * list for every entered state. + */ +struct RerexMatcherImpl { + const RerexPattern* regexp; // Pattern to match against + IndexList active[2]; // Two lists of active states + size_t* last_active; // Last iteration a state was active +}; + +RerexMatcher* +rerex_new_matcher(const RerexPattern* const regexp) +{ + const size_t n_states = regexp->states.n_states; + RerexMatcher* const m = (RerexMatcher*)calloc(1, sizeof(RerexMatcher)); + + m->regexp = regexp; + m->active[0].indices = (StateIndex*)calloc(n_states, sizeof(StateIndex)); + m->active[1].indices = (StateIndex*)calloc(n_states, sizeof(StateIndex)); + m->last_active = (size_t*)calloc(n_states, sizeof(size_t)); + + return m; +} + +void +rerex_free_matcher(RerexMatcher* const matcher) +{ + free(matcher->last_active); + free(matcher->active[1].indices); + free(matcher->active[0].indices); + free(matcher); +} + +// Add `s` and any epsilon successors to the active list +static void +enter_state(RerexMatcher* const matcher, + const size_t step, + IndexList* const list, + const StateIndex s) +{ + const StateArray* const states = &matcher->regexp->states; + + if (s && matcher->last_active[s] != step) { + matcher->last_active[s] = step; + + const State* const state = &states->states[s]; + if (state->min == REREX_SPLIT) { + enter_state(matcher, step, list, state->next1); + enter_state(matcher, step, list, state->next2); + } else { + list->indices[list->n_indices++] = s; + } + } +} + +// Run `matcher` and return true if `string` matches the expression +bool +rerex_match(RerexMatcher* const matcher, const char* const string) +{ + const StateArray* const states = &matcher->regexp->states; + + // Reset matcher to a consistent initial state + matcher->active[0].n_indices = 0; + matcher->active[1].n_indices = 0; + for (size_t i = 0; i < states->n_states; ++i) { + matcher->last_active[i] = SIZE_MAX; + } + + // Enter start state + enter_state(matcher, 0, &matcher->active[0], matcher->regexp->start); + + // Tick the matcher for every input character + bool phase = 0; + for (size_t i = 0; string[i]; ++i) { + const char c = string[i]; + IndexList* const list = &matcher->active[phase]; + IndexList* const next_list = &matcher->active[!phase]; + + // Add successor states to the next iteration's list + next_list->n_indices = 0; + for (size_t j = 0; j < list->n_indices; ++j) { + const State* const state = &states->states[list->indices[j]]; + if (state->min <= c && c <= state->max) { + enter_state(matcher, i + 1, next_list, state->next1); + } + } + + // Flip phase to swap active lists + phase = !phase; + } + + // Check if match state is entered in the end + IndexList* const active = &matcher->active[phase]; + for (size_t i = 0; i < active->n_indices; ++i) { + if (states->states[active->indices[i]].min == REREX_MATCH) { + return true; + } + } + + return false; +} diff --git a/subprojects/rerex/test/test_match.c b/subprojects/rerex/test/test_match.c new file mode 100644 index 00000000..fa270e83 --- /dev/null +++ b/subprojects/rerex/test/test_match.c @@ -0,0 +1,238 @@ +/* + Copyright 2020 David Robillard <d@drobilla.net> + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#undef NDEBUG + +#include "rerex/rerex.h" + +#include <assert.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> + +typedef struct { + uintptr_t match; ///< Boolean, true if text should match + const char* pattern; ///< Regular expression + const char* text; ///< Text to match against pattern +} MatchTestCase; + +static const MatchTestCase match_tests[] = { + {1, "\\(", "("}, + {1, "\\)", ")"}, + {1, "\\*", "*"}, + {1, "\\+", "+"}, + {1, "\\-", "-"}, + {1, "\\.", "."}, + {1, "\\?", "?"}, + {1, "\\[", "["}, + {1, "\\]", "]"}, + {1, "\\^", "^"}, + {1, "\\|", "|"}, + {0, ".", ""}, + {1, ".", "a"}, + {0, ".", "aa"}, + {0, "..", ""}, + {0, "..", "a"}, + {1, "..", "aa"}, + {1, ".*", ""}, + {1, ".*", "a"}, + {1, ".*", "aa"}, + {0, ".+", ""}, + {1, ".+", "a"}, + {1, ".+", "aa"}, + {1, ".?", ""}, + {1, ".?", "a"}, + {0, ".?", "aa"}, + {1, "a*", ""}, + {1, "a*", "a"}, + {1, "a*", "aa"}, + {0, "a*", "b"}, + {0, "a+", ""}, + {1, "a+", "a"}, + {1, "a+", "aa"}, + {0, "a+", "b"}, + {1, "a?", ""}, + {1, "a?", "a"}, + {0, "a?", "aa"}, + {0, "a?", "b"}, + {0, "[.]", "a"}, + {1, "[.]", "."}, + {0, "[\\]]", "a"}, + {1, "[\\]]", "]"}, + {0, "[b]", "a"}, + {1, "[b]", "b"}, + {0, "[b]", "c"}, + {0, "[bc]", "a"}, + {1, "[bc]", "b"}, + {1, "[bc]", "c"}, + {0, "[bc]", "d"}, + {0, "[bcd]", "a"}, + {1, "[bcd]", "b"}, + {1, "[bcd]", "c"}, + {1, "[bcd]", "d"}, + {0, "[bcd]", "e"}, + {0, "[b-d]", "a"}, + {1, "[b-d]", "b"}, + {1, "[b-d]", "d"}, + {0, "[b-d]", "e"}, + {1, "[^b-d]", "a"}, + {0, "[^b-d]", "b"}, + {0, "[^b-d]", "d"}, + {1, "[^b-d]", "e"}, + {0, "[^ -/]", "\t"}, + {1, "[^ -/]", "0"}, + {1, "[^{-~]", "z"}, + {0, "[^{-~]", "~"}, + {0, "[A-Za-z]", "5"}, + {1, "[A-Za-z]", "m"}, + {1, "[A-Za-z]", "M"}, + {0, "[A-Za-z]", "~"}, + {0, "[+-]", "*"}, + {1, "[+-]", "+"}, + {0, "[+-]", ","}, + {1, "[+-]", "-"}, + {0, "[+-]", "."}, + {1, "[b-d]*", ""}, + {0, "[b-d]*", "a"}, + {1, "[b-d]*", "b"}, + {1, "[b-d]*", "c"}, + {1, "[b-d]*", "cc"}, + {1, "[b-d]*", "d"}, + {0, "[b-d]*", "e"}, + {0, "[b-d]+", ""}, + {0, "[b-d]+", "a"}, + {1, "[b-d]+", "b"}, + {1, "[b-d]+", "c"}, + {1, "[b-d]+", "cc"}, + {1, "[b-d]+", "d"}, + {0, "[b-d]+", "e"}, + {1, "[b-d]?", ""}, + {0, "[b-d]?", "a"}, + {1, "[b-d]?", "b"}, + {1, "[b-d]?", "c"}, + {0, "[b-d]?", "cc"}, + {1, "[b-d]?", "d"}, + {0, "[b-d]?", "e"}, + {1, "h(e|a)llo", "hello"}, + {1, "h(e|a)llo", "hallo"}, + {1, "h(e|a)+llo", "haello"}, + {1, "h(e|a)*llo", "hllo"}, + {1, "h(e|a)?llo", "hllo"}, + {1, "h(e|a)?llo", "hello"}, + {1, "h(e|a)*llo*", "haeeeallooo"}, + {1, "(ab|a)(bc|c)", "abc"}, + {0, "(ab|a)(bc|c)", "acb"}, + {1, "(ab)c|abc", "abc"}, + {0, "(ab)c|abc", "ab"}, + {1, "(a*)(b?)(b+)", "aaabbbb"}, + {0, "(a*)(b?)(b+)", "aaaa"}, + {1, "((a|a)|a)", "a"}, + {0, "((a|a)|a)", "aa"}, + {1, "(a*)(a|aa)", "aaaa"}, + {0, "(a*)(a|aa)", "b"}, + {1, "a(b)|c(d)|a(e)f", "aef"}, + {0, "a(b)|c(d)|a(e)f", "adf"}, + {1, "(a|b)c|a(b|c)", "ac"}, + {0, "(a|b)c|a(b|c)", "acc"}, + {1, "(a|b)c|a(b|c)", "ab"}, + {0, "(a|b)c|a(b|c)", "acb"}, + {1, "(a|b)*c|(a|ab)*c", "abc"}, + {0, "(a|b)*c|(a|ab)*c", "bbbcabbbc"}, + {1, "a?(ab|ba)ab", "abab"}, + {0, "a?(ab|ba)ab", "aaabab"}, + {1, "(aa|aaa)*|(a|aaaaa)", "aa"}, + {1, "(a)(b)(c)", "abc"}, + {1, "((((((((((x))))))))))", "x"}, + {1, "((((((((((x))))))))))*", "xx"}, + {1, "a?(ab|ba)*", "ababababababababababababababababa"}, + {1, "a*a*a*a*a*b", "aaaaaaaab"}, + {1, "abc", "abc"}, + {1, "ab*c", "abc"}, + {1, "ab*bc", "abbc"}, + {1, "ab*bc", "abbbbc"}, + {1, "ab+bc", "abbc"}, + {1, "ab+bc", "abbbbc"}, + {1, "ab?bc", "abbc"}, + {1, "ab?bc", "abc"}, + {1, "ab|cd", "ab"}, + {1, "(a)b(c)", "abc"}, + {1, "a*", "aaa"}, + {1, "(a+|b)*", "ab"}, + {1, "(a+|b)+", "ab"}, + {1, "a|b|c|d|e", "e"}, + {1, "(a|b|c|d|e)f", "ef"}, + {1, "abcd*efg", "abcdefg"}, + {1, "(ab|ab*)bc", "abc"}, + {1, "(ab|a)b*c", "abc"}, + {1, "((a)(b)c)(d)", "abcd"}, + {1, "(a|ab)(c|bcd)", "abcd"}, + {1, "(a|ab)(bcd|c)", "abcd"}, + {1, "(ab|a)(c|bcd)", "abcd"}, + {1, "(ab|a)(bcd|c)", "abcd"}, + {1, "((a|ab)(c|bcd))(d*)", "abcd"}, + {1, "((a|ab)(bcd|c))(d*)", "abcd"}, + {1, "((ab|a)(c|bcd))(d*)", "abcd"}, + {1, "((ab|a)(bcd|c))(d*)", "abcd"}, + {1, "(a|ab)((c|bcd)(d*))", "abcd"}, + {1, "(a|ab)((bcd|c)(d*))", "abcd"}, + {1, "(ab|a)((c|bcd)(d*))", "abcd"}, + {1, "(ab|a)((bcd|c)(d*))", "abcd"}, + {1, "(a*)(b|abc)", "abc"}, + {1, "(a*)(abc|b)", "abc"}, + {1, "((a*)(b|abc))(c*)", "abc"}, + {1, "((a*)(abc|b))(c*)", "abc"}, + {1, "(a*)((b|abc))(c*)", "abc"}, + {1, "(a*)((abc|b)(c*))", "abc"}, + {1, "(a*)(b|abc)", "abc"}, + {1, "(a*)(abc|b)", "abc"}, + {1, "((a*)(b|abc))(c*)", "abc"}, + {1, "((a*)(abc|b))(c*)", "abc"}, + {1, "(a*)((b|abc)(c*))", "abc"}, + {1, "(a*)((abc|b)(c*))", "abc"}, + {1, "(a|ab)", "ab"}, + {1, "(ab|a)", "ab"}, + {1, "(a|ab)(b*)", "ab"}, + {1, "(ab|a)(b*)", "ab"}, + {1, "(a|b)*c|(a|ab)*c", "abc"}, +}; + +int +main(void) +{ + const size_t n_tests = sizeof(match_tests) / sizeof(*match_tests); + + for (size_t i = 0; i < n_tests; ++i) { + const char* const regexp = match_tests[i].pattern; + const char* const text = match_tests[i].text; + const bool should_match = match_tests[i].match; + + RerexPattern* pattern = NULL; + size_t end = 0; + const RerexStatus st = rerex_compile(regexp, &end, &pattern); + + assert(!st); + + RerexMatcher* const matcher = rerex_new_matcher(pattern); + const bool matches = rerex_match(matcher, text); + + assert(matches == should_match); + + rerex_free_matcher(matcher); + rerex_free_pattern(pattern); + } + + return 0; +} diff --git a/subprojects/rerex/test/test_syntax.c b/subprojects/rerex/test/test_syntax.c new file mode 100644 index 00000000..9cff8855 --- /dev/null +++ b/subprojects/rerex/test/test_syntax.c @@ -0,0 +1,99 @@ +/* + Copyright 2020 David Robillard <d@drobilla.net> + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +// Tests that regex syntax errors are reported correctly + +#undef NDEBUG + +#include "rerex/rerex.h" + +#include <assert.h> +#include <stdint.h> +#include <string.h> + +typedef struct { + RerexStatus status; + unsigned offset; + const char* pattern; +} BadSyntaxTestCase; + +static const BadSyntaxTestCase syntax_tests[] = { + {REREX_EXPECTED_CHAR, 1, "a\b"}, + {REREX_EXPECTED_CHAR, 1, "a\x7F"}, + {REREX_EXPECTED_ELEMENT, 1, "[\b]"}, + {REREX_EXPECTED_ELEMENT, 1, "[\x7F]"}, + {REREX_EXPECTED_ELEMENT, 2, "[a\b]"}, + {REREX_EXPECTED_ELEMENT, 2, "[a\x7F]"}, + {REREX_EXPECTED_ELEMENT, 3, "[a-\b]"}, + {REREX_EXPECTED_ELEMENT, 3, "[a-\x7F]"}, + {REREX_EXPECTED_RBRACKET, 2, "[\\n]"}, + {REREX_EXPECTED_RPAREN, 2, "(a"}, + {REREX_EXPECTED_SPECIAL, 1, "\\n"}, + {REREX_UNEXPECTED_END, 1, "("}, + {REREX_UNEXPECTED_END, 1, "["}, + {REREX_UNEXPECTED_END, 2, "[a"}, + {REREX_UNEXPECTED_END, 3, "(a|"}, + {REREX_UNEXPECTED_END, 3, "[a-"}, + {REREX_UNEXPECTED_END, 4, "[a-z"}, + {REREX_UNEXPECTED_END, 4, "[a-z"}, + {REREX_UNEXPECTED_SPECIAL, 0, "{"}, + {REREX_UNEXPECTED_SPECIAL, 0, "}"}, + {REREX_UNEXPECTED_SPECIAL, 0, "?"}, + {REREX_UNEXPECTED_SPECIAL, 1, "[]]"}, + {REREX_UNEXPECTED_SPECIAL, 2, "a|?"}, + {REREX_UNEXPECTED_SPECIAL, 3, "(a|?)"}, + {REREX_UNEXPECTED_SPECIAL, 3, "[[]]"}, + {REREX_UNEXPECTED_SPECIAL, 3, "[a]]"}, + {REREX_UNEXPECTED_SPECIAL, 4, "[A-]]"}, + {REREX_UNEXPECTED_SPECIAL, 4, "[a[]]"}, + {REREX_UNEXPECTED_SPECIAL, 5, "[A-[]]"}, + {REREX_UNORDERED_RANGE, 4, "[z-a]"}, +}; + +static void +test_status(void) +{ + assert(!strcmp(rerex_strerror((RerexStatus)INT32_MAX), "Unknown error")); +} + +static void +test_syntax(void) +{ + const size_t n_tests = sizeof(syntax_tests) / sizeof(*syntax_tests); + + for (size_t i = 0; i < n_tests; ++i) { + const char* const regexp = syntax_tests[i].pattern; + const size_t offset = syntax_tests[i].offset; + const RerexStatus status = syntax_tests[i].status; + + RerexPattern* pattern = NULL; + size_t end = 0; + const RerexStatus st = rerex_compile(regexp, &end, &pattern); + + assert(st == status); + assert(!pattern); + assert(strcmp(rerex_strerror(st), rerex_strerror(REREX_SUCCESS))); + assert(end == offset); + } +} + +int +main(void) +{ + test_status(); + test_syntax(); + return 0; +} diff --git a/subprojects/rerex/test/test_xsd.c b/subprojects/rerex/test/test_xsd.c new file mode 100644 index 00000000..81104a99 --- /dev/null +++ b/subprojects/rerex/test/test_xsd.c @@ -0,0 +1,522 @@ +/* + Copyright 2020 David Robillard <d@drobilla.net> + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +/* + Tests more realistic patterns and using a matcher multiple times. + + The patterns here were written by myself and may have bugs. XSD datatypes + unfortunately don't have canonical patterns, though sometimes they + infuriatingly say that they are constrained by a pattern... which is not + given. XML is terrible, don't use it, but the XSD datatypes are more + generally useful. Some patterns here slightly fuzzy (for example, with + dates), so matching does not necessary mean the value itself is valid. + + These don't have much to do with Rerex itself, but it's something I needed it + for, and having some more realistic tests is nice. These also test reusing a + matcher, unlike the basic match tests. +*/ + +#undef NDEBUG + +#include "rerex/rerex.h" + +#include <assert.h> +#include <stddef.h> + +static void +test_pattern(const char* const regexp, + const char* const matching[], + const char* const nonmatching[]) +{ + RerexPattern* pattern = NULL; + size_t end = 0; + const RerexStatus st = rerex_compile(regexp, &end, &pattern); + + assert(!st); + + RerexMatcher* const matcher = rerex_new_matcher(pattern); + + for (const char* const* m = matching; *m; ++m) { + assert(rerex_match(matcher, *m)); + } + + for (const char* const* n = nonmatching; *n; ++n) { + assert(!rerex_match(matcher, *n)); + } + + rerex_free_matcher(matcher); + rerex_free_pattern(pattern); +} + +static void +test_base64Binary(void) +{ + static const char* const regexp = + "(([A-Za-z0-9+/] *[A-Za-z0-9+/] *[A-Za-z0-9+/] *[A-Za-z0-9+/] *)*" // + "(([A-Za-z0-9+/] *[A-Za-z0-9+/] *[A-Za-z0-9+/] *[A-Za-z0-9+/])|" // + "([A-Za-z0-9+/] *[A-Za-z0-9+/] *[AEIMQUYcgkosw048] *=)|" // + "([A-Za-z0-9+/] *[AQgw] *= *=)))?"; + + static const char* const good[] = { + "0FB8", + "0fb8", + "0 FB8 0F+9", + "0F+40A8=", + "0F+40A==", + "", + NULL, + }; + + static const char* const bad[] = { + " 0FB8", + "0FB8 ", + " 0FB8 ", + "FB8", + "==0F", + "0F+40A9=", + "0F+40B==", + NULL, + }; + + test_pattern(regexp, good, bad); +} + +static void +test_boolean(void) +{ + static const char* const regexp = "(true|false|0|1)"; + static const char* const good[] = {"true", "false", "0", "1", NULL}; + static const char* const bad[] = {"TRUE", "T", "", NULL}; + + test_pattern(regexp, good, bad); +} + +static void +test_date(void) +{ + static const char* const regexp = // + "-?[0-9][0-9][0-9][0-9][0-9]*" // + "-(0[1-9]|1[0-2])" // + "-(0[1-9]|[12][0-9]|3[01])" // + "(Z|[-+][0-2][0-9]:[0-5][0-9])?"; + + static const char* const good[] = { + "2004-04-12", + "-0045-01-01", + "12004-04-12", + "2004-04-12-05:00", + "2004-04-12Z", + "2001-10-26", + "2001-10-26+02:00", + "2001-10-26Z", + "2001-10-26+00:00", + "-2001-10-26", + "-20000-04-01", + NULL, + }; + + static const char* const bad[] = { + "99-04-12", + "2004-4-2", + "2004/04/02", + "04-12-2004", + // "2004-04-31", // Not quite that clever... + "2001-10", + "2001-10-32", + "2001-13-26+02:00", + "01-10-26", + "", + NULL, + }; + + test_pattern(regexp, good, bad); +} + +static void +test_decimal(void) +{ + static const char* const regexp = + "[+-]?(([0-9]+[.]?[0-9]*)|([0-9]*[.]?[0-9]+))"; + + static const char* const good[] = { + "3.0", + "-3.0", + "+3.5", + "3", + ".3", + "3.", + "0", + "-.3", + "0003.0", + "3.0000", + "-456", + NULL, + }; + + static const char* const bad[] = {"3,5", ".", "", NULL}; + + test_pattern(regexp, good, bad); +} + +// Tests both xsd:float and xsd:double, which are lexically identical +static void +test_float(void) +{ + static const char* const regexp = "-?INF|NaN|[+-]?(([0-9]+[.]?[0-9]*)|([0-" + "9]*[.]?[0-9]+))([eE][-+]?[0-9]+)?"; + + static const char* const good[] = { + "-3E2", + "4268.22752E11", + "+24.3e-3", + "12", + "+3.5", + "INF", + "-INF", + "-0", + "NaN", + NULL, + }; + + static const char* const bad[] = { + "-3E2.4", + "12E", + "+INF", + "NAN", + "", + NULL, + }; + + test_pattern(regexp, good, bad); +} + +static void +test_gDay(void) +{ + static const char* const regexp = + "---(0[1-9]|[12][0-9]|3[01])(Z|[-+][0-2][0-9]:[0-5][0-9])?"; + + static const char* const good[] = { + "---02", + "---01", + "---01Z", + "---01+02:00", + "---01-04:00", + "---15", + "---31", + NULL, + }; + + static const char* const bad[] = { + "02", + "---2", + "---32", + "--30-", + "---35", + "---5", + "15", + "", + NULL, + }; + + test_pattern(regexp, good, bad); +} + +static void +test_gMonth(void) +{ + static const char* const regexp = + "--(0[1-9]|1[0-2])(Z|[-+][0-2][0-9]:[0-5][0-9])?"; + + static const char* const good[] = { + "--04", + "--04-05:00", + "--05", + "--11Z", + "--11+02:00", + "--11-04:00", + "--02", + NULL, + }; + + static const char* const bad[] = { + "2004-04", + "04", + "--4", + "--13", + "-01-", + "--13", + "--1", + "01", + "", + NULL, + }; + + test_pattern(regexp, good, bad); +} + +static void +test_gMonthDay(void) +{ + static const char* const regexp = // + "--(0[1-9]|1[0-2])" // + "-(0[1-9]|[12][0-9]|3[01])" // + "(Z|[-+][0-2][0-9]:[0-5][0-9])?"; + + static const char* const good[] = { + "--04-12", + "--04-12Z", + "--05-01", + "--11-01Z", + "--11-01+02:00", + "--11-01-04:00", + "--11-15", + "--02-29", + NULL, + }; + + static const char* const bad[] = { + "04-12", + // "--04-31", Not quite that clever... + "--4-6", + "-01-30-", + "--01-35", + "--1-5", + "01-15", + "", + NULL, + }; + + test_pattern(regexp, good, bad); +} + +static void +test_gYear(void) +{ + static const char* const regexp = + "-?[0-9][0-9][0-9][0-9][0-9]*(Z|[-+][0-2][0-9]:[0-5][0-9])?"; + + static const char* const good[] = { + "2004", + "2004-05:00", + "12004", + "0922", + "-0045", + "2001+02:00", + "2001Z", + "2001+00:00", + "-2001", + "-20000", + NULL, + }; + + static const char* const bad[] = { + "99", + "922", + "01", + "2001-12", + "", + NULL, + }; + + test_pattern(regexp, good, bad); +} + +static void +test_gYearMonth(void) +{ + static const char* const regexp = // + "-?[0-9][0-9][0-9][0-9][0-9]*" // + "-(0[1-9]|1[0-2])" // + "(Z|[-+][0-2][0-9]:[0-5][0-9])?"; + + static const char* const good[] = { + "2001-10", + "2001-10+02:00", + "2001-10Z", + "2001-10+00:00", + "-2001-10", + "-20000-04", + "2004-04-05:00", + NULL, + }; + + static const char* const bad[] = { + "2001", + "2001-13", + "2001-13-26+02:00", + "01-10", + "99-04", + "2004", + "2004-4", + "2004-13", + "", + NULL, + }; + + test_pattern(regexp, good, bad); +} + +static void +test_hexBinary(void) +{ + static const char* const regexp = "([0-9A-Fa-f][0-9A-Fa-f])*"; + static const char* const good[] = {"0FB8", "0fb8", "", NULL}; + static const char* const bad[] = {"F", "FB8", NULL}; + + test_pattern(regexp, good, bad); +} + +static void +test_integer(void) +{ + static const char* const regexp = "[-+]?[0-9]+"; + static const char* const good[] = {"122", "00122", "0", "-3", "+3", NULL}; + static const char* const bad[] = {"3.", "3.0", "A", "", NULL}; + + test_pattern(regexp, good, bad); +} + +static void +test_language(void) +{ + static const char* const regexp = // + "[a-zA-Z][a-zA-Z]?[a-zA-Z]?[a-zA-Z]?" // + "[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?" // + "(-[a-zA-Z0-9][a-zA-Z0-9]?[a-zA-Z0-9]?[a-zA-Z0-9]?" // + "[a-zA-Z0-9]?[a-zA-Z0-9]?[a-zA-Z0-9]?[a-zA-Z0-9]?)*"; + + static const char* const good[] = { + "en", + "en-GB", + "en-US", + "fr", + "fr-FR", + "fr-CA", + "de", + "zh", + "ja", + "ko", + "i-navajo", + "x-Newspeak", + "any-value-with-short-parts", + NULL, + }; + + static const char* const bad[] = { + "longerThan8", + "even-longerThan8", + "longererThan8-first", + "last-longererThan8", + "middle-longererThan8-CA", + "", + NULL, + }; + + test_pattern(regexp, good, bad); +} + +static void +test_nonNegativeInteger(void) +{ + static const char* const regexp = "[+]?[0-9]+"; + static const char* const good[] = {"+3", "122", "0", "0012", "+123", NULL}; + static const char* const bad[] = {"-3", "3.0", "", NULL}; + + test_pattern(regexp, good, bad); +} + +static void +test_nonPositiveInteger(void) +{ + static const char* const regexp = "(0|-[0-9]+)"; + static const char* const good[] = {"-3", "-0", "-00122", NULL}; + static const char* const bad[] = {"122", "+3", "3.", "3.0", "", NULL}; + + test_pattern(regexp, good, bad); +} + +static void +test_positiveInteger(void) +{ + static const char* const regexp = "[+]?[0-9]*[1-9]+[0-9]*"; + static const char* const good[] = {"122", "+3", "00122", NULL}; + static const char* const bad[] = {"0", "-3", "3.0", "", NULL}; + + test_pattern(regexp, good, bad); +} + +static void +test_time(void) +{ + static const char* const regexp = + "(([0-1][0-9])|(2[0-4])):[0-5][0-9]:[0-5][0-9](.[0-9]+)?(Z|[-+][0-2][0-" + "9]:[0-5][0-9])?"; + + static const char* const good[] = { + "13:20:00", + "13:20:30.5555", + "13:20:00-05:00", + "13:20:00Z", + "00:00:00", + "24:00:00", + "21:32:52", + "21:32:52+02:00", + "19:32:52Z", + "19:32:52+00:00", + "21:32:52.12679", + NULL, + }; + + static const char* const bad[] = { + "5:20:00", + "13:20", + "13:20.5:00", + "13:65:00", + "21:32", + "25:25:10", + "-10:00:00", + "1:20:10", + "", + NULL, + }; + + test_pattern(regexp, good, bad); +} + +int +main(void) +{ + test_base64Binary(); + test_boolean(); + test_date(); + test_decimal(); + test_float(); + test_gDay(); + test_gMonth(); + test_gMonthDay(); + test_gYear(); + test_gYearMonth(); + test_hexBinary(); + test_integer(); + test_language(); + test_nonNegativeInteger(); + test_nonPositiveInteger(); + test_positiveInteger(); + test_time(); + + return 0; +} |