aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--subprojects/rerex/.clang-format21
-rw-r--r--subprojects/rerex/.clang-tidy6
-rw-r--r--subprojects/rerex/.gitignore1
-rw-r--r--subprojects/rerex/.gitlab-ci.yml165
-rw-r--r--subprojects/rerex/CONTRIBUTING.md32
-rw-r--r--subprojects/rerex/COPYING13
-rw-r--r--subprojects/rerex/NEWS5
-rw-r--r--subprojects/rerex/README.md158
-rw-r--r--subprojects/rerex/include/rerex/rerex.h112
-rw-r--r--subprojects/rerex/meson.build121
-rw-r--r--subprojects/rerex/meson/meson.build204
-rw-r--r--subprojects/rerex/meson_options.txt5
-rw-r--r--subprojects/rerex/src/.clang-tidy8
-rw-r--r--subprojects/rerex/src/rerex.c765
-rw-r--r--subprojects/rerex/test/test_match.c238
-rw-r--r--subprojects/rerex/test/test_syntax.c99
-rw-r--r--subprojects/rerex/test/test_xsd.c522
17 files changed, 2475 insertions, 0 deletions
diff --git a/subprojects/rerex/.clang-format b/subprojects/rerex/.clang-format
new file mode 100644
index 00000000..fcdbab81
--- /dev/null
+++ b/subprojects/rerex/.clang-format
@@ -0,0 +1,21 @@
+---
+AlignConsecutiveAssignments: true
+AlignConsecutiveDeclarations: true
+AlignEscapedNewlinesLeft: true
+BasedOnStyle: Mozilla
+BraceWrapping:
+ AfterEnum: false
+ AfterExternBlock: false
+ AfterFunction: true
+ AfterStruct: false
+ AfterUnion: false
+BreakBeforeBraces: Custom
+Cpp11BracedListStyle: true
+IndentCaseLabels: false
+IndentPPDirectives: AfterHash
+KeepEmptyLinesAtTheStartOfBlocks: false
+SpacesInContainerLiterals: false
+StatementMacros:
+ - REREX_API
+ - REREX_CONST_API
+...
diff --git a/subprojects/rerex/.clang-tidy b/subprojects/rerex/.clang-tidy
new file mode 100644
index 00000000..3a0d4d70
--- /dev/null
+++ b/subprojects/rerex/.clang-tidy
@@ -0,0 +1,6 @@
+Checks: >
+ *,
+ -llvmlibc*,
+WarningsAsErrors: '*'
+HeaderFilterRegex: '.*'
+FormatStyle: file
diff --git a/subprojects/rerex/.gitignore b/subprojects/rerex/.gitignore
new file mode 100644
index 00000000..84c048a7
--- /dev/null
+++ b/subprojects/rerex/.gitignore
@@ -0,0 +1 @@
+/build/
diff --git a/subprojects/rerex/.gitlab-ci.yml b/subprojects/rerex/.gitlab-ci.yml
new file mode 100644
index 00000000..5e7cb7bb
--- /dev/null
+++ b/subprojects/rerex/.gitlab-ci.yml
@@ -0,0 +1,165 @@
+stages:
+ - build
+ - deploy
+
+.build_template: &build_definition
+ stage: build
+
+
+arm32_dbg:
+ <<: *build_definition
+ image: lv2plugin/debian-arm32
+ script:
+ - meson . build --cross-file=/usr/share/meson/cross/arm-linux-gnueabihf.ini -Dbuildtype=debug -Dstrict=true -Dwerror=true -Dtests=true
+ - ninja -C build test
+
+arm32_rel:
+ <<: *build_definition
+ image: lv2plugin/debian-arm32
+ script:
+ - meson . build --cross-file=/usr/share/meson/cross/arm-linux-gnueabihf.ini -Dbuildtype=release -Dstrict=true -Dwerror=true -Dtests=true
+ - ninja -C build test
+
+
+arm64_dbg:
+ <<: *build_definition
+ image: lv2plugin/debian-arm64
+ script:
+ - meson . build --cross-file=/usr/share/meson/cross/aarch64-linux-gnu.ini -Dbuildtype=debug -Dstrict=true -Dwerror=true -Dtests=true
+ - ninja -C build test
+
+arm64_rel:
+ <<: *build_definition
+ image: lv2plugin/debian-arm64
+ script:
+ - meson . build --cross-file=/usr/share/meson/cross/aarch64-linux-gnu.ini -Dbuildtype=release -Dstrict=true -Dwerror=true -Dtests=true
+ - ninja -C build test
+
+
+x32_dbg:
+ <<: *build_definition
+ image: lv2plugin/debian-x32
+ script:
+ - meson . build --cross-file=/usr/share/meson/cross/i686-linux-gnu.ini -Dbuildtype=debug -Dstrict=true -Dwerror=true
+ - ninja -C build test
+
+x32_rel:
+ <<: *build_definition
+ image: lv2plugin/debian-x32
+ script:
+ - meson . build --cross-file=/usr/share/meson/cross/i686-linux-gnu.ini -Dbuildtype=release -Dstrict=true -Dwerror=true
+ - ninja -C build test
+
+
+x64_dbg:
+ <<: *build_definition
+ image: lv2plugin/debian-x64
+ script:
+ - meson . build -Dbuildtype=debug -Dstrict=true -Dwerror=true -Dtests=true -Db_coverage=true
+ - ninja -C build test
+ - ninja -C build coverage-html
+ artifacts:
+ paths:
+ - build/meson-logs/coveragereport
+
+x64_rel:
+ <<: *build_definition
+ image: lv2plugin/debian-x64
+ script:
+ - meson . build -Dbuildtype=release -Dstrict=true -Dwerror=true -Dtests=true
+ - ninja -C build test
+
+
+x64_static:
+ <<: *build_definition
+ image: lv2plugin/debian-x64
+ script:
+ - meson . build -Ddefault_library=static -Dstrict=true -Dwerror=true -Dtests=true
+ - ninja -C build test
+
+
+x64_sanitize:
+ <<: *build_definition
+ image: lv2plugin/debian-x64-clang
+ script:
+ - meson . build -Db_lundef=false -Dbuildtype=plain -Dstrict=true -Dwerror=true -Dtests=true
+ - ninja -C build test
+ variables:
+ CC: "clang"
+ CFLAGS: "-fno-sanitize-recover=all -fsanitize=address -fsanitize=undefined -fsanitize=float-divide-by-zero -fsanitize=unsigned-integer-overflow -fsanitize=implicit-conversion -fsanitize=local-bounds -fsanitize=nullability"
+ LDFLAGS: "-fno-sanitize-recover=all -fsanitize=address -fsanitize=undefined -fsanitize=float-divide-by-zero -fsanitize=unsigned-integer-overflow -fsanitize=implicit-conversion -fsanitize=local-bounds -fsanitize=nullability"
+
+
+mingw32_dbg:
+ <<: *build_definition
+ image: lv2plugin/debian-mingw32
+ script:
+ - meson . build --cross-file=/usr/share/meson/cross/i686-w64-mingw32.ini -Dbuildtype=debug -Dstrict=true -Dwerror=true -Dtests=true
+ - ninja -C build test
+
+mingw32_rel:
+ <<: *build_definition
+ image: lv2plugin/debian-mingw32
+ script:
+ - meson . build --cross-file=/usr/share/meson/cross/i686-w64-mingw32.ini -Dbuildtype=release -Dstrict=true -Dwerror=true -Dtests=true
+ - ninja -C build test
+
+
+mingw64_dbg:
+ <<: *build_definition
+ image: lv2plugin/debian-mingw64
+ script:
+ - meson . build --cross-file=/usr/share/meson/cross/x86_64-w64-mingw32.ini -Dbuildtype=debug -Dstrict=true -Dwerror=true -Dtests=true
+ - ninja -C build test
+
+mingw64_rel:
+ <<: *build_definition
+ image: lv2plugin/debian-mingw64
+ script:
+ - meson . build --cross-file=/usr/share/meson/cross/x86_64-w64-mingw32.ini -Dbuildtype=release -Dstrict=true -Dwerror=true -Dtests=true
+ - ninja -C build test
+
+
+mac_dbg:
+ <<: *build_definition
+ tags: [macos]
+ script:
+ - meson . build -Dbuildtype=debug -Dstrict=true -Dwerror=true -Dtests=true
+ - ninja -C build test
+
+mac_rel:
+ <<: *build_definition
+ tags: [macos]
+ script:
+ - meson . build -Dbuildtype=release -Dstrict=true -Dwerror=true -Dtests=true
+ - ninja -C build test
+
+
+win_dbg:
+ <<: *build_definition
+ tags: [windows,meson]
+ script:
+ - meson . build -Dbuildtype=debug -Dstrict=true -Dwerror=true -Dtests=true
+ - ninja -C build test
+
+win_rel:
+ <<: *build_definition
+ tags: [windows,meson]
+ script:
+ - meson . build -Dbuildtype=release -Dstrict=true -Dwerror=true -Dtests=true
+ - ninja -C build test
+
+
+pages:
+ stage: deploy
+ script:
+ - mkdir -p .public
+ - mv build/meson-logs/coveragereport/ .public/coverage
+ - mv .public public
+ dependencies:
+ - x64_dbg
+ artifacts:
+ paths:
+ - public
+ only:
+ - master
diff --git a/subprojects/rerex/CONTRIBUTING.md b/subprojects/rerex/CONTRIBUTING.md
new file mode 100644
index 00000000..c942308a
--- /dev/null
+++ b/subprojects/rerex/CONTRIBUTING.md
@@ -0,0 +1,32 @@
+Contributing to Rerex
+=====================
+
+Feel free to tinker with Rerex, I tried to make the code as easy to follow as I
+could while preserving the minimalist spirit of it. Implementing character
+sets and escapes and so on have made things a little bit hairier than a
+textbook example, but hopefully it is still easy to digest.
+
+If you want to make a contribution, feel free to send patches, but note that I
+would like to preserve the cleanliness avoid bloating things too much, so I may
+not accept patches for features that are too complicated. Bug fixes or other
+improvements to the code are, of course, most welcome.
+
+That said, this is, of course, free software, so you're free to take it and do
+as you please. I'd be happy to chat about it, these are just guidelines for
+what I feel would be appropriate for inclusion in the project.
+
+Happy hacking.
+
+Tooling
+-------
+
+All of the code is machine formatted, the style is defined by the included
+`clang-format` configuration. You can use `ninja clang-format` to format
+everything.
+
+Configuring with `-Dstrict=true` will enable the ultra-strict warnings that are
+run on CI (nearly all of them). Configuring with `-Dwerror=true` will make
+them errors.
+
+A `clang-tidy` configuration is included for more intensive static checking,
+you can run all the checks with `ninja clang-tidy`.
diff --git a/subprojects/rerex/COPYING b/subprojects/rerex/COPYING
new file mode 100644
index 00000000..0ba77822
--- /dev/null
+++ b/subprojects/rerex/COPYING
@@ -0,0 +1,13 @@
+Copyright 2020 David Robillard <d@drobilla.net>
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. \ No newline at end of file
diff --git a/subprojects/rerex/NEWS b/subprojects/rerex/NEWS
new file mode 100644
index 00000000..41f780a0
--- /dev/null
+++ b/subprojects/rerex/NEWS
@@ -0,0 +1,5 @@
+rerex (0.0.1) unstable;
+
+ * Initial release
+
+ -- David Robillard <d@drobilla.net> Wed, 09 Dec 2020 18:01:50 +0000
diff --git a/subprojects/rerex/README.md b/subprojects/rerex/README.md
new file mode 100644
index 00000000..a2571cee
--- /dev/null
+++ b/subprojects/rerex/README.md
@@ -0,0 +1,158 @@
+Rerex
+=====
+
+Rerex (REally REgular eXpressions) is a simple and efficient NFA-based regular
+expression implementation in C99.
+
+Rerex parses a regular expression into an NFA, then simulates execution using
+Thompson's algorithm, so matching is linear in the size of the input. For
+details on the theory behind this approach, see [Programming Techniques:
+Regular expression search algorithm], the classic paper by Ken Thompson.
+[Regular Expression Matching Can Be Simple And Fast] is also a good read, which
+compares this approach with those used in most modern regular expression
+implementations. The matching algorithm in Rerex is effectively the same as
+the one described there, though with a slightly different implementation.
+Parsing and construction are completely different, using a recursive-descent
+parser which constructs the automata on the fly.
+
+This is a minimalist implementation with a limited feature set which is mainly
+suitable for basic applications like checking markup or programming language
+syntax. It does, however, strive to be clear and high-quality code which is a
+bit more fully-featured than a simple example, and suitable for real-world use.
+For example, character set expressions are supported, and meaningful errors are
+reported with the offset of the problematic character.
+
+The generated automata are not always optimal, though there are a few simple
+optimizations that avoid generating redundant states. Though there is room for
+improvement there (at the cost of additional complexity), matching performs
+relatively well.
+
+Implementation
+--------------
+
+The implementation is clean and warning-free C99, tested and known to work with
+at least clang, GCC, and MSVC on POSIX, MacOS, and Windows. It weighs in at
+under 600 [SLOC][], which can be compiled to about 5k on x86.
+
+Additionally, a simple test suite is included which covers 100% of the code.
+It is run continuously on all of the above mentioned platforms on x64, as well
+as in Linux on 32 and 64 bit ARM via emulation.
+
+Supported Grammar
+-----------------
+
+Rerex supports a basic subset of classic regular expression syntax that is
+supported by almost all implementations. The grammar is written here in [XML
+EBNF notation][] with terminals in uppercase:
+
+ DOT ::= '.'
+ OPERATOR ::= '*' | '+' | '?'
+ SPECIAL ::= DOT | OPERATOR | '(' | ')' | '[' | ']' | '^' | '{' | '|' | '}'
+ ESCAPABLE ::= SPECIAL | '-'
+ ESCAPE ::= '\' SPECIAL
+ CHAR ::= ESCAPE | [#x20-#x7E] - SPECIAL
+ ELEMENT ::= ([#x20-#x7E] - ']') | ('\' ']')
+ Range ::= ELEMENT | ELEMENT '-' ELEMENT
+ Set ::= Range | Range Set
+ Atom ::= CHAR | DOT | '(' Expr ')' | '[' Set ']'
+ Factor ::= Atom | Atom OPERATOR
+ Term ::= Factor | Factor Term
+ Expr ::= Term | Term '|' Expr
+
+Notable Limitations
+-------------------
+
+Rerex is not meant to replace fully-featured regular expression libraries, so
+the feature set is somewhat limited. The most glaring omissions include:
+
+ - Only supports printable ASCII input
+ - Only supports anchored matching (no back references or group extraction)
+ - Only reads from a null-terminated string (not, for example, files)
+ - No support for counted replication with `{}`
+
+Should I Use This?
+------------------
+
+If you just want to understand how regular expressions work, and have some
+clean code that's easy to understand and tinker with, sure.
+
+If you need a minimal regular expression matching implementation in portable C
+for basic ASCII tasks, maybe.
+
+If you need a fully-featured regular expression implementation for
+international text with group extraction and other more advanced features,
+probably not.
+
+In any case, if you do, letting me know or linking to this page would be
+appreciated.
+
+Dependencies
+------------
+
+None, except the C standard library.
+
+Building
+--------
+
+Rerex consists only of a single header and configuration-free source file, so
+building it with any compiler manually should be straightforward. It should,
+in theory, work just about anywhere.
+
+A [Meson][] build definition is included which can be used to do a proper
+system installation with a `pkg-config` file, generate IDE projects, run the
+tests, and so on. For example, the library and tests can be built and run like
+so:
+
+ meson setup -Dtests=true build
+ cd build
+ ninja test
+
+See the [Meson documentation][] for more details on using Meson.
+
+Usage
+-----
+
+Rerex has a small API based around two objects: a _pattern_, which is an
+immutable compiled representation of a regular expression, and a _matcher_
+which can be used to match strings against a pattern. See [the
+header](include/rerex/rerex.h) for details, there is no external documentation
+aside from this README.
+
+This simple example function uses everything in the API to provide a high-level
+string-based match function:
+
+```c
+bool matches(const char* regexp, const char* string)
+{
+ RerexPattern* pattern = NULL;
+ size_t end = 0;
+ RerexStatus st = rerex_compile(regexp, &end, &pattern);
+
+ if (st) {
+ fprintf(stderr, "(string):%zu: error: %s\n", end, rerex_strerror(st));
+ return false;
+ }
+
+ RerexMatcher* matcher = rerex_new_matcher(pattern);
+ bool matches = rerex_match(matcher, string);
+
+ rerex_free_matcher(matcher);
+ rerex_free_pattern(pattern);
+
+ return matches;
+}
+```
+
+ -- David Robillard <d@drobilla.net>
+
+[XML EBNF notation]: http://www.w3.org/TR/REC-xml/#sec-notation
+
+[SLOC]: https://en.wikipedia.org/wiki/Source_lines_of_code
+
+[Meson]: https://mesonbuild.com/
+
+[Meson documentation]: https://mesonbuild.com/Quick-guide.html
+
+[Programming Techniques: Regular expression search algorithm]: https://dl.acm.org/doi/10.1145/363347.363387
+
+[Regular Expression Matching Can Be Simple And Fast]: https://swtch.com/~rsc/regexp/regexp1.html
diff --git a/subprojects/rerex/include/rerex/rerex.h b/subprojects/rerex/include/rerex/rerex.h
new file mode 100644
index 00000000..507a3a96
--- /dev/null
+++ b/subprojects/rerex/include/rerex/rerex.h
@@ -0,0 +1,112 @@
+/*
+ Copyright 2020 David Robillard <d@drobilla.net>
+
+ Permission to use, copy, modify, and/or distribute this software for any
+ purpose with or without fee is hereby granted, provided that the above
+ copyright notice and this permission notice appear in all copies.
+
+ THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#ifndef REREX_REREX_H
+#define REREX_REREX_H
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#if defined(_WIN32) && !defined(REREX_STATIC) && defined(REREX_INTERNAL)
+# define REREX_API __declspec(dllexport)
+#elif defined(_WIN32) && !defined(REREX_STATIC)
+# define REREX_API __declspec(dllimport)
+#elif defined(__GNUC__)
+# define REREX_API __attribute__((visibility("default")))
+#else
+# define REREX_API
+#endif
+
+#if defined(__GNUC__)
+# define REREX_CONST_API __attribute__((const)) REREX_API
+#else
+# define REREX_CONST_API REREX_API
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// Return status code
+typedef enum {
+ REREX_SUCCESS,
+ REREX_EXPECTED_CHAR,
+ REREX_EXPECTED_ELEMENT,
+ REREX_EXPECTED_RBRACKET,
+ REREX_EXPECTED_RPAREN,
+ REREX_EXPECTED_SPECIAL,
+ REREX_UNEXPECTED_SPECIAL,
+ REREX_UNEXPECTED_END,
+ REREX_UNORDERED_RANGE,
+} RerexStatus;
+
+/// Pattern that represents a compiled valid regular expression
+typedef struct RerexPatternImpl RerexPattern;
+
+/// Matcher that can be used to match strings against a pattern
+typedef struct RerexMatcherImpl RerexMatcher;
+
+/// Return a human-readable description of `status`
+REREX_CONST_API
+const char*
+rerex_strerror(RerexStatus status);
+
+/**
+ Build a regular expression from a pattern string.
+
+ The `pattern` must be a null-terminated string in supported regular
+ expression syntax. Upon return, `end` is set to the offset of the last
+ character processed. On success, `REREX_SUCCESS` is returned, and `out` is
+ pointed to a newly allocated pattern which must be freed with
+ rerex_free_pattern(). On error, `out` is unchanged, an error code is
+ returned, and `end` can be used to determine the position of the error in
+ the pattern.
+*/
+REREX_API
+RerexStatus
+rerex_compile(const char* pattern, size_t* end, RerexPattern** out);
+
+/**
+ Allocate a new matcher for matching against a pattern.
+
+ The returned matcher can be used to match several strings against a single
+ pattern using rerex_match(). It is newly allocated and must be freed with
+ rerex_free_matcher().
+*/
+REREX_API
+RerexMatcher*
+rerex_new_matcher(const RerexPattern* regexp);
+
+/// Return true if `string` matches the pattern of `matcher`
+REREX_API
+bool
+rerex_match(RerexMatcher* matcher, const char* string);
+
+/// Free a matcher allocated with rerex_new_matcher()
+REREX_API
+void
+rerex_free_matcher(RerexMatcher* matcher);
+
+/// Free a pattern allocated with rerex_compile()
+REREX_API
+void
+rerex_free_pattern(RerexPattern* expression);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // REREX_REREX_H
diff --git a/subprojects/rerex/meson.build b/subprojects/rerex/meson.build
new file mode 100644
index 00000000..af64fac9
--- /dev/null
+++ b/subprojects/rerex/meson.build
@@ -0,0 +1,121 @@
+project('rerex', ['c'],
+ version: '0.0.1',
+ license: 'ISC',
+ meson_version: '>= 0.49.0',
+ default_options: [
+ 'b_ndebug=if-release',
+ 'buildtype=release',
+ 'c_std=c99',
+ 'default_library=shared',
+ 'warning_level=3',
+ ])
+
+major_version = meson.project_version().split('.')[0]
+version_suffix = '-@0@'.format(major_version)
+versioned_name = 'rerex' + version_suffix
+
+# Load build tools
+pkg = import('pkgconfig')
+cc = meson.get_compiler('c')
+
+# Set ultra strict warnings for developers, if requested
+c_warnings = []
+c_suppressions = []
+if get_option('strict')
+ if not meson.is_subproject()
+ subdir('meson')
+ add_project_arguments(all_c_warnings, language: ['c'])
+ endif
+
+ if cc.get_id() == 'gcc'
+ c_suppressions = [
+ '-Wno-aggregate-return',
+ '-Wno-switch-default',
+ ]
+ elif cc.get_id() == 'msvc'
+ c_suppressions = [
+ '/wd4706', # assignment within conditional expression
+ '/wd4711', # function selected for automatic inline expansion
+ '/wd5045', # will insert Spectre mitigation
+ ]
+ else
+ c_suppressions = []
+ endif
+endif
+
+rerex_c_args = cc.get_supported_arguments(c_suppressions)
+
+if cc.get_id() == 'msvc'
+ # Build as C++
+ add_project_arguments(['/TP'], language: ['c'])
+
+ # Suppress warnings in system headers
+ add_project_arguments(['/experimental:external',
+ '/external:W0',
+ '/external:anglebrackets'],
+ language: ['c'])
+endif
+
+# Determine library type and the flags needed to build it
+if get_option('default_library') == 'both'
+ if host_machine.system() == 'windows'
+ error('default_library=both is not supported on Windows')
+ endif
+
+ library_type = 'both_libraries'
+ library_args = ['-DREREX_INTERNAL']
+elif get_option('default_library') == 'shared'
+ library_type = 'shared_library'
+ library_args = ['-DREREX_INTERNAL']
+else
+ library_type = 'static_library'
+ library_args = ['-DREREX_INTERNAL', '-DREREX_STATIC']
+endif
+
+# Build shared and/or static library/libraries
+librerex = build_target(
+ versioned_name,
+ ['src/rerex.c'],
+ version: meson.project_version(),
+ include_directories: include_directories(['include']),
+ c_args: rerex_c_args + library_args,
+ gnu_symbol_visibility: 'hidden',
+ install: true,
+ target_type: library_type)
+
+# Generage pkg-config file
+pkg.generate(
+ librerex,
+ name: 'Rerex',
+ filebase: versioned_name,
+ subdirs: [versioned_name],
+ version: meson.project_version(),
+ description: 'A simple and efficient regular expression implementation')
+
+# Install header to a versioned include directory
+install_headers(['include/rerex/rerex.h'], subdir: versioned_name / 'rerex')
+
+rerex_dep = declare_dependency(
+ link_with: librerex,
+ include_directories: include_directories(['include']))
+
+# Build and run tests
+if get_option('tests')
+ foreach name : ['syntax', 'match', 'xsd']
+ full_name = 'test_@0@'.format(name)
+ test(full_name,
+ executable(full_name,
+ 'test/@0@.c'.format(full_name),
+ c_args: rerex_c_args,
+ include_directories: include_directories(['include']),
+ dependencies: rerex_dep))
+ endforeach
+endif
+
+if meson.version().version_compare('>=0.53.0')
+ summary('Tests', get_option('tests'), bool_yn: true)
+
+ summary('Install prefix', get_option('prefix'))
+ summary('Headers', get_option('prefix') / get_option('includedir'))
+ summary('Libraries', get_option('prefix') / get_option('libdir'))
+endif
diff --git a/subprojects/rerex/meson/meson.build b/subprojects/rerex/meson/meson.build
new file mode 100644
index 00000000..5dd0de38
--- /dev/null
+++ b/subprojects/rerex/meson/meson.build
@@ -0,0 +1,204 @@
+# General code to enable approximately all warnings.
+#
+# This is trivial for clang and MSVC, but GCC does not have such an option, and
+# has several esoteric warnings, so we need to enable everything we want
+# explicitly. We enable everything that does not require a value argument,
+# except for warnings that are only relevant for very old languages (earlier
+# than C99 or C++11) or non-standard extensions.
+#
+# Omitted common warnings:
+#
+# Wabi=
+# Waggregate-return
+# Walloc-size-larger-than=BYTES
+# Walloca-larger-than=BYTES
+# Wframe-larger-than=BYTES
+# Wlarger-than=BYTES
+# Wstack-usage=BYTES
+# Wsystem-headers
+# Wtraditional
+# Wtraditional-conversion
+# Wtrampolines
+# Wvla-larger-than=BYTES
+#
+# Omitted C warnings:
+#
+# Wc90-c99-compat
+# Wdeclaration-after-statement
+# Wtraditional
+# Wtraditional-conversion
+#
+# Omitted C++ warnings:
+#
+# Wnamespaces
+# Wtemplates
+
+gcc_common_warnings = [
+ '-Walloc-zero',
+ '-Walloca',
+ '-Wanalyzer-too-complex',
+ '-Warith-conversion',
+ '-Warray-bounds=2',
+ '-Wattribute-alias=2',
+ '-Wcast-align=strict',
+ '-Wcast-qual',
+ '-Wconversion',
+ '-Wdate-time',
+ '-Wdisabled-optimization',
+ '-Wdouble-promotion',
+ '-Wduplicated-branches',
+ '-Wduplicated-cond',
+ '-Wfloat-equal',
+ '-Wformat-overflow=2',
+ '-Wformat-signedness',
+ '-Wformat-truncation=2',
+ '-Wformat=2',
+ '-Wimplicit-fallthrough=2',
+ '-Winit-self',
+ '-Winline',
+ '-Winvalid-pch',
+ '-Wlogical-op',
+ '-Wmissing-declarations',
+ '-Wmissing-include-dirs',
+ '-Wmultichar',
+ '-Wnormalized=nfc',
+ '-Wnull-dereference',
+ '-Wpacked',
+ '-Wpadded',
+ '-Wredundant-decls',
+ '-Wscalar-storage-order',
+ '-Wshadow',
+ '-Wshift-overflow=2',
+ '-Wsizeof-array-argument',
+ '-Wstack-protector',
+ '-Wstrict-aliasing=3',
+ '-Wstrict-overflow=5',
+ '-Wstringop-overflow=3',
+ '-Wsuggest-attribute=cold',
+ '-Wsuggest-attribute=const',
+ '-Wsuggest-attribute=format',
+ '-Wsuggest-attribute=malloc',
+ '-Wsuggest-attribute=noreturn',
+ '-Wsuggest-attribute=pure',
+ '-Wswitch-default',
+ '-Wswitch-enum',
+ '-Wsync-nand',
+ '-Wundef',
+ '-Wunused-const-variable=2',
+ '-Wunused-macros',
+ '-Wvarargs',
+ '-Wvector-operation-performance',
+ '-Wvla',
+ '-Wwrite-strings',
+]
+
+gcc_c_warnings = [
+ '-Wbad-function-cast',
+ '-Wc++-compat',
+ '-Wc99-c11-compat',
+ '-Wdesignated-init',
+ '-Wdiscarded-array-qualifiers',
+ '-Wdiscarded-qualifiers',
+ '-Wincompatible-pointer-types',
+ '-Wjump-misses-init',
+ '-Wmissing-prototypes',
+ '-Wnested-externs',
+ '-Wold-style-definition',
+ '-Wstrict-prototypes',
+ '-Wunsuffixed-float-constants',
+]
+
+# Set all_c_warnings for the current C compiler
+if is_variable('cc') and not is_variable('all_c_warnings')
+ if cc.get_id() == 'clang'
+ all_c_warnings = ['-Weverything']
+ elif cc.get_id() == 'gcc'
+ all_c_warnings = gcc_common_warnings + [
+ '-Wbad-function-cast',
+ '-Wc++-compat',
+ '-Wc99-c11-compat',
+ '-Wdesignated-init',
+ '-Wdiscarded-array-qualifiers',
+ '-Wdiscarded-qualifiers',
+ '-Wincompatible-pointer-types',
+ '-Wjump-misses-init',
+ '-Wmissing-prototypes',
+ '-Wnested-externs',
+ '-Wold-style-definition',
+ '-Wstrict-prototypes',
+ '-Wunsuffixed-float-constants',
+ ]
+ elif cc.get_id() == 'msvc'
+ all_c_warnings = ['/Wall']
+ else
+ all_c_warnings = []
+ endif
+
+ all_c_warnings = cc.get_supported_arguments(all_c_warnings)
+
+endif
+
+# Set all_cpp_warnings for the current C++ compiler
+if is_variable('cpp') and not is_variable('all_cpp_warnings')
+ if cpp.get_id() == 'clang'
+ all_cpp_warnings = [
+ '-Weverything',
+ '-Wno-c++98-compat',
+ '-Wno-c++98-compat-pedantic'
+ ]
+ elif cpp.get_id() == 'gcc'
+ all_cpp_warnings = gcc_common_warnings + [
+ '-Wabi-tag',
+ '-Waligned-new=all',
+ '-Wcatch-value=3',
+ '-Wcomma-subscript',
+ '-Wconditionally-supported',
+ '-Wctor-dtor-privacy',
+ '-Wdeprecated-copy-dtor',
+ '-Weffc++',
+ '-Wextra-semi',
+ '-Wmismatched-tags',
+ '-Wmultiple-inheritance',
+ '-Wnoexcept',
+ '-Wnoexcept-type',
+ '-Wnon-virtual-dtor',
+ '-Wold-style-cast',
+ '-Woverloaded-virtual',
+ '-Wplacement-new=2',
+ '-Wredundant-tags',
+ '-Wregister',
+ '-Wsign-promo',
+ '-Wstrict-null-sentinel',
+ '-Wsuggest-final-methods',
+ '-Wsuggest-final-types',
+ '-Wsuggest-override',
+ '-Wuseless-cast',
+ '-Wvirtual-inheritance',
+ '-Wvolatile',
+ '-Wzero-as-null-pointer-constant',
+ ]
+ elif cpp.get_id() == 'msvc'
+ all_cpp_warnings = ['/Wall']
+ else
+ all_cpp_warnings = []
+ endif
+
+ all_cpp_warnings = cpp.get_supported_arguments(all_cpp_warnings)
+
+endif
+
+# Set all_objc_warnings for the current Objective C compiler
+if is_variable('objcc') and not is_variable('all_objc_warnings')
+ all_objc_warnings = []
+ if objcc.get_id() == 'clang'
+ all_objc_warnings = ['-Weverything']
+ elif objc.get_id() == 'gcc'
+ all_objc_warnings = gcc_common_warnings + [
+ '-Wno-direct-ivar-access',
+ ]
+ else
+ all_objc_warnings = []
+ endif
+
+ all_objc_warnings = objcc.get_supported_arguments(all_objc_warnings)
+endif
diff --git a/subprojects/rerex/meson_options.txt b/subprojects/rerex/meson_options.txt
new file mode 100644
index 00000000..f2b59781
--- /dev/null
+++ b/subprojects/rerex/meson_options.txt
@@ -0,0 +1,5 @@
+option('strict', type: 'boolean', value: false,
+ description: 'Enable ultra-strict warnings')
+
+option('tests', type: 'boolean', value: true,
+ description: 'Build tests')
diff --git a/subprojects/rerex/src/.clang-tidy b/subprojects/rerex/src/.clang-tidy
new file mode 100644
index 00000000..420ce833
--- /dev/null
+++ b/subprojects/rerex/src/.clang-tidy
@@ -0,0 +1,8 @@
+Checks: >
+ *,
+ -hicpp-multiway-paths-covered,
+ -llvmlibc-*,
+ -misc-no-recursion,
+WarningsAsErrors: '*'
+HeaderFilterRegex: '.*'
+FormatStyle: file
diff --git a/subprojects/rerex/src/rerex.c b/subprojects/rerex/src/rerex.c
new file mode 100644
index 00000000..49970a45
--- /dev/null
+++ b/subprojects/rerex/src/rerex.c
@@ -0,0 +1,765 @@
+/*
+ Copyright 2020 David Robillard <d@drobilla.net>
+
+ Permission to use, copy, modify, and/or distribute this software for any
+ purpose with or without fee is hereby granted, provided that the above
+ copyright notice and this permission notice appear in all copies.
+
+ THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#include "rerex/rerex.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+static const char cmin = 0x20; // Inclusive minimum normal character
+static const char cmax = 0x7E; // Inclusive maximum normal character
+
+const char*
+rerex_strerror(const RerexStatus status)
+{
+ switch (status) {
+ case REREX_SUCCESS:
+ return "Success";
+ case REREX_EXPECTED_CHAR:
+ return "Expected a regular character";
+ case REREX_EXPECTED_ELEMENT:
+ return "Expected a character in a set";
+ case REREX_EXPECTED_RBRACKET:
+ return "Expected ']'";
+ case REREX_EXPECTED_RPAREN:
+ return "Expected ')'";
+ case REREX_EXPECTED_SPECIAL:
+ return "Expected a special character (one of \"()*+-?[]^|\")";
+ case REREX_UNEXPECTED_SPECIAL:
+ return "Unexpected special character";
+ case REREX_UNEXPECTED_END:
+ return "Unexpected end of input";
+ case REREX_UNORDERED_RANGE:
+ return "Range is out of order";
+ }
+
+ return "Unknown error";
+}
+
+/* State */
+
+#define NO_STATE 0
+
+// The ID for a state, which is an index into the state array
+typedef size_t StateIndex;
+
+// A code point (currently only 8-bit ASCII but we use the space anyway)
+typedef int Codepoint;
+
+// Special type for states with only epsilon out arcs
+typedef enum {
+ REREX_MATCH = 0xE000, ///< Matching state, no out arcs
+ REREX_SPLIT = 0xE001, ///< Splitting state, one or two out arcs
+} StateType;
+
+/* A state in an NFA.
+
+ A state in Thompson's NFA can have either a single character-labeled
+ transition to another state, or up to two unlabeled epsilon transitions to
+ other states. There is both a minimum and maximum label for supporting
+ character ranges. So, either `min` and `max` are ASCII characters that are
+ the label of an arc to next1 (and next2 is null), or `min` is a special
+ StateType and next1 and/or next2 may be set to successor states.
+*/
+typedef struct {
+ StateIndex next1; ///< Head of first out arc (or NULL)
+ StateIndex next2; ///< Head of second out arc (or NULL)
+ Codepoint min; ///< Special type, or inclusive min label for next1
+ Codepoint max; ///< Inclusive max label for next2
+} State;
+
+// Create a match (end) state with no successors
+static State
+match_state(void)
+{
+ const State s = {NO_STATE, NO_STATE, REREX_MATCH, 0};
+ return s;
+}
+
+// Create a split state with at most two successors
+static State
+split_state(const StateIndex next1, const StateIndex next2)
+{
+ const State s = {next1, next2, REREX_SPLIT, 0};
+ return s;
+}
+
+// Create a labeled state with one successor reached by a character arc
+static State
+range_state(const char min, const char max, const StateIndex next)
+{
+ const State s = {next, NO_STATE, min, max};
+ return s;
+}
+
+/* Array of states.
+
+ States are stored in a flat array to reduce memory fragmentation, and for
+ easy memory management since the automata graph may be cyclic. This simple
+ implementation calls realloc() for every state, which isn't terribly
+ efficient, but works well enough. Note that state addresses therefore change
+ during compilation, so states are generally referred to by their index, and
+ not by pointer. Conveniently, using indices is also useful during matching
+ for storing auxiliary information about states.
+*/
+typedef struct {
+ State* states;
+ size_t n_states;
+} StateArray;
+
+// Append a new state to the end of the state array
+static StateIndex
+add_state(StateArray* const array, const State state)
+{
+ const size_t new_n_states = array->n_states + 1;
+ const size_t new_size = new_n_states * sizeof(State);
+
+ array->states = (State*)realloc(array->states, new_size);
+ array->states[array->n_states] = state;
+
+ return array->n_states++;
+}
+
+/* Automata.
+ *
+ * This is a lightweiht description of an NFA fragment. The states are stored
+ * elsewhere, this is just used to provide a facade to conceptually work with
+ * automata for high-level operations.
+ */
+typedef struct {
+ StateIndex start;
+ StateIndex end;
+} Automata;
+
+// Simple utility function for making an automata in an expression
+static Automata
+make_automata(const StateIndex start, const StateIndex end)
+{
+ Automata result = {start, end};
+ return result;
+}
+
+// Return whether `nfa` has only two simple states (used for optimizations)
+static inline bool
+is_trivial(const StateArray* const states, const Automata nfa)
+{
+ return (states->states[nfa.start].min < REREX_MATCH &&
+ states->states[nfa.start].next1 == nfa.end);
+}
+
+// Kleene's Star of an NFA
+static Automata
+star(StateArray* const states, const Automata nfa)
+{
+ const StateIndex end = add_state(states, match_state());
+ const StateIndex start = add_state(states, split_state(nfa.start, end));
+
+ states->states[nfa.end] = split_state(nfa.start, end);
+
+ return make_automata(start, end);
+}
+
+// Zero-or-one of an NFA
+static Automata
+question(StateArray* const states, const Automata nfa)
+{
+ const StateIndex start = add_state(states, split_state(nfa.start, nfa.end));
+
+ return make_automata(start, nfa.end);
+}
+
+// One-or-more of an NFA
+static Automata
+plus(StateArray* const states, const Automata nfa)
+{
+ const StateIndex end = add_state(states, match_state());
+
+ states->states[nfa.end] = split_state(nfa.start, end);
+
+ return make_automata(nfa.start, end);
+}
+
+// Concatenation of two NFAs
+static Automata
+concatenate(StateArray* const states, const Automata a, const Automata b)
+{
+ if (is_trivial(states, a)) {
+ // Optimization: link a's start directly to b's start (drop a's end)
+ states->states[a.start].next1 = b.start;
+ } else {
+ states->states[a.end] = split_state(b.start, NO_STATE);
+ }
+
+ return make_automata(a.start, b.end);
+}
+
+// Alternation (OR) of two NFAs
+static Automata
+alternate(StateArray* const states, const Automata a, const Automata b)
+{
+ const StateIndex split = add_state(states, split_state(a.start, b.start));
+
+ if (is_trivial(states, a)) {
+ // Optimization: link a's start directly to b's end (drop a's end)
+ states->states[a.start].next1 = b.end;
+ return make_automata(split, b.end);
+ }
+
+ if (is_trivial(states, b)) {
+ // Optimization: link b's start directly to a's end (drop b's end)
+ states->states[b.start].next1 = a.end;
+ return make_automata(split, a.end);
+ }
+
+ const StateIndex end = add_state(states, match_state());
+
+ states->states[a.end] = split_state(end, NO_STATE);
+ states->states[b.end] = split_state(end, NO_STATE);
+
+ return make_automata(split, end);
+}
+
+/* Parser input.
+ *
+ * The parser reads from a string one character at a time, though it would be
+ * simple to change this to read from any stream. All reading is done by three
+ * operations: peek, peekahead, and eat.
+ */
+typedef struct {
+ const char* const str;
+ size_t offset;
+} Input;
+
+// Return the next character in the input without consuming it
+static inline char
+peek(Input* const input)
+{
+ return input->str[input->offset];
+}
+
+// Return the next-next character in the input without consuming any
+static inline char
+peekahead(Input* const input)
+{
+ // Unfortunately we need 2-char lookahead for the ambiguity of '-' in sets
+ return input->str[input->offset + 1];
+}
+
+// Consume and return the next character in the input
+static inline char
+eat(Input* const input)
+{
+ return input->str[input->offset++];
+}
+
+// Forward declaration for read_expr because it is called recursively
+static RerexStatus
+read_expr(Input* input, StateArray* states, Automata* out);
+
+// DOT ::= '.'
+// OPERATOR ::= '*' | '+' | '?'
+// SPECIAL ::= DOT | OPERATOR | '(' | ')' | '[' | ']' | '^' | '{' | '|' | '}'
+static bool
+is_special(const char c)
+{
+ switch (c) {
+ case '(':
+ case ')':
+ case '*':
+ case '+':
+ case '.':
+ case '?':
+ case '[':
+ case ']':
+ case '^':
+ case '{':
+ case '|':
+ case '}':
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+// DOT ::= '.'
+static RerexStatus
+read_dot(Input* const input, StateArray* const states, Automata* const out)
+{
+ assert(peek(input) == '.');
+ eat(input);
+
+ const StateIndex end = add_state(states, match_state());
+ const StateIndex start = add_state(states, range_state(cmin, cmax, end));
+
+ *out = make_automata(start, end);
+
+ return REREX_SUCCESS;
+}
+
+// ESCAPE ::= '\' SPECIAL
+static RerexStatus
+read_escape(Input* const input, char* const out)
+{
+ assert(peek(input) == '\\');
+ eat(input);
+
+ const char c = peek(input);
+ if (is_special(c) || c == '-') {
+ *out = eat(input);
+ return REREX_SUCCESS;
+ }
+
+ return REREX_EXPECTED_SPECIAL;
+}
+
+// CHAR ::= ESCAPE | [#x20-#x7E] - SPECIAL
+static RerexStatus
+read_char(Input* const input, char* const out)
+{
+ const char c = peek(input);
+
+ switch (c) {
+ case '\0':
+ return REREX_UNEXPECTED_END;
+ case '\\':
+ return read_escape(input, out);
+ default:
+ break;
+ }
+
+ if (is_special(c)) {
+ return REREX_UNEXPECTED_SPECIAL;
+ }
+
+ if (c >= cmin && c <= cmax) {
+ *out = eat(input);
+ return REREX_SUCCESS;
+ }
+
+ return REREX_EXPECTED_CHAR;
+}
+
+// ELEMENT ::= ([#x20-#x7E] - ']') | ('\' ']')
+static RerexStatus
+read_element(Input* const input, char* const out)
+{
+ const char c = peek(input);
+
+ switch (c) {
+ case '\0':
+ return REREX_UNEXPECTED_END;
+ case ']':
+ return REREX_UNEXPECTED_SPECIAL;
+ case '\\':
+ eat(input);
+ if (peek(input) != ']') {
+ return REREX_EXPECTED_RBRACKET;
+ }
+
+ *out = eat(input);
+ return REREX_SUCCESS;
+ default:
+ break;
+ }
+
+ if (c >= cmin && c <= cmax) {
+ *out = eat(input);
+ return REREX_SUCCESS;
+ }
+
+ return REREX_EXPECTED_ELEMENT;
+}
+
+// Range ::= ELEMENT | ELEMENT '-' ELEMENT
+static RerexStatus
+read_range(Input* const input,
+ StateArray* const states,
+ const bool negated,
+ Automata* const out)
+{
+ RerexStatus st = REREX_SUCCESS;
+ char min = 0;
+ char max = 0;
+
+ // Read the first (or only) character
+ if ((st = read_element(input, &min))) {
+ return st;
+ }
+
+ // Handle '-' which is a bit hairy because it may or may not be special
+ if (peek(input) == '-') {
+ switch (peekahead(input)) {
+ case ']':
+ // Weird case like [a-] where '-' is a character
+ max = min;
+ break;
+ default:
+ // Normal range like [a-z] (note that '[' isn't special here)
+ eat(input);
+ if ((st = read_element(input, &max))) {
+ return st;
+ }
+ break;
+ }
+ } else {
+ // Single character element
+ max = min;
+ }
+
+ if (max < min) {
+ return REREX_UNORDERED_RANGE;
+ }
+
+ const StateIndex end = add_state(states, match_state());
+ if (negated) {
+ const char emin = (char)(min - 1);
+ const char emax = (char)(max + 1);
+ const StateIndex low = add_state(states, range_state(cmin, emin, end));
+ const StateIndex high = add_state(states, range_state(emax, cmax, end));
+ const StateIndex fork = add_state(states, split_state(low, high));
+
+ *out = make_automata(fork, end);
+ } else {
+ const StateIndex start = add_state(states, range_state(min, max, end));
+
+ *out = make_automata(start, end);
+ }
+
+ return st;
+}
+
+// Set ::= Range | Range Set
+static RerexStatus
+read_set(Input* const input, StateArray* const states, Automata* const out)
+{
+ RerexStatus st = REREX_SUCCESS;
+ bool negated = false;
+
+ if (peek(input) == '^') {
+ eat(input);
+ negated = true;
+ }
+
+ Automata nfa = {NO_STATE, NO_STATE};
+ if ((st = read_range(input, states, negated, &nfa))) {
+ return st;
+ }
+
+ while (peek(input) != ']') {
+ Automata range_nfa = {NO_STATE, NO_STATE};
+ if ((st = read_range(input, states, negated, &range_nfa))) {
+ return st;
+ }
+
+ nfa = alternate(states, nfa, range_nfa);
+ }
+
+ *out = nfa;
+ return st;
+}
+
+// Atom ::= CHAR | DOT | '(' Expr ')' | '[' Set ']'
+static RerexStatus
+read_atom(Input* const input, StateArray* const states, Automata* const out)
+{
+ RerexStatus st = REREX_SUCCESS;
+
+ switch (peek(input)) {
+ case '(':
+ eat(input);
+ if ((st = read_expr(input, states, out))) {
+ return st;
+ }
+
+ if (peek(input) != ')') {
+ return REREX_EXPECTED_RPAREN;
+ }
+
+ eat(input);
+ return st;
+
+ case '.':
+ return read_dot(input, states, out);
+
+ case '[':
+ eat(input);
+ if ((st = read_set(input, states, out))) {
+ return st;
+ }
+
+ eat(input);
+ return st;
+
+ default:
+ break;
+ }
+
+ char c = 0;
+ if ((st = read_char(input, &c))) {
+ return st;
+ }
+
+ const StateIndex end = add_state(states, match_state());
+ const StateIndex start = add_state(states, range_state(c, c, end));
+
+ *out = make_automata(start, end);
+
+ return st;
+}
+
+// OPERATOR ::= '*' | '+' | '?'
+// Factor ::= Atom | Atom OPERATOR
+static RerexStatus
+read_factor(Input* const input, StateArray* const states, Automata* const out)
+{
+ RerexStatus st = REREX_SUCCESS;
+ Automata atom_nfa = {NO_STATE, NO_STATE};
+
+ if (!(st = read_atom(input, states, &atom_nfa))) {
+ const char c = peek(input);
+ switch (c) {
+ case '*':
+ eat(input);
+ *out = star(states, atom_nfa);
+ break;
+ case '+':
+ eat(input);
+ *out = plus(states, atom_nfa);
+ break;
+ case '?':
+ eat(input);
+ *out = question(states, atom_nfa);
+ break;
+ default:
+ *out = atom_nfa;
+ break;
+ }
+ }
+
+ return st;
+}
+
+// Term ::= Factor | Factor Term
+static RerexStatus
+read_term(Input* const input, StateArray* const states, Automata* const out)
+{
+ RerexStatus st = REREX_SUCCESS;
+ Automata factor_nfa = {NO_STATE, NO_STATE};
+ Automata term_nfa = {NO_STATE, NO_STATE};
+
+ if (!(st = read_factor(input, states, &factor_nfa))) {
+ switch (peek(input)) {
+ case '\0':
+ case ')':
+ case '|':
+ *out = factor_nfa;
+ break;
+ default:
+ if (!(st = read_term(input, states, &term_nfa))) {
+ *out = concatenate(states, factor_nfa, term_nfa);
+ }
+ break;
+ }
+ }
+
+ return st;
+}
+
+// Expr ::= Term | Term '|' Expr
+static RerexStatus
+read_expr(Input* const input, StateArray* const states, Automata* const out)
+{
+ RerexStatus st = REREX_SUCCESS;
+ Automata term_nfa = {NO_STATE, NO_STATE};
+ Automata expr_nfa = {NO_STATE, NO_STATE};
+
+ if ((st = read_term(input, states, &term_nfa))) {
+ return st;
+ }
+
+ if (peek(input) == '|') {
+ eat(input);
+ if (!(st = read_expr(input, states, &expr_nfa))) {
+ *out = alternate(states, term_nfa, expr_nfa);
+ }
+ } else {
+ *out = term_nfa;
+ }
+
+ return st;
+}
+
+/* Pattern.
+ *
+ * A pattern is simply an array of states and an index to the start state. The
+ * end state(s) are known because they have type REREX_MATCH. A pattern is
+ * immutable after construction, the matcher does not modify it.
+ */
+struct RerexPatternImpl {
+ StateArray states;
+ StateIndex start;
+};
+
+void
+rerex_free_pattern(RerexPattern* const regexp)
+{
+ free(regexp->states.states);
+ free(regexp);
+}
+
+RerexStatus
+rerex_compile(const char* const pattern,
+ size_t* const end,
+ RerexPattern** const out)
+{
+ Input input = {pattern, 0};
+ Automata nfa = {NO_STATE, NO_STATE};
+ StateArray states = {NULL, 0};
+
+ // Add null state so that no actual state has NO_STATE as an ID
+ add_state(&states, split_state(NO_STATE, NO_STATE));
+
+ const RerexStatus st = read_expr(&input, &states, &nfa);
+ if (st) {
+ free(states.states);
+ } else {
+ *out = (RerexPattern*)malloc(sizeof(RerexPattern));
+ (*out)->states = states;
+ (*out)->start = nfa.start;
+ }
+
+ *end = input.offset;
+ return st;
+}
+
+/* Matcher */
+
+typedef struct {
+ StateIndex* indices; // Array of state indices
+ size_t n_indices; // Number of elements in indices
+} IndexList;
+
+/* Matcher.
+ *
+ * The matcher tracks active states by keeping two lists of indices: one for
+ * the current iteration, and one for the next. A separate array, keyed by
+ * state index, stores the number of the last iteration the state was entered
+ * in. This makes it simple and fast to check if a state has already been
+ * entered in the current iteration, avoiding the need to search the active
+ * list for every entered state.
+ */
+struct RerexMatcherImpl {
+ const RerexPattern* regexp; // Pattern to match against
+ IndexList active[2]; // Two lists of active states
+ size_t* last_active; // Last iteration a state was active
+};
+
+RerexMatcher*
+rerex_new_matcher(const RerexPattern* const regexp)
+{
+ const size_t n_states = regexp->states.n_states;
+ RerexMatcher* const m = (RerexMatcher*)calloc(1, sizeof(RerexMatcher));
+
+ m->regexp = regexp;
+ m->active[0].indices = (StateIndex*)calloc(n_states, sizeof(StateIndex));
+ m->active[1].indices = (StateIndex*)calloc(n_states, sizeof(StateIndex));
+ m->last_active = (size_t*)calloc(n_states, sizeof(size_t));
+
+ return m;
+}
+
+void
+rerex_free_matcher(RerexMatcher* const matcher)
+{
+ free(matcher->last_active);
+ free(matcher->active[1].indices);
+ free(matcher->active[0].indices);
+ free(matcher);
+}
+
+// Add `s` and any epsilon successors to the active list
+static void
+enter_state(RerexMatcher* const matcher,
+ const size_t step,
+ IndexList* const list,
+ const StateIndex s)
+{
+ const StateArray* const states = &matcher->regexp->states;
+
+ if (s && matcher->last_active[s] != step) {
+ matcher->last_active[s] = step;
+
+ const State* const state = &states->states[s];
+ if (state->min == REREX_SPLIT) {
+ enter_state(matcher, step, list, state->next1);
+ enter_state(matcher, step, list, state->next2);
+ } else {
+ list->indices[list->n_indices++] = s;
+ }
+ }
+}
+
+// Run `matcher` and return true if `string` matches the expression
+bool
+rerex_match(RerexMatcher* const matcher, const char* const string)
+{
+ const StateArray* const states = &matcher->regexp->states;
+
+ // Reset matcher to a consistent initial state
+ matcher->active[0].n_indices = 0;
+ matcher->active[1].n_indices = 0;
+ for (size_t i = 0; i < states->n_states; ++i) {
+ matcher->last_active[i] = SIZE_MAX;
+ }
+
+ // Enter start state
+ enter_state(matcher, 0, &matcher->active[0], matcher->regexp->start);
+
+ // Tick the matcher for every input character
+ bool phase = 0;
+ for (size_t i = 0; string[i]; ++i) {
+ const char c = string[i];
+ IndexList* const list = &matcher->active[phase];
+ IndexList* const next_list = &matcher->active[!phase];
+
+ // Add successor states to the next iteration's list
+ next_list->n_indices = 0;
+ for (size_t j = 0; j < list->n_indices; ++j) {
+ const State* const state = &states->states[list->indices[j]];
+ if (state->min <= c && c <= state->max) {
+ enter_state(matcher, i + 1, next_list, state->next1);
+ }
+ }
+
+ // Flip phase to swap active lists
+ phase = !phase;
+ }
+
+ // Check if match state is entered in the end
+ IndexList* const active = &matcher->active[phase];
+ for (size_t i = 0; i < active->n_indices; ++i) {
+ if (states->states[active->indices[i]].min == REREX_MATCH) {
+ return true;
+ }
+ }
+
+ return false;
+}
diff --git a/subprojects/rerex/test/test_match.c b/subprojects/rerex/test/test_match.c
new file mode 100644
index 00000000..fa270e83
--- /dev/null
+++ b/subprojects/rerex/test/test_match.c
@@ -0,0 +1,238 @@
+/*
+ Copyright 2020 David Robillard <d@drobilla.net>
+
+ Permission to use, copy, modify, and/or distribute this software for any
+ purpose with or without fee is hereby granted, provided that the above
+ copyright notice and this permission notice appear in all copies.
+
+ THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#undef NDEBUG
+
+#include "rerex/rerex.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct {
+ uintptr_t match; ///< Boolean, true if text should match
+ const char* pattern; ///< Regular expression
+ const char* text; ///< Text to match against pattern
+} MatchTestCase;
+
+static const MatchTestCase match_tests[] = {
+ {1, "\\(", "("},
+ {1, "\\)", ")"},
+ {1, "\\*", "*"},
+ {1, "\\+", "+"},
+ {1, "\\-", "-"},
+ {1, "\\.", "."},
+ {1, "\\?", "?"},
+ {1, "\\[", "["},
+ {1, "\\]", "]"},
+ {1, "\\^", "^"},
+ {1, "\\|", "|"},
+ {0, ".", ""},
+ {1, ".", "a"},
+ {0, ".", "aa"},
+ {0, "..", ""},
+ {0, "..", "a"},
+ {1, "..", "aa"},
+ {1, ".*", ""},
+ {1, ".*", "a"},
+ {1, ".*", "aa"},
+ {0, ".+", ""},
+ {1, ".+", "a"},
+ {1, ".+", "aa"},
+ {1, ".?", ""},
+ {1, ".?", "a"},
+ {0, ".?", "aa"},
+ {1, "a*", ""},
+ {1, "a*", "a"},
+ {1, "a*", "aa"},
+ {0, "a*", "b"},
+ {0, "a+", ""},
+ {1, "a+", "a"},
+ {1, "a+", "aa"},
+ {0, "a+", "b"},
+ {1, "a?", ""},
+ {1, "a?", "a"},
+ {0, "a?", "aa"},
+ {0, "a?", "b"},
+ {0, "[.]", "a"},
+ {1, "[.]", "."},
+ {0, "[\\]]", "a"},
+ {1, "[\\]]", "]"},
+ {0, "[b]", "a"},
+ {1, "[b]", "b"},
+ {0, "[b]", "c"},
+ {0, "[bc]", "a"},
+ {1, "[bc]", "b"},
+ {1, "[bc]", "c"},
+ {0, "[bc]", "d"},
+ {0, "[bcd]", "a"},
+ {1, "[bcd]", "b"},
+ {1, "[bcd]", "c"},
+ {1, "[bcd]", "d"},
+ {0, "[bcd]", "e"},
+ {0, "[b-d]", "a"},
+ {1, "[b-d]", "b"},
+ {1, "[b-d]", "d"},
+ {0, "[b-d]", "e"},
+ {1, "[^b-d]", "a"},
+ {0, "[^b-d]", "b"},
+ {0, "[^b-d]", "d"},
+ {1, "[^b-d]", "e"},
+ {0, "[^ -/]", "\t"},
+ {1, "[^ -/]", "0"},
+ {1, "[^{-~]", "z"},
+ {0, "[^{-~]", "~"},
+ {0, "[A-Za-z]", "5"},
+ {1, "[A-Za-z]", "m"},
+ {1, "[A-Za-z]", "M"},
+ {0, "[A-Za-z]", "~"},
+ {0, "[+-]", "*"},
+ {1, "[+-]", "+"},
+ {0, "[+-]", ","},
+ {1, "[+-]", "-"},
+ {0, "[+-]", "."},
+ {1, "[b-d]*", ""},
+ {0, "[b-d]*", "a"},
+ {1, "[b-d]*", "b"},
+ {1, "[b-d]*", "c"},
+ {1, "[b-d]*", "cc"},
+ {1, "[b-d]*", "d"},
+ {0, "[b-d]*", "e"},
+ {0, "[b-d]+", ""},
+ {0, "[b-d]+", "a"},
+ {1, "[b-d]+", "b"},
+ {1, "[b-d]+", "c"},
+ {1, "[b-d]+", "cc"},
+ {1, "[b-d]+", "d"},
+ {0, "[b-d]+", "e"},
+ {1, "[b-d]?", ""},
+ {0, "[b-d]?", "a"},
+ {1, "[b-d]?", "b"},
+ {1, "[b-d]?", "c"},
+ {0, "[b-d]?", "cc"},
+ {1, "[b-d]?", "d"},
+ {0, "[b-d]?", "e"},
+ {1, "h(e|a)llo", "hello"},
+ {1, "h(e|a)llo", "hallo"},
+ {1, "h(e|a)+llo", "haello"},
+ {1, "h(e|a)*llo", "hllo"},
+ {1, "h(e|a)?llo", "hllo"},
+ {1, "h(e|a)?llo", "hello"},
+ {1, "h(e|a)*llo*", "haeeeallooo"},
+ {1, "(ab|a)(bc|c)", "abc"},
+ {0, "(ab|a)(bc|c)", "acb"},
+ {1, "(ab)c|abc", "abc"},
+ {0, "(ab)c|abc", "ab"},
+ {1, "(a*)(b?)(b+)", "aaabbbb"},
+ {0, "(a*)(b?)(b+)", "aaaa"},
+ {1, "((a|a)|a)", "a"},
+ {0, "((a|a)|a)", "aa"},
+ {1, "(a*)(a|aa)", "aaaa"},
+ {0, "(a*)(a|aa)", "b"},
+ {1, "a(b)|c(d)|a(e)f", "aef"},
+ {0, "a(b)|c(d)|a(e)f", "adf"},
+ {1, "(a|b)c|a(b|c)", "ac"},
+ {0, "(a|b)c|a(b|c)", "acc"},
+ {1, "(a|b)c|a(b|c)", "ab"},
+ {0, "(a|b)c|a(b|c)", "acb"},
+ {1, "(a|b)*c|(a|ab)*c", "abc"},
+ {0, "(a|b)*c|(a|ab)*c", "bbbcabbbc"},
+ {1, "a?(ab|ba)ab", "abab"},
+ {0, "a?(ab|ba)ab", "aaabab"},
+ {1, "(aa|aaa)*|(a|aaaaa)", "aa"},
+ {1, "(a)(b)(c)", "abc"},
+ {1, "((((((((((x))))))))))", "x"},
+ {1, "((((((((((x))))))))))*", "xx"},
+ {1, "a?(ab|ba)*", "ababababababababababababababababa"},
+ {1, "a*a*a*a*a*b", "aaaaaaaab"},
+ {1, "abc", "abc"},
+ {1, "ab*c", "abc"},
+ {1, "ab*bc", "abbc"},
+ {1, "ab*bc", "abbbbc"},
+ {1, "ab+bc", "abbc"},
+ {1, "ab+bc", "abbbbc"},
+ {1, "ab?bc", "abbc"},
+ {1, "ab?bc", "abc"},
+ {1, "ab|cd", "ab"},
+ {1, "(a)b(c)", "abc"},
+ {1, "a*", "aaa"},
+ {1, "(a+|b)*", "ab"},
+ {1, "(a+|b)+", "ab"},
+ {1, "a|b|c|d|e", "e"},
+ {1, "(a|b|c|d|e)f", "ef"},
+ {1, "abcd*efg", "abcdefg"},
+ {1, "(ab|ab*)bc", "abc"},
+ {1, "(ab|a)b*c", "abc"},
+ {1, "((a)(b)c)(d)", "abcd"},
+ {1, "(a|ab)(c|bcd)", "abcd"},
+ {1, "(a|ab)(bcd|c)", "abcd"},
+ {1, "(ab|a)(c|bcd)", "abcd"},
+ {1, "(ab|a)(bcd|c)", "abcd"},
+ {1, "((a|ab)(c|bcd))(d*)", "abcd"},
+ {1, "((a|ab)(bcd|c))(d*)", "abcd"},
+ {1, "((ab|a)(c|bcd))(d*)", "abcd"},
+ {1, "((ab|a)(bcd|c))(d*)", "abcd"},
+ {1, "(a|ab)((c|bcd)(d*))", "abcd"},
+ {1, "(a|ab)((bcd|c)(d*))", "abcd"},
+ {1, "(ab|a)((c|bcd)(d*))", "abcd"},
+ {1, "(ab|a)((bcd|c)(d*))", "abcd"},
+ {1, "(a*)(b|abc)", "abc"},
+ {1, "(a*)(abc|b)", "abc"},
+ {1, "((a*)(b|abc))(c*)", "abc"},
+ {1, "((a*)(abc|b))(c*)", "abc"},
+ {1, "(a*)((b|abc))(c*)", "abc"},
+ {1, "(a*)((abc|b)(c*))", "abc"},
+ {1, "(a*)(b|abc)", "abc"},
+ {1, "(a*)(abc|b)", "abc"},
+ {1, "((a*)(b|abc))(c*)", "abc"},
+ {1, "((a*)(abc|b))(c*)", "abc"},
+ {1, "(a*)((b|abc)(c*))", "abc"},
+ {1, "(a*)((abc|b)(c*))", "abc"},
+ {1, "(a|ab)", "ab"},
+ {1, "(ab|a)", "ab"},
+ {1, "(a|ab)(b*)", "ab"},
+ {1, "(ab|a)(b*)", "ab"},
+ {1, "(a|b)*c|(a|ab)*c", "abc"},
+};
+
+int
+main(void)
+{
+ const size_t n_tests = sizeof(match_tests) / sizeof(*match_tests);
+
+ for (size_t i = 0; i < n_tests; ++i) {
+ const char* const regexp = match_tests[i].pattern;
+ const char* const text = match_tests[i].text;
+ const bool should_match = match_tests[i].match;
+
+ RerexPattern* pattern = NULL;
+ size_t end = 0;
+ const RerexStatus st = rerex_compile(regexp, &end, &pattern);
+
+ assert(!st);
+
+ RerexMatcher* const matcher = rerex_new_matcher(pattern);
+ const bool matches = rerex_match(matcher, text);
+
+ assert(matches == should_match);
+
+ rerex_free_matcher(matcher);
+ rerex_free_pattern(pattern);
+ }
+
+ return 0;
+}
diff --git a/subprojects/rerex/test/test_syntax.c b/subprojects/rerex/test/test_syntax.c
new file mode 100644
index 00000000..9cff8855
--- /dev/null
+++ b/subprojects/rerex/test/test_syntax.c
@@ -0,0 +1,99 @@
+/*
+ Copyright 2020 David Robillard <d@drobilla.net>
+
+ Permission to use, copy, modify, and/or distribute this software for any
+ purpose with or without fee is hereby granted, provided that the above
+ copyright notice and this permission notice appear in all copies.
+
+ THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+// Tests that regex syntax errors are reported correctly
+
+#undef NDEBUG
+
+#include "rerex/rerex.h"
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+typedef struct {
+ RerexStatus status;
+ unsigned offset;
+ const char* pattern;
+} BadSyntaxTestCase;
+
+static const BadSyntaxTestCase syntax_tests[] = {
+ {REREX_EXPECTED_CHAR, 1, "a\b"},
+ {REREX_EXPECTED_CHAR, 1, "a\x7F"},
+ {REREX_EXPECTED_ELEMENT, 1, "[\b]"},
+ {REREX_EXPECTED_ELEMENT, 1, "[\x7F]"},
+ {REREX_EXPECTED_ELEMENT, 2, "[a\b]"},
+ {REREX_EXPECTED_ELEMENT, 2, "[a\x7F]"},
+ {REREX_EXPECTED_ELEMENT, 3, "[a-\b]"},
+ {REREX_EXPECTED_ELEMENT, 3, "[a-\x7F]"},
+ {REREX_EXPECTED_RBRACKET, 2, "[\\n]"},
+ {REREX_EXPECTED_RPAREN, 2, "(a"},
+ {REREX_EXPECTED_SPECIAL, 1, "\\n"},
+ {REREX_UNEXPECTED_END, 1, "("},
+ {REREX_UNEXPECTED_END, 1, "["},
+ {REREX_UNEXPECTED_END, 2, "[a"},
+ {REREX_UNEXPECTED_END, 3, "(a|"},
+ {REREX_UNEXPECTED_END, 3, "[a-"},
+ {REREX_UNEXPECTED_END, 4, "[a-z"},
+ {REREX_UNEXPECTED_END, 4, "[a-z"},
+ {REREX_UNEXPECTED_SPECIAL, 0, "{"},
+ {REREX_UNEXPECTED_SPECIAL, 0, "}"},
+ {REREX_UNEXPECTED_SPECIAL, 0, "?"},
+ {REREX_UNEXPECTED_SPECIAL, 1, "[]]"},
+ {REREX_UNEXPECTED_SPECIAL, 2, "a|?"},
+ {REREX_UNEXPECTED_SPECIAL, 3, "(a|?)"},
+ {REREX_UNEXPECTED_SPECIAL, 3, "[[]]"},
+ {REREX_UNEXPECTED_SPECIAL, 3, "[a]]"},
+ {REREX_UNEXPECTED_SPECIAL, 4, "[A-]]"},
+ {REREX_UNEXPECTED_SPECIAL, 4, "[a[]]"},
+ {REREX_UNEXPECTED_SPECIAL, 5, "[A-[]]"},
+ {REREX_UNORDERED_RANGE, 4, "[z-a]"},
+};
+
+static void
+test_status(void)
+{
+ assert(!strcmp(rerex_strerror((RerexStatus)INT32_MAX), "Unknown error"));
+}
+
+static void
+test_syntax(void)
+{
+ const size_t n_tests = sizeof(syntax_tests) / sizeof(*syntax_tests);
+
+ for (size_t i = 0; i < n_tests; ++i) {
+ const char* const regexp = syntax_tests[i].pattern;
+ const size_t offset = syntax_tests[i].offset;
+ const RerexStatus status = syntax_tests[i].status;
+
+ RerexPattern* pattern = NULL;
+ size_t end = 0;
+ const RerexStatus st = rerex_compile(regexp, &end, &pattern);
+
+ assert(st == status);
+ assert(!pattern);
+ assert(strcmp(rerex_strerror(st), rerex_strerror(REREX_SUCCESS)));
+ assert(end == offset);
+ }
+}
+
+int
+main(void)
+{
+ test_status();
+ test_syntax();
+ return 0;
+}
diff --git a/subprojects/rerex/test/test_xsd.c b/subprojects/rerex/test/test_xsd.c
new file mode 100644
index 00000000..81104a99
--- /dev/null
+++ b/subprojects/rerex/test/test_xsd.c
@@ -0,0 +1,522 @@
+/*
+ Copyright 2020 David Robillard <d@drobilla.net>
+
+ Permission to use, copy, modify, and/or distribute this software for any
+ purpose with or without fee is hereby granted, provided that the above
+ copyright notice and this permission notice appear in all copies.
+
+ THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+/*
+ Tests more realistic patterns and using a matcher multiple times.
+
+ The patterns here were written by myself and may have bugs. XSD datatypes
+ unfortunately don't have canonical patterns, though sometimes they
+ infuriatingly say that they are constrained by a pattern... which is not
+ given. XML is terrible, don't use it, but the XSD datatypes are more
+ generally useful. Some patterns here slightly fuzzy (for example, with
+ dates), so matching does not necessary mean the value itself is valid.
+
+ These don't have much to do with Rerex itself, but it's something I needed it
+ for, and having some more realistic tests is nice. These also test reusing a
+ matcher, unlike the basic match tests.
+*/
+
+#undef NDEBUG
+
+#include "rerex/rerex.h"
+
+#include <assert.h>
+#include <stddef.h>
+
+static void
+test_pattern(const char* const regexp,
+ const char* const matching[],
+ const char* const nonmatching[])
+{
+ RerexPattern* pattern = NULL;
+ size_t end = 0;
+ const RerexStatus st = rerex_compile(regexp, &end, &pattern);
+
+ assert(!st);
+
+ RerexMatcher* const matcher = rerex_new_matcher(pattern);
+
+ for (const char* const* m = matching; *m; ++m) {
+ assert(rerex_match(matcher, *m));
+ }
+
+ for (const char* const* n = nonmatching; *n; ++n) {
+ assert(!rerex_match(matcher, *n));
+ }
+
+ rerex_free_matcher(matcher);
+ rerex_free_pattern(pattern);
+}
+
+static void
+test_base64Binary(void)
+{
+ static const char* const regexp =
+ "(([A-Za-z0-9+/] *[A-Za-z0-9+/] *[A-Za-z0-9+/] *[A-Za-z0-9+/] *)*" //
+ "(([A-Za-z0-9+/] *[A-Za-z0-9+/] *[A-Za-z0-9+/] *[A-Za-z0-9+/])|" //
+ "([A-Za-z0-9+/] *[A-Za-z0-9+/] *[AEIMQUYcgkosw048] *=)|" //
+ "([A-Za-z0-9+/] *[AQgw] *= *=)))?";
+
+ static const char* const good[] = {
+ "0FB8",
+ "0fb8",
+ "0 FB8 0F+9",
+ "0F+40A8=",
+ "0F+40A==",
+ "",
+ NULL,
+ };
+
+ static const char* const bad[] = {
+ " 0FB8",
+ "0FB8 ",
+ " 0FB8 ",
+ "FB8",
+ "==0F",
+ "0F+40A9=",
+ "0F+40B==",
+ NULL,
+ };
+
+ test_pattern(regexp, good, bad);
+}
+
+static void
+test_boolean(void)
+{
+ static const char* const regexp = "(true|false|0|1)";
+ static const char* const good[] = {"true", "false", "0", "1", NULL};
+ static const char* const bad[] = {"TRUE", "T", "", NULL};
+
+ test_pattern(regexp, good, bad);
+}
+
+static void
+test_date(void)
+{
+ static const char* const regexp = //
+ "-?[0-9][0-9][0-9][0-9][0-9]*" //
+ "-(0[1-9]|1[0-2])" //
+ "-(0[1-9]|[12][0-9]|3[01])" //
+ "(Z|[-+][0-2][0-9]:[0-5][0-9])?";
+
+ static const char* const good[] = {
+ "2004-04-12",
+ "-0045-01-01",
+ "12004-04-12",
+ "2004-04-12-05:00",
+ "2004-04-12Z",
+ "2001-10-26",
+ "2001-10-26+02:00",
+ "2001-10-26Z",
+ "2001-10-26+00:00",
+ "-2001-10-26",
+ "-20000-04-01",
+ NULL,
+ };
+
+ static const char* const bad[] = {
+ "99-04-12",
+ "2004-4-2",
+ "2004/04/02",
+ "04-12-2004",
+ // "2004-04-31", // Not quite that clever...
+ "2001-10",
+ "2001-10-32",
+ "2001-13-26+02:00",
+ "01-10-26",
+ "",
+ NULL,
+ };
+
+ test_pattern(regexp, good, bad);
+}
+
+static void
+test_decimal(void)
+{
+ static const char* const regexp =
+ "[+-]?(([0-9]+[.]?[0-9]*)|([0-9]*[.]?[0-9]+))";
+
+ static const char* const good[] = {
+ "3.0",
+ "-3.0",
+ "+3.5",
+ "3",
+ ".3",
+ "3.",
+ "0",
+ "-.3",
+ "0003.0",
+ "3.0000",
+ "-456",
+ NULL,
+ };
+
+ static const char* const bad[] = {"3,5", ".", "", NULL};
+
+ test_pattern(regexp, good, bad);
+}
+
+// Tests both xsd:float and xsd:double, which are lexically identical
+static void
+test_float(void)
+{
+ static const char* const regexp = "-?INF|NaN|[+-]?(([0-9]+[.]?[0-9]*)|([0-"
+ "9]*[.]?[0-9]+))([eE][-+]?[0-9]+)?";
+
+ static const char* const good[] = {
+ "-3E2",
+ "4268.22752E11",
+ "+24.3e-3",
+ "12",
+ "+3.5",
+ "INF",
+ "-INF",
+ "-0",
+ "NaN",
+ NULL,
+ };
+
+ static const char* const bad[] = {
+ "-3E2.4",
+ "12E",
+ "+INF",
+ "NAN",
+ "",
+ NULL,
+ };
+
+ test_pattern(regexp, good, bad);
+}
+
+static void
+test_gDay(void)
+{
+ static const char* const regexp =
+ "---(0[1-9]|[12][0-9]|3[01])(Z|[-+][0-2][0-9]:[0-5][0-9])?";
+
+ static const char* const good[] = {
+ "---02",
+ "---01",
+ "---01Z",
+ "---01+02:00",
+ "---01-04:00",
+ "---15",
+ "---31",
+ NULL,
+ };
+
+ static const char* const bad[] = {
+ "02",
+ "---2",
+ "---32",
+ "--30-",
+ "---35",
+ "---5",
+ "15",
+ "",
+ NULL,
+ };
+
+ test_pattern(regexp, good, bad);
+}
+
+static void
+test_gMonth(void)
+{
+ static const char* const regexp =
+ "--(0[1-9]|1[0-2])(Z|[-+][0-2][0-9]:[0-5][0-9])?";
+
+ static const char* const good[] = {
+ "--04",
+ "--04-05:00",
+ "--05",
+ "--11Z",
+ "--11+02:00",
+ "--11-04:00",
+ "--02",
+ NULL,
+ };
+
+ static const char* const bad[] = {
+ "2004-04",
+ "04",
+ "--4",
+ "--13",
+ "-01-",
+ "--13",
+ "--1",
+ "01",
+ "",
+ NULL,
+ };
+
+ test_pattern(regexp, good, bad);
+}
+
+static void
+test_gMonthDay(void)
+{
+ static const char* const regexp = //
+ "--(0[1-9]|1[0-2])" //
+ "-(0[1-9]|[12][0-9]|3[01])" //
+ "(Z|[-+][0-2][0-9]:[0-5][0-9])?";
+
+ static const char* const good[] = {
+ "--04-12",
+ "--04-12Z",
+ "--05-01",
+ "--11-01Z",
+ "--11-01+02:00",
+ "--11-01-04:00",
+ "--11-15",
+ "--02-29",
+ NULL,
+ };
+
+ static const char* const bad[] = {
+ "04-12",
+ // "--04-31", Not quite that clever...
+ "--4-6",
+ "-01-30-",
+ "--01-35",
+ "--1-5",
+ "01-15",
+ "",
+ NULL,
+ };
+
+ test_pattern(regexp, good, bad);
+}
+
+static void
+test_gYear(void)
+{
+ static const char* const regexp =
+ "-?[0-9][0-9][0-9][0-9][0-9]*(Z|[-+][0-2][0-9]:[0-5][0-9])?";
+
+ static const char* const good[] = {
+ "2004",
+ "2004-05:00",
+ "12004",
+ "0922",
+ "-0045",
+ "2001+02:00",
+ "2001Z",
+ "2001+00:00",
+ "-2001",
+ "-20000",
+ NULL,
+ };
+
+ static const char* const bad[] = {
+ "99",
+ "922",
+ "01",
+ "2001-12",
+ "",
+ NULL,
+ };
+
+ test_pattern(regexp, good, bad);
+}
+
+static void
+test_gYearMonth(void)
+{
+ static const char* const regexp = //
+ "-?[0-9][0-9][0-9][0-9][0-9]*" //
+ "-(0[1-9]|1[0-2])" //
+ "(Z|[-+][0-2][0-9]:[0-5][0-9])?";
+
+ static const char* const good[] = {
+ "2001-10",
+ "2001-10+02:00",
+ "2001-10Z",
+ "2001-10+00:00",
+ "-2001-10",
+ "-20000-04",
+ "2004-04-05:00",
+ NULL,
+ };
+
+ static const char* const bad[] = {
+ "2001",
+ "2001-13",
+ "2001-13-26+02:00",
+ "01-10",
+ "99-04",
+ "2004",
+ "2004-4",
+ "2004-13",
+ "",
+ NULL,
+ };
+
+ test_pattern(regexp, good, bad);
+}
+
+static void
+test_hexBinary(void)
+{
+ static const char* const regexp = "([0-9A-Fa-f][0-9A-Fa-f])*";
+ static const char* const good[] = {"0FB8", "0fb8", "", NULL};
+ static const char* const bad[] = {"F", "FB8", NULL};
+
+ test_pattern(regexp, good, bad);
+}
+
+static void
+test_integer(void)
+{
+ static const char* const regexp = "[-+]?[0-9]+";
+ static const char* const good[] = {"122", "00122", "0", "-3", "+3", NULL};
+ static const char* const bad[] = {"3.", "3.0", "A", "", NULL};
+
+ test_pattern(regexp, good, bad);
+}
+
+static void
+test_language(void)
+{
+ static const char* const regexp = //
+ "[a-zA-Z][a-zA-Z]?[a-zA-Z]?[a-zA-Z]?" //
+ "[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?" //
+ "(-[a-zA-Z0-9][a-zA-Z0-9]?[a-zA-Z0-9]?[a-zA-Z0-9]?" //
+ "[a-zA-Z0-9]?[a-zA-Z0-9]?[a-zA-Z0-9]?[a-zA-Z0-9]?)*";
+
+ static const char* const good[] = {
+ "en",
+ "en-GB",
+ "en-US",
+ "fr",
+ "fr-FR",
+ "fr-CA",
+ "de",
+ "zh",
+ "ja",
+ "ko",
+ "i-navajo",
+ "x-Newspeak",
+ "any-value-with-short-parts",
+ NULL,
+ };
+
+ static const char* const bad[] = {
+ "longerThan8",
+ "even-longerThan8",
+ "longererThan8-first",
+ "last-longererThan8",
+ "middle-longererThan8-CA",
+ "",
+ NULL,
+ };
+
+ test_pattern(regexp, good, bad);
+}
+
+static void
+test_nonNegativeInteger(void)
+{
+ static const char* const regexp = "[+]?[0-9]+";
+ static const char* const good[] = {"+3", "122", "0", "0012", "+123", NULL};
+ static const char* const bad[] = {"-3", "3.0", "", NULL};
+
+ test_pattern(regexp, good, bad);
+}
+
+static void
+test_nonPositiveInteger(void)
+{
+ static const char* const regexp = "(0|-[0-9]+)";
+ static const char* const good[] = {"-3", "-0", "-00122", NULL};
+ static const char* const bad[] = {"122", "+3", "3.", "3.0", "", NULL};
+
+ test_pattern(regexp, good, bad);
+}
+
+static void
+test_positiveInteger(void)
+{
+ static const char* const regexp = "[+]?[0-9]*[1-9]+[0-9]*";
+ static const char* const good[] = {"122", "+3", "00122", NULL};
+ static const char* const bad[] = {"0", "-3", "3.0", "", NULL};
+
+ test_pattern(regexp, good, bad);
+}
+
+static void
+test_time(void)
+{
+ static const char* const regexp =
+ "(([0-1][0-9])|(2[0-4])):[0-5][0-9]:[0-5][0-9](.[0-9]+)?(Z|[-+][0-2][0-"
+ "9]:[0-5][0-9])?";
+
+ static const char* const good[] = {
+ "13:20:00",
+ "13:20:30.5555",
+ "13:20:00-05:00",
+ "13:20:00Z",
+ "00:00:00",
+ "24:00:00",
+ "21:32:52",
+ "21:32:52+02:00",
+ "19:32:52Z",
+ "19:32:52+00:00",
+ "21:32:52.12679",
+ NULL,
+ };
+
+ static const char* const bad[] = {
+ "5:20:00",
+ "13:20",
+ "13:20.5:00",
+ "13:65:00",
+ "21:32",
+ "25:25:10",
+ "-10:00:00",
+ "1:20:10",
+ "",
+ NULL,
+ };
+
+ test_pattern(regexp, good, bad);
+}
+
+int
+main(void)
+{
+ test_base64Binary();
+ test_boolean();
+ test_date();
+ test_decimal();
+ test_float();
+ test_gDay();
+ test_gMonth();
+ test_gMonthDay();
+ test_gYear();
+ test_gYearMonth();
+ test_hexBinary();
+ test_integer();
+ test_language();
+ test_nonNegativeInteger();
+ test_nonPositiveInteger();
+ test_positiveInteger();
+ test_time();
+
+ return 0;
+}