aboutsummaryrefslogtreecommitdiffstats
path: root/subprojects/rerex/src
diff options
context:
space:
mode:
Diffstat (limited to 'subprojects/rerex/src')
-rw-r--r--subprojects/rerex/src/.clang-tidy8
-rw-r--r--subprojects/rerex/src/rerex.c765
2 files changed, 773 insertions, 0 deletions
diff --git a/subprojects/rerex/src/.clang-tidy b/subprojects/rerex/src/.clang-tidy
new file mode 100644
index 00000000..420ce833
--- /dev/null
+++ b/subprojects/rerex/src/.clang-tidy
@@ -0,0 +1,8 @@
+Checks: >
+ *,
+ -hicpp-multiway-paths-covered,
+ -llvmlibc-*,
+ -misc-no-recursion,
+WarningsAsErrors: '*'
+HeaderFilterRegex: '.*'
+FormatStyle: file
diff --git a/subprojects/rerex/src/rerex.c b/subprojects/rerex/src/rerex.c
new file mode 100644
index 00000000..49970a45
--- /dev/null
+++ b/subprojects/rerex/src/rerex.c
@@ -0,0 +1,765 @@
+/*
+ Copyright 2020 David Robillard <d@drobilla.net>
+
+ Permission to use, copy, modify, and/or distribute this software for any
+ purpose with or without fee is hereby granted, provided that the above
+ copyright notice and this permission notice appear in all copies.
+
+ THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#include "rerex/rerex.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+static const char cmin = 0x20; // Inclusive minimum normal character
+static const char cmax = 0x7E; // Inclusive maximum normal character
+
+const char*
+rerex_strerror(const RerexStatus status)
+{
+ switch (status) {
+ case REREX_SUCCESS:
+ return "Success";
+ case REREX_EXPECTED_CHAR:
+ return "Expected a regular character";
+ case REREX_EXPECTED_ELEMENT:
+ return "Expected a character in a set";
+ case REREX_EXPECTED_RBRACKET:
+ return "Expected ']'";
+ case REREX_EXPECTED_RPAREN:
+ return "Expected ')'";
+ case REREX_EXPECTED_SPECIAL:
+ return "Expected a special character (one of \"()*+-?[]^|\")";
+ case REREX_UNEXPECTED_SPECIAL:
+ return "Unexpected special character";
+ case REREX_UNEXPECTED_END:
+ return "Unexpected end of input";
+ case REREX_UNORDERED_RANGE:
+ return "Range is out of order";
+ }
+
+ return "Unknown error";
+}
+
+/* State */
+
+#define NO_STATE 0
+
+// The ID for a state, which is an index into the state array
+typedef size_t StateIndex;
+
+// A code point (currently only 8-bit ASCII but we use the space anyway)
+typedef int Codepoint;
+
+// Special type for states with only epsilon out arcs
+typedef enum {
+ REREX_MATCH = 0xE000, ///< Matching state, no out arcs
+ REREX_SPLIT = 0xE001, ///< Splitting state, one or two out arcs
+} StateType;
+
+/* A state in an NFA.
+
+ A state in Thompson's NFA can have either a single character-labeled
+ transition to another state, or up to two unlabeled epsilon transitions to
+ other states. There is both a minimum and maximum label for supporting
+ character ranges. So, either `min` and `max` are ASCII characters that are
+ the label of an arc to next1 (and next2 is null), or `min` is a special
+ StateType and next1 and/or next2 may be set to successor states.
+*/
+typedef struct {
+ StateIndex next1; ///< Head of first out arc (or NULL)
+ StateIndex next2; ///< Head of second out arc (or NULL)
+ Codepoint min; ///< Special type, or inclusive min label for next1
+ Codepoint max; ///< Inclusive max label for next2
+} State;
+
+// Create a match (end) state with no successors
+static State
+match_state(void)
+{
+ const State s = {NO_STATE, NO_STATE, REREX_MATCH, 0};
+ return s;
+}
+
+// Create a split state with at most two successors
+static State
+split_state(const StateIndex next1, const StateIndex next2)
+{
+ const State s = {next1, next2, REREX_SPLIT, 0};
+ return s;
+}
+
+// Create a labeled state with one successor reached by a character arc
+static State
+range_state(const char min, const char max, const StateIndex next)
+{
+ const State s = {next, NO_STATE, min, max};
+ return s;
+}
+
+/* Array of states.
+
+ States are stored in a flat array to reduce memory fragmentation, and for
+ easy memory management since the automata graph may be cyclic. This simple
+ implementation calls realloc() for every state, which isn't terribly
+ efficient, but works well enough. Note that state addresses therefore change
+ during compilation, so states are generally referred to by their index, and
+ not by pointer. Conveniently, using indices is also useful during matching
+ for storing auxiliary information about states.
+*/
+typedef struct {
+ State* states;
+ size_t n_states;
+} StateArray;
+
+// Append a new state to the end of the state array
+static StateIndex
+add_state(StateArray* const array, const State state)
+{
+ const size_t new_n_states = array->n_states + 1;
+ const size_t new_size = new_n_states * sizeof(State);
+
+ array->states = (State*)realloc(array->states, new_size);
+ array->states[array->n_states] = state;
+
+ return array->n_states++;
+}
+
+/* Automata.
+ *
+ * This is a lightweiht description of an NFA fragment. The states are stored
+ * elsewhere, this is just used to provide a facade to conceptually work with
+ * automata for high-level operations.
+ */
+typedef struct {
+ StateIndex start;
+ StateIndex end;
+} Automata;
+
+// Simple utility function for making an automata in an expression
+static Automata
+make_automata(const StateIndex start, const StateIndex end)
+{
+ Automata result = {start, end};
+ return result;
+}
+
+// Return whether `nfa` has only two simple states (used for optimizations)
+static inline bool
+is_trivial(const StateArray* const states, const Automata nfa)
+{
+ return (states->states[nfa.start].min < REREX_MATCH &&
+ states->states[nfa.start].next1 == nfa.end);
+}
+
+// Kleene's Star of an NFA
+static Automata
+star(StateArray* const states, const Automata nfa)
+{
+ const StateIndex end = add_state(states, match_state());
+ const StateIndex start = add_state(states, split_state(nfa.start, end));
+
+ states->states[nfa.end] = split_state(nfa.start, end);
+
+ return make_automata(start, end);
+}
+
+// Zero-or-one of an NFA
+static Automata
+question(StateArray* const states, const Automata nfa)
+{
+ const StateIndex start = add_state(states, split_state(nfa.start, nfa.end));
+
+ return make_automata(start, nfa.end);
+}
+
+// One-or-more of an NFA
+static Automata
+plus(StateArray* const states, const Automata nfa)
+{
+ const StateIndex end = add_state(states, match_state());
+
+ states->states[nfa.end] = split_state(nfa.start, end);
+
+ return make_automata(nfa.start, end);
+}
+
+// Concatenation of two NFAs
+static Automata
+concatenate(StateArray* const states, const Automata a, const Automata b)
+{
+ if (is_trivial(states, a)) {
+ // Optimization: link a's start directly to b's start (drop a's end)
+ states->states[a.start].next1 = b.start;
+ } else {
+ states->states[a.end] = split_state(b.start, NO_STATE);
+ }
+
+ return make_automata(a.start, b.end);
+}
+
+// Alternation (OR) of two NFAs
+static Automata
+alternate(StateArray* const states, const Automata a, const Automata b)
+{
+ const StateIndex split = add_state(states, split_state(a.start, b.start));
+
+ if (is_trivial(states, a)) {
+ // Optimization: link a's start directly to b's end (drop a's end)
+ states->states[a.start].next1 = b.end;
+ return make_automata(split, b.end);
+ }
+
+ if (is_trivial(states, b)) {
+ // Optimization: link b's start directly to a's end (drop b's end)
+ states->states[b.start].next1 = a.end;
+ return make_automata(split, a.end);
+ }
+
+ const StateIndex end = add_state(states, match_state());
+
+ states->states[a.end] = split_state(end, NO_STATE);
+ states->states[b.end] = split_state(end, NO_STATE);
+
+ return make_automata(split, end);
+}
+
+/* Parser input.
+ *
+ * The parser reads from a string one character at a time, though it would be
+ * simple to change this to read from any stream. All reading is done by three
+ * operations: peek, peekahead, and eat.
+ */
+typedef struct {
+ const char* const str;
+ size_t offset;
+} Input;
+
+// Return the next character in the input without consuming it
+static inline char
+peek(Input* const input)
+{
+ return input->str[input->offset];
+}
+
+// Return the next-next character in the input without consuming any
+static inline char
+peekahead(Input* const input)
+{
+ // Unfortunately we need 2-char lookahead for the ambiguity of '-' in sets
+ return input->str[input->offset + 1];
+}
+
+// Consume and return the next character in the input
+static inline char
+eat(Input* const input)
+{
+ return input->str[input->offset++];
+}
+
+// Forward declaration for read_expr because it is called recursively
+static RerexStatus
+read_expr(Input* input, StateArray* states, Automata* out);
+
+// DOT ::= '.'
+// OPERATOR ::= '*' | '+' | '?'
+// SPECIAL ::= DOT | OPERATOR | '(' | ')' | '[' | ']' | '^' | '{' | '|' | '}'
+static bool
+is_special(const char c)
+{
+ switch (c) {
+ case '(':
+ case ')':
+ case '*':
+ case '+':
+ case '.':
+ case '?':
+ case '[':
+ case ']':
+ case '^':
+ case '{':
+ case '|':
+ case '}':
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+// DOT ::= '.'
+static RerexStatus
+read_dot(Input* const input, StateArray* const states, Automata* const out)
+{
+ assert(peek(input) == '.');
+ eat(input);
+
+ const StateIndex end = add_state(states, match_state());
+ const StateIndex start = add_state(states, range_state(cmin, cmax, end));
+
+ *out = make_automata(start, end);
+
+ return REREX_SUCCESS;
+}
+
+// ESCAPE ::= '\' SPECIAL
+static RerexStatus
+read_escape(Input* const input, char* const out)
+{
+ assert(peek(input) == '\\');
+ eat(input);
+
+ const char c = peek(input);
+ if (is_special(c) || c == '-') {
+ *out = eat(input);
+ return REREX_SUCCESS;
+ }
+
+ return REREX_EXPECTED_SPECIAL;
+}
+
+// CHAR ::= ESCAPE | [#x20-#x7E] - SPECIAL
+static RerexStatus
+read_char(Input* const input, char* const out)
+{
+ const char c = peek(input);
+
+ switch (c) {
+ case '\0':
+ return REREX_UNEXPECTED_END;
+ case '\\':
+ return read_escape(input, out);
+ default:
+ break;
+ }
+
+ if (is_special(c)) {
+ return REREX_UNEXPECTED_SPECIAL;
+ }
+
+ if (c >= cmin && c <= cmax) {
+ *out = eat(input);
+ return REREX_SUCCESS;
+ }
+
+ return REREX_EXPECTED_CHAR;
+}
+
+// ELEMENT ::= ([#x20-#x7E] - ']') | ('\' ']')
+static RerexStatus
+read_element(Input* const input, char* const out)
+{
+ const char c = peek(input);
+
+ switch (c) {
+ case '\0':
+ return REREX_UNEXPECTED_END;
+ case ']':
+ return REREX_UNEXPECTED_SPECIAL;
+ case '\\':
+ eat(input);
+ if (peek(input) != ']') {
+ return REREX_EXPECTED_RBRACKET;
+ }
+
+ *out = eat(input);
+ return REREX_SUCCESS;
+ default:
+ break;
+ }
+
+ if (c >= cmin && c <= cmax) {
+ *out = eat(input);
+ return REREX_SUCCESS;
+ }
+
+ return REREX_EXPECTED_ELEMENT;
+}
+
+// Range ::= ELEMENT | ELEMENT '-' ELEMENT
+static RerexStatus
+read_range(Input* const input,
+ StateArray* const states,
+ const bool negated,
+ Automata* const out)
+{
+ RerexStatus st = REREX_SUCCESS;
+ char min = 0;
+ char max = 0;
+
+ // Read the first (or only) character
+ if ((st = read_element(input, &min))) {
+ return st;
+ }
+
+ // Handle '-' which is a bit hairy because it may or may not be special
+ if (peek(input) == '-') {
+ switch (peekahead(input)) {
+ case ']':
+ // Weird case like [a-] where '-' is a character
+ max = min;
+ break;
+ default:
+ // Normal range like [a-z] (note that '[' isn't special here)
+ eat(input);
+ if ((st = read_element(input, &max))) {
+ return st;
+ }
+ break;
+ }
+ } else {
+ // Single character element
+ max = min;
+ }
+
+ if (max < min) {
+ return REREX_UNORDERED_RANGE;
+ }
+
+ const StateIndex end = add_state(states, match_state());
+ if (negated) {
+ const char emin = (char)(min - 1);
+ const char emax = (char)(max + 1);
+ const StateIndex low = add_state(states, range_state(cmin, emin, end));
+ const StateIndex high = add_state(states, range_state(emax, cmax, end));
+ const StateIndex fork = add_state(states, split_state(low, high));
+
+ *out = make_automata(fork, end);
+ } else {
+ const StateIndex start = add_state(states, range_state(min, max, end));
+
+ *out = make_automata(start, end);
+ }
+
+ return st;
+}
+
+// Set ::= Range | Range Set
+static RerexStatus
+read_set(Input* const input, StateArray* const states, Automata* const out)
+{
+ RerexStatus st = REREX_SUCCESS;
+ bool negated = false;
+
+ if (peek(input) == '^') {
+ eat(input);
+ negated = true;
+ }
+
+ Automata nfa = {NO_STATE, NO_STATE};
+ if ((st = read_range(input, states, negated, &nfa))) {
+ return st;
+ }
+
+ while (peek(input) != ']') {
+ Automata range_nfa = {NO_STATE, NO_STATE};
+ if ((st = read_range(input, states, negated, &range_nfa))) {
+ return st;
+ }
+
+ nfa = alternate(states, nfa, range_nfa);
+ }
+
+ *out = nfa;
+ return st;
+}
+
+// Atom ::= CHAR | DOT | '(' Expr ')' | '[' Set ']'
+static RerexStatus
+read_atom(Input* const input, StateArray* const states, Automata* const out)
+{
+ RerexStatus st = REREX_SUCCESS;
+
+ switch (peek(input)) {
+ case '(':
+ eat(input);
+ if ((st = read_expr(input, states, out))) {
+ return st;
+ }
+
+ if (peek(input) != ')') {
+ return REREX_EXPECTED_RPAREN;
+ }
+
+ eat(input);
+ return st;
+
+ case '.':
+ return read_dot(input, states, out);
+
+ case '[':
+ eat(input);
+ if ((st = read_set(input, states, out))) {
+ return st;
+ }
+
+ eat(input);
+ return st;
+
+ default:
+ break;
+ }
+
+ char c = 0;
+ if ((st = read_char(input, &c))) {
+ return st;
+ }
+
+ const StateIndex end = add_state(states, match_state());
+ const StateIndex start = add_state(states, range_state(c, c, end));
+
+ *out = make_automata(start, end);
+
+ return st;
+}
+
+// OPERATOR ::= '*' | '+' | '?'
+// Factor ::= Atom | Atom OPERATOR
+static RerexStatus
+read_factor(Input* const input, StateArray* const states, Automata* const out)
+{
+ RerexStatus st = REREX_SUCCESS;
+ Automata atom_nfa = {NO_STATE, NO_STATE};
+
+ if (!(st = read_atom(input, states, &atom_nfa))) {
+ const char c = peek(input);
+ switch (c) {
+ case '*':
+ eat(input);
+ *out = star(states, atom_nfa);
+ break;
+ case '+':
+ eat(input);
+ *out = plus(states, atom_nfa);
+ break;
+ case '?':
+ eat(input);
+ *out = question(states, atom_nfa);
+ break;
+ default:
+ *out = atom_nfa;
+ break;
+ }
+ }
+
+ return st;
+}
+
+// Term ::= Factor | Factor Term
+static RerexStatus
+read_term(Input* const input, StateArray* const states, Automata* const out)
+{
+ RerexStatus st = REREX_SUCCESS;
+ Automata factor_nfa = {NO_STATE, NO_STATE};
+ Automata term_nfa = {NO_STATE, NO_STATE};
+
+ if (!(st = read_factor(input, states, &factor_nfa))) {
+ switch (peek(input)) {
+ case '\0':
+ case ')':
+ case '|':
+ *out = factor_nfa;
+ break;
+ default:
+ if (!(st = read_term(input, states, &term_nfa))) {
+ *out = concatenate(states, factor_nfa, term_nfa);
+ }
+ break;
+ }
+ }
+
+ return st;
+}
+
+// Expr ::= Term | Term '|' Expr
+static RerexStatus
+read_expr(Input* const input, StateArray* const states, Automata* const out)
+{
+ RerexStatus st = REREX_SUCCESS;
+ Automata term_nfa = {NO_STATE, NO_STATE};
+ Automata expr_nfa = {NO_STATE, NO_STATE};
+
+ if ((st = read_term(input, states, &term_nfa))) {
+ return st;
+ }
+
+ if (peek(input) == '|') {
+ eat(input);
+ if (!(st = read_expr(input, states, &expr_nfa))) {
+ *out = alternate(states, term_nfa, expr_nfa);
+ }
+ } else {
+ *out = term_nfa;
+ }
+
+ return st;
+}
+
+/* Pattern.
+ *
+ * A pattern is simply an array of states and an index to the start state. The
+ * end state(s) are known because they have type REREX_MATCH. A pattern is
+ * immutable after construction, the matcher does not modify it.
+ */
+struct RerexPatternImpl {
+ StateArray states;
+ StateIndex start;
+};
+
+void
+rerex_free_pattern(RerexPattern* const regexp)
+{
+ free(regexp->states.states);
+ free(regexp);
+}
+
+RerexStatus
+rerex_compile(const char* const pattern,
+ size_t* const end,
+ RerexPattern** const out)
+{
+ Input input = {pattern, 0};
+ Automata nfa = {NO_STATE, NO_STATE};
+ StateArray states = {NULL, 0};
+
+ // Add null state so that no actual state has NO_STATE as an ID
+ add_state(&states, split_state(NO_STATE, NO_STATE));
+
+ const RerexStatus st = read_expr(&input, &states, &nfa);
+ if (st) {
+ free(states.states);
+ } else {
+ *out = (RerexPattern*)malloc(sizeof(RerexPattern));
+ (*out)->states = states;
+ (*out)->start = nfa.start;
+ }
+
+ *end = input.offset;
+ return st;
+}
+
+/* Matcher */
+
+typedef struct {
+ StateIndex* indices; // Array of state indices
+ size_t n_indices; // Number of elements in indices
+} IndexList;
+
+/* Matcher.
+ *
+ * The matcher tracks active states by keeping two lists of indices: one for
+ * the current iteration, and one for the next. A separate array, keyed by
+ * state index, stores the number of the last iteration the state was entered
+ * in. This makes it simple and fast to check if a state has already been
+ * entered in the current iteration, avoiding the need to search the active
+ * list for every entered state.
+ */
+struct RerexMatcherImpl {
+ const RerexPattern* regexp; // Pattern to match against
+ IndexList active[2]; // Two lists of active states
+ size_t* last_active; // Last iteration a state was active
+};
+
+RerexMatcher*
+rerex_new_matcher(const RerexPattern* const regexp)
+{
+ const size_t n_states = regexp->states.n_states;
+ RerexMatcher* const m = (RerexMatcher*)calloc(1, sizeof(RerexMatcher));
+
+ m->regexp = regexp;
+ m->active[0].indices = (StateIndex*)calloc(n_states, sizeof(StateIndex));
+ m->active[1].indices = (StateIndex*)calloc(n_states, sizeof(StateIndex));
+ m->last_active = (size_t*)calloc(n_states, sizeof(size_t));
+
+ return m;
+}
+
+void
+rerex_free_matcher(RerexMatcher* const matcher)
+{
+ free(matcher->last_active);
+ free(matcher->active[1].indices);
+ free(matcher->active[0].indices);
+ free(matcher);
+}
+
+// Add `s` and any epsilon successors to the active list
+static void
+enter_state(RerexMatcher* const matcher,
+ const size_t step,
+ IndexList* const list,
+ const StateIndex s)
+{
+ const StateArray* const states = &matcher->regexp->states;
+
+ if (s && matcher->last_active[s] != step) {
+ matcher->last_active[s] = step;
+
+ const State* const state = &states->states[s];
+ if (state->min == REREX_SPLIT) {
+ enter_state(matcher, step, list, state->next1);
+ enter_state(matcher, step, list, state->next2);
+ } else {
+ list->indices[list->n_indices++] = s;
+ }
+ }
+}
+
+// Run `matcher` and return true if `string` matches the expression
+bool
+rerex_match(RerexMatcher* const matcher, const char* const string)
+{
+ const StateArray* const states = &matcher->regexp->states;
+
+ // Reset matcher to a consistent initial state
+ matcher->active[0].n_indices = 0;
+ matcher->active[1].n_indices = 0;
+ for (size_t i = 0; i < states->n_states; ++i) {
+ matcher->last_active[i] = SIZE_MAX;
+ }
+
+ // Enter start state
+ enter_state(matcher, 0, &matcher->active[0], matcher->regexp->start);
+
+ // Tick the matcher for every input character
+ bool phase = 0;
+ for (size_t i = 0; string[i]; ++i) {
+ const char c = string[i];
+ IndexList* const list = &matcher->active[phase];
+ IndexList* const next_list = &matcher->active[!phase];
+
+ // Add successor states to the next iteration's list
+ next_list->n_indices = 0;
+ for (size_t j = 0; j < list->n_indices; ++j) {
+ const State* const state = &states->states[list->indices[j]];
+ if (state->min <= c && c <= state->max) {
+ enter_state(matcher, i + 1, next_list, state->next1);
+ }
+ }
+
+ // Flip phase to swap active lists
+ phase = !phase;
+ }
+
+ // Check if match state is entered in the end
+ IndexList* const active = &matcher->active[phase];
+ for (size_t i = 0; i < active->n_indices; ++i) {
+ if (states->states[active->indices[i]].min == REREX_MATCH) {
+ return true;
+ }
+ }
+
+ return false;
+}