/* Copyright 2020 David Robillard Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies. THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "rerex/rerex.h" #include #include #include #include #include static const char cmin = 0x20; // Inclusive minimum normal character static const char cmax = 0x7E; // Inclusive maximum normal character const char* rerex_strerror(const RerexStatus status) { switch (status) { case REREX_SUCCESS: return "Success"; case REREX_EXPECTED_CHAR: return "Expected a regular character"; case REREX_EXPECTED_ELEMENT: return "Expected a character in a set"; case REREX_EXPECTED_RBRACKET: return "Expected ']'"; case REREX_EXPECTED_RPAREN: return "Expected ')'"; case REREX_EXPECTED_SPECIAL: return "Expected a special character (one of \"()*+-?[]^|\")"; case REREX_UNEXPECTED_SPECIAL: return "Unexpected special character"; case REREX_UNEXPECTED_END: return "Unexpected end of input"; case REREX_UNORDERED_RANGE: return "Range is out of order"; } return "Unknown error"; } /* State */ #define NO_STATE 0 // The ID for a state, which is an index into the state array typedef size_t StateIndex; // A code point (currently only 8-bit ASCII but we use the space anyway) typedef int Codepoint; // Special type for states with only epsilon out arcs typedef enum { REREX_MATCH = 0xE000, ///< Matching state, no out arcs REREX_SPLIT = 0xE001, ///< Splitting state, one or two out arcs } StateType; /* A state in an NFA. A state in Thompson's NFA can have either a single character-labeled transition to another state, or up to two unlabeled epsilon transitions to other states. There is both a minimum and maximum label for supporting character ranges. So, either `min` and `max` are ASCII characters that are the label of an arc to next1 (and next2 is null), or `min` is a special StateType and next1 and/or next2 may be set to successor states. */ typedef struct { StateIndex next1; ///< Head of first out arc (or NULL) StateIndex next2; ///< Head of second out arc (or NULL) Codepoint min; ///< Special type, or inclusive min label for next1 Codepoint max; ///< Inclusive max label for next2 } State; // Create a match (end) state with no successors static State match_state(void) { const State s = {NO_STATE, NO_STATE, REREX_MATCH, 0}; return s; } // Create a split state with at most two successors static State split_state(const StateIndex next1, const StateIndex next2) { const State s = {next1, next2, REREX_SPLIT, 0}; return s; } // Create a labeled state with one successor reached by a character arc static State range_state(const char min, const char max, const StateIndex next) { const State s = {next, NO_STATE, min, max}; return s; } /* Array of states. States are stored in a flat array to reduce memory fragmentation, and for easy memory management since the automata graph may be cyclic. This simple implementation calls realloc() for every state, which isn't terribly efficient, but works well enough. Note that state addresses therefore change during compilation, so states are generally referred to by their index, and not by pointer. Conveniently, using indices is also useful during matching for storing auxiliary information about states. */ typedef struct { State* states; size_t n_states; } StateArray; // Append a new state to the end of the state array static StateIndex add_state(StateArray* const array, const State state) { const size_t new_n_states = array->n_states + 1; const size_t new_size = new_n_states * sizeof(State); array->states = (State*)realloc(array->states, new_size); array->states[array->n_states] = state; return array->n_states++; } /* Automata. * * This is a lightweiht description of an NFA fragment. The states are stored * elsewhere, this is just used to provide a facade to conceptually work with * automata for high-level operations. */ typedef struct { StateIndex start; StateIndex end; } Automata; // Simple utility function for making an automata in an expression static Automata make_automata(const StateIndex start, const StateIndex end) { Automata result = {start, end}; return result; } // Return whether `nfa` has only two simple states (used for optimizations) static inline bool is_trivial(const StateArray* const states, const Automata nfa) { return (states->states[nfa.start].min < REREX_MATCH && states->states[nfa.start].next1 == nfa.end); } // Kleene's Star of an NFA static Automata star(StateArray* const states, const Automata nfa) { const StateIndex end = add_state(states, match_state()); const StateIndex start = add_state(states, split_state(nfa.start, end)); states->states[nfa.end] = split_state(nfa.start, end); return make_automata(start, end); } // Zero-or-one of an NFA static Automata question(StateArray* const states, const Automata nfa) { const StateIndex start = add_state(states, split_state(nfa.start, nfa.end)); return make_automata(start, nfa.end); } // One-or-more of an NFA static Automata plus(StateArray* const states, const Automata nfa) { const StateIndex end = add_state(states, match_state()); states->states[nfa.end] = split_state(nfa.start, end); return make_automata(nfa.start, end); } // Concatenation of two NFAs static Automata concatenate(StateArray* const states, const Automata a, const Automata b) { if (is_trivial(states, a)) { // Optimization: link a's start directly to b's start (drop a's end) states->states[a.start].next1 = b.start; } else { states->states[a.end] = split_state(b.start, NO_STATE); } return make_automata(a.start, b.end); } // Alternation (OR) of two NFAs static Automata alternate(StateArray* const states, const Automata a, const Automata b) { const StateIndex split = add_state(states, split_state(a.start, b.start)); if (is_trivial(states, a)) { // Optimization: link a's start directly to b's end (drop a's end) states->states[a.start].next1 = b.end; return make_automata(split, b.end); } if (is_trivial(states, b)) { // Optimization: link b's start directly to a's end (drop b's end) states->states[b.start].next1 = a.end; return make_automata(split, a.end); } const StateIndex end = add_state(states, match_state()); states->states[a.end] = split_state(end, NO_STATE); states->states[b.end] = split_state(end, NO_STATE); return make_automata(split, end); } /* Parser input. * * The parser reads from a string one character at a time, though it would be * simple to change this to read from any stream. All reading is done by three * operations: peek, peekahead, and eat. */ typedef struct { const char* const str; size_t offset; } Input; // Return the next character in the input without consuming it static inline char peek(Input* const input) { return input->str[input->offset]; } // Return the next-next character in the input without consuming any static inline char peekahead(Input* const input) { // Unfortunately we need 2-char lookahead for the ambiguity of '-' in sets return input->str[input->offset + 1]; } // Consume and return the next character in the input static inline char eat(Input* const input) { return input->str[input->offset++]; } // Forward declaration for read_expr because it is called recursively static RerexStatus read_expr(Input* input, StateArray* states, Automata* out); // DOT ::= '.' // OPERATOR ::= '*' | '+' | '?' // SPECIAL ::= DOT | OPERATOR | '(' | ')' | '[' | ']' | '^' | '{' | '|' | '}' static bool is_special(const char c) { switch (c) { case '(': case ')': case '*': case '+': case '.': case '?': case '[': case ']': case '^': case '{': case '|': case '}': return true; default: break; } return false; } // DOT ::= '.' static RerexStatus read_dot(Input* const input, StateArray* const states, Automata* const out) { assert(peek(input) == '.'); eat(input); const StateIndex end = add_state(states, match_state()); const StateIndex start = add_state(states, range_state(cmin, cmax, end)); *out = make_automata(start, end); return REREX_SUCCESS; } // ESCAPE ::= '\' SPECIAL static RerexStatus read_escape(Input* const input, char* const out) { assert(peek(input) == '\\'); eat(input); const char c = peek(input); if (is_special(c) || c == '-') { *out = eat(input); return REREX_SUCCESS; } return REREX_EXPECTED_SPECIAL; } // CHAR ::= ESCAPE | [#x20-#x7E] - SPECIAL static RerexStatus read_char(Input* const input, char* const out) { const char c = peek(input); switch (c) { case '\0': return REREX_UNEXPECTED_END; case '\\': return read_escape(input, out); default: break; } if (is_special(c)) { return REREX_UNEXPECTED_SPECIAL; } if (c >= cmin && c <= cmax) { *out = eat(input); return REREX_SUCCESS; } return REREX_EXPECTED_CHAR; } // ELEMENT ::= ([#x20-#x7E] - ']') | ('\' ']') static RerexStatus read_element(Input* const input, char* const out) { const char c = peek(input); switch (c) { case '\0': return REREX_UNEXPECTED_END; case ']': return REREX_UNEXPECTED_SPECIAL; case '\\': eat(input); if (peek(input) != ']') { return REREX_EXPECTED_RBRACKET; } *out = eat(input); return REREX_SUCCESS; default: break; } if (c >= cmin && c <= cmax) { *out = eat(input); return REREX_SUCCESS; } return REREX_EXPECTED_ELEMENT; } // Range ::= ELEMENT | ELEMENT '-' ELEMENT static RerexStatus read_range(Input* const input, StateArray* const states, const bool negated, Automata* const out) { RerexStatus st = REREX_SUCCESS; char min = 0; char max = 0; // Read the first (or only) character if ((st = read_element(input, &min))) { return st; } // Handle '-' which is a bit hairy because it may or may not be special if (peek(input) == '-') { switch (peekahead(input)) { case ']': // Weird case like [a-] where '-' is a character max = min; break; default: // Normal range like [a-z] (note that '[' isn't special here) eat(input); if ((st = read_element(input, &max))) { return st; } break; } } else { // Single character element max = min; } if (max < min) { return REREX_UNORDERED_RANGE; } const StateIndex end = add_state(states, match_state()); if (negated) { const char emin = (char)(min - 1); const char emax = (char)(max + 1); const StateIndex low = add_state(states, range_state(cmin, emin, end)); const StateIndex high = add_state(states, range_state(emax, cmax, end)); const StateIndex fork = add_state(states, split_state(low, high)); *out = make_automata(fork, end); } else { const StateIndex start = add_state(states, range_state(min, max, end)); *out = make_automata(start, end); } return st; } // Set ::= Range | Range Set static RerexStatus read_set(Input* const input, StateArray* const states, Automata* const out) { RerexStatus st = REREX_SUCCESS; bool negated = false; if (peek(input) == '^') { eat(input); negated = true; } Automata nfa = {NO_STATE, NO_STATE}; if ((st = read_range(input, states, negated, &nfa))) { return st; } while (peek(input) != ']') { Automata range_nfa = {NO_STATE, NO_STATE}; if ((st = read_range(input, states, negated, &range_nfa))) { return st; } nfa = alternate(states, nfa, range_nfa); } *out = nfa; return st; } // Atom ::= CHAR | DOT | '(' Expr ')' | '[' Set ']' static RerexStatus read_atom(Input* const input, StateArray* const states, Automata* const out) { RerexStatus st = REREX_SUCCESS; switch (peek(input)) { case '(': eat(input); if ((st = read_expr(input, states, out))) { return st; } if (peek(input) != ')') { return REREX_EXPECTED_RPAREN; } eat(input); return st; case '.': return read_dot(input, states, out); case '[': eat(input); if ((st = read_set(input, states, out))) { return st; } eat(input); return st; default: break; } char c = 0; if ((st = read_char(input, &c))) { return st; } const StateIndex end = add_state(states, match_state()); const StateIndex start = add_state(states, range_state(c, c, end)); *out = make_automata(start, end); return st; } // OPERATOR ::= '*' | '+' | '?' // Factor ::= Atom | Atom OPERATOR static RerexStatus read_factor(Input* const input, StateArray* const states, Automata* const out) { RerexStatus st = REREX_SUCCESS; Automata atom_nfa = {NO_STATE, NO_STATE}; if (!(st = read_atom(input, states, &atom_nfa))) { const char c = peek(input); switch (c) { case '*': eat(input); *out = star(states, atom_nfa); break; case '+': eat(input); *out = plus(states, atom_nfa); break; case '?': eat(input); *out = question(states, atom_nfa); break; default: *out = atom_nfa; break; } } return st; } // Term ::= Factor | Factor Term static RerexStatus read_term(Input* const input, StateArray* const states, Automata* const out) { RerexStatus st = REREX_SUCCESS; Automata factor_nfa = {NO_STATE, NO_STATE}; Automata term_nfa = {NO_STATE, NO_STATE}; if (!(st = read_factor(input, states, &factor_nfa))) { switch (peek(input)) { case '\0': case ')': case '|': *out = factor_nfa; break; default: if (!(st = read_term(input, states, &term_nfa))) { *out = concatenate(states, factor_nfa, term_nfa); } break; } } return st; } // Expr ::= Term | Term '|' Expr static RerexStatus read_expr(Input* const input, StateArray* const states, Automata* const out) { RerexStatus st = REREX_SUCCESS; Automata term_nfa = {NO_STATE, NO_STATE}; Automata expr_nfa = {NO_STATE, NO_STATE}; if ((st = read_term(input, states, &term_nfa))) { return st; } if (peek(input) == '|') { eat(input); if (!(st = read_expr(input, states, &expr_nfa))) { *out = alternate(states, term_nfa, expr_nfa); } } else { *out = term_nfa; } return st; } /* Pattern. * * A pattern is simply an array of states and an index to the start state. The * end state(s) are known because they have type REREX_MATCH. A pattern is * immutable after construction, the matcher does not modify it. */ struct RerexPatternImpl { StateArray states; StateIndex start; }; void rerex_free_pattern(RerexPattern* const regexp) { free(regexp->states.states); free(regexp); } RerexStatus rerex_compile(const char* const pattern, size_t* const end, RerexPattern** const out) { Input input = {pattern, 0}; Automata nfa = {NO_STATE, NO_STATE}; StateArray states = {NULL, 0}; // Add null state so that no actual state has NO_STATE as an ID add_state(&states, split_state(NO_STATE, NO_STATE)); const RerexStatus st = read_expr(&input, &states, &nfa); if (st) { free(states.states); } else { *out = (RerexPattern*)malloc(sizeof(RerexPattern)); (*out)->states = states; (*out)->start = nfa.start; } *end = input.offset; return st; } /* Matcher */ typedef struct { StateIndex* indices; // Array of state indices size_t n_indices; // Number of elements in indices } IndexList; /* Matcher. * * The matcher tracks active states by keeping two lists of indices: one for * the current iteration, and one for the next. A separate array, keyed by * state index, stores the number of the last iteration the state was entered * in. This makes it simple and fast to check if a state has already been * entered in the current iteration, avoiding the need to search the active * list for every entered state. */ struct RerexMatcherImpl { const RerexPattern* regexp; // Pattern to match against IndexList active[2]; // Two lists of active states size_t* last_active; // Last iteration a state was active }; RerexMatcher* rerex_new_matcher(const RerexPattern* const regexp) { const size_t n_states = regexp->states.n_states; RerexMatcher* const m = (RerexMatcher*)calloc(1, sizeof(RerexMatcher)); m->regexp = regexp; m->active[0].indices = (StateIndex*)calloc(n_states, sizeof(StateIndex)); m->active[1].indices = (StateIndex*)calloc(n_states, sizeof(StateIndex)); m->last_active = (size_t*)calloc(n_states, sizeof(size_t)); return m; } void rerex_free_matcher(RerexMatcher* const matcher) { free(matcher->last_active); free(matcher->active[1].indices); free(matcher->active[0].indices); free(matcher); } // Add `s` and any epsilon successors to the active list static void enter_state(RerexMatcher* const matcher, const size_t step, IndexList* const list, const StateIndex s) { const StateArray* const states = &matcher->regexp->states; if (s && matcher->last_active[s] != step) { matcher->last_active[s] = step; const State* const state = &states->states[s]; if (state->min == REREX_SPLIT) { enter_state(matcher, step, list, state->next1); enter_state(matcher, step, list, state->next2); } else { list->indices[list->n_indices++] = s; } } } // Run `matcher` and return true if `string` matches the expression bool rerex_match(RerexMatcher* const matcher, const char* const string) { const StateArray* const states = &matcher->regexp->states; // Reset matcher to a consistent initial state matcher->active[0].n_indices = 0; matcher->active[1].n_indices = 0; for (size_t i = 0; i < states->n_states; ++i) { matcher->last_active[i] = SIZE_MAX; } // Enter start state enter_state(matcher, 0, &matcher->active[0], matcher->regexp->start); // Tick the matcher for every input character bool phase = 0; for (size_t i = 0; string[i]; ++i) { const char c = string[i]; IndexList* const list = &matcher->active[phase]; IndexList* const next_list = &matcher->active[!phase]; // Add successor states to the next iteration's list next_list->n_indices = 0; for (size_t j = 0; j < list->n_indices; ++j) { const State* const state = &states->states[list->indices[j]]; if (state->min <= c && c <= state->max) { enter_state(matcher, i + 1, next_list, state->next1); } } // Flip phase to swap active lists phase = !phase; } // Check if match state is entered in the end IndexList* const active = &matcher->active[phase]; for (size_t i = 0; i < active->n_indices; ++i) { if (states->states[active->indices[i]].min == REREX_MATCH) { return true; } } return false; }