From d6ea7e7c4172902086eab521ac7e74fab2009efd Mon Sep 17 00:00:00 2001 From: TizenOpenSource Date: Mon, 22 Jan 2024 15:17:04 +0900 Subject: Imported Upstream version 3.1 --- lib/lex.h | 9 +- lib/lex.re | 74 ++--- lib/parse.ypp | 89 +++--- lib/regcomp.cc | 147 ++++----- lib/regcomp_dfa_multipass.h | 271 ++++++++++++++++ lib/regcomp_dfa_regless.h | 249 --------------- lib/regex.h | 173 +++++----- lib/regex_impl.h | 380 +++++++++------------- lib/regexec.cc | 71 ++--- lib/regexec_dfa.cc | 109 +++---- lib/regexec_dfa_multipass.cc | 306 ++++++++++++++++++ lib/regexec_dfa_regless.cc | 312 ------------------ lib/regexec_nfa_leftmost.cc | 37 +-- lib/regexec_nfa_leftmost_trie.cc | 44 ++- lib/regexec_nfa_posix.cc | 64 ++-- lib/regexec_nfa_posix_backward.cc | 632 ------------------------------------- lib/regexec_nfa_posix_kuklewicz.cc | 333 ------------------- lib/regexec_nfa_posix_trie.cc | 175 +++++----- lib/regfree.cc | 60 ++-- lib/regoff_trie.h | 40 +-- lib/stubs.cc | 4 +- lib/test.cc | 498 +++++++++++++---------------- lib/test_helper.h | 15 +- lib/test_helper.re | 54 ++-- 24 files changed, 1446 insertions(+), 2700 deletions(-) create mode 100644 lib/regcomp_dfa_multipass.h delete mode 100644 lib/regcomp_dfa_regless.h create mode 100644 lib/regexec_dfa_multipass.cc delete mode 100644 lib/regexec_dfa_regless.cc delete mode 100644 lib/regexec_nfa_posix_backward.cc delete mode 100644 lib/regexec_nfa_posix_kuklewicz.cc (limited to 'lib') diff --git a/lib/lex.h b/lib/lex.h index a9ec531f..ceaef0c7 100644 --- a/lib/lex.h +++ b/lib/lex.h @@ -1,15 +1,16 @@ #ifndef _RE2C_LIB_LEX_ #define _RE2C_LIB_LEX_ +#include "lib/parse.h" #include "src/msg/location.h" -#include "src/regexp/re.h" +#include "src/regexp/regexp.h" #include "src/parse/ast.h" namespace re2c { -int lex(const char *&pattern); -const AST *parse(const char *pattern); -extern const AST *regexp; +int lex(YYSTYPE* yylval, const uint8_t*& pattern, Ast& ast); +const AstNode* parse(const char* pattern, Ast& ast); +extern const AstNode* regexp; } // namespace re2c diff --git a/lib/lex.re b/lib/lex.re index ebd0024f..e03bceb0 100644 --- a/lib/lex.re +++ b/lib/lex.re @@ -1,39 +1,34 @@ +#include #include -#include "src/util/c99_stdint.h" - #include "src/encoding/enc.h" #include "src/msg/msg.h" #include "src/parse/ast.h" -#include "src/parse/unescape.h" #include "src/util/range.h" -#include "src/util/s_to_n32_unsafe.h" +#include "src/util/string_utils.h" #include "parse.h" #include "lib/lex.h" -extern YYSTYPE yylval; namespace re2c { -static int32_t lex_cls_chr(const char *&, uint32_t &); +static int32_t lex_cls_chr(const uint8_t*&, uint32_t&); /*!re2c re2c:flags:tags = 1; re2c:yyfill:enable = 0; re2c:define:YYCURSOR = cur; re2c:define:YYMARKER = mar; - re2c:define:YYCTYPE = char; + re2c:define:YYCTYPE = uint8_t; nil = "\x00"; num = [0-9]+; */ -int lex(const char *&cur) -{ - /*!stags:re2c format = "const char *@@;"; */ - const char *mar, *x, *y; - std::vector cls; +int lex(YYSTYPE* yylval, const uint8_t*& cur, Ast& ast) { + /*!stags:re2c format = "const uint8_t* @@;"; */ + const uint8_t* mar, *x, *y; bool neg = false; uint32_t l, u; @@ -53,33 +48,31 @@ int lex(const char *&cur) "[" { goto cls; } "{" @x num "}" { - if (!s_to_u32_unsafe(x, cur - 1, yylval.bounds.min)) goto err_cnt; - yylval.bounds.max = yylval.bounds.min; + if (!s_to_u32_unsafe(x, cur - 1, yylval->bounds.min)) goto err_cnt; + yylval->bounds.max = yylval->bounds.min; return TOKEN_COUNT; } "{" @x num "," @y num "}" { - if (!s_to_u32_unsafe(x, y - 1, yylval.bounds.min) - || !s_to_u32_unsafe(y, cur - 1, yylval.bounds.max)) goto err_cnt; + if (!s_to_u32_unsafe(x, y - 1, yylval->bounds.min) + || !s_to_u32_unsafe(y, cur - 1, yylval->bounds.max)) goto err_cnt; return TOKEN_COUNT; } "{" @x num ",}" { - if (!s_to_u32_unsafe(x, cur - 2, yylval.bounds.min)) goto err_cnt; - yylval.bounds.max = AST::MANY; + if (!s_to_u32_unsafe(x, cur - 2, yylval->bounds.min)) goto err_cnt; + yylval->bounds.max = Ast::MANY; return TOKEN_COUNT; } "." { - yylval.regexp = ast_dot(NOWHERE); + yylval->regexp = ast.dot(NOWHERE); return TOKEN_REGEXP; } [^] \ nil { - ASTChar c = {static_cast(cur[-1]), NOWHERE}; - std::vector *str = new std::vector; - str->push_back(c); - yylval.regexp = ast_str(NOWHERE, str, false); + ast.temp_chars.push_back({cur[-1], NOWHERE}); + yylval->regexp = ast.str(NOWHERE, false); return TOKEN_REGEXP; } */ @@ -92,13 +85,11 @@ cls: */ add: if (l > u) goto err; - cls.push_back(ASTRange(l, u, NOWHERE)); + ast.temp_ranges.push_back(AstRange(l, u, NOWHERE)); /*!local:re2c "" { goto cls; } "]" { - std::vector *p = new std::vector; - p->swap(cls); - yylval.regexp = ast_cls(NOWHERE, p, neg); + yylval->regexp = ast.cls(NOWHERE, neg); return TOKEN_REGEXP; } */ @@ -112,9 +103,8 @@ err_cnt: return TOKEN_ERROR; } -int32_t lex_cls_chr(const char *&cur, uint32_t &c) -{ - const char *mar, *p = cur; +int32_t lex_cls_chr(const uint8_t*& cur, uint32_t& c) { + const uint8_t* mar, *p = cur; /*!local:re2c * { return 1; } "[." { error("collating characters not supported"); return 1; } @@ -123,18 +113,18 @@ int32_t lex_cls_chr(const char *&cur, uint32_t &c) "\\x"[0-9a-fA-F]{2} { c = unesc_hex(p, cur); return 0; } - "\\" { c = static_cast('\\'); return 0; } - "\\a" { c = static_cast('\a'); return 0; } - "\\b" { c = static_cast('\b'); return 0; } - "\\f" { c = static_cast('\f'); return 0; } - "\\n" { c = static_cast('\n'); return 0; } - "\\r" { c = static_cast('\r'); return 0; } - "\\t" { c = static_cast('\t'); return 0; } - "\\v" { c = static_cast('\v'); return 0; } - "\\\\" { c = static_cast('\\'); return 0; } - "\\]" { c = static_cast(']'); return 0; } - - [^] \ nil { c = static_cast(cur[-1]); return 0; } + "\\" { c = '\\'_u8; return 0; } + "\\a" { c = '\a'_u8; return 0; } + "\\b" { c = '\b'_u8; return 0; } + "\\f" { c = '\f'_u8; return 0; } + "\\n" { c = '\n'_u8; return 0; } + "\\r" { c = '\r'_u8; return 0; } + "\\t" { c = '\t'_u8; return 0; } + "\\v" { c = '\v'_u8; return 0; } + "\\\\" { c = '\\'_u8; return 0; } + "\\]" { c = ']'_u8; return 0; } + + [^] \ nil { c = cur[-1]; return 0; } */ } diff --git a/lib/parse.ypp b/lib/parse.ypp index 37bb54b1..bfdee3e9 100644 --- a/lib/parse.ypp +++ b/lib/parse.ypp @@ -1,3 +1,11 @@ +%code requires { +/* pull in types to populate YYSTYPE: */ +#include "src/parse/ast.h" +namespace re2c { + struct AstNode; +} +} + %{ #include @@ -5,41 +13,36 @@ #include "src/parse/ast.h" #include "src/util/attribute.h" -#include "src/util/c99_stdint.h" +#include #include "lib/lex.h" -// disable certain GCC and/or Clang warnings, as we have no control over -// autogenerated code (Clang also understands '#pragma GCC') #pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wpragmas" -#pragma GCC diagnostic ignored "-Wconversion" -#pragma GCC diagnostic ignored "-Wsign-conversion" -#pragma GCC diagnostic ignored "-Wunused-macros" -#pragma GCC diagnostic ignored "-Wmissing-variable-declarations" -#pragma GCC diagnostic ignored "-Wunreachable-code" -#pragma GCC diagnostic ignored "-Wunreachable-code-break" +#include "src/util/nowarn_in_bison.h" using namespace re2c; - -extern "C" { - -int yylex(const char *&pattern); -void yyerror(const char *pattern, const char*) RE2C_ATTR((noreturn)); - -} // extern "C" - %} -%lex-param {const char *&pattern} -%parse-param {const char *&pattern} +%define api.pure full +%lex-param {const uint8_t*& pattern} +%lex-param {re2c::Ast& ast} +%parse-param {const uint8_t*& pattern} +%parse-param {re2c::Ast& ast} %start regexp %union { - const re2c::AST *regexp; - re2c::ASTBounds bounds; + const re2c::AstNode* regexp; + re2c::AstBounds bounds; }; +%{ +extern "C" { + static int yylex(YYSTYPE* yylval, const uint8_t*& pattern, Ast& ast); + static void yyerror(const uint8_t* pattern, Ast&, const char* msg) RE2C_ATTR((noreturn)); +} + +%} + %token TOKEN_COUNT %token TOKEN_ERROR %token TOKEN_REGEXP @@ -53,26 +56,26 @@ regexp: expr { regexp = $$; }; expr : term -| expr '|' term { $$ = ast_alt($1, $3); } +| expr '|' term { $$ = ast.alt($1, $3); } ; term : factor -| factor term { $$ = ast_cat($1, $2); } // in POSIX concatenation is right-associative +| factor term { $$ = ast.cat($1, $2); } // in POSIX concatenation is right-associative ; factor : primary -| primary '*' { $$ = ast_iter($1, 0, AST::MANY); } -| primary '+' { $$ = ast_iter($1, 1, AST::MANY); } -| primary '?' { $$ = ast_iter($1, 0, 1); } -| primary TOKEN_COUNT { $$ = ast_iter($1, $2.min, $2.max); } +| primary '*' { $$ = ast.iter($1, 0, Ast::MANY); } +| primary '+' { $$ = ast.iter($1, 1, Ast::MANY); } +| primary '?' { $$ = ast.iter($1, 0, 1); } +| primary TOKEN_COUNT { $$ = ast.iter($1, $2.min, $2.max); } ; primary : TOKEN_REGEXP -| '(' ')' { $$ = ast_cap(ast_nil(NOWHERE)); } -| '(' expr ')' { $$ = ast_cap($2); } +| '(' ')' { $$ = ast.cap(ast.nil(NOWHERE), CAPTURE); } +| '(' expr ')' { $$ = ast.cap($2, CAPTURE); } ; %% @@ -80,25 +83,21 @@ primary #pragma GCC diagnostic pop extern "C" { - -void yyerror(const char *pattern, const char *msg) -{ - fprintf(stderr, "%s (on RE %s)", msg, pattern); - exit(1); + static void yyerror(const uint8_t* pattern, Ast&, const char* msg) { + fprintf(stderr, "%s (on regexp %s)", msg, pattern); + exit(1); + } + + static int yylex(YYSTYPE* yylval, const uint8_t*& pattern, Ast& ast) { + return lex(yylval, pattern, ast); + } } -int yylex(const char *&pattern) -{ - return lex(pattern); -} - -} // extern "C" - namespace re2c { -const AST *parse(const char *pattern) -{ - yyparse(pattern); +const AstNode* parse(const char* pattern, Ast& ast) { + const uint8_t *p = reinterpret_cast(pattern); + yyparse(p, ast); return regexp; } diff --git a/lib/regcomp.cc b/lib/regcomp.cc index a87b012b..7bb0f073 100644 --- a/lib/regcomp.cc +++ b/lib/regcomp.cc @@ -1,140 +1,124 @@ #include -#include "src/util/c99_stdint.h" +#include #include #include #include #include "lib/lex.h" -#include "lib/regcomp_dfa_regless.h" +#include "lib/regcomp_dfa_multipass.h" #include "lib/regex.h" #include "lib/regex_impl.h" #include "lib/regoff_trie.h" -#include "src/debug/debug.h" #include "src/dfa/dfa.h" #include "src/msg/location.h" #include "src/msg/msg.h" #include "src/nfa/nfa.h" #include "src/options/opt.h" #include "src/parse/ast.h" -#include "src/regexp/re.h" +#include "src/regexp/regexp.h" #include "src/regexp/rule.h" +#include "src/util/check.h" #include "src/util/range.h" - namespace re2c { -int lex(const char *pattern); -const AST *regexp; +int lex(YYSTYPE* yylval, const char* pattern); +const AstNode* regexp; } // namespace re2c using namespace re2c; using namespace re2c::libre2c; -int regcomp(regex_t *preg, const char *pattern, int cflags) -{ +// redefine CHECK_RET, as it needs to return `int` rather than `Ret` +#undef CHECK_RET +#define CHECK_RET(x) do { if (x != Ret::OK) return 1; } while(0) + +int regcomp(regex_t* preg, const char* pattern, int cflags) { conopt_t globopts; - globopts.nested_negative_tags = !(cflags & (REG_NFA | REG_REGLESS)); - globopts.FFlag = true; - globopts.backward = cflags & REG_BACKWARD; - globopts.stadfa = cflags & REG_STADFA; + globopts.nested_negative_tags = !(cflags & (REG_NFA | REG_MULTIPASS)); + globopts.flex_syntax = true; Msg msg; Opt opts(globopts, msg); - opts.set_subhistories((cflags & REG_SUBHIST) != 0); - opts.set_autotags((cflags & REG_AUTOTAGS) != 0); - opts.set_posix_syntax(true); - opts.set_posix_semantics((cflags & REG_LEFTMOST) == 0); - const opt_t *opt = opts.snapshot(); + opts.set_tags_history((cflags & REG_SUBHIST) != 0); + opts.set_tags_automatic((cflags & REG_AUTOTAGS) != 0); + opts.set_tags_posix_syntax(true); + opts.set_tags_posix_semantics((cflags & REG_LEFTMOST) == 0); + const opt_t* opt; + CHECK_RET(opts.snapshot(&opt)); preg->flags = cflags; - const AST *a = parse(pattern); + AstAllocator ast_alc; + OutAllocator out_alc; // unused but required by AST constructor + Ast ast(ast_alc, out_alc); + const AstNode* a = parse(pattern, ast); - preg->rmgr = new RangeMgr; - - ASTRule ar(a, new SemAct(NOWHERE)); - std::vector arv; - arv.push_back(ar); - RESpec re(arv, opt, msg, *preg->rmgr); + std::vector arv{AstRule{a, ast.sem_act(NOWHERE, nullptr, nullptr, false)}}; + RESpec re(opt, msg); + CHECK_RET(re.init(arv)); find_fixed_tags(re); insert_default_tags(re); - size_t nfa_size, nfa_depth; - compute_size_and_depth(re.res, &nfa_size, &nfa_depth); - if (nfa_depth > MAX_NFA_DEPTH || nfa_size > MAX_NFA_STATES) return 1; - - nfa_t *nfa = new nfa_t(re, nfa_size); - - nfa_t *nfa0 = NULL; - if (cflags & REG_BACKWARD) { - conopt_t globopts0; - globopts0.FFlag = true; - Opt opts0(globopts0, msg); - const opt_t *opt0 = opts0.snapshot(); - RESpec re0(arv, opt0, msg, *preg->rmgr); - nfa0 = new nfa_t(re0, nfa_size); - delete opt0; + if ((cflags & REG_NFA) == 0) { + preg->char2class = new size_t[256]; + split_charset(re); + for (uint32_t i = 1, j = 0; i < re.charset.size(); ++i) { + for (; j < re.charset[i]; ++j) { + preg->char2class[j] = i - 1; + } + } } - dfa_t *dfa = NULL; - rldfa_t *rldfa = NULL; + Tnfa* nfa = new Tnfa; + CHECK_RET(re_to_nfa(*nfa, std::move(re))); + + DCHECK(nfa->rules.size() == 1); + preg->re_nsub = nfa->rules[0].ncap + 1; + preg->re_ntag = nfa->tags.size(); if (cflags & REG_NFA) { + preg->nfa = nfa; if ((cflags & REG_TRIE) && (cflags & REG_LEFTMOST)) { - preg->simctx = new lzsimctx_t(*nfa, nfa0, preg->re_nsub, cflags); + preg->simctx = new lzsimctx_t(*nfa, preg->re_nsub, cflags); } else if (cflags & REG_TRIE) { - preg->simctx = new pzsimctx_t(*nfa, nfa0, preg->re_nsub, cflags); + preg->simctx = new pzsimctx_t(*nfa, preg->re_nsub, cflags); } else if (cflags & REG_LEFTMOST) { - preg->simctx = new lsimctx_t(*nfa, nfa0, preg->re_nsub, cflags); - } else if (cflags & REG_KUKLEWICZ) { - preg->simctx = new ksimctx_t(*nfa, nfa0, preg->re_nsub, cflags); + preg->simctx = new lsimctx_t(*nfa, preg->re_nsub, cflags); } else { - preg->simctx = new psimctx_t(*nfa, nfa0, preg->re_nsub, cflags); - } - } else { - preg->char2class = new size_t[256]; - split_charset(re); - for (uint32_t i = 1, j = 0; i < re.charset.size(); ++i) { - for (; j < re.charset[i]; ++j) { - preg->char2class[j] = i - 1; - } + preg->simctx = new psimctx_t(*nfa, preg->re_nsub, cflags); } + } else if (cflags & REG_MULTIPASS) { + preg->mptdfa = new MpTdfa(std::move(*nfa), opt, cflags); + delete nfa; + opt = nullptr; // transfer options ownership to RLDFA - dfa = new dfa_t(*nfa, Rule::NONE, Rule::NONE); - if (cflags & REG_REGLESS) { - DASSERT((cflags & REG_STADFA) == 0); - rldfa = new rldfa_t(*nfa, *dfa, opt, cflags); - opt = NULL; // transfer options ownership to RLDFA - } else { - determinization(*nfa, *dfa, opt, msg, ""); - cutoff_dead_rules(*dfa, opt, "", msg); - insert_fallback_tags(opt, *dfa); - compact_and_optimize_tags(opt, *dfa); + if (cflags & REG_SUBHIST) { + preg->regtrie = new regoff_trie_t(preg->mptdfa->tags.size()); } + } else { + Tdfa* dfa = new Tdfa(*new DfaAllocator(), nfa->charset.size(), Rule::NONE, Rule::NONE); + CHECK_RET(determinization(std::move(*nfa), *dfa, opt, msg, "")); + preg->dfa = dfa; + delete nfa; + + cutoff_dead_rules(*dfa, opt, "", msg); + insert_fallback_tags(*dfa); + compact_and_optimize_tags(opt, *dfa); if (cflags & REG_TSTRING) { // T-string does not need intermediate storage for tag values. } else if (cflags & REG_SUBHIST) { - const size_t nlists = (cflags & REG_REGLESS) - ? dfa->tags.size() : static_cast(dfa->maxtagver + 1); - preg->regtrie = new regoff_trie_t(nlists); + preg->regtrie = new regoff_trie_t(static_cast(dfa->maxtagver + 1)); } else { preg->regs = new regoff_t[dfa->maxtagver + 1]; } } - preg->nfa = nfa; - preg->dfa = dfa; - preg->rldfa = rldfa; - - DASSERT(nfa->rules.size() == 1); - preg->re_nsub = nfa->rules[0].ncap + 1; - preg->re_ntag = nfa->tags.size(); - if (cflags & REG_TSTRING) { - // T-string is stored in RE (reallocated on each regtstring() call if - // needed), and the user gets an immutable view of it (const ref). - tstring_t &ts = preg->tstring; + // T-string is stored in the regexp (reallocated on each regtstring() call if needed), and + // the user gets an immutable view of it (const ref). + tstring_t& ts = preg->tstring; ts.capacity = 256; ts.string = new tchar_t[ts.capacity]; ts.length = 0; @@ -147,3 +131,6 @@ int regcomp(regex_t *preg, const char *pattern, int cflags) delete opt; return 0; } + +#undef CHECK_RET + diff --git a/lib/regcomp_dfa_multipass.h b/lib/regcomp_dfa_multipass.h new file mode 100644 index 00000000..af384edc --- /dev/null +++ b/lib/regcomp_dfa_multipass.h @@ -0,0 +1,271 @@ +#ifndef _RE2C_LIB_REGCOMP_DFA_MULTIPASS_ +#define _RE2C_LIB_REGCOMP_DFA_MULTIPASS_ + +#include + +#include "regex.h" +#include "src/dfa/dfa.h" +#include "src/dfa/determinization.h" +#include "src/nfa/nfa.h" +#include "src/options/opt.h" +#include "src/util/check.h" + +namespace re2c { +namespace libre2c { + +static constexpr regoff_t NORESULT = std::numeric_limits::max(); +static constexpr uint32_t NOCONF = std::numeric_limits::max(); + +// A backlink connects target and origin states of a TDFA transition at the level of TNFA +// configurations. This allows one to follow TNFA path backwards from the final state to the initial +// one. +struct MpTdfaBacklink { + // Index of configuration in the origin TDFA state. + uint32_t conf; + // T-string fragment corresponding to tagged path from the origin configuration. + tchar_t* tfrag; + // Length of t-string fragment. + size_t tfrag_size; +}; + +// Multi-pass TDFA arc (transition). There are no registers and register operations, tag actions are +// hidden in tag histories (stored in backlinks). +struct MpTdfaArc { + // Target TDFA state where this arc goes to. + size_t state; + // Array of backlinks (one per TNFA path that reaches the target state). + MpTdfaBacklink* backlinks; +}; + +// Multi-pass TDFA state. +struct MpTdfaState { + // Outgoing arcs from this state. Each arc has target state and backlinks. + MpTdfaArc* arcs; + // Final link stores final configuration index (if any) and its history. + MpTdfaBacklink finlink; + + MpTdfaState(size_t nchars, const MpTdfaBacklink& link); + ~MpTdfaState(); + FORBID_COPY(MpTdfaState); +}; + +// Multi-pass TDFA. +struct MpTdfa { + const opt_t* opts; + const int flags; + std::vector tags; + + DfaAllocator alc; + + // RLDFA own states with backlinks in them. + std::vector states; + + // Array of submatch values (used during matching). + mutable regoff_t* result; + // Log stores a sequence of backlink arrays on the matching TDFA path. This allows one to unwind + // tag history back and get submatch values in the absence of registers. Backlink arrays (rather + // than single backlinks) are needed because there is a set of active TNFA paths on the way + // forward, and it is unknown until the final state which of them will match. + mutable std::vector log; + + MpTdfa(Tnfa&& nfa, const opt_t* opts, int flags); + ~MpTdfa(); + FORBID_COPY(MpTdfa); +}; + +static inline tchar_t encode_tag(size_t tag) { + // Tags in the t-string are indexed from 1 rather than 0 (so that negative tags can be + // represented by negating tag index). + tag += 1; + // Two extra tags for the outermost capture that wraps the whole regexp. + tag += 2; + // T-string characters store either symbols or tags. Symbols use the lower half of the range (so + // they don't need to be translated), and tags use the upper half. + tag += TAG_BASE; + + return static_cast(tag); +} + +template +static inline void get_tstring_fragment(history_t& history, + DfaAllocator& alc, + hidx_t hidx, + std::vector& tfrag, + MpTdfaBacklink& link, + bool tstring) { + tfrag.clear(); + for (int32_t i = hidx; i != HROOT; ) { + const typename history_t::node_t& n = history.node(i); + const size_t t = n.info.idx; + const bool negative = n.info.neg; + i = n.pred; + + if (tstring) { + // For t-string construction add only positive tags. The idea is to get the cheapest + // possible representation of parse results, and adding negative tags slows it down + // quite a bit. + if (!negative) tfrag.push_back(encode_tag(t)); + } else { + // For offset construction add both positive and negative tags. Shift negative tags by + // TAG_BASE to make them distinguishable from positive tags. Do not expand nested + // negative tags here: it makes regexec() slower, because it cannot skip all nested tags + // with one check when a tag has already been set. + tfrag.push_back(static_cast(t + (negative ? TAG_BASE : 0))); + } + } + + std::reverse(tfrag.begin(), tfrag.end()); + + link.tfrag_size = tfrag.size(); + link.tfrag = alc.template alloct(tfrag.size()); + memcpy(link.tfrag, tfrag.data(), tfrag.size() * sizeof(tchar_t)); +} + +template +static MpTdfaBacklink* construct_backlinks(const ctx_t& ctx, + MpTdfa& mptdfa, + std::vector& tfrag, + const std::vector>& uniq_orig) { + if (ctx.target == Tdfa::NIL) return nullptr; + + const bool tstring = mptdfa.flags & REG_TSTRING; + const std::vector& uo = uniq_orig[ctx.target]; + uint32_t nbacklinks = *std::max_element(uo.begin(), uo.end()) + 1; + MpTdfaBacklink* links = mptdfa.alc.alloct(nbacklinks); + + for (size_t j = 0, k; j < nbacklinks; ++j) { + for (k = 0; k < ctx.state.size() && uo[k] != j; ++k); + const typename ctx_t::conf_t& x = ctx.state[k]; + MpTdfaBacklink& l = links[j]; + l.conf = uniq_orig[ctx.origin][x.origin]; + get_tstring_fragment(ctx.history, mptdfa.alc, x.ttran, tfrag, l, tstring); + } + + return links; +} + +template +static void determinization_multipass(Tnfa&& nfa, MpTdfa& mptdfa) { + Msg msg; + // Determinization context lifetime must cover regexec, as some of the data stored in the + // context is used during matching. + Tdfa dfa(mptdfa.alc, nfa.charset.size(), Rule::NONE, Rule::NONE); + ctx_t ctx(std::move(nfa), dfa, mptdfa.opts, msg, ""); + + std::vector tfrag; + // Per-state array of mappings from configuration index to a unique origin index. This is needed + // to compress identical backlinks into one. Note that configurations with identical origin also + // have identical transition tags (because those are the lookahead tags in the origin + // configuration). + std::vector > uniq_orig; + + // Construct initial TDFA state. + const uint32_t INITIAL_TAGS = init_tag_versions(ctx); + const clos_t c0(ctx.nfa_root, 0, INITIAL_TAGS, HROOT, HROOT); + ctx.reach.push_back(c0); + closure(ctx); + find_state_multipass(ctx, mptdfa, tfrag, uniq_orig); + + // Iterate while new states are added: for each alphabet symbol build tagged epsilon-closure of + // all reachable NFA states, then find identical or mappable TDFA state, or add a new one. + for (uint32_t i = 0; i < ctx.kernels.size(); ++i) { + ctx.origin = i; + + for (uint32_t c = 0; c < dfa.nchars; ++c) { + reach_on_symbol(ctx, c); + closure(ctx); + find_state_multipass(ctx, mptdfa, tfrag, uniq_orig); + + // Multi-pass TDFA stores backlinks instead of tag actions. + mptdfa.states[ctx.origin]->arcs[c].backlinks = + construct_backlinks(ctx, mptdfa, tfrag, uniq_orig); + } + } + + mptdfa.tags = std::move(ctx.tags); +} + +template +static void find_state_multipass(ctx_t& ctx, + MpTdfa& mptdfa, + std::vector& tfrag, + std::vector>& uniq_orig) { + const bool tstring = mptdfa.flags & REG_TSTRING; + + // Find or add the new state in the existing set of states. + const bool is_new = do_find_state(ctx); + + if (is_new) { + const typename ctx_t::confset_t& state = ctx.state; + + // Map configuration index to a unique origin index. + uniq_orig.push_back(std::vector(state.size(), UINT32_MAX)); + std::vector& uo = uniq_orig.back(); + for (uint32_t i = 0, j = 0; j < state.size(); ++j) { + if (uo[j] != UINT32_MAX) continue; + for (size_t k = j + 1; k < state.size(); ++k) { + if (state[j].origin == state[k].origin) { + DCHECK(state[j].ttran == state[k].ttran); // TDFA(1) property + uo[k] = i; + } + } + uo[j] = i++; + } + + // Check if the new TDFA state is final. See note [at most one final item per closure]. + MpTdfaBacklink finlink = {NOCONF, nullptr, 0}; + for (uint32_t i = 0; i < state.size(); ++i) { + if (state[i].state->kind == TnfaState::Kind::FIN) { + finlink.conf = uo[i]; + get_tstring_fragment( + ctx.history, mptdfa.alc, state[i].thist, tfrag, finlink, tstring); + break; + } + } + + // Create a new multipass-DFA state. + MpTdfaState* s = new MpTdfaState(ctx.dfa.nchars, finlink); + mptdfa.states.push_back(s); + } + + if (ctx.origin != Tdfa::NIL) { + mptdfa.states[ctx.origin]->arcs[ctx.symbol].state = ctx.target; + } +} + +inline MpTdfaState::MpTdfaState(size_t nchars, const MpTdfaBacklink& link) + : arcs(new MpTdfaArc[nchars]), finlink() { + finlink.conf = link.conf; + finlink.tfrag = link.tfrag; + finlink.tfrag_size = link.tfrag_size; +} + +inline MpTdfaState::~MpTdfaState() { + delete[] arcs; +} + +inline MpTdfa::MpTdfa(Tnfa&& nfa, const opt_t* opts, int flags) + : opts(opts), + flags(flags), + tags(), + alc(), + states(), + result(new regoff_t[nfa.tags.size()]), + log() { + if (opts->tags_posix_semantics) { + determinization_multipass(std::move(nfa), *this); + } else { + determinization_multipass(std::move(nfa), *this); + } +} + +inline MpTdfa::~MpTdfa() { + for (size_t i = 0; i < states.size(); ++i) { + delete states[i]; + } +} + +} // namespace libre2c +} // namespace re2c + +#endif // _RE2C_LIB_REGCOMP_DFA_MULTIPASS_ diff --git a/lib/regcomp_dfa_regless.h b/lib/regcomp_dfa_regless.h deleted file mode 100644 index 916aa8f6..00000000 --- a/lib/regcomp_dfa_regless.h +++ /dev/null @@ -1,249 +0,0 @@ -#ifndef _RE2C_LIB_REGCOMP_DFA_REGLESS_ -#define _RE2C_LIB_REGCOMP_DFA_REGLESS_ - -#include - -#include "regex.h" -#include "src/dfa/dfa.h" -#include "src/dfa/determinization.h" -#include "src/nfa/nfa.h" -#include "src/options/opt.h" - - -namespace re2c { -namespace libre2c { - -static const regoff_t NORESULT = std::numeric_limits::max(); -static const uint32_t NOCONF = std::numeric_limits::max(); - -// A backlink connects target and origin states of a TDFA transition at the -// level of TNFA configurations. This allows to follow TNFA path backwards from -// the final state to the initial one. -struct rldfa_backlink_t { - // Index of configuration in the origin TDFA state. - uint32_t conf; - // Index of tag history corresponding to the origin configuration. - hidx_t hidx; - // T-string fragment corresponding to tagged path from the origin configuration. - tchar_t *tfrag; - // Length of t-string fragment. - size_t tfrag_size; -}; - -// Registerless TDFA arc (transition). There are no registers and register -// operations, tag actions are hidden in tag histories (stored in backlinks). -struct rldfa_arc_t { - // Target TDFA state where this arc goes to. - size_t state; - // Array of backlinks (one per TNFA path that reaches the target state). - rldfa_backlink_t *backlinks; -}; - -// Registerless TDFA state. -struct rldfa_state_t { - // Outgoing arcs from this state. Each arc has target state and backlinks. - rldfa_arc_t *arcs; - // Final link stores final configuration index (if any) and its history. - rldfa_backlink_t finlink; - - rldfa_state_t(size_t nchars, const rldfa_backlink_t &link); - ~rldfa_state_t(); - FORBID_COPY(rldfa_state_t); -}; - -// Registerless TDFA. -struct rldfa_t { - const opt_t *opts; - const int flags; - - // Determinization context, specialized either for leftmost greedy policy or - // for POSIX policy. Context lifetime should cover regexec, as it needs some - // of the data stored in it (TDFA, tag history, etc.). - void *ctx; - - // RLDFA own states with backlinks in them. - std::vector states; - - // Array of submatch values (used during matching). - mutable regoff_t *result; - // Log stores a sequence of backlink arrays on the matching TDFA path. This - // allows to unwind tag history back and get submatch values in the absence - // of registers. Backlink arrays (rather than single backlinks) are needed - // because there is a set of active TNFA paths on the way forward, and it is - // unknown until the final state which of them will match. - mutable std::vector log; - - rldfa_t(const nfa_t &nfa, dfa_t &dfa, const opt_t *opts, int flags); - ~rldfa_t(); - FORBID_COPY(rldfa_t); -}; - -static inline tchar_t encode_tag(size_t tag) -{ - // Tags in the t-string are indexed from 1 rather than 0 (so that - // negative tags can be represented by negating tag index). - tag += 1; - // Two extra tags for the outermost capture that wraps the whole RE. - tag += 2; - // T-string characters store either symbols or tags. Symbols use the - // lower half of the range (so they don't need to be translated), - // and tags use the upper half. - tag += TAG_BASE; - - return static_cast(tag); -} - -template -static inline void get_tstring_fragment(determ_context_t &ctx, - hidx_t hidx, std::vector &tfrag, rldfa_backlink_t &link, bool tstring) -{ - tfrag.clear(); - for (int32_t i = hidx; i != HROOT; ) { - const typename history_t::node_t &n = ctx.history.node(i); - const size_t t = n.info.idx; - const bool negative = n.info.neg; - i = n.pred; - - if (tstring) { - // For t-string construction add only positive tags. The idea is to - // get the cheapest possible representation of parse results, and - // adding negative tags slows it down quite a bit. - if (!negative) tfrag.push_back(encode_tag(t)); - } else { - // For offset construction add both positive and negative tags. - // Shift negative tags by TAG_BASE to make them distinguishable from - // positive tags. Do not expand nested negative tags here: it makes - // regexec() slower, because it cannot skip all nested tags with one - // check when a tag has already been set. - tfrag.push_back(static_cast(t + (negative ? TAG_BASE : 0))); - } - } - - std::reverse(tfrag.begin(), tfrag.end()); - - link.tfrag_size = tfrag.size(); - link.tfrag = ctx.dc_allocator.template alloct(tfrag.size()); - memcpy(link.tfrag, tfrag.data(), tfrag.size() * sizeof(tchar_t)); -} - -template -static void determinization_regless(const nfa_t &nfa, dfa_t &dfa, rldfa_t &rldfa) -{ - Msg msg; - // Determinization context lifetime must cover regexec, as some of the data - // stored in the context is used during matching. - ctx_t &ctx = *new ctx_t(rldfa.opts, msg, "", nfa, dfa); - rldfa.ctx = &ctx; - allocator_t &alc = ctx.dc_allocator; - const bool tstring = rldfa.flags & REG_TSTRING; - std::vector tfrag; - - // Construct initial TDFA state. - const uint32_t INITIAL_TAGS = init_tag_versions(ctx); - const clos_t c0(ctx.nfa.root, 0, INITIAL_TAGS, HROOT, HROOT); - ctx.reach.push_back(c0); - closure(ctx); - find_state_regless(ctx, rldfa, tfrag); - - // Iterate while new states are added: for each alphabet symbol build tagged - // epsilon-closure of all reachable NFA states, then find identical or - // mappable TDFA state, or add a new one. - for (uint32_t i = 0; i < ctx.dc_kernels.size(); ++i) { - ctx.dc_origin = i; - - for (uint32_t c = 0; c < ctx.dfa.nchars; ++c) { - reach_on_symbol(ctx, c); - closure(ctx); - find_state_regless(ctx, rldfa, tfrag); - - // Unlike TDFA, RLDFA stores backlinks instead of tag actions. - rldfa_backlink_t *links = alc.alloct(ctx.state.size()); - for (size_t j = 0; j < ctx.state.size(); ++j) { - const typename ctx_t::conf_t &x = ctx.state[j]; - rldfa_backlink_t &l = links[j]; - l.conf = x.origin; - l.hidx = x.ttran; - get_tstring_fragment(ctx, x.ttran, tfrag, l, tstring); - } - rldfa.states[ctx.dc_origin]->arcs[c].backlinks = links; - } - } -} - -template -static void find_state_regless(ctx_t &ctx, rldfa_t &rldfa, std::vector &tfrag) -{ - const bool tstring = rldfa.flags & REG_TSTRING; - dfa_t &dfa = ctx.dfa; - - // Find or add the new state in the existing set of states. - const bool is_new = do_find_state(ctx); - - if (is_new) { - // Check if the new TDFA state is final. - // See note [at most one final item per closure]. - rldfa_backlink_t finlink = {NOCONF, HROOT, NULL, 0}; - for (uint32_t i = 0; i < ctx.state.size(); ++i) { - const typename ctx_t::conf_t &x = ctx.state[i]; - if (x.state->type == nfa_state_t::FIN) { - finlink.conf = i; - finlink.hidx = x.thist; - get_tstring_fragment(ctx, x.thist, tfrag, finlink, tstring); - break; - } - } - - // Create a new regless-DFA state. - rldfa_state_t *s = new rldfa_state_t(dfa.nchars, finlink); - rldfa.states.push_back(s); - } - - if (ctx.dc_origin != dfa_t::NIL) { - rldfa.states[ctx.dc_origin]->arcs[ctx.dc_symbol].state = ctx.dc_target; - } - - DDUMP_DFA_RAW(ctx, is_new); - DDUMP_DFA_TREE(is_new); -} - -inline rldfa_state_t::rldfa_state_t(size_t nchars, const rldfa_backlink_t &link) - : arcs(new rldfa_arc_t[nchars]) - , finlink() -{ - finlink.conf = link.conf; - finlink.hidx = link.hidx; - finlink.tfrag = link.tfrag; - finlink.tfrag_size = link.tfrag_size; -} - -inline rldfa_state_t::~rldfa_state_t() -{ - delete[] arcs; -} - -inline rldfa_t::rldfa_t(const nfa_t &nfa, dfa_t &dfa, const opt_t *opts, int flags) - : opts(opts) - , flags(flags) - , ctx(NULL) - , states() - , result(new regoff_t[nfa.tags.size()]) - , log() -{ - if (opts->posix_semantics) { - determinization_regless(nfa, dfa, *this); - } else { - determinization_regless(nfa, dfa, *this); - } -} - -inline rldfa_t::~rldfa_t() -{ - for (size_t i = 0; i < states.size(); ++i) { - delete states[i]; - } -} - -} // namespace libre2c -} // namespace re2c - -#endif // _RE2C_LIB_REGCOMP_DFA_REGLESS_ diff --git a/lib/regex.h b/lib/regex.h index 1f33ec04..9d053957 100644 --- a/lib/regex.h +++ b/lib/regex.h @@ -5,151 +5,144 @@ #include #include - namespace re2c { -struct nfa_t; -struct dfa_t; + +struct Tnfa; +struct Tdfa; class RangeMgr; + } // namespace re2c namespace re2c { namespace libre2c { -struct rldfa_t; + +struct MpTdfa; struct regoff_trie_t; + } // namespace libre2c } // namespace re2c -typedef ptrdiff_t regoff_t; +using regoff_t = ptrdiff_t; -// regmatch_t stores an offset pair representing a capturing group. -// No match is represented with (-1,-1). +// regmatch_t stores an offset pair representing a capturing group. No match is represented with +// (-1,-1). struct regmatch_t { regoff_t rm_so; regoff_t rm_eo; }; -// subhistory_t stores subhistory of a capturing group: an array of offset pairs -// corresponding to all positions in the input string where this capturing group -// has matched. The length of the array depends on how many times the group has -// matched, which depends on the RE and the input string. +// subhistory_t stores subhistory of a capturing group: an array of offset pairs corresponding to +// all positions in the input string where this capturing group has matched. The length of the array +// depends on how many times the group has matched, which depends on the regexp and the string. struct subhistory_t { size_t size; - regmatch_t *offs; + regmatch_t* offs; }; -// T-string chars are 16 bits. -// This is aligned with the Java implementation by Angelo Borsotti. -typedef uint16_t tchar_t; +// T-string chars are 16 bits. This is aligned with the Java implementation by Angelo Borsotti. +using tchar_t = uint16_t; -// A t-string is a flattened representation a parse tree where characters of the -// input string are interleaved with tags. Input characters occupy the lower -// byte of the 16-bit value, leaving the higher byte as zero. Tags are stored as -// numeric values shifted to the upper half of the 16-bit range. +// A t-string is a flattened representation a parse tree where characters of the input string are +// interleaved with tags. Input characters occupy the lower byte of the 16-bit value, leaving the +// higher byte as zero. Tags are stored as numeric values shifted to the upper half of the 16-bit +// range. struct tstring_t { size_t capacity; size_t length; - tchar_t *string; + tchar_t* string; }; // Tags use the upper half of the t-string charater value range. -static const tchar_t TAG_BASE = 1 << (8 * sizeof(tchar_t) - 1); +static constexpr tchar_t TAG_BASE = 1 << (8 * sizeof(tchar_t) - 1); // standard flags -static const int REG_EXTENDED = 1u << 0; -static const int REG_ICASE = 1u << 1; -static const int REG_NOSUB = 1u << 2; -static const int REG_NEWLINE = 1u << 3; -static const int REG_NOTBOL = 1u << 4; -static const int REG_NOTEOL = 1u << 5; +static constexpr int REG_EXTENDED = 1u << 0; +static constexpr int REG_ICASE = 1u << 1; +static constexpr int REG_NOSUB = 1u << 2; +static constexpr int REG_NEWLINE = 1u << 3; +static constexpr int REG_NOTBOL = 1u << 4; +static constexpr int REG_NOTEOL = 1u << 5; // extensions -static const int REG_NFA = 1u << 6; -static const int REG_LEFTMOST = 1u << 7; -static const int REG_TRIE = 1u << 8; -static const int REG_GTOP = 1u << 9; -static const int REG_SLOWPREC = 1u << 10; -static const int REG_BACKWARD = 1u << 11; -static const int REG_KUKLEWICZ = 1u << 12; -static const int REG_STADFA = 1u << 13; -static const int REG_REGLESS = 1u << 14; -static const int REG_SUBHIST = 1u << 15; -static const int REG_TSTRING = 1u << 16; -static const int REG_AUTOTAGS = 1u << 17; +static constexpr int REG_NFA = 1u << 6; +static constexpr int REG_LEFTMOST = 1u << 7; +static constexpr int REG_TRIE = 1u << 8; +static constexpr int REG_SLOWPREC = 1u << 9; +static constexpr int REG_MULTIPASS = 1u << 10; +static constexpr int REG_SUBHIST = 1u << 11; +static constexpr int REG_TSTRING = 1u << 12; +static constexpr int REG_AUTOTAGS = 1u << 13; + +// Deprecated, keep for backward compatibility. +static constexpr int REG_REGLESS = REG_MULTIPASS; // old name for REG_MULTIPASS struct regex_t { size_t re_nsub; size_t re_ntag; - re2c::RangeMgr *rmgr; - const re2c::nfa_t *nfa; - const re2c::dfa_t *dfa; - const re2c::libre2c::rldfa_t *rldfa; - void *simctx; - size_t *char2class; + const re2c::Tnfa* nfa; + const re2c::Tdfa* dfa; + const re2c::libre2c::MpTdfa* mptdfa; + void* simctx; + size_t* char2class; int flags; union { - regoff_t *regs; - re2c::libre2c::regoff_trie_t *regtrie; + regoff_t* regs; + re2c::libre2c::regoff_trie_t* regtrie; }; - // Allow storing submatch results in RE (this is needed for Java bindings). + // Allow storing submatch results in the regexp (this is needed for Java bindings). union { - regmatch_t *pmatch; - subhistory_t *psubhist; + regmatch_t* pmatch; + subhistory_t* psubhist; mutable tstring_t tstring; }; }; -static const int REG_NOMATCH = INT_MAX; +static constexpr int REG_NOMATCH = INT_MAX; // standard functions -int regcomp(regex_t *preg, const char *pattern, int cflags); -size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size); -int regexec(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], - int eflags); -void regfree(regex_t *preg); - -// The regparse() function returns parse results as an array of nmatch size, -// where each element is an array of offset pairs for the corresponding -// capturing group. If a group has matched repeatedly at different parts of the -// input string, then its array will contain multiple offset pairs; otherwise, a -// single pair. This differs from the standard POSIX API, where only the last -// offset pair for each group is returned. +int regcomp(regex_t* preg, const char* pattern, int cflags); +size_t regerror(int errcode, const regex_t* preg, char* errbuf, size_t errbuf_size); +int regexec( + const regex_t* preg, const char* string, size_t nmatch, regmatch_t pmatch[], int eflags); +void regfree(regex_t* preg); + +// The regparse() function returns parse results as an array of nmatch size, where each element is +// an array of offset pairs for the corresponding capturing group. If a group has matched repeatedly +// at different parts of the input string, then its array will contain multiple offset pairs; +// otherwise, a single pair. This differs from the standard POSIX API, where only the last offset +// pair for each group is returned. // // regparse() can be used if REG_SUBHIST flag has been passed to regcomp(). // -// The allocation of memory for the parse results is done by the library (the -// user cannot know how much memory will be needed). The ownership of parse -// results is transferred to the user, who is expected to deallocate memory -// allocated by regparse() with regfreesub(). -subhistory_t *regparse(const regex_t *preg, const char *string, size_t nmatch); +// The allocation of memory for the parse results is done by the library (the user cannot know how +// much memory will be needed). The ownership of parse results is transferred to the user, who is +// expected to deallocate memory allocated by regparse() with regfreesub(). +subhistory_t* regparse(const regex_t* preg, const char* string, size_t nmatch); -// regfreesub() deallocates memory allocated with regparse(). It must be called -// every time after regparse() is called. -void regfreesub(subhistory_t *history); +// regfreesub() deallocates memory allocated with regparse(). It must be called every time after +// regparse() is called. +void regfreesub(subhistory_t* history); // The regtstring() function constructs a t-string. // // It can be used if REG_TSTRING flag has been passed to regcomp(). // -// The goal of this function is to construct parse results in the cheapest -// possible way. There is no standard representation of a parse tree, and any -// nontrivial representation incurs significant overhead on construction (the -// algorithm spends more time constructing parse results than doing the actual -// parsing). The t-string representation requires very little effort to -// construct, provided that the t-string fragments corresponding to the tagged -// paths in the epsilon-closure are prepared in advance at regcomp() time. The -// fragments are stored as arrays of tags, so that they can be easily copied to -// the resulting t-string. Importantly, this does not require unwinding of tag -// history stored in a trie form, which is rather slow. +// The goal of this function is to construct parse results in the cheapest possible way. There is no +// standard representation of a parse tree, and any nontrivial representation incurs significant +// overhead on construction (the algorithm spends more time constructing parse results than doing +// the actual parsing). The t-string representation requires very little effort to construct, +// provided that the t-string fragments corresponding to the tagged paths in the epsilon-closure are +// prepared in advance at regcomp() time. The fragments are stored as arrays of tags, so that they +// can be easily copied to the resulting t-string. Importantly, this does not require unwinding of +// tag history stored in a trie form, which is rather slow. // -// T-string allocation is done on the library side, and the t-string is stored -// in the regexp structure. The user gets an immutable view of it and is not -// expected to free memory after calling regtstring(). Moreover, the library -// reuses the same memory on each regtstring() call and reallocates it only if -// the new t-string does not fit into the old memory area. So each call to +// T-string allocation is done on the library side, and the t-string is stored in the regexp +// structure. The user gets an immutable view of it and is not expected to free memory after calling +// regtstring(). Moreover, the library reuses the same memory on each regtstring() call and +// reallocates it only if the new t-string does not fit into the old memory area. So each call to // regtstring() invalidates the previous results returned from it. // -// The regtstring() function is primarily intended for benchmarking the speed -// of a parsing algorithm. -const tstring_t *regtstring(const regex_t *preg, const char *string); - +// The regtstring() function is primarily intended for benchmarking the speed of a parsing +// algorithm. +const tstring_t* regtstring(const regex_t* preg, const char* string); #endif // _RE2C_LIB_REGEX_ diff --git a/lib/regex_impl.h b/lib/regex_impl.h index 7ecfa21b..83ce046d 100644 --- a/lib/regex_impl.h +++ b/lib/regex_impl.h @@ -11,54 +11,52 @@ #include "src/dfa/dfa.h" #include "src/dfa/determinization.h" #include "src/nfa/nfa.h" - +#include "src/util/check.h" namespace re2c { namespace libre2c { -typedef std::vector tag_path_t; +using tag_path_t = std::vector; -struct conf_t -{ - nfa_state_t *state; +struct conf_t { + TnfaState* state; uint32_t origin; int32_t thist; - inline conf_t(): state(NULL), origin(0), thist(HROOT) {} - inline conf_t(nfa_state_t *s, uint32_t o, int32_t h) + inline conf_t(): state(nullptr), origin(0), thist(HROOT) {} + inline conf_t(TnfaState* s, uint32_t o, int32_t h) : state(s), origin(o), thist(h) {} - inline conf_t(const conf_t &c, nfa_state_t *s) + inline conf_t(const conf_t& c, TnfaState* s) : state(s), origin(c.origin), thist(c.thist) {} - inline conf_t(const conf_t &c, nfa_state_t *s, int32_t h) + inline conf_t(const conf_t& c, TnfaState* s, int32_t h) : state(s), origin(c.origin), thist(h) {} }; -struct ran_or_fin_t -{ - inline bool operator()(const conf_t &c); +struct ran_or_fin_t { + inline bool operator()(const conf_t& c); }; -typedef std::vector confset_t; -typedef confset_t::iterator confiter_t; -typedef confset_t::const_iterator cconfiter_t; -typedef confset_t::const_reverse_iterator rcconfiter_t; +using confset_t = std::vector; +using confiter_t = confset_t::iterator; +using cconfiter_t = confset_t::const_iterator; +using rcconfiter_t = confset_t::const_reverse_iterator; template -struct simctx_t -{ - typedef libre2c::conf_t conf_t; - typedef std::vector confset_t; - typedef confset_t::iterator confiter_t; - typedef confset_t::const_iterator cconfiter_t; - typedef confset_t::reverse_iterator rconfiter_t; - typedef confset_t::const_reverse_iterator rcconfiter_t; - typedef history_type_t history_t; - - const nfa_t &nfa; - const nfa_t *nfa0; +struct simctx_t { + using conf_t = libre2c::conf_t; + using confset_t = std::vector; + using confiter_t = confset_t::iterator; + using cconfiter_t = confset_t::const_iterator; + using rconfiter_t = confset_t::reverse_iterator; + using rcconfiter_t = confset_t::const_reverse_iterator; + using history_t = history_type_t; + + const Tnfa& nfa; const size_t nsub; const int flags; + const std::vector& tags; + history_t history; int32_t hidx; @@ -66,18 +64,18 @@ struct simctx_t size_t rule; - const char *cursor; - const char *marker; + const char* cursor; + const char* marker; - regoff_t *offsets1; - regoff_t *offsets2; - regoff_t *offsets3; - bool *done; + regoff_t* offsets1; + regoff_t* offsets2; + regoff_t* offsets3; + bool* done; - int32_t *newprectbl; - int32_t *oldprectbl; + int32_t* newprectbl; + int32_t* oldprectbl; size_t oldprecdim; - histleaf_t *histlevel; + histleaf_t* histlevel; std::vector sortcores; std::vector fincount; std::vector worklist; @@ -85,21 +83,17 @@ struct simctx_t confset_t reach; confset_t state; - std::vector gor1_topsort; - std::vector gor1_linear; - std::vector gtop_heap_storage; - cmp_gtop_t gtop_cmp; - gtop_heap_t gtop_heap; - closure_stats_t dc_clstats; - - simctx_t(const nfa_t &nfa, const nfa_t *nfa0, size_t re_nsub, int flags); + std::vector gor1_topsort; + std::vector gor1_linear; + closure_stats_t clstats; + + simctx_t(const Tnfa& nfa, size_t re_nsub, int flags); ~simctx_t(); FORBID_COPY(simctx_t); }; // tag history for lazy disambiguation (both POSIX and leftmost greedy) -struct zhistory_t -{ +struct zhistory_t { struct node_t { tag_info_t info; hidx_t pred; @@ -110,119 +104,86 @@ struct zhistory_t : info(info), pred(pred), orig(orig), step(step) {} }; - struct cache_entry_t - { + struct cache_entry_t { int32_t prec1; int32_t prec2; int32_t prec; }; - typedef std::map cache_t; + using cache_t = std::map; std::vector nodes; cache_t cache; inline zhistory_t(): nodes(), cache() { init(); } inline void init(); - inline node_t &node(hidx_t i) { return nodes[static_cast(i)]; } - inline const node_t &node(hidx_t i) const { return nodes[static_cast(i)]; } - template inline hidx_t link(ctx_t &ctx - , const typename ctx_t::conf_t &conf); - template static int32_t precedence(ctx_t &ctx - , const typename ctx_t::conf_t &x, const typename ctx_t::conf_t &y - , int32_t &prec1, int32_t &prec2); + inline node_t& node(hidx_t i) { return nodes[static_cast(i)]; } + inline const node_t& node(hidx_t i) const { return nodes[static_cast(i)]; } + + template inline hidx_t link(ctx_t& ctx, const typename ctx_t::conf_t& conf); + template static int32_t precedence(ctx_t& ctx, + const typename ctx_t::conf_t& x, + const typename ctx_t::conf_t& y, + int32_t& prec1, + int32_t& prec2); FORBID_COPY(zhistory_t); }; -// tag history for Kuklewicz disambiguation (POSIX semantics) -struct khistory_t -{ - struct node_t { - tag_info_t info; - hidx_t pred; - - inline node_t(tag_info_t info, hidx_t pred) - : info(info), pred(pred) {} - }; - - std::vector nodes; - std::vector path1; - std::vector path2; - - inline khistory_t(): nodes(), path1(), path2() { init(); } - inline void init(); - inline node_t &node(hidx_t i) { return nodes[static_cast(i)]; } - inline const node_t &node(hidx_t i) const { return nodes[static_cast(i)]; } - template inline hidx_t link(ctx_t &ctx - , const typename ctx_t::conf_t &conf); - template static int32_t precedence(ctx_t &ctx - , const typename ctx_t::conf_t &x, const typename ctx_t::conf_t &y - , int32_t &prec1, int32_t &prec2); - FORBID_COPY(khistory_t); -}; - -typedef simctx_t psimctx_t; -typedef simctx_t lsimctx_t; -typedef simctx_t pzsimctx_t; -typedef simctx_t lzsimctx_t; -typedef simctx_t ksimctx_t; +using psimctx_t = simctx_t; +using lsimctx_t = simctx_t; +using pzsimctx_t = simctx_t; +using lzsimctx_t = simctx_t; // regexec functions -typedef int (regexec_t)(const regex_t*, const char*, size_t, regmatch_t[], int); -template regexec_t regexec_dfa; -template regexec_t regexec_dfa_regless; +using regexec_t = int (const regex_t*, const char*, size_t, regmatch_t[], int); +regexec_t regexec_dfa; +template regexec_t regexec_dfa_multipass; regexec_t regexec_nfa_posix; regexec_t regexec_nfa_posix_trie; -regexec_t regexec_nfa_posix_backward; -regexec_t regexec_nfa_posix_kuklewicz; regexec_t regexec_nfa_leftmost; regexec_t regexec_nfa_leftmost_trie; // regparse functions (non-standard) -typedef subhistory_t*(regparse_t)(const regex_t*, const char*, size_t); -template regparse_t regparse_dfa; -template regparse_t regparse_dfa_regless; +using regparse_t = subhistory_t* (const regex_t*, const char*, size_t); +regparse_t regparse_dfa; +template regparse_t regparse_dfa_multipass; // regtstring function (non-standard) template -const tstring_t *regtstring_dfa_regless(const regex_t*, const char*); +const tstring_t* regtstring_dfa_multipass(const regex_t*, const char*); template -simctx_t::simctx_t(const nfa_t &nfa, const nfa_t *nfa0, size_t re_nsub, int flags) - : nfa(nfa) - , nfa0(nfa0) - , nsub(2 * (re_nsub - 1)) - , flags(flags) - , history() - , hidx(HROOT) - , step(0) - , rule(Rule::NONE) - , cursor(NULL) - , marker(NULL) - , offsets1(NULL) - , offsets2(NULL) - , offsets3(NULL) - , done(NULL) - , newprectbl(NULL) - , oldprectbl(NULL) - , oldprecdim(0) - , histlevel(NULL) - , sortcores() - , fincount() - , worklist() - , stateiters() - , reach() - , state() - , gor1_topsort() - , gor1_linear() - , gtop_heap_storage() - , gtop_cmp() - , gtop_heap(gtop_cmp, gtop_heap_storage) - , dc_clstats() -{ +simctx_t::simctx_t(const Tnfa& nfa, size_t re_nsub, int flags) + : nfa(nfa), + nsub(2 * (re_nsub - 1)), + flags(flags), + tags(nfa.tags), + history(), + hidx(HROOT), + step(0), + rule(Rule::NONE), + cursor(nullptr), + marker(nullptr), + offsets1(nullptr), + offsets2(nullptr), + offsets3(nullptr), + done(nullptr), + newprectbl(nullptr), + oldprectbl(nullptr), + oldprecdim(0), + histlevel(nullptr), + sortcores(), + fincount(), + worklist(), + stateiters(), + reach(), + state(), + gor1_topsort(), + gor1_linear(), + clstats() { const size_t - ntags = nfa.tags.size(), - nstates = nfa.size, - ncores = nfa.ncores; + ntags = nfa.tags.size(), + nstates = nfa.nstates, + ncores = nfa.ncores; state.reserve(nstates); reach.reserve(nstates); @@ -235,7 +196,7 @@ simctx_t::simctx_t(const nfa_t &nfa, const nfa_t *nfa0, size_t re_nsu offsets2 = new regoff_t[ntags * ncores]; } if (!(flags & REG_LEFTMOST) && !(flags & REG_TRIE)) { - const size_t dim = (flags & REG_KUKLEWICZ) ? ntags : ncores; + const size_t dim = ncores; newprectbl = new int32_t[ncores * dim]; oldprectbl = new int32_t[ncores * dim]; histlevel = new histleaf_t[ncores]; @@ -243,22 +204,13 @@ simctx_t::simctx_t(const nfa_t &nfa, const nfa_t *nfa0, size_t re_nsu fincount.resize(ncores + 1); worklist.reserve(nstates); } - if (flags & REG_KUKLEWICZ) { - stateiters.reserve(ncores); - } - if (flags & REG_GTOP) { - gtop_heap_storage.reserve(nstates); - } - else { - gor1_topsort.reserve(nstates); - gor1_linear.reserve(nstates); - } + gor1_topsort.reserve(nstates); + gor1_linear.reserve(nstates); } template -simctx_t::~simctx_t() -{ +simctx_t::~simctx_t() { delete[] done; delete[] offsets3; if (!(flags & REG_TRIE)) { @@ -270,17 +222,10 @@ simctx_t::~simctx_t() delete[] oldprectbl; delete[] histlevel; } - if (flags & REG_BACKWARD) { - delete &nfa0->charset; - delete &nfa0->rules; - delete &nfa0->tags; - delete nfa0; - } } template -void init(simctx_t &ctx, const char *string) -{ +void init(simctx_t& ctx, const char* string) { ctx.reach.clear(); ctx.state.clear(); ctx.history.init(); @@ -289,33 +234,30 @@ void init(simctx_t &ctx, const char *string) ctx.rule = Rule::NONE; ctx.cursor = ctx.marker = string; ctx.sortcores.clear(); - DASSERT(ctx.worklist.empty()); - DASSERT(ctx.gor1_topsort.empty()); - DASSERT(ctx.gor1_linear.empty()); - DASSERT(ctx.gtop_heap.empty()); + DCHECK(ctx.worklist.empty()); + DCHECK(ctx.gor1_topsort.empty()); + DCHECK(ctx.gor1_linear.empty()); } -static inline regoff_t *offs_addr(regmatch_t pmatch[], size_t t) -{ - regmatch_t *m = &pmatch[t / 2 + 1]; +static inline regoff_t* offs_addr(regmatch_t pmatch[], size_t t) { + regmatch_t* m = &pmatch[t / 2 + 1]; return t % 2 == 0 ? &m->rm_so : &m->rm_eo; } -struct getoff_nfa_t -{ - const regoff_t *offsets; - inline regoff_t operator()(size_t idx) const { return offsets[idx]; } +struct getoff_nfa_t { + const regoff_t* offsets; + inline regoff_t operator()(size_t idx) const { + return offsets[idx]; + } }; -struct getoff_dfa_t -{ - const dfa_t *dfa; - const regoff_t *regs; +struct getoff_dfa_t { + const Tdfa* dfa; + const regoff_t* regs; const regoff_t len; - regoff_t operator()(size_t idx) const - { - const Tag &tag = dfa->tags[idx]; + regoff_t operator()(size_t idx) const { + const Tag& tag = dfa->tags[idx]; regoff_t off; if (!fixed(tag)) { off = regs[dfa->finvers[idx]]; @@ -330,24 +272,26 @@ struct getoff_dfa_t }; template -void tags_to_submatch(const std::vector &tags, size_t nmatch, - regmatch_t pmatch[], regoff_t len, const getoff_t &getoff) -{ +void tags_to_submatch(const std::vector& tags, + size_t nmatch, + regmatch_t pmatch[], + regoff_t len, + const getoff_t& getoff) { const size_t ntags = tags.size(); - regmatch_t *m = pmatch, *e = pmatch + nmatch; + regmatch_t* m = pmatch, *e = pmatch + nmatch; m->rm_so = 0; m->rm_eo = len; ++m; for (size_t t = 0; t < ntags && m < e; t += 2) { - const Tag &tag = tags[t]; + const Tag& tag = tags[t]; if (!fictive(tag)) { const regoff_t so = getoff(t); const regoff_t eo = getoff(t + 1); for (size_t j = tag.lsub; j <= tag.hsub && m < e; j += 2, ++m) { - DASSERT(m - 1 == &pmatch[j / 2]); + DCHECK(m - 1 == &pmatch[j / 2]); m->rm_so = so; m->rm_eo = eo; } @@ -356,36 +300,35 @@ void tags_to_submatch(const std::vector &tags, size_t nmatch, } template -int finalize(const simctx_t &ctx, const char *string, size_t nmatch, - regmatch_t pmatch[]) -{ +int finalize(const simctx_t& ctx, + const char* string, + size_t nmatch, + regmatch_t pmatch[]) { if (ctx.rule == Rule::NONE) { return REG_NOMATCH; } - const std::vector &tags = ctx.nfa.tags; + const std::vector& tags = ctx.nfa.tags; const size_t ntags = tags.size(); - bool *done = ctx.done; + bool* done = ctx.done; memset(done, 0, ntags * sizeof(bool)); for (int32_t i = ctx.hidx; i != HROOT; ) { - const typename history_t::node_t &n = ctx.history.node(i); + const typename history_t::node_t& n = ctx.history.node(i); i = n.pred; const size_t t = n.info.idx; // If already updated, skip. if (done[t]) continue; - // Update positive tag. if (!n.info.neg) { + // Update positive tag. done[t] = true; ctx.offsets3[t] = static_cast(n.step); - } - - // Update negative tag together with its sibling and nested tags (if any). - else { - const Tag &tag = tags[t]; + } else { + // Update negative tag together with its sibling and nested tags (if any). + const Tag& tag = tags[t]; for (size_t l = tag.lnest; l < tag.hnest; ++l) { if (!done[l]) { done[l] = true; @@ -402,20 +345,18 @@ int finalize(const simctx_t &ctx, const char *string, size_t nmatch, } template -void update_offsets(simctx_t &ctx, const conf_t &c, uint32_t id) -{ - regoff_t *o; - const std::vector &tags = ctx.nfa.tags; +void update_offsets(simctx_t& ctx, const conf_t& c, uint32_t id) { + regoff_t* o; + const std::vector& tags = ctx.nfa.tags; const size_t ntags = tags.size(); - nfa_state_t *s = c.state; - bool *done = ctx.done; + TnfaState* s = c.state; + bool* done = ctx.done; - if (s->type == nfa_state_t::FIN) { + if (s->kind == TnfaState::Kind::FIN) { ctx.marker = ctx.cursor; ctx.rule = 0; o = ctx.offsets3; - } - else { + } else { o = ctx.offsets1 + id * ntags; } @@ -423,22 +364,20 @@ void update_offsets(simctx_t &ctx, const conf_t &c, uint32_t id) memset(done, 0, ntags * sizeof(bool)); for (int32_t i = c.thist; i != HROOT; ) { - const typename history_t::node_t &n = ctx.history.node(i); + const typename history_t::node_t& n = ctx.history.node(i); i = n.pred; const size_t t = n.info.idx; // If already updated, skip. if (done[t]) continue; - // Update positive tag. if (!n.info.neg) { + // Update positive tag. done[t] = true; o[t] = static_cast(ctx.step); - } - - // Update negative tag together with its sibling and nested tags (if any). - else { - const Tag &tag = tags[t]; + } else { + // Update negative tag together with its sibling and nested tags (if any). + const Tag& tag = tags[t]; for (size_t l = tag.lnest; l < tag.hnest; ++l) { if (!done[l]) { done[l] = true; @@ -449,38 +388,21 @@ void update_offsets(simctx_t &ctx, const conf_t &c, uint32_t id) } } -bool ran_or_fin_t::operator()(const conf_t &c) -{ - return c.state->type == nfa_state_t::RAN - || c.state->type == nfa_state_t::FIN; +bool ran_or_fin_t::operator()(const conf_t& c) { + return c.state->kind == TnfaState::Kind::RAN + || c.state->kind == TnfaState::Kind::FIN; } -void zhistory_t::init() -{ +void zhistory_t::init() { nodes.clear(); nodes.push_back(node_t(NOINFO, -1, 0, 0)); cache.clear(); } -void khistory_t::init() -{ - nodes.clear(); - nodes.push_back(node_t(NOINFO, -1)); -} - -template -hidx_t zhistory_t::link(ctx_t &ctx, const typename ctx_t::conf_t &conf) -{ - const int32_t i = static_cast(nodes.size()); - nodes.push_back(node_t(conf.state->tag.info, conf.thist, conf.origin, ctx.step)); - return i; -} - template -hidx_t khistory_t::link(ctx_t &/* ctx */, const typename ctx_t::conf_t &conf) -{ +hidx_t zhistory_t::link(ctx_t& ctx, const typename ctx_t::conf_t& conf) { const int32_t i = static_cast(nodes.size()); - nodes.push_back(node_t(conf.state->tag.info, conf.thist)); + nodes.push_back(node_t(conf.state->tag, conf.thist, conf.origin, ctx.step)); return i; } diff --git a/lib/regexec.cc b/lib/regexec.cc index 662e3ac5..ba62ed64 100644 --- a/lib/regexec.cc +++ b/lib/regexec.cc @@ -1,34 +1,26 @@ -#include #include #include "lib/regex.h" #include "lib/regex_impl.h" #include "src/msg/location.h" - using namespace re2c; using namespace re2c::libre2c; -int regexec(const regex_t *re, const char *string, size_t nmatch, - regmatch_t pmatch[], int eflags) -{ +int regexec(const regex_t* re, const char* string, size_t nmatch, regmatch_t pmatch[], int eflags) { const int cflags = re->flags; if (!(cflags & REG_NFA)) { // DFA-based algorithms - if (cflags & REG_REGLESS) { - // Registerless TDFA. + if (cflags & REG_MULTIPASS) { + // Multi-pass TDFA. if (cflags & REG_LEFTMOST) { - return regexec_dfa_regless(re, string, nmatch, pmatch, eflags); + return regexec_dfa_multipass(re, string, nmatch, pmatch, eflags); } else { - return regexec_dfa_regless(re, string, nmatch, pmatch, eflags); + return regexec_dfa_multipass(re, string, nmatch, pmatch, eflags); } } else { // TDFA with registers and register operations on transitions. - if (cflags & REG_STADFA) { - return regexec_dfa(re, string, nmatch, pmatch, eflags); - } else { - return regexec_dfa(re, string, nmatch, pmatch, eflags); - } + return regexec_dfa(re, string, nmatch, pmatch, eflags); } } else { // NFA-based algorithms @@ -41,11 +33,7 @@ int regexec(const regex_t *re, const char *string, size_t nmatch, } } else { // POSIX disambiguation - if (cflags & REG_BACKWARD) { - return regexec_nfa_posix_backward(re, string, nmatch, pmatch, eflags); - } else if (cflags & REG_KUKLEWICZ) { - return regexec_nfa_posix_kuklewicz(re, string, nmatch, pmatch, eflags); - } else if (cflags & REG_TRIE) { + if (cflags & REG_TRIE) { return regexec_nfa_posix_trie(re, string, nmatch, pmatch, eflags); } else { return regexec_nfa_posix(re, string, nmatch, pmatch, eflags); @@ -54,56 +42,49 @@ int regexec(const regex_t *re, const char *string, size_t nmatch, } } -subhistory_t *regparse(const regex_t *re, const char *string, size_t nmatch) -{ +subhistory_t* regparse(const regex_t* re, const char* string, size_t nmatch) { const int cflags = re->flags; - assert(cflags & REG_SUBHIST); + CHECK(cflags & REG_SUBHIST); if (!(cflags & REG_NFA)) { // DFA-based algorithms - if (cflags & REG_REGLESS) { - // Registerless TDFA. + if (cflags & REG_MULTIPASS) { + // Multi-pass TDFA. if (cflags & REG_LEFTMOST) { - return regparse_dfa_regless(re, string, nmatch); + return regparse_dfa_multipass(re, string, nmatch); } else { - return regparse_dfa_regless(re, string, nmatch); + return regparse_dfa_multipass(re, string, nmatch); } } else { // TDFA with registers and register operations on transitions. - if (cflags & REG_STADFA) { - return regparse_dfa(re, string, nmatch); - } else { - return regparse_dfa(re, string, nmatch); - } + return regparse_dfa(re, string, nmatch); } } else { // NFA-based algorithms (not implemented yet). - assert(false); - return NULL; + UNREACHABLE(); + return nullptr; } } -const tstring_t *regtstring(const regex_t *re, const char *string) -{ +const tstring_t* regtstring(const regex_t* re, const char* string) { const int cflags = re->flags; - assert(cflags & REG_TSTRING); + CHECK(cflags & REG_TSTRING); if (!(cflags & REG_NFA)) { // DFA-based algorithms - if (cflags & REG_REGLESS) { - // Registerless TDFA. + if (cflags & REG_MULTIPASS) { + // Multi-pass TDFA. if (cflags & REG_LEFTMOST) { - return regtstring_dfa_regless(re, string); + return regtstring_dfa_multipass(re, string); } else { - return regtstring_dfa_regless(re, string); + return regtstring_dfa_multipass(re, string); } } else { // TDFA with registers is not suited to tstring construction. - assert(false); - return NULL; + UNREACHABLE(); + return nullptr; } } else { // NFA-based algorithms (not implemented yet). - assert(false); - return NULL; + UNREACHABLE(); + return nullptr; } } - diff --git a/lib/regexec_dfa.cc b/lib/regexec_dfa.cc index 6eb4660b..161571d7 100644 --- a/lib/regexec_dfa.cc +++ b/lib/regexec_dfa.cc @@ -1,4 +1,3 @@ -#include #include #include #include @@ -6,40 +5,33 @@ #include "lib/regex.h" #include "lib/regex_impl.h" #include "lib/regoff_trie.h" -#include "src/debug/debug.h" #include "src/dfa/dfa.h" #include "src/dfa/tcmd.h" #include "src/regexp/rule.h" #include "src/regexp/tag.h" - +#include "src/util/check.h" namespace re2c { namespace libre2c { -static void apply_regops(regoff_t *regs, const tcmd_t *cmd, regoff_t pos) -{ - for (const tcmd_t *p = cmd; p; p = p->next) { +static void apply_regops(regoff_t* regs, const tcmd_t* cmd, regoff_t pos) { + for (const tcmd_t* p = cmd; p; p = p->next) { if (tcmd_t::iscopy(p)) { regs[p->lhs] = regs[p->rhs]; - } - else { - DASSERT (tcmd_t::isset(p)); + } else { + DCHECK(tcmd_t::isset(p)); regs[p->lhs] = *p->history == TAGVER_BOTTOM ? -1 : pos; } } } -template -int regexec_dfa(const regex_t *preg, const char *string, size_t nmatch, - regmatch_t pmatch[], int /* eflags */) -{ - const dfa_t *dfa = preg->dfa; - regoff_t *regs = preg->regs; +int regexec_dfa( + const regex_t* preg, const char* string, size_t nmatch,regmatch_t pmatch[], int /*eflags*/) { + const Tdfa* dfa = preg->dfa; + regoff_t* regs = preg->regs; size_t i = 0; - const char *p = string, *q = p; - const dfa_state_t *s, *x = NULL; - - if (!stadfa) apply_regops(regs, dfa->tcmd0, 0); + const char* p = string, *q = p; + const TdfaState* s, *x = nullptr; for (;;) { s = dfa->states[i]; @@ -52,14 +44,12 @@ int regexec_dfa(const regex_t *preg, const char *string, size_t nmatch, x = s; } - if (stadfa) apply_regops(regs, s->stacmd, p - string - 2); - - if (i == dfa_t::NIL || c == 0) break; + if (i == Tdfa::NIL || c == 0) break; - if (!stadfa) apply_regops(regs, s->tcmd[j], p - string - 1); + apply_regops(regs, s->tcmd[j], p - string - 1); } - if (s->rule == Rule::NONE && x != NULL) { + if (s->rule == Rule::NONE && x != nullptr) { s = x; p = q; @@ -78,10 +68,8 @@ int regexec_dfa(const regex_t *preg, const char *string, size_t nmatch, return 0; } -static void apply_regops_with_history(regoff_trie_t *regtrie, const tcmd_t *cmd, - regoff_t pos) -{ - for (const tcmd_t *p = cmd; p; p = p->next) { +static void apply_regops_with_history(regoff_trie_t* regtrie, const tcmd_t* cmd, regoff_t pos) { + for (const tcmd_t* p = cmd; p; p = p->next) { const size_t lhs = static_cast(p->lhs); const size_t rhs = static_cast(p->rhs); @@ -92,7 +80,7 @@ static void apply_regops_with_history(regoff_trie_t *regtrie, const tcmd_t *cmd, regtrie->set(lhs, *p->history == TAGVER_BOTTOM ? -1 : pos); } else { - const tagver_t *h = p->history, *h0; + const tagver_t* h = p->history, *h0; for (h0 = h; *h != TAGVER_ZERO; ++h); if (lhs != rhs) regtrie->copy(lhs, rhs); @@ -103,17 +91,14 @@ static void apply_regops_with_history(regoff_trie_t *regtrie, const tcmd_t *cmd, } } -template -subhistory_t *regparse_dfa(const regex_t *preg, const char *string, size_t nmatch) -{ - const dfa_t *dfa = preg->dfa; +subhistory_t* regparse_dfa(const regex_t* preg, const char* string, size_t nmatch) { + const Tdfa* dfa = preg->dfa; size_t i = 0; - const char *p = string, *q = p; - const dfa_state_t *s, *x = NULL; - regoff_trie_t *regtrie = preg->regtrie; + const char* p = string, *q = p; + const TdfaState* s, *x = nullptr; + regoff_trie_t* regtrie = preg->regtrie; regtrie->clear(); - if (!stadfa) apply_regops_with_history(regtrie, dfa->tcmd0, 0); for (;;) { s = dfa->states[i]; @@ -126,11 +111,9 @@ subhistory_t *regparse_dfa(const regex_t *preg, const char *string, size_t nmatc x = s; } - if (stadfa) apply_regops_with_history(regtrie, s->stacmd, p - string - 2); + if (i == Tdfa::NIL || c == 0) break; - if (i == dfa_t::NIL || c == 0) break; - - if (!stadfa) apply_regops_with_history(regtrie, s->tcmd[j], p - string - 1); + apply_regops_with_history(regtrie, s->tcmd[j], p - string - 1); } regoff_t mlen; @@ -138,31 +121,29 @@ subhistory_t *regparse_dfa(const regex_t *preg, const char *string, size_t nmatc // already in final state, apply final tags mlen = p - string - 1; apply_regops_with_history(regtrie, s->tcmd[dfa->nchars], mlen); - - } else if (x != NULL) { + } else if (x != nullptr) { // rollback to a final state, apply fallback tags s = x; p = q; mlen = p - string - 1; apply_regops_with_history(regtrie, s->tcmd[dfa->nchars + 1], mlen); - } else { // no final state on the way => no match - return NULL; + return nullptr; } - const std::vector &tags = dfa->tags; + const std::vector& tags = dfa->tags; const size_t ntags = tags.size(); - const tagver_t *finvers = dfa->finvers; + const tagver_t* finvers = dfa->finvers; - const regoff_trie_t::node_t *storage = regtrie->storage; - const size_t *count = regtrie->count; - const size_t *lists = regtrie->lists; + const regoff_trie_t::node_t* storage = regtrie->storage; + const size_t* count = regtrie->count; + const size_t* lists = regtrie->lists; // Find the total number of slots needed for all submatch elements. size_t rm_count = 1; for (size_t t = 0; t < ntags; t += 2) { - const Tag &tag = tags[t]; + const Tag& tag = tags[t]; if (!fictive(tag)) { rm_count += count[finvers[t]] * (1 + (tag.hsub - tag.lsub) / 2); } @@ -170,8 +151,8 @@ subhistory_t *regparse_dfa(const regex_t *preg, const char *string, size_t nmatc // The amount of memory (in bytes) needed to store submatch information. size_t memsize = nmatch * sizeof(subhistory_t) + rm_count * sizeof(regmatch_t); - subhistory_t *h0 = (subhistory_t*) malloc(memsize), *h = h0, *lasth = h + nmatch; - regmatch_t *rm0 = (regmatch_t *)(h0 + nmatch), *rm = rm0; + subhistory_t* h0 = static_cast(malloc(memsize)), *h = h0, *lasth = h + nmatch; + regmatch_t* rm0 = reinterpret_cast(lasth), *rm = rm0; h->size = 1; h->offs = rm; @@ -181,28 +162,27 @@ subhistory_t *regparse_dfa(const regex_t *preg, const char *string, size_t nmatc ++h; for (size_t t = 0; t < ntags && h < lasth; t += 2) { - const Tag &tag = tags[t]; + const Tag& tag = tags[t]; if (fictive(tag)) continue; - DASSERT(!fixed(tag)); + DCHECK(!fixed(tag)); const tagver_t f1 = finvers[t], f2 = finvers[t + 1]; const size_t so0 = lists[f1], sz_so = count[f1]; const size_t eo0 = lists[f2]; - assert(sz_so == count[f2]); + CHECK(sz_so == count[f2]); // Submatch groups corresponding to this tag pair: there may be more // than one capturing parenthesis per tag in regexp like (...(e)...). - for (size_t s = tag.lsub; s <= tag.hsub && h < lasth; s += 2, ++h) { - DASSERT(h - 1 == &h0[s / 2]); + for (size_t k = tag.lsub; k <= tag.hsub && h < lasth; k += 2, ++h) { + DCHECK(h - 1 == &h0[k / 2]); h->size = sz_so; h->offs = rm; rm += sz_so; for (size_t j = sz_so, so = so0, eo = eo0; j --> 0; ) { - const regoff_trie_t::node_t &n_so = storage[so], &n_eo = storage[eo]; - const regmatch_t m = {n_so.off, n_eo.off}; - h->offs[j] = m; + const regoff_trie_t::node_t& n_so = storage[so], &n_eo = storage[eo]; + h->offs[j] = {n_so.off, n_eo.off}; so = n_so.pred; eo = n_eo.pred; } @@ -212,12 +192,5 @@ subhistory_t *regparse_dfa(const regex_t *preg, const char *string, size_t nmatc return h0; } -// explicit instantiation for automaton type -template int regexec_dfa(const regex_t *, const char *, size_t, regmatch_t[], int); -template int regexec_dfa(const regex_t *, const char *, size_t, regmatch_t[], int); -template subhistory_t *regparse_dfa(const regex_t *, const char *, size_t); -template subhistory_t *regparse_dfa(const regex_t *, const char *, size_t); - } // namespace libre2c } // namespace re2c - diff --git a/lib/regexec_dfa_multipass.cc b/lib/regexec_dfa_multipass.cc new file mode 100644 index 00000000..d1feee90 --- /dev/null +++ b/lib/regexec_dfa_multipass.cc @@ -0,0 +1,306 @@ +#include +#include +#include + +#include "lib/regcomp_dfa_multipass.h" +#include "lib/regex.h" +#include "lib/regex_impl.h" +#include "lib/regoff_trie.h" +#include "src/dfa/dfa.h" +#include "src/dfa/tcmd.h" +#include "src/regexp/rule.h" +#include "src/regexp/tag.h" +#include "src/util/check.h" + +namespace re2c { +namespace libre2c { + +// Match RLDFA on the input string, logging backlink arrays on the way. This is the forward pass +// that is common to all algorithms in this file. +static MpTdfaBacklink forward_pass(const regex_t* preg, const char* string, size_t* matchlen) { + const MpTdfa* mptdfa = preg->mptdfa; + const std::vector& states = mptdfa->states; + std::vector& log = mptdfa->log; + + const char* strptr = string, *finstrptr = strptr; + MpTdfaBacklink finlink = {NOCONF, nullptr, 0}; + + log.clear(); + for (size_t stidx = 0;;) { + const int32_t chr = *strptr++; + const size_t cls = preg->char2class[chr]; + + const MpTdfaState* state = states[stidx]; + const MpTdfaArc& arc = state->arcs[cls]; + stidx = arc.state; + + log.push_back(&arc.backlinks); + + if (state->finlink.conf != NOCONF) { + finlink = state->finlink; + finstrptr = strptr; + } + + if (stidx == Tdfa::NIL || chr == 0) break; + } + + *matchlen = static_cast(finstrptr - string) - 1; + return finlink; +} + +static inline void apply_tfrag(const tchar_t* tfrag, + size_t tfrag_size, + regoff_t* result, + size_t offset, + const std::vector& tags) { + for (size_t i = tfrag_size; i --> 0;) { + size_t t = tfrag[i]; + const bool negative = t >= TAG_BASE; + if (negative) t -= TAG_BASE; + + // If already updated, skip. + if (result[t] != NORESULT) continue; + + // Process fictive tags as well, as they may have nested negative tags. + if (!negative) { + // Update positive tag. + result[t] = static_cast(offset); + } else { + // Update negative tag together with its sibling and nested tags. + const Tag& tag = tags[t]; + for (size_t l = tag.lnest; l < tag.hnest; ++l) { + result[l] = -1; + } + } + } +} + +template +int regexec_dfa_multipass(const regex_t* preg, + const char* string, + size_t nmatch, + regmatch_t pmatch[], + int /* eflags */) { + // Forward pass. + size_t matchlen; + MpTdfaBacklink finlink = forward_pass(preg, string, &matchlen); + + // No final state has been encountered on the way; return no-match. + if (finlink.conf == NOCONF) return REG_NOMATCH; + + const MpTdfa* mptdfa = preg->mptdfa; + const std::vector& tags = mptdfa->tags; + const size_t ntags = tags.size(); + + // It is necessary to initialize tag offsets to prevent overwriting the latest tag value when + // unwinding tag history. + regoff_t* result = mptdfa->result; + std::fill(result, result + ntags, NORESULT); + + // Unwind tag history back from the final RLDFA state to the initial state. + std::vector& log = mptdfa->log; + MpTdfaBacklink link = finlink; + for (size_t offset = matchlen;;) { + apply_tfrag(link.tfrag, link.tfrag_size, result, offset, tags); + + if (offset == 0) break; + --offset; + + link = (*log[offset])[link.conf]; + } + + // Copy tag offsets to submatch results in the pmatch[] array. + regmatch_t* match = pmatch, *lastmatch = pmatch + nmatch; + match->rm_so = 0; + match->rm_eo = static_cast(matchlen); + ++match; + + for (size_t t = 0; t < ntags && match < lastmatch; t += 2) { + const Tag& tag = tags[t]; + if (fictive(tag)) continue; + + const regoff_t so = result[t], eo = result[t + 1]; + + // Submatch groups corresponding to this tag pair: there may be more than one capturing + // parenthesis per tag in regexp like (...(e)...). + for (size_t s = tag.lsub; s <= tag.hsub && match < lastmatch; s += 2, ++match) { + DCHECK(match - 1 == &pmatch[s / 2]); + match->rm_so = so; + match->rm_eo = eo; + } + } + + return 0; +} + +static inline void apply_tfrag_subhist(const tchar_t* tfrag, + size_t tfrag_size, + regoff_trie_t* regtrie, + size_t offset, + const std::vector& tags) { + for (size_t i = tfrag_size; i --> 0;) { + const size_t t = tfrag[i]; + + // Process fictive tags as well, as they may have nested negative tags. + if (t < TAG_BASE) { + // Update positive tag. + regtrie->add(t, static_cast(offset)); + } else { + // Update negative tag together with its sibling and nested tags. + const Tag& tag = tags[t - TAG_BASE]; + for (size_t l = tag.lnest; l < tag.hnest; ++l) { + regtrie->add(l, -1); + } + } + } +} + +template +subhistory_t* regparse_dfa_multipass(const regex_t* preg, const char* string, size_t nmatch) { + // Forward pass. + size_t matchlen; + MpTdfaBacklink finlink = forward_pass(preg, string, &matchlen); + + // No final state has been encountered on the way; return no-match. + if (finlink.conf == NOCONF) return nullptr; + + const MpTdfa* mptdfa = preg->mptdfa; + const std::vector& tags = mptdfa->tags; + const size_t ntags = tags.size(); + regoff_trie_t* regtrie = preg->regtrie; + + // Unwind tag history back from the final RLDFA state to the initial state. + std::vector& log = mptdfa->log; + MpTdfaBacklink link = finlink; + regtrie->clear(); + for (size_t offset = matchlen;;) { + apply_tfrag_subhist(link.tfrag, link.tfrag_size, regtrie, offset, tags); + + if (offset == 0) break; + --offset; + + link = (*log[offset])[link.conf]; + } + + const regoff_trie_t::node_t* storage = regtrie->storage; + const size_t* count = regtrie->count; + const size_t* lists = regtrie->lists; + + // Find the total number of slots needed for all submatch elements. + size_t rm_count = 1; + for (size_t t = 0; t < ntags; t += 2) { + const Tag& tag = tags[t]; + if (!fictive(tag)) { + rm_count += count[t] * (1 + (tag.hsub - tag.lsub) / 2); + } + } + // The amount of memory (in bytes) needed to store submatch information. + size_t memsize = nmatch * sizeof(subhistory_t) + rm_count * sizeof(regmatch_t); + + subhistory_t* h0 = static_cast(malloc(memsize)), *h = h0, *lasth = h + nmatch; + regmatch_t* rm0 = reinterpret_cast(lasth), *rm = rm0; + + h->size = 1; + h->offs = rm; + rm += 1; + h->offs[0].rm_so = 0; + h->offs[0].rm_eo = static_cast(matchlen); + ++h; + + for (size_t t = 0; t < ntags && h < lasth; t += 2) { + const Tag& tag = tags[t]; + if (fictive(tag)) continue; + + const size_t so0 = lists[t], sz_so = count[t]; + const size_t eo0 = lists[t + 1]; + CHECK(sz_so == count[t + 1]); + + // Submatch groups corresponding to this tag pair: there may be more than one capturing + // parenthesis per tag in regexp like (...(e)...). + for (size_t s = tag.lsub; s <= tag.hsub && h < lasth; s += 2, ++h) { + DCHECK(h - 1 == &h0[s / 2]); + + h->size = sz_so; + h->offs = rm; + rm += sz_so; + + for (size_t j = 0, so = so0, eo = eo0; j < sz_so; ++j) { + const regoff_trie_t::node_t& n_so = storage[so], &n_eo = storage[eo]; + h->offs[j] = {n_so.off, n_eo.off}; + so = n_so.pred; + eo = n_eo.pred; + } + } + } + + return h0; +} + +template +const tstring_t* regtstring_dfa_multipass(const regex_t* preg, const char* string) { + // Forward pass. + size_t matchlen; + MpTdfaBacklink finlink = forward_pass(preg, string, &matchlen); + + // No final state has been encountered on the way; return no-match. + if (finlink.conf == NOCONF) return nullptr; + + const MpTdfa* mptdfa = preg->mptdfa; + std::vector& log = mptdfa->log; + MpTdfaBacklink link = finlink; + + // Calculate the length of the resulting t-string. + size_t len = matchlen; + for (size_t offset = matchlen;;) { + len += link.tfrag_size; + + if (offset == 0) break; + --offset; + + link = (*log[offset])[link.conf]; + } + len += 2; // tags for the outermost capture that wraps the whole regexp + len += 1; // terminating NULL + + tstring_t* tstr = &preg->tstring; + if (tstr->capacity <= len) { + tstr->capacity = len * 2; + delete[] tstr->string; + tstr->string = new tchar_t[tstr->capacity]; + } + + tstr->length = len; + tchar_t* s = tstr->string + len; + + *--s = 0; // terminating NULL + *--s = TAG_BASE + 2; // outermost closing parenthesis + + link = finlink; + for (size_t offset = matchlen;;) { + s -= link.tfrag_size; + memcpy(s, link.tfrag, link.tfrag_size * sizeof(tchar_t)); + + if (offset == 0) break; + --offset; + + *--s = static_cast(string[offset]); + link = (*log[offset])[link.conf]; + } + + *--s = TAG_BASE + 1; // outermost opening parenthesis + + return tstr; +} + +// explicit instantiation for disambiguation policy (leftmost greedy / POSIX) +template int regexec_dfa_multipass( + const regex_t*, const char*, size_t, regmatch_t[], int); +template int regexec_dfa_multipass( + const regex_t*, const char*, size_t, regmatch_t[], int); +template subhistory_t* regparse_dfa_multipass(const regex_t*, const char*, size_t); +template subhistory_t* regparse_dfa_multipass(const regex_t*, const char*, size_t); +template const tstring_t* regtstring_dfa_multipass(const regex_t*, const char*); +template const tstring_t* regtstring_dfa_multipass(const regex_t*, const char*); + +} // namespace libre2c +} // namespace re2c diff --git a/lib/regexec_dfa_regless.cc b/lib/regexec_dfa_regless.cc deleted file mode 100644 index e1675d3e..00000000 --- a/lib/regexec_dfa_regless.cc +++ /dev/null @@ -1,312 +0,0 @@ -#include -#include -#include -#include - -#include "lib/regcomp_dfa_regless.h" -#include "lib/regex.h" -#include "lib/regex_impl.h" -#include "lib/regoff_trie.h" -#include "src/debug/debug.h" -#include "src/dfa/dfa.h" -#include "src/dfa/tcmd.h" -#include "src/regexp/rule.h" -#include "src/regexp/tag.h" - - -namespace re2c { -namespace libre2c { - -// Match RLDFA on the input string, logging backlink arrays on the way. -// This is the forward pass that is common to all algorithms in this file. -static rldfa_backlink_t forward_pass(const regex_t *preg, const char *string, - size_t *matchlen) -{ - const rldfa_t *rldfa = preg->rldfa; - const std::vector &states = rldfa->states; - std::vector &log = rldfa->log; - - const char *strptr = string, *finstrptr = strptr; - rldfa_backlink_t finlink = {NOCONF, HROOT, NULL, 0}; - - log.clear(); - for (size_t stidx = 0;;) { - const int32_t chr = *strptr++; - const size_t cls = preg->char2class[chr]; - - const rldfa_state_t *state = states[stidx]; - const rldfa_arc_t &arc = state->arcs[cls]; - stidx = arc.state; - - log.push_back(&arc.backlinks); - - if (state->finlink.conf != NOCONF) { - finlink = state->finlink; - finstrptr = strptr; - } - - if (stidx == dfa_t::NIL || chr == 0) break; - } - - *matchlen = static_cast(finstrptr - string) - 1; - return finlink; -} - -static inline void apply_tfrag(const tchar_t *tfrag, size_t tfrag_size, - regoff_t *result, size_t offset, const std::vector &tags) -{ - for (size_t i = tfrag_size; i --> 0;) { - size_t t = tfrag[i]; - const bool negative = t >= TAG_BASE; - if (negative) t -= TAG_BASE; - - // If already updated, skip. - if (result[t] != NORESULT) continue; - - // Process fictive tags as well, as they may have nested negative tags. - if (!negative) { - // Update positive tag. - result[t] = static_cast(offset); - } else { - // Update negative tag together with its sibling and nested tags. - const Tag &tag = tags[t]; - for (size_t l = tag.lnest; l < tag.hnest; ++l) { - result[l] = -1; - } - } - } -} - -template -int regexec_dfa_regless(const regex_t *preg, const char *string, size_t nmatch, - regmatch_t pmatch[], int /* eflags */) -{ - // Forward pass. - size_t matchlen; - rldfa_backlink_t finlink = forward_pass(preg, string, &matchlen); - - // No final state has been encountered on the way; return no-match. - if (finlink.conf == NOCONF) return REG_NOMATCH; - - const rldfa_t *rldfa = preg->rldfa; - ctx_t &ctx = *static_cast(rldfa->ctx); - const std::vector &tags = ctx.dfa.tags; - const size_t ntags = tags.size(); - - // It is necessary to initialize tag offsets to prevent overwriting the - // latest tag value when unwinding tag history. - regoff_t *result = rldfa->result; - std::fill(result, result + ntags, NORESULT); - - // Unwind tag history back from the final RLDFA state to the initial state. - std::vector &log = rldfa->log; - rldfa_backlink_t link = finlink; - for (size_t offset = matchlen;;) { - apply_tfrag(link.tfrag, link.tfrag_size, result, offset, tags); - - if (offset == 0) break; - --offset; - - link = (*log[offset])[link.conf]; - } - - // Copy tag offsets to submatch results in the pmatch[] array. - regmatch_t *match = pmatch, *lastmatch = pmatch + nmatch; - match->rm_so = 0; - match->rm_eo = static_cast(matchlen); - ++match; - - for (size_t t = 0; t < ntags && match < lastmatch; t += 2) { - const Tag &tag = tags[t]; - if (fictive(tag)) continue; - - const regoff_t so = result[t], eo = result[t + 1]; - - // Submatch groups corresponding to this tag pair: there may be more - // than one capturing parenthesis per tag in regexp like (...(e)...). - for (size_t s = tag.lsub; s <= tag.hsub && match < lastmatch; s += 2, ++match) { - DASSERT(match - 1 == &pmatch[s / 2]); - match->rm_so = so; - match->rm_eo = eo; - } - } - - return 0; -} - -static inline void apply_tfrag_subhist(const tchar_t *tfrag, size_t tfrag_size, - regoff_trie_t *regtrie, size_t offset, const std::vector &tags) -{ - for (size_t i = tfrag_size; i --> 0;) { - const size_t t = tfrag[i]; - - // Process fictive tags as well, as they may have nested negative tags. - if (t < TAG_BASE) { - // Update positive tag. - regtrie->add(t, static_cast(offset)); - } else { - // Update negative tag together with its sibling and nested tags. - const Tag &tag = tags[t - TAG_BASE]; - for (size_t l = tag.lnest; l < tag.hnest; ++l) { - regtrie->add(l, -1); - } - } - } -} - -template -subhistory_t *regparse_dfa_regless(const regex_t *preg, const char *string, size_t nmatch) -{ - // Forward pass. - size_t matchlen; - rldfa_backlink_t finlink = forward_pass(preg, string, &matchlen); - - // No final state has been encountered on the way; return no-match. - if (finlink.conf == NOCONF) return NULL; - - const rldfa_t *rldfa = preg->rldfa; - ctx_t &ctx = *static_cast(rldfa->ctx); - const std::vector &tags = ctx.dfa.tags; - const size_t ntags = tags.size(); - regoff_trie_t *regtrie = preg->regtrie; - - // Unwind tag history back from the final RLDFA state to the initial state. - std::vector &log = rldfa->log; - rldfa_backlink_t link = finlink; - regtrie->clear(); - for (size_t offset = matchlen;;) { - apply_tfrag_subhist(link.tfrag, link.tfrag_size, regtrie, offset, tags); - - if (offset == 0) break; - --offset; - - link = (*log[offset])[link.conf]; - } - - const regoff_trie_t::node_t *storage = regtrie->storage; - const size_t *count = regtrie->count; - const size_t *lists = regtrie->lists; - - // Find the total number of slots needed for all submatch elements. - size_t rm_count = 1; - for (size_t t = 0; t < ntags; t += 2) { - const Tag &tag = tags[t]; - if (!fictive(tag)) { - rm_count += count[t] * (1 + (tag.hsub - tag.lsub) / 2); - } - } - // The amount of memory (in bytes) needed to store submatch information. - size_t memsize = nmatch * sizeof(subhistory_t) + rm_count * sizeof(regmatch_t); - - subhistory_t *h0 = (subhistory_t*) malloc(memsize), *h = h0, *lasth = h + nmatch; - regmatch_t *rm0 = (regmatch_t *)(h0 + nmatch), *rm = rm0; - - h->size = 1; - h->offs = rm; - rm += 1; - h->offs[0].rm_so = 0; - h->offs[0].rm_eo = static_cast(matchlen); - ++h; - - for (size_t t = 0; t < ntags && h < lasth; t += 2) { - const Tag &tag = tags[t]; - if (fictive(tag)) continue; - - const size_t so0 = lists[t], sz_so = count[t]; - const size_t eo0 = lists[t + 1]; - assert(sz_so == count[t + 1]); - - // Submatch groups corresponding to this tag pair: there may be more - // than one capturing parenthesis per tag in regexp like (...(e)...). - for (size_t s = tag.lsub; s <= tag.hsub && h < lasth; s += 2, ++h) { - DASSERT(h - 1 == &h0[s / 2]); - - h->size = sz_so; - h->offs = rm; - rm += sz_so; - - for (size_t j = 0, so = so0, eo = eo0; j < sz_so; ++j) { - const regoff_trie_t::node_t &n_so = storage[so], &n_eo = storage[eo]; - const regmatch_t m = {n_so.off, n_eo.off}; - h->offs[j] = m; - so = n_so.pred; - eo = n_eo.pred; - } - } - } - - return h0; -} - -template -const tstring_t *regtstring_dfa_regless(const regex_t *preg, const char *string) -{ - // Forward pass. - size_t matchlen; - rldfa_backlink_t finlink = forward_pass(preg, string, &matchlen); - - // No final state has been encountered on the way; return no-match. - if (finlink.conf == NOCONF) return NULL; - - const rldfa_t *rldfa = preg->rldfa; - std::vector &log = rldfa->log; - rldfa_backlink_t link = finlink; - - // Calculate the length of the resulting t-string. - size_t len = matchlen; - for (size_t offset = matchlen;;) { - len += link.tfrag_size; - - if (offset == 0) break; - --offset; - - link = (*log[offset])[link.conf]; - } - len += 2; // tags for the outermost capture that wraps the whole RE - len += 1; // terminating NULL - - tstring_t *tstr = &preg->tstring; - if (tstr->capacity <= len) { - tstr->capacity = len * 2; - delete[] tstr->string; - tstr->string = new tchar_t[tstr->capacity]; - } - - tstr->length = len; - tchar_t *s = tstr->string + len; - - *--s = 0; // terminating NULL - *--s = TAG_BASE + 2; // outermost closing parenthesis - - link = finlink; - for (size_t offset = matchlen;;) { - s -= link.tfrag_size; - memcpy(s, link.tfrag, link.tfrag_size * sizeof(tchar_t)); - - if (offset == 0) break; - --offset; - - *--s = static_cast(string[offset]); - link = (*log[offset])[link.conf]; - } - - *--s = TAG_BASE + 1; // outermost opening parenthesis - - return tstr; -} - -// explicit instantiation for disambiguation policy (leftmost greedy / POSIX) -template int regexec_dfa_regless(const regex_t*, const char*, size_t, - regmatch_t[], int); -template int regexec_dfa_regless(const regex_t*, const char*, size_t, - regmatch_t[], int); -template subhistory_t* regparse_dfa_regless(const regex_t*, const char*, - size_t); -template subhistory_t* regparse_dfa_regless(const regex_t*, const char*, - size_t); -template const tstring_t* regtstring_dfa_regless(const regex_t*, const char*); -template const tstring_t* regtstring_dfa_regless(const regex_t*, const char*); - -} // namespace libre2c -} // namespace re2c - diff --git a/lib/regexec_nfa_leftmost.cc b/lib/regexec_nfa_leftmost.cc index 7c3af7b7..0245b37d 100644 --- a/lib/regexec_nfa_leftmost.cc +++ b/lib/regexec_nfa_leftmost.cc @@ -5,23 +5,21 @@ #include "lib/regex.h" #include "lib/regex_impl.h" -#include "src/debug/debug.h" #include "src/dfa/closure_leftmost.h" #include "src/dfa/tag_history.h" #include "src/nfa/nfa.h" #include "src/regexp/rule.h" +#include "src/util/check.h" #include "src/util/range.h" - namespace re2c { namespace libre2c { -static void reach_on_symbol(lsimctx_t &, uint32_t); +static void reach_on_symbol(lsimctx_t&, uint32_t); -int regexec_nfa_leftmost(const regex_t *preg, const char *string - , size_t nmatch, regmatch_t pmatch[], int) -{ - lsimctx_t &ctx = *static_cast(preg->simctx); +int regexec_nfa_leftmost( + const regex_t* preg, const char* string, size_t nmatch, regmatch_t pmatch[], int) { + lsimctx_t& ctx = *static_cast(preg->simctx); init(ctx, string); // root state can be non-core, so we pass zero as origin to avoid checks @@ -36,9 +34,9 @@ int regexec_nfa_leftmost(const regex_t *preg, const char *string } for (cconfiter_t i = ctx.state.begin(), e = ctx.state.end(); i != e; ++i) { - nfa_state_t *s = i->state; + TnfaState* s = i->state; s->clos = NOCLOS; - if (s->type == nfa_state_t::FIN) { + if (s->kind == TnfaState::Kind::FIN) { update_offsets(ctx, *i, NONCORE); } } @@ -53,30 +51,28 @@ int regexec_nfa_leftmost(const regex_t *preg, const char *string return 0; } -void reach_on_symbol(lsimctx_t &ctx, uint32_t sym) -{ - const confset_t &state = ctx.state; - confset_t &reach = ctx.reach; - DASSERT(reach.empty()); +void reach_on_symbol(lsimctx_t& ctx, uint32_t sym) { + const confset_t& state = ctx.state; + confset_t& reach = ctx.reach; + DCHECK(reach.empty()); // in reverse, so that future closure DFS has states in stack order uint32_t j = 0; for (rcconfiter_t i = state.rbegin(), e = state.rend(); i != e; ++i) { - nfa_state_t *s = i->state; + TnfaState* s = i->state; s->clos = NOCLOS; - if (s->type == nfa_state_t::RAN) { - for (const Range *r = s->ran.ran; r; r = r->next()) { + if (s->kind == TnfaState::Kind::RAN) { + for (const Range* r = s->ran; r; r = r->next()) { if (r->lower() <= sym && sym < r->upper()) { - conf_t c(s->ran.out, j, HROOT); + conf_t c(s->out1, j, HROOT); reach.push_back(c); update_offsets(ctx, *i, j); ++j; break; } } - } - else if (s->type == nfa_state_t::FIN) { + } else if (s->kind == TnfaState::Kind::FIN) { update_offsets(ctx, *i, NONCORE); } } @@ -88,4 +84,3 @@ void reach_on_symbol(lsimctx_t &ctx, uint32_t sym) } // namespace libre2 } // namespace re2c - diff --git a/lib/regexec_nfa_leftmost_trie.cc b/lib/regexec_nfa_leftmost_trie.cc index 37d9cc56..f6f7e96c 100644 --- a/lib/regexec_nfa_leftmost_trie.cc +++ b/lib/regexec_nfa_leftmost_trie.cc @@ -4,23 +4,21 @@ #include "lib/regex.h" #include "lib/regex_impl.h" -#include "src/debug/debug.h" #include "src/dfa/closure_leftmost.h" #include "src/dfa/tag_history.h" #include "src/nfa/nfa.h" +#include "src/util/check.h" #include "src/util/range.h" - namespace re2c { namespace libre2c { -static void make_step(lzsimctx_t &, uint32_t); -static void make_final_step(lzsimctx_t &ctx); +static void make_step(lzsimctx_t&, uint32_t); +static void make_final_step(lzsimctx_t& ctx); -int regexec_nfa_leftmost_trie(const regex_t *preg, const char *string - , size_t nmatch, regmatch_t pmatch[], int) -{ - lzsimctx_t &ctx = *static_cast(preg->simctx); +int regexec_nfa_leftmost_trie( + const regex_t* preg, const char* string, size_t nmatch, regmatch_t pmatch[], int) { + lzsimctx_t& ctx = *static_cast(preg->simctx); init(ctx, string); const conf_t c0(ctx.nfa.root, 0/* unused */, HROOT); @@ -37,31 +35,29 @@ int regexec_nfa_leftmost_trie(const regex_t *preg, const char *string return finalize(ctx, string, nmatch, pmatch); } -void make_step(lzsimctx_t &ctx, uint32_t sym) -{ - const confset_t &state = ctx.state; - confset_t &reach = ctx.reach; +void make_step(lzsimctx_t& ctx, uint32_t sym) { + const confset_t& state = ctx.state; + confset_t& reach = ctx.reach; // in reverse, so that future closure DFS has states in stack order rcconfiter_t b = state.rbegin(), e = state.rend(), i; - DASSERT(reach.empty()); + DCHECK(reach.empty()); for (i = b; i != e; ++i) { - nfa_state_t *s = i->state; + TnfaState* s = i->state; // cleanup from previous closure s->clos = NOCLOS; - if (s->type == nfa_state_t::RAN) { - for (const Range *r = s->ran.ran; r; r = r->next()) { + if (s->kind == TnfaState::Kind::RAN) { + for (const Range* r = s->ran; r; r = r->next()) { if (r->lower() <= sym && sym < r->upper()) { - const conf_t c(s->ran.out, 0/* unused */, i->thist); + const conf_t c(s->out1, 0/* unused */, i->thist); reach.push_back(c); break; } } - } - else if (s->type == nfa_state_t::FIN) { + } else if (s->kind == TnfaState::Kind::FIN) { ctx.marker = ctx.cursor; ctx.hidx = i->thist; ctx.rule = 0; @@ -71,15 +67,14 @@ void make_step(lzsimctx_t &ctx, uint32_t sym) ++ctx.step; } -void make_final_step(lzsimctx_t &ctx) -{ +void make_final_step(lzsimctx_t& ctx) { for (confiter_t i = ctx.state.begin(), e = ctx.state.end(); i != e; ++i) { - nfa_state_t *s = i->state; + TnfaState* s = i->state; s->clos = NOCLOS; - DASSERT(s->active == 0); + DCHECK(s->active == 0); - if (s->type == nfa_state_t::FIN) { + if (s->kind == TnfaState::Kind::FIN) { ctx.marker = ctx.cursor; ctx.hidx = i->thist; ctx.rule = 0; @@ -89,4 +84,3 @@ void make_final_step(lzsimctx_t &ctx) } // namespace libre2 } // namespace re2c - diff --git a/lib/regexec_nfa_posix.cc b/lib/regexec_nfa_posix.cc index e31b5005..6eeb2775 100644 --- a/lib/regexec_nfa_posix.cc +++ b/lib/regexec_nfa_posix.cc @@ -2,32 +2,32 @@ #include #include #include -#include #include "lib/regex.h" #include "lib/regex_impl.h" -#include "src/debug/debug.h" #include "src/dfa/closure_posix.h" #include "src/dfa/posix_precedence.h" #include "src/dfa/tag_history.h" #include "src/nfa/nfa.h" #include "src/regexp/rule.h" +#include "src/util/check.h" #include "src/util/range.h" - namespace re2c { namespace libre2c { -static void make_one_step(psimctx_t &, uint32_t); -static void make_final_step(psimctx_t &); +static void make_one_step(psimctx_t&, uint32_t); +static void make_final_step(psimctx_t&); // we *do* want these to be inlined -static inline void closure_posix(psimctx_t &ctx); - -int regexec_nfa_posix(const regex_t *preg, const char *string - , size_t nmatch, regmatch_t pmatch[], int /* eflags */) -{ - psimctx_t &ctx = *static_cast(preg->simctx); +static inline void closure_posix(psimctx_t& ctx); + +int regexec_nfa_posix(const regex_t* preg, + const char* string, + size_t nmatch, + regmatch_t pmatch[], + int /*eflags*/) { + psimctx_t& ctx = *static_cast(preg->simctx); init(ctx, string); // root state can be non-core, so we pass zero as origin to avoid checks @@ -51,35 +51,27 @@ int regexec_nfa_posix(const regex_t *preg, const char *string return 0; } -void closure_posix(psimctx_t &ctx) -{ +void closure_posix(psimctx_t& ctx) { ctx.history.init(); - - if (ctx.flags & REG_GTOP) { - closure_posix_gtop(ctx); - } - else { - closure_posix_gor1(ctx); - } + closure_posix_gor1(ctx); } -void make_one_step(psimctx_t &ctx, uint32_t sym) -{ - confset_t &state = ctx.state, &reach = ctx.reach; +void make_one_step(psimctx_t& ctx, uint32_t sym) { + confset_t& state = ctx.state, &reach = ctx.reach; uint32_t j = 0; reach.clear(); for (cconfiter_t i = state.begin(), e = state.end(); i != e; ++i) { - nfa_state_t *s = i->state; + TnfaState* s = i->state; s->clos = NOCLOS; s->arcidx = 0; - DASSERT(s->status == GOR_NOPASS && s->active == 0); + DCHECK(s->status == GOR_NOPASS && s->active == 0); - if (s->type == nfa_state_t::RAN) { - for (const Range *r = s->ran.ran; r; r = r->next()) { + if (s->kind == TnfaState::Kind::RAN) { + for (const Range* r = s->ran; r; r = r->next()) { if (r->lower() <= sym && sym < r->upper()) { - const conf_t c(s->ran.out, j, HROOT); + const conf_t c(s->out1, j, HROOT); reach.push_back(c); state[j] = *i; update_offsets(ctx, *i, j); @@ -87,8 +79,7 @@ void make_one_step(psimctx_t &ctx, uint32_t sym) break; } } - } - else if (s->type == nfa_state_t::FIN) { + } else if (s->kind == TnfaState::Kind::FIN) { update_offsets(ctx, *i, NONCORE); } } @@ -98,8 +89,7 @@ void make_one_step(psimctx_t &ctx, uint32_t sym) if (!(ctx.flags & REG_SLOWPREC)) { compute_prectable_complex(ctx); - } - else { + } else { compute_prectable_naive(ctx); } std::swap(ctx.newprectbl, ctx.oldprectbl); @@ -109,16 +99,15 @@ void make_one_step(psimctx_t &ctx, uint32_t sym) ++ctx.step; } -void make_final_step(psimctx_t &ctx) -{ +void make_final_step(psimctx_t& ctx) { for (cconfiter_t i = ctx.state.begin(), e = ctx.state.end(); i != e; ++i) { - nfa_state_t *s = i->state; + TnfaState* s = i->state; s->clos = NOCLOS; s->arcidx = 0; - DASSERT(s->status == GOR_NOPASS && s->active == 0); + DCHECK(s->status == GOR_NOPASS && s->active == 0); - if (s->type == nfa_state_t::FIN) { + if (s->kind == TnfaState::Kind::FIN) { update_offsets(ctx, *i, NONCORE); } } @@ -126,4 +115,3 @@ void make_final_step(psimctx_t &ctx) } // namespace libre2c } // namespace re2c - diff --git a/lib/regexec_nfa_posix_backward.cc b/lib/regexec_nfa_posix_backward.cc deleted file mode 100644 index 3a953ac8..00000000 --- a/lib/regexec_nfa_posix_backward.cc +++ /dev/null @@ -1,632 +0,0 @@ -#include -#include -#include -#include - -#include "lib/regex.h" -#include "lib/regex_impl.h" -#include "src/debug/debug.h" -#include "src/dfa/closure_posix.h" -#include "src/dfa/tag_history.h" -#include "src/nfa/nfa.h" -#include "src/regexp/rule.h" -#include "src/regexp/tag.h" -#include "src/util/range.h" - - -namespace re2c { -namespace libre2c { - -/* - * This is a quick-and-dirty implementation of backward matching algorithm - * suggested by Cox. The algorithm is incorrect, but it is surprisingly - * close to correct, and we keep it around for research purposes (in case - * we need to confirm its incorrectness in the light of some future findings). - * - * The algorithm attempts to find POSIX longest match by going from right - * to left and maximizing the length of the latest iteration of each - * capturing group, similar to the way Laurikari algorithm minimizez or - * maximizes the most recent value of each tag. - * - * The algorithm also has to remember the match on the last iteration, so - * it has to carry around twice as much data for offssets. Offsets are - * bound to configurations: for each configuration, there is a "scratch" - * offset pair, an "accepted" offset pair and also a "backup" offset pair - * that is used only during closure initialization. And double all that, - * as we keep offsets of both last and most recent iterations. All this - * means a lot of copying, so unsurprisingly the algorithm is very slow - * if RE has many submatch groups. Some copying can be reduced, but not all: - * SSSP does not (in general) proceed in stack order, therefore the - * previous scanned configuration might not be related at all to the next. - * GOR1's topsort phase is DFS though, so it can be made to use a single - * scratch buffer for offsets (that is updated on DFS recursive enter and - * restored on DFS recursive return). - * - * The algorithm is incorrect because it sometimes compares ambiguous - * paths too late, when the difference between them has been overwritten - * with more recent offsets. This may happen when two conditions are met: - * - * (1) the most recent iteration of both paths matches the same substring, - * so that the offsets on this iteration are equal - * - * (2) paths are compared for the first time *after* the offsets have been - * updated and have become indistinguishable - * - * The algorithm would work if such paths had a join point somewhere - * before the problematic iteration, and were properly disambiguated at that - * point. The algorithm may fail to do so in two cases: - * - * (A) With bounded repetition, each repetition duplicated in is a separate - * automaton, and paths may not meet until the final join point (because - * at each step they end in different subautomata). This depends on how - * we unroll bounded repetition. An example of this is (aaaa|aaa|a){3,4} - * and paths X = (aaaa)(aaaa)(a)(a) and Y = (aaaa)(aaa)(aaa), provided - * that we unroll bounded repetition by making the last iteration - * optional and the first three non-optional (the normal way we unroll - * it). Note that the paths have different number of iterations (4 and 3) - * which is why they don't meet until the very end. - * - * (B) Even if there is a join point, due to a particular order of closure - * traversal paths may need to be compared *after* the join point (at - * the time when offsets are already overwritten with new ones, making - * the paths indistinguishable). This may happen if the order of closure - * traversal is not topological (for example, when we go in depth-first - * order, start from some unfortunate initial state and propagate the - * wrong path to other states, then start from another initial state and - * need to propagate improvement to all those states). - * - */ - -static void make_one_step_simple(psimctx_t &, uint32_t); -static void make_one_step(psimctx_t &, uint32_t); -static void make_final_step(psimctx_t &); -static void update_final_offsets(psimctx_t &ctx, const conf_t &c); -static void closure_simple(psimctx_t &ctx); -static void relax_gtop(psimctx_t &ctx, const psimctx_t::conf_t &c); -static void closure_posix_gtop(psimctx_t &ctx); -static bool scan(psimctx_t &ctx, nfa_state_t *q, bool all); -static bool relax_gor1(psimctx_t &ctx, const psimctx_t::conf_t &c); -static void closure_posix_gor1(psimctx_t &ctx); -static inline void closure_posix(psimctx_t &ctx); -static inline uint32_t index(const nfa_state_t *s, const nfa_t &nfa); -static void copy_offs(psimctx_t &ctx, const nfa_state_t *y, const nfa_state_t *x, tag_info_t info); -static inline void accept_offsets(psimctx_t &ctx, const nfa_state_t *s); - -// debug -static int D = 0; -static void prtoff(psimctx_t &ctx, uint32_t x, bool newer); -static inline void prtoff4(psimctx_t &ctx, uint32_t x); -static inline void prtoff5(psimctx_t &ctx, uint32_t x); - -static regoff_t *offsets4 = NULL; -static regoff_t *offsets5 = NULL; -static regoff_t *offsets6 = NULL; - -int regexec_nfa_posix_backward(const regex_t *preg, const char *string - , size_t nmatch, regmatch_t pmatch[], int /* eflags */) -{ - psimctx_t &ctx = *static_cast(preg->simctx); - init(ctx, string); - - // 1st pass, forward: find longest match on a simple NFA - - ctx.reach.push_back(conf_t(ctx.nfa0->root, 0, 0)); - for (;;) { - closure_simple(ctx); - const uint32_t sym = static_cast(*ctx.cursor++); - if (ctx.state.empty() || sym == 0) break; - make_one_step_simple(ctx, sym); - } - - for (cconfiter_t i = ctx.state.begin(), e = ctx.state.end(); i != e; ++i) { - nfa_state_t *s = i->state; - s->clos = NOCLOS; - if (s->type == nfa_state_t::FIN) { - ctx.marker = ctx.cursor; - ctx.rule = 0; - } - } - - if (ctx.rule == Rule::NONE) { - return REG_NOMATCH; - } - - // 2nd pass, backward: find submatches - - const uint32_t len = static_cast(ctx.marker - string) - 1; - init(ctx, string); - ctx.step = len; - ctx.cursor = ctx.marker = string + len; - - const std::vector &tags = ctx.nfa.tags; - const size_t sz = tags.size() * ctx.nfa.size * 2; - offsets4 = new regoff_t[sz]; - offsets5 = new regoff_t[sz]; - offsets6 = new regoff_t[sz]; - - std::fill(offsets4, offsets4 + sz, -2); - std::fill(offsets5, offsets5 + sz, -2); - - ctx.reach.push_back(conf_t(ctx.nfa.root, 0, 0)); - for (;;) { - closure_posix(ctx); - if (ctx.state.empty() || ctx.cursor == string) break; - const uint32_t sym = static_cast(*--ctx.cursor); - make_one_step(ctx, sym); - --ctx.step; - } - make_final_step(ctx); - - delete[] offsets6; - delete[] offsets5; - delete[] offsets4; - - if (ctx.rule == Rule::NONE) { - return REG_NOMATCH; - } - - const getoff_nfa_t fn = { ctx.offsets3 }; - tags_to_submatch(tags, nmatch, pmatch, static_cast(len), fn); - - return 0; -} - -static uint32_t index(const nfa_state_t *s, const nfa_t &nfa) -{ - return static_cast(s - nfa.states); -} - -void closure_simple(psimctx_t &ctx) -{ - psimctx_t::confset_t &state = ctx.state, &stack = ctx.reach; - state.clear(); - - for (; !stack.empty(); ) { - const psimctx_t::conf_t x = stack.back(); - stack.pop_back(); - nfa_state_t *n = x.state; - - if (n->clos != NOCLOS) continue; - - n->clos = static_cast(state.size()); - state.push_back(x); - - switch (n->type) { - case nfa_state_t::ALT: - stack.push_back(psimctx_t::conf_t(x, n->alt.out2)); - stack.push_back(psimctx_t::conf_t(x, n->alt.out1)); - break; - case nfa_state_t::TAG: - stack.push_back(psimctx_t::conf_t(x, n->tag.out, 0)); - break; - case nfa_state_t::RAN: - case nfa_state_t::FIN: - break; - } - } -} - -void make_one_step_simple(psimctx_t &ctx, uint32_t sym) -{ - const confset_t &state = ctx.state; - confset_t &reach = ctx.reach; - DASSERT(reach.empty()); - - for (rcconfiter_t i = state.rbegin(), e = state.rend(); i != e; ++i) { - nfa_state_t *s = i->state; - s->clos = NOCLOS; - - if (s->type == nfa_state_t::RAN) { - for (const Range *r = s->ran.ran; r; r = r->next()) { - if (r->lower() <= sym && sym < r->upper()) { - conf_t c(s->ran.out, 0, 0); - reach.push_back(c); - break; - } - } - } - else if (s->type == nfa_state_t::FIN) { - ctx.marker = ctx.cursor; - ctx.rule = 0; - } - } - - ++ctx.step; -} - -void closure_posix(psimctx_t &ctx) -{ - if (ctx.flags & REG_GTOP) { - closure_posix_gtop(ctx); - } - else { - closure_posix_gor1(ctx); - } -} - -static int32_t precedence(psimctx_t &ctx, const conf_t &x, const conf_t &y) -{ - DASSERT(x.state == y.state); (void)y; - const uint32_t idx = index(x.state, ctx.nfa); - - const size_t ntags = ctx.nfa.tags.size(); - const regoff_t *ox = offsets5 + ntags * idx * 2; - const regoff_t *oy = offsets4 + ntags * idx * 2; - - prtoff5(ctx, idx); - prtoff4(ctx, idx); - if (D) fprintf(stderr, "prec %u: ", idx); - - for (size_t t = 0; t < ctx.nfa.tags.size(); t += 2) { - const regoff_t offx = ox[2 * (t + 1)]; - const regoff_t offy = oy[2 * (t + 1)]; - if (offx != offy) { - int cmp; - if (offx == -2) { - cmp = -1; - } - if (offy == -2) { - cmp = 1; - } - else { - cmp = offx > offy ? -1 : 1; - } - if (D) fprintf(stderr, "%d\n", cmp); - return cmp; - } - } - if (D) fprintf(stderr, "0\n"); - return 0; -} - -void closure_posix_gor1(psimctx_t &ctx) -{ - psimctx_t::confset_t &state = ctx.state, &reach = ctx.reach; - std::vector - &topsort = ctx.gor1_topsort, - &linear = ctx.gor1_linear; - const size_t ntags = ctx.nfa.tags.size(); - - state.clear(); - memcpy(offsets6, offsets4, ctx.nfa.size * ntags * sizeof(regoff_t) * 2); - - // Reverse order happens to result in less test errors, but both orders - // do not yield a correct algorithm (paths are still compared after - // their join point sometimes). - for (psimctx_t::rcconfiter_t c = reach.rbegin(); c != reach.rend(); ++c) { - regoff_t *ox = offsets5 + ntags * index(c->state, ctx.nfa) * 2; - const regoff_t *oy = offsets6 + ntags * c->origin * 2; - memcpy(ox, oy, ntags * sizeof(regoff_t) * 2); - - if (D) fprintf(stderr, "restoring offsets %u to %u\n" - , c->origin, index(c->state, ctx.nfa)); - - relax_gor1(ctx, *c); - } - - for (; !topsort.empty(); ) { - - // 1st pass: depth-first postorder traversal of admissible subgraph - for (; !topsort.empty(); ) { - nfa_state_t *q = topsort.back(); - if (q->status == GOR_LINEAR) { - topsort.pop_back(); - } - else { - q->status = GOR_TOPSORT; - if (!scan(ctx, q, false)) { - q->status = GOR_LINEAR; - topsort.pop_back(); - linear.push_back(q); - } - } - } - - // 2nd pass: linear scan of topologically ordered states - for (; !linear.empty(); ) { - nfa_state_t *q = linear.back(); - linear.pop_back(); - if (q->active) { - q->active = 0; - q->arcidx = 0; - scan(ctx, q, true); - } - q->status = GOR_NOPASS; - } - } -} - -bool scan(psimctx_t &ctx, nfa_state_t *q, bool all) -{ - bool any = false; - - typedef psimctx_t::conf_t conf_t; - const conf_t x = ctx.state[q->clos]; - - switch (q->type) { - case nfa_state_t::ALT: - if (q->arcidx == 0) { - copy_offs(ctx, q, q->alt.out1, NOINFO); - any |= relax_gor1(ctx, conf_t(x, q->alt.out1)); - ++q->arcidx; - } - if (q->arcidx == 1 && (!any || all)) { - copy_offs(ctx, q, q->alt.out2, NOINFO); - any |= relax_gor1(ctx, conf_t(x, q->alt.out2)); - ++q->arcidx; - } - break; - case nfa_state_t::TAG: - if (q->arcidx == 0) { - copy_offs(ctx, q, q->tag.out, q->tag.info); - any |= relax_gor1(ctx, conf_t(x, q->tag.out, 0)); - ++q->arcidx; - } - break; - case nfa_state_t::RAN: - case nfa_state_t::FIN: - break; - } - - return any; -} - -bool relax_gor1(psimctx_t &ctx, const psimctx_t::conf_t &x) -{ - psimctx_t::confset_t &state = ctx.state; - nfa_state_t *q = x.state; - const uint32_t idx = q->clos; - - if (q->status == GOR_TOPSORT) { - return false; - } - - if (idx == NOCLOS) { - q->clos = static_cast(state.size()); - state.push_back(x); - accept_offsets(ctx, q); - } - else if (q->indeg < 2 || precedence(ctx, x, state[idx]) < 0) { - state[idx] = x; - accept_offsets(ctx, q); - } - else { - return false; - } - - if (q->status == GOR_NOPASS) { - ctx.gor1_topsort.push_back(q); - q->arcidx = 0; - return true; - } - else { - q->active = 1; - return false; - } -} - -void closure_posix_gtop(psimctx_t &ctx) -{ - const psimctx_t::confset_t &reach = ctx.reach; - psimctx_t::confset_t &state = ctx.state; - gtop_heap_t &heap = ctx.gtop_heap; - const size_t ntags = ctx.nfa.tags.size(); - - state.clear(); - memcpy(offsets6, offsets4, ctx.nfa.size * ntags * sizeof(regoff_t) * 2); - for (psimctx_t::cconfiter_t c = reach.begin(); c != reach.end(); ++c) { - regoff_t *ox = offsets5 + ntags * index(c->state, ctx.nfa) * 2; - const regoff_t *oy = offsets6 + ntags * c->origin * 2; - memcpy(ox, oy, ntags * sizeof(regoff_t) * 2); - - if (D) fprintf(stderr, "restoring offsets %u to %u\n" - , c->origin, index(c->state, ctx.nfa)); - - relax_gtop(ctx, *c); - prtoff4(ctx, index(c->state, ctx.nfa)); - } - - for (; !heap.empty(); ) { - nfa_state_t *q = heap.top(); - heap.pop(); - q->active = 0; - - if (D) fprintf(stderr, "> %u ", index(q, ctx.nfa)); - prtoff4(ctx, index(q, ctx.nfa)); - - typedef psimctx_t::conf_t conf_t; - const conf_t x = ctx.state[q->clos]; - - switch (q->type) { - case nfa_state_t::ALT: - copy_offs(ctx, q, q->alt.out1, NOINFO); - relax_gtop(ctx, conf_t(x, q->alt.out1)); - copy_offs(ctx, q, q->alt.out2, NOINFO); - relax_gtop(ctx, conf_t(x, q->alt.out2)); - break; - case nfa_state_t::TAG: - copy_offs(ctx, q, q->tag.out, q->tag.info); - relax_gtop(ctx, conf_t(x, q->tag.out, 0)); - break; - case nfa_state_t::RAN: - case nfa_state_t::FIN: - break; - } - } -} - -void relax_gtop(psimctx_t &ctx, const psimctx_t::conf_t &c) -{ - psimctx_t::confset_t &state = ctx.state; - nfa_state_t *q = c.state; - const uint32_t idx = q->clos; - - if (idx == NOCLOS) { - q->clos = static_cast(state.size()); - state.push_back(c); - accept_offsets(ctx, q); - } - else if (q->indeg < 2 || precedence(ctx, c, state[idx]) < 0) { - state[idx] = c; - accept_offsets(ctx, q); - } - else { - return; - } - - if (!q->active) { - q->active = 1; - ctx.gtop_heap.push(q); - } -} - -void make_one_step(psimctx_t &ctx, uint32_t sym) -{ - confset_t &state = ctx.state, &reach = ctx.reach; - reach.clear(); - - if (D) fprintf(stderr, "\n--- step %u (sym %c)\n", ctx.step, sym); - - for (cconfiter_t i = state.begin(), e = state.end(); i != e; ++i) { - nfa_state_t *s = i->state; - - s->clos = NOCLOS; - s->arcidx = 0; - DASSERT(s->status == GOR_NOPASS && s->active == 0); - - if (s->type == nfa_state_t::RAN) { - for (const Range *r = s->ran.ran; r; r = r->next()) { - if (r->lower() <= sym && sym < r->upper()) { - const conf_t c(s->ran.out, index(s, ctx.nfa), HROOT); - reach.push_back(c); - break; - } - } - } - else if (s->type == nfa_state_t::FIN) { - update_final_offsets(ctx, *i); - } - } -} - -void make_final_step(psimctx_t &ctx) -{ - for (cconfiter_t i = ctx.state.begin(), e = ctx.state.end(); i != e; ++i) { - nfa_state_t *s = i->state; - - s->clos = NOCLOS; - s->arcidx = 0; - DASSERT(s->status == GOR_NOPASS && s->active == 0); - - if (s->type == nfa_state_t::FIN) { - update_final_offsets(ctx, *i); - } - } -} - -void update_final_offsets(psimctx_t &ctx, const conf_t &c) -{ - nfa_state_t *s = c.state; - DASSERT(s->type == nfa_state_t::FIN); - - const std::vector &tags = ctx.nfa.tags; - const size_t ntags = tags.size(); - - ctx.marker = ctx.cursor; - ctx.rule = 0; - - regoff_t *of = ctx.offsets3; - const regoff_t *ox = offsets4 + ntags * index(s, ctx.nfa) * 2; - - if (D) fprintf(stderr, "finally: "); - prtoff4(ctx, index(s, ctx.nfa)); - - for (size_t t = 0; t < ntags; t += 2) { - of[t] = ox[2 * t + 1]; - of[t + 1] = ox[2 * (t + 1) + 1]; - } -} - -static void copy_offs(psimctx_t &ctx, const nfa_state_t *y, const nfa_state_t *x - , tag_info_t info) -{ - const std::vector &tags = ctx.nfa.tags; - const size_t ntags = tags.size(); - const uint32_t - xidx = index(x, ctx.nfa), - yidx = index(y, ctx.nfa); - - if (D) fprintf(stderr, "copying offsets %u to %u ", yidx, xidx); - prtoff4(ctx, yidx); - - regoff_t *ox = offsets5 + ntags * xidx * 2; - const regoff_t *oy = offsets4 + ntags * yidx * 2; - memcpy(ox, oy, ntags * sizeof(regoff_t) * 2); - - if (!(info == NOINFO)) { - const uint32_t t = info.idx; - - // update active tag, and set final tag if it's not set already - ox[2 * t] = info.neg ? -1 : static_cast(ctx.step); - if (ox[2 * t + 1] == -2) { - ox[2 * t + 1] = ox[2 * t]; - } - - // update nested negative tags (if any) - if (info.neg) { - const Tag &tag = tags[t]; - for (size_t l = tag.lnest; l < tag.hnest; ++l) { - ox[2 * l] = -1; - if (ox[2 * l + 1] == -2) { - ox[2 * l + 1] = -1; - } - } - } - - if (D) fprintf(stderr, "setting offset %u[%u] to %ld\n" - , xidx, t, (long)ox[2 * t]); - } -} - -void accept_offsets(psimctx_t &ctx, const nfa_state_t *s) -{ - const uint32_t idx = index(s, ctx.nfa); - const size_t ntags = ctx.nfa.tags.size(); - - const regoff_t *ox = offsets5 + ntags * idx * 2; - regoff_t *oy = offsets4 + ntags * idx * 2; - memcpy(oy, ox, ntags * sizeof(regoff_t) * 2); - - if (D) fprintf(stderr, "setting offsets %u to ", idx); - prtoff4(ctx, idx); -} - -static void prtoff(psimctx_t &ctx, uint32_t x, bool newer) -{ - if (!D) return; - - const size_t ntags = ctx.nfa.tags.size(); - const regoff_t *ox = (newer ? offsets5 : offsets4) + ntags * x * 2; - for (size_t t = 0; t < ntags; t += 2) { - fprintf(stderr, "(%ld,%ld)(%ld,%ld)" - , (long)ox[2 * t], (long)ox[2 * (t + 1)] - , (long)ox[2 * t + 1], (long)ox[2 * (t + 1) + 1]); - } - fprintf(stderr, "\n"); -} - -static void prtoff4(psimctx_t &ctx, uint32_t x) -{ - if (D) fprintf(stderr, "off4: "); - prtoff(ctx, x, false); -} - -static void prtoff5(psimctx_t &ctx, uint32_t x) -{ - if (D) fprintf(stderr, "off5: "); - prtoff(ctx, x, true); -} - -} // namespace libre2c -} // namespace re2c - diff --git a/lib/regexec_nfa_posix_kuklewicz.cc b/lib/regexec_nfa_posix_kuklewicz.cc deleted file mode 100644 index c19db559..00000000 --- a/lib/regexec_nfa_posix_kuklewicz.cc +++ /dev/null @@ -1,333 +0,0 @@ -#include -#include -#include -#include - -#include "lib/regex.h" -#include "lib/regex_impl.h" -#include "src/debug/debug.h" -#include "src/dfa/closure_posix.h" -#include "src/dfa/tag_history.h" -#include "src/nfa/nfa.h" -#include "src/regexp/rule.h" -#include "src/regexp/tag.h" -#include "src/util/range.h" - - -/* note [POSIX orbit tags] - * - * POSIX disambiguation rules demand that earlier subexpressions match - * the longest possible prefix of the input string (without violating the - * whole match). To accommodate these rules, we resolve conflicts on orbit - * tags by comparison of tag subhistories on conflicting NFA paths. - * - * If one subhistory is a proper prefix of another subhistory, it is less; - * otherwise for the first pair of different offsets, if one offset is greater - * than the other, then the corresponding subhistory is less. - * - * It is possible to pre-compare two NFA paths corresponding to the same - * input string prefix and ending in the same NFA state; if paths are not - * equal, the result of this comparison will hold for any common suffix. - * - * It is also possible to pre-compare NFA paths that correspond to the same - * input prefix, but end in different NFA states. Such comparison is incorrect - * unless subhistories start at the same offset; but if it is incorrect, we - * will never use its result (tags with higher priority will also disagree). - * - * Therefore instead of keeping the whole history of offsets we calculate - * the relative order of any pair of subhistories on each step. - * - * This part of the algorithm was invented by Christopher Kuklewicz. - */ - -namespace re2c { - -// specialization that doesn't sort initial closure like Okui-Suzuki -template<> void init_gor1(libre2c::ksimctx_t &ctx); - -namespace libre2c { - -static const int32_t DELIM = 0x7fffFFFF; - -static void make_one_step(ksimctx_t &, uint32_t); -static void make_final_step(ksimctx_t &); -static void compute_orders(ksimctx_t &ctx); - -// we *do* want these to be inlined -static inline void closure_posix(ksimctx_t &ctx); -static inline size_t boundary_tag(size_t tag); -static inline int32_t subhistory_list(const khistory_t &history, std::vector &path, hidx_t idx, size_t tag); -static inline void last_subhistory(const khistory_t &history, std::vector &path, hidx_t idx, size_t tag); -static inline int32_t compare_last_subhistories(khistory_t &history, hidx_t x, hidx_t y, int32_t ox, int32_t oy, size_t t); - -int regexec_nfa_posix_kuklewicz(const regex_t *preg, const char *string - , size_t nmatch, regmatch_t pmatch[], int /* eflags */) -{ - ksimctx_t &ctx = *static_cast(preg->simctx); - init(ctx, string); - if (ctx.nfa.tags.size() > 0) { - ctx.oldprectbl[0] = 0; - } - - // root state can be non-core, so we pass zero as origin to avoid checks - const conf_t c0(ctx.nfa.root, 0, HROOT); - ctx.reach.push_back(c0); - for (;;) { - closure_posix(ctx); - const uint32_t sym = static_cast(*ctx.cursor++); - if (ctx.state.empty() || sym == 0) break; - make_one_step(ctx, sym); - } - make_final_step(ctx); - - if (ctx.rule == Rule::NONE) { - return REG_NOMATCH; - } - - const getoff_nfa_t fn = { ctx.offsets3 }; - tags_to_submatch(ctx.nfa.tags, nmatch, pmatch, ctx.marker - string - 1, fn); - - return 0; -} - -void closure_posix(ksimctx_t &ctx) -{ - if (ctx.flags & REG_GTOP) { - closure_posix_gtop(ctx); - } - else { - closure_posix_gor1(ctx); - } -} - -void make_one_step(ksimctx_t &ctx, uint32_t sym) -{ - confset_t &state = ctx.state, &reach = ctx.reach; - uint32_t j = 0; - reach.clear(); - - for (cconfiter_t i = state.begin(), e = state.end(); i != e; ++i) { - nfa_state_t *s = i->state; - - s->clos = NOCLOS; - s->arcidx = 0; - DASSERT(s->status == GOR_NOPASS && s->active == 0); - - if (s->type == nfa_state_t::RAN) { - for (const Range *r = s->ran.ran; r; r = r->next()) { - if (r->lower() <= sym && sym < r->upper()) { - const conf_t c(s->ran.out, j, HROOT); - reach.push_back(c); - state[j] = *i; - update_offsets(ctx, *i, j); - ++j; - break; - } - } - } - else if (s->type == nfa_state_t::FIN) { - update_offsets(ctx, *i, NONCORE); - } - } - - state.resize(j); - std::swap(ctx.offsets1, ctx.offsets2); - - compute_orders(ctx); - std::swap(ctx.newprectbl, ctx.oldprectbl); - ctx.oldprecdim = j; - - ctx.history.init(); - ++ctx.step; -} - -void make_final_step(ksimctx_t &ctx) -{ - for (cconfiter_t i = ctx.state.begin(), e = ctx.state.end(); i != e; ++i) { - nfa_state_t *s = i->state; - - s->clos = NOCLOS; - s->arcidx = 0; - DASSERT(s->status == GOR_NOPASS && s->active == 0); - - if (s->type == nfa_state_t::FIN) { - update_offsets(ctx, *i, NONCORE); - } - } -} - -struct cmp_posix_t -{ - ksimctx_t &ctx; - size_t tag; - inline bool operator()(cconfiter_t x, cconfiter_t y) const - { - const int32_t - ox = ctx.oldprectbl[tag * ctx.oldprecdim + x->origin], - oy = ctx.oldprectbl[tag * ctx.oldprecdim + y->origin]; - // comparison result is inverted, because orders are used as offsets - return compare_last_subhistories(ctx.history - , x->thist, y->thist, ox, oy, tag) > 0; - } -}; - -void compute_orders(ksimctx_t &ctx) -{ - const confset_t &state = ctx.state; - const size_t ntags = ctx.nfa.tags.size(); - const size_t newdim = state.size(); - cconfiter_t b = state.begin(), e = state.end(), c; - std::vector &iters = ctx.stateiters; - - if (newdim == 0) return; - - iters.clear(); - iters.reserve(state.size()); - for (c = b; c != e; ++c) { - iters.push_back(c); - } - - for (size_t t = 1; t < ntags; t += 2) { - cmp_posix_t cmp = {ctx, t}; - std::sort(iters.begin(), iters.end(), cmp); - - int32_t m = 0, *tbl = ctx.newprectbl + t * newdim; - for (size_t i = 0; i < newdim; ++m) { - *(tbl + (iters[i] - b)) = m; - for (; ++i < newdim && cmp(iters[i - 1], iters[i]) == 0; ) { - *(tbl + (iters[i] - b)) = m; - } - } - } -} - -int32_t compare_last_subhistories(khistory_t &history - , hidx_t x, hidx_t y, int32_t ox, int32_t oy, size_t t) -{ - if (ox > oy) return -1; - if (ox < oy) return 1; - if (x == y) return 0; - - std::vector &p1 = history.path1, &p2 = history.path2; - - last_subhistory(history, p1, x, t); - last_subhistory(history, p2, y, t); - - std::vector::const_reverse_iterator - i1 = p1.rbegin(), e1 = p1.rend(), - i2 = p2.rbegin(), e2 = p2.rend(); - for (;;) { - if (i1 == e1 && i2 == e2) break; - if (i1 == e1) return -1; - if (i2 == e2) return 1; - if (*i1 > *i2) return -1; - if (*i1 < *i2) return 1; - ++i1; ++i2; - } - - return 0; -} - -void last_subhistory(const khistory_t &history - , std::vector &path, hidx_t idx, size_t tag) -{ - path.clear(); - hidx_t i = idx; - const size_t bound = boundary_tag(tag); - for (; i != HROOT && history.node(i).info.idx >= bound; i = history.node(i).pred) { - if (history.node(i).info.idx == tag) { - path.push_back(history.node(i).info.neg ? -1 : 1); - } - } -} - -template -int32_t khistory_t::precedence(ctx_t &ctx - , const typename ctx_t::conf_t &x, const typename ctx_t::conf_t &y - , int32_t &/*prec1*/, int32_t &/*prec2*/) -{ - // History consists of multiple subhistories (each containing either a - // single negative tag, or one or more positive tags (exactly one for - // non-orbit subhistories). Because of the shortest-path algorithm earlier - // subhistories do not necessarily coincide, so comparing only the last - // pair of subhistories is not enough. See note [POSIX orbit tags]. - - const size_t ntags = ctx.nfa.tags.size(); - for (size_t t = 1; t < ntags; t += 2) { - const int32_t - ox = ctx.oldprectbl[t * ctx.oldprecdim + x.origin], - oy = ctx.oldprectbl[t * ctx.oldprecdim + y.origin]; - - if (ox > oy) return -1; - if (ox < oy) return 1; - if (x.thist == y.thist) continue; - - std::vector &p1 = ctx.history.path1, &p2 = ctx.history.path2; - int32_t n1, n2; - (void)(n1 = subhistory_list(ctx.history, p1, x.thist, t)); - (void)(n2 = subhistory_list(ctx.history, p2, y.thist, t)); - DASSERT(n1 == n2); - - std::vector::const_reverse_iterator - i1 = p1.rbegin(), e1 = p1.rend(), - i2 = p2.rbegin(), e2 = p2.rend(); - for (;;) { - if (i1 == e1 && i2 == e2) break; - DASSERT(i1 != e1 && i2 != e2); - const int32_t v1 = *i1++, v2 = *i2++; - if (v1 == DELIM && v2 == DELIM) continue; - if (v1 == DELIM) return -1; - if (v2 == DELIM) return 1; - if (v1 > v2) return -1; - if (v1 < v2) return 1; - } - } - return 0; -} - -// returns all subhistories of the given tag as one sequence -// (individual subhistories are separated by delimiter) -int32_t subhistory_list(const khistory_t &history, - std::vector &path, hidx_t idx, size_t tag) -{ - path.clear(); - int32_t nsub = 0; - hidx_t i = idx; - - const size_t bound = boundary_tag(tag); - path.push_back(DELIM); - for (;;) { - for (; i != HROOT && history.node(i).info.idx >= bound; i = history.node(i).pred) { - if (history.node(i).info.idx == tag) { - path.push_back(history.node(i).info.neg ? -1 : 1); - } - } - if (i == HROOT) break; - ++nsub; - path.push_back(DELIM); - for (; i != HROOT && history.node(i).info.idx != tag; i = history.node(i).pred); - } - - return nsub; -} - -size_t boundary_tag(size_t tag) -{ - // for start tags, return itself; for end tags, return start tag - // (start tags have even numbers, end tags have odd numbers) - return tag & ~1u; -} - -} // namespace libre2c - -template<> void init_gor1(libre2c::ksimctx_t &ctx) -{ - ctx.state.clear(); - libre2c::ksimctx_t::cconfiter_t c = ctx.reach.begin(), e = ctx.reach.end(); - for (; c != e; ++c) { - relax_gor1(ctx, *c); - } -} - -} // namespace re2c - diff --git a/lib/regexec_nfa_posix_trie.cc b/lib/regexec_nfa_posix_trie.cc index beb05cd9..bf296289 100644 --- a/lib/regexec_nfa_posix_trie.cc +++ b/lib/regexec_nfa_posix_trie.cc @@ -4,64 +4,57 @@ #include "lib/regex.h" #include "lib/regex_impl.h" -#include "src/debug/debug.h" #include "src/dfa/closure_posix.h" #include "src/dfa/tag_history.h" #include "src/nfa/nfa.h" +#include "src/util/check.h" #include "src/util/range.h" - namespace re2c { -// specialization that doesn't sort initial closure like Okui-Suzuki -// (there is no cheap way to sort it in the lazy-disambiguation algorithm) -template<> void init_gor1(libre2c::pzsimctx_t &ctx); +// Specialization that doesn't sort initial closure like Okui-Suzuki (there is no cheap way to sort +// it in the lazy-disambiguation algorithm). +template<> void init_gor1(libre2c::pzsimctx_t& ctx); namespace libre2c { -/* note [lazy computation and caching of precedence] - * - * Eagerly computing precedence values on each step for each pair of closure - * states is a waste of time: most of these values are not needed, because - * RE may be unambigous, or the given input string may be unambigous, or even - * if there is ambiguity, it may take only a few comparisons to resolve. All - * the rest is wasted effort. - * - * We can avoid it by delaying precedence computation until necessary, and - * then unwinding all the steps backwards, computing precedence for each step - * and caching the computed values (so that the same pair of histories is not - * compared twice). It is still the same incremental comparison as with - * precedence matrices: we compare step by step, going from the fork frame to - * the join frame. The result of comparison on each step is folded to a triple - * of numbers and recorded in cache. It is important that we do record each - * step, not just the final step, because the next pair of ambiguous histories - * may unwind to the same pair of prefixes that was compared before. - * - * For all this to work, it is necessary that we are able to store all history - * until the end, because at any step we might need to unwind an arbitrary - * number of steps back. We also need to address individual subhistories - * efficiently in order to use them as keys in the cache. All this is achieved - * by storing history in the form of a trie and addressing individual - * histories by indices in that trie. We also use trie to compute the final - * tag values (instead of storing tags in registers at each step). - */ - -static void make_step(pzsimctx_t &, uint32_t); -static void make_final_step(pzsimctx_t &); -template static int32_t zprecedence1(ctx_t &ctx - , hidx_t idx1, hidx_t idx2, int32_t &prec1, int32_t &prec2); -template static int32_t zprecedence2(ctx_t &ctx - , hidx_t idx1, hidx_t idx2, int32_t &prec1, int32_t &prec2); +// note [lazy computation and caching of precedence] +// +// Eagerly computing precedence values on each step for each pair of closure states is a waste of +// time: most of these values are not needed, because regexp may be unambigous, or the given input +// string may be unambigous, or even if there is ambiguity, it may take only a few comparisons to +// resolve. All the rest is wasted effort. +// +// We can avoid it by delaying precedence computation until necessary, and then unwinding all the +// steps backwards, computing precedence for each step and caching the computed values (so that the +// same pair of histories is not compared twice). It is still the same incremental comparison as +// with precedence matrices: we compare step by step, going from the fork frame to the join frame. +// The result of comparison on each step is folded to a triple of numbers and recorded in cache. It +// is important that we do record each step, not just the final step, because the next pair of +// ambiguous histories may unwind to the same pair of prefixes that was compared before. +// +// For all this to work, it is necessary that we are able to store all history until the end, +// because at any step we might need to unwind an arbitrary number of steps back. We also need to +// address individual subhistories efficiently in order to use them as keys in the cache. All this +// is achieved by storing history in the form of a trie and addressing individual histories by +// indices in that trie. We also use trie to compute the final tag values (instead of storing tags +// in registers at each step). + +static void make_step(pzsimctx_t&, uint32_t); +static void make_final_step(pzsimctx_t&); +template static int32_t zprecedence1( + ctx_t& ctx, hidx_t idx1, hidx_t idx2, int32_t& prec1, int32_t& prec2); +template static int32_t zprecedence2( + ctx_t& ctx, hidx_t idx1, hidx_t idx2, int32_t& prec1, int32_t& prec2); // we *do* want this to be inlined -static inline uint32_t get_step(const zhistory_t &hist, int32_t idx); -static inline uint32_t get_orig(const zhistory_t &hist, int32_t idx); -static inline void closure_posix(pzsimctx_t &ctx); - -int regexec_nfa_posix_trie(const regex_t *preg, const char *string - , size_t nmatch, regmatch_t pmatch[], int) -{ - pzsimctx_t &ctx = *static_cast(preg->simctx); +static inline uint32_t get_step(const zhistory_t& hist, int32_t idx); +static inline uint32_t get_orig(const zhistory_t& hist, int32_t idx); +static inline void closure_posix(pzsimctx_t& ctx); + +int regexec_nfa_posix_trie( + const regex_t* preg, const char* string, size_t nmatch, regmatch_t pmatch[], int) { + pzsimctx_t& ctx = *static_cast(preg->simctx); init(ctx, string); const conf_t c0(ctx.nfa.root, 0, HROOT); @@ -78,41 +71,33 @@ int regexec_nfa_posix_trie(const regex_t *preg, const char *string return finalize(ctx, string, nmatch, pmatch); } -void closure_posix(pzsimctx_t &ctx) -{ - if (ctx.flags & REG_GTOP) { - closure_posix_gtop(ctx); - } - else { - closure_posix_gor1(ctx); - } +void closure_posix(pzsimctx_t& ctx) { + closure_posix_gor1(ctx); } -void make_step(pzsimctx_t &ctx, uint32_t sym) -{ - const confset_t &state = ctx.state; - confset_t &reach = ctx.reach; +void make_step(pzsimctx_t& ctx, uint32_t sym) { + const confset_t& state = ctx.state; + confset_t& reach = ctx.reach; cconfiter_t b = state.begin(), e = state.end(), i; reach.clear(); uint32_t j = 0; for (i = b; i != e; ++i) { - nfa_state_t *s = i->state; + TnfaState* s = i->state; s->clos = NOCLOS; s->arcidx = 0; - DASSERT(s->status == GOR_NOPASS && s->active == 0); + DCHECK(s->status == GOR_NOPASS && s->active == 0); - if (s->type == nfa_state_t::RAN) { - for (const Range *r = s->ran.ran; r; r = r->next()) { + if (s->kind == TnfaState::Kind::RAN) { + for (const Range* r = s->ran; r; r = r->next()) { if (r->lower() <= sym && sym < r->upper()) { - const conf_t c(s->ran.out, j++, i->thist); + const conf_t c(s->out1, j++, i->thist); reach.push_back(c); break; } } - } - else if (s->type == nfa_state_t::FIN) { + } else if (s->kind == TnfaState::Kind::FIN) { ctx.marker = ctx.cursor; ctx.hidx = i->thist; ctx.rule = 0; @@ -122,16 +107,15 @@ void make_step(pzsimctx_t &ctx, uint32_t sym) ++ctx.step; } -void make_final_step(pzsimctx_t &ctx) -{ +void make_final_step(pzsimctx_t& ctx) { for (confiter_t i = ctx.state.begin(), e = ctx.state.end(); i != e; ++i) { - nfa_state_t *s = i->state; + TnfaState* s = i->state; s->clos = NOCLOS; s->arcidx = 0; - DASSERT(s->status == GOR_NOPASS && s->active == 0); + DCHECK(s->status == GOR_NOPASS && s->active == 0); - if (s->type == nfa_state_t::FIN) { + if (s->kind == TnfaState::Kind::FIN) { ctx.marker = ctx.cursor; ctx.hidx = i->thist; ctx.rule = 0; @@ -140,19 +124,18 @@ void make_final_step(pzsimctx_t &ctx) } template -int32_t zhistory_t::precedence(ctx_t &ctx - , const typename ctx_t::conf_t &x, const typename ctx_t::conf_t &y - , int32_t &prec1, int32_t &prec2) -{ +int32_t zhistory_t::precedence(ctx_t& ctx, + const typename ctx_t::conf_t& x, + const typename ctx_t::conf_t& y, + int32_t& prec1, + int32_t& prec2) { return zprecedence1(ctx, x.thist, y.thist, prec1, prec2); } // see note [lazy computation and caching of precedence] template -int32_t zprecedence1(ctx_t &ctx, hidx_t idx1, hidx_t idx2 - , int32_t &prec1, int32_t &prec2) -{ - zhistory_t::cache_t &cache = ctx.history.cache; +int32_t zprecedence1(ctx_t& ctx, hidx_t idx1, hidx_t idx2, int32_t& prec1, int32_t& prec2) { + zhistory_t::cache_t& cache = ctx.history.cache; int32_t prec = 0; // use the same cache entry for (x, y) and (y, x) @@ -164,7 +147,7 @@ int32_t zprecedence1(ctx_t &ctx, hidx_t idx1, hidx_t idx2 zhistory_t::cache_t::const_iterator i = cache.find(key); if (i != cache.end()) { // use previously computed precedence values from cache - const zhistory_t::cache_entry_t &val = i->second; + const zhistory_t::cache_entry_t& val = i->second; prec1 = val.prec1; prec2 = val.prec2; prec = val.prec; @@ -172,8 +155,7 @@ int32_t zprecedence1(ctx_t &ctx, hidx_t idx1, hidx_t idx2 std::swap(prec1, prec2); prec = -prec; } - } - else { + } else { // compute precedence values and put them into cache prec = zprecedence2(ctx, idx1, idx2, prec1, prec2); zhistory_t::cache_entry_t val = {prec1, prec2, prec}; @@ -189,16 +171,14 @@ int32_t zprecedence1(ctx_t &ctx, hidx_t idx1, hidx_t idx2 // see note [lazy computation and caching of precedence] template -int32_t zprecedence2(ctx_t &ctx, hidx_t idx1, hidx_t idx2 - , int32_t &prec1, int32_t &prec2) -{ +int32_t zprecedence2(ctx_t& ctx, hidx_t idx1, hidx_t idx2, int32_t& prec1, int32_t& prec2) { if (idx1 == idx2) { prec1 = prec2 = MAX_RHO; return 0; } - const std::vector &tags = ctx.nfa.tags; - zhistory_t &hist = ctx.history; + const std::vector& tags = ctx.nfa.tags; + zhistory_t& hist = ctx.history; int32_t prec = 0; prec1 = prec2 = MAX_RHO; @@ -215,14 +195,13 @@ int32_t zprecedence2(ctx_t &ctx, hidx_t idx1, hidx_t idx2 tag_info_t info1 = NOINFO, info2 = NOINFO; for (; i1 != i2 && (s1 >= s || s2 >= s);) { if (s1 >= s && (i1 > i2 || s2 < s)) { - const zhistory_t::node_t &n = hist.node(i1); + const zhistory_t::node_t& n = hist.node(i1); info1 = n.info; prec1 = std::min(prec1, tags[info1.idx].height); i1 = n.pred; s1 = get_step(hist, i1); - } - else { - const zhistory_t::node_t &n = hist.node(i2); + } else { + const zhistory_t::node_t& n = hist.node(i2); info2 = n.info; prec2 = std::min(prec2, tags[info2.idx].height); i2 = n.pred; @@ -235,8 +214,7 @@ int32_t zprecedence2(ctx_t &ctx, hidx_t idx1, hidx_t idx2 prec = zprecedence1(ctx, i1, i2, p1, p2); prec1 = std::min(prec1, p1); prec2 = std::min(prec2, p2); - } - else if (i1 != HROOT) { + } else if (i1 != HROOT) { const int32_t h = tags[hist.node(i1).info.idx].height; prec1 = std::min(prec1, h); prec2 = std::min(prec2, h); @@ -247,24 +225,20 @@ int32_t zprecedence2(ctx_t &ctx, hidx_t idx1, hidx_t idx2 if (prec1 < prec2) return 1; // leftmost precedence - return !fork_frame ? prec - : leftprec(info1, info2, i1 == idx1, i2 == idx2); + return !fork_frame ? prec : leftprec(info1, info2, i1 == idx1, i2 == idx2); } -uint32_t get_step(const zhistory_t &hist, int32_t idx) -{ +uint32_t get_step(const zhistory_t& hist, int32_t idx) { return idx == HROOT ? 0 : hist.node(idx).step; } -uint32_t get_orig(const zhistory_t &hist, int32_t idx) -{ +uint32_t get_orig(const zhistory_t& hist, int32_t idx) { return idx == HROOT ? 0 : hist.node(idx).orig; } } // namespace libre2c -template<> void init_gor1(libre2c::pzsimctx_t &ctx) -{ +template<> void init_gor1(libre2c::pzsimctx_t& ctx) { ctx.state.clear(); libre2c::pzsimctx_t::cconfiter_t c = ctx.reach.begin(), e = ctx.reach.end(); for (; c != e; ++c) { @@ -273,4 +247,3 @@ template<> void init_gor1(libre2c::pzsimctx_t &ctx) } } // namespace re2c - diff --git a/lib/regfree.cc b/lib/regfree.cc index 005b961a..e837ba61 100644 --- a/lib/regfree.cc +++ b/lib/regfree.cc @@ -3,7 +3,7 @@ #include #include -#include "lib/regcomp_dfa_regless.h" +#include "lib/regcomp_dfa_multipass.h" #include "lib/regex.h" #include "lib/regex_impl.h" #include "lib/regoff_trie.h" @@ -14,22 +14,12 @@ #include "src/parse/ast.h" #include "src/regexp/rule.h" #include "src/regexp/tag.h" -#include "src/util/free_list.h" #include "src/util/range.h" - using namespace re2c; using namespace re2c::libre2c; -void regfree(regex_t *preg) -{ - delete preg->rmgr; - - delete &preg->nfa->charset; - delete &preg->nfa->rules; - delete &preg->nfa->tags; - delete preg->nfa; - +void regfree(regex_t* preg) { if (preg->flags & REG_TSTRING) { delete[] preg->tstring.string; } else if (preg->flags & REG_SUBHIST) { @@ -39,49 +29,39 @@ void regfree(regex_t *preg) } if (preg->flags & REG_NFA) { + delete preg->nfa; if ((preg->flags & REG_TRIE) && (preg->flags & REG_LEFTMOST)) { delete static_cast(preg->simctx); } else if (preg->flags & REG_TRIE) { delete static_cast(preg->simctx); } else if (preg->flags & REG_LEFTMOST) { delete static_cast(preg->simctx); - } else if (preg->flags & REG_KUKLEWICZ) { - delete static_cast(preg->simctx); } else { delete static_cast(preg->simctx); } } else { - delete[] preg->char2class; - delete[] preg->dfa->finvers; - - delete &preg->dfa->mtagvers; - delete &preg->dfa->tcpool; - delete preg->dfa; - - if (preg->flags & REG_REGLESS) { - if (preg->flags & REG_LEFTMOST) { - delete static_cast(preg->rldfa->ctx); - } else { - delete static_cast(preg->rldfa->ctx); + if (preg->flags & REG_MULTIPASS) { + delete preg->mptdfa->opts; + delete[] preg->mptdfa->result; + delete preg->mptdfa; + if (preg->flags & REG_SUBHIST) { + delete preg->regtrie; } - delete preg->rldfa->opts; - delete[] preg->rldfa->result; - delete preg->rldfa; - } - if (preg->flags & REG_TSTRING) { - // t-string construction does not use this - } else if (preg->flags & REG_SUBHIST) { - delete preg->regtrie; } else { - delete[] preg->regs; + delete &preg->dfa->dfa_alc; + delete preg->dfa; + if (preg->flags & REG_TSTRING) { + // t-string construction does not use this + } else if (preg->flags & REG_SUBHIST) { + delete preg->regtrie; + } else { + delete[] preg->regs; + } } + delete[] preg->char2class; } - - AST::flist.clear(); - RangeSuffix::freeList.clear(); } -void regfreesub(subhistory_t *history) -{ +void regfreesub(subhistory_t* history) { free(history); } diff --git a/lib/regoff_trie.h b/lib/regoff_trie.h index 2e463a59..d50cc649 100644 --- a/lib/regoff_trie.h +++ b/lib/regoff_trie.h @@ -2,15 +2,14 @@ #define _RE2C_LIB_REGOFF_TRIE_ #include -#include "regex.h" #include #include #include -#include "src/debug/debug.h" +#include "regex.h" +#include "src/util/check.h" #include "src/util/forbid_copy.h" - namespace re2c { namespace libre2c { @@ -21,26 +20,24 @@ struct regoff_trie_t { }; size_t nlists; - size_t *lists; - size_t *count; + size_t* lists; + size_t* count; size_t size; size_t capacity; - node_t *storage; + node_t* storage; explicit regoff_trie_t(size_t nlists) - : nlists(nlists) - , lists(new size_t[nlists]) - , count(new size_t[nlists]) - , size(0) - , capacity(nlists * 2) - , storage(new node_t[capacity]) - { + : nlists(nlists), + lists(new size_t[nlists]), + count(new size_t[nlists]), + size(0), + capacity(nlists * 2), + storage(new node_t[capacity]) { clear(); } - ~regoff_trie_t() - { + ~regoff_trie_t() { delete[] lists; delete[] count; delete[] storage; @@ -53,22 +50,20 @@ struct regoff_trie_t { } inline void grow(size_t new_capacity) { - DASSERT(new_capacity > capacity); - node_t *new_storage = new node_t[new_capacity]; + DCHECK(new_capacity > capacity); + node_t* new_storage = new node_t[new_capacity]; memcpy(new_storage, storage, capacity * sizeof(node_t)); delete[] storage; storage = new_storage; capacity = new_capacity; } - inline void copy(size_t lhs, size_t rhs) - { + inline void copy(size_t lhs, size_t rhs) { lists[lhs] = lists[rhs]; count[lhs] = count[rhs]; } - inline void set(size_t lhs, regoff_t off) - { + inline void set(size_t lhs, regoff_t off) { if (count[lhs] == 0) { add(lhs, off); } else { @@ -79,8 +74,7 @@ struct regoff_trie_t { } } - inline void add(size_t lhs, regoff_t off) - { + inline void add(size_t lhs, regoff_t off) { if (size >= capacity) { grow(capacity * 2); } diff --git a/lib/stubs.cc b/lib/stubs.cc index 0dae4b9e..4784bb7c 100644 --- a/lib/stubs.cc +++ b/lib/stubs.cc @@ -1,2 +1,2 @@ -extern const char *help; -const char *help = ""; +extern const char* help; +const char* help = ""; diff --git a/lib/test.cc b/lib/test.cc index 72c3d0c8..5cdbf24a 100644 --- a/lib/test.cc +++ b/lib/test.cc @@ -1,4 +1,4 @@ -#include +#include #include #include #include @@ -8,12 +8,12 @@ #include "lib/regex.h" #include "lib/test_helper.h" -#include "src/util/c99_stdint.h" +#include "src/util/check.h" - -static int test(int flags, const char *pattern, const char *string, - const char *expected = NULL) -{ +static int test(int flags, + const char* pattern, + const char* string, + const char* expected = nullptr) { std::vector > submatch; if (!parse_submatch(expected, submatch)) { fprintf(stderr, "failed to parse submatch string: %s\n", expected); @@ -24,26 +24,26 @@ static int test(int flags, const char *pattern, const char *string, static uint32_t total = 0; static uint32_t failed = 0; - const char *prefix = flags & REG_NFA ? "NFA" : "DFA"; + const char* prefix = flags & REG_NFA ? "NFA" : "DFA"; bool with_hist = flags & REG_SUBHIST; regex_t re; - regmatch_t *pmatch = with_hist ? NULL : new regmatch_t[nmatch]; - subhistory_t *psubhist = NULL; + regmatch_t* pmatch = with_hist ? nullptr : new regmatch_t[nmatch]; + subhistory_t* psubhist = nullptr; int result; result = regcomp(&re, pattern, flags); if (result != 0) { - fprintf(stderr, "%s: regcomp() failed for RE %s\n", prefix, pattern); + fprintf(stderr, "%s: regcomp() failed for regexp %s\n", prefix, pattern); goto end; } // run multiple times to ensure everything gets cleaned up properly - static const uint32_t NRUNS = 2; + static constexpr uint32_t NRUNS = 2; for (uint32_t i = 0; i < NRUNS; ++i) { if (with_hist) { - // With subhistories every regexec() call may allocate offsets. - // Clean them up at the beginning of the loop, so that the offsets - // remain available for inspection after the last iteration. + // With subhistories every regexec() call may allocate offsets. Clean them up at the + // beginning of the loop, so that the offsets remain available for inspection after the + // last iteration. regfreesub(psubhist); psubhist = regparse(&re, string, nmatch); result = psubhist ? 0 : 1; @@ -55,62 +55,67 @@ static int test(int flags, const char *pattern, const char *string, // failure was expected => it's a success result = 0; } else if (nmatch > 0) { - fprintf(stderr, "%s: regexec() failed for RE %s and string %s\n" - , prefix, pattern, string); + fprintf(stderr, + "%s: regexec() failed for regexp %s and string %s\n", + prefix, pattern, string); goto end; } } else if (nmatch == 0) { // regexed must have failed, something is wrong result = REG_NOMATCH; - fprintf(stderr, "%s: regexec() didn't fail while it should" - " for RE %s and string %s\n", prefix, pattern, string); + fprintf(stderr, + "%s: regexec() didn't fail while it should for regexp %s and string %s\n", + prefix, pattern, string); goto end; } } - assert(nmatch == 0 || nmatch == re.re_nsub); + CHECK(nmatch == 0 || nmatch == re.re_nsub); for (uint32_t i = 0; i < nmatch; ++i) { - const std::vector &expect_history = submatch[i]; + const std::vector& expect_history = submatch[i]; const size_t expect_size = expect_history.size() / 2; if (!with_hist) { // Check only the last pair of offsets for each capturing group. const regoff_t expect_so = expect_history[2 * expect_size - 2]; const regoff_t expect_eo = expect_history[2 * expect_size - 1]; - const regmatch_t &actual = pmatch[i]; + const regmatch_t& actual = pmatch[i]; if (expect_so != actual.rm_so || expect_eo != actual.rm_eo) { result = 1; - fprintf(stderr, "%s: incorrect submatch for RE %s and string %s:\n" - "\tpmatch[%u].rm_so = %ld (expected %ld)\n" - "\tpmatch[%u].rm_eo = %ld (expected %ld)\n", - prefix, pattern, string, - i, (long)actual.rm_so, (long)expect_so, - i, (long)actual.rm_eo, (long)expect_eo); + fprintf(stderr, + "%s: incorrect submatch for regexp %s and string %s:\n" + "\tpmatch[%u].rm_so = %td (expected %td)\n" + "\tpmatch[%u].rm_eo = %td (expected %td)\n", + prefix, pattern, string, + i, actual.rm_so, expect_so, + i, actual.rm_eo, expect_eo); goto end; } } else { // Check the full history of each capturing group. - const subhistory_t &actual_history = psubhist[i]; + const subhistory_t& actual_history = psubhist[i]; if (actual_history.size != expect_size) { result = 1; - fprintf(stderr, "%s: wrong subhistory-%u length for RE %s and " - "string %s: expected %lu, have %lu\n", - prefix, i, pattern, string, expect_size, actual_history.size); + fprintf(stderr, + "%s: wrong subhistory-%u length for regexp %s and string %s: " + "expected %zd, have %zd\n", + prefix, i, pattern, string, expect_size, actual_history.size); goto end; } for (uint32_t j = 0; j < actual_history.size; ++j) { const regoff_t expect_so = expect_history[2 * j]; const regoff_t expect_eo = expect_history[2 * j + 1]; - const regmatch_t &actual = actual_history.offs[j]; + const regmatch_t& actual = actual_history.offs[j]; if (expect_so != actual.rm_so || expect_eo != actual.rm_eo) { result = 1; - fprintf(stderr, "%s: incorrect submatch for RE %s and string %s:\n" - "\tpmatch[%u][%u].rm_so = %ld (expected %ld)\n" - "\tpmatch[%u][%u].rm_eo = %ld (expected %ld)\n", - prefix, pattern, string, - i, j, (long)actual.rm_so, (long)expect_so, - i, j, (long)actual.rm_eo, (long)expect_eo); + fprintf(stderr, + "%s: incorrect submatch for regexp %s and string %s:\n" + "\tpmatch[%u][%u].rm_so = %td (expected %td)\n" + "\tpmatch[%u][%u].rm_eo = %td (expected %td)\n", + prefix, pattern, string, + i, j, actual.rm_so, expect_so, + i, j, actual.rm_eo, expect_eo); goto end; } } @@ -134,27 +139,25 @@ end: return result; } -static int test_tstring(const char *pattern, const char *string, const char *expected) -{ +static int test_tstring(const char* pattern, const char* string, const char* expected) { regex_t re; - const tstring_t *tstr; + const tstring_t* tstr = nullptr; std::ostringstream s; int result; - result = regcomp(&re, pattern, REG_REGLESS | REG_TSTRING); + result = regcomp(&re, pattern, REG_MULTIPASS | REG_TSTRING); if (result != 0) { - fprintf(stderr, "[t-string] regcomp() failed for RE %s\n", pattern); + fprintf(stderr, "[t-string] regcomp() failed for regexp %s\n", pattern); goto end; } // run multiple times to ensure everything gets cleaned up properly - static const uint32_t NRUNS = 2; + static constexpr uint32_t NRUNS = 2; for (uint32_t i = 0; i < NRUNS; ++i) { tstr = regtstring(&re, string); - if (tstr == NULL) { + if (tstr == nullptr) { result = 1; - fprintf(stderr, "regtstring() failed for RE %s and string %s\n", - pattern, string); + fprintf(stderr, "regtstring() failed for regexp %s and string %s\n", pattern, string); goto end; } } @@ -163,7 +166,7 @@ static int test_tstring(const char *pattern, const char *string, const char *exp for (size_t i = 0; i < tstr->length; ++i) { const tchar_t c = tstr->string[i]; if (c < TAG_BASE) { - assert(c < 0xff); // expect 1-byte characters + CHECK(c < 0xff); // expect 1-byte characters s << static_cast(c); } else { s << c - TAG_BASE; @@ -173,8 +176,9 @@ static int test_tstring(const char *pattern, const char *string, const char *exp if (strcmp(expected, s.str().c_str()) != 0) { result = 1; - fprintf(stderr, "incorrect t-string for RE %s and string %s:\n" - "\texpect: %s\n\tactual: %s\n", pattern, string, expected, s.str().c_str()); + fprintf(stderr, + "incorrect t-string for regexp %s and string %s:\n\texpect: %s\n\tactual: %s\n", + pattern, string, expected, s.str().c_str()); } end: @@ -182,70 +186,60 @@ end: return result; } -static int test_all_posix(int f) -{ +static int test_all_posix(int f) { int e = 0; e |= test(f, "(a+(c+))|(b+(d+))", "ac", "(0,2),(0,2),(1,2),(?,?),(?,?)"); e |= test(f, "(aaaa|aaa|a)+", "aaaaaaaaaa", "(0,10),(0,4)(4,8)(8,9)(9,10)"); e |= test(f, "(aaaa|aaa|a){3,}", "aaaaaaaaaa", "(0,10),(0,4)(4,8)(8,9)(9,10)"); - if (!(f & REG_BACKWARD)) { - // expected failures for Cox algorithm + e |= test(f, "(aaaa|aaa|a){3,4}", "aaaaaaaaaa", "(0,10),(0,4)(4,8)(8,9)(9,10)"); - e |= test(f, "(aaaa|aaa|a){3,4}", "aaaaaaaaaa", "(0,10),(0,4)(4,8)(8,9)(9,10)"); + e |= test(f, "(aaaaaa|aaaaa|aaaa|aa|a){3,4}", "aaaaaaaaaaaaaaa", + "(0,15),(0,6)(6,12)(12,14)(14,15)"); - e |= test(f, "(aaaaaa|aaaaa|aaaa|aa|a){3,4}", "aaaaaaaaaaaaaaa", - "(0,15),(0,6)(6,12)(12,14)(14,15)"); + e |= test(f, "(aaaaa?a?|aa?){1,4}", "aaaaaaaaaaaaaaa", "(0,15),(0,6)(6,12)(12,14)(14,15)"); - e |= test(f, "(aaaaa?a?|aa?){1,4}", "aaaaaaaaaaaaaaa", - "(0,15),(0,6)(6,12)(12,14)(14,15)"); + e |= test(f, "(((a){3,4}a?)()a|aa?){1,4}", "aaaaaaaaaaaaaaa", + "(0,15),(0,6)(6,12)(12,14)(14,15),(0,5)(6,11)(?,?)(?,?)," + "(0,1)(1,2)(2,3)(3,4)(6,7)(7,8)(8,9)(9,10)(?,?)(?,?),(5,5)(11,11)(?,?)(?,?)"); - e |= test(f, "(((a){3,4}a?)()a|aa?){1,4}", "aaaaaaaaaaaaaaa", - "(0,15),(0,6)(6,12)(12,14)(14,15),(0,5)(6,11)(?,?)(?,?)," - "(0,1)(1,2)(2,3)(3,4)(6,7)(7,8)(8,9)(9,10)(?,?)(?,?),(5,5)(11,11)(?,?)(?,?)"); + e |= test(f, "(((aaaa?|a?)){1,4})+", "aaaaaaaaaa", + "(0,10),(0,10),(0,4)(4,8)(8,9)(9,10),(0,4)(4,8)(8,9)(9,10)"); - e |= test(f, "(((aaaa?|a?)){1,4})+", "aaaaaaaaaa", - "(0,10),(0,10),(0,4)(4,8)(8,9)(9,10),(0,4)(4,8)(8,9)(9,10)"); + e |= test(f, "((((a){2,3}(()|a)(()|a?)a|a?)){2,5})*", "aaaaaaaaaaaaaa", + "(0,14),(0,14),(0,6)(6,12)(12,13)(13,14),(0,6)(6,12)(12,13)(13,14)," + "(0,1)(1,2)(2,3)(6,7)(7,8)(8,9)(?,?)(?,?),(3,4)(9,10)(?,?)(?,?)," + "(?,?)(?,?)(?,?)(?,?),(4,5)(10,11)(?,?)(?,?),(?,?)(?,?)(?,?)(?,?)"); - e |= test(f, "((((a){2,3}(()|a)(()|a?)a|a?)){2,5})*", "aaaaaaaaaaaaaa", - "(0,14),(0,14),(0,6)(6,12)(12,13)(13,14),(0,6)(6,12)(12,13)(13,14)," - "(0,1)(1,2)(2,3)(6,7)(7,8)(8,9)(?,?)(?,?),(3,4)(9,10)(?,?)(?,?)," - "(?,?)(?,?)(?,?)(?,?),(4,5)(10,11)(?,?)(?,?),(?,?)(?,?)(?,?)(?,?)"); - } - if (!((f & REG_BACKWARD) && (f & REG_GTOP))) { - // expected failures for Cox algorithm + e |= test(f, "(((((a){3,3}a?|a?)){0,4})?)*", "aaaaaaaaaa", + "(0,10),(0,10),(0,10),(0,4)(4,8)(8,9)(9,10)," + "(0,4)(4,8)(8,9)(9,10),(0,1)(1,2)(2,3)(4,5)(5,6)(6,7)(?,?)(?,?)"); - e |= test(f, "(((((a){3,3}a?|a?)){0,4})?)*", "aaaaaaaaaa", - "(0,10),(0,10),(0,10),(0,4)(4,8)(8,9)(9,10)," - "(0,4)(4,8)(8,9)(9,10),(0,1)(1,2)(2,3)(4,5)(5,6)(6,7)(?,?)(?,?)"); + e |= test(f, "(((((a){3,4}|a?)){1,4}|((a)+(a|())){1,2}))*", "aaaaaaaaaa", + "(0,10),(0,10),(0,10),(0,4)(4,8)(8,9)(9,10),(0,4)(4,8)(8,9)(9,10)," + "(0,1)(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(?,?)(?,?),(?,?),(?,?),(?,?),(?,?)"); + + e |= test(f, "((a?|a?(a?a?)((())+)+))*", "aaaaaa", + "(0,6),(0,3)(3,6),(0,3)(3,6),(1,3)(4,6),(3,3)(6,6),(3,3)(6,6),(3,3)(6,6)"); - e |= test(f, "(((((a){3,4}|a?)){1,4}|((a)+(a|())){1,2}))*", "aaaaaaaaaa", - "(0,10),(0,10),(0,10),(0,4)(4,8)(8,9)(9,10),(0,4)(4,8)(8,9)(9,10)," - "(0,1)(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(?,?)(?,?),(?,?),(?,?),(?,?),(?,?)"); - } - if (!((f & REG_BACKWARD) && !(f & REG_GTOP))) { - // expected failures for Cox algorithm - e |= test(f, "((a?|a?(a?a?)((())+)+))*", "aaaaaa", - "(0,6),(0,3)(3,6),(0,3)(3,6),(1,3)(4,6),(3,3)(6,6),(3,3)(6,6),(3,3)(6,6)"); - } e |= test(f, "(((a?a)){2,3})*", "aaaa", "(0,4),(0,4),(0,2)(2,4),(0,2)(2,4)"); e |= test(f, "(((aa?a)((aaa)+))+)+", "aaaaaaaaaa", - "(0,10),(0,10),(0,5)(5,10),(0,2)(5,7),(2,5)(7,10),(2,5)(7,10)"); + "(0,10),(0,10),(0,5)(5,10),(0,2)(5,7),(2,5)(7,10),(2,5)(7,10)"); e |= test(f, "(((aaa?)((aaa)+))+)+", "aaaaaaaaaa", - "(0,10),(0,10),(0,5)(5,10),(0,2)(5,7),(2,5)(7,10),(2,5)(7,10)"); + "(0,10),(0,10),(0,5)(5,10),(0,2)(5,7),(2,5)(7,10),(2,5)(7,10)"); e |= test(f, "(((aaa?)((aaa){1,3})){1,2})*", "aaaaaaaaaa", - "(0,10),(0,10),(0,5)(5,10),(0,2)(5,7),(2,5)(7,10),(2,5)(7,10)"); + "(0,10),(0,10),(0,5)(5,10),(0,2)(5,7),(2,5)(7,10),(2,5)(7,10)"); e |= test(f, "((((aaa?)(aaa){1,3})){1,2})*", "aaaaaaaaaa", - "(0,10),(0,10),(0,5)(5,10),(0,5)(5,10),(0,2)(5,7),(2,5)(7,10)"); + "(0,10),(0,10),(0,5)(5,10),(0,5)(5,10),(0,2)(5,7),(2,5)(7,10)"); e |= test(f, "((((aaa?)((a){3,3}){1,3})){1,2})*", "aaaaaaaaaa", - "(0,10),(0,10),(0,5)(5,10),(0,5)(5,10),(0,2)(5,7),(2,5)(7,10)," - "(2,3)(3,4)(4,5)(7,8)(8,9)(9,10)"); + "(0,10),(0,10),(0,5)(5,10),(0,5)(5,10),(0,2)(5,7),(2,5)(7,10)," + "(2,3)(3,4)(4,5)(7,8)(8,9)(9,10)"); e |= test(f, "(a|aa)*", "", "(0,0),(?,?)"); e |= test(f, "(a|aa)*", "a", "(0,1),(0,1)"); @@ -279,25 +273,19 @@ static int test_all_posix(int f) e |= test(f, "((y?){2,3}){2}", "yyyyy", "(0,5),(0,3)(3,5),(0,1)(1,2)(2,3)(3,4)(4,5)"); - e |= test(f, "((y?){2,3}){2,3}", "yyyyy", - "(0,5),(0,3)(3,5),(0,1)(1,2)(2,3)(3,4)(4,5)"); + e |= test(f, "((y?){2,3}){2,3}", "yyyyy", "(0,5),(0,3)(3,5),(0,1)(1,2)(2,3)(3,4)(4,5)"); e |= test(f, "((y?){2,3})*", "yyyyy", "(0,5),(0,3)(3,5),(0,1)(1,2)(2,3)(3,4)(4,5)"); - e |= test(f, "((y?){3}){2}", "yyyyy", - "(0,5),(0,3)(3,5),(0,1)(1,2)(2,3)(3,4)(4,5)(5,5)"); + e |= test(f, "((y?){3}){2}", "yyyyy", "(0,5),(0,3)(3,5),(0,1)(1,2)(2,3)(3,4)(4,5)(5,5)"); - e |= test(f, "((y?){3}){2,3}", "yyyyy", - "(0,5),(0,3)(3,5),(0,1)(1,2)(2,3)(3,4)(4,5)(5,5)"); + e |= test(f, "((y?){3}){2,3}", "yyyyy", "(0,5),(0,3)(3,5),(0,1)(1,2)(2,3)(3,4)(4,5)(5,5)"); - e |= test(f, "((y?){3})*", "yyyyy", - "(0,5),(0,3)(3,5),(0,1)(1,2)(2,3)(3,4)(4,5)(5,5)"); + e |= test(f, "((y?){3})*", "yyyyy", "(0,5),(0,3)(3,5),(0,1)(1,2)(2,3)(3,4)(4,5)(5,5)"); - e |= test(f, "((y?)*){2}", "yyyyy", - "(0,5),(0,5)(5,5),(0,1)(1,2)(2,3)(3,4)(4,5)(5,5)"); + e |= test(f, "((y?)*){2}", "yyyyy", "(0,5),(0,5)(5,5),(0,1)(1,2)(2,3)(3,4)(4,5)(5,5)"); - e |= test(f, "((y?)*){2,3}", "yyyyy", - "(0,5),(0,5)(5,5),(0,1)(1,2)(2,3)(3,4)(4,5)(5,5)"); + e |= test(f, "((y?)*){2,3}", "yyyyy", "(0,5),(0,5)(5,5),(0,1)(1,2)(2,3)(3,4)(4,5)(5,5)"); e |= test(f, "((y?)*)*", "yyyyy", "(0,5),(0,5),(0,1)(1,2)(2,3)(3,4)(4,5)"); @@ -306,12 +294,8 @@ static int test_all_posix(int f) e |= test(f, "((a)|(b))*", "ab", "(0,2),(0,1)(1,2),(0,1)(?,?),(?,?)(1,2)"); e |= test(f, "((a?)|(b?))*", "ab", "(0,2),(0,1)(1,2),(0,1)(?,?),(?,?)(1,2)"); e |= test(f, "((a?)|(b?))*", "ba", "(0,2),(0,1)(1,2),(?,?)(1,2),(0,1)(?,?)"); - - if (!((f & REG_BACKWARD) && !(f & REG_GTOP))) { - e |= test(f, "((a?)|(b?)){2,3}", "ab", "(0,2),(0,1)(1,2),(0,1)(?,?),(?,?)(1,2)"); - } - e |= test(f, "((a?)|(b?)){3}", "ab", - "(0,2),(0,1)(1,2)(2,2),(0,1)(?,?)(2,2),(?,?)(1,2)(?,?)"); + e |= test(f, "((a?)|(b?)){2,3}", "ab", "(0,2),(0,1)(1,2),(0,1)(?,?),(?,?)(1,2)"); + e |= test(f, "((a?)|(b?)){3}", "ab", "(0,2),(0,1)(1,2)(2,2),(0,1)(?,?)(2,2),(?,?)(1,2)(?,?)"); e |= test(f, "y{3}", "yyy", "(0,3)"); e |= test(f, "y{0,2}", "", "(0,0)"); @@ -377,16 +361,16 @@ static int test_all_posix(int f) e |= test(f, "((((((x))))))", "x", "(0,1),(0,1),(0,1),(0,1),(0,1),(0,1),(0,1)"); e |= test(f, "((((((x))))))*", "xx", - "(0,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2)"); + "(0,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2)"); e |= test(f, "a?(ab|ba)*", - "abababababababababababababababababababab" - "ababababababababababababababababababababa", - "(0,81)," - "(1,3)(3,5)(5,7)(7,9)(9,11)(11,13)(13,15)(15,17)(17,19)(19,21)(21,23)" - "(23,25)(25,27)(27,29)(29,31)(31,33)(33,35)(35,37)(37,39)(39,41)(41,43)" - "(43,45)(45,47)(47,49)(49,51)(51,53)(53,55)(55,57)(57,59)(59,61)(61,63)" - "(63,65)(65,67)(67,69)(69,71)(71,73)(73,75)(75,77)(77,79)(79,81)"); + "abababababababababababababababababababab" + "ababababababababababababababababababababa", + "(0,81)," + "(1,3)(3,5)(5,7)(7,9)(9,11)(11,13)(13,15)(15,17)(17,19)(19,21)(21,23)" + "(23,25)(25,27)(27,29)(29,31)(31,33)(33,35)(35,37)(37,39)(39,41)(41,43)" + "(43,45)(45,47)(47,49)(49,51)(51,53)(53,55)(55,57)(57,59)(59,61)(61,63)" + "(63,65)(65,67)(67,69)(69,71)(71,73)(73,75)(75,77)(77,79)(79,81)"); e |= test(f, "a*a*a*a*a*b", "aaaaaaaaab", "(0,10)"); e |= test(f, "a[b]+bc", "abbc", "(0,4)"); @@ -425,8 +409,7 @@ static int test_all_posix(int f) // categorize e |= test(f, "(a*)(ab)*(b*)", "abc", "(0,2),(0,1),(?,?),(1,2)"); - e |= test(f, "((a*)(ab)*)((b*)(a*))", "aba", - "(0,3),(0,2),(0,0),(0,2),(2,3),(2,2),(2,3)"); + e |= test(f, "((a*)(ab)*)((b*)(a*))", "aba", "(0,3),(0,2),(0,0),(0,2),(2,3),(2,2),(2,3)"); e |= test(f, "(...?.?)*", "xxxxxx", "(0,6),(0,4)(4,6)"); e |= test(f, "(a|ab)(bc|c)", "abcabc", "(0,3),(0,2),(2,3)"); @@ -437,8 +420,7 @@ static int test_all_posix(int f) e |= test(f, "(a(b)?)+", "aba", "(0,3),(0,2)(2,3),(1,2)(?,?)"); e |= test(f, ".*(.*)", "ab", "(0,2),(2,2)"); - e |= test(f, "(a?)((ab)?)(b?)a?(ab)?b?", "abab", - "(0,4),(0,1),(1,1),(?,?),(1,2),(?,?)"); + e |= test(f, "(a?)((ab)?)(b?)a?(ab)?b?", "abab", "(0,4),(0,1),(1,1),(?,?),(1,2),(?,?)"); // forcedassoc e |= test(f, "(a|ab)(c|bcd)", "abcd", "(0,4),(0,1),(1,4)"); @@ -520,8 +502,7 @@ static int test_all_posix(int f) e |= test(f, "((..)*(...)*)", "xxx", "(0,3),(0,3),(?,?),(0,3)"); e |= test(f, "((..)*(...)*)*", "xxx", "(0,3),(0,3),(?,?),(0,3)"); - e |= test(f, "((..)*(...)*)((..)*(...)*)", "xxx", - "(0,3),(0,3),(?,?),(0,3),(3,3),(?,?),(?,?)"); + e |= test(f, "((..)*(...)*)((..)*(...)*)", "xxx", "(0,3),(0,3),(?,?),(0,3),(3,3),(?,?),(?,?)"); // nullsubexpr e |= test(f, "(a*)*", "a", "(0,1),(0,1)"); @@ -596,19 +577,16 @@ static int test_all_posix(int f) e |= test(f, "(aa*|aaa*)(aa*|aaa*)", "aaaaaa", "(0,6),(0,5),(5,6)"); e |= test(f, "(aa*|aaa*){2}", "aaaaaa", "(0,6),(0,5)(5,6)"); e |= test(f, "((aa)*|(aaa)*)((aa)*|(aaa)*)", "aaaaaa", - "(0,6),(0,6),(0,2)(2,4)(4,6),(?,?),(6,6),(?,?),(?,?)"); + "(0,6),(0,6),(0,2)(2,4)(4,6),(?,?),(6,6),(?,?),(?,?)"); e |= test(f, "((aa)*|(aaa)*){2}", "aaaaaa", - "(0,6),(0,6)(6,6),(0,2)(2,4)(4,6)(?,?),(?,?)(?,?)"); - e |= test(f, "(aa)*|(aaa)*", "aaaaaa", - "(0,6),(0,2)(2,4)(4,6),(?,?)"); - - e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XY", "(0,2),(0,1)(1,2)"); - e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XabY", "(0,4),(0,3)(3,4)"); - e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XababY", "(0,6),(0,4)(4,6)"); - e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XabababY", - "(0,8),(0,3)(3,7)(7,8)"); - e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XababababY", - "(0,10),(0,4)(4,8)(8,10)"); + "(0,6),(0,6)(6,6),(0,2)(2,4)(4,6)(?,?),(?,?)(?,?)"); + e |= test(f, "(aa)*|(aaa)*", "aaaaaa", "(0,6),(0,2)(2,4)(4,6),(?,?)"); + + e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XY", "(0,2),(0,1)(1,2)"); + e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XabY", "(0,4),(0,3)(3,4)"); + e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XababY", "(0,6),(0,4)(4,6)"); + e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XabababY", "(0,8),(0,3)(3,7)(7,8)"); + e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XababababY", "(0,10),(0,4)(4,8)(8,10)"); e |= test(f, "(y){2}", ""); e |= test(f, "(y){2}", "y"); @@ -675,66 +653,62 @@ static int test_all_posix(int f) e |= test(f, "((..)|(.))", "aa", "(0,2),(0,2),(0,2),(?,?)"); e |= test(f, "((..)|(.))((..)|(.))", "aa", - "(0,2),(0,1),(?,?),(0,1),(1,2),(?,?),(1,2)"); + "(0,2),(0,1),(?,?),(0,1),(1,2),(?,?),(1,2)"); e |= test(f, "((..)|(.))((..)|(.))((..)|(.))", "aa"); e |= test(f, "((..)|(.)){1}", "aa", "(0,2),(0,2),(0,2),(?,?)"); - e |= test(f, "((..)|(.)){2}", "aa", - "(0,2),(0,1)(1,2),(?,?)(?,?),(0,1)(1,2)"); + e |= test(f, "((..)|(.)){2}", "aa", "(0,2),(0,1)(1,2),(?,?)(?,?),(0,1)(1,2)"); e |= test(f, "((..)|(.)){3}", "aa"); e |= test(f, "((..)|(.))*", "aa", "(0,2),(0,2),(0,2),(?,?)"); e |= test(f, "((..)|(.))", "aaa", "(0,2),(0,2),(0,2),(?,?)"); e |= test(f, "((..)|(.))((..)|(.))", "aaa", - "(0,3),(0,2),(0,2),(?,?),(2,3),(?,?),(2,3)"); + "(0,3),(0,2),(0,2),(?,?),(2,3),(?,?),(2,3)"); e |= test(f, "((..)|(.))((..)|(.))((..)|(.))", "aaa", - "(0,3),(0,1),(?,?),(0,1),(1,2),(?,?),(1,2),(2,3),(?,?),(2,3)"); - e |= test(f, "((..)|(.)){1}", "aaa", - "(0,2),(0,2),(0,2),(?,?)"); - e |= test(f, "((..)|(.)){2}", "aaa", - "(0,3),(0,2)(2,3),(0,2)(?,?),(?,?)(2,3)"); + "(0,3),(0,1),(?,?),(0,1),(1,2),(?,?),(1,2),(2,3),(?,?),(2,3)"); + e |= test(f, "((..)|(.)){1}", "aaa", "(0,2),(0,2),(0,2),(?,?)"); + e |= test(f, "((..)|(.)){2}", "aaa", "(0,3),(0,2)(2,3),(0,2)(?,?),(?,?)(2,3)"); e |= test(f, "((..)|(.)){3}", "aaa", - "(0,3),(0,1)(1,2)(2,3),(?,?)(?,?)(?,?),(0,1)(1,2)(2,3)"); - e |= test(f, "((..)|(.))*", "aaa", - "(0,3),(0,2)(2,3),(0,2)(?,?),(?,?)(2,3)"); + "(0,3),(0,1)(1,2)(2,3),(?,?)(?,?)(?,?),(0,1)(1,2)(2,3)"); + e |= test(f, "((..)|(.))*", "aaa", "(0,3),(0,2)(2,3),(0,2)(?,?),(?,?)(2,3)"); e |= test(f, "((..)|(.))", "aaaa", "(0,2),(0,2),(0,2),(?,?)"); e |= test(f, "((..)|(.))((..)|(.))", "aaaa", - "(0,4),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?)"); + "(0,4),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?)"); e |= test(f, "((..)|(.))((..)|(.))((..)|(.))", "aaaa", - "(0,4),(0,2),(0,2),(?,?),(2,3),(?,?),(2,3),(3,4),(?,?),(3,4)"); + "(0,4),(0,2),(0,2),(?,?),(2,3),(?,?),(2,3),(3,4),(?,?),(3,4)"); e |= test(f, "((..)|(.)){1}", "aaaa", "(0,2),(0,2),(0,2),(?,?)"); e |= test(f, "((..)|(.)){2}", "aaaa", - "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)"); + "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)"); e |= test(f, "((..)|(.)){3}", "aaaa", - "(0,4),(0,2)(2,3)(3,4),(0,2)(?,?)(?,?),(?,?)(2,3)(3,4)"); + "(0,4),(0,2)(2,3)(3,4),(0,2)(?,?)(?,?),(?,?)(2,3)(3,4)"); e |= test(f, "((..)|(.))*", "aaaa", - "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)"); + "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)"); e |= test(f, "((..)|(.))", "aaaaa", "(0,2),(0,2),(0,2),(?,?)"); e |= test(f, "((..)|(.))((..)|(.))", "aaaaa", - "(0,4),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?)"); + "(0,4),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?)"); e |= test(f, "((..)|(.))((..)|(.))((..)|(.))", "aaaaa", - "(0,5),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?),(4,5),(?,?),(4,5)"); + "(0,5),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?),(4,5),(?,?),(4,5)"); e |= test(f, "((..)|(.)){1}", "aaaaa", "(0,2),(0,2),(0,2),(?,?)"); e |= test(f, "((..)|(.)){2}", "aaaaa", - "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)"); + "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)"); e |= test(f, "((..)|(.)){3}", "aaaaa", - "(0,5),(0,2)(2,4)(4,5),(0,2)(2,4)(?,?),(?,?)(?,?)(4,5)"); + "(0,5),(0,2)(2,4)(4,5),(0,2)(2,4)(?,?),(?,?)(?,?)(4,5)"); e |= test(f, "((..)|(.))*", "aaaaa", - "(0,5),(0,2)(2,4)(4,5),(0,2)(2,4)(?,?),(?,?)(?,?)(4,5)"); + "(0,5),(0,2)(2,4)(4,5),(0,2)(2,4)(?,?),(?,?)(?,?)(4,5)"); e |= test(f, "((..)|(.))", "aaaaaa", "(0,2),(0,2),(0,2),(?,?)"); e |= test(f, "((..)|(.))((..)|(.))", "aaaaaa", - "(0,4),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?)"); + "(0,4),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?)"); e |= test(f, "((..)|(.))((..)|(.))((..)|(.))", "aaaaaa", - "(0,6),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?),(4,6),(4,6),(?,?)"); + "(0,6),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?),(4,6),(4,6),(?,?)"); e |= test(f, "((..)|(.)){1}", "aaaaaa", "(0,2),(0,2),(0,2),(?,?)"); e |= test(f, "((..)|(.)){2}", "aaaaaa", - "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)"); + "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)"); e |= test(f, "((..)|(.)){3}", "aaaaaa", - "(0,6),(0,2)(2,4)(4,6),(0,2)(2,4)(4,6),(?,?)(?,?)(?,?)"); + "(0,6),(0,2)(2,4)(4,6),(0,2)(2,4)(4,6),(?,?)(?,?)(?,?)"); e |= test(f, "((..)|(.))*", "aaaaaa", - "(0,6),(0,2)(2,4)(4,6),(0,2)(2,4)(4,6),(?,?)(?,?)(?,?)"); + "(0,6),(0,2)(2,4)(4,6),(0,2)(2,4)(4,6),(?,?)(?,?)(?,?)"); e |= test(f, "X(.?){0,}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)"); e |= test(f, "X(.?){1,}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)"); @@ -744,8 +718,7 @@ static int test_all_posix(int f) e |= test(f, "X(.?){5,}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)"); e |= test(f, "X(.?){6,}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)"); e |= test(f, "X(.?){7,}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)"); - e |= test(f, "X(.?){8,}Y", "X1234567Y", - "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)"); + e |= test(f, "X(.?){8,}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)"); e |= test(f, "X(.?){0,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)"); e |= test(f, "X(.?){1,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)"); e |= test(f, "X(.?){2,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)"); @@ -754,8 +727,7 @@ static int test_all_posix(int f) e |= test(f, "X(.?){5,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)"); e |= test(f, "X(.?){6,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)"); e |= test(f, "X(.?){7,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)"); - e |= test(f, "X(.?){8,8}Y", "X1234567Y", - "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)"); + e |= test(f, "X(.?){8,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)"); e |= test(f, "(a|ab|c|bcd){0,}(d*)", "ababcd", "(0,6),(0,2)(2,3)(3,6),(6,6)"); e |= test(f, "(a|ab|c|bcd){1,}(d*)", "ababcd", "(0,6),(0,2)(2,3)(3,6),(6,6)"); @@ -798,59 +770,53 @@ static int test_all_posix(int f) if (!(f & REG_NFA)) { e |= test(f, "((a?){1,300})*", - "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", - "(0,50),(0,50)," - "(0,1)(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,9)(9,10)" - "(10,11)(11,12)(12,13)(13,14)(14,15)(15,16)(16,17)(17,18)(18,19)(19,20)" - "(20,21)(21,22)(22,23)(23,24)(24,25)(25,26)(26,27)(27,28)(28,29)(29,30)" - "(30,31)(31,32)(32,33)(33,34)(34,35)(35,36)(36,37)(37,38)(38,39)(39,40)" - "(40,41)(41,42)(42,43)(43,44)(44,45)(45,46)(46,47)(47,48)(48,49)(49,50)"); - } - else if (!(f & (REG_SLOWPREC | REG_KUKLEWICZ))) { + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "(0,50),(0,50)," + "(0,1)(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,9)(9,10)" + "(10,11)(11,12)(12,13)(13,14)(14,15)(15,16)(16,17)(17,18)(18,19)(19,20)" + "(20,21)(21,22)(22,23)(23,24)(24,25)(25,26)(26,27)(27,28)(28,29)(29,30)" + "(30,31)(31,32)(32,33)(33,34)(34,35)(35,36)(36,37)(37,38)(38,39)(39,40)" + "(40,41)(41,42)(42,43)(43,44)(44,45)(45,46)(46,47)(47,48)(48,49)(49,50)"); + } else if (!(f & REG_SLOWPREC)) { e |= test(f, "((a?){1,1000})*", "aaaa", "(0,4),(0,4),(3,4)"); e |= test(f, "(((((aa)|((a?)*))*){0,10}){0,10}){0,10}", "", - "(0,0),(0,0),(0,0),(0,0),(0,0),(?,?),(0,0),(0,0)"); - - if (!((f & REG_BACKWARD) && !(f & REG_GTOP))) { - // expected failures for Cox algorithm + "(0,0),(0,0),(0,0),(0,0),(0,0),(?,?),(0,0),(0,0)"); - e |= test(f, "(((((aa)|((a?)*))*){0,10}){0,10}){0,10}", "aaa", - "(0,3),(0,3),(0,3),(0,3),(0,3),(?,?),(0,3),(2,3)"); + e |= test(f, "(((((aa)|((a?)*))*){0,10}){0,10}){0,10}", "aaa", + "(0,3),(0,3),(0,3),(0,3),(0,3),(?,?),(0,3),(2,3)"); - e |= test(f, "(((((aa)|((a?)*))*){0,10}){0,10}){0,10}", "aaaaa", - "(0,5),(0,5),(0,5),(0,5),(0,5),(?,?),(0,5),(4,5)"); - } + e |= test(f, "(((((aa)|((a?)*))*){0,10}){0,10}){0,10}", "aaaaa", + "(0,5),(0,5),(0,5),(0,5),(0,5),(?,?),(0,5),(4,5)"); } e |= test(f, "((((a?)+)|(aa))+)", "aaa", - "(0,3),(0,3),(0,3),(0,3),(0,1)(1,2)(2,3),(?,?)"); + "(0,3),(0,3),(0,3),(0,3),(0,1)(1,2)(2,3),(?,?)"); e |= test(f, "(((aa)|((a?)+))+)", "aaa", - "(0,3),(0,3),(0,3),(?,?),(0,3),(0,1)(1,2)(2,3)"); + "(0,3),(0,3),(0,3),(?,?),(0,3),(0,1)(1,2)(2,3)"); e |= test(f, "((a?){1,2}|(a)*)*", "aaaa", "(0,4),(0,4),(?,?),(0,1)(1,2)(2,3)(3,4)"); e |= test(f, "(((a?){2,3}|(a)*))*", "aaaaa", - "(0,5),(0,5),(0,5),(?,?),(0,1)(1,2)(2,3)(3,4)(4,5)"); + "(0,5),(0,5),(0,5),(?,?),(0,1)(1,2)(2,3)(3,4)(4,5)"); e |= test(f, "(((a?)|(a?a?))+)", "aa", "(0,2),(0,2),(0,2),(?,?),(0,2)"); e |= test(f, "((((a)*))*|((((a))*))+)*", "aa", - "(0,2),(0,2),(0,2),(0,2),(0,1)(1,2),(?,?),(?,?),(?,?),(?,?)"); + "(0,2),(0,2),(0,2),(0,2),(0,1)(1,2),(?,?),(?,?),(?,?),(?,?)"); e |= test(f, "(((a)*)*|((a)*)+)*", "aa", "(0,2),(0,2),(0,2),(0,1)(1,2),(?,?),(?,?)"); e |= test(f, "(((a)+)|(((a)+)?))+", "aa", - "(0,2),(0,2),(0,2),(0,1)(1,2),(?,?),(?,?),(?,?)"); + "(0,2),(0,2),(0,2),(0,1)(1,2),(?,?),(?,?),(?,?)"); e |= test(f, "(a*|(a)*)*", "aa", "(0,2),(0,2),(?,?)"); return e; } -static int test_all_leftmost(int f) -{ +static int test_all_leftmost(int f) { int e = 0; e |= test(f, "a", "a", "(0,1)"); @@ -902,16 +868,16 @@ static int test_all_leftmost(int f) e |= test(f, "((((((x))))))", "x", "(0,1),(0,1),(0,1),(0,1),(0,1),(0,1),(0,1)"); e |= test(f, "((((((x))))))*", "xx", - "(0,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2)"); + "(0,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2)"); e |= test(f, "a?(ab|ba)*", - "abababababababababababababababababababab" - "ababababababababababababababababababababa", - "(0,81)," - "(1,3)(3,5)(5,7)(7,9)(9,11)(11,13)(13,15)(15,17)(17,19)(19,21)(21,23)" - "(23,25)(25,27)(27,29)(29,31)(31,33)(33,35)(35,37)(37,39)(39,41)(41,43)" - "(43,45)(45,47)(47,49)(49,51)(51,53)(53,55)(55,57)(57,59)(59,61)(61,63)" - "(63,65)(65,67)(67,69)(69,71)(71,73)(73,75)(75,77)(77,79)(79,81)"); + "abababababababababababababababababababab" + "ababababababababababababababababababababa", + "(0,81)," + "(1,3)(3,5)(5,7)(7,9)(9,11)(11,13)(13,15)(15,17)(17,19)(19,21)(21,23)" + "(23,25)(25,27)(27,29)(29,31)(31,33)(33,35)(35,37)(37,39)(39,41)(41,43)" + "(43,45)(45,47)(47,49)(49,51)(51,53)(53,55)(55,57)(57,59)(59,61)(61,63)" + "(63,65)(65,67)(67,69)(69,71)(71,73)(73,75)(75,77)(77,79)(79,81)"); e |= test(f, "a*a*a*a*a*b", "aaaaaaaaab", "(0,10)"); e |= test(f, "a[b]+bc", "abbc", "(0,4)"); @@ -950,8 +916,7 @@ static int test_all_leftmost(int f) // categorize e |= test(f, "(a*)(ab)*(b*)", "abc", "(0,2),(0,1),(?,?),(1,2)"); - e |= test(f, "((a*)(ab)*)((b*)(a*))", "aba", - "(0,3),(0,1),(0,1),(?,?),(1,3),(1,2),(2,3)"); + e |= test(f, "((a*)(ab)*)((b*)(a*))", "aba", "(0,3),(0,1),(0,1),(?,?),(1,3),(1,2),(2,3)"); e |= test(f, "(...?.?)*", "xxxxxx", "(0,6),(0,4)(4,6)"); e |= test(f, "(a|ab)(bc|c)", "abcabc", "(0,3),(0,1),(1,3)"); @@ -962,8 +927,7 @@ static int test_all_leftmost(int f) e |= test(f, "(a(b)?)+", "aba", "(0,3),(0,2)(2,3),(1,2)(?,?)"); e |= test(f, ".*(.*)", "ab", "(0,2),(2,2)"); - e |= test(f, "(a?)((ab)?)(b?)a?(ab)?b?", "abab", - "(0,4),(0,1),(1,1),(?,?),(1,2),(?,?)"); + e |= test(f, "(a?)((ab)?)(b?)a?(ab)?b?", "abab", "(0,4),(0,1),(1,1),(?,?),(1,2),(?,?)"); // forcedassoc e |= test(f, "(a|ab)(c|bcd)", "abcd", "(0,4),(0,1),(1,4)"); @@ -1042,8 +1006,7 @@ static int test_all_leftmost(int f) e |= test(f, "((..)*(...)*)", "xxx", "(0,3),(0,3),(?,?),(0,3)"); e |= test(f, "((..)*(...)*)*", "xxx", "(0,3),(0,3),(?,?),(0,3)"); - e |= test(f, "((..)*(...)*)((..)*(...)*)", "xxx", - "(0,3),(0,3),(?,?),(0,3),(3,3),(?,?),(?,?)"); + e |= test(f, "((..)*(...)*)((..)*(...)*)", "xxx", "(0,3),(0,3),(?,?),(0,3),(3,3),(?,?),(?,?)"); // nullsubexpr e |= test(f, "(a*)*", "a", "(0,1),(0,1)"); @@ -1121,17 +1084,16 @@ static int test_all_leftmost(int f) e |= test(f, "(aa*|aaa*)(aa*|aaa*)", "aaaaaa", "(0,6),(0,5),(5,6)"); e |= test(f, "(aa*|aaa*){2}", "aaaaaa", "(0,6),(0,5)(5,6)"); e |= test(f, "((aa)*|(aaa)*)((aa)*|(aaa)*)", "aaaaaa", - "(0,6),(0,6),(0,2)(2,4)(4,6),(?,?),(6,6),(?,?),(?,?)"); + "(0,6),(0,6),(0,2)(2,4)(4,6),(?,?),(6,6),(?,?),(?,?)"); e |= test(f, "((aa)*|(aaa)*){2}", "aaaaaa", - "(0,6),(0,6)(6,6),(0,2)(2,4)(4,6)(?,?),(?,?)(?,?)"); + "(0,6),(0,6)(6,6),(0,2)(2,4)(4,6)(?,?),(?,?)(?,?)"); e |= test(f, "(aa)*|(aaa)*", "aaaaaa", "(0,6),(0,2)(2,4)(4,6),(?,?)"); - e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XY", "(0,2),(0,1)(1,2)"); - e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XabY", "(0,4),(0,2)(2,4)"); - e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XababY", "(0,6),(0,1)(1,5)(5,6)"); - e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XabababY", "(0,8),(0,2)(2,6)(6,8)"); - e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XababababY", - "(0,10),(0,1)(1,5)(5,9)(9,10)"); + e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XY", "(0,2),(0,1)(1,2)"); + e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XabY", "(0,4),(0,2)(2,4)"); + e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XababY", "(0,6),(0,1)(1,5)(5,6)"); + e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XabababY", "(0,8),(0,2)(2,6)(6,8)"); + e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XababababY", "(0,10),(0,1)(1,5)(5,9)(9,10)"); e |= test(f, "(y){2}", ""); e |= test(f, "(y){2}", "y"); @@ -1197,65 +1159,62 @@ static int test_all_leftmost(int f) e |= test(f, "((..)|(.))", "aa", "(0,2),(0,2),(0,2),(?,?)"); e |= test(f, "((..)|(.))((..)|(.))", "aa", - "(0,2),(0,1),(?,?),(0,1),(1,2),(?,?),(1,2)"); + "(0,2),(0,1),(?,?),(0,1),(1,2),(?,?),(1,2)"); e |= test(f, "((..)|(.))((..)|(.))((..)|(.))", "aa"); e |= test(f, "((..)|(.)){1}", "aa", "(0,2),(0,2),(0,2),(?,?)"); - e |= test(f, "((..)|(.)){2}", "aa", - "(0,2),(0,1)(1,2),(?,?)(?,?),(0,1)(1,2)"); + e |= test(f, "((..)|(.)){2}", "aa", "(0,2),(0,1)(1,2),(?,?)(?,?),(0,1)(1,2)"); e |= test(f, "((..)|(.)){3}", "aa"); e |= test(f, "((..)|(.))*", "aa", "(0,2),(0,2),(0,2),(?,?)"); e |= test(f, "((..)|(.))", "aaa", "(0,2),(0,2),(0,2),(?,?)"); e |= test(f, "((..)|(.))((..)|(.))", "aaa", - "(0,3),(0,2),(0,2),(?,?),(2,3),(?,?),(2,3)"); + "(0,3),(0,2),(0,2),(?,?),(2,3),(?,?),(2,3)"); e |= test(f, "((..)|(.))((..)|(.))((..)|(.))", "aaa", - "(0,3),(0,1),(?,?),(0,1),(1,2),(?,?),(1,2),(2,3),(?,?),(2,3)"); + "(0,3),(0,1),(?,?),(0,1),(1,2),(?,?),(1,2),(2,3),(?,?),(2,3)"); e |= test(f, "((..)|(.)){1}", "aaa", "(0,2),(0,2),(0,2),(?,?)"); - e |= test(f, "((..)|(.)){2}", "aaa", - "(0,3),(0,2)(2,3),(0,2)(?,?),(?,?)(2,3)"); + e |= test(f, "((..)|(.)){2}", "aaa", "(0,3),(0,2)(2,3),(0,2)(?,?),(?,?)(2,3)"); e |= test(f, "((..)|(.)){3}", "aaa", - "(0,3),(0,1)(1,2)(2,3),(?,?)(?,?)(?,?),(0,1)(1,2)(2,3)"); - e |= test(f, "((..)|(.))*", "aaa", - "(0,3),(0,2)(2,3),(0,2)(?,?),(?,?)(2,3)"); + "(0,3),(0,1)(1,2)(2,3),(?,?)(?,?)(?,?),(0,1)(1,2)(2,3)"); + e |= test(f, "((..)|(.))*", "aaa", "(0,3),(0,2)(2,3),(0,2)(?,?),(?,?)(2,3)"); e |= test(f, "((..)|(.))", "aaaa", "(0,2),(0,2),(0,2),(?,?)"); e |= test(f, "((..)|(.))((..)|(.))", "aaaa", - "(0,4),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?)"); + "(0,4),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?)"); e |= test(f, "((..)|(.))((..)|(.))((..)|(.))", "aaaa", - "(0,4),(0,2),(0,2),(?,?),(2,3),(?,?),(2,3),(3,4),(?,?),(3,4)"); + "(0,4),(0,2),(0,2),(?,?),(2,3),(?,?),(2,3),(3,4),(?,?),(3,4)"); e |= test(f, "((..)|(.)){1}", "aaaa", "(0,2),(0,2),(0,2),(?,?)"); e |= test(f, "((..)|(.)){2}", "aaaa", - "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)"); + "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)"); e |= test(f, "((..)|(.)){3}", "aaaa", - "(0,4),(0,2)(2,3)(3,4),(0,2)(?,?)(?,?),(?,?)(2,3)(3,4)"); + "(0,4),(0,2)(2,3)(3,4),(0,2)(?,?)(?,?),(?,?)(2,3)(3,4)"); e |= test(f, "((..)|(.))*", "aaaa", - "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)"); + "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)"); e |= test(f, "((..)|(.))", "aaaaa", "(0,2),(0,2),(0,2),(?,?)"); e |= test(f, "((..)|(.))((..)|(.))", "aaaaa", - "(0,4),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?)"); + "(0,4),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?)"); e |= test(f, "((..)|(.))((..)|(.))((..)|(.))", "aaaaa", - "(0,5),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?),(4,5),(?,?),(4,5)"); + "(0,5),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?),(4,5),(?,?),(4,5)"); e |= test(f, "((..)|(.)){1}", "aaaaa", "(0,2),(0,2),(0,2),(?,?)"); e |= test(f, "((..)|(.)){2}", "aaaaa", - "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)"); + "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)"); e |= test(f, "((..)|(.)){3}", "aaaaa", - "(0,5),(0,2)(2,4)(4,5),(0,2)(2,4)(?,?),(?,?)(?,?)(4,5)"); + "(0,5),(0,2)(2,4)(4,5),(0,2)(2,4)(?,?),(?,?)(?,?)(4,5)"); e |= test(f, "((..)|(.))*", "aaaaa", - "(0,5),(0,2)(2,4)(4,5),(0,2)(2,4)(?,?),(?,?)(?,?)(4,5)"); + "(0,5),(0,2)(2,4)(4,5),(0,2)(2,4)(?,?),(?,?)(?,?)(4,5)"); e |= test(f, "((..)|(.))", "aaaaaa", "(0,2),(0,2),(0,2),(?,?)"); e |= test(f, "((..)|(.))((..)|(.))", "aaaaaa", - "(0,4),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?)"); + "(0,4),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?)"); e |= test(f, "((..)|(.))((..)|(.))((..)|(.))", "aaaaaa", - "(0,6),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?),(4,6),(4,6),(?,?)"); + "(0,6),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?),(4,6),(4,6),(?,?)"); e |= test(f, "((..)|(.)){1}", "aaaaaa", "(0,2),(0,2),(0,2),(?,?)"); e |= test(f, "((..)|(.)){2}", "aaaaaa", - "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)"); + "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)"); e |= test(f, "((..)|(.)){3}", "aaaaaa", - "(0,6),(0,2)(2,4)(4,6),(0,2)(2,4)(4,6),(?,?)(?,?)(?,?)"); + "(0,6),(0,2)(2,4)(4,6),(0,2)(2,4)(4,6),(?,?)(?,?)(?,?)"); e |= test(f, "((..)|(.))*", "aaaaaa", - "(0,6),(0,2)(2,4)(4,6),(0,2)(2,4)(4,6),(?,?)(?,?)(?,?)"); + "(0,6),(0,2)(2,4)(4,6),(0,2)(2,4)(4,6),(?,?)(?,?)(?,?)"); e |= test(f, "X(.?){0,}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)"); e |= test(f, "X(.?){1,}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)"); @@ -1266,26 +1225,16 @@ static int test_all_leftmost(int f) e |= test(f, "X(.?){6,}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)"); e |= test(f, "X(.?){7,}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)"); - e |= test(f, "X(.?){8,}Y", "X1234567Y", - "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)"); - e |= test(f, "X(.?){0,8}Y", "X1234567Y", - "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)"); - e |= test(f, "X(.?){1,8}Y", "X1234567Y", - "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)"); - e |= test(f, "X(.?){2,8}Y", "X1234567Y", - "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)"); - e |= test(f, "X(.?){3,8}Y", "X1234567Y", - "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)"); - e |= test(f, "X(.?){4,8}Y", "X1234567Y", - "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)"); - e |= test(f, "X(.?){5,8}Y", "X1234567Y", - "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)"); - e |= test(f, "X(.?){6,8}Y", "X1234567Y", - "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)"); - e |= test(f, "X(.?){7,8}Y", "X1234567Y", - "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)"); - e |= test(f, "X(.?){8,8}Y", "X1234567Y", - "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)"); + e |= test(f, "X(.?){8,}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)"); + e |= test(f, "X(.?){0,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)"); + e |= test(f, "X(.?){1,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)"); + e |= test(f, "X(.?){2,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)"); + e |= test(f, "X(.?){3,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)"); + e |= test(f, "X(.?){4,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)"); + e |= test(f, "X(.?){5,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)"); + e |= test(f, "X(.?){6,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)"); + e |= test(f, "X(.?){7,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)"); + e |= test(f, "X(.?){8,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)"); e |= test(f, "(a|ab|c|bcd){0,}(d*)", "ababcd", "(0,6),(0,2)(2,3)(3,6),(6,6)"); e |= test(f, "(a|ab|c|bcd){1,}(d*)", "ababcd", "(0,6),(0,2)(2,3)(3,6),(6,6)"); @@ -1327,20 +1276,15 @@ static int test_all_leftmost(int f) e |= test(f, "(ab|a)(bcd|c)(d|.*)", "abcd", "(0,4),(0,2),(2,3),(3,4)"); // other - if (!(f & REG_STADFA)) { - // TODO: Find out why this test takes a long time on staDFA. - - std::string s = "(0,4),(0,4),(0,1)(1,2)(2,3)(3,4)"; - for (size_t i = 0; i < 1000 - 4; ++i) s += "(4,4)"; + std::string s = "(0,4),(0,4),(0,1)(1,2)(2,3)(3,4)"; + for (size_t i = 0; i < 1000 - 4; ++i) s += "(4,4)"; - e |= test(f, "((a?){1,1000})*", "aaaa", s.c_str()); - } + e |= test(f, "((a?){1,1000})*", "aaaa", s.c_str()); return e; } -static int test_all_tstring() -{ +static int test_all_tstring() { int e = 0; e |= test_tstring("a", "a", "1 a 2 "); @@ -1360,41 +1304,27 @@ static int test_all_tstring() return e; } -int main() -{ +int main() { int e = 0; e |= test_all_posix(0); - e |= test_all_posix(REG_STADFA); - e |= test_all_posix(REG_REGLESS); + e |= test_all_posix(REG_MULTIPASS); e |= test_all_posix(REG_SUBHIST); - e |= test_all_posix(REG_SUBHIST | REG_REGLESS); + e |= test_all_posix(REG_SUBHIST | REG_MULTIPASS); e |= test_all_leftmost(REG_LEFTMOST); - e |= test_all_leftmost(REG_LEFTMOST | REG_STADFA); - e |= test_all_leftmost(REG_LEFTMOST | REG_REGLESS); + e |= test_all_leftmost(REG_LEFTMOST | REG_MULTIPASS); e |= test_all_leftmost(REG_LEFTMOST | REG_SUBHIST); - e |= test_all_leftmost(REG_LEFTMOST | REG_SUBHIST | REG_REGLESS); + e |= test_all_leftmost(REG_LEFTMOST | REG_SUBHIST | REG_MULTIPASS); e |= test_all_posix(REG_NFA); - e |= test_all_posix(REG_NFA | REG_GTOP); - - e |= test_all_posix(REG_NFA | REG_KUKLEWICZ); - e |= test_all_posix(REG_NFA | REG_KUKLEWICZ | REG_GTOP); - e |= test_all_posix(REG_NFA | REG_TRIE); - e |= test_all_posix(REG_NFA | REG_TRIE | REG_GTOP); - e |= test_all_posix(REG_NFA | REG_SLOWPREC); e |= test_all_leftmost(REG_NFA | REG_LEFTMOST); e |= test_all_leftmost(REG_NFA | REG_LEFTMOST | REG_TRIE); - e |= test_all_posix(REG_NFA | REG_BACKWARD); - e |= test_all_posix(REG_NFA | REG_BACKWARD | REG_GTOP); - e |= test_all_tstring(); return e; } - diff --git a/lib/test_helper.h b/lib/test_helper.h index 47dacf56..568892bf 100644 --- a/lib/test_helper.h +++ b/lib/test_helper.h @@ -6,15 +6,14 @@ // Parse test submatch result string. // -// Expected format: comma-separated lists of offset pairs in parentheses, each -// list contains as many pairs as there are repetitions for the corresponding -// capturing group. +// Expected format: comma-separated lists of offset pairs in parentheses, each list contains as many +// pairs as there are repetitions for the corresponding capturing group. // -// Example for RE "(a(b)?)*" and string "aba": "(0,3),(0,2)(2,3),(1,2)(?,?)". +// Example for regexp "(a(b)?)*" and string "aba": "(0,3),(0,2)(2,3),(1,2)(?,?)". // -// String representation of result is used instead of vectors and initializer -// lists because C++ compilers take too long to process that many vectors, and -// the compilation gets very slow. -bool parse_submatch(const char *s, std::vector > &result); +// String representation of result is used instead of vectors and initializer lists because C++ +// compilers take too long to process that many vectors, and the compilation gets very slow. +// +bool parse_submatch(const char* s, std::vector >& result); #endif // _RE2C_LIB_TEST_HELPER_ diff --git a/lib/test_helper.re b/lib/test_helper.re index 9ae6cc23..40b2d9c4 100644 --- a/lib/test_helper.re +++ b/lib/test_helper.re @@ -1,8 +1,6 @@ #include "lib/test_helper.h" - -static inline regoff_t parse_offset(const char *s, const char *e) -{ +static inline regoff_t parse_offset(const char* s, const char* e) { regoff_t n = 0; for (; s < e; ++s) { n = n * 10 + (*s - '0'); @@ -10,39 +8,37 @@ static inline regoff_t parse_offset(const char *s, const char *e) return n; } -bool parse_submatch(const char *s, std::vector > &result) -{ +bool parse_submatch(const char* s, std::vector >& result) { result.clear(); if (!s) return true; - std::vector *v; - const char *n1, *n2, *m1, *m2, *YYMARKER, *YYCURSOR = s; + std::vector* v; + const char* n1, *n2, *m1, *m2, *YYMARKER, *YYCURSOR = s; /*!stags:re2c format = "const char *@@;"; */ next: result.push_back(std::vector()); v = &result.back(); -loop: - /*!re2c - re2c:define:YYCTYPE = char; - re2c:yyfill:enable = 0; - re2c:flags:tags = 1; - - * { return false; } - "\x00" { return true; } - "," { goto next; } - - "(?,?)" { - v->push_back(-1); - v->push_back(-1); - goto loop; - } - - "(" @n1 [0-9]+ @n2 "," @m1 [0-9]+ @m2 ")" { - v->push_back(parse_offset(n1, n2)); - v->push_back(parse_offset(m1, m2)); - goto loop; - } - */ +loop: /*!re2c + re2c:define:YYCTYPE = char; + re2c:yyfill:enable = 0; + re2c:flags:tags = 1; + + * { return false; } + "\x00" { return true; } + "," { goto next; } + + "(?,?)" { + v->push_back(-1); + v->push_back(-1); + goto loop; + } + + "(" @n1 [0-9]+ @n2 "," @m1 [0-9]+ @m2 ")" { + v->push_back(parse_offset(n1, n2)); + v->push_back(parse_offset(m1, m2)); + goto loop; + } +*/ } -- cgit v1.2.3