summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/lex.h9
-rw-r--r--lib/lex.re74
-rw-r--r--lib/parse.ypp89
-rw-r--r--lib/regcomp.cc147
-rw-r--r--lib/regcomp_dfa_multipass.h271
-rw-r--r--lib/regcomp_dfa_regless.h249
-rw-r--r--lib/regex.h173
-rw-r--r--lib/regex_impl.h380
-rw-r--r--lib/regexec.cc71
-rw-r--r--lib/regexec_dfa.cc109
-rw-r--r--lib/regexec_dfa_multipass.cc (renamed from lib/regexec_dfa_regless.cc)168
-rw-r--r--lib/regexec_nfa_leftmost.cc37
-rw-r--r--lib/regexec_nfa_leftmost_trie.cc44
-rw-r--r--lib/regexec_nfa_posix.cc64
-rw-r--r--lib/regexec_nfa_posix_backward.cc632
-rw-r--r--lib/regexec_nfa_posix_kuklewicz.cc333
-rw-r--r--lib/regexec_nfa_posix_trie.cc175
-rw-r--r--lib/regfree.cc60
-rw-r--r--lib/regoff_trie.h40
-rw-r--r--lib/stubs.cc4
-rw-r--r--lib/test.cc498
-rw-r--r--lib/test_helper.h15
-rw-r--r--lib/test_helper.re54
23 files changed, 1221 insertions, 2475 deletions
diff --git a/lib/lex.h b/lib/lex.h
index a9ec531f..ceaef0c7 100644
--- a/lib/lex.h
+++ b/lib/lex.h
@@ -1,15 +1,16 @@
#ifndef _RE2C_LIB_LEX_
#define _RE2C_LIB_LEX_
+#include "lib/parse.h"
#include "src/msg/location.h"
-#include "src/regexp/re.h"
+#include "src/regexp/regexp.h"
#include "src/parse/ast.h"
namespace re2c {
-int lex(const char *&pattern);
-const AST *parse(const char *pattern);
-extern const AST *regexp;
+int lex(YYSTYPE* yylval, const uint8_t*& pattern, Ast& ast);
+const AstNode* parse(const char* pattern, Ast& ast);
+extern const AstNode* regexp;
} // namespace re2c
diff --git a/lib/lex.re b/lib/lex.re
index ebd0024f..e03bceb0 100644
--- a/lib/lex.re
+++ b/lib/lex.re
@@ -1,39 +1,34 @@
+#include <stdint.h>
#include <stdio.h>
-#include "src/util/c99_stdint.h"
-
#include "src/encoding/enc.h"
#include "src/msg/msg.h"
#include "src/parse/ast.h"
-#include "src/parse/unescape.h"
#include "src/util/range.h"
-#include "src/util/s_to_n32_unsafe.h"
+#include "src/util/string_utils.h"
#include "parse.h"
#include "lib/lex.h"
-extern YYSTYPE yylval;
namespace re2c {
-static int32_t lex_cls_chr(const char *&, uint32_t &);
+static int32_t lex_cls_chr(const uint8_t*&, uint32_t&);
/*!re2c
re2c:flags:tags = 1;
re2c:yyfill:enable = 0;
re2c:define:YYCURSOR = cur;
re2c:define:YYMARKER = mar;
- re2c:define:YYCTYPE = char;
+ re2c:define:YYCTYPE = uint8_t;
nil = "\x00";
num = [0-9]+;
*/
-int lex(const char *&cur)
-{
- /*!stags:re2c format = "const char *@@;"; */
- const char *mar, *x, *y;
- std::vector<ASTRange> cls;
+int lex(YYSTYPE* yylval, const uint8_t*& cur, Ast& ast) {
+ /*!stags:re2c format = "const uint8_t* @@;"; */
+ const uint8_t* mar, *x, *y;
bool neg = false;
uint32_t l, u;
@@ -53,33 +48,31 @@ int lex(const char *&cur)
"[" { goto cls; }
"{" @x num "}" {
- if (!s_to_u32_unsafe(x, cur - 1, yylval.bounds.min)) goto err_cnt;
- yylval.bounds.max = yylval.bounds.min;
+ if (!s_to_u32_unsafe(x, cur - 1, yylval->bounds.min)) goto err_cnt;
+ yylval->bounds.max = yylval->bounds.min;
return TOKEN_COUNT;
}
"{" @x num "," @y num "}" {
- if (!s_to_u32_unsafe(x, y - 1, yylval.bounds.min)
- || !s_to_u32_unsafe(y, cur - 1, yylval.bounds.max)) goto err_cnt;
+ if (!s_to_u32_unsafe(x, y - 1, yylval->bounds.min)
+ || !s_to_u32_unsafe(y, cur - 1, yylval->bounds.max)) goto err_cnt;
return TOKEN_COUNT;
}
"{" @x num ",}" {
- if (!s_to_u32_unsafe(x, cur - 2, yylval.bounds.min)) goto err_cnt;
- yylval.bounds.max = AST::MANY;
+ if (!s_to_u32_unsafe(x, cur - 2, yylval->bounds.min)) goto err_cnt;
+ yylval->bounds.max = Ast::MANY;
return TOKEN_COUNT;
}
"." {
- yylval.regexp = ast_dot(NOWHERE);
+ yylval->regexp = ast.dot(NOWHERE);
return TOKEN_REGEXP;
}
[^] \ nil {
- ASTChar c = {static_cast<uint32_t>(cur[-1]), NOWHERE};
- std::vector<ASTChar> *str = new std::vector<ASTChar>;
- str->push_back(c);
- yylval.regexp = ast_str(NOWHERE, str, false);
+ ast.temp_chars.push_back({cur[-1], NOWHERE});
+ yylval->regexp = ast.str(NOWHERE, false);
return TOKEN_REGEXP;
}
*/
@@ -92,13 +85,11 @@ cls:
*/
add:
if (l > u) goto err;
- cls.push_back(ASTRange(l, u, NOWHERE));
+ ast.temp_ranges.push_back(AstRange(l, u, NOWHERE));
/*!local:re2c
"" { goto cls; }
"]" {
- std::vector<ASTRange> *p = new std::vector<ASTRange>;
- p->swap(cls);
- yylval.regexp = ast_cls(NOWHERE, p, neg);
+ yylval->regexp = ast.cls(NOWHERE, neg);
return TOKEN_REGEXP;
}
*/
@@ -112,9 +103,8 @@ err_cnt:
return TOKEN_ERROR;
}
-int32_t lex_cls_chr(const char *&cur, uint32_t &c)
-{
- const char *mar, *p = cur;
+int32_t lex_cls_chr(const uint8_t*& cur, uint32_t& c) {
+ const uint8_t* mar, *p = cur;
/*!local:re2c
* { return 1; }
"[." { error("collating characters not supported"); return 1; }
@@ -123,18 +113,18 @@ int32_t lex_cls_chr(const char *&cur, uint32_t &c)
"\\x"[0-9a-fA-F]{2} { c = unesc_hex(p, cur); return 0; }
- "\\" { c = static_cast<uint8_t>('\\'); return 0; }
- "\\a" { c = static_cast<uint8_t>('\a'); return 0; }
- "\\b" { c = static_cast<uint8_t>('\b'); return 0; }
- "\\f" { c = static_cast<uint8_t>('\f'); return 0; }
- "\\n" { c = static_cast<uint8_t>('\n'); return 0; }
- "\\r" { c = static_cast<uint8_t>('\r'); return 0; }
- "\\t" { c = static_cast<uint8_t>('\t'); return 0; }
- "\\v" { c = static_cast<uint8_t>('\v'); return 0; }
- "\\\\" { c = static_cast<uint8_t>('\\'); return 0; }
- "\\]" { c = static_cast<uint8_t>(']'); return 0; }
-
- [^] \ nil { c = static_cast<uint8_t>(cur[-1]); return 0; }
+ "\\" { c = '\\'_u8; return 0; }
+ "\\a" { c = '\a'_u8; return 0; }
+ "\\b" { c = '\b'_u8; return 0; }
+ "\\f" { c = '\f'_u8; return 0; }
+ "\\n" { c = '\n'_u8; return 0; }
+ "\\r" { c = '\r'_u8; return 0; }
+ "\\t" { c = '\t'_u8; return 0; }
+ "\\v" { c = '\v'_u8; return 0; }
+ "\\\\" { c = '\\'_u8; return 0; }
+ "\\]" { c = ']'_u8; return 0; }
+
+ [^] \ nil { c = cur[-1]; return 0; }
*/
}
diff --git a/lib/parse.ypp b/lib/parse.ypp
index 37bb54b1..bfdee3e9 100644
--- a/lib/parse.ypp
+++ b/lib/parse.ypp
@@ -1,3 +1,11 @@
+%code requires {
+/* pull in types to populate YYSTYPE: */
+#include "src/parse/ast.h"
+namespace re2c {
+ struct AstNode;
+}
+}
+
%{
#include <stdio.h>
@@ -5,41 +13,36 @@
#include "src/parse/ast.h"
#include "src/util/attribute.h"
-#include "src/util/c99_stdint.h"
+#include <stdint.h>
#include "lib/lex.h"
-// disable certain GCC and/or Clang warnings, as we have no control over
-// autogenerated code (Clang also understands '#pragma GCC')
#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wpragmas"
-#pragma GCC diagnostic ignored "-Wconversion"
-#pragma GCC diagnostic ignored "-Wsign-conversion"
-#pragma GCC diagnostic ignored "-Wunused-macros"
-#pragma GCC diagnostic ignored "-Wmissing-variable-declarations"
-#pragma GCC diagnostic ignored "-Wunreachable-code"
-#pragma GCC diagnostic ignored "-Wunreachable-code-break"
+#include "src/util/nowarn_in_bison.h"
using namespace re2c;
-
-extern "C" {
-
-int yylex(const char *&pattern);
-void yyerror(const char *pattern, const char*) RE2C_ATTR((noreturn));
-
-} // extern "C"
-
%}
-%lex-param {const char *&pattern}
-%parse-param {const char *&pattern}
+%define api.pure full
+%lex-param {const uint8_t*& pattern}
+%lex-param {re2c::Ast& ast}
+%parse-param {const uint8_t*& pattern}
+%parse-param {re2c::Ast& ast}
%start regexp
%union {
- const re2c::AST *regexp;
- re2c::ASTBounds bounds;
+ const re2c::AstNode* regexp;
+ re2c::AstBounds bounds;
};
+%{
+extern "C" {
+ static int yylex(YYSTYPE* yylval, const uint8_t*& pattern, Ast& ast);
+ static void yyerror(const uint8_t* pattern, Ast&, const char* msg) RE2C_ATTR((noreturn));
+}
+
+%}
+
%token TOKEN_COUNT
%token TOKEN_ERROR
%token TOKEN_REGEXP
@@ -53,26 +56,26 @@ regexp: expr { regexp = $$; };
expr
: term
-| expr '|' term { $$ = ast_alt($1, $3); }
+| expr '|' term { $$ = ast.alt($1, $3); }
;
term
: factor
-| factor term { $$ = ast_cat($1, $2); } // in POSIX concatenation is right-associative
+| factor term { $$ = ast.cat($1, $2); } // in POSIX concatenation is right-associative
;
factor
: primary
-| primary '*' { $$ = ast_iter($1, 0, AST::MANY); }
-| primary '+' { $$ = ast_iter($1, 1, AST::MANY); }
-| primary '?' { $$ = ast_iter($1, 0, 1); }
-| primary TOKEN_COUNT { $$ = ast_iter($1, $2.min, $2.max); }
+| primary '*' { $$ = ast.iter($1, 0, Ast::MANY); }
+| primary '+' { $$ = ast.iter($1, 1, Ast::MANY); }
+| primary '?' { $$ = ast.iter($1, 0, 1); }
+| primary TOKEN_COUNT { $$ = ast.iter($1, $2.min, $2.max); }
;
primary
: TOKEN_REGEXP
-| '(' ')' { $$ = ast_cap(ast_nil(NOWHERE)); }
-| '(' expr ')' { $$ = ast_cap($2); }
+| '(' ')' { $$ = ast.cap(ast.nil(NOWHERE), CAPTURE); }
+| '(' expr ')' { $$ = ast.cap($2, CAPTURE); }
;
%%
@@ -80,25 +83,21 @@ primary
#pragma GCC diagnostic pop
extern "C" {
-
-void yyerror(const char *pattern, const char *msg)
-{
- fprintf(stderr, "%s (on RE %s)", msg, pattern);
- exit(1);
+ static void yyerror(const uint8_t* pattern, Ast&, const char* msg) {
+ fprintf(stderr, "%s (on regexp %s)", msg, pattern);
+ exit(1);
+ }
+
+ static int yylex(YYSTYPE* yylval, const uint8_t*& pattern, Ast& ast) {
+ return lex(yylval, pattern, ast);
+ }
}
-int yylex(const char *&pattern)
-{
- return lex(pattern);
-}
-
-} // extern "C"
-
namespace re2c {
-const AST *parse(const char *pattern)
-{
- yyparse(pattern);
+const AstNode* parse(const char* pattern, Ast& ast) {
+ const uint8_t *p = reinterpret_cast<const uint8_t*>(pattern);
+ yyparse(p, ast);
return regexp;
}
diff --git a/lib/regcomp.cc b/lib/regcomp.cc
index a87b012b..7bb0f073 100644
--- a/lib/regcomp.cc
+++ b/lib/regcomp.cc
@@ -1,140 +1,124 @@
#include <stddef.h>
-#include "src/util/c99_stdint.h"
+#include <stdint.h>
#include <algorithm>
#include <valarray>
#include <vector>
#include "lib/lex.h"
-#include "lib/regcomp_dfa_regless.h"
+#include "lib/regcomp_dfa_multipass.h"
#include "lib/regex.h"
#include "lib/regex_impl.h"
#include "lib/regoff_trie.h"
-#include "src/debug/debug.h"
#include "src/dfa/dfa.h"
#include "src/msg/location.h"
#include "src/msg/msg.h"
#include "src/nfa/nfa.h"
#include "src/options/opt.h"
#include "src/parse/ast.h"
-#include "src/regexp/re.h"
+#include "src/regexp/regexp.h"
#include "src/regexp/rule.h"
+#include "src/util/check.h"
#include "src/util/range.h"
-
namespace re2c {
-int lex(const char *pattern);
-const AST *regexp;
+int lex(YYSTYPE* yylval, const char* pattern);
+const AstNode* regexp;
} // namespace re2c
using namespace re2c;
using namespace re2c::libre2c;
-int regcomp(regex_t *preg, const char *pattern, int cflags)
-{
+// redefine CHECK_RET, as it needs to return `int` rather than `Ret`
+#undef CHECK_RET
+#define CHECK_RET(x) do { if (x != Ret::OK) return 1; } while(0)
+
+int regcomp(regex_t* preg, const char* pattern, int cflags) {
conopt_t globopts;
- globopts.nested_negative_tags = !(cflags & (REG_NFA | REG_REGLESS));
- globopts.FFlag = true;
- globopts.backward = cflags & REG_BACKWARD;
- globopts.stadfa = cflags & REG_STADFA;
+ globopts.nested_negative_tags = !(cflags & (REG_NFA | REG_MULTIPASS));
+ globopts.flex_syntax = true;
Msg msg;
Opt opts(globopts, msg);
- opts.set_subhistories((cflags & REG_SUBHIST) != 0);
- opts.set_autotags((cflags & REG_AUTOTAGS) != 0);
- opts.set_posix_syntax(true);
- opts.set_posix_semantics((cflags & REG_LEFTMOST) == 0);
- const opt_t *opt = opts.snapshot();
+ opts.set_tags_history((cflags & REG_SUBHIST) != 0);
+ opts.set_tags_automatic((cflags & REG_AUTOTAGS) != 0);
+ opts.set_tags_posix_syntax(true);
+ opts.set_tags_posix_semantics((cflags & REG_LEFTMOST) == 0);
+ const opt_t* opt;
+ CHECK_RET(opts.snapshot(&opt));
preg->flags = cflags;
- const AST *a = parse(pattern);
+ AstAllocator ast_alc;
+ OutAllocator out_alc; // unused but required by AST constructor
+ Ast ast(ast_alc, out_alc);
+ const AstNode* a = parse(pattern, ast);
- preg->rmgr = new RangeMgr;
-
- ASTRule ar(a, new SemAct(NOWHERE));
- std::vector<ASTRule> arv;
- arv.push_back(ar);
- RESpec re(arv, opt, msg, *preg->rmgr);
+ std::vector<AstRule> arv{AstRule{a, ast.sem_act(NOWHERE, nullptr, nullptr, false)}};
+ RESpec re(opt, msg);
+ CHECK_RET(re.init(arv));
find_fixed_tags(re);
insert_default_tags(re);
- size_t nfa_size, nfa_depth;
- compute_size_and_depth(re.res, &nfa_size, &nfa_depth);
- if (nfa_depth > MAX_NFA_DEPTH || nfa_size > MAX_NFA_STATES) return 1;
-
- nfa_t *nfa = new nfa_t(re, nfa_size);
-
- nfa_t *nfa0 = NULL;
- if (cflags & REG_BACKWARD) {
- conopt_t globopts0;
- globopts0.FFlag = true;
- Opt opts0(globopts0, msg);
- const opt_t *opt0 = opts0.snapshot();
- RESpec re0(arv, opt0, msg, *preg->rmgr);
- nfa0 = new nfa_t(re0, nfa_size);
- delete opt0;
+ if ((cflags & REG_NFA) == 0) {
+ preg->char2class = new size_t[256];
+ split_charset(re);
+ for (uint32_t i = 1, j = 0; i < re.charset.size(); ++i) {
+ for (; j < re.charset[i]; ++j) {
+ preg->char2class[j] = i - 1;
+ }
+ }
}
- dfa_t *dfa = NULL;
- rldfa_t *rldfa = NULL;
+ Tnfa* nfa = new Tnfa;
+ CHECK_RET(re_to_nfa(*nfa, std::move(re)));
+
+ DCHECK(nfa->rules.size() == 1);
+ preg->re_nsub = nfa->rules[0].ncap + 1;
+ preg->re_ntag = nfa->tags.size();
if (cflags & REG_NFA) {
+ preg->nfa = nfa;
if ((cflags & REG_TRIE) && (cflags & REG_LEFTMOST)) {
- preg->simctx = new lzsimctx_t(*nfa, nfa0, preg->re_nsub, cflags);
+ preg->simctx = new lzsimctx_t(*nfa, preg->re_nsub, cflags);
} else if (cflags & REG_TRIE) {
- preg->simctx = new pzsimctx_t(*nfa, nfa0, preg->re_nsub, cflags);
+ preg->simctx = new pzsimctx_t(*nfa, preg->re_nsub, cflags);
} else if (cflags & REG_LEFTMOST) {
- preg->simctx = new lsimctx_t(*nfa, nfa0, preg->re_nsub, cflags);
- } else if (cflags & REG_KUKLEWICZ) {
- preg->simctx = new ksimctx_t(*nfa, nfa0, preg->re_nsub, cflags);
+ preg->simctx = new lsimctx_t(*nfa, preg->re_nsub, cflags);
} else {
- preg->simctx = new psimctx_t(*nfa, nfa0, preg->re_nsub, cflags);
- }
- } else {
- preg->char2class = new size_t[256];
- split_charset(re);
- for (uint32_t i = 1, j = 0; i < re.charset.size(); ++i) {
- for (; j < re.charset[i]; ++j) {
- preg->char2class[j] = i - 1;
- }
+ preg->simctx = new psimctx_t(*nfa, preg->re_nsub, cflags);
}
+ } else if (cflags & REG_MULTIPASS) {
+ preg->mptdfa = new MpTdfa(std::move(*nfa), opt, cflags);
+ delete nfa;
+ opt = nullptr; // transfer options ownership to RLDFA
- dfa = new dfa_t(*nfa, Rule::NONE, Rule::NONE);
- if (cflags & REG_REGLESS) {
- DASSERT((cflags & REG_STADFA) == 0);
- rldfa = new rldfa_t(*nfa, *dfa, opt, cflags);
- opt = NULL; // transfer options ownership to RLDFA
- } else {
- determinization(*nfa, *dfa, opt, msg, "");
- cutoff_dead_rules(*dfa, opt, "", msg);
- insert_fallback_tags(opt, *dfa);
- compact_and_optimize_tags(opt, *dfa);
+ if (cflags & REG_SUBHIST) {
+ preg->regtrie = new regoff_trie_t(preg->mptdfa->tags.size());
}
+ } else {
+ Tdfa* dfa = new Tdfa(*new DfaAllocator(), nfa->charset.size(), Rule::NONE, Rule::NONE);
+ CHECK_RET(determinization(std::move(*nfa), *dfa, opt, msg, ""));
+ preg->dfa = dfa;
+ delete nfa;
+
+ cutoff_dead_rules(*dfa, opt, "", msg);
+ insert_fallback_tags(*dfa);
+ compact_and_optimize_tags(opt, *dfa);
if (cflags & REG_TSTRING) {
// T-string does not need intermediate storage for tag values.
} else if (cflags & REG_SUBHIST) {
- const size_t nlists = (cflags & REG_REGLESS)
- ? dfa->tags.size() : static_cast<size_t>(dfa->maxtagver + 1);
- preg->regtrie = new regoff_trie_t(nlists);
+ preg->regtrie = new regoff_trie_t(static_cast<size_t>(dfa->maxtagver + 1));
} else {
preg->regs = new regoff_t[dfa->maxtagver + 1];
}
}
- preg->nfa = nfa;
- preg->dfa = dfa;
- preg->rldfa = rldfa;
-
- DASSERT(nfa->rules.size() == 1);
- preg->re_nsub = nfa->rules[0].ncap + 1;
- preg->re_ntag = nfa->tags.size();
-
if (cflags & REG_TSTRING) {
- // T-string is stored in RE (reallocated on each regtstring() call if
- // needed), and the user gets an immutable view of it (const ref).
- tstring_t &ts = preg->tstring;
+ // T-string is stored in the regexp (reallocated on each regtstring() call if needed), and
+ // the user gets an immutable view of it (const ref).
+ tstring_t& ts = preg->tstring;
ts.capacity = 256;
ts.string = new tchar_t[ts.capacity];
ts.length = 0;
@@ -147,3 +131,6 @@ int regcomp(regex_t *preg, const char *pattern, int cflags)
delete opt;
return 0;
}
+
+#undef CHECK_RET
+
diff --git a/lib/regcomp_dfa_multipass.h b/lib/regcomp_dfa_multipass.h
new file mode 100644
index 00000000..af384edc
--- /dev/null
+++ b/lib/regcomp_dfa_multipass.h
@@ -0,0 +1,271 @@
+#ifndef _RE2C_LIB_REGCOMP_DFA_MULTIPASS_
+#define _RE2C_LIB_REGCOMP_DFA_MULTIPASS_
+
+#include <string.h>
+
+#include "regex.h"
+#include "src/dfa/dfa.h"
+#include "src/dfa/determinization.h"
+#include "src/nfa/nfa.h"
+#include "src/options/opt.h"
+#include "src/util/check.h"
+
+namespace re2c {
+namespace libre2c {
+
+static constexpr regoff_t NORESULT = std::numeric_limits<regoff_t>::max();
+static constexpr uint32_t NOCONF = std::numeric_limits<uint32_t>::max();
+
+// A backlink connects target and origin states of a TDFA transition at the level of TNFA
+// configurations. This allows one to follow TNFA path backwards from the final state to the initial
+// one.
+struct MpTdfaBacklink {
+ // Index of configuration in the origin TDFA state.
+ uint32_t conf;
+ // T-string fragment corresponding to tagged path from the origin configuration.
+ tchar_t* tfrag;
+ // Length of t-string fragment.
+ size_t tfrag_size;
+};
+
+// Multi-pass TDFA arc (transition). There are no registers and register operations, tag actions are
+// hidden in tag histories (stored in backlinks).
+struct MpTdfaArc {
+ // Target TDFA state where this arc goes to.
+ size_t state;
+ // Array of backlinks (one per TNFA path that reaches the target state).
+ MpTdfaBacklink* backlinks;
+};
+
+// Multi-pass TDFA state.
+struct MpTdfaState {
+ // Outgoing arcs from this state. Each arc has target state and backlinks.
+ MpTdfaArc* arcs;
+ // Final link stores final configuration index (if any) and its history.
+ MpTdfaBacklink finlink;
+
+ MpTdfaState(size_t nchars, const MpTdfaBacklink& link);
+ ~MpTdfaState();
+ FORBID_COPY(MpTdfaState);
+};
+
+// Multi-pass TDFA.
+struct MpTdfa {
+ const opt_t* opts;
+ const int flags;
+ std::vector<Tag> tags;
+
+ DfaAllocator alc;
+
+ // RLDFA own states with backlinks in them.
+ std::vector<MpTdfaState*> states;
+
+ // Array of submatch values (used during matching).
+ mutable regoff_t* result;
+ // Log stores a sequence of backlink arrays on the matching TDFA path. This allows one to unwind
+ // tag history back and get submatch values in the absence of registers. Backlink arrays (rather
+ // than single backlinks) are needed because there is a set of active TNFA paths on the way
+ // forward, and it is unknown until the final state which of them will match.
+ mutable std::vector<const MpTdfaBacklink* const*> log;
+
+ MpTdfa(Tnfa&& nfa, const opt_t* opts, int flags);
+ ~MpTdfa();
+ FORBID_COPY(MpTdfa);
+};
+
+static inline tchar_t encode_tag(size_t tag) {
+ // Tags in the t-string are indexed from 1 rather than 0 (so that negative tags can be
+ // represented by negating tag index).
+ tag += 1;
+ // Two extra tags for the outermost capture that wraps the whole regexp.
+ tag += 2;
+ // T-string characters store either symbols or tags. Symbols use the lower half of the range (so
+ // they don't need to be translated), and tags use the upper half.
+ tag += TAG_BASE;
+
+ return static_cast<tchar_t>(tag);
+}
+
+template<typename history_t>
+static inline void get_tstring_fragment(history_t& history,
+ DfaAllocator& alc,
+ hidx_t hidx,
+ std::vector<tchar_t>& tfrag,
+ MpTdfaBacklink& link,
+ bool tstring) {
+ tfrag.clear();
+ for (int32_t i = hidx; i != HROOT; ) {
+ const typename history_t::node_t& n = history.node(i);
+ const size_t t = n.info.idx;
+ const bool negative = n.info.neg;
+ i = n.pred;
+
+ if (tstring) {
+ // For t-string construction add only positive tags. The idea is to get the cheapest
+ // possible representation of parse results, and adding negative tags slows it down
+ // quite a bit.
+ if (!negative) tfrag.push_back(encode_tag(t));
+ } else {
+ // For offset construction add both positive and negative tags. Shift negative tags by
+ // TAG_BASE to make them distinguishable from positive tags. Do not expand nested
+ // negative tags here: it makes regexec() slower, because it cannot skip all nested tags
+ // with one check when a tag has already been set.
+ tfrag.push_back(static_cast<tchar_t>(t + (negative ? TAG_BASE : 0)));
+ }
+ }
+
+ std::reverse(tfrag.begin(), tfrag.end());
+
+ link.tfrag_size = tfrag.size();
+ link.tfrag = alc.template alloct<tchar_t>(tfrag.size());
+ memcpy(link.tfrag, tfrag.data(), tfrag.size() * sizeof(tchar_t));
+}
+
+template<typename ctx_t>
+static MpTdfaBacklink* construct_backlinks(const ctx_t& ctx,
+ MpTdfa& mptdfa,
+ std::vector<tchar_t>& tfrag,
+ const std::vector<std::vector<uint32_t>>& uniq_orig) {
+ if (ctx.target == Tdfa::NIL) return nullptr;
+
+ const bool tstring = mptdfa.flags & REG_TSTRING;
+ const std::vector<uint32_t>& uo = uniq_orig[ctx.target];
+ uint32_t nbacklinks = *std::max_element(uo.begin(), uo.end()) + 1;
+ MpTdfaBacklink* links = mptdfa.alc.alloct<MpTdfaBacklink>(nbacklinks);
+
+ for (size_t j = 0, k; j < nbacklinks; ++j) {
+ for (k = 0; k < ctx.state.size() && uo[k] != j; ++k);
+ const typename ctx_t::conf_t& x = ctx.state[k];
+ MpTdfaBacklink& l = links[j];
+ l.conf = uniq_orig[ctx.origin][x.origin];
+ get_tstring_fragment(ctx.history, mptdfa.alc, x.ttran, tfrag, l, tstring);
+ }
+
+ return links;
+}
+
+template<typename ctx_t>
+static void determinization_multipass(Tnfa&& nfa, MpTdfa& mptdfa) {
+ Msg msg;
+ // Determinization context lifetime must cover regexec, as some of the data stored in the
+ // context is used during matching.
+ Tdfa dfa(mptdfa.alc, nfa.charset.size(), Rule::NONE, Rule::NONE);
+ ctx_t ctx(std::move(nfa), dfa, mptdfa.opts, msg, "");
+
+ std::vector<tchar_t> tfrag;
+ // Per-state array of mappings from configuration index to a unique origin index. This is needed
+ // to compress identical backlinks into one. Note that configurations with identical origin also
+ // have identical transition tags (because those are the lookahead tags in the origin
+ // configuration).
+ std::vector<std::vector<uint32_t> > uniq_orig;
+
+ // Construct initial TDFA state.
+ const uint32_t INITIAL_TAGS = init_tag_versions(ctx);
+ const clos_t c0(ctx.nfa_root, 0, INITIAL_TAGS, HROOT, HROOT);
+ ctx.reach.push_back(c0);
+ closure(ctx);
+ find_state_multipass(ctx, mptdfa, tfrag, uniq_orig);
+
+ // Iterate while new states are added: for each alphabet symbol build tagged epsilon-closure of
+ // all reachable NFA states, then find identical or mappable TDFA state, or add a new one.
+ for (uint32_t i = 0; i < ctx.kernels.size(); ++i) {
+ ctx.origin = i;
+
+ for (uint32_t c = 0; c < dfa.nchars; ++c) {
+ reach_on_symbol(ctx, c);
+ closure(ctx);
+ find_state_multipass(ctx, mptdfa, tfrag, uniq_orig);
+
+ // Multi-pass TDFA stores backlinks instead of tag actions.
+ mptdfa.states[ctx.origin]->arcs[c].backlinks =
+ construct_backlinks(ctx, mptdfa, tfrag, uniq_orig);
+ }
+ }
+
+ mptdfa.tags = std::move(ctx.tags);
+}
+
+template<typename ctx_t>
+static void find_state_multipass(ctx_t& ctx,
+ MpTdfa& mptdfa,
+ std::vector<tchar_t>& tfrag,
+ std::vector<std::vector<uint32_t>>& uniq_orig) {
+ const bool tstring = mptdfa.flags & REG_TSTRING;
+
+ // Find or add the new state in the existing set of states.
+ const bool is_new = do_find_state<ctx_t, true>(ctx);
+
+ if (is_new) {
+ const typename ctx_t::confset_t& state = ctx.state;
+
+ // Map configuration index to a unique origin index.
+ uniq_orig.push_back(std::vector<uint32_t>(state.size(), UINT32_MAX));
+ std::vector<uint32_t>& uo = uniq_orig.back();
+ for (uint32_t i = 0, j = 0; j < state.size(); ++j) {
+ if (uo[j] != UINT32_MAX) continue;
+ for (size_t k = j + 1; k < state.size(); ++k) {
+ if (state[j].origin == state[k].origin) {
+ DCHECK(state[j].ttran == state[k].ttran); // TDFA(1) property
+ uo[k] = i;
+ }
+ }
+ uo[j] = i++;
+ }
+
+ // Check if the new TDFA state is final. See note [at most one final item per closure].
+ MpTdfaBacklink finlink = {NOCONF, nullptr, 0};
+ for (uint32_t i = 0; i < state.size(); ++i) {
+ if (state[i].state->kind == TnfaState::Kind::FIN) {
+ finlink.conf = uo[i];
+ get_tstring_fragment(
+ ctx.history, mptdfa.alc, state[i].thist, tfrag, finlink, tstring);
+ break;
+ }
+ }
+
+ // Create a new multipass-DFA state.
+ MpTdfaState* s = new MpTdfaState(ctx.dfa.nchars, finlink);
+ mptdfa.states.push_back(s);
+ }
+
+ if (ctx.origin != Tdfa::NIL) {
+ mptdfa.states[ctx.origin]->arcs[ctx.symbol].state = ctx.target;
+ }
+}
+
+inline MpTdfaState::MpTdfaState(size_t nchars, const MpTdfaBacklink& link)
+ : arcs(new MpTdfaArc[nchars]), finlink() {
+ finlink.conf = link.conf;
+ finlink.tfrag = link.tfrag;
+ finlink.tfrag_size = link.tfrag_size;
+}
+
+inline MpTdfaState::~MpTdfaState() {
+ delete[] arcs;
+}
+
+inline MpTdfa::MpTdfa(Tnfa&& nfa, const opt_t* opts, int flags)
+ : opts(opts),
+ flags(flags),
+ tags(),
+ alc(),
+ states(),
+ result(new regoff_t[nfa.tags.size()]),
+ log() {
+ if (opts->tags_posix_semantics) {
+ determinization_multipass<pdetctx_t>(std::move(nfa), *this);
+ } else {
+ determinization_multipass<ldetctx_t>(std::move(nfa), *this);
+ }
+}
+
+inline MpTdfa::~MpTdfa() {
+ for (size_t i = 0; i < states.size(); ++i) {
+ delete states[i];
+ }
+}
+
+} // namespace libre2c
+} // namespace re2c
+
+#endif // _RE2C_LIB_REGCOMP_DFA_MULTIPASS_
diff --git a/lib/regcomp_dfa_regless.h b/lib/regcomp_dfa_regless.h
deleted file mode 100644
index 916aa8f6..00000000
--- a/lib/regcomp_dfa_regless.h
+++ /dev/null
@@ -1,249 +0,0 @@
-#ifndef _RE2C_LIB_REGCOMP_DFA_REGLESS_
-#define _RE2C_LIB_REGCOMP_DFA_REGLESS_
-
-#include <string.h>
-
-#include "regex.h"
-#include "src/dfa/dfa.h"
-#include "src/dfa/determinization.h"
-#include "src/nfa/nfa.h"
-#include "src/options/opt.h"
-
-
-namespace re2c {
-namespace libre2c {
-
-static const regoff_t NORESULT = std::numeric_limits<regoff_t>::max();
-static const uint32_t NOCONF = std::numeric_limits<uint32_t>::max();
-
-// A backlink connects target and origin states of a TDFA transition at the
-// level of TNFA configurations. This allows to follow TNFA path backwards from
-// the final state to the initial one.
-struct rldfa_backlink_t {
- // Index of configuration in the origin TDFA state.
- uint32_t conf;
- // Index of tag history corresponding to the origin configuration.
- hidx_t hidx;
- // T-string fragment corresponding to tagged path from the origin configuration.
- tchar_t *tfrag;
- // Length of t-string fragment.
- size_t tfrag_size;
-};
-
-// Registerless TDFA arc (transition). There are no registers and register
-// operations, tag actions are hidden in tag histories (stored in backlinks).
-struct rldfa_arc_t {
- // Target TDFA state where this arc goes to.
- size_t state;
- // Array of backlinks (one per TNFA path that reaches the target state).
- rldfa_backlink_t *backlinks;
-};
-
-// Registerless TDFA state.
-struct rldfa_state_t {
- // Outgoing arcs from this state. Each arc has target state and backlinks.
- rldfa_arc_t *arcs;
- // Final link stores final configuration index (if any) and its history.
- rldfa_backlink_t finlink;
-
- rldfa_state_t(size_t nchars, const rldfa_backlink_t &link);
- ~rldfa_state_t();
- FORBID_COPY(rldfa_state_t);
-};
-
-// Registerless TDFA.
-struct rldfa_t {
- const opt_t *opts;
- const int flags;
-
- // Determinization context, specialized either for leftmost greedy policy or
- // for POSIX policy. Context lifetime should cover regexec, as it needs some
- // of the data stored in it (TDFA, tag history, etc.).
- void *ctx;
-
- // RLDFA own states with backlinks in them.
- std::vector<rldfa_state_t*> states;
-
- // Array of submatch values (used during matching).
- mutable regoff_t *result;
- // Log stores a sequence of backlink arrays on the matching TDFA path. This
- // allows to unwind tag history back and get submatch values in the absence
- // of registers. Backlink arrays (rather than single backlinks) are needed
- // because there is a set of active TNFA paths on the way forward, and it is
- // unknown until the final state which of them will match.
- mutable std::vector<const rldfa_backlink_t* const*> log;
-
- rldfa_t(const nfa_t &nfa, dfa_t &dfa, const opt_t *opts, int flags);
- ~rldfa_t();
- FORBID_COPY(rldfa_t);
-};
-
-static inline tchar_t encode_tag(size_t tag)
-{
- // Tags in the t-string are indexed from 1 rather than 0 (so that
- // negative tags can be represented by negating tag index).
- tag += 1;
- // Two extra tags for the outermost capture that wraps the whole RE.
- tag += 2;
- // T-string characters store either symbols or tags. Symbols use the
- // lower half of the range (so they don't need to be translated),
- // and tags use the upper half.
- tag += TAG_BASE;
-
- return static_cast<tchar_t>(tag);
-}
-
-template<typename history_t>
-static inline void get_tstring_fragment(determ_context_t<history_t> &ctx,
- hidx_t hidx, std::vector<tchar_t> &tfrag, rldfa_backlink_t &link, bool tstring)
-{
- tfrag.clear();
- for (int32_t i = hidx; i != HROOT; ) {
- const typename history_t::node_t &n = ctx.history.node(i);
- const size_t t = n.info.idx;
- const bool negative = n.info.neg;
- i = n.pred;
-
- if (tstring) {
- // For t-string construction add only positive tags. The idea is to
- // get the cheapest possible representation of parse results, and
- // adding negative tags slows it down quite a bit.
- if (!negative) tfrag.push_back(encode_tag(t));
- } else {
- // For offset construction add both positive and negative tags.
- // Shift negative tags by TAG_BASE to make them distinguishable from
- // positive tags. Do not expand nested negative tags here: it makes
- // regexec() slower, because it cannot skip all nested tags with one
- // check when a tag has already been set.
- tfrag.push_back(static_cast<tchar_t>(t + (negative ? TAG_BASE : 0)));
- }
- }
-
- std::reverse(tfrag.begin(), tfrag.end());
-
- link.tfrag_size = tfrag.size();
- link.tfrag = ctx.dc_allocator.template alloct<tchar_t>(tfrag.size());
- memcpy(link.tfrag, tfrag.data(), tfrag.size() * sizeof(tchar_t));
-}
-
-template<typename ctx_t>
-static void determinization_regless(const nfa_t &nfa, dfa_t &dfa, rldfa_t &rldfa)
-{
- Msg msg;
- // Determinization context lifetime must cover regexec, as some of the data
- // stored in the context is used during matching.
- ctx_t &ctx = *new ctx_t(rldfa.opts, msg, "", nfa, dfa);
- rldfa.ctx = &ctx;
- allocator_t &alc = ctx.dc_allocator;
- const bool tstring = rldfa.flags & REG_TSTRING;
- std::vector<tchar_t> tfrag;
-
- // Construct initial TDFA state.
- const uint32_t INITIAL_TAGS = init_tag_versions(ctx);
- const clos_t c0(ctx.nfa.root, 0, INITIAL_TAGS, HROOT, HROOT);
- ctx.reach.push_back(c0);
- closure(ctx);
- find_state_regless(ctx, rldfa, tfrag);
-
- // Iterate while new states are added: for each alphabet symbol build tagged
- // epsilon-closure of all reachable NFA states, then find identical or
- // mappable TDFA state, or add a new one.
- for (uint32_t i = 0; i < ctx.dc_kernels.size(); ++i) {
- ctx.dc_origin = i;
-
- for (uint32_t c = 0; c < ctx.dfa.nchars; ++c) {
- reach_on_symbol(ctx, c);
- closure(ctx);
- find_state_regless(ctx, rldfa, tfrag);
-
- // Unlike TDFA, RLDFA stores backlinks instead of tag actions.
- rldfa_backlink_t *links = alc.alloct<rldfa_backlink_t>(ctx.state.size());
- for (size_t j = 0; j < ctx.state.size(); ++j) {
- const typename ctx_t::conf_t &x = ctx.state[j];
- rldfa_backlink_t &l = links[j];
- l.conf = x.origin;
- l.hidx = x.ttran;
- get_tstring_fragment(ctx, x.ttran, tfrag, l, tstring);
- }
- rldfa.states[ctx.dc_origin]->arcs[c].backlinks = links;
- }
- }
-}
-
-template<typename ctx_t>
-static void find_state_regless(ctx_t &ctx, rldfa_t &rldfa, std::vector<tchar_t> &tfrag)
-{
- const bool tstring = rldfa.flags & REG_TSTRING;
- dfa_t &dfa = ctx.dfa;
-
- // Find or add the new state in the existing set of states.
- const bool is_new = do_find_state<ctx_t, false, true>(ctx);
-
- if (is_new) {
- // Check if the new TDFA state is final.
- // See note [at most one final item per closure].
- rldfa_backlink_t finlink = {NOCONF, HROOT, NULL, 0};
- for (uint32_t i = 0; i < ctx.state.size(); ++i) {
- const typename ctx_t::conf_t &x = ctx.state[i];
- if (x.state->type == nfa_state_t::FIN) {
- finlink.conf = i;
- finlink.hidx = x.thist;
- get_tstring_fragment(ctx, x.thist, tfrag, finlink, tstring);
- break;
- }
- }
-
- // Create a new regless-DFA state.
- rldfa_state_t *s = new rldfa_state_t(dfa.nchars, finlink);
- rldfa.states.push_back(s);
- }
-
- if (ctx.dc_origin != dfa_t::NIL) {
- rldfa.states[ctx.dc_origin]->arcs[ctx.dc_symbol].state = ctx.dc_target;
- }
-
- DDUMP_DFA_RAW(ctx, is_new);
- DDUMP_DFA_TREE(is_new);
-}
-
-inline rldfa_state_t::rldfa_state_t(size_t nchars, const rldfa_backlink_t &link)
- : arcs(new rldfa_arc_t[nchars])
- , finlink()
-{
- finlink.conf = link.conf;
- finlink.hidx = link.hidx;
- finlink.tfrag = link.tfrag;
- finlink.tfrag_size = link.tfrag_size;
-}
-
-inline rldfa_state_t::~rldfa_state_t()
-{
- delete[] arcs;
-}
-
-inline rldfa_t::rldfa_t(const nfa_t &nfa, dfa_t &dfa, const opt_t *opts, int flags)
- : opts(opts)
- , flags(flags)
- , ctx(NULL)
- , states()
- , result(new regoff_t[nfa.tags.size()])
- , log()
-{
- if (opts->posix_semantics) {
- determinization_regless<pdetctx_t>(nfa, dfa, *this);
- } else {
- determinization_regless<ldetctx_t>(nfa, dfa, *this);
- }
-}
-
-inline rldfa_t::~rldfa_t()
-{
- for (size_t i = 0; i < states.size(); ++i) {
- delete states[i];
- }
-}
-
-} // namespace libre2c
-} // namespace re2c
-
-#endif // _RE2C_LIB_REGCOMP_DFA_REGLESS_
diff --git a/lib/regex.h b/lib/regex.h
index 1f33ec04..9d053957 100644
--- a/lib/regex.h
+++ b/lib/regex.h
@@ -5,151 +5,144 @@
#include <stdint.h>
#include <limits.h>
-
namespace re2c {
-struct nfa_t;
-struct dfa_t;
+
+struct Tnfa;
+struct Tdfa;
class RangeMgr;
+
} // namespace re2c
namespace re2c {
namespace libre2c {
-struct rldfa_t;
+
+struct MpTdfa;
struct regoff_trie_t;
+
} // namespace libre2c
} // namespace re2c
-typedef ptrdiff_t regoff_t;
+using regoff_t = ptrdiff_t;
-// regmatch_t stores an offset pair representing a capturing group.
-// No match is represented with (-1,-1).
+// regmatch_t stores an offset pair representing a capturing group. No match is represented with
+// (-1,-1).
struct regmatch_t {
regoff_t rm_so;
regoff_t rm_eo;
};
-// subhistory_t stores subhistory of a capturing group: an array of offset pairs
-// corresponding to all positions in the input string where this capturing group
-// has matched. The length of the array depends on how many times the group has
-// matched, which depends on the RE and the input string.
+// subhistory_t stores subhistory of a capturing group: an array of offset pairs corresponding to
+// all positions in the input string where this capturing group has matched. The length of the array
+// depends on how many times the group has matched, which depends on the regexp and the string.
struct subhistory_t {
size_t size;
- regmatch_t *offs;
+ regmatch_t* offs;
};
-// T-string chars are 16 bits.
-// This is aligned with the Java implementation by Angelo Borsotti.
-typedef uint16_t tchar_t;
+// T-string chars are 16 bits. This is aligned with the Java implementation by Angelo Borsotti.
+using tchar_t = uint16_t;
-// A t-string is a flattened representation a parse tree where characters of the
-// input string are interleaved with tags. Input characters occupy the lower
-// byte of the 16-bit value, leaving the higher byte as zero. Tags are stored as
-// numeric values shifted to the upper half of the 16-bit range.
+// A t-string is a flattened representation a parse tree where characters of the input string are
+// interleaved with tags. Input characters occupy the lower byte of the 16-bit value, leaving the
+// higher byte as zero. Tags are stored as numeric values shifted to the upper half of the 16-bit
+// range.
struct tstring_t {
size_t capacity;
size_t length;
- tchar_t *string;
+ tchar_t* string;
};
// Tags use the upper half of the t-string charater value range.
-static const tchar_t TAG_BASE = 1 << (8 * sizeof(tchar_t) - 1);
+static constexpr tchar_t TAG_BASE = 1 << (8 * sizeof(tchar_t) - 1);
// standard flags
-static const int REG_EXTENDED = 1u << 0;
-static const int REG_ICASE = 1u << 1;
-static const int REG_NOSUB = 1u << 2;
-static const int REG_NEWLINE = 1u << 3;
-static const int REG_NOTBOL = 1u << 4;
-static const int REG_NOTEOL = 1u << 5;
+static constexpr int REG_EXTENDED = 1u << 0;
+static constexpr int REG_ICASE = 1u << 1;
+static constexpr int REG_NOSUB = 1u << 2;
+static constexpr int REG_NEWLINE = 1u << 3;
+static constexpr int REG_NOTBOL = 1u << 4;
+static constexpr int REG_NOTEOL = 1u << 5;
// extensions
-static const int REG_NFA = 1u << 6;
-static const int REG_LEFTMOST = 1u << 7;
-static const int REG_TRIE = 1u << 8;
-static const int REG_GTOP = 1u << 9;
-static const int REG_SLOWPREC = 1u << 10;
-static const int REG_BACKWARD = 1u << 11;
-static const int REG_KUKLEWICZ = 1u << 12;
-static const int REG_STADFA = 1u << 13;
-static const int REG_REGLESS = 1u << 14;
-static const int REG_SUBHIST = 1u << 15;
-static const int REG_TSTRING = 1u << 16;
-static const int REG_AUTOTAGS = 1u << 17;
+static constexpr int REG_NFA = 1u << 6;
+static constexpr int REG_LEFTMOST = 1u << 7;
+static constexpr int REG_TRIE = 1u << 8;
+static constexpr int REG_SLOWPREC = 1u << 9;
+static constexpr int REG_MULTIPASS = 1u << 10;
+static constexpr int REG_SUBHIST = 1u << 11;
+static constexpr int REG_TSTRING = 1u << 12;
+static constexpr int REG_AUTOTAGS = 1u << 13;
+
+// Deprecated, keep for backward compatibility.
+static constexpr int REG_REGLESS = REG_MULTIPASS; // old name for REG_MULTIPASS
struct regex_t {
size_t re_nsub;
size_t re_ntag;
- re2c::RangeMgr *rmgr;
- const re2c::nfa_t *nfa;
- const re2c::dfa_t *dfa;
- const re2c::libre2c::rldfa_t *rldfa;
- void *simctx;
- size_t *char2class;
+ const re2c::Tnfa* nfa;
+ const re2c::Tdfa* dfa;
+ const re2c::libre2c::MpTdfa* mptdfa;
+ void* simctx;
+ size_t* char2class;
int flags;
union {
- regoff_t *regs;
- re2c::libre2c::regoff_trie_t *regtrie;
+ regoff_t* regs;
+ re2c::libre2c::regoff_trie_t* regtrie;
};
- // Allow storing submatch results in RE (this is needed for Java bindings).
+ // Allow storing submatch results in the regexp (this is needed for Java bindings).
union {
- regmatch_t *pmatch;
- subhistory_t *psubhist;
+ regmatch_t* pmatch;
+ subhistory_t* psubhist;
mutable tstring_t tstring;
};
};
-static const int REG_NOMATCH = INT_MAX;
+static constexpr int REG_NOMATCH = INT_MAX;
// standard functions
-int regcomp(regex_t *preg, const char *pattern, int cflags);
-size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size);
-int regexec(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[],
- int eflags);
-void regfree(regex_t *preg);
-
-// The regparse() function returns parse results as an array of nmatch size,
-// where each element is an array of offset pairs for the corresponding
-// capturing group. If a group has matched repeatedly at different parts of the
-// input string, then its array will contain multiple offset pairs; otherwise, a
-// single pair. This differs from the standard POSIX API, where only the last
-// offset pair for each group is returned.
+int regcomp(regex_t* preg, const char* pattern, int cflags);
+size_t regerror(int errcode, const regex_t* preg, char* errbuf, size_t errbuf_size);
+int regexec(
+ const regex_t* preg, const char* string, size_t nmatch, regmatch_t pmatch[], int eflags);
+void regfree(regex_t* preg);
+
+// The regparse() function returns parse results as an array of nmatch size, where each element is
+// an array of offset pairs for the corresponding capturing group. If a group has matched repeatedly
+// at different parts of the input string, then its array will contain multiple offset pairs;
+// otherwise, a single pair. This differs from the standard POSIX API, where only the last offset
+// pair for each group is returned.
//
// regparse() can be used if REG_SUBHIST flag has been passed to regcomp().
//
-// The allocation of memory for the parse results is done by the library (the
-// user cannot know how much memory will be needed). The ownership of parse
-// results is transferred to the user, who is expected to deallocate memory
-// allocated by regparse() with regfreesub().
-subhistory_t *regparse(const regex_t *preg, const char *string, size_t nmatch);
+// The allocation of memory for the parse results is done by the library (the user cannot know how
+// much memory will be needed). The ownership of parse results is transferred to the user, who is
+// expected to deallocate memory allocated by regparse() with regfreesub().
+subhistory_t* regparse(const regex_t* preg, const char* string, size_t nmatch);
-// regfreesub() deallocates memory allocated with regparse(). It must be called
-// every time after regparse() is called.
-void regfreesub(subhistory_t *history);
+// regfreesub() deallocates memory allocated with regparse(). It must be called every time after
+// regparse() is called.
+void regfreesub(subhistory_t* history);
// The regtstring() function constructs a t-string.
//
// It can be used if REG_TSTRING flag has been passed to regcomp().
//
-// The goal of this function is to construct parse results in the cheapest
-// possible way. There is no standard representation of a parse tree, and any
-// nontrivial representation incurs significant overhead on construction (the
-// algorithm spends more time constructing parse results than doing the actual
-// parsing). The t-string representation requires very little effort to
-// construct, provided that the t-string fragments corresponding to the tagged
-// paths in the epsilon-closure are prepared in advance at regcomp() time. The
-// fragments are stored as arrays of tags, so that they can be easily copied to
-// the resulting t-string. Importantly, this does not require unwinding of tag
-// history stored in a trie form, which is rather slow.
+// The goal of this function is to construct parse results in the cheapest possible way. There is no
+// standard representation of a parse tree, and any nontrivial representation incurs significant
+// overhead on construction (the algorithm spends more time constructing parse results than doing
+// the actual parsing). The t-string representation requires very little effort to construct,
+// provided that the t-string fragments corresponding to the tagged paths in the epsilon-closure are
+// prepared in advance at regcomp() time. The fragments are stored as arrays of tags, so that they
+// can be easily copied to the resulting t-string. Importantly, this does not require unwinding of
+// tag history stored in a trie form, which is rather slow.
//
-// T-string allocation is done on the library side, and the t-string is stored
-// in the regexp structure. The user gets an immutable view of it and is not
-// expected to free memory after calling regtstring(). Moreover, the library
-// reuses the same memory on each regtstring() call and reallocates it only if
-// the new t-string does not fit into the old memory area. So each call to
+// T-string allocation is done on the library side, and the t-string is stored in the regexp
+// structure. The user gets an immutable view of it and is not expected to free memory after calling
+// regtstring(). Moreover, the library reuses the same memory on each regtstring() call and
+// reallocates it only if the new t-string does not fit into the old memory area. So each call to
// regtstring() invalidates the previous results returned from it.
//
-// The regtstring() function is primarily intended for benchmarking the speed
-// of a parsing algorithm.
-const tstring_t *regtstring(const regex_t *preg, const char *string);
-
+// The regtstring() function is primarily intended for benchmarking the speed of a parsing
+// algorithm.
+const tstring_t* regtstring(const regex_t* preg, const char* string);
#endif // _RE2C_LIB_REGEX_
diff --git a/lib/regex_impl.h b/lib/regex_impl.h
index 7ecfa21b..83ce046d 100644
--- a/lib/regex_impl.h
+++ b/lib/regex_impl.h
@@ -11,54 +11,52 @@
#include "src/dfa/dfa.h"
#include "src/dfa/determinization.h"
#include "src/nfa/nfa.h"
-
+#include "src/util/check.h"
namespace re2c {
namespace libre2c {
-typedef std::vector<tag_info_t> tag_path_t;
+using tag_path_t = std::vector<tag_info_t>;
-struct conf_t
-{
- nfa_state_t *state;
+struct conf_t {
+ TnfaState* state;
uint32_t origin;
int32_t thist;
- inline conf_t(): state(NULL), origin(0), thist(HROOT) {}
- inline conf_t(nfa_state_t *s, uint32_t o, int32_t h)
+ inline conf_t(): state(nullptr), origin(0), thist(HROOT) {}
+ inline conf_t(TnfaState* s, uint32_t o, int32_t h)
: state(s), origin(o), thist(h) {}
- inline conf_t(const conf_t &c, nfa_state_t *s)
+ inline conf_t(const conf_t& c, TnfaState* s)
: state(s), origin(c.origin), thist(c.thist) {}
- inline conf_t(const conf_t &c, nfa_state_t *s, int32_t h)
+ inline conf_t(const conf_t& c, TnfaState* s, int32_t h)
: state(s), origin(c.origin), thist(h) {}
};
-struct ran_or_fin_t
-{
- inline bool operator()(const conf_t &c);
+struct ran_or_fin_t {
+ inline bool operator()(const conf_t& c);
};
-typedef std::vector<conf_t> confset_t;
-typedef confset_t::iterator confiter_t;
-typedef confset_t::const_iterator cconfiter_t;
-typedef confset_t::const_reverse_iterator rcconfiter_t;
+using confset_t = std::vector<conf_t>;
+using confiter_t = confset_t::iterator;
+using cconfiter_t = confset_t::const_iterator;
+using rcconfiter_t = confset_t::const_reverse_iterator;
template<typename history_type_t>
-struct simctx_t
-{
- typedef libre2c::conf_t conf_t;
- typedef std::vector<conf_t> confset_t;
- typedef confset_t::iterator confiter_t;
- typedef confset_t::const_iterator cconfiter_t;
- typedef confset_t::reverse_iterator rconfiter_t;
- typedef confset_t::const_reverse_iterator rcconfiter_t;
- typedef history_type_t history_t;
-
- const nfa_t &nfa;
- const nfa_t *nfa0;
+struct simctx_t {
+ using conf_t = libre2c::conf_t;
+ using confset_t = std::vector<conf_t>;
+ using confiter_t = confset_t::iterator;
+ using cconfiter_t = confset_t::const_iterator;
+ using rconfiter_t = confset_t::reverse_iterator;
+ using rcconfiter_t = confset_t::const_reverse_iterator;
+ using history_t = history_type_t;
+
+ const Tnfa& nfa;
const size_t nsub;
const int flags;
+ const std::vector<Tag>& tags;
+
history_t history;
int32_t hidx;
@@ -66,18 +64,18 @@ struct simctx_t
size_t rule;
- const char *cursor;
- const char *marker;
+ const char* cursor;
+ const char* marker;
- regoff_t *offsets1;
- regoff_t *offsets2;
- regoff_t *offsets3;
- bool *done;
+ regoff_t* offsets1;
+ regoff_t* offsets2;
+ regoff_t* offsets3;
+ bool* done;
- int32_t *newprectbl;
- int32_t *oldprectbl;
+ int32_t* newprectbl;
+ int32_t* oldprectbl;
size_t oldprecdim;
- histleaf_t *histlevel;
+ histleaf_t* histlevel;
std::vector<uint32_t> sortcores;
std::vector<uint32_t> fincount;
std::vector<int32_t> worklist;
@@ -85,21 +83,17 @@ struct simctx_t
confset_t reach;
confset_t state;
- std::vector<nfa_state_t*> gor1_topsort;
- std::vector<nfa_state_t*> gor1_linear;
- std::vector<nfa_state_t*> gtop_heap_storage;
- cmp_gtop_t gtop_cmp;
- gtop_heap_t gtop_heap;
- closure_stats_t dc_clstats;
-
- simctx_t(const nfa_t &nfa, const nfa_t *nfa0, size_t re_nsub, int flags);
+ std::vector<TnfaState*> gor1_topsort;
+ std::vector<TnfaState*> gor1_linear;
+ closure_stats_t clstats;
+
+ simctx_t(const Tnfa& nfa, size_t re_nsub, int flags);
~simctx_t();
FORBID_COPY(simctx_t);
};
// tag history for lazy disambiguation (both POSIX and leftmost greedy)
-struct zhistory_t
-{
+struct zhistory_t {
struct node_t {
tag_info_t info;
hidx_t pred;
@@ -110,119 +104,86 @@ struct zhistory_t
: info(info), pred(pred), orig(orig), step(step) {}
};
- struct cache_entry_t
- {
+ struct cache_entry_t {
int32_t prec1;
int32_t prec2;
int32_t prec;
};
- typedef std::map<uint64_t, cache_entry_t> cache_t;
+ using cache_t = std::map<uint64_t, cache_entry_t>;
std::vector<node_t> nodes;
cache_t cache;
inline zhistory_t(): nodes(), cache() { init(); }
inline void init();
- inline node_t &node(hidx_t i) { return nodes[static_cast<uint32_t>(i)]; }
- inline const node_t &node(hidx_t i) const { return nodes[static_cast<uint32_t>(i)]; }
- template<typename ctx_t> inline hidx_t link(ctx_t &ctx
- , const typename ctx_t::conf_t &conf);
- template<typename ctx_t> static int32_t precedence(ctx_t &ctx
- , const typename ctx_t::conf_t &x, const typename ctx_t::conf_t &y
- , int32_t &prec1, int32_t &prec2);
+ inline node_t& node(hidx_t i) { return nodes[static_cast<uint32_t>(i)]; }
+ inline const node_t& node(hidx_t i) const { return nodes[static_cast<uint32_t>(i)]; }
+
+ template<typename ctx_t> inline hidx_t link(ctx_t& ctx, const typename ctx_t::conf_t& conf);
+ template<typename ctx_t> static int32_t precedence(ctx_t& ctx,
+ const typename ctx_t::conf_t& x,
+ const typename ctx_t::conf_t& y,
+ int32_t& prec1,
+ int32_t& prec2);
FORBID_COPY(zhistory_t);
};
-// tag history for Kuklewicz disambiguation (POSIX semantics)
-struct khistory_t
-{
- struct node_t {
- tag_info_t info;
- hidx_t pred;
-
- inline node_t(tag_info_t info, hidx_t pred)
- : info(info), pred(pred) {}
- };
-
- std::vector<node_t> nodes;
- std::vector<int32_t> path1;
- std::vector<int32_t> path2;
-
- inline khistory_t(): nodes(), path1(), path2() { init(); }
- inline void init();
- inline node_t &node(hidx_t i) { return nodes[static_cast<uint32_t>(i)]; }
- inline const node_t &node(hidx_t i) const { return nodes[static_cast<uint32_t>(i)]; }
- template<typename ctx_t> inline hidx_t link(ctx_t &ctx
- , const typename ctx_t::conf_t &conf);
- template<typename ctx_t> static int32_t precedence(ctx_t &ctx
- , const typename ctx_t::conf_t &x, const typename ctx_t::conf_t &y
- , int32_t &prec1, int32_t &prec2);
- FORBID_COPY(khistory_t);
-};
-
-typedef simctx_t<phistory_t> psimctx_t;
-typedef simctx_t<lhistory_t> lsimctx_t;
-typedef simctx_t<zhistory_t> pzsimctx_t;
-typedef simctx_t<zhistory_t> lzsimctx_t;
-typedef simctx_t<khistory_t> ksimctx_t;
+using psimctx_t = simctx_t<phistory_t>;
+using lsimctx_t = simctx_t<lhistory_t>;
+using pzsimctx_t = simctx_t<zhistory_t>;
+using lzsimctx_t = simctx_t<zhistory_t>;
// regexec functions
-typedef int (regexec_t)(const regex_t*, const char*, size_t, regmatch_t[], int);
-template<bool stadfa> regexec_t regexec_dfa;
-template<typename ctx_t> regexec_t regexec_dfa_regless;
+using regexec_t = int (const regex_t*, const char*, size_t, regmatch_t[], int);
+regexec_t regexec_dfa;
+template<typename ctx_t> regexec_t regexec_dfa_multipass;
regexec_t regexec_nfa_posix;
regexec_t regexec_nfa_posix_trie;
-regexec_t regexec_nfa_posix_backward;
-regexec_t regexec_nfa_posix_kuklewicz;
regexec_t regexec_nfa_leftmost;
regexec_t regexec_nfa_leftmost_trie;
// regparse functions (non-standard)
-typedef subhistory_t*(regparse_t)(const regex_t*, const char*, size_t);
-template<bool stadfa> regparse_t regparse_dfa;
-template<typename ctx_t> regparse_t regparse_dfa_regless;
+using regparse_t = subhistory_t* (const regex_t*, const char*, size_t);
+regparse_t regparse_dfa;
+template<typename ctx_t> regparse_t regparse_dfa_multipass;
// regtstring function (non-standard)
template<typename ctx_t>
-const tstring_t *regtstring_dfa_regless(const regex_t*, const char*);
+const tstring_t* regtstring_dfa_multipass(const regex_t*, const char*);
template<typename history_t>
-simctx_t<history_t>::simctx_t(const nfa_t &nfa, const nfa_t *nfa0, size_t re_nsub, int flags)
- : nfa(nfa)
- , nfa0(nfa0)
- , nsub(2 * (re_nsub - 1))
- , flags(flags)
- , history()
- , hidx(HROOT)
- , step(0)
- , rule(Rule::NONE)
- , cursor(NULL)
- , marker(NULL)
- , offsets1(NULL)
- , offsets2(NULL)
- , offsets3(NULL)
- , done(NULL)
- , newprectbl(NULL)
- , oldprectbl(NULL)
- , oldprecdim(0)
- , histlevel(NULL)
- , sortcores()
- , fincount()
- , worklist()
- , stateiters()
- , reach()
- , state()
- , gor1_topsort()
- , gor1_linear()
- , gtop_heap_storage()
- , gtop_cmp()
- , gtop_heap(gtop_cmp, gtop_heap_storage)
- , dc_clstats()
-{
+simctx_t<history_t>::simctx_t(const Tnfa& nfa, size_t re_nsub, int flags)
+ : nfa(nfa),
+ nsub(2 * (re_nsub - 1)),
+ flags(flags),
+ tags(nfa.tags),
+ history(),
+ hidx(HROOT),
+ step(0),
+ rule(Rule::NONE),
+ cursor(nullptr),
+ marker(nullptr),
+ offsets1(nullptr),
+ offsets2(nullptr),
+ offsets3(nullptr),
+ done(nullptr),
+ newprectbl(nullptr),
+ oldprectbl(nullptr),
+ oldprecdim(0),
+ histlevel(nullptr),
+ sortcores(),
+ fincount(),
+ worklist(),
+ stateiters(),
+ reach(),
+ state(),
+ gor1_topsort(),
+ gor1_linear(),
+ clstats() {
const size_t
- ntags = nfa.tags.size(),
- nstates = nfa.size,
- ncores = nfa.ncores;
+ ntags = nfa.tags.size(),
+ nstates = nfa.nstates,
+ ncores = nfa.ncores;
state.reserve(nstates);
reach.reserve(nstates);
@@ -235,7 +196,7 @@ simctx_t<history_t>::simctx_t(const nfa_t &nfa, const nfa_t *nfa0, size_t re_nsu
offsets2 = new regoff_t[ntags * ncores];
}
if (!(flags & REG_LEFTMOST) && !(flags & REG_TRIE)) {
- const size_t dim = (flags & REG_KUKLEWICZ) ? ntags : ncores;
+ const size_t dim = ncores;
newprectbl = new int32_t[ncores * dim];
oldprectbl = new int32_t[ncores * dim];
histlevel = new histleaf_t[ncores];
@@ -243,22 +204,13 @@ simctx_t<history_t>::simctx_t(const nfa_t &nfa, const nfa_t *nfa0, size_t re_nsu
fincount.resize(ncores + 1);
worklist.reserve(nstates);
}
- if (flags & REG_KUKLEWICZ) {
- stateiters.reserve(ncores);
- }
- if (flags & REG_GTOP) {
- gtop_heap_storage.reserve(nstates);
- }
- else {
- gor1_topsort.reserve(nstates);
- gor1_linear.reserve(nstates);
- }
+ gor1_topsort.reserve(nstates);
+ gor1_linear.reserve(nstates);
}
template<typename history_t>
-simctx_t<history_t>::~simctx_t()
-{
+simctx_t<history_t>::~simctx_t() {
delete[] done;
delete[] offsets3;
if (!(flags & REG_TRIE)) {
@@ -270,17 +222,10 @@ simctx_t<history_t>::~simctx_t()
delete[] oldprectbl;
delete[] histlevel;
}
- if (flags & REG_BACKWARD) {
- delete &nfa0->charset;
- delete &nfa0->rules;
- delete &nfa0->tags;
- delete nfa0;
- }
}
template<typename history_t>
-void init(simctx_t<history_t> &ctx, const char *string)
-{
+void init(simctx_t<history_t>& ctx, const char* string) {
ctx.reach.clear();
ctx.state.clear();
ctx.history.init();
@@ -289,33 +234,30 @@ void init(simctx_t<history_t> &ctx, const char *string)
ctx.rule = Rule::NONE;
ctx.cursor = ctx.marker = string;
ctx.sortcores.clear();
- DASSERT(ctx.worklist.empty());
- DASSERT(ctx.gor1_topsort.empty());
- DASSERT(ctx.gor1_linear.empty());
- DASSERT(ctx.gtop_heap.empty());
+ DCHECK(ctx.worklist.empty());
+ DCHECK(ctx.gor1_topsort.empty());
+ DCHECK(ctx.gor1_linear.empty());
}
-static inline regoff_t *offs_addr(regmatch_t pmatch[], size_t t)
-{
- regmatch_t *m = &pmatch[t / 2 + 1];
+static inline regoff_t* offs_addr(regmatch_t pmatch[], size_t t) {
+ regmatch_t* m = &pmatch[t / 2 + 1];
return t % 2 == 0 ? &m->rm_so : &m->rm_eo;
}
-struct getoff_nfa_t
-{
- const regoff_t *offsets;
- inline regoff_t operator()(size_t idx) const { return offsets[idx]; }
+struct getoff_nfa_t {
+ const regoff_t* offsets;
+ inline regoff_t operator()(size_t idx) const {
+ return offsets[idx];
+ }
};
-struct getoff_dfa_t
-{
- const dfa_t *dfa;
- const regoff_t *regs;
+struct getoff_dfa_t {
+ const Tdfa* dfa;
+ const regoff_t* regs;
const regoff_t len;
- regoff_t operator()(size_t idx) const
- {
- const Tag &tag = dfa->tags[idx];
+ regoff_t operator()(size_t idx) const {
+ const Tag& tag = dfa->tags[idx];
regoff_t off;
if (!fixed(tag)) {
off = regs[dfa->finvers[idx]];
@@ -330,24 +272,26 @@ struct getoff_dfa_t
};
template<typename getoff_t>
-void tags_to_submatch(const std::vector<Tag> &tags, size_t nmatch,
- regmatch_t pmatch[], regoff_t len, const getoff_t &getoff)
-{
+void tags_to_submatch(const std::vector<Tag>& tags,
+ size_t nmatch,
+ regmatch_t pmatch[],
+ regoff_t len,
+ const getoff_t& getoff) {
const size_t ntags = tags.size();
- regmatch_t *m = pmatch, *e = pmatch + nmatch;
+ regmatch_t* m = pmatch, *e = pmatch + nmatch;
m->rm_so = 0;
m->rm_eo = len;
++m;
for (size_t t = 0; t < ntags && m < e; t += 2) {
- const Tag &tag = tags[t];
+ const Tag& tag = tags[t];
if (!fictive(tag)) {
const regoff_t so = getoff(t);
const regoff_t eo = getoff(t + 1);
for (size_t j = tag.lsub; j <= tag.hsub && m < e; j += 2, ++m) {
- DASSERT(m - 1 == &pmatch[j / 2]);
+ DCHECK(m - 1 == &pmatch[j / 2]);
m->rm_so = so;
m->rm_eo = eo;
}
@@ -356,36 +300,35 @@ void tags_to_submatch(const std::vector<Tag> &tags, size_t nmatch,
}
template<typename history_t>
-int finalize(const simctx_t<history_t> &ctx, const char *string, size_t nmatch,
- regmatch_t pmatch[])
-{
+int finalize(const simctx_t<history_t>& ctx,
+ const char* string,
+ size_t nmatch,
+ regmatch_t pmatch[]) {
if (ctx.rule == Rule::NONE) {
return REG_NOMATCH;
}
- const std::vector<Tag> &tags = ctx.nfa.tags;
+ const std::vector<Tag>& tags = ctx.nfa.tags;
const size_t ntags = tags.size();
- bool *done = ctx.done;
+ bool* done = ctx.done;
memset(done, 0, ntags * sizeof(bool));
for (int32_t i = ctx.hidx; i != HROOT; ) {
- const typename history_t::node_t &n = ctx.history.node(i);
+ const typename history_t::node_t& n = ctx.history.node(i);
i = n.pred;
const size_t t = n.info.idx;
// If already updated, skip.
if (done[t]) continue;
- // Update positive tag.
if (!n.info.neg) {
+ // Update positive tag.
done[t] = true;
ctx.offsets3[t] = static_cast<regoff_t>(n.step);
- }
-
- // Update negative tag together with its sibling and nested tags (if any).
- else {
- const Tag &tag = tags[t];
+ } else {
+ // Update negative tag together with its sibling and nested tags (if any).
+ const Tag& tag = tags[t];
for (size_t l = tag.lnest; l < tag.hnest; ++l) {
if (!done[l]) {
done[l] = true;
@@ -402,20 +345,18 @@ int finalize(const simctx_t<history_t> &ctx, const char *string, size_t nmatch,
}
template<typename history_t>
-void update_offsets(simctx_t<history_t> &ctx, const conf_t &c, uint32_t id)
-{
- regoff_t *o;
- const std::vector<Tag> &tags = ctx.nfa.tags;
+void update_offsets(simctx_t<history_t>& ctx, const conf_t& c, uint32_t id) {
+ regoff_t* o;
+ const std::vector<Tag>& tags = ctx.nfa.tags;
const size_t ntags = tags.size();
- nfa_state_t *s = c.state;
- bool *done = ctx.done;
+ TnfaState* s = c.state;
+ bool* done = ctx.done;
- if (s->type == nfa_state_t::FIN) {
+ if (s->kind == TnfaState::Kind::FIN) {
ctx.marker = ctx.cursor;
ctx.rule = 0;
o = ctx.offsets3;
- }
- else {
+ } else {
o = ctx.offsets1 + id * ntags;
}
@@ -423,22 +364,20 @@ void update_offsets(simctx_t<history_t> &ctx, const conf_t &c, uint32_t id)
memset(done, 0, ntags * sizeof(bool));
for (int32_t i = c.thist; i != HROOT; ) {
- const typename history_t::node_t &n = ctx.history.node(i);
+ const typename history_t::node_t& n = ctx.history.node(i);
i = n.pred;
const size_t t = n.info.idx;
// If already updated, skip.
if (done[t]) continue;
- // Update positive tag.
if (!n.info.neg) {
+ // Update positive tag.
done[t] = true;
o[t] = static_cast<regoff_t>(ctx.step);
- }
-
- // Update negative tag together with its sibling and nested tags (if any).
- else {
- const Tag &tag = tags[t];
+ } else {
+ // Update negative tag together with its sibling and nested tags (if any).
+ const Tag& tag = tags[t];
for (size_t l = tag.lnest; l < tag.hnest; ++l) {
if (!done[l]) {
done[l] = true;
@@ -449,38 +388,21 @@ void update_offsets(simctx_t<history_t> &ctx, const conf_t &c, uint32_t id)
}
}
-bool ran_or_fin_t::operator()(const conf_t &c)
-{
- return c.state->type == nfa_state_t::RAN
- || c.state->type == nfa_state_t::FIN;
+bool ran_or_fin_t::operator()(const conf_t& c) {
+ return c.state->kind == TnfaState::Kind::RAN
+ || c.state->kind == TnfaState::Kind::FIN;
}
-void zhistory_t::init()
-{
+void zhistory_t::init() {
nodes.clear();
nodes.push_back(node_t(NOINFO, -1, 0, 0));
cache.clear();
}
-void khistory_t::init()
-{
- nodes.clear();
- nodes.push_back(node_t(NOINFO, -1));
-}
-
-template<typename ctx_t>
-hidx_t zhistory_t::link(ctx_t &ctx, const typename ctx_t::conf_t &conf)
-{
- const int32_t i = static_cast<int32_t>(nodes.size());
- nodes.push_back(node_t(conf.state->tag.info, conf.thist, conf.origin, ctx.step));
- return i;
-}
-
template<typename ctx_t>
-hidx_t khistory_t::link(ctx_t &/* ctx */, const typename ctx_t::conf_t &conf)
-{
+hidx_t zhistory_t::link(ctx_t& ctx, const typename ctx_t::conf_t& conf) {
const int32_t i = static_cast<int32_t>(nodes.size());
- nodes.push_back(node_t(conf.state->tag.info, conf.thist));
+ nodes.push_back(node_t(conf.state->tag, conf.thist, conf.origin, ctx.step));
return i;
}
diff --git a/lib/regexec.cc b/lib/regexec.cc
index 662e3ac5..ba62ed64 100644
--- a/lib/regexec.cc
+++ b/lib/regexec.cc
@@ -1,34 +1,26 @@
-#include <assert.h>
#include <stddef.h>
#include "lib/regex.h"
#include "lib/regex_impl.h"
#include "src/msg/location.h"
-
using namespace re2c;
using namespace re2c::libre2c;
-int regexec(const regex_t *re, const char *string, size_t nmatch,
- regmatch_t pmatch[], int eflags)
-{
+int regexec(const regex_t* re, const char* string, size_t nmatch, regmatch_t pmatch[], int eflags) {
const int cflags = re->flags;
if (!(cflags & REG_NFA)) {
// DFA-based algorithms
- if (cflags & REG_REGLESS) {
- // Registerless TDFA.
+ if (cflags & REG_MULTIPASS) {
+ // Multi-pass TDFA.
if (cflags & REG_LEFTMOST) {
- return regexec_dfa_regless<ldetctx_t>(re, string, nmatch, pmatch, eflags);
+ return regexec_dfa_multipass<ldetctx_t>(re, string, nmatch, pmatch, eflags);
} else {
- return regexec_dfa_regless<pdetctx_t>(re, string, nmatch, pmatch, eflags);
+ return regexec_dfa_multipass<pdetctx_t>(re, string, nmatch, pmatch, eflags);
}
} else {
// TDFA with registers and register operations on transitions.
- if (cflags & REG_STADFA) {
- return regexec_dfa<true>(re, string, nmatch, pmatch, eflags);
- } else {
- return regexec_dfa<false>(re, string, nmatch, pmatch, eflags);
- }
+ return regexec_dfa(re, string, nmatch, pmatch, eflags);
}
} else {
// NFA-based algorithms
@@ -41,11 +33,7 @@ int regexec(const regex_t *re, const char *string, size_t nmatch,
}
} else {
// POSIX disambiguation
- if (cflags & REG_BACKWARD) {
- return regexec_nfa_posix_backward(re, string, nmatch, pmatch, eflags);
- } else if (cflags & REG_KUKLEWICZ) {
- return regexec_nfa_posix_kuklewicz(re, string, nmatch, pmatch, eflags);
- } else if (cflags & REG_TRIE) {
+ if (cflags & REG_TRIE) {
return regexec_nfa_posix_trie(re, string, nmatch, pmatch, eflags);
} else {
return regexec_nfa_posix(re, string, nmatch, pmatch, eflags);
@@ -54,56 +42,49 @@ int regexec(const regex_t *re, const char *string, size_t nmatch,
}
}
-subhistory_t *regparse(const regex_t *re, const char *string, size_t nmatch)
-{
+subhistory_t* regparse(const regex_t* re, const char* string, size_t nmatch) {
const int cflags = re->flags;
- assert(cflags & REG_SUBHIST);
+ CHECK(cflags & REG_SUBHIST);
if (!(cflags & REG_NFA)) {
// DFA-based algorithms
- if (cflags & REG_REGLESS) {
- // Registerless TDFA.
+ if (cflags & REG_MULTIPASS) {
+ // Multi-pass TDFA.
if (cflags & REG_LEFTMOST) {
- return regparse_dfa_regless<ldetctx_t>(re, string, nmatch);
+ return regparse_dfa_multipass<ldetctx_t>(re, string, nmatch);
} else {
- return regparse_dfa_regless<pdetctx_t>(re, string, nmatch);
+ return regparse_dfa_multipass<pdetctx_t>(re, string, nmatch);
}
} else {
// TDFA with registers and register operations on transitions.
- if (cflags & REG_STADFA) {
- return regparse_dfa<true>(re, string, nmatch);
- } else {
- return regparse_dfa<false>(re, string, nmatch);
- }
+ return regparse_dfa(re, string, nmatch);
}
} else {
// NFA-based algorithms (not implemented yet).
- assert(false);
- return NULL;
+ UNREACHABLE();
+ return nullptr;
}
}
-const tstring_t *regtstring(const regex_t *re, const char *string)
-{
+const tstring_t* regtstring(const regex_t* re, const char* string) {
const int cflags = re->flags;
- assert(cflags & REG_TSTRING);
+ CHECK(cflags & REG_TSTRING);
if (!(cflags & REG_NFA)) {
// DFA-based algorithms
- if (cflags & REG_REGLESS) {
- // Registerless TDFA.
+ if (cflags & REG_MULTIPASS) {
+ // Multi-pass TDFA.
if (cflags & REG_LEFTMOST) {
- return regtstring_dfa_regless<ldetctx_t>(re, string);
+ return regtstring_dfa_multipass<ldetctx_t>(re, string);
} else {
- return regtstring_dfa_regless<pdetctx_t>(re, string);
+ return regtstring_dfa_multipass<pdetctx_t>(re, string);
}
} else {
// TDFA with registers is not suited to tstring construction.
- assert(false);
- return NULL;
+ UNREACHABLE();
+ return nullptr;
}
} else {
// NFA-based algorithms (not implemented yet).
- assert(false);
- return NULL;
+ UNREACHABLE();
+ return nullptr;
}
}
-
diff --git a/lib/regexec_dfa.cc b/lib/regexec_dfa.cc
index 6eb4660b..161571d7 100644
--- a/lib/regexec_dfa.cc
+++ b/lib/regexec_dfa.cc
@@ -1,4 +1,3 @@
-#include <assert.h>
#include <stddef.h>
#include <valarray>
#include <vector>
@@ -6,40 +5,33 @@
#include "lib/regex.h"
#include "lib/regex_impl.h"
#include "lib/regoff_trie.h"
-#include "src/debug/debug.h"
#include "src/dfa/dfa.h"
#include "src/dfa/tcmd.h"
#include "src/regexp/rule.h"
#include "src/regexp/tag.h"
-
+#include "src/util/check.h"
namespace re2c {
namespace libre2c {
-static void apply_regops(regoff_t *regs, const tcmd_t *cmd, regoff_t pos)
-{
- for (const tcmd_t *p = cmd; p; p = p->next) {
+static void apply_regops(regoff_t* regs, const tcmd_t* cmd, regoff_t pos) {
+ for (const tcmd_t* p = cmd; p; p = p->next) {
if (tcmd_t::iscopy(p)) {
regs[p->lhs] = regs[p->rhs];
- }
- else {
- DASSERT (tcmd_t::isset(p));
+ } else {
+ DCHECK(tcmd_t::isset(p));
regs[p->lhs] = *p->history == TAGVER_BOTTOM ? -1 : pos;
}
}
}
-template<bool stadfa>
-int regexec_dfa(const regex_t *preg, const char *string, size_t nmatch,
- regmatch_t pmatch[], int /* eflags */)
-{
- const dfa_t *dfa = preg->dfa;
- regoff_t *regs = preg->regs;
+int regexec_dfa(
+ const regex_t* preg, const char* string, size_t nmatch,regmatch_t pmatch[], int /*eflags*/) {
+ const Tdfa* dfa = preg->dfa;
+ regoff_t* regs = preg->regs;
size_t i = 0;
- const char *p = string, *q = p;
- const dfa_state_t *s, *x = NULL;
-
- if (!stadfa) apply_regops(regs, dfa->tcmd0, 0);
+ const char* p = string, *q = p;
+ const TdfaState* s, *x = nullptr;
for (;;) {
s = dfa->states[i];
@@ -52,14 +44,12 @@ int regexec_dfa(const regex_t *preg, const char *string, size_t nmatch,
x = s;
}
- if (stadfa) apply_regops(regs, s->stacmd, p - string - 2);
-
- if (i == dfa_t::NIL || c == 0) break;
+ if (i == Tdfa::NIL || c == 0) break;
- if (!stadfa) apply_regops(regs, s->tcmd[j], p - string - 1);
+ apply_regops(regs, s->tcmd[j], p - string - 1);
}
- if (s->rule == Rule::NONE && x != NULL) {
+ if (s->rule == Rule::NONE && x != nullptr) {
s = x;
p = q;
@@ -78,10 +68,8 @@ int regexec_dfa(const regex_t *preg, const char *string, size_t nmatch,
return 0;
}
-static void apply_regops_with_history(regoff_trie_t *regtrie, const tcmd_t *cmd,
- regoff_t pos)
-{
- for (const tcmd_t *p = cmd; p; p = p->next) {
+static void apply_regops_with_history(regoff_trie_t* regtrie, const tcmd_t* cmd, regoff_t pos) {
+ for (const tcmd_t* p = cmd; p; p = p->next) {
const size_t lhs = static_cast<size_t>(p->lhs);
const size_t rhs = static_cast<size_t>(p->rhs);
@@ -92,7 +80,7 @@ static void apply_regops_with_history(regoff_trie_t *regtrie, const tcmd_t *cmd,
regtrie->set(lhs, *p->history == TAGVER_BOTTOM ? -1 : pos);
} else {
- const tagver_t *h = p->history, *h0;
+ const tagver_t* h = p->history, *h0;
for (h0 = h; *h != TAGVER_ZERO; ++h);
if (lhs != rhs) regtrie->copy(lhs, rhs);
@@ -103,17 +91,14 @@ static void apply_regops_with_history(regoff_trie_t *regtrie, const tcmd_t *cmd,
}
}
-template<bool stadfa>
-subhistory_t *regparse_dfa(const regex_t *preg, const char *string, size_t nmatch)
-{
- const dfa_t *dfa = preg->dfa;
+subhistory_t* regparse_dfa(const regex_t* preg, const char* string, size_t nmatch) {
+ const Tdfa* dfa = preg->dfa;
size_t i = 0;
- const char *p = string, *q = p;
- const dfa_state_t *s, *x = NULL;
- regoff_trie_t *regtrie = preg->regtrie;
+ const char* p = string, *q = p;
+ const TdfaState* s, *x = nullptr;
+ regoff_trie_t* regtrie = preg->regtrie;
regtrie->clear();
- if (!stadfa) apply_regops_with_history(regtrie, dfa->tcmd0, 0);
for (;;) {
s = dfa->states[i];
@@ -126,11 +111,9 @@ subhistory_t *regparse_dfa(const regex_t *preg, const char *string, size_t nmatc
x = s;
}
- if (stadfa) apply_regops_with_history(regtrie, s->stacmd, p - string - 2);
+ if (i == Tdfa::NIL || c == 0) break;
- if (i == dfa_t::NIL || c == 0) break;
-
- if (!stadfa) apply_regops_with_history(regtrie, s->tcmd[j], p - string - 1);
+ apply_regops_with_history(regtrie, s->tcmd[j], p - string - 1);
}
regoff_t mlen;
@@ -138,31 +121,29 @@ subhistory_t *regparse_dfa(const regex_t *preg, const char *string, size_t nmatc
// already in final state, apply final tags
mlen = p - string - 1;
apply_regops_with_history(regtrie, s->tcmd[dfa->nchars], mlen);
-
- } else if (x != NULL) {
+ } else if (x != nullptr) {
// rollback to a final state, apply fallback tags
s = x;
p = q;
mlen = p - string - 1;
apply_regops_with_history(regtrie, s->tcmd[dfa->nchars + 1], mlen);
-
} else {
// no final state on the way => no match
- return NULL;
+ return nullptr;
}
- const std::vector<Tag> &tags = dfa->tags;
+ const std::vector<Tag>& tags = dfa->tags;
const size_t ntags = tags.size();
- const tagver_t *finvers = dfa->finvers;
+ const tagver_t* finvers = dfa->finvers;
- const regoff_trie_t::node_t *storage = regtrie->storage;
- const size_t *count = regtrie->count;
- const size_t *lists = regtrie->lists;
+ const regoff_trie_t::node_t* storage = regtrie->storage;
+ const size_t* count = regtrie->count;
+ const size_t* lists = regtrie->lists;
// Find the total number of slots needed for all submatch elements.
size_t rm_count = 1;
for (size_t t = 0; t < ntags; t += 2) {
- const Tag &tag = tags[t];
+ const Tag& tag = tags[t];
if (!fictive(tag)) {
rm_count += count[finvers[t]] * (1 + (tag.hsub - tag.lsub) / 2);
}
@@ -170,8 +151,8 @@ subhistory_t *regparse_dfa(const regex_t *preg, const char *string, size_t nmatc
// The amount of memory (in bytes) needed to store submatch information.
size_t memsize = nmatch * sizeof(subhistory_t) + rm_count * sizeof(regmatch_t);
- subhistory_t *h0 = (subhistory_t*) malloc(memsize), *h = h0, *lasth = h + nmatch;
- regmatch_t *rm0 = (regmatch_t *)(h0 + nmatch), *rm = rm0;
+ subhistory_t* h0 = static_cast<subhistory_t*>(malloc(memsize)), *h = h0, *lasth = h + nmatch;
+ regmatch_t* rm0 = reinterpret_cast<regmatch_t*>(lasth), *rm = rm0;
h->size = 1;
h->offs = rm;
@@ -181,28 +162,27 @@ subhistory_t *regparse_dfa(const regex_t *preg, const char *string, size_t nmatc
++h;
for (size_t t = 0; t < ntags && h < lasth; t += 2) {
- const Tag &tag = tags[t];
+ const Tag& tag = tags[t];
if (fictive(tag)) continue;
- DASSERT(!fixed(tag));
+ DCHECK(!fixed(tag));
const tagver_t f1 = finvers[t], f2 = finvers[t + 1];
const size_t so0 = lists[f1], sz_so = count[f1];
const size_t eo0 = lists[f2];
- assert(sz_so == count[f2]);
+ CHECK(sz_so == count[f2]);
// Submatch groups corresponding to this tag pair: there may be more
// than one capturing parenthesis per tag in regexp like (...(e)...).
- for (size_t s = tag.lsub; s <= tag.hsub && h < lasth; s += 2, ++h) {
- DASSERT(h - 1 == &h0[s / 2]);
+ for (size_t k = tag.lsub; k <= tag.hsub && h < lasth; k += 2, ++h) {
+ DCHECK(h - 1 == &h0[k / 2]);
h->size = sz_so;
h->offs = rm;
rm += sz_so;
for (size_t j = sz_so, so = so0, eo = eo0; j --> 0; ) {
- const regoff_trie_t::node_t &n_so = storage[so], &n_eo = storage[eo];
- const regmatch_t m = {n_so.off, n_eo.off};
- h->offs[j] = m;
+ const regoff_trie_t::node_t& n_so = storage[so], &n_eo = storage[eo];
+ h->offs[j] = {n_so.off, n_eo.off};
so = n_so.pred;
eo = n_eo.pred;
}
@@ -212,12 +192,5 @@ subhistory_t *regparse_dfa(const regex_t *preg, const char *string, size_t nmatc
return h0;
}
-// explicit instantiation for automaton type
-template int regexec_dfa<true>(const regex_t *, const char *, size_t, regmatch_t[], int);
-template int regexec_dfa<false>(const regex_t *, const char *, size_t, regmatch_t[], int);
-template subhistory_t *regparse_dfa<true>(const regex_t *, const char *, size_t);
-template subhistory_t *regparse_dfa<false>(const regex_t *, const char *, size_t);
-
} // namespace libre2c
} // namespace re2c
-
diff --git a/lib/regexec_dfa_regless.cc b/lib/regexec_dfa_multipass.cc
index e1675d3e..d1feee90 100644
--- a/lib/regexec_dfa_regless.cc
+++ b/lib/regexec_dfa_multipass.cc
@@ -1,41 +1,37 @@
-#include <assert.h>
#include <stddef.h>
#include <valarray>
#include <vector>
-#include "lib/regcomp_dfa_regless.h"
+#include "lib/regcomp_dfa_multipass.h"
#include "lib/regex.h"
#include "lib/regex_impl.h"
#include "lib/regoff_trie.h"
-#include "src/debug/debug.h"
#include "src/dfa/dfa.h"
#include "src/dfa/tcmd.h"
#include "src/regexp/rule.h"
#include "src/regexp/tag.h"
-
+#include "src/util/check.h"
namespace re2c {
namespace libre2c {
-// Match RLDFA on the input string, logging backlink arrays on the way.
-// This is the forward pass that is common to all algorithms in this file.
-static rldfa_backlink_t forward_pass(const regex_t *preg, const char *string,
- size_t *matchlen)
-{
- const rldfa_t *rldfa = preg->rldfa;
- const std::vector<rldfa_state_t*> &states = rldfa->states;
- std::vector<const rldfa_backlink_t* const*> &log = rldfa->log;
+// Match RLDFA on the input string, logging backlink arrays on the way. This is the forward pass
+// that is common to all algorithms in this file.
+static MpTdfaBacklink forward_pass(const regex_t* preg, const char* string, size_t* matchlen) {
+ const MpTdfa* mptdfa = preg->mptdfa;
+ const std::vector<MpTdfaState*>& states = mptdfa->states;
+ std::vector<const MpTdfaBacklink* const*>& log = mptdfa->log;
- const char *strptr = string, *finstrptr = strptr;
- rldfa_backlink_t finlink = {NOCONF, HROOT, NULL, 0};
+ const char* strptr = string, *finstrptr = strptr;
+ MpTdfaBacklink finlink = {NOCONF, nullptr, 0};
log.clear();
for (size_t stidx = 0;;) {
const int32_t chr = *strptr++;
const size_t cls = preg->char2class[chr];
- const rldfa_state_t *state = states[stidx];
- const rldfa_arc_t &arc = state->arcs[cls];
+ const MpTdfaState* state = states[stidx];
+ const MpTdfaArc& arc = state->arcs[cls];
stidx = arc.state;
log.push_back(&arc.backlinks);
@@ -45,16 +41,18 @@ static rldfa_backlink_t forward_pass(const regex_t *preg, const char *string,
finstrptr = strptr;
}
- if (stidx == dfa_t::NIL || chr == 0) break;
+ if (stidx == Tdfa::NIL || chr == 0) break;
}
*matchlen = static_cast<size_t>(finstrptr - string) - 1;
return finlink;
}
-static inline void apply_tfrag(const tchar_t *tfrag, size_t tfrag_size,
- regoff_t *result, size_t offset, const std::vector<Tag> &tags)
-{
+static inline void apply_tfrag(const tchar_t* tfrag,
+ size_t tfrag_size,
+ regoff_t* result,
+ size_t offset,
+ const std::vector<Tag>& tags) {
for (size_t i = tfrag_size; i --> 0;) {
size_t t = tfrag[i];
const bool negative = t >= TAG_BASE;
@@ -69,7 +67,7 @@ static inline void apply_tfrag(const tchar_t *tfrag, size_t tfrag_size,
result[t] = static_cast<regoff_t>(offset);
} else {
// Update negative tag together with its sibling and nested tags.
- const Tag &tag = tags[t];
+ const Tag& tag = tags[t];
for (size_t l = tag.lnest; l < tag.hnest; ++l) {
result[l] = -1;
}
@@ -78,29 +76,30 @@ static inline void apply_tfrag(const tchar_t *tfrag, size_t tfrag_size,
}
template<typename ctx_t>
-int regexec_dfa_regless(const regex_t *preg, const char *string, size_t nmatch,
- regmatch_t pmatch[], int /* eflags */)
-{
+int regexec_dfa_multipass(const regex_t* preg,
+ const char* string,
+ size_t nmatch,
+ regmatch_t pmatch[],
+ int /* eflags */) {
// Forward pass.
size_t matchlen;
- rldfa_backlink_t finlink = forward_pass(preg, string, &matchlen);
+ MpTdfaBacklink finlink = forward_pass(preg, string, &matchlen);
// No final state has been encountered on the way; return no-match.
if (finlink.conf == NOCONF) return REG_NOMATCH;
- const rldfa_t *rldfa = preg->rldfa;
- ctx_t &ctx = *static_cast<ctx_t*>(rldfa->ctx);
- const std::vector<Tag> &tags = ctx.dfa.tags;
+ const MpTdfa* mptdfa = preg->mptdfa;
+ const std::vector<Tag>& tags = mptdfa->tags;
const size_t ntags = tags.size();
- // It is necessary to initialize tag offsets to prevent overwriting the
- // latest tag value when unwinding tag history.
- regoff_t *result = rldfa->result;
+ // It is necessary to initialize tag offsets to prevent overwriting the latest tag value when
+ // unwinding tag history.
+ regoff_t* result = mptdfa->result;
std::fill(result, result + ntags, NORESULT);
// Unwind tag history back from the final RLDFA state to the initial state.
- std::vector<const rldfa_backlink_t* const*> &log = rldfa->log;
- rldfa_backlink_t link = finlink;
+ std::vector<const MpTdfaBacklink* const*>& log = mptdfa->log;
+ MpTdfaBacklink link = finlink;
for (size_t offset = matchlen;;) {
apply_tfrag(link.tfrag, link.tfrag_size, result, offset, tags);
@@ -111,21 +110,21 @@ int regexec_dfa_regless(const regex_t *preg, const char *string, size_t nmatch,
}
// Copy tag offsets to submatch results in the pmatch[] array.
- regmatch_t *match = pmatch, *lastmatch = pmatch + nmatch;
+ regmatch_t* match = pmatch, *lastmatch = pmatch + nmatch;
match->rm_so = 0;
match->rm_eo = static_cast<regoff_t>(matchlen);
++match;
for (size_t t = 0; t < ntags && match < lastmatch; t += 2) {
- const Tag &tag = tags[t];
+ const Tag& tag = tags[t];
if (fictive(tag)) continue;
const regoff_t so = result[t], eo = result[t + 1];
- // Submatch groups corresponding to this tag pair: there may be more
- // than one capturing parenthesis per tag in regexp like (...(e)...).
+ // Submatch groups corresponding to this tag pair: there may be more than one capturing
+ // parenthesis per tag in regexp like (...(e)...).
for (size_t s = tag.lsub; s <= tag.hsub && match < lastmatch; s += 2, ++match) {
- DASSERT(match - 1 == &pmatch[s / 2]);
+ DCHECK(match - 1 == &pmatch[s / 2]);
match->rm_so = so;
match->rm_eo = eo;
}
@@ -134,9 +133,11 @@ int regexec_dfa_regless(const regex_t *preg, const char *string, size_t nmatch,
return 0;
}
-static inline void apply_tfrag_subhist(const tchar_t *tfrag, size_t tfrag_size,
- regoff_trie_t *regtrie, size_t offset, const std::vector<Tag> &tags)
-{
+static inline void apply_tfrag_subhist(const tchar_t* tfrag,
+ size_t tfrag_size,
+ regoff_trie_t* regtrie,
+ size_t offset,
+ const std::vector<Tag>& tags) {
for (size_t i = tfrag_size; i --> 0;) {
const size_t t = tfrag[i];
@@ -146,7 +147,7 @@ static inline void apply_tfrag_subhist(const tchar_t *tfrag, size_t tfrag_size,
regtrie->add(t, static_cast<regoff_t>(offset));
} else {
// Update negative tag together with its sibling and nested tags.
- const Tag &tag = tags[t - TAG_BASE];
+ const Tag& tag = tags[t - TAG_BASE];
for (size_t l = tag.lnest; l < tag.hnest; ++l) {
regtrie->add(l, -1);
}
@@ -155,24 +156,22 @@ static inline void apply_tfrag_subhist(const tchar_t *tfrag, size_t tfrag_size,
}
template<typename ctx_t>
-subhistory_t *regparse_dfa_regless(const regex_t *preg, const char *string, size_t nmatch)
-{
+subhistory_t* regparse_dfa_multipass(const regex_t* preg, const char* string, size_t nmatch) {
// Forward pass.
size_t matchlen;
- rldfa_backlink_t finlink = forward_pass(preg, string, &matchlen);
+ MpTdfaBacklink finlink = forward_pass(preg, string, &matchlen);
// No final state has been encountered on the way; return no-match.
- if (finlink.conf == NOCONF) return NULL;
+ if (finlink.conf == NOCONF) return nullptr;
- const rldfa_t *rldfa = preg->rldfa;
- ctx_t &ctx = *static_cast<ctx_t*>(rldfa->ctx);
- const std::vector<Tag> &tags = ctx.dfa.tags;
+ const MpTdfa* mptdfa = preg->mptdfa;
+ const std::vector<Tag>& tags = mptdfa->tags;
const size_t ntags = tags.size();
- regoff_trie_t *regtrie = preg->regtrie;
+ regoff_trie_t* regtrie = preg->regtrie;
// Unwind tag history back from the final RLDFA state to the initial state.
- std::vector<const rldfa_backlink_t* const*> &log = rldfa->log;
- rldfa_backlink_t link = finlink;
+ std::vector<const MpTdfaBacklink* const*>& log = mptdfa->log;
+ MpTdfaBacklink link = finlink;
regtrie->clear();
for (size_t offset = matchlen;;) {
apply_tfrag_subhist(link.tfrag, link.tfrag_size, regtrie, offset, tags);
@@ -183,14 +182,14 @@ subhistory_t *regparse_dfa_regless(const regex_t *preg, const char *string, size
link = (*log[offset])[link.conf];
}
- const regoff_trie_t::node_t *storage = regtrie->storage;
- const size_t *count = regtrie->count;
- const size_t *lists = regtrie->lists;
+ const regoff_trie_t::node_t* storage = regtrie->storage;
+ const size_t* count = regtrie->count;
+ const size_t* lists = regtrie->lists;
// Find the total number of slots needed for all submatch elements.
size_t rm_count = 1;
for (size_t t = 0; t < ntags; t += 2) {
- const Tag &tag = tags[t];
+ const Tag& tag = tags[t];
if (!fictive(tag)) {
rm_count += count[t] * (1 + (tag.hsub - tag.lsub) / 2);
}
@@ -198,8 +197,8 @@ subhistory_t *regparse_dfa_regless(const regex_t *preg, const char *string, size
// The amount of memory (in bytes) needed to store submatch information.
size_t memsize = nmatch * sizeof(subhistory_t) + rm_count * sizeof(regmatch_t);
- subhistory_t *h0 = (subhistory_t*) malloc(memsize), *h = h0, *lasth = h + nmatch;
- regmatch_t *rm0 = (regmatch_t *)(h0 + nmatch), *rm = rm0;
+ subhistory_t* h0 = static_cast<subhistory_t*>(malloc(memsize)), *h = h0, *lasth = h + nmatch;
+ regmatch_t* rm0 = reinterpret_cast<regmatch_t*>(lasth), *rm = rm0;
h->size = 1;
h->offs = rm;
@@ -209,26 +208,25 @@ subhistory_t *regparse_dfa_regless(const regex_t *preg, const char *string, size
++h;
for (size_t t = 0; t < ntags && h < lasth; t += 2) {
- const Tag &tag = tags[t];
+ const Tag& tag = tags[t];
if (fictive(tag)) continue;
const size_t so0 = lists[t], sz_so = count[t];
const size_t eo0 = lists[t + 1];
- assert(sz_so == count[t + 1]);
+ CHECK(sz_so == count[t + 1]);
- // Submatch groups corresponding to this tag pair: there may be more
- // than one capturing parenthesis per tag in regexp like (...(e)...).
+ // Submatch groups corresponding to this tag pair: there may be more than one capturing
+ // parenthesis per tag in regexp like (...(e)...).
for (size_t s = tag.lsub; s <= tag.hsub && h < lasth; s += 2, ++h) {
- DASSERT(h - 1 == &h0[s / 2]);
+ DCHECK(h - 1 == &h0[s / 2]);
h->size = sz_so;
h->offs = rm;
rm += sz_so;
for (size_t j = 0, so = so0, eo = eo0; j < sz_so; ++j) {
- const regoff_trie_t::node_t &n_so = storage[so], &n_eo = storage[eo];
- const regmatch_t m = {n_so.off, n_eo.off};
- h->offs[j] = m;
+ const regoff_trie_t::node_t& n_so = storage[so], &n_eo = storage[eo];
+ h->offs[j] = {n_so.off, n_eo.off};
so = n_so.pred;
eo = n_eo.pred;
}
@@ -239,18 +237,17 @@ subhistory_t *regparse_dfa_regless(const regex_t *preg, const char *string, size
}
template<typename ctx_t>
-const tstring_t *regtstring_dfa_regless(const regex_t *preg, const char *string)
-{
+const tstring_t* regtstring_dfa_multipass(const regex_t* preg, const char* string) {
// Forward pass.
size_t matchlen;
- rldfa_backlink_t finlink = forward_pass(preg, string, &matchlen);
+ MpTdfaBacklink finlink = forward_pass(preg, string, &matchlen);
// No final state has been encountered on the way; return no-match.
- if (finlink.conf == NOCONF) return NULL;
+ if (finlink.conf == NOCONF) return nullptr;
- const rldfa_t *rldfa = preg->rldfa;
- std::vector<const rldfa_backlink_t* const*> &log = rldfa->log;
- rldfa_backlink_t link = finlink;
+ const MpTdfa* mptdfa = preg->mptdfa;
+ std::vector<const MpTdfaBacklink* const*>& log = mptdfa->log;
+ MpTdfaBacklink link = finlink;
// Calculate the length of the resulting t-string.
size_t len = matchlen;
@@ -262,10 +259,10 @@ const tstring_t *regtstring_dfa_regless(const regex_t *preg, const char *string)
link = (*log[offset])[link.conf];
}
- len += 2; // tags for the outermost capture that wraps the whole RE
+ len += 2; // tags for the outermost capture that wraps the whole regexp
len += 1; // terminating NULL
- tstring_t *tstr = &preg->tstring;
+ tstring_t* tstr = &preg->tstring;
if (tstr->capacity <= len) {
tstr->capacity = len * 2;
delete[] tstr->string;
@@ -273,7 +270,7 @@ const tstring_t *regtstring_dfa_regless(const regex_t *preg, const char *string)
}
tstr->length = len;
- tchar_t *s = tstr->string + len;
+ tchar_t* s = tstr->string + len;
*--s = 0; // terminating NULL
*--s = TAG_BASE + 2; // outermost closing parenthesis
@@ -296,17 +293,14 @@ const tstring_t *regtstring_dfa_regless(const regex_t *preg, const char *string)
}
// explicit instantiation for disambiguation policy (leftmost greedy / POSIX)
-template int regexec_dfa_regless<ldetctx_t>(const regex_t*, const char*, size_t,
- regmatch_t[], int);
-template int regexec_dfa_regless<pdetctx_t>(const regex_t*, const char*, size_t,
- regmatch_t[], int);
-template subhistory_t* regparse_dfa_regless<ldetctx_t>(const regex_t*, const char*,
- size_t);
-template subhistory_t* regparse_dfa_regless<pdetctx_t>(const regex_t*, const char*,
- size_t);
-template const tstring_t* regtstring_dfa_regless<ldetctx_t>(const regex_t*, const char*);
-template const tstring_t* regtstring_dfa_regless<pdetctx_t>(const regex_t*, const char*);
+template int regexec_dfa_multipass<ldetctx_t>(
+ const regex_t*, const char*, size_t, regmatch_t[], int);
+template int regexec_dfa_multipass<pdetctx_t>(
+ const regex_t*, const char*, size_t, regmatch_t[], int);
+template subhistory_t* regparse_dfa_multipass<ldetctx_t>(const regex_t*, const char*, size_t);
+template subhistory_t* regparse_dfa_multipass<pdetctx_t>(const regex_t*, const char*, size_t);
+template const tstring_t* regtstring_dfa_multipass<ldetctx_t>(const regex_t*, const char*);
+template const tstring_t* regtstring_dfa_multipass<pdetctx_t>(const regex_t*, const char*);
} // namespace libre2c
} // namespace re2c
-
diff --git a/lib/regexec_nfa_leftmost.cc b/lib/regexec_nfa_leftmost.cc
index 7c3af7b7..0245b37d 100644
--- a/lib/regexec_nfa_leftmost.cc
+++ b/lib/regexec_nfa_leftmost.cc
@@ -5,23 +5,21 @@
#include "lib/regex.h"
#include "lib/regex_impl.h"
-#include "src/debug/debug.h"
#include "src/dfa/closure_leftmost.h"
#include "src/dfa/tag_history.h"
#include "src/nfa/nfa.h"
#include "src/regexp/rule.h"
+#include "src/util/check.h"
#include "src/util/range.h"
-
namespace re2c {
namespace libre2c {
-static void reach_on_symbol(lsimctx_t &, uint32_t);
+static void reach_on_symbol(lsimctx_t&, uint32_t);
-int regexec_nfa_leftmost(const regex_t *preg, const char *string
- , size_t nmatch, regmatch_t pmatch[], int)
-{
- lsimctx_t &ctx = *static_cast<lsimctx_t*>(preg->simctx);
+int regexec_nfa_leftmost(
+ const regex_t* preg, const char* string, size_t nmatch, regmatch_t pmatch[], int) {
+ lsimctx_t& ctx = *static_cast<lsimctx_t*>(preg->simctx);
init(ctx, string);
// root state can be non-core, so we pass zero as origin to avoid checks
@@ -36,9 +34,9 @@ int regexec_nfa_leftmost(const regex_t *preg, const char *string
}
for (cconfiter_t i = ctx.state.begin(), e = ctx.state.end(); i != e; ++i) {
- nfa_state_t *s = i->state;
+ TnfaState* s = i->state;
s->clos = NOCLOS;
- if (s->type == nfa_state_t::FIN) {
+ if (s->kind == TnfaState::Kind::FIN) {
update_offsets(ctx, *i, NONCORE);
}
}
@@ -53,30 +51,28 @@ int regexec_nfa_leftmost(const regex_t *preg, const char *string
return 0;
}
-void reach_on_symbol(lsimctx_t &ctx, uint32_t sym)
-{
- const confset_t &state = ctx.state;
- confset_t &reach = ctx.reach;
- DASSERT(reach.empty());
+void reach_on_symbol(lsimctx_t& ctx, uint32_t sym) {
+ const confset_t& state = ctx.state;
+ confset_t& reach = ctx.reach;
+ DCHECK(reach.empty());
// in reverse, so that future closure DFS has states in stack order
uint32_t j = 0;
for (rcconfiter_t i = state.rbegin(), e = state.rend(); i != e; ++i) {
- nfa_state_t *s = i->state;
+ TnfaState* s = i->state;
s->clos = NOCLOS;
- if (s->type == nfa_state_t::RAN) {
- for (const Range *r = s->ran.ran; r; r = r->next()) {
+ if (s->kind == TnfaState::Kind::RAN) {
+ for (const Range* r = s->ran; r; r = r->next()) {
if (r->lower() <= sym && sym < r->upper()) {
- conf_t c(s->ran.out, j, HROOT);
+ conf_t c(s->out1, j, HROOT);
reach.push_back(c);
update_offsets(ctx, *i, j);
++j;
break;
}
}
- }
- else if (s->type == nfa_state_t::FIN) {
+ } else if (s->kind == TnfaState::Kind::FIN) {
update_offsets(ctx, *i, NONCORE);
}
}
@@ -88,4 +84,3 @@ void reach_on_symbol(lsimctx_t &ctx, uint32_t sym)
} // namespace libre2
} // namespace re2c
-
diff --git a/lib/regexec_nfa_leftmost_trie.cc b/lib/regexec_nfa_leftmost_trie.cc
index 37d9cc56..f6f7e96c 100644
--- a/lib/regexec_nfa_leftmost_trie.cc
+++ b/lib/regexec_nfa_leftmost_trie.cc
@@ -4,23 +4,21 @@
#include "lib/regex.h"
#include "lib/regex_impl.h"
-#include "src/debug/debug.h"
#include "src/dfa/closure_leftmost.h"
#include "src/dfa/tag_history.h"
#include "src/nfa/nfa.h"
+#include "src/util/check.h"
#include "src/util/range.h"
-
namespace re2c {
namespace libre2c {
-static void make_step(lzsimctx_t &, uint32_t);
-static void make_final_step(lzsimctx_t &ctx);
+static void make_step(lzsimctx_t&, uint32_t);
+static void make_final_step(lzsimctx_t& ctx);
-int regexec_nfa_leftmost_trie(const regex_t *preg, const char *string
- , size_t nmatch, regmatch_t pmatch[], int)
-{
- lzsimctx_t &ctx = *static_cast<lzsimctx_t*>(preg->simctx);
+int regexec_nfa_leftmost_trie(
+ const regex_t* preg, const char* string, size_t nmatch, regmatch_t pmatch[], int) {
+ lzsimctx_t& ctx = *static_cast<lzsimctx_t*>(preg->simctx);
init(ctx, string);
const conf_t c0(ctx.nfa.root, 0/* unused */, HROOT);
@@ -37,31 +35,29 @@ int regexec_nfa_leftmost_trie(const regex_t *preg, const char *string
return finalize(ctx, string, nmatch, pmatch);
}
-void make_step(lzsimctx_t &ctx, uint32_t sym)
-{
- const confset_t &state = ctx.state;
- confset_t &reach = ctx.reach;
+void make_step(lzsimctx_t& ctx, uint32_t sym) {
+ const confset_t& state = ctx.state;
+ confset_t& reach = ctx.reach;
// in reverse, so that future closure DFS has states in stack order
rcconfiter_t b = state.rbegin(), e = state.rend(), i;
- DASSERT(reach.empty());
+ DCHECK(reach.empty());
for (i = b; i != e; ++i) {
- nfa_state_t *s = i->state;
+ TnfaState* s = i->state;
// cleanup from previous closure
s->clos = NOCLOS;
- if (s->type == nfa_state_t::RAN) {
- for (const Range *r = s->ran.ran; r; r = r->next()) {
+ if (s->kind == TnfaState::Kind::RAN) {
+ for (const Range* r = s->ran; r; r = r->next()) {
if (r->lower() <= sym && sym < r->upper()) {
- const conf_t c(s->ran.out, 0/* unused */, i->thist);
+ const conf_t c(s->out1, 0/* unused */, i->thist);
reach.push_back(c);
break;
}
}
- }
- else if (s->type == nfa_state_t::FIN) {
+ } else if (s->kind == TnfaState::Kind::FIN) {
ctx.marker = ctx.cursor;
ctx.hidx = i->thist;
ctx.rule = 0;
@@ -71,15 +67,14 @@ void make_step(lzsimctx_t &ctx, uint32_t sym)
++ctx.step;
}
-void make_final_step(lzsimctx_t &ctx)
-{
+void make_final_step(lzsimctx_t& ctx) {
for (confiter_t i = ctx.state.begin(), e = ctx.state.end(); i != e; ++i) {
- nfa_state_t *s = i->state;
+ TnfaState* s = i->state;
s->clos = NOCLOS;
- DASSERT(s->active == 0);
+ DCHECK(s->active == 0);
- if (s->type == nfa_state_t::FIN) {
+ if (s->kind == TnfaState::Kind::FIN) {
ctx.marker = ctx.cursor;
ctx.hidx = i->thist;
ctx.rule = 0;
@@ -89,4 +84,3 @@ void make_final_step(lzsimctx_t &ctx)
} // namespace libre2
} // namespace re2c
-
diff --git a/lib/regexec_nfa_posix.cc b/lib/regexec_nfa_posix.cc
index e31b5005..6eeb2775 100644
--- a/lib/regexec_nfa_posix.cc
+++ b/lib/regexec_nfa_posix.cc
@@ -2,32 +2,32 @@
#include <algorithm>
#include <utility>
#include <vector>
-#include <assert.h>
#include "lib/regex.h"
#include "lib/regex_impl.h"
-#include "src/debug/debug.h"
#include "src/dfa/closure_posix.h"
#include "src/dfa/posix_precedence.h"
#include "src/dfa/tag_history.h"
#include "src/nfa/nfa.h"
#include "src/regexp/rule.h"
+#include "src/util/check.h"
#include "src/util/range.h"
-
namespace re2c {
namespace libre2c {
-static void make_one_step(psimctx_t &, uint32_t);
-static void make_final_step(psimctx_t &);
+static void make_one_step(psimctx_t&, uint32_t);
+static void make_final_step(psimctx_t&);
// we *do* want these to be inlined
-static inline void closure_posix(psimctx_t &ctx);
-
-int regexec_nfa_posix(const regex_t *preg, const char *string
- , size_t nmatch, regmatch_t pmatch[], int /* eflags */)
-{
- psimctx_t &ctx = *static_cast<psimctx_t*>(preg->simctx);
+static inline void closure_posix(psimctx_t& ctx);
+
+int regexec_nfa_posix(const regex_t* preg,
+ const char* string,
+ size_t nmatch,
+ regmatch_t pmatch[],
+ int /*eflags*/) {
+ psimctx_t& ctx = *static_cast<psimctx_t*>(preg->simctx);
init(ctx, string);
// root state can be non-core, so we pass zero as origin to avoid checks
@@ -51,35 +51,27 @@ int regexec_nfa_posix(const regex_t *preg, const char *string
return 0;
}
-void closure_posix(psimctx_t &ctx)
-{
+void closure_posix(psimctx_t& ctx) {
ctx.history.init();
-
- if (ctx.flags & REG_GTOP) {
- closure_posix_gtop(ctx);
- }
- else {
- closure_posix_gor1(ctx);
- }
+ closure_posix_gor1(ctx);
}
-void make_one_step(psimctx_t &ctx, uint32_t sym)
-{
- confset_t &state = ctx.state, &reach = ctx.reach;
+void make_one_step(psimctx_t& ctx, uint32_t sym) {
+ confset_t& state = ctx.state, &reach = ctx.reach;
uint32_t j = 0;
reach.clear();
for (cconfiter_t i = state.begin(), e = state.end(); i != e; ++i) {
- nfa_state_t *s = i->state;
+ TnfaState* s = i->state;
s->clos = NOCLOS;
s->arcidx = 0;
- DASSERT(s->status == GOR_NOPASS && s->active == 0);
+ DCHECK(s->status == GOR_NOPASS && s->active == 0);
- if (s->type == nfa_state_t::RAN) {
- for (const Range *r = s->ran.ran; r; r = r->next()) {
+ if (s->kind == TnfaState::Kind::RAN) {
+ for (const Range* r = s->ran; r; r = r->next()) {
if (r->lower() <= sym && sym < r->upper()) {
- const conf_t c(s->ran.out, j, HROOT);
+ const conf_t c(s->out1, j, HROOT);
reach.push_back(c);
state[j] = *i;
update_offsets(ctx, *i, j);
@@ -87,8 +79,7 @@ void make_one_step(psimctx_t &ctx, uint32_t sym)
break;
}
}
- }
- else if (s->type == nfa_state_t::FIN) {
+ } else if (s->kind == TnfaState::Kind::FIN) {
update_offsets(ctx, *i, NONCORE);
}
}
@@ -98,8 +89,7 @@ void make_one_step(psimctx_t &ctx, uint32_t sym)
if (!(ctx.flags & REG_SLOWPREC)) {
compute_prectable_complex(ctx);
- }
- else {
+ } else {
compute_prectable_naive(ctx);
}
std::swap(ctx.newprectbl, ctx.oldprectbl);
@@ -109,16 +99,15 @@ void make_one_step(psimctx_t &ctx, uint32_t sym)
++ctx.step;
}
-void make_final_step(psimctx_t &ctx)
-{
+void make_final_step(psimctx_t& ctx) {
for (cconfiter_t i = ctx.state.begin(), e = ctx.state.end(); i != e; ++i) {
- nfa_state_t *s = i->state;
+ TnfaState* s = i->state;
s->clos = NOCLOS;
s->arcidx = 0;
- DASSERT(s->status == GOR_NOPASS && s->active == 0);
+ DCHECK(s->status == GOR_NOPASS && s->active == 0);
- if (s->type == nfa_state_t::FIN) {
+ if (s->kind == TnfaState::Kind::FIN) {
update_offsets(ctx, *i, NONCORE);
}
}
@@ -126,4 +115,3 @@ void make_final_step(psimctx_t &ctx)
} // namespace libre2c
} // namespace re2c
-
diff --git a/lib/regexec_nfa_posix_backward.cc b/lib/regexec_nfa_posix_backward.cc
deleted file mode 100644
index 3a953ac8..00000000
--- a/lib/regexec_nfa_posix_backward.cc
+++ /dev/null
@@ -1,632 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <algorithm>
-#include <vector>
-
-#include "lib/regex.h"
-#include "lib/regex_impl.h"
-#include "src/debug/debug.h"
-#include "src/dfa/closure_posix.h"
-#include "src/dfa/tag_history.h"
-#include "src/nfa/nfa.h"
-#include "src/regexp/rule.h"
-#include "src/regexp/tag.h"
-#include "src/util/range.h"
-
-
-namespace re2c {
-namespace libre2c {
-
-/*
- * This is a quick-and-dirty implementation of backward matching algorithm
- * suggested by Cox. The algorithm is incorrect, but it is surprisingly
- * close to correct, and we keep it around for research purposes (in case
- * we need to confirm its incorrectness in the light of some future findings).
- *
- * The algorithm attempts to find POSIX longest match by going from right
- * to left and maximizing the length of the latest iteration of each
- * capturing group, similar to the way Laurikari algorithm minimizez or
- * maximizes the most recent value of each tag.
- *
- * The algorithm also has to remember the match on the last iteration, so
- * it has to carry around twice as much data for offssets. Offsets are
- * bound to configurations: for each configuration, there is a "scratch"
- * offset pair, an "accepted" offset pair and also a "backup" offset pair
- * that is used only during closure initialization. And double all that,
- * as we keep offsets of both last and most recent iterations. All this
- * means a lot of copying, so unsurprisingly the algorithm is very slow
- * if RE has many submatch groups. Some copying can be reduced, but not all:
- * SSSP does not (in general) proceed in stack order, therefore the
- * previous scanned configuration might not be related at all to the next.
- * GOR1's topsort phase is DFS though, so it can be made to use a single
- * scratch buffer for offsets (that is updated on DFS recursive enter and
- * restored on DFS recursive return).
- *
- * The algorithm is incorrect because it sometimes compares ambiguous
- * paths too late, when the difference between them has been overwritten
- * with more recent offsets. This may happen when two conditions are met:
- *
- * (1) the most recent iteration of both paths matches the same substring,
- * so that the offsets on this iteration are equal
- *
- * (2) paths are compared for the first time *after* the offsets have been
- * updated and have become indistinguishable
- *
- * The algorithm would work if such paths had a join point somewhere
- * before the problematic iteration, and were properly disambiguated at that
- * point. The algorithm may fail to do so in two cases:
- *
- * (A) With bounded repetition, each repetition duplicated in is a separate
- * automaton, and paths may not meet until the final join point (because
- * at each step they end in different subautomata). This depends on how
- * we unroll bounded repetition. An example of this is (aaaa|aaa|a){3,4}
- * and paths X = (aaaa)(aaaa)(a)(a) and Y = (aaaa)(aaa)(aaa), provided
- * that we unroll bounded repetition by making the last iteration
- * optional and the first three non-optional (the normal way we unroll
- * it). Note that the paths have different number of iterations (4 and 3)
- * which is why they don't meet until the very end.
- *
- * (B) Even if there is a join point, due to a particular order of closure
- * traversal paths may need to be compared *after* the join point (at
- * the time when offsets are already overwritten with new ones, making
- * the paths indistinguishable). This may happen if the order of closure
- * traversal is not topological (for example, when we go in depth-first
- * order, start from some unfortunate initial state and propagate the
- * wrong path to other states, then start from another initial state and
- * need to propagate improvement to all those states).
- *
- */
-
-static void make_one_step_simple(psimctx_t &, uint32_t);
-static void make_one_step(psimctx_t &, uint32_t);
-static void make_final_step(psimctx_t &);
-static void update_final_offsets(psimctx_t &ctx, const conf_t &c);
-static void closure_simple(psimctx_t &ctx);
-static void relax_gtop(psimctx_t &ctx, const psimctx_t::conf_t &c);
-static void closure_posix_gtop(psimctx_t &ctx);
-static bool scan(psimctx_t &ctx, nfa_state_t *q, bool all);
-static bool relax_gor1(psimctx_t &ctx, const psimctx_t::conf_t &c);
-static void closure_posix_gor1(psimctx_t &ctx);
-static inline void closure_posix(psimctx_t &ctx);
-static inline uint32_t index(const nfa_state_t *s, const nfa_t &nfa);
-static void copy_offs(psimctx_t &ctx, const nfa_state_t *y, const nfa_state_t *x, tag_info_t info);
-static inline void accept_offsets(psimctx_t &ctx, const nfa_state_t *s);
-
-// debug
-static int D = 0;
-static void prtoff(psimctx_t &ctx, uint32_t x, bool newer);
-static inline void prtoff4(psimctx_t &ctx, uint32_t x);
-static inline void prtoff5(psimctx_t &ctx, uint32_t x);
-
-static regoff_t *offsets4 = NULL;
-static regoff_t *offsets5 = NULL;
-static regoff_t *offsets6 = NULL;
-
-int regexec_nfa_posix_backward(const regex_t *preg, const char *string
- , size_t nmatch, regmatch_t pmatch[], int /* eflags */)
-{
- psimctx_t &ctx = *static_cast<psimctx_t*>(preg->simctx);
- init(ctx, string);
-
- // 1st pass, forward: find longest match on a simple NFA
-
- ctx.reach.push_back(conf_t(ctx.nfa0->root, 0, 0));
- for (;;) {
- closure_simple(ctx);
- const uint32_t sym = static_cast<uint8_t>(*ctx.cursor++);
- if (ctx.state.empty() || sym == 0) break;
- make_one_step_simple(ctx, sym);
- }
-
- for (cconfiter_t i = ctx.state.begin(), e = ctx.state.end(); i != e; ++i) {
- nfa_state_t *s = i->state;
- s->clos = NOCLOS;
- if (s->type == nfa_state_t::FIN) {
- ctx.marker = ctx.cursor;
- ctx.rule = 0;
- }
- }
-
- if (ctx.rule == Rule::NONE) {
- return REG_NOMATCH;
- }
-
- // 2nd pass, backward: find submatches
-
- const uint32_t len = static_cast<uint32_t>(ctx.marker - string) - 1;
- init(ctx, string);
- ctx.step = len;
- ctx.cursor = ctx.marker = string + len;
-
- const std::vector<Tag> &tags = ctx.nfa.tags;
- const size_t sz = tags.size() * ctx.nfa.size * 2;
- offsets4 = new regoff_t[sz];
- offsets5 = new regoff_t[sz];
- offsets6 = new regoff_t[sz];
-
- std::fill(offsets4, offsets4 + sz, -2);
- std::fill(offsets5, offsets5 + sz, -2);
-
- ctx.reach.push_back(conf_t(ctx.nfa.root, 0, 0));
- for (;;) {
- closure_posix(ctx);
- if (ctx.state.empty() || ctx.cursor == string) break;
- const uint32_t sym = static_cast<uint8_t>(*--ctx.cursor);
- make_one_step(ctx, sym);
- --ctx.step;
- }
- make_final_step(ctx);
-
- delete[] offsets6;
- delete[] offsets5;
- delete[] offsets4;
-
- if (ctx.rule == Rule::NONE) {
- return REG_NOMATCH;
- }
-
- const getoff_nfa_t fn = { ctx.offsets3 };
- tags_to_submatch(tags, nmatch, pmatch, static_cast<regoff_t>(len), fn);
-
- return 0;
-}
-
-static uint32_t index(const nfa_state_t *s, const nfa_t &nfa)
-{
- return static_cast<uint32_t>(s - nfa.states);
-}
-
-void closure_simple(psimctx_t &ctx)
-{
- psimctx_t::confset_t &state = ctx.state, &stack = ctx.reach;
- state.clear();
-
- for (; !stack.empty(); ) {
- const psimctx_t::conf_t x = stack.back();
- stack.pop_back();
- nfa_state_t *n = x.state;
-
- if (n->clos != NOCLOS) continue;
-
- n->clos = static_cast<uint32_t>(state.size());
- state.push_back(x);
-
- switch (n->type) {
- case nfa_state_t::ALT:
- stack.push_back(psimctx_t::conf_t(x, n->alt.out2));
- stack.push_back(psimctx_t::conf_t(x, n->alt.out1));
- break;
- case nfa_state_t::TAG:
- stack.push_back(psimctx_t::conf_t(x, n->tag.out, 0));
- break;
- case nfa_state_t::RAN:
- case nfa_state_t::FIN:
- break;
- }
- }
-}
-
-void make_one_step_simple(psimctx_t &ctx, uint32_t sym)
-{
- const confset_t &state = ctx.state;
- confset_t &reach = ctx.reach;
- DASSERT(reach.empty());
-
- for (rcconfiter_t i = state.rbegin(), e = state.rend(); i != e; ++i) {
- nfa_state_t *s = i->state;
- s->clos = NOCLOS;
-
- if (s->type == nfa_state_t::RAN) {
- for (const Range *r = s->ran.ran; r; r = r->next()) {
- if (r->lower() <= sym && sym < r->upper()) {
- conf_t c(s->ran.out, 0, 0);
- reach.push_back(c);
- break;
- }
- }
- }
- else if (s->type == nfa_state_t::FIN) {
- ctx.marker = ctx.cursor;
- ctx.rule = 0;
- }
- }
-
- ++ctx.step;
-}
-
-void closure_posix(psimctx_t &ctx)
-{
- if (ctx.flags & REG_GTOP) {
- closure_posix_gtop(ctx);
- }
- else {
- closure_posix_gor1(ctx);
- }
-}
-
-static int32_t precedence(psimctx_t &ctx, const conf_t &x, const conf_t &y)
-{
- DASSERT(x.state == y.state); (void)y;
- const uint32_t idx = index(x.state, ctx.nfa);
-
- const size_t ntags = ctx.nfa.tags.size();
- const regoff_t *ox = offsets5 + ntags * idx * 2;
- const regoff_t *oy = offsets4 + ntags * idx * 2;
-
- prtoff5(ctx, idx);
- prtoff4(ctx, idx);
- if (D) fprintf(stderr, "prec %u: ", idx);
-
- for (size_t t = 0; t < ctx.nfa.tags.size(); t += 2) {
- const regoff_t offx = ox[2 * (t + 1)];
- const regoff_t offy = oy[2 * (t + 1)];
- if (offx != offy) {
- int cmp;
- if (offx == -2) {
- cmp = -1;
- }
- if (offy == -2) {
- cmp = 1;
- }
- else {
- cmp = offx > offy ? -1 : 1;
- }
- if (D) fprintf(stderr, "%d\n", cmp);
- return cmp;
- }
- }
- if (D) fprintf(stderr, "0\n");
- return 0;
-}
-
-void closure_posix_gor1(psimctx_t &ctx)
-{
- psimctx_t::confset_t &state = ctx.state, &reach = ctx.reach;
- std::vector<nfa_state_t*>
- &topsort = ctx.gor1_topsort,
- &linear = ctx.gor1_linear;
- const size_t ntags = ctx.nfa.tags.size();
-
- state.clear();
- memcpy(offsets6, offsets4, ctx.nfa.size * ntags * sizeof(regoff_t) * 2);
-
- // Reverse order happens to result in less test errors, but both orders
- // do not yield a correct algorithm (paths are still compared after
- // their join point sometimes).
- for (psimctx_t::rcconfiter_t c = reach.rbegin(); c != reach.rend(); ++c) {
- regoff_t *ox = offsets5 + ntags * index(c->state, ctx.nfa) * 2;
- const regoff_t *oy = offsets6 + ntags * c->origin * 2;
- memcpy(ox, oy, ntags * sizeof(regoff_t) * 2);
-
- if (D) fprintf(stderr, "restoring offsets %u to %u\n"
- , c->origin, index(c->state, ctx.nfa));
-
- relax_gor1(ctx, *c);
- }
-
- for (; !topsort.empty(); ) {
-
- // 1st pass: depth-first postorder traversal of admissible subgraph
- for (; !topsort.empty(); ) {
- nfa_state_t *q = topsort.back();
- if (q->status == GOR_LINEAR) {
- topsort.pop_back();
- }
- else {
- q->status = GOR_TOPSORT;
- if (!scan(ctx, q, false)) {
- q->status = GOR_LINEAR;
- topsort.pop_back();
- linear.push_back(q);
- }
- }
- }
-
- // 2nd pass: linear scan of topologically ordered states
- for (; !linear.empty(); ) {
- nfa_state_t *q = linear.back();
- linear.pop_back();
- if (q->active) {
- q->active = 0;
- q->arcidx = 0;
- scan(ctx, q, true);
- }
- q->status = GOR_NOPASS;
- }
- }
-}
-
-bool scan(psimctx_t &ctx, nfa_state_t *q, bool all)
-{
- bool any = false;
-
- typedef psimctx_t::conf_t conf_t;
- const conf_t x = ctx.state[q->clos];
-
- switch (q->type) {
- case nfa_state_t::ALT:
- if (q->arcidx == 0) {
- copy_offs(ctx, q, q->alt.out1, NOINFO);
- any |= relax_gor1(ctx, conf_t(x, q->alt.out1));
- ++q->arcidx;
- }
- if (q->arcidx == 1 && (!any || all)) {
- copy_offs(ctx, q, q->alt.out2, NOINFO);
- any |= relax_gor1(ctx, conf_t(x, q->alt.out2));
- ++q->arcidx;
- }
- break;
- case nfa_state_t::TAG:
- if (q->arcidx == 0) {
- copy_offs(ctx, q, q->tag.out, q->tag.info);
- any |= relax_gor1(ctx, conf_t(x, q->tag.out, 0));
- ++q->arcidx;
- }
- break;
- case nfa_state_t::RAN:
- case nfa_state_t::FIN:
- break;
- }
-
- return any;
-}
-
-bool relax_gor1(psimctx_t &ctx, const psimctx_t::conf_t &x)
-{
- psimctx_t::confset_t &state = ctx.state;
- nfa_state_t *q = x.state;
- const uint32_t idx = q->clos;
-
- if (q->status == GOR_TOPSORT) {
- return false;
- }
-
- if (idx == NOCLOS) {
- q->clos = static_cast<uint32_t>(state.size());
- state.push_back(x);
- accept_offsets(ctx, q);
- }
- else if (q->indeg < 2 || precedence(ctx, x, state[idx]) < 0) {
- state[idx] = x;
- accept_offsets(ctx, q);
- }
- else {
- return false;
- }
-
- if (q->status == GOR_NOPASS) {
- ctx.gor1_topsort.push_back(q);
- q->arcidx = 0;
- return true;
- }
- else {
- q->active = 1;
- return false;
- }
-}
-
-void closure_posix_gtop(psimctx_t &ctx)
-{
- const psimctx_t::confset_t &reach = ctx.reach;
- psimctx_t::confset_t &state = ctx.state;
- gtop_heap_t &heap = ctx.gtop_heap;
- const size_t ntags = ctx.nfa.tags.size();
-
- state.clear();
- memcpy(offsets6, offsets4, ctx.nfa.size * ntags * sizeof(regoff_t) * 2);
- for (psimctx_t::cconfiter_t c = reach.begin(); c != reach.end(); ++c) {
- regoff_t *ox = offsets5 + ntags * index(c->state, ctx.nfa) * 2;
- const regoff_t *oy = offsets6 + ntags * c->origin * 2;
- memcpy(ox, oy, ntags * sizeof(regoff_t) * 2);
-
- if (D) fprintf(stderr, "restoring offsets %u to %u\n"
- , c->origin, index(c->state, ctx.nfa));
-
- relax_gtop(ctx, *c);
- prtoff4(ctx, index(c->state, ctx.nfa));
- }
-
- for (; !heap.empty(); ) {
- nfa_state_t *q = heap.top();
- heap.pop();
- q->active = 0;
-
- if (D) fprintf(stderr, "> %u ", index(q, ctx.nfa));
- prtoff4(ctx, index(q, ctx.nfa));
-
- typedef psimctx_t::conf_t conf_t;
- const conf_t x = ctx.state[q->clos];
-
- switch (q->type) {
- case nfa_state_t::ALT:
- copy_offs(ctx, q, q->alt.out1, NOINFO);
- relax_gtop(ctx, conf_t(x, q->alt.out1));
- copy_offs(ctx, q, q->alt.out2, NOINFO);
- relax_gtop(ctx, conf_t(x, q->alt.out2));
- break;
- case nfa_state_t::TAG:
- copy_offs(ctx, q, q->tag.out, q->tag.info);
- relax_gtop(ctx, conf_t(x, q->tag.out, 0));
- break;
- case nfa_state_t::RAN:
- case nfa_state_t::FIN:
- break;
- }
- }
-}
-
-void relax_gtop(psimctx_t &ctx, const psimctx_t::conf_t &c)
-{
- psimctx_t::confset_t &state = ctx.state;
- nfa_state_t *q = c.state;
- const uint32_t idx = q->clos;
-
- if (idx == NOCLOS) {
- q->clos = static_cast<uint32_t>(state.size());
- state.push_back(c);
- accept_offsets(ctx, q);
- }
- else if (q->indeg < 2 || precedence(ctx, c, state[idx]) < 0) {
- state[idx] = c;
- accept_offsets(ctx, q);
- }
- else {
- return;
- }
-
- if (!q->active) {
- q->active = 1;
- ctx.gtop_heap.push(q);
- }
-}
-
-void make_one_step(psimctx_t &ctx, uint32_t sym)
-{
- confset_t &state = ctx.state, &reach = ctx.reach;
- reach.clear();
-
- if (D) fprintf(stderr, "\n--- step %u (sym %c)\n", ctx.step, sym);
-
- for (cconfiter_t i = state.begin(), e = state.end(); i != e; ++i) {
- nfa_state_t *s = i->state;
-
- s->clos = NOCLOS;
- s->arcidx = 0;
- DASSERT(s->status == GOR_NOPASS && s->active == 0);
-
- if (s->type == nfa_state_t::RAN) {
- for (const Range *r = s->ran.ran; r; r = r->next()) {
- if (r->lower() <= sym && sym < r->upper()) {
- const conf_t c(s->ran.out, index(s, ctx.nfa), HROOT);
- reach.push_back(c);
- break;
- }
- }
- }
- else if (s->type == nfa_state_t::FIN) {
- update_final_offsets(ctx, *i);
- }
- }
-}
-
-void make_final_step(psimctx_t &ctx)
-{
- for (cconfiter_t i = ctx.state.begin(), e = ctx.state.end(); i != e; ++i) {
- nfa_state_t *s = i->state;
-
- s->clos = NOCLOS;
- s->arcidx = 0;
- DASSERT(s->status == GOR_NOPASS && s->active == 0);
-
- if (s->type == nfa_state_t::FIN) {
- update_final_offsets(ctx, *i);
- }
- }
-}
-
-void update_final_offsets(psimctx_t &ctx, const conf_t &c)
-{
- nfa_state_t *s = c.state;
- DASSERT(s->type == nfa_state_t::FIN);
-
- const std::vector<Tag> &tags = ctx.nfa.tags;
- const size_t ntags = tags.size();
-
- ctx.marker = ctx.cursor;
- ctx.rule = 0;
-
- regoff_t *of = ctx.offsets3;
- const regoff_t *ox = offsets4 + ntags * index(s, ctx.nfa) * 2;
-
- if (D) fprintf(stderr, "finally: ");
- prtoff4(ctx, index(s, ctx.nfa));
-
- for (size_t t = 0; t < ntags; t += 2) {
- of[t] = ox[2 * t + 1];
- of[t + 1] = ox[2 * (t + 1) + 1];
- }
-}
-
-static void copy_offs(psimctx_t &ctx, const nfa_state_t *y, const nfa_state_t *x
- , tag_info_t info)
-{
- const std::vector<Tag> &tags = ctx.nfa.tags;
- const size_t ntags = tags.size();
- const uint32_t
- xidx = index(x, ctx.nfa),
- yidx = index(y, ctx.nfa);
-
- if (D) fprintf(stderr, "copying offsets %u to %u ", yidx, xidx);
- prtoff4(ctx, yidx);
-
- regoff_t *ox = offsets5 + ntags * xidx * 2;
- const regoff_t *oy = offsets4 + ntags * yidx * 2;
- memcpy(ox, oy, ntags * sizeof(regoff_t) * 2);
-
- if (!(info == NOINFO)) {
- const uint32_t t = info.idx;
-
- // update active tag, and set final tag if it's not set already
- ox[2 * t] = info.neg ? -1 : static_cast<regoff_t>(ctx.step);
- if (ox[2 * t + 1] == -2) {
- ox[2 * t + 1] = ox[2 * t];
- }
-
- // update nested negative tags (if any)
- if (info.neg) {
- const Tag &tag = tags[t];
- for (size_t l = tag.lnest; l < tag.hnest; ++l) {
- ox[2 * l] = -1;
- if (ox[2 * l + 1] == -2) {
- ox[2 * l + 1] = -1;
- }
- }
- }
-
- if (D) fprintf(stderr, "setting offset %u[%u] to %ld\n"
- , xidx, t, (long)ox[2 * t]);
- }
-}
-
-void accept_offsets(psimctx_t &ctx, const nfa_state_t *s)
-{
- const uint32_t idx = index(s, ctx.nfa);
- const size_t ntags = ctx.nfa.tags.size();
-
- const regoff_t *ox = offsets5 + ntags * idx * 2;
- regoff_t *oy = offsets4 + ntags * idx * 2;
- memcpy(oy, ox, ntags * sizeof(regoff_t) * 2);
-
- if (D) fprintf(stderr, "setting offsets %u to ", idx);
- prtoff4(ctx, idx);
-}
-
-static void prtoff(psimctx_t &ctx, uint32_t x, bool newer)
-{
- if (!D) return;
-
- const size_t ntags = ctx.nfa.tags.size();
- const regoff_t *ox = (newer ? offsets5 : offsets4) + ntags * x * 2;
- for (size_t t = 0; t < ntags; t += 2) {
- fprintf(stderr, "(%ld,%ld)(%ld,%ld)"
- , (long)ox[2 * t], (long)ox[2 * (t + 1)]
- , (long)ox[2 * t + 1], (long)ox[2 * (t + 1) + 1]);
- }
- fprintf(stderr, "\n");
-}
-
-static void prtoff4(psimctx_t &ctx, uint32_t x)
-{
- if (D) fprintf(stderr, "off4: ");
- prtoff(ctx, x, false);
-}
-
-static void prtoff5(psimctx_t &ctx, uint32_t x)
-{
- if (D) fprintf(stderr, "off5: ");
- prtoff(ctx, x, true);
-}
-
-} // namespace libre2c
-} // namespace re2c
-
diff --git a/lib/regexec_nfa_posix_kuklewicz.cc b/lib/regexec_nfa_posix_kuklewicz.cc
deleted file mode 100644
index c19db559..00000000
--- a/lib/regexec_nfa_posix_kuklewicz.cc
+++ /dev/null
@@ -1,333 +0,0 @@
-#include <stdio.h>
-#include <algorithm>
-#include <utility>
-#include <vector>
-
-#include "lib/regex.h"
-#include "lib/regex_impl.h"
-#include "src/debug/debug.h"
-#include "src/dfa/closure_posix.h"
-#include "src/dfa/tag_history.h"
-#include "src/nfa/nfa.h"
-#include "src/regexp/rule.h"
-#include "src/regexp/tag.h"
-#include "src/util/range.h"
-
-
-/* note [POSIX orbit tags]
- *
- * POSIX disambiguation rules demand that earlier subexpressions match
- * the longest possible prefix of the input string (without violating the
- * whole match). To accommodate these rules, we resolve conflicts on orbit
- * tags by comparison of tag subhistories on conflicting NFA paths.
- *
- * If one subhistory is a proper prefix of another subhistory, it is less;
- * otherwise for the first pair of different offsets, if one offset is greater
- * than the other, then the corresponding subhistory is less.
- *
- * It is possible to pre-compare two NFA paths corresponding to the same
- * input string prefix and ending in the same NFA state; if paths are not
- * equal, the result of this comparison will hold for any common suffix.
- *
- * It is also possible to pre-compare NFA paths that correspond to the same
- * input prefix, but end in different NFA states. Such comparison is incorrect
- * unless subhistories start at the same offset; but if it is incorrect, we
- * will never use its result (tags with higher priority will also disagree).
- *
- * Therefore instead of keeping the whole history of offsets we calculate
- * the relative order of any pair of subhistories on each step.
- *
- * This part of the algorithm was invented by Christopher Kuklewicz.
- */
-
-namespace re2c {
-
-// specialization that doesn't sort initial closure like Okui-Suzuki
-template<> void init_gor1<libre2c::ksimctx_t>(libre2c::ksimctx_t &ctx);
-
-namespace libre2c {
-
-static const int32_t DELIM = 0x7fffFFFF;
-
-static void make_one_step(ksimctx_t &, uint32_t);
-static void make_final_step(ksimctx_t &);
-static void compute_orders(ksimctx_t &ctx);
-
-// we *do* want these to be inlined
-static inline void closure_posix(ksimctx_t &ctx);
-static inline size_t boundary_tag(size_t tag);
-static inline int32_t subhistory_list(const khistory_t &history, std::vector<int32_t> &path, hidx_t idx, size_t tag);
-static inline void last_subhistory(const khistory_t &history, std::vector<tagver_t> &path, hidx_t idx, size_t tag);
-static inline int32_t compare_last_subhistories(khistory_t &history, hidx_t x, hidx_t y, int32_t ox, int32_t oy, size_t t);
-
-int regexec_nfa_posix_kuklewicz(const regex_t *preg, const char *string
- , size_t nmatch, regmatch_t pmatch[], int /* eflags */)
-{
- ksimctx_t &ctx = *static_cast<ksimctx_t*>(preg->simctx);
- init(ctx, string);
- if (ctx.nfa.tags.size() > 0) {
- ctx.oldprectbl[0] = 0;
- }
-
- // root state can be non-core, so we pass zero as origin to avoid checks
- const conf_t c0(ctx.nfa.root, 0, HROOT);
- ctx.reach.push_back(c0);
- for (;;) {
- closure_posix(ctx);
- const uint32_t sym = static_cast<uint8_t>(*ctx.cursor++);
- if (ctx.state.empty() || sym == 0) break;
- make_one_step(ctx, sym);
- }
- make_final_step(ctx);
-
- if (ctx.rule == Rule::NONE) {
- return REG_NOMATCH;
- }
-
- const getoff_nfa_t fn = { ctx.offsets3 };
- tags_to_submatch(ctx.nfa.tags, nmatch, pmatch, ctx.marker - string - 1, fn);
-
- return 0;
-}
-
-void closure_posix(ksimctx_t &ctx)
-{
- if (ctx.flags & REG_GTOP) {
- closure_posix_gtop(ctx);
- }
- else {
- closure_posix_gor1(ctx);
- }
-}
-
-void make_one_step(ksimctx_t &ctx, uint32_t sym)
-{
- confset_t &state = ctx.state, &reach = ctx.reach;
- uint32_t j = 0;
- reach.clear();
-
- for (cconfiter_t i = state.begin(), e = state.end(); i != e; ++i) {
- nfa_state_t *s = i->state;
-
- s->clos = NOCLOS;
- s->arcidx = 0;
- DASSERT(s->status == GOR_NOPASS && s->active == 0);
-
- if (s->type == nfa_state_t::RAN) {
- for (const Range *r = s->ran.ran; r; r = r->next()) {
- if (r->lower() <= sym && sym < r->upper()) {
- const conf_t c(s->ran.out, j, HROOT);
- reach.push_back(c);
- state[j] = *i;
- update_offsets(ctx, *i, j);
- ++j;
- break;
- }
- }
- }
- else if (s->type == nfa_state_t::FIN) {
- update_offsets(ctx, *i, NONCORE);
- }
- }
-
- state.resize(j);
- std::swap(ctx.offsets1, ctx.offsets2);
-
- compute_orders(ctx);
- std::swap(ctx.newprectbl, ctx.oldprectbl);
- ctx.oldprecdim = j;
-
- ctx.history.init();
- ++ctx.step;
-}
-
-void make_final_step(ksimctx_t &ctx)
-{
- for (cconfiter_t i = ctx.state.begin(), e = ctx.state.end(); i != e; ++i) {
- nfa_state_t *s = i->state;
-
- s->clos = NOCLOS;
- s->arcidx = 0;
- DASSERT(s->status == GOR_NOPASS && s->active == 0);
-
- if (s->type == nfa_state_t::FIN) {
- update_offsets(ctx, *i, NONCORE);
- }
- }
-}
-
-struct cmp_posix_t
-{
- ksimctx_t &ctx;
- size_t tag;
- inline bool operator()(cconfiter_t x, cconfiter_t y) const
- {
- const int32_t
- ox = ctx.oldprectbl[tag * ctx.oldprecdim + x->origin],
- oy = ctx.oldprectbl[tag * ctx.oldprecdim + y->origin];
- // comparison result is inverted, because orders are used as offsets
- return compare_last_subhistories(ctx.history
- , x->thist, y->thist, ox, oy, tag) > 0;
- }
-};
-
-void compute_orders(ksimctx_t &ctx)
-{
- const confset_t &state = ctx.state;
- const size_t ntags = ctx.nfa.tags.size();
- const size_t newdim = state.size();
- cconfiter_t b = state.begin(), e = state.end(), c;
- std::vector<cconfiter_t> &iters = ctx.stateiters;
-
- if (newdim == 0) return;
-
- iters.clear();
- iters.reserve(state.size());
- for (c = b; c != e; ++c) {
- iters.push_back(c);
- }
-
- for (size_t t = 1; t < ntags; t += 2) {
- cmp_posix_t cmp = {ctx, t};
- std::sort(iters.begin(), iters.end(), cmp);
-
- int32_t m = 0, *tbl = ctx.newprectbl + t * newdim;
- for (size_t i = 0; i < newdim; ++m) {
- *(tbl + (iters[i] - b)) = m;
- for (; ++i < newdim && cmp(iters[i - 1], iters[i]) == 0; ) {
- *(tbl + (iters[i] - b)) = m;
- }
- }
- }
-}
-
-int32_t compare_last_subhistories(khistory_t &history
- , hidx_t x, hidx_t y, int32_t ox, int32_t oy, size_t t)
-{
- if (ox > oy) return -1;
- if (ox < oy) return 1;
- if (x == y) return 0;
-
- std::vector<int32_t> &p1 = history.path1, &p2 = history.path2;
-
- last_subhistory(history, p1, x, t);
- last_subhistory(history, p2, y, t);
-
- std::vector<int32_t>::const_reverse_iterator
- i1 = p1.rbegin(), e1 = p1.rend(),
- i2 = p2.rbegin(), e2 = p2.rend();
- for (;;) {
- if (i1 == e1 && i2 == e2) break;
- if (i1 == e1) return -1;
- if (i2 == e2) return 1;
- if (*i1 > *i2) return -1;
- if (*i1 < *i2) return 1;
- ++i1; ++i2;
- }
-
- return 0;
-}
-
-void last_subhistory(const khistory_t &history
- , std::vector<tagver_t> &path, hidx_t idx, size_t tag)
-{
- path.clear();
- hidx_t i = idx;
- const size_t bound = boundary_tag(tag);
- for (; i != HROOT && history.node(i).info.idx >= bound; i = history.node(i).pred) {
- if (history.node(i).info.idx == tag) {
- path.push_back(history.node(i).info.neg ? -1 : 1);
- }
- }
-}
-
-template<typename ctx_t>
-int32_t khistory_t::precedence(ctx_t &ctx
- , const typename ctx_t::conf_t &x, const typename ctx_t::conf_t &y
- , int32_t &/*prec1*/, int32_t &/*prec2*/)
-{
- // History consists of multiple subhistories (each containing either a
- // single negative tag, or one or more positive tags (exactly one for
- // non-orbit subhistories). Because of the shortest-path algorithm earlier
- // subhistories do not necessarily coincide, so comparing only the last
- // pair of subhistories is not enough. See note [POSIX orbit tags].
-
- const size_t ntags = ctx.nfa.tags.size();
- for (size_t t = 1; t < ntags; t += 2) {
- const int32_t
- ox = ctx.oldprectbl[t * ctx.oldprecdim + x.origin],
- oy = ctx.oldprectbl[t * ctx.oldprecdim + y.origin];
-
- if (ox > oy) return -1;
- if (ox < oy) return 1;
- if (x.thist == y.thist) continue;
-
- std::vector<int32_t> &p1 = ctx.history.path1, &p2 = ctx.history.path2;
- int32_t n1, n2;
- (void)(n1 = subhistory_list(ctx.history, p1, x.thist, t));
- (void)(n2 = subhistory_list(ctx.history, p2, y.thist, t));
- DASSERT(n1 == n2);
-
- std::vector<int32_t>::const_reverse_iterator
- i1 = p1.rbegin(), e1 = p1.rend(),
- i2 = p2.rbegin(), e2 = p2.rend();
- for (;;) {
- if (i1 == e1 && i2 == e2) break;
- DASSERT(i1 != e1 && i2 != e2);
- const int32_t v1 = *i1++, v2 = *i2++;
- if (v1 == DELIM && v2 == DELIM) continue;
- if (v1 == DELIM) return -1;
- if (v2 == DELIM) return 1;
- if (v1 > v2) return -1;
- if (v1 < v2) return 1;
- }
- }
- return 0;
-}
-
-// returns all subhistories of the given tag as one sequence
-// (individual subhistories are separated by delimiter)
-int32_t subhistory_list(const khistory_t &history,
- std::vector<int32_t> &path, hidx_t idx, size_t tag)
-{
- path.clear();
- int32_t nsub = 0;
- hidx_t i = idx;
-
- const size_t bound = boundary_tag(tag);
- path.push_back(DELIM);
- for (;;) {
- for (; i != HROOT && history.node(i).info.idx >= bound; i = history.node(i).pred) {
- if (history.node(i).info.idx == tag) {
- path.push_back(history.node(i).info.neg ? -1 : 1);
- }
- }
- if (i == HROOT) break;
- ++nsub;
- path.push_back(DELIM);
- for (; i != HROOT && history.node(i).info.idx != tag; i = history.node(i).pred);
- }
-
- return nsub;
-}
-
-size_t boundary_tag(size_t tag)
-{
- // for start tags, return itself; for end tags, return start tag
- // (start tags have even numbers, end tags have odd numbers)
- return tag & ~1u;
-}
-
-} // namespace libre2c
-
-template<> void init_gor1<libre2c::ksimctx_t>(libre2c::ksimctx_t &ctx)
-{
- ctx.state.clear();
- libre2c::ksimctx_t::cconfiter_t c = ctx.reach.begin(), e = ctx.reach.end();
- for (; c != e; ++c) {
- relax_gor1(ctx, *c);
- }
-}
-
-} // namespace re2c
-
diff --git a/lib/regexec_nfa_posix_trie.cc b/lib/regexec_nfa_posix_trie.cc
index beb05cd9..bf296289 100644
--- a/lib/regexec_nfa_posix_trie.cc
+++ b/lib/regexec_nfa_posix_trie.cc
@@ -4,64 +4,57 @@
#include "lib/regex.h"
#include "lib/regex_impl.h"
-#include "src/debug/debug.h"
#include "src/dfa/closure_posix.h"
#include "src/dfa/tag_history.h"
#include "src/nfa/nfa.h"
+#include "src/util/check.h"
#include "src/util/range.h"
-
namespace re2c {
-// specialization that doesn't sort initial closure like Okui-Suzuki
-// (there is no cheap way to sort it in the lazy-disambiguation algorithm)
-template<> void init_gor1<libre2c::pzsimctx_t>(libre2c::pzsimctx_t &ctx);
+// Specialization that doesn't sort initial closure like Okui-Suzuki (there is no cheap way to sort
+// it in the lazy-disambiguation algorithm).
+template<> void init_gor1<libre2c::pzsimctx_t>(libre2c::pzsimctx_t& ctx);
namespace libre2c {
-/* note [lazy computation and caching of precedence]
- *
- * Eagerly computing precedence values on each step for each pair of closure
- * states is a waste of time: most of these values are not needed, because
- * RE may be unambigous, or the given input string may be unambigous, or even
- * if there is ambiguity, it may take only a few comparisons to resolve. All
- * the rest is wasted effort.
- *
- * We can avoid it by delaying precedence computation until necessary, and
- * then unwinding all the steps backwards, computing precedence for each step
- * and caching the computed values (so that the same pair of histories is not
- * compared twice). It is still the same incremental comparison as with
- * precedence matrices: we compare step by step, going from the fork frame to
- * the join frame. The result of comparison on each step is folded to a triple
- * of numbers and recorded in cache. It is important that we do record each
- * step, not just the final step, because the next pair of ambiguous histories
- * may unwind to the same pair of prefixes that was compared before.
- *
- * For all this to work, it is necessary that we are able to store all history
- * until the end, because at any step we might need to unwind an arbitrary
- * number of steps back. We also need to address individual subhistories
- * efficiently in order to use them as keys in the cache. All this is achieved
- * by storing history in the form of a trie and addressing individual
- * histories by indices in that trie. We also use trie to compute the final
- * tag values (instead of storing tags in registers at each step).
- */
-
-static void make_step(pzsimctx_t &, uint32_t);
-static void make_final_step(pzsimctx_t &);
-template<typename ctx_t> static int32_t zprecedence1(ctx_t &ctx
- , hidx_t idx1, hidx_t idx2, int32_t &prec1, int32_t &prec2);
-template<typename ctx_t> static int32_t zprecedence2(ctx_t &ctx
- , hidx_t idx1, hidx_t idx2, int32_t &prec1, int32_t &prec2);
+// note [lazy computation and caching of precedence]
+//
+// Eagerly computing precedence values on each step for each pair of closure states is a waste of
+// time: most of these values are not needed, because regexp may be unambigous, or the given input
+// string may be unambigous, or even if there is ambiguity, it may take only a few comparisons to
+// resolve. All the rest is wasted effort.
+//
+// We can avoid it by delaying precedence computation until necessary, and then unwinding all the
+// steps backwards, computing precedence for each step and caching the computed values (so that the
+// same pair of histories is not compared twice). It is still the same incremental comparison as
+// with precedence matrices: we compare step by step, going from the fork frame to the join frame.
+// The result of comparison on each step is folded to a triple of numbers and recorded in cache. It
+// is important that we do record each step, not just the final step, because the next pair of
+// ambiguous histories may unwind to the same pair of prefixes that was compared before.
+//
+// For all this to work, it is necessary that we are able to store all history until the end,
+// because at any step we might need to unwind an arbitrary number of steps back. We also need to
+// address individual subhistories efficiently in order to use them as keys in the cache. All this
+// is achieved by storing history in the form of a trie and addressing individual histories by
+// indices in that trie. We also use trie to compute the final tag values (instead of storing tags
+// in registers at each step).
+
+static void make_step(pzsimctx_t&, uint32_t);
+static void make_final_step(pzsimctx_t&);
+template<typename ctx_t> static int32_t zprecedence1(
+ ctx_t& ctx, hidx_t idx1, hidx_t idx2, int32_t& prec1, int32_t& prec2);
+template<typename ctx_t> static int32_t zprecedence2(
+ ctx_t& ctx, hidx_t idx1, hidx_t idx2, int32_t& prec1, int32_t& prec2);
// we *do* want this to be inlined
-static inline uint32_t get_step(const zhistory_t &hist, int32_t idx);
-static inline uint32_t get_orig(const zhistory_t &hist, int32_t idx);
-static inline void closure_posix(pzsimctx_t &ctx);
-
-int regexec_nfa_posix_trie(const regex_t *preg, const char *string
- , size_t nmatch, regmatch_t pmatch[], int)
-{
- pzsimctx_t &ctx = *static_cast<pzsimctx_t*>(preg->simctx);
+static inline uint32_t get_step(const zhistory_t& hist, int32_t idx);
+static inline uint32_t get_orig(const zhistory_t& hist, int32_t idx);
+static inline void closure_posix(pzsimctx_t& ctx);
+
+int regexec_nfa_posix_trie(
+ const regex_t* preg, const char* string, size_t nmatch, regmatch_t pmatch[], int) {
+ pzsimctx_t& ctx = *static_cast<pzsimctx_t*>(preg->simctx);
init(ctx, string);
const conf_t c0(ctx.nfa.root, 0, HROOT);
@@ -78,41 +71,33 @@ int regexec_nfa_posix_trie(const regex_t *preg, const char *string
return finalize(ctx, string, nmatch, pmatch);
}
-void closure_posix(pzsimctx_t &ctx)
-{
- if (ctx.flags & REG_GTOP) {
- closure_posix_gtop(ctx);
- }
- else {
- closure_posix_gor1(ctx);
- }
+void closure_posix(pzsimctx_t& ctx) {
+ closure_posix_gor1(ctx);
}
-void make_step(pzsimctx_t &ctx, uint32_t sym)
-{
- const confset_t &state = ctx.state;
- confset_t &reach = ctx.reach;
+void make_step(pzsimctx_t& ctx, uint32_t sym) {
+ const confset_t& state = ctx.state;
+ confset_t& reach = ctx.reach;
cconfiter_t b = state.begin(), e = state.end(), i;
reach.clear();
uint32_t j = 0;
for (i = b; i != e; ++i) {
- nfa_state_t *s = i->state;
+ TnfaState* s = i->state;
s->clos = NOCLOS;
s->arcidx = 0;
- DASSERT(s->status == GOR_NOPASS && s->active == 0);
+ DCHECK(s->status == GOR_NOPASS && s->active == 0);
- if (s->type == nfa_state_t::RAN) {
- for (const Range *r = s->ran.ran; r; r = r->next()) {
+ if (s->kind == TnfaState::Kind::RAN) {
+ for (const Range* r = s->ran; r; r = r->next()) {
if (r->lower() <= sym && sym < r->upper()) {
- const conf_t c(s->ran.out, j++, i->thist);
+ const conf_t c(s->out1, j++, i->thist);
reach.push_back(c);
break;
}
}
- }
- else if (s->type == nfa_state_t::FIN) {
+ } else if (s->kind == TnfaState::Kind::FIN) {
ctx.marker = ctx.cursor;
ctx.hidx = i->thist;
ctx.rule = 0;
@@ -122,16 +107,15 @@ void make_step(pzsimctx_t &ctx, uint32_t sym)
++ctx.step;
}
-void make_final_step(pzsimctx_t &ctx)
-{
+void make_final_step(pzsimctx_t& ctx) {
for (confiter_t i = ctx.state.begin(), e = ctx.state.end(); i != e; ++i) {
- nfa_state_t *s = i->state;
+ TnfaState* s = i->state;
s->clos = NOCLOS;
s->arcidx = 0;
- DASSERT(s->status == GOR_NOPASS && s->active == 0);
+ DCHECK(s->status == GOR_NOPASS && s->active == 0);
- if (s->type == nfa_state_t::FIN) {
+ if (s->kind == TnfaState::Kind::FIN) {
ctx.marker = ctx.cursor;
ctx.hidx = i->thist;
ctx.rule = 0;
@@ -140,19 +124,18 @@ void make_final_step(pzsimctx_t &ctx)
}
template<typename ctx_t>
-int32_t zhistory_t::precedence(ctx_t &ctx
- , const typename ctx_t::conf_t &x, const typename ctx_t::conf_t &y
- , int32_t &prec1, int32_t &prec2)
-{
+int32_t zhistory_t::precedence(ctx_t& ctx,
+ const typename ctx_t::conf_t& x,
+ const typename ctx_t::conf_t& y,
+ int32_t& prec1,
+ int32_t& prec2) {
return zprecedence1(ctx, x.thist, y.thist, prec1, prec2);
}
// see note [lazy computation and caching of precedence]
template<typename ctx_t>
-int32_t zprecedence1(ctx_t &ctx, hidx_t idx1, hidx_t idx2
- , int32_t &prec1, int32_t &prec2)
-{
- zhistory_t::cache_t &cache = ctx.history.cache;
+int32_t zprecedence1(ctx_t& ctx, hidx_t idx1, hidx_t idx2, int32_t& prec1, int32_t& prec2) {
+ zhistory_t::cache_t& cache = ctx.history.cache;
int32_t prec = 0;
// use the same cache entry for (x, y) and (y, x)
@@ -164,7 +147,7 @@ int32_t zprecedence1(ctx_t &ctx, hidx_t idx1, hidx_t idx2
zhistory_t::cache_t::const_iterator i = cache.find(key);
if (i != cache.end()) {
// use previously computed precedence values from cache
- const zhistory_t::cache_entry_t &val = i->second;
+ const zhistory_t::cache_entry_t& val = i->second;
prec1 = val.prec1;
prec2 = val.prec2;
prec = val.prec;
@@ -172,8 +155,7 @@ int32_t zprecedence1(ctx_t &ctx, hidx_t idx1, hidx_t idx2
std::swap(prec1, prec2);
prec = -prec;
}
- }
- else {
+ } else {
// compute precedence values and put them into cache
prec = zprecedence2(ctx, idx1, idx2, prec1, prec2);
zhistory_t::cache_entry_t val = {prec1, prec2, prec};
@@ -189,16 +171,14 @@ int32_t zprecedence1(ctx_t &ctx, hidx_t idx1, hidx_t idx2
// see note [lazy computation and caching of precedence]
template<typename ctx_t>
-int32_t zprecedence2(ctx_t &ctx, hidx_t idx1, hidx_t idx2
- , int32_t &prec1, int32_t &prec2)
-{
+int32_t zprecedence2(ctx_t& ctx, hidx_t idx1, hidx_t idx2, int32_t& prec1, int32_t& prec2) {
if (idx1 == idx2) {
prec1 = prec2 = MAX_RHO;
return 0;
}
- const std::vector<Tag> &tags = ctx.nfa.tags;
- zhistory_t &hist = ctx.history;
+ const std::vector<Tag>& tags = ctx.nfa.tags;
+ zhistory_t& hist = ctx.history;
int32_t prec = 0;
prec1 = prec2 = MAX_RHO;
@@ -215,14 +195,13 @@ int32_t zprecedence2(ctx_t &ctx, hidx_t idx1, hidx_t idx2
tag_info_t info1 = NOINFO, info2 = NOINFO;
for (; i1 != i2 && (s1 >= s || s2 >= s);) {
if (s1 >= s && (i1 > i2 || s2 < s)) {
- const zhistory_t::node_t &n = hist.node(i1);
+ const zhistory_t::node_t& n = hist.node(i1);
info1 = n.info;
prec1 = std::min(prec1, tags[info1.idx].height);
i1 = n.pred;
s1 = get_step(hist, i1);
- }
- else {
- const zhistory_t::node_t &n = hist.node(i2);
+ } else {
+ const zhistory_t::node_t& n = hist.node(i2);
info2 = n.info;
prec2 = std::min(prec2, tags[info2.idx].height);
i2 = n.pred;
@@ -235,8 +214,7 @@ int32_t zprecedence2(ctx_t &ctx, hidx_t idx1, hidx_t idx2
prec = zprecedence1(ctx, i1, i2, p1, p2);
prec1 = std::min(prec1, p1);
prec2 = std::min(prec2, p2);
- }
- else if (i1 != HROOT) {
+ } else if (i1 != HROOT) {
const int32_t h = tags[hist.node(i1).info.idx].height;
prec1 = std::min(prec1, h);
prec2 = std::min(prec2, h);
@@ -247,24 +225,20 @@ int32_t zprecedence2(ctx_t &ctx, hidx_t idx1, hidx_t idx2
if (prec1 < prec2) return 1;
// leftmost precedence
- return !fork_frame ? prec
- : leftprec(info1, info2, i1 == idx1, i2 == idx2);
+ return !fork_frame ? prec : leftprec(info1, info2, i1 == idx1, i2 == idx2);
}
-uint32_t get_step(const zhistory_t &hist, int32_t idx)
-{
+uint32_t get_step(const zhistory_t& hist, int32_t idx) {
return idx == HROOT ? 0 : hist.node(idx).step;
}
-uint32_t get_orig(const zhistory_t &hist, int32_t idx)
-{
+uint32_t get_orig(const zhistory_t& hist, int32_t idx) {
return idx == HROOT ? 0 : hist.node(idx).orig;
}
} // namespace libre2c
-template<> void init_gor1<libre2c::pzsimctx_t>(libre2c::pzsimctx_t &ctx)
-{
+template<> void init_gor1<libre2c::pzsimctx_t>(libre2c::pzsimctx_t& ctx) {
ctx.state.clear();
libre2c::pzsimctx_t::cconfiter_t c = ctx.reach.begin(), e = ctx.reach.end();
for (; c != e; ++c) {
@@ -273,4 +247,3 @@ template<> void init_gor1<libre2c::pzsimctx_t>(libre2c::pzsimctx_t &ctx)
}
} // namespace re2c
-
diff --git a/lib/regfree.cc b/lib/regfree.cc
index 005b961a..e837ba61 100644
--- a/lib/regfree.cc
+++ b/lib/regfree.cc
@@ -3,7 +3,7 @@
#include <valarray>
#include <vector>
-#include "lib/regcomp_dfa_regless.h"
+#include "lib/regcomp_dfa_multipass.h"
#include "lib/regex.h"
#include "lib/regex_impl.h"
#include "lib/regoff_trie.h"
@@ -14,22 +14,12 @@
#include "src/parse/ast.h"
#include "src/regexp/rule.h"
#include "src/regexp/tag.h"
-#include "src/util/free_list.h"
#include "src/util/range.h"
-
using namespace re2c;
using namespace re2c::libre2c;
-void regfree(regex_t *preg)
-{
- delete preg->rmgr;
-
- delete &preg->nfa->charset;
- delete &preg->nfa->rules;
- delete &preg->nfa->tags;
- delete preg->nfa;
-
+void regfree(regex_t* preg) {
if (preg->flags & REG_TSTRING) {
delete[] preg->tstring.string;
} else if (preg->flags & REG_SUBHIST) {
@@ -39,49 +29,39 @@ void regfree(regex_t *preg)
}
if (preg->flags & REG_NFA) {
+ delete preg->nfa;
if ((preg->flags & REG_TRIE) && (preg->flags & REG_LEFTMOST)) {
delete static_cast<lzsimctx_t*>(preg->simctx);
} else if (preg->flags & REG_TRIE) {
delete static_cast<pzsimctx_t*>(preg->simctx);
} else if (preg->flags & REG_LEFTMOST) {
delete static_cast<lsimctx_t*>(preg->simctx);
- } else if (preg->flags & REG_KUKLEWICZ) {
- delete static_cast<ksimctx_t*>(preg->simctx);
} else {
delete static_cast<psimctx_t*>(preg->simctx);
}
} else {
- delete[] preg->char2class;
- delete[] preg->dfa->finvers;
-
- delete &preg->dfa->mtagvers;
- delete &preg->dfa->tcpool;
- delete preg->dfa;
-
- if (preg->flags & REG_REGLESS) {
- if (preg->flags & REG_LEFTMOST) {
- delete static_cast<ldetctx_t*>(preg->rldfa->ctx);
- } else {
- delete static_cast<pdetctx_t*>(preg->rldfa->ctx);
+ if (preg->flags & REG_MULTIPASS) {
+ delete preg->mptdfa->opts;
+ delete[] preg->mptdfa->result;
+ delete preg->mptdfa;
+ if (preg->flags & REG_SUBHIST) {
+ delete preg->regtrie;
}
- delete preg->rldfa->opts;
- delete[] preg->rldfa->result;
- delete preg->rldfa;
- }
- if (preg->flags & REG_TSTRING) {
- // t-string construction does not use this
- } else if (preg->flags & REG_SUBHIST) {
- delete preg->regtrie;
} else {
- delete[] preg->regs;
+ delete &preg->dfa->dfa_alc;
+ delete preg->dfa;
+ if (preg->flags & REG_TSTRING) {
+ // t-string construction does not use this
+ } else if (preg->flags & REG_SUBHIST) {
+ delete preg->regtrie;
+ } else {
+ delete[] preg->regs;
+ }
}
+ delete[] preg->char2class;
}
-
- AST::flist.clear();
- RangeSuffix::freeList.clear();
}
-void regfreesub(subhistory_t *history)
-{
+void regfreesub(subhistory_t* history) {
free(history);
}
diff --git a/lib/regoff_trie.h b/lib/regoff_trie.h
index 2e463a59..d50cc649 100644
--- a/lib/regoff_trie.h
+++ b/lib/regoff_trie.h
@@ -2,15 +2,14 @@
#define _RE2C_LIB_REGOFF_TRIE_
#include <algorithm>
-#include "regex.h"
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
-#include "src/debug/debug.h"
+#include "regex.h"
+#include "src/util/check.h"
#include "src/util/forbid_copy.h"
-
namespace re2c {
namespace libre2c {
@@ -21,26 +20,24 @@ struct regoff_trie_t {
};
size_t nlists;
- size_t *lists;
- size_t *count;
+ size_t* lists;
+ size_t* count;
size_t size;
size_t capacity;
- node_t *storage;
+ node_t* storage;
explicit regoff_trie_t(size_t nlists)
- : nlists(nlists)
- , lists(new size_t[nlists])
- , count(new size_t[nlists])
- , size(0)
- , capacity(nlists * 2)
- , storage(new node_t[capacity])
- {
+ : nlists(nlists),
+ lists(new size_t[nlists]),
+ count(new size_t[nlists]),
+ size(0),
+ capacity(nlists * 2),
+ storage(new node_t[capacity]) {
clear();
}
- ~regoff_trie_t()
- {
+ ~regoff_trie_t() {
delete[] lists;
delete[] count;
delete[] storage;
@@ -53,22 +50,20 @@ struct regoff_trie_t {
}
inline void grow(size_t new_capacity) {
- DASSERT(new_capacity > capacity);
- node_t *new_storage = new node_t[new_capacity];
+ DCHECK(new_capacity > capacity);
+ node_t* new_storage = new node_t[new_capacity];
memcpy(new_storage, storage, capacity * sizeof(node_t));
delete[] storage;
storage = new_storage;
capacity = new_capacity;
}
- inline void copy(size_t lhs, size_t rhs)
- {
+ inline void copy(size_t lhs, size_t rhs) {
lists[lhs] = lists[rhs];
count[lhs] = count[rhs];
}
- inline void set(size_t lhs, regoff_t off)
- {
+ inline void set(size_t lhs, regoff_t off) {
if (count[lhs] == 0) {
add(lhs, off);
} else {
@@ -79,8 +74,7 @@ struct regoff_trie_t {
}
}
- inline void add(size_t lhs, regoff_t off)
- {
+ inline void add(size_t lhs, regoff_t off) {
if (size >= capacity) {
grow(capacity * 2);
}
diff --git a/lib/stubs.cc b/lib/stubs.cc
index 0dae4b9e..4784bb7c 100644
--- a/lib/stubs.cc
+++ b/lib/stubs.cc
@@ -1,2 +1,2 @@
-extern const char *help;
-const char *help = "";
+extern const char* help;
+const char* help = "";
diff --git a/lib/test.cc b/lib/test.cc
index 72c3d0c8..5cdbf24a 100644
--- a/lib/test.cc
+++ b/lib/test.cc
@@ -1,4 +1,4 @@
-#include <assert.h>
+#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -8,12 +8,12 @@
#include "lib/regex.h"
#include "lib/test_helper.h"
-#include "src/util/c99_stdint.h"
+#include "src/util/check.h"
-
-static int test(int flags, const char *pattern, const char *string,
- const char *expected = NULL)
-{
+static int test(int flags,
+ const char* pattern,
+ const char* string,
+ const char* expected = nullptr) {
std::vector<std::vector<regoff_t> > submatch;
if (!parse_submatch(expected, submatch)) {
fprintf(stderr, "failed to parse submatch string: %s\n", expected);
@@ -24,26 +24,26 @@ static int test(int flags, const char *pattern, const char *string,
static uint32_t total = 0;
static uint32_t failed = 0;
- const char *prefix = flags & REG_NFA ? "NFA" : "DFA";
+ const char* prefix = flags & REG_NFA ? "NFA" : "DFA";
bool with_hist = flags & REG_SUBHIST;
regex_t re;
- regmatch_t *pmatch = with_hist ? NULL : new regmatch_t[nmatch];
- subhistory_t *psubhist = NULL;
+ regmatch_t* pmatch = with_hist ? nullptr : new regmatch_t[nmatch];
+ subhistory_t* psubhist = nullptr;
int result;
result = regcomp(&re, pattern, flags);
if (result != 0) {
- fprintf(stderr, "%s: regcomp() failed for RE %s\n", prefix, pattern);
+ fprintf(stderr, "%s: regcomp() failed for regexp %s\n", prefix, pattern);
goto end;
}
// run multiple times to ensure everything gets cleaned up properly
- static const uint32_t NRUNS = 2;
+ static constexpr uint32_t NRUNS = 2;
for (uint32_t i = 0; i < NRUNS; ++i) {
if (with_hist) {
- // With subhistories every regexec() call may allocate offsets.
- // Clean them up at the beginning of the loop, so that the offsets
- // remain available for inspection after the last iteration.
+ // With subhistories every regexec() call may allocate offsets. Clean them up at the
+ // beginning of the loop, so that the offsets remain available for inspection after the
+ // last iteration.
regfreesub(psubhist);
psubhist = regparse(&re, string, nmatch);
result = psubhist ? 0 : 1;
@@ -55,62 +55,67 @@ static int test(int flags, const char *pattern, const char *string,
// failure was expected => it's a success
result = 0;
} else if (nmatch > 0) {
- fprintf(stderr, "%s: regexec() failed for RE %s and string %s\n"
- , prefix, pattern, string);
+ fprintf(stderr,
+ "%s: regexec() failed for regexp %s and string %s\n",
+ prefix, pattern, string);
goto end;
}
} else if (nmatch == 0) {
// regexed must have failed, something is wrong
result = REG_NOMATCH;
- fprintf(stderr, "%s: regexec() didn't fail while it should"
- " for RE %s and string %s\n", prefix, pattern, string);
+ fprintf(stderr,
+ "%s: regexec() didn't fail while it should for regexp %s and string %s\n",
+ prefix, pattern, string);
goto end;
}
}
- assert(nmatch == 0 || nmatch == re.re_nsub);
+ CHECK(nmatch == 0 || nmatch == re.re_nsub);
for (uint32_t i = 0; i < nmatch; ++i) {
- const std::vector<regoff_t> &expect_history = submatch[i];
+ const std::vector<regoff_t>& expect_history = submatch[i];
const size_t expect_size = expect_history.size() / 2;
if (!with_hist) {
// Check only the last pair of offsets for each capturing group.
const regoff_t expect_so = expect_history[2 * expect_size - 2];
const regoff_t expect_eo = expect_history[2 * expect_size - 1];
- const regmatch_t &actual = pmatch[i];
+ const regmatch_t& actual = pmatch[i];
if (expect_so != actual.rm_so || expect_eo != actual.rm_eo) {
result = 1;
- fprintf(stderr, "%s: incorrect submatch for RE %s and string %s:\n"
- "\tpmatch[%u].rm_so = %ld (expected %ld)\n"
- "\tpmatch[%u].rm_eo = %ld (expected %ld)\n",
- prefix, pattern, string,
- i, (long)actual.rm_so, (long)expect_so,
- i, (long)actual.rm_eo, (long)expect_eo);
+ fprintf(stderr,
+ "%s: incorrect submatch for regexp %s and string %s:\n"
+ "\tpmatch[%u].rm_so = %td (expected %td)\n"
+ "\tpmatch[%u].rm_eo = %td (expected %td)\n",
+ prefix, pattern, string,
+ i, actual.rm_so, expect_so,
+ i, actual.rm_eo, expect_eo);
goto end;
}
} else {
// Check the full history of each capturing group.
- const subhistory_t &actual_history = psubhist[i];
+ const subhistory_t& actual_history = psubhist[i];
if (actual_history.size != expect_size) {
result = 1;
- fprintf(stderr, "%s: wrong subhistory-%u length for RE %s and "
- "string %s: expected %lu, have %lu\n",
- prefix, i, pattern, string, expect_size, actual_history.size);
+ fprintf(stderr,
+ "%s: wrong subhistory-%u length for regexp %s and string %s: "
+ "expected %zd, have %zd\n",
+ prefix, i, pattern, string, expect_size, actual_history.size);
goto end;
}
for (uint32_t j = 0; j < actual_history.size; ++j) {
const regoff_t expect_so = expect_history[2 * j];
const regoff_t expect_eo = expect_history[2 * j + 1];
- const regmatch_t &actual = actual_history.offs[j];
+ const regmatch_t& actual = actual_history.offs[j];
if (expect_so != actual.rm_so || expect_eo != actual.rm_eo) {
result = 1;
- fprintf(stderr, "%s: incorrect submatch for RE %s and string %s:\n"
- "\tpmatch[%u][%u].rm_so = %ld (expected %ld)\n"
- "\tpmatch[%u][%u].rm_eo = %ld (expected %ld)\n",
- prefix, pattern, string,
- i, j, (long)actual.rm_so, (long)expect_so,
- i, j, (long)actual.rm_eo, (long)expect_eo);
+ fprintf(stderr,
+ "%s: incorrect submatch for regexp %s and string %s:\n"
+ "\tpmatch[%u][%u].rm_so = %td (expected %td)\n"
+ "\tpmatch[%u][%u].rm_eo = %td (expected %td)\n",
+ prefix, pattern, string,
+ i, j, actual.rm_so, expect_so,
+ i, j, actual.rm_eo, expect_eo);
goto end;
}
}
@@ -134,27 +139,25 @@ end:
return result;
}
-static int test_tstring(const char *pattern, const char *string, const char *expected)
-{
+static int test_tstring(const char* pattern, const char* string, const char* expected) {
regex_t re;
- const tstring_t *tstr;
+ const tstring_t* tstr = nullptr;
std::ostringstream s;
int result;
- result = regcomp(&re, pattern, REG_REGLESS | REG_TSTRING);
+ result = regcomp(&re, pattern, REG_MULTIPASS | REG_TSTRING);
if (result != 0) {
- fprintf(stderr, "[t-string] regcomp() failed for RE %s\n", pattern);
+ fprintf(stderr, "[t-string] regcomp() failed for regexp %s\n", pattern);
goto end;
}
// run multiple times to ensure everything gets cleaned up properly
- static const uint32_t NRUNS = 2;
+ static constexpr uint32_t NRUNS = 2;
for (uint32_t i = 0; i < NRUNS; ++i) {
tstr = regtstring(&re, string);
- if (tstr == NULL) {
+ if (tstr == nullptr) {
result = 1;
- fprintf(stderr, "regtstring() failed for RE %s and string %s\n",
- pattern, string);
+ fprintf(stderr, "regtstring() failed for regexp %s and string %s\n", pattern, string);
goto end;
}
}
@@ -163,7 +166,7 @@ static int test_tstring(const char *pattern, const char *string, const char *exp
for (size_t i = 0; i < tstr->length; ++i) {
const tchar_t c = tstr->string[i];
if (c < TAG_BASE) {
- assert(c < 0xff); // expect 1-byte characters
+ CHECK(c < 0xff); // expect 1-byte characters
s << static_cast<char>(c);
} else {
s << c - TAG_BASE;
@@ -173,8 +176,9 @@ static int test_tstring(const char *pattern, const char *string, const char *exp
if (strcmp(expected, s.str().c_str()) != 0) {
result = 1;
- fprintf(stderr, "incorrect t-string for RE %s and string %s:\n"
- "\texpect: %s\n\tactual: %s\n", pattern, string, expected, s.str().c_str());
+ fprintf(stderr,
+ "incorrect t-string for regexp %s and string %s:\n\texpect: %s\n\tactual: %s\n",
+ pattern, string, expected, s.str().c_str());
}
end:
@@ -182,70 +186,60 @@ end:
return result;
}
-static int test_all_posix(int f)
-{
+static int test_all_posix(int f) {
int e = 0;
e |= test(f, "(a+(c+))|(b+(d+))", "ac", "(0,2),(0,2),(1,2),(?,?),(?,?)");
e |= test(f, "(aaaa|aaa|a)+", "aaaaaaaaaa", "(0,10),(0,4)(4,8)(8,9)(9,10)");
e |= test(f, "(aaaa|aaa|a){3,}", "aaaaaaaaaa", "(0,10),(0,4)(4,8)(8,9)(9,10)");
- if (!(f & REG_BACKWARD)) {
- // expected failures for Cox algorithm
+ e |= test(f, "(aaaa|aaa|a){3,4}", "aaaaaaaaaa", "(0,10),(0,4)(4,8)(8,9)(9,10)");
- e |= test(f, "(aaaa|aaa|a){3,4}", "aaaaaaaaaa", "(0,10),(0,4)(4,8)(8,9)(9,10)");
+ e |= test(f, "(aaaaaa|aaaaa|aaaa|aa|a){3,4}", "aaaaaaaaaaaaaaa",
+ "(0,15),(0,6)(6,12)(12,14)(14,15)");
- e |= test(f, "(aaaaaa|aaaaa|aaaa|aa|a){3,4}", "aaaaaaaaaaaaaaa",
- "(0,15),(0,6)(6,12)(12,14)(14,15)");
+ e |= test(f, "(aaaaa?a?|aa?){1,4}", "aaaaaaaaaaaaaaa", "(0,15),(0,6)(6,12)(12,14)(14,15)");
- e |= test(f, "(aaaaa?a?|aa?){1,4}", "aaaaaaaaaaaaaaa",
- "(0,15),(0,6)(6,12)(12,14)(14,15)");
+ e |= test(f, "(((a){3,4}a?)()a|aa?){1,4}", "aaaaaaaaaaaaaaa",
+ "(0,15),(0,6)(6,12)(12,14)(14,15),(0,5)(6,11)(?,?)(?,?),"
+ "(0,1)(1,2)(2,3)(3,4)(6,7)(7,8)(8,9)(9,10)(?,?)(?,?),(5,5)(11,11)(?,?)(?,?)");
- e |= test(f, "(((a){3,4}a?)()a|aa?){1,4}", "aaaaaaaaaaaaaaa",
- "(0,15),(0,6)(6,12)(12,14)(14,15),(0,5)(6,11)(?,?)(?,?),"
- "(0,1)(1,2)(2,3)(3,4)(6,7)(7,8)(8,9)(9,10)(?,?)(?,?),(5,5)(11,11)(?,?)(?,?)");
+ e |= test(f, "(((aaaa?|a?)){1,4})+", "aaaaaaaaaa",
+ "(0,10),(0,10),(0,4)(4,8)(8,9)(9,10),(0,4)(4,8)(8,9)(9,10)");
- e |= test(f, "(((aaaa?|a?)){1,4})+", "aaaaaaaaaa",
- "(0,10),(0,10),(0,4)(4,8)(8,9)(9,10),(0,4)(4,8)(8,9)(9,10)");
+ e |= test(f, "((((a){2,3}(()|a)(()|a?)a|a?)){2,5})*", "aaaaaaaaaaaaaa",
+ "(0,14),(0,14),(0,6)(6,12)(12,13)(13,14),(0,6)(6,12)(12,13)(13,14),"
+ "(0,1)(1,2)(2,3)(6,7)(7,8)(8,9)(?,?)(?,?),(3,4)(9,10)(?,?)(?,?),"
+ "(?,?)(?,?)(?,?)(?,?),(4,5)(10,11)(?,?)(?,?),(?,?)(?,?)(?,?)(?,?)");
- e |= test(f, "((((a){2,3}(()|a)(()|a?)a|a?)){2,5})*", "aaaaaaaaaaaaaa",
- "(0,14),(0,14),(0,6)(6,12)(12,13)(13,14),(0,6)(6,12)(12,13)(13,14),"
- "(0,1)(1,2)(2,3)(6,7)(7,8)(8,9)(?,?)(?,?),(3,4)(9,10)(?,?)(?,?),"
- "(?,?)(?,?)(?,?)(?,?),(4,5)(10,11)(?,?)(?,?),(?,?)(?,?)(?,?)(?,?)");
- }
- if (!((f & REG_BACKWARD) && (f & REG_GTOP))) {
- // expected failures for Cox algorithm
+ e |= test(f, "(((((a){3,3}a?|a?)){0,4})?)*", "aaaaaaaaaa",
+ "(0,10),(0,10),(0,10),(0,4)(4,8)(8,9)(9,10),"
+ "(0,4)(4,8)(8,9)(9,10),(0,1)(1,2)(2,3)(4,5)(5,6)(6,7)(?,?)(?,?)");
- e |= test(f, "(((((a){3,3}a?|a?)){0,4})?)*", "aaaaaaaaaa",
- "(0,10),(0,10),(0,10),(0,4)(4,8)(8,9)(9,10),"
- "(0,4)(4,8)(8,9)(9,10),(0,1)(1,2)(2,3)(4,5)(5,6)(6,7)(?,?)(?,?)");
+ e |= test(f, "(((((a){3,4}|a?)){1,4}|((a)+(a|())){1,2}))*", "aaaaaaaaaa",
+ "(0,10),(0,10),(0,10),(0,4)(4,8)(8,9)(9,10),(0,4)(4,8)(8,9)(9,10),"
+ "(0,1)(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(?,?)(?,?),(?,?),(?,?),(?,?),(?,?)");
+
+ e |= test(f, "((a?|a?(a?a?)((())+)+))*", "aaaaaa",
+ "(0,6),(0,3)(3,6),(0,3)(3,6),(1,3)(4,6),(3,3)(6,6),(3,3)(6,6),(3,3)(6,6)");
- e |= test(f, "(((((a){3,4}|a?)){1,4}|((a)+(a|())){1,2}))*", "aaaaaaaaaa",
- "(0,10),(0,10),(0,10),(0,4)(4,8)(8,9)(9,10),(0,4)(4,8)(8,9)(9,10),"
- "(0,1)(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(?,?)(?,?),(?,?),(?,?),(?,?),(?,?)");
- }
- if (!((f & REG_BACKWARD) && !(f & REG_GTOP))) {
- // expected failures for Cox algorithm
- e |= test(f, "((a?|a?(a?a?)((())+)+))*", "aaaaaa",
- "(0,6),(0,3)(3,6),(0,3)(3,6),(1,3)(4,6),(3,3)(6,6),(3,3)(6,6),(3,3)(6,6)");
- }
e |= test(f, "(((a?a)){2,3})*", "aaaa", "(0,4),(0,4),(0,2)(2,4),(0,2)(2,4)");
e |= test(f, "(((aa?a)((aaa)+))+)+", "aaaaaaaaaa",
- "(0,10),(0,10),(0,5)(5,10),(0,2)(5,7),(2,5)(7,10),(2,5)(7,10)");
+ "(0,10),(0,10),(0,5)(5,10),(0,2)(5,7),(2,5)(7,10),(2,5)(7,10)");
e |= test(f, "(((aaa?)((aaa)+))+)+", "aaaaaaaaaa",
- "(0,10),(0,10),(0,5)(5,10),(0,2)(5,7),(2,5)(7,10),(2,5)(7,10)");
+ "(0,10),(0,10),(0,5)(5,10),(0,2)(5,7),(2,5)(7,10),(2,5)(7,10)");
e |= test(f, "(((aaa?)((aaa){1,3})){1,2})*", "aaaaaaaaaa",
- "(0,10),(0,10),(0,5)(5,10),(0,2)(5,7),(2,5)(7,10),(2,5)(7,10)");
+ "(0,10),(0,10),(0,5)(5,10),(0,2)(5,7),(2,5)(7,10),(2,5)(7,10)");
e |= test(f, "((((aaa?)(aaa){1,3})){1,2})*", "aaaaaaaaaa",
- "(0,10),(0,10),(0,5)(5,10),(0,5)(5,10),(0,2)(5,7),(2,5)(7,10)");
+ "(0,10),(0,10),(0,5)(5,10),(0,5)(5,10),(0,2)(5,7),(2,5)(7,10)");
e |= test(f, "((((aaa?)((a){3,3}){1,3})){1,2})*", "aaaaaaaaaa",
- "(0,10),(0,10),(0,5)(5,10),(0,5)(5,10),(0,2)(5,7),(2,5)(7,10),"
- "(2,3)(3,4)(4,5)(7,8)(8,9)(9,10)");
+ "(0,10),(0,10),(0,5)(5,10),(0,5)(5,10),(0,2)(5,7),(2,5)(7,10),"
+ "(2,3)(3,4)(4,5)(7,8)(8,9)(9,10)");
e |= test(f, "(a|aa)*", "", "(0,0),(?,?)");
e |= test(f, "(a|aa)*", "a", "(0,1),(0,1)");
@@ -279,25 +273,19 @@ static int test_all_posix(int f)
e |= test(f, "((y?){2,3}){2}", "yyyyy", "(0,5),(0,3)(3,5),(0,1)(1,2)(2,3)(3,4)(4,5)");
- e |= test(f, "((y?){2,3}){2,3}", "yyyyy",
- "(0,5),(0,3)(3,5),(0,1)(1,2)(2,3)(3,4)(4,5)");
+ e |= test(f, "((y?){2,3}){2,3}", "yyyyy", "(0,5),(0,3)(3,5),(0,1)(1,2)(2,3)(3,4)(4,5)");
e |= test(f, "((y?){2,3})*", "yyyyy", "(0,5),(0,3)(3,5),(0,1)(1,2)(2,3)(3,4)(4,5)");
- e |= test(f, "((y?){3}){2}", "yyyyy",
- "(0,5),(0,3)(3,5),(0,1)(1,2)(2,3)(3,4)(4,5)(5,5)");
+ e |= test(f, "((y?){3}){2}", "yyyyy", "(0,5),(0,3)(3,5),(0,1)(1,2)(2,3)(3,4)(4,5)(5,5)");
- e |= test(f, "((y?){3}){2,3}", "yyyyy",
- "(0,5),(0,3)(3,5),(0,1)(1,2)(2,3)(3,4)(4,5)(5,5)");
+ e |= test(f, "((y?){3}){2,3}", "yyyyy", "(0,5),(0,3)(3,5),(0,1)(1,2)(2,3)(3,4)(4,5)(5,5)");
- e |= test(f, "((y?){3})*", "yyyyy",
- "(0,5),(0,3)(3,5),(0,1)(1,2)(2,3)(3,4)(4,5)(5,5)");
+ e |= test(f, "((y?){3})*", "yyyyy", "(0,5),(0,3)(3,5),(0,1)(1,2)(2,3)(3,4)(4,5)(5,5)");
- e |= test(f, "((y?)*){2}", "yyyyy",
- "(0,5),(0,5)(5,5),(0,1)(1,2)(2,3)(3,4)(4,5)(5,5)");
+ e |= test(f, "((y?)*){2}", "yyyyy", "(0,5),(0,5)(5,5),(0,1)(1,2)(2,3)(3,4)(4,5)(5,5)");
- e |= test(f, "((y?)*){2,3}", "yyyyy",
- "(0,5),(0,5)(5,5),(0,1)(1,2)(2,3)(3,4)(4,5)(5,5)");
+ e |= test(f, "((y?)*){2,3}", "yyyyy", "(0,5),(0,5)(5,5),(0,1)(1,2)(2,3)(3,4)(4,5)(5,5)");
e |= test(f, "((y?)*)*", "yyyyy", "(0,5),(0,5),(0,1)(1,2)(2,3)(3,4)(4,5)");
@@ -306,12 +294,8 @@ static int test_all_posix(int f)
e |= test(f, "((a)|(b))*", "ab", "(0,2),(0,1)(1,2),(0,1)(?,?),(?,?)(1,2)");
e |= test(f, "((a?)|(b?))*", "ab", "(0,2),(0,1)(1,2),(0,1)(?,?),(?,?)(1,2)");
e |= test(f, "((a?)|(b?))*", "ba", "(0,2),(0,1)(1,2),(?,?)(1,2),(0,1)(?,?)");
-
- if (!((f & REG_BACKWARD) && !(f & REG_GTOP))) {
- e |= test(f, "((a?)|(b?)){2,3}", "ab", "(0,2),(0,1)(1,2),(0,1)(?,?),(?,?)(1,2)");
- }
- e |= test(f, "((a?)|(b?)){3}", "ab",
- "(0,2),(0,1)(1,2)(2,2),(0,1)(?,?)(2,2),(?,?)(1,2)(?,?)");
+ e |= test(f, "((a?)|(b?)){2,3}", "ab", "(0,2),(0,1)(1,2),(0,1)(?,?),(?,?)(1,2)");
+ e |= test(f, "((a?)|(b?)){3}", "ab", "(0,2),(0,1)(1,2)(2,2),(0,1)(?,?)(2,2),(?,?)(1,2)(?,?)");
e |= test(f, "y{3}", "yyy", "(0,3)");
e |= test(f, "y{0,2}", "", "(0,0)");
@@ -377,16 +361,16 @@ static int test_all_posix(int f)
e |= test(f, "((((((x))))))", "x", "(0,1),(0,1),(0,1),(0,1),(0,1),(0,1),(0,1)");
e |= test(f, "((((((x))))))*", "xx",
- "(0,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2)");
+ "(0,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2)");
e |= test(f, "a?(ab|ba)*",
- "abababababababababababababababababababab"
- "ababababababababababababababababababababa",
- "(0,81),"
- "(1,3)(3,5)(5,7)(7,9)(9,11)(11,13)(13,15)(15,17)(17,19)(19,21)(21,23)"
- "(23,25)(25,27)(27,29)(29,31)(31,33)(33,35)(35,37)(37,39)(39,41)(41,43)"
- "(43,45)(45,47)(47,49)(49,51)(51,53)(53,55)(55,57)(57,59)(59,61)(61,63)"
- "(63,65)(65,67)(67,69)(69,71)(71,73)(73,75)(75,77)(77,79)(79,81)");
+ "abababababababababababababababababababab"
+ "ababababababababababababababababababababa",
+ "(0,81),"
+ "(1,3)(3,5)(5,7)(7,9)(9,11)(11,13)(13,15)(15,17)(17,19)(19,21)(21,23)"
+ "(23,25)(25,27)(27,29)(29,31)(31,33)(33,35)(35,37)(37,39)(39,41)(41,43)"
+ "(43,45)(45,47)(47,49)(49,51)(51,53)(53,55)(55,57)(57,59)(59,61)(61,63)"
+ "(63,65)(65,67)(67,69)(69,71)(71,73)(73,75)(75,77)(77,79)(79,81)");
e |= test(f, "a*a*a*a*a*b", "aaaaaaaaab", "(0,10)");
e |= test(f, "a[b]+bc", "abbc", "(0,4)");
@@ -425,8 +409,7 @@ static int test_all_posix(int f)
// categorize
e |= test(f, "(a*)(ab)*(b*)", "abc", "(0,2),(0,1),(?,?),(1,2)");
- e |= test(f, "((a*)(ab)*)((b*)(a*))", "aba",
- "(0,3),(0,2),(0,0),(0,2),(2,3),(2,2),(2,3)");
+ e |= test(f, "((a*)(ab)*)((b*)(a*))", "aba", "(0,3),(0,2),(0,0),(0,2),(2,3),(2,2),(2,3)");
e |= test(f, "(...?.?)*", "xxxxxx", "(0,6),(0,4)(4,6)");
e |= test(f, "(a|ab)(bc|c)", "abcabc", "(0,3),(0,2),(2,3)");
@@ -437,8 +420,7 @@ static int test_all_posix(int f)
e |= test(f, "(a(b)?)+", "aba", "(0,3),(0,2)(2,3),(1,2)(?,?)");
e |= test(f, ".*(.*)", "ab", "(0,2),(2,2)");
- e |= test(f, "(a?)((ab)?)(b?)a?(ab)?b?", "abab",
- "(0,4),(0,1),(1,1),(?,?),(1,2),(?,?)");
+ e |= test(f, "(a?)((ab)?)(b?)a?(ab)?b?", "abab", "(0,4),(0,1),(1,1),(?,?),(1,2),(?,?)");
// forcedassoc
e |= test(f, "(a|ab)(c|bcd)", "abcd", "(0,4),(0,1),(1,4)");
@@ -520,8 +502,7 @@ static int test_all_posix(int f)
e |= test(f, "((..)*(...)*)", "xxx", "(0,3),(0,3),(?,?),(0,3)");
e |= test(f, "((..)*(...)*)*", "xxx", "(0,3),(0,3),(?,?),(0,3)");
- e |= test(f, "((..)*(...)*)((..)*(...)*)", "xxx",
- "(0,3),(0,3),(?,?),(0,3),(3,3),(?,?),(?,?)");
+ e |= test(f, "((..)*(...)*)((..)*(...)*)", "xxx", "(0,3),(0,3),(?,?),(0,3),(3,3),(?,?),(?,?)");
// nullsubexpr
e |= test(f, "(a*)*", "a", "(0,1),(0,1)");
@@ -596,19 +577,16 @@ static int test_all_posix(int f)
e |= test(f, "(aa*|aaa*)(aa*|aaa*)", "aaaaaa", "(0,6),(0,5),(5,6)");
e |= test(f, "(aa*|aaa*){2}", "aaaaaa", "(0,6),(0,5)(5,6)");
e |= test(f, "((aa)*|(aaa)*)((aa)*|(aaa)*)", "aaaaaa",
- "(0,6),(0,6),(0,2)(2,4)(4,6),(?,?),(6,6),(?,?),(?,?)");
+ "(0,6),(0,6),(0,2)(2,4)(4,6),(?,?),(6,6),(?,?),(?,?)");
e |= test(f, "((aa)*|(aaa)*){2}", "aaaaaa",
- "(0,6),(0,6)(6,6),(0,2)(2,4)(4,6)(?,?),(?,?)(?,?)");
- e |= test(f, "(aa)*|(aaa)*", "aaaaaa",
- "(0,6),(0,2)(2,4)(4,6),(?,?)");
-
- e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XY", "(0,2),(0,1)(1,2)");
- e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XabY", "(0,4),(0,3)(3,4)");
- e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XababY", "(0,6),(0,4)(4,6)");
- e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XabababY",
- "(0,8),(0,3)(3,7)(7,8)");
- e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XababababY",
- "(0,10),(0,4)(4,8)(8,10)");
+ "(0,6),(0,6)(6,6),(0,2)(2,4)(4,6)(?,?),(?,?)(?,?)");
+ e |= test(f, "(aa)*|(aaa)*", "aaaaaa", "(0,6),(0,2)(2,4)(4,6),(?,?)");
+
+ e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XY", "(0,2),(0,1)(1,2)");
+ e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XabY", "(0,4),(0,3)(3,4)");
+ e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XababY", "(0,6),(0,4)(4,6)");
+ e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XabababY", "(0,8),(0,3)(3,7)(7,8)");
+ e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XababababY", "(0,10),(0,4)(4,8)(8,10)");
e |= test(f, "(y){2}", "");
e |= test(f, "(y){2}", "y");
@@ -675,66 +653,62 @@ static int test_all_posix(int f)
e |= test(f, "((..)|(.))", "aa", "(0,2),(0,2),(0,2),(?,?)");
e |= test(f, "((..)|(.))((..)|(.))", "aa",
- "(0,2),(0,1),(?,?),(0,1),(1,2),(?,?),(1,2)");
+ "(0,2),(0,1),(?,?),(0,1),(1,2),(?,?),(1,2)");
e |= test(f, "((..)|(.))((..)|(.))((..)|(.))", "aa");
e |= test(f, "((..)|(.)){1}", "aa", "(0,2),(0,2),(0,2),(?,?)");
- e |= test(f, "((..)|(.)){2}", "aa",
- "(0,2),(0,1)(1,2),(?,?)(?,?),(0,1)(1,2)");
+ e |= test(f, "((..)|(.)){2}", "aa", "(0,2),(0,1)(1,2),(?,?)(?,?),(0,1)(1,2)");
e |= test(f, "((..)|(.)){3}", "aa");
e |= test(f, "((..)|(.))*", "aa", "(0,2),(0,2),(0,2),(?,?)");
e |= test(f, "((..)|(.))", "aaa", "(0,2),(0,2),(0,2),(?,?)");
e |= test(f, "((..)|(.))((..)|(.))", "aaa",
- "(0,3),(0,2),(0,2),(?,?),(2,3),(?,?),(2,3)");
+ "(0,3),(0,2),(0,2),(?,?),(2,3),(?,?),(2,3)");
e |= test(f, "((..)|(.))((..)|(.))((..)|(.))", "aaa",
- "(0,3),(0,1),(?,?),(0,1),(1,2),(?,?),(1,2),(2,3),(?,?),(2,3)");
- e |= test(f, "((..)|(.)){1}", "aaa",
- "(0,2),(0,2),(0,2),(?,?)");
- e |= test(f, "((..)|(.)){2}", "aaa",
- "(0,3),(0,2)(2,3),(0,2)(?,?),(?,?)(2,3)");
+ "(0,3),(0,1),(?,?),(0,1),(1,2),(?,?),(1,2),(2,3),(?,?),(2,3)");
+ e |= test(f, "((..)|(.)){1}", "aaa", "(0,2),(0,2),(0,2),(?,?)");
+ e |= test(f, "((..)|(.)){2}", "aaa", "(0,3),(0,2)(2,3),(0,2)(?,?),(?,?)(2,3)");
e |= test(f, "((..)|(.)){3}", "aaa",
- "(0,3),(0,1)(1,2)(2,3),(?,?)(?,?)(?,?),(0,1)(1,2)(2,3)");
- e |= test(f, "((..)|(.))*", "aaa",
- "(0,3),(0,2)(2,3),(0,2)(?,?),(?,?)(2,3)");
+ "(0,3),(0,1)(1,2)(2,3),(?,?)(?,?)(?,?),(0,1)(1,2)(2,3)");
+ e |= test(f, "((..)|(.))*", "aaa", "(0,3),(0,2)(2,3),(0,2)(?,?),(?,?)(2,3)");
e |= test(f, "((..)|(.))", "aaaa", "(0,2),(0,2),(0,2),(?,?)");
e |= test(f, "((..)|(.))((..)|(.))", "aaaa",
- "(0,4),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?)");
+ "(0,4),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?)");
e |= test(f, "((..)|(.))((..)|(.))((..)|(.))", "aaaa",
- "(0,4),(0,2),(0,2),(?,?),(2,3),(?,?),(2,3),(3,4),(?,?),(3,4)");
+ "(0,4),(0,2),(0,2),(?,?),(2,3),(?,?),(2,3),(3,4),(?,?),(3,4)");
e |= test(f, "((..)|(.)){1}", "aaaa", "(0,2),(0,2),(0,2),(?,?)");
e |= test(f, "((..)|(.)){2}", "aaaa",
- "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)");
+ "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)");
e |= test(f, "((..)|(.)){3}", "aaaa",
- "(0,4),(0,2)(2,3)(3,4),(0,2)(?,?)(?,?),(?,?)(2,3)(3,4)");
+ "(0,4),(0,2)(2,3)(3,4),(0,2)(?,?)(?,?),(?,?)(2,3)(3,4)");
e |= test(f, "((..)|(.))*", "aaaa",
- "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)");
+ "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)");
e |= test(f, "((..)|(.))", "aaaaa", "(0,2),(0,2),(0,2),(?,?)");
e |= test(f, "((..)|(.))((..)|(.))", "aaaaa",
- "(0,4),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?)");
+ "(0,4),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?)");
e |= test(f, "((..)|(.))((..)|(.))((..)|(.))", "aaaaa",
- "(0,5),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?),(4,5),(?,?),(4,5)");
+ "(0,5),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?),(4,5),(?,?),(4,5)");
e |= test(f, "((..)|(.)){1}", "aaaaa", "(0,2),(0,2),(0,2),(?,?)");
e |= test(f, "((..)|(.)){2}", "aaaaa",
- "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)");
+ "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)");
e |= test(f, "((..)|(.)){3}", "aaaaa",
- "(0,5),(0,2)(2,4)(4,5),(0,2)(2,4)(?,?),(?,?)(?,?)(4,5)");
+ "(0,5),(0,2)(2,4)(4,5),(0,2)(2,4)(?,?),(?,?)(?,?)(4,5)");
e |= test(f, "((..)|(.))*", "aaaaa",
- "(0,5),(0,2)(2,4)(4,5),(0,2)(2,4)(?,?),(?,?)(?,?)(4,5)");
+ "(0,5),(0,2)(2,4)(4,5),(0,2)(2,4)(?,?),(?,?)(?,?)(4,5)");
e |= test(f, "((..)|(.))", "aaaaaa", "(0,2),(0,2),(0,2),(?,?)");
e |= test(f, "((..)|(.))((..)|(.))", "aaaaaa",
- "(0,4),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?)");
+ "(0,4),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?)");
e |= test(f, "((..)|(.))((..)|(.))((..)|(.))", "aaaaaa",
- "(0,6),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?),(4,6),(4,6),(?,?)");
+ "(0,6),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?),(4,6),(4,6),(?,?)");
e |= test(f, "((..)|(.)){1}", "aaaaaa", "(0,2),(0,2),(0,2),(?,?)");
e |= test(f, "((..)|(.)){2}", "aaaaaa",
- "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)");
+ "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)");
e |= test(f, "((..)|(.)){3}", "aaaaaa",
- "(0,6),(0,2)(2,4)(4,6),(0,2)(2,4)(4,6),(?,?)(?,?)(?,?)");
+ "(0,6),(0,2)(2,4)(4,6),(0,2)(2,4)(4,6),(?,?)(?,?)(?,?)");
e |= test(f, "((..)|(.))*", "aaaaaa",
- "(0,6),(0,2)(2,4)(4,6),(0,2)(2,4)(4,6),(?,?)(?,?)(?,?)");
+ "(0,6),(0,2)(2,4)(4,6),(0,2)(2,4)(4,6),(?,?)(?,?)(?,?)");
e |= test(f, "X(.?){0,}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)");
e |= test(f, "X(.?){1,}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)");
@@ -744,8 +718,7 @@ static int test_all_posix(int f)
e |= test(f, "X(.?){5,}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)");
e |= test(f, "X(.?){6,}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)");
e |= test(f, "X(.?){7,}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)");
- e |= test(f, "X(.?){8,}Y", "X1234567Y",
- "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)");
+ e |= test(f, "X(.?){8,}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)");
e |= test(f, "X(.?){0,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)");
e |= test(f, "X(.?){1,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)");
e |= test(f, "X(.?){2,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)");
@@ -754,8 +727,7 @@ static int test_all_posix(int f)
e |= test(f, "X(.?){5,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)");
e |= test(f, "X(.?){6,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)");
e |= test(f, "X(.?){7,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)");
- e |= test(f, "X(.?){8,8}Y", "X1234567Y",
- "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)");
+ e |= test(f, "X(.?){8,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)");
e |= test(f, "(a|ab|c|bcd){0,}(d*)", "ababcd", "(0,6),(0,2)(2,3)(3,6),(6,6)");
e |= test(f, "(a|ab|c|bcd){1,}(d*)", "ababcd", "(0,6),(0,2)(2,3)(3,6),(6,6)");
@@ -798,59 +770,53 @@ static int test_all_posix(int f)
if (!(f & REG_NFA)) {
e |= test(f, "((a?){1,300})*",
- "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
- "(0,50),(0,50),"
- "(0,1)(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,9)(9,10)"
- "(10,11)(11,12)(12,13)(13,14)(14,15)(15,16)(16,17)(17,18)(18,19)(19,20)"
- "(20,21)(21,22)(22,23)(23,24)(24,25)(25,26)(26,27)(27,28)(28,29)(29,30)"
- "(30,31)(31,32)(32,33)(33,34)(34,35)(35,36)(36,37)(37,38)(38,39)(39,40)"
- "(40,41)(41,42)(42,43)(43,44)(44,45)(45,46)(46,47)(47,48)(48,49)(49,50)");
- }
- else if (!(f & (REG_SLOWPREC | REG_KUKLEWICZ))) {
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "(0,50),(0,50),"
+ "(0,1)(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,9)(9,10)"
+ "(10,11)(11,12)(12,13)(13,14)(14,15)(15,16)(16,17)(17,18)(18,19)(19,20)"
+ "(20,21)(21,22)(22,23)(23,24)(24,25)(25,26)(26,27)(27,28)(28,29)(29,30)"
+ "(30,31)(31,32)(32,33)(33,34)(34,35)(35,36)(36,37)(37,38)(38,39)(39,40)"
+ "(40,41)(41,42)(42,43)(43,44)(44,45)(45,46)(46,47)(47,48)(48,49)(49,50)");
+ } else if (!(f & REG_SLOWPREC)) {
e |= test(f, "((a?){1,1000})*", "aaaa", "(0,4),(0,4),(3,4)");
e |= test(f, "(((((aa)|((a?)*))*){0,10}){0,10}){0,10}", "",
- "(0,0),(0,0),(0,0),(0,0),(0,0),(?,?),(0,0),(0,0)");
-
- if (!((f & REG_BACKWARD) && !(f & REG_GTOP))) {
- // expected failures for Cox algorithm
+ "(0,0),(0,0),(0,0),(0,0),(0,0),(?,?),(0,0),(0,0)");
- e |= test(f, "(((((aa)|((a?)*))*){0,10}){0,10}){0,10}", "aaa",
- "(0,3),(0,3),(0,3),(0,3),(0,3),(?,?),(0,3),(2,3)");
+ e |= test(f, "(((((aa)|((a?)*))*){0,10}){0,10}){0,10}", "aaa",
+ "(0,3),(0,3),(0,3),(0,3),(0,3),(?,?),(0,3),(2,3)");
- e |= test(f, "(((((aa)|((a?)*))*){0,10}){0,10}){0,10}", "aaaaa",
- "(0,5),(0,5),(0,5),(0,5),(0,5),(?,?),(0,5),(4,5)");
- }
+ e |= test(f, "(((((aa)|((a?)*))*){0,10}){0,10}){0,10}", "aaaaa",
+ "(0,5),(0,5),(0,5),(0,5),(0,5),(?,?),(0,5),(4,5)");
}
e |= test(f, "((((a?)+)|(aa))+)", "aaa",
- "(0,3),(0,3),(0,3),(0,3),(0,1)(1,2)(2,3),(?,?)");
+ "(0,3),(0,3),(0,3),(0,3),(0,1)(1,2)(2,3),(?,?)");
e |= test(f, "(((aa)|((a?)+))+)", "aaa",
- "(0,3),(0,3),(0,3),(?,?),(0,3),(0,1)(1,2)(2,3)");
+ "(0,3),(0,3),(0,3),(?,?),(0,3),(0,1)(1,2)(2,3)");
e |= test(f, "((a?){1,2}|(a)*)*", "aaaa", "(0,4),(0,4),(?,?),(0,1)(1,2)(2,3)(3,4)");
e |= test(f, "(((a?){2,3}|(a)*))*", "aaaaa",
- "(0,5),(0,5),(0,5),(?,?),(0,1)(1,2)(2,3)(3,4)(4,5)");
+ "(0,5),(0,5),(0,5),(?,?),(0,1)(1,2)(2,3)(3,4)(4,5)");
e |= test(f, "(((a?)|(a?a?))+)", "aa", "(0,2),(0,2),(0,2),(?,?),(0,2)");
e |= test(f, "((((a)*))*|((((a))*))+)*", "aa",
- "(0,2),(0,2),(0,2),(0,2),(0,1)(1,2),(?,?),(?,?),(?,?),(?,?)");
+ "(0,2),(0,2),(0,2),(0,2),(0,1)(1,2),(?,?),(?,?),(?,?),(?,?)");
e |= test(f, "(((a)*)*|((a)*)+)*", "aa", "(0,2),(0,2),(0,2),(0,1)(1,2),(?,?),(?,?)");
e |= test(f, "(((a)+)|(((a)+)?))+", "aa",
- "(0,2),(0,2),(0,2),(0,1)(1,2),(?,?),(?,?),(?,?)");
+ "(0,2),(0,2),(0,2),(0,1)(1,2),(?,?),(?,?),(?,?)");
e |= test(f, "(a*|(a)*)*", "aa", "(0,2),(0,2),(?,?)");
return e;
}
-static int test_all_leftmost(int f)
-{
+static int test_all_leftmost(int f) {
int e = 0;
e |= test(f, "a", "a", "(0,1)");
@@ -902,16 +868,16 @@ static int test_all_leftmost(int f)
e |= test(f, "((((((x))))))", "x", "(0,1),(0,1),(0,1),(0,1),(0,1),(0,1),(0,1)");
e |= test(f, "((((((x))))))*", "xx",
- "(0,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2)");
+ "(0,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2),(0,1)(1,2)");
e |= test(f, "a?(ab|ba)*",
- "abababababababababababababababababababab"
- "ababababababababababababababababababababa",
- "(0,81),"
- "(1,3)(3,5)(5,7)(7,9)(9,11)(11,13)(13,15)(15,17)(17,19)(19,21)(21,23)"
- "(23,25)(25,27)(27,29)(29,31)(31,33)(33,35)(35,37)(37,39)(39,41)(41,43)"
- "(43,45)(45,47)(47,49)(49,51)(51,53)(53,55)(55,57)(57,59)(59,61)(61,63)"
- "(63,65)(65,67)(67,69)(69,71)(71,73)(73,75)(75,77)(77,79)(79,81)");
+ "abababababababababababababababababababab"
+ "ababababababababababababababababababababa",
+ "(0,81),"
+ "(1,3)(3,5)(5,7)(7,9)(9,11)(11,13)(13,15)(15,17)(17,19)(19,21)(21,23)"
+ "(23,25)(25,27)(27,29)(29,31)(31,33)(33,35)(35,37)(37,39)(39,41)(41,43)"
+ "(43,45)(45,47)(47,49)(49,51)(51,53)(53,55)(55,57)(57,59)(59,61)(61,63)"
+ "(63,65)(65,67)(67,69)(69,71)(71,73)(73,75)(75,77)(77,79)(79,81)");
e |= test(f, "a*a*a*a*a*b", "aaaaaaaaab", "(0,10)");
e |= test(f, "a[b]+bc", "abbc", "(0,4)");
@@ -950,8 +916,7 @@ static int test_all_leftmost(int f)
// categorize
e |= test(f, "(a*)(ab)*(b*)", "abc", "(0,2),(0,1),(?,?),(1,2)");
- e |= test(f, "((a*)(ab)*)((b*)(a*))", "aba",
- "(0,3),(0,1),(0,1),(?,?),(1,3),(1,2),(2,3)");
+ e |= test(f, "((a*)(ab)*)((b*)(a*))", "aba", "(0,3),(0,1),(0,1),(?,?),(1,3),(1,2),(2,3)");
e |= test(f, "(...?.?)*", "xxxxxx", "(0,6),(0,4)(4,6)");
e |= test(f, "(a|ab)(bc|c)", "abcabc", "(0,3),(0,1),(1,3)");
@@ -962,8 +927,7 @@ static int test_all_leftmost(int f)
e |= test(f, "(a(b)?)+", "aba", "(0,3),(0,2)(2,3),(1,2)(?,?)");
e |= test(f, ".*(.*)", "ab", "(0,2),(2,2)");
- e |= test(f, "(a?)((ab)?)(b?)a?(ab)?b?", "abab",
- "(0,4),(0,1),(1,1),(?,?),(1,2),(?,?)");
+ e |= test(f, "(a?)((ab)?)(b?)a?(ab)?b?", "abab", "(0,4),(0,1),(1,1),(?,?),(1,2),(?,?)");
// forcedassoc
e |= test(f, "(a|ab)(c|bcd)", "abcd", "(0,4),(0,1),(1,4)");
@@ -1042,8 +1006,7 @@ static int test_all_leftmost(int f)
e |= test(f, "((..)*(...)*)", "xxx", "(0,3),(0,3),(?,?),(0,3)");
e |= test(f, "((..)*(...)*)*", "xxx", "(0,3),(0,3),(?,?),(0,3)");
- e |= test(f, "((..)*(...)*)((..)*(...)*)", "xxx",
- "(0,3),(0,3),(?,?),(0,3),(3,3),(?,?),(?,?)");
+ e |= test(f, "((..)*(...)*)((..)*(...)*)", "xxx", "(0,3),(0,3),(?,?),(0,3),(3,3),(?,?),(?,?)");
// nullsubexpr
e |= test(f, "(a*)*", "a", "(0,1),(0,1)");
@@ -1121,17 +1084,16 @@ static int test_all_leftmost(int f)
e |= test(f, "(aa*|aaa*)(aa*|aaa*)", "aaaaaa", "(0,6),(0,5),(5,6)");
e |= test(f, "(aa*|aaa*){2}", "aaaaaa", "(0,6),(0,5)(5,6)");
e |= test(f, "((aa)*|(aaa)*)((aa)*|(aaa)*)", "aaaaaa",
- "(0,6),(0,6),(0,2)(2,4)(4,6),(?,?),(6,6),(?,?),(?,?)");
+ "(0,6),(0,6),(0,2)(2,4)(4,6),(?,?),(6,6),(?,?),(?,?)");
e |= test(f, "((aa)*|(aaa)*){2}", "aaaaaa",
- "(0,6),(0,6)(6,6),(0,2)(2,4)(4,6)(?,?),(?,?)(?,?)");
+ "(0,6),(0,6)(6,6),(0,2)(2,4)(4,6)(?,?),(?,?)(?,?)");
e |= test(f, "(aa)*|(aaa)*", "aaaaaa", "(0,6),(0,2)(2,4)(4,6),(?,?)");
- e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XY", "(0,2),(0,1)(1,2)");
- e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XabY", "(0,4),(0,2)(2,4)");
- e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XababY", "(0,6),(0,1)(1,5)(5,6)");
- e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XabababY", "(0,8),(0,2)(2,6)(6,8)");
- e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XababababY",
- "(0,10),(0,1)(1,5)(5,9)(9,10)");
+ e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XY", "(0,2),(0,1)(1,2)");
+ e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XabY", "(0,4),(0,2)(2,4)");
+ e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XababY", "(0,6),(0,1)(1,5)(5,6)");
+ e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XabababY", "(0,8),(0,2)(2,6)(6,8)");
+ e |= test(f, "(X|Xa|Xab|Xaba|abab|baba|bY|Y)*", "XababababY", "(0,10),(0,1)(1,5)(5,9)(9,10)");
e |= test(f, "(y){2}", "");
e |= test(f, "(y){2}", "y");
@@ -1197,65 +1159,62 @@ static int test_all_leftmost(int f)
e |= test(f, "((..)|(.))", "aa", "(0,2),(0,2),(0,2),(?,?)");
e |= test(f, "((..)|(.))((..)|(.))", "aa",
- "(0,2),(0,1),(?,?),(0,1),(1,2),(?,?),(1,2)");
+ "(0,2),(0,1),(?,?),(0,1),(1,2),(?,?),(1,2)");
e |= test(f, "((..)|(.))((..)|(.))((..)|(.))", "aa");
e |= test(f, "((..)|(.)){1}", "aa", "(0,2),(0,2),(0,2),(?,?)");
- e |= test(f, "((..)|(.)){2}", "aa",
- "(0,2),(0,1)(1,2),(?,?)(?,?),(0,1)(1,2)");
+ e |= test(f, "((..)|(.)){2}", "aa", "(0,2),(0,1)(1,2),(?,?)(?,?),(0,1)(1,2)");
e |= test(f, "((..)|(.)){3}", "aa");
e |= test(f, "((..)|(.))*", "aa", "(0,2),(0,2),(0,2),(?,?)");
e |= test(f, "((..)|(.))", "aaa", "(0,2),(0,2),(0,2),(?,?)");
e |= test(f, "((..)|(.))((..)|(.))", "aaa",
- "(0,3),(0,2),(0,2),(?,?),(2,3),(?,?),(2,3)");
+ "(0,3),(0,2),(0,2),(?,?),(2,3),(?,?),(2,3)");
e |= test(f, "((..)|(.))((..)|(.))((..)|(.))", "aaa",
- "(0,3),(0,1),(?,?),(0,1),(1,2),(?,?),(1,2),(2,3),(?,?),(2,3)");
+ "(0,3),(0,1),(?,?),(0,1),(1,2),(?,?),(1,2),(2,3),(?,?),(2,3)");
e |= test(f, "((..)|(.)){1}", "aaa", "(0,2),(0,2),(0,2),(?,?)");
- e |= test(f, "((..)|(.)){2}", "aaa",
- "(0,3),(0,2)(2,3),(0,2)(?,?),(?,?)(2,3)");
+ e |= test(f, "((..)|(.)){2}", "aaa", "(0,3),(0,2)(2,3),(0,2)(?,?),(?,?)(2,3)");
e |= test(f, "((..)|(.)){3}", "aaa",
- "(0,3),(0,1)(1,2)(2,3),(?,?)(?,?)(?,?),(0,1)(1,2)(2,3)");
- e |= test(f, "((..)|(.))*", "aaa",
- "(0,3),(0,2)(2,3),(0,2)(?,?),(?,?)(2,3)");
+ "(0,3),(0,1)(1,2)(2,3),(?,?)(?,?)(?,?),(0,1)(1,2)(2,3)");
+ e |= test(f, "((..)|(.))*", "aaa", "(0,3),(0,2)(2,3),(0,2)(?,?),(?,?)(2,3)");
e |= test(f, "((..)|(.))", "aaaa", "(0,2),(0,2),(0,2),(?,?)");
e |= test(f, "((..)|(.))((..)|(.))", "aaaa",
- "(0,4),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?)");
+ "(0,4),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?)");
e |= test(f, "((..)|(.))((..)|(.))((..)|(.))", "aaaa",
- "(0,4),(0,2),(0,2),(?,?),(2,3),(?,?),(2,3),(3,4),(?,?),(3,4)");
+ "(0,4),(0,2),(0,2),(?,?),(2,3),(?,?),(2,3),(3,4),(?,?),(3,4)");
e |= test(f, "((..)|(.)){1}", "aaaa", "(0,2),(0,2),(0,2),(?,?)");
e |= test(f, "((..)|(.)){2}", "aaaa",
- "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)");
+ "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)");
e |= test(f, "((..)|(.)){3}", "aaaa",
- "(0,4),(0,2)(2,3)(3,4),(0,2)(?,?)(?,?),(?,?)(2,3)(3,4)");
+ "(0,4),(0,2)(2,3)(3,4),(0,2)(?,?)(?,?),(?,?)(2,3)(3,4)");
e |= test(f, "((..)|(.))*", "aaaa",
- "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)");
+ "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)");
e |= test(f, "((..)|(.))", "aaaaa", "(0,2),(0,2),(0,2),(?,?)");
e |= test(f, "((..)|(.))((..)|(.))", "aaaaa",
- "(0,4),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?)");
+ "(0,4),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?)");
e |= test(f, "((..)|(.))((..)|(.))((..)|(.))", "aaaaa",
- "(0,5),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?),(4,5),(?,?),(4,5)");
+ "(0,5),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?),(4,5),(?,?),(4,5)");
e |= test(f, "((..)|(.)){1}", "aaaaa", "(0,2),(0,2),(0,2),(?,?)");
e |= test(f, "((..)|(.)){2}", "aaaaa",
- "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)");
+ "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)");
e |= test(f, "((..)|(.)){3}", "aaaaa",
- "(0,5),(0,2)(2,4)(4,5),(0,2)(2,4)(?,?),(?,?)(?,?)(4,5)");
+ "(0,5),(0,2)(2,4)(4,5),(0,2)(2,4)(?,?),(?,?)(?,?)(4,5)");
e |= test(f, "((..)|(.))*", "aaaaa",
- "(0,5),(0,2)(2,4)(4,5),(0,2)(2,4)(?,?),(?,?)(?,?)(4,5)");
+ "(0,5),(0,2)(2,4)(4,5),(0,2)(2,4)(?,?),(?,?)(?,?)(4,5)");
e |= test(f, "((..)|(.))", "aaaaaa", "(0,2),(0,2),(0,2),(?,?)");
e |= test(f, "((..)|(.))((..)|(.))", "aaaaaa",
- "(0,4),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?)");
+ "(0,4),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?)");
e |= test(f, "((..)|(.))((..)|(.))((..)|(.))", "aaaaaa",
- "(0,6),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?),(4,6),(4,6),(?,?)");
+ "(0,6),(0,2),(0,2),(?,?),(2,4),(2,4),(?,?),(4,6),(4,6),(?,?)");
e |= test(f, "((..)|(.)){1}", "aaaaaa", "(0,2),(0,2),(0,2),(?,?)");
e |= test(f, "((..)|(.)){2}", "aaaaaa",
- "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)");
+ "(0,4),(0,2)(2,4),(0,2)(2,4),(?,?)(?,?)");
e |= test(f, "((..)|(.)){3}", "aaaaaa",
- "(0,6),(0,2)(2,4)(4,6),(0,2)(2,4)(4,6),(?,?)(?,?)(?,?)");
+ "(0,6),(0,2)(2,4)(4,6),(0,2)(2,4)(4,6),(?,?)(?,?)(?,?)");
e |= test(f, "((..)|(.))*", "aaaaaa",
- "(0,6),(0,2)(2,4)(4,6),(0,2)(2,4)(4,6),(?,?)(?,?)(?,?)");
+ "(0,6),(0,2)(2,4)(4,6),(0,2)(2,4)(4,6),(?,?)(?,?)(?,?)");
e |= test(f, "X(.?){0,}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)");
e |= test(f, "X(.?){1,}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)");
@@ -1266,26 +1225,16 @@ static int test_all_leftmost(int f)
e |= test(f, "X(.?){6,}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)");
e |= test(f, "X(.?){7,}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)");
- e |= test(f, "X(.?){8,}Y", "X1234567Y",
- "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)");
- e |= test(f, "X(.?){0,8}Y", "X1234567Y",
- "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)");
- e |= test(f, "X(.?){1,8}Y", "X1234567Y",
- "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)");
- e |= test(f, "X(.?){2,8}Y", "X1234567Y",
- "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)");
- e |= test(f, "X(.?){3,8}Y", "X1234567Y",
- "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)");
- e |= test(f, "X(.?){4,8}Y", "X1234567Y",
- "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)");
- e |= test(f, "X(.?){5,8}Y", "X1234567Y",
- "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)");
- e |= test(f, "X(.?){6,8}Y", "X1234567Y",
- "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)");
- e |= test(f, "X(.?){7,8}Y", "X1234567Y",
- "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)");
- e |= test(f, "X(.?){8,8}Y", "X1234567Y",
- "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)");
+ e |= test(f, "X(.?){8,}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)");
+ e |= test(f, "X(.?){0,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)");
+ e |= test(f, "X(.?){1,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)");
+ e |= test(f, "X(.?){2,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)");
+ e |= test(f, "X(.?){3,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)");
+ e |= test(f, "X(.?){4,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)");
+ e |= test(f, "X(.?){5,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)");
+ e |= test(f, "X(.?){6,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)");
+ e |= test(f, "X(.?){7,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)");
+ e |= test(f, "X(.?){8,8}Y", "X1234567Y", "(0,9),(1,2)(2,3)(3,4)(4,5)(5,6)(6,7)(7,8)(8,8)");
e |= test(f, "(a|ab|c|bcd){0,}(d*)", "ababcd", "(0,6),(0,2)(2,3)(3,6),(6,6)");
e |= test(f, "(a|ab|c|bcd){1,}(d*)", "ababcd", "(0,6),(0,2)(2,3)(3,6),(6,6)");
@@ -1327,20 +1276,15 @@ static int test_all_leftmost(int f)
e |= test(f, "(ab|a)(bcd|c)(d|.*)", "abcd", "(0,4),(0,2),(2,3),(3,4)");
// other
- if (!(f & REG_STADFA)) {
- // TODO: Find out why this test takes a long time on staDFA.
-
- std::string s = "(0,4),(0,4),(0,1)(1,2)(2,3)(3,4)";
- for (size_t i = 0; i < 1000 - 4; ++i) s += "(4,4)";
+ std::string s = "(0,4),(0,4),(0,1)(1,2)(2,3)(3,4)";
+ for (size_t i = 0; i < 1000 - 4; ++i) s += "(4,4)";
- e |= test(f, "((a?){1,1000})*", "aaaa", s.c_str());
- }
+ e |= test(f, "((a?){1,1000})*", "aaaa", s.c_str());
return e;
}
-static int test_all_tstring()
-{
+static int test_all_tstring() {
int e = 0;
e |= test_tstring("a", "a", "1 a 2 ");
@@ -1360,41 +1304,27 @@ static int test_all_tstring()
return e;
}
-int main()
-{
+int main() {
int e = 0;
e |= test_all_posix(0);
- e |= test_all_posix(REG_STADFA);
- e |= test_all_posix(REG_REGLESS);
+ e |= test_all_posix(REG_MULTIPASS);
e |= test_all_posix(REG_SUBHIST);
- e |= test_all_posix(REG_SUBHIST | REG_REGLESS);
+ e |= test_all_posix(REG_SUBHIST | REG_MULTIPASS);
e |= test_all_leftmost(REG_LEFTMOST);
- e |= test_all_leftmost(REG_LEFTMOST | REG_STADFA);
- e |= test_all_leftmost(REG_LEFTMOST | REG_REGLESS);
+ e |= test_all_leftmost(REG_LEFTMOST | REG_MULTIPASS);
e |= test_all_leftmost(REG_LEFTMOST | REG_SUBHIST);
- e |= test_all_leftmost(REG_LEFTMOST | REG_SUBHIST | REG_REGLESS);
+ e |= test_all_leftmost(REG_LEFTMOST | REG_SUBHIST | REG_MULTIPASS);
e |= test_all_posix(REG_NFA);
- e |= test_all_posix(REG_NFA | REG_GTOP);
-
- e |= test_all_posix(REG_NFA | REG_KUKLEWICZ);
- e |= test_all_posix(REG_NFA | REG_KUKLEWICZ | REG_GTOP);
-
e |= test_all_posix(REG_NFA | REG_TRIE);
- e |= test_all_posix(REG_NFA | REG_TRIE | REG_GTOP);
-
e |= test_all_posix(REG_NFA | REG_SLOWPREC);
e |= test_all_leftmost(REG_NFA | REG_LEFTMOST);
e |= test_all_leftmost(REG_NFA | REG_LEFTMOST | REG_TRIE);
- e |= test_all_posix(REG_NFA | REG_BACKWARD);
- e |= test_all_posix(REG_NFA | REG_BACKWARD | REG_GTOP);
-
e |= test_all_tstring();
return e;
}
-
diff --git a/lib/test_helper.h b/lib/test_helper.h
index 47dacf56..568892bf 100644
--- a/lib/test_helper.h
+++ b/lib/test_helper.h
@@ -6,15 +6,14 @@
// Parse test submatch result string.
//
-// Expected format: comma-separated lists of offset pairs in parentheses, each
-// list contains as many pairs as there are repetitions for the corresponding
-// capturing group.
+// Expected format: comma-separated lists of offset pairs in parentheses, each list contains as many
+// pairs as there are repetitions for the corresponding capturing group.
//
-// Example for RE "(a(b)?)*" and string "aba": "(0,3),(0,2)(2,3),(1,2)(?,?)".
+// Example for regexp "(a(b)?)*" and string "aba": "(0,3),(0,2)(2,3),(1,2)(?,?)".
//
-// String representation of result is used instead of vectors and initializer
-// lists because C++ compilers take too long to process that many vectors, and
-// the compilation gets very slow.
-bool parse_submatch(const char *s, std::vector<std::vector<regoff_t> > &result);
+// String representation of result is used instead of vectors and initializer lists because C++
+// compilers take too long to process that many vectors, and the compilation gets very slow.
+//
+bool parse_submatch(const char* s, std::vector<std::vector<regoff_t> >& result);
#endif // _RE2C_LIB_TEST_HELPER_
diff --git a/lib/test_helper.re b/lib/test_helper.re
index 9ae6cc23..40b2d9c4 100644
--- a/lib/test_helper.re
+++ b/lib/test_helper.re
@@ -1,8 +1,6 @@
#include "lib/test_helper.h"
-
-static inline regoff_t parse_offset(const char *s, const char *e)
-{
+static inline regoff_t parse_offset(const char* s, const char* e) {
regoff_t n = 0;
for (; s < e; ++s) {
n = n * 10 + (*s - '0');
@@ -10,39 +8,37 @@ static inline regoff_t parse_offset(const char *s, const char *e)
return n;
}
-bool parse_submatch(const char *s, std::vector<std::vector<regoff_t> > &result)
-{
+bool parse_submatch(const char* s, std::vector<std::vector<regoff_t> >& result) {
result.clear();
if (!s) return true;
- std::vector<regoff_t> *v;
- const char *n1, *n2, *m1, *m2, *YYMARKER, *YYCURSOR = s;
+ std::vector<regoff_t>* v;
+ const char* n1, *n2, *m1, *m2, *YYMARKER, *YYCURSOR = s;
/*!stags:re2c format = "const char *@@;"; */
next:
result.push_back(std::vector<regoff_t>());
v = &result.back();
-loop:
- /*!re2c
- re2c:define:YYCTYPE = char;
- re2c:yyfill:enable = 0;
- re2c:flags:tags = 1;
-
- * { return false; }
- "\x00" { return true; }
- "," { goto next; }
-
- "(?,?)" {
- v->push_back(-1);
- v->push_back(-1);
- goto loop;
- }
-
- "(" @n1 [0-9]+ @n2 "," @m1 [0-9]+ @m2 ")" {
- v->push_back(parse_offset(n1, n2));
- v->push_back(parse_offset(m1, m2));
- goto loop;
- }
- */
+loop: /*!re2c
+ re2c:define:YYCTYPE = char;
+ re2c:yyfill:enable = 0;
+ re2c:flags:tags = 1;
+
+ * { return false; }
+ "\x00" { return true; }
+ "," { goto next; }
+
+ "(?,?)" {
+ v->push_back(-1);
+ v->push_back(-1);
+ goto loop;
+ }
+
+ "(" @n1 [0-9]+ @n2 "," @m1 [0-9]+ @m2 ")" {
+ v->push_back(parse_offset(n1, n2));
+ v->push_back(parse_offset(m1, m2));
+ goto loop;
+ }
+*/
}