diff options
author | DongHun Kwak <dh0128.kwak@samsung.com> | 2018-02-20 15:37:38 +0900 |
---|---|---|
committer | DongHun Kwak <dh0128.kwak@samsung.com> | 2018-02-20 15:37:38 +0900 |
commit | a3a5ffe62bc766304e422d5ce04d22638cf6eacd (patch) | |
tree | 2c260ddbe400507216779578de476cfef3736225 | |
parent | 4214b4091520393b9c06dca78eb304cfbcee2c04 (diff) | |
download | re2-a3a5ffe62bc766304e422d5ce04d22638cf6eacd.tar.gz re2-a3a5ffe62bc766304e422d5ce04d22638cf6eacd.tar.bz2 re2-a3a5ffe62bc766304e422d5ce04d22638cf6eacd.zip |
Imported Upstream version 20180101upstream/20180101
Change-Id: I927f049290b4c0f3c9ffc5da1d2deda29d583f8a
Signed-off-by: DongHun Kwak <dh0128.kwak@samsung.com>
-rw-r--r-- | CMakeLists.txt | 7 | ||||
-rwxr-xr-x | kokoro/windows-cmake.bat | 4 | ||||
-rw-r--r-- | re2/dfa.cc | 2 | ||||
-rw-r--r-- | re2/fuzzing/re2_fuzzer.cc | 2 | ||||
-rw-r--r-- | re2/parse.cc | 335 | ||||
-rw-r--r-- | re2/regexp.h | 3 | ||||
-rw-r--r-- | re2/testing/dfa_test.cc | 4 | ||||
-rw-r--r-- | re2/testing/parse_test.cc | 12 | ||||
-rw-r--r-- | re2/testing/simplify_test.cc | 2 |
9 files changed, 239 insertions, 132 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 884873b..f87c4ff 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,8 +22,8 @@ option(RE2_BUILD_TESTING "enable testing for RE2" ON) set(EXTRA_TARGET_LINK_LIBRARIES) if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") - if(MSVC_VERSION LESS 1800) - message(FATAL_ERROR "you need Visual Studio 2013 or later") + if(MSVC_VERSION LESS 1900) + message(FATAL_ERROR "you need Visual Studio 2015 or later") endif() if(BUILD_SHARED_LIBS) # See http://www.kitware.com/blog/home/post/939 for details. @@ -33,6 +33,9 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") # CMake defaults to /W3, but some users like /W4 (or /Wall) and /WX, # so we disable various warnings that aren't particularly helpful. add_compile_options(/wd4100 /wd4201 /wd4456 /wd4457 /wd4702 /wd4815) + # Without a byte order mark (BOM), Visual Studio assumes that the source + # file is encoded using the current user code page, so we specify UTF-8. + add_compile_options(/utf-8) elseif(CYGWIN OR MINGW) # See https://stackoverflow.com/questions/38139631 for details. add_compile_options(-std=gnu++11) diff --git a/kokoro/windows-cmake.bat b/kokoro/windows-cmake.bat index b6d2f6f..6b04b65 100755 --- a/kokoro/windows-cmake.bat +++ b/kokoro/windows-cmake.bat @@ -1,10 +1,10 @@ CD git/re2 || EXIT /B 1 -cmake -D CMAKE_BUILD_TYPE=Debug -G "Visual Studio 12 2013" -A x64 . || EXIT /B 1 +cmake -D CMAKE_BUILD_TYPE=Debug -G "Visual Studio 14 2015" -A x64 . || EXIT /B 1 cmake --build . --config Debug --clean-first || EXIT /B 1 ctest -C Debug --output-on-failure -E dfa^|exhaustive^|random || EXIT /B 1 -cmake -D CMAKE_BUILD_TYPE=Release -G "Visual Studio 12 2013" -A x64 . || EXIT /B 1 +cmake -D CMAKE_BUILD_TYPE=Release -G "Visual Studio 14 2015" -A x64 . || EXIT /B 1 cmake --build . --config Release --clean-first || EXIT /B 1 ctest -C Release --output-on-failure -E dfa^|exhaustive^|random || EXIT /B 1 @@ -126,7 +126,7 @@ class DFA { // into this state, along with kFlagMatch if this // is a matching state. -// Work around the bug affecting flexible array members in GCC 6.1 and 6.2. +// Work around the bug affecting flexible array members in GCC 6.x (for x >= 1). // (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70932) #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && __GNUC_MINOR__ >= 1 std::atomic<State*> next_[0]; // Outgoing arrows from State, diff --git a/re2/fuzzing/re2_fuzzer.cc b/re2/fuzzing/re2_fuzzer.cc index e0aa96a..d3cd145 100644 --- a/re2/fuzzing/re2_fuzzer.cc +++ b/re2/fuzzing/re2_fuzzer.cc @@ -56,7 +56,7 @@ void Test(StringPiece pattern, const RE2::Options& options, StringPiece text) { // Entry point for libFuzzer. extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { - if (size == 0 || size > 1024) + if (size == 0 || size > 512) return 0; // Crudely limit the use of \p and \P. diff --git a/re2/parse.cc b/re2/parse.cc index 2b46e3e..3374345 100644 --- a/re2/parse.cc +++ b/re2/parse.cc @@ -23,6 +23,7 @@ #include <algorithm> #include <map> #include <string> +#include <vector> #include "util/util.h" #include "util/logging.h" @@ -865,59 +866,180 @@ void Regexp::RemoveLeadingString(Regexp* re, int n) { } } +// In the context of factoring alternations, a Splice is: a factored prefix or +// merged character class computed by one iteration of one round of factoring; +// the span of subexpressions of the alternation to be "spliced" (i.e. removed +// and replaced); and, for a factored prefix, the number of suffixes after any +// factoring that might have subsequently been performed on them. For a merged +// character class, there are no suffixes, of course, so the field is ignored. +struct Splice { + Splice(Regexp* prefix, Regexp** sub, int nsub) + : prefix(prefix), + sub(sub), + nsub(nsub), + nsuffix(-1) {} + + Regexp* prefix; + Regexp** sub; + int nsub; + int nsuffix; +}; + +// Named so because it is used to implement an explicit stack, a Frame is: the +// span of subexpressions of the alternation to be factored; the current round +// of factoring; any Splices computed; and, for a factored prefix, an iterator +// to the next Splice to be factored (i.e. in another Frame) because suffixes. +struct Frame { + Frame(Regexp** sub, int nsub) + : sub(sub), + nsub(nsub), + round(0) {} + + Regexp** sub; + int nsub; + int round; + std::vector<Splice> splices; + std::vector<Splice>::iterator spliceiter; +}; + +// Bundled into a class for friend access to Regexp without needing to declare +// (or define) Splice in regexp.h. +class FactorAlternationImpl { + public: + static void Round1(Regexp** sub, int nsub, + Regexp::ParseFlags flags, + std::vector<Splice>* splices); + static void Round2(Regexp** sub, int nsub, + Regexp::ParseFlags flags, + std::vector<Splice>* splices); + static void Round3(Regexp** sub, int nsub, + Regexp::ParseFlags flags, + std::vector<Splice>* splices); +}; + // Factors common prefixes from alternation. // For example, // ABC|ABD|AEF|BCX|BCY // simplifies to // A(B(C|D)|EF)|BC(X|Y) -// which the normal parse state routines will further simplify to +// and thence to // A(B[CD]|EF)|BC[XY] // // Rewrites sub to contain simplified list to alternate and returns // the new length of sub. Adjusts reference counts accordingly // (incoming sub[i] decremented, outgoing sub[i] incremented). +int Regexp::FactorAlternation(Regexp** sub, int nsub, ParseFlags flags) { + std::vector<Frame> stk; + stk.emplace_back(sub, nsub); + + for (;;) { + auto& sub = stk.back().sub; + auto& nsub = stk.back().nsub; + auto& round = stk.back().round; + auto& splices = stk.back().splices; + auto& spliceiter = stk.back().spliceiter; + + if (splices.empty()) { + // Advance to the next round of factoring. Note that this covers + // the initialised state: when splices is empty and round is 0. + round++; + } else if (spliceiter != splices.end()) { + // We have at least one more Splice to factor. Recurse logically. + stk.emplace_back(spliceiter->sub, spliceiter->nsub); + continue; + } else { + // We have no more Splices to factor. Apply them. + auto iter = splices.begin(); + int out = 0; + for (int i = 0; i < nsub; ) { + // Copy until we reach where the next Splice begins. + while (sub + i < iter->sub) + sub[out++] = sub[i++]; + switch (round) { + case 1: + case 2: { + // Assemble the Splice prefix and the suffixes. + Regexp* re[2]; + re[0] = iter->prefix; + re[1] = Regexp::AlternateNoFactor(iter->sub, iter->nsuffix, flags); + sub[out++] = Regexp::Concat(re, 2, flags); + i += iter->nsub; + break; + } + case 3: + // Just use the Splice prefix. + sub[out++] = iter->prefix; + i += iter->nsub; + break; + default: + LOG(DFATAL) << "unknown round: " << round; + break; + } + // If we are done, copy until the end of sub. + if (++iter == splices.end()) { + while (i < nsub) + sub[out++] = sub[i++]; + } + } + splices.clear(); + nsub = out; + // Advance to the next round of factoring. + round++; + } -// It's too much of a pain to write this code with an explicit stack, -// so instead we let the caller specify a maximum depth and -// don't simplify beyond that. There are around 15 words of local -// variables and parameters in the frame, so allowing 8 levels -// on a 64-bit machine is still less than a kilobyte of stack and -// probably enough benefit for practical uses. -const int kFactorAlternationMaxDepth = 8; - -int Regexp::FactorAlternation( - Regexp** sub, int n, - Regexp::ParseFlags altflags) { - return FactorAlternationRecursive(sub, n, altflags, - kFactorAlternationMaxDepth); -} - -int Regexp::FactorAlternationRecursive( - Regexp** sub, int n, - Regexp::ParseFlags altflags, - int maxdepth) { + switch (round) { + case 1: + FactorAlternationImpl::Round1(sub, nsub, flags, &splices); + break; + case 2: + FactorAlternationImpl::Round2(sub, nsub, flags, &splices); + break; + case 3: + FactorAlternationImpl::Round3(sub, nsub, flags, &splices); + break; + case 4: + if (stk.size() == 1) { + // We are at the top of the stack. Just return. + return nsub; + } else { + // Pop the stack and set the number of suffixes. + // (Note that references will be invalidated!) + int nsuffix = nsub; + stk.pop_back(); + stk.back().spliceiter->nsuffix = nsuffix; + ++stk.back().spliceiter; + continue; + } + default: + LOG(DFATAL) << "unknown round: " << round; + break; + } - if (maxdepth <= 0) - return n; + // Set spliceiter depending on whether we have Splices to factor. + if (splices.empty() || round == 3) { + spliceiter = splices.end(); + } else { + spliceiter = splices.begin(); + } + } +} +void FactorAlternationImpl::Round1(Regexp** sub, int nsub, + Regexp::ParseFlags flags, + std::vector<Splice>* splices) { // Round 1: Factor out common literal prefixes. - Rune *rune = NULL; + int start = 0; + Rune* rune = NULL; int nrune = 0; Regexp::ParseFlags runeflags = Regexp::NoParseFlags; - int start = 0; - int out = 0; - for (int i = 0; i <= n; i++) { - // Invariant: what was in sub[0:start] has been Decref'ed - // and that space has been reused for sub[0:out] (out <= start). - // - // Invariant: sub[start:i] consists of regexps that all begin - // with the string rune[0:nrune]. - + for (int i = 0; i <= nsub; i++) { + // Invariant: sub[start:i] consists of regexps that all + // begin with rune[0:nrune]. Rune* rune_i = NULL; int nrune_i = 0; Regexp::ParseFlags runeflags_i = Regexp::NoParseFlags; - if (i < n) { - rune_i = LeadingString(sub[i], &nrune_i, &runeflags_i); + if (i < nsub) { + rune_i = Regexp::LeadingString(sub[i], &nrune_i, &runeflags_i); if (runeflags_i == runeflags) { int same = 0; while (same < nrune && same < nrune_i && rune[same] == rune_i[same]) @@ -931,37 +1053,32 @@ int Regexp::FactorAlternationRecursive( } // Found end of a run with common leading literal string: - // sub[start:i] all begin with rune[0:nrune] but sub[i] - // does not even begin with rune[0]. - // - // Factor out common string and append factored expression to sub[0:out]. + // sub[start:i] all begin with rune[0:nrune], + // but sub[i] does not even begin with rune[0]. if (i == start) { // Nothing to do - first iteration. } else if (i == start+1) { // Just one: don't bother factoring. - sub[out++] = sub[start]; } else { - // Construct factored form: prefix(suffix1|suffix2|...) - Regexp* x[2]; // x[0] = prefix, x[1] = suffix1|suffix2|... - x[0] = LiteralString(rune, nrune, runeflags); + Regexp* prefix = Regexp::LiteralString(rune, nrune, runeflags); for (int j = start; j < i; j++) - RemoveLeadingString(sub[j], nrune); - int nn = FactorAlternationRecursive(sub + start, i - start, altflags, - maxdepth - 1); - x[1] = AlternateNoFactor(sub + start, nn, altflags); - sub[out++] = Concat(x, 2, altflags); + Regexp::RemoveLeadingString(sub[j], nrune); + splices->emplace_back(prefix, sub + start, i - start); } - // Prepare for next round (if there is one). - if (i < n) { + // Prepare for next iteration (if there is one). + if (i < nsub) { start = i; rune = rune_i; nrune = nrune_i; runeflags = runeflags_i; } } - n = out; +} +void FactorAlternationImpl::Round2(Regexp** sub, int nsub, + Regexp::ParseFlags flags, + std::vector<Splice>* splices) { // Round 2: Factor out common simple prefixes, // just the first piece of each concatenation. // This will be good enough a lot of the time. @@ -970,19 +1087,15 @@ int Regexp::FactorAlternationRecursive( // are not safe to factor because that collapses their // distinct paths through the automaton, which affects // correctness in some cases. - start = 0; - out = 0; + int start = 0; Regexp* first = NULL; - for (int i = 0; i <= n; i++) { - // Invariant: what was in sub[0:start] has been Decref'ed - // and that space has been reused for sub[0:out] (out <= start). - // - // Invariant: sub[start:i] consists of regexps that all begin with first. - + for (int i = 0; i <= nsub; i++) { + // Invariant: sub[start:i] consists of regexps that all + // begin with first. Regexp* first_i = NULL; - if (i < n) { - first_i = LeadingRegexp(sub[i]); - if (first != NULL && Regexp::Equal(first, first_i) && + if (i < nsub) { + first_i = Regexp::LeadingRegexp(sub[i]); + if (first != NULL && // first must be an empty-width op // OR a char class, any char or any byte // OR a fixed repeat of a literal, char class, any char or any byte. @@ -1000,63 +1113,61 @@ int Regexp::FactorAlternationRecursive( (first->sub()[0]->op() == kRegexpLiteral || first->sub()[0]->op() == kRegexpCharClass || first->sub()[0]->op() == kRegexpAnyChar || - first->sub()[0]->op() == kRegexpAnyByte)))) { + first->sub()[0]->op() == kRegexpAnyByte))) && + Regexp::Equal(first, first_i)) continue; - } } // Found end of a run with common leading regexp: - // sub[start:i] all begin with first but sub[i] does not. - // - // Factor out common regexp and append factored expression to sub[0:out]. + // sub[start:i] all begin with first, + // but sub[i] does not. if (i == start) { // Nothing to do - first iteration. } else if (i == start+1) { // Just one: don't bother factoring. - sub[out++] = sub[start]; } else { - // Construct factored form: prefix(suffix1|suffix2|...) - Regexp* x[2]; // x[0] = prefix, x[1] = suffix1|suffix2|... - x[0] = first->Incref(); + Regexp* prefix = first->Incref(); for (int j = start; j < i; j++) - sub[j] = RemoveLeadingRegexp(sub[j]); - int nn = FactorAlternationRecursive(sub + start, i - start, altflags, - maxdepth - 1); - x[1] = AlternateNoFactor(sub + start, nn, altflags); - sub[out++] = Concat(x, 2, altflags); + sub[j] = Regexp::RemoveLeadingRegexp(sub[j]); + splices->emplace_back(prefix, sub + start, i - start); } - // Prepare for next round (if there is one). - if (i < n) { + // Prepare for next iteration (if there is one). + if (i < nsub) { start = i; first = first_i; } } - n = out; - - // Round 3: Collapse runs of single literals into character classes. - start = 0; - out = 0; - for (int i = 0; i <= n; i++) { - // Invariant: what was in sub[0:start] has been Decref'ed - // and that space has been reused for sub[0:out] (out <= start). - // - // Invariant: sub[start:i] consists of regexps that are either - // literal runes or character classes. +} - if (i < n && - (sub[i]->op() == kRegexpLiteral || - sub[i]->op() == kRegexpCharClass)) - continue; +void FactorAlternationImpl::Round3(Regexp** sub, int nsub, + Regexp::ParseFlags flags, + std::vector<Splice>* splices) { + // Round 3: Merge runs of literals and/or character classes. + int start = 0; + Regexp* first = NULL; + for (int i = 0; i <= nsub; i++) { + // Invariant: sub[start:i] consists of regexps that all + // are either literals (i.e. runes) or character classes. + Regexp* first_i = NULL; + if (i < nsub) { + first_i = sub[i]; + if (first != NULL && + (first->op() == kRegexpLiteral || + first->op() == kRegexpCharClass) && + (first_i->op() == kRegexpLiteral || + first_i->op() == kRegexpCharClass)) + continue; + } - // sub[i] is not a char or char class; - // emit char class for sub[start:i]... + // Found end of a run of Literal/CharClass: + // sub[start:i] all are either one or the other, + // but sub[i] is not. if (i == start) { - // Nothing to do. + // Nothing to do - first iteration. } else if (i == start+1) { - sub[out++] = sub[start]; + // Just one: don't bother factoring. } else { - // Make new char class. CharClassBuilder ccb; for (int j = start; j < i; j++) { Regexp* re = sub[j]; @@ -1072,31 +1183,16 @@ int Regexp::FactorAlternationRecursive( } re->Decref(); } - sub[out++] = NewCharClass(ccb.GetCharClass(), altflags); + Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags); + splices->emplace_back(re, sub + start, i - start); } - // ... and then emit sub[i]. - if (i < n) - sub[out++] = sub[i]; - start = i+1; - } - n = out; - - // Round 4: Collapse runs of empty matches into single empty match. - start = 0; - out = 0; - for (int i = 0; i < n; i++) { - if (i + 1 < n && - sub[i]->op() == kRegexpEmptyMatch && - sub[i+1]->op() == kRegexpEmptyMatch) { - sub[i]->Decref(); - continue; + // Prepare for next iteration (if there is one). + if (i < nsub) { + start = i; + first = first_i; } - sub[out++] = sub[i]; } - n = out; - - return n; } // Collapse the regexps on top of the stack, down to the @@ -1294,8 +1390,7 @@ static bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) { static int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) { // fullrune() takes int, not size_t. However, it just looks // at the leading byte and treats any length >= 4 the same. - if (fullrune(sp->data(), static_cast<int>(std::min(static_cast<size_t>(4), - sp->size())))) { + if (fullrune(sp->data(), static_cast<int>(std::min(size_t{4}, sp->size())))) { int n = chartorune(r, sp->data()); // Some copies of chartorune have a bug that accepts // encodings of values in (10FFFF, 1FFFFF] as valid. diff --git a/re2/regexp.h b/re2/regexp.h index fcc7c0f..2ca96cd 100644 --- a/re2/regexp.h +++ b/re2/regexp.h @@ -494,8 +494,7 @@ class Regexp { // Simplifies an alternation of literal strings by factoring out // common prefixes. static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags); - static int FactorAlternationRecursive(Regexp** sub, int nsub, - ParseFlags flags, int maxdepth); + friend class FactorAlternationImpl; // Is a == b? Only efficient on regexps that have not been through // Simplify yet - the expansion of a kRegexpRepeat will make this diff --git a/re2/testing/dfa_test.cc b/re2/testing/dfa_test.cc index 1db7769..7f0feea 100644 --- a/re2/testing/dfa_test.cc +++ b/re2/testing/dfa_test.cc @@ -79,7 +79,7 @@ TEST(SingleThreaded, BuildEntireDFA) { CHECK(re); for (int i = 17; i < 24; i++) { - int64_t limit = 1<<i; + int64_t limit = int64_t{1}<<i; int64_t usage; //int64_t progusage, dfamem; { @@ -120,7 +120,7 @@ static string DeBruijnString(int n) { CHECK_LT(n, static_cast<int>(8*sizeof(int))); CHECK_GT(n, 0); - std::vector<bool> did(1<<n); + std::vector<bool> did(size_t{1}<<n); for (int i = 0; i < 1<<n; i++) did[i] = false; diff --git a/re2/testing/parse_test.cc b/re2/testing/parse_test.cc index 1e6929d..b039e13 100644 --- a/re2/testing/parse_test.cc +++ b/re2/testing/parse_test.cc @@ -53,7 +53,7 @@ static Test tests[] = { { "a{2,3}?", "nrep{2,3 lit{a}}" }, { "a{2,}?", "nrep{2,-1 lit{a}}" }, { "", "emp{}" }, - { "|", "emp{}" }, // alt{emp{}emp{}} but got factored + { "|", "alt{emp{}emp{}}" }, { "|x|", "alt{emp{}lit{x}emp{}}" }, { ".", "dot{}" }, { "^", "bol{}" }, @@ -336,6 +336,16 @@ Test prefix_tests[] = { "cat{lit{a}alt{cat{nstar{byte{}}lit{c}}cat{nstar{byte{}}lit{b}}}}" }, { "^/a/bc|^/a/de", "cat{bol{}cat{str{/a/}alt{str{bc}str{de}}}}" }, + // In the past, factoring was limited to kFactorAlternationMaxDepth (8). + { "a|aa|aaa|aaaa|aaaaa|aaaaaa|aaaaaaa|aaaaaaaa|aaaaaaaaa|aaaaaaaaaa", + "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" + "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" + "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" + "lit{a}}}}}}}}}}}}}}}}}}}" }, + { "a|aardvark|aardvarks|abaci|aback|abacus|abacuses|abaft|abalone|abalones", + "cat{lit{a}alt{emp{}cat{str{ardvark}alt{emp{}lit{s}}}" + "cat{str{ba}alt{cat{lit{c}alt{cc{0x69 0x6b}cat{str{us}alt{emp{}str{es}}}}}" + "str{ft}cat{str{lone}alt{emp{}lit{s}}}}}}}" }, }; // Test that prefix factoring works. diff --git a/re2/testing/simplify_test.cc b/re2/testing/simplify_test.cc index 6bdb2af..43b9656 100644 --- a/re2/testing/simplify_test.cc +++ b/re2/testing/simplify_test.cc @@ -125,7 +125,7 @@ static Test tests[] = { // explicit (?:) in place of non-parenthesized empty strings, // to make them easier to spot for other parsers. { "(a|b|)", "([a-b]|(?:))" }, - { "(|)", "()" }, + { "(|)", "((?:)|(?:))" }, { "a()", "a()" }, { "(()|())", "(()|())" }, { "(a|)", "(a|(?:))" }, |