diff options
author | DongHun Kwak <dh0128.kwak@samsung.com> | 2022-02-08 14:56:22 +0900 |
---|---|---|
committer | DongHun Kwak <dh0128.kwak@samsung.com> | 2022-02-08 14:56:22 +0900 |
commit | df68d8515fb94cafc61deb036ca88f915dbc1f0b (patch) | |
tree | 88de880fcd4072b47fd0f15a3fbddfde5c5360a0 | |
parent | cb86a7988f59860dfc815152d8823293140167b3 (diff) | |
download | re2-df68d8515fb94cafc61deb036ca88f915dbc1f0b.tar.gz re2-df68d8515fb94cafc61deb036ca88f915dbc1f0b.tar.bz2 re2-df68d8515fb94cafc61deb036ca88f915dbc1f0b.zip |
Imported Upstream version 20210801upstream/20210801
-rw-r--r-- | .github/workflows/ci-bazel.yml | 2 | ||||
-rw-r--r-- | .github/workflows/ci-cmake.yml | 2 | ||||
-rw-r--r-- | .github/workflows/ci.yml | 2 | ||||
-rw-r--r-- | SECURITY.md | 4 | ||||
-rw-r--r-- | WORKSPACE | 4 | ||||
-rw-r--r-- | re2/compile.cc | 8 | ||||
-rw-r--r-- | re2/dfa.cc | 8 | ||||
-rw-r--r-- | re2/fuzzing/re2_fuzzer.cc | 5 | ||||
-rw-r--r-- | re2/parse.cc | 18 | ||||
-rw-r--r-- | re2/prog.cc | 186 | ||||
-rw-r--r-- | re2/prog.h | 37 | ||||
-rw-r--r-- | re2/regexp.h | 4 | ||||
-rw-r--r-- | re2/testing/dfa_test.cc | 8 | ||||
-rw-r--r-- | re2/testing/regexp_benchmark.cc | 19 | ||||
-rw-r--r-- | re2/testing/required_prefix_test.cc | 76 |
15 files changed, 326 insertions, 57 deletions
diff --git a/.github/workflows/ci-bazel.yml b/.github/workflows/ci-bazel.yml index 534da03..681034d 100644 --- a/.github/workflows/ci-bazel.yml +++ b/.github/workflows/ci-bazel.yml @@ -1,7 +1,7 @@ name: CI (Bazel) on: push: - branches: [master] + branches: [main] jobs: build: runs-on: ${{ matrix.os }} diff --git a/.github/workflows/ci-cmake.yml b/.github/workflows/ci-cmake.yml index 568312e..585c386 100644 --- a/.github/workflows/ci-cmake.yml +++ b/.github/workflows/ci-cmake.yml @@ -1,7 +1,7 @@ name: CI (CMake) on: push: - branches: [master] + branches: [main] jobs: build: runs-on: ${{ matrix.os }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 09d1687..474cf65 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,7 +1,7 @@ name: CI on: push: - branches: [master] + branches: [main] jobs: build: runs-on: ${{ matrix.os }} diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..39ba0e9 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,4 @@ +To report a security issue, please use https://g.co/vulnz. We use +https://g.co/vulnz for our intake, and do coordination and disclosure here on +GitHub (including using GitHub Security Advisory). The Google Security Team will +respond within 5 working days of your report on https://g.co/vulnz. @@ -10,6 +10,6 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") http_archive( name = "rules_cc", - strip_prefix = "rules_cc-master", - urls = ["https://github.com/bazelbuild/rules_cc/archive/master.zip"], + strip_prefix = "rules_cc-main", + urls = ["https://github.com/bazelbuild/rules_cc/archive/main.zip"], ) diff --git a/re2/compile.cc b/re2/compile.cc index f3624b9..61d801a 100644 --- a/re2/compile.cc +++ b/re2/compile.cc @@ -1176,12 +1176,8 @@ Prog* Compiler::Finish(Regexp* re) { if (!prog_->reversed()) { std::string prefix; bool prefix_foldcase; - if (re->RequiredPrefixForAccel(&prefix, &prefix_foldcase) && - !prefix_foldcase) { - prog_->prefix_size_ = prefix.size(); - prog_->prefix_front_ = prefix.front(); - prog_->prefix_back_ = prefix.back(); - } + if (re->RequiredPrefixForAccel(&prefix, &prefix_foldcase)) + prog_->ConfigurePrefixAccel(prefix, prefix_foldcase); } // Record remaining memory for DFA. @@ -56,6 +56,10 @@ namespace re2 { // Controls whether the DFA should bail out early if the NFA would be faster. static bool dfa_should_bail_when_slow = true; +void Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(bool b) { + dfa_should_bail_when_slow = b; +} + // Changing this to true compiles in prints that trace execution of the DFA. // Generates a lot of output -- only useful for debugging. static const bool ExtraDebug = false; @@ -1966,10 +1970,6 @@ int Prog::BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb) { return GetDFA(kind)->BuildAllStates(cb); } -void Prog::TEST_dfa_should_bail_when_slow(bool b) { - dfa_should_bail_when_slow = b; -} - // Computes min and max for matching string. // Won't return strings bigger than maxlen. bool DFA::PossibleMatchRange(std::string* min, std::string* max, int maxlen) { diff --git a/re2/fuzzing/re2_fuzzer.cc b/re2/fuzzing/re2_fuzzer.cc index 8306f88..af1129f 100644 --- a/re2/fuzzing/re2_fuzzer.cc +++ b/re2/fuzzing/re2_fuzzer.cc @@ -12,6 +12,7 @@ #include "re2/prefilter.h" #include "re2/re2.h" +#include "re2/regexp.h" using re2::StringPiece; @@ -50,6 +51,10 @@ void TestOneInput(StringPiece pattern, const RE2::Options& options, if (backslash_p > 1) return; + // The default is 1000. Even 100 turned out to be too generous + // for fuzzing, empirically speaking, so let's try 10 instead. + re2::Regexp::FUZZING_ONLY_set_maximum_repeat_count(10); + RE2 re(pattern, options); if (!re.ok()) return; diff --git a/re2/parse.cc b/re2/parse.cc index 3bba613..87ff2ca 100644 --- a/re2/parse.cc +++ b/re2/parse.cc @@ -44,12 +44,12 @@ namespace re2 { -// Reduce the maximum repeat count by an order of magnitude when fuzzing. -#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION -static const int kMaxRepeat = 100; -#else -static const int kMaxRepeat = 1000; -#endif +// Controls the maximum repeat count permitted by the parser. +static int maximum_repeat_count = 1000; + +void Regexp::FUZZING_ONLY_set_maximum_repeat_count(int i) { + maximum_repeat_count = i; +} // Regular expression parse state. // The list of parsed regexps so far is maintained as a vector of @@ -568,7 +568,9 @@ int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) { bool Regexp::ParseState::PushRepetition(int min, int max, const StringPiece& s, bool nongreedy) { - if ((max != -1 && max < min) || min > kMaxRepeat || max > kMaxRepeat) { + if ((max != -1 && max < min) || + min > maximum_repeat_count || + max > maximum_repeat_count) { status_->set_code(kRegexpRepeatSize); status_->set_error_arg(s); return false; @@ -591,7 +593,7 @@ bool Regexp::ParseState::PushRepetition(int min, int max, stacktop_ = re; if (min >= 2 || max >= 2) { RepetitionWalker w; - if (w.Walk(stacktop_, kMaxRepeat) == 0) { + if (w.Walk(stacktop_, maximum_repeat_count) == 0) { status_->set_code(kRegexpRepeatSize); status_->set_error_arg(s); return false; diff --git a/re2/prog.cc b/re2/prog.cc index ac9c085..396b46c 100644 --- a/re2/prog.cc +++ b/re2/prog.cc @@ -115,9 +115,8 @@ Prog::Prog() start_unanchored_(0), size_(0), bytemap_range_(0), + prefix_foldcase_(false), prefix_size_(0), - prefix_front_(-1), - prefix_back_(-1), list_count_(0), dfa_mem_(0), dfa_first_(NULL), @@ -127,6 +126,8 @@ Prog::Prog() Prog::~Prog() { DeleteDFA(dfa_longest_); DeleteDFA(dfa_first_); + if (prefix_foldcase_) + delete[] prefix_dfa_; } typedef SparseSet Workq; @@ -916,6 +917,183 @@ void Prog::ComputeHints(std::vector<Inst>* flat, int begin, int end) { } } +// The final state will always be this, which frees up a register for the hot +// loop and thus avoids the spilling that can occur when building with Clang. +static const size_t kShiftDFAFinal = 9; + +// This function takes the prefix as std::string (i.e. not const std::string& +// as normal) because it's going to clobber it, so a temporary is convenient. +static uint64_t* BuildShiftDFA(std::string prefix) { + // This constant is for convenience now and also for correctness later when + // we clobber the prefix, but still need to know how long it was initially. + const size_t size = prefix.size(); + + // Construct the NFA. + // The table is indexed by input byte; each element is a bitfield of states + // reachable by the input byte. Given a bitfield of the current states, the + // bitfield of states reachable from those is - for this specific purpose - + // always ((ncurr << 1) | 1). Intersecting the reachability bitfields gives + // the bitfield of the next states reached by stepping over the input byte. + // Credits for this technique: the Hyperscan paper by Geoff Langdale et al. + uint16_t nfa[256]{}; + for (size_t i = 0; i < size; ++i) { + uint8_t b = prefix[i]; + nfa[b] |= 1 << (i+1); + } + // This is the `\C*?` for unanchored search. + for (int b = 0; b < 256; ++b) + nfa[b] |= 1; + + // This maps from DFA state to NFA states; the reverse mapping is used when + // recording transitions and gets implemented with plain old linear search. + // The "Shift DFA" technique limits this to ten states when using uint64_t; + // to allow for the initial state, we use at most nine bytes of the prefix. + // That same limit is also why uint16_t is sufficient for the NFA bitfield. + uint16_t states[kShiftDFAFinal+1]{}; + states[0] = 1; + for (size_t dcurr = 0; dcurr < size; ++dcurr) { + uint8_t b = prefix[dcurr]; + uint16_t ncurr = states[dcurr]; + uint16_t nnext = nfa[b] & ((ncurr << 1) | 1); + size_t dnext = dcurr+1; + if (dnext == size) + dnext = kShiftDFAFinal; + states[dnext] = nnext; + } + + // Sort and unique the bytes of the prefix to avoid repeating work while we + // record transitions. This clobbers the prefix, but it's no longer needed. + std::sort(prefix.begin(), prefix.end()); + prefix.erase(std::unique(prefix.begin(), prefix.end()), prefix.end()); + + // Construct the DFA. + // The table is indexed by input byte; each element is effectively a packed + // array of uint6_t; each array value will be multiplied by six in order to + // avoid having to do so later in the hot loop as well as masking/shifting. + // Credits for this technique: "Shift-based DFAs" on GitHub by Per Vognsen. + uint64_t* dfa = new uint64_t[256]{}; + // Record a transition from each state for each of the bytes of the prefix. + // Note that all other input bytes go back to the initial state by default. + for (size_t dcurr = 0; dcurr < size; ++dcurr) { + for (uint8_t b : prefix) { + uint16_t ncurr = states[dcurr]; + uint16_t nnext = nfa[b] & ((ncurr << 1) | 1); + size_t dnext = 0; + while (states[dnext] != nnext) + ++dnext; + dfa[b] |= static_cast<uint64_t>(dnext * 6) << (dcurr * 6); + // Convert ASCII letters to uppercase and record the extra transitions. + // Note that ASCII letters are guaranteed to be lowercase at this point + // because that's how the parser normalises them. #FunFact: 'k' and 's' + // match U+212A and U+017F, respectively, so they won't occur here when + // using UTF-8 encoding because the parser will emit character classes. + if ('a' <= b && b <= 'z') { + b -= 'a' - 'A'; + dfa[b] |= static_cast<uint64_t>(dnext * 6) << (dcurr * 6); + } + } + } + // This lets the final state "saturate", which will matter for performance: + // in the hot loop, we check for a match only at the end of each iteration, + // so we must keep signalling the match until we get around to checking it. + for (int b = 0; b < 256; ++b) + dfa[b] |= static_cast<uint64_t>(kShiftDFAFinal * 6) << (kShiftDFAFinal * 6); + + return dfa; +} + +void Prog::ConfigurePrefixAccel(const std::string& prefix, + bool prefix_foldcase) { + prefix_foldcase_ = prefix_foldcase; + prefix_size_ = prefix.size(); + if (prefix_foldcase_) { + // Use PrefixAccel_ShiftDFA(). + // ... and no more than nine bytes of the prefix. (See above for details.) + prefix_size_ = std::min(prefix_size_, kShiftDFAFinal); + prefix_dfa_ = BuildShiftDFA(prefix.substr(0, prefix_size_)); + } else if (prefix_size_ != 1) { + // Use PrefixAccel_FrontAndBack(). + prefix_front_ = prefix.front(); + prefix_back_ = prefix.back(); + } else { + // Use memchr(3). + prefix_front_ = prefix.front(); + } +} + +const void* Prog::PrefixAccel_ShiftDFA(const void* data, size_t size) { + if (size < prefix_size_) + return NULL; + + uint64_t curr = 0; + + // At the time of writing, rough benchmarks on a Broadwell machine showed + // that this unroll factor (i.e. eight) achieves a speedup factor of two. + if (size >= 8) { + const uint8_t* p = reinterpret_cast<const uint8_t*>(data); + const uint8_t* endp = p + (size&~7); + do { + uint8_t b0 = p[0]; + uint8_t b1 = p[1]; + uint8_t b2 = p[2]; + uint8_t b3 = p[3]; + uint8_t b4 = p[4]; + uint8_t b5 = p[5]; + uint8_t b6 = p[6]; + uint8_t b7 = p[7]; + + uint64_t next0 = prefix_dfa_[b0]; + uint64_t next1 = prefix_dfa_[b1]; + uint64_t next2 = prefix_dfa_[b2]; + uint64_t next3 = prefix_dfa_[b3]; + uint64_t next4 = prefix_dfa_[b4]; + uint64_t next5 = prefix_dfa_[b5]; + uint64_t next6 = prefix_dfa_[b6]; + uint64_t next7 = prefix_dfa_[b7]; + + uint64_t curr0 = next0 >> (curr & 63); + uint64_t curr1 = next1 >> (curr0 & 63); + uint64_t curr2 = next2 >> (curr1 & 63); + uint64_t curr3 = next3 >> (curr2 & 63); + uint64_t curr4 = next4 >> (curr3 & 63); + uint64_t curr5 = next5 >> (curr4 & 63); + uint64_t curr6 = next6 >> (curr5 & 63); + uint64_t curr7 = next7 >> (curr6 & 63); + + if ((curr7 & 63) == kShiftDFAFinal * 6) { + // At the time of writing, using the same masking subexpressions from + // the preceding lines caused Clang to clutter the hot loop computing + // them - even though they aren't actually needed for shifting! Hence + // these rewritten conditions, which achieve a speedup factor of two. + if (((curr7-curr0) & 63) == 0) return p+1-prefix_size_; + if (((curr7-curr1) & 63) == 0) return p+2-prefix_size_; + if (((curr7-curr2) & 63) == 0) return p+3-prefix_size_; + if (((curr7-curr3) & 63) == 0) return p+4-prefix_size_; + if (((curr7-curr4) & 63) == 0) return p+5-prefix_size_; + if (((curr7-curr5) & 63) == 0) return p+6-prefix_size_; + if (((curr7-curr6) & 63) == 0) return p+7-prefix_size_; + if (((curr7-curr7) & 63) == 0) return p+8-prefix_size_; + } + + curr = curr7; + p += 8; + } while (p != endp); + data = p; + size = size&7; + } + + const uint8_t* p = reinterpret_cast<const uint8_t*>(data); + const uint8_t* endp = p + size; + while (p != endp) { + uint8_t b = *p++; + uint64_t next = prefix_dfa_[b]; + curr = next >> (curr & 63); + if ((curr & 63) == kShiftDFAFinal * 6) + return p-prefix_size_; + } + return NULL; +} + #if defined(__AVX2__) // Finds the least significant non-zero bit in n. static int FindLSBSet(uint32_t n) { @@ -958,7 +1136,7 @@ const void* Prog::PrefixAccel_FrontAndBack(const void* data, size_t size) { const __m256i* endfp = fp + size/sizeof(__m256i); const __m256i f_set1 = _mm256_set1_epi8(prefix_front_); const __m256i b_set1 = _mm256_set1_epi8(prefix_back_); - while (fp != endfp) { + do { const __m256i f_loadu = _mm256_loadu_si256(fp++); const __m256i b_loadu = _mm256_loadu_si256(bp++); const __m256i f_cmpeq = _mm256_cmpeq_epi8(f_set1, f_loadu); @@ -970,7 +1148,7 @@ const void* Prog::PrefixAccel_FrontAndBack(const void* data, size_t size) { const int fb_ctz = FindLSBSet(fb_movemask); return reinterpret_cast<const char*>(fp-1) + fb_ctz; } - } + } while (fp != endfp); data = fp; size = size%sizeof(__m256i); } @@ -220,11 +220,23 @@ class Prog { // Accelerates to the first likely occurrence of the prefix. // Returns a pointer to the first byte or NULL if not found. const void* PrefixAccel(const void* data, size_t size) { - DCHECK_GE(prefix_size_, 1); - return prefix_size_ == 1 ? memchr(data, prefix_front_, size) - : PrefixAccel_FrontAndBack(data, size); + DCHECK(can_prefix_accel()); + if (prefix_foldcase_) { + return PrefixAccel_ShiftDFA(data, size); + } else if (prefix_size_ != 1) { + return PrefixAccel_FrontAndBack(data, size); + } else { + return memchr(data, prefix_front_, size); + } } + // Configures prefix accel using the analysis performed during compilation. + void ConfigurePrefixAccel(const std::string& prefix, bool prefix_foldcase); + + // An implementation of prefix accel that uses prefix_dfa_ to perform + // case-insensitive search. + const void* PrefixAccel_ShiftDFA(const void* data, size_t size); + // An implementation of prefix accel that looks for prefix_front_ and // prefix_back_ to return fewer false positives than memchr(3) alone. const void* PrefixAccel_FrontAndBack(const void* data, size_t size); @@ -298,10 +310,6 @@ class Prog { // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY. int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb); - // Controls whether the DFA should bail out early if the NFA would be faster. - // FOR TESTING ONLY. - static void TEST_dfa_should_bail_when_slow(bool b); - // Compute bytemap. void ComputeByteMap(); @@ -390,6 +398,10 @@ class Prog { // Computes hints for ByteRange instructions in [begin, end). void ComputeHints(std::vector<Inst>* flat, int begin, int end); + // Controls whether the DFA should bail out early if the NFA would be faster. + // FOR TESTING ONLY. + static void TESTING_ONLY_set_dfa_should_bail_when_slow(bool b); + private: friend class Compiler; @@ -406,9 +418,16 @@ class Prog { int start_unanchored_; // unanchored entry point for program int size_; // number of instructions int bytemap_range_; // bytemap_[x] < bytemap_range_ + + bool prefix_foldcase_; // whether prefix is case-insensitive size_t prefix_size_; // size of prefix (0 if no prefix) - int prefix_front_; // first byte of prefix (-1 if no prefix) - int prefix_back_; // last byte of prefix (-1 if no prefix) + union { + uint64_t* prefix_dfa_; // "Shift DFA" for prefix + struct { + int prefix_front_; // first byte of prefix + int prefix_back_; // last byte of prefix + }; + }; int list_count_; // count of lists (see above) int inst_count_[kNumInst]; // count of instructions by opcode diff --git a/re2/regexp.h b/re2/regexp.h index 61882b5..2f40642 100644 --- a/re2/regexp.h +++ b/re2/regexp.h @@ -449,6 +449,10 @@ class Regexp { // regardless of the return value. bool RequiredPrefixForAccel(std::string* prefix, bool* foldcase); + // Controls the maximum repeat count permitted by the parser. + // FOR FUZZING ONLY. + static void FUZZING_ONLY_set_maximum_repeat_count(int i); + private: // Constructor allocates vectors as appropriate for operator. explicit Regexp(RegexpOp op, ParseFlags parse_flags); diff --git a/re2/testing/dfa_test.cc b/re2/testing/dfa_test.cc index 9e15a41..842daaf 100644 --- a/re2/testing/dfa_test.cc +++ b/re2/testing/dfa_test.cc @@ -143,7 +143,7 @@ TEST(SingleThreaded, SearchDFA) { // NFA implementation instead. (The DFA loses its speed advantage // if it can't get a good cache hit rate.) // Tell the DFA to trudge along instead. - Prog::TEST_dfa_should_bail_when_slow(false); + Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(false); state_cache_resets = 0; search_failures = 0; @@ -194,7 +194,7 @@ TEST(SingleThreaded, SearchDFA) { re->Decref(); // Reset to original behaviour. - Prog::TEST_dfa_should_bail_when_slow(true); + Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(true); ASSERT_GT(state_cache_resets, 0); ASSERT_EQ(search_failures, 0); } @@ -218,7 +218,7 @@ static void DoSearch(Prog* prog, const StringPiece& match, } TEST(Multithreaded, SearchDFA) { - Prog::TEST_dfa_should_bail_when_slow(false); + Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(false); state_cache_resets = 0; search_failures = 0; @@ -259,7 +259,7 @@ TEST(Multithreaded, SearchDFA) { re->Decref(); // Reset to original behaviour. - Prog::TEST_dfa_should_bail_when_slow(true); + Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(true); ASSERT_GT(state_cache_resets, 0); ASSERT_EQ(search_failures, 0); } diff --git a/re2/testing/regexp_benchmark.cc b/re2/testing/regexp_benchmark.cc index acf6e88..3eeb098 100644 --- a/re2/testing/regexp_benchmark.cc +++ b/re2/testing/regexp_benchmark.cc @@ -181,10 +181,11 @@ void Search(benchmark::State& state, const char* regexp, SearchImpl* search) { state.SetBytesProcessed(state.iterations() * state.range(0)); } -// These two are easy because they start with an A, -// giving the search loop something to memchr for. +// These three are easy because they have prefixes, +// giving the search loop something to prefix accel. #define EASY0 "ABCDEFGHIJKLMNOPQRSTUVWXYZ$" #define EASY1 "A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$" +#define EASY2 "(?i)" EASY0 // This is a little harder, since it starts with a character class // and thus can't be memchr'ed. Could look for ABC and work backward, @@ -228,6 +229,18 @@ BENCHMARK_RANGE(Search_Easy1_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs() #endif BENCHMARK_RANGE(Search_Easy1_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); +void Search_Easy2_CachedDFA(benchmark::State& state) { Search(state, EASY2, SearchCachedDFA); } +void Search_Easy2_CachedNFA(benchmark::State& state) { Search(state, EASY2, SearchCachedNFA); } +void Search_Easy2_CachedPCRE(benchmark::State& state) { Search(state, EASY2, SearchCachedPCRE); } +void Search_Easy2_CachedRE2(benchmark::State& state) { Search(state, EASY2, SearchCachedRE2); } + +BENCHMARK_RANGE(Search_Easy2_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Easy2_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Easy2_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Easy2_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + void Search_Medium_CachedDFA(benchmark::State& state) { Search(state, MEDIUM, SearchCachedDFA); } void Search_Medium_CachedNFA(benchmark::State& state) { Search(state, MEDIUM, SearchCachedNFA); } void Search_Medium_CachedPCRE(benchmark::State& state) { Search(state, MEDIUM, SearchCachedPCRE); } @@ -953,6 +966,8 @@ Prog* GetCachedProg(const char* regexp) { CHECK(prog); cache[regexp] = prog; re->Decref(); + // We must call this here - while we have exclusive access. + prog->IsOnePass(); } return prog; } diff --git a/re2/testing/required_prefix_test.cc b/re2/testing/required_prefix_test.cc index 023d242..60a11f8 100644 --- a/re2/testing/required_prefix_test.cc +++ b/re2/testing/required_prefix_test.cc @@ -131,23 +131,69 @@ TEST(RequiredPrefixForAccel, SimpleTests) { } } -TEST(PrefixAccel, BasicTest) { - Regexp* re = Regexp::Parse("abc\\d+", Regexp::LikePerl, NULL); +TEST(RequiredPrefixForAccel, CaseFoldingForKAndS) { + Regexp* re; + std::string p; + bool f; + + // With Latin-1 encoding, `(?i)` prefixes can include 'k' and 's'. + re = Regexp::Parse("(?i)KLM", Regexp::LikePerl|Regexp::Latin1, NULL); ASSERT_TRUE(re != NULL); - Prog* prog = re->CompileToProg(0); - ASSERT_TRUE(prog != NULL); - for (int i = 0; i < 100; i++) { - std::string text(i, 'a'); - const char* p = reinterpret_cast<const char*>( - prog->PrefixAccel(text.data(), text.size())); - EXPECT_TRUE(p == NULL); - text.append("abc"); - p = reinterpret_cast<const char*>( - prog->PrefixAccel(text.data(), text.size())); - EXPECT_EQ(i, p-text.data()); - } - delete prog; + ASSERT_TRUE(re->RequiredPrefixForAccel(&p, &f)); + ASSERT_EQ(p, "klm"); + ASSERT_EQ(f, true); + re->Decref(); + + re = Regexp::Parse("(?i)STU", Regexp::LikePerl|Regexp::Latin1, NULL); + ASSERT_TRUE(re != NULL); + ASSERT_TRUE(re->RequiredPrefixForAccel(&p, &f)); + ASSERT_EQ(p, "stu"); + ASSERT_EQ(f, true); + re->Decref(); + + // With UTF-8 encoding, `(?i)` prefixes can't include 'k' and 's'. + // This is because they match U+212A and U+017F, respectively, and + // so the parser ends up emitting character classes, not literals. + re = Regexp::Parse("(?i)KLM", Regexp::LikePerl, NULL); + ASSERT_TRUE(re != NULL); + ASSERT_FALSE(re->RequiredPrefixForAccel(&p, &f)); + re->Decref(); + + re = Regexp::Parse("(?i)STU", Regexp::LikePerl, NULL); + ASSERT_TRUE(re != NULL); + ASSERT_FALSE(re->RequiredPrefixForAccel(&p, &f)); re->Decref(); } +static const char* prefix_accel_tests[] = { + "aababc\\d+", + "(?i)AABABC\\d+", +}; + +TEST(PrefixAccel, SimpleTests) { + for (size_t i = 0; i < arraysize(prefix_accel_tests); i++) { + const char* pattern = prefix_accel_tests[i]; + Regexp* re = Regexp::Parse(pattern, Regexp::LikePerl, NULL); + ASSERT_TRUE(re != NULL); + Prog* prog = re->CompileToProg(0); + ASSERT_TRUE(prog != NULL); + ASSERT_TRUE(prog->can_prefix_accel()); + for (int j = 0; j < 100; j++) { + std::string text(j, 'a'); + const char* p = reinterpret_cast<const char*>( + prog->PrefixAccel(text.data(), text.size())); + EXPECT_TRUE(p == NULL); + text.append("aababc"); + for (int k = 0; k < 100; k++) { + text.append(k, 'a'); + p = reinterpret_cast<const char*>( + prog->PrefixAccel(text.data(), text.size())); + EXPECT_EQ(j, p - text.data()); + } + } + delete prog; + re->Decref(); + } +} + } // namespace re2 |