diff options
-rw-r--r-- | .travis.yml | 12 | ||||
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | re2/bitmap256.h | 7 | ||||
-rw-r--r-- | re2/bitstate.cc | 15 | ||||
-rw-r--r-- | re2/compile.cc | 6 | ||||
-rw-r--r-- | re2/dfa.cc | 30 | ||||
-rw-r--r-- | re2/fuzzing/re2_fuzzer.cc | 4 | ||||
-rw-r--r-- | re2/nfa.cc | 30 | ||||
-rw-r--r-- | re2/prog.cc | 58 | ||||
-rw-r--r-- | re2/prog.h | 13 | ||||
-rw-r--r-- | re2/re2.cc | 51 | ||||
-rw-r--r-- | re2/re2.h | 30 | ||||
-rw-r--r-- | re2/testing/backtrack.cc | 3 | ||||
-rw-r--r-- | re2/testing/re2_test.cc | 3 | ||||
-rw-r--r-- | re2/testing/regexp_benchmark.cc | 2 |
15 files changed, 138 insertions, 128 deletions
diff --git a/.travis.yml b/.travis.yml index d79541e..a3740ce 100644 --- a/.travis.yml +++ b/.travis.yml @@ -169,6 +169,18 @@ jobs: - clang-9 env: - MATRIX_EVAL="CC=clang-9 CXX=clang++-9" + - os: linux + dist: xenial + addons: + apt: + sources: + - ubuntu-toolchain-r-test + - sourceline: 'deb https://apt.llvm.org/xenial/ llvm-toolchain-xenial-10 main' + key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key' + packages: + - clang-10 + env: + - MATRIX_EVAL="CC=clang-10 CXX=clang++-10" - os: osx osx_image: xcode7.3 @@ -44,7 +44,7 @@ endif # ABI version # http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html -SONAME=6 +SONAME=7 # To rebuild the Tables generated by Perl and Python scripts (requires Internet # access for Unicode data), uncomment the following line: diff --git a/re2/bitmap256.h b/re2/bitmap256.h index f649b4c..4899379 100644 --- a/re2/bitmap256.h +++ b/re2/bitmap256.h @@ -32,7 +32,7 @@ class Bitmap256 { DCHECK_GE(c, 0); DCHECK_LE(c, 255); - return (words_[c / 64] & (1ULL << (c % 64))) != 0; + return (words_[c / 64] & (uint64_t{1} << (c % 64))) != 0; } // Sets the bit with index c. @@ -40,7 +40,7 @@ class Bitmap256 { DCHECK_GE(c, 0); DCHECK_LE(c, 255); - words_[c / 64] |= (1ULL << (c % 64)); + words_[c / 64] |= (uint64_t{1} << (c % 64)); } // Finds the next non-zero bit with index >= c. @@ -51,7 +51,6 @@ class Bitmap256 { // Finds the least significant non-zero bit in n. static int FindLSBSet(uint64_t n) { DCHECK_NE(n, 0); - #if defined(__GNUC__) return __builtin_ctzll(n); #elif defined(_MSC_VER) && defined(_M_X64) @@ -89,7 +88,7 @@ int Bitmap256::FindNextSetBit(int c) const { // Check the word that contains the bit. Mask out any lower bits. int i = c / 64; - uint64_t word = words_[i] & (~0ULL << (c % 64)); + uint64_t word = words_[i] & (~uint64_t{0} << (c % 64)); if (word != 0) return (i * 64) + FindLSBSet(word); diff --git a/re2/bitstate.cc b/re2/bitstate.cc index f07c5b9..865b58e 100644 --- a/re2/bitstate.cc +++ b/re2/bitstate.cc @@ -63,11 +63,14 @@ class BitState { int nsubmatch_; // # of submatches to fill in // Search state - static const int VisitedBits = 32; - PODArray<uint32_t> visited_; // bitmap: (list ID, char*) pairs visited + static constexpr int kVisitedBits = 64; + PODArray<uint64_t> visited_; // bitmap: (list ID, char*) pairs visited PODArray<const char*> cap_; // capture registers PODArray<Job> job_; // stack of text positions to explore int njob_; // stack size + + BitState(const BitState&) = delete; + BitState& operator=(const BitState&) = delete; }; BitState::BitState(Prog* prog) @@ -87,9 +90,9 @@ BitState::BitState(Prog* prog) bool BitState::ShouldVisit(int id, const char* p) { int n = prog_->list_heads()[id] * static_cast<int>(text_.size()+1) + static_cast<int>(p-text_.data()); - if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1)))) + if (visited_[n/kVisitedBits] & (uint64_t{1} << (n & (kVisitedBits-1)))) return false; - visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1)); + visited_[n/kVisitedBits] |= uint64_t{1} << (n & (kVisitedBits-1)); return true; } @@ -304,8 +307,8 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context, // Allocate scratch space. int nvisited = prog_->list_count() * static_cast<int>(text.size()+1); - nvisited = (nvisited + VisitedBits-1) / VisitedBits; - visited_ = PODArray<uint32_t>(nvisited); + nvisited = (nvisited + kVisitedBits-1) / kVisitedBits; + visited_ = PODArray<uint64_t>(nvisited); memset(visited_.data(), 0, nvisited*sizeof visited_[0]); int ncap = 2*nsubmatch; diff --git a/re2/compile.cc b/re2/compile.cc index 1b7b66c..b3e5bdc 100644 --- a/re2/compile.cc +++ b/re2/compile.cc @@ -1089,8 +1089,6 @@ static bool IsAnchorEnd(Regexp** pre, int depth) { void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor) { - prog_->set_flags(flags); - if (flags & Regexp::Latin1) encoding_ = kEncodingLatin1; max_mem_ = max_mem; @@ -1111,14 +1109,11 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem, // on the program.) if (m >= 1<<24) m = 1<<24; - // Inst imposes its own limit (currently bigger than 2^24 but be safe). if (m > Prog::Inst::kMaxInst) m = Prog::Inst::kMaxInst; - max_ninst_ = static_cast<int>(m); } - anchor_ = anchor; } @@ -1191,6 +1186,7 @@ Prog* Compiler::Finish() { prog_->Optimize(); prog_->Flatten(); prog_->ComputeByteMap(); + prog_->ComputeFirstByte(); // Record remaining memory for DFA. if (max_mem_ <= 0) { @@ -53,17 +53,6 @@ namespace re2 { -#if !defined(__linux__) /* only Linux seems to have memrchr */ -static void* memrchr(const void* s, int c, size_t n) { - const unsigned char* p = (const unsigned char*)s; - for (p += n; n > 0; n--) - if (*--p == c) - return (void*)p; - - return NULL; -} -#endif - // Controls whether the DFA should bail out early if the NFA would be faster. static bool dfa_should_bail_when_slow = true; @@ -361,6 +350,9 @@ class DFA { int64_t state_budget_; // Amount of memory remaining for new States. StateSet state_cache_; // All States computed so far. StartInfo start_[kMaxStart]; + + DFA(const DFA&) = delete; + DFA& operator=(const DFA&) = delete; }; // Shorthand for casting to uint8_t*. @@ -1388,22 +1380,14 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, if (ExtraDebug) fprintf(stderr, "@%td: %s\n", p - bp, DumpState(s).c_str()); - if (have_first_byte && s == start) { + if (run_forward && have_first_byte && s == start) { // In start state, only way out is to find first_byte, // so use optimized assembly in memchr to skip ahead. // If first_byte isn't found, we can skip to the end // of the string. - if (run_forward) { - if ((p = BytePtr(memchr(p, params->first_byte, ep - p))) == NULL) { - p = ep; - break; - } - } else { - if ((p = BytePtr(memrchr(ep, params->first_byte, p - ep))) == NULL) { - p = ep; - break; - } - p++; + if ((p = BytePtr(memchr(p, params->first_byte, ep - p))) == NULL) { + p = ep; + break; } } diff --git a/re2/fuzzing/re2_fuzzer.cc b/re2/fuzzing/re2_fuzzer.cc index 12ad23a..8306f88 100644 --- a/re2/fuzzing/re2_fuzzer.cc +++ b/re2/fuzzing/re2_fuzzer.cc @@ -5,10 +5,10 @@ #include <fuzzer/FuzzedDataProvider.h> #include <stddef.h> #include <stdint.h> -#include <map> #include <memory> #include <queue> #include <string> +#include <vector> #include "re2/prefilter.h" #include "re2/re2.h" @@ -87,7 +87,7 @@ void TestOneInput(StringPiece pattern, const RE2::Options& options, // Don't waste time fuzzing high-fanout programs. // They can cause bug reports due to fuzzer timeouts. - std::map<int, int> histogram; + std::vector<int> histogram; int fanout = re.ProgramFanout(&histogram); if (fanout > 9) return; @@ -629,10 +629,7 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, return false; } -// Computes whether all successful matches have a common first byte, -// and if so, returns that byte. If not, returns -1. -int Prog::ComputeFirstByte() { - int b = -1; +void Prog::ComputeFirstByte() { SparseSet q(size()); q.insert(start()); for (SparseSet::iterator it = q.begin(); it != q.end(); ++it) { @@ -645,23 +642,27 @@ int Prog::ComputeFirstByte() { case kInstMatch: // The empty string matches: no first byte. - return -1; + first_byte_ = -1; + return; case kInstByteRange: if (!ip->last()) q.insert(id+1); - // Must match only a single byte - if (ip->lo() != ip->hi()) - return -1; - if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z') - return -1; + // Must match only a single byte. + if (ip->lo() != ip->hi() || + (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')) { + first_byte_ = -1; + return; + } // If we haven't seen any bytes yet, record it; // otherwise must match the one we saw before. - if (b == -1) - b = ip->lo(); - else if (b != ip->lo()) - return -1; + if (first_byte_ == -1) { + first_byte_ = ip->lo(); + } else if (first_byte_ != ip->lo()) { + first_byte_ = -1; + return; + } break; case kInstNop: @@ -687,7 +688,6 @@ int Prog::ComputeFirstByte() { break; } } - return b; } bool diff --git a/re2/prog.cc b/re2/prog.cc index cc35917..9244319 100644 --- a/re2/prog.cc +++ b/re2/prog.cc @@ -110,7 +110,6 @@ Prog::Prog() size_(0), bytemap_range_(0), first_byte_(-1), - flags_(0), list_count_(0), dfa_mem_(0), dfa_first_(NULL), @@ -185,14 +184,31 @@ std::string Prog::DumpByteMap() { return map; } -int Prog::first_byte() { - std::call_once(first_byte_once_, [](Prog* prog) { - prog->first_byte_ = prog->ComputeFirstByte(); - }, this); - return first_byte_; -} +// Is ip a guaranteed match at end of text, perhaps after some capturing? +static bool IsMatch(Prog* prog, Prog::Inst* ip) { + for (;;) { + switch (ip->opcode()) { + default: + LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode(); + return false; + + case kInstAlt: + case kInstAltMatch: + case kInstByteRange: + case kInstFail: + case kInstEmptyWidth: + return false; -static bool IsMatch(Prog*, Prog::Inst*); + case kInstCapture: + case kInstNop: + ip = prog->inst(ip->out()); + break; + + case kInstMatch: + return true; + } + } +} // Peep-hole optimizer. void Prog::Optimize() { @@ -258,32 +274,6 @@ void Prog::Optimize() { } } -// Is ip a guaranteed match at end of text, perhaps after some capturing? -static bool IsMatch(Prog* prog, Prog::Inst* ip) { - for (;;) { - switch (ip->opcode()) { - default: - LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode(); - return false; - - case kInstAlt: - case kInstAltMatch: - case kInstByteRange: - case kInstFail: - case kInstEmptyWidth: - return false; - - case kInstCapture: - case kInstNop: - ip = prog->inst(ip->out()); - break; - - case kInstMatch: - return true; - } - } -} - uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) { int flags = 0; @@ -209,17 +209,13 @@ class Prog { uint16_t* list_heads() { return list_heads_.data(); } void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; } int64_t dfa_mem() { return dfa_mem_; } - int flags() { return flags_; } - void set_flags(int flags) { flags_ = flags; } bool anchor_start() { return anchor_start_; } void set_anchor_start(bool b) { anchor_start_ = b; } bool anchor_end() { return anchor_end_; } void set_anchor_end(bool b) { anchor_end_ = b; } int bytemap_range() { return bytemap_range_; } const uint8_t* bytemap() { return bytemap_; } - - // Lazily computed. - int first_byte(); + int first_byte() { return first_byte_; } // Returns string representation of program for debugging. std::string Dump(); @@ -297,9 +293,8 @@ class Prog { // Compute bytemap. void ComputeByteMap(); - // Computes whether all matches must begin with the same first - // byte, and if so, returns that byte. If not, returns -1. - int ComputeFirstByte(); + // Computes whether all matches must begin with the same first byte. + void ComputeFirstByte(); // Run peep-hole optimizer on program. void Optimize(); @@ -403,7 +398,6 @@ class Prog { int size_; // number of instructions int bytemap_range_; // bytemap_[x] < bytemap_range_ int first_byte_; // required first byte for match, or -1 if none - int flags_; // regexp parse flags int list_count_; // count of lists (see above) int inst_count_[kNumInst]; // count of instructions by opcode @@ -419,7 +413,6 @@ class Prog { uint8_t bytemap_[256]; // map from input bytes to byte classes - std::once_flag first_byte_once_; std::once_flag dfa_first_once_; std::once_flag dfa_longest_once_; @@ -12,6 +12,9 @@ #include <assert.h> #include <ctype.h> #include <errno.h> +#ifdef _MSC_VER +#include <intrin.h> +#endif #include <stdint.h> #include <stdlib.h> #include <string.h> @@ -285,28 +288,54 @@ int RE2::ReverseProgramSize() const { return prog->size(); } -static int Fanout(Prog* prog, std::map<int, int>* histogram) { +// Finds the most significant non-zero bit in n. +static int FindMSBSet(uint32_t n) { + DCHECK_NE(n, 0); +#if defined(__GNUC__) + return 31 ^ __builtin_clz(n); +#elif defined(_MSC_VER) + unsigned long c; + _BitScanReverse(&c, n); + return static_cast<int>(c); +#else + int c = 0; + for (int shift = 1 << 4; shift != 0; shift >>= 1) { + uint32_t word = n >> shift; + if (word != 0) { + n = word; + c += shift; + } + } + return c; +#endif +} + +static int Fanout(Prog* prog, std::vector<int>* histogram) { SparseArray<int> fanout(prog->size()); prog->Fanout(&fanout); - histogram->clear(); + int data[32] = {}; + int size = 0; for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) { - // TODO(junyer): Optimise this? - int bucket = 0; - while (1 << bucket < i->value()) { - bucket++; - } - (*histogram)[bucket]++; + if (i->value() == 0) + continue; + uint32_t value = i->value(); + int bucket = FindMSBSet(value); + bucket += value & (value-1) ? 1 : 0; + ++data[bucket]; + size = std::max(size, bucket+1); } - return histogram->rbegin()->first; + if (histogram != NULL) + histogram->assign(data, data+size); + return size-1; } -int RE2::ProgramFanout(std::map<int, int>* histogram) const { +int RE2::ProgramFanout(std::vector<int>* histogram) const { if (prog_ == NULL) return -1; return Fanout(prog_, histogram); } -int RE2::ReverseProgramFanout(std::map<int, int>* histogram) const { +int RE2::ReverseProgramFanout(std::vector<int>* histogram) const { if (prog_ == NULL) return -1; Prog* prog = ReverseProg(); @@ -30,6 +30,16 @@ // "(?i)hello" -- (?i) turns on case-insensitive matching // "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible // +// The double backslashes are needed when writing C++ string literals. +// However, they should NOT be used when writing C++11 raw string literals: +// +// R"(hello (\w+) world)" -- \w matches a "word" character +// R"(version (\d+))" -- \d matches a digit +// R"(hello\s+world)" -- \s matches any whitespace character +// R"(\b(\w+)\b)" -- \b matches non-empty string at word boundary +// R"((?i)hello)" -- (?i) turns on case-insensitive matching +// R"(/\*(.*?)\*/)" -- .*? matches . minimum no. of times possible +// // ----------------------------------------------------------------------- // MATCHING INTERFACE: // @@ -195,6 +205,7 @@ #include <map> #include <mutex> #include <string> +#include <vector> #if defined(__APPLE__) #include <TargetConditionals.h> @@ -291,11 +302,11 @@ class RE2 { int ProgramSize() const; int ReverseProgramSize() const; - // EXPERIMENTAL! SUBJECT TO CHANGE! - // Outputs the program fanout as a histogram bucketed by powers of 2. + // If histogram is not null, outputs the program fanout + // as a histogram bucketed by powers of 2. // Returns the number of the largest non-empty bucket. - int ProgramFanout(std::map<int, int>* histogram) const; - int ReverseProgramFanout(std::map<int, int>* histogram) const; + int ProgramFanout(std::vector<int>* histogram) const; + int ReverseProgramFanout(std::vector<int>* histogram) const; // Returns the underlying Regexp; not for general use. // Returns entire_regexp_ so that callers don't need @@ -630,17 +641,6 @@ class RE2 { Encoding encoding() const { return encoding_; } void set_encoding(Encoding encoding) { encoding_ = encoding; } - // Legacy interface to encoding. - // TODO(rsc): Remove once clients have been converted. - bool utf8() const { return encoding_ == EncodingUTF8; } - void set_utf8(bool b) { - if (b) { - encoding_ = EncodingUTF8; - } else { - encoding_ = EncodingLatin1; - } - } - bool posix_syntax() const { return posix_syntax_; } void set_posix_syntax(bool b) { posix_syntax_ = b; } diff --git a/re2/testing/backtrack.cc b/re2/testing/backtrack.cc index 1e888da..3bab1e7 100644 --- a/re2/testing/backtrack.cc +++ b/re2/testing/backtrack.cc @@ -82,6 +82,9 @@ class Backtracker { const char* cap_[64]; // capture registers uint32_t *visited_; // bitmap: (Inst*, char*) pairs already backtracked size_t nvisited_; // # of words in bitmap + + Backtracker(const Backtracker&) = delete; + Backtracker& operator=(const Backtracker&) = delete; }; Backtracker::Backtracker(Prog* prog) diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc index 4dba3c6..52e1f6f 100644 --- a/re2/testing/re2_test.cc +++ b/re2/testing/re2_test.cc @@ -12,6 +12,7 @@ #include <map> #include <string> #include <utility> +#include <vector> #if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__) #include <sys/mman.h> #include <unistd.h> /* for sysconf */ @@ -473,7 +474,7 @@ TEST(ProgramFanout, BigProgram) { RE2 re100("(?:(?:(?:(?:(?:.)?){100})*)+)"); RE2 re1000("(?:(?:(?:(?:(?:.)?){1000})*)+)"); - std::map<int, int> histogram; + std::vector<int> histogram; // 3 is the largest non-empty bucket and has 1 element. ASSERT_EQ(3, re1.ProgramFanout(&histogram)); diff --git a/re2/testing/regexp_benchmark.cc b/re2/testing/regexp_benchmark.cc index c53ade0..d85784f 100644 --- a/re2/testing/regexp_benchmark.cc +++ b/re2/testing/regexp_benchmark.cc @@ -944,7 +944,7 @@ void SearchCachedDFA(benchmark::State& state, const char* regexp, bool expect_match) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); CHECK(re); - Prog* prog = re->CompileToProg(1LL<<31); + Prog* prog = re->CompileToProg(int64_t{1}<<31); CHECK(prog); for (auto _ : state) { bool failed = false; |