summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.travis.yml12
-rw-r--r--Makefile2
-rw-r--r--re2/bitmap256.h7
-rw-r--r--re2/bitstate.cc15
-rw-r--r--re2/compile.cc6
-rw-r--r--re2/dfa.cc30
-rw-r--r--re2/fuzzing/re2_fuzzer.cc4
-rw-r--r--re2/nfa.cc30
-rw-r--r--re2/prog.cc58
-rw-r--r--re2/prog.h13
-rw-r--r--re2/re2.cc51
-rw-r--r--re2/re2.h30
-rw-r--r--re2/testing/backtrack.cc3
-rw-r--r--re2/testing/re2_test.cc3
-rw-r--r--re2/testing/regexp_benchmark.cc2
15 files changed, 138 insertions, 128 deletions
diff --git a/.travis.yml b/.travis.yml
index d79541e..a3740ce 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -169,6 +169,18 @@ jobs:
- clang-9
env:
- MATRIX_EVAL="CC=clang-9 CXX=clang++-9"
+ - os: linux
+ dist: xenial
+ addons:
+ apt:
+ sources:
+ - ubuntu-toolchain-r-test
+ - sourceline: 'deb https://apt.llvm.org/xenial/ llvm-toolchain-xenial-10 main'
+ key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key'
+ packages:
+ - clang-10
+ env:
+ - MATRIX_EVAL="CC=clang-10 CXX=clang++-10"
- os: osx
osx_image: xcode7.3
diff --git a/Makefile b/Makefile
index fd9cec6..a550717 100644
--- a/Makefile
+++ b/Makefile
@@ -44,7 +44,7 @@ endif
# ABI version
# http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html
-SONAME=6
+SONAME=7
# To rebuild the Tables generated by Perl and Python scripts (requires Internet
# access for Unicode data), uncomment the following line:
diff --git a/re2/bitmap256.h b/re2/bitmap256.h
index f649b4c..4899379 100644
--- a/re2/bitmap256.h
+++ b/re2/bitmap256.h
@@ -32,7 +32,7 @@ class Bitmap256 {
DCHECK_GE(c, 0);
DCHECK_LE(c, 255);
- return (words_[c / 64] & (1ULL << (c % 64))) != 0;
+ return (words_[c / 64] & (uint64_t{1} << (c % 64))) != 0;
}
// Sets the bit with index c.
@@ -40,7 +40,7 @@ class Bitmap256 {
DCHECK_GE(c, 0);
DCHECK_LE(c, 255);
- words_[c / 64] |= (1ULL << (c % 64));
+ words_[c / 64] |= (uint64_t{1} << (c % 64));
}
// Finds the next non-zero bit with index >= c.
@@ -51,7 +51,6 @@ class Bitmap256 {
// Finds the least significant non-zero bit in n.
static int FindLSBSet(uint64_t n) {
DCHECK_NE(n, 0);
-
#if defined(__GNUC__)
return __builtin_ctzll(n);
#elif defined(_MSC_VER) && defined(_M_X64)
@@ -89,7 +88,7 @@ int Bitmap256::FindNextSetBit(int c) const {
// Check the word that contains the bit. Mask out any lower bits.
int i = c / 64;
- uint64_t word = words_[i] & (~0ULL << (c % 64));
+ uint64_t word = words_[i] & (~uint64_t{0} << (c % 64));
if (word != 0)
return (i * 64) + FindLSBSet(word);
diff --git a/re2/bitstate.cc b/re2/bitstate.cc
index f07c5b9..865b58e 100644
--- a/re2/bitstate.cc
+++ b/re2/bitstate.cc
@@ -63,11 +63,14 @@ class BitState {
int nsubmatch_; // # of submatches to fill in
// Search state
- static const int VisitedBits = 32;
- PODArray<uint32_t> visited_; // bitmap: (list ID, char*) pairs visited
+ static constexpr int kVisitedBits = 64;
+ PODArray<uint64_t> visited_; // bitmap: (list ID, char*) pairs visited
PODArray<const char*> cap_; // capture registers
PODArray<Job> job_; // stack of text positions to explore
int njob_; // stack size
+
+ BitState(const BitState&) = delete;
+ BitState& operator=(const BitState&) = delete;
};
BitState::BitState(Prog* prog)
@@ -87,9 +90,9 @@ BitState::BitState(Prog* prog)
bool BitState::ShouldVisit(int id, const char* p) {
int n = prog_->list_heads()[id] * static_cast<int>(text_.size()+1) +
static_cast<int>(p-text_.data());
- if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
+ if (visited_[n/kVisitedBits] & (uint64_t{1} << (n & (kVisitedBits-1))))
return false;
- visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
+ visited_[n/kVisitedBits] |= uint64_t{1} << (n & (kVisitedBits-1));
return true;
}
@@ -304,8 +307,8 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context,
// Allocate scratch space.
int nvisited = prog_->list_count() * static_cast<int>(text.size()+1);
- nvisited = (nvisited + VisitedBits-1) / VisitedBits;
- visited_ = PODArray<uint32_t>(nvisited);
+ nvisited = (nvisited + kVisitedBits-1) / kVisitedBits;
+ visited_ = PODArray<uint64_t>(nvisited);
memset(visited_.data(), 0, nvisited*sizeof visited_[0]);
int ncap = 2*nsubmatch;
diff --git a/re2/compile.cc b/re2/compile.cc
index 1b7b66c..b3e5bdc 100644
--- a/re2/compile.cc
+++ b/re2/compile.cc
@@ -1089,8 +1089,6 @@ static bool IsAnchorEnd(Regexp** pre, int depth) {
void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem,
RE2::Anchor anchor) {
- prog_->set_flags(flags);
-
if (flags & Regexp::Latin1)
encoding_ = kEncodingLatin1;
max_mem_ = max_mem;
@@ -1111,14 +1109,11 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem,
// on the program.)
if (m >= 1<<24)
m = 1<<24;
-
// Inst imposes its own limit (currently bigger than 2^24 but be safe).
if (m > Prog::Inst::kMaxInst)
m = Prog::Inst::kMaxInst;
-
max_ninst_ = static_cast<int>(m);
}
-
anchor_ = anchor;
}
@@ -1191,6 +1186,7 @@ Prog* Compiler::Finish() {
prog_->Optimize();
prog_->Flatten();
prog_->ComputeByteMap();
+ prog_->ComputeFirstByte();
// Record remaining memory for DFA.
if (max_mem_ <= 0) {
diff --git a/re2/dfa.cc b/re2/dfa.cc
index a269870..0934f5a 100644
--- a/re2/dfa.cc
+++ b/re2/dfa.cc
@@ -53,17 +53,6 @@
namespace re2 {
-#if !defined(__linux__) /* only Linux seems to have memrchr */
-static void* memrchr(const void* s, int c, size_t n) {
- const unsigned char* p = (const unsigned char*)s;
- for (p += n; n > 0; n--)
- if (*--p == c)
- return (void*)p;
-
- return NULL;
-}
-#endif
-
// Controls whether the DFA should bail out early if the NFA would be faster.
static bool dfa_should_bail_when_slow = true;
@@ -361,6 +350,9 @@ class DFA {
int64_t state_budget_; // Amount of memory remaining for new States.
StateSet state_cache_; // All States computed so far.
StartInfo start_[kMaxStart];
+
+ DFA(const DFA&) = delete;
+ DFA& operator=(const DFA&) = delete;
};
// Shorthand for casting to uint8_t*.
@@ -1388,22 +1380,14 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params,
if (ExtraDebug)
fprintf(stderr, "@%td: %s\n", p - bp, DumpState(s).c_str());
- if (have_first_byte && s == start) {
+ if (run_forward && have_first_byte && s == start) {
// In start state, only way out is to find first_byte,
// so use optimized assembly in memchr to skip ahead.
// If first_byte isn't found, we can skip to the end
// of the string.
- if (run_forward) {
- if ((p = BytePtr(memchr(p, params->first_byte, ep - p))) == NULL) {
- p = ep;
- break;
- }
- } else {
- if ((p = BytePtr(memrchr(ep, params->first_byte, p - ep))) == NULL) {
- p = ep;
- break;
- }
- p++;
+ if ((p = BytePtr(memchr(p, params->first_byte, ep - p))) == NULL) {
+ p = ep;
+ break;
}
}
diff --git a/re2/fuzzing/re2_fuzzer.cc b/re2/fuzzing/re2_fuzzer.cc
index 12ad23a..8306f88 100644
--- a/re2/fuzzing/re2_fuzzer.cc
+++ b/re2/fuzzing/re2_fuzzer.cc
@@ -5,10 +5,10 @@
#include <fuzzer/FuzzedDataProvider.h>
#include <stddef.h>
#include <stdint.h>
-#include <map>
#include <memory>
#include <queue>
#include <string>
+#include <vector>
#include "re2/prefilter.h"
#include "re2/re2.h"
@@ -87,7 +87,7 @@ void TestOneInput(StringPiece pattern, const RE2::Options& options,
// Don't waste time fuzzing high-fanout programs.
// They can cause bug reports due to fuzzer timeouts.
- std::map<int, int> histogram;
+ std::vector<int> histogram;
int fanout = re.ProgramFanout(&histogram);
if (fanout > 9)
return;
diff --git a/re2/nfa.cc b/re2/nfa.cc
index 77fb5fb..75ca306 100644
--- a/re2/nfa.cc
+++ b/re2/nfa.cc
@@ -629,10 +629,7 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
return false;
}
-// Computes whether all successful matches have a common first byte,
-// and if so, returns that byte. If not, returns -1.
-int Prog::ComputeFirstByte() {
- int b = -1;
+void Prog::ComputeFirstByte() {
SparseSet q(size());
q.insert(start());
for (SparseSet::iterator it = q.begin(); it != q.end(); ++it) {
@@ -645,23 +642,27 @@ int Prog::ComputeFirstByte() {
case kInstMatch:
// The empty string matches: no first byte.
- return -1;
+ first_byte_ = -1;
+ return;
case kInstByteRange:
if (!ip->last())
q.insert(id+1);
- // Must match only a single byte
- if (ip->lo() != ip->hi())
- return -1;
- if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
- return -1;
+ // Must match only a single byte.
+ if (ip->lo() != ip->hi() ||
+ (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')) {
+ first_byte_ = -1;
+ return;
+ }
// If we haven't seen any bytes yet, record it;
// otherwise must match the one we saw before.
- if (b == -1)
- b = ip->lo();
- else if (b != ip->lo())
- return -1;
+ if (first_byte_ == -1) {
+ first_byte_ = ip->lo();
+ } else if (first_byte_ != ip->lo()) {
+ first_byte_ = -1;
+ return;
+ }
break;
case kInstNop:
@@ -687,7 +688,6 @@ int Prog::ComputeFirstByte() {
break;
}
}
- return b;
}
bool
diff --git a/re2/prog.cc b/re2/prog.cc
index cc35917..9244319 100644
--- a/re2/prog.cc
+++ b/re2/prog.cc
@@ -110,7 +110,6 @@ Prog::Prog()
size_(0),
bytemap_range_(0),
first_byte_(-1),
- flags_(0),
list_count_(0),
dfa_mem_(0),
dfa_first_(NULL),
@@ -185,14 +184,31 @@ std::string Prog::DumpByteMap() {
return map;
}
-int Prog::first_byte() {
- std::call_once(first_byte_once_, [](Prog* prog) {
- prog->first_byte_ = prog->ComputeFirstByte();
- }, this);
- return first_byte_;
-}
+// Is ip a guaranteed match at end of text, perhaps after some capturing?
+static bool IsMatch(Prog* prog, Prog::Inst* ip) {
+ for (;;) {
+ switch (ip->opcode()) {
+ default:
+ LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
+ return false;
+
+ case kInstAlt:
+ case kInstAltMatch:
+ case kInstByteRange:
+ case kInstFail:
+ case kInstEmptyWidth:
+ return false;
-static bool IsMatch(Prog*, Prog::Inst*);
+ case kInstCapture:
+ case kInstNop:
+ ip = prog->inst(ip->out());
+ break;
+
+ case kInstMatch:
+ return true;
+ }
+ }
+}
// Peep-hole optimizer.
void Prog::Optimize() {
@@ -258,32 +274,6 @@ void Prog::Optimize() {
}
}
-// Is ip a guaranteed match at end of text, perhaps after some capturing?
-static bool IsMatch(Prog* prog, Prog::Inst* ip) {
- for (;;) {
- switch (ip->opcode()) {
- default:
- LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
- return false;
-
- case kInstAlt:
- case kInstAltMatch:
- case kInstByteRange:
- case kInstFail:
- case kInstEmptyWidth:
- return false;
-
- case kInstCapture:
- case kInstNop:
- ip = prog->inst(ip->out());
- break;
-
- case kInstMatch:
- return true;
- }
- }
-}
-
uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) {
int flags = 0;
diff --git a/re2/prog.h b/re2/prog.h
index 40a6ce4..4306672 100644
--- a/re2/prog.h
+++ b/re2/prog.h
@@ -209,17 +209,13 @@ class Prog {
uint16_t* list_heads() { return list_heads_.data(); }
void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; }
int64_t dfa_mem() { return dfa_mem_; }
- int flags() { return flags_; }
- void set_flags(int flags) { flags_ = flags; }
bool anchor_start() { return anchor_start_; }
void set_anchor_start(bool b) { anchor_start_ = b; }
bool anchor_end() { return anchor_end_; }
void set_anchor_end(bool b) { anchor_end_ = b; }
int bytemap_range() { return bytemap_range_; }
const uint8_t* bytemap() { return bytemap_; }
-
- // Lazily computed.
- int first_byte();
+ int first_byte() { return first_byte_; }
// Returns string representation of program for debugging.
std::string Dump();
@@ -297,9 +293,8 @@ class Prog {
// Compute bytemap.
void ComputeByteMap();
- // Computes whether all matches must begin with the same first
- // byte, and if so, returns that byte. If not, returns -1.
- int ComputeFirstByte();
+ // Computes whether all matches must begin with the same first byte.
+ void ComputeFirstByte();
// Run peep-hole optimizer on program.
void Optimize();
@@ -403,7 +398,6 @@ class Prog {
int size_; // number of instructions
int bytemap_range_; // bytemap_[x] < bytemap_range_
int first_byte_; // required first byte for match, or -1 if none
- int flags_; // regexp parse flags
int list_count_; // count of lists (see above)
int inst_count_[kNumInst]; // count of instructions by opcode
@@ -419,7 +413,6 @@ class Prog {
uint8_t bytemap_[256]; // map from input bytes to byte classes
- std::once_flag first_byte_once_;
std::once_flag dfa_first_once_;
std::once_flag dfa_longest_once_;
diff --git a/re2/re2.cc b/re2/re2.cc
index eb8ab3e..d231a21 100644
--- a/re2/re2.cc
+++ b/re2/re2.cc
@@ -12,6 +12,9 @@
#include <assert.h>
#include <ctype.h>
#include <errno.h>
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@@ -285,28 +288,54 @@ int RE2::ReverseProgramSize() const {
return prog->size();
}
-static int Fanout(Prog* prog, std::map<int, int>* histogram) {
+// Finds the most significant non-zero bit in n.
+static int FindMSBSet(uint32_t n) {
+ DCHECK_NE(n, 0);
+#if defined(__GNUC__)
+ return 31 ^ __builtin_clz(n);
+#elif defined(_MSC_VER)
+ unsigned long c;
+ _BitScanReverse(&c, n);
+ return static_cast<int>(c);
+#else
+ int c = 0;
+ for (int shift = 1 << 4; shift != 0; shift >>= 1) {
+ uint32_t word = n >> shift;
+ if (word != 0) {
+ n = word;
+ c += shift;
+ }
+ }
+ return c;
+#endif
+}
+
+static int Fanout(Prog* prog, std::vector<int>* histogram) {
SparseArray<int> fanout(prog->size());
prog->Fanout(&fanout);
- histogram->clear();
+ int data[32] = {};
+ int size = 0;
for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) {
- // TODO(junyer): Optimise this?
- int bucket = 0;
- while (1 << bucket < i->value()) {
- bucket++;
- }
- (*histogram)[bucket]++;
+ if (i->value() == 0)
+ continue;
+ uint32_t value = i->value();
+ int bucket = FindMSBSet(value);
+ bucket += value & (value-1) ? 1 : 0;
+ ++data[bucket];
+ size = std::max(size, bucket+1);
}
- return histogram->rbegin()->first;
+ if (histogram != NULL)
+ histogram->assign(data, data+size);
+ return size-1;
}
-int RE2::ProgramFanout(std::map<int, int>* histogram) const {
+int RE2::ProgramFanout(std::vector<int>* histogram) const {
if (prog_ == NULL)
return -1;
return Fanout(prog_, histogram);
}
-int RE2::ReverseProgramFanout(std::map<int, int>* histogram) const {
+int RE2::ReverseProgramFanout(std::vector<int>* histogram) const {
if (prog_ == NULL)
return -1;
Prog* prog = ReverseProg();
diff --git a/re2/re2.h b/re2/re2.h
index 44fb5af..32b2718 100644
--- a/re2/re2.h
+++ b/re2/re2.h
@@ -30,6 +30,16 @@
// "(?i)hello" -- (?i) turns on case-insensitive matching
// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible
//
+// The double backslashes are needed when writing C++ string literals.
+// However, they should NOT be used when writing C++11 raw string literals:
+//
+// R"(hello (\w+) world)" -- \w matches a "word" character
+// R"(version (\d+))" -- \d matches a digit
+// R"(hello\s+world)" -- \s matches any whitespace character
+// R"(\b(\w+)\b)" -- \b matches non-empty string at word boundary
+// R"((?i)hello)" -- (?i) turns on case-insensitive matching
+// R"(/\*(.*?)\*/)" -- .*? matches . minimum no. of times possible
+//
// -----------------------------------------------------------------------
// MATCHING INTERFACE:
//
@@ -195,6 +205,7 @@
#include <map>
#include <mutex>
#include <string>
+#include <vector>
#if defined(__APPLE__)
#include <TargetConditionals.h>
@@ -291,11 +302,11 @@ class RE2 {
int ProgramSize() const;
int ReverseProgramSize() const;
- // EXPERIMENTAL! SUBJECT TO CHANGE!
- // Outputs the program fanout as a histogram bucketed by powers of 2.
+ // If histogram is not null, outputs the program fanout
+ // as a histogram bucketed by powers of 2.
// Returns the number of the largest non-empty bucket.
- int ProgramFanout(std::map<int, int>* histogram) const;
- int ReverseProgramFanout(std::map<int, int>* histogram) const;
+ int ProgramFanout(std::vector<int>* histogram) const;
+ int ReverseProgramFanout(std::vector<int>* histogram) const;
// Returns the underlying Regexp; not for general use.
// Returns entire_regexp_ so that callers don't need
@@ -630,17 +641,6 @@ class RE2 {
Encoding encoding() const { return encoding_; }
void set_encoding(Encoding encoding) { encoding_ = encoding; }
- // Legacy interface to encoding.
- // TODO(rsc): Remove once clients have been converted.
- bool utf8() const { return encoding_ == EncodingUTF8; }
- void set_utf8(bool b) {
- if (b) {
- encoding_ = EncodingUTF8;
- } else {
- encoding_ = EncodingLatin1;
- }
- }
-
bool posix_syntax() const { return posix_syntax_; }
void set_posix_syntax(bool b) { posix_syntax_ = b; }
diff --git a/re2/testing/backtrack.cc b/re2/testing/backtrack.cc
index 1e888da..3bab1e7 100644
--- a/re2/testing/backtrack.cc
+++ b/re2/testing/backtrack.cc
@@ -82,6 +82,9 @@ class Backtracker {
const char* cap_[64]; // capture registers
uint32_t *visited_; // bitmap: (Inst*, char*) pairs already backtracked
size_t nvisited_; // # of words in bitmap
+
+ Backtracker(const Backtracker&) = delete;
+ Backtracker& operator=(const Backtracker&) = delete;
};
Backtracker::Backtracker(Prog* prog)
diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc
index 4dba3c6..52e1f6f 100644
--- a/re2/testing/re2_test.cc
+++ b/re2/testing/re2_test.cc
@@ -12,6 +12,7 @@
#include <map>
#include <string>
#include <utility>
+#include <vector>
#if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__)
#include <sys/mman.h>
#include <unistd.h> /* for sysconf */
@@ -473,7 +474,7 @@ TEST(ProgramFanout, BigProgram) {
RE2 re100("(?:(?:(?:(?:(?:.)?){100})*)+)");
RE2 re1000("(?:(?:(?:(?:(?:.)?){1000})*)+)");
- std::map<int, int> histogram;
+ std::vector<int> histogram;
// 3 is the largest non-empty bucket and has 1 element.
ASSERT_EQ(3, re1.ProgramFanout(&histogram));
diff --git a/re2/testing/regexp_benchmark.cc b/re2/testing/regexp_benchmark.cc
index c53ade0..d85784f 100644
--- a/re2/testing/regexp_benchmark.cc
+++ b/re2/testing/regexp_benchmark.cc
@@ -944,7 +944,7 @@ void SearchCachedDFA(benchmark::State& state, const char* regexp,
bool expect_match) {
Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
CHECK(re);
- Prog* prog = re->CompileToProg(1LL<<31);
+ Prog* prog = re->CompileToProg(int64_t{1}<<31);
CHECK(prog);
for (auto _ : state) {
bool failed = false;