From 6c0bc52e1c8854d1a96810f0fe9849d83a7c409c Mon Sep 17 00:00:00 2001 From: DongHun Kwak Date: Mon, 21 Nov 2016 16:57:37 +0900 Subject: Imported Upstream version 20160401 Change-Id: I05a22e9add0c0dc9c7cc5d28b8ceedd442fca31c Signed-off-by: DongHun Kwak --- BUILD | 50 ++++++++-- CMakeLists.txt | 3 + Makefile | 12 ++- re2/bitstate.cc | 83 +++++++++------- re2/compile.cc | 4 + re2/dfa.cc | 83 ++++++++-------- re2/nfa.cc | 65 ++++++++----- re2/onepass.cc | 50 +++++++--- re2/prefilter.cc | 9 +- re2/prefilter.h | 3 - re2/prog.cc | 191 +++++++++++++++++++++++++++++++++++- re2/prog.h | 50 +++++++--- re2/re2.cc | 79 +++++++-------- re2/re2.h | 43 ++++----- re2/regexp.cc | 29 +++--- re2/regexp.h | 12 +-- re2/set.cc | 5 +- re2/set.h | 2 +- re2/testing/backtrack.cc | 32 +++++-- re2/testing/compile_test.cc | 208 +++++++++++++++++++++------------------- re2/testing/re2_test.cc | 5 + re2/testing/regexp_benchmark.cc | 45 +++++++++ re2/testing/search_test.cc | 9 +- re2/testing/set_test.cc | 56 +++++++++-- util/mutex.h | 21 ---- util/util.h | 1 + 26 files changed, 758 insertions(+), 392 deletions(-) diff --git a/BUILD b/BUILD index 2636631..1e2d19c 100644 --- a/BUILD +++ b/BUILD @@ -6,6 +6,8 @@ licenses(["notice"]) +exports_files(["LICENSE"]) + cc_library( name = "re2", srcs = [ @@ -97,25 +99,61 @@ cc_library( load("re2_test", "re2_test") re2_test("charclass_test") + re2_test("compile_test") + re2_test("filtered_re2_test") + re2_test("mimics_pcre_test") + re2_test("parse_test") + re2_test("possible_match_test") + re2_test("re2_arg_test") + re2_test("re2_test") + re2_test("regexp_test") + re2_test("required_prefix_test") + re2_test("search_test") + re2_test("set_test") + re2_test("simplify_test") + re2_test("string_generator_test") -re2_test("dfa_test", size="large") -re2_test("exhaustive1_test", size="large") -re2_test("exhaustive2_test", size="large") -re2_test("exhaustive3_test", size="large") -re2_test("exhaustive_test", size="large") -re2_test("random_test", size="large") +re2_test( + "dfa_test", + size = "large", +) + +re2_test( + "exhaustive1_test", + size = "large", +) + +re2_test( + "exhaustive2_test", + size = "large", +) + +re2_test( + "exhaustive3_test", + size = "large", +) + +re2_test( + "exhaustive_test", + size = "large", +) + +re2_test( + "random_test", + size = "large", +) # TODO: Add support for regexp_benchmark. diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ed718f..66cfdc6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,6 +15,9 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") if(MSVC_VERSION LESS 1800) message(FATAL_ERROR "you need Visual Studio 2013 or later") endif() + if(BUILD_SHARED_LIBS) + message(FATAL_ERROR "building a DLL is not currently supported") + endif() elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") add_compile_options(-std=c++11) endif() diff --git a/Makefile b/Makefile index 92fbfc3..4479033 100644 --- a/Makefile +++ b/Makefile @@ -44,7 +44,7 @@ ifeq ($(shell uname),Darwin) SOEXT=dylib SOEXTVER=$(SONAME).$(SOEXT) SOEXTVER00=$(SONAME).0.0.$(SOEXT) -MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib $(LDFLAGS) -Wl,-install_name,@rpath/libre2.$(SOEXTVER) -exported_symbols_list libre2.symbols.darwin +MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib -Wl,-install_name,@rpath/libre2.$(SOEXTVER),-exported_symbols_list,libre2.symbols.darwin $(LDFLAGS) else ifeq ($(shell uname),SunOS) SOEXT=so SOEXTVER=$(SOEXT).$(SONAME) @@ -275,11 +275,15 @@ install: obj/libre2.a obj/so/libre2.$(SOEXT) testinstall: @mkdir -p obj cp testinstall.cc obj -ifneq ($(shell uname),Darwin) - (cd obj && $(CXX) -I$(DESTDIR)$(includedir) -L$(DESTDIR)$(libdir) testinstall.cc -lre2 -pthread -static -o testinstall) +ifeq ($(shell uname),Darwin) + @echo Skipping test for libre2.a on Darwin. +else ifeq ($(shell uname),SunOS) + @echo Skipping test for libre2.a on SunOS. +else + (cd obj && $(CXX) $(CXXFLAGS) -I$(DESTDIR)$(includedir) -L$(DESTDIR)$(libdir) testinstall.cc -l:libre2.a -o testinstall) obj/testinstall endif - (cd obj && $(CXX) -I$(DESTDIR)$(includedir) -L$(DESTDIR)$(libdir) testinstall.cc -lre2 -pthread -o testinstall) + (cd obj && $(CXX) $(CXXFLAGS) -I$(DESTDIR)$(includedir) -L$(DESTDIR)$(libdir) testinstall.cc -lre2 -o testinstall) LD_LIBRARY_PATH=$(DESTDIR)$(libdir) obj/testinstall benchlog: obj/test/regexp_benchmark diff --git a/re2/bitstate.cc b/re2/bitstate.cc index 0cfa480..661a8ca 100644 --- a/re2/bitstate.cc +++ b/re2/bitstate.cc @@ -141,6 +141,7 @@ void BitState::Push(int id, const char* p, int arg) { // Return whether it succeeded. bool BitState::TrySearch(int id0, const char* p0) { bool matched = false; + bool inaltmatch = false; const char* end = text_.end(); njob_ = 0; Push(id0, p0, 0); @@ -159,6 +160,14 @@ bool BitState::TrySearch(int id0, const char* p0) { // would have, but we avoid the stack // manipulation. if (0) { + Next: + // If the Match of a non-greedy AltMatch failed, + // we stop ourselves from trying the ByteRange, + // which would steer us off the short circuit. + if (prog_->inst(id)->last() || inaltmatch) + continue; + id++; + CheckAndLoop: if (!ShouldVisit(id, p)) continue; @@ -176,66 +185,63 @@ bool BitState::TrySearch(int id0, const char* p0) { case kInstFail: continue; - case kInstAlt: - // Cannot just - // Push(ip->out1(), p, 0); - // Push(ip->out(), p, 0); - // If, during the processing of ip->out(), we encounter - // ip->out1() via another path, we want to process it then. - // Pushing it here will inhibit that. Instead, re-push - // ip with arg==1 as a reminder to push ip->out1() later. + case kInstAltMatch: switch (arg) { case 0: + inaltmatch = true; Push(id, p, 1); // come back when we're done + + // One opcode is ByteRange; the other leads to Match + // (possibly via Nop or Capture). + if (ip->greedy(prog_)) { + // out1 is the match + Push(ip->out1(), p, 0); + id = ip->out1(); + p = end; + goto CheckAndLoop; + } + // out is the match - non-greedy + Push(ip->out(), end, 0); id = ip->out(); goto CheckAndLoop; case 1: - // Finished ip->out(); try ip->out1(). - arg = 0; - id = ip->out1(); - goto CheckAndLoop; + inaltmatch = false; + continue; } - LOG(DFATAL) << "Bad arg in kInstAlt: " << arg; + LOG(DFATAL) << "Bad arg in kInstAltMatch: " << arg; continue; - case kInstAltMatch: - // One opcode is byte range; the other leads to match. - if (ip->greedy(prog_)) { - // out1 is the match - Push(ip->out1(), p, 0); - id = ip->out1(); - p = end; - goto CheckAndLoop; - } - // out is the match - non-greedy - Push(ip->out(), end, 0); - id = ip->out(); - goto CheckAndLoop; - case kInstByteRange: { int c = -1; if (p < end) c = *p & 0xFF; - if (ip->Matches(c)) { - id = ip->out(); - p++; - goto CheckAndLoop; - } - continue; + if (!ip->Matches(c)) + goto Next; + + if (!ip->last()) + Push(id+1, p, 0); // try the next when we're done + id = ip->out(); + p++; + goto CheckAndLoop; } case kInstCapture: switch (arg) { case 0: + if (!ip->last()) + Push(id+1, p, 0); // try the next when we're done + if (0 <= ip->cap() && ip->cap() < ncap_) { // Capture p to register, but save old value. Push(id, cap_[ip->cap()], 1); // come back when we're done cap_[ip->cap()] = p; } + // Continue on. id = ip->out(); goto CheckAndLoop; + case 1: // Finished ip->out(); restore the old value. cap_[ip->cap()] = p; @@ -246,17 +252,22 @@ bool BitState::TrySearch(int id0, const char* p0) { case kInstEmptyWidth: if (ip->empty() & ~Prog::EmptyFlags(context_, p)) - continue; + goto Next; + + if (!ip->last()) + Push(id+1, p, 0); // try the next when we're done id = ip->out(); goto CheckAndLoop; case kInstNop: + if (!ip->last()) + Push(id+1, p, 0); // try the next when we're done id = ip->out(); goto CheckAndLoop; case kInstMatch: { if (endmatch_ && p != text_.end()) - continue; + goto Next; // VLOG(0) << "Found match."; // We found a match. If the caller doesn't care @@ -285,7 +296,7 @@ bool BitState::TrySearch(int id0, const char* p0) { return true; // Otherwise, continue on in hope of a longer match. - continue; + goto Next; } } } diff --git a/re2/compile.cc b/re2/compile.cc index 9882fef..df859ab 100644 --- a/re2/compile.cc +++ b/re2/compile.cc @@ -541,6 +541,9 @@ bool Compiler::IsCachedRuneByteSuffix(int id) { } void Compiler::AddSuffix(int id) { + if (failed_) + return; + if (rune_range_.begin == 0) { rune_range_.begin = id; return; @@ -1234,6 +1237,7 @@ Prog* Compiler::Finish() { prog_->ComputeByteMap(); prog_->Optimize(); + prog_->Flatten(); // Record remaining memory for DFA. if (max_mem_ <= 0) { diff --git a/re2/dfa.cc b/re2/dfa.cc index 7c526c8..ccb6065 100644 --- a/re2/dfa.cc +++ b/re2/dfa.cc @@ -216,7 +216,7 @@ class DFA { // Converts a State into a Workq: the opposite of WorkqToCachedState. // L >= mutex_ - static void StateToWorkq(State* s, Workq* q); + void StateToWorkq(State* s, Workq* q); // Runs a State on a given byte, returning the next state. State* RunStateOnByteUnlocked(State*, int); // cache_mutex_.r <= L < mutex_ @@ -640,31 +640,17 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, uint flag) { return FullMatchState; } // Fall through. - case kInstByteRange: // These are useful. - case kInstEmptyWidth: - case kInstMatch: - case kInstAlt: // Not useful, but necessary [*] - inst[n++] = *it; + default: + // Record iff id is the head of its list, which must + // be the case if id-1 is the last of *its* list. :) + if (prog_->inst(id-1)->last()) + inst[n++] = *it; if (ip->opcode() == kInstEmptyWidth) needflags |= ip->empty(); if (ip->opcode() == kInstMatch && !prog_->anchor_end()) sawmatch = true; break; - - default: // The rest are not. - break; } - - // [*] kInstAlt would seem useless to record in a state, since - // we've already followed both its arrows and saved all the - // interesting states we can reach from there. The problem - // is that one of the empty-width instructions might lead - // back to the same kInstAlt (if an empty-width operator is starred), - // producing a different evaluation order depending on whether - // we keep the kInstAlt to begin with. Sigh. - // A specific case that this affects is /(^|a)+/ matching "a". - // If we don't save the kInstAlt, we will match the whole "a" (0,1) - // but in fact the correct leftmost-first match is the leading "" (0,0). } DCHECK_LE(n, q->size()); if (n > 0 && inst[n-1] == Mark) @@ -797,12 +783,12 @@ void DFA::StateToWorkq(State* s, Workq* q) { if (s->inst_[i] == Mark) q->mark(); else - q->insert_new(s->inst_[i]); + // Explore from the head of the list. + AddToQueue(q, s->inst_[i], s->flag_ & kFlagEmptyMask); } } -// Adds ip to the work queue, following empty arrows according to flag -// and expanding kInstAlt instructions (two-target gotos). +// Adds ip to the work queue, following empty arrows according to flag. void DFA::AddToQueue(Workq* q, int id, uint flag) { // Use astack_ to hold our stack of states yet to process. @@ -828,9 +814,8 @@ void DFA::AddToQueue(Workq* q, int id, uint flag) { continue; // If ip is already on the queue, nothing to do. - // Otherwise add it. We don't actually keep all the ones - // that get added -- for example, kInstAlt is ignored - // when on a work queue -- but adding all ip's here + // Otherwise add it. We don't actually keep all the + // ones that get added, but adding all of them here // increases the likelihood of q->contains(id), // reducing the amount of duplicated work. if (q->contains(id)) @@ -840,34 +825,40 @@ void DFA::AddToQueue(Workq* q, int id, uint flag) { // Process instruction. Prog::Inst* ip = prog_->inst(id); switch (ip->opcode()) { - case kInstFail: // can't happen: discarded above + default: + LOG(DFATAL) << "unhandled opcode: " << ip->opcode(); break; case kInstByteRange: // just save these on the queue case kInstMatch: + if (!ip->last()) + stk[nstk++] = id+1; break; case kInstCapture: // DFA treats captures as no-ops. case kInstNop: + if (!ip->last()) + stk[nstk++] = id+1; + + // If this instruction is the [00-FF]* loop at the beginning of + // a leftmost-longest unanchored search, separate with a Mark so + // that future threads (which will start farther to the right in + // the input string) are lower priority than current threads. + if (ip->opcode() == kInstNop && q->maxmark() > 0 && + id == prog_->start_unanchored() && id != prog_->start()) + stk[nstk++] = Mark; stk[nstk++] = ip->out(); break; - case kInstAlt: // two choices: expand both, in order case kInstAltMatch: - // Want to visit out then out1, so push on stack in reverse order. - // This instruction is the [00-FF]* loop at the beginning of - // a leftmost-longest unanchored search, separate out from out1 - // with a Mark, so that out1's threads (which will start farther - // to the right in the string being searched) are lower priority - // than the current ones. - stk[nstk++] = ip->out1(); - if (q->maxmark() > 0 && - id == prog_->start_unanchored() && id != prog_->start()) - stk[nstk++] = Mark; - stk[nstk++] = ip->out(); + DCHECK(!ip->last()); + stk[nstk++] = id+1; break; case kInstEmptyWidth: + if (!ip->last()) + stk[nstk++] = id+1; + // Continue on if we have all the right flag bits. if (ip->empty() & ~flag) break; @@ -924,10 +915,13 @@ void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq, int id = *i; Prog::Inst* ip = prog_->inst(id); switch (ip->opcode()) { + default: + LOG(DFATAL) << "unhandled opcode: " << ip->opcode(); + break; + case kInstFail: // never succeeds case kInstCapture: // already followed case kInstNop: // already followed - case kInstAlt: // already followed case kInstAltMatch: // already followed case kInstEmptyWidth: // already followed break; @@ -1499,8 +1493,13 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, v->clear(); for (int i = 0; i < s->ninst_; i++) { Prog::Inst* ip = prog_->inst(s->inst_[i]); - if (ip->opcode() == kInstMatch) - v->push_back(ip->match_id()); + for (;;) { + if (ip->opcode() == kInstMatch) + v->push_back(ip->match_id()); + if (ip->last()) + break; + ip++; + } } } if (DebugDFA) diff --git a/re2/nfa.cc b/re2/nfa.cc index bc8996c..5232e5f 100644 --- a/re2/nfa.cc +++ b/re2/nfa.cc @@ -230,25 +230,28 @@ void NFA::AddToThreadq(Threadq* q, int id0, int flag, break; case kInstAltMatch: + DCHECK(!ip->last()); + stk[nstk++] = AddState(id+1); + // Save state; will pick up at next byte. t = AllocThread(); t->id = id; CopyCapture(t->capture, capture); *tp = t; - // fall through - - case kInstAlt: - // Explore alternatives. - stk[nstk++] = AddState(ip->out1()); - stk[nstk++] = AddState(ip->out()); break; case kInstNop: + if (!ip->last()) + stk[nstk++] = AddState(id+1); + // Continue on. stk[nstk++] = AddState(ip->out()); break; case kInstCapture: + if (!ip->last()) + stk[nstk++] = AddState(id+1); + if ((j=ip->cap()) < ncapture_) { // Push a dummy whose only job is to restore capture[j] // once we finish exploring this possibility. @@ -262,6 +265,9 @@ void NFA::AddToThreadq(Threadq* q, int id0, int flag, case kInstMatch: case kInstByteRange: + if (!ip->last()) + stk[nstk++] = AddState(id+1); + // Save state; will pick up at next byte. t = AllocThread(); t->id = id; @@ -272,6 +278,9 @@ void NFA::AddToThreadq(Threadq* q, int id0, int flag, break; case kInstEmptyWidth: + if (!ip->last()) + stk[nstk++] = AddState(id+1); + // Continue on if we have all the right flag bits. if (ip->empty() & ~flag) break; @@ -542,14 +551,6 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, match_[1] = p; matched_ = true; break; - - case kInstEmptyWidth: - if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) { - LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty(); - break; - } - id = ip->out(); - continue; } break; } @@ -640,10 +641,16 @@ int NFA::ComputeFirstByte() { break; case kInstMatch: + if (!ip->last()) + q.insert(id+1); + // The empty string matches: no first byte. return -1; case kInstByteRange: + if (!ip->last()) + q.insert(id+1); + // Must match only a single byte if (ip->lo() != ip->hi()) return -1; @@ -660,6 +667,9 @@ int NFA::ComputeFirstByte() { case kInstNop: case kInstCapture: case kInstEmptyWidth: + if (!ip->last()) + q.insert(id+1); + // Continue on. // Ignore ip->empty() flags for kInstEmptyWidth // in order to be as conservative as possible @@ -668,13 +678,9 @@ int NFA::ComputeFirstByte() { q.insert(ip->out()); break; - case kInstAlt: case kInstAltMatch: - // Explore alternatives. - if (ip->out()) - q.insert(ip->out()); - if (ip->out1()) - q.insert(ip->out1()); + DCHECK(!ip->last()); + q.insert(id+1); break; case kInstFail: @@ -723,31 +729,42 @@ void Prog::Fanout(SparseArray* fanout) { reachable.clear(); reachable.insert(i->index()); for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) { - Prog::Inst* ip = inst(*j); + int id = *j; + Prog::Inst* ip = inst(id); switch (ip->opcode()) { default: LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()"; break; case kInstByteRange: + if (!ip->last()) + reachable.insert(id+1); + (*count)++; if (!fanout->has_index(ip->out())) { fanout->set_new(ip->out(), 0); } break; - case kInstAlt: case kInstAltMatch: - reachable.insert(ip->out1()); - // fall through + DCHECK(!ip->last()); + reachable.insert(id+1); + break; case kInstCapture: case kInstEmptyWidth: case kInstNop: + if (!ip->last()) + reachable.insert(id+1); + reachable.insert(ip->out()); break; case kInstMatch: + if (!ip->last()) + reachable.insert(id+1); + break; + case kInstFail: break; } diff --git a/re2/onepass.cc b/re2/onepass.cc index 73acdc8..977ed96 100644 --- a/re2/onepass.cc +++ b/re2/onepass.cc @@ -427,21 +427,30 @@ bool Prog::IsOnePass() { Prog::Inst* ip = inst(id); uint32 cond = stack[nstack].cond; switch (ip->opcode()) { + default: + LOG(DFATAL) << "unhandled opcode: " << ip->opcode(); + break; + case kInstAltMatch: // TODO(rsc): Ignoring kInstAltMatch optimization. // Should implement it in this engine, but it's subtle. - // Fall through. - case kInstAlt: + DCHECK(!ip->last()); // If already on work queue, (1) is violated: bail out. - if (!AddQ(&workq, ip->out()) || !AddQ(&workq, ip->out1())) + if (!AddQ(&workq, id+1)) goto fail; - stack[nstack].id = ip->out1(); - stack[nstack++].cond = cond; - stack[nstack].id = ip->out(); + stack[nstack].id = id+1; stack[nstack++].cond = cond; break; case kInstByteRange: { + if (!ip->last()) { + // If already on work queue, (1) is violated: bail out. + if (!AddQ(&workq, id+1)) + goto fail; + stack[nstack].id = id+1; + stack[nstack++].cond = cond; + } + int nextindex = nodebyid[ip->out()]; if (nextindex == -1) { if (nalloc >= maxnodes) { @@ -501,16 +510,21 @@ bool Prog::IsOnePass() { } case kInstCapture: - if (ip->cap() < kMaxCap) - cond |= (1 << kCapShift) << ip->cap(); - goto QueueEmpty; - case kInstEmptyWidth: - cond |= ip->empty(); - goto QueueEmpty; - case kInstNop: - QueueEmpty: + if (!ip->last()) { + // If already on work queue, (1) is violated: bail out. + if (!AddQ(&workq, id+1)) + goto fail; + stack[nstack].id = id+1; + stack[nstack++].cond = cond; + } + + if (ip->opcode() == kInstCapture && ip->cap() < kMaxCap) + cond |= (1 << kCapShift) << ip->cap(); + if (ip->opcode() == kInstEmptyWidth) + cond |= ip->empty(); + // kInstCapture and kInstNop always proceed to ip->out(). // kInstEmptyWidth only sometimes proceeds to ip->out(), // but as a conservative approximation we assume it always does. @@ -531,6 +545,14 @@ bool Prog::IsOnePass() { break; case kInstMatch: + if (!ip->last()) { + // If already on work queue, (1) is violated: bail out. + if (!AddQ(&workq, id+1)) + goto fail; + stack[nstack].id = id+1; + stack[nstack++].cond = cond; + } + if (matched) { // (3) is violated if (Debug) { diff --git a/re2/prefilter.cc b/re2/prefilter.cc index 45e43c9..f171eec 100644 --- a/re2/prefilter.cc +++ b/re2/prefilter.cc @@ -15,8 +15,6 @@ static const int Trace = false; typedef set::iterator SSIter; typedef set::const_iterator ConstSSIter; -GLOBAL_MUTEX(alloc_id_mutex); -static int alloc_id = 100000; // Used for debugging. // Initializes a Prefilter, allocating subs_ as necessary. Prefilter::Prefilter(Op op) { op_ = op; @@ -24,15 +22,12 @@ Prefilter::Prefilter(Op op) { if (op_ == AND || op_ == OR) subs_ = new vector; - GLOBAL_MUTEX_LOCK(alloc_id_mutex); - alloc_id_ = alloc_id++; - GLOBAL_MUTEX_UNLOCK(alloc_id_mutex); - VLOG(10) << "alloc_id: " << alloc_id_; + VLOG(10) << "constructed: " << reinterpret_cast(this); } // Destroys a Prefilter. Prefilter::~Prefilter() { - VLOG(10) << "Deleted: " << alloc_id_; + VLOG(10) << "destructing: " << reinterpret_cast(this); if (subs_) { for (size_t i = 0; i < subs_->size(); i++) delete (*subs_)[i]; diff --git a/re2/prefilter.h b/re2/prefilter.h index 2bc1676..531c189 100644 --- a/re2/prefilter.h +++ b/re2/prefilter.h @@ -94,9 +94,6 @@ class Prefilter { // and -1 for duplicate nodes. int unique_id_; - // Used for debugging, helps in tracking memory leaks. - int alloc_id_; - DISALLOW_COPY_AND_ASSIGN(Prefilter); }; diff --git a/re2/prog.cc b/re2/prog.cc index cb7e6e0..b5f48ef 100644 --- a/re2/prog.cc +++ b/re2/prog.cc @@ -6,7 +6,6 @@ // Tested by compile_test.cc #include "util/util.h" -#include "util/sparse_set.h" #include "re2/prog.h" #include "re2/stringpiece.h" @@ -94,6 +93,7 @@ Prog::Prog() : anchor_start_(false), anchor_end_(false), reversed_(false), + did_flatten_(false), did_onepass_(false), start_(0), start_unanchored_(0), @@ -131,7 +131,6 @@ static inline void AddToQueue(Workq* q, int id) { static string ProgToString(Prog* prog, Workq* q) { string s; - for (Workq::iterator i = q->begin(); i != q->end(); ++i) { int id = *i; Prog::Inst* ip = prog->inst(id); @@ -143,6 +142,18 @@ static string ProgToString(Prog* prog, Workq* q) { return s; } +static string FlattenedProgToString(Prog* prog, int start) { + string s; + for (int id = start; id < prog->size(); id++) { + Prog::Inst* ip = prog->inst(id); + if (ip->last()) + StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str()); + else + StringAppendF(&s, "%d+ %s\n", id, ip->Dump().c_str()); + } + return s; +} + string Prog::Dump() { string map; if (false) { // Debugging @@ -155,12 +166,18 @@ string Prog::Dump() { StringAppendF(&map, "\n"); } + if (did_flatten_) + return map + FlattenedProgToString(this, start_); + Workq q(size_); AddToQueue(&q, start_); return map + ProgToString(this, &q); } string Prog::DumpUnanchored() { + if (did_flatten_) + return FlattenedProgToString(this, start_unanchored_); + Workq q(size_); AddToQueue(&q, start_unanchored_); return ProgToString(this, &q); @@ -337,5 +354,173 @@ void Prog::ComputeByteMap() { } } -} // namespace re2 +void Prog::Flatten() { + if (did_flatten_) + return; + did_flatten_ = true; + + // Scratch structures. It's important that these are reused by EmitList() + // because we call it in a loop and it would thrash the heap otherwise. + SparseSet q(size()); + vector stk; + stk.reserve(size()); + + // First pass: Marks "roots". + // Builds the mapping from inst-ids to root-ids. + SparseArray rootmap(size()); + MarkRoots(&rootmap, &q, &stk); + + // Second pass: Emits "lists". Remaps outs to root-ids. + // Builds the mapping from root-ids to flat-ids. + vector flatmap(rootmap.size()); + vector flat; + flat.reserve(size()); + for (SparseArray::const_iterator i = rootmap.begin(); + i != rootmap.end(); + ++i) { + flatmap[i->value()] = flat.size(); + EmitList(i->index(), &rootmap, &flat, &q, &stk); + flat.back().set_last(); + } + + // Third pass: Remaps outs to flat-ids. + for (int id = 0; id < static_cast(flat.size()); id++) { + Inst* ip = &flat[id]; + if (ip->opcode() != kInstAltMatch) // handled in EmitList() + ip->set_out(flatmap[ip->out()]); + } + + // Remap start_unanchored and start. + if (start_unanchored() == 0) { + DCHECK_EQ(start(), 0); + } else if (start_unanchored() == start()) { + set_start_unanchored(flatmap[1]); + set_start(flatmap[1]); + } else { + set_start_unanchored(flatmap[1]); + set_start(flatmap[2]); + } + + // Finally, replace the old instructions with the new instructions. + size_ = flat.size(); + delete[] inst_; + inst_ = new Inst[size_]; + memmove(inst_, flat.data(), size_ * sizeof *inst_); +} + +void Prog::MarkRoots(SparseArray* rootmap, + SparseSet* q, vector* stk) { + // Mark the kInstFail instruction. + rootmap->set_new(0, rootmap->size()); + + // Mark the start_unanchored and start instructions. + if (!rootmap->has_index(start_unanchored())) + rootmap->set_new(start_unanchored(), rootmap->size()); + if (!rootmap->has_index(start())) + rootmap->set_new(start(), rootmap->size()); + + q->clear(); + stk->clear(); + stk->push_back(start_unanchored()); + while (!stk->empty()) { + int id = stk->back(); + stk->pop_back(); + Loop: + if (q->contains(id)) + continue; + q->insert_new(id); + + Inst* ip = inst(id); + switch (ip->opcode()) { + default: + LOG(DFATAL) << "unhandled opcode: " << ip->opcode(); + break; + + case kInstAltMatch: + case kInstAlt: + stk->push_back(ip->out1()); + id = ip->out(); + goto Loop; + + case kInstByteRange: + case kInstCapture: + case kInstEmptyWidth: + // Mark the out of this instruction. + if (!rootmap->has_index(ip->out())) + rootmap->set_new(ip->out(), rootmap->size()); + id = ip->out(); + goto Loop; + + case kInstNop: + id = ip->out(); + goto Loop; + case kInstMatch: + case kInstFail: + break; + } + } +} + +void Prog::EmitList(int root, SparseArray* rootmap, vector* flat, + SparseSet* q, vector* stk) { + q->clear(); + stk->clear(); + stk->push_back(root); + while (!stk->empty()) { + int id = stk->back(); + stk->pop_back(); + Loop: + if (q->contains(id)) + continue; + q->insert_new(id); + + if (id != root && rootmap->has_index(id)) { + // We reached another "tree" via epsilon transition. Emit a kInstNop + // instruction so that the Prog does not become quadratically larger. + flat->emplace_back(); + flat->back().set_opcode(kInstNop); + flat->back().set_out(rootmap->get_existing(id)); + continue; + } + + Inst* ip = inst(id); + switch (ip->opcode()) { + default: + LOG(DFATAL) << "unhandled opcode: " << ip->opcode(); + break; + + case kInstAltMatch: + flat->emplace_back(); + flat->back().set_opcode(kInstAltMatch); + flat->back().set_out(flat->size()); + flat->back().out1_ = flat->size()+1; + // Fall through. + + case kInstAlt: + stk->push_back(ip->out1()); + id = ip->out(); + goto Loop; + + case kInstByteRange: + case kInstCapture: + case kInstEmptyWidth: + flat->emplace_back(); + memmove(&flat->back(), ip, sizeof *ip); + flat->back().set_out(rootmap->get_existing(ip->out())); + break; + + case kInstNop: + id = ip->out(); + goto Loop; + + case kInstMatch: + case kInstFail: + flat->emplace_back(); + memmove(&flat->back(), ip, sizeof *ip); + break; + } + } +} + +} // namespace re2 diff --git a/re2/prog.h b/re2/prog.h index 45e9f0b..4a4017c 100644 --- a/re2/prog.h +++ b/re2/prog.h @@ -11,6 +11,7 @@ #include "util/util.h" #include "util/sparse_array.h" +#include "util/sparse_set.h" #include "re2/re2.h" namespace re2 { @@ -84,7 +85,11 @@ class Prog { // Single instruction in regexp program. class Inst { public: - Inst() : out_opcode_(0), out1_(0) { } + Inst() : out_opcode_(0), out1_(0) {} + + // Copyable. + Inst(const Inst&) = default; + Inst& operator=(const Inst&) = default; // Constructors per opcode void InitAlt(uint32 out, uint32 out1); @@ -98,17 +103,21 @@ class Prog { // Getters int id(Prog* p) { return static_cast(this - p->inst_); } InstOp opcode() { return static_cast(out_opcode_&7); } - int out() { return out_opcode_>>3; } - int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; } + int last() { return (out_opcode_>>3)&1; } + int out() { return out_opcode_>>4; } + int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; } int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; } int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; } int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; } int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; } int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; } EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; } - bool greedy(Prog *p) { + + bool greedy(Prog* p) { DCHECK_EQ(opcode(), kInstAltMatch); - return p->inst(out())->opcode() == kInstByteRange; + return p->inst(out())->opcode() == kInstByteRange || + (p->inst(out())->opcode() == kInstNop && + p->inst(p->inst(out())->out())->opcode() == kInstByteRange); } // Does this inst (an kInstByteRange) match c? @@ -123,23 +132,27 @@ class Prog { string Dump(); // Maximum instruction id. - // (Must fit in out_opcode_, and PatchList steals another bit.) + // (Must fit in out_opcode_. PatchList/last steal another bit.) static const int kMaxInst = (1<<28) - 1; private: void set_opcode(InstOp opcode) { - out_opcode_ = (out()<<3) | opcode; + out_opcode_ = (out()<<4) | (last()<<3) | opcode; + } + + void set_last() { + out_opcode_ = (out()<<4) | (1<<3) | opcode(); } void set_out(int out) { - out_opcode_ = (out<<3) | opcode(); + out_opcode_ = (out<<4) | (last()<<3) | opcode(); } void set_out_opcode(int out, InstOp opcode) { - out_opcode_ = (out<<3) | opcode; + out_opcode_ = (out<<4) | (last()<<3) | opcode; } - uint32 out_opcode_; // 29 bits of out, 3 (low) bits opcode + uint32 out_opcode_; // 28 bits of out, 1 bit for last, 3 (low) bits opcode union { // additional instruction arguments: uint32 out1_; // opcode == kInstAlt // alternate next instruction @@ -167,8 +180,6 @@ class Prog { friend class Compiler; friend struct PatchList; friend class Prog; - - DISALLOW_COPY_AND_ASSIGN(Inst); }; // Whether to anchor the search. @@ -339,6 +350,20 @@ class Prog { static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor, Regexp* re); + // Flattens the Prog from "tree" form to "list" form. This is an in-place + // operation in the sense that the old instructions are lost. + void Flatten(); + + // Marks the "roots" in the Prog: the outs of kInstByteRange, kInstCapture + // and kInstEmptyWidth instructions. + void MarkRoots(SparseArray* rootmap, + SparseSet* q, vector* stk); + + // Emits one "list" via "tree" traversal from the given "root" instruction. + // The new instructions are appended to the given vector. + void EmitList(int root, SparseArray* rootmap, vector* flat, + SparseSet* q, vector* stk); + private: friend class Compiler; @@ -347,6 +372,7 @@ class Prog { bool anchor_start_; // regexp has explicit start anchor bool anchor_end_; // regexp has explicit end anchor bool reversed_; // whether program runs backward over input + bool did_flatten_; // has Flatten been called? bool did_onepass_; // has IsOnePass been called? int start_; // entry point for program diff --git a/re2/re2.cc b/re2/re2.cc index b3e582f..492ffaf 100644 --- a/re2/re2.cc +++ b/re2/re2.cc @@ -52,22 +52,11 @@ RE2::Options::Options(RE2::CannedOptions opt) one_line_(false) { } -// static empty things for use as const references. -// To avoid global constructors, initialized on demand. -GLOBAL_MUTEX(empty_mutex); -static const string *empty_string; -static const map *empty_named_groups; -static const map *empty_group_names; - -static void InitEmpty() { - GLOBAL_MUTEX_LOCK(empty_mutex); - if (empty_string == NULL) { - empty_string = new string; - empty_named_groups = new map; - empty_group_names = new map; - } - GLOBAL_MUTEX_UNLOCK(empty_mutex); -} +// static empty objects for use as const references. +// To avoid global constructors, allocated in RE2::Init(). +static const string* empty_string; +static const map* empty_named_groups; +static const map* empty_group_names; // Converts from Regexp error code to RE2 error code. // Maybe some day they will diverge. In any event, this @@ -174,19 +163,24 @@ int RE2::Options::ParseFlags() const { } void RE2::Init(const StringPiece& pattern, const Options& options) { - mutex_ = new Mutex; + static std::once_flag empty_once; + std::call_once(empty_once, []() { + empty_string = new string; + empty_named_groups = new map; + empty_group_names = new map; + }); + pattern_ = pattern.as_string(); options_.Copy(options); - InitEmpty(); - error_ = empty_string; - error_code_ = NoError; - suffix_regexp_ = NULL; entire_regexp_ = NULL; + suffix_regexp_ = NULL; prog_ = NULL; rprog_ = NULL; + error_ = empty_string; + error_code_ = NoError; + num_captures_ = -1; named_groups_ = NULL; group_names_ = NULL; - num_captures_ = -1; RegexpStatus status; entire_regexp_ = Regexp::Parse( @@ -194,14 +188,13 @@ void RE2::Init(const StringPiece& pattern, const Options& options) { static_cast(options_.ParseFlags()), &status); if (entire_regexp_ == NULL) { - if (error_ == empty_string) - error_ = new string(status.Text()); if (options_.log_errors()) { LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': " << status.Text(); } - error_arg_ = status.error_arg().as_string(); + error_ = new string(status.Text()); error_code_ = RegexpErrorToRE2(status.code()); + error_arg_ = status.error_arg().as_string(); return; } @@ -235,17 +228,15 @@ void RE2::Init(const StringPiece& pattern, const Options& options) { // Returns rprog_, computing it if needed. re2::Prog* RE2::ReverseProg() const { - MutexLock l(mutex_); - if (rprog_ == NULL && error_ == empty_string) { + std::call_once(rprog_once_, [this]() { rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3); if (rprog_ == NULL) { if (options_.log_errors()) LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'"; error_ = new string("pattern too large - reverse compile failed"); error_code_ = RE2::ErrorPatternTooLarge; - return NULL; } - } + }); return rprog_; } @@ -254,7 +245,6 @@ RE2::~RE2() { suffix_regexp_->Decref(); if (entire_regexp_) entire_regexp_->Decref(); - delete mutex_; delete prog_; delete rprog_; if (error_ != empty_string) @@ -291,37 +281,32 @@ int RE2::ProgramFanout(map* histogram) const { // Returns num_captures_, computing it if needed, or -1 if the // regexp wasn't valid on construction. int RE2::NumberOfCapturingGroups() const { - MutexLock l(mutex_); - if (suffix_regexp_ == NULL) - return -1; - if (num_captures_ == -1) - num_captures_ = suffix_regexp_->NumCaptures(); + std::call_once(num_captures_once_, [this]() { + if (suffix_regexp_ != NULL) + num_captures_ = suffix_regexp_->NumCaptures(); + }); return num_captures_; } // Returns named_groups_, computing it if needed. const map& RE2::NamedCapturingGroups() const { - MutexLock l(mutex_); - if (!ok()) - return *empty_named_groups; - if (named_groups_ == NULL) { - named_groups_ = suffix_regexp_->NamedCaptures(); + std::call_once(named_groups_once_, [this]() { + if (suffix_regexp_ != NULL) + named_groups_ = suffix_regexp_->NamedCaptures(); if (named_groups_ == NULL) named_groups_ = empty_named_groups; - } + }); return *named_groups_; } // Returns group_names_, computing it if needed. const map& RE2::CapturingGroupNames() const { - MutexLock l(mutex_); - if (!ok()) - return *empty_group_names; - if (group_names_ == NULL) { - group_names_ = suffix_regexp_->CaptureNames(); + std::call_once(group_names_once_, [this]() { + if (suffix_regexp_ != NULL) + group_names_ = suffix_regexp_->CaptureNames(); if (group_names_ == NULL) group_names_ = empty_group_names; - } + }); return *group_names_; } diff --git a/re2/re2.h b/re2/re2.h index 5b955d0..4a8c5c8 100644 --- a/re2/re2.h +++ b/re2/re2.h @@ -181,6 +181,7 @@ #include #include +#include #include #include "re2/stringpiece.h" #include "re2/variadic_function.h" @@ -193,25 +194,9 @@ namespace re2 { using std::string; using std::map; -class Mutex; class Prog; class Regexp; -// The following enum should be used only as a constructor argument to indicate -// that the variable has static storage class, and that the constructor should -// do nothing to its state. It indicates to the reader that it is legal to -// declare a static instance of the class, provided the constructor is given -// the LINKER_INITIALIZED argument. Normally, it is unsafe to declare a -// static variable that has a constructor or a destructor because invocation -// order is undefined. However, IF the type can be initialized by filling with -// zeroes (which the loader does for static variables), AND the type's -// destructor does nothing to the storage, then a constructor for static -// initialization can be declared as -// explicit MyClass(LinkerInitialized x) {} -// and invoked as -// static MyClass my_variable_name(LINKER_INITIALIZED); -enum LinkerInitialized { LINKER_INITIALIZED }; - // Interface for regular expression matching. Also corresponds to a // pre-compiled regular expression. An "RE2" object is safe for // concurrent use by multiple threads. @@ -723,21 +708,21 @@ class RE2 { re2::Prog* ReverseProg() const; - mutable Mutex* mutex_; - string pattern_; // string regular expression - Options options_; // option flags + string pattern_; // string regular expression + Options options_; // option flags string prefix_; // required prefix (before regexp_) bool prefix_foldcase_; // prefix is ASCII case-insensitive re2::Regexp* entire_regexp_; // parsed regular expression re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed re2::Prog* prog_; // compiled program for regexp - mutable re2::Prog* rprog_; // reverse program for regexp - bool is_one_pass_; // can use prog_->SearchOnePass? - mutable const string* error_; // Error indicator - // (or points to empty string) - mutable ErrorCode error_code_; // Error code - mutable string error_arg_; // Fragment of regexp showing error - mutable int num_captures_; // Number of capturing groups + bool is_one_pass_; // can use prog_->SearchOnePass? + + mutable re2::Prog* rprog_; // reverse program for regexp + mutable const string* error_; // Error indicator + // (or points to empty string) + mutable ErrorCode error_code_; // Error code + mutable string error_arg_; // Fragment of regexp showing error + mutable int num_captures_; // Number of capturing groups // Map from capture names to indices mutable const map* named_groups_; @@ -745,6 +730,12 @@ class RE2 { // Map from capture indices to names mutable const map* group_names_; + // Onces for lazy computations. + mutable std::once_flag rprog_once_; + mutable std::once_flag num_captures_once_; + mutable std::once_flag named_groups_once_; + mutable std::once_flag group_names_once_; + //DISALLOW_COPY_AND_ASSIGN(RE2); RE2(const RE2&); void operator=(const RE2&); diff --git a/re2/regexp.cc b/re2/regexp.cc index 99e72e5..7b1b174 100644 --- a/re2/regexp.cc +++ b/re2/regexp.cc @@ -60,30 +60,29 @@ bool Regexp::QuickDestroy() { return false; } -static map *ref_map; -GLOBAL_MUTEX(ref_mutex); +// Lazily allocated. +static Mutex* ref_mutex; +static map* ref_map; int Regexp::Ref() { if (ref_ < kMaxRef) return ref_; - GLOBAL_MUTEX_LOCK(ref_mutex); - int r = 0; - if (ref_map != NULL) { - r = (*ref_map)[this]; - } - GLOBAL_MUTEX_UNLOCK(ref_mutex); - return r; + MutexLock l(ref_mutex); + return (*ref_map)[this]; } // Increments reference count, returns object as convenience. Regexp* Regexp::Incref() { if (ref_ >= kMaxRef-1) { - // Store ref count in overflow map. - GLOBAL_MUTEX_LOCK(ref_mutex); - if (ref_map == NULL) { + static std::once_flag ref_once; + std::call_once(ref_once, []() { + ref_mutex = new Mutex; ref_map = new map; - } + }); + + // Store ref count in overflow map. + MutexLock l(ref_mutex); if (ref_ == kMaxRef) { // already overflowed (*ref_map)[this]++; @@ -92,7 +91,6 @@ Regexp* Regexp::Incref() { (*ref_map)[this] = kMaxRef; ref_ = kMaxRef; } - GLOBAL_MUTEX_UNLOCK(ref_mutex); return this; } @@ -104,7 +102,7 @@ Regexp* Regexp::Incref() { void Regexp::Decref() { if (ref_ == kMaxRef) { // Ref count is stored in overflow map. - GLOBAL_MUTEX_LOCK(ref_mutex); + MutexLock l(ref_mutex); int r = (*ref_map)[this] - 1; if (r < kMaxRef) { ref_ = static_cast(r); @@ -112,7 +110,6 @@ void Regexp::Decref() { } else { (*ref_map)[this] = r; } - GLOBAL_MUTEX_UNLOCK(ref_mutex); return; } ref_--; diff --git a/re2/regexp.h b/re2/regexp.h index 5f222b7..54720eb 100644 --- a/re2/regexp.h +++ b/re2/regexp.h @@ -185,10 +185,10 @@ class RegexpStatus { RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {} ~RegexpStatus() { delete tmp_; } - void set_code(enum RegexpStatusCode code) { code_ = code; } + void set_code(RegexpStatusCode code) { code_ = code; } void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; } void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; } - enum RegexpStatusCode code() const { return code_; } + RegexpStatusCode code() const { return code_; } const StringPiece& error_arg() const { return error_arg_; } bool ok() const { return code() == kRegexpSuccess; } @@ -197,14 +197,14 @@ class RegexpStatus { // Returns text equivalent of code, e.g.: // "Bad character class" - static string CodeText(enum RegexpStatusCode code); + static string CodeText(RegexpStatusCode code); // Returns text describing error, e.g.: // "Bad character class: [z-a]" string Text() const; private: - enum RegexpStatusCode code_; // Kind of error + RegexpStatusCode code_; // Kind of error StringPiece error_arg_; // Piece of regexp containing syntax error. string* tmp_; // Temporary storage, possibly where error_arg_ is. @@ -314,7 +314,7 @@ class Regexp { RegexpOp op() { return static_cast(op_); } int nsub() { return nsub_; } bool simple() { return simple_ != 0; } - enum ParseFlags parse_flags() { return static_cast(parse_flags_); } + ParseFlags parse_flags() { return static_cast(parse_flags_); } int Ref(); // For testing. Regexp** sub() { @@ -628,8 +628,6 @@ inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) return static_cast(~static_cast(a)); } - - } // namespace re2 #endif // RE2_REGEXP_H__ diff --git a/re2/set.cc b/re2/set.cc index a1a84ba..0e626a2 100644 --- a/re2/set.cc +++ b/re2/set.cc @@ -96,7 +96,8 @@ bool RE2::Set::Match(const StringPiece& text, vector* v) const { LOG(DFATAL) << "RE2::Set::Match without Compile"; return false; } - v->clear(); + if (v != NULL) + v->clear(); bool failed; bool ret = prog_->SearchDFA(text, text, Prog::kAnchored, Prog::kManyMatch, NULL, &failed, v); @@ -105,7 +106,7 @@ bool RE2::Set::Match(const StringPiece& text, vector* v) const { if (ret == false) return false; - if (v->size() == 0) { + if (v != NULL && v->empty()) { LOG(DFATAL) << "RE2::Set::Match: match but unknown regexp set"; return false; } diff --git a/re2/set.h b/re2/set.h index e48d8cb..5bcdb70 100644 --- a/re2/set.h +++ b/re2/set.h @@ -36,7 +36,7 @@ class RE2::Set { bool Compile(); // Match returns true if text matches any of the regexps in the set. - // If so, it fills v with the indices of the matching regexps. + // If so, it fills v (if not NULL) with the indices of the matching regexps. bool Match(const StringPiece& text, vector* v) const; private: diff --git a/re2/testing/backtrack.cc b/re2/testing/backtrack.cc index 2141c53..95c14c2 100644 --- a/re2/testing/backtrack.cc +++ b/re2/testing/backtrack.cc @@ -55,10 +55,14 @@ class Backtracker { StringPiece* submatch, int nsubmatch); private: - // Explores from instruction ip at string position p looking for a match. + // Explores from instruction id at string position p looking for a match. // Returns true if found (so that caller can stop trying other possibilities). bool Visit(int id, const char* p); + // Tries instruction id at string position p. + // Returns true if a match is found. + bool Try(int id, const char* p); + // Search parameters Prog* prog_; // program being run StringPiece text_; // text being searched @@ -143,7 +147,7 @@ bool Backtracker::Search(const StringPiece& text, const StringPiece& context, return false; } -// Explores from instruction ip at string position p looking for a match. +// Explores from instruction id at string position p looking for a match. // Return true if found (so that caller can stop trying other possibilities). bool Backtracker::Visit(int id, const char* p) { // Check bitmap. If we've already explored from here, @@ -156,6 +160,20 @@ bool Backtracker::Visit(int id, const char* p) { return false; visited_[n/32] |= 1 << (n&31); + Prog::Inst* ip = prog_->inst(id); + if (Try(id, p)) { + if (longest_ && !ip->last()) + Visit(id+1, p); + return true; + } + if (!ip->last()) + return Visit(id+1, p); + return false; +} + +// Tries instruction id at string position p. +// Returns true if a match is found. +bool Backtracker::Try(int id, const char* p) { // Pick out byte at current position. If at end of string, // have to explore in hope of finishing a match. Use impossible byte -1. int c = -1; @@ -168,15 +186,9 @@ bool Backtracker::Visit(int id, const char* p) { LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode(); return false; // not reached - case kInstAlt: case kInstAltMatch: - // Try both possible next states: out is preferred to out1. - if (Visit(ip->out(), p)) { - if (longest_) - Visit(ip->out1(), p); - return true; - } - return Visit(ip->out1(), p); + // Ignored. + return false; case kInstByteRange: if (ip->Matches(c)) diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc index dee90a3..05e7ea8 100644 --- a/re2/testing/compile_test.cc +++ b/re2/testing/compile_test.cc @@ -27,82 +27,89 @@ struct Test { static Test tests[] = { { "a", - "1. byte [61-61] -> 2\n" - "2. match! 0\n" }, + "3. byte [61-61] -> 4\n" + "4. match! 0\n" }, { "ab", - "1. byte [61-61] -> 2\n" - "2. byte [62-62] -> 3\n" - "3. match! 0\n" }, + "3. byte [61-61] -> 4\n" + "4. byte [62-62] -> 5\n" + "5. match! 0\n" }, { "a|c", - "3. alt -> 1 | 2\n" - "1. byte [61-61] -> 4\n" - "2. byte [63-63] -> 4\n" - "4. match! 0\n" }, + "3+ byte [61-61] -> 5\n" + "4. byte [63-63] -> 5\n" + "5. match! 0\n" }, { "a|b", - "1. byte [61-62] -> 2\n" - "2. match! 0\n" }, + "3. byte [61-62] -> 4\n" + "4. match! 0\n" }, { "[ab]", - "1. byte [61-62] -> 2\n" - "2. match! 0\n" }, + "3. byte [61-62] -> 4\n" + "4. match! 0\n" }, { "a+", - "1. byte [61-61] -> 2\n" - "2. alt -> 1 | 3\n" - "3. match! 0\n" }, + "3. byte [61-61] -> 4\n" + "4+ nop -> 3\n" + "5. match! 0\n" }, { "a+?", - "1. byte [61-61] -> 2\n" - "2. alt -> 3 | 1\n" - "3. match! 0\n" }, + "3. byte [61-61] -> 4\n" + "4+ match! 0\n" + "5. nop -> 3\n" }, { "a*", - "2. alt -> 1 | 3\n" - "1. byte [61-61] -> 2\n" - "3. match! 0\n" }, + "3+ byte [61-61] -> 3\n" + "4. match! 0\n" }, { "a*?", - "2. alt -> 3 | 1\n" - "3. match! 0\n" - "1. byte [61-61] -> 2\n" }, + "3+ match! 0\n" + "4. byte [61-61] -> 3\n" }, { "a?", - "2. alt -> 1 | 3\n" - "1. byte [61-61] -> 3\n" - "3. match! 0\n" }, + "3+ byte [61-61] -> 5\n" + "4. nop -> 5\n" + "5. match! 0\n" }, { "a??", - "2. alt -> 3 | 1\n" - "3. match! 0\n" - "1. byte [61-61] -> 3\n" }, + "3+ nop -> 5\n" + "4. byte [61-61] -> 5\n" + "5. match! 0\n" }, { "a{4}", - "1. byte [61-61] -> 2\n" - "2. byte [61-61] -> 3\n" "3. byte [61-61] -> 4\n" "4. byte [61-61] -> 5\n" - "5. match! 0\n" }, + "5. byte [61-61] -> 6\n" + "6. byte [61-61] -> 7\n" + "7. match! 0\n" }, { "(a)", - "2. capture 2 -> 1\n" - "1. byte [61-61] -> 3\n" - "3. capture 3 -> 4\n" - "4. match! 0\n" }, + "3. capture 2 -> 4\n" + "4. byte [61-61] -> 5\n" + "5. capture 3 -> 6\n" + "6. match! 0\n" }, { "(?:a)", - "1. byte [61-61] -> 2\n" - "2. match! 0\n" }, + "3. byte [61-61] -> 4\n" + "4. match! 0\n" }, { "", - "2. match! 0\n" }, + "3. match! 0\n" }, { ".", - "3. alt -> 1 | 2\n" - "1. byte [00-09] -> 4\n" - "2. byte [0b-ff] -> 4\n" - "4. match! 0\n" }, + "3+ byte [00-09] -> 5\n" + "4. byte [0b-ff] -> 5\n" + "5. match! 0\n" }, { "[^ab]", - "5. alt -> 3 | 4\n" - "3. alt -> 1 | 2\n" - "4. byte [63-ff] -> 6\n" - "1. byte [00-09] -> 6\n" - "2. byte [0b-60] -> 6\n" + "3+ byte [00-09] -> 6\n" + "4+ byte [0b-60] -> 6\n" + "5. byte [63-ff] -> 6\n" "6. match! 0\n" }, { "[Aa]", - "1. byte/i [61-61] -> 2\n" - "2. match! 0\n" }, + "3. byte/i [61-61] -> 4\n" + "4. match! 0\n" }, + { "\\C+", + "3. byte [00-ff] -> 4\n" + "4+ altmatch -> 5 | 6\n" + "5+ nop -> 3\n" + "6. match! 0\n" }, + { "\\C*", + "3+ altmatch -> 4 | 5\n" + "4+ byte [00-ff] -> 3\n" + "5. match! 0\n" }, + { "\\C?", + "3+ byte [00-ff] -> 5\n" + "4. nop -> 5\n" + "5. match! 0\n" }, // Issue 20992936 { "[[-`]", - "1. byte [5b-60] -> 2\n" - "2. match! 0\n" }, + "3. byte [5b-60] -> 4\n" + "4. match! 0\n" }, }; TEST(TestRegexpCompileToProg, Simple) { @@ -172,6 +179,18 @@ TEST(TestCompile, ByteRanges) { re->Decref(); } +TEST(TestCompile, InsufficientMemory) { + Regexp* re = Regexp::Parse( + "^(?P[^\\s]+)\\s+(?P[^\\s]+)\\s+(?P.+)$", + Regexp::LikePerl, NULL); + EXPECT_TRUE(re != NULL); + Prog* prog = re->CompileToProg(920); + // If the memory budget has been exhausted, compilation should fail + // and return NULL instead of trying to do anything with NoMatch(). + EXPECT_TRUE(prog == NULL); + re->Decref(); +} + static void Dump(StringPiece pattern, Regexp::ParseFlags flags, string* forward, string* reverse) { Regexp* re = Regexp::Parse(pattern, flags, NULL); @@ -201,60 +220,51 @@ TEST(TestCompile, Bug26705922) { string forward, reverse; Dump("[\\x{10000}\\x{10010}]", Regexp::LikePerl, &forward, &reverse); - EXPECT_EQ("4. byte [f0-f0] -> 3\n" - "3. byte [90-90] -> 2\n" - "2. byte [80-80] -> 6\n" - "6. alt -> 1 | 5\n" - "1. byte [80-80] -> 7\n" - "5. byte [90-90] -> 7\n" - "7. match! 0\n", + EXPECT_EQ("3. byte [f0-f0] -> 4\n" + "4. byte [90-90] -> 5\n" + "5. byte [80-80] -> 6\n" + "6+ byte [80-80] -> 8\n" + "7. byte [90-90] -> 8\n" + "8. match! 0\n", forward); - EXPECT_EQ("6. alt -> 4 | 5\n" - "4. byte [80-80] -> 3\n" - "5. byte [90-90] -> 3\n" - "3. byte [80-80] -> 2\n" - "2. byte [90-90] -> 1\n" - "1. byte [f0-f0] -> 7\n" - "7. match! 0\n", + EXPECT_EQ("3+ byte [80-80] -> 5\n" + "4. byte [90-90] -> 5\n" + "5. byte [80-80] -> 6\n" + "6. byte [90-90] -> 7\n" + "7. byte [f0-f0] -> 8\n" + "8. match! 0\n", reverse); Dump("[\\x{8000}-\\x{10FFF}]", Regexp::LikePerl, &forward, &reverse); - EXPECT_EQ("6. alt -> 3 | 5\n" - "3. byte [e8-ef] -> 2\n" - "5. byte [f0-f0] -> 4\n" - "2. byte [80-bf] -> 1\n" - "4. byte [90-90] -> 2\n" - "1. byte [80-bf] -> 7\n" - "7. match! 0\n", + EXPECT_EQ("3+ byte [e8-ef] -> 5\n" + "4. byte [f0-f0] -> 8\n" + "5. byte [80-bf] -> 6\n" + "6. byte [80-bf] -> 7\n" + "7. match! 0\n" + "8. byte [90-90] -> 5\n", forward); - EXPECT_EQ("3. byte [80-bf] -> 2\n" - "2. byte [80-bf] -> 6\n" - "6. alt -> 1 | 5\n" - "1. byte [e8-ef] -> 7\n" - "5. byte [90-90] -> 4\n" + EXPECT_EQ("3. byte [80-bf] -> 4\n" + "4. byte [80-bf] -> 5\n" + "5+ byte [e8-ef] -> 7\n" + "6. byte [90-90] -> 8\n" "7. match! 0\n" - "4. byte [f0-f0] -> 7\n", + "8. byte [f0-f0] -> 7\n", reverse); Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, NULL, &reverse); - EXPECT_EQ("2. byte [80-bf] -> 8\n" - "8. alt -> 5 | 7\n" - "5. alt -> 1 | 4\n" - "7. byte [80-bf] -> 17\n" - "1. byte [c2-df] -> 18\n" - "4. byte [a0-bf] -> 3\n" - "17. alt -> 14 | 16\n" - "18. match! 0\n" - "3. byte [e0-e0] -> 18\n" - "14. alt -> 11 | 13\n" - "16. byte [80-8f] -> 15\n" - "11. alt -> 6 | 10\n" - "13. byte [80-bf] -> 12\n" - "15. byte [f4-f4] -> 18\n" - "6. byte [e1-ef] -> 18\n" - "10. byte [90-bf] -> 9\n" - "12. byte [f1-f3] -> 18\n" - "9. byte [f0-f0] -> 18\n", + EXPECT_EQ("3. byte [80-bf] -> 4\n" + "4+ byte [c2-df] -> 7\n" + "5+ byte [a0-bf] -> 8\n" + "6. byte [80-bf] -> 9\n" + "7. match! 0\n" + "8. byte [e0-e0] -> 7\n" + "9+ byte [e1-ef] -> 7\n" + "10+ byte [90-bf] -> 13\n" + "11+ byte [80-bf] -> 14\n" + "12. byte [80-8f] -> 15\n" + "13. byte [f0-f0] -> 7\n" + "14. byte [f1-f3] -> 7\n" + "15. byte [f4-f4] -> 7\n", reverse); } diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc index 5a32d92..81d5fef 100644 --- a/re2/testing/re2_test.cc +++ b/re2/testing/re2_test.cc @@ -879,11 +879,16 @@ TEST(RE2, FloatingPointFullMatchTypes) { // number, since C does not guarantee to get the correctly // rounded answer for strtod and strtof unless the input is // short. + // + // This is known to fail on Cygwin and MinGW due to a broken + // implementation of strtof(3). Sigh. +#if !defined(__CYGWIN__) && !defined(__MINGW32__) CHECK(RE2::FullMatch("0.1", "(.*)", &v)); CHECK_EQ(v, 0.1f) << StringPrintf("%.8g != %.8g", v, 0.1f); CHECK(RE2::FullMatch("6700000000081920.1", "(.*)", &v)); CHECK_EQ(v, 6700000000081920.1f) << StringPrintf("%.8g != %.8g", v, 6700000000081920.1f); +#endif } { double v; diff --git a/re2/testing/regexp_benchmark.cc b/re2/testing/regexp_benchmark.cc index 8d80b4c..8318e28 100644 --- a/re2/testing/regexp_benchmark.cc +++ b/re2/testing/regexp_benchmark.cc @@ -349,6 +349,51 @@ BENCHMARK_RANGE(Search_Success1_Cached_PCRE, 8, 16<<20)->ThreadRange(1, NumC BENCHMARK_RANGE(Search_Success1_Cached_RE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); BENCHMARK_RANGE(Search_Success1_Cached_BitState, 8, 2<<20)->ThreadRange(1, NumCPUs()); +// Benchmark: AltMatch optimisation (just to verify that it works) +// Note that OnePass doesn't implement it! + +void SearchAltMatch(int iters, int nbytes, SearchImpl* search) { + StopBenchmarkTiming(); + string s; + MakeText(&s, nbytes); + BenchmarkMemoryUsage(); + StartBenchmarkTiming(); + search(iters, "\\C*", s, Prog::kAnchored, true); + SetBenchmarkBytesProcessed(static_cast(iters)*nbytes); +} + +void Search_AltMatch_DFA(int i, int n) { SearchAltMatch(i, n, SearchDFA); } +void Search_AltMatch_NFA(int i, int n) { SearchAltMatch(i, n, SearchNFA); } +void Search_AltMatch_OnePass(int i, int n) { SearchAltMatch(i, n, SearchOnePass); } +void Search_AltMatch_BitState(int i, int n) { SearchAltMatch(i, n, SearchBitState); } +void Search_AltMatch_PCRE(int i, int n) { SearchAltMatch(i, n, SearchPCRE); } +void Search_AltMatch_RE2(int i, int n) { SearchAltMatch(i, n, SearchRE2); } + +BENCHMARK_RANGE(Search_AltMatch_DFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_AltMatch_NFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_AltMatch_OnePass, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_AltMatch_BitState, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_AltMatch_PCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_AltMatch_RE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +void Search_AltMatch_CachedDFA(int i, int n) { SearchAltMatch(i, n, SearchCachedDFA); } +void Search_AltMatch_CachedNFA(int i, int n) { SearchAltMatch(i, n, SearchCachedNFA); } +void Search_AltMatch_CachedOnePass(int i, int n) { SearchAltMatch(i, n, SearchCachedOnePass); } +void Search_AltMatch_CachedBitState(int i, int n) { SearchAltMatch(i, n, SearchCachedBitState); } +void Search_AltMatch_CachedPCRE(int i, int n) { SearchAltMatch(i, n, SearchCachedPCRE); } +void Search_AltMatch_CachedRE2(int i, int n) { SearchAltMatch(i, n, SearchCachedRE2); } + +BENCHMARK_RANGE(Search_AltMatch_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_AltMatch_CachedNFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_AltMatch_CachedOnePass, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_AltMatch_CachedBitState, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_AltMatch_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_AltMatch_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + // Benchmark: use regexp to find phone number. void SearchDigits(int iters, SearchImpl* search) { diff --git a/re2/testing/search_test.cc b/re2/testing/search_test.cc index 3ab2ae3..08e5b4c 100644 --- a/re2/testing/search_test.cc +++ b/re2/testing/search_test.cc @@ -264,7 +264,7 @@ RegexpTest simple_tests[] = { { "[A-Z]+", "aAzZ" }, { "[^\\\\]+", "Aa\\" }, { "[acegikmoqsuwy]+", "acegikmoqsuwyACEGIKMOQSUWY" }, - + // Anchoring. (^abc in aabcdef was a former bug) // The tester checks for a match in the text and // subpieces of the text with a byte removed on either side. @@ -297,7 +297,12 @@ RegexpTest simple_tests[] = { { "a", "a" }, { "ab*", "a" }, { "a\\C*", "a" }, - + { "a\\C+", "a" }, + { "a\\C?", "a" }, + { "a\\C*?", "a" }, + { "a\\C+?", "a" }, + { "a\\C??", "a" }, + // Former bugs. { "a\\C*|ba\\C", "baba" }, }; diff --git a/re2/testing/set_test.cc b/re2/testing/set_test.cc index 4e267ae..c613d6a 100644 --- a/re2/testing/set_test.cc +++ b/re2/testing/set_test.cc @@ -18,21 +18,22 @@ TEST(Set, Unanchored) { CHECK_EQ(s.Add("foo", NULL), 0); CHECK_EQ(s.Add("(", NULL), -1); CHECK_EQ(s.Add("bar", NULL), 1); - CHECK_EQ(s.Compile(), true); + CHECK_EQ(s.Match("foobar", NULL), true); + CHECK_EQ(s.Match("fooba", NULL), true); + CHECK_EQ(s.Match("oobar", NULL), true); + vector v; CHECK_EQ(s.Match("foobar", &v), true); CHECK_EQ(v.size(), 2); CHECK_EQ(v[0], 0); CHECK_EQ(v[1], 1); - v.clear(); CHECK_EQ(s.Match("fooba", &v), true); CHECK_EQ(v.size(), 1); CHECK_EQ(v[0], 0); - v.clear(); CHECK_EQ(s.Match("oobar", &v), true); CHECK_EQ(v.size(), 1); CHECK_EQ(v[0], 1); @@ -44,27 +45,28 @@ TEST(Set, UnanchoredFactored) { CHECK_EQ(s.Add("foo", NULL), 0); CHECK_EQ(s.Add("(", NULL), -1); CHECK_EQ(s.Add("foobar", NULL), 1); - CHECK_EQ(s.Compile(), true); + CHECK_EQ(s.Match("foobar", NULL), true); + CHECK_EQ(s.Match("obarfoobaroo", NULL), true); + CHECK_EQ(s.Match("fooba", NULL), true); + CHECK_EQ(s.Match("oobar", NULL), false); + vector v; CHECK_EQ(s.Match("foobar", &v), true); CHECK_EQ(v.size(), 2); CHECK_EQ(v[0], 0); CHECK_EQ(v[1], 1); - v.clear(); CHECK_EQ(s.Match("obarfoobaroo", &v), true); CHECK_EQ(v.size(), 2); CHECK_EQ(v[0], 0); CHECK_EQ(v[1], 1); - v.clear(); CHECK_EQ(s.Match("fooba", &v), true); CHECK_EQ(v.size(), 1); CHECK_EQ(v[0], 0); - v.clear(); CHECK_EQ(s.Match("oobar", &v), false); CHECK_EQ(v.size(), 0); } @@ -75,6 +77,8 @@ TEST(Set, UnanchoredDollar) { CHECK_EQ(s.Add("foo$", NULL), 0); CHECK_EQ(s.Compile(), true); + CHECK_EQ(s.Match("foo", NULL), true); + vector v; CHECK_EQ(s.Match("foo", &v), true); CHECK_EQ(v.size(), 1); @@ -87,9 +91,14 @@ TEST(Set, Anchored) { CHECK_EQ(s.Add("foo", NULL), 0); CHECK_EQ(s.Add("(", NULL), -1); CHECK_EQ(s.Add("bar", NULL), 1); - CHECK_EQ(s.Compile(), true); + CHECK_EQ(s.Match("foobar", NULL), false); + CHECK_EQ(s.Match("fooba", NULL), false); + CHECK_EQ(s.Match("oobar", NULL), false); + CHECK_EQ(s.Match("foo", NULL), true); + CHECK_EQ(s.Match("bar", NULL), true); + vector v; CHECK_EQ(s.Match("foobar", &v), false); CHECK_EQ(v.size(), 0); @@ -114,11 +123,13 @@ TEST(Set, EmptyUnanchored) { CHECK_EQ(s.Compile(), true); + CHECK_EQ(s.Match("", NULL), false); + CHECK_EQ(s.Match("foobar", NULL), false); + vector v; CHECK_EQ(s.Match("", &v), false); CHECK_EQ(v.size(), 0); - v.clear(); CHECK_EQ(s.Match("foobar", &v), false); CHECK_EQ(v.size(), 0); } @@ -128,13 +139,38 @@ TEST(Set, EmptyAnchored) { CHECK_EQ(s.Compile(), true); + CHECK_EQ(s.Match("", NULL), false); + CHECK_EQ(s.Match("foobar", NULL), false); + vector v; CHECK_EQ(s.Match("", &v), false); CHECK_EQ(v.size(), 0); - v.clear(); CHECK_EQ(s.Match("foobar", &v), false); CHECK_EQ(v.size(), 0); } +TEST(Set, Prefix) { + RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH); + + CHECK_EQ(s.Add("/prefix/\\d*", NULL), 0); + CHECK_EQ(s.Compile(), true); + + CHECK_EQ(s.Match("/prefix", NULL), false); + CHECK_EQ(s.Match("/prefix/", NULL), true); + CHECK_EQ(s.Match("/prefix/42", NULL), true); + + vector v; + CHECK_EQ(s.Match("/prefix", &v), false); + CHECK_EQ(v.size(), 0); + + CHECK_EQ(s.Match("/prefix/", &v), true); + CHECK_EQ(v.size(), 1); + CHECK_EQ(v[0], 0); + + CHECK_EQ(s.Match("/prefix/42", &v), true); + CHECK_EQ(v.size(), 1); + CHECK_EQ(v[0], 0); +} + } // namespace re2 diff --git a/util/mutex.h b/util/mutex.h index b479e48..8a13226 100644 --- a/util/mutex.h +++ b/util/mutex.h @@ -208,27 +208,6 @@ class WriterMutexLock { #define ReaderMutexLock(x) COMPILE_ASSERT(0, rmutex_lock_decl_missing_var_name) #define WriterMutexLock(x) COMPILE_ASSERT(0, wmutex_lock_decl_missing_var_name) -// Provide safe way to declare and use global, linker-initialized mutex. Sigh. -#if HAVE_PTHREAD - -#define GLOBAL_MUTEX(name) \ - static pthread_mutex_t (name) = PTHREAD_MUTEX_INITIALIZER -#define GLOBAL_MUTEX_LOCK(name) \ - pthread_mutex_lock(&(name)) -#define GLOBAL_MUTEX_UNLOCK(name) \ - pthread_mutex_unlock(&(name)) - -#else - -#define GLOBAL_MUTEX(name) \ - static Mutex name -#define GLOBAL_MUTEX_LOCK(name) \ - name.Lock() -#define GLOBAL_MUTEX_UNLOCK(name) \ - name.Unlock() - -#endif - } // namespace re2 #endif /* #define RE2_UTIL_MUTEX_H_ */ diff --git a/util/util.h b/util/util.h index 8c5d922..7dfab93 100644 --- a/util/util.h +++ b/util/util.h @@ -31,6 +31,7 @@ #include #include #include +#include // For std::call_once // Use std names. using std::set; -- cgit v1.2.3