summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDongHun Kwak <dh0128.kwak@samsung.com>2016-11-21 16:57:37 +0900
committerDongHun Kwak <dh0128.kwak@samsung.com>2016-11-21 16:57:37 +0900
commit6c0bc52e1c8854d1a96810f0fe9849d83a7c409c (patch)
treefc11f96d6acda48ef0b297817c5f6110aa81351b
parent4dfe393027cfcfe844a8c8b97209a714904308c4 (diff)
downloadre2-6c0bc52e1c8854d1a96810f0fe9849d83a7c409c.tar.gz
re2-6c0bc52e1c8854d1a96810f0fe9849d83a7c409c.tar.bz2
re2-6c0bc52e1c8854d1a96810f0fe9849d83a7c409c.zip
Imported Upstream version 20160401upstream/20160401
Change-Id: I05a22e9add0c0dc9c7cc5d28b8ceedd442fca31c Signed-off-by: DongHun Kwak <dh0128.kwak@samsung.com>
-rw-r--r--BUILD50
-rw-r--r--CMakeLists.txt3
-rw-r--r--Makefile12
-rw-r--r--re2/bitstate.cc83
-rw-r--r--re2/compile.cc4
-rw-r--r--re2/dfa.cc83
-rw-r--r--re2/nfa.cc65
-rw-r--r--re2/onepass.cc50
-rw-r--r--re2/prefilter.cc9
-rw-r--r--re2/prefilter.h3
-rw-r--r--re2/prog.cc191
-rw-r--r--re2/prog.h50
-rw-r--r--re2/re2.cc79
-rw-r--r--re2/re2.h43
-rw-r--r--re2/regexp.cc29
-rw-r--r--re2/regexp.h12
-rw-r--r--re2/set.cc5
-rw-r--r--re2/set.h2
-rw-r--r--re2/testing/backtrack.cc32
-rw-r--r--re2/testing/compile_test.cc208
-rw-r--r--re2/testing/re2_test.cc5
-rw-r--r--re2/testing/regexp_benchmark.cc45
-rw-r--r--re2/testing/search_test.cc9
-rw-r--r--re2/testing/set_test.cc56
-rw-r--r--util/mutex.h21
-rw-r--r--util/util.h1
26 files changed, 758 insertions, 392 deletions
diff --git a/BUILD b/BUILD
index 2636631..1e2d19c 100644
--- a/BUILD
+++ b/BUILD
@@ -6,6 +6,8 @@
licenses(["notice"])
+exports_files(["LICENSE"])
+
cc_library(
name = "re2",
srcs = [
@@ -97,25 +99,61 @@ cc_library(
load("re2_test", "re2_test")
re2_test("charclass_test")
+
re2_test("compile_test")
+
re2_test("filtered_re2_test")
+
re2_test("mimics_pcre_test")
+
re2_test("parse_test")
+
re2_test("possible_match_test")
+
re2_test("re2_arg_test")
+
re2_test("re2_test")
+
re2_test("regexp_test")
+
re2_test("required_prefix_test")
+
re2_test("search_test")
+
re2_test("set_test")
+
re2_test("simplify_test")
+
re2_test("string_generator_test")
-re2_test("dfa_test", size="large")
-re2_test("exhaustive1_test", size="large")
-re2_test("exhaustive2_test", size="large")
-re2_test("exhaustive3_test", size="large")
-re2_test("exhaustive_test", size="large")
-re2_test("random_test", size="large")
+re2_test(
+ "dfa_test",
+ size = "large",
+)
+
+re2_test(
+ "exhaustive1_test",
+ size = "large",
+)
+
+re2_test(
+ "exhaustive2_test",
+ size = "large",
+)
+
+re2_test(
+ "exhaustive3_test",
+ size = "large",
+)
+
+re2_test(
+ "exhaustive_test",
+ size = "large",
+)
+
+re2_test(
+ "random_test",
+ size = "large",
+)
# TODO: Add support for regexp_benchmark.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3ed718f..66cfdc6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,6 +15,9 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
if(MSVC_VERSION LESS 1800)
message(FATAL_ERROR "you need Visual Studio 2013 or later")
endif()
+ if(BUILD_SHARED_LIBS)
+ message(FATAL_ERROR "building a DLL is not currently supported")
+ endif()
elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
add_compile_options(-std=c++11)
endif()
diff --git a/Makefile b/Makefile
index 92fbfc3..4479033 100644
--- a/Makefile
+++ b/Makefile
@@ -44,7 +44,7 @@ ifeq ($(shell uname),Darwin)
SOEXT=dylib
SOEXTVER=$(SONAME).$(SOEXT)
SOEXTVER00=$(SONAME).0.0.$(SOEXT)
-MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib $(LDFLAGS) -Wl,-install_name,@rpath/libre2.$(SOEXTVER) -exported_symbols_list libre2.symbols.darwin
+MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib -Wl,-install_name,@rpath/libre2.$(SOEXTVER),-exported_symbols_list,libre2.symbols.darwin $(LDFLAGS)
else ifeq ($(shell uname),SunOS)
SOEXT=so
SOEXTVER=$(SOEXT).$(SONAME)
@@ -275,11 +275,15 @@ install: obj/libre2.a obj/so/libre2.$(SOEXT)
testinstall:
@mkdir -p obj
cp testinstall.cc obj
-ifneq ($(shell uname),Darwin)
- (cd obj && $(CXX) -I$(DESTDIR)$(includedir) -L$(DESTDIR)$(libdir) testinstall.cc -lre2 -pthread -static -o testinstall)
+ifeq ($(shell uname),Darwin)
+ @echo Skipping test for libre2.a on Darwin.
+else ifeq ($(shell uname),SunOS)
+ @echo Skipping test for libre2.a on SunOS.
+else
+ (cd obj && $(CXX) $(CXXFLAGS) -I$(DESTDIR)$(includedir) -L$(DESTDIR)$(libdir) testinstall.cc -l:libre2.a -o testinstall)
obj/testinstall
endif
- (cd obj && $(CXX) -I$(DESTDIR)$(includedir) -L$(DESTDIR)$(libdir) testinstall.cc -lre2 -pthread -o testinstall)
+ (cd obj && $(CXX) $(CXXFLAGS) -I$(DESTDIR)$(includedir) -L$(DESTDIR)$(libdir) testinstall.cc -lre2 -o testinstall)
LD_LIBRARY_PATH=$(DESTDIR)$(libdir) obj/testinstall
benchlog: obj/test/regexp_benchmark
diff --git a/re2/bitstate.cc b/re2/bitstate.cc
index 0cfa480..661a8ca 100644
--- a/re2/bitstate.cc
+++ b/re2/bitstate.cc
@@ -141,6 +141,7 @@ void BitState::Push(int id, const char* p, int arg) {
// Return whether it succeeded.
bool BitState::TrySearch(int id0, const char* p0) {
bool matched = false;
+ bool inaltmatch = false;
const char* end = text_.end();
njob_ = 0;
Push(id0, p0, 0);
@@ -159,6 +160,14 @@ bool BitState::TrySearch(int id0, const char* p0) {
// would have, but we avoid the stack
// manipulation.
if (0) {
+ Next:
+ // If the Match of a non-greedy AltMatch failed,
+ // we stop ourselves from trying the ByteRange,
+ // which would steer us off the short circuit.
+ if (prog_->inst(id)->last() || inaltmatch)
+ continue;
+ id++;
+
CheckAndLoop:
if (!ShouldVisit(id, p))
continue;
@@ -176,66 +185,63 @@ bool BitState::TrySearch(int id0, const char* p0) {
case kInstFail:
continue;
- case kInstAlt:
- // Cannot just
- // Push(ip->out1(), p, 0);
- // Push(ip->out(), p, 0);
- // If, during the processing of ip->out(), we encounter
- // ip->out1() via another path, we want to process it then.
- // Pushing it here will inhibit that. Instead, re-push
- // ip with arg==1 as a reminder to push ip->out1() later.
+ case kInstAltMatch:
switch (arg) {
case 0:
+ inaltmatch = true;
Push(id, p, 1); // come back when we're done
+
+ // One opcode is ByteRange; the other leads to Match
+ // (possibly via Nop or Capture).
+ if (ip->greedy(prog_)) {
+ // out1 is the match
+ Push(ip->out1(), p, 0);
+ id = ip->out1();
+ p = end;
+ goto CheckAndLoop;
+ }
+ // out is the match - non-greedy
+ Push(ip->out(), end, 0);
id = ip->out();
goto CheckAndLoop;
case 1:
- // Finished ip->out(); try ip->out1().
- arg = 0;
- id = ip->out1();
- goto CheckAndLoop;
+ inaltmatch = false;
+ continue;
}
- LOG(DFATAL) << "Bad arg in kInstAlt: " << arg;
+ LOG(DFATAL) << "Bad arg in kInstAltMatch: " << arg;
continue;
- case kInstAltMatch:
- // One opcode is byte range; the other leads to match.
- if (ip->greedy(prog_)) {
- // out1 is the match
- Push(ip->out1(), p, 0);
- id = ip->out1();
- p = end;
- goto CheckAndLoop;
- }
- // out is the match - non-greedy
- Push(ip->out(), end, 0);
- id = ip->out();
- goto CheckAndLoop;
-
case kInstByteRange: {
int c = -1;
if (p < end)
c = *p & 0xFF;
- if (ip->Matches(c)) {
- id = ip->out();
- p++;
- goto CheckAndLoop;
- }
- continue;
+ if (!ip->Matches(c))
+ goto Next;
+
+ if (!ip->last())
+ Push(id+1, p, 0); // try the next when we're done
+ id = ip->out();
+ p++;
+ goto CheckAndLoop;
}
case kInstCapture:
switch (arg) {
case 0:
+ if (!ip->last())
+ Push(id+1, p, 0); // try the next when we're done
+
if (0 <= ip->cap() && ip->cap() < ncap_) {
// Capture p to register, but save old value.
Push(id, cap_[ip->cap()], 1); // come back when we're done
cap_[ip->cap()] = p;
}
+
// Continue on.
id = ip->out();
goto CheckAndLoop;
+
case 1:
// Finished ip->out(); restore the old value.
cap_[ip->cap()] = p;
@@ -246,17 +252,22 @@ bool BitState::TrySearch(int id0, const char* p0) {
case kInstEmptyWidth:
if (ip->empty() & ~Prog::EmptyFlags(context_, p))
- continue;
+ goto Next;
+
+ if (!ip->last())
+ Push(id+1, p, 0); // try the next when we're done
id = ip->out();
goto CheckAndLoop;
case kInstNop:
+ if (!ip->last())
+ Push(id+1, p, 0); // try the next when we're done
id = ip->out();
goto CheckAndLoop;
case kInstMatch: {
if (endmatch_ && p != text_.end())
- continue;
+ goto Next;
// VLOG(0) << "Found match.";
// We found a match. If the caller doesn't care
@@ -285,7 +296,7 @@ bool BitState::TrySearch(int id0, const char* p0) {
return true;
// Otherwise, continue on in hope of a longer match.
- continue;
+ goto Next;
}
}
}
diff --git a/re2/compile.cc b/re2/compile.cc
index 9882fef..df859ab 100644
--- a/re2/compile.cc
+++ b/re2/compile.cc
@@ -541,6 +541,9 @@ bool Compiler::IsCachedRuneByteSuffix(int id) {
}
void Compiler::AddSuffix(int id) {
+ if (failed_)
+ return;
+
if (rune_range_.begin == 0) {
rune_range_.begin = id;
return;
@@ -1234,6 +1237,7 @@ Prog* Compiler::Finish() {
prog_->ComputeByteMap();
prog_->Optimize();
+ prog_->Flatten();
// Record remaining memory for DFA.
if (max_mem_ <= 0) {
diff --git a/re2/dfa.cc b/re2/dfa.cc
index 7c526c8..ccb6065 100644
--- a/re2/dfa.cc
+++ b/re2/dfa.cc
@@ -216,7 +216,7 @@ class DFA {
// Converts a State into a Workq: the opposite of WorkqToCachedState.
// L >= mutex_
- static void StateToWorkq(State* s, Workq* q);
+ void StateToWorkq(State* s, Workq* q);
// Runs a State on a given byte, returning the next state.
State* RunStateOnByteUnlocked(State*, int); // cache_mutex_.r <= L < mutex_
@@ -640,31 +640,17 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, uint flag) {
return FullMatchState;
}
// Fall through.
- case kInstByteRange: // These are useful.
- case kInstEmptyWidth:
- case kInstMatch:
- case kInstAlt: // Not useful, but necessary [*]
- inst[n++] = *it;
+ default:
+ // Record iff id is the head of its list, which must
+ // be the case if id-1 is the last of *its* list. :)
+ if (prog_->inst(id-1)->last())
+ inst[n++] = *it;
if (ip->opcode() == kInstEmptyWidth)
needflags |= ip->empty();
if (ip->opcode() == kInstMatch && !prog_->anchor_end())
sawmatch = true;
break;
-
- default: // The rest are not.
- break;
}
-
- // [*] kInstAlt would seem useless to record in a state, since
- // we've already followed both its arrows and saved all the
- // interesting states we can reach from there. The problem
- // is that one of the empty-width instructions might lead
- // back to the same kInstAlt (if an empty-width operator is starred),
- // producing a different evaluation order depending on whether
- // we keep the kInstAlt to begin with. Sigh.
- // A specific case that this affects is /(^|a)+/ matching "a".
- // If we don't save the kInstAlt, we will match the whole "a" (0,1)
- // but in fact the correct leftmost-first match is the leading "" (0,0).
}
DCHECK_LE(n, q->size());
if (n > 0 && inst[n-1] == Mark)
@@ -797,12 +783,12 @@ void DFA::StateToWorkq(State* s, Workq* q) {
if (s->inst_[i] == Mark)
q->mark();
else
- q->insert_new(s->inst_[i]);
+ // Explore from the head of the list.
+ AddToQueue(q, s->inst_[i], s->flag_ & kFlagEmptyMask);
}
}
-// Adds ip to the work queue, following empty arrows according to flag
-// and expanding kInstAlt instructions (two-target gotos).
+// Adds ip to the work queue, following empty arrows according to flag.
void DFA::AddToQueue(Workq* q, int id, uint flag) {
// Use astack_ to hold our stack of states yet to process.
@@ -828,9 +814,8 @@ void DFA::AddToQueue(Workq* q, int id, uint flag) {
continue;
// If ip is already on the queue, nothing to do.
- // Otherwise add it. We don't actually keep all the ones
- // that get added -- for example, kInstAlt is ignored
- // when on a work queue -- but adding all ip's here
+ // Otherwise add it. We don't actually keep all the
+ // ones that get added, but adding all of them here
// increases the likelihood of q->contains(id),
// reducing the amount of duplicated work.
if (q->contains(id))
@@ -840,34 +825,40 @@ void DFA::AddToQueue(Workq* q, int id, uint flag) {
// Process instruction.
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
- case kInstFail: // can't happen: discarded above
+ default:
+ LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
break;
case kInstByteRange: // just save these on the queue
case kInstMatch:
+ if (!ip->last())
+ stk[nstk++] = id+1;
break;
case kInstCapture: // DFA treats captures as no-ops.
case kInstNop:
+ if (!ip->last())
+ stk[nstk++] = id+1;
+
+ // If this instruction is the [00-FF]* loop at the beginning of
+ // a leftmost-longest unanchored search, separate with a Mark so
+ // that future threads (which will start farther to the right in
+ // the input string) are lower priority than current threads.
+ if (ip->opcode() == kInstNop && q->maxmark() > 0 &&
+ id == prog_->start_unanchored() && id != prog_->start())
+ stk[nstk++] = Mark;
stk[nstk++] = ip->out();
break;
- case kInstAlt: // two choices: expand both, in order
case kInstAltMatch:
- // Want to visit out then out1, so push on stack in reverse order.
- // This instruction is the [00-FF]* loop at the beginning of
- // a leftmost-longest unanchored search, separate out from out1
- // with a Mark, so that out1's threads (which will start farther
- // to the right in the string being searched) are lower priority
- // than the current ones.
- stk[nstk++] = ip->out1();
- if (q->maxmark() > 0 &&
- id == prog_->start_unanchored() && id != prog_->start())
- stk[nstk++] = Mark;
- stk[nstk++] = ip->out();
+ DCHECK(!ip->last());
+ stk[nstk++] = id+1;
break;
case kInstEmptyWidth:
+ if (!ip->last())
+ stk[nstk++] = id+1;
+
// Continue on if we have all the right flag bits.
if (ip->empty() & ~flag)
break;
@@ -924,10 +915,13 @@ void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq,
int id = *i;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
+ default:
+ LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
+ break;
+
case kInstFail: // never succeeds
case kInstCapture: // already followed
case kInstNop: // already followed
- case kInstAlt: // already followed
case kInstAltMatch: // already followed
case kInstEmptyWidth: // already followed
break;
@@ -1499,8 +1493,13 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params,
v->clear();
for (int i = 0; i < s->ninst_; i++) {
Prog::Inst* ip = prog_->inst(s->inst_[i]);
- if (ip->opcode() == kInstMatch)
- v->push_back(ip->match_id());
+ for (;;) {
+ if (ip->opcode() == kInstMatch)
+ v->push_back(ip->match_id());
+ if (ip->last())
+ break;
+ ip++;
+ }
}
}
if (DebugDFA)
diff --git a/re2/nfa.cc b/re2/nfa.cc
index bc8996c..5232e5f 100644
--- a/re2/nfa.cc
+++ b/re2/nfa.cc
@@ -230,25 +230,28 @@ void NFA::AddToThreadq(Threadq* q, int id0, int flag,
break;
case kInstAltMatch:
+ DCHECK(!ip->last());
+ stk[nstk++] = AddState(id+1);
+
// Save state; will pick up at next byte.
t = AllocThread();
t->id = id;
CopyCapture(t->capture, capture);
*tp = t;
- // fall through
-
- case kInstAlt:
- // Explore alternatives.
- stk[nstk++] = AddState(ip->out1());
- stk[nstk++] = AddState(ip->out());
break;
case kInstNop:
+ if (!ip->last())
+ stk[nstk++] = AddState(id+1);
+
// Continue on.
stk[nstk++] = AddState(ip->out());
break;
case kInstCapture:
+ if (!ip->last())
+ stk[nstk++] = AddState(id+1);
+
if ((j=ip->cap()) < ncapture_) {
// Push a dummy whose only job is to restore capture[j]
// once we finish exploring this possibility.
@@ -262,6 +265,9 @@ void NFA::AddToThreadq(Threadq* q, int id0, int flag,
case kInstMatch:
case kInstByteRange:
+ if (!ip->last())
+ stk[nstk++] = AddState(id+1);
+
// Save state; will pick up at next byte.
t = AllocThread();
t->id = id;
@@ -272,6 +278,9 @@ void NFA::AddToThreadq(Threadq* q, int id0, int flag,
break;
case kInstEmptyWidth:
+ if (!ip->last())
+ stk[nstk++] = AddState(id+1);
+
// Continue on if we have all the right flag bits.
if (ip->empty() & ~flag)
break;
@@ -542,14 +551,6 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
match_[1] = p;
matched_ = true;
break;
-
- case kInstEmptyWidth:
- if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) {
- LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty();
- break;
- }
- id = ip->out();
- continue;
}
break;
}
@@ -640,10 +641,16 @@ int NFA::ComputeFirstByte() {
break;
case kInstMatch:
+ if (!ip->last())
+ q.insert(id+1);
+
// The empty string matches: no first byte.
return -1;
case kInstByteRange:
+ if (!ip->last())
+ q.insert(id+1);
+
// Must match only a single byte
if (ip->lo() != ip->hi())
return -1;
@@ -660,6 +667,9 @@ int NFA::ComputeFirstByte() {
case kInstNop:
case kInstCapture:
case kInstEmptyWidth:
+ if (!ip->last())
+ q.insert(id+1);
+
// Continue on.
// Ignore ip->empty() flags for kInstEmptyWidth
// in order to be as conservative as possible
@@ -668,13 +678,9 @@ int NFA::ComputeFirstByte() {
q.insert(ip->out());
break;
- case kInstAlt:
case kInstAltMatch:
- // Explore alternatives.
- if (ip->out())
- q.insert(ip->out());
- if (ip->out1())
- q.insert(ip->out1());
+ DCHECK(!ip->last());
+ q.insert(id+1);
break;
case kInstFail:
@@ -723,31 +729,42 @@ void Prog::Fanout(SparseArray<int>* fanout) {
reachable.clear();
reachable.insert(i->index());
for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) {
- Prog::Inst* ip = inst(*j);
+ int id = *j;
+ Prog::Inst* ip = inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()";
break;
case kInstByteRange:
+ if (!ip->last())
+ reachable.insert(id+1);
+
(*count)++;
if (!fanout->has_index(ip->out())) {
fanout->set_new(ip->out(), 0);
}
break;
- case kInstAlt:
case kInstAltMatch:
- reachable.insert(ip->out1());
- // fall through
+ DCHECK(!ip->last());
+ reachable.insert(id+1);
+ break;
case kInstCapture:
case kInstEmptyWidth:
case kInstNop:
+ if (!ip->last())
+ reachable.insert(id+1);
+
reachable.insert(ip->out());
break;
case kInstMatch:
+ if (!ip->last())
+ reachable.insert(id+1);
+ break;
+
case kInstFail:
break;
}
diff --git a/re2/onepass.cc b/re2/onepass.cc
index 73acdc8..977ed96 100644
--- a/re2/onepass.cc
+++ b/re2/onepass.cc
@@ -427,21 +427,30 @@ bool Prog::IsOnePass() {
Prog::Inst* ip = inst(id);
uint32 cond = stack[nstack].cond;
switch (ip->opcode()) {
+ default:
+ LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
+ break;
+
case kInstAltMatch:
// TODO(rsc): Ignoring kInstAltMatch optimization.
// Should implement it in this engine, but it's subtle.
- // Fall through.
- case kInstAlt:
+ DCHECK(!ip->last());
// If already on work queue, (1) is violated: bail out.
- if (!AddQ(&workq, ip->out()) || !AddQ(&workq, ip->out1()))
+ if (!AddQ(&workq, id+1))
goto fail;
- stack[nstack].id = ip->out1();
- stack[nstack++].cond = cond;
- stack[nstack].id = ip->out();
+ stack[nstack].id = id+1;
stack[nstack++].cond = cond;
break;
case kInstByteRange: {
+ if (!ip->last()) {
+ // If already on work queue, (1) is violated: bail out.
+ if (!AddQ(&workq, id+1))
+ goto fail;
+ stack[nstack].id = id+1;
+ stack[nstack++].cond = cond;
+ }
+
int nextindex = nodebyid[ip->out()];
if (nextindex == -1) {
if (nalloc >= maxnodes) {
@@ -501,16 +510,21 @@ bool Prog::IsOnePass() {
}
case kInstCapture:
- if (ip->cap() < kMaxCap)
- cond |= (1 << kCapShift) << ip->cap();
- goto QueueEmpty;
-
case kInstEmptyWidth:
- cond |= ip->empty();
- goto QueueEmpty;
-
case kInstNop:
- QueueEmpty:
+ if (!ip->last()) {
+ // If already on work queue, (1) is violated: bail out.
+ if (!AddQ(&workq, id+1))
+ goto fail;
+ stack[nstack].id = id+1;
+ stack[nstack++].cond = cond;
+ }
+
+ if (ip->opcode() == kInstCapture && ip->cap() < kMaxCap)
+ cond |= (1 << kCapShift) << ip->cap();
+ if (ip->opcode() == kInstEmptyWidth)
+ cond |= ip->empty();
+
// kInstCapture and kInstNop always proceed to ip->out().
// kInstEmptyWidth only sometimes proceeds to ip->out(),
// but as a conservative approximation we assume it always does.
@@ -531,6 +545,14 @@ bool Prog::IsOnePass() {
break;
case kInstMatch:
+ if (!ip->last()) {
+ // If already on work queue, (1) is violated: bail out.
+ if (!AddQ(&workq, id+1))
+ goto fail;
+ stack[nstack].id = id+1;
+ stack[nstack++].cond = cond;
+ }
+
if (matched) {
// (3) is violated
if (Debug) {
diff --git a/re2/prefilter.cc b/re2/prefilter.cc
index 45e43c9..f171eec 100644
--- a/re2/prefilter.cc
+++ b/re2/prefilter.cc
@@ -15,8 +15,6 @@ static const int Trace = false;
typedef set<string>::iterator SSIter;
typedef set<string>::const_iterator ConstSSIter;
-GLOBAL_MUTEX(alloc_id_mutex);
-static int alloc_id = 100000; // Used for debugging.
// Initializes a Prefilter, allocating subs_ as necessary.
Prefilter::Prefilter(Op op) {
op_ = op;
@@ -24,15 +22,12 @@ Prefilter::Prefilter(Op op) {
if (op_ == AND || op_ == OR)
subs_ = new vector<Prefilter*>;
- GLOBAL_MUTEX_LOCK(alloc_id_mutex);
- alloc_id_ = alloc_id++;
- GLOBAL_MUTEX_UNLOCK(alloc_id_mutex);
- VLOG(10) << "alloc_id: " << alloc_id_;
+ VLOG(10) << "constructed: " << reinterpret_cast<intptr_t>(this);
}
// Destroys a Prefilter.
Prefilter::~Prefilter() {
- VLOG(10) << "Deleted: " << alloc_id_;
+ VLOG(10) << "destructing: " << reinterpret_cast<intptr_t>(this);
if (subs_) {
for (size_t i = 0; i < subs_->size(); i++)
delete (*subs_)[i];
diff --git a/re2/prefilter.h b/re2/prefilter.h
index 2bc1676..531c189 100644
--- a/re2/prefilter.h
+++ b/re2/prefilter.h
@@ -94,9 +94,6 @@ class Prefilter {
// and -1 for duplicate nodes.
int unique_id_;
- // Used for debugging, helps in tracking memory leaks.
- int alloc_id_;
-
DISALLOW_COPY_AND_ASSIGN(Prefilter);
};
diff --git a/re2/prog.cc b/re2/prog.cc
index cb7e6e0..b5f48ef 100644
--- a/re2/prog.cc
+++ b/re2/prog.cc
@@ -6,7 +6,6 @@
// Tested by compile_test.cc
#include "util/util.h"
-#include "util/sparse_set.h"
#include "re2/prog.h"
#include "re2/stringpiece.h"
@@ -94,6 +93,7 @@ Prog::Prog()
: anchor_start_(false),
anchor_end_(false),
reversed_(false),
+ did_flatten_(false),
did_onepass_(false),
start_(0),
start_unanchored_(0),
@@ -131,7 +131,6 @@ static inline void AddToQueue(Workq* q, int id) {
static string ProgToString(Prog* prog, Workq* q) {
string s;
-
for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
int id = *i;
Prog::Inst* ip = prog->inst(id);
@@ -143,6 +142,18 @@ static string ProgToString(Prog* prog, Workq* q) {
return s;
}
+static string FlattenedProgToString(Prog* prog, int start) {
+ string s;
+ for (int id = start; id < prog->size(); id++) {
+ Prog::Inst* ip = prog->inst(id);
+ if (ip->last())
+ StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
+ else
+ StringAppendF(&s, "%d+ %s\n", id, ip->Dump().c_str());
+ }
+ return s;
+}
+
string Prog::Dump() {
string map;
if (false) { // Debugging
@@ -155,12 +166,18 @@ string Prog::Dump() {
StringAppendF(&map, "\n");
}
+ if (did_flatten_)
+ return map + FlattenedProgToString(this, start_);
+
Workq q(size_);
AddToQueue(&q, start_);
return map + ProgToString(this, &q);
}
string Prog::DumpUnanchored() {
+ if (did_flatten_)
+ return FlattenedProgToString(this, start_unanchored_);
+
Workq q(size_);
AddToQueue(&q, start_unanchored_);
return ProgToString(this, &q);
@@ -337,5 +354,173 @@ void Prog::ComputeByteMap() {
}
}
-} // namespace re2
+void Prog::Flatten() {
+ if (did_flatten_)
+ return;
+ did_flatten_ = true;
+
+ // Scratch structures. It's important that these are reused by EmitList()
+ // because we call it in a loop and it would thrash the heap otherwise.
+ SparseSet q(size());
+ vector<int> stk;
+ stk.reserve(size());
+
+ // First pass: Marks "roots".
+ // Builds the mapping from inst-ids to root-ids.
+ SparseArray<int> rootmap(size());
+ MarkRoots(&rootmap, &q, &stk);
+
+ // Second pass: Emits "lists". Remaps outs to root-ids.
+ // Builds the mapping from root-ids to flat-ids.
+ vector<int> flatmap(rootmap.size());
+ vector<Inst> flat;
+ flat.reserve(size());
+ for (SparseArray<int>::const_iterator i = rootmap.begin();
+ i != rootmap.end();
+ ++i) {
+ flatmap[i->value()] = flat.size();
+ EmitList(i->index(), &rootmap, &flat, &q, &stk);
+ flat.back().set_last();
+ }
+
+ // Third pass: Remaps outs to flat-ids.
+ for (int id = 0; id < static_cast<int>(flat.size()); id++) {
+ Inst* ip = &flat[id];
+ if (ip->opcode() != kInstAltMatch) // handled in EmitList()
+ ip->set_out(flatmap[ip->out()]);
+ }
+
+ // Remap start_unanchored and start.
+ if (start_unanchored() == 0) {
+ DCHECK_EQ(start(), 0);
+ } else if (start_unanchored() == start()) {
+ set_start_unanchored(flatmap[1]);
+ set_start(flatmap[1]);
+ } else {
+ set_start_unanchored(flatmap[1]);
+ set_start(flatmap[2]);
+ }
+
+ // Finally, replace the old instructions with the new instructions.
+ size_ = flat.size();
+ delete[] inst_;
+ inst_ = new Inst[size_];
+ memmove(inst_, flat.data(), size_ * sizeof *inst_);
+}
+
+void Prog::MarkRoots(SparseArray<int>* rootmap,
+ SparseSet* q, vector<int>* stk) {
+ // Mark the kInstFail instruction.
+ rootmap->set_new(0, rootmap->size());
+
+ // Mark the start_unanchored and start instructions.
+ if (!rootmap->has_index(start_unanchored()))
+ rootmap->set_new(start_unanchored(), rootmap->size());
+ if (!rootmap->has_index(start()))
+ rootmap->set_new(start(), rootmap->size());
+
+ q->clear();
+ stk->clear();
+ stk->push_back(start_unanchored());
+ while (!stk->empty()) {
+ int id = stk->back();
+ stk->pop_back();
+ Loop:
+ if (q->contains(id))
+ continue;
+ q->insert_new(id);
+
+ Inst* ip = inst(id);
+ switch (ip->opcode()) {
+ default:
+ LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
+ break;
+
+ case kInstAltMatch:
+ case kInstAlt:
+ stk->push_back(ip->out1());
+ id = ip->out();
+ goto Loop;
+
+ case kInstByteRange:
+ case kInstCapture:
+ case kInstEmptyWidth:
+ // Mark the out of this instruction.
+ if (!rootmap->has_index(ip->out()))
+ rootmap->set_new(ip->out(), rootmap->size());
+ id = ip->out();
+ goto Loop;
+
+ case kInstNop:
+ id = ip->out();
+ goto Loop;
+ case kInstMatch:
+ case kInstFail:
+ break;
+ }
+ }
+}
+
+void Prog::EmitList(int root, SparseArray<int>* rootmap, vector<Inst>* flat,
+ SparseSet* q, vector<int>* stk) {
+ q->clear();
+ stk->clear();
+ stk->push_back(root);
+ while (!stk->empty()) {
+ int id = stk->back();
+ stk->pop_back();
+ Loop:
+ if (q->contains(id))
+ continue;
+ q->insert_new(id);
+
+ if (id != root && rootmap->has_index(id)) {
+ // We reached another "tree" via epsilon transition. Emit a kInstNop
+ // instruction so that the Prog does not become quadratically larger.
+ flat->emplace_back();
+ flat->back().set_opcode(kInstNop);
+ flat->back().set_out(rootmap->get_existing(id));
+ continue;
+ }
+
+ Inst* ip = inst(id);
+ switch (ip->opcode()) {
+ default:
+ LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
+ break;
+
+ case kInstAltMatch:
+ flat->emplace_back();
+ flat->back().set_opcode(kInstAltMatch);
+ flat->back().set_out(flat->size());
+ flat->back().out1_ = flat->size()+1;
+ // Fall through.
+
+ case kInstAlt:
+ stk->push_back(ip->out1());
+ id = ip->out();
+ goto Loop;
+
+ case kInstByteRange:
+ case kInstCapture:
+ case kInstEmptyWidth:
+ flat->emplace_back();
+ memmove(&flat->back(), ip, sizeof *ip);
+ flat->back().set_out(rootmap->get_existing(ip->out()));
+ break;
+
+ case kInstNop:
+ id = ip->out();
+ goto Loop;
+
+ case kInstMatch:
+ case kInstFail:
+ flat->emplace_back();
+ memmove(&flat->back(), ip, sizeof *ip);
+ break;
+ }
+ }
+}
+
+} // namespace re2
diff --git a/re2/prog.h b/re2/prog.h
index 45e9f0b..4a4017c 100644
--- a/re2/prog.h
+++ b/re2/prog.h
@@ -11,6 +11,7 @@
#include "util/util.h"
#include "util/sparse_array.h"
+#include "util/sparse_set.h"
#include "re2/re2.h"
namespace re2 {
@@ -84,7 +85,11 @@ class Prog {
// Single instruction in regexp program.
class Inst {
public:
- Inst() : out_opcode_(0), out1_(0) { }
+ Inst() : out_opcode_(0), out1_(0) {}
+
+ // Copyable.
+ Inst(const Inst&) = default;
+ Inst& operator=(const Inst&) = default;
// Constructors per opcode
void InitAlt(uint32 out, uint32 out1);
@@ -98,17 +103,21 @@ class Prog {
// Getters
int id(Prog* p) { return static_cast<int>(this - p->inst_); }
InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
- int out() { return out_opcode_>>3; }
- int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
+ int last() { return (out_opcode_>>3)&1; }
+ int out() { return out_opcode_>>4; }
+ int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
- bool greedy(Prog *p) {
+
+ bool greedy(Prog* p) {
DCHECK_EQ(opcode(), kInstAltMatch);
- return p->inst(out())->opcode() == kInstByteRange;
+ return p->inst(out())->opcode() == kInstByteRange ||
+ (p->inst(out())->opcode() == kInstNop &&
+ p->inst(p->inst(out())->out())->opcode() == kInstByteRange);
}
// Does this inst (an kInstByteRange) match c?
@@ -123,23 +132,27 @@ class Prog {
string Dump();
// Maximum instruction id.
- // (Must fit in out_opcode_, and PatchList steals another bit.)
+ // (Must fit in out_opcode_. PatchList/last steal another bit.)
static const int kMaxInst = (1<<28) - 1;
private:
void set_opcode(InstOp opcode) {
- out_opcode_ = (out()<<3) | opcode;
+ out_opcode_ = (out()<<4) | (last()<<3) | opcode;
+ }
+
+ void set_last() {
+ out_opcode_ = (out()<<4) | (1<<3) | opcode();
}
void set_out(int out) {
- out_opcode_ = (out<<3) | opcode();
+ out_opcode_ = (out<<4) | (last()<<3) | opcode();
}
void set_out_opcode(int out, InstOp opcode) {
- out_opcode_ = (out<<3) | opcode;
+ out_opcode_ = (out<<4) | (last()<<3) | opcode;
}
- uint32 out_opcode_; // 29 bits of out, 3 (low) bits opcode
+ uint32 out_opcode_; // 28 bits of out, 1 bit for last, 3 (low) bits opcode
union { // additional instruction arguments:
uint32 out1_; // opcode == kInstAlt
// alternate next instruction
@@ -167,8 +180,6 @@ class Prog {
friend class Compiler;
friend struct PatchList;
friend class Prog;
-
- DISALLOW_COPY_AND_ASSIGN(Inst);
};
// Whether to anchor the search.
@@ -339,6 +350,20 @@ class Prog {
static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
Regexp* re);
+ // Flattens the Prog from "tree" form to "list" form. This is an in-place
+ // operation in the sense that the old instructions are lost.
+ void Flatten();
+
+ // Marks the "roots" in the Prog: the outs of kInstByteRange, kInstCapture
+ // and kInstEmptyWidth instructions.
+ void MarkRoots(SparseArray<int>* rootmap,
+ SparseSet* q, vector<int>* stk);
+
+ // Emits one "list" via "tree" traversal from the given "root" instruction.
+ // The new instructions are appended to the given vector.
+ void EmitList(int root, SparseArray<int>* rootmap, vector<Inst>* flat,
+ SparseSet* q, vector<int>* stk);
+
private:
friend class Compiler;
@@ -347,6 +372,7 @@ class Prog {
bool anchor_start_; // regexp has explicit start anchor
bool anchor_end_; // regexp has explicit end anchor
bool reversed_; // whether program runs backward over input
+ bool did_flatten_; // has Flatten been called?
bool did_onepass_; // has IsOnePass been called?
int start_; // entry point for program
diff --git a/re2/re2.cc b/re2/re2.cc
index b3e582f..492ffaf 100644
--- a/re2/re2.cc
+++ b/re2/re2.cc
@@ -52,22 +52,11 @@ RE2::Options::Options(RE2::CannedOptions opt)
one_line_(false) {
}
-// static empty things for use as const references.
-// To avoid global constructors, initialized on demand.
-GLOBAL_MUTEX(empty_mutex);
-static const string *empty_string;
-static const map<string, int> *empty_named_groups;
-static const map<int, string> *empty_group_names;
-
-static void InitEmpty() {
- GLOBAL_MUTEX_LOCK(empty_mutex);
- if (empty_string == NULL) {
- empty_string = new string;
- empty_named_groups = new map<string, int>;
- empty_group_names = new map<int, string>;
- }
- GLOBAL_MUTEX_UNLOCK(empty_mutex);
-}
+// static empty objects for use as const references.
+// To avoid global constructors, allocated in RE2::Init().
+static const string* empty_string;
+static const map<string, int>* empty_named_groups;
+static const map<int, string>* empty_group_names;
// Converts from Regexp error code to RE2 error code.
// Maybe some day they will diverge. In any event, this
@@ -174,19 +163,24 @@ int RE2::Options::ParseFlags() const {
}
void RE2::Init(const StringPiece& pattern, const Options& options) {
- mutex_ = new Mutex;
+ static std::once_flag empty_once;
+ std::call_once(empty_once, []() {
+ empty_string = new string;
+ empty_named_groups = new map<string, int>;
+ empty_group_names = new map<int, string>;
+ });
+
pattern_ = pattern.as_string();
options_.Copy(options);
- InitEmpty();
- error_ = empty_string;
- error_code_ = NoError;
- suffix_regexp_ = NULL;
entire_regexp_ = NULL;
+ suffix_regexp_ = NULL;
prog_ = NULL;
rprog_ = NULL;
+ error_ = empty_string;
+ error_code_ = NoError;
+ num_captures_ = -1;
named_groups_ = NULL;
group_names_ = NULL;
- num_captures_ = -1;
RegexpStatus status;
entire_regexp_ = Regexp::Parse(
@@ -194,14 +188,13 @@ void RE2::Init(const StringPiece& pattern, const Options& options) {
static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
&status);
if (entire_regexp_ == NULL) {
- if (error_ == empty_string)
- error_ = new string(status.Text());
if (options_.log_errors()) {
LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
<< status.Text();
}
- error_arg_ = status.error_arg().as_string();
+ error_ = new string(status.Text());
error_code_ = RegexpErrorToRE2(status.code());
+ error_arg_ = status.error_arg().as_string();
return;
}
@@ -235,17 +228,15 @@ void RE2::Init(const StringPiece& pattern, const Options& options) {
// Returns rprog_, computing it if needed.
re2::Prog* RE2::ReverseProg() const {
- MutexLock l(mutex_);
- if (rprog_ == NULL && error_ == empty_string) {
+ std::call_once(rprog_once_, [this]() {
rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3);
if (rprog_ == NULL) {
if (options_.log_errors())
LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'";
error_ = new string("pattern too large - reverse compile failed");
error_code_ = RE2::ErrorPatternTooLarge;
- return NULL;
}
- }
+ });
return rprog_;
}
@@ -254,7 +245,6 @@ RE2::~RE2() {
suffix_regexp_->Decref();
if (entire_regexp_)
entire_regexp_->Decref();
- delete mutex_;
delete prog_;
delete rprog_;
if (error_ != empty_string)
@@ -291,37 +281,32 @@ int RE2::ProgramFanout(map<int, int>* histogram) const {
// Returns num_captures_, computing it if needed, or -1 if the
// regexp wasn't valid on construction.
int RE2::NumberOfCapturingGroups() const {
- MutexLock l(mutex_);
- if (suffix_regexp_ == NULL)
- return -1;
- if (num_captures_ == -1)
- num_captures_ = suffix_regexp_->NumCaptures();
+ std::call_once(num_captures_once_, [this]() {
+ if (suffix_regexp_ != NULL)
+ num_captures_ = suffix_regexp_->NumCaptures();
+ });
return num_captures_;
}
// Returns named_groups_, computing it if needed.
const map<string, int>& RE2::NamedCapturingGroups() const {
- MutexLock l(mutex_);
- if (!ok())
- return *empty_named_groups;
- if (named_groups_ == NULL) {
- named_groups_ = suffix_regexp_->NamedCaptures();
+ std::call_once(named_groups_once_, [this]() {
+ if (suffix_regexp_ != NULL)
+ named_groups_ = suffix_regexp_->NamedCaptures();
if (named_groups_ == NULL)
named_groups_ = empty_named_groups;
- }
+ });
return *named_groups_;
}
// Returns group_names_, computing it if needed.
const map<int, string>& RE2::CapturingGroupNames() const {
- MutexLock l(mutex_);
- if (!ok())
- return *empty_group_names;
- if (group_names_ == NULL) {
- group_names_ = suffix_regexp_->CaptureNames();
+ std::call_once(group_names_once_, [this]() {
+ if (suffix_regexp_ != NULL)
+ group_names_ = suffix_regexp_->CaptureNames();
if (group_names_ == NULL)
group_names_ = empty_group_names;
- }
+ });
return *group_names_;
}
diff --git a/re2/re2.h b/re2/re2.h
index 5b955d0..4a8c5c8 100644
--- a/re2/re2.h
+++ b/re2/re2.h
@@ -181,6 +181,7 @@
#include <stdint.h>
#include <map>
+#include <mutex>
#include <string>
#include "re2/stringpiece.h"
#include "re2/variadic_function.h"
@@ -193,25 +194,9 @@ namespace re2 {
using std::string;
using std::map;
-class Mutex;
class Prog;
class Regexp;
-// The following enum should be used only as a constructor argument to indicate
-// that the variable has static storage class, and that the constructor should
-// do nothing to its state. It indicates to the reader that it is legal to
-// declare a static instance of the class, provided the constructor is given
-// the LINKER_INITIALIZED argument. Normally, it is unsafe to declare a
-// static variable that has a constructor or a destructor because invocation
-// order is undefined. However, IF the type can be initialized by filling with
-// zeroes (which the loader does for static variables), AND the type's
-// destructor does nothing to the storage, then a constructor for static
-// initialization can be declared as
-// explicit MyClass(LinkerInitialized x) {}
-// and invoked as
-// static MyClass my_variable_name(LINKER_INITIALIZED);
-enum LinkerInitialized { LINKER_INITIALIZED };
-
// Interface for regular expression matching. Also corresponds to a
// pre-compiled regular expression. An "RE2" object is safe for
// concurrent use by multiple threads.
@@ -723,21 +708,21 @@ class RE2 {
re2::Prog* ReverseProg() const;
- mutable Mutex* mutex_;
- string pattern_; // string regular expression
- Options options_; // option flags
+ string pattern_; // string regular expression
+ Options options_; // option flags
string prefix_; // required prefix (before regexp_)
bool prefix_foldcase_; // prefix is ASCII case-insensitive
re2::Regexp* entire_regexp_; // parsed regular expression
re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed
re2::Prog* prog_; // compiled program for regexp
- mutable re2::Prog* rprog_; // reverse program for regexp
- bool is_one_pass_; // can use prog_->SearchOnePass?
- mutable const string* error_; // Error indicator
- // (or points to empty string)
- mutable ErrorCode error_code_; // Error code
- mutable string error_arg_; // Fragment of regexp showing error
- mutable int num_captures_; // Number of capturing groups
+ bool is_one_pass_; // can use prog_->SearchOnePass?
+
+ mutable re2::Prog* rprog_; // reverse program for regexp
+ mutable const string* error_; // Error indicator
+ // (or points to empty string)
+ mutable ErrorCode error_code_; // Error code
+ mutable string error_arg_; // Fragment of regexp showing error
+ mutable int num_captures_; // Number of capturing groups
// Map from capture names to indices
mutable const map<string, int>* named_groups_;
@@ -745,6 +730,12 @@ class RE2 {
// Map from capture indices to names
mutable const map<int, string>* group_names_;
+ // Onces for lazy computations.
+ mutable std::once_flag rprog_once_;
+ mutable std::once_flag num_captures_once_;
+ mutable std::once_flag named_groups_once_;
+ mutable std::once_flag group_names_once_;
+
//DISALLOW_COPY_AND_ASSIGN(RE2);
RE2(const RE2&);
void operator=(const RE2&);
diff --git a/re2/regexp.cc b/re2/regexp.cc
index 99e72e5..7b1b174 100644
--- a/re2/regexp.cc
+++ b/re2/regexp.cc
@@ -60,30 +60,29 @@ bool Regexp::QuickDestroy() {
return false;
}
-static map<Regexp*, int> *ref_map;
-GLOBAL_MUTEX(ref_mutex);
+// Lazily allocated.
+static Mutex* ref_mutex;
+static map<Regexp*, int>* ref_map;
int Regexp::Ref() {
if (ref_ < kMaxRef)
return ref_;
- GLOBAL_MUTEX_LOCK(ref_mutex);
- int r = 0;
- if (ref_map != NULL) {
- r = (*ref_map)[this];
- }
- GLOBAL_MUTEX_UNLOCK(ref_mutex);
- return r;
+ MutexLock l(ref_mutex);
+ return (*ref_map)[this];
}
// Increments reference count, returns object as convenience.
Regexp* Regexp::Incref() {
if (ref_ >= kMaxRef-1) {
- // Store ref count in overflow map.
- GLOBAL_MUTEX_LOCK(ref_mutex);
- if (ref_map == NULL) {
+ static std::once_flag ref_once;
+ std::call_once(ref_once, []() {
+ ref_mutex = new Mutex;
ref_map = new map<Regexp*, int>;
- }
+ });
+
+ // Store ref count in overflow map.
+ MutexLock l(ref_mutex);
if (ref_ == kMaxRef) {
// already overflowed
(*ref_map)[this]++;
@@ -92,7 +91,6 @@ Regexp* Regexp::Incref() {
(*ref_map)[this] = kMaxRef;
ref_ = kMaxRef;
}
- GLOBAL_MUTEX_UNLOCK(ref_mutex);
return this;
}
@@ -104,7 +102,7 @@ Regexp* Regexp::Incref() {
void Regexp::Decref() {
if (ref_ == kMaxRef) {
// Ref count is stored in overflow map.
- GLOBAL_MUTEX_LOCK(ref_mutex);
+ MutexLock l(ref_mutex);
int r = (*ref_map)[this] - 1;
if (r < kMaxRef) {
ref_ = static_cast<uint16>(r);
@@ -112,7 +110,6 @@ void Regexp::Decref() {
} else {
(*ref_map)[this] = r;
}
- GLOBAL_MUTEX_UNLOCK(ref_mutex);
return;
}
ref_--;
diff --git a/re2/regexp.h b/re2/regexp.h
index 5f222b7..54720eb 100644
--- a/re2/regexp.h
+++ b/re2/regexp.h
@@ -185,10 +185,10 @@ class RegexpStatus {
RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
~RegexpStatus() { delete tmp_; }
- void set_code(enum RegexpStatusCode code) { code_ = code; }
+ void set_code(RegexpStatusCode code) { code_ = code; }
void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; }
- enum RegexpStatusCode code() const { return code_; }
+ RegexpStatusCode code() const { return code_; }
const StringPiece& error_arg() const { return error_arg_; }
bool ok() const { return code() == kRegexpSuccess; }
@@ -197,14 +197,14 @@ class RegexpStatus {
// Returns text equivalent of code, e.g.:
// "Bad character class"
- static string CodeText(enum RegexpStatusCode code);
+ static string CodeText(RegexpStatusCode code);
// Returns text describing error, e.g.:
// "Bad character class: [z-a]"
string Text() const;
private:
- enum RegexpStatusCode code_; // Kind of error
+ RegexpStatusCode code_; // Kind of error
StringPiece error_arg_; // Piece of regexp containing syntax error.
string* tmp_; // Temporary storage, possibly where error_arg_ is.
@@ -314,7 +314,7 @@ class Regexp {
RegexpOp op() { return static_cast<RegexpOp>(op_); }
int nsub() { return nsub_; }
bool simple() { return simple_ != 0; }
- enum ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
+ ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
int Ref(); // For testing.
Regexp** sub() {
@@ -628,8 +628,6 @@ inline Regexp::ParseFlags operator~(Regexp::ParseFlags a)
return static_cast<Regexp::ParseFlags>(~static_cast<int>(a));
}
-
-
} // namespace re2
#endif // RE2_REGEXP_H__
diff --git a/re2/set.cc b/re2/set.cc
index a1a84ba..0e626a2 100644
--- a/re2/set.cc
+++ b/re2/set.cc
@@ -96,7 +96,8 @@ bool RE2::Set::Match(const StringPiece& text, vector<int>* v) const {
LOG(DFATAL) << "RE2::Set::Match without Compile";
return false;
}
- v->clear();
+ if (v != NULL)
+ v->clear();
bool failed;
bool ret = prog_->SearchDFA(text, text, Prog::kAnchored,
Prog::kManyMatch, NULL, &failed, v);
@@ -105,7 +106,7 @@ bool RE2::Set::Match(const StringPiece& text, vector<int>* v) const {
if (ret == false)
return false;
- if (v->size() == 0) {
+ if (v != NULL && v->empty()) {
LOG(DFATAL) << "RE2::Set::Match: match but unknown regexp set";
return false;
}
diff --git a/re2/set.h b/re2/set.h
index e48d8cb..5bcdb70 100644
--- a/re2/set.h
+++ b/re2/set.h
@@ -36,7 +36,7 @@ class RE2::Set {
bool Compile();
// Match returns true if text matches any of the regexps in the set.
- // If so, it fills v with the indices of the matching regexps.
+ // If so, it fills v (if not NULL) with the indices of the matching regexps.
bool Match(const StringPiece& text, vector<int>* v) const;
private:
diff --git a/re2/testing/backtrack.cc b/re2/testing/backtrack.cc
index 2141c53..95c14c2 100644
--- a/re2/testing/backtrack.cc
+++ b/re2/testing/backtrack.cc
@@ -55,10 +55,14 @@ class Backtracker {
StringPiece* submatch, int nsubmatch);
private:
- // Explores from instruction ip at string position p looking for a match.
+ // Explores from instruction id at string position p looking for a match.
// Returns true if found (so that caller can stop trying other possibilities).
bool Visit(int id, const char* p);
+ // Tries instruction id at string position p.
+ // Returns true if a match is found.
+ bool Try(int id, const char* p);
+
// Search parameters
Prog* prog_; // program being run
StringPiece text_; // text being searched
@@ -143,7 +147,7 @@ bool Backtracker::Search(const StringPiece& text, const StringPiece& context,
return false;
}
-// Explores from instruction ip at string position p looking for a match.
+// Explores from instruction id at string position p looking for a match.
// Return true if found (so that caller can stop trying other possibilities).
bool Backtracker::Visit(int id, const char* p) {
// Check bitmap. If we've already explored from here,
@@ -156,6 +160,20 @@ bool Backtracker::Visit(int id, const char* p) {
return false;
visited_[n/32] |= 1 << (n&31);
+ Prog::Inst* ip = prog_->inst(id);
+ if (Try(id, p)) {
+ if (longest_ && !ip->last())
+ Visit(id+1, p);
+ return true;
+ }
+ if (!ip->last())
+ return Visit(id+1, p);
+ return false;
+}
+
+// Tries instruction id at string position p.
+// Returns true if a match is found.
+bool Backtracker::Try(int id, const char* p) {
// Pick out byte at current position. If at end of string,
// have to explore in hope of finishing a match. Use impossible byte -1.
int c = -1;
@@ -168,15 +186,9 @@ bool Backtracker::Visit(int id, const char* p) {
LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode();
return false; // not reached
- case kInstAlt:
case kInstAltMatch:
- // Try both possible next states: out is preferred to out1.
- if (Visit(ip->out(), p)) {
- if (longest_)
- Visit(ip->out1(), p);
- return true;
- }
- return Visit(ip->out1(), p);
+ // Ignored.
+ return false;
case kInstByteRange:
if (ip->Matches(c))
diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc
index dee90a3..05e7ea8 100644
--- a/re2/testing/compile_test.cc
+++ b/re2/testing/compile_test.cc
@@ -27,82 +27,89 @@ struct Test {
static Test tests[] = {
{ "a",
- "1. byte [61-61] -> 2\n"
- "2. match! 0\n" },
+ "3. byte [61-61] -> 4\n"
+ "4. match! 0\n" },
{ "ab",
- "1. byte [61-61] -> 2\n"
- "2. byte [62-62] -> 3\n"
- "3. match! 0\n" },
+ "3. byte [61-61] -> 4\n"
+ "4. byte [62-62] -> 5\n"
+ "5. match! 0\n" },
{ "a|c",
- "3. alt -> 1 | 2\n"
- "1. byte [61-61] -> 4\n"
- "2. byte [63-63] -> 4\n"
- "4. match! 0\n" },
+ "3+ byte [61-61] -> 5\n"
+ "4. byte [63-63] -> 5\n"
+ "5. match! 0\n" },
{ "a|b",
- "1. byte [61-62] -> 2\n"
- "2. match! 0\n" },
+ "3. byte [61-62] -> 4\n"
+ "4. match! 0\n" },
{ "[ab]",
- "1. byte [61-62] -> 2\n"
- "2. match! 0\n" },
+ "3. byte [61-62] -> 4\n"
+ "4. match! 0\n" },
{ "a+",
- "1. byte [61-61] -> 2\n"
- "2. alt -> 1 | 3\n"
- "3. match! 0\n" },
+ "3. byte [61-61] -> 4\n"
+ "4+ nop -> 3\n"
+ "5. match! 0\n" },
{ "a+?",
- "1. byte [61-61] -> 2\n"
- "2. alt -> 3 | 1\n"
- "3. match! 0\n" },
+ "3. byte [61-61] -> 4\n"
+ "4+ match! 0\n"
+ "5. nop -> 3\n" },
{ "a*",
- "2. alt -> 1 | 3\n"
- "1. byte [61-61] -> 2\n"
- "3. match! 0\n" },
+ "3+ byte [61-61] -> 3\n"
+ "4. match! 0\n" },
{ "a*?",
- "2. alt -> 3 | 1\n"
- "3. match! 0\n"
- "1. byte [61-61] -> 2\n" },
+ "3+ match! 0\n"
+ "4. byte [61-61] -> 3\n" },
{ "a?",
- "2. alt -> 1 | 3\n"
- "1. byte [61-61] -> 3\n"
- "3. match! 0\n" },
+ "3+ byte [61-61] -> 5\n"
+ "4. nop -> 5\n"
+ "5. match! 0\n" },
{ "a??",
- "2. alt -> 3 | 1\n"
- "3. match! 0\n"
- "1. byte [61-61] -> 3\n" },
+ "3+ nop -> 5\n"
+ "4. byte [61-61] -> 5\n"
+ "5. match! 0\n" },
{ "a{4}",
- "1. byte [61-61] -> 2\n"
- "2. byte [61-61] -> 3\n"
"3. byte [61-61] -> 4\n"
"4. byte [61-61] -> 5\n"
- "5. match! 0\n" },
+ "5. byte [61-61] -> 6\n"
+ "6. byte [61-61] -> 7\n"
+ "7. match! 0\n" },
{ "(a)",
- "2. capture 2 -> 1\n"
- "1. byte [61-61] -> 3\n"
- "3. capture 3 -> 4\n"
- "4. match! 0\n" },
+ "3. capture 2 -> 4\n"
+ "4. byte [61-61] -> 5\n"
+ "5. capture 3 -> 6\n"
+ "6. match! 0\n" },
{ "(?:a)",
- "1. byte [61-61] -> 2\n"
- "2. match! 0\n" },
+ "3. byte [61-61] -> 4\n"
+ "4. match! 0\n" },
{ "",
- "2. match! 0\n" },
+ "3. match! 0\n" },
{ ".",
- "3. alt -> 1 | 2\n"
- "1. byte [00-09] -> 4\n"
- "2. byte [0b-ff] -> 4\n"
- "4. match! 0\n" },
+ "3+ byte [00-09] -> 5\n"
+ "4. byte [0b-ff] -> 5\n"
+ "5. match! 0\n" },
{ "[^ab]",
- "5. alt -> 3 | 4\n"
- "3. alt -> 1 | 2\n"
- "4. byte [63-ff] -> 6\n"
- "1. byte [00-09] -> 6\n"
- "2. byte [0b-60] -> 6\n"
+ "3+ byte [00-09] -> 6\n"
+ "4+ byte [0b-60] -> 6\n"
+ "5. byte [63-ff] -> 6\n"
"6. match! 0\n" },
{ "[Aa]",
- "1. byte/i [61-61] -> 2\n"
- "2. match! 0\n" },
+ "3. byte/i [61-61] -> 4\n"
+ "4. match! 0\n" },
+ { "\\C+",
+ "3. byte [00-ff] -> 4\n"
+ "4+ altmatch -> 5 | 6\n"
+ "5+ nop -> 3\n"
+ "6. match! 0\n" },
+ { "\\C*",
+ "3+ altmatch -> 4 | 5\n"
+ "4+ byte [00-ff] -> 3\n"
+ "5. match! 0\n" },
+ { "\\C?",
+ "3+ byte [00-ff] -> 5\n"
+ "4. nop -> 5\n"
+ "5. match! 0\n" },
// Issue 20992936
{ "[[-`]",
- "1. byte [5b-60] -> 2\n"
- "2. match! 0\n" },
+ "3. byte [5b-60] -> 4\n"
+ "4. match! 0\n" },
};
TEST(TestRegexpCompileToProg, Simple) {
@@ -172,6 +179,18 @@ TEST(TestCompile, ByteRanges) {
re->Decref();
}
+TEST(TestCompile, InsufficientMemory) {
+ Regexp* re = Regexp::Parse(
+ "^(?P<name1>[^\\s]+)\\s+(?P<name2>[^\\s]+)\\s+(?P<name3>.+)$",
+ Regexp::LikePerl, NULL);
+ EXPECT_TRUE(re != NULL);
+ Prog* prog = re->CompileToProg(920);
+ // If the memory budget has been exhausted, compilation should fail
+ // and return NULL instead of trying to do anything with NoMatch().
+ EXPECT_TRUE(prog == NULL);
+ re->Decref();
+}
+
static void Dump(StringPiece pattern, Regexp::ParseFlags flags,
string* forward, string* reverse) {
Regexp* re = Regexp::Parse(pattern, flags, NULL);
@@ -201,60 +220,51 @@ TEST(TestCompile, Bug26705922) {
string forward, reverse;
Dump("[\\x{10000}\\x{10010}]", Regexp::LikePerl, &forward, &reverse);
- EXPECT_EQ("4. byte [f0-f0] -> 3\n"
- "3. byte [90-90] -> 2\n"
- "2. byte [80-80] -> 6\n"
- "6. alt -> 1 | 5\n"
- "1. byte [80-80] -> 7\n"
- "5. byte [90-90] -> 7\n"
- "7. match! 0\n",
+ EXPECT_EQ("3. byte [f0-f0] -> 4\n"
+ "4. byte [90-90] -> 5\n"
+ "5. byte [80-80] -> 6\n"
+ "6+ byte [80-80] -> 8\n"
+ "7. byte [90-90] -> 8\n"
+ "8. match! 0\n",
forward);
- EXPECT_EQ("6. alt -> 4 | 5\n"
- "4. byte [80-80] -> 3\n"
- "5. byte [90-90] -> 3\n"
- "3. byte [80-80] -> 2\n"
- "2. byte [90-90] -> 1\n"
- "1. byte [f0-f0] -> 7\n"
- "7. match! 0\n",
+ EXPECT_EQ("3+ byte [80-80] -> 5\n"
+ "4. byte [90-90] -> 5\n"
+ "5. byte [80-80] -> 6\n"
+ "6. byte [90-90] -> 7\n"
+ "7. byte [f0-f0] -> 8\n"
+ "8. match! 0\n",
reverse);
Dump("[\\x{8000}-\\x{10FFF}]", Regexp::LikePerl, &forward, &reverse);
- EXPECT_EQ("6. alt -> 3 | 5\n"
- "3. byte [e8-ef] -> 2\n"
- "5. byte [f0-f0] -> 4\n"
- "2. byte [80-bf] -> 1\n"
- "4. byte [90-90] -> 2\n"
- "1. byte [80-bf] -> 7\n"
- "7. match! 0\n",
+ EXPECT_EQ("3+ byte [e8-ef] -> 5\n"
+ "4. byte [f0-f0] -> 8\n"
+ "5. byte [80-bf] -> 6\n"
+ "6. byte [80-bf] -> 7\n"
+ "7. match! 0\n"
+ "8. byte [90-90] -> 5\n",
forward);
- EXPECT_EQ("3. byte [80-bf] -> 2\n"
- "2. byte [80-bf] -> 6\n"
- "6. alt -> 1 | 5\n"
- "1. byte [e8-ef] -> 7\n"
- "5. byte [90-90] -> 4\n"
+ EXPECT_EQ("3. byte [80-bf] -> 4\n"
+ "4. byte [80-bf] -> 5\n"
+ "5+ byte [e8-ef] -> 7\n"
+ "6. byte [90-90] -> 8\n"
"7. match! 0\n"
- "4. byte [f0-f0] -> 7\n",
+ "8. byte [f0-f0] -> 7\n",
reverse);
Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, NULL, &reverse);
- EXPECT_EQ("2. byte [80-bf] -> 8\n"
- "8. alt -> 5 | 7\n"
- "5. alt -> 1 | 4\n"
- "7. byte [80-bf] -> 17\n"
- "1. byte [c2-df] -> 18\n"
- "4. byte [a0-bf] -> 3\n"
- "17. alt -> 14 | 16\n"
- "18. match! 0\n"
- "3. byte [e0-e0] -> 18\n"
- "14. alt -> 11 | 13\n"
- "16. byte [80-8f] -> 15\n"
- "11. alt -> 6 | 10\n"
- "13. byte [80-bf] -> 12\n"
- "15. byte [f4-f4] -> 18\n"
- "6. byte [e1-ef] -> 18\n"
- "10. byte [90-bf] -> 9\n"
- "12. byte [f1-f3] -> 18\n"
- "9. byte [f0-f0] -> 18\n",
+ EXPECT_EQ("3. byte [80-bf] -> 4\n"
+ "4+ byte [c2-df] -> 7\n"
+ "5+ byte [a0-bf] -> 8\n"
+ "6. byte [80-bf] -> 9\n"
+ "7. match! 0\n"
+ "8. byte [e0-e0] -> 7\n"
+ "9+ byte [e1-ef] -> 7\n"
+ "10+ byte [90-bf] -> 13\n"
+ "11+ byte [80-bf] -> 14\n"
+ "12. byte [80-8f] -> 15\n"
+ "13. byte [f0-f0] -> 7\n"
+ "14. byte [f1-f3] -> 7\n"
+ "15. byte [f4-f4] -> 7\n",
reverse);
}
diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc
index 5a32d92..81d5fef 100644
--- a/re2/testing/re2_test.cc
+++ b/re2/testing/re2_test.cc
@@ -879,11 +879,16 @@ TEST(RE2, FloatingPointFullMatchTypes) {
// number, since C does not guarantee to get the correctly
// rounded answer for strtod and strtof unless the input is
// short.
+ //
+ // This is known to fail on Cygwin and MinGW due to a broken
+ // implementation of strtof(3). Sigh.
+#if !defined(__CYGWIN__) && !defined(__MINGW32__)
CHECK(RE2::FullMatch("0.1", "(.*)", &v));
CHECK_EQ(v, 0.1f) << StringPrintf("%.8g != %.8g", v, 0.1f);
CHECK(RE2::FullMatch("6700000000081920.1", "(.*)", &v));
CHECK_EQ(v, 6700000000081920.1f)
<< StringPrintf("%.8g != %.8g", v, 6700000000081920.1f);
+#endif
}
{
double v;
diff --git a/re2/testing/regexp_benchmark.cc b/re2/testing/regexp_benchmark.cc
index 8d80b4c..8318e28 100644
--- a/re2/testing/regexp_benchmark.cc
+++ b/re2/testing/regexp_benchmark.cc
@@ -349,6 +349,51 @@ BENCHMARK_RANGE(Search_Success1_Cached_PCRE, 8, 16<<20)->ThreadRange(1, NumC
BENCHMARK_RANGE(Search_Success1_Cached_RE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
BENCHMARK_RANGE(Search_Success1_Cached_BitState, 8, 2<<20)->ThreadRange(1, NumCPUs());
+// Benchmark: AltMatch optimisation (just to verify that it works)
+// Note that OnePass doesn't implement it!
+
+void SearchAltMatch(int iters, int nbytes, SearchImpl* search) {
+ StopBenchmarkTiming();
+ string s;
+ MakeText(&s, nbytes);
+ BenchmarkMemoryUsage();
+ StartBenchmarkTiming();
+ search(iters, "\\C*", s, Prog::kAnchored, true);
+ SetBenchmarkBytesProcessed(static_cast<int64>(iters)*nbytes);
+}
+
+void Search_AltMatch_DFA(int i, int n) { SearchAltMatch(i, n, SearchDFA); }
+void Search_AltMatch_NFA(int i, int n) { SearchAltMatch(i, n, SearchNFA); }
+void Search_AltMatch_OnePass(int i, int n) { SearchAltMatch(i, n, SearchOnePass); }
+void Search_AltMatch_BitState(int i, int n) { SearchAltMatch(i, n, SearchBitState); }
+void Search_AltMatch_PCRE(int i, int n) { SearchAltMatch(i, n, SearchPCRE); }
+void Search_AltMatch_RE2(int i, int n) { SearchAltMatch(i, n, SearchRE2); }
+
+BENCHMARK_RANGE(Search_AltMatch_DFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_AltMatch_NFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_AltMatch_OnePass, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_AltMatch_BitState, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_AltMatch_PCRE, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_AltMatch_RE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
+
+void Search_AltMatch_CachedDFA(int i, int n) { SearchAltMatch(i, n, SearchCachedDFA); }
+void Search_AltMatch_CachedNFA(int i, int n) { SearchAltMatch(i, n, SearchCachedNFA); }
+void Search_AltMatch_CachedOnePass(int i, int n) { SearchAltMatch(i, n, SearchCachedOnePass); }
+void Search_AltMatch_CachedBitState(int i, int n) { SearchAltMatch(i, n, SearchCachedBitState); }
+void Search_AltMatch_CachedPCRE(int i, int n) { SearchAltMatch(i, n, SearchCachedPCRE); }
+void Search_AltMatch_CachedRE2(int i, int n) { SearchAltMatch(i, n, SearchCachedRE2); }
+
+BENCHMARK_RANGE(Search_AltMatch_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_AltMatch_CachedNFA, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_AltMatch_CachedOnePass, 8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_AltMatch_CachedBitState, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_AltMatch_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_AltMatch_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs());
+
// Benchmark: use regexp to find phone number.
void SearchDigits(int iters, SearchImpl* search) {
diff --git a/re2/testing/search_test.cc b/re2/testing/search_test.cc
index 3ab2ae3..08e5b4c 100644
--- a/re2/testing/search_test.cc
+++ b/re2/testing/search_test.cc
@@ -264,7 +264,7 @@ RegexpTest simple_tests[] = {
{ "[A-Z]+", "aAzZ" },
{ "[^\\\\]+", "Aa\\" },
{ "[acegikmoqsuwy]+", "acegikmoqsuwyACEGIKMOQSUWY" },
-
+
// Anchoring. (^abc in aabcdef was a former bug)
// The tester checks for a match in the text and
// subpieces of the text with a byte removed on either side.
@@ -297,7 +297,12 @@ RegexpTest simple_tests[] = {
{ "a", "a" },
{ "ab*", "a" },
{ "a\\C*", "a" },
-
+ { "a\\C+", "a" },
+ { "a\\C?", "a" },
+ { "a\\C*?", "a" },
+ { "a\\C+?", "a" },
+ { "a\\C??", "a" },
+
// Former bugs.
{ "a\\C*|ba\\C", "baba" },
};
diff --git a/re2/testing/set_test.cc b/re2/testing/set_test.cc
index 4e267ae..c613d6a 100644
--- a/re2/testing/set_test.cc
+++ b/re2/testing/set_test.cc
@@ -18,21 +18,22 @@ TEST(Set, Unanchored) {
CHECK_EQ(s.Add("foo", NULL), 0);
CHECK_EQ(s.Add("(", NULL), -1);
CHECK_EQ(s.Add("bar", NULL), 1);
-
CHECK_EQ(s.Compile(), true);
+ CHECK_EQ(s.Match("foobar", NULL), true);
+ CHECK_EQ(s.Match("fooba", NULL), true);
+ CHECK_EQ(s.Match("oobar", NULL), true);
+
vector<int> v;
CHECK_EQ(s.Match("foobar", &v), true);
CHECK_EQ(v.size(), 2);
CHECK_EQ(v[0], 0);
CHECK_EQ(v[1], 1);
- v.clear();
CHECK_EQ(s.Match("fooba", &v), true);
CHECK_EQ(v.size(), 1);
CHECK_EQ(v[0], 0);
- v.clear();
CHECK_EQ(s.Match("oobar", &v), true);
CHECK_EQ(v.size(), 1);
CHECK_EQ(v[0], 1);
@@ -44,27 +45,28 @@ TEST(Set, UnanchoredFactored) {
CHECK_EQ(s.Add("foo", NULL), 0);
CHECK_EQ(s.Add("(", NULL), -1);
CHECK_EQ(s.Add("foobar", NULL), 1);
-
CHECK_EQ(s.Compile(), true);
+ CHECK_EQ(s.Match("foobar", NULL), true);
+ CHECK_EQ(s.Match("obarfoobaroo", NULL), true);
+ CHECK_EQ(s.Match("fooba", NULL), true);
+ CHECK_EQ(s.Match("oobar", NULL), false);
+
vector<int> v;
CHECK_EQ(s.Match("foobar", &v), true);
CHECK_EQ(v.size(), 2);
CHECK_EQ(v[0], 0);
CHECK_EQ(v[1], 1);
- v.clear();
CHECK_EQ(s.Match("obarfoobaroo", &v), true);
CHECK_EQ(v.size(), 2);
CHECK_EQ(v[0], 0);
CHECK_EQ(v[1], 1);
- v.clear();
CHECK_EQ(s.Match("fooba", &v), true);
CHECK_EQ(v.size(), 1);
CHECK_EQ(v[0], 0);
- v.clear();
CHECK_EQ(s.Match("oobar", &v), false);
CHECK_EQ(v.size(), 0);
}
@@ -75,6 +77,8 @@ TEST(Set, UnanchoredDollar) {
CHECK_EQ(s.Add("foo$", NULL), 0);
CHECK_EQ(s.Compile(), true);
+ CHECK_EQ(s.Match("foo", NULL), true);
+
vector<int> v;
CHECK_EQ(s.Match("foo", &v), true);
CHECK_EQ(v.size(), 1);
@@ -87,9 +91,14 @@ TEST(Set, Anchored) {
CHECK_EQ(s.Add("foo", NULL), 0);
CHECK_EQ(s.Add("(", NULL), -1);
CHECK_EQ(s.Add("bar", NULL), 1);
-
CHECK_EQ(s.Compile(), true);
+ CHECK_EQ(s.Match("foobar", NULL), false);
+ CHECK_EQ(s.Match("fooba", NULL), false);
+ CHECK_EQ(s.Match("oobar", NULL), false);
+ CHECK_EQ(s.Match("foo", NULL), true);
+ CHECK_EQ(s.Match("bar", NULL), true);
+
vector<int> v;
CHECK_EQ(s.Match("foobar", &v), false);
CHECK_EQ(v.size(), 0);
@@ -114,11 +123,13 @@ TEST(Set, EmptyUnanchored) {
CHECK_EQ(s.Compile(), true);
+ CHECK_EQ(s.Match("", NULL), false);
+ CHECK_EQ(s.Match("foobar", NULL), false);
+
vector<int> v;
CHECK_EQ(s.Match("", &v), false);
CHECK_EQ(v.size(), 0);
- v.clear();
CHECK_EQ(s.Match("foobar", &v), false);
CHECK_EQ(v.size(), 0);
}
@@ -128,13 +139,38 @@ TEST(Set, EmptyAnchored) {
CHECK_EQ(s.Compile(), true);
+ CHECK_EQ(s.Match("", NULL), false);
+ CHECK_EQ(s.Match("foobar", NULL), false);
+
vector<int> v;
CHECK_EQ(s.Match("", &v), false);
CHECK_EQ(v.size(), 0);
- v.clear();
CHECK_EQ(s.Match("foobar", &v), false);
CHECK_EQ(v.size(), 0);
}
+TEST(Set, Prefix) {
+ RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH);
+
+ CHECK_EQ(s.Add("/prefix/\\d*", NULL), 0);
+ CHECK_EQ(s.Compile(), true);
+
+ CHECK_EQ(s.Match("/prefix", NULL), false);
+ CHECK_EQ(s.Match("/prefix/", NULL), true);
+ CHECK_EQ(s.Match("/prefix/42", NULL), true);
+
+ vector<int> v;
+ CHECK_EQ(s.Match("/prefix", &v), false);
+ CHECK_EQ(v.size(), 0);
+
+ CHECK_EQ(s.Match("/prefix/", &v), true);
+ CHECK_EQ(v.size(), 1);
+ CHECK_EQ(v[0], 0);
+
+ CHECK_EQ(s.Match("/prefix/42", &v), true);
+ CHECK_EQ(v.size(), 1);
+ CHECK_EQ(v[0], 0);
+}
+
} // namespace re2
diff --git a/util/mutex.h b/util/mutex.h
index b479e48..8a13226 100644
--- a/util/mutex.h
+++ b/util/mutex.h
@@ -208,27 +208,6 @@ class WriterMutexLock {
#define ReaderMutexLock(x) COMPILE_ASSERT(0, rmutex_lock_decl_missing_var_name)
#define WriterMutexLock(x) COMPILE_ASSERT(0, wmutex_lock_decl_missing_var_name)
-// Provide safe way to declare and use global, linker-initialized mutex. Sigh.
-#if HAVE_PTHREAD
-
-#define GLOBAL_MUTEX(name) \
- static pthread_mutex_t (name) = PTHREAD_MUTEX_INITIALIZER
-#define GLOBAL_MUTEX_LOCK(name) \
- pthread_mutex_lock(&(name))
-#define GLOBAL_MUTEX_UNLOCK(name) \
- pthread_mutex_unlock(&(name))
-
-#else
-
-#define GLOBAL_MUTEX(name) \
- static Mutex name
-#define GLOBAL_MUTEX_LOCK(name) \
- name.Lock()
-#define GLOBAL_MUTEX_UNLOCK(name) \
- name.Unlock()
-
-#endif
-
} // namespace re2
#endif /* #define RE2_UTIL_MUTEX_H_ */
diff --git a/util/util.h b/util/util.h
index 8c5d922..7dfab93 100644
--- a/util/util.h
+++ b/util/util.h
@@ -31,6 +31,7 @@
#include <utility>
#include <set>
#include <atomic>
+#include <mutex> // For std::call_once
// Use std names.
using std::set;