summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDongHun Kwak <dh0128.kwak@samsung.com>2016-11-21 16:58:22 +0900
committerDongHun Kwak <dh0128.kwak@samsung.com>2016-11-21 16:58:22 +0900
commit11b439be08e713284f09eba5547142b6ea2924f3 (patch)
tree6e8aab0313cb718030050a0e2bfe1acdb6a21dd0
parent2e30f7d1bb7894979dafc4fcd68dd8664c5584ab (diff)
downloadre2-11b439be08e713284f09eba5547142b6ea2924f3.tar.gz
re2-11b439be08e713284f09eba5547142b6ea2924f3.tar.bz2
re2-11b439be08e713284f09eba5547142b6ea2924f3.zip
Imported Upstream version 20160601upstream/20160601
Change-Id: I89d6c9dac409aeeb940c58029f34a1292d4cb0a1 Signed-off-by: DongHun Kwak <dh0128.kwak@samsung.com>
-rw-r--r--BUILD1
-rw-r--r--Makefile32
-rw-r--r--re2.pc6
-rw-r--r--re2/bitstate.cc10
-rw-r--r--re2/compile.cc42
-rw-r--r--re2/dfa.cc23
-rw-r--r--re2/fuzzing/re2_fuzzer.cc106
-rw-r--r--re2/nfa.cc226
-rw-r--r--re2/onepass.cc114
-rw-r--r--re2/prog.cc131
-rw-r--r--re2/prog.h23
-rw-r--r--re2/re2.cc5
-rw-r--r--re2/re2.h95
-rw-r--r--re2/testing/compile_test.cc27
-rw-r--r--re2/variadic_function.h344
-rw-r--r--util/fuzz.cc21
-rw-r--r--util/test.cc2
-rw-r--r--util/util.h16
18 files changed, 513 insertions, 711 deletions
diff --git a/BUILD b/BUILD
index 4fd529f..3376943 100644
--- a/BUILD
+++ b/BUILD
@@ -58,7 +58,6 @@ cc_library(
"re2/re2.h",
"re2/set.h",
"re2/stringpiece.h",
- "re2/variadic_function.h",
],
copts = ["-pthread"],
linkopts = ["-pthread"],
diff --git a/Makefile b/Makefile
index d0e2090..d498296 100644
--- a/Makefile
+++ b/Makefile
@@ -28,13 +28,20 @@ NMFLAGS?=-p
# http://www.gnu.org/prep/standards/standards.html
prefix=/usr/local
exec_prefix=$(prefix)
-bindir=$(exec_prefix)/bin
includedir=$(prefix)/include
libdir=$(exec_prefix)/lib
INSTALL=install
-INSTALL_PROGRAM=$(INSTALL)
INSTALL_DATA=$(INSTALL) -m 644
+# Work around the weirdness of sed(1) on Darwin. :/
+ifeq ($(shell uname),Darwin)
+SED_INPLACE=sed -i ''
+else ifeq ($(shell uname),SunOS)
+SED_INPLACE=sed -i
+else
+SED_INPLACE=sed -i
+endif
+
# ABI version
# http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html
SONAME=0
@@ -43,6 +50,7 @@ SONAME=0
# access for Unicode data), uncomment the following line:
# REBUILD_TABLES=1
+# The SunOS linker does not support wildcards. :(
ifeq ($(shell uname),Darwin)
SOEXT=dylib
SOEXTVER=$(SONAME).$(SOEXT)
@@ -52,7 +60,7 @@ else ifeq ($(shell uname),SunOS)
SOEXT=so
SOEXTVER=$(SOEXT).$(SONAME)
SOEXTVER00=$(SOEXT).$(SONAME).0.0
-MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),-M,libre2.symbols $(RE2_LDFLAGS) $(LDFLAGS)
+MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER) $(RE2_LDFLAGS) $(LDFLAGS)
else
SOEXT=so
SOEXTVER=$(SOEXT).$(SONAME)
@@ -67,7 +75,6 @@ INSTALL_HFILES=\
re2/re2.h\
re2/set.h\
re2/stringpiece.h\
- re2/variadic_function.h\
HFILES=\
util/benchmark.h\
@@ -97,7 +104,6 @@ HFILES=\
re2/testing/tester.h\
re2/unicode_casefold.h\
re2/unicode_groups.h\
- re2/variadic_function.h\
re2/walker-inl.h\
OFILES=\
@@ -216,6 +222,14 @@ obj/test/regexp_benchmark: obj/libre2.a obj/re2/testing/regexp_benchmark.o $(TES
@mkdir -p obj/test
$(CXX) -o $@ obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/util/benchmark.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
+# re2_fuzzer is a target for fuzzers like libFuzzer and AFL. This fake fuzzing
+# is simply a way to check that the target builds and then to run it against a
+# fixed set of inputs. To perform real fuzzing, refer to the documentation for
+# libFuzzer (llvm.org/docs/LibFuzzer.html) and AFL (lcamtuf.coredump.cx/afl/).
+obj/test/re2_fuzzer: obj/libre2.a obj/re2/fuzzing/re2_fuzzer.o obj/util/fuzz.o
+ @mkdir -p obj/test
+ $(CXX) -o $@ obj/re2/fuzzing/re2_fuzzer.o obj/util/fuzz.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
+
ifdef REBUILD_TABLES
re2/perl_groups.cc: re2/make_perl_groups.pl
perl $< > $@
@@ -266,6 +280,8 @@ shared-bigtest: $(STESTS) $(SBIGTESTS)
benchmark: obj/test/regexp_benchmark
+fuzz: obj/test/re2_fuzzer
+
install: obj/libre2.a obj/so/libre2.$(SOEXT)
mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir)/pkgconfig
$(INSTALL_DATA) $(INSTALL_HFILES) $(DESTDIR)$(includedir)/re2
@@ -273,7 +289,11 @@ install: obj/libre2.a obj/so/libre2.$(SOEXT)
$(INSTALL) obj/so/libre2.$(SOEXT) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER00)
ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER)
ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXT)
- sed -e "s#@prefix@#${prefix}#" re2.pc >$(DESTDIR)$(libdir)/pkgconfig/re2.pc
+ $(INSTALL_DATA) re2.pc $(DESTDIR)$(libdir)/pkgconfig/re2.pc
+ $(SED_INPLACE) -e "s#@prefix@#${prefix}#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc
+ $(SED_INPLACE) -e "s#@exec_prefix@#${exec_prefix}#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc
+ $(SED_INPLACE) -e "s#@includedir@#${includedir}#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc
+ $(SED_INPLACE) -e "s#@libdir@#${libdir}#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc
testinstall: static-testinstall shared-testinstall
@echo
diff --git a/re2.pc b/re2.pc
index 91ba181..d66cf51 100644
--- a/re2.pc
+++ b/re2.pc
@@ -1,7 +1,7 @@
prefix=@prefix@
-exec_prefix=${prefix}
-includedir=${prefix}/include
-libdir=${exec_prefix}/lib
+exec_prefix=@exec_prefix@
+includedir=@includedir@
+libdir=@libdir@
Name: re2
Description: RE2 is a fast, safe, thread-friendly regular expression engine.
diff --git a/re2/bitstate.cc b/re2/bitstate.cc
index 53fa219..775fbec 100644
--- a/re2/bitstate.cc
+++ b/re2/bitstate.cc
@@ -345,15 +345,6 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context,
return TrySearch(prog_->start(), text.begin());
}
- // Shallow first byte analysis. We could go deeper.
- int fb = -1;
- Prog::Inst* ip = prog_->inst(prog_->start());
- if (ip->opcode() == kInstByteRange &&
- ip->lo() == ip->hi() &&
- !(ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z') &&
- ip->last())
- fb = ip->lo();
-
// Unanchored search, starting from each possible text position.
// Notice that we have to try the empty string at the end of
// the text, so the loop condition is p <= text.end(), not p < text.end().
@@ -362,6 +353,7 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context,
// so no work is duplicated and it ends up still being linear.
for (const char* p = text.begin(); p <= text.end(); p++) {
// Try to use memchr to find the first byte quickly.
+ int fb = prog_->first_byte();
if (fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) {
p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
if (p == NULL)
diff --git a/re2/compile.cc b/re2/compile.cc
index 334ee4c..117679f 100644
--- a/re2/compile.cc
+++ b/re2/compile.cc
@@ -178,9 +178,6 @@ class Compiler : public Regexp::Walker<Frag> {
// Returns -1 if no more instructions are available.
int AllocInst(int n);
- // Deletes unused instructions.
- void Trim();
-
// Rune range compiler.
// Begins a new alternation.
@@ -291,16 +288,6 @@ int Compiler::AllocInst(int n) {
return id;
}
-void Compiler::Trim() {
- if (inst_len_ < inst_cap_) {
- Prog::Inst* ip = new Prog::Inst[inst_len_];
- memmove(ip, inst_, inst_len_ * sizeof ip[0]);
- delete[] inst_;
- inst_ = ip;
- inst_cap_ = inst_len_;
- }
-}
-
// These routines are somewhat hard to visualize in text --
// see http://swtch.com/~rsc/regexp/regexp1.html for
// pictures explaining what is going on here.
@@ -409,15 +396,6 @@ Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
if (id < 0)
return NoMatch();
inst_[id].InitByteRange(lo, hi, foldcase, 0);
- prog_->MarkByteRange(lo, hi);
- if (foldcase && lo <= 'z' && hi >= 'a') {
- if (lo < 'a')
- lo = 'a';
- if (hi > 'z')
- hi = 'z';
- if (lo <= hi)
- prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a');
- }
return Frag(id, PatchList::Mk(id << 1));
}
@@ -445,19 +423,6 @@ Frag Compiler::EmptyWidth(EmptyOp empty) {
if (id < 0)
return NoMatch();
inst_[id].InitEmptyWidth(empty, 0);
- if (empty & (kEmptyBeginLine|kEmptyEndLine))
- prog_->MarkByteRange('\n', '\n');
- if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) {
- int j;
- for (int i = 0; i < 256; i = j) {
- for (j = i + 1; j < 256 &&
- Prog::IsWordChar(static_cast<uint8>(i)) ==
- Prog::IsWordChar(static_cast<uint8>(j));
- j++)
- ;
- prog_->MarkByteRange(i, j-1);
- }
- }
return Frag(id, PatchList::Mk(id << 1));
}
@@ -1223,17 +1188,14 @@ Prog* Compiler::Finish() {
inst_len_ = 1;
}
- // Trim instruction to minimum array and transfer to Prog.
- Trim();
+ // Hand off the array to Prog.
prog_->inst_ = inst_;
prog_->size_ = inst_len_;
inst_ = NULL;
- // Compute byte map.
- prog_->ComputeByteMap();
-
prog_->Optimize();
prog_->Flatten();
+ prog_->ComputeByteMap();
// Record remaining memory for DFA.
if (max_mem_ <= 0) {
diff --git a/re2/dfa.cc b/re2/dfa.cc
index b4aac30..e30aeaf 100644
--- a/re2/dfa.cc
+++ b/re2/dfa.cc
@@ -1789,23 +1789,6 @@ bool DFA::Search(const StringPiece& text,
return ret;
}
-// Deletes dfa.
-//
-// This is a separate function so that
-// prog.h can be used without moving the definition of
-// class DFA out of this file. If you (atomically) set
-// prog_->dfa_first_ = dfa;
-// or
-// prog_->dfa_longest_ = dfa;
-// then you also have to set
-// prog_->delete_dfa_ = DeleteDFA;
-// so that ~Prog can delete the DFA.
-static void DeleteDFA(std::atomic<DFA*>* pdfa) {
- DFA* dfa = pdfa->load(std::memory_order_relaxed);
- if (dfa != NULL)
- delete dfa;
-}
-
DFA* Prog::GetDFA(MatchKind kind) {
std::atomic<DFA*>* pdfa;
if (kind == kFirstMatch || kind == kManyMatch) {
@@ -1837,13 +1820,17 @@ DFA* Prog::GetDFA(MatchKind kind) {
m = 0;
}
dfa = new DFA(this, kind, m);
- delete_dfa_ = DeleteDFA;
// Synchronize with "quick check" above.
pdfa->store(dfa, std::memory_order_release);
return dfa;
}
+void Prog::DeleteDFA(std::atomic<DFA*>* pdfa) {
+ DFA* dfa = pdfa->load(std::memory_order_relaxed);
+ if (dfa != NULL)
+ delete dfa;
+}
// Executes the regexp program to search in text,
// which itself is inside the larger context. (As a convenience,
diff --git a/re2/fuzzing/re2_fuzzer.cc b/re2/fuzzing/re2_fuzzer.cc
new file mode 100644
index 0000000..b4e6bb6
--- /dev/null
+++ b/re2/fuzzing/re2_fuzzer.cc
@@ -0,0 +1,106 @@
+// Copyright 2016 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <map>
+#include <string>
+
+#include "re2/re2.h"
+#include "util/logging.h"
+
+using re2::FLAGS_minloglevel;
+using re2::StringPiece;
+using std::map;
+using std::string;
+
+// NOT static, NOT signed.
+uint8_t dummy = 0;
+
+void Test(StringPiece pattern, const RE2::Options& options, StringPiece text) {
+ RE2 re(pattern, options);
+ if (!re.ok())
+ return;
+
+ // Don't waste time fuzzing high-fanout programs.
+ map<int, int> histogram;
+ int fanout = re.ProgramFanout(&histogram);
+ if (fanout > 10)
+ return;
+
+ StringPiece sp1, sp2, sp3, sp4;
+ string s1, s2, s3, s4;
+ int i1, i2, i3, i4;
+ double d1, d2, d3, d4;
+
+ RE2::FullMatch(text, re, &sp1, &sp2, &sp3, &sp4);
+ RE2::PartialMatch(text, re, &s1, &s2, &s3, &s4);
+
+ sp1 = sp2 = text;
+ RE2::Consume(&sp1, re, &i1, &i2, &i3, &i4);
+ RE2::FindAndConsume(&sp2, re, &d1, &d2, &d3, &d4);
+
+ s3 = s4 = text.ToString();
+ RE2::Replace(&s3, re, "");
+ RE2::GlobalReplace(&s4, re, "");
+
+ // Exercise some other API functionality.
+ dummy += re.NumberOfCapturingGroups();
+ dummy += RE2::QuoteMeta(pattern).size();
+}
+
+// Entry point for libFuzzer.
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+ if (size == 0)
+ return 0;
+
+ // Suppress logging below FATAL severity.
+ FLAGS_minloglevel = 3;
+
+ // The one-at-a-time hash by Bob Jenkins.
+ uint32_t hash = 0;
+ for (size_t i = 0; i < size; i++) {
+ hash += data[i];
+ hash += (hash << 10);
+ hash ^= (hash >> 6);
+ }
+ hash += (hash << 3);
+ hash ^= (hash >> 11);
+ hash += (hash << 15);
+
+ RE2::Options options;
+ options.set_log_errors(false);
+ options.set_encoding(hash & 1 ? RE2::Options::EncodingLatin1
+ : RE2::Options::EncodingUTF8);
+ options.set_posix_syntax(hash & 2);
+ options.set_longest_match(hash & 4);
+ options.set_literal(hash & 8);
+ options.set_never_nl(hash & 16);
+ options.set_dot_nl(hash & 32);
+ options.set_never_capture(hash & 64);
+ options.set_case_sensitive(hash & 128);
+ options.set_perl_classes(hash & 256);
+ options.set_word_boundary(hash & 512);
+ options.set_one_line(hash & 1024);
+
+ const char* ptr = reinterpret_cast<const char*>(data);
+ int len = static_cast<int>(size);
+
+ StringPiece pattern(ptr, len);
+ StringPiece text(ptr, len);
+ Test(pattern, options, text);
+
+ for (int i = 2; i <= 4; i++) {
+ if (len < i)
+ break;
+
+ int frac = len / i;
+ pattern.set(ptr, frac);
+ text.set(ptr + frac, len - frac);
+ Test(pattern, options, text);
+ }
+
+ return 0;
+}
diff --git a/re2/nfa.cc b/re2/nfa.cc
index 16bbe31..f084e60 100644
--- a/re2/nfa.cc
+++ b/re2/nfa.cc
@@ -55,22 +55,24 @@ class NFA {
private:
struct Thread {
- Thread* next; // when on free list
+ union {
+ int ref;
+ Thread* next; // when on free list
+ };
const char** capture;
};
// State for explicit stack in AddToThreadq.
struct AddState {
- int id; // Inst to process
- int j;
- const char* cap_j; // if j>=0, set capture[j] = cap_j before processing ip
+ int id; // Inst to process
+ Thread* t; // if not null, set t0 = t before processing id
AddState()
- : id(0), j(-1), cap_j(NULL) {}
+ : id(0), t(NULL) {}
explicit AddState(int id)
- : id(id), j(-1), cap_j(NULL) {}
- AddState(int id, const char* cap_j, int j)
- : id(id), j(j), cap_j(cap_j) {}
+ : id(id), t(NULL) {}
+ AddState(int id, Thread* t)
+ : id(id), t(t) {}
};
// Threadq is a list of threads. The list is sorted by the order
@@ -79,19 +81,24 @@ class NFA {
typedef SparseArray<Thread*> Threadq;
inline Thread* AllocThread();
- inline void FreeThread(Thread*);
+ inline Thread* Incref(Thread* t);
+ inline void Decref(Thread* t);
- // Add id (or its children, following unlabeled arrows)
- // to the workqueue q with associated capture info.
- void AddToThreadq(Threadq* q, int id, int flag,
- const char* p, const char** capture);
+ // Follows all empty arrows from id0 and enqueues all the states reached.
+ // Enqueues only the ByteRange instructions that match byte c.
+ // The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
+ // p is the current input position, and t0 is the current thread.
+ void AddToThreadq(Threadq* q, int id0, int c, int flag,
+ const char* p, Thread* t0);
// Run runq on byte c, appending new states to nextq.
// Updates matched_ and match_ as new, better matches are found.
- // p is position of the next byte (the one after c)
- // in the input string, used when processing capturing parens.
- // flag is the bitwise or of Bol, Eol, etc., specifying whether
- // ^, $ and \b match the current input point (after c).
+ // p is the position of the previous byte (the one before c)
+ // in the input string, used when processing Match instructions.
+ // flag is the bitwise OR of Bol, Eol, etc., specifying whether
+ // ^, $ and \b match the current input position (after c).
+ // Frees all the threads on runq.
+ // If there is a shortcut to the end, returns that shortcut.
inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
// Returns text version of capture information, for debugging.
@@ -99,10 +106,6 @@ class NFA {
inline void CopyCapture(const char** dst, const char** src);
- // Computes whether all matches must begin with the same first
- // byte, and if so, returns that byte. If not, returns -1.
- int ComputeFirstByte();
-
Prog* prog_; // underlying program
int start_; // start instruction in program
int ncapture_; // number of submatches to track
@@ -115,7 +118,6 @@ class NFA {
bool matched_; // any match so far?
AddState* astack_; // pre-allocated for AddToThreadq
int nastack_;
- int first_byte_; // required first byte for match, or -1 if none
Thread* free_threads_; // free list
@@ -140,7 +142,6 @@ NFA::NFA(Prog* prog) {
match_ = NULL;
matched_ = false;
free_threads_ = NULL;
- first_byte_ = ComputeFirstByte();
}
NFA::~NFA() {
@@ -154,24 +155,36 @@ NFA::~NFA() {
}
}
-void NFA::FreeThread(Thread *t) {
- if (t == NULL)
- return;
- t->next = free_threads_;
- free_threads_ = t;
-}
-
NFA::Thread* NFA::AllocThread() {
Thread* t = free_threads_;
if (t == NULL) {
t = new Thread;
+ t->ref = 1;
t->capture = new const char*[ncapture_];
return t;
}
free_threads_ = t->next;
+ t->ref = 1;
+ return t;
+}
+
+NFA::Thread* NFA::Incref(Thread* t) {
+ DCHECK(t != NULL);
+ t->ref++;
return t;
}
+void NFA::Decref(Thread* t) {
+ if (t == NULL)
+ return;
+ t->ref--;
+ if (t->ref > 0)
+ return;
+ DCHECK_EQ(t->ref, 0);
+ t->next = free_threads_;
+ free_threads_ = t;
+}
+
void NFA::CopyCapture(const char** dst, const char** src) {
for (int i = 0; i < ncapture_; i+=2) {
dst[i] = src[i];
@@ -180,11 +193,11 @@ void NFA::CopyCapture(const char** dst, const char** src) {
}
// Follows all empty arrows from id0 and enqueues all the states reached.
+// Enqueues only the ByteRange instructions that match byte c.
// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
-// The pointer p is the current input position, and m is the
-// current set of match boundaries.
-void NFA::AddToThreadq(Threadq* q, int id0, int flag,
- const char* p, const char** capture) {
+// p is the current input position, and t0 is the current thread.
+void NFA::AddToThreadq(Threadq* q, int id0, int c, int flag,
+ const char* p, Thread* t0) {
if (id0 == 0)
return;
@@ -204,15 +217,19 @@ void NFA::AddToThreadq(Threadq* q, int id0, int flag,
AddState a = stk[--nstk];
Loop:
- if (a.j >= 0)
- capture[a.j] = a.cap_j;
+ if (a.t != NULL) {
+ // t0 was a thread that we allocated and copied in order to
+ // record the capture, so we must now decref it.
+ Decref(t0);
+ t0 = a.t;
+ }
int id = a.id;
if (id == 0)
continue;
if (q->has_index(id)) {
if (Debug)
- fprintf(stderr, " [%d%s]\n", id, FormatCapture(capture).c_str());
+ fprintf(stderr, " [%d%s]\n", id, FormatCapture(t0->capture).c_str());
continue;
}
@@ -235,8 +252,7 @@ void NFA::AddToThreadq(Threadq* q, int id0, int flag,
case kInstAltMatch:
// Save state; will pick up at next byte.
- t = AllocThread();
- CopyCapture(t->capture, capture);
+ t = Incref(t0);
*tp = t;
DCHECK(!ip->last());
@@ -256,25 +272,32 @@ void NFA::AddToThreadq(Threadq* q, int id0, int flag,
stk[nstk++] = AddState(id+1);
if ((j=ip->cap()) < ncapture_) {
- // Push a dummy whose only job is to restore capture[j]
+ // Push a dummy whose only job is to restore t0
// once we finish exploring this possibility.
- stk[nstk++] = AddState(0, capture[j], j);
+ stk[nstk++] = AddState(0, t0);
// Record capture.
- capture[j] = p;
+ t = AllocThread();
+ CopyCapture(t->capture, t0->capture);
+ t->capture[j] = p;
+ t0 = t;
}
a = AddState(ip->out());
goto Loop;
- case kInstMatch:
case kInstByteRange:
+ if (!ip->Matches(c))
+ goto Next;
+ // Fallthrough intended.
+
+ case kInstMatch:
// Save state; will pick up at next byte.
- t = AllocThread();
- CopyCapture(t->capture, capture);
+ t = Incref(t0);
*tp = t;
if (Debug)
- fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t);
+ fprintf(stderr, " + %d%s\n", id, FormatCapture(t0->capture).c_str());
+ Next:
if (ip->last())
break;
a = AddState(id+1);
@@ -294,11 +317,11 @@ void NFA::AddToThreadq(Threadq* q, int id0, int flag,
}
// Run runq on byte c, appending new states to nextq.
-// Updates match as new, better matches are found.
-// p is position of the byte c in the input string,
-// used when processing capturing parens.
-// flag is the bitwise or of Bol, Eol, etc., specifying whether
-// ^, $ and \b match the current input point (after c).
+// Updates matched_ and match_ as new, better matches are found.
+// p is the position of the previous byte (the one before c)
+// in the input string, used when processing Match instructions.
+// flag is the bitwise OR of Bol, Eol, etc., specifying whether
+// ^, $ and \b match the current input position (after c).
// Frees all the threads on runq.
// If there is a shortcut to the end, returns that shortcut.
int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
@@ -312,7 +335,7 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
if (longest_) {
// Can skip any threads started after our current best match.
if (matched_ && match_[0] < t->capture[0]) {
- FreeThread(t);
+ Decref(t);
continue;
}
}
@@ -327,8 +350,7 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
break;
case kInstByteRange:
- if (ip->Matches(c))
- AddToThreadq(nextq, ip->out(), flag, p+1, t->capture);
+ AddToThreadq(nextq, ip->out(), c, flag, p+1, t);
break;
case kInstAltMatch:
@@ -337,11 +359,12 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
// The match is ours if we want it.
if (ip->greedy(prog_) || longest_) {
CopyCapture(match_, t->capture);
- FreeThread(t);
+ matched_ = true;
+
+ Decref(t);
for (++i; i != runq->end(); ++i)
- FreeThread(i->second);
+ Decref(i->second);
runq->clear();
- matched_ = true;
if (ip->greedy(prog_))
return ip->out1();
return ip->out();
@@ -352,33 +375,35 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
if (endmatch_ && p != etext_)
break;
- t->capture[1] = p;
if (longest_) {
// Leftmost-longest mode: save this match only if
// it is either farther to the left or at the same
// point but longer than an existing match.
if (!matched_ || t->capture[0] < match_[0] ||
- (t->capture[0] == match_[0] && t->capture[1] > match_[1]))
+ (t->capture[0] == match_[0] && p > match_[1])) {
CopyCapture(match_, t->capture);
+ match_[1] = p;
+ matched_ = true;
+ }
} else {
// Leftmost-biased mode: this match is by definition
// better than what we've already found (see next line).
CopyCapture(match_, t->capture);
+ match_[1] = p;
+ matched_ = true;
// Cut off the threads that can only find matches
// worse than the one we just found: don't run the
// rest of the current Threadq.
- FreeThread(t);
+ Decref(t);
for (++i; i != runq->end(); ++i)
- FreeThread(i->second);
+ Decref(i->second);
runq->clear();
- matched_ = true;
return 0;
}
- matched_ = true;
break;
}
- FreeThread(t);
+ Decref(t);
}
runq->clear();
return 0;
@@ -454,7 +479,6 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
match_ = new const char*[ncapture_];
matched_ = false;
- memset(match_, 0, ncapture_*sizeof match_[0]);
// For debugging prints.
btext_ = context.begin();
@@ -471,14 +495,10 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
runq->clear();
nextq->clear();
memset(&match_[0], 0, ncapture_*sizeof match_[0]);
- const char* bp = context.begin();
- int c = -1;
int wasword = 0;
- if (text.begin() > context.begin()) {
- c = text.begin()[-1] & 0xFF;
- wasword = Prog::IsWordChar(static_cast<uint8>(c));
- }
+ if (text.begin() > context.begin())
+ wasword = Prog::IsWordChar(text.begin()[-1] & 0xFF);
// Loop over the text, stepping the machine.
for (const char* p = text.begin();; p++) {
@@ -508,7 +528,15 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
flag |= kEmptyNonWordBoundary;
if (Debug) {
- fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword);
+ int c = 0;
+ if (p == context.begin())
+ c = '^';
+ else if (p > text.end())
+ c = '$';
+ else if (p < text.end())
+ c = p[0] & 0xFF;
+
+ fprintf(stderr, "%c[%#x/%d/%d]:", c, flag, isword, wasword);
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
Thread* t = i->second;
if (t == NULL)
@@ -518,11 +546,12 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
fprintf(stderr, "\n");
}
- // Process previous character (waited until now to avoid
- // repeating the flag computation above).
- // This is a no-op the first time around the loop, because
- // runq is empty.
- int id = Step(runq, nextq, c, flag, p-1);
+ // Note that we pass p-1 to Step() because it needs the previous pointer
+ // value in order to handle Match instructions appropriately. It plumbs
+ // c and flag through to AddToThreadq() along with p-1+1, which is p.
+ //
+ // This is a no-op the first time around the loop because runq is empty.
+ int id = Step(runq, nextq, p < text.end() ? p[0] & 0xFF : -1, flag, p-1);
DCHECK_EQ(runq->size(), 0);
swap(nextq, runq);
nextq->clear();
@@ -567,10 +596,10 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
// If there's a required first byte for an unanchored search
// and we're not in the middle of any possible matches,
// use memchr to search for the byte quickly.
- if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
- p < text.end() && (p[0] & 0xFF) != first_byte_) {
- p = reinterpret_cast<const char*>(memchr(p, first_byte_,
- text.end() - p));
+ int fb = prog_->first_byte();
+ if (!anchored && runq->size() == 0 &&
+ fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) {
+ p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
if (p == NULL) {
p = text.end();
isword = 0;
@@ -580,8 +609,11 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
flag = Prog::EmptyFlags(context, p);
}
- match_[0] = p;
- AddToThreadq(runq, start_, flag, p, match_);
+ Thread* t = AllocThread();
+ CopyCapture(t->capture, match_);
+ t->capture[0] = p;
+ AddToThreadq(runq, start_, p < text.end() ? p[0] & 0xFF : -1, flag, p, t);
+ Decref(t);
}
// If all the threads have died, stop early.
@@ -591,17 +623,11 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
break;
}
- if (p == text.end())
- c = 0;
- else
- c = *p & 0xFF;
wasword = isword;
-
- // Will run step(runq, nextq, c, ...) on next iteration. See above.
}
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
- FreeThread(i->second);
+ Decref(i->second);
if (matched_) {
for (int i = 0; i < nsubmatch; i++)
@@ -619,27 +645,19 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
// Computes whether all successful matches have a common first byte,
// and if so, returns that byte. If not, returns -1.
-int NFA::ComputeFirstByte() {
- if (start_ == 0)
- return -1;
-
- int b = -1; // first byte, not yet computed
-
- typedef SparseSet Workq;
- Workq q(prog_->size());
- q.insert(start_);
- for (Workq::iterator it = q.begin(); it != q.end(); ++it) {
+int Prog::ComputeFirstByte() {
+ int b = -1;
+ SparseSet q(size());
+ q.insert(start());
+ for (SparseSet::iterator it = q.begin(); it != q.end(); ++it) {
int id = *it;
- Prog::Inst* ip = prog_->inst(id);
+ Prog::Inst* ip = inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
break;
case kInstMatch:
- if (!ip->last())
- q.insert(id+1);
-
// The empty string matches: no first byte.
return -1;
diff --git a/re2/onepass.cc b/re2/onepass.cc
index 977b0e7..1d745d4 100644
--- a/re2/onepass.cc
+++ b/re2/onepass.cc
@@ -392,9 +392,12 @@ bool Prog::IsOnePass() {
// Flood the graph starting at the start state, and check
// that in each reachable state, each possible byte leads
// to a unique next state.
- int size = this->size();
- InstCond *stack = new InstCond[size];
+ int stacksize = inst_count(kInstCapture) +
+ inst_count(kInstEmptyWidth) +
+ inst_count(kInstNop) + 1; // + 1 for start inst
+ InstCond* stack = new InstCond[stacksize];
+ int size = this->size();
int* nodebyid = new int[size]; // indexed by ip
memset(nodebyid, 0xFF, size*sizeof nodebyid[0]);
@@ -424,8 +427,10 @@ bool Prog::IsOnePass() {
stack[nstack++].cond = 0;
while (nstack > 0) {
int id = stack[--nstack].id;
- Prog::Inst* ip = inst(id);
uint32 cond = stack[nstack].cond;
+
+ Loop:
+ Prog::Inst* ip = inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
@@ -438,26 +443,16 @@ bool Prog::IsOnePass() {
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, id+1))
goto fail;
- stack[nstack].id = id+1;
- stack[nstack++].cond = cond;
- break;
+ id = id+1;
+ goto Loop;
case kInstByteRange: {
- if (!ip->last()) {
- // If already on work queue, (1) is violated: bail out.
- if (!AddQ(&workq, id+1))
- goto fail;
- stack[nstack].id = id+1;
- stack[nstack++].cond = cond;
- }
-
int nextindex = nodebyid[ip->out()];
if (nextindex == -1) {
if (nalloc >= maxnodes) {
if (Debug)
- LOG(ERROR)
- << StringPrintf("Not OnePass: hit node limit %d > %d",
- nalloc, maxnodes);
+ LOG(ERROR) << StringPrintf(
+ "Not OnePass: hit node limit %d > %d", nalloc, maxnodes);
goto fail;
}
nextindex = nalloc;
@@ -466,21 +461,21 @@ bool Prog::IsOnePass() {
nalloc++;
AddQ(&tovisit, ip->out());
}
- if (matched)
- cond |= kMatchWins;
for (int c = ip->lo(); c <= ip->hi(); c++) {
int b = bytemap_[c];
- c = unbytemap_[b]; // last c in byte class
+ // Skip any bytes immediately after c that are also in b.
+ while (c < 256-1 && bytemap_[c+1] == b)
+ c++;
uint32 act = node->action[b];
uint32 newact = (nextindex << kIndexShift) | cond;
+ if (matched)
+ newact |= kMatchWins;
if ((act & kImpossible) == kImpossible) {
node->action[b] = newact;
} else if (act != newact) {
if (Debug) {
- LOG(ERROR)
- << StringPrintf("Not OnePass: conflict on byte "
- "%#x at state %d",
- c, *it);
+ LOG(ERROR) << StringPrintf(
+ "Not OnePass: conflict on byte %#x at state %d", c, *it);
}
goto fail;
}
@@ -490,23 +485,32 @@ bool Prog::IsOnePass() {
Rune hi = min<Rune>(ip->hi(), 'z') + 'A' - 'a';
for (int c = lo; c <= hi; c++) {
int b = bytemap_[c];
- c = unbytemap_[b]; // last c in class
+ // Skip any bytes immediately after c that are also in b.
+ while (c < 256-1 && bytemap_[c+1] == b)
+ c++;
uint32 act = node->action[b];
uint32 newact = (nextindex << kIndexShift) | cond;
+ if (matched)
+ newact |= kMatchWins;
if ((act & kImpossible) == kImpossible) {
node->action[b] = newact;
} else if (act != newact) {
if (Debug) {
- LOG(ERROR)
- << StringPrintf("Not OnePass: conflict on byte "
- "%#x at state %d",
- c, *it);
+ LOG(ERROR) << StringPrintf(
+ "Not OnePass: conflict on byte %#x at state %d", c, *it);
}
goto fail;
}
}
}
- break;
+
+ if (ip->last())
+ break;
+ // If already on work queue, (1) is violated: bail out.
+ if (!AddQ(&workq, id+1))
+ goto fail;
+ id = id+1;
+ goto Loop;
}
case kInstCapture:
@@ -534,36 +538,33 @@ bool Prog::IsOnePass() {
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, ip->out())) {
if (Debug) {
- LOG(ERROR) << StringPrintf("Not OnePass: multiple paths"
- " %d -> %d\n",
- *it, ip->out());
+ LOG(ERROR) << StringPrintf(
+ "Not OnePass: multiple paths %d -> %d\n", *it, ip->out());
}
goto fail;
}
- stack[nstack].id = ip->out();
- stack[nstack++].cond = cond;
- break;
+ id = ip->out();
+ goto Loop;
case kInstMatch:
- if (!ip->last()) {
- // If already on work queue, (1) is violated: bail out.
- if (!AddQ(&workq, id+1))
- goto fail;
- stack[nstack].id = id+1;
- stack[nstack++].cond = cond;
- }
-
if (matched) {
// (3) is violated
if (Debug) {
- LOG(ERROR) << StringPrintf("Not OnePass: multiple matches"
- " from %d\n", *it);
+ LOG(ERROR) << StringPrintf(
+ "Not OnePass: multiple matches from %d\n", *it);
}
goto fail;
}
matched = true;
node->matchcond = cond;
- break;
+
+ if (ip->last())
+ break;
+ // If already on work queue, (1) is violated: bail out.
+ if (!AddQ(&workq, id+1))
+ goto fail;
+ id = id+1;
+ goto Loop;
case kInstFail:
break;
@@ -572,28 +573,21 @@ bool Prog::IsOnePass() {
}
if (Debug) { // For debugging, dump one-pass NFA to LOG(ERROR).
- string dump = "prog dump:\n" + Dump() + "node dump\n";
+ LOG(ERROR) << "bytemap:\n" << DumpByteMap();
+ LOG(ERROR) << "prog:\n" << Dump();
+
map<int, int> idmap;
for (int i = 0; i < size; i++)
if (nodebyid[i] != -1)
idmap[nodebyid[i]] = i;
- StringAppendF(&dump, "byte ranges:\n");
- int i = 0;
- for (int b = 0; b < bytemap_range_; b++) {
- int lo = i;
- while (bytemap_[i] == b)
- i++;
- StringAppendF(&dump, "\t%d: %#x-%#x\n", b, lo, i - 1);
- }
-
+ string dump;
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
int id = *it;
int nodeindex = nodebyid[id];
if (nodeindex == -1)
- continue;
+ continue;
OneState* node = IndexToNode(nodes, statesize, nodeindex);
- string s;
StringAppendF(&dump, "node %d id=%d: matchcond=%#x\n",
nodeindex, id, node->matchcond);
for (int i = 0; i < bytemap_range_; i++) {
@@ -605,7 +599,7 @@ bool Prog::IsOnePass() {
idmap[node->action[i] >> kIndexShift]);
}
}
- LOG(ERROR) << dump;
+ LOG(ERROR) << "nodes:\n" << dump;
}
// Overallocated earlier; cut down to actual size.
diff --git a/re2/prog.cc b/re2/prog.cc
index 15f10e1..9098dd4 100644
--- a/re2/prog.cc
+++ b/re2/prog.cc
@@ -99,26 +99,22 @@ Prog::Prog()
start_unanchored_(0),
size_(0),
bytemap_range_(0),
+ first_byte_(-1),
flags_(0),
onepass_statesize_(0),
inst_(NULL),
dfa_first_(NULL),
dfa_longest_(NULL),
dfa_mem_(0),
- delete_dfa_(NULL),
- unbytemap_(NULL),
onepass_nodes_(NULL),
onepass_start_(NULL) {
}
Prog::~Prog() {
- if (delete_dfa_ != NULL) {
- delete_dfa_(&dfa_first_);
- delete_dfa_(&dfa_longest_);
- }
+ DeleteDFA(&dfa_first_);
+ DeleteDFA(&dfa_longest_);
delete[] onepass_nodes_;
delete[] inst_;
- delete[] unbytemap_;
}
typedef SparseSet Workq;
@@ -154,23 +150,12 @@ static string FlattenedProgToString(Prog* prog, int start) {
}
string Prog::Dump() {
- string map;
- if (false) { // Debugging
- int lo = 0;
- StringAppendF(&map, "byte map:\n");
- for (int i = 0; i < bytemap_range_; i++) {
- StringAppendF(&map, "\t%d. [%02x-%02x]\n", i, lo, unbytemap_[i]);
- lo = unbytemap_[i] + 1;
- }
- StringAppendF(&map, "\n");
- }
-
if (did_flatten_)
- return map + FlattenedProgToString(this, start_);
+ return FlattenedProgToString(this, start_);
Workq q(size_);
AddToQueue(&q, start_);
- return map + ProgToString(this, &q);
+ return ProgToString(this, &q);
}
string Prog::DumpUnanchored() {
@@ -182,6 +167,26 @@ string Prog::DumpUnanchored() {
return ProgToString(this, &q);
}
+string Prog::DumpByteMap() {
+ string map;
+ for (int c = 0; c < 256; c++) {
+ int b = bytemap_[c];
+ int lo = c;
+ while (c < 256-1 && bytemap_[c+1] == b)
+ c++;
+ int hi = c;
+ StringAppendF(&map, "[%02x-%02x] -> %d\n", lo, hi, b);
+ }
+ return map;
+}
+
+int Prog::first_byte() {
+ std::call_once(first_byte_once_, [this]() {
+ first_byte_ = ComputeFirstByte();
+ });
+ return first_byte_;
+}
+
static bool IsMatch(Prog*, Prog::Inst*);
// Peep-hole optimizer.
@@ -308,46 +313,72 @@ uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) {
return flags;
}
-void Prog::MarkByteRange(int lo, int hi) {
- DCHECK_GE(lo, 0);
- DCHECK_GE(hi, 0);
- DCHECK_LE(lo, 255);
- DCHECK_LE(hi, 255);
- DCHECK_LE(lo, hi);
- if (0 < lo && lo <= 255)
- byterange_.Set(lo - 1);
- if (0 <= hi && hi <= 255)
- byterange_.Set(hi);
-}
-
void Prog::ComputeByteMap() {
- // Fill in bytemap with byte classes for prog_.
- // Ranges of bytes that are treated as indistinguishable
- // by the regexp program are mapped to a single byte class.
- // The vector prog_->byterange() marks the end of each
- // such range.
- const Bitmap<256>& v = byterange();
+ // Fill in byte map with byte classes for the program.
+ // Ranges of bytes that are treated indistinguishably
+ // are mapped to a single byte class.
+ Bitmap<256> v;
+
+ // Don't repeat the work for \b and \B.
+ bool done_word_boundaries = false;
+
+ for (int id = 0; id < static_cast<int>(size()); id++) {
+ Inst* ip = inst(id);
+ if (ip->opcode() == kInstByteRange) {
+ int lo = ip->lo();
+ int hi = ip->hi();
+ if (0 < lo)
+ v.Set(lo - 1);
+ v.Set(hi);
+ if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
+ if (lo < 'a')
+ lo = 'a';
+ if (hi > 'z')
+ hi = 'z';
+ if (lo <= hi) {
+ v.Set(lo + 'A' - 'a' - 1);
+ v.Set(hi + 'A' - 'a');
+ }
+ }
+ } else if (ip->opcode() == kInstEmptyWidth) {
+ if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine)) {
+ v.Set('\n' - 1);
+ v.Set('\n');
+ }
+ if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary)) {
+ if (done_word_boundaries)
+ continue;
+ int j;
+ for (int i = 0; i < 256; i = j) {
+ for (j = i + 1; j < 256 &&
+ Prog::IsWordChar(static_cast<uint8>(i)) ==
+ Prog::IsWordChar(static_cast<uint8>(j));
+ j++)
+ ;
+ if (0 < i)
+ v.Set(i - 1);
+ v.Set(j - 1);
+ }
+ done_word_boundaries = true;
+ }
+ }
+ }
COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize);
uint8 n = 0;
uint32 bits = 0;
for (int i = 0; i < 256; i++) {
- if ((i&31) == 0)
+ if ((i & 31) == 0)
bits = v.Word(i >> 5);
bytemap_[i] = n;
n += bits & 1;
bits >>= 1;
}
bytemap_range_ = bytemap_[255] + 1;
- unbytemap_ = new uint8[bytemap_range_];
- for (int i = 0; i < 256; i++)
- unbytemap_[bytemap_[i]] = static_cast<uint8>(i);
if (0) { // For debugging: use trivial byte map.
- for (int i = 0; i < 256; i++) {
+ for (int i = 0; i < 256; i++)
bytemap_[i] = static_cast<uint8>(i);
- unbytemap_[i] = static_cast<uint8>(i);
- }
bytemap_range_ = 256;
LOG(INFO) << "Using trivial bytemap.";
}
@@ -377,12 +408,12 @@ void Prog::Flatten() {
for (SparseArray<int>::const_iterator i = rootmap.begin();
i != rootmap.end();
++i) {
- flatmap[i->value()] = flat.size();
+ flatmap[i->value()] = static_cast<int>(flat.size());
EmitList(i->index(), &rootmap, &flat, &q, &stk);
flat.back().set_last();
}
- list_count_ = flatmap.size();
+ list_count_ = static_cast<int>(flatmap.size());
for (int i = 0; i < kNumInst; i++)
inst_count_[i] = 0;
@@ -412,7 +443,7 @@ void Prog::Flatten() {
}
// Finally, replace the old instructions with the new instructions.
- size_ = flat.size();
+ size_ = static_cast<int>(flat.size());
delete[] inst_;
inst_ = new Inst[size_];
memmove(inst_, flat.data(), size_ * sizeof *inst_);
@@ -503,8 +534,8 @@ void Prog::EmitList(int root, SparseArray<int>* rootmap, vector<Inst>* flat,
case kInstAltMatch:
flat->emplace_back();
flat->back().set_opcode(kInstAltMatch);
- flat->back().set_out(flat->size());
- flat->back().out1_ = flat->size()+1;
+ flat->back().set_out(static_cast<int>(flat->size()));
+ flat->back().out1_ = static_cast<uint32>(flat->size())+1;
// Fall through.
case kInstAlt:
diff --git a/re2/prog.h b/re2/prog.h
index 3e2d809..d5c8616 100644
--- a/re2/prog.h
+++ b/re2/prog.h
@@ -218,7 +218,6 @@ class Prog {
void set_reversed(bool reversed) { reversed_ = reversed; }
int list_count() { return list_count_; }
int inst_count(InstOp op) { return inst_count_[op]; }
- const Bitmap<256>& byterange() { return byterange_; }
void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
int64 dfa_mem() { return dfa_mem_; }
int flags() { return flags_; }
@@ -230,15 +229,13 @@ class Prog {
int bytemap_range() { return bytemap_range_; }
const uint8* bytemap() { return bytemap_; }
+ // Lazily computed.
+ int first_byte();
+
// Returns string representation of program for debugging.
string Dump();
string DumpUnanchored();
-
- // Record that at some point in the prog, the bytes in the range
- // lo-hi (inclusive) are treated as different from bytes outside the range.
- // Tracking this lets the DFA collapse commonly-treated byte ranges
- // when recording state pointers, greatly reducing its memory footprint.
- void MarkByteRange(int lo, int hi);
+ string DumpByteMap();
// Returns the set of kEmpty flags that are in effect at
// position p within context.
@@ -297,6 +294,10 @@ class Prog {
// Compute byte map.
void ComputeByteMap();
+ // Computes whether all matches must begin with the same first
+ // byte, and if so, returns that byte. If not, returns -1.
+ int ComputeFirstByte();
+
// Run peep-hole optimizer on program.
void Optimize();
@@ -370,6 +371,7 @@ class Prog {
friend class Compiler;
DFA* GetDFA(MatchKind kind);
+ void DeleteDFA(std::atomic<DFA*>* pdfa);
bool anchor_start_; // regexp has explicit start anchor
bool anchor_end_; // regexp has explicit end anchor
@@ -381,6 +383,7 @@ class Prog {
int start_unanchored_; // unanchored entry point for program
int size_; // number of instructions
int bytemap_range_; // bytemap_[x] < bytemap_range_
+ int first_byte_; // required first byte for match, or -1 if none
int flags_; // regexp parse flags
int onepass_statesize_; // byte size of each OneState* node
@@ -393,16 +396,14 @@ class Prog {
std::atomic<DFA*> dfa_first_; // DFA cached for kFirstMatch
std::atomic<DFA*> dfa_longest_; // DFA cached for kLongestMatch and kFullMatch
int64 dfa_mem_; // Maximum memory for DFAs.
- void (*delete_dfa_)(std::atomic<DFA*>* pdfa);
- Bitmap<256> byterange_; // byterange.Get(x) true if x ends a
- // commonly-treated byte range.
uint8 bytemap_[256]; // map from input bytes to byte classes
- uint8 *unbytemap_; // bytemap_[unbytemap_[x]] == x
uint8* onepass_nodes_; // data for OnePass nodes
OneState* onepass_start_; // start node for OnePass program
+ std::once_flag first_byte_once_;
+
DISALLOW_COPY_AND_ASSIGN(Prog);
};
diff --git a/re2/re2.cc b/re2/re2.cc
index 492ffaf..6bc5edd 100644
--- a/re2/re2.cc
+++ b/re2/re2.cc
@@ -26,11 +26,6 @@ namespace re2 {
static const int kMaxArgs = 16;
static const int kVecSize = 1+kMaxArgs;
-const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::FullMatchN> RE2::FullMatch = {};
-const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::PartialMatchN> RE2::PartialMatch = {};
-const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN> RE2::Consume = {};
-const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndConsumeN> RE2::FindAndConsume = {};
-
// This will trigger LNK2005 error in MSVC.
#ifndef _MSC_VER
const int RE2::Options::kDefaultMaxMem; // initialized in re2.h
diff --git a/re2/re2.h b/re2/re2.h
index 4a8c5c8..56fde7f 100644
--- a/re2/re2.h
+++ b/re2/re2.h
@@ -184,7 +184,6 @@
#include <mutex>
#include <string>
#include "re2/stringpiece.h"
-#include "re2/variadic_function.h"
#ifndef RE2_HAVE_LONGLONG
#define RE2_HAVE_LONGLONG 1
@@ -251,7 +250,7 @@ class RE2 {
RE2(const string& pattern);
#endif
RE2(const StringPiece& pattern);
- RE2(const StringPiece& pattern, const Options& option);
+ RE2(const StringPiece& pattern, const Options& options);
~RE2();
// Returns whether RE2 was created properly.
@@ -290,11 +289,11 @@ class RE2 {
/***** The useful part: the matching interface *****/
- // Matches "text" against "pattern". If pointer arguments are
+ // Matches "text" against "re". If pointer arguments are
// supplied, copies matched sub-patterns into them.
//
// You can pass in a "const char*" or a "string" for "text".
- // You can pass in a "const char*" or a "string" or a "RE2" for "pattern".
+ // You can pass in a "const char*" or a "string" or a "RE2" for "re".
//
// The provided pointer arguments can be pointers to any scalar numeric
// type, or one of:
@@ -304,7 +303,7 @@ class RE2 {
// (void*)NULL (the corresponding matched sub-pattern is not copied)
//
// Returns true iff all of the following conditions are satisfied:
- // a. "text" matches "pattern" exactly
+ // a. "text" matches "re" exactly
// b. The number of matched sub-patterns is >= number of supplied pointers
// c. The "i"th argument has a suitable type for holding the
// string captured as the "i"th sub-pattern. If you pass in
@@ -320,32 +319,60 @@ class RE2 {
// RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number);
static bool FullMatchN(const StringPiece& text, const RE2& re,
const Arg* const args[], int argc);
- static const VariadicFunction2<
- bool, const StringPiece&, const RE2&, Arg, RE2::FullMatchN> FullMatch;
- // Exactly like FullMatch(), except that "pattern" is allowed to match
+ // Exactly like FullMatch(), except that "re" is allowed to match
// a substring of "text".
- static bool PartialMatchN(const StringPiece& text, const RE2& re, // 3..16 args
+ static bool PartialMatchN(const StringPiece& text, const RE2& re,
const Arg* const args[], int argc);
- static const VariadicFunction2<
- bool, const StringPiece&, const RE2&, Arg, RE2::PartialMatchN> PartialMatch;
- // Like FullMatch() and PartialMatch(), except that pattern has to
- // match a prefix of "text", and "input" is advanced past the matched
+ // Like FullMatch() and PartialMatch(), except that "re" has to match
+ // a prefix of the text, and "input" is advanced past the matched
// text. Note: "input" is modified iff this routine returns true.
- static bool ConsumeN(StringPiece* input, const RE2& pattern, // 3..16 args
+ static bool ConsumeN(StringPiece* input, const RE2& re,
const Arg* const args[], int argc);
- static const VariadicFunction2<
- bool, StringPiece*, const RE2&, Arg, RE2::ConsumeN> Consume;
-
- // Like Consume(..), but does not anchor the match at the beginning of the
- // string. That is, "pattern" need not start its match at the beginning of
- // "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next
- // word in "s" and stores it in "word".
- static bool FindAndConsumeN(StringPiece* input, const RE2& pattern,
- const Arg* const args[], int argc);
- static const VariadicFunction2<
- bool, StringPiece*, const RE2&, Arg, RE2::FindAndConsumeN> FindAndConsume;
+
+ // Like Consume(), but does not anchor the match at the beginning of
+ // the text. That is, "re" need not start its match at the beginning
+ // of "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds
+ // the next word in "s" and stores it in "word".
+ static bool FindAndConsumeN(StringPiece* input, const RE2& re,
+ const Arg* const args[], int argc);
+
+#ifndef SWIG
+ private:
+ template <typename F, typename SP, typename... A>
+ static inline bool Apply(F f, SP sp, const RE2& re, const A&... a) {
+ const Arg* const args[] = {&a...};
+ const int argc = sizeof...(a);
+ return f(sp, re, args, argc);
+ }
+
+ public:
+ // In order to allow FullMatch() et al. to be called with a varying number
+ // of arguments of varying types, we use two layers of variadic templates.
+ // The first layer constructs the temporary Arg objects. The second layer
+ // (above) constructs the array of pointers to the temporary Arg objects.
+
+ template <typename... A>
+ static bool FullMatch(const StringPiece& text, const RE2& re, A&&... a) {
+ return Apply(FullMatchN, text, re, Arg(std::forward<A>(a))...);
+ }
+
+ template <typename... A>
+ static bool PartialMatch(const StringPiece& text, const RE2& re, A&&... a) {
+ return Apply(PartialMatchN, text, re, Arg(std::forward<A>(a))...);
+ }
+
+ template <typename... A>
+ static bool Consume(StringPiece* input, const RE2& re, A&&... a) {
+ return Apply(ConsumeN, input, re, Arg(std::forward<A>(a))...);
+ }
+
+ template <typename... A>
+ static bool FindAndConsume(StringPiece* input, const RE2& re, A&&... a) {
+ return Apply(FindAndConsumeN, input, re, Arg(std::forward<A>(a))...);
+ }
+#endif
// Replace the first match of "pattern" in "str" with "rewrite".
// Within "rewrite", backslash-escaped digits (\1 to \9) can be
@@ -623,19 +650,7 @@ class RE2 {
void set_one_line(bool b) { one_line_ = b; }
void Copy(const Options& src) {
- encoding_ = src.encoding_;
- posix_syntax_ = src.posix_syntax_;
- longest_match_ = src.longest_match_;
- log_errors_ = src.log_errors_;
- max_mem_ = src.max_mem_;
- literal_ = src.literal_;
- never_nl_ = src.never_nl_;
- dot_nl_ = src.dot_nl_;
- never_capture_ = src.never_capture_;
- case_sensitive_ = src.case_sensitive_;
- perl_classes_ = src.perl_classes_;
- word_boundary_ = src.word_boundary_;
- one_line_ = src.one_line_;
+ *this = src;
}
int ParseFlags() const;
@@ -654,10 +669,6 @@ class RE2 {
bool perl_classes_;
bool word_boundary_;
bool one_line_;
-
- //DISALLOW_COPY_AND_ASSIGN(Options);
- Options(const Options&);
- void operator=(const Options&);
};
// Returns the options set in the constructor.
diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc
index 05e7ea8..0c598e0 100644
--- a/re2/testing/compile_test.cc
+++ b/re2/testing/compile_test.cc
@@ -143,6 +143,29 @@ TEST(TestRegexpCompileToProg, Simple) {
EXPECT_EQ(failed, 0);
}
+// The distinct byte ranges involved in the Latin-1 dot ([^\n]).
+static struct Latin1ByteRange {
+ int lo;
+ int hi;
+} latin1ranges[] = {
+ { 0x00, 0x09 },
+ { 0x0A, 0x0A },
+ { 0x0B, 0xFF },
+};
+
+TEST(TestCompile, Latin1Ranges) {
+ Regexp* re = Regexp::Parse(".", Regexp::PerlX|Regexp::Latin1, NULL);
+ EXPECT_TRUE(re != NULL);
+ Prog* prog = re->CompileToProg(0);
+ EXPECT_TRUE(prog != NULL);
+ EXPECT_EQ(prog->bytemap_range(), arraysize(latin1ranges));
+ for (int i = 0; i < arraysize(latin1ranges); i++)
+ for (int j = latin1ranges[i].lo; j <= latin1ranges[i].hi; j++)
+ EXPECT_EQ(prog->bytemap()[j], i) << " byte " << j;
+ delete prog;
+ re->Decref();
+}
+
// The distinct byte ranges involved in the UTF-8 dot ([^\n]).
// Once, erroneously split between 0x3f and 0x40 because it is
// a 6-bit boundary.
@@ -152,7 +175,7 @@ static struct UTF8ByteRange {
} utf8ranges[] = {
{ 0x00, 0x09 },
{ 0x0A, 0x0A },
- { 0x10, 0x7F },
+ { 0x0B, 0x7F },
{ 0x80, 0x8F },
{ 0x90, 0x9F },
{ 0xA0, 0xBF },
@@ -166,7 +189,7 @@ static struct UTF8ByteRange {
{ 0xF5, 0xFF },
};
-TEST(TestCompile, ByteRanges) {
+TEST(TestCompile, UTF8Ranges) {
Regexp* re = Regexp::Parse(".", Regexp::PerlX, NULL);
EXPECT_TRUE(re != NULL);
Prog* prog = re->CompileToProg(0);
diff --git a/re2/variadic_function.h b/re2/variadic_function.h
deleted file mode 100644
index 7c7d6d5..0000000
--- a/re2/variadic_function.h
+++ /dev/null
@@ -1,344 +0,0 @@
-// Copyright 2010 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef RE2_VARIADIC_FUNCTION_H_
-#define RE2_VARIADIC_FUNCTION_H_
-
-namespace re2 {
-
-template <typename Result, typename Param0, typename Param1, typename Arg,
- Result (*Func)(Param0, Param1, const Arg* const [], int count)>
-class VariadicFunction2 {
- public:
- Result operator()(Param0 p0, Param1 p1) const {
- return Func(p0, p1, 0, 0);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0) const {
- const Arg* const args[] = { &a0 };
- return Func(p0, p1, args, 1);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1) const {
- const Arg* const args[] = { &a0, &a1 };
- return Func(p0, p1, args, 2);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2) const {
- const Arg* const args[] = { &a0, &a1, &a2 };
- return Func(p0, p1, args, 3);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3 };
- return Func(p0, p1, args, 4);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4 };
- return Func(p0, p1, args, 5);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5 };
- return Func(p0, p1, args, 6);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
- const Arg& a6) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6 };
- return Func(p0, p1, args, 7);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
- const Arg& a6, const Arg& a7) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7 };
- return Func(p0, p1, args, 8);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
- const Arg& a6, const Arg& a7, const Arg& a8) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8 };
- return Func(p0, p1, args, 9);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
- const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
- &a9 };
- return Func(p0, p1, args, 10);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
- const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
- const Arg& a10) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
- &a9, &a10 };
- return Func(p0, p1, args, 11);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
- const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
- const Arg& a10, const Arg& a11) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
- &a9, &a10, &a11 };
- return Func(p0, p1, args, 12);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
- const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
- const Arg& a10, const Arg& a11, const Arg& a12) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
- &a9, &a10, &a11, &a12 };
- return Func(p0, p1, args, 13);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
- const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
- const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
- &a9, &a10, &a11, &a12, &a13 };
- return Func(p0, p1, args, 14);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
- const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
- const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
- const Arg& a14) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
- &a9, &a10, &a11, &a12, &a13, &a14 };
- return Func(p0, p1, args, 15);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
- const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
- const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
- const Arg& a14, const Arg& a15) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
- &a9, &a10, &a11, &a12, &a13, &a14, &a15 };
- return Func(p0, p1, args, 16);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
- const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
- const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
- const Arg& a14, const Arg& a15, const Arg& a16) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
- &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16 };
- return Func(p0, p1, args, 17);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
- const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
- const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
- const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
- &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17 };
- return Func(p0, p1, args, 18);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
- const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
- const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
- const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
- const Arg& a18) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
- &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18 };
- return Func(p0, p1, args, 19);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
- const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
- const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
- const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
- const Arg& a18, const Arg& a19) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
- &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19 };
- return Func(p0, p1, args, 20);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
- const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
- const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
- const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
- const Arg& a18, const Arg& a19, const Arg& a20) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
- &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19,
- &a20 };
- return Func(p0, p1, args, 21);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
- const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
- const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
- const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
- const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
- &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
- &a21 };
- return Func(p0, p1, args, 22);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
- const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
- const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
- const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
- const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
- const Arg& a22) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
- &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
- &a21, &a22 };
- return Func(p0, p1, args, 23);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
- const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
- const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
- const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
- const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
- const Arg& a22, const Arg& a23) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
- &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
- &a21, &a22, &a23 };
- return Func(p0, p1, args, 24);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
- const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
- const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
- const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
- const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
- const Arg& a22, const Arg& a23, const Arg& a24) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
- &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
- &a21, &a22, &a23, &a24 };
- return Func(p0, p1, args, 25);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
- const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
- const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
- const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
- const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
- const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
- &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
- &a21, &a22, &a23, &a24, &a25 };
- return Func(p0, p1, args, 26);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
- const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
- const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
- const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
- const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
- const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
- const Arg& a26) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
- &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
- &a21, &a22, &a23, &a24, &a25, &a26 };
- return Func(p0, p1, args, 27);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
- const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
- const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
- const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
- const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
- const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
- const Arg& a26, const Arg& a27) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
- &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
- &a21, &a22, &a23, &a24, &a25, &a26, &a27 };
- return Func(p0, p1, args, 28);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
- const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
- const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
- const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
- const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
- const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
- const Arg& a26, const Arg& a27, const Arg& a28) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
- &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
- &a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28 };
- return Func(p0, p1, args, 29);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
- const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
- const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
- const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
- const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
- const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
- const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
- &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
- &a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29 };
- return Func(p0, p1, args, 30);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
- const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
- const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
- const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
- const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
- const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
- const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29,
- const Arg& a30) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
- &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
- &a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29, &a30 };
- return Func(p0, p1, args, 31);
- }
-
- Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
- const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
- const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
- const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
- const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
- const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
- const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
- const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29,
- const Arg& a30, const Arg& a31) const {
- const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
- &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
- &a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29, &a30, &a31 };
- return Func(p0, p1, args, 32);
- }
-};
-
-} // namespace re2
-
-#endif // RE2_VARIADIC_FUNCTION_H_
diff --git a/util/fuzz.cc b/util/fuzz.cc
new file mode 100644
index 0000000..9cac118
--- /dev/null
+++ b/util/fuzz.cc
@@ -0,0 +1,21 @@
+// Copyright 2016 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+// Entry point for libFuzzer.
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size);
+
+int main(int argc, char** argv) {
+ uint8_t data[32];
+ for (int i = 0; i < 32; i++) {
+ for (int j = 0; j < 32; j++) {
+ data[j] = random() & 0xFF;
+ }
+ LLVMFuzzerTestOneInput(data, 32);
+ }
+ return 0;
+}
diff --git a/util/test.cc b/util/test.cc
index b0167e7..0a751fe 100644
--- a/util/test.cc
+++ b/util/test.cc
@@ -23,7 +23,7 @@ void RegisterTest(void (*fn)(void), const char *name) {
tests[ntests++].name = name;
}
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
for (int i = 0; i < ntests; i++) {
printf("%s\n", tests[i].name);
tests[i].fn();
diff --git a/util/util.h b/util/util.h
index 7dfab93..1e78aa3 100644
--- a/util/util.h
+++ b/util/util.h
@@ -32,6 +32,7 @@
#include <set>
#include <atomic>
#include <mutex> // For std::call_once
+#include <unordered_set>
// Use std names.
using std::set;
@@ -46,22 +47,7 @@ using std::stack;
using std::sort;
using std::swap;
using std::make_pair;
-
-#if defined(__GNUC__) && !defined(USE_CXX0X) && !defined(_LIBCPP_ABI_VERSION)
-
-#include <tr1/unordered_set>
-using std::tr1::unordered_set;
-
-#else
-
-#include <unordered_set>
-#if defined(_WIN32)
-using std::tr1::unordered_set;
-#else
using std::unordered_set;
-#endif
-
-#endif
#ifdef _WIN32