13 files changed, 199 insertions, 111 deletions
diff --git a/BUILD b/BUILD
index 1e2d19c..4fd529f 100644
--- a/BUILD
+++ b/BUILD
@@ -61,13 +61,12 @@ cc_library(
         "re2/variadic_function.h",
     ],
     copts = ["-pthread"],
-    includes = ["."],
     linkopts = ["-pthread"],
     visibility = ["//visibility:public"],
 )
 
 cc_library(
-    name = "test",
+    name = "testing",
     testonly = 1,
     srcs = [
         "re2/testing/backtrack.cc",
@@ -79,7 +78,6 @@ cc_library(
         "re2/testing/tester.cc",
         "util/pcre.cc",
         "util/random.cc",
-        "util/test.cc",
         "util/thread.cc",
     ],
     hdrs = [
@@ -87,15 +85,21 @@ cc_library(
         "re2/testing/regexp_generator.h",
         "re2/testing/string_generator.h",
         "re2/testing/tester.h",
+        "util/benchmark.h",
         "util/pcre.h",
         "util/random.h",
         "util/test.h",
         "util/thread.h",
     ],
-    includes = ["."],
     deps = [":re2"],
 )
 
+cc_library(
+    name = "test",
+    srcs = ["util/test.cc"],
+    deps = [":testing"],
+)
+
 load("re2_test", "re2_test")
 
 re2_test("charclass_test")
@@ -156,4 +160,18 @@ re2_test(
     size = "large",
 )
 
-# TODO: Add support for regexp_benchmark.
+cc_library(
+    name = "benchmark",
+    srcs = ["util/benchmark.cc"],
+    deps = [":testing"],
+)
+
+cc_binary(
+    name = "regexp_benchmark",
+    srcs = ["re2/testing/regexp_benchmark.cc"],
+    linkopts = [
+        "-lm",
+        "-lrt",
+    ],
+    deps = [":benchmark"],
+)
diff --git a/Makefile b/Makefile
index 4479033..d0e2090 100644
--- a/Makefile
+++ b/Makefile
@@ -13,9 +13,12 @@
 # LDPCRE=-L/usr/local/lib -lpcre
 
 CXX?=g++
-CXXFLAGS?=-std=c++11 -O3 -g -pthread  # can override
-RE2_CXXFLAGS?=-Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -I. $(CCICU) $(CCPCRE)  # required
-LDFLAGS?=-pthread $(LDICU)
+# can override
+CXXFLAGS?=-O3 -g
+LDFLAGS?=
+# required
+RE2_CXXFLAGS?=-std=c++11 -pthread -Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -I. $(CCICU) $(CCPCRE)
+RE2_LDFLAGS?=-pthread $(LDICU) $(LDPCRE)
 AR?=ar
 ARFLAGS?=rsc
 NM?=nm
@@ -44,17 +47,17 @@ ifeq ($(shell uname),Darwin)
 SOEXT=dylib
 SOEXTVER=$(SONAME).$(SOEXT)
 SOEXTVER00=$(SONAME).0.0.$(SOEXT)
-MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib -Wl,-install_name,@rpath/libre2.$(SOEXTVER),-exported_symbols_list,libre2.symbols.darwin $(LDFLAGS)
+MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib -Wl,-install_name,@rpath/libre2.$(SOEXTVER),-exported_symbols_list,libre2.symbols.darwin $(RE2_LDFLAGS) $(LDFLAGS)
 else ifeq ($(shell uname),SunOS)
 SOEXT=so
 SOEXTVER=$(SOEXT).$(SONAME)
 SOEXTVER00=$(SOEXT).$(SONAME).0.0
-MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),-M,libre2.symbols $(LDFLAGS)
+MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),-M,libre2.symbols $(RE2_LDFLAGS) $(LDFLAGS)
 else
 SOEXT=so
 SOEXTVER=$(SOEXT).$(SONAME)
 SOEXTVER00=$(SOEXT).$(SONAME).0.0
-MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),--version-script,libre2.symbols $(LDFLAGS)
+MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),--version-script,libre2.symbols $(RE2_LDFLAGS) $(LDFLAGS)
 endif
 
 all: obj/libre2.a obj/so/libre2.$(SOEXT)
@@ -173,15 +176,15 @@ DBIGTESTS=$(patsubst obj/%,obj/dbg/%,$(BIGTESTS))
 
 obj/%.o: %.cc $(HFILES)
 	@mkdir -p $$(dirname $@)
-	$(CXX) -c -o $@ $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.cc
+	$(CXX) -c -o $@ $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) -DNDEBUG $*.cc
 
 obj/dbg/%.o: %.cc $(HFILES)
 	@mkdir -p $$(dirname $@)
-	$(CXX) -c -o $@ $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) $*.cc
+	$(CXX) -c -o $@ $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) $*.cc
 
 obj/so/%.o: %.cc $(HFILES)
 	@mkdir -p $$(dirname $@)
-	$(CXX) -c -o $@ -fPIC $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.cc
+	$(CXX) -c -o $@ -fPIC $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) -DNDEBUG $*.cc
 
 obj/libre2.a: $(OFILES)
 	@mkdir -p obj
@@ -198,20 +201,20 @@ obj/so/libre2.$(SOEXT): $(SOFILES)
 
 obj/dbg/test/%: obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES) obj/dbg/util/test.o
 	@mkdir -p obj/dbg/test
-	$(CXX) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) obj/dbg/util/test.o obj/dbg/libre2.a $(LDFLAGS) $(LDPCRE)
+	$(CXX) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) obj/dbg/util/test.o obj/dbg/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
 
 obj/test/%: obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o
 	@mkdir -p obj/test
-	$(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o obj/libre2.a $(LDFLAGS) $(LDPCRE)
+	$(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
 
 # Test the shared lib, falling back to the static lib for private symbols
 obj/so/test/%: obj/so/libre2.$(SOEXT) obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o
 	@mkdir -p obj/so/test
-	$(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o -Lobj/so -lre2 obj/libre2.a $(LDFLAGS) $(LDPCRE)
+	$(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o -Lobj/so -lre2 obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
 
 obj/test/regexp_benchmark: obj/libre2.a obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/util/benchmark.o
 	@mkdir -p obj/test
-	$(CXX) -o $@ obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/util/benchmark.o obj/libre2.a $(LDFLAGS) $(LDPCRE)
+	$(CXX) -o $@ obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/util/benchmark.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
 
 ifdef REBUILD_TABLES
 re2/perl_groups.cc: re2/make_perl_groups.pl
@@ -272,18 +275,31 @@ install: obj/libre2.a obj/so/libre2.$(SOEXT)
 	ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXT)
 	sed -e "s#@prefix@#${prefix}#" re2.pc >$(DESTDIR)$(libdir)/pkgconfig/re2.pc
 
-testinstall:
+testinstall: static-testinstall shared-testinstall
+	@echo
+	@echo Install tests passed.
+	@echo
+
+static-testinstall: CXXFLAGS:=-std=c++11 -pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS)
+static-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -l:libre2.a $(LDICU) $(LDFLAGS)
+static-testinstall:
 	@mkdir -p obj
-	cp testinstall.cc obj
+	@cp testinstall.cc obj
 ifeq ($(shell uname),Darwin)
 	@echo Skipping test for libre2.a on Darwin.
 else ifeq ($(shell uname),SunOS)
 	@echo Skipping test for libre2.a on SunOS.
 else
-	(cd obj && $(CXX) $(CXXFLAGS) -I$(DESTDIR)$(includedir) -L$(DESTDIR)$(libdir) testinstall.cc -l:libre2.a -o testinstall)
+	(cd obj && $(CXX) testinstall.cc -o testinstall $(CXXFLAGS) $(LDFLAGS))
 	obj/testinstall
 endif
-	(cd obj && $(CXX) $(CXXFLAGS) -I$(DESTDIR)$(includedir) -L$(DESTDIR)$(libdir) testinstall.cc -lre2 -o testinstall)
+
+shared-testinstall: CXXFLAGS:=-std=c++11 -pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS)
+shared-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -lre2 $(LDICU) $(LDFLAGS)
+shared-testinstall:
+	@mkdir -p obj
+	@cp testinstall.cc obj
+	(cd obj && $(CXX) testinstall.cc -o testinstall $(CXXFLAGS) $(LDFLAGS))
 	LD_LIBRARY_PATH=$(DESTDIR)$(libdir) obj/testinstall
 
 benchlog: obj/test/regexp_benchmark
diff --git a/README b/README
index 119f33c..3686a86 100644
--- a/README
+++ b/README
@@ -34,4 +34,4 @@ A Node.js wrapper is at https://github.com/uhop/node-re2/ and on NPM.
 An OCaml wrapper is at https://github.com/janestreet/re2/ and on OPAM.
 A Perl wrapper is at https://github.com/dgl/re-engine-RE2/ and on CPAN.
 A Python wrapper is at https://github.com/facebook/pyre2/.
-A Ruby wrapper is at https://github.com/axic/rre2/.
+A Ruby wrapper is at https://github.com/mudge/re2/.
diff --git a/re2.pc b/re2.pc
index 82832c3..91ba181 100644
--- a/re2.pc
+++ b/re2.pc
@@ -6,5 +6,5 @@ libdir=${exec_prefix}/lib
 Name: re2
 Description: RE2 is a fast, safe, thread-friendly regular expression engine.
 Version: 0.0.0
-Cflags: -I${includedir} -pthread
-Libs: -L${libdir} -lre2 -pthread
+Cflags: -std=c++11 -pthread -I${includedir}
+Libs: -pthread -L${libdir} -lre2
diff --git a/re2/bitstate.cc b/re2/bitstate.cc
index 661a8ca..53fa219 100644
--- a/re2/bitstate.cc
+++ b/re2/bitstate.cc
@@ -345,6 +345,15 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context,
     return TrySearch(prog_->start(), text.begin());
   }
 
+  // Shallow first byte analysis. We could go deeper.
+  int fb = -1;
+  Prog::Inst* ip = prog_->inst(prog_->start());
+  if (ip->opcode() == kInstByteRange &&
+      ip->lo() == ip->hi() &&
+      !(ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z') &&
+      ip->last())
+    fb = ip->lo();
+
   // Unanchored search, starting from each possible text position.
   // Notice that we have to try the empty string at the end of
   // the text, so the loop condition is p <= text.end(), not p < text.end().
@@ -352,6 +361,13 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context,
   // but we are not clearing visited_ between calls to TrySearch,
   // so no work is duplicated and it ends up still being linear.
   for (const char* p = text.begin(); p <= text.end(); p++) {
+    // Try to use memchr to find the first byte quickly.
+    if (fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) {
+      p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
+      if (p == NULL)
+        p = text.end();
+    }
+
     cap_[0] = p;
     if (TrySearch(prog_->start(), p))  // Match must be leftmost; done.
       return true;
diff --git a/re2/compile.cc b/re2/compile.cc
index df859ab..334ee4c 100644
--- a/re2/compile.cc
+++ b/re2/compile.cc
@@ -409,7 +409,6 @@ Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
   if (id < 0)
     return NoMatch();
   inst_[id].InitByteRange(lo, hi, foldcase, 0);
-  prog_->byte_inst_count_++;
   prog_->MarkByteRange(lo, hi);
   if (foldcase && lo <= 'z' && hi >= 'a') {
     if (lo < 'a')
@@ -604,9 +603,6 @@ int Compiler::AddSuffixRecursive(int root, int id) {
       inst_[f.begin].set_out(br);
   }
 
-  // We just saved one ByteRange instruction. :)
-  prog_->byte_inst_count_--;
-
   int out = inst_[id].out();
   if (!IsCachedRuneByteSuffix(id)) {
     // The head should be the instruction most recently allocated, so free it
diff --git a/re2/dfa.cc b/re2/dfa.cc
index ccb6065..b4aac30 100644
--- a/re2/dfa.cc
+++ b/re2/dfa.cc
@@ -441,8 +441,12 @@ DFA::DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem)
     fprintf(stderr, "\nkind %d\n%s\n", (int)kind_, prog_->DumpUnanchored().c_str());
   int nmark = 0;
   if (kind_ == Prog::kLongestMatch)
-    nmark = prog->size();
-  nastack_ = 2 * prog->size() + nmark;
+    nmark = prog_->size();
+  // See DFA::AddToQueue() for why this is so.
+  nastack_ = prog_->inst_count(kInstCapture) +
+             prog_->inst_count(kInstEmptyWidth) +
+             prog_->inst_count(kInstNop) +
+             nmark + 1;  // + 1 for start inst
 
   // Account for space needed for DFA, q0, q1, astack.
   mem_budget_ -= sizeof(DFA);
@@ -462,9 +466,11 @@ DFA::DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem)
   // At minimum, the search requires room for two states in order
   // to limp along, restarting frequently.  We'll get better performance
   // if there is room for a larger number of states, say 20.
+  // Note that a state stores list heads only, so we use the program
+  // list count for the upper bound, not the program size.
   int nnext = prog_->bytemap_range() + 1;  // + 1 for kByteEndText slot
   int64 one_state = sizeof(State) + (nnext-1)*sizeof(std::atomic<State*>) +
-                    (prog_->size()+nmark)*sizeof(int);
+                    (prog_->list_count()+nmark)*sizeof(int);
   if (state_budget_ < 20*one_state) {
     LOG(INFO) << StringPrintf("DFA out of memory: prog size %d mem %lld",
                               prog_->size(), max_mem);
@@ -472,8 +478,8 @@ DFA::DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem)
     return;
   }
 
-  q0_ = new Workq(prog->size(), nmark);
-  q1_ = new Workq(prog->size(), nmark);
+  q0_ = new Workq(prog_->size(), nmark);
+  q1_ = new Workq(prog_->size(), nmark);
   astack_ = new int[nastack_];
 }
 
@@ -791,12 +797,15 @@ void DFA::StateToWorkq(State* s, Workq* q) {
 // Adds ip to the work queue, following empty arrows according to flag.
 void DFA::AddToQueue(Workq* q, int id, uint flag) {
 
-  // Use astack_ to hold our stack of states yet to process.
-  // It is sized to have room for nastack_ == 2*prog->size() + nmark
-  // instructions, which is enough: each instruction can be
-  // processed by the switch below only once, and the processing
-  // pushes at most two instructions plus maybe a mark.
-  // (If we're using marks, nmark == prog->size(); otherwise nmark == 0.)
+  // Use astack_ to hold our stack of instructions yet to process.
+  // It was preallocated as follows:
+  //   one entry per Capture;
+  //   one entry per EmptyWidth; and
+  //   one entry per Nop.
+  // This reflects the maximum number of stack pushes that each can
+  // perform. (Each instruction can be processed at most once.)
+  // When using marks, we also added nmark == prog_->size().
+  // (Otherwise, nmark == 0.)
   int* stk = astack_;
   int nstk = 0;
 
@@ -805,6 +814,7 @@ void DFA::AddToQueue(Workq* q, int id, uint flag) {
     DCHECK_LE(nstk, nastack_);
     id = stk[--nstk];
 
+  Loop:
     if (id == Mark) {
       q->mark();
       continue;
@@ -831,9 +841,10 @@ void DFA::AddToQueue(Workq* q, int id, uint flag) {
 
       case kInstByteRange:  // just save these on the queue
       case kInstMatch:
-        if (!ip->last())
-          stk[nstk++] = id+1;
-        break;
+        if (ip->last())
+          break;
+        id = id+1;
+        goto Loop;
 
       case kInstCapture:    // DFA treats captures as no-ops.
       case kInstNop:
@@ -847,13 +858,13 @@ void DFA::AddToQueue(Workq* q, int id, uint flag) {
         if (ip->opcode() == kInstNop && q->maxmark() > 0 &&
             id == prog_->start_unanchored() && id != prog_->start())
           stk[nstk++] = Mark;
-        stk[nstk++] = ip->out();
-        break;
+        id = ip->out();
+        goto Loop;
 
       case kInstAltMatch:
         DCHECK(!ip->last());
-        stk[nstk++] = id+1;
-        break;
+        id = id+1;
+        goto Loop;
 
       case kInstEmptyWidth:
         if (!ip->last())
@@ -862,8 +873,8 @@ void DFA::AddToQueue(Workq* q, int id, uint flag) {
         // Continue on if we have all the right flag bits.
         if (ip->empty() & ~flag)
           break;
-        stk[nstk++] = ip->out();
-        break;
+        id = ip->out();
+        goto Loop;
     }
   }
 }
@@ -1783,11 +1794,11 @@ bool DFA::Search(const StringPiece& text,
 // This is a separate function so that
 // prog.h can be used without moving the definition of
 // class DFA out of this file.  If you (atomically) set
-//   prog->dfa_first_ = dfa;
+//   prog_->dfa_first_ = dfa;
 // or
-//   prog->dfa_longest_ = dfa;
+//   prog_->dfa_longest_ = dfa;
 // then you also have to set
-//   prog->delete_dfa_ = DeleteDFA;
+//   prog_->delete_dfa_ = DeleteDFA;
 // so that ~Prog can delete the DFA.
 static void DeleteDFA(std::atomic<DFA*>* pdfa) {
   DFA* dfa = pdfa->load(std::memory_order_relaxed);
diff --git a/re2/nfa.cc b/re2/nfa.cc
index 5232e5f..16bbe31 100644
--- a/re2/nfa.cc
+++ b/re2/nfa.cc
@@ -55,10 +55,7 @@ class NFA {
 
  private:
   struct Thread {
-    union {
-      int id;
-      Thread* next;  // when on free list
-    };
+    Thread* next;  // when on free list
     const char** capture;
   };
 
@@ -127,7 +124,7 @@ class NFA {
 
 NFA::NFA(Prog* prog) {
   prog_ = prog;
-  start_ = prog->start();
+  start_ = prog_->start();
   ncapture_ = 0;
   longest_ = false;
   endmatch_ = false;
@@ -135,7 +132,10 @@ NFA::NFA(Prog* prog) {
   etext_ = NULL;
   q0_.resize(prog_->size());
   q1_.resize(prog_->size());
-  nastack_ = 2*prog_->size();
+  // See NFA::AddToThreadq() for why this is so.
+  nastack_ = 2*prog_->inst_count(kInstCapture) +
+             prog_->inst_count(kInstEmptyWidth) +
+             prog_->inst_count(kInstNop) + 1;  // + 1 for start inst
   astack_ = new AddState[nastack_];
   match_ = NULL;
   matched_ = false;
@@ -188,18 +188,22 @@ void NFA::AddToThreadq(Threadq* q, int id0, int flag,
   if (id0 == 0)
     return;
 
-  // Astack_ is pre-allocated to avoid resize operations.
-  // It has room for 2*prog_->size() entries, which is enough:
-  // Each inst in prog can be processed at most once,
-  // pushing at most two entries on stk.
-
-  int nstk = 0;
+  // Use astack_ to hold our stack of instructions yet to process.
+  // It was preallocated as follows:
+  //   two entries per Capture;
+  //   one entry per EmptyWidth; and
+  //   one entry per Nop.
+  // This reflects the maximum number of stack pushes that each can
+  // perform. (Each instruction can be processed at most once.)
   AddState* stk = astack_;
-  stk[nstk++] = AddState(id0);
+  int nstk = 0;
 
+  stk[nstk++] = AddState(id0);
   while (nstk > 0) {
     DCHECK_LE(nstk, nastack_);
-    const AddState& a = stk[--nstk];
+    AddState a = stk[--nstk];
+
+  Loop:
     if (a.j >= 0)
       capture[a.j] = a.cap_j;
 
@@ -230,23 +234,22 @@ void NFA::AddToThreadq(Threadq* q, int id0, int flag,
       break;
 
     case kInstAltMatch:
-      DCHECK(!ip->last());
-      stk[nstk++] = AddState(id+1);
-
       // Save state; will pick up at next byte.
       t = AllocThread();
-      t->id = id;
       CopyCapture(t->capture, capture);
       *tp = t;
-      break;
+
+      DCHECK(!ip->last());
+      a = AddState(id+1);
+      goto Loop;
 
     case kInstNop:
       if (!ip->last())
         stk[nstk++] = AddState(id+1);
 
       // Continue on.
-      stk[nstk++] = AddState(ip->out());
-      break;
+      a = AddState(ip->out());
+      goto Loop;
 
     case kInstCapture:
       if (!ip->last())
@@ -260,22 +263,22 @@ void NFA::AddToThreadq(Threadq* q, int id0, int flag,
         // Record capture.
         capture[j] = p;
       }
-      stk[nstk++] = AddState(ip->out());
-      break;
+      a = AddState(ip->out());
+      goto Loop;
 
     case kInstMatch:
     case kInstByteRange:
-      if (!ip->last())
-        stk[nstk++] = AddState(id+1);
-
       // Save state; will pick up at next byte.
       t = AllocThread();
-      t->id = id;
       CopyCapture(t->capture, capture);
       *tp = t;
       if (Debug)
         fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t);
-      break;
+
+      if (ip->last())
+        break;
+      a = AddState(id+1);
+      goto Loop;
 
     case kInstEmptyWidth:
       if (!ip->last())
@@ -284,8 +287,8 @@ void NFA::AddToThreadq(Threadq* q, int id0, int flag,
       // Continue on if we have all the right flag bits.
       if (ip->empty() & ~flag)
         break;
-      stk[nstk++] = AddState(ip->out());
-      break;
+      a = AddState(ip->out());
+      goto Loop;
     }
   }
 }
@@ -314,7 +317,7 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
       }
     }
 
-    int id = t->id;
+    int id = i->index();
     Prog::Inst* ip = prog_->inst(id);
 
     switch (ip->opcode()) {
@@ -333,7 +336,7 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
           break;
         // The match is ours if we want it.
         if (ip->greedy(prog_) || longest_) {
-          CopyCapture((const char**)match_, t->capture);
+          CopyCapture(match_, t->capture);
           FreeThread(t);
           for (++i; i != runq->end(); ++i)
             FreeThread(i->second);
@@ -349,7 +352,6 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
         if (endmatch_ && p != etext_)
           break;
 
-        const char* old = t->capture[1];  // previous end pointer
         t->capture[1] = p;
         if (longest_) {
           // Leftmost-longest mode: save this match only if
@@ -357,16 +359,15 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
           // point but longer than an existing match.
           if (!matched_ || t->capture[0] < match_[0] ||
               (t->capture[0] == match_[0] && t->capture[1] > match_[1]))
-            CopyCapture((const char**)match_, t->capture);
+            CopyCapture(match_, t->capture);
         } else {
           // Leftmost-biased mode: this match is by definition
           // better than what we've already found (see next line).
-          CopyCapture((const char**)match_, t->capture);
+          CopyCapture(match_, t->capture);
 
           // Cut off the threads that can only find matches
           // worse than the one we just found: don't run the
           // rest of the current Threadq.
-          t->capture[0] = old;
           FreeThread(t);
           for (++i; i != runq->end(); ++i)
             FreeThread(i->second);
@@ -374,7 +375,6 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
           matched_ = true;
           return 0;
         }
-        t->capture[0] = old;
         matched_ = true;
         break;
     }
@@ -513,8 +513,7 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
         Thread* t = i->second;
         if (t == NULL)
           continue;
-        fprintf(stderr, " %d%s", t->id,
-                FormatCapture((const char**)t->capture).c_str());
+        fprintf(stderr, " %d%s", i->index(), FormatCapture(t->capture).c_str());
       }
       fprintf(stderr, "\n");
     }
@@ -581,11 +580,8 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
         flag = Prog::EmptyFlags(context, p);
       }
 
-      // Steal match storage (cleared but unused as of yet)
-      // temporarily to hold match boundaries for new thread.
       match_[0] = p;
       AddToThreadq(runq, start_, flag, p, match_);
-      match_[0] = NULL;
     }
 
     // If all the threads have died, stop early.
diff --git a/re2/onepass.cc b/re2/onepass.cc
index 977ed96..977b0e7 100644
--- a/re2/onepass.cc
+++ b/re2/onepass.cc
@@ -384,7 +384,7 @@ bool Prog::IsOnePass() {
   // Willing to use at most 1/4 of the DFA budget (heuristic).
   // Limit max node count to 65000 as a conservative estimate to
   // avoid overflowing 16-bit node index in encoding.
-  int maxnodes = 2 + byte_inst_count_;
+  int maxnodes = 2 + inst_count(kInstByteRange);
   int statesize = sizeof(OneState) + (bytemap_range_-1)*sizeof(uint32);
   if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
     return false;
diff --git a/re2/prog.cc b/re2/prog.cc
index b5f48ef..15f10e1 100644
--- a/re2/prog.cc
+++ b/re2/prog.cc
@@ -98,7 +98,6 @@ Prog::Prog()
     start_(0),
     start_unanchored_(0),
     size_(0),
-    byte_inst_count_(0),
     bytemap_range_(0),
     flags_(0),
     onepass_statesize_(0),
@@ -383,13 +382,24 @@ void Prog::Flatten() {
     flat.back().set_last();
   }
 
+  list_count_ = flatmap.size();
+  for (int i = 0; i < kNumInst; i++)
+    inst_count_[i] = 0;
+
   // Third pass: Remaps outs to flat-ids.
+  // Counts instructions by opcode.
   for (int id = 0; id < static_cast<int>(flat.size()); id++) {
     Inst* ip = &flat[id];
     if (ip->opcode() != kInstAltMatch)  // handled in EmitList()
       ip->set_out(flatmap[ip->out()]);
+    inst_count_[ip->opcode()]++;
   }
 
+  int total = 0;
+  for (int i = 0; i < kNumInst; i++)
+    total += inst_count_[i];
+  DCHECK_EQ(total, static_cast<int>(flat.size()));
+
   // Remap start_unanchored and start.
   if (start_unanchored() == 0) {
     DCHECK_EQ(start(), 0);
diff --git a/re2/prog.h b/re2/prog.h
index 4a4017c..3e2d809 100644
--- a/re2/prog.h
+++ b/re2/prog.h
@@ -58,6 +58,7 @@ enum InstOp {
   kInstMatch,        // found a match!
   kInstNop,          // no-op; occasionally unavoidable
   kInstFail,         // never match; occasionally unavoidable
+  kNumInst,
 };
 
 // Bit flags for empty-width specials
@@ -215,7 +216,8 @@ class Prog {
   int size() { return size_; }
   bool reversed() { return reversed_; }
   void set_reversed(bool reversed) { reversed_ = reversed; }
-  int byte_inst_count() { return byte_inst_count_; }
+  int list_count() { return list_count_; }
+  int inst_count(InstOp op) { return inst_count_[op]; }
   const Bitmap<256>& byterange() { return byterange_; }
   void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
   int64 dfa_mem() { return dfa_mem_; }
@@ -378,11 +380,13 @@ class Prog {
   int start_;               // entry point for program
   int start_unanchored_;    // unanchored entry point for program
   int size_;                // number of instructions
-  int byte_inst_count_;     // number of kInstByteRange instructions
   int bytemap_range_;       // bytemap_[x] < bytemap_range_
   int flags_;               // regexp parse flags
   int onepass_statesize_;   // byte size of each OneState* node
 
+  int list_count_;            // count of lists (see above)
+  int inst_count_[kNumInst];  // count of instructions by opcode
+
   Inst* inst_;              // pointer to instruction array
 
   Mutex dfa_mutex_;    // Protects dfa_first_, dfa_longest_
diff --git a/re2/testing/regexp_benchmark.cc b/re2/testing/regexp_benchmark.cc
index 8318e28..311a60e 100644
--- a/re2/testing/regexp_benchmark.cc
+++ b/re2/testing/regexp_benchmark.cc
@@ -174,6 +174,10 @@ void Search(int iters, int nbytes, const char* regexp, SearchImpl* search) {
 // figure out there's no match.
 #define HARD       "[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
 
+// This has quite a high degree of fanout.
+// NFA execution will be particularly slow.
+#define FANOUT     "(?:[\\x{80}-\\x{10FFFF}]?){100}[\\x{80}-\\x{10FFFF}]"
+
 // This stresses engines that are trying to track parentheses.
 #define PARENS     "([ -~])*(A)(B)(C)(D)(E)(F)(G)(H)(I)(J)(K)(L)(M)" \
                    "(N)(O)(P)(Q)(R)(S)(T)(U)(V)(W)(X)(Y)(Z)$"
@@ -226,6 +230,18 @@ BENCHMARK_RANGE(Search_Hard_CachedPCRE,    8, 4<<10)->ThreadRange(1, NumCPUs());
 #endif
 BENCHMARK_RANGE(Search_Hard_CachedRE2,     8, 16<<20)->ThreadRange(1, NumCPUs());
 
+void Search_Fanout_CachedDFA(int i, int n)     { Search(i, n, FANOUT, SearchCachedDFA); }
+void Search_Fanout_CachedNFA(int i, int n)     { Search(i, n, FANOUT, SearchCachedNFA); }
+void Search_Fanout_CachedPCRE(int i, int n)    { Search(i, n, FANOUT, SearchCachedPCRE); }
+void Search_Fanout_CachedRE2(int i, int n)     { Search(i, n, FANOUT, SearchCachedRE2); }
+
+BENCHMARK_RANGE(Search_Fanout_CachedDFA,     8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Fanout_CachedNFA,     8, 256<<10)->ThreadRange(1, NumCPUs());
+#ifdef USEPCRE
+BENCHMARK_RANGE(Search_Fanout_CachedPCRE,    8, 4<<10)->ThreadRange(1, NumCPUs());
+#endif
+BENCHMARK_RANGE(Search_Fanout_CachedRE2,     8, 16<<20)->ThreadRange(1, NumCPUs());
+
 void Search_Parens_CachedDFA(int i, int n)     { Search(i, n, PARENS, SearchCachedDFA); }
 void Search_Parens_CachedNFA(int i, int n)     { Search(i, n, PARENS, SearchCachedNFA); }
 void Search_Parens_CachedPCRE(int i, int n)    { Search(i, n, PARENS, SearchCachedPCRE); }
@@ -299,11 +315,13 @@ void SearchSuccess(int iters, int nbytes, const char* regexp, SearchImpl* search
 // Unambiguous search (RE2 can use OnePass).
 
 void Search_Success_DFA(int i, int n)     { SearchSuccess(i, n, ".*$", SearchDFA); }
+void Search_Success_NFA(int i, int n)     { SearchSuccess(i, n, ".*$", SearchNFA); }
 void Search_Success_PCRE(int i, int n)    { SearchSuccess(i, n, ".*$", SearchPCRE); }
 void Search_Success_RE2(int i, int n)     { SearchSuccess(i, n, ".*$", SearchRE2); }
 void Search_Success_OnePass(int i, int n) { SearchSuccess(i, n, ".*$", SearchOnePass); }
 
 BENCHMARK_RANGE(Search_Success_DFA,     8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Success_NFA,     8, 16<<20)->ThreadRange(1, NumCPUs());
 #ifdef USEPCRE
 BENCHMARK_RANGE(Search_Success_PCRE,    8, 16<<20)->ThreadRange(1, NumCPUs());
 #endif
@@ -311,11 +329,13 @@ BENCHMARK_RANGE(Search_Success_RE2,     8, 16<<20)->ThreadRange(1, NumCPUs());
 BENCHMARK_RANGE(Search_Success_OnePass, 8, 2<<20)->ThreadRange(1, NumCPUs());
 
 void Search_Success_CachedDFA(int i, int n)     { SearchSuccess(i, n, ".*$", SearchCachedDFA); }
+void Search_Success_CachedNFA(int i, int n)     { SearchSuccess(i, n, ".*$", SearchCachedNFA); }
 void Search_Success_CachedPCRE(int i, int n)    { SearchSuccess(i, n, ".*$", SearchCachedPCRE); }
 void Search_Success_CachedRE2(int i, int n)     { SearchSuccess(i, n, ".*$", SearchCachedRE2); }
 void Search_Success_CachedOnePass(int i, int n) { SearchSuccess(i, n, ".*$", SearchCachedOnePass); }
 
 BENCHMARK_RANGE(Search_Success_CachedDFA,     8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Success_CachedNFA,     8, 16<<20)->ThreadRange(1, NumCPUs());
 #ifdef USEPCRE
 BENCHMARK_RANGE(Search_Success_CachedPCRE,    8, 16<<20)->ThreadRange(1, NumCPUs());
 #endif
@@ -326,28 +346,32 @@ BENCHMARK_RANGE(Search_Success_CachedOnePass, 8, 2<<20)->ThreadRange(1, NumCPUs(
 // Used to be ".*.$", but that is coalesced to ".+$" these days.
 
 void Search_Success1_DFA(int i, int n)      { SearchSuccess(i, n, ".*\\C$", SearchDFA); }
+void Search_Success1_NFA(int i, int n)      { SearchSuccess(i, n, ".*\\C$", SearchNFA); }
 void Search_Success1_PCRE(int i, int n)     { SearchSuccess(i, n, ".*\\C$", SearchPCRE); }
 void Search_Success1_RE2(int i, int n)      { SearchSuccess(i, n, ".*\\C$", SearchRE2); }
 void Search_Success1_BitState(int i, int n) { SearchSuccess(i, n, ".*\\C$", SearchBitState); }
 
 BENCHMARK_RANGE(Search_Success1_DFA,      8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Success1_NFA,      8, 16<<20)->ThreadRange(1, NumCPUs());
 #ifdef USEPCRE
 BENCHMARK_RANGE(Search_Success1_PCRE,     8, 16<<20)->ThreadRange(1, NumCPUs());
 #endif
 BENCHMARK_RANGE(Search_Success1_RE2,      8, 16<<20)->ThreadRange(1, NumCPUs());
 BENCHMARK_RANGE(Search_Success1_BitState, 8, 2<<20)->ThreadRange(1, NumCPUs());
 
-void Search_Success1_Cached_DFA(int i, int n)      { SearchSuccess(i, n, ".*\\C$", SearchCachedDFA); }
-void Search_Success1_Cached_PCRE(int i, int n)     { SearchSuccess(i, n, ".*\\C$", SearchCachedPCRE); }
-void Search_Success1_Cached_RE2(int i, int n)      { SearchSuccess(i, n, ".*\\C$", SearchCachedRE2); }
-void Search_Success1_Cached_BitState(int i, int n) { SearchSuccess(i, n, ".*\\C$", SearchCachedBitState); }
+void Search_Success1_CachedDFA(int i, int n)      { SearchSuccess(i, n, ".*\\C$", SearchCachedDFA); }
+void Search_Success1_CachedNFA(int i, int n)      { SearchSuccess(i, n, ".*\\C$", SearchCachedNFA); }
+void Search_Success1_CachedPCRE(int i, int n)     { SearchSuccess(i, n, ".*\\C$", SearchCachedPCRE); }
+void Search_Success1_CachedRE2(int i, int n)      { SearchSuccess(i, n, ".*\\C$", SearchCachedRE2); }
+void Search_Success1_CachedBitState(int i, int n) { SearchSuccess(i, n, ".*\\C$", SearchCachedBitState); }
 
-BENCHMARK_RANGE(Search_Success1_Cached_DFA,      8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Success1_CachedDFA,      8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Success1_CachedNFA,      8, 16<<20)->ThreadRange(1, NumCPUs());
 #ifdef USEPCRE
-BENCHMARK_RANGE(Search_Success1_Cached_PCRE,     8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Success1_CachedPCRE,     8, 16<<20)->ThreadRange(1, NumCPUs());
 #endif
-BENCHMARK_RANGE(Search_Success1_Cached_RE2,      8, 16<<20)->ThreadRange(1, NumCPUs());
-BENCHMARK_RANGE(Search_Success1_Cached_BitState, 8, 2<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Success1_CachedRE2,      8, 16<<20)->ThreadRange(1, NumCPUs());
+BENCHMARK_RANGE(Search_Success1_CachedBitState, 8, 2<<20)->ThreadRange(1, NumCPUs());
 
 // Benchmark: AltMatch optimisation (just to verify that it works)
 // Note that OnePass doesn't implement it!
diff --git a/re2_test.bzl b/re2_test.bzl
index 8dafbd5..94443f6 100644
--- a/re2_test.bzl
+++ b/re2_test.bzl
@@ -2,14 +2,11 @@
 # Use of this source code is governed by a BSD-style
 # license that can be found in the LICENSE file.
 
-# Define a bazel macro that creates cc_test for re2.
+# Defines a Bazel macro that instantiates a native cc_test rule for an RE2 test.
 def re2_test(name, deps=[], size="medium"):
   native.cc_test(
       name=name,
       srcs=["re2/testing/%s.cc" % (name)],
-      deps=[
-          ":re2",
-          ":test",
-      ] + deps,
+      deps=[":test"] + deps,
       size = size,
   )