Imported Upstream version 20150801upstream/20150801

Change-Id: If6ba79b7356ae34e4008c43850b4390163058f2f Signed-off-by: DongHun Kwak <dh0128.kwak@samsung.com>
author: DongHun Kwak <dh0128.kwak@samsung.com> 2016-11-21 16:55:18 +0900
committer: DongHun Kwak <dh0128.kwak@samsung.com> 2016-11-21 16:55:19 +0900
commit: 873ba3ec5c36fe51e9aeadaac126a04d965fc053 (patch)
tree: 9624dd07f5e55939549e86add68cda90119f8af4
parent: abbaebf6f92b878517d8061fae50a8b9ca84ca52 (diff)
download: re2-873ba3ec5c36fe51e9aeadaac126a04d965fc053.tar.gz
re2-873ba3ec5c36fe51e9aeadaac126a04d965fc053.tar.bz2
re2-873ba3ec5c36fe51e9aeadaac126a04d965fc053.zip
29 files changed, 738 insertions, 190 deletions
diff --git a/.gitignore b/.gitignore
index 3db23b7..a671fe2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@
 *.orig
 core
 obj/
+benchlog.*
diff --git a/Makefile b/Makefile
index 801304d..1245f35 100644
--- a/Makefile
+++ b/Makefile
@@ -2,8 +2,6 @@
 # Use of this source code is governed by a BSD-style
 # license that can be found in the LICENSE file.
 
-all: obj/libre2.a obj/so/libre2.so
-
 # to build against PCRE for testing or benchmarking,
 # uncomment the next two lines
 # CCPCRE=-I/usr/local/include -DUSEPCRE
@@ -38,13 +36,24 @@ SONAME=0
 # REBUILD_TABLES=1
 
 ifeq ($(shell uname),Darwin)
-MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib $(LDFLAGS) -exported_symbols_list libre2.symbols.darwin
+SOEXT=dylib
+SOEXTVER=$(SONAME).$(SOEXT)
+SOEXTVER00=$(SONAME).0.0.$(SOEXT)
+MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib $(LDFLAGS) -Wl,-install_name,@rpath/libre2.$(SOEXTVER) -exported_symbols_list libre2.symbols.darwin
 else ifeq ($(shell uname),SunOS)
-MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.so.$(SONAME),-M,libre2.symbols $(LDFLAGS)
+SOEXT=so
+SOEXTVER=$(SOEXT).$(SONAME)
+SOEXTVER00=$(SOEXT).$(SONAME).0.0
+MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),-M,libre2.symbols $(LDFLAGS)
 else
-MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.so.$(SONAME),--version-script,libre2.symbols $(LDFLAGS)
+SOEXT=so
+SOEXTVER=$(SOEXT).$(SONAME)
+SOEXTVER00=$(SOEXT).$(SONAME).0.0
+MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),--version-script,libre2.symbols $(LDFLAGS)
 endif
 
+all: obj/libre2.a obj/so/libre2.$(SOEXT)
+
 INSTALL_HFILES=\
 	re2/filtered_re2.h\
 	re2/re2.h\
@@ -177,10 +186,10 @@ obj/dbg/libre2.a: $(DOFILES)
 	@mkdir -p obj/dbg
 	$(AR) $(ARFLAGS) obj/dbg/libre2.a $(DOFILES)
 
-obj/so/libre2.so: $(SOFILES)
+obj/so/libre2.$(SOEXT): $(SOFILES)
 	@mkdir -p obj/so
-	$(MAKE_SHARED_LIBRARY) -o $@.$(SONAME) $(SOFILES)
-	ln -sf libre2.so.$(SONAME) $@
+	$(MAKE_SHARED_LIBRARY) -o obj/so/libre2.$(SOEXTVER) $(SOFILES)
+	ln -sf libre2.$(SOEXTVER) $@
 
 obj/test/%: obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o
 	@mkdir -p obj/test
@@ -190,7 +199,7 @@ obj/dbg/test/%: obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES) obj/dbg/
 	@mkdir -p obj/dbg/test
 	$(CXX) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) obj/dbg/util/test.o obj/dbg/libre2.a $(LDFLAGS) $(LDPCRE)
 
-obj/so/test/%: obj/so/libre2.so obj/libre2.a obj/so/re2/testing/%.o $(STESTOFILES) obj/so/util/test.o
+obj/so/test/%: obj/so/libre2.$(SOEXT) obj/libre2.a obj/so/re2/testing/%.o $(STESTOFILES) obj/so/util/test.o
 	@mkdir -p obj/so/test
 	$(CXX) -o $@ obj/so/re2/testing/$*.o $(STESTOFILES) obj/so/util/test.o -Lobj/so -lre2 obj/libre2.a $(LDFLAGS) $(LDPCRE)
 
@@ -204,6 +213,8 @@ re2/perl_groups.cc: re2/make_perl_groups.pl
 
 re2/unicode_%.cc: re2/make_unicode_%.py
 	python $< > $@
+
+.PRECIOUS: re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc
 endif
 
 distclean: clean
@@ -246,13 +257,13 @@ shared-bigtest: $(STESTS) $(SBIGTESTS)
 
 benchmark: obj/test/regexp_benchmark
 
-install: obj/libre2.a obj/so/libre2.so
+install: obj/libre2.a obj/so/libre2.$(SOEXT)
 	mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir)/pkgconfig
 	$(INSTALL_DATA) $(INSTALL_HFILES) $(DESTDIR)$(includedir)/re2
 	$(INSTALL) obj/libre2.a $(DESTDIR)$(libdir)/libre2.a
-	$(INSTALL) obj/so/libre2.so $(DESTDIR)$(libdir)/libre2.so.$(SONAME).0.0
-	ln -sf libre2.so.$(SONAME).0.0 $(DESTDIR)$(libdir)/libre2.so.$(SONAME)
-	ln -sf libre2.so.$(SONAME).0.0 $(DESTDIR)$(libdir)/libre2.so
+	$(INSTALL) obj/so/libre2.$(SOEXT) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER00)
+	ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER)
+	ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXT)
 	sed -e "s#@prefix@#${prefix}#" re2.pc >$(DESTDIR)$(libdir)/pkgconfig/re2.pc
 
 testinstall:
@@ -267,7 +278,7 @@ endif
 
 benchlog: obj/test/regexp_benchmark
 	(echo '==BENCHMARK==' `hostname` `date`; \
-	  (uname -a; $(CXX) --version; hg identify; file obj/test/regexp_benchmark) | sed 's/^/# /'; \
+	  (uname -a; $(CXX) --version; git rev-parse --short HEAD; file obj/test/regexp_benchmark) | sed 's/^/# /'; \
 	  echo; \
 	  ./obj/test/regexp_benchmark 'PCRE|RE2') | tee -a benchlog.$$(hostname | sed 's/\..*//')
 
@@ -279,8 +290,9 @@ benchlog: obj/test/regexp_benchmark
 	obj/test/% obj/so/test/% obj/dbg/test/%
 
 log:
-	make clean
-	make CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" obj/test/exhaustive{,1,2,3}_test
+	$(MAKE) clean
+	$(MAKE) CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" \
+		$(filter obj/test/exhaustive%_test,$(BIGTESTS))
 	echo '#' RE2 exhaustive tests built by make log >re2-exhaustive.txt
 	echo '#' $$(date) >>re2-exhaustive.txt
 	obj/test/exhaustive_test |grep -v '^PASS$$' >>re2-exhaustive.txt
@@ -288,7 +300,7 @@ log:
 	obj/test/exhaustive2_test |grep -v '^PASS$$' >>re2-exhaustive.txt
 	obj/test/exhaustive3_test |grep -v '^PASS$$' >>re2-exhaustive.txt
 
-	make CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" obj/test/search_test
+	$(MAKE) CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" obj/test/search_test
 	echo '#' RE2 basic search tests built by make $@ >re2-search.txt
 	echo '#' $$(date) >>re2-search.txt
 	obj/test/search_test |grep -v '^PASS$$' >>re2-search.txt
diff --git a/README b/README
index 3cba956..4c5875b 100644
--- a/README
+++ b/README
@@ -25,6 +25,7 @@ under the BSD-style license found in the LICENSE file.
 RE2's native language is C++.
 An Erlang wrapper is at https://github.com/tuncer/re2/.
 An Inferno wrapper is at https://github.com/powerman/inferno-re2/.
-A Perl wrapper is at https://github.com/dgl/re-engine-RE2/ and on CPAN. 
+An OCaml wrapper is at https://github.com/janestreet/re2/ and on OPAM.
+A Perl wrapper is at https://github.com/dgl/re-engine-RE2/ and on CPAN.
 A Python wrapper is at https://github.com/facebook/pyre2/.
 A Ruby wrapper is at https://github.com/axic/rre2/.
diff --git a/benchlog/benchplot.py b/benchlog/benchplot.py
new file mode 100755
index 0000000..104abe8
--- /dev/null
+++ b/benchlog/benchplot.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+
+import argparse     # for ArgumentParser
+import subprocess   # for Popen
+import tempfile     # for NamedTemporaryFile
+import os           # for remove
+
+class gnuplot(object):
+
+    output = "result.png"
+
+    script = """
+             set terminal png size 1024, 768
+             set output "{}.png"
+             set title "re2 benchlog"
+             set datafile separator ";"
+             set grid x y
+             set ylabel "MB/s"
+             set autoscale
+             plot """
+
+    template = """'{}' using 1:5:xticlabels(2) with linespoints linewidth 3 title "{}",\\\n"""
+
+    benchdata = dict()
+    tempfiles = []
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, traceback):
+        """
+        remove all temporary files
+        """
+
+        for filename in self.tempfiles:
+            os.remove(filename)
+
+    def parse_re2_benchlog(self, filename):
+        """
+        parse the input benchlog and return a dictionary contain bench data
+        """
+
+        benchdata = self.benchdata
+
+        with open(filename) as f:
+
+            for raw in f.readlines():
+
+                data = raw.split('\t')
+
+                if len(data) == 4:
+
+                    data = data[0].split('/') + data[1:]
+                    data = list(map(str.strip, data))
+
+                    if not benchdata.get(data[0]):
+                        benchdata[data[0]] = [ data[1:] ]
+                    else:
+                        benchdata[data[0]].append(data[1:])
+
+    def gen_csv(self):
+        """
+        generate temporary csv files
+        """
+
+        for name, data in self.benchdata.items():
+
+            with tempfile.NamedTemporaryFile(delete=False) as f:
+
+                for index, line in enumerate(data):
+                    f.write('{};{}\n'.format(index, ';'.join(line)).encode())
+
+                self.tempfiles.append(f.name)
+                self.script = self.script + self.template.format(f.name, name)
+
+    def run(self):
+        self.gen_csv()
+        script = self.script[:-3].format(self.output)
+        command = subprocess.Popen(['gnuplot'], stdin=subprocess.PIPE)
+        command.communicate(script.encode())
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='generate plots for benchlog')
+    parser.add_argument('benchlog', type=str, help='benchlog generated by re2')
+    args = parser.parse_args()
+
+    try:
+        subprocess.Popen(['gnuplot'], stdin=subprocess.PIPE)
+    except FileNotFoundError:
+        print('you can install "gnuplot" to generate plots automatically')
+        exit(1)
+
+    with gnuplot() as plot:
+        plot.output = args.benchlog
+        plot.parse_re2_benchlog(args.benchlog)
+        plot.run()
diff --git a/libre2.symbols.darwin b/libre2.symbols.darwin
index bc6f1af..4207f87 100644
--- a/libre2.symbols.darwin
+++ b/libre2.symbols.darwin
@@ -6,7 +6,12 @@ __ZNK3re23RE2*
 __ZN3re211StringPiece*
 __ZNK3re211StringPiece*
 # operator<<(std::ostream&, re2::StringPiece const&)
-__ZlsRNSt3__113basic_ostreamIcNS_11char_traitsIcEEEERKN3re211StringPieceE
+# Seen with libstdc++ on 10.8 and below:
+# __ZlsRSoRKN3re211StringPieceE
+# Seen with libc++ on 10.9 and above:
+# __ZlsRNSt3__113basic_ostreamIcNS_11char_traitsIcEEEERKN3re211StringPieceE
+# Note that "ls" means operator<<, so this is not overly broad.
+__Zls*RKN3re211StringPieceE
 # re2::FilteredRE2*
 __ZN3re211FilteredRE2*
 __ZNK3re211FilteredRE2*
diff --git a/re2/compile.cc b/re2/compile.cc
index 6478def..e5d6088 100644
--- a/re2/compile.cc
+++ b/re2/compile.cc
@@ -435,7 +435,10 @@ Frag Compiler::EmptyWidth(EmptyOp empty) {
   if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) {
     int j;
     for (int i = 0; i < 256; i = j) {
-      for (j = i+1; j < 256 && Prog::IsWordChar(i) == Prog::IsWordChar(j); j++)
+      for (j = i + 1; j < 256 &&
+                      Prog::IsWordChar(static_cast<uint8>(i)) ==
+                          Prog::IsWordChar(static_cast<uint8>(j));
+           j++)
         ;
       prog_->MarkByteRange(i, j-1);
     }
@@ -503,7 +506,10 @@ int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {
     return UncachedRuneByteSuffix(lo, hi, foldcase, next);
   }
 
-  uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | foldcase;
+  uint64 key = (uint64)next << 17 |
+               (uint64)lo   <<  9 |
+               (uint64)hi   <<  1 |
+               (uint64)foldcase;
   map<uint64, int>::iterator it = rune_cache_.find(key);
   if (it != rune_cache_.end())
     return it->second;
@@ -555,7 +561,8 @@ void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
     return;
   if (hi > 0xFF)
     hi = 0xFF;
-  AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));
+  AddSuffix(RuneByteSuffix(static_cast<uint8>(lo), static_cast<uint8>(hi),
+                           foldcase, 0));
 }
 
 // Table describing how to make a UTF-8 matching machine
@@ -596,7 +603,8 @@ void Compiler::Add_80_10ffff() {
     int next = 0;
     if (p.next >= 0)
       next = inst[p.next];
-    inst[i] = UncachedRuneByteSuffix(p.lo, p.hi, false, next);
+    inst[i] = UncachedRuneByteSuffix(static_cast<uint8>(p.lo),
+                                     static_cast<uint8>(p.hi), false, next);
     if ((p.lo & 0xC0) != 0x80)
       AddSuffix(inst[i]);
   }
@@ -625,7 +633,8 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
 
   // ASCII range is always a special case.
   if (hi < Runeself) {
-    AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));
+    AddSuffix(RuneByteSuffix(static_cast<uint8>(lo), static_cast<uint8>(hi),
+                             foldcase, 0));
     return;
   }
 
@@ -979,7 +988,7 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
     if (m > Prog::Inst::kMaxInst)
       m = Prog::Inst::kMaxInst;
 
-    max_inst_ = m;
+    max_inst_ = static_cast<int>(m);
   }
 
   anchor_ = anchor;
diff --git a/re2/dfa.cc b/re2/dfa.cc
index 25e0220..11eb77f 100644
--- a/re2/dfa.cc
+++ b/re2/dfa.cc
@@ -143,7 +143,7 @@ class DFA {
       if (sizeof(size_t) == sizeof(uint32))
         return Hash32StringWithSeed(s, len, a->flag_);
       else
-        return Hash64StringWithSeed(s, len, a->flag_);
+        return static_cast<size_t>(Hash64StringWithSeed(s, len, a->flag_));
     }
 #ifdef STL_MSVC
     // Less than operator.
@@ -451,7 +451,7 @@ DFA::DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem)
                  (sizeof(int)+sizeof(int)) * 2;  // q0, q1
   mem_budget_ -= nastack_ * sizeof(int);  // astack
   if (mem_budget_ < 0) {
-    LOG(INFO) << StringPrintf("DFA out of memory: prog size %lld mem %lld",
+    LOG(INFO) << StringPrintf("DFA out of memory: prog size %d mem %lld",
                               prog_->size(), max_mem);
     init_failed_ = true;
     return;
@@ -466,7 +466,7 @@ DFA::DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem)
   int64 one_state = sizeof(State) + (prog_->size()+nmark)*sizeof(int) +
                     (prog_->bytemap_range()+1)*sizeof(State*);
   if (state_budget_ < 20*one_state) {
-    LOG(INFO) << StringPrintf("DFA out of memory: prog size %lld mem %lld",
+    LOG(INFO) << StringPrintf("DFA out of memory: prog size %d mem %lld",
                               prog_->size(), max_mem);
     init_failed_ = true;
     return;
@@ -864,8 +864,10 @@ void DFA::AddToQueue(Workq* q, int id, uint flag) {
         break;
 
       case kInstEmptyWidth:
-        if ((ip->empty() & flag) == ip->empty())
-          stk[nstk++] = ip->out();
+        // Continue on if we have all the right flag bits.
+        if (ip->empty() & ~flag)
+          break;
+        stk[nstk++] = ip->out();
         break;
     }
   }
@@ -1014,7 +1016,7 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) {
   // byte processed was a word character.  Use that info to
   // insert empty-width (non-)word boundaries.
   bool islastword = state->flag_ & kFlagLastWord;
-  bool isword = (c != kByteEndText && Prog::IsWordChar(c));
+  bool isword = (c != kByteEndText && Prog::IsWordChar(static_cast<uint8>(c)));
   if (isword == islastword)
     beforeflag |= kEmptyNonWordBoundary;
   else
@@ -2034,7 +2036,7 @@ bool DFA::PossibleMatchRange(string* min, string* max, int maxlen) {
       if (ns == FullMatchState ||
           (ns > SpecialStateMax && ns->ninst_ > 0)) {
         extended = true;
-        min->append(1, j);
+        min->append(1, static_cast<char>(j));
         s = ns;
         break;
       }
@@ -2064,7 +2066,7 @@ bool DFA::PossibleMatchRange(string* min, string* max, int maxlen) {
       if (ns == FullMatchState ||
           (ns > SpecialStateMax && ns->ninst_ > 0)) {
         extended = true;
-        max->append(1, j);
+        max->append(1, static_cast<char>(j));
         s = ns;
         break;
       }
diff --git a/re2/make_perl_groups.pl b/re2/make_perl_groups.pl
index 8c1f4f6..d9fcdaf 100755
--- a/re2/make_perl_groups.pl
+++ b/re2/make_perl_groups.pl
@@ -32,14 +32,20 @@
 	"\\w",
 );
 
+%overrides = (
+	# Prior to Perl 5.18, \s did not match vertical tab.
+	# RE2 preserves that original behaviour.
+	"\\s:11" => 0,
+);
+
 sub ComputeClass($) {
+  my ($cname) = @_;
   my @ranges;
-  my ($class) = @_;
-  my $regexp = "[$class]";
+  my $regexp = qr/[$cname]/;
   my $start = -1;
   for (my $i=0; $i<=129; $i++) {
     if ($i == 129) { $i = 256; }
-    if ($i <= 128 && chr($i) =~ $regexp) {
+    if ($i <= 128 && ($overrides{"$cname:$i"} // chr($i) =~ $regexp)) {
       if ($start < 0) {
         $start = $i;
       }
@@ -54,15 +60,15 @@ sub ComputeClass($) {
 }
 
 sub PrintClass($$@) {
-  my ($cname, $name, @ranges) = @_;
-  print "static const URange16 code${cname}[] = {  /* $name */\n";
+  my ($cnum, $cname, @ranges) = @_;
+  print "static const URange16 code${cnum}[] = {  /* $cname */\n";
   for (my $i=0; $i<@ranges; $i++) {
     my @a = @{$ranges[$i]};
     printf "\t{ 0x%x, 0x%x },\n", $a[0], $a[1];
   }
   print "};\n";
   my $n = @ranges;
-  my $escname = $name;
+  my $escname = $cname;
   $escname =~ s/\\/\\\\/g;
   $negname = $escname;
   if ($negname =~ /:/) {
@@ -70,25 +76,25 @@ sub PrintClass($$@) {
   } else {
     $negname =~ y/a-z/A-Z/;
   }
-  return "{ \"$escname\", +1, code$cname, $n }", "{ \"$negname\", -1, code$cname, $n }";
+  return "{ \"$escname\", +1, code$cnum, $n }", "{ \"$negname\", -1, code$cnum, $n }";
 }
 
-my $gen = 0;
+my $cnum = 0;
 
 sub PrintClasses($@) {
-  my ($cname, @classes) = @_;
+  my ($pname, @classes) = @_;
   my @entries;
-  foreach my $cl (@classes) {
-    my @ranges = ComputeClass($cl);
-    push @entries, PrintClass(++$gen, $cl, @ranges);
+  foreach my $cname (@classes) {
+    my @ranges = ComputeClass($cname);
+    push @entries, PrintClass(++$cnum, $cname, @ranges);
   }
-  print "const UGroup ${cname}_groups[] = {\n";
+  print "const UGroup ${pname}_groups[] = {\n";
   foreach my $e (@entries) {
     print "\t$e,\n";
   }
   print "};\n";
   my $count = @entries;
-  print "const int num_${cname}_groups = $count;\n";
+  print "const int num_${pname}_groups = $count;\n";
 }
 
 print <<EOF;
diff --git a/re2/make_unicode_casefold.py b/re2/make_unicode_casefold.py
index 8f90cfe..d215eb1 100755
--- a/re2/make_unicode_casefold.py
+++ b/re2/make_unicode_casefold.py
@@ -9,7 +9,8 @@
 
 """Generate C++ table for Unicode case folding."""
 
-import unicode, sys
+import sys
+import unicode
 
 _header = """
 // GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
diff --git a/re2/nfa.cc b/re2/nfa.cc
index 57a18fe..3ca275e 100644
--- a/re2/nfa.cc
+++ b/re2/nfa.cc
@@ -468,7 +468,7 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
 
   if (text.begin() > context.begin()) {
     c = text.begin()[-1] & 0xFF;
-    wasword = Prog::IsWordChar(c);
+    wasword = Prog::IsWordChar(static_cast<uint8>(c));
   }
 
   // Loop over the text, stepping the machine.
diff --git a/re2/parse.cc b/re2/parse.cc
index e6b27d2..3c15bbd 100644
--- a/re2/parse.cc
+++ b/re2/parse.cc
@@ -215,7 +215,7 @@ bool Regexp::ParseState::PushRegexp(Regexp* re) {
   // single characters (e.g., [.] instead of \.), and some
   // analysis does better with fewer character classes.
   // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding.
-  if (re->op_ == kRegexpCharClass) {
+  if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {
     re->ccb_->RemoveAbove(rune_max_);
     if (re->ccb_->size() == 1) {
       Rune r = re->ccb_->begin()->lo;
@@ -576,13 +576,6 @@ bool Regexp::ParseState::DoLeftParenNoCapture() {
   return PushRegexp(re);
 }
 
-// Adds r to cc, along with r's upper case if foldascii is set.
-static void AddLiteral(CharClassBuilder* cc, Rune r, bool foldascii) {
-  cc->AddRange(r, r);
-  if (foldascii && 'a' <= r && r <= 'z')
-    cc->AddRange(r + 'A' - 'a', r + 'A' - 'a');
-}
-
 // Processes a vertical bar in the input.
 bool Regexp::ParseState::DoVerticalBar() {
   MaybeConcatString(-1, NoParseFlags);
@@ -596,46 +589,34 @@ bool Regexp::ParseState::DoVerticalBar() {
   Regexp* r1;
   Regexp* r2;
   if ((r1 = stacktop_) != NULL &&
-      (r2 = stacktop_->down_) != NULL &&
+      (r2 = r1->down_) != NULL &&
       r2->op() == kVerticalBar) {
-    // If above and below vertical bar are literal or char class,
-    // can merge into a single char class.
     Regexp* r3;
-    if ((r1->op() == kRegexpLiteral ||
-         r1->op() == kRegexpCharClass ||
-         r1->op() == kRegexpAnyChar) &&
-        (r3 = r2->down_) != NULL) {
-      Rune rune;
-      switch (r3->op()) {
-        case kRegexpLiteral:  // convert to char class
-          rune = r3->rune_;
-          r3->op_ = kRegexpCharClass;
-          r3->cc_ = NULL;
-          r3->ccb_ = new CharClassBuilder;
-          AddLiteral(r3->ccb_, rune, r3->parse_flags_ & Regexp::FoldCase);
-          // fall through
-        case kRegexpCharClass:
-          if (r1->op() == kRegexpLiteral)
-            AddLiteral(r3->ccb_, r1->rune_,
-                       r1->parse_flags_ & Regexp::FoldCase);
-          else if (r1->op() == kRegexpCharClass)
-            r3->ccb_->AddCharClass(r1->ccb_);
-          if (r1->op() == kRegexpAnyChar || r3->ccb_->full()) {
-            delete r3->ccb_;
-            r3->ccb_ = NULL;
-            r3->op_ = kRegexpAnyChar;
-          }
-          // fall through
-        case kRegexpAnyChar:
-          // pop r1
-          stacktop_ = r2;
-          r1->Decref();
-          return true;
-        default:
-          break;
+    if ((r3 = r2->down_) != NULL &&
+        (r1->op() == kRegexpAnyChar || r3->op() == kRegexpAnyChar)) {
+      // AnyChar is above or below the vertical bar. Let it subsume
+      // the other when the other is Literal, CharClass or AnyChar.
+      if (r3->op() == kRegexpAnyChar &&
+          (r1->op() == kRegexpLiteral ||
+           r1->op() == kRegexpCharClass ||
+           r1->op() == kRegexpAnyChar)) {
+        // Discard r1.
+        stacktop_ = r2;
+        r1->Decref();
+        return true;
+      }
+      if (r1->op() == kRegexpAnyChar &&
+          (r3->op() == kRegexpLiteral ||
+           r3->op() == kRegexpCharClass ||
+           r3->op() == kRegexpAnyChar)) {
+        // Rearrange the stack and discard r3.
+        r1->down_ = r3->down_;
+        r2->down_ = r1;
+        stacktop_ = r2;
+        r3->Decref();
+        return true;
       }
     }
-
     // Swap r1 below vertical bar (r2).
     r1->down_ = r2->down_;
     r2->down_ = r1;
@@ -1166,7 +1147,7 @@ bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) {
   if (r >= 0) {
     re1->op_ = kRegexpLiteral;
     re1->rune_ = r;
-    re1->parse_flags_ = flags;
+    re1->parse_flags_ = static_cast<uint16>(flags);
     return true;
   }
 
diff --git a/re2/prog.cc b/re2/prog.cc
index f326ffd..499f560 100644
--- a/re2/prog.cc
+++ b/re2/prog.cc
@@ -25,7 +25,7 @@ void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) {
   set_out_opcode(out, kInstByteRange);
   lo_ = lo & 0xFF;
   hi_ = hi & 0xFF;
-  foldcase_ = foldcase;
+  foldcase_ = foldcase & 0xFF;
 }
 
 void Prog::Inst::InitCapture(int cap, uint32 out) {
@@ -327,12 +327,12 @@ void Prog::ComputeByteMap() {
   bytemap_range_ = bytemap_[255] + 1;
   unbytemap_ = new uint8[bytemap_range_];
   for (int i = 0; i < 256; i++)
-    unbytemap_[bytemap_[i]] = i;
+    unbytemap_[bytemap_[i]] = static_cast<uint8>(i);
 
   if (0) {  // For debugging: use trivial byte map.
     for (int i = 0; i < 256; i++) {
-      bytemap_[i] = i;
-      unbytemap_[i] = i;
+      bytemap_[i] = static_cast<uint8>(i);
+      unbytemap_[i] = static_cast<uint8>(i);
     }
     bytemap_range_ = 256;
     LOG(INFO) << "Using trivial bytemap.";
diff --git a/re2/prog.h b/re2/prog.h
index 26c8676..3e6be8f 100644
--- a/re2/prog.h
+++ b/re2/prog.h
@@ -201,10 +201,10 @@ class Prog {
   int start_unanchored() { return start_unanchored_; }
   void set_start(int start) { start_ = start; }
   void set_start_unanchored(int start) { start_unanchored_ = start; }
-  int64 size() { return size_; }
+  int size() { return size_; }
   bool reversed() { return reversed_; }
   void set_reversed(bool reversed) { reversed_ = reversed; }
-  int64 byte_inst_count() { return byte_inst_count_; }
+  int byte_inst_count() { return byte_inst_count_; }
   const Bitmap<256>& byterange() { return byterange_; }
   void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
   int64 dfa_mem() { return dfa_mem_; }
diff --git a/re2/re2.cc b/re2/re2.cc
index 4e8af90..099491c 100644
--- a/re2/re2.cc
+++ b/re2/re2.cc
@@ -11,7 +11,6 @@
 
 #include <stdio.h>
 #include <string>
-#include <pthread.h>
 #include <errno.h>
 #include "util/atomicops.h"
 #include "util/util.h"
@@ -879,31 +878,30 @@ bool RE2::Rewrite(string *out, const StringPiece &rewrite,
                  const StringPiece *vec, int veclen) const {
   for (const char *s = rewrite.data(), *end = s + rewrite.size();
        s < end; s++) {
-    int c = *s;
-    if (c == '\\') {
-      s++;
-      c = (s < end) ? *s : -1;
-      if (isdigit(c)) {
-        int n = (c - '0');
-        if (n >= veclen) {
-          if (options_.log_errors()) {
-            LOG(ERROR) << "requested group " << n
-                       << " in regexp " << rewrite.data();
-          }
-          return false;
+    if (*s != '\\') {
+      out->push_back(*s);
+      continue;
+    }
+    s++;
+    int c = (s < end) ? *s : -1;
+    if (isdigit(c)) {
+      int n = (c - '0');
+      if (n >= veclen) {
+        if (options_.log_errors()) {
+          LOG(ERROR) << "requested group " << n
+                     << " in regexp " << rewrite.data();
         }
-        StringPiece snip = vec[n];
-        if (snip.size() > 0)
-          out->append(snip.data(), snip.size());
-      } else if (c == '\\') {
-        out->push_back('\\');
-      } else {
-        if (options_.log_errors())
-          LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
         return false;
       }
+      StringPiece snip = vec[n];
+      if (snip.size() > 0)
+        out->append(snip.data(), snip.size());
+    } else if (c == '\\') {
+      out->push_back('\\');
     } else {
-      out->push_back(c);
+      if (options_.log_errors())
+        LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
+      return false;
     }
   }
   return true;
@@ -1102,7 +1100,7 @@ bool RE2::Arg::parse_short_radix(const char* str,
   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
   if ((short)r != r) return false;       // Out of range
   if (dest == NULL) return true;
-  *(reinterpret_cast<short*>(dest)) = r;
+  *(reinterpret_cast<short*>(dest)) = (short)r;
   return true;
 }
 
@@ -1114,7 +1112,7 @@ bool RE2::Arg::parse_ushort_radix(const char* str,
   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
   if ((ushort)r != r) return false;                      // Out of range
   if (dest == NULL) return true;
-  *(reinterpret_cast<unsigned short*>(dest)) = r;
+  *(reinterpret_cast<unsigned short*>(dest)) = (ushort)r;
   return true;
 }
 
@@ -1200,7 +1198,7 @@ static bool parse_double_float(const char* str, int n, bool isfloat, void *dest)
   if (errno) return false;
   if (dest == NULL) return true;
   if (isfloat) {
-    *(reinterpret_cast<float*>(dest)) = r;
+    *(reinterpret_cast<float*>(dest)) = (float)r;
   } else {
     *(reinterpret_cast<double*>(dest)) = r;
   }
diff --git a/re2/regexp.cc b/re2/regexp.cc
index 3667fda..b09b5ca 100644
--- a/re2/regexp.cc
+++ b/re2/regexp.cc
@@ -14,7 +14,7 @@ namespace re2 {
 
 // Constructor.  Allocates vectors as appropriate for operator.
 Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
-  : op_(op),
+  : op_(static_cast<uint8>(op)),
     simple_(false),
     parse_flags_(static_cast<uint16>(parse_flags)),
     ref_(1),
@@ -107,7 +107,7 @@ void Regexp::Decref() {
     GLOBAL_MUTEX_LOCK(ref_mutex);
     int r = (*ref_map)[this] - 1;
     if (r < kMaxRef) {
-      ref_ = r;
+      ref_ = static_cast<uint16>(r);
       ref_map->erase(this);
     } else {
       (*ref_map)[this] = r;
@@ -651,7 +651,7 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
       if (re->parse_flags() & Latin1) {
         prefix->resize(re->nrunes_);
         for (int j = 0; j < re->nrunes_; j++)
-          (*prefix)[j] = re->runes_[j];
+          (*prefix)[j] = static_cast<char>(re->runes_[j]);
       } else {
         // Convert to UTF-8 in place.
         // Assume worst-case space and then trim.
@@ -660,7 +660,7 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
         for (int j = 0; j < re->nrunes_; j++) {
           Rune r = re->runes_[j];
           if (r < Runeself)
-            *p++ = r;
+            *p++ = static_cast<char>(r);
           else
             p += runetochar(p, &r);
         }
@@ -670,7 +670,7 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
 
     case kRegexpLiteral:
       if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
-        prefix->append(1, re->rune_);
+        prefix->append(1, static_cast<char>(re->rune_));
       } else {
         char buf[UTFmax];
         prefix->append(buf, runetochar(buf, &re->rune_));
diff --git a/re2/regexp.h b/re2/regexp.h
index 60eb993..b49ce0d 100644
--- a/re2/regexp.h
+++ b/re2/regexp.h
@@ -211,7 +211,8 @@ class RegexpStatus {
   DISALLOW_COPY_AND_ASSIGN(RegexpStatus);
 };
 
-// Walker to implement Simplify.
+// Walkers to implement Simplify.
+class CoalesceWalker;
 class SimplifyWalker;
 
 // Compiled form; see prog.h
@@ -353,6 +354,7 @@ class Regexp {
   // removed.  The result will capture exactly the same
   // subexpressions the original did, unless formatted with ToString.
   Regexp* Simplify();
+  friend class CoalesceWalker;
   friend class SimplifyWalker;
 
   // Parses the regexp src and then simplifies it and sets *dst to the
diff --git a/re2/simplify.cc b/re2/simplify.cc
index 9c0021e..d14483f 100644
--- a/re2/simplify.cc
+++ b/re2/simplify.cc
@@ -97,6 +97,36 @@ bool Regexp::ComputeSimple() {
 }
 
 // Walker subclass used by Simplify.
+// Coalesces runs of star/plus/quest/repeat of the same literal along with any
+// occurrences of that literal into repeats of that literal. It also works for
+// char classes, any char and any byte.
+// PostVisit creates the coalesced result, which should then be simplified.
+class CoalesceWalker : public Regexp::Walker<Regexp*> {
+ public:
+  CoalesceWalker() {}
+  virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
+                            Regexp** child_args, int nchild_args);
+  virtual Regexp* Copy(Regexp* re);
+  virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
+
+ private:
+  // These functions are declared inside CoalesceWalker so that
+  // they can edit the private fields of the Regexps they construct.
+
+  // Returns true if r1 and r2 can be coalesced. In particular, ensures that
+  // the parse flags are consistent. (They will not be checked again later.)
+  static bool CanCoalesce(Regexp* r1, Regexp* r2);
+
+  // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
+  // will be empty match and the coalesced op. In other cases, where part of a
+  // literal string was removed to be coalesced, the array elements afterwards
+  // will be the coalesced op and the remainder of the literal string.
+  static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
+
+  DISALLOW_COPY_AND_ASSIGN(CoalesceWalker);
+};
+
+// Walker subclass used by Simplify.
 // The simplify walk is purely post-recursive: given the simplified children,
 // PostVisit creates the simplified result.
 // The child_args are simplified Regexp*s.
@@ -104,9 +134,7 @@ class SimplifyWalker : public Regexp::Walker<Regexp*> {
  public:
   SimplifyWalker() {}
   virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
-  virtual Regexp* PostVisit(Regexp* re,
-                            Regexp* parent_arg,
-                            Regexp* pre_arg,
+  virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
                             Regexp** child_args, int nchild_args);
   virtual Regexp* Copy(Regexp* re);
   virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
@@ -143,14 +171,261 @@ class SimplifyWalker : public Regexp::Walker<Regexp*> {
 // Caller must Decref() return value when done with it.
 
 Regexp* Regexp::Simplify() {
-  if (simple_)
-    return Incref();
-  SimplifyWalker w;
-  return w.Walk(this, NULL);
+  CoalesceWalker cw;
+  Regexp* cre = cw.Walk(this, NULL);
+  if (cre == NULL)
+    return cre;
+  SimplifyWalker sw;
+  Regexp* sre = sw.Walk(cre, NULL);
+  cre->Decref();
+  return sre;
 }
 
 #define Simplify DontCallSimplify  // Avoid accidental recursion
 
+// Utility function for PostVisit implementations that compares re->sub() with
+// child_args to determine whether any child_args changed. In the common case,
+// where nothing changed, calls Decref() for all child_args and returns false,
+// so PostVisit must return re->Incref(). Otherwise, returns true.
+static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
+  for (int i = 0; i < re->nsub(); i++) {
+    Regexp* sub = re->sub()[i];
+    Regexp* newsub = child_args[i];
+    if (newsub != sub)
+      return true;
+  }
+  for (int i = 0; i < re->nsub(); i++) {
+    Regexp* newsub = child_args[i];
+    newsub->Decref();
+  }
+  return false;
+}
+
+Regexp* CoalesceWalker::Copy(Regexp* re) {
+  return re->Incref();
+}
+
+Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
+  // This should never be called, since we use Walk and not
+  // WalkExponential.
+  LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
+  return re->Incref();
+}
+
+Regexp* CoalesceWalker::PostVisit(Regexp* re,
+                                  Regexp* parent_arg,
+                                  Regexp* pre_arg,
+                                  Regexp** child_args,
+                                  int nchild_args) {
+  if (re->nsub() == 0)
+    return re->Incref();
+
+  if (re->op() != kRegexpConcat) {
+    if (!ChildArgsChanged(re, child_args))
+      return re->Incref();
+
+    // Something changed. Build a new op.
+    Regexp* nre = new Regexp(re->op(), re->parse_flags());
+    nre->AllocSub(re->nsub());
+    Regexp** nre_subs = nre->sub();
+    for (int i = 0; i < re->nsub(); i++)
+      nre_subs[i] = child_args[i];
+    // Repeats and Captures have additional data that must be copied.
+    if (re->op() == kRegexpRepeat) {
+      nre->min_ = re->min();
+      nre->max_ = re->max();
+    } else if (re->op() == kRegexpCapture) {
+      nre->cap_ = re->cap();
+    }
+    return nre;
+  }
+
+  bool can_coalesce = false;
+  for (int i = 0; i < re->nsub(); i++) {
+    if (i+1 < re->nsub() &&
+        CanCoalesce(child_args[i], child_args[i+1])) {
+      can_coalesce = true;
+      break;
+    }
+  }
+  if (!can_coalesce) {
+    if (!ChildArgsChanged(re, child_args))
+      return re->Incref();
+
+    // Something changed. Build a new op.
+    Regexp* nre = new Regexp(re->op(), re->parse_flags());
+    nre->AllocSub(re->nsub());
+    Regexp** nre_subs = nre->sub();
+    for (int i = 0; i < re->nsub(); i++)
+      nre_subs[i] = child_args[i];
+    return nre;
+  }
+
+  for (int i = 0; i < re->nsub(); i++) {
+    if (i+1 < re->nsub() &&
+        CanCoalesce(child_args[i], child_args[i+1]))
+      DoCoalesce(&child_args[i], &child_args[i+1]);
+  }
+  // Determine how many empty matches were left by DoCoalesce.
+  int n = 0;
+  for (int i = n; i < re->nsub(); i++) {
+    if (child_args[i]->op() == kRegexpEmptyMatch)
+      n++;
+  }
+  // Build a new op.
+  Regexp* nre = new Regexp(re->op(), re->parse_flags());
+  nre->AllocSub(re->nsub() - n);
+  Regexp** nre_subs = nre->sub();
+  for (int i = 0, j = 0; i < re->nsub(); i++) {
+    if (child_args[i]->op() == kRegexpEmptyMatch) {
+      child_args[i]->Decref();
+      continue;
+    }
+    nre_subs[j] = child_args[i];
+    j++;
+  }
+  return nre;
+}
+
+bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
+  // r1 must be a star/plus/quest/repeat of a literal, char class, any char or
+  // any byte.
+  if ((r1->op() == kRegexpStar ||
+       r1->op() == kRegexpPlus ||
+       r1->op() == kRegexpQuest ||
+       r1->op() == kRegexpRepeat) &&
+      (r1->sub()[0]->op() == kRegexpLiteral ||
+       r1->sub()[0]->op() == kRegexpCharClass ||
+       r1->sub()[0]->op() == kRegexpAnyChar ||
+       r1->sub()[0]->op() == kRegexpAnyByte)) {
+    // r2 must be a star/plus/quest/repeat of the same literal, char class,
+    // any char or any byte.
+    if ((r2->op() == kRegexpStar ||
+         r2->op() == kRegexpPlus ||
+         r2->op() == kRegexpQuest ||
+         r2->op() == kRegexpRepeat) &&
+        Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
+        // The parse flags must be consistent.
+        ((r1->parse_flags() & Regexp::NonGreedy) ==
+         (r2->parse_flags() & Regexp::NonGreedy))) {
+      return true;
+    }
+    // ... OR an occurrence of that literal, char class, any char or any byte
+    if (Regexp::Equal(r1->sub()[0], r2)) {
+      return true;
+    }
+    // ... OR a literal string that begins with that literal.
+    if (r1->sub()[0]->op() == kRegexpLiteral &&
+        r2->op() == kRegexpLiteralString &&
+        r2->runes()[0] == r1->sub()[0]->rune() &&
+        // The parse flags must be consistent.
+        ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
+         (r2->parse_flags() & Regexp::FoldCase))) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
+  Regexp* r1 = *r1ptr;
+  Regexp* r2 = *r2ptr;
+
+  Regexp* nre = Regexp::Repeat(
+      r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
+
+  switch (r1->op()) {
+    case kRegexpStar:
+      nre->min_ = 0;
+      nre->max_ = -1;
+      break;
+
+    case kRegexpPlus:
+      nre->min_ = 1;
+      nre->max_ = -1;
+      break;
+
+    case kRegexpQuest:
+      nre->min_ = 0;
+      nre->max_ = 1;
+      break;
+
+    case kRegexpRepeat:
+      nre->min_ = r1->min();
+      nre->max_ = r1->max();
+      break;
+
+    default:
+      LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
+      nre->Decref();
+      return;
+  }
+
+  switch (r2->op()) {
+    case kRegexpStar:
+      nre->max_ = -1;
+      goto LeaveEmpty;
+
+    case kRegexpPlus:
+      nre->min_++;
+      nre->max_ = -1;
+      goto LeaveEmpty;
+
+    case kRegexpQuest:
+      if (nre->max() != -1)
+        nre->max_++;
+      goto LeaveEmpty;
+
+    case kRegexpRepeat:
+      nre->min_ += r2->min();
+      if (r2->max() == -1)
+        nre->max_ = -1;
+      else if (nre->max() != -1)
+        nre->max_ += r2->max();
+      goto LeaveEmpty;
+
+    case kRegexpLiteral:
+    case kRegexpCharClass:
+    case kRegexpAnyChar:
+    case kRegexpAnyByte:
+      nre->min_++;
+      if (nre->max() != -1)
+        nre->max_++;
+      goto LeaveEmpty;
+
+    LeaveEmpty:
+      *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
+      *r2ptr = nre;
+      break;
+
+    case kRegexpLiteralString: {
+      Rune r = r1->sub()[0]->rune();
+      // Determine how much of the literal string is removed.
+      // We know that we have at least one rune. :)
+      int n = 1;
+      while (n < r2->nrunes() && r2->runes()[n] == r)
+        n++;
+      nre->min_ += n;
+      if (nre->max() != -1)
+        nre->max_ += n;
+      if (n == r2->nrunes())
+        goto LeaveEmpty;
+      *r1ptr = nre;
+      *r2ptr = Regexp::LiteralString(
+          &r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
+      break;
+    }
+
+    default:
+      LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
+      nre->Decref();
+      return;
+  }
+
+  r1->Decref();
+  r2->Decref();
+}
+
 Regexp* SimplifyWalker::Copy(Regexp* re) {
   return re->Incref();
 }
@@ -196,29 +471,14 @@ Regexp* SimplifyWalker::PostVisit(Regexp* re,
     case kRegexpConcat:
     case kRegexpAlternate: {
       // These are simple as long as the subpieces are simple.
-      // Two passes to avoid allocation in the common case.
-      bool changed = false;
-      Regexp** subs = re->sub();
-      for (int i = 0; i < re->nsub_; i++) {
-        Regexp* sub = subs[i];
-        Regexp* newsub = child_args[i];
-        if (newsub != sub) {
-          changed = true;
-          break;
-        }
-      }
-      if (!changed) {
-        for (int i = 0; i < re->nsub_; i++) {
-          Regexp* newsub = child_args[i];
-          newsub->Decref();
-        }
+      if (!ChildArgsChanged(re, child_args)) {
         re->simple_ = true;
         return re->Incref();
       }
       Regexp* nre = new Regexp(re->op(), re->parse_flags());
-      nre->AllocSub(re->nsub_);
+      nre->AllocSub(re->nsub());
       Regexp** nre_subs = nre->sub();
-      for (int i = 0; i <re->nsub_; i++)
+      for (int i = 0; i < re->nsub(); i++)
         nre_subs[i] = child_args[i];
       nre->simple_ = true;
       return nre;
@@ -234,7 +494,7 @@ Regexp* SimplifyWalker::PostVisit(Regexp* re,
       Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
       nre->AllocSub(1);
       nre->sub()[0] = newsub;
-      nre->cap_ = re->cap_;
+      nre->cap_ = re->cap();
       nre->simple_ = true;
       return nre;
     }
diff --git a/re2/testing/dump.cc b/re2/testing/dump.cc
index 4bdf714..9703039 100644
--- a/re2/testing/dump.cc
+++ b/re2/testing/dump.cc
@@ -120,6 +120,8 @@ static void DumpRegexpAppending(Regexp* re, string* s) {
       DumpRegexpAppending(re->sub()[0], s);
       break;
     case kRegexpCapture:
+      if (re->cap() == 0)
+        LOG(DFATAL) << "kRegexpCapture cap() == 0";
       if (re->name()) {
         s->append(*re->name());
         s->append(":");
diff --git a/re2/testing/parse_test.cc b/re2/testing/parse_test.cc
index f6d760a..29a79b1 100644
--- a/re2/testing/parse_test.cc
+++ b/re2/testing/parse_test.cc
@@ -118,8 +118,15 @@ static Test tests[] = {
   { "(?:a)", "lit{a}" },
   { "(?:ab)(?:cd)", "str{abcd}" },
   { "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" },
+  { "a|c", "cc{0x61 0x63}" },
+  { "a|[cd]", "cc{0x61 0x63-0x64}" },
   { "a|.", "dot{}" },
-  { ".|a", "dot{}" },
+  { "[ab]|c", "cc{0x61-0x63}" },
+  { "[ab]|[cd]", "cc{0x61-0x64}" },
+  { "[ab]|.", "dot{}" },
+  { ".|c", "dot{}" },
+  { ".|[cd]", "dot{}" },
+  { ".|.", "dot{}" },
 
   // Test Perl quoted literals
   { "\\Q+|*?{[\\E", "str{+|*?{[}" },
@@ -299,6 +306,14 @@ Test prefix_tests[] = {
     "cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" },
   { "x{2}y|x{2}[0-9]y",
     "cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}" },
+  { "n|r|rs",
+    "alt{lit{n}cat{lit{r}alt{emp{}lit{s}}}}" },
+  { "n|rs|r",
+    "alt{lit{n}cat{lit{r}alt{lit{s}emp{}}}}" },
+  { "r|rs|n",
+    "alt{cat{lit{r}alt{emp{}lit{s}}}lit{n}}" },
+  { "rs|r|n",
+    "alt{cat{lit{r}alt{lit{s}emp{}}}lit{n}}" },
 };
 
 // Test that prefix factoring works.
diff --git a/re2/testing/set_test.cc b/re2/testing/set_test.cc
index 74058a4..4e267ae 100644
--- a/re2/testing/set_test.cc
+++ b/re2/testing/set_test.cc
@@ -71,10 +71,10 @@ TEST(Set, UnanchoredFactored) {
 
 TEST(Set, UnanchoredDollar) {
   RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
-  
+
   CHECK_EQ(s.Add("foo$", NULL), 0);
   CHECK_EQ(s.Compile(), true);
-  
+
   vector<int> v;
   CHECK_EQ(s.Match("foo", &v), true);
   CHECK_EQ(v.size(), 1);
@@ -107,8 +107,34 @@ TEST(Set, Anchored) {
   CHECK_EQ(s.Match("bar", &v), true);
   CHECK_EQ(v.size(), 1);
   CHECK_EQ(v[0], 1);
+}
+
+TEST(Set, EmptyUnanchored) {
+  RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
+
+  CHECK_EQ(s.Compile(), true);
+
+  vector<int> v;
+  CHECK_EQ(s.Match("", &v), false);
+  CHECK_EQ(v.size(), 0);
+
+  v.clear();
+  CHECK_EQ(s.Match("foobar", &v), false);
+  CHECK_EQ(v.size(), 0);
+}
+
+TEST(Set, EmptyAnchored) {
+  RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH);
 
+  CHECK_EQ(s.Compile(), true);
+
+  vector<int> v;
+  CHECK_EQ(s.Match("", &v), false);
+  CHECK_EQ(v.size(), 0);
+
+  v.clear();
+  CHECK_EQ(s.Match("foobar", &v), false);
+  CHECK_EQ(v.size(), 0);
 }
 
 }  // namespace re2
-
diff --git a/re2/testing/simplify_test.cc b/re2/testing/simplify_test.cc
index d54837c..9db41ee 100644
--- a/re2/testing/simplify_test.cc
+++ b/re2/testing/simplify_test.cc
@@ -136,6 +136,99 @@ static Test tests[] = {
   { "(){1}", "()" },
   { "(){1,}", "()+" },
   { "(){0,2}", "(?:()()?)?" },
+
+  // Test that coalescing occurs and that the resulting repeats are simplified.
+  // Two-op combinations of *, +, ?, {n}, {n,} and {n,m} with a literal:
+  { "a*a*", "a*" },
+  { "a*a+", "a+" },
+  { "a*a?", "a*" },
+  { "a*a{2}", "aa+" },
+  { "a*a{2,}", "aa+" },
+  { "a*a{2,3}", "aa+" },
+  { "a+a*", "a+" },
+  { "a+a+", "aa+" },
+  { "a+a?", "a+" },
+  { "a+a{2}", "aaa+" },
+  { "a+a{2,}", "aaa+" },
+  { "a+a{2,3}", "aaa+" },
+  { "a?a*", "a*" },
+  { "a?a+", "a+" },
+  { "a?a?", "(?:aa?)?" },
+  { "a?a{2}", "aaa?" },
+  { "a?a{2,}", "aa+" },
+  { "a?a{2,3}", "aa(?:aa?)?" },
+  { "a{2}a*", "aa+" },
+  { "a{2}a+", "aaa+" },
+  { "a{2}a?", "aaa?" },
+  { "a{2}a{2}", "aaaa" },
+  { "a{2}a{2,}", "aaaa+" },
+  { "a{2}a{2,3}", "aaaaa?" },
+  { "a{2,}a*", "aa+" },
+  { "a{2,}a+", "aaa+" },
+  { "a{2,}a?", "aa+" },
+  { "a{2,}a{2}", "aaaa+" },
+  { "a{2,}a{2,}", "aaaa+" },
+  { "a{2,}a{2,3}", "aaaa+" },
+  { "a{2,3}a*", "aa+" },
+  { "a{2,3}a+", "aaa+" },
+  { "a{2,3}a?", "aa(?:aa?)?" },
+  { "a{2,3}a{2}", "aaaaa?" },
+  { "a{2,3}a{2,}", "aaaa+" },
+  { "a{2,3}a{2,3}", "aaaa(?:aa?)?" },
+  // With a char class, any char and any byte:
+  { "\\d*\\d*", "[0-9]*" },
+  { ".*.*", ".*" },
+  { "\\C*\\C*", "\\C*" },
+  // FoldCase works, but must be consistent:
+  { "(?i)A*a*", "[Aa]*" },
+  { "(?i)a+A+", "[Aa][Aa]+" },
+  { "(?i)A*(?-i)a*", "[Aa]*a*" },
+  { "(?i)a+(?-i)A+", "[Aa]+A+" },
+  // NonGreedy works, but must be consistent:
+  { "a*?a*?", "a*?" },
+  { "a+?a+?", "aa+?" },
+  { "a*?a*", "a*?a*" },
+  { "a+a+?", "a+a+?" },
+  // The second element is the literal, char class, any char or any byte:
+  { "a*a", "a+" },
+  { "\\d*\\d", "[0-9]+" },
+  { ".*.", ".+" },
+  { "\\C*\\C", "\\C+" },
+  // FoldCase works, but must be consistent:
+  { "(?i)A*a", "[Aa]+" },
+  { "(?i)a+A", "[Aa][Aa]+" },
+  { "(?i)A*(?-i)a", "[Aa]*a" },
+  { "(?i)a+(?-i)A", "[Aa]+A" },
+  // The second element is a literal string that begins with the literal:
+  { "a*aa", "aa+" },
+  { "a*aab", "aa+b" },
+  // FoldCase works, but must be consistent:
+  { "(?i)a*aa", "[Aa][Aa]+" },
+  { "(?i)a*aab", "[Aa][Aa]+[Bb]" },
+  { "(?i)a*(?-i)aa", "[Aa]*aa" },
+  { "(?i)a*(?-i)aab", "[Aa]*aab" },
+  // Negative tests with mismatching ops:
+  { "a*b*", "a*b*" },
+  { "\\d*\\D*", "[0-9]*[^0-9]*" },
+  { "a+b", "a+b" },
+  { "\\d+\\D", "[0-9]+[^0-9]" },
+  { "a?bb", "a?bb" },
+  // Negative tests with capturing groups:
+  { "(a*)a*", "(a*)a*" },
+  { "a+(a)", "a+(a)" },
+  { "(a?)(aa)", "(a?)(aa)" },
+  // Just for fun:
+  { "aa*aa+aa?aa{2}aaa{2,}aaa{2,3}a", "aaaaaaaaaaaaaaaa+" },
+
+  // During coalescing, the child of the repeat changes, so we build a new
+  // repeat. The new repeat must have the min and max of the old repeat.
+  // Failure to copy them results in min=0 and max=0 -> empty match.
+  { "(?:a*aab){2}", "aa+baa+b" },
+
+  // During coalescing, the child of the capture changes, so we build a new
+  // capture. The new capture must have the cap of the old capture.
+  // Failure to copy it results in cap=0 -> ToString() logs a fatal error.
+  { "(a*aab)", "(aa+b)" },
 };
 
 TEST(TestSimplify, SimpleRegexps) {
diff --git a/re2/tostring.cc b/re2/tostring.cc
index 56ca7df..c59d4d9 100644
--- a/re2/tostring.cc
+++ b/re2/tostring.cc
@@ -94,6 +94,8 @@ int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
 
     case kRegexpCapture:
       t_->append("(");
+      if (re->cap() == 0)
+        LOG(DFATAL) << "kRegexpCapture cap() == 0";
       if (re->name()) {
         t_->append("?P<");
         t_->append(*re->name());
@@ -120,13 +122,13 @@ int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
 static void AppendLiteral(string *t, Rune r, bool foldcase) {
   if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
     t->append(1, '\\');
-    t->append(1, r);
+    t->append(1, static_cast<char>(r));
   } else if (foldcase && 'a' <= r && r <= 'z') {
     if ('a' <= r && r <= 'z')
       r += 'A' - 'a';
     t->append(1, '[');
-    t->append(1, r);
-    t->append(1, r + 'a' - 'A');
+    t->append(1, static_cast<char>(r));
+    t->append(1, static_cast<char>(r) + 'a' - 'A');
     t->append(1, ']');
   } else {
     AppendCCRange(t, r, r);
@@ -297,7 +299,7 @@ static void AppendCCChar(string* t, Rune r) {
   if (0x20 <= r && r <= 0x7E) {
     if (strchr("[]^-\\", r))
       t->append("\\");
-    t->append(1, r);
+    t->append(1, static_cast<char>(r));
     return;
   }
   switch (r) {
diff --git a/util/benchmark.cc b/util/benchmark.cc
index c3aad7e..8f72f3c 100644
--- a/util/benchmark.cc
+++ b/util/benchmark.cc
@@ -25,10 +25,17 @@ void Benchmark::Register() {
 }
 
 static int64 nsec() {
+#if defined(__APPLE__) || defined(_WIN32)
 	struct timeval tv;
 	if(gettimeofday(&tv, 0) < 0)
 		return -1;
 	return (int64)tv.tv_sec*1000*1000*1000 + tv.tv_usec*1000;
+#else
+	struct timespec tp;
+	if(clock_gettime(CLOCK_REALTIME, &tp) < 0)
+		return -1;
+	return (int64)tp.tv_sec*1000*1000*1000 + tp.tv_nsec;
+#endif
 }
 
 static int64 bytes;
diff --git a/util/mutex.h b/util/mutex.h
index 4a8de4c..9cb6de3 100644
--- a/util/mutex.h
+++ b/util/mutex.h
@@ -12,17 +12,38 @@
 
 #include <stdlib.h>
 
+#if !defined(_WIN32)
+#include <unistd.h>  // For POSIX options
+#endif
+
 namespace re2 {
 
-#define HAVE_PTHREAD 1
-#define HAVE_RWLOCK 1
+#if !defined(_WIN32)
+  // Possible values of POSIX options:
+  //   -1 means not supported,
+  //    0 means maybe supported (query at runtime),
+  //   >0 means supported.
+# if defined(_POSIX_THREADS) && _POSIX_THREADS > 0
+#   define HAVE_PTHREAD 1
+# else
+#   define HAVE_PTHREAD 0
+# endif
+# if defined(_POSIX_READER_WRITER_LOCKS) && _POSIX_READER_WRITER_LOCKS > 0
+#   define HAVE_RWLOCK 1
+# else
+#   define HAVE_RWLOCK 0
+# endif
+#else
+# define HAVE_PTHREAD 0
+# define HAVE_RWLOCK 0
+#endif
 
 #if defined(NO_THREADS)
   typedef int MutexType;      // to keep a lock-count
-#elif defined(HAVE_PTHREAD) && defined(HAVE_RWLOCK)
+#elif HAVE_PTHREAD && HAVE_RWLOCK
   // Needed for pthread_rwlock_*.  If it causes problems, you could take it
-  // out, but then you'd have to unset HAVE_RWLOCK (at least on linux -- it
-  // *does* cause problems for FreeBSD, or MacOSX, but isn't needed
+  // out, but then you'd have to set HAVE_RWLOCK to 0 (at least on linux --
+  // it *does* cause problems for FreeBSD, or MacOSX, but isn't needed
   // for locking there.)
 # ifdef __linux__
 #   undef _XOPEN_SOURCE
@@ -30,10 +51,10 @@ namespace re2 {
 # endif
 # include <pthread.h>
   typedef pthread_rwlock_t MutexType;
-#elif defined(HAVE_PTHREAD)
+#elif HAVE_PTHREAD
 # include <pthread.h>
   typedef pthread_mutex_t MutexType;
-#elif defined(WIN32)
+#elif defined(_WIN32)
 # define WIN32_LEAN_AND_MEAN  // We only need minimal includes
 # ifdef GMUTEX_TRYLOCK
   // We need Windows NT or later for TryEnterCriticalSection().  If you
@@ -102,7 +123,7 @@ bool Mutex::TryLock()      { if (mutex_) return false; Lock(); return true; }
 void Mutex::ReaderLock()   { assert(++mutex_ > 0); }
 void Mutex::ReaderUnlock() { assert(mutex_-- > 0); }
 
-#elif defined(HAVE_PTHREAD) && defined(HAVE_RWLOCK)
+#elif HAVE_PTHREAD && HAVE_RWLOCK
 
 #define SAFE_PTHREAD(fncall)  do { if ((fncall) != 0) abort(); } while (0)
 
@@ -116,7 +137,7 @@ void Mutex::ReaderUnlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); }
 
 #undef SAFE_PTHREAD
 
-#elif defined(HAVE_PTHREAD)
+#elif HAVE_PTHREAD
 
 #define SAFE_PTHREAD(fncall)  do { if ((fncall) != 0) abort(); } while (0)
 
@@ -129,7 +150,7 @@ void Mutex::ReaderLock()   { Lock(); }      // we don't have read-write locks
 void Mutex::ReaderUnlock() { Unlock(); }
 #undef SAFE_PTHREAD
 
-#elif defined(WIN32)
+#elif defined(_WIN32)
 
 Mutex::Mutex()             { InitializeCriticalSection(&mutex_); }
 Mutex::~Mutex()            { DeleteCriticalSection(&mutex_); }
@@ -186,7 +207,7 @@ class WriterMutexLock {
 #define WriterMutexLock(x) COMPILE_ASSERT(0, wmutex_lock_decl_missing_var_name)
 
 // Provide safe way to declare and use global, linker-initialized mutex. Sigh.
-#ifdef HAVE_PTHREAD
+#if HAVE_PTHREAD
 
 #define GLOBAL_MUTEX(name) \
 	static pthread_mutex_t (name) = PTHREAD_MUTEX_INITIALIZER
diff --git a/util/rune.cc b/util/rune.cc
index 26442b0..e6231ce 100644
--- a/util/rune.cc
+++ b/util/rune.cc
@@ -133,7 +133,7 @@ runetochar(char *str, const Rune *rune)
 	 */
 	c = *rune;
 	if(c <= Rune1) {
-		str[0] = c;
+		str[0] = static_cast<char>(c);
 		return 1;
 	}
 
@@ -142,7 +142,7 @@ runetochar(char *str, const Rune *rune)
 	 *	0080-07FF => T2 Tx
 	 */
 	if(c <= Rune2) {
-		str[0] = T2 | (c >> 1*Bitx);
+		str[0] = T2 | static_cast<char>(c >> 1*Bitx);
 		str[1] = Tx | (c & Maskx);
 		return 2;
 	}
@@ -161,9 +161,9 @@ runetochar(char *str, const Rune *rune)
 	 *	0800-FFFF => T3 Tx Tx
 	 */
 	if (c <= Rune3) {
-		str[0] = T3 |  (c >> 2*Bitx);
+		str[0] = T3 | static_cast<char>(c >> 2*Bitx);
 		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
-		str[2] = Tx |  (c & Maskx);
+		str[2] = Tx | (c & Maskx);
 		return 3;
 	}
 
@@ -171,7 +171,7 @@ runetochar(char *str, const Rune *rune)
 	 * four character sequence (21-bit value)
 	 *     10000-1FFFFF => T4 Tx Tx Tx
 	 */
-	str[0] = T4 | (c >> 3*Bitx);
+	str[0] = T4 | static_cast<char>(c >> 3*Bitx);
 	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
 	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
 	str[3] = Tx | (c & Maskx);
diff --git a/util/stringprintf.cc b/util/stringprintf.cc
index 79ba7fc..e71d993 100644
--- a/util/stringprintf.cc
+++ b/util/stringprintf.cc
@@ -4,7 +4,7 @@
 
 #include "util/util.h"
 
-namespace re2 { 
+namespace re2 {
 
 static void StringAppendV(string* dst, const char* format, va_list ap) {
   // First try with a small fixed size buffer
@@ -38,7 +38,14 @@ static void StringAppendV(string* dst, const char* format, va_list ap) {
 
     // Restore the va_list before we use it again
     va_copy(backup_ap, ap);
+#if !defined(_WIN32)
     result = vsnprintf(buf, length, format, backup_ap);
+#else
+    // On Windows, the function takes five arguments, not four. With an array,
+    // the buffer size will be inferred, but not with a pointer. C'est la vie.
+    // (See https://github.com/google/re2/issues/40 for more details.)
+    result = vsnprintf(buf, length, _TRUNCATE, format, backup_ap);
+#endif
     va_end(backup_ap);
 
     if ((result >= 0) && (result < length)) {
diff --git a/util/test.cc b/util/test.cc
index 220bfb0..85055b2 100644
--- a/util/test.cc
+++ b/util/test.cc
@@ -3,7 +3,7 @@
 // license that can be found in the LICENSE file.
 
 #include <stdio.h>
-#ifndef WIN32
+#ifndef _WIN32
 #include <sys/resource.h>
 #endif
 #include "util/test.h"
@@ -25,7 +25,7 @@ void RegisterTest(void (*fn)(void), const char *name) {
 
 namespace re2 {
 int64 VirtualProcessSize() {
-#ifdef WIN32
+#ifdef _WIN32
   return 0;
 #else
   struct rusage ru;
diff --git a/util/util.h b/util/util.h
index a7ccafe..f49ebd1 100644
--- a/util/util.h
+++ b/util/util.h
@@ -12,10 +12,10 @@
 #include <stddef.h>     // For size_t
 #include <assert.h>
 #include <stdarg.h>
-#include <time.h>
+#include <time.h>       // For clock_gettime, CLOCK_REALTIME
 #include <ctype.h>      // For isdigit, isalpha
 
-#if !defined(WIN32)
+#if !defined(_WIN32)
 #include <sys/time.h>   // For gettimeofday
 #endif
 
@@ -53,7 +53,7 @@ using std::tr1::unordered_set;
 #else
 
 #include <unordered_set>
-#if defined(WIN32)
+#if defined(_WIN32)
 using std::tr1::unordered_set;
 #else
 using std::unordered_set;
@@ -61,7 +61,7 @@ using std::unordered_set;
 
 #endif
 
-#ifdef WIN32
+#ifdef _WIN32
 
 #define snprintf _snprintf_s
 #define sprintf sprintf_s
@@ -72,7 +72,6 @@ using std::unordered_set;
 #define vsnprintf vsnprintf_s
 
 #pragma warning(disable: 4018) // signed/unsigned mismatch
-#pragma warning(disable: 4244) // possible data loss in int conversion
 #pragma warning(disable: 4800) // conversion from int to bool
 
 #endif
diff --git a/util/valgrind.cc b/util/valgrind.cc
index ee257d0..82f9a4c 100644
--- a/util/valgrind.cc
+++ b/util/valgrind.cc
@@ -3,7 +3,7 @@
 // license that can be found in the LICENSE file.
 
 #include "util/util.h"
-#ifndef WIN32
+#ifndef _WIN32
 #include "util/valgrind.h"
 #endif
author	DongHun Kwak <dh0128.kwak@samsung.com>	2016-11-21 16:55:18 +0900
committer	DongHun Kwak <dh0128.kwak@samsung.com>	2016-11-21 16:55:19 +0900
commit	873ba3ec5c36fe51e9aeadaac126a04d965fc053 (patch)
tree	9624dd07f5e55939549e86add68cda90119f8af4
parent	abbaebf6f92b878517d8061fae50a8b9ca84ca52 (diff)
download	re2-873ba3ec5c36fe51e9aeadaac126a04d965fc053.tar.gz re2-873ba3ec5c36fe51e9aeadaac126a04d965fc053.tar.bz2 re2-873ba3ec5c36fe51e9aeadaac126a04d965fc053.zip