summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDongHun Kwak <dh0128.kwak@samsung.com>2016-11-21 16:55:18 +0900
committerDongHun Kwak <dh0128.kwak@samsung.com>2016-11-21 16:55:19 +0900
commit873ba3ec5c36fe51e9aeadaac126a04d965fc053 (patch)
tree9624dd07f5e55939549e86add68cda90119f8af4
parentabbaebf6f92b878517d8061fae50a8b9ca84ca52 (diff)
downloadre2-873ba3ec5c36fe51e9aeadaac126a04d965fc053.tar.gz
re2-873ba3ec5c36fe51e9aeadaac126a04d965fc053.tar.bz2
re2-873ba3ec5c36fe51e9aeadaac126a04d965fc053.zip
Imported Upstream version 20150801upstream/20150801
Change-Id: If6ba79b7356ae34e4008c43850b4390163058f2f Signed-off-by: DongHun Kwak <dh0128.kwak@samsung.com>
-rw-r--r--.gitignore1
-rw-r--r--Makefile46
-rw-r--r--README3
-rwxr-xr-xbenchlog/benchplot.py98
-rw-r--r--libre2.symbols.darwin7
-rw-r--r--re2/compile.cc21
-rw-r--r--re2/dfa.cc18
-rwxr-xr-xre2/make_perl_groups.pl34
-rwxr-xr-xre2/make_unicode_casefold.py3
-rw-r--r--re2/nfa.cc2
-rw-r--r--re2/parse.cc71
-rw-r--r--re2/prog.cc8
-rw-r--r--re2/prog.h4
-rw-r--r--re2/re2.cc48
-rw-r--r--re2/regexp.cc10
-rw-r--r--re2/regexp.h4
-rw-r--r--re2/simplify.cc312
-rw-r--r--re2/testing/dump.cc2
-rw-r--r--re2/testing/parse_test.cc17
-rw-r--r--re2/testing/set_test.cc32
-rw-r--r--re2/testing/simplify_test.cc93
-rw-r--r--re2/tostring.cc10
-rw-r--r--util/benchmark.cc7
-rw-r--r--util/mutex.h43
-rw-r--r--util/rune.cc10
-rw-r--r--util/stringprintf.cc9
-rw-r--r--util/test.cc4
-rw-r--r--util/util.h9
-rw-r--r--util/valgrind.cc2
29 files changed, 738 insertions, 190 deletions
diff --git a/.gitignore b/.gitignore
index 3db23b7..a671fe2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@
*.orig
core
obj/
+benchlog.*
diff --git a/Makefile b/Makefile
index 801304d..1245f35 100644
--- a/Makefile
+++ b/Makefile
@@ -2,8 +2,6 @@
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
-all: obj/libre2.a obj/so/libre2.so
-
# to build against PCRE for testing or benchmarking,
# uncomment the next two lines
# CCPCRE=-I/usr/local/include -DUSEPCRE
@@ -38,13 +36,24 @@ SONAME=0
# REBUILD_TABLES=1
ifeq ($(shell uname),Darwin)
-MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib $(LDFLAGS) -exported_symbols_list libre2.symbols.darwin
+SOEXT=dylib
+SOEXTVER=$(SONAME).$(SOEXT)
+SOEXTVER00=$(SONAME).0.0.$(SOEXT)
+MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib $(LDFLAGS) -Wl,-install_name,@rpath/libre2.$(SOEXTVER) -exported_symbols_list libre2.symbols.darwin
else ifeq ($(shell uname),SunOS)
-MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.so.$(SONAME),-M,libre2.symbols $(LDFLAGS)
+SOEXT=so
+SOEXTVER=$(SOEXT).$(SONAME)
+SOEXTVER00=$(SOEXT).$(SONAME).0.0
+MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),-M,libre2.symbols $(LDFLAGS)
else
-MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.so.$(SONAME),--version-script,libre2.symbols $(LDFLAGS)
+SOEXT=so
+SOEXTVER=$(SOEXT).$(SONAME)
+SOEXTVER00=$(SOEXT).$(SONAME).0.0
+MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),--version-script,libre2.symbols $(LDFLAGS)
endif
+all: obj/libre2.a obj/so/libre2.$(SOEXT)
+
INSTALL_HFILES=\
re2/filtered_re2.h\
re2/re2.h\
@@ -177,10 +186,10 @@ obj/dbg/libre2.a: $(DOFILES)
@mkdir -p obj/dbg
$(AR) $(ARFLAGS) obj/dbg/libre2.a $(DOFILES)
-obj/so/libre2.so: $(SOFILES)
+obj/so/libre2.$(SOEXT): $(SOFILES)
@mkdir -p obj/so
- $(MAKE_SHARED_LIBRARY) -o $@.$(SONAME) $(SOFILES)
- ln -sf libre2.so.$(SONAME) $@
+ $(MAKE_SHARED_LIBRARY) -o obj/so/libre2.$(SOEXTVER) $(SOFILES)
+ ln -sf libre2.$(SOEXTVER) $@
obj/test/%: obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o
@mkdir -p obj/test
@@ -190,7 +199,7 @@ obj/dbg/test/%: obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES) obj/dbg/
@mkdir -p obj/dbg/test
$(CXX) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) obj/dbg/util/test.o obj/dbg/libre2.a $(LDFLAGS) $(LDPCRE)
-obj/so/test/%: obj/so/libre2.so obj/libre2.a obj/so/re2/testing/%.o $(STESTOFILES) obj/so/util/test.o
+obj/so/test/%: obj/so/libre2.$(SOEXT) obj/libre2.a obj/so/re2/testing/%.o $(STESTOFILES) obj/so/util/test.o
@mkdir -p obj/so/test
$(CXX) -o $@ obj/so/re2/testing/$*.o $(STESTOFILES) obj/so/util/test.o -Lobj/so -lre2 obj/libre2.a $(LDFLAGS) $(LDPCRE)
@@ -204,6 +213,8 @@ re2/perl_groups.cc: re2/make_perl_groups.pl
re2/unicode_%.cc: re2/make_unicode_%.py
python $< > $@
+
+.PRECIOUS: re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc
endif
distclean: clean
@@ -246,13 +257,13 @@ shared-bigtest: $(STESTS) $(SBIGTESTS)
benchmark: obj/test/regexp_benchmark
-install: obj/libre2.a obj/so/libre2.so
+install: obj/libre2.a obj/so/libre2.$(SOEXT)
mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir)/pkgconfig
$(INSTALL_DATA) $(INSTALL_HFILES) $(DESTDIR)$(includedir)/re2
$(INSTALL) obj/libre2.a $(DESTDIR)$(libdir)/libre2.a
- $(INSTALL) obj/so/libre2.so $(DESTDIR)$(libdir)/libre2.so.$(SONAME).0.0
- ln -sf libre2.so.$(SONAME).0.0 $(DESTDIR)$(libdir)/libre2.so.$(SONAME)
- ln -sf libre2.so.$(SONAME).0.0 $(DESTDIR)$(libdir)/libre2.so
+ $(INSTALL) obj/so/libre2.$(SOEXT) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER00)
+ ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER)
+ ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXT)
sed -e "s#@prefix@#${prefix}#" re2.pc >$(DESTDIR)$(libdir)/pkgconfig/re2.pc
testinstall:
@@ -267,7 +278,7 @@ endif
benchlog: obj/test/regexp_benchmark
(echo '==BENCHMARK==' `hostname` `date`; \
- (uname -a; $(CXX) --version; hg identify; file obj/test/regexp_benchmark) | sed 's/^/# /'; \
+ (uname -a; $(CXX) --version; git rev-parse --short HEAD; file obj/test/regexp_benchmark) | sed 's/^/# /'; \
echo; \
./obj/test/regexp_benchmark 'PCRE|RE2') | tee -a benchlog.$$(hostname | sed 's/\..*//')
@@ -279,8 +290,9 @@ benchlog: obj/test/regexp_benchmark
obj/test/% obj/so/test/% obj/dbg/test/%
log:
- make clean
- make CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" obj/test/exhaustive{,1,2,3}_test
+ $(MAKE) clean
+ $(MAKE) CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" \
+ $(filter obj/test/exhaustive%_test,$(BIGTESTS))
echo '#' RE2 exhaustive tests built by make log >re2-exhaustive.txt
echo '#' $$(date) >>re2-exhaustive.txt
obj/test/exhaustive_test |grep -v '^PASS$$' >>re2-exhaustive.txt
@@ -288,7 +300,7 @@ log:
obj/test/exhaustive2_test |grep -v '^PASS$$' >>re2-exhaustive.txt
obj/test/exhaustive3_test |grep -v '^PASS$$' >>re2-exhaustive.txt
- make CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" obj/test/search_test
+ $(MAKE) CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" obj/test/search_test
echo '#' RE2 basic search tests built by make $@ >re2-search.txt
echo '#' $$(date) >>re2-search.txt
obj/test/search_test |grep -v '^PASS$$' >>re2-search.txt
diff --git a/README b/README
index 3cba956..4c5875b 100644
--- a/README
+++ b/README
@@ -25,6 +25,7 @@ under the BSD-style license found in the LICENSE file.
RE2's native language is C++.
An Erlang wrapper is at https://github.com/tuncer/re2/.
An Inferno wrapper is at https://github.com/powerman/inferno-re2/.
-A Perl wrapper is at https://github.com/dgl/re-engine-RE2/ and on CPAN.
+An OCaml wrapper is at https://github.com/janestreet/re2/ and on OPAM.
+A Perl wrapper is at https://github.com/dgl/re-engine-RE2/ and on CPAN.
A Python wrapper is at https://github.com/facebook/pyre2/.
A Ruby wrapper is at https://github.com/axic/rre2/.
diff --git a/benchlog/benchplot.py b/benchlog/benchplot.py
new file mode 100755
index 0000000..104abe8
--- /dev/null
+++ b/benchlog/benchplot.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+
+import argparse # for ArgumentParser
+import subprocess # for Popen
+import tempfile # for NamedTemporaryFile
+import os # for remove
+
+class gnuplot(object):
+
+ output = "result.png"
+
+ script = """
+ set terminal png size 1024, 768
+ set output "{}.png"
+ set title "re2 benchlog"
+ set datafile separator ";"
+ set grid x y
+ set ylabel "MB/s"
+ set autoscale
+ plot """
+
+ template = """'{}' using 1:5:xticlabels(2) with linespoints linewidth 3 title "{}",\\\n"""
+
+ benchdata = dict()
+ tempfiles = []
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, type, value, traceback):
+ """
+ remove all temporary files
+ """
+
+ for filename in self.tempfiles:
+ os.remove(filename)
+
+ def parse_re2_benchlog(self, filename):
+ """
+ parse the input benchlog and return a dictionary contain bench data
+ """
+
+ benchdata = self.benchdata
+
+ with open(filename) as f:
+
+ for raw in f.readlines():
+
+ data = raw.split('\t')
+
+ if len(data) == 4:
+
+ data = data[0].split('/') + data[1:]
+ data = list(map(str.strip, data))
+
+ if not benchdata.get(data[0]):
+ benchdata[data[0]] = [ data[1:] ]
+ else:
+ benchdata[data[0]].append(data[1:])
+
+ def gen_csv(self):
+ """
+ generate temporary csv files
+ """
+
+ for name, data in self.benchdata.items():
+
+ with tempfile.NamedTemporaryFile(delete=False) as f:
+
+ for index, line in enumerate(data):
+ f.write('{};{}\n'.format(index, ';'.join(line)).encode())
+
+ self.tempfiles.append(f.name)
+ self.script = self.script + self.template.format(f.name, name)
+
+ def run(self):
+ self.gen_csv()
+ script = self.script[:-3].format(self.output)
+ command = subprocess.Popen(['gnuplot'], stdin=subprocess.PIPE)
+ command.communicate(script.encode())
+
+
+if __name__ == '__main__':
+
+ parser = argparse.ArgumentParser(description='generate plots for benchlog')
+ parser.add_argument('benchlog', type=str, help='benchlog generated by re2')
+ args = parser.parse_args()
+
+ try:
+ subprocess.Popen(['gnuplot'], stdin=subprocess.PIPE)
+ except FileNotFoundError:
+ print('you can install "gnuplot" to generate plots automatically')
+ exit(1)
+
+ with gnuplot() as plot:
+ plot.output = args.benchlog
+ plot.parse_re2_benchlog(args.benchlog)
+ plot.run()
diff --git a/libre2.symbols.darwin b/libre2.symbols.darwin
index bc6f1af..4207f87 100644
--- a/libre2.symbols.darwin
+++ b/libre2.symbols.darwin
@@ -6,7 +6,12 @@ __ZNK3re23RE2*
__ZN3re211StringPiece*
__ZNK3re211StringPiece*
# operator<<(std::ostream&, re2::StringPiece const&)
-__ZlsRNSt3__113basic_ostreamIcNS_11char_traitsIcEEEERKN3re211StringPieceE
+# Seen with libstdc++ on 10.8 and below:
+# __ZlsRSoRKN3re211StringPieceE
+# Seen with libc++ on 10.9 and above:
+# __ZlsRNSt3__113basic_ostreamIcNS_11char_traitsIcEEEERKN3re211StringPieceE
+# Note that "ls" means operator<<, so this is not overly broad.
+__Zls*RKN3re211StringPieceE
# re2::FilteredRE2*
__ZN3re211FilteredRE2*
__ZNK3re211FilteredRE2*
diff --git a/re2/compile.cc b/re2/compile.cc
index 6478def..e5d6088 100644
--- a/re2/compile.cc
+++ b/re2/compile.cc
@@ -435,7 +435,10 @@ Frag Compiler::EmptyWidth(EmptyOp empty) {
if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) {
int j;
for (int i = 0; i < 256; i = j) {
- for (j = i+1; j < 256 && Prog::IsWordChar(i) == Prog::IsWordChar(j); j++)
+ for (j = i + 1; j < 256 &&
+ Prog::IsWordChar(static_cast<uint8>(i)) ==
+ Prog::IsWordChar(static_cast<uint8>(j));
+ j++)
;
prog_->MarkByteRange(i, j-1);
}
@@ -503,7 +506,10 @@ int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {
return UncachedRuneByteSuffix(lo, hi, foldcase, next);
}
- uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | foldcase;
+ uint64 key = (uint64)next << 17 |
+ (uint64)lo << 9 |
+ (uint64)hi << 1 |
+ (uint64)foldcase;
map<uint64, int>::iterator it = rune_cache_.find(key);
if (it != rune_cache_.end())
return it->second;
@@ -555,7 +561,8 @@ void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
return;
if (hi > 0xFF)
hi = 0xFF;
- AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));
+ AddSuffix(RuneByteSuffix(static_cast<uint8>(lo), static_cast<uint8>(hi),
+ foldcase, 0));
}
// Table describing how to make a UTF-8 matching machine
@@ -596,7 +603,8 @@ void Compiler::Add_80_10ffff() {
int next = 0;
if (p.next >= 0)
next = inst[p.next];
- inst[i] = UncachedRuneByteSuffix(p.lo, p.hi, false, next);
+ inst[i] = UncachedRuneByteSuffix(static_cast<uint8>(p.lo),
+ static_cast<uint8>(p.hi), false, next);
if ((p.lo & 0xC0) != 0x80)
AddSuffix(inst[i]);
}
@@ -625,7 +633,8 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
// ASCII range is always a special case.
if (hi < Runeself) {
- AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));
+ AddSuffix(RuneByteSuffix(static_cast<uint8>(lo), static_cast<uint8>(hi),
+ foldcase, 0));
return;
}
@@ -979,7 +988,7 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
if (m > Prog::Inst::kMaxInst)
m = Prog::Inst::kMaxInst;
- max_inst_ = m;
+ max_inst_ = static_cast<int>(m);
}
anchor_ = anchor;
diff --git a/re2/dfa.cc b/re2/dfa.cc
index 25e0220..11eb77f 100644
--- a/re2/dfa.cc
+++ b/re2/dfa.cc
@@ -143,7 +143,7 @@ class DFA {
if (sizeof(size_t) == sizeof(uint32))
return Hash32StringWithSeed(s, len, a->flag_);
else
- return Hash64StringWithSeed(s, len, a->flag_);
+ return static_cast<size_t>(Hash64StringWithSeed(s, len, a->flag_));
}
#ifdef STL_MSVC
// Less than operator.
@@ -451,7 +451,7 @@ DFA::DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem)
(sizeof(int)+sizeof(int)) * 2; // q0, q1
mem_budget_ -= nastack_ * sizeof(int); // astack
if (mem_budget_ < 0) {
- LOG(INFO) << StringPrintf("DFA out of memory: prog size %lld mem %lld",
+ LOG(INFO) << StringPrintf("DFA out of memory: prog size %d mem %lld",
prog_->size(), max_mem);
init_failed_ = true;
return;
@@ -466,7 +466,7 @@ DFA::DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem)
int64 one_state = sizeof(State) + (prog_->size()+nmark)*sizeof(int) +
(prog_->bytemap_range()+1)*sizeof(State*);
if (state_budget_ < 20*one_state) {
- LOG(INFO) << StringPrintf("DFA out of memory: prog size %lld mem %lld",
+ LOG(INFO) << StringPrintf("DFA out of memory: prog size %d mem %lld",
prog_->size(), max_mem);
init_failed_ = true;
return;
@@ -864,8 +864,10 @@ void DFA::AddToQueue(Workq* q, int id, uint flag) {
break;
case kInstEmptyWidth:
- if ((ip->empty() & flag) == ip->empty())
- stk[nstk++] = ip->out();
+ // Continue on if we have all the right flag bits.
+ if (ip->empty() & ~flag)
+ break;
+ stk[nstk++] = ip->out();
break;
}
}
@@ -1014,7 +1016,7 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) {
// byte processed was a word character. Use that info to
// insert empty-width (non-)word boundaries.
bool islastword = state->flag_ & kFlagLastWord;
- bool isword = (c != kByteEndText && Prog::IsWordChar(c));
+ bool isword = (c != kByteEndText && Prog::IsWordChar(static_cast<uint8>(c)));
if (isword == islastword)
beforeflag |= kEmptyNonWordBoundary;
else
@@ -2034,7 +2036,7 @@ bool DFA::PossibleMatchRange(string* min, string* max, int maxlen) {
if (ns == FullMatchState ||
(ns > SpecialStateMax && ns->ninst_ > 0)) {
extended = true;
- min->append(1, j);
+ min->append(1, static_cast<char>(j));
s = ns;
break;
}
@@ -2064,7 +2066,7 @@ bool DFA::PossibleMatchRange(string* min, string* max, int maxlen) {
if (ns == FullMatchState ||
(ns > SpecialStateMax && ns->ninst_ > 0)) {
extended = true;
- max->append(1, j);
+ max->append(1, static_cast<char>(j));
s = ns;
break;
}
diff --git a/re2/make_perl_groups.pl b/re2/make_perl_groups.pl
index 8c1f4f6..d9fcdaf 100755
--- a/re2/make_perl_groups.pl
+++ b/re2/make_perl_groups.pl
@@ -32,14 +32,20 @@
"\\w",
);
+%overrides = (
+ # Prior to Perl 5.18, \s did not match vertical tab.
+ # RE2 preserves that original behaviour.
+ "\\s:11" => 0,
+);
+
sub ComputeClass($) {
+ my ($cname) = @_;
my @ranges;
- my ($class) = @_;
- my $regexp = "[$class]";
+ my $regexp = qr/[$cname]/;
my $start = -1;
for (my $i=0; $i<=129; $i++) {
if ($i == 129) { $i = 256; }
- if ($i <= 128 && chr($i) =~ $regexp) {
+ if ($i <= 128 && ($overrides{"$cname:$i"} // chr($i) =~ $regexp)) {
if ($start < 0) {
$start = $i;
}
@@ -54,15 +60,15 @@ sub ComputeClass($) {
}
sub PrintClass($$@) {
- my ($cname, $name, @ranges) = @_;
- print "static const URange16 code${cname}[] = { /* $name */\n";
+ my ($cnum, $cname, @ranges) = @_;
+ print "static const URange16 code${cnum}[] = { /* $cname */\n";
for (my $i=0; $i<@ranges; $i++) {
my @a = @{$ranges[$i]};
printf "\t{ 0x%x, 0x%x },\n", $a[0], $a[1];
}
print "};\n";
my $n = @ranges;
- my $escname = $name;
+ my $escname = $cname;
$escname =~ s/\\/\\\\/g;
$negname = $escname;
if ($negname =~ /:/) {
@@ -70,25 +76,25 @@ sub PrintClass($$@) {
} else {
$negname =~ y/a-z/A-Z/;
}
- return "{ \"$escname\", +1, code$cname, $n }", "{ \"$negname\", -1, code$cname, $n }";
+ return "{ \"$escname\", +1, code$cnum, $n }", "{ \"$negname\", -1, code$cnum, $n }";
}
-my $gen = 0;
+my $cnum = 0;
sub PrintClasses($@) {
- my ($cname, @classes) = @_;
+ my ($pname, @classes) = @_;
my @entries;
- foreach my $cl (@classes) {
- my @ranges = ComputeClass($cl);
- push @entries, PrintClass(++$gen, $cl, @ranges);
+ foreach my $cname (@classes) {
+ my @ranges = ComputeClass($cname);
+ push @entries, PrintClass(++$cnum, $cname, @ranges);
}
- print "const UGroup ${cname}_groups[] = {\n";
+ print "const UGroup ${pname}_groups[] = {\n";
foreach my $e (@entries) {
print "\t$e,\n";
}
print "};\n";
my $count = @entries;
- print "const int num_${cname}_groups = $count;\n";
+ print "const int num_${pname}_groups = $count;\n";
}
print <<EOF;
diff --git a/re2/make_unicode_casefold.py b/re2/make_unicode_casefold.py
index 8f90cfe..d215eb1 100755
--- a/re2/make_unicode_casefold.py
+++ b/re2/make_unicode_casefold.py
@@ -9,7 +9,8 @@
"""Generate C++ table for Unicode case folding."""
-import unicode, sys
+import sys
+import unicode
_header = """
// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
diff --git a/re2/nfa.cc b/re2/nfa.cc
index 57a18fe..3ca275e 100644
--- a/re2/nfa.cc
+++ b/re2/nfa.cc
@@ -468,7 +468,7 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
if (text.begin() > context.begin()) {
c = text.begin()[-1] & 0xFF;
- wasword = Prog::IsWordChar(c);
+ wasword = Prog::IsWordChar(static_cast<uint8>(c));
}
// Loop over the text, stepping the machine.
diff --git a/re2/parse.cc b/re2/parse.cc
index e6b27d2..3c15bbd 100644
--- a/re2/parse.cc
+++ b/re2/parse.cc
@@ -215,7 +215,7 @@ bool Regexp::ParseState::PushRegexp(Regexp* re) {
// single characters (e.g., [.] instead of \.), and some
// analysis does better with fewer character classes.
// Similarly, [Aa] can be rewritten as a literal A with ASCII case folding.
- if (re->op_ == kRegexpCharClass) {
+ if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {
re->ccb_->RemoveAbove(rune_max_);
if (re->ccb_->size() == 1) {
Rune r = re->ccb_->begin()->lo;
@@ -576,13 +576,6 @@ bool Regexp::ParseState::DoLeftParenNoCapture() {
return PushRegexp(re);
}
-// Adds r to cc, along with r's upper case if foldascii is set.
-static void AddLiteral(CharClassBuilder* cc, Rune r, bool foldascii) {
- cc->AddRange(r, r);
- if (foldascii && 'a' <= r && r <= 'z')
- cc->AddRange(r + 'A' - 'a', r + 'A' - 'a');
-}
-
// Processes a vertical bar in the input.
bool Regexp::ParseState::DoVerticalBar() {
MaybeConcatString(-1, NoParseFlags);
@@ -596,46 +589,34 @@ bool Regexp::ParseState::DoVerticalBar() {
Regexp* r1;
Regexp* r2;
if ((r1 = stacktop_) != NULL &&
- (r2 = stacktop_->down_) != NULL &&
+ (r2 = r1->down_) != NULL &&
r2->op() == kVerticalBar) {
- // If above and below vertical bar are literal or char class,
- // can merge into a single char class.
Regexp* r3;
- if ((r1->op() == kRegexpLiteral ||
- r1->op() == kRegexpCharClass ||
- r1->op() == kRegexpAnyChar) &&
- (r3 = r2->down_) != NULL) {
- Rune rune;
- switch (r3->op()) {
- case kRegexpLiteral: // convert to char class
- rune = r3->rune_;
- r3->op_ = kRegexpCharClass;
- r3->cc_ = NULL;
- r3->ccb_ = new CharClassBuilder;
- AddLiteral(r3->ccb_, rune, r3->parse_flags_ & Regexp::FoldCase);
- // fall through
- case kRegexpCharClass:
- if (r1->op() == kRegexpLiteral)
- AddLiteral(r3->ccb_, r1->rune_,
- r1->parse_flags_ & Regexp::FoldCase);
- else if (r1->op() == kRegexpCharClass)
- r3->ccb_->AddCharClass(r1->ccb_);
- if (r1->op() == kRegexpAnyChar || r3->ccb_->full()) {
- delete r3->ccb_;
- r3->ccb_ = NULL;
- r3->op_ = kRegexpAnyChar;
- }
- // fall through
- case kRegexpAnyChar:
- // pop r1
- stacktop_ = r2;
- r1->Decref();
- return true;
- default:
- break;
+ if ((r3 = r2->down_) != NULL &&
+ (r1->op() == kRegexpAnyChar || r3->op() == kRegexpAnyChar)) {
+ // AnyChar is above or below the vertical bar. Let it subsume
+ // the other when the other is Literal, CharClass or AnyChar.
+ if (r3->op() == kRegexpAnyChar &&
+ (r1->op() == kRegexpLiteral ||
+ r1->op() == kRegexpCharClass ||
+ r1->op() == kRegexpAnyChar)) {
+ // Discard r1.
+ stacktop_ = r2;
+ r1->Decref();
+ return true;
+ }
+ if (r1->op() == kRegexpAnyChar &&
+ (r3->op() == kRegexpLiteral ||
+ r3->op() == kRegexpCharClass ||
+ r3->op() == kRegexpAnyChar)) {
+ // Rearrange the stack and discard r3.
+ r1->down_ = r3->down_;
+ r2->down_ = r1;
+ stacktop_ = r2;
+ r3->Decref();
+ return true;
}
}
-
// Swap r1 below vertical bar (r2).
r1->down_ = r2->down_;
r2->down_ = r1;
@@ -1166,7 +1147,7 @@ bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) {
if (r >= 0) {
re1->op_ = kRegexpLiteral;
re1->rune_ = r;
- re1->parse_flags_ = flags;
+ re1->parse_flags_ = static_cast<uint16>(flags);
return true;
}
diff --git a/re2/prog.cc b/re2/prog.cc
index f326ffd..499f560 100644
--- a/re2/prog.cc
+++ b/re2/prog.cc
@@ -25,7 +25,7 @@ void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) {
set_out_opcode(out, kInstByteRange);
lo_ = lo & 0xFF;
hi_ = hi & 0xFF;
- foldcase_ = foldcase;
+ foldcase_ = foldcase & 0xFF;
}
void Prog::Inst::InitCapture(int cap, uint32 out) {
@@ -327,12 +327,12 @@ void Prog::ComputeByteMap() {
bytemap_range_ = bytemap_[255] + 1;
unbytemap_ = new uint8[bytemap_range_];
for (int i = 0; i < 256; i++)
- unbytemap_[bytemap_[i]] = i;
+ unbytemap_[bytemap_[i]] = static_cast<uint8>(i);
if (0) { // For debugging: use trivial byte map.
for (int i = 0; i < 256; i++) {
- bytemap_[i] = i;
- unbytemap_[i] = i;
+ bytemap_[i] = static_cast<uint8>(i);
+ unbytemap_[i] = static_cast<uint8>(i);
}
bytemap_range_ = 256;
LOG(INFO) << "Using trivial bytemap.";
diff --git a/re2/prog.h b/re2/prog.h
index 26c8676..3e6be8f 100644
--- a/re2/prog.h
+++ b/re2/prog.h
@@ -201,10 +201,10 @@ class Prog {
int start_unanchored() { return start_unanchored_; }
void set_start(int start) { start_ = start; }
void set_start_unanchored(int start) { start_unanchored_ = start; }
- int64 size() { return size_; }
+ int size() { return size_; }
bool reversed() { return reversed_; }
void set_reversed(bool reversed) { reversed_ = reversed; }
- int64 byte_inst_count() { return byte_inst_count_; }
+ int byte_inst_count() { return byte_inst_count_; }
const Bitmap<256>& byterange() { return byterange_; }
void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
int64 dfa_mem() { return dfa_mem_; }
diff --git a/re2/re2.cc b/re2/re2.cc
index 4e8af90..099491c 100644
--- a/re2/re2.cc
+++ b/re2/re2.cc
@@ -11,7 +11,6 @@
#include <stdio.h>
#include <string>
-#include <pthread.h>
#include <errno.h>
#include "util/atomicops.h"
#include "util/util.h"
@@ -879,31 +878,30 @@ bool RE2::Rewrite(string *out, const StringPiece &rewrite,
const StringPiece *vec, int veclen) const {
for (const char *s = rewrite.data(), *end = s + rewrite.size();
s < end; s++) {
- int c = *s;
- if (c == '\\') {
- s++;
- c = (s < end) ? *s : -1;
- if (isdigit(c)) {
- int n = (c - '0');
- if (n >= veclen) {
- if (options_.log_errors()) {
- LOG(ERROR) << "requested group " << n
- << " in regexp " << rewrite.data();
- }
- return false;
+ if (*s != '\\') {
+ out->push_back(*s);
+ continue;
+ }
+ s++;
+ int c = (s < end) ? *s : -1;
+ if (isdigit(c)) {
+ int n = (c - '0');
+ if (n >= veclen) {
+ if (options_.log_errors()) {
+ LOG(ERROR) << "requested group " << n
+ << " in regexp " << rewrite.data();
}
- StringPiece snip = vec[n];
- if (snip.size() > 0)
- out->append(snip.data(), snip.size());
- } else if (c == '\\') {
- out->push_back('\\');
- } else {
- if (options_.log_errors())
- LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
return false;
}
+ StringPiece snip = vec[n];
+ if (snip.size() > 0)
+ out->append(snip.data(), snip.size());
+ } else if (c == '\\') {
+ out->push_back('\\');
} else {
- out->push_back(c);
+ if (options_.log_errors())
+ LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
+ return false;
}
}
return true;
@@ -1102,7 +1100,7 @@ bool RE2::Arg::parse_short_radix(const char* str,
if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
if ((short)r != r) return false; // Out of range
if (dest == NULL) return true;
- *(reinterpret_cast<short*>(dest)) = r;
+ *(reinterpret_cast<short*>(dest)) = (short)r;
return true;
}
@@ -1114,7 +1112,7 @@ bool RE2::Arg::parse_ushort_radix(const char* str,
if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
if ((ushort)r != r) return false; // Out of range
if (dest == NULL) return true;
- *(reinterpret_cast<unsigned short*>(dest)) = r;
+ *(reinterpret_cast<unsigned short*>(dest)) = (ushort)r;
return true;
}
@@ -1200,7 +1198,7 @@ static bool parse_double_float(const char* str, int n, bool isfloat, void *dest)
if (errno) return false;
if (dest == NULL) return true;
if (isfloat) {
- *(reinterpret_cast<float*>(dest)) = r;
+ *(reinterpret_cast<float*>(dest)) = (float)r;
} else {
*(reinterpret_cast<double*>(dest)) = r;
}
diff --git a/re2/regexp.cc b/re2/regexp.cc
index 3667fda..b09b5ca 100644
--- a/re2/regexp.cc
+++ b/re2/regexp.cc
@@ -14,7 +14,7 @@ namespace re2 {
// Constructor. Allocates vectors as appropriate for operator.
Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
- : op_(op),
+ : op_(static_cast<uint8>(op)),
simple_(false),
parse_flags_(static_cast<uint16>(parse_flags)),
ref_(1),
@@ -107,7 +107,7 @@ void Regexp::Decref() {
GLOBAL_MUTEX_LOCK(ref_mutex);
int r = (*ref_map)[this] - 1;
if (r < kMaxRef) {
- ref_ = r;
+ ref_ = static_cast<uint16>(r);
ref_map->erase(this);
} else {
(*ref_map)[this] = r;
@@ -651,7 +651,7 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
if (re->parse_flags() & Latin1) {
prefix->resize(re->nrunes_);
for (int j = 0; j < re->nrunes_; j++)
- (*prefix)[j] = re->runes_[j];
+ (*prefix)[j] = static_cast<char>(re->runes_[j]);
} else {
// Convert to UTF-8 in place.
// Assume worst-case space and then trim.
@@ -660,7 +660,7 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
for (int j = 0; j < re->nrunes_; j++) {
Rune r = re->runes_[j];
if (r < Runeself)
- *p++ = r;
+ *p++ = static_cast<char>(r);
else
p += runetochar(p, &r);
}
@@ -670,7 +670,7 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
case kRegexpLiteral:
if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
- prefix->append(1, re->rune_);
+ prefix->append(1, static_cast<char>(re->rune_));
} else {
char buf[UTFmax];
prefix->append(buf, runetochar(buf, &re->rune_));
diff --git a/re2/regexp.h b/re2/regexp.h
index 60eb993..b49ce0d 100644
--- a/re2/regexp.h
+++ b/re2/regexp.h
@@ -211,7 +211,8 @@ class RegexpStatus {
DISALLOW_COPY_AND_ASSIGN(RegexpStatus);
};
-// Walker to implement Simplify.
+// Walkers to implement Simplify.
+class CoalesceWalker;
class SimplifyWalker;
// Compiled form; see prog.h
@@ -353,6 +354,7 @@ class Regexp {
// removed. The result will capture exactly the same
// subexpressions the original did, unless formatted with ToString.
Regexp* Simplify();
+ friend class CoalesceWalker;
friend class SimplifyWalker;
// Parses the regexp src and then simplifies it and sets *dst to the
diff --git a/re2/simplify.cc b/re2/simplify.cc
index 9c0021e..d14483f 100644
--- a/re2/simplify.cc
+++ b/re2/simplify.cc
@@ -97,6 +97,36 @@ bool Regexp::ComputeSimple() {
}
// Walker subclass used by Simplify.
+// Coalesces runs of star/plus/quest/repeat of the same literal along with any
+// occurrences of that literal into repeats of that literal. It also works for
+// char classes, any char and any byte.
+// PostVisit creates the coalesced result, which should then be simplified.
+class CoalesceWalker : public Regexp::Walker<Regexp*> {
+ public:
+ CoalesceWalker() {}
+ virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
+ Regexp** child_args, int nchild_args);
+ virtual Regexp* Copy(Regexp* re);
+ virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
+
+ private:
+ // These functions are declared inside CoalesceWalker so that
+ // they can edit the private fields of the Regexps they construct.
+
+ // Returns true if r1 and r2 can be coalesced. In particular, ensures that
+ // the parse flags are consistent. (They will not be checked again later.)
+ static bool CanCoalesce(Regexp* r1, Regexp* r2);
+
+ // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
+ // will be empty match and the coalesced op. In other cases, where part of a
+ // literal string was removed to be coalesced, the array elements afterwards
+ // will be the coalesced op and the remainder of the literal string.
+ static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
+
+ DISALLOW_COPY_AND_ASSIGN(CoalesceWalker);
+};
+
+// Walker subclass used by Simplify.
// The simplify walk is purely post-recursive: given the simplified children,
// PostVisit creates the simplified result.
// The child_args are simplified Regexp*s.
@@ -104,9 +134,7 @@ class SimplifyWalker : public Regexp::Walker<Regexp*> {
public:
SimplifyWalker() {}
virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
- virtual Regexp* PostVisit(Regexp* re,
- Regexp* parent_arg,
- Regexp* pre_arg,
+ virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
Regexp** child_args, int nchild_args);
virtual Regexp* Copy(Regexp* re);
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
@@ -143,14 +171,261 @@ class SimplifyWalker : public Regexp::Walker<Regexp*> {
// Caller must Decref() return value when done with it.
Regexp* Regexp::Simplify() {
- if (simple_)
- return Incref();
- SimplifyWalker w;
- return w.Walk(this, NULL);
+ CoalesceWalker cw;
+ Regexp* cre = cw.Walk(this, NULL);
+ if (cre == NULL)
+ return cre;
+ SimplifyWalker sw;
+ Regexp* sre = sw.Walk(cre, NULL);
+ cre->Decref();
+ return sre;
}
#define Simplify DontCallSimplify // Avoid accidental recursion
+// Utility function for PostVisit implementations that compares re->sub() with
+// child_args to determine whether any child_args changed. In the common case,
+// where nothing changed, calls Decref() for all child_args and returns false,
+// so PostVisit must return re->Incref(). Otherwise, returns true.
+static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
+ for (int i = 0; i < re->nsub(); i++) {
+ Regexp* sub = re->sub()[i];
+ Regexp* newsub = child_args[i];
+ if (newsub != sub)
+ return true;
+ }
+ for (int i = 0; i < re->nsub(); i++) {
+ Regexp* newsub = child_args[i];
+ newsub->Decref();
+ }
+ return false;
+}
+
+Regexp* CoalesceWalker::Copy(Regexp* re) {
+ return re->Incref();
+}
+
+Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
+ // This should never be called, since we use Walk and not
+ // WalkExponential.
+ LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
+ return re->Incref();
+}
+
+Regexp* CoalesceWalker::PostVisit(Regexp* re,
+ Regexp* parent_arg,
+ Regexp* pre_arg,
+ Regexp** child_args,
+ int nchild_args) {
+ if (re->nsub() == 0)
+ return re->Incref();
+
+ if (re->op() != kRegexpConcat) {
+ if (!ChildArgsChanged(re, child_args))
+ return re->Incref();
+
+ // Something changed. Build a new op.
+ Regexp* nre = new Regexp(re->op(), re->parse_flags());
+ nre->AllocSub(re->nsub());
+ Regexp** nre_subs = nre->sub();
+ for (int i = 0; i < re->nsub(); i++)
+ nre_subs[i] = child_args[i];
+ // Repeats and Captures have additional data that must be copied.
+ if (re->op() == kRegexpRepeat) {
+ nre->min_ = re->min();
+ nre->max_ = re->max();
+ } else if (re->op() == kRegexpCapture) {
+ nre->cap_ = re->cap();
+ }
+ return nre;
+ }
+
+ bool can_coalesce = false;
+ for (int i = 0; i < re->nsub(); i++) {
+ if (i+1 < re->nsub() &&
+ CanCoalesce(child_args[i], child_args[i+1])) {
+ can_coalesce = true;
+ break;
+ }
+ }
+ if (!can_coalesce) {
+ if (!ChildArgsChanged(re, child_args))
+ return re->Incref();
+
+ // Something changed. Build a new op.
+ Regexp* nre = new Regexp(re->op(), re->parse_flags());
+ nre->AllocSub(re->nsub());
+ Regexp** nre_subs = nre->sub();
+ for (int i = 0; i < re->nsub(); i++)
+ nre_subs[i] = child_args[i];
+ return nre;
+ }
+
+ for (int i = 0; i < re->nsub(); i++) {
+ if (i+1 < re->nsub() &&
+ CanCoalesce(child_args[i], child_args[i+1]))
+ DoCoalesce(&child_args[i], &child_args[i+1]);
+ }
+ // Determine how many empty matches were left by DoCoalesce.
+ int n = 0;
+ for (int i = n; i < re->nsub(); i++) {
+ if (child_args[i]->op() == kRegexpEmptyMatch)
+ n++;
+ }
+ // Build a new op.
+ Regexp* nre = new Regexp(re->op(), re->parse_flags());
+ nre->AllocSub(re->nsub() - n);
+ Regexp** nre_subs = nre->sub();
+ for (int i = 0, j = 0; i < re->nsub(); i++) {
+ if (child_args[i]->op() == kRegexpEmptyMatch) {
+ child_args[i]->Decref();
+ continue;
+ }
+ nre_subs[j] = child_args[i];
+ j++;
+ }
+ return nre;
+}
+
+bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
+ // r1 must be a star/plus/quest/repeat of a literal, char class, any char or
+ // any byte.
+ if ((r1->op() == kRegexpStar ||
+ r1->op() == kRegexpPlus ||
+ r1->op() == kRegexpQuest ||
+ r1->op() == kRegexpRepeat) &&
+ (r1->sub()[0]->op() == kRegexpLiteral ||
+ r1->sub()[0]->op() == kRegexpCharClass ||
+ r1->sub()[0]->op() == kRegexpAnyChar ||
+ r1->sub()[0]->op() == kRegexpAnyByte)) {
+ // r2 must be a star/plus/quest/repeat of the same literal, char class,
+ // any char or any byte.
+ if ((r2->op() == kRegexpStar ||
+ r2->op() == kRegexpPlus ||
+ r2->op() == kRegexpQuest ||
+ r2->op() == kRegexpRepeat) &&
+ Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
+ // The parse flags must be consistent.
+ ((r1->parse_flags() & Regexp::NonGreedy) ==
+ (r2->parse_flags() & Regexp::NonGreedy))) {
+ return true;
+ }
+ // ... OR an occurrence of that literal, char class, any char or any byte
+ if (Regexp::Equal(r1->sub()[0], r2)) {
+ return true;
+ }
+ // ... OR a literal string that begins with that literal.
+ if (r1->sub()[0]->op() == kRegexpLiteral &&
+ r2->op() == kRegexpLiteralString &&
+ r2->runes()[0] == r1->sub()[0]->rune() &&
+ // The parse flags must be consistent.
+ ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
+ (r2->parse_flags() & Regexp::FoldCase))) {
+ return true;
+ }
+ }
+ return false;
+}
+
+void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
+ Regexp* r1 = *r1ptr;
+ Regexp* r2 = *r2ptr;
+
+ Regexp* nre = Regexp::Repeat(
+ r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
+
+ switch (r1->op()) {
+ case kRegexpStar:
+ nre->min_ = 0;
+ nre->max_ = -1;
+ break;
+
+ case kRegexpPlus:
+ nre->min_ = 1;
+ nre->max_ = -1;
+ break;
+
+ case kRegexpQuest:
+ nre->min_ = 0;
+ nre->max_ = 1;
+ break;
+
+ case kRegexpRepeat:
+ nre->min_ = r1->min();
+ nre->max_ = r1->max();
+ break;
+
+ default:
+ LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
+ nre->Decref();
+ return;
+ }
+
+ switch (r2->op()) {
+ case kRegexpStar:
+ nre->max_ = -1;
+ goto LeaveEmpty;
+
+ case kRegexpPlus:
+ nre->min_++;
+ nre->max_ = -1;
+ goto LeaveEmpty;
+
+ case kRegexpQuest:
+ if (nre->max() != -1)
+ nre->max_++;
+ goto LeaveEmpty;
+
+ case kRegexpRepeat:
+ nre->min_ += r2->min();
+ if (r2->max() == -1)
+ nre->max_ = -1;
+ else if (nre->max() != -1)
+ nre->max_ += r2->max();
+ goto LeaveEmpty;
+
+ case kRegexpLiteral:
+ case kRegexpCharClass:
+ case kRegexpAnyChar:
+ case kRegexpAnyByte:
+ nre->min_++;
+ if (nre->max() != -1)
+ nre->max_++;
+ goto LeaveEmpty;
+
+ LeaveEmpty:
+ *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
+ *r2ptr = nre;
+ break;
+
+ case kRegexpLiteralString: {
+ Rune r = r1->sub()[0]->rune();
+ // Determine how much of the literal string is removed.
+ // We know that we have at least one rune. :)
+ int n = 1;
+ while (n < r2->nrunes() && r2->runes()[n] == r)
+ n++;
+ nre->min_ += n;
+ if (nre->max() != -1)
+ nre->max_ += n;
+ if (n == r2->nrunes())
+ goto LeaveEmpty;
+ *r1ptr = nre;
+ *r2ptr = Regexp::LiteralString(
+ &r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
+ break;
+ }
+
+ default:
+ LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
+ nre->Decref();
+ return;
+ }
+
+ r1->Decref();
+ r2->Decref();
+}
+
Regexp* SimplifyWalker::Copy(Regexp* re) {
return re->Incref();
}
@@ -196,29 +471,14 @@ Regexp* SimplifyWalker::PostVisit(Regexp* re,
case kRegexpConcat:
case kRegexpAlternate: {
// These are simple as long as the subpieces are simple.
- // Two passes to avoid allocation in the common case.
- bool changed = false;
- Regexp** subs = re->sub();
- for (int i = 0; i < re->nsub_; i++) {
- Regexp* sub = subs[i];
- Regexp* newsub = child_args[i];
- if (newsub != sub) {
- changed = true;
- break;
- }
- }
- if (!changed) {
- for (int i = 0; i < re->nsub_; i++) {
- Regexp* newsub = child_args[i];
- newsub->Decref();
- }
+ if (!ChildArgsChanged(re, child_args)) {
re->simple_ = true;
return re->Incref();
}
Regexp* nre = new Regexp(re->op(), re->parse_flags());
- nre->AllocSub(re->nsub_);
+ nre->AllocSub(re->nsub());
Regexp** nre_subs = nre->sub();
- for (int i = 0; i <re->nsub_; i++)
+ for (int i = 0; i < re->nsub(); i++)
nre_subs[i] = child_args[i];
nre->simple_ = true;
return nre;
@@ -234,7 +494,7 @@ Regexp* SimplifyWalker::PostVisit(Regexp* re,
Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
nre->AllocSub(1);
nre->sub()[0] = newsub;
- nre->cap_ = re->cap_;
+ nre->cap_ = re->cap();
nre->simple_ = true;
return nre;
}
diff --git a/re2/testing/dump.cc b/re2/testing/dump.cc
index 4bdf714..9703039 100644
--- a/re2/testing/dump.cc
+++ b/re2/testing/dump.cc
@@ -120,6 +120,8 @@ static void DumpRegexpAppending(Regexp* re, string* s) {
DumpRegexpAppending(re->sub()[0], s);
break;
case kRegexpCapture:
+ if (re->cap() == 0)
+ LOG(DFATAL) << "kRegexpCapture cap() == 0";
if (re->name()) {
s->append(*re->name());
s->append(":");
diff --git a/re2/testing/parse_test.cc b/re2/testing/parse_test.cc
index f6d760a..29a79b1 100644
--- a/re2/testing/parse_test.cc
+++ b/re2/testing/parse_test.cc
@@ -118,8 +118,15 @@ static Test tests[] = {
{ "(?:a)", "lit{a}" },
{ "(?:ab)(?:cd)", "str{abcd}" },
{ "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" },
+ { "a|c", "cc{0x61 0x63}" },
+ { "a|[cd]", "cc{0x61 0x63-0x64}" },
{ "a|.", "dot{}" },
- { ".|a", "dot{}" },
+ { "[ab]|c", "cc{0x61-0x63}" },
+ { "[ab]|[cd]", "cc{0x61-0x64}" },
+ { "[ab]|.", "dot{}" },
+ { ".|c", "dot{}" },
+ { ".|[cd]", "dot{}" },
+ { ".|.", "dot{}" },
// Test Perl quoted literals
{ "\\Q+|*?{[\\E", "str{+|*?{[}" },
@@ -299,6 +306,14 @@ Test prefix_tests[] = {
"cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" },
{ "x{2}y|x{2}[0-9]y",
"cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}" },
+ { "n|r|rs",
+ "alt{lit{n}cat{lit{r}alt{emp{}lit{s}}}}" },
+ { "n|rs|r",
+ "alt{lit{n}cat{lit{r}alt{lit{s}emp{}}}}" },
+ { "r|rs|n",
+ "alt{cat{lit{r}alt{emp{}lit{s}}}lit{n}}" },
+ { "rs|r|n",
+ "alt{cat{lit{r}alt{lit{s}emp{}}}lit{n}}" },
};
// Test that prefix factoring works.
diff --git a/re2/testing/set_test.cc b/re2/testing/set_test.cc
index 74058a4..4e267ae 100644
--- a/re2/testing/set_test.cc
+++ b/re2/testing/set_test.cc
@@ -71,10 +71,10 @@ TEST(Set, UnanchoredFactored) {
TEST(Set, UnanchoredDollar) {
RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
-
+
CHECK_EQ(s.Add("foo$", NULL), 0);
CHECK_EQ(s.Compile(), true);
-
+
vector<int> v;
CHECK_EQ(s.Match("foo", &v), true);
CHECK_EQ(v.size(), 1);
@@ -107,8 +107,34 @@ TEST(Set, Anchored) {
CHECK_EQ(s.Match("bar", &v), true);
CHECK_EQ(v.size(), 1);
CHECK_EQ(v[0], 1);
+}
+
+TEST(Set, EmptyUnanchored) {
+ RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
+
+ CHECK_EQ(s.Compile(), true);
+
+ vector<int> v;
+ CHECK_EQ(s.Match("", &v), false);
+ CHECK_EQ(v.size(), 0);
+
+ v.clear();
+ CHECK_EQ(s.Match("foobar", &v), false);
+ CHECK_EQ(v.size(), 0);
+}
+
+TEST(Set, EmptyAnchored) {
+ RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH);
+ CHECK_EQ(s.Compile(), true);
+
+ vector<int> v;
+ CHECK_EQ(s.Match("", &v), false);
+ CHECK_EQ(v.size(), 0);
+
+ v.clear();
+ CHECK_EQ(s.Match("foobar", &v), false);
+ CHECK_EQ(v.size(), 0);
}
} // namespace re2
-
diff --git a/re2/testing/simplify_test.cc b/re2/testing/simplify_test.cc
index d54837c..9db41ee 100644
--- a/re2/testing/simplify_test.cc
+++ b/re2/testing/simplify_test.cc
@@ -136,6 +136,99 @@ static Test tests[] = {
{ "(){1}", "()" },
{ "(){1,}", "()+" },
{ "(){0,2}", "(?:()()?)?" },
+
+ // Test that coalescing occurs and that the resulting repeats are simplified.
+ // Two-op combinations of *, +, ?, {n}, {n,} and {n,m} with a literal:
+ { "a*a*", "a*" },
+ { "a*a+", "a+" },
+ { "a*a?", "a*" },
+ { "a*a{2}", "aa+" },
+ { "a*a{2,}", "aa+" },
+ { "a*a{2,3}", "aa+" },
+ { "a+a*", "a+" },
+ { "a+a+", "aa+" },
+ { "a+a?", "a+" },
+ { "a+a{2}", "aaa+" },
+ { "a+a{2,}", "aaa+" },
+ { "a+a{2,3}", "aaa+" },
+ { "a?a*", "a*" },
+ { "a?a+", "a+" },
+ { "a?a?", "(?:aa?)?" },
+ { "a?a{2}", "aaa?" },
+ { "a?a{2,}", "aa+" },
+ { "a?a{2,3}", "aa(?:aa?)?" },
+ { "a{2}a*", "aa+" },
+ { "a{2}a+", "aaa+" },
+ { "a{2}a?", "aaa?" },
+ { "a{2}a{2}", "aaaa" },
+ { "a{2}a{2,}", "aaaa+" },
+ { "a{2}a{2,3}", "aaaaa?" },
+ { "a{2,}a*", "aa+" },
+ { "a{2,}a+", "aaa+" },
+ { "a{2,}a?", "aa+" },
+ { "a{2,}a{2}", "aaaa+" },
+ { "a{2,}a{2,}", "aaaa+" },
+ { "a{2,}a{2,3}", "aaaa+" },
+ { "a{2,3}a*", "aa+" },
+ { "a{2,3}a+", "aaa+" },
+ { "a{2,3}a?", "aa(?:aa?)?" },
+ { "a{2,3}a{2}", "aaaaa?" },
+ { "a{2,3}a{2,}", "aaaa+" },
+ { "a{2,3}a{2,3}", "aaaa(?:aa?)?" },
+ // With a char class, any char and any byte:
+ { "\\d*\\d*", "[0-9]*" },
+ { ".*.*", ".*" },
+ { "\\C*\\C*", "\\C*" },
+ // FoldCase works, but must be consistent:
+ { "(?i)A*a*", "[Aa]*" },
+ { "(?i)a+A+", "[Aa][Aa]+" },
+ { "(?i)A*(?-i)a*", "[Aa]*a*" },
+ { "(?i)a+(?-i)A+", "[Aa]+A+" },
+ // NonGreedy works, but must be consistent:
+ { "a*?a*?", "a*?" },
+ { "a+?a+?", "aa+?" },
+ { "a*?a*", "a*?a*" },
+ { "a+a+?", "a+a+?" },
+ // The second element is the literal, char class, any char or any byte:
+ { "a*a", "a+" },
+ { "\\d*\\d", "[0-9]+" },
+ { ".*.", ".+" },
+ { "\\C*\\C", "\\C+" },
+ // FoldCase works, but must be consistent:
+ { "(?i)A*a", "[Aa]+" },
+ { "(?i)a+A", "[Aa][Aa]+" },
+ { "(?i)A*(?-i)a", "[Aa]*a" },
+ { "(?i)a+(?-i)A", "[Aa]+A" },
+ // The second element is a literal string that begins with the literal:
+ { "a*aa", "aa+" },
+ { "a*aab", "aa+b" },
+ // FoldCase works, but must be consistent:
+ { "(?i)a*aa", "[Aa][Aa]+" },
+ { "(?i)a*aab", "[Aa][Aa]+[Bb]" },
+ { "(?i)a*(?-i)aa", "[Aa]*aa" },
+ { "(?i)a*(?-i)aab", "[Aa]*aab" },
+ // Negative tests with mismatching ops:
+ { "a*b*", "a*b*" },
+ { "\\d*\\D*", "[0-9]*[^0-9]*" },
+ { "a+b", "a+b" },
+ { "\\d+\\D", "[0-9]+[^0-9]" },
+ { "a?bb", "a?bb" },
+ // Negative tests with capturing groups:
+ { "(a*)a*", "(a*)a*" },
+ { "a+(a)", "a+(a)" },
+ { "(a?)(aa)", "(a?)(aa)" },
+ // Just for fun:
+ { "aa*aa+aa?aa{2}aaa{2,}aaa{2,3}a", "aaaaaaaaaaaaaaaa+" },
+
+ // During coalescing, the child of the repeat changes, so we build a new
+ // repeat. The new repeat must have the min and max of the old repeat.
+ // Failure to copy them results in min=0 and max=0 -> empty match.
+ { "(?:a*aab){2}", "aa+baa+b" },
+
+ // During coalescing, the child of the capture changes, so we build a new
+ // capture. The new capture must have the cap of the old capture.
+ // Failure to copy it results in cap=0 -> ToString() logs a fatal error.
+ { "(a*aab)", "(aa+b)" },
};
TEST(TestSimplify, SimpleRegexps) {
diff --git a/re2/tostring.cc b/re2/tostring.cc
index 56ca7df..c59d4d9 100644
--- a/re2/tostring.cc
+++ b/re2/tostring.cc
@@ -94,6 +94,8 @@ int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
case kRegexpCapture:
t_->append("(");
+ if (re->cap() == 0)
+ LOG(DFATAL) << "kRegexpCapture cap() == 0";
if (re->name()) {
t_->append("?P<");
t_->append(*re->name());
@@ -120,13 +122,13 @@ int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
static void AppendLiteral(string *t, Rune r, bool foldcase) {
if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
t->append(1, '\\');
- t->append(1, r);
+ t->append(1, static_cast<char>(r));
} else if (foldcase && 'a' <= r && r <= 'z') {
if ('a' <= r && r <= 'z')
r += 'A' - 'a';
t->append(1, '[');
- t->append(1, r);
- t->append(1, r + 'a' - 'A');
+ t->append(1, static_cast<char>(r));
+ t->append(1, static_cast<char>(r) + 'a' - 'A');
t->append(1, ']');
} else {
AppendCCRange(t, r, r);
@@ -297,7 +299,7 @@ static void AppendCCChar(string* t, Rune r) {
if (0x20 <= r && r <= 0x7E) {
if (strchr("[]^-\\", r))
t->append("\\");
- t->append(1, r);
+ t->append(1, static_cast<char>(r));
return;
}
switch (r) {
diff --git a/util/benchmark.cc b/util/benchmark.cc
index c3aad7e..8f72f3c 100644
--- a/util/benchmark.cc
+++ b/util/benchmark.cc
@@ -25,10 +25,17 @@ void Benchmark::Register() {
}
static int64 nsec() {
+#if defined(__APPLE__) || defined(_WIN32)
struct timeval tv;
if(gettimeofday(&tv, 0) < 0)
return -1;
return (int64)tv.tv_sec*1000*1000*1000 + tv.tv_usec*1000;
+#else
+ struct timespec tp;
+ if(clock_gettime(CLOCK_REALTIME, &tp) < 0)
+ return -1;
+ return (int64)tp.tv_sec*1000*1000*1000 + tp.tv_nsec;
+#endif
}
static int64 bytes;
diff --git a/util/mutex.h b/util/mutex.h
index 4a8de4c..9cb6de3 100644
--- a/util/mutex.h
+++ b/util/mutex.h
@@ -12,17 +12,38 @@
#include <stdlib.h>
+#if !defined(_WIN32)
+#include <unistd.h> // For POSIX options
+#endif
+
namespace re2 {
-#define HAVE_PTHREAD 1
-#define HAVE_RWLOCK 1
+#if !defined(_WIN32)
+ // Possible values of POSIX options:
+ // -1 means not supported,
+ // 0 means maybe supported (query at runtime),
+ // >0 means supported.
+# if defined(_POSIX_THREADS) && _POSIX_THREADS > 0
+# define HAVE_PTHREAD 1
+# else
+# define HAVE_PTHREAD 0
+# endif
+# if defined(_POSIX_READER_WRITER_LOCKS) && _POSIX_READER_WRITER_LOCKS > 0
+# define HAVE_RWLOCK 1
+# else
+# define HAVE_RWLOCK 0
+# endif
+#else
+# define HAVE_PTHREAD 0
+# define HAVE_RWLOCK 0
+#endif
#if defined(NO_THREADS)
typedef int MutexType; // to keep a lock-count
-#elif defined(HAVE_PTHREAD) && defined(HAVE_RWLOCK)
+#elif HAVE_PTHREAD && HAVE_RWLOCK
// Needed for pthread_rwlock_*. If it causes problems, you could take it
- // out, but then you'd have to unset HAVE_RWLOCK (at least on linux -- it
- // *does* cause problems for FreeBSD, or MacOSX, but isn't needed
+ // out, but then you'd have to set HAVE_RWLOCK to 0 (at least on linux --
+ // it *does* cause problems for FreeBSD, or MacOSX, but isn't needed
// for locking there.)
# ifdef __linux__
# undef _XOPEN_SOURCE
@@ -30,10 +51,10 @@ namespace re2 {
# endif
# include <pthread.h>
typedef pthread_rwlock_t MutexType;
-#elif defined(HAVE_PTHREAD)
+#elif HAVE_PTHREAD
# include <pthread.h>
typedef pthread_mutex_t MutexType;
-#elif defined(WIN32)
+#elif defined(_WIN32)
# define WIN32_LEAN_AND_MEAN // We only need minimal includes
# ifdef GMUTEX_TRYLOCK
// We need Windows NT or later for TryEnterCriticalSection(). If you
@@ -102,7 +123,7 @@ bool Mutex::TryLock() { if (mutex_) return false; Lock(); return true; }
void Mutex::ReaderLock() { assert(++mutex_ > 0); }
void Mutex::ReaderUnlock() { assert(mutex_-- > 0); }
-#elif defined(HAVE_PTHREAD) && defined(HAVE_RWLOCK)
+#elif HAVE_PTHREAD && HAVE_RWLOCK
#define SAFE_PTHREAD(fncall) do { if ((fncall) != 0) abort(); } while (0)
@@ -116,7 +137,7 @@ void Mutex::ReaderUnlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); }
#undef SAFE_PTHREAD
-#elif defined(HAVE_PTHREAD)
+#elif HAVE_PTHREAD
#define SAFE_PTHREAD(fncall) do { if ((fncall) != 0) abort(); } while (0)
@@ -129,7 +150,7 @@ void Mutex::ReaderLock() { Lock(); } // we don't have read-write locks
void Mutex::ReaderUnlock() { Unlock(); }
#undef SAFE_PTHREAD
-#elif defined(WIN32)
+#elif defined(_WIN32)
Mutex::Mutex() { InitializeCriticalSection(&mutex_); }
Mutex::~Mutex() { DeleteCriticalSection(&mutex_); }
@@ -186,7 +207,7 @@ class WriterMutexLock {
#define WriterMutexLock(x) COMPILE_ASSERT(0, wmutex_lock_decl_missing_var_name)
// Provide safe way to declare and use global, linker-initialized mutex. Sigh.
-#ifdef HAVE_PTHREAD
+#if HAVE_PTHREAD
#define GLOBAL_MUTEX(name) \
static pthread_mutex_t (name) = PTHREAD_MUTEX_INITIALIZER
diff --git a/util/rune.cc b/util/rune.cc
index 26442b0..e6231ce 100644
--- a/util/rune.cc
+++ b/util/rune.cc
@@ -133,7 +133,7 @@ runetochar(char *str, const Rune *rune)
*/
c = *rune;
if(c <= Rune1) {
- str[0] = c;
+ str[0] = static_cast<char>(c);
return 1;
}
@@ -142,7 +142,7 @@ runetochar(char *str, const Rune *rune)
* 0080-07FF => T2 Tx
*/
if(c <= Rune2) {
- str[0] = T2 | (c >> 1*Bitx);
+ str[0] = T2 | static_cast<char>(c >> 1*Bitx);
str[1] = Tx | (c & Maskx);
return 2;
}
@@ -161,9 +161,9 @@ runetochar(char *str, const Rune *rune)
* 0800-FFFF => T3 Tx Tx
*/
if (c <= Rune3) {
- str[0] = T3 | (c >> 2*Bitx);
+ str[0] = T3 | static_cast<char>(c >> 2*Bitx);
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
- str[2] = Tx | (c & Maskx);
+ str[2] = Tx | (c & Maskx);
return 3;
}
@@ -171,7 +171,7 @@ runetochar(char *str, const Rune *rune)
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
- str[0] = T4 | (c >> 3*Bitx);
+ str[0] = T4 | static_cast<char>(c >> 3*Bitx);
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
str[3] = Tx | (c & Maskx);
diff --git a/util/stringprintf.cc b/util/stringprintf.cc
index 79ba7fc..e71d993 100644
--- a/util/stringprintf.cc
+++ b/util/stringprintf.cc
@@ -4,7 +4,7 @@
#include "util/util.h"
-namespace re2 {
+namespace re2 {
static void StringAppendV(string* dst, const char* format, va_list ap) {
// First try with a small fixed size buffer
@@ -38,7 +38,14 @@ static void StringAppendV(string* dst, const char* format, va_list ap) {
// Restore the va_list before we use it again
va_copy(backup_ap, ap);
+#if !defined(_WIN32)
result = vsnprintf(buf, length, format, backup_ap);
+#else
+ // On Windows, the function takes five arguments, not four. With an array,
+ // the buffer size will be inferred, but not with a pointer. C'est la vie.
+ // (See https://github.com/google/re2/issues/40 for more details.)
+ result = vsnprintf(buf, length, _TRUNCATE, format, backup_ap);
+#endif
va_end(backup_ap);
if ((result >= 0) && (result < length)) {
diff --git a/util/test.cc b/util/test.cc
index 220bfb0..85055b2 100644
--- a/util/test.cc
+++ b/util/test.cc
@@ -3,7 +3,7 @@
// license that can be found in the LICENSE file.
#include <stdio.h>
-#ifndef WIN32
+#ifndef _WIN32
#include <sys/resource.h>
#endif
#include "util/test.h"
@@ -25,7 +25,7 @@ void RegisterTest(void (*fn)(void), const char *name) {
namespace re2 {
int64 VirtualProcessSize() {
-#ifdef WIN32
+#ifdef _WIN32
return 0;
#else
struct rusage ru;
diff --git a/util/util.h b/util/util.h
index a7ccafe..f49ebd1 100644
--- a/util/util.h
+++ b/util/util.h
@@ -12,10 +12,10 @@
#include <stddef.h> // For size_t
#include <assert.h>
#include <stdarg.h>
-#include <time.h>
+#include <time.h> // For clock_gettime, CLOCK_REALTIME
#include <ctype.h> // For isdigit, isalpha
-#if !defined(WIN32)
+#if !defined(_WIN32)
#include <sys/time.h> // For gettimeofday
#endif
@@ -53,7 +53,7 @@ using std::tr1::unordered_set;
#else
#include <unordered_set>
-#if defined(WIN32)
+#if defined(_WIN32)
using std::tr1::unordered_set;
#else
using std::unordered_set;
@@ -61,7 +61,7 @@ using std::unordered_set;
#endif
-#ifdef WIN32
+#ifdef _WIN32
#define snprintf _snprintf_s
#define sprintf sprintf_s
@@ -72,7 +72,6 @@ using std::unordered_set;
#define vsnprintf vsnprintf_s
#pragma warning(disable: 4018) // signed/unsigned mismatch
-#pragma warning(disable: 4244) // possible data loss in int conversion
#pragma warning(disable: 4800) // conversion from int to bool
#endif
diff --git a/util/valgrind.cc b/util/valgrind.cc
index ee257d0..82f9a4c 100644
--- a/util/valgrind.cc
+++ b/util/valgrind.cc
@@ -3,7 +3,7 @@
// license that can be found in the LICENSE file.
#include "util/util.h"
-#ifndef WIN32
+#ifndef _WIN32
#include "util/valgrind.h"
#endif