diff options
author | DongHun Kwak <dh0128.kwak@samsung.com> | 2016-11-21 16:55:18 +0900 |
---|---|---|
committer | DongHun Kwak <dh0128.kwak@samsung.com> | 2016-11-21 16:55:19 +0900 |
commit | 873ba3ec5c36fe51e9aeadaac126a04d965fc053 (patch) | |
tree | 9624dd07f5e55939549e86add68cda90119f8af4 | |
parent | abbaebf6f92b878517d8061fae50a8b9ca84ca52 (diff) | |
download | re2-873ba3ec5c36fe51e9aeadaac126a04d965fc053.tar.gz re2-873ba3ec5c36fe51e9aeadaac126a04d965fc053.tar.bz2 re2-873ba3ec5c36fe51e9aeadaac126a04d965fc053.zip |
Imported Upstream version 20150801upstream/20150801
Change-Id: If6ba79b7356ae34e4008c43850b4390163058f2f
Signed-off-by: DongHun Kwak <dh0128.kwak@samsung.com>
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | Makefile | 46 | ||||
-rw-r--r-- | README | 3 | ||||
-rwxr-xr-x | benchlog/benchplot.py | 98 | ||||
-rw-r--r-- | libre2.symbols.darwin | 7 | ||||
-rw-r--r-- | re2/compile.cc | 21 | ||||
-rw-r--r-- | re2/dfa.cc | 18 | ||||
-rwxr-xr-x | re2/make_perl_groups.pl | 34 | ||||
-rwxr-xr-x | re2/make_unicode_casefold.py | 3 | ||||
-rw-r--r-- | re2/nfa.cc | 2 | ||||
-rw-r--r-- | re2/parse.cc | 71 | ||||
-rw-r--r-- | re2/prog.cc | 8 | ||||
-rw-r--r-- | re2/prog.h | 4 | ||||
-rw-r--r-- | re2/re2.cc | 48 | ||||
-rw-r--r-- | re2/regexp.cc | 10 | ||||
-rw-r--r-- | re2/regexp.h | 4 | ||||
-rw-r--r-- | re2/simplify.cc | 312 | ||||
-rw-r--r-- | re2/testing/dump.cc | 2 | ||||
-rw-r--r-- | re2/testing/parse_test.cc | 17 | ||||
-rw-r--r-- | re2/testing/set_test.cc | 32 | ||||
-rw-r--r-- | re2/testing/simplify_test.cc | 93 | ||||
-rw-r--r-- | re2/tostring.cc | 10 | ||||
-rw-r--r-- | util/benchmark.cc | 7 | ||||
-rw-r--r-- | util/mutex.h | 43 | ||||
-rw-r--r-- | util/rune.cc | 10 | ||||
-rw-r--r-- | util/stringprintf.cc | 9 | ||||
-rw-r--r-- | util/test.cc | 4 | ||||
-rw-r--r-- | util/util.h | 9 | ||||
-rw-r--r-- | util/valgrind.cc | 2 |
29 files changed, 738 insertions, 190 deletions
@@ -2,3 +2,4 @@ *.orig core obj/ +benchlog.* @@ -2,8 +2,6 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. -all: obj/libre2.a obj/so/libre2.so - # to build against PCRE for testing or benchmarking, # uncomment the next two lines # CCPCRE=-I/usr/local/include -DUSEPCRE @@ -38,13 +36,24 @@ SONAME=0 # REBUILD_TABLES=1 ifeq ($(shell uname),Darwin) -MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib $(LDFLAGS) -exported_symbols_list libre2.symbols.darwin +SOEXT=dylib +SOEXTVER=$(SONAME).$(SOEXT) +SOEXTVER00=$(SONAME).0.0.$(SOEXT) +MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib $(LDFLAGS) -Wl,-install_name,@rpath/libre2.$(SOEXTVER) -exported_symbols_list libre2.symbols.darwin else ifeq ($(shell uname),SunOS) -MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.so.$(SONAME),-M,libre2.symbols $(LDFLAGS) +SOEXT=so +SOEXTVER=$(SOEXT).$(SONAME) +SOEXTVER00=$(SOEXT).$(SONAME).0.0 +MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),-M,libre2.symbols $(LDFLAGS) else -MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.so.$(SONAME),--version-script,libre2.symbols $(LDFLAGS) +SOEXT=so +SOEXTVER=$(SOEXT).$(SONAME) +SOEXTVER00=$(SOEXT).$(SONAME).0.0 +MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),--version-script,libre2.symbols $(LDFLAGS) endif +all: obj/libre2.a obj/so/libre2.$(SOEXT) + INSTALL_HFILES=\ re2/filtered_re2.h\ re2/re2.h\ @@ -177,10 +186,10 @@ obj/dbg/libre2.a: $(DOFILES) @mkdir -p obj/dbg $(AR) $(ARFLAGS) obj/dbg/libre2.a $(DOFILES) -obj/so/libre2.so: $(SOFILES) +obj/so/libre2.$(SOEXT): $(SOFILES) @mkdir -p obj/so - $(MAKE_SHARED_LIBRARY) -o $@.$(SONAME) $(SOFILES) - ln -sf libre2.so.$(SONAME) $@ + $(MAKE_SHARED_LIBRARY) -o obj/so/libre2.$(SOEXTVER) $(SOFILES) + ln -sf libre2.$(SOEXTVER) $@ obj/test/%: obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o @mkdir -p obj/test @@ -190,7 +199,7 @@ obj/dbg/test/%: obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES) obj/dbg/ @mkdir -p obj/dbg/test $(CXX) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) obj/dbg/util/test.o obj/dbg/libre2.a $(LDFLAGS) $(LDPCRE) -obj/so/test/%: obj/so/libre2.so obj/libre2.a obj/so/re2/testing/%.o $(STESTOFILES) obj/so/util/test.o +obj/so/test/%: obj/so/libre2.$(SOEXT) obj/libre2.a obj/so/re2/testing/%.o $(STESTOFILES) obj/so/util/test.o @mkdir -p obj/so/test $(CXX) -o $@ obj/so/re2/testing/$*.o $(STESTOFILES) obj/so/util/test.o -Lobj/so -lre2 obj/libre2.a $(LDFLAGS) $(LDPCRE) @@ -204,6 +213,8 @@ re2/perl_groups.cc: re2/make_perl_groups.pl re2/unicode_%.cc: re2/make_unicode_%.py python $< > $@ + +.PRECIOUS: re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc endif distclean: clean @@ -246,13 +257,13 @@ shared-bigtest: $(STESTS) $(SBIGTESTS) benchmark: obj/test/regexp_benchmark -install: obj/libre2.a obj/so/libre2.so +install: obj/libre2.a obj/so/libre2.$(SOEXT) mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir)/pkgconfig $(INSTALL_DATA) $(INSTALL_HFILES) $(DESTDIR)$(includedir)/re2 $(INSTALL) obj/libre2.a $(DESTDIR)$(libdir)/libre2.a - $(INSTALL) obj/so/libre2.so $(DESTDIR)$(libdir)/libre2.so.$(SONAME).0.0 - ln -sf libre2.so.$(SONAME).0.0 $(DESTDIR)$(libdir)/libre2.so.$(SONAME) - ln -sf libre2.so.$(SONAME).0.0 $(DESTDIR)$(libdir)/libre2.so + $(INSTALL) obj/so/libre2.$(SOEXT) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER00) + ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER) + ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXT) sed -e "s#@prefix@#${prefix}#" re2.pc >$(DESTDIR)$(libdir)/pkgconfig/re2.pc testinstall: @@ -267,7 +278,7 @@ endif benchlog: obj/test/regexp_benchmark (echo '==BENCHMARK==' `hostname` `date`; \ - (uname -a; $(CXX) --version; hg identify; file obj/test/regexp_benchmark) | sed 's/^/# /'; \ + (uname -a; $(CXX) --version; git rev-parse --short HEAD; file obj/test/regexp_benchmark) | sed 's/^/# /'; \ echo; \ ./obj/test/regexp_benchmark 'PCRE|RE2') | tee -a benchlog.$$(hostname | sed 's/\..*//') @@ -279,8 +290,9 @@ benchlog: obj/test/regexp_benchmark obj/test/% obj/so/test/% obj/dbg/test/% log: - make clean - make CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" obj/test/exhaustive{,1,2,3}_test + $(MAKE) clean + $(MAKE) CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" \ + $(filter obj/test/exhaustive%_test,$(BIGTESTS)) echo '#' RE2 exhaustive tests built by make log >re2-exhaustive.txt echo '#' $$(date) >>re2-exhaustive.txt obj/test/exhaustive_test |grep -v '^PASS$$' >>re2-exhaustive.txt @@ -288,7 +300,7 @@ log: obj/test/exhaustive2_test |grep -v '^PASS$$' >>re2-exhaustive.txt obj/test/exhaustive3_test |grep -v '^PASS$$' >>re2-exhaustive.txt - make CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" obj/test/search_test + $(MAKE) CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" obj/test/search_test echo '#' RE2 basic search tests built by make $@ >re2-search.txt echo '#' $$(date) >>re2-search.txt obj/test/search_test |grep -v '^PASS$$' >>re2-search.txt @@ -25,6 +25,7 @@ under the BSD-style license found in the LICENSE file. RE2's native language is C++. An Erlang wrapper is at https://github.com/tuncer/re2/. An Inferno wrapper is at https://github.com/powerman/inferno-re2/. -A Perl wrapper is at https://github.com/dgl/re-engine-RE2/ and on CPAN. +An OCaml wrapper is at https://github.com/janestreet/re2/ and on OPAM. +A Perl wrapper is at https://github.com/dgl/re-engine-RE2/ and on CPAN. A Python wrapper is at https://github.com/facebook/pyre2/. A Ruby wrapper is at https://github.com/axic/rre2/. diff --git a/benchlog/benchplot.py b/benchlog/benchplot.py new file mode 100755 index 0000000..104abe8 --- /dev/null +++ b/benchlog/benchplot.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python + +import argparse # for ArgumentParser +import subprocess # for Popen +import tempfile # for NamedTemporaryFile +import os # for remove + +class gnuplot(object): + + output = "result.png" + + script = """ + set terminal png size 1024, 768 + set output "{}.png" + set title "re2 benchlog" + set datafile separator ";" + set grid x y + set ylabel "MB/s" + set autoscale + plot """ + + template = """'{}' using 1:5:xticlabels(2) with linespoints linewidth 3 title "{}",\\\n""" + + benchdata = dict() + tempfiles = [] + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + """ + remove all temporary files + """ + + for filename in self.tempfiles: + os.remove(filename) + + def parse_re2_benchlog(self, filename): + """ + parse the input benchlog and return a dictionary contain bench data + """ + + benchdata = self.benchdata + + with open(filename) as f: + + for raw in f.readlines(): + + data = raw.split('\t') + + if len(data) == 4: + + data = data[0].split('/') + data[1:] + data = list(map(str.strip, data)) + + if not benchdata.get(data[0]): + benchdata[data[0]] = [ data[1:] ] + else: + benchdata[data[0]].append(data[1:]) + + def gen_csv(self): + """ + generate temporary csv files + """ + + for name, data in self.benchdata.items(): + + with tempfile.NamedTemporaryFile(delete=False) as f: + + for index, line in enumerate(data): + f.write('{};{}\n'.format(index, ';'.join(line)).encode()) + + self.tempfiles.append(f.name) + self.script = self.script + self.template.format(f.name, name) + + def run(self): + self.gen_csv() + script = self.script[:-3].format(self.output) + command = subprocess.Popen(['gnuplot'], stdin=subprocess.PIPE) + command.communicate(script.encode()) + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description='generate plots for benchlog') + parser.add_argument('benchlog', type=str, help='benchlog generated by re2') + args = parser.parse_args() + + try: + subprocess.Popen(['gnuplot'], stdin=subprocess.PIPE) + except FileNotFoundError: + print('you can install "gnuplot" to generate plots automatically') + exit(1) + + with gnuplot() as plot: + plot.output = args.benchlog + plot.parse_re2_benchlog(args.benchlog) + plot.run() diff --git a/libre2.symbols.darwin b/libre2.symbols.darwin index bc6f1af..4207f87 100644 --- a/libre2.symbols.darwin +++ b/libre2.symbols.darwin @@ -6,7 +6,12 @@ __ZNK3re23RE2* __ZN3re211StringPiece* __ZNK3re211StringPiece* # operator<<(std::ostream&, re2::StringPiece const&) -__ZlsRNSt3__113basic_ostreamIcNS_11char_traitsIcEEEERKN3re211StringPieceE +# Seen with libstdc++ on 10.8 and below: +# __ZlsRSoRKN3re211StringPieceE +# Seen with libc++ on 10.9 and above: +# __ZlsRNSt3__113basic_ostreamIcNS_11char_traitsIcEEEERKN3re211StringPieceE +# Note that "ls" means operator<<, so this is not overly broad. +__Zls*RKN3re211StringPieceE # re2::FilteredRE2* __ZN3re211FilteredRE2* __ZNK3re211FilteredRE2* diff --git a/re2/compile.cc b/re2/compile.cc index 6478def..e5d6088 100644 --- a/re2/compile.cc +++ b/re2/compile.cc @@ -435,7 +435,10 @@ Frag Compiler::EmptyWidth(EmptyOp empty) { if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) { int j; for (int i = 0; i < 256; i = j) { - for (j = i+1; j < 256 && Prog::IsWordChar(i) == Prog::IsWordChar(j); j++) + for (j = i + 1; j < 256 && + Prog::IsWordChar(static_cast<uint8>(i)) == + Prog::IsWordChar(static_cast<uint8>(j)); + j++) ; prog_->MarkByteRange(i, j-1); } @@ -503,7 +506,10 @@ int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) { return UncachedRuneByteSuffix(lo, hi, foldcase, next); } - uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | foldcase; + uint64 key = (uint64)next << 17 | + (uint64)lo << 9 | + (uint64)hi << 1 | + (uint64)foldcase; map<uint64, int>::iterator it = rune_cache_.find(key); if (it != rune_cache_.end()) return it->second; @@ -555,7 +561,8 @@ void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) { return; if (hi > 0xFF) hi = 0xFF; - AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0)); + AddSuffix(RuneByteSuffix(static_cast<uint8>(lo), static_cast<uint8>(hi), + foldcase, 0)); } // Table describing how to make a UTF-8 matching machine @@ -596,7 +603,8 @@ void Compiler::Add_80_10ffff() { int next = 0; if (p.next >= 0) next = inst[p.next]; - inst[i] = UncachedRuneByteSuffix(p.lo, p.hi, false, next); + inst[i] = UncachedRuneByteSuffix(static_cast<uint8>(p.lo), + static_cast<uint8>(p.hi), false, next); if ((p.lo & 0xC0) != 0x80) AddSuffix(inst[i]); } @@ -625,7 +633,8 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { // ASCII range is always a special case. if (hi < Runeself) { - AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0)); + AddSuffix(RuneByteSuffix(static_cast<uint8>(lo), static_cast<uint8>(hi), + foldcase, 0)); return; } @@ -979,7 +988,7 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem, if (m > Prog::Inst::kMaxInst) m = Prog::Inst::kMaxInst; - max_inst_ = m; + max_inst_ = static_cast<int>(m); } anchor_ = anchor; @@ -143,7 +143,7 @@ class DFA { if (sizeof(size_t) == sizeof(uint32)) return Hash32StringWithSeed(s, len, a->flag_); else - return Hash64StringWithSeed(s, len, a->flag_); + return static_cast<size_t>(Hash64StringWithSeed(s, len, a->flag_)); } #ifdef STL_MSVC // Less than operator. @@ -451,7 +451,7 @@ DFA::DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem) (sizeof(int)+sizeof(int)) * 2; // q0, q1 mem_budget_ -= nastack_ * sizeof(int); // astack if (mem_budget_ < 0) { - LOG(INFO) << StringPrintf("DFA out of memory: prog size %lld mem %lld", + LOG(INFO) << StringPrintf("DFA out of memory: prog size %d mem %lld", prog_->size(), max_mem); init_failed_ = true; return; @@ -466,7 +466,7 @@ DFA::DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem) int64 one_state = sizeof(State) + (prog_->size()+nmark)*sizeof(int) + (prog_->bytemap_range()+1)*sizeof(State*); if (state_budget_ < 20*one_state) { - LOG(INFO) << StringPrintf("DFA out of memory: prog size %lld mem %lld", + LOG(INFO) << StringPrintf("DFA out of memory: prog size %d mem %lld", prog_->size(), max_mem); init_failed_ = true; return; @@ -864,8 +864,10 @@ void DFA::AddToQueue(Workq* q, int id, uint flag) { break; case kInstEmptyWidth: - if ((ip->empty() & flag) == ip->empty()) - stk[nstk++] = ip->out(); + // Continue on if we have all the right flag bits. + if (ip->empty() & ~flag) + break; + stk[nstk++] = ip->out(); break; } } @@ -1014,7 +1016,7 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) { // byte processed was a word character. Use that info to // insert empty-width (non-)word boundaries. bool islastword = state->flag_ & kFlagLastWord; - bool isword = (c != kByteEndText && Prog::IsWordChar(c)); + bool isword = (c != kByteEndText && Prog::IsWordChar(static_cast<uint8>(c))); if (isword == islastword) beforeflag |= kEmptyNonWordBoundary; else @@ -2034,7 +2036,7 @@ bool DFA::PossibleMatchRange(string* min, string* max, int maxlen) { if (ns == FullMatchState || (ns > SpecialStateMax && ns->ninst_ > 0)) { extended = true; - min->append(1, j); + min->append(1, static_cast<char>(j)); s = ns; break; } @@ -2064,7 +2066,7 @@ bool DFA::PossibleMatchRange(string* min, string* max, int maxlen) { if (ns == FullMatchState || (ns > SpecialStateMax && ns->ninst_ > 0)) { extended = true; - max->append(1, j); + max->append(1, static_cast<char>(j)); s = ns; break; } diff --git a/re2/make_perl_groups.pl b/re2/make_perl_groups.pl index 8c1f4f6..d9fcdaf 100755 --- a/re2/make_perl_groups.pl +++ b/re2/make_perl_groups.pl @@ -32,14 +32,20 @@ "\\w", ); +%overrides = ( + # Prior to Perl 5.18, \s did not match vertical tab. + # RE2 preserves that original behaviour. + "\\s:11" => 0, +); + sub ComputeClass($) { + my ($cname) = @_; my @ranges; - my ($class) = @_; - my $regexp = "[$class]"; + my $regexp = qr/[$cname]/; my $start = -1; for (my $i=0; $i<=129; $i++) { if ($i == 129) { $i = 256; } - if ($i <= 128 && chr($i) =~ $regexp) { + if ($i <= 128 && ($overrides{"$cname:$i"} // chr($i) =~ $regexp)) { if ($start < 0) { $start = $i; } @@ -54,15 +60,15 @@ sub ComputeClass($) { } sub PrintClass($$@) { - my ($cname, $name, @ranges) = @_; - print "static const URange16 code${cname}[] = { /* $name */\n"; + my ($cnum, $cname, @ranges) = @_; + print "static const URange16 code${cnum}[] = { /* $cname */\n"; for (my $i=0; $i<@ranges; $i++) { my @a = @{$ranges[$i]}; printf "\t{ 0x%x, 0x%x },\n", $a[0], $a[1]; } print "};\n"; my $n = @ranges; - my $escname = $name; + my $escname = $cname; $escname =~ s/\\/\\\\/g; $negname = $escname; if ($negname =~ /:/) { @@ -70,25 +76,25 @@ sub PrintClass($$@) { } else { $negname =~ y/a-z/A-Z/; } - return "{ \"$escname\", +1, code$cname, $n }", "{ \"$negname\", -1, code$cname, $n }"; + return "{ \"$escname\", +1, code$cnum, $n }", "{ \"$negname\", -1, code$cnum, $n }"; } -my $gen = 0; +my $cnum = 0; sub PrintClasses($@) { - my ($cname, @classes) = @_; + my ($pname, @classes) = @_; my @entries; - foreach my $cl (@classes) { - my @ranges = ComputeClass($cl); - push @entries, PrintClass(++$gen, $cl, @ranges); + foreach my $cname (@classes) { + my @ranges = ComputeClass($cname); + push @entries, PrintClass(++$cnum, $cname, @ranges); } - print "const UGroup ${cname}_groups[] = {\n"; + print "const UGroup ${pname}_groups[] = {\n"; foreach my $e (@entries) { print "\t$e,\n"; } print "};\n"; my $count = @entries; - print "const int num_${cname}_groups = $count;\n"; + print "const int num_${pname}_groups = $count;\n"; } print <<EOF; diff --git a/re2/make_unicode_casefold.py b/re2/make_unicode_casefold.py index 8f90cfe..d215eb1 100755 --- a/re2/make_unicode_casefold.py +++ b/re2/make_unicode_casefold.py @@ -9,7 +9,8 @@ """Generate C++ table for Unicode case folding.""" -import unicode, sys +import sys +import unicode _header = """ // GENERATED BY make_unicode_casefold.py; DO NOT EDIT. @@ -468,7 +468,7 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, if (text.begin() > context.begin()) { c = text.begin()[-1] & 0xFF; - wasword = Prog::IsWordChar(c); + wasword = Prog::IsWordChar(static_cast<uint8>(c)); } // Loop over the text, stepping the machine. diff --git a/re2/parse.cc b/re2/parse.cc index e6b27d2..3c15bbd 100644 --- a/re2/parse.cc +++ b/re2/parse.cc @@ -215,7 +215,7 @@ bool Regexp::ParseState::PushRegexp(Regexp* re) { // single characters (e.g., [.] instead of \.), and some // analysis does better with fewer character classes. // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding. - if (re->op_ == kRegexpCharClass) { + if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) { re->ccb_->RemoveAbove(rune_max_); if (re->ccb_->size() == 1) { Rune r = re->ccb_->begin()->lo; @@ -576,13 +576,6 @@ bool Regexp::ParseState::DoLeftParenNoCapture() { return PushRegexp(re); } -// Adds r to cc, along with r's upper case if foldascii is set. -static void AddLiteral(CharClassBuilder* cc, Rune r, bool foldascii) { - cc->AddRange(r, r); - if (foldascii && 'a' <= r && r <= 'z') - cc->AddRange(r + 'A' - 'a', r + 'A' - 'a'); -} - // Processes a vertical bar in the input. bool Regexp::ParseState::DoVerticalBar() { MaybeConcatString(-1, NoParseFlags); @@ -596,46 +589,34 @@ bool Regexp::ParseState::DoVerticalBar() { Regexp* r1; Regexp* r2; if ((r1 = stacktop_) != NULL && - (r2 = stacktop_->down_) != NULL && + (r2 = r1->down_) != NULL && r2->op() == kVerticalBar) { - // If above and below vertical bar are literal or char class, - // can merge into a single char class. Regexp* r3; - if ((r1->op() == kRegexpLiteral || - r1->op() == kRegexpCharClass || - r1->op() == kRegexpAnyChar) && - (r3 = r2->down_) != NULL) { - Rune rune; - switch (r3->op()) { - case kRegexpLiteral: // convert to char class - rune = r3->rune_; - r3->op_ = kRegexpCharClass; - r3->cc_ = NULL; - r3->ccb_ = new CharClassBuilder; - AddLiteral(r3->ccb_, rune, r3->parse_flags_ & Regexp::FoldCase); - // fall through - case kRegexpCharClass: - if (r1->op() == kRegexpLiteral) - AddLiteral(r3->ccb_, r1->rune_, - r1->parse_flags_ & Regexp::FoldCase); - else if (r1->op() == kRegexpCharClass) - r3->ccb_->AddCharClass(r1->ccb_); - if (r1->op() == kRegexpAnyChar || r3->ccb_->full()) { - delete r3->ccb_; - r3->ccb_ = NULL; - r3->op_ = kRegexpAnyChar; - } - // fall through - case kRegexpAnyChar: - // pop r1 - stacktop_ = r2; - r1->Decref(); - return true; - default: - break; + if ((r3 = r2->down_) != NULL && + (r1->op() == kRegexpAnyChar || r3->op() == kRegexpAnyChar)) { + // AnyChar is above or below the vertical bar. Let it subsume + // the other when the other is Literal, CharClass or AnyChar. + if (r3->op() == kRegexpAnyChar && + (r1->op() == kRegexpLiteral || + r1->op() == kRegexpCharClass || + r1->op() == kRegexpAnyChar)) { + // Discard r1. + stacktop_ = r2; + r1->Decref(); + return true; + } + if (r1->op() == kRegexpAnyChar && + (r3->op() == kRegexpLiteral || + r3->op() == kRegexpCharClass || + r3->op() == kRegexpAnyChar)) { + // Rearrange the stack and discard r3. + r1->down_ = r3->down_; + r2->down_ = r1; + stacktop_ = r2; + r3->Decref(); + return true; } } - // Swap r1 below vertical bar (r2). r1->down_ = r2->down_; r2->down_ = r1; @@ -1166,7 +1147,7 @@ bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) { if (r >= 0) { re1->op_ = kRegexpLiteral; re1->rune_ = r; - re1->parse_flags_ = flags; + re1->parse_flags_ = static_cast<uint16>(flags); return true; } diff --git a/re2/prog.cc b/re2/prog.cc index f326ffd..499f560 100644 --- a/re2/prog.cc +++ b/re2/prog.cc @@ -25,7 +25,7 @@ void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) { set_out_opcode(out, kInstByteRange); lo_ = lo & 0xFF; hi_ = hi & 0xFF; - foldcase_ = foldcase; + foldcase_ = foldcase & 0xFF; } void Prog::Inst::InitCapture(int cap, uint32 out) { @@ -327,12 +327,12 @@ void Prog::ComputeByteMap() { bytemap_range_ = bytemap_[255] + 1; unbytemap_ = new uint8[bytemap_range_]; for (int i = 0; i < 256; i++) - unbytemap_[bytemap_[i]] = i; + unbytemap_[bytemap_[i]] = static_cast<uint8>(i); if (0) { // For debugging: use trivial byte map. for (int i = 0; i < 256; i++) { - bytemap_[i] = i; - unbytemap_[i] = i; + bytemap_[i] = static_cast<uint8>(i); + unbytemap_[i] = static_cast<uint8>(i); } bytemap_range_ = 256; LOG(INFO) << "Using trivial bytemap."; @@ -201,10 +201,10 @@ class Prog { int start_unanchored() { return start_unanchored_; } void set_start(int start) { start_ = start; } void set_start_unanchored(int start) { start_unanchored_ = start; } - int64 size() { return size_; } + int size() { return size_; } bool reversed() { return reversed_; } void set_reversed(bool reversed) { reversed_ = reversed; } - int64 byte_inst_count() { return byte_inst_count_; } + int byte_inst_count() { return byte_inst_count_; } const Bitmap<256>& byterange() { return byterange_; } void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; } int64 dfa_mem() { return dfa_mem_; } @@ -11,7 +11,6 @@ #include <stdio.h> #include <string> -#include <pthread.h> #include <errno.h> #include "util/atomicops.h" #include "util/util.h" @@ -879,31 +878,30 @@ bool RE2::Rewrite(string *out, const StringPiece &rewrite, const StringPiece *vec, int veclen) const { for (const char *s = rewrite.data(), *end = s + rewrite.size(); s < end; s++) { - int c = *s; - if (c == '\\') { - s++; - c = (s < end) ? *s : -1; - if (isdigit(c)) { - int n = (c - '0'); - if (n >= veclen) { - if (options_.log_errors()) { - LOG(ERROR) << "requested group " << n - << " in regexp " << rewrite.data(); - } - return false; + if (*s != '\\') { + out->push_back(*s); + continue; + } + s++; + int c = (s < end) ? *s : -1; + if (isdigit(c)) { + int n = (c - '0'); + if (n >= veclen) { + if (options_.log_errors()) { + LOG(ERROR) << "requested group " << n + << " in regexp " << rewrite.data(); } - StringPiece snip = vec[n]; - if (snip.size() > 0) - out->append(snip.data(), snip.size()); - } else if (c == '\\') { - out->push_back('\\'); - } else { - if (options_.log_errors()) - LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); return false; } + StringPiece snip = vec[n]; + if (snip.size() > 0) + out->append(snip.data(), snip.size()); + } else if (c == '\\') { + out->push_back('\\'); } else { - out->push_back(c); + if (options_.log_errors()) + LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); + return false; } } return true; @@ -1102,7 +1100,7 @@ bool RE2::Arg::parse_short_radix(const char* str, if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse if ((short)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast<short*>(dest)) = r; + *(reinterpret_cast<short*>(dest)) = (short)r; return true; } @@ -1114,7 +1112,7 @@ bool RE2::Arg::parse_ushort_radix(const char* str, if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse if ((ushort)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast<unsigned short*>(dest)) = r; + *(reinterpret_cast<unsigned short*>(dest)) = (ushort)r; return true; } @@ -1200,7 +1198,7 @@ static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) if (errno) return false; if (dest == NULL) return true; if (isfloat) { - *(reinterpret_cast<float*>(dest)) = r; + *(reinterpret_cast<float*>(dest)) = (float)r; } else { *(reinterpret_cast<double*>(dest)) = r; } diff --git a/re2/regexp.cc b/re2/regexp.cc index 3667fda..b09b5ca 100644 --- a/re2/regexp.cc +++ b/re2/regexp.cc @@ -14,7 +14,7 @@ namespace re2 { // Constructor. Allocates vectors as appropriate for operator. Regexp::Regexp(RegexpOp op, ParseFlags parse_flags) - : op_(op), + : op_(static_cast<uint8>(op)), simple_(false), parse_flags_(static_cast<uint16>(parse_flags)), ref_(1), @@ -107,7 +107,7 @@ void Regexp::Decref() { GLOBAL_MUTEX_LOCK(ref_mutex); int r = (*ref_map)[this] - 1; if (r < kMaxRef) { - ref_ = r; + ref_ = static_cast<uint16>(r); ref_map->erase(this); } else { (*ref_map)[this] = r; @@ -651,7 +651,7 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { if (re->parse_flags() & Latin1) { prefix->resize(re->nrunes_); for (int j = 0; j < re->nrunes_; j++) - (*prefix)[j] = re->runes_[j]; + (*prefix)[j] = static_cast<char>(re->runes_[j]); } else { // Convert to UTF-8 in place. // Assume worst-case space and then trim. @@ -660,7 +660,7 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { for (int j = 0; j < re->nrunes_; j++) { Rune r = re->runes_[j]; if (r < Runeself) - *p++ = r; + *p++ = static_cast<char>(r); else p += runetochar(p, &r); } @@ -670,7 +670,7 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { case kRegexpLiteral: if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) { - prefix->append(1, re->rune_); + prefix->append(1, static_cast<char>(re->rune_)); } else { char buf[UTFmax]; prefix->append(buf, runetochar(buf, &re->rune_)); diff --git a/re2/regexp.h b/re2/regexp.h index 60eb993..b49ce0d 100644 --- a/re2/regexp.h +++ b/re2/regexp.h @@ -211,7 +211,8 @@ class RegexpStatus { DISALLOW_COPY_AND_ASSIGN(RegexpStatus); }; -// Walker to implement Simplify. +// Walkers to implement Simplify. +class CoalesceWalker; class SimplifyWalker; // Compiled form; see prog.h @@ -353,6 +354,7 @@ class Regexp { // removed. The result will capture exactly the same // subexpressions the original did, unless formatted with ToString. Regexp* Simplify(); + friend class CoalesceWalker; friend class SimplifyWalker; // Parses the regexp src and then simplifies it and sets *dst to the diff --git a/re2/simplify.cc b/re2/simplify.cc index 9c0021e..d14483f 100644 --- a/re2/simplify.cc +++ b/re2/simplify.cc @@ -97,6 +97,36 @@ bool Regexp::ComputeSimple() { } // Walker subclass used by Simplify. +// Coalesces runs of star/plus/quest/repeat of the same literal along with any +// occurrences of that literal into repeats of that literal. It also works for +// char classes, any char and any byte. +// PostVisit creates the coalesced result, which should then be simplified. +class CoalesceWalker : public Regexp::Walker<Regexp*> { + public: + CoalesceWalker() {} + virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg, + Regexp** child_args, int nchild_args); + virtual Regexp* Copy(Regexp* re); + virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg); + + private: + // These functions are declared inside CoalesceWalker so that + // they can edit the private fields of the Regexps they construct. + + // Returns true if r1 and r2 can be coalesced. In particular, ensures that + // the parse flags are consistent. (They will not be checked again later.) + static bool CanCoalesce(Regexp* r1, Regexp* r2); + + // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards + // will be empty match and the coalesced op. In other cases, where part of a + // literal string was removed to be coalesced, the array elements afterwards + // will be the coalesced op and the remainder of the literal string. + static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr); + + DISALLOW_COPY_AND_ASSIGN(CoalesceWalker); +}; + +// Walker subclass used by Simplify. // The simplify walk is purely post-recursive: given the simplified children, // PostVisit creates the simplified result. // The child_args are simplified Regexp*s. @@ -104,9 +134,7 @@ class SimplifyWalker : public Regexp::Walker<Regexp*> { public: SimplifyWalker() {} virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop); - virtual Regexp* PostVisit(Regexp* re, - Regexp* parent_arg, - Regexp* pre_arg, + virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg, Regexp** child_args, int nchild_args); virtual Regexp* Copy(Regexp* re); virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg); @@ -143,14 +171,261 @@ class SimplifyWalker : public Regexp::Walker<Regexp*> { // Caller must Decref() return value when done with it. Regexp* Regexp::Simplify() { - if (simple_) - return Incref(); - SimplifyWalker w; - return w.Walk(this, NULL); + CoalesceWalker cw; + Regexp* cre = cw.Walk(this, NULL); + if (cre == NULL) + return cre; + SimplifyWalker sw; + Regexp* sre = sw.Walk(cre, NULL); + cre->Decref(); + return sre; } #define Simplify DontCallSimplify // Avoid accidental recursion +// Utility function for PostVisit implementations that compares re->sub() with +// child_args to determine whether any child_args changed. In the common case, +// where nothing changed, calls Decref() for all child_args and returns false, +// so PostVisit must return re->Incref(). Otherwise, returns true. +static bool ChildArgsChanged(Regexp* re, Regexp** child_args) { + for (int i = 0; i < re->nsub(); i++) { + Regexp* sub = re->sub()[i]; + Regexp* newsub = child_args[i]; + if (newsub != sub) + return true; + } + for (int i = 0; i < re->nsub(); i++) { + Regexp* newsub = child_args[i]; + newsub->Decref(); + } + return false; +} + +Regexp* CoalesceWalker::Copy(Regexp* re) { + return re->Incref(); +} + +Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { + // This should never be called, since we use Walk and not + // WalkExponential. + LOG(DFATAL) << "CoalesceWalker::ShortVisit called"; + return re->Incref(); +} + +Regexp* CoalesceWalker::PostVisit(Regexp* re, + Regexp* parent_arg, + Regexp* pre_arg, + Regexp** child_args, + int nchild_args) { + if (re->nsub() == 0) + return re->Incref(); + + if (re->op() != kRegexpConcat) { + if (!ChildArgsChanged(re, child_args)) + return re->Incref(); + + // Something changed. Build a new op. + Regexp* nre = new Regexp(re->op(), re->parse_flags()); + nre->AllocSub(re->nsub()); + Regexp** nre_subs = nre->sub(); + for (int i = 0; i < re->nsub(); i++) + nre_subs[i] = child_args[i]; + // Repeats and Captures have additional data that must be copied. + if (re->op() == kRegexpRepeat) { + nre->min_ = re->min(); + nre->max_ = re->max(); + } else if (re->op() == kRegexpCapture) { + nre->cap_ = re->cap(); + } + return nre; + } + + bool can_coalesce = false; + for (int i = 0; i < re->nsub(); i++) { + if (i+1 < re->nsub() && + CanCoalesce(child_args[i], child_args[i+1])) { + can_coalesce = true; + break; + } + } + if (!can_coalesce) { + if (!ChildArgsChanged(re, child_args)) + return re->Incref(); + + // Something changed. Build a new op. + Regexp* nre = new Regexp(re->op(), re->parse_flags()); + nre->AllocSub(re->nsub()); + Regexp** nre_subs = nre->sub(); + for (int i = 0; i < re->nsub(); i++) + nre_subs[i] = child_args[i]; + return nre; + } + + for (int i = 0; i < re->nsub(); i++) { + if (i+1 < re->nsub() && + CanCoalesce(child_args[i], child_args[i+1])) + DoCoalesce(&child_args[i], &child_args[i+1]); + } + // Determine how many empty matches were left by DoCoalesce. + int n = 0; + for (int i = n; i < re->nsub(); i++) { + if (child_args[i]->op() == kRegexpEmptyMatch) + n++; + } + // Build a new op. + Regexp* nre = new Regexp(re->op(), re->parse_flags()); + nre->AllocSub(re->nsub() - n); + Regexp** nre_subs = nre->sub(); + for (int i = 0, j = 0; i < re->nsub(); i++) { + if (child_args[i]->op() == kRegexpEmptyMatch) { + child_args[i]->Decref(); + continue; + } + nre_subs[j] = child_args[i]; + j++; + } + return nre; +} + +bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) { + // r1 must be a star/plus/quest/repeat of a literal, char class, any char or + // any byte. + if ((r1->op() == kRegexpStar || + r1->op() == kRegexpPlus || + r1->op() == kRegexpQuest || + r1->op() == kRegexpRepeat) && + (r1->sub()[0]->op() == kRegexpLiteral || + r1->sub()[0]->op() == kRegexpCharClass || + r1->sub()[0]->op() == kRegexpAnyChar || + r1->sub()[0]->op() == kRegexpAnyByte)) { + // r2 must be a star/plus/quest/repeat of the same literal, char class, + // any char or any byte. + if ((r2->op() == kRegexpStar || + r2->op() == kRegexpPlus || + r2->op() == kRegexpQuest || + r2->op() == kRegexpRepeat) && + Regexp::Equal(r1->sub()[0], r2->sub()[0]) && + // The parse flags must be consistent. + ((r1->parse_flags() & Regexp::NonGreedy) == + (r2->parse_flags() & Regexp::NonGreedy))) { + return true; + } + // ... OR an occurrence of that literal, char class, any char or any byte + if (Regexp::Equal(r1->sub()[0], r2)) { + return true; + } + // ... OR a literal string that begins with that literal. + if (r1->sub()[0]->op() == kRegexpLiteral && + r2->op() == kRegexpLiteralString && + r2->runes()[0] == r1->sub()[0]->rune() && + // The parse flags must be consistent. + ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) == + (r2->parse_flags() & Regexp::FoldCase))) { + return true; + } + } + return false; +} + +void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) { + Regexp* r1 = *r1ptr; + Regexp* r2 = *r2ptr; + + Regexp* nre = Regexp::Repeat( + r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0); + + switch (r1->op()) { + case kRegexpStar: + nre->min_ = 0; + nre->max_ = -1; + break; + + case kRegexpPlus: + nre->min_ = 1; + nre->max_ = -1; + break; + + case kRegexpQuest: + nre->min_ = 0; + nre->max_ = 1; + break; + + case kRegexpRepeat: + nre->min_ = r1->min(); + nre->max_ = r1->max(); + break; + + default: + LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op(); + nre->Decref(); + return; + } + + switch (r2->op()) { + case kRegexpStar: + nre->max_ = -1; + goto LeaveEmpty; + + case kRegexpPlus: + nre->min_++; + nre->max_ = -1; + goto LeaveEmpty; + + case kRegexpQuest: + if (nre->max() != -1) + nre->max_++; + goto LeaveEmpty; + + case kRegexpRepeat: + nre->min_ += r2->min(); + if (r2->max() == -1) + nre->max_ = -1; + else if (nre->max() != -1) + nre->max_ += r2->max(); + goto LeaveEmpty; + + case kRegexpLiteral: + case kRegexpCharClass: + case kRegexpAnyChar: + case kRegexpAnyByte: + nre->min_++; + if (nre->max() != -1) + nre->max_++; + goto LeaveEmpty; + + LeaveEmpty: + *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags); + *r2ptr = nre; + break; + + case kRegexpLiteralString: { + Rune r = r1->sub()[0]->rune(); + // Determine how much of the literal string is removed. + // We know that we have at least one rune. :) + int n = 1; + while (n < r2->nrunes() && r2->runes()[n] == r) + n++; + nre->min_ += n; + if (nre->max() != -1) + nre->max_ += n; + if (n == r2->nrunes()) + goto LeaveEmpty; + *r1ptr = nre; + *r2ptr = Regexp::LiteralString( + &r2->runes()[n], r2->nrunes() - n, r2->parse_flags()); + break; + } + + default: + LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op(); + nre->Decref(); + return; + } + + r1->Decref(); + r2->Decref(); +} + Regexp* SimplifyWalker::Copy(Regexp* re) { return re->Incref(); } @@ -196,29 +471,14 @@ Regexp* SimplifyWalker::PostVisit(Regexp* re, case kRegexpConcat: case kRegexpAlternate: { // These are simple as long as the subpieces are simple. - // Two passes to avoid allocation in the common case. - bool changed = false; - Regexp** subs = re->sub(); - for (int i = 0; i < re->nsub_; i++) { - Regexp* sub = subs[i]; - Regexp* newsub = child_args[i]; - if (newsub != sub) { - changed = true; - break; - } - } - if (!changed) { - for (int i = 0; i < re->nsub_; i++) { - Regexp* newsub = child_args[i]; - newsub->Decref(); - } + if (!ChildArgsChanged(re, child_args)) { re->simple_ = true; return re->Incref(); } Regexp* nre = new Regexp(re->op(), re->parse_flags()); - nre->AllocSub(re->nsub_); + nre->AllocSub(re->nsub()); Regexp** nre_subs = nre->sub(); - for (int i = 0; i <re->nsub_; i++) + for (int i = 0; i < re->nsub(); i++) nre_subs[i] = child_args[i]; nre->simple_ = true; return nre; @@ -234,7 +494,7 @@ Regexp* SimplifyWalker::PostVisit(Regexp* re, Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags()); nre->AllocSub(1); nre->sub()[0] = newsub; - nre->cap_ = re->cap_; + nre->cap_ = re->cap(); nre->simple_ = true; return nre; } diff --git a/re2/testing/dump.cc b/re2/testing/dump.cc index 4bdf714..9703039 100644 --- a/re2/testing/dump.cc +++ b/re2/testing/dump.cc @@ -120,6 +120,8 @@ static void DumpRegexpAppending(Regexp* re, string* s) { DumpRegexpAppending(re->sub()[0], s); break; case kRegexpCapture: + if (re->cap() == 0) + LOG(DFATAL) << "kRegexpCapture cap() == 0"; if (re->name()) { s->append(*re->name()); s->append(":"); diff --git a/re2/testing/parse_test.cc b/re2/testing/parse_test.cc index f6d760a..29a79b1 100644 --- a/re2/testing/parse_test.cc +++ b/re2/testing/parse_test.cc @@ -118,8 +118,15 @@ static Test tests[] = { { "(?:a)", "lit{a}" }, { "(?:ab)(?:cd)", "str{abcd}" }, { "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" }, + { "a|c", "cc{0x61 0x63}" }, + { "a|[cd]", "cc{0x61 0x63-0x64}" }, { "a|.", "dot{}" }, - { ".|a", "dot{}" }, + { "[ab]|c", "cc{0x61-0x63}" }, + { "[ab]|[cd]", "cc{0x61-0x64}" }, + { "[ab]|.", "dot{}" }, + { ".|c", "dot{}" }, + { ".|[cd]", "dot{}" }, + { ".|.", "dot{}" }, // Test Perl quoted literals { "\\Q+|*?{[\\E", "str{+|*?{[}" }, @@ -299,6 +306,14 @@ Test prefix_tests[] = { "cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" }, { "x{2}y|x{2}[0-9]y", "cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}" }, + { "n|r|rs", + "alt{lit{n}cat{lit{r}alt{emp{}lit{s}}}}" }, + { "n|rs|r", + "alt{lit{n}cat{lit{r}alt{lit{s}emp{}}}}" }, + { "r|rs|n", + "alt{cat{lit{r}alt{emp{}lit{s}}}lit{n}}" }, + { "rs|r|n", + "alt{cat{lit{r}alt{lit{s}emp{}}}lit{n}}" }, }; // Test that prefix factoring works. diff --git a/re2/testing/set_test.cc b/re2/testing/set_test.cc index 74058a4..4e267ae 100644 --- a/re2/testing/set_test.cc +++ b/re2/testing/set_test.cc @@ -71,10 +71,10 @@ TEST(Set, UnanchoredFactored) { TEST(Set, UnanchoredDollar) { RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED); - + CHECK_EQ(s.Add("foo$", NULL), 0); CHECK_EQ(s.Compile(), true); - + vector<int> v; CHECK_EQ(s.Match("foo", &v), true); CHECK_EQ(v.size(), 1); @@ -107,8 +107,34 @@ TEST(Set, Anchored) { CHECK_EQ(s.Match("bar", &v), true); CHECK_EQ(v.size(), 1); CHECK_EQ(v[0], 1); +} + +TEST(Set, EmptyUnanchored) { + RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED); + + CHECK_EQ(s.Compile(), true); + + vector<int> v; + CHECK_EQ(s.Match("", &v), false); + CHECK_EQ(v.size(), 0); + + v.clear(); + CHECK_EQ(s.Match("foobar", &v), false); + CHECK_EQ(v.size(), 0); +} + +TEST(Set, EmptyAnchored) { + RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH); + CHECK_EQ(s.Compile(), true); + + vector<int> v; + CHECK_EQ(s.Match("", &v), false); + CHECK_EQ(v.size(), 0); + + v.clear(); + CHECK_EQ(s.Match("foobar", &v), false); + CHECK_EQ(v.size(), 0); } } // namespace re2 - diff --git a/re2/testing/simplify_test.cc b/re2/testing/simplify_test.cc index d54837c..9db41ee 100644 --- a/re2/testing/simplify_test.cc +++ b/re2/testing/simplify_test.cc @@ -136,6 +136,99 @@ static Test tests[] = { { "(){1}", "()" }, { "(){1,}", "()+" }, { "(){0,2}", "(?:()()?)?" }, + + // Test that coalescing occurs and that the resulting repeats are simplified. + // Two-op combinations of *, +, ?, {n}, {n,} and {n,m} with a literal: + { "a*a*", "a*" }, + { "a*a+", "a+" }, + { "a*a?", "a*" }, + { "a*a{2}", "aa+" }, + { "a*a{2,}", "aa+" }, + { "a*a{2,3}", "aa+" }, + { "a+a*", "a+" }, + { "a+a+", "aa+" }, + { "a+a?", "a+" }, + { "a+a{2}", "aaa+" }, + { "a+a{2,}", "aaa+" }, + { "a+a{2,3}", "aaa+" }, + { "a?a*", "a*" }, + { "a?a+", "a+" }, + { "a?a?", "(?:aa?)?" }, + { "a?a{2}", "aaa?" }, + { "a?a{2,}", "aa+" }, + { "a?a{2,3}", "aa(?:aa?)?" }, + { "a{2}a*", "aa+" }, + { "a{2}a+", "aaa+" }, + { "a{2}a?", "aaa?" }, + { "a{2}a{2}", "aaaa" }, + { "a{2}a{2,}", "aaaa+" }, + { "a{2}a{2,3}", "aaaaa?" }, + { "a{2,}a*", "aa+" }, + { "a{2,}a+", "aaa+" }, + { "a{2,}a?", "aa+" }, + { "a{2,}a{2}", "aaaa+" }, + { "a{2,}a{2,}", "aaaa+" }, + { "a{2,}a{2,3}", "aaaa+" }, + { "a{2,3}a*", "aa+" }, + { "a{2,3}a+", "aaa+" }, + { "a{2,3}a?", "aa(?:aa?)?" }, + { "a{2,3}a{2}", "aaaaa?" }, + { "a{2,3}a{2,}", "aaaa+" }, + { "a{2,3}a{2,3}", "aaaa(?:aa?)?" }, + // With a char class, any char and any byte: + { "\\d*\\d*", "[0-9]*" }, + { ".*.*", ".*" }, + { "\\C*\\C*", "\\C*" }, + // FoldCase works, but must be consistent: + { "(?i)A*a*", "[Aa]*" }, + { "(?i)a+A+", "[Aa][Aa]+" }, + { "(?i)A*(?-i)a*", "[Aa]*a*" }, + { "(?i)a+(?-i)A+", "[Aa]+A+" }, + // NonGreedy works, but must be consistent: + { "a*?a*?", "a*?" }, + { "a+?a+?", "aa+?" }, + { "a*?a*", "a*?a*" }, + { "a+a+?", "a+a+?" }, + // The second element is the literal, char class, any char or any byte: + { "a*a", "a+" }, + { "\\d*\\d", "[0-9]+" }, + { ".*.", ".+" }, + { "\\C*\\C", "\\C+" }, + // FoldCase works, but must be consistent: + { "(?i)A*a", "[Aa]+" }, + { "(?i)a+A", "[Aa][Aa]+" }, + { "(?i)A*(?-i)a", "[Aa]*a" }, + { "(?i)a+(?-i)A", "[Aa]+A" }, + // The second element is a literal string that begins with the literal: + { "a*aa", "aa+" }, + { "a*aab", "aa+b" }, + // FoldCase works, but must be consistent: + { "(?i)a*aa", "[Aa][Aa]+" }, + { "(?i)a*aab", "[Aa][Aa]+[Bb]" }, + { "(?i)a*(?-i)aa", "[Aa]*aa" }, + { "(?i)a*(?-i)aab", "[Aa]*aab" }, + // Negative tests with mismatching ops: + { "a*b*", "a*b*" }, + { "\\d*\\D*", "[0-9]*[^0-9]*" }, + { "a+b", "a+b" }, + { "\\d+\\D", "[0-9]+[^0-9]" }, + { "a?bb", "a?bb" }, + // Negative tests with capturing groups: + { "(a*)a*", "(a*)a*" }, + { "a+(a)", "a+(a)" }, + { "(a?)(aa)", "(a?)(aa)" }, + // Just for fun: + { "aa*aa+aa?aa{2}aaa{2,}aaa{2,3}a", "aaaaaaaaaaaaaaaa+" }, + + // During coalescing, the child of the repeat changes, so we build a new + // repeat. The new repeat must have the min and max of the old repeat. + // Failure to copy them results in min=0 and max=0 -> empty match. + { "(?:a*aab){2}", "aa+baa+b" }, + + // During coalescing, the child of the capture changes, so we build a new + // capture. The new capture must have the cap of the old capture. + // Failure to copy it results in cap=0 -> ToString() logs a fatal error. + { "(a*aab)", "(aa+b)" }, }; TEST(TestSimplify, SimpleRegexps) { diff --git a/re2/tostring.cc b/re2/tostring.cc index 56ca7df..c59d4d9 100644 --- a/re2/tostring.cc +++ b/re2/tostring.cc @@ -94,6 +94,8 @@ int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) { case kRegexpCapture: t_->append("("); + if (re->cap() == 0) + LOG(DFATAL) << "kRegexpCapture cap() == 0"; if (re->name()) { t_->append("?P<"); t_->append(*re->name()); @@ -120,13 +122,13 @@ int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) { static void AppendLiteral(string *t, Rune r, bool foldcase) { if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) { t->append(1, '\\'); - t->append(1, r); + t->append(1, static_cast<char>(r)); } else if (foldcase && 'a' <= r && r <= 'z') { if ('a' <= r && r <= 'z') r += 'A' - 'a'; t->append(1, '['); - t->append(1, r); - t->append(1, r + 'a' - 'A'); + t->append(1, static_cast<char>(r)); + t->append(1, static_cast<char>(r) + 'a' - 'A'); t->append(1, ']'); } else { AppendCCRange(t, r, r); @@ -297,7 +299,7 @@ static void AppendCCChar(string* t, Rune r) { if (0x20 <= r && r <= 0x7E) { if (strchr("[]^-\\", r)) t->append("\\"); - t->append(1, r); + t->append(1, static_cast<char>(r)); return; } switch (r) { diff --git a/util/benchmark.cc b/util/benchmark.cc index c3aad7e..8f72f3c 100644 --- a/util/benchmark.cc +++ b/util/benchmark.cc @@ -25,10 +25,17 @@ void Benchmark::Register() { } static int64 nsec() { +#if defined(__APPLE__) || defined(_WIN32) struct timeval tv; if(gettimeofday(&tv, 0) < 0) return -1; return (int64)tv.tv_sec*1000*1000*1000 + tv.tv_usec*1000; +#else + struct timespec tp; + if(clock_gettime(CLOCK_REALTIME, &tp) < 0) + return -1; + return (int64)tp.tv_sec*1000*1000*1000 + tp.tv_nsec; +#endif } static int64 bytes; diff --git a/util/mutex.h b/util/mutex.h index 4a8de4c..9cb6de3 100644 --- a/util/mutex.h +++ b/util/mutex.h @@ -12,17 +12,38 @@ #include <stdlib.h> +#if !defined(_WIN32) +#include <unistd.h> // For POSIX options +#endif + namespace re2 { -#define HAVE_PTHREAD 1 -#define HAVE_RWLOCK 1 +#if !defined(_WIN32) + // Possible values of POSIX options: + // -1 means not supported, + // 0 means maybe supported (query at runtime), + // >0 means supported. +# if defined(_POSIX_THREADS) && _POSIX_THREADS > 0 +# define HAVE_PTHREAD 1 +# else +# define HAVE_PTHREAD 0 +# endif +# if defined(_POSIX_READER_WRITER_LOCKS) && _POSIX_READER_WRITER_LOCKS > 0 +# define HAVE_RWLOCK 1 +# else +# define HAVE_RWLOCK 0 +# endif +#else +# define HAVE_PTHREAD 0 +# define HAVE_RWLOCK 0 +#endif #if defined(NO_THREADS) typedef int MutexType; // to keep a lock-count -#elif defined(HAVE_PTHREAD) && defined(HAVE_RWLOCK) +#elif HAVE_PTHREAD && HAVE_RWLOCK // Needed for pthread_rwlock_*. If it causes problems, you could take it - // out, but then you'd have to unset HAVE_RWLOCK (at least on linux -- it - // *does* cause problems for FreeBSD, or MacOSX, but isn't needed + // out, but then you'd have to set HAVE_RWLOCK to 0 (at least on linux -- + // it *does* cause problems for FreeBSD, or MacOSX, but isn't needed // for locking there.) # ifdef __linux__ # undef _XOPEN_SOURCE @@ -30,10 +51,10 @@ namespace re2 { # endif # include <pthread.h> typedef pthread_rwlock_t MutexType; -#elif defined(HAVE_PTHREAD) +#elif HAVE_PTHREAD # include <pthread.h> typedef pthread_mutex_t MutexType; -#elif defined(WIN32) +#elif defined(_WIN32) # define WIN32_LEAN_AND_MEAN // We only need minimal includes # ifdef GMUTEX_TRYLOCK // We need Windows NT or later for TryEnterCriticalSection(). If you @@ -102,7 +123,7 @@ bool Mutex::TryLock() { if (mutex_) return false; Lock(); return true; } void Mutex::ReaderLock() { assert(++mutex_ > 0); } void Mutex::ReaderUnlock() { assert(mutex_-- > 0); } -#elif defined(HAVE_PTHREAD) && defined(HAVE_RWLOCK) +#elif HAVE_PTHREAD && HAVE_RWLOCK #define SAFE_PTHREAD(fncall) do { if ((fncall) != 0) abort(); } while (0) @@ -116,7 +137,7 @@ void Mutex::ReaderUnlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); } #undef SAFE_PTHREAD -#elif defined(HAVE_PTHREAD) +#elif HAVE_PTHREAD #define SAFE_PTHREAD(fncall) do { if ((fncall) != 0) abort(); } while (0) @@ -129,7 +150,7 @@ void Mutex::ReaderLock() { Lock(); } // we don't have read-write locks void Mutex::ReaderUnlock() { Unlock(); } #undef SAFE_PTHREAD -#elif defined(WIN32) +#elif defined(_WIN32) Mutex::Mutex() { InitializeCriticalSection(&mutex_); } Mutex::~Mutex() { DeleteCriticalSection(&mutex_); } @@ -186,7 +207,7 @@ class WriterMutexLock { #define WriterMutexLock(x) COMPILE_ASSERT(0, wmutex_lock_decl_missing_var_name) // Provide safe way to declare and use global, linker-initialized mutex. Sigh. -#ifdef HAVE_PTHREAD +#if HAVE_PTHREAD #define GLOBAL_MUTEX(name) \ static pthread_mutex_t (name) = PTHREAD_MUTEX_INITIALIZER diff --git a/util/rune.cc b/util/rune.cc index 26442b0..e6231ce 100644 --- a/util/rune.cc +++ b/util/rune.cc @@ -133,7 +133,7 @@ runetochar(char *str, const Rune *rune) */ c = *rune; if(c <= Rune1) { - str[0] = c; + str[0] = static_cast<char>(c); return 1; } @@ -142,7 +142,7 @@ runetochar(char *str, const Rune *rune) * 0080-07FF => T2 Tx */ if(c <= Rune2) { - str[0] = T2 | (c >> 1*Bitx); + str[0] = T2 | static_cast<char>(c >> 1*Bitx); str[1] = Tx | (c & Maskx); return 2; } @@ -161,9 +161,9 @@ runetochar(char *str, const Rune *rune) * 0800-FFFF => T3 Tx Tx */ if (c <= Rune3) { - str[0] = T3 | (c >> 2*Bitx); + str[0] = T3 | static_cast<char>(c >> 2*Bitx); str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); + str[2] = Tx | (c & Maskx); return 3; } @@ -171,7 +171,7 @@ runetochar(char *str, const Rune *rune) * four character sequence (21-bit value) * 10000-1FFFFF => T4 Tx Tx Tx */ - str[0] = T4 | (c >> 3*Bitx); + str[0] = T4 | static_cast<char>(c >> 3*Bitx); str[1] = Tx | ((c >> 2*Bitx) & Maskx); str[2] = Tx | ((c >> 1*Bitx) & Maskx); str[3] = Tx | (c & Maskx); diff --git a/util/stringprintf.cc b/util/stringprintf.cc index 79ba7fc..e71d993 100644 --- a/util/stringprintf.cc +++ b/util/stringprintf.cc @@ -4,7 +4,7 @@ #include "util/util.h" -namespace re2 { +namespace re2 { static void StringAppendV(string* dst, const char* format, va_list ap) { // First try with a small fixed size buffer @@ -38,7 +38,14 @@ static void StringAppendV(string* dst, const char* format, va_list ap) { // Restore the va_list before we use it again va_copy(backup_ap, ap); +#if !defined(_WIN32) result = vsnprintf(buf, length, format, backup_ap); +#else + // On Windows, the function takes five arguments, not four. With an array, + // the buffer size will be inferred, but not with a pointer. C'est la vie. + // (See https://github.com/google/re2/issues/40 for more details.) + result = vsnprintf(buf, length, _TRUNCATE, format, backup_ap); +#endif va_end(backup_ap); if ((result >= 0) && (result < length)) { diff --git a/util/test.cc b/util/test.cc index 220bfb0..85055b2 100644 --- a/util/test.cc +++ b/util/test.cc @@ -3,7 +3,7 @@ // license that can be found in the LICENSE file. #include <stdio.h> -#ifndef WIN32 +#ifndef _WIN32 #include <sys/resource.h> #endif #include "util/test.h" @@ -25,7 +25,7 @@ void RegisterTest(void (*fn)(void), const char *name) { namespace re2 { int64 VirtualProcessSize() { -#ifdef WIN32 +#ifdef _WIN32 return 0; #else struct rusage ru; diff --git a/util/util.h b/util/util.h index a7ccafe..f49ebd1 100644 --- a/util/util.h +++ b/util/util.h @@ -12,10 +12,10 @@ #include <stddef.h> // For size_t #include <assert.h> #include <stdarg.h> -#include <time.h> +#include <time.h> // For clock_gettime, CLOCK_REALTIME #include <ctype.h> // For isdigit, isalpha -#if !defined(WIN32) +#if !defined(_WIN32) #include <sys/time.h> // For gettimeofday #endif @@ -53,7 +53,7 @@ using std::tr1::unordered_set; #else #include <unordered_set> -#if defined(WIN32) +#if defined(_WIN32) using std::tr1::unordered_set; #else using std::unordered_set; @@ -61,7 +61,7 @@ using std::unordered_set; #endif -#ifdef WIN32 +#ifdef _WIN32 #define snprintf _snprintf_s #define sprintf sprintf_s @@ -72,7 +72,6 @@ using std::unordered_set; #define vsnprintf vsnprintf_s #pragma warning(disable: 4018) // signed/unsigned mismatch -#pragma warning(disable: 4244) // possible data loss in int conversion #pragma warning(disable: 4800) // conversion from int to bool #endif diff --git a/util/valgrind.cc b/util/valgrind.cc index ee257d0..82f9a4c 100644 --- a/util/valgrind.cc +++ b/util/valgrind.cc @@ -3,7 +3,7 @@ // license that can be found in the LICENSE file. #include "util/util.h" -#ifndef WIN32 +#ifndef _WIN32 #include "util/valgrind.h" #endif |