diff options
author | DongHun Kwak <dh0128.kwak@samsung.com> | 2020-09-23 15:44:43 +0900 |
---|---|---|
committer | DongHun Kwak <dh0128.kwak@samsung.com> | 2020-09-23 15:44:43 +0900 |
commit | e8d51c0b86328c541f6716785c897d2be4ead2da (patch) | |
tree | 38e29693556bccbdfd046350e7486dddff5a593b | |
parent | e245f8f640cb5beae4553f063f7cb94a088607ba (diff) | |
download | re2-e8d51c0b86328c541f6716785c897d2be4ead2da.tar.gz re2-e8d51c0b86328c541f6716785c897d2be4ead2da.tar.bz2 re2-e8d51c0b86328c541f6716785c897d2be4ead2da.zip |
Imported Upstream version 20200601upstream/20200601
-rw-r--r-- | CMakeLists.txt | 10 | ||||
-rw-r--r-- | Makefile | 63 | ||||
-rw-r--r-- | re2/bitstate.cc | 13 | ||||
-rw-r--r-- | re2/compile.cc | 22 | ||||
-rw-r--r-- | re2/dfa.cc | 129 | ||||
-rw-r--r-- | re2/nfa.cc | 144 | ||||
-rw-r--r-- | re2/parse.cc | 11 | ||||
-rw-r--r-- | re2/prog.cc | 79 | ||||
-rw-r--r-- | re2/prog.h | 25 | ||||
-rw-r--r-- | re2/re2.cc | 13 | ||||
-rw-r--r-- | re2/regexp.cc | 120 | ||||
-rw-r--r-- | re2/regexp.h | 7 | ||||
-rw-r--r-- | re2/testing/backtrack.cc | 28 | ||||
-rw-r--r-- | re2/testing/compile_test.cc | 2 | ||||
-rw-r--r-- | re2/testing/re2_test.cc | 9 | ||||
-rw-r--r-- | re2/testing/required_prefix_test.cc | 85 | ||||
-rw-r--r-- | re2/walker-inl.h | 26 |
17 files changed, 457 insertions, 329 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 5c980f0..44a4772 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,6 +13,11 @@ project(RE2 CXX) include(CTest) include(GNUInstallDirs) +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 11) + set(CMAKE_CXX_STANDARD_REQUIRED ON) +endif() + option(BUILD_SHARED_LIBS "build shared libraries" OFF) option(USEPCRE "use PCRE in tests and benchmarks" OFF) @@ -36,11 +41,6 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") # Without a byte order mark (BOM), Visual Studio assumes that the source # file is encoded using the current user code page, so we specify UTF-8. add_compile_options(/utf-8) -elseif(CYGWIN OR MINGW) - # See https://stackoverflow.com/questions/38139631 for details. - add_compile_options(-std=gnu++11) -elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") - add_compile_options(-std=c++11) endif() if(WIN32) @@ -68,6 +68,7 @@ SOEXTVER00=$(SOEXT).$(SONAME).0.0 MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),--version-script,libre2.symbols $(RE2_LDFLAGS) $(LDFLAGS) endif +.PHONY: all all: obj/libre2.a obj/so/libre2.$(SOEXT) INSTALL_HFILES=\ @@ -176,40 +177,49 @@ DTESTOFILES=$(patsubst obj/%,obj/dbg/%,$(TESTOFILES)) DTESTS=$(patsubst obj/%,obj/dbg/%,$(TESTS)) DBIGTESTS=$(patsubst obj/%,obj/dbg/%,$(BIGTESTS)) +.PRECIOUS: obj/%.o obj/%.o: %.cc $(HFILES) @mkdir -p $$(dirname $@) $(CXX) -c -o $@ $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) -DNDEBUG $*.cc +.PRECIOUS: obj/dbg/%.o obj/dbg/%.o: %.cc $(HFILES) @mkdir -p $$(dirname $@) $(CXX) -c -o $@ $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) $*.cc +.PRECIOUS: obj/so/%.o obj/so/%.o: %.cc $(HFILES) @mkdir -p $$(dirname $@) $(CXX) -c -o $@ -fPIC $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) -DNDEBUG $*.cc +.PRECIOUS: obj/libre2.a obj/libre2.a: $(OFILES) @mkdir -p obj $(AR) $(ARFLAGS) obj/libre2.a $(OFILES) +.PRECIOUS: obj/dbg/libre2.a obj/dbg/libre2.a: $(DOFILES) @mkdir -p obj/dbg $(AR) $(ARFLAGS) obj/dbg/libre2.a $(DOFILES) +.PRECIOUS: obj/so/libre2.$(SOEXT) obj/so/libre2.$(SOEXT): $(SOFILES) @mkdir -p obj/so $(MAKE_SHARED_LIBRARY) -o obj/so/libre2.$(SOEXTVER) $(SOFILES) ln -sf libre2.$(SOEXTVER) $@ +.PRECIOUS: obj/dbg/test/% obj/dbg/test/%: obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES) obj/dbg/util/test.o @mkdir -p obj/dbg/test $(CXX) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) obj/dbg/util/test.o obj/dbg/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) +.PRECIOUS: obj/test/% obj/test/%: obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o @mkdir -p obj/test $(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) # Test the shared lib, falling back to the static lib for private symbols +.PRECIOUS: obj/so/test/% obj/so/test/%: obj/so/libre2.$(SOEXT) obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o @mkdir -p obj/so/test $(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o -Lobj/so -lre2 obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) @@ -229,64 +239,94 @@ obj/test/re2_fuzzer: obj/libre2.a obj/re2/fuzzing/re2_fuzzer.o obj/util/fuzz.o $(CXX) -o $@ obj/re2/fuzzing/re2_fuzzer.o obj/util/fuzz.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) ifdef REBUILD_TABLES +.PRECIOUS: re2/perl_groups.cc re2/perl_groups.cc: re2/make_perl_groups.pl perl $< > $@ +.PRECIOUS: re2/unicode_%.cc re2/unicode_%.cc: re2/make_unicode_%.py python $< > $@ - -.PRECIOUS: re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc endif +.PHONY: distclean distclean: clean rm -f re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc +.PHONY: clean clean: rm -rf obj rm -f re2/*.pyc +.PHONY: testofiles testofiles: $(TESTOFILES) +.PHONY: test test: $(DTESTS) $(TESTS) $(STESTS) debug-test static-test shared-test +.PHONY: debug-test debug-test: $(DTESTS) @./runtests $(DTESTS) +.PHONY: static-test static-test: $(TESTS) @./runtests $(TESTS) +.PHONY: shared-test shared-test: $(STESTS) @./runtests -shared-library-path obj/so $(STESTS) +.PHONY: debug-bigtest debug-bigtest: $(DTESTS) $(DBIGTESTS) @./runtests $(DTESTS) $(DBIGTESTS) +.PHONY: static-bigtest static-bigtest: $(TESTS) $(BIGTESTS) @./runtests $(TESTS) $(BIGTESTS) +.PHONY: shared-bigtest shared-bigtest: $(STESTS) $(SBIGTESTS) @./runtests -shared-library-path obj/so $(STESTS) $(SBIGTESTS) +.PHONY: benchmark benchmark: obj/test/regexp_benchmark +.PHONY: fuzz fuzz: obj/test/re2_fuzzer -install: obj/libre2.a obj/so/libre2.$(SOEXT) - mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir)/pkgconfig - $(INSTALL_DATA) $(INSTALL_HFILES) $(DESTDIR)$(includedir)/re2 +.PHONY: install +install: static-install shared-install + +.PHONY: static +static: obj/libre2.a + +.PHONY: static-install +static-install: obj/libre2.a common-install $(INSTALL) obj/libre2.a $(DESTDIR)$(libdir)/libre2.a + +.PHONY: shared +shared: obj/so/libre2.$(SOEXT) + +.PHONY: shared-install +shared-install: obj/so/libre2.$(SOEXT) common-install $(INSTALL) obj/so/libre2.$(SOEXT) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER00) ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER) ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXT) + +.PHONY: common-install +common-install: + mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir)/pkgconfig + $(INSTALL_DATA) $(INSTALL_HFILES) $(DESTDIR)$(includedir)/re2 $(INSTALL_DATA) re2.pc $(DESTDIR)$(libdir)/pkgconfig/re2.pc $(SED_INPLACE) -e "s#@includedir@#$(includedir)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc $(SED_INPLACE) -e "s#@libdir@#$(libdir)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc +.PHONY: testinstall testinstall: static-testinstall shared-testinstall @echo @echo Install tests passed. @echo +.PHONY: static-testinstall static-testinstall: CXXFLAGS:=-std=c++11 -pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS) static-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -l:libre2.a $(LDICU) $(LDFLAGS) static-testinstall: @@ -301,6 +341,7 @@ else obj/testinstall endif +.PHONY: shared-testinstall shared-testinstall: CXXFLAGS:=-std=c++11 -pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS) shared-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -lre2 $(LDICU) $(LDFLAGS) shared-testinstall: @@ -313,19 +354,14 @@ else LD_LIBRARY_PATH="$(DESTDIR)$(libdir):$(LD_LIBRARY_PATH)" obj/testinstall endif +.PHONY: benchlog benchlog: obj/test/regexp_benchmark (echo '==BENCHMARK==' `hostname` `date`; \ (uname -a; $(CXX) --version; git rev-parse --short HEAD; file obj/test/regexp_benchmark) | sed 's/^/# /'; \ echo; \ ./obj/test/regexp_benchmark 'PCRE|RE2') | tee -a benchlog.$$(hostname | sed 's/\..*//') -# Keep gmake from deleting intermediate files it creates. -# This makes repeated builds faster and preserves debug info on OS X. - -.PRECIOUS: obj/%.o obj/dbg/%.o obj/so/%.o obj/libre2.a \ - obj/dbg/libre2.a obj/so/libre2.a \ - obj/test/% obj/so/test/% obj/dbg/test/% - +.PHONY: log log: $(MAKE) clean $(MAKE) CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" \ @@ -341,6 +377,3 @@ log: echo '#' RE2 basic search tests built by make $@ >re2-search.txt echo '#' $$(date) >>re2-search.txt obj/test/search_test |grep -v '^PASS$$' >>re2-search.txt - -x: x.cc obj/libre2.a - g++ -I. -o x x.cc obj/libre2.a diff --git a/re2/bitstate.cc b/re2/bitstate.cc index 865b58e..320d1ee 100644 --- a/re2/bitstate.cc +++ b/re2/bitstate.cc @@ -332,14 +332,13 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context, // This looks like it's quadratic in the size of the text, // but we are not clearing visited_ between calls to TrySearch, // so no work is duplicated and it ends up still being linear. - for (const char* p = text.data(); p <= text.data() + text.size(); p++) { - // Try to use memchr to find the first byte quickly. - int fb = prog_->first_byte(); - if (fb >= 0 && p < text.data() + text.size() && (p[0] & 0xFF) != fb) { - p = reinterpret_cast<const char*>( - memchr(p, fb, text.data() + text.size() - p)); + const char* etext = text.data() + text.size(); + for (const char* p = text.data(); p <= etext; p++) { + // Try to use prefix accel (e.g. memchr) to skip ahead. + if (p < etext && prog_->can_prefix_accel()) { + p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext - p)); if (p == NULL) - p = text.data() + text.size(); + p = etext; } cap_[0] = p; diff --git a/re2/compile.cc b/re2/compile.cc index b3e5bdc..30e55f5 100644 --- a/re2/compile.cc +++ b/re2/compile.cc @@ -225,8 +225,8 @@ class Compiler : public Regexp::Walker<Frag> { // Single rune. Frag Literal(Rune r, bool foldcase); - void Setup(Regexp::ParseFlags, int64_t, RE2::Anchor); - Prog* Finish(); + void Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor); + Prog* Finish(Regexp* re); // Returns .* where dot = any byte Frag DotStar(); @@ -1167,10 +1167,10 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) { c.prog_->set_start_unanchored(all.begin); // Hand ownership of prog_ to caller. - return c.Finish(); + return c.Finish(re); } -Prog* Compiler::Finish() { +Prog* Compiler::Finish(Regexp* re) { if (failed_) return NULL; @@ -1186,7 +1186,17 @@ Prog* Compiler::Finish() { prog_->Optimize(); prog_->Flatten(); prog_->ComputeByteMap(); - prog_->ComputeFirstByte(); + + if (!prog_->reversed()) { + std::string prefix; + bool prefix_foldcase; + if (re->RequiredPrefixForAccel(&prefix, &prefix_foldcase) && + !prefix_foldcase) { + prog_->prefix_size_ = prefix.size(); + prog_->prefix_front_ = prefix.front(); + prog_->prefix_back_ = prefix.back(); + } + } // Record remaining memory for DFA. if (max_mem_ <= 0) { @@ -1244,7 +1254,7 @@ Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) { c.prog_->set_start(all.begin); c.prog_->set_start_unanchored(all.begin); - Prog* prog = c.Finish(); + Prog* prog = c.Finish(re); if (prog == NULL) return NULL; @@ -167,12 +167,6 @@ class DFA { typedef std::unordered_set<State*, StateHash, StateEqual> StateSet; private: - // Special "first_byte" values for a state. (Values >= 0 denote actual bytes.) - enum { - kFbUnknown = -1, // No analysis has been performed. - kFbNone = -2, // The first-byte trick cannot be used. - }; - enum { // Indices into start_ for unanchored searches. // Add kStartAnchored for anchored searches. @@ -239,25 +233,26 @@ class DFA { struct SearchParams { SearchParams(const StringPiece& text, const StringPiece& context, RWLocker* cache_lock) - : text(text), context(context), + : text(text), + context(context), anchored(false), + can_prefix_accel(false), want_earliest_match(false), run_forward(false), start(NULL), - first_byte(kFbUnknown), cache_lock(cache_lock), failed(false), ep(NULL), - matches(NULL) { } + matches(NULL) {} StringPiece text; StringPiece context; bool anchored; + bool can_prefix_accel; bool want_earliest_match; bool run_forward; State* start; - int first_byte; - RWLocker *cache_lock; + RWLocker* cache_lock; bool failed; // "out" parameter: whether search gave up const char* ep; // "out" parameter: end pointer for match SparseSet* matches; @@ -268,15 +263,13 @@ class DFA { }; // Before each search, the parameters to Search are analyzed by - // AnalyzeSearch to determine the state in which to start and the - // "first_byte" for that state, if any. + // AnalyzeSearch to determine the state in which to start. struct StartInfo { - StartInfo() : start(NULL), first_byte(kFbUnknown) {} - State* start; - std::atomic<int> first_byte; + StartInfo() : start(NULL) {} + std::atomic<State*> start; }; - // Fills in params->start and params->first_byte using + // Fills in params->start and params->can_prefix_accel using // the other search parameters. Returns true on success, // false on failure. // cache_mutex_.r <= L < mutex_ @@ -288,7 +281,7 @@ class DFA { // cache_mutex_.r <= L < mutex_ // Might unlock and relock cache_mutex_ via params->cache_lock. inline bool InlinedSearchLoop(SearchParams* params, - bool have_first_byte, + bool can_prefix_accel, bool want_earliest_match, bool run_forward); @@ -606,7 +599,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { // Only ByteRange, EmptyWidth, and Match instructions are useful to keep: // those are the only operators with any effect in // RunWorkqOnEmptyString or RunWorkqOnByte. - int* inst = new int[q->size()]; + PODArray<int> inst(q->size()); int n = 0; uint32_t needflags = 0; // flags needed by kInstEmptyWidth instructions bool sawmatch = false; // whether queue contains guaranteed kInstMatch @@ -636,7 +629,6 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { (it == q->begin() && ip->greedy(prog_))) && (kind_ != Prog::kLongestMatch || !sawmark) && (flag & kFlagMatch)) { - delete[] inst; if (ExtraDebug) fprintf(stderr, " -> FullMatchState\n"); return FullMatchState; @@ -683,7 +675,6 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { // the execution loop can stop early. This is only okay // if the state is *not* a matching state. if (n == 0 && flag == 0) { - delete[] inst; if (ExtraDebug) fprintf(stderr, " -> DeadState\n"); return DeadState; @@ -693,7 +684,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { // unordered state sets separated by Marks. Sort each set // to canonicalize, to reduce the number of distinct sets stored. if (kind_ == Prog::kLongestMatch) { - int* ip = inst; + int* ip = inst.data(); int* ep = ip + n; while (ip < ep) { int* markp = ip; @@ -710,7 +701,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { // we have an unordered set of states (i.e. we don't have Marks) // and sorting will reduce the number of distinct sets stored. if (kind_ == Prog::kManyMatch) { - int* ip = inst; + int* ip = inst.data(); int* ep = ip + n; std::sort(ip, ep); } @@ -729,8 +720,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { // Save the needed empty-width flags in the top bits for use later. flag |= needflags << kFlagNeedShift; - State* state = CachedState(inst, n, flag); - delete[] inst; + State* state = CachedState(inst.data(), n, flag); return state; } @@ -1183,10 +1173,8 @@ void DFA::ResetCache(RWLocker* cache_lock) { }); // Clear the cache, reset the memory budget. - for (int i = 0; i < kMaxStart; i++) { - start_[i].start = NULL; - start_[i].first_byte.store(kFbUnknown, std::memory_order_relaxed); - } + for (int i = 0; i < kMaxStart; i++) + start_[i].start.store(NULL, std::memory_order_relaxed); ClearCache(); mem_budget_ = state_budget_; } @@ -1301,8 +1289,7 @@ DFA::State* DFA::StateSaver::Restore() { // situation, the DFA can do better than executing the simple loop. // Instead, it can call memchr to search very quickly for the byte c. // Whether the start state has this property is determined during a -// pre-compilation pass, and if so, the byte b is passed to the search -// loop as the "first_byte" argument, along with a boolean "have_first_byte". +// pre-compilation pass and the "can_prefix_accel" argument is set. // // Fourth, the desired behavior is to search for the leftmost-best match // (approximately, the same one that Perl would find), which is not @@ -1335,7 +1322,7 @@ DFA::State* DFA::StateSaver::Restore() { // making them function arguments lets the inliner specialize // this function to each combination (see two paragraphs above). inline bool DFA::InlinedSearchLoop(SearchParams* params, - bool have_first_byte, + bool can_prefix_accel, bool want_earliest_match, bool run_forward) { State* start = params->start; @@ -1380,12 +1367,12 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, if (ExtraDebug) fprintf(stderr, "@%td: %s\n", p - bp, DumpState(s).c_str()); - if (run_forward && have_first_byte && s == start) { - // In start state, only way out is to find first_byte, - // so use optimized assembly in memchr to skip ahead. - // If first_byte isn't found, we can skip to the end - // of the string. - if ((p = BytePtr(memchr(p, params->first_byte, ep - p))) == NULL) { + if (can_prefix_accel && s == start) { + // In start state, only way out is to find the prefix, + // so we use prefix accel (e.g. memchr) to skip ahead. + // If not found, we can skip to the end of the string. + p = BytePtr(prog_->PrefixAccel(p, ep - p)); + if (p == NULL) { p = ep; break; } @@ -1589,7 +1576,7 @@ bool DFA::SearchTTT(SearchParams* params) { // For debugging, calls the general code directly. bool DFA::SlowSearchLoop(SearchParams* params) { return InlinedSearchLoop(params, - params->first_byte >= 0, + params->can_prefix_accel, params->want_earliest_match, params->run_forward); } @@ -1610,8 +1597,7 @@ bool DFA::FastSearchLoop(SearchParams* params) { &DFA::SearchTTT, }; - bool have_first_byte = params->first_byte >= 0; - int index = 4 * have_first_byte + + int index = 4 * params->can_prefix_accel + 2 * params->want_earliest_match + 1 * params->run_forward; return (this->*Searches[index])(params); @@ -1703,13 +1689,22 @@ bool DFA::AnalyzeSearch(SearchParams* params) { } } + params->start = info->start.load(std::memory_order_acquire); + + // Even if we could prefix accel, we cannot do so when anchored and, + // less obviously, we cannot do so when we are going to need flags. + // This trick works only when there is a single byte that leads to a + // different state! + if (prog_->can_prefix_accel() && + !params->anchored && + params->start > SpecialStateMax && + params->start->flag_ >> kFlagNeedShift == 0) + params->can_prefix_accel = true; + if (ExtraDebug) - fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s first_byte=%d\n", + fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s can_prefix_accel=%d\n", params->anchored, params->run_forward, flags, - DumpState(info->start).c_str(), info->first_byte.load()); - - params->start = info->start; - params->first_byte = info->first_byte.load(std::memory_order_acquire); + DumpState(params->start).c_str(), params->can_prefix_accel); return true; } @@ -1718,47 +1713,25 @@ bool DFA::AnalyzeSearch(SearchParams* params) { bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info, uint32_t flags) { // Quick check. - int fb = info->first_byte.load(std::memory_order_acquire); - if (fb != kFbUnknown) + State* start = info->start.load(std::memory_order_acquire); + if (start != NULL) return true; MutexLock l(&mutex_); - fb = info->first_byte.load(std::memory_order_relaxed); - if (fb != kFbUnknown) + start = info->start.load(std::memory_order_relaxed); + if (start != NULL) return true; q0_->clear(); AddToQueue(q0_, params->anchored ? prog_->start() : prog_->start_unanchored(), flags); - info->start = WorkqToCachedState(q0_, NULL, flags); - if (info->start == NULL) + start = WorkqToCachedState(q0_, NULL, flags); + if (start == NULL) return false; - if (info->start == DeadState) { - // Synchronize with "quick check" above. - info->first_byte.store(kFbNone, std::memory_order_release); - return true; - } - - if (info->start == FullMatchState) { - // Synchronize with "quick check" above. - info->first_byte.store(kFbNone, std::memory_order_release); // will be ignored - return true; - } - - // Even if we have a first_byte, we cannot use it when anchored and, - // less obviously, we cannot use it when we are going to need flags. - // This trick works only when there is a single byte that leads to a - // different state! - int first_byte = prog_->first_byte(); - if (first_byte == -1 || - params->anchored || - info->start->flag_ >> kFlagNeedShift != 0) - first_byte = kFbNone; - // Synchronize with "quick check" above. - info->first_byte.store(first_byte, std::memory_order_release); + info->start.store(start, std::memory_order_release); return true; } @@ -1866,13 +1839,13 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, StringPiece context = const_context; if (context.data() == NULL) context = text; - bool carat = anchor_start(); + bool caret = anchor_start(); bool dollar = anchor_end(); if (reversed_) { using std::swap; - swap(carat, dollar); + swap(caret, dollar); } - if (carat && context.begin() != text.begin()) + if (caret && context.begin() != text.begin()) return false; if (dollar && context.end() != text.end()) return false; @@ -27,6 +27,7 @@ #include <stdio.h> #include <string.h> #include <algorithm> +#include <deque> #include <string> #include <utility> #include <vector> @@ -107,18 +108,21 @@ class NFA { // Returns text version of capture information, for debugging. std::string FormatCapture(const char** capture); - inline void CopyCapture(const char** dst, const char** src); + void CopyCapture(const char** dst, const char** src) { + memmove(dst, src, ncapture_*sizeof src[0]); + } Prog* prog_; // underlying program int start_; // start instruction in program int ncapture_; // number of submatches to track bool longest_; // whether searching for longest match bool endmatch_; // whether match must end at text.end() - const char* btext_; // beginning of text being matched (for FormatSubmatch) - const char* etext_; // end of text being matched (for endmatch_) + const char* btext_; // beginning of text (for FormatSubmatch) + const char* etext_; // end of text (for endmatch_) Threadq q0_, q1_; // pre-allocated for Search. PODArray<AddState> stack_; // pre-allocated for AddToThreadq - Thread* free_threads_; // free list + std::deque<Thread> arena_; // thread arena + Thread* freelist_; // thread freelist const char** match_; // best match so far bool matched_; // any match so far? @@ -141,31 +145,30 @@ NFA::NFA(Prog* prog) { prog_->inst_count(kInstEmptyWidth) + prog_->inst_count(kInstNop) + 1; // + 1 for start inst stack_ = PODArray<AddState>(nstack); - free_threads_ = NULL; + freelist_ = NULL; match_ = NULL; matched_ = false; } NFA::~NFA() { delete[] match_; - Thread* next; - for (Thread* t = free_threads_; t; t = next) { - next = t->next; - delete[] t->capture; - delete t; - } + for (const Thread& t : arena_) + delete[] t.capture; } NFA::Thread* NFA::AllocThread() { - Thread* t = free_threads_; - if (t == NULL) { - t = new Thread; + Thread* t = freelist_; + if (t != NULL) { + freelist_ = t->next; t->ref = 1; - t->capture = new const char*[ncapture_]; + // We don't need to touch t->capture because + // the caller will immediately overwrite it. return t; } - free_threads_ = t->next; + arena_.emplace_back(); + t = &arena_.back(); t->ref = 1; + t->capture = new const char*[ncapture_]; return t; } @@ -176,21 +179,13 @@ NFA::Thread* NFA::Incref(Thread* t) { } void NFA::Decref(Thread* t) { - if (t == NULL) - return; + DCHECK(t != NULL); t->ref--; if (t->ref > 0) return; DCHECK_EQ(t->ref, 0); - t->next = free_threads_; - free_threads_ = t; -} - -void NFA::CopyCapture(const char** dst, const char** src) { - for (int i = 0; i < ncapture_; i+=2) { - dst[i] = src[i]; - dst[i+1] = src[i+1]; - } + t->next = freelist_; + freelist_ = t; } // Follows all empty arrows from id0 and enqueues all the states reached. @@ -372,8 +367,10 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, matched_ = true; Decref(t); - for (++i; i != runq->end(); ++i) - Decref(i->value()); + for (++i; i != runq->end(); ++i) { + if (i->value() != NULL) + Decref(i->value()); + } runq->clear(); if (ip->greedy(prog_)) return ip->out1(); @@ -416,8 +413,10 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, // worse than the one we just found: don't run the // rest of the current Threadq. Decref(t); - for (++i; i != runq->end(); ++i) - Decref(i->value()); + for (++i; i != runq->end(); ++i) { + if (i->value() != NULL) + Decref(i->value()); + } runq->clear(); return 0; } @@ -489,6 +488,7 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, } match_ = new const char*[ncapture_]; + memset(match_, 0, ncapture_*sizeof match_[0]); matched_ = false; // For debugging prints. @@ -505,7 +505,6 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, Threadq* nextq = &q1_; runq->clear(); nextq->clear(); - memset(&match_[0], 0, ncapture_*sizeof match_[0]); // Loop over the text, stepping the machine. for (const char* p = text.data();; p++) { @@ -572,16 +571,14 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, // matches, since it would be to the right of the match // we already found.) if (!matched_ && (!anchored || p == text.data())) { - // If there's a required first byte for an unanchored search - // and we're not in the middle of any possible matches, - // use memchr to search for the byte quickly. - int fb = prog_->first_byte(); + // Try to use prefix accel (e.g. memchr) to skip ahead. + // The search must be unanchored and there must be zero + // possible matches already. if (!anchored && runq->size() == 0 && - fb >= 0 && p < etext_ && (p[0] & 0xFF) != fb) { - p = reinterpret_cast<const char*>(memchr(p, fb, etext_ - p)); - if (p == NULL) { + p < etext_ && prog_->can_prefix_accel()) { + p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext_ - p)); + if (p == NULL) p = etext_; - } } Thread* t = AllocThread(); @@ -612,8 +609,10 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, } } - for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) - Decref(i->value()); + for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { + if (i->value() != NULL) + Decref(i->value()); + } if (matched_) { for (int i = 0; i < nsubmatch; i++) @@ -629,67 +628,6 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, return false; } -void Prog::ComputeFirstByte() { - SparseSet q(size()); - q.insert(start()); - for (SparseSet::iterator it = q.begin(); it != q.end(); ++it) { - int id = *it; - Prog::Inst* ip = inst(id); - switch (ip->opcode()) { - default: - LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte"; - break; - - case kInstMatch: - // The empty string matches: no first byte. - first_byte_ = -1; - return; - - case kInstByteRange: - if (!ip->last()) - q.insert(id+1); - - // Must match only a single byte. - if (ip->lo() != ip->hi() || - (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')) { - first_byte_ = -1; - return; - } - // If we haven't seen any bytes yet, record it; - // otherwise must match the one we saw before. - if (first_byte_ == -1) { - first_byte_ = ip->lo(); - } else if (first_byte_ != ip->lo()) { - first_byte_ = -1; - return; - } - break; - - case kInstNop: - case kInstCapture: - case kInstEmptyWidth: - if (!ip->last()) - q.insert(id+1); - - // Continue on. - // Ignore ip->empty() flags for kInstEmptyWidth - // in order to be as conservative as possible - // (assume all possible empty-width flags are true). - if (ip->out()) - q.insert(ip->out()); - break; - - case kInstAltMatch: - DCHECK(!ip->last()); - q.insert(id+1); - break; - - case kInstFail: - break; - } - } -} - bool Prog::SearchNFA(const StringPiece& text, const StringPiece& context, Anchor anchor, MatchKind kind, diff --git a/re2/parse.cc b/re2/parse.cc index 0a5ff74..e741603 100644 --- a/re2/parse.cc +++ b/re2/parse.cc @@ -93,7 +93,7 @@ class Regexp::ParseState { bool PushSimpleOp(RegexpOp op); // Pushes a ^ onto the stack. - bool PushCarat(); + bool PushCaret(); // Pushes a \b (word == true) or \B (word == false) onto the stack. bool PushWordBoundary(bool word); @@ -423,7 +423,7 @@ bool Regexp::ParseState::PushLiteral(Rune r) { } // Pushes a ^ onto the stack. -bool Regexp::ParseState::PushCarat() { +bool Regexp::ParseState::PushCaret() { if (flags_ & OneLine) { return PushSimpleOp(kRegexpBeginText); } @@ -1802,14 +1802,13 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, // Convert the UnicodeSet to a URange32 and UGroup that we can add. int nr = uset.getRangeCount(); - URange32* r = new URange32[nr]; + PODArray<URange32> r(nr); for (int i = 0; i < nr; i++) { r[i].lo = uset.getRangeStart(i); r[i].hi = uset.getRangeEnd(i); } - UGroup g = {"", +1, 0, 0, r, nr}; + UGroup g = {"", +1, 0, 0, r.data(), nr}; AddUGroup(cc, &g, sign, parse_flags); - delete[] r; #endif return kParseOk; @@ -2272,7 +2271,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, break; case '^': // Beginning of line. - if (!ps.PushCarat()) + if (!ps.PushCaret()) return NULL; t.remove_prefix(1); // '^' break; diff --git a/re2/prog.cc b/re2/prog.cc index 9244319..ac9c085 100644 --- a/re2/prog.cc +++ b/re2/prog.cc @@ -7,6 +7,12 @@ #include "re2/prog.h" +#if defined(__AVX2__) +#include <immintrin.h> +#ifdef _MSC_VER +#include <intrin.h> +#endif +#endif #include <stdint.h> #include <string.h> #include <algorithm> @@ -109,7 +115,9 @@ Prog::Prog() start_unanchored_(0), size_(0), bytemap_range_(0), - first_byte_(-1), + prefix_size_(0), + prefix_front_(-1), + prefix_back_(-1), list_count_(0), dfa_mem_(0), dfa_first_(NULL), @@ -908,4 +916,73 @@ void Prog::ComputeHints(std::vector<Inst>* flat, int begin, int end) { } } +#if defined(__AVX2__) +// Finds the least significant non-zero bit in n. +static int FindLSBSet(uint32_t n) { + DCHECK_NE(n, 0); +#if defined(__GNUC__) + return __builtin_ctz(n); +#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) + unsigned long c; + _BitScanForward(&c, n); + return static_cast<int>(c); +#else + int c = 31; + for (int shift = 1 << 4; shift != 0; shift >>= 1) { + uint32_t word = n << shift; + if (word != 0) { + n = word; + c -= shift; + } + } + return c; +#endif +} +#endif + +const void* Prog::PrefixAccel_FrontAndBack(const void* data, size_t size) { + DCHECK_GE(prefix_size_, 2); + if (size < prefix_size_) + return NULL; + // Don't bother searching the last prefix_size_-1 bytes for prefix_front_. + // This also means that probing for prefix_back_ doesn't go out of bounds. + size -= prefix_size_-1; + +#if defined(__AVX2__) + // Use AVX2 to look for prefix_front_ and prefix_back_ 32 bytes at a time. + if (size >= sizeof(__m256i)) { + const __m256i* fp = reinterpret_cast<const __m256i*>( + reinterpret_cast<const char*>(data)); + const __m256i* bp = reinterpret_cast<const __m256i*>( + reinterpret_cast<const char*>(data) + prefix_size_-1); + const __m256i* endfp = fp + size/sizeof(__m256i); + const __m256i f_set1 = _mm256_set1_epi8(prefix_front_); + const __m256i b_set1 = _mm256_set1_epi8(prefix_back_); + while (fp != endfp) { + const __m256i f_loadu = _mm256_loadu_si256(fp++); + const __m256i b_loadu = _mm256_loadu_si256(bp++); + const __m256i f_cmpeq = _mm256_cmpeq_epi8(f_set1, f_loadu); + const __m256i b_cmpeq = _mm256_cmpeq_epi8(b_set1, b_loadu); + const int fb_testz = _mm256_testz_si256(f_cmpeq, b_cmpeq); + if (fb_testz == 0) { // ZF: 1 means zero, 0 means non-zero. + const __m256i fb_and = _mm256_and_si256(f_cmpeq, b_cmpeq); + const int fb_movemask = _mm256_movemask_epi8(fb_and); + const int fb_ctz = FindLSBSet(fb_movemask); + return reinterpret_cast<const char*>(fp-1) + fb_ctz; + } + } + data = fp; + size = size%sizeof(__m256i); + } +#endif + + const char* p0 = reinterpret_cast<const char*>(data); + for (const char* p = p0;; p++) { + DCHECK_GE(size, static_cast<size_t>(p-p0)); + p = reinterpret_cast<const char*>(memchr(p, prefix_front_, size - (p-p0))); + if (p == NULL || p[prefix_size_-1] == prefix_back_) + return p; + } +} + } // namespace re2 @@ -198,8 +198,8 @@ class Prog { Inst *inst(int id) { return &inst_[id]; } int start() { return start_; } - int start_unanchored() { return start_unanchored_; } void set_start(int start) { start_ = start; } + int start_unanchored() { return start_unanchored_; } void set_start_unanchored(int start) { start_unanchored_ = start; } int size() { return size_; } bool reversed() { return reversed_; } @@ -207,15 +207,27 @@ class Prog { int list_count() { return list_count_; } int inst_count(InstOp op) { return inst_count_[op]; } uint16_t* list_heads() { return list_heads_.data(); } - void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; } int64_t dfa_mem() { return dfa_mem_; } + void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; } bool anchor_start() { return anchor_start_; } void set_anchor_start(bool b) { anchor_start_ = b; } bool anchor_end() { return anchor_end_; } void set_anchor_end(bool b) { anchor_end_ = b; } int bytemap_range() { return bytemap_range_; } const uint8_t* bytemap() { return bytemap_; } - int first_byte() { return first_byte_; } + bool can_prefix_accel() { return prefix_size_ != 0; } + + // Accelerates to the first likely occurrence of the prefix. + // Returns a pointer to the first byte or NULL if not found. + const void* PrefixAccel(const void* data, size_t size) { + DCHECK_GE(prefix_size_, 1); + return prefix_size_ == 1 ? memchr(data, prefix_front_, size) + : PrefixAccel_FrontAndBack(data, size); + } + + // An implementation of prefix accel that looks for prefix_front_ and + // prefix_back_ to return fewer false positives than memchr(3) alone. + const void* PrefixAccel_FrontAndBack(const void* data, size_t size); // Returns string representation of program for debugging. std::string Dump(); @@ -293,9 +305,6 @@ class Prog { // Compute bytemap. void ComputeByteMap(); - // Computes whether all matches must begin with the same first byte. - void ComputeFirstByte(); - // Run peep-hole optimizer on program. void Optimize(); @@ -397,7 +406,9 @@ class Prog { int start_unanchored_; // unanchored entry point for program int size_; // number of instructions int bytemap_range_; // bytemap_[x] < bytemap_range_ - int first_byte_; // required first byte for match, or -1 if none + size_t prefix_size_; // size of prefix (0 if no prefix) + int prefix_front_; // first byte of prefix (-1 if no prefix) + int prefix_back_; // last byte of prefix (-1 if no prefix) int list_count_; // count of lists (see above) int inst_count_[kNumInst]; // count of instructions by opcode @@ -293,7 +293,7 @@ static int FindMSBSet(uint32_t n) { DCHECK_NE(n, 0); #if defined(__GNUC__) return 31 ^ __builtin_clz(n); -#elif defined(_MSC_VER) +#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) unsigned long c; _BitScanReverse(&c, n); return static_cast<int>(c); @@ -405,6 +405,8 @@ bool RE2::Replace(std::string* str, const StringPiece& rewrite) { StringPiece vec[kVecSize]; int nvec = 1 + MaxSubmatch(rewrite); + if (nvec > 1 + re.NumberOfCapturingGroups()) + return false; if (nvec > static_cast<int>(arraysize(vec))) return false; if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec)) @@ -425,6 +427,8 @@ int RE2::GlobalReplace(std::string* str, const StringPiece& rewrite) { StringPiece vec[kVecSize]; int nvec = 1 + MaxSubmatch(rewrite); + if (nvec > 1 + re.NumberOfCapturingGroups()) + return false; if (nvec > static_cast<int>(arraysize(vec))) return false; @@ -497,9 +501,10 @@ bool RE2::Extract(const StringPiece& text, std::string* out) { StringPiece vec[kVecSize]; int nvec = 1 + MaxSubmatch(rewrite); + if (nvec > 1 + re.NumberOfCapturingGroups()) + return false; if (nvec > static_cast<int>(arraysize(vec))) return false; - if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec)) return false; @@ -1012,8 +1017,8 @@ bool RE2::Rewrite(std::string* out, int n = (c - '0'); if (n >= veclen) { if (options_.log_errors()) { - LOG(ERROR) << "requested group " << n - << " in regexp " << rewrite.data(); + LOG(ERROR) << "invalid substitution \\" << n + << " from " << veclen << " groups"; } return false; } diff --git a/re2/regexp.cc b/re2/regexp.cc index 5732166..4364d0a 100644 --- a/re2/regexp.cc +++ b/re2/regexp.cc @@ -20,6 +20,7 @@ #include "util/logging.h" #include "util/mutex.h" #include "util/utf.h" +#include "re2/pod_array.h" #include "re2/stringpiece.h" #include "re2/walker-inl.h" @@ -243,16 +244,15 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, return new Regexp(kRegexpEmptyMatch, flags); } - Regexp** subcopy = NULL; + PODArray<Regexp*> subcopy; if (op == kRegexpAlternate && can_factor) { // Going to edit sub; make a copy so we don't step on caller. - subcopy = new Regexp*[nsub]; - memmove(subcopy, sub, nsub * sizeof sub[0]); - sub = subcopy; + subcopy = PODArray<Regexp*>(nsub); + memmove(subcopy.data(), sub, nsub * sizeof sub[0]); + sub = subcopy.data(); nsub = FactorAlternation(sub, nsub, flags); if (nsub == 1) { Regexp* re = sub[0]; - delete[] subcopy; return re; } } @@ -269,7 +269,6 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub, nsub - (nbigsub-1)*kMaxNsub, flags, false); - delete[] subcopy; return re; } @@ -278,8 +277,6 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, Regexp** subs = re->sub(); for (int i = 0; i < nsub; i++) subs[i] = sub[i]; - - delete[] subcopy; return re; } @@ -658,78 +655,83 @@ std::map<int, std::string>* Regexp::CaptureNames() { return w.TakeMap(); } +void ConvertRunesToBytes(bool latin1, Rune* runes, int nrunes, + std::string* bytes) { + if (latin1) { + bytes->resize(nrunes); + for (int i = 0; i < nrunes; i++) + (*bytes)[i] = static_cast<char>(runes[i]); + } else { + bytes->resize(nrunes * UTFmax); // worst case + char* p = &(*bytes)[0]; + for (int i = 0; i < nrunes; i++) + p += runetochar(p, &runes[i]); + bytes->resize(p - &(*bytes)[0]); + bytes->shrink_to_fit(); + } +} + // Determines whether regexp matches must be anchored // with a fixed string prefix. If so, returns the prefix and // the regexp that remains after the prefix. The prefix might // be ASCII case-insensitive. bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase, Regexp** suffix) { + prefix->clear(); + *foldcase = false; + *suffix = NULL; + // No need for a walker: the regexp must be of the form // 1. some number of ^ anchors // 2. a literal char or string // 3. the rest - prefix->clear(); - *foldcase = false; - *suffix = NULL; if (op_ != kRegexpConcat) return false; - - // Some number of anchors, then a literal or concatenation. int i = 0; - Regexp** sub = this->sub(); - while (i < nsub_ && sub[i]->op_ == kRegexpBeginText) + while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText) i++; if (i == 0 || i >= nsub_) return false; - - Regexp* re = sub[i]; - switch (re->op_) { - default: - return false; - - case kRegexpLiteralString: - // Convert to string in proper encoding. - if (re->parse_flags() & Latin1) { - prefix->resize(re->nrunes_); - for (int j = 0; j < re->nrunes_; j++) - (*prefix)[j] = static_cast<char>(re->runes_[j]); - } else { - // Convert to UTF-8 in place. - // Assume worst-case space and then trim. - prefix->resize(re->nrunes_ * UTFmax); - char *p = &(*prefix)[0]; - for (int j = 0; j < re->nrunes_; j++) { - Rune r = re->runes_[j]; - if (r < Runeself) - *p++ = static_cast<char>(r); - else - p += runetochar(p, &r); - } - prefix->resize(p - &(*prefix)[0]); - } - break; - - case kRegexpLiteral: - if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) { - prefix->append(1, static_cast<char>(re->rune_)); - } else { - char buf[UTFmax]; - prefix->append(buf, runetochar(buf, &re->rune_)); - } - break; - } - *foldcase = (sub[i]->parse_flags() & FoldCase) != 0; + Regexp* re = sub()[i]; + if (re->op_ != kRegexpLiteral && + re->op_ != kRegexpLiteralString) + return false; i++; - - // The rest. if (i < nsub_) { for (int j = i; j < nsub_; j++) - sub[j]->Incref(); - re = Concat(sub + i, nsub_ - i, parse_flags()); + sub()[j]->Incref(); + *suffix = Concat(sub() + i, nsub_ - i, parse_flags()); } else { - re = new Regexp(kRegexpEmptyMatch, parse_flags()); + *suffix = new Regexp(kRegexpEmptyMatch, parse_flags()); } - *suffix = re; + + bool latin1 = (re->parse_flags() & Latin1) != 0; + Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_; + int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_; + ConvertRunesToBytes(latin1, runes, nrunes, prefix); + *foldcase = (re->parse_flags() & FoldCase) != 0; + return true; +} + +// Determines whether regexp matches must be unanchored +// with a fixed string prefix. If so, returns the prefix. +// The prefix might be ASCII case-insensitive. +bool Regexp::RequiredPrefixForAccel(std::string* prefix, bool* foldcase) { + prefix->clear(); + *foldcase = false; + + // No need for a walker: the regexp must either begin with or be + // a literal char or string. + Regexp* re = op_ == kRegexpConcat && nsub_ > 0 ? sub()[0] : this; + if (re->op_ != kRegexpLiteral && + re->op_ != kRegexpLiteralString) + return false; + + bool latin1 = (re->parse_flags() & Latin1) != 0; + Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_; + int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_; + ConvertRunesToBytes(latin1, runes, nrunes, prefix); + *foldcase = (re->parse_flags() & FoldCase) != 0; return true; } diff --git a/re2/regexp.h b/re2/regexp.h index a5d85c8..5284ab5 100644 --- a/re2/regexp.h +++ b/re2/regexp.h @@ -440,6 +440,13 @@ class Regexp { bool RequiredPrefix(std::string* prefix, bool* foldcase, Regexp** suffix); + // Whether every match of this regexp must be unanchored and + // begin with a non-empty fixed string (perhaps after ASCII + // case-folding). If so, returns the prefix. + // Callers should expect *prefix and *foldcase to be "zeroed" + // regardless of the return value. + bool RequiredPrefixForAccel(std::string* prefix, bool* foldcase); + private: // Constructor allocates vectors as appropriate for operator. explicit Regexp(RegexpOp op, ParseFlags parse_flags); diff --git a/re2/testing/backtrack.cc b/re2/testing/backtrack.cc index 3bab1e7..216d259 100644 --- a/re2/testing/backtrack.cc +++ b/re2/testing/backtrack.cc @@ -29,6 +29,7 @@ #include "util/util.h" #include "util/logging.h" +#include "re2/pod_array.h" #include "re2/prog.h" #include "re2/regexp.h" @@ -53,7 +54,6 @@ namespace re2 { class Backtracker { public: explicit Backtracker(Prog* prog); - ~Backtracker(); bool Search(const StringPiece& text, const StringPiece& context, bool anchored, bool longest, @@ -79,9 +79,8 @@ class Backtracker { int nsubmatch_; // # of submatches to fill in // Search state - const char* cap_[64]; // capture registers - uint32_t *visited_; // bitmap: (Inst*, char*) pairs already backtracked - size_t nvisited_; // # of words in bitmap + const char* cap_[64]; // capture registers + PODArray<uint32_t> visited_; // bitmap: (Inst*, char*) pairs visited Backtracker(const Backtracker&) = delete; Backtracker& operator=(const Backtracker&) = delete; @@ -93,13 +92,7 @@ Backtracker::Backtracker(Prog* prog) longest_(false), endmatch_(false), submatch_(NULL), - nsubmatch_(0), - visited_(NULL), - nvisited_(0) { -} - -Backtracker::~Backtracker() { - delete[] visited_; + nsubmatch_(0) { } // Runs a backtracking search. @@ -133,10 +126,10 @@ bool Backtracker::Search(const StringPiece& text, const StringPiece& context, // Allocate new visited_ bitmap -- size is proportional // to text, so have to reallocate on each call to Search. - delete[] visited_; - nvisited_ = (prog_->size()*(text.size()+1) + 31)/32; - visited_ = new uint32_t[nvisited_]; - memset(visited_, 0, nvisited_*sizeof visited_[0]); + int nvisited = prog_->size() * static_cast<int>(text.size()+1); + nvisited = (nvisited + 31) / 32; + visited_ = PODArray<uint32_t>(nvisited); + memset(visited_.data(), 0, nvisited*sizeof visited_[0]); // Anchored search must start at text.begin(). if (anchored_) { @@ -166,8 +159,9 @@ bool Backtracker::Visit(int id, const char* p) { // either it didn't match or it did but we're hoping for a better match. // Either way, don't go down that road again. CHECK(p <= text_.data() + text_.size()); - size_t n = id*(text_.size()+1) + (p - text_.data()); - CHECK_LT(n/32, nvisited_); + int n = id * static_cast<int>(text_.size()+1) + + static_cast<int>(p-text_.data()); + CHECK_LT(n/32, visited_.size()); if (visited_[n/32] & (1 << (n&31))) return false; visited_[n/32] |= 1 << (n&31); diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc index 4598aa6..2096e2f 100644 --- a/re2/testing/compile_test.cc +++ b/re2/testing/compile_test.cc @@ -236,7 +236,7 @@ TEST(TestCompile, InsufficientMemory) { "^(?P<name1>[^\\s]+)\\s+(?P<name2>[^\\s]+)\\s+(?P<name3>.+)$", Regexp::LikePerl, NULL); EXPECT_TRUE(re != NULL); - Prog* prog = re->CompileToProg(920); + Prog* prog = re->CompileToProg(850); // If the memory budget has been exhausted, compilation should fail // and return NULL instead of trying to do anything with NoMatch(). EXPECT_TRUE(prog == NULL); diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc index 52e1f6f..ec457fb 100644 --- a/re2/testing/re2_test.cc +++ b/re2/testing/re2_test.cc @@ -224,6 +224,15 @@ TEST(RE2, Extract) { ASSERT_EQ(s, "'foo'"); } +TEST(RE2, MaxSubmatchTooLarge) { + std::string s; + ASSERT_FALSE(RE2::Extract("foo", "f(o+)", "\\1\\2", &s)); + s = "foo"; + ASSERT_FALSE(RE2::Replace(&s, "f(o+)", "\\1\\2")); + s = "foo"; + ASSERT_FALSE(RE2::GlobalReplace(&s, "f(o+)", "\\1\\2")); +} + TEST(RE2, Consume) { RE2 r("\\s*(\\w+)"); // matches a word, possibly proceeded by whitespace std::string word; diff --git a/re2/testing/required_prefix_test.cc b/re2/testing/required_prefix_test.cc index 5460045..c00e812 100644 --- a/re2/testing/required_prefix_test.cc +++ b/re2/testing/required_prefix_test.cc @@ -6,6 +6,7 @@ #include "util/test.h" #include "util/logging.h" +#include "re2/prog.h" #include "re2/regexp.h" namespace re2 { @@ -19,10 +20,13 @@ struct PrefixTest { }; static PrefixTest tests[] = { - // If the regexp is missing a ^, there's no required prefix. - { "abc", false }, + // Empty cases. { "", false }, { "(?m)^", false }, + { "(?-m)^", false }, + + // If the regexp has no ^, there's no required prefix. + { "abc", false }, // If the regexp immediately goes into // something not a literal match, there's no required prefix. @@ -53,15 +57,15 @@ TEST(RequiredPrefix, SimpleTests) { bool f; Regexp* s; ASSERT_EQ(t.return_value, re->RequiredPrefix(&p, &f, &s)) - << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf") + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8") << " " << re->Dump(); if (t.return_value) { ASSERT_EQ(p, std::string(t.prefix)) - << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf"); + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); ASSERT_EQ(f, t.foldcase) - << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf"); + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); ASSERT_EQ(s->ToString(), std::string(t.suffix)) - << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf"); + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); s->Decref(); } re->Decref(); @@ -69,4 +73,73 @@ TEST(RequiredPrefix, SimpleTests) { } } +static PrefixTest for_accel_tests[] = { + // Empty cases. + { "", false }, + { "(?m)^", false }, + { "(?-m)^", false }, + + // If the regexp has a ^, there's no required prefix. + { "^abc", false }, + + // If the regexp immediately goes into + // something not a literal match, there's no required prefix. + { "(abc)", false }, + { "a*", false }, + + // Otherwise, it should work. + { "abc$", true, "abc", false, }, + { "abc", true, "abc", false, }, + { "(?i)abc", true, "abc", true, }, + { "abcd*", true, "abc", false, }, + { "[Aa][Bb]cd*", true, "ab", true, }, + { "ab[Cc]d*", true, "ab", false, }, + { "☺abc", true, "☺abc", false, }, +}; + +TEST(RequiredPrefixForAccel, SimpleTests) { + for (size_t i = 0; i < arraysize(for_accel_tests); i++) { + const PrefixTest& t = for_accel_tests[i]; + for (size_t j = 0; j < 2; j++) { + Regexp::ParseFlags flags = Regexp::LikePerl; + if (j == 0) + flags = flags | Regexp::Latin1; + Regexp* re = Regexp::Parse(t.regexp, flags, NULL); + ASSERT_TRUE(re != NULL) << " " << t.regexp; + + std::string p; + bool f; + ASSERT_EQ(t.return_value, re->RequiredPrefixForAccel(&p, &f)) + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8") + << " " << re->Dump(); + if (t.return_value) { + ASSERT_EQ(p, std::string(t.prefix)) + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); + ASSERT_EQ(f, t.foldcase) + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); + } + re->Decref(); + } + } +} + +TEST(PrefixAccel, BasicTest) { + Regexp* re = Regexp::Parse("abc\\d+", Regexp::LikePerl, NULL); + ASSERT_TRUE(re != NULL); + Prog* prog = re->CompileToProg(0); + ASSERT_TRUE(prog != NULL); + for (int i = 0; i < 100; i++) { + std::string text(i, 'a'); + const char* p = reinterpret_cast<const char*>( + prog->PrefixAccel(text.data(), text.size())); + EXPECT_TRUE(p == NULL); + text.append("abc"); + p = reinterpret_cast<const char*>( + prog->PrefixAccel(text.data(), text.size())); + EXPECT_EQ(i, p-text.data()); + } + delete prog; + re->Decref(); +} + } // namespace re2 diff --git a/re2/walker-inl.h b/re2/walker-inl.h index 310be54..8cfe799 100644 --- a/re2/walker-inl.h +++ b/re2/walker-inl.h @@ -89,7 +89,7 @@ template<typename T> class Regexp::Walker { private: // Walk state for the entire traversal. - std::stack<WalkState<T> >* stack_; + std::stack<WalkState<T>> stack_; bool stopped_early_; int max_visits_; @@ -134,24 +134,22 @@ template<typename T> struct WalkState { }; template<typename T> Regexp::Walker<T>::Walker() { - stack_ = new std::stack<WalkState<T> >; stopped_early_ = false; } template<typename T> Regexp::Walker<T>::~Walker() { Reset(); - delete stack_; } // Clears the stack. Should never be necessary, since // Walk always enters and exits with an empty stack. // Logs DFATAL if stack is not already clear. template<typename T> void Regexp::Walker<T>::Reset() { - if (stack_ && stack_->size() > 0) { + if (!stack_.empty()) { LOG(DFATAL) << "Stack not empty."; - while (stack_->size() > 0) { - delete[] stack_->top().child_args; - stack_->pop(); + while (!stack_.empty()) { + delete[] stack_.top().child_args; + stack_.pop(); } } } @@ -165,12 +163,12 @@ template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg, return top_arg; } - stack_->push(WalkState<T>(re, top_arg)); + stack_.push(WalkState<T>(re, top_arg)); WalkState<T>* s; for (;;) { T t; - s = &stack_->top(); + s = &stack_.top(); Regexp* re = s->re; switch (s->n) { case -1: { @@ -201,7 +199,7 @@ template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg, s->child_args[s->n] = Copy(s->child_args[s->n - 1]); s->n++; } else { - stack_->push(WalkState<T>(sub[s->n], s->pre_arg)); + stack_.push(WalkState<T>(sub[s->n], s->pre_arg)); } continue; } @@ -214,12 +212,12 @@ template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg, } } - // We've finished stack_->top(). + // We've finished stack_.top(). // Update next guy down. - stack_->pop(); - if (stack_->size() == 0) + stack_.pop(); + if (stack_.empty()) return t; - s = &stack_->top(); + s = &stack_.top(); if (s->child_args != NULL) s->child_args[s->n] = t; else |