summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDongHun Kwak <dh0128.kwak@samsung.com>2020-09-23 15:44:43 +0900
committerDongHun Kwak <dh0128.kwak@samsung.com>2020-09-23 15:44:43 +0900
commite8d51c0b86328c541f6716785c897d2be4ead2da (patch)
tree38e29693556bccbdfd046350e7486dddff5a593b
parente245f8f640cb5beae4553f063f7cb94a088607ba (diff)
downloadre2-e8d51c0b86328c541f6716785c897d2be4ead2da.tar.gz
re2-e8d51c0b86328c541f6716785c897d2be4ead2da.tar.bz2
re2-e8d51c0b86328c541f6716785c897d2be4ead2da.zip
Imported Upstream version 20200601upstream/20200601
-rw-r--r--CMakeLists.txt10
-rw-r--r--Makefile63
-rw-r--r--re2/bitstate.cc13
-rw-r--r--re2/compile.cc22
-rw-r--r--re2/dfa.cc129
-rw-r--r--re2/nfa.cc144
-rw-r--r--re2/parse.cc11
-rw-r--r--re2/prog.cc79
-rw-r--r--re2/prog.h25
-rw-r--r--re2/re2.cc13
-rw-r--r--re2/regexp.cc120
-rw-r--r--re2/regexp.h7
-rw-r--r--re2/testing/backtrack.cc28
-rw-r--r--re2/testing/compile_test.cc2
-rw-r--r--re2/testing/re2_test.cc9
-rw-r--r--re2/testing/required_prefix_test.cc85
-rw-r--r--re2/walker-inl.h26
17 files changed, 457 insertions, 329 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5c980f0..44a4772 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,6 +13,11 @@ project(RE2 CXX)
include(CTest)
include(GNUInstallDirs)
+if(NOT CMAKE_CXX_STANDARD)
+ set(CMAKE_CXX_STANDARD 11)
+ set(CMAKE_CXX_STANDARD_REQUIRED ON)
+endif()
+
option(BUILD_SHARED_LIBS "build shared libraries" OFF)
option(USEPCRE "use PCRE in tests and benchmarks" OFF)
@@ -36,11 +41,6 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
# Without a byte order mark (BOM), Visual Studio assumes that the source
# file is encoded using the current user code page, so we specify UTF-8.
add_compile_options(/utf-8)
-elseif(CYGWIN OR MINGW)
- # See https://stackoverflow.com/questions/38139631 for details.
- add_compile_options(-std=gnu++11)
-elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
- add_compile_options(-std=c++11)
endif()
if(WIN32)
diff --git a/Makefile b/Makefile
index a550717..20b8d0f 100644
--- a/Makefile
+++ b/Makefile
@@ -68,6 +68,7 @@ SOEXTVER00=$(SOEXT).$(SONAME).0.0
MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),--version-script,libre2.symbols $(RE2_LDFLAGS) $(LDFLAGS)
endif
+.PHONY: all
all: obj/libre2.a obj/so/libre2.$(SOEXT)
INSTALL_HFILES=\
@@ -176,40 +177,49 @@ DTESTOFILES=$(patsubst obj/%,obj/dbg/%,$(TESTOFILES))
DTESTS=$(patsubst obj/%,obj/dbg/%,$(TESTS))
DBIGTESTS=$(patsubst obj/%,obj/dbg/%,$(BIGTESTS))
+.PRECIOUS: obj/%.o
obj/%.o: %.cc $(HFILES)
@mkdir -p $$(dirname $@)
$(CXX) -c -o $@ $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) -DNDEBUG $*.cc
+.PRECIOUS: obj/dbg/%.o
obj/dbg/%.o: %.cc $(HFILES)
@mkdir -p $$(dirname $@)
$(CXX) -c -o $@ $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) $*.cc
+.PRECIOUS: obj/so/%.o
obj/so/%.o: %.cc $(HFILES)
@mkdir -p $$(dirname $@)
$(CXX) -c -o $@ -fPIC $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) -DNDEBUG $*.cc
+.PRECIOUS: obj/libre2.a
obj/libre2.a: $(OFILES)
@mkdir -p obj
$(AR) $(ARFLAGS) obj/libre2.a $(OFILES)
+.PRECIOUS: obj/dbg/libre2.a
obj/dbg/libre2.a: $(DOFILES)
@mkdir -p obj/dbg
$(AR) $(ARFLAGS) obj/dbg/libre2.a $(DOFILES)
+.PRECIOUS: obj/so/libre2.$(SOEXT)
obj/so/libre2.$(SOEXT): $(SOFILES)
@mkdir -p obj/so
$(MAKE_SHARED_LIBRARY) -o obj/so/libre2.$(SOEXTVER) $(SOFILES)
ln -sf libre2.$(SOEXTVER) $@
+.PRECIOUS: obj/dbg/test/%
obj/dbg/test/%: obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES) obj/dbg/util/test.o
@mkdir -p obj/dbg/test
$(CXX) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) obj/dbg/util/test.o obj/dbg/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
+.PRECIOUS: obj/test/%
obj/test/%: obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o
@mkdir -p obj/test
$(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
# Test the shared lib, falling back to the static lib for private symbols
+.PRECIOUS: obj/so/test/%
obj/so/test/%: obj/so/libre2.$(SOEXT) obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o
@mkdir -p obj/so/test
$(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o -Lobj/so -lre2 obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
@@ -229,64 +239,94 @@ obj/test/re2_fuzzer: obj/libre2.a obj/re2/fuzzing/re2_fuzzer.o obj/util/fuzz.o
$(CXX) -o $@ obj/re2/fuzzing/re2_fuzzer.o obj/util/fuzz.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
ifdef REBUILD_TABLES
+.PRECIOUS: re2/perl_groups.cc
re2/perl_groups.cc: re2/make_perl_groups.pl
perl $< > $@
+.PRECIOUS: re2/unicode_%.cc
re2/unicode_%.cc: re2/make_unicode_%.py
python $< > $@
-
-.PRECIOUS: re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc
endif
+.PHONY: distclean
distclean: clean
rm -f re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc
+.PHONY: clean
clean:
rm -rf obj
rm -f re2/*.pyc
+.PHONY: testofiles
testofiles: $(TESTOFILES)
+.PHONY: test
test: $(DTESTS) $(TESTS) $(STESTS) debug-test static-test shared-test
+.PHONY: debug-test
debug-test: $(DTESTS)
@./runtests $(DTESTS)
+.PHONY: static-test
static-test: $(TESTS)
@./runtests $(TESTS)
+.PHONY: shared-test
shared-test: $(STESTS)
@./runtests -shared-library-path obj/so $(STESTS)
+.PHONY: debug-bigtest
debug-bigtest: $(DTESTS) $(DBIGTESTS)
@./runtests $(DTESTS) $(DBIGTESTS)
+.PHONY: static-bigtest
static-bigtest: $(TESTS) $(BIGTESTS)
@./runtests $(TESTS) $(BIGTESTS)
+.PHONY: shared-bigtest
shared-bigtest: $(STESTS) $(SBIGTESTS)
@./runtests -shared-library-path obj/so $(STESTS) $(SBIGTESTS)
+.PHONY: benchmark
benchmark: obj/test/regexp_benchmark
+.PHONY: fuzz
fuzz: obj/test/re2_fuzzer
-install: obj/libre2.a obj/so/libre2.$(SOEXT)
- mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir)/pkgconfig
- $(INSTALL_DATA) $(INSTALL_HFILES) $(DESTDIR)$(includedir)/re2
+.PHONY: install
+install: static-install shared-install
+
+.PHONY: static
+static: obj/libre2.a
+
+.PHONY: static-install
+static-install: obj/libre2.a common-install
$(INSTALL) obj/libre2.a $(DESTDIR)$(libdir)/libre2.a
+
+.PHONY: shared
+shared: obj/so/libre2.$(SOEXT)
+
+.PHONY: shared-install
+shared-install: obj/so/libre2.$(SOEXT) common-install
$(INSTALL) obj/so/libre2.$(SOEXT) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER00)
ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER)
ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXT)
+
+.PHONY: common-install
+common-install:
+ mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir)/pkgconfig
+ $(INSTALL_DATA) $(INSTALL_HFILES) $(DESTDIR)$(includedir)/re2
$(INSTALL_DATA) re2.pc $(DESTDIR)$(libdir)/pkgconfig/re2.pc
$(SED_INPLACE) -e "s#@includedir@#$(includedir)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc
$(SED_INPLACE) -e "s#@libdir@#$(libdir)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc
+.PHONY: testinstall
testinstall: static-testinstall shared-testinstall
@echo
@echo Install tests passed.
@echo
+.PHONY: static-testinstall
static-testinstall: CXXFLAGS:=-std=c++11 -pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS)
static-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -l:libre2.a $(LDICU) $(LDFLAGS)
static-testinstall:
@@ -301,6 +341,7 @@ else
obj/testinstall
endif
+.PHONY: shared-testinstall
shared-testinstall: CXXFLAGS:=-std=c++11 -pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS)
shared-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -lre2 $(LDICU) $(LDFLAGS)
shared-testinstall:
@@ -313,19 +354,14 @@ else
LD_LIBRARY_PATH="$(DESTDIR)$(libdir):$(LD_LIBRARY_PATH)" obj/testinstall
endif
+.PHONY: benchlog
benchlog: obj/test/regexp_benchmark
(echo '==BENCHMARK==' `hostname` `date`; \
(uname -a; $(CXX) --version; git rev-parse --short HEAD; file obj/test/regexp_benchmark) | sed 's/^/# /'; \
echo; \
./obj/test/regexp_benchmark 'PCRE|RE2') | tee -a benchlog.$$(hostname | sed 's/\..*//')
-# Keep gmake from deleting intermediate files it creates.
-# This makes repeated builds faster and preserves debug info on OS X.
-
-.PRECIOUS: obj/%.o obj/dbg/%.o obj/so/%.o obj/libre2.a \
- obj/dbg/libre2.a obj/so/libre2.a \
- obj/test/% obj/so/test/% obj/dbg/test/%
-
+.PHONY: log
log:
$(MAKE) clean
$(MAKE) CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" \
@@ -341,6 +377,3 @@ log:
echo '#' RE2 basic search tests built by make $@ >re2-search.txt
echo '#' $$(date) >>re2-search.txt
obj/test/search_test |grep -v '^PASS$$' >>re2-search.txt
-
-x: x.cc obj/libre2.a
- g++ -I. -o x x.cc obj/libre2.a
diff --git a/re2/bitstate.cc b/re2/bitstate.cc
index 865b58e..320d1ee 100644
--- a/re2/bitstate.cc
+++ b/re2/bitstate.cc
@@ -332,14 +332,13 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context,
// This looks like it's quadratic in the size of the text,
// but we are not clearing visited_ between calls to TrySearch,
// so no work is duplicated and it ends up still being linear.
- for (const char* p = text.data(); p <= text.data() + text.size(); p++) {
- // Try to use memchr to find the first byte quickly.
- int fb = prog_->first_byte();
- if (fb >= 0 && p < text.data() + text.size() && (p[0] & 0xFF) != fb) {
- p = reinterpret_cast<const char*>(
- memchr(p, fb, text.data() + text.size() - p));
+ const char* etext = text.data() + text.size();
+ for (const char* p = text.data(); p <= etext; p++) {
+ // Try to use prefix accel (e.g. memchr) to skip ahead.
+ if (p < etext && prog_->can_prefix_accel()) {
+ p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext - p));
if (p == NULL)
- p = text.data() + text.size();
+ p = etext;
}
cap_[0] = p;
diff --git a/re2/compile.cc b/re2/compile.cc
index b3e5bdc..30e55f5 100644
--- a/re2/compile.cc
+++ b/re2/compile.cc
@@ -225,8 +225,8 @@ class Compiler : public Regexp::Walker<Frag> {
// Single rune.
Frag Literal(Rune r, bool foldcase);
- void Setup(Regexp::ParseFlags, int64_t, RE2::Anchor);
- Prog* Finish();
+ void Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor);
+ Prog* Finish(Regexp* re);
// Returns .* where dot = any byte
Frag DotStar();
@@ -1167,10 +1167,10 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) {
c.prog_->set_start_unanchored(all.begin);
// Hand ownership of prog_ to caller.
- return c.Finish();
+ return c.Finish(re);
}
-Prog* Compiler::Finish() {
+Prog* Compiler::Finish(Regexp* re) {
if (failed_)
return NULL;
@@ -1186,7 +1186,17 @@ Prog* Compiler::Finish() {
prog_->Optimize();
prog_->Flatten();
prog_->ComputeByteMap();
- prog_->ComputeFirstByte();
+
+ if (!prog_->reversed()) {
+ std::string prefix;
+ bool prefix_foldcase;
+ if (re->RequiredPrefixForAccel(&prefix, &prefix_foldcase) &&
+ !prefix_foldcase) {
+ prog_->prefix_size_ = prefix.size();
+ prog_->prefix_front_ = prefix.front();
+ prog_->prefix_back_ = prefix.back();
+ }
+ }
// Record remaining memory for DFA.
if (max_mem_ <= 0) {
@@ -1244,7 +1254,7 @@ Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
c.prog_->set_start(all.begin);
c.prog_->set_start_unanchored(all.begin);
- Prog* prog = c.Finish();
+ Prog* prog = c.Finish(re);
if (prog == NULL)
return NULL;
diff --git a/re2/dfa.cc b/re2/dfa.cc
index 0934f5a..b25cc97 100644
--- a/re2/dfa.cc
+++ b/re2/dfa.cc
@@ -167,12 +167,6 @@ class DFA {
typedef std::unordered_set<State*, StateHash, StateEqual> StateSet;
private:
- // Special "first_byte" values for a state. (Values >= 0 denote actual bytes.)
- enum {
- kFbUnknown = -1, // No analysis has been performed.
- kFbNone = -2, // The first-byte trick cannot be used.
- };
-
enum {
// Indices into start_ for unanchored searches.
// Add kStartAnchored for anchored searches.
@@ -239,25 +233,26 @@ class DFA {
struct SearchParams {
SearchParams(const StringPiece& text, const StringPiece& context,
RWLocker* cache_lock)
- : text(text), context(context),
+ : text(text),
+ context(context),
anchored(false),
+ can_prefix_accel(false),
want_earliest_match(false),
run_forward(false),
start(NULL),
- first_byte(kFbUnknown),
cache_lock(cache_lock),
failed(false),
ep(NULL),
- matches(NULL) { }
+ matches(NULL) {}
StringPiece text;
StringPiece context;
bool anchored;
+ bool can_prefix_accel;
bool want_earliest_match;
bool run_forward;
State* start;
- int first_byte;
- RWLocker *cache_lock;
+ RWLocker* cache_lock;
bool failed; // "out" parameter: whether search gave up
const char* ep; // "out" parameter: end pointer for match
SparseSet* matches;
@@ -268,15 +263,13 @@ class DFA {
};
// Before each search, the parameters to Search are analyzed by
- // AnalyzeSearch to determine the state in which to start and the
- // "first_byte" for that state, if any.
+ // AnalyzeSearch to determine the state in which to start.
struct StartInfo {
- StartInfo() : start(NULL), first_byte(kFbUnknown) {}
- State* start;
- std::atomic<int> first_byte;
+ StartInfo() : start(NULL) {}
+ std::atomic<State*> start;
};
- // Fills in params->start and params->first_byte using
+ // Fills in params->start and params->can_prefix_accel using
// the other search parameters. Returns true on success,
// false on failure.
// cache_mutex_.r <= L < mutex_
@@ -288,7 +281,7 @@ class DFA {
// cache_mutex_.r <= L < mutex_
// Might unlock and relock cache_mutex_ via params->cache_lock.
inline bool InlinedSearchLoop(SearchParams* params,
- bool have_first_byte,
+ bool can_prefix_accel,
bool want_earliest_match,
bool run_forward);
@@ -606,7 +599,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) {
// Only ByteRange, EmptyWidth, and Match instructions are useful to keep:
// those are the only operators with any effect in
// RunWorkqOnEmptyString or RunWorkqOnByte.
- int* inst = new int[q->size()];
+ PODArray<int> inst(q->size());
int n = 0;
uint32_t needflags = 0; // flags needed by kInstEmptyWidth instructions
bool sawmatch = false; // whether queue contains guaranteed kInstMatch
@@ -636,7 +629,6 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) {
(it == q->begin() && ip->greedy(prog_))) &&
(kind_ != Prog::kLongestMatch || !sawmark) &&
(flag & kFlagMatch)) {
- delete[] inst;
if (ExtraDebug)
fprintf(stderr, " -> FullMatchState\n");
return FullMatchState;
@@ -683,7 +675,6 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) {
// the execution loop can stop early. This is only okay
// if the state is *not* a matching state.
if (n == 0 && flag == 0) {
- delete[] inst;
if (ExtraDebug)
fprintf(stderr, " -> DeadState\n");
return DeadState;
@@ -693,7 +684,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) {
// unordered state sets separated by Marks. Sort each set
// to canonicalize, to reduce the number of distinct sets stored.
if (kind_ == Prog::kLongestMatch) {
- int* ip = inst;
+ int* ip = inst.data();
int* ep = ip + n;
while (ip < ep) {
int* markp = ip;
@@ -710,7 +701,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) {
// we have an unordered set of states (i.e. we don't have Marks)
// and sorting will reduce the number of distinct sets stored.
if (kind_ == Prog::kManyMatch) {
- int* ip = inst;
+ int* ip = inst.data();
int* ep = ip + n;
std::sort(ip, ep);
}
@@ -729,8 +720,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) {
// Save the needed empty-width flags in the top bits for use later.
flag |= needflags << kFlagNeedShift;
- State* state = CachedState(inst, n, flag);
- delete[] inst;
+ State* state = CachedState(inst.data(), n, flag);
return state;
}
@@ -1183,10 +1173,8 @@ void DFA::ResetCache(RWLocker* cache_lock) {
});
// Clear the cache, reset the memory budget.
- for (int i = 0; i < kMaxStart; i++) {
- start_[i].start = NULL;
- start_[i].first_byte.store(kFbUnknown, std::memory_order_relaxed);
- }
+ for (int i = 0; i < kMaxStart; i++)
+ start_[i].start.store(NULL, std::memory_order_relaxed);
ClearCache();
mem_budget_ = state_budget_;
}
@@ -1301,8 +1289,7 @@ DFA::State* DFA::StateSaver::Restore() {
// situation, the DFA can do better than executing the simple loop.
// Instead, it can call memchr to search very quickly for the byte c.
// Whether the start state has this property is determined during a
-// pre-compilation pass, and if so, the byte b is passed to the search
-// loop as the "first_byte" argument, along with a boolean "have_first_byte".
+// pre-compilation pass and the "can_prefix_accel" argument is set.
//
// Fourth, the desired behavior is to search for the leftmost-best match
// (approximately, the same one that Perl would find), which is not
@@ -1335,7 +1322,7 @@ DFA::State* DFA::StateSaver::Restore() {
// making them function arguments lets the inliner specialize
// this function to each combination (see two paragraphs above).
inline bool DFA::InlinedSearchLoop(SearchParams* params,
- bool have_first_byte,
+ bool can_prefix_accel,
bool want_earliest_match,
bool run_forward) {
State* start = params->start;
@@ -1380,12 +1367,12 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params,
if (ExtraDebug)
fprintf(stderr, "@%td: %s\n", p - bp, DumpState(s).c_str());
- if (run_forward && have_first_byte && s == start) {
- // In start state, only way out is to find first_byte,
- // so use optimized assembly in memchr to skip ahead.
- // If first_byte isn't found, we can skip to the end
- // of the string.
- if ((p = BytePtr(memchr(p, params->first_byte, ep - p))) == NULL) {
+ if (can_prefix_accel && s == start) {
+ // In start state, only way out is to find the prefix,
+ // so we use prefix accel (e.g. memchr) to skip ahead.
+ // If not found, we can skip to the end of the string.
+ p = BytePtr(prog_->PrefixAccel(p, ep - p));
+ if (p == NULL) {
p = ep;
break;
}
@@ -1589,7 +1576,7 @@ bool DFA::SearchTTT(SearchParams* params) {
// For debugging, calls the general code directly.
bool DFA::SlowSearchLoop(SearchParams* params) {
return InlinedSearchLoop(params,
- params->first_byte >= 0,
+ params->can_prefix_accel,
params->want_earliest_match,
params->run_forward);
}
@@ -1610,8 +1597,7 @@ bool DFA::FastSearchLoop(SearchParams* params) {
&DFA::SearchTTT,
};
- bool have_first_byte = params->first_byte >= 0;
- int index = 4 * have_first_byte +
+ int index = 4 * params->can_prefix_accel +
2 * params->want_earliest_match +
1 * params->run_forward;
return (this->*Searches[index])(params);
@@ -1703,13 +1689,22 @@ bool DFA::AnalyzeSearch(SearchParams* params) {
}
}
+ params->start = info->start.load(std::memory_order_acquire);
+
+ // Even if we could prefix accel, we cannot do so when anchored and,
+ // less obviously, we cannot do so when we are going to need flags.
+ // This trick works only when there is a single byte that leads to a
+ // different state!
+ if (prog_->can_prefix_accel() &&
+ !params->anchored &&
+ params->start > SpecialStateMax &&
+ params->start->flag_ >> kFlagNeedShift == 0)
+ params->can_prefix_accel = true;
+
if (ExtraDebug)
- fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s first_byte=%d\n",
+ fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s can_prefix_accel=%d\n",
params->anchored, params->run_forward, flags,
- DumpState(info->start).c_str(), info->first_byte.load());
-
- params->start = info->start;
- params->first_byte = info->first_byte.load(std::memory_order_acquire);
+ DumpState(params->start).c_str(), params->can_prefix_accel);
return true;
}
@@ -1718,47 +1713,25 @@ bool DFA::AnalyzeSearch(SearchParams* params) {
bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info,
uint32_t flags) {
// Quick check.
- int fb = info->first_byte.load(std::memory_order_acquire);
- if (fb != kFbUnknown)
+ State* start = info->start.load(std::memory_order_acquire);
+ if (start != NULL)
return true;
MutexLock l(&mutex_);
- fb = info->first_byte.load(std::memory_order_relaxed);
- if (fb != kFbUnknown)
+ start = info->start.load(std::memory_order_relaxed);
+ if (start != NULL)
return true;
q0_->clear();
AddToQueue(q0_,
params->anchored ? prog_->start() : prog_->start_unanchored(),
flags);
- info->start = WorkqToCachedState(q0_, NULL, flags);
- if (info->start == NULL)
+ start = WorkqToCachedState(q0_, NULL, flags);
+ if (start == NULL)
return false;
- if (info->start == DeadState) {
- // Synchronize with "quick check" above.
- info->first_byte.store(kFbNone, std::memory_order_release);
- return true;
- }
-
- if (info->start == FullMatchState) {
- // Synchronize with "quick check" above.
- info->first_byte.store(kFbNone, std::memory_order_release); // will be ignored
- return true;
- }
-
- // Even if we have a first_byte, we cannot use it when anchored and,
- // less obviously, we cannot use it when we are going to need flags.
- // This trick works only when there is a single byte that leads to a
- // different state!
- int first_byte = prog_->first_byte();
- if (first_byte == -1 ||
- params->anchored ||
- info->start->flag_ >> kFlagNeedShift != 0)
- first_byte = kFbNone;
-
// Synchronize with "quick check" above.
- info->first_byte.store(first_byte, std::memory_order_release);
+ info->start.store(start, std::memory_order_release);
return true;
}
@@ -1866,13 +1839,13 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context,
StringPiece context = const_context;
if (context.data() == NULL)
context = text;
- bool carat = anchor_start();
+ bool caret = anchor_start();
bool dollar = anchor_end();
if (reversed_) {
using std::swap;
- swap(carat, dollar);
+ swap(caret, dollar);
}
- if (carat && context.begin() != text.begin())
+ if (caret && context.begin() != text.begin())
return false;
if (dollar && context.end() != text.end())
return false;
diff --git a/re2/nfa.cc b/re2/nfa.cc
index 75ca306..19ee08d 100644
--- a/re2/nfa.cc
+++ b/re2/nfa.cc
@@ -27,6 +27,7 @@
#include <stdio.h>
#include <string.h>
#include <algorithm>
+#include <deque>
#include <string>
#include <utility>
#include <vector>
@@ -107,18 +108,21 @@ class NFA {
// Returns text version of capture information, for debugging.
std::string FormatCapture(const char** capture);
- inline void CopyCapture(const char** dst, const char** src);
+ void CopyCapture(const char** dst, const char** src) {
+ memmove(dst, src, ncapture_*sizeof src[0]);
+ }
Prog* prog_; // underlying program
int start_; // start instruction in program
int ncapture_; // number of submatches to track
bool longest_; // whether searching for longest match
bool endmatch_; // whether match must end at text.end()
- const char* btext_; // beginning of text being matched (for FormatSubmatch)
- const char* etext_; // end of text being matched (for endmatch_)
+ const char* btext_; // beginning of text (for FormatSubmatch)
+ const char* etext_; // end of text (for endmatch_)
Threadq q0_, q1_; // pre-allocated for Search.
PODArray<AddState> stack_; // pre-allocated for AddToThreadq
- Thread* free_threads_; // free list
+ std::deque<Thread> arena_; // thread arena
+ Thread* freelist_; // thread freelist
const char** match_; // best match so far
bool matched_; // any match so far?
@@ -141,31 +145,30 @@ NFA::NFA(Prog* prog) {
prog_->inst_count(kInstEmptyWidth) +
prog_->inst_count(kInstNop) + 1; // + 1 for start inst
stack_ = PODArray<AddState>(nstack);
- free_threads_ = NULL;
+ freelist_ = NULL;
match_ = NULL;
matched_ = false;
}
NFA::~NFA() {
delete[] match_;
- Thread* next;
- for (Thread* t = free_threads_; t; t = next) {
- next = t->next;
- delete[] t->capture;
- delete t;
- }
+ for (const Thread& t : arena_)
+ delete[] t.capture;
}
NFA::Thread* NFA::AllocThread() {
- Thread* t = free_threads_;
- if (t == NULL) {
- t = new Thread;
+ Thread* t = freelist_;
+ if (t != NULL) {
+ freelist_ = t->next;
t->ref = 1;
- t->capture = new const char*[ncapture_];
+ // We don't need to touch t->capture because
+ // the caller will immediately overwrite it.
return t;
}
- free_threads_ = t->next;
+ arena_.emplace_back();
+ t = &arena_.back();
t->ref = 1;
+ t->capture = new const char*[ncapture_];
return t;
}
@@ -176,21 +179,13 @@ NFA::Thread* NFA::Incref(Thread* t) {
}
void NFA::Decref(Thread* t) {
- if (t == NULL)
- return;
+ DCHECK(t != NULL);
t->ref--;
if (t->ref > 0)
return;
DCHECK_EQ(t->ref, 0);
- t->next = free_threads_;
- free_threads_ = t;
-}
-
-void NFA::CopyCapture(const char** dst, const char** src) {
- for (int i = 0; i < ncapture_; i+=2) {
- dst[i] = src[i];
- dst[i+1] = src[i+1];
- }
+ t->next = freelist_;
+ freelist_ = t;
}
// Follows all empty arrows from id0 and enqueues all the states reached.
@@ -372,8 +367,10 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
matched_ = true;
Decref(t);
- for (++i; i != runq->end(); ++i)
- Decref(i->value());
+ for (++i; i != runq->end(); ++i) {
+ if (i->value() != NULL)
+ Decref(i->value());
+ }
runq->clear();
if (ip->greedy(prog_))
return ip->out1();
@@ -416,8 +413,10 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
// worse than the one we just found: don't run the
// rest of the current Threadq.
Decref(t);
- for (++i; i != runq->end(); ++i)
- Decref(i->value());
+ for (++i; i != runq->end(); ++i) {
+ if (i->value() != NULL)
+ Decref(i->value());
+ }
runq->clear();
return 0;
}
@@ -489,6 +488,7 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
}
match_ = new const char*[ncapture_];
+ memset(match_, 0, ncapture_*sizeof match_[0]);
matched_ = false;
// For debugging prints.
@@ -505,7 +505,6 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
Threadq* nextq = &q1_;
runq->clear();
nextq->clear();
- memset(&match_[0], 0, ncapture_*sizeof match_[0]);
// Loop over the text, stepping the machine.
for (const char* p = text.data();; p++) {
@@ -572,16 +571,14 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
// matches, since it would be to the right of the match
// we already found.)
if (!matched_ && (!anchored || p == text.data())) {
- // If there's a required first byte for an unanchored search
- // and we're not in the middle of any possible matches,
- // use memchr to search for the byte quickly.
- int fb = prog_->first_byte();
+ // Try to use prefix accel (e.g. memchr) to skip ahead.
+ // The search must be unanchored and there must be zero
+ // possible matches already.
if (!anchored && runq->size() == 0 &&
- fb >= 0 && p < etext_ && (p[0] & 0xFF) != fb) {
- p = reinterpret_cast<const char*>(memchr(p, fb, etext_ - p));
- if (p == NULL) {
+ p < etext_ && prog_->can_prefix_accel()) {
+ p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext_ - p));
+ if (p == NULL)
p = etext_;
- }
}
Thread* t = AllocThread();
@@ -612,8 +609,10 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
}
}
- for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
- Decref(i->value());
+ for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
+ if (i->value() != NULL)
+ Decref(i->value());
+ }
if (matched_) {
for (int i = 0; i < nsubmatch; i++)
@@ -629,67 +628,6 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
return false;
}
-void Prog::ComputeFirstByte() {
- SparseSet q(size());
- q.insert(start());
- for (SparseSet::iterator it = q.begin(); it != q.end(); ++it) {
- int id = *it;
- Prog::Inst* ip = inst(id);
- switch (ip->opcode()) {
- default:
- LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
- break;
-
- case kInstMatch:
- // The empty string matches: no first byte.
- first_byte_ = -1;
- return;
-
- case kInstByteRange:
- if (!ip->last())
- q.insert(id+1);
-
- // Must match only a single byte.
- if (ip->lo() != ip->hi() ||
- (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')) {
- first_byte_ = -1;
- return;
- }
- // If we haven't seen any bytes yet, record it;
- // otherwise must match the one we saw before.
- if (first_byte_ == -1) {
- first_byte_ = ip->lo();
- } else if (first_byte_ != ip->lo()) {
- first_byte_ = -1;
- return;
- }
- break;
-
- case kInstNop:
- case kInstCapture:
- case kInstEmptyWidth:
- if (!ip->last())
- q.insert(id+1);
-
- // Continue on.
- // Ignore ip->empty() flags for kInstEmptyWidth
- // in order to be as conservative as possible
- // (assume all possible empty-width flags are true).
- if (ip->out())
- q.insert(ip->out());
- break;
-
- case kInstAltMatch:
- DCHECK(!ip->last());
- q.insert(id+1);
- break;
-
- case kInstFail:
- break;
- }
- }
-}
-
bool
Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
diff --git a/re2/parse.cc b/re2/parse.cc
index 0a5ff74..e741603 100644
--- a/re2/parse.cc
+++ b/re2/parse.cc
@@ -93,7 +93,7 @@ class Regexp::ParseState {
bool PushSimpleOp(RegexpOp op);
// Pushes a ^ onto the stack.
- bool PushCarat();
+ bool PushCaret();
// Pushes a \b (word == true) or \B (word == false) onto the stack.
bool PushWordBoundary(bool word);
@@ -423,7 +423,7 @@ bool Regexp::ParseState::PushLiteral(Rune r) {
}
// Pushes a ^ onto the stack.
-bool Regexp::ParseState::PushCarat() {
+bool Regexp::ParseState::PushCaret() {
if (flags_ & OneLine) {
return PushSimpleOp(kRegexpBeginText);
}
@@ -1802,14 +1802,13 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags,
// Convert the UnicodeSet to a URange32 and UGroup that we can add.
int nr = uset.getRangeCount();
- URange32* r = new URange32[nr];
+ PODArray<URange32> r(nr);
for (int i = 0; i < nr; i++) {
r[i].lo = uset.getRangeStart(i);
r[i].hi = uset.getRangeEnd(i);
}
- UGroup g = {"", +1, 0, 0, r, nr};
+ UGroup g = {"", +1, 0, 0, r.data(), nr};
AddUGroup(cc, &g, sign, parse_flags);
- delete[] r;
#endif
return kParseOk;
@@ -2272,7 +2271,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,
break;
case '^': // Beginning of line.
- if (!ps.PushCarat())
+ if (!ps.PushCaret())
return NULL;
t.remove_prefix(1); // '^'
break;
diff --git a/re2/prog.cc b/re2/prog.cc
index 9244319..ac9c085 100644
--- a/re2/prog.cc
+++ b/re2/prog.cc
@@ -7,6 +7,12 @@
#include "re2/prog.h"
+#if defined(__AVX2__)
+#include <immintrin.h>
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+#endif
#include <stdint.h>
#include <string.h>
#include <algorithm>
@@ -109,7 +115,9 @@ Prog::Prog()
start_unanchored_(0),
size_(0),
bytemap_range_(0),
- first_byte_(-1),
+ prefix_size_(0),
+ prefix_front_(-1),
+ prefix_back_(-1),
list_count_(0),
dfa_mem_(0),
dfa_first_(NULL),
@@ -908,4 +916,73 @@ void Prog::ComputeHints(std::vector<Inst>* flat, int begin, int end) {
}
}
+#if defined(__AVX2__)
+// Finds the least significant non-zero bit in n.
+static int FindLSBSet(uint32_t n) {
+ DCHECK_NE(n, 0);
+#if defined(__GNUC__)
+ return __builtin_ctz(n);
+#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+ unsigned long c;
+ _BitScanForward(&c, n);
+ return static_cast<int>(c);
+#else
+ int c = 31;
+ for (int shift = 1 << 4; shift != 0; shift >>= 1) {
+ uint32_t word = n << shift;
+ if (word != 0) {
+ n = word;
+ c -= shift;
+ }
+ }
+ return c;
+#endif
+}
+#endif
+
+const void* Prog::PrefixAccel_FrontAndBack(const void* data, size_t size) {
+ DCHECK_GE(prefix_size_, 2);
+ if (size < prefix_size_)
+ return NULL;
+ // Don't bother searching the last prefix_size_-1 bytes for prefix_front_.
+ // This also means that probing for prefix_back_ doesn't go out of bounds.
+ size -= prefix_size_-1;
+
+#if defined(__AVX2__)
+ // Use AVX2 to look for prefix_front_ and prefix_back_ 32 bytes at a time.
+ if (size >= sizeof(__m256i)) {
+ const __m256i* fp = reinterpret_cast<const __m256i*>(
+ reinterpret_cast<const char*>(data));
+ const __m256i* bp = reinterpret_cast<const __m256i*>(
+ reinterpret_cast<const char*>(data) + prefix_size_-1);
+ const __m256i* endfp = fp + size/sizeof(__m256i);
+ const __m256i f_set1 = _mm256_set1_epi8(prefix_front_);
+ const __m256i b_set1 = _mm256_set1_epi8(prefix_back_);
+ while (fp != endfp) {
+ const __m256i f_loadu = _mm256_loadu_si256(fp++);
+ const __m256i b_loadu = _mm256_loadu_si256(bp++);
+ const __m256i f_cmpeq = _mm256_cmpeq_epi8(f_set1, f_loadu);
+ const __m256i b_cmpeq = _mm256_cmpeq_epi8(b_set1, b_loadu);
+ const int fb_testz = _mm256_testz_si256(f_cmpeq, b_cmpeq);
+ if (fb_testz == 0) { // ZF: 1 means zero, 0 means non-zero.
+ const __m256i fb_and = _mm256_and_si256(f_cmpeq, b_cmpeq);
+ const int fb_movemask = _mm256_movemask_epi8(fb_and);
+ const int fb_ctz = FindLSBSet(fb_movemask);
+ return reinterpret_cast<const char*>(fp-1) + fb_ctz;
+ }
+ }
+ data = fp;
+ size = size%sizeof(__m256i);
+ }
+#endif
+
+ const char* p0 = reinterpret_cast<const char*>(data);
+ for (const char* p = p0;; p++) {
+ DCHECK_GE(size, static_cast<size_t>(p-p0));
+ p = reinterpret_cast<const char*>(memchr(p, prefix_front_, size - (p-p0)));
+ if (p == NULL || p[prefix_size_-1] == prefix_back_)
+ return p;
+ }
+}
+
} // namespace re2
diff --git a/re2/prog.h b/re2/prog.h
index 4306672..e9ce682 100644
--- a/re2/prog.h
+++ b/re2/prog.h
@@ -198,8 +198,8 @@ class Prog {
Inst *inst(int id) { return &inst_[id]; }
int start() { return start_; }
- int start_unanchored() { return start_unanchored_; }
void set_start(int start) { start_ = start; }
+ int start_unanchored() { return start_unanchored_; }
void set_start_unanchored(int start) { start_unanchored_ = start; }
int size() { return size_; }
bool reversed() { return reversed_; }
@@ -207,15 +207,27 @@ class Prog {
int list_count() { return list_count_; }
int inst_count(InstOp op) { return inst_count_[op]; }
uint16_t* list_heads() { return list_heads_.data(); }
- void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; }
int64_t dfa_mem() { return dfa_mem_; }
+ void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; }
bool anchor_start() { return anchor_start_; }
void set_anchor_start(bool b) { anchor_start_ = b; }
bool anchor_end() { return anchor_end_; }
void set_anchor_end(bool b) { anchor_end_ = b; }
int bytemap_range() { return bytemap_range_; }
const uint8_t* bytemap() { return bytemap_; }
- int first_byte() { return first_byte_; }
+ bool can_prefix_accel() { return prefix_size_ != 0; }
+
+ // Accelerates to the first likely occurrence of the prefix.
+ // Returns a pointer to the first byte or NULL if not found.
+ const void* PrefixAccel(const void* data, size_t size) {
+ DCHECK_GE(prefix_size_, 1);
+ return prefix_size_ == 1 ? memchr(data, prefix_front_, size)
+ : PrefixAccel_FrontAndBack(data, size);
+ }
+
+ // An implementation of prefix accel that looks for prefix_front_ and
+ // prefix_back_ to return fewer false positives than memchr(3) alone.
+ const void* PrefixAccel_FrontAndBack(const void* data, size_t size);
// Returns string representation of program for debugging.
std::string Dump();
@@ -293,9 +305,6 @@ class Prog {
// Compute bytemap.
void ComputeByteMap();
- // Computes whether all matches must begin with the same first byte.
- void ComputeFirstByte();
-
// Run peep-hole optimizer on program.
void Optimize();
@@ -397,7 +406,9 @@ class Prog {
int start_unanchored_; // unanchored entry point for program
int size_; // number of instructions
int bytemap_range_; // bytemap_[x] < bytemap_range_
- int first_byte_; // required first byte for match, or -1 if none
+ size_t prefix_size_; // size of prefix (0 if no prefix)
+ int prefix_front_; // first byte of prefix (-1 if no prefix)
+ int prefix_back_; // last byte of prefix (-1 if no prefix)
int list_count_; // count of lists (see above)
int inst_count_[kNumInst]; // count of instructions by opcode
diff --git a/re2/re2.cc b/re2/re2.cc
index d231a21..35fd1cc 100644
--- a/re2/re2.cc
+++ b/re2/re2.cc
@@ -293,7 +293,7 @@ static int FindMSBSet(uint32_t n) {
DCHECK_NE(n, 0);
#if defined(__GNUC__)
return 31 ^ __builtin_clz(n);
-#elif defined(_MSC_VER)
+#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
unsigned long c;
_BitScanReverse(&c, n);
return static_cast<int>(c);
@@ -405,6 +405,8 @@ bool RE2::Replace(std::string* str,
const StringPiece& rewrite) {
StringPiece vec[kVecSize];
int nvec = 1 + MaxSubmatch(rewrite);
+ if (nvec > 1 + re.NumberOfCapturingGroups())
+ return false;
if (nvec > static_cast<int>(arraysize(vec)))
return false;
if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
@@ -425,6 +427,8 @@ int RE2::GlobalReplace(std::string* str,
const StringPiece& rewrite) {
StringPiece vec[kVecSize];
int nvec = 1 + MaxSubmatch(rewrite);
+ if (nvec > 1 + re.NumberOfCapturingGroups())
+ return false;
if (nvec > static_cast<int>(arraysize(vec)))
return false;
@@ -497,9 +501,10 @@ bool RE2::Extract(const StringPiece& text,
std::string* out) {
StringPiece vec[kVecSize];
int nvec = 1 + MaxSubmatch(rewrite);
+ if (nvec > 1 + re.NumberOfCapturingGroups())
+ return false;
if (nvec > static_cast<int>(arraysize(vec)))
return false;
-
if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
return false;
@@ -1012,8 +1017,8 @@ bool RE2::Rewrite(std::string* out,
int n = (c - '0');
if (n >= veclen) {
if (options_.log_errors()) {
- LOG(ERROR) << "requested group " << n
- << " in regexp " << rewrite.data();
+ LOG(ERROR) << "invalid substitution \\" << n
+ << " from " << veclen << " groups";
}
return false;
}
diff --git a/re2/regexp.cc b/re2/regexp.cc
index 5732166..4364d0a 100644
--- a/re2/regexp.cc
+++ b/re2/regexp.cc
@@ -20,6 +20,7 @@
#include "util/logging.h"
#include "util/mutex.h"
#include "util/utf.h"
+#include "re2/pod_array.h"
#include "re2/stringpiece.h"
#include "re2/walker-inl.h"
@@ -243,16 +244,15 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
return new Regexp(kRegexpEmptyMatch, flags);
}
- Regexp** subcopy = NULL;
+ PODArray<Regexp*> subcopy;
if (op == kRegexpAlternate && can_factor) {
// Going to edit sub; make a copy so we don't step on caller.
- subcopy = new Regexp*[nsub];
- memmove(subcopy, sub, nsub * sizeof sub[0]);
- sub = subcopy;
+ subcopy = PODArray<Regexp*>(nsub);
+ memmove(subcopy.data(), sub, nsub * sizeof sub[0]);
+ sub = subcopy.data();
nsub = FactorAlternation(sub, nsub, flags);
if (nsub == 1) {
Regexp* re = sub[0];
- delete[] subcopy;
return re;
}
}
@@ -269,7 +269,6 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
nsub - (nbigsub-1)*kMaxNsub, flags,
false);
- delete[] subcopy;
return re;
}
@@ -278,8 +277,6 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
Regexp** subs = re->sub();
for (int i = 0; i < nsub; i++)
subs[i] = sub[i];
-
- delete[] subcopy;
return re;
}
@@ -658,78 +655,83 @@ std::map<int, std::string>* Regexp::CaptureNames() {
return w.TakeMap();
}
+void ConvertRunesToBytes(bool latin1, Rune* runes, int nrunes,
+ std::string* bytes) {
+ if (latin1) {
+ bytes->resize(nrunes);
+ for (int i = 0; i < nrunes; i++)
+ (*bytes)[i] = static_cast<char>(runes[i]);
+ } else {
+ bytes->resize(nrunes * UTFmax); // worst case
+ char* p = &(*bytes)[0];
+ for (int i = 0; i < nrunes; i++)
+ p += runetochar(p, &runes[i]);
+ bytes->resize(p - &(*bytes)[0]);
+ bytes->shrink_to_fit();
+ }
+}
+
// Determines whether regexp matches must be anchored
// with a fixed string prefix. If so, returns the prefix and
// the regexp that remains after the prefix. The prefix might
// be ASCII case-insensitive.
bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase,
Regexp** suffix) {
+ prefix->clear();
+ *foldcase = false;
+ *suffix = NULL;
+
// No need for a walker: the regexp must be of the form
// 1. some number of ^ anchors
// 2. a literal char or string
// 3. the rest
- prefix->clear();
- *foldcase = false;
- *suffix = NULL;
if (op_ != kRegexpConcat)
return false;
-
- // Some number of anchors, then a literal or concatenation.
int i = 0;
- Regexp** sub = this->sub();
- while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
+ while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText)
i++;
if (i == 0 || i >= nsub_)
return false;
-
- Regexp* re = sub[i];
- switch (re->op_) {
- default:
- return false;
-
- case kRegexpLiteralString:
- // Convert to string in proper encoding.
- if (re->parse_flags() & Latin1) {
- prefix->resize(re->nrunes_);
- for (int j = 0; j < re->nrunes_; j++)
- (*prefix)[j] = static_cast<char>(re->runes_[j]);
- } else {
- // Convert to UTF-8 in place.
- // Assume worst-case space and then trim.
- prefix->resize(re->nrunes_ * UTFmax);
- char *p = &(*prefix)[0];
- for (int j = 0; j < re->nrunes_; j++) {
- Rune r = re->runes_[j];
- if (r < Runeself)
- *p++ = static_cast<char>(r);
- else
- p += runetochar(p, &r);
- }
- prefix->resize(p - &(*prefix)[0]);
- }
- break;
-
- case kRegexpLiteral:
- if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
- prefix->append(1, static_cast<char>(re->rune_));
- } else {
- char buf[UTFmax];
- prefix->append(buf, runetochar(buf, &re->rune_));
- }
- break;
- }
- *foldcase = (sub[i]->parse_flags() & FoldCase) != 0;
+ Regexp* re = sub()[i];
+ if (re->op_ != kRegexpLiteral &&
+ re->op_ != kRegexpLiteralString)
+ return false;
i++;
-
- // The rest.
if (i < nsub_) {
for (int j = i; j < nsub_; j++)
- sub[j]->Incref();
- re = Concat(sub + i, nsub_ - i, parse_flags());
+ sub()[j]->Incref();
+ *suffix = Concat(sub() + i, nsub_ - i, parse_flags());
} else {
- re = new Regexp(kRegexpEmptyMatch, parse_flags());
+ *suffix = new Regexp(kRegexpEmptyMatch, parse_flags());
}
- *suffix = re;
+
+ bool latin1 = (re->parse_flags() & Latin1) != 0;
+ Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
+ int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
+ ConvertRunesToBytes(latin1, runes, nrunes, prefix);
+ *foldcase = (re->parse_flags() & FoldCase) != 0;
+ return true;
+}
+
+// Determines whether regexp matches must be unanchored
+// with a fixed string prefix. If so, returns the prefix.
+// The prefix might be ASCII case-insensitive.
+bool Regexp::RequiredPrefixForAccel(std::string* prefix, bool* foldcase) {
+ prefix->clear();
+ *foldcase = false;
+
+ // No need for a walker: the regexp must either begin with or be
+ // a literal char or string.
+ Regexp* re = op_ == kRegexpConcat && nsub_ > 0 ? sub()[0] : this;
+ if (re->op_ != kRegexpLiteral &&
+ re->op_ != kRegexpLiteralString)
+ return false;
+
+ bool latin1 = (re->parse_flags() & Latin1) != 0;
+ Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
+ int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
+ ConvertRunesToBytes(latin1, runes, nrunes, prefix);
+ *foldcase = (re->parse_flags() & FoldCase) != 0;
return true;
}
diff --git a/re2/regexp.h b/re2/regexp.h
index a5d85c8..5284ab5 100644
--- a/re2/regexp.h
+++ b/re2/regexp.h
@@ -440,6 +440,13 @@ class Regexp {
bool RequiredPrefix(std::string* prefix, bool* foldcase,
Regexp** suffix);
+ // Whether every match of this regexp must be unanchored and
+ // begin with a non-empty fixed string (perhaps after ASCII
+ // case-folding). If so, returns the prefix.
+ // Callers should expect *prefix and *foldcase to be "zeroed"
+ // regardless of the return value.
+ bool RequiredPrefixForAccel(std::string* prefix, bool* foldcase);
+
private:
// Constructor allocates vectors as appropriate for operator.
explicit Regexp(RegexpOp op, ParseFlags parse_flags);
diff --git a/re2/testing/backtrack.cc b/re2/testing/backtrack.cc
index 3bab1e7..216d259 100644
--- a/re2/testing/backtrack.cc
+++ b/re2/testing/backtrack.cc
@@ -29,6 +29,7 @@
#include "util/util.h"
#include "util/logging.h"
+#include "re2/pod_array.h"
#include "re2/prog.h"
#include "re2/regexp.h"
@@ -53,7 +54,6 @@ namespace re2 {
class Backtracker {
public:
explicit Backtracker(Prog* prog);
- ~Backtracker();
bool Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
@@ -79,9 +79,8 @@ class Backtracker {
int nsubmatch_; // # of submatches to fill in
// Search state
- const char* cap_[64]; // capture registers
- uint32_t *visited_; // bitmap: (Inst*, char*) pairs already backtracked
- size_t nvisited_; // # of words in bitmap
+ const char* cap_[64]; // capture registers
+ PODArray<uint32_t> visited_; // bitmap: (Inst*, char*) pairs visited
Backtracker(const Backtracker&) = delete;
Backtracker& operator=(const Backtracker&) = delete;
@@ -93,13 +92,7 @@ Backtracker::Backtracker(Prog* prog)
longest_(false),
endmatch_(false),
submatch_(NULL),
- nsubmatch_(0),
- visited_(NULL),
- nvisited_(0) {
-}
-
-Backtracker::~Backtracker() {
- delete[] visited_;
+ nsubmatch_(0) {
}
// Runs a backtracking search.
@@ -133,10 +126,10 @@ bool Backtracker::Search(const StringPiece& text, const StringPiece& context,
// Allocate new visited_ bitmap -- size is proportional
// to text, so have to reallocate on each call to Search.
- delete[] visited_;
- nvisited_ = (prog_->size()*(text.size()+1) + 31)/32;
- visited_ = new uint32_t[nvisited_];
- memset(visited_, 0, nvisited_*sizeof visited_[0]);
+ int nvisited = prog_->size() * static_cast<int>(text.size()+1);
+ nvisited = (nvisited + 31) / 32;
+ visited_ = PODArray<uint32_t>(nvisited);
+ memset(visited_.data(), 0, nvisited*sizeof visited_[0]);
// Anchored search must start at text.begin().
if (anchored_) {
@@ -166,8 +159,9 @@ bool Backtracker::Visit(int id, const char* p) {
// either it didn't match or it did but we're hoping for a better match.
// Either way, don't go down that road again.
CHECK(p <= text_.data() + text_.size());
- size_t n = id*(text_.size()+1) + (p - text_.data());
- CHECK_LT(n/32, nvisited_);
+ int n = id * static_cast<int>(text_.size()+1) +
+ static_cast<int>(p-text_.data());
+ CHECK_LT(n/32, visited_.size());
if (visited_[n/32] & (1 << (n&31)))
return false;
visited_[n/32] |= 1 << (n&31);
diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc
index 4598aa6..2096e2f 100644
--- a/re2/testing/compile_test.cc
+++ b/re2/testing/compile_test.cc
@@ -236,7 +236,7 @@ TEST(TestCompile, InsufficientMemory) {
"^(?P<name1>[^\\s]+)\\s+(?P<name2>[^\\s]+)\\s+(?P<name3>.+)$",
Regexp::LikePerl, NULL);
EXPECT_TRUE(re != NULL);
- Prog* prog = re->CompileToProg(920);
+ Prog* prog = re->CompileToProg(850);
// If the memory budget has been exhausted, compilation should fail
// and return NULL instead of trying to do anything with NoMatch().
EXPECT_TRUE(prog == NULL);
diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc
index 52e1f6f..ec457fb 100644
--- a/re2/testing/re2_test.cc
+++ b/re2/testing/re2_test.cc
@@ -224,6 +224,15 @@ TEST(RE2, Extract) {
ASSERT_EQ(s, "'foo'");
}
+TEST(RE2, MaxSubmatchTooLarge) {
+ std::string s;
+ ASSERT_FALSE(RE2::Extract("foo", "f(o+)", "\\1\\2", &s));
+ s = "foo";
+ ASSERT_FALSE(RE2::Replace(&s, "f(o+)", "\\1\\2"));
+ s = "foo";
+ ASSERT_FALSE(RE2::GlobalReplace(&s, "f(o+)", "\\1\\2"));
+}
+
TEST(RE2, Consume) {
RE2 r("\\s*(\\w+)"); // matches a word, possibly proceeded by whitespace
std::string word;
diff --git a/re2/testing/required_prefix_test.cc b/re2/testing/required_prefix_test.cc
index 5460045..c00e812 100644
--- a/re2/testing/required_prefix_test.cc
+++ b/re2/testing/required_prefix_test.cc
@@ -6,6 +6,7 @@
#include "util/test.h"
#include "util/logging.h"
+#include "re2/prog.h"
#include "re2/regexp.h"
namespace re2 {
@@ -19,10 +20,13 @@ struct PrefixTest {
};
static PrefixTest tests[] = {
- // If the regexp is missing a ^, there's no required prefix.
- { "abc", false },
+ // Empty cases.
{ "", false },
{ "(?m)^", false },
+ { "(?-m)^", false },
+
+ // If the regexp has no ^, there's no required prefix.
+ { "abc", false },
// If the regexp immediately goes into
// something not a literal match, there's no required prefix.
@@ -53,15 +57,15 @@ TEST(RequiredPrefix, SimpleTests) {
bool f;
Regexp* s;
ASSERT_EQ(t.return_value, re->RequiredPrefix(&p, &f, &s))
- << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf")
+ << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8")
<< " " << re->Dump();
if (t.return_value) {
ASSERT_EQ(p, std::string(t.prefix))
- << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf");
+ << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
ASSERT_EQ(f, t.foldcase)
- << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf");
+ << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
ASSERT_EQ(s->ToString(), std::string(t.suffix))
- << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf");
+ << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
s->Decref();
}
re->Decref();
@@ -69,4 +73,73 @@ TEST(RequiredPrefix, SimpleTests) {
}
}
+static PrefixTest for_accel_tests[] = {
+ // Empty cases.
+ { "", false },
+ { "(?m)^", false },
+ { "(?-m)^", false },
+
+ // If the regexp has a ^, there's no required prefix.
+ { "^abc", false },
+
+ // If the regexp immediately goes into
+ // something not a literal match, there's no required prefix.
+ { "(abc)", false },
+ { "a*", false },
+
+ // Otherwise, it should work.
+ { "abc$", true, "abc", false, },
+ { "abc", true, "abc", false, },
+ { "(?i)abc", true, "abc", true, },
+ { "abcd*", true, "abc", false, },
+ { "[Aa][Bb]cd*", true, "ab", true, },
+ { "ab[Cc]d*", true, "ab", false, },
+ { "☺abc", true, "☺abc", false, },
+};
+
+TEST(RequiredPrefixForAccel, SimpleTests) {
+ for (size_t i = 0; i < arraysize(for_accel_tests); i++) {
+ const PrefixTest& t = for_accel_tests[i];
+ for (size_t j = 0; j < 2; j++) {
+ Regexp::ParseFlags flags = Regexp::LikePerl;
+ if (j == 0)
+ flags = flags | Regexp::Latin1;
+ Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
+ ASSERT_TRUE(re != NULL) << " " << t.regexp;
+
+ std::string p;
+ bool f;
+ ASSERT_EQ(t.return_value, re->RequiredPrefixForAccel(&p, &f))
+ << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8")
+ << " " << re->Dump();
+ if (t.return_value) {
+ ASSERT_EQ(p, std::string(t.prefix))
+ << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
+ ASSERT_EQ(f, t.foldcase)
+ << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
+ }
+ re->Decref();
+ }
+ }
+}
+
+TEST(PrefixAccel, BasicTest) {
+ Regexp* re = Regexp::Parse("abc\\d+", Regexp::LikePerl, NULL);
+ ASSERT_TRUE(re != NULL);
+ Prog* prog = re->CompileToProg(0);
+ ASSERT_TRUE(prog != NULL);
+ for (int i = 0; i < 100; i++) {
+ std::string text(i, 'a');
+ const char* p = reinterpret_cast<const char*>(
+ prog->PrefixAccel(text.data(), text.size()));
+ EXPECT_TRUE(p == NULL);
+ text.append("abc");
+ p = reinterpret_cast<const char*>(
+ prog->PrefixAccel(text.data(), text.size()));
+ EXPECT_EQ(i, p-text.data());
+ }
+ delete prog;
+ re->Decref();
+}
+
} // namespace re2
diff --git a/re2/walker-inl.h b/re2/walker-inl.h
index 310be54..8cfe799 100644
--- a/re2/walker-inl.h
+++ b/re2/walker-inl.h
@@ -89,7 +89,7 @@ template<typename T> class Regexp::Walker {
private:
// Walk state for the entire traversal.
- std::stack<WalkState<T> >* stack_;
+ std::stack<WalkState<T>> stack_;
bool stopped_early_;
int max_visits_;
@@ -134,24 +134,22 @@ template<typename T> struct WalkState {
};
template<typename T> Regexp::Walker<T>::Walker() {
- stack_ = new std::stack<WalkState<T> >;
stopped_early_ = false;
}
template<typename T> Regexp::Walker<T>::~Walker() {
Reset();
- delete stack_;
}
// Clears the stack. Should never be necessary, since
// Walk always enters and exits with an empty stack.
// Logs DFATAL if stack is not already clear.
template<typename T> void Regexp::Walker<T>::Reset() {
- if (stack_ && stack_->size() > 0) {
+ if (!stack_.empty()) {
LOG(DFATAL) << "Stack not empty.";
- while (stack_->size() > 0) {
- delete[] stack_->top().child_args;
- stack_->pop();
+ while (!stack_.empty()) {
+ delete[] stack_.top().child_args;
+ stack_.pop();
}
}
}
@@ -165,12 +163,12 @@ template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
return top_arg;
}
- stack_->push(WalkState<T>(re, top_arg));
+ stack_.push(WalkState<T>(re, top_arg));
WalkState<T>* s;
for (;;) {
T t;
- s = &stack_->top();
+ s = &stack_.top();
Regexp* re = s->re;
switch (s->n) {
case -1: {
@@ -201,7 +199,7 @@ template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
s->child_args[s->n] = Copy(s->child_args[s->n - 1]);
s->n++;
} else {
- stack_->push(WalkState<T>(sub[s->n], s->pre_arg));
+ stack_.push(WalkState<T>(sub[s->n], s->pre_arg));
}
continue;
}
@@ -214,12 +212,12 @@ template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
}
}
- // We've finished stack_->top().
+ // We've finished stack_.top().
// Update next guy down.
- stack_->pop();
- if (stack_->size() == 0)
+ stack_.pop();
+ if (stack_.empty())
return t;
- s = &stack_->top();
+ s = &stack_.top();
if (s->child_args != NULL)
s->child_args[s->n] = t;
else