From d040ac06f2b63571259ebe0d03b2592b686e4036 Mon Sep 17 00:00:00 2001 From: DongHun Kwak Date: Tue, 8 Feb 2022 14:56:09 +0900 Subject: Imported Upstream version 20210401 --- CMakeLists.txt | 7 +- Makefile | 2 +- README | 1 + re2/dfa.cc | 11 ++- re2/make_perl_groups.pl | 2 +- re2/perl_groups.cc | 68 ++++++++--------- re2/testing/regexp_benchmark.cc | 158 +++++++++++++++++----------------------- 7 files changed, 116 insertions(+), 133 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ed5ef13..a1ef60d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,6 +25,10 @@ option(USEPCRE "use PCRE in tests and benchmarks" OFF) # so we provide an option similar to BUILD_TESTING, but just for RE2. option(RE2_BUILD_TESTING "enable testing for RE2" ON) +# ABI version +# http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html +set(SONAME 9) + set(EXTRA_TARGET_LINK_LIBRARIES) if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") @@ -82,7 +86,8 @@ set(RE2_SOURCES ) add_library(re2 ${RE2_SOURCES}) -target_include_directories (re2 PUBLIC $) +target_include_directories(re2 PUBLIC $) +set_target_properties(re2 PROPERTIES SOVERSION ${SONAME} VERSION ${SONAME}.0.0) add_library(re2::re2 ALIAS re2) if(RE2_BUILD_TESTING) diff --git a/Makefile b/Makefile index 44cedee..ba5e4f6 100644 --- a/Makefile +++ b/Makefile @@ -55,7 +55,7 @@ ifeq ($(shell uname),Darwin) SOEXT=dylib SOEXTVER=$(SONAME).$(SOEXT) SOEXTVER00=$(SONAME).0.0.$(SOEXT) -MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib -Wl,-install_name,$(libdir)/libre2.$(SOEXTVER),-exported_symbols_list,libre2.symbols.darwin $(RE2_LDFLAGS) $(LDFLAGS) +MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib -Wl,-compatibility_version,$(SONAME),-current_version,$(SONAME).0.0,-install_name,$(libdir)/libre2.$(SOEXTVER),-exported_symbols_list,libre2.symbols.darwin $(RE2_LDFLAGS) $(LDFLAGS) else ifeq ($(shell uname),SunOS) SOEXT=so SOEXTVER=$(SOEXT).$(SONAME) diff --git a/README b/README index 1187cbe..0e1142c 100644 --- a/README +++ b/README @@ -38,3 +38,4 @@ An OCaml wrapper is at https://github.com/janestreet/re2/ and on OPAM (opam.ocam A Perl wrapper is at https://github.com/dgl/re-engine-RE2/ and on CPAN (cpan.org). An R wrapper is at https://github.com/qinwf/re2r/ and on CRAN (cran.r-project.org). A Ruby wrapper is at https://github.com/mudge/re2/ and on RubyGems (rubygems.org). +A WebAssembly wrapper is at https://github.com/google/re2-wasm/ and on NPM (npmjs.com). diff --git a/re2/dfa.cc b/re2/dfa.cc index 3f6571d..f292ff1 100644 --- a/re2/dfa.cc +++ b/re2/dfa.cc @@ -167,6 +167,9 @@ class DFA { typedef std::unordered_set StateSet; private: + // Make it easier to swap in a scalable reader-writer mutex. + using CacheMutex = Mutex; + enum { // Indices into start_ for unanchored searches. // Add kStartAnchored for anchored searches. @@ -331,7 +334,7 @@ class DFA { // while holding cache_mutex_ for writing, to avoid interrupting other // readers. Any State* pointers are only valid while cache_mutex_ // is held. - Mutex cache_mutex_; + CacheMutex cache_mutex_; int64_t mem_budget_; // Total memory budget for all States. int64_t state_budget_; // Amount of memory remaining for new States. StateSet state_cache_; // All States computed so far. @@ -1106,7 +1109,7 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) { class DFA::RWLocker { public: - explicit RWLocker(Mutex* mu); + explicit RWLocker(CacheMutex* mu); ~RWLocker(); // If the lock is only held for reading right now, @@ -1116,14 +1119,14 @@ class DFA::RWLocker { void LockForWriting(); private: - Mutex* mu_; + CacheMutex* mu_; bool writing_; RWLocker(const RWLocker&) = delete; RWLocker& operator=(const RWLocker&) = delete; }; -DFA::RWLocker::RWLocker(Mutex* mu) : mu_(mu), writing_(false) { +DFA::RWLocker::RWLocker(CacheMutex* mu) : mu_(mu), writing_(false) { mu_->ReaderLock(); } diff --git a/re2/make_perl_groups.pl b/re2/make_perl_groups.pl index d9fcdaf..ed0d509 100755 --- a/re2/make_perl_groups.pl +++ b/re2/make_perl_groups.pl @@ -76,7 +76,7 @@ sub PrintClass($$@) { } else { $negname =~ y/a-z/A-Z/; } - return "{ \"$escname\", +1, code$cnum, $n }", "{ \"$negname\", -1, code$cnum, $n }"; + return "{ \"$escname\", +1, code$cnum, $n, 0, 0 }", "{ \"$negname\", -1, code$cnum, $n, 0, 0 }"; } my $cnum = 0; diff --git a/re2/perl_groups.cc b/re2/perl_groups.cc index 422b388..4687444 100644 --- a/re2/perl_groups.cc +++ b/re2/perl_groups.cc @@ -20,12 +20,12 @@ static const URange16 code3[] = { /* \w */ { 0x61, 0x7a }, }; const UGroup perl_groups[] = { - { "\\d", +1, code1, 1 }, - { "\\D", -1, code1, 1 }, - { "\\s", +1, code2, 3 }, - { "\\S", -1, code2, 3 }, - { "\\w", +1, code3, 4 }, - { "\\W", -1, code3, 4 }, + { "\\d", +1, code1, 1, 0, 0 }, + { "\\D", -1, code1, 1, 0, 0 }, + { "\\s", +1, code2, 3, 0, 0 }, + { "\\S", -1, code2, 3, 0, 0 }, + { "\\w", +1, code3, 4, 0, 0 }, + { "\\W", -1, code3, 4, 0, 0 }, }; const int num_perl_groups = 6; static const URange16 code4[] = { /* [:alnum:] */ @@ -85,34 +85,34 @@ static const URange16 code17[] = { /* [:xdigit:] */ { 0x61, 0x66 }, }; const UGroup posix_groups[] = { - { "[:alnum:]", +1, code4, 3 }, - { "[:^alnum:]", -1, code4, 3 }, - { "[:alpha:]", +1, code5, 2 }, - { "[:^alpha:]", -1, code5, 2 }, - { "[:ascii:]", +1, code6, 1 }, - { "[:^ascii:]", -1, code6, 1 }, - { "[:blank:]", +1, code7, 2 }, - { "[:^blank:]", -1, code7, 2 }, - { "[:cntrl:]", +1, code8, 2 }, - { "[:^cntrl:]", -1, code8, 2 }, - { "[:digit:]", +1, code9, 1 }, - { "[:^digit:]", -1, code9, 1 }, - { "[:graph:]", +1, code10, 1 }, - { "[:^graph:]", -1, code10, 1 }, - { "[:lower:]", +1, code11, 1 }, - { "[:^lower:]", -1, code11, 1 }, - { "[:print:]", +1, code12, 1 }, - { "[:^print:]", -1, code12, 1 }, - { "[:punct:]", +1, code13, 4 }, - { "[:^punct:]", -1, code13, 4 }, - { "[:space:]", +1, code14, 2 }, - { "[:^space:]", -1, code14, 2 }, - { "[:upper:]", +1, code15, 1 }, - { "[:^upper:]", -1, code15, 1 }, - { "[:word:]", +1, code16, 4 }, - { "[:^word:]", -1, code16, 4 }, - { "[:xdigit:]", +1, code17, 3 }, - { "[:^xdigit:]", -1, code17, 3 }, + { "[:alnum:]", +1, code4, 3, 0, 0 }, + { "[:^alnum:]", -1, code4, 3, 0, 0 }, + { "[:alpha:]", +1, code5, 2, 0, 0 }, + { "[:^alpha:]", -1, code5, 2, 0, 0 }, + { "[:ascii:]", +1, code6, 1, 0, 0 }, + { "[:^ascii:]", -1, code6, 1, 0, 0 }, + { "[:blank:]", +1, code7, 2, 0, 0 }, + { "[:^blank:]", -1, code7, 2, 0, 0 }, + { "[:cntrl:]", +1, code8, 2, 0, 0 }, + { "[:^cntrl:]", -1, code8, 2, 0, 0 }, + { "[:digit:]", +1, code9, 1, 0, 0 }, + { "[:^digit:]", -1, code9, 1, 0, 0 }, + { "[:graph:]", +1, code10, 1, 0, 0 }, + { "[:^graph:]", -1, code10, 1, 0, 0 }, + { "[:lower:]", +1, code11, 1, 0, 0 }, + { "[:^lower:]", -1, code11, 1, 0, 0 }, + { "[:print:]", +1, code12, 1, 0, 0 }, + { "[:^print:]", -1, code12, 1, 0, 0 }, + { "[:punct:]", +1, code13, 4, 0, 0 }, + { "[:^punct:]", -1, code13, 4, 0, 0 }, + { "[:space:]", +1, code14, 2, 0, 0 }, + { "[:^space:]", -1, code14, 2, 0, 0 }, + { "[:upper:]", +1, code15, 1, 0, 0 }, + { "[:^upper:]", -1, code15, 1, 0, 0 }, + { "[:word:]", +1, code16, 4, 0, 0 }, + { "[:^word:]", -1, code16, 4, 0, 0 }, + { "[:xdigit:]", +1, code17, 3, 0, 0 }, + { "[:^xdigit:]", -1, code17, 3, 0, 0 }, }; const int num_posix_groups = 28; diff --git a/re2/testing/regexp_benchmark.cc b/re2/testing/regexp_benchmark.cc index 787505b..acf6e88 100644 --- a/re2/testing/regexp_benchmark.cc +++ b/re2/testing/regexp_benchmark.cc @@ -9,6 +9,7 @@ #include #include #include +#include #include #include "util/benchmark.h" @@ -20,6 +21,7 @@ #include "re2/prog.h" #include "re2/re2.h" #include "re2/regexp.h" +#include "util/mutex.h" #include "util/pcre.h" namespace re2 { @@ -939,13 +941,52 @@ void SearchRE2(benchmark::State& state, const char* regexp, // regexp parsing and compiling once. This lets us measure // search time without the per-regexp overhead. +Prog* GetCachedProg(const char* regexp) { + static auto& mutex = *new Mutex; + MutexLock lock(&mutex); + static auto& cache = *new std::unordered_map; + Prog* prog = cache[regexp]; + if (prog == NULL) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + prog = re->CompileToProg(int64_t{1}<<31); // mostly for the DFA + CHECK(prog); + cache[regexp] = prog; + re->Decref(); + } + return prog; +} + +PCRE* GetCachedPCRE(const char* regexp) { + static auto& mutex = *new Mutex; + MutexLock lock(&mutex); + static auto& cache = *new std::unordered_map; + PCRE* re = cache[regexp]; + if (re == NULL) { + re = new PCRE(regexp, PCRE::UTF8); + CHECK_EQ(re->error(), ""); + cache[regexp] = re; + } + return re; +} + +RE2* GetCachedRE2(const char* regexp) { + static auto& mutex = *new Mutex; + MutexLock lock(&mutex); + static auto& cache = *new std::unordered_map; + RE2* re = cache[regexp]; + if (re == NULL) { + re = new RE2(regexp); + CHECK_EQ(re->error(), ""); + cache[regexp] = re; + } + return re; +} + void SearchCachedDFA(benchmark::State& state, const char* regexp, const StringPiece& text, Prog::Anchor anchor, bool expect_match) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(int64_t{1}<<31); - CHECK(prog); + Prog* prog = GetCachedProg(regexp); for (auto _ : state) { bool failed = false; CHECK_EQ(prog->SearchDFA(text, StringPiece(), anchor, Prog::kFirstMatch, @@ -953,63 +994,45 @@ void SearchCachedDFA(benchmark::State& state, const char* regexp, expect_match); CHECK(!failed); } - delete prog; - re->Decref(); } void SearchCachedNFA(benchmark::State& state, const char* regexp, const StringPiece& text, Prog::Anchor anchor, bool expect_match) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); + Prog* prog = GetCachedProg(regexp); for (auto _ : state) { CHECK_EQ(prog->SearchNFA(text, StringPiece(), anchor, Prog::kFirstMatch, NULL, 0), expect_match); } - delete prog; - re->Decref(); } void SearchCachedOnePass(benchmark::State& state, const char* regexp, const StringPiece& text, Prog::Anchor anchor, bool expect_match) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); + Prog* prog = GetCachedProg(regexp); CHECK(prog->IsOnePass()); for (auto _ : state) { CHECK_EQ(prog->SearchOnePass(text, text, anchor, Prog::kFirstMatch, NULL, 0), expect_match); } - delete prog; - re->Decref(); } void SearchCachedBitState(benchmark::State& state, const char* regexp, const StringPiece& text, Prog::Anchor anchor, bool expect_match) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); + Prog* prog = GetCachedProg(regexp); CHECK(prog->CanBitState()); for (auto _ : state) { CHECK_EQ(prog->SearchBitState(text, text, anchor, Prog::kFirstMatch, NULL, 0), expect_match); } - delete prog; - re->Decref(); } void SearchCachedPCRE(benchmark::State& state, const char* regexp, const StringPiece& text, Prog::Anchor anchor, bool expect_match) { - PCRE re(regexp, PCRE::UTF8); - CHECK_EQ(re.error(), ""); + PCRE& re = *GetCachedPCRE(regexp); for (auto _ : state) { if (anchor == Prog::kAnchored) CHECK_EQ(PCRE::FullMatch(text, re), expect_match); @@ -1021,8 +1044,7 @@ void SearchCachedPCRE(benchmark::State& state, const char* regexp, void SearchCachedRE2(benchmark::State& state, const char* regexp, const StringPiece& text, Prog::Anchor anchor, bool expect_match) { - RE2 re(regexp); - CHECK_EQ(re.error(), ""); + RE2& re = *GetCachedRE2(regexp); for (auto _ : state) { if (anchor == Prog::kAnchored) CHECK_EQ(RE2::FullMatch(text, re), expect_match); @@ -1115,67 +1137,46 @@ void Parse3RE2(benchmark::State& state, const char* regexp, void Parse3CachedNFA(benchmark::State& state, const char* regexp, const StringPiece& text) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); + Prog* prog = GetCachedProg(regexp); StringPiece sp[4]; // 4 because sp[0] is whole match. for (auto _ : state) { CHECK(prog->SearchNFA(text, StringPiece(), Prog::kAnchored, Prog::kFullMatch, sp, 4)); } - delete prog; - re->Decref(); } void Parse3CachedOnePass(benchmark::State& state, const char* regexp, const StringPiece& text) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); + Prog* prog = GetCachedProg(regexp); CHECK(prog->IsOnePass()); StringPiece sp[4]; // 4 because sp[0] is whole match. for (auto _ : state) { CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); } - delete prog; - re->Decref(); } void Parse3CachedBitState(benchmark::State& state, const char* regexp, const StringPiece& text) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); + Prog* prog = GetCachedProg(regexp); CHECK(prog->CanBitState()); StringPiece sp[4]; // 4 because sp[0] is whole match. for (auto _ : state) { CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); } - delete prog; - re->Decref(); } void Parse3CachedBacktrack(benchmark::State& state, const char* regexp, const StringPiece& text) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); + Prog* prog = GetCachedProg(regexp); StringPiece sp[4]; // 4 because sp[0] is whole match. for (auto _ : state) { CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); } - delete prog; - re->Decref(); } void Parse3CachedPCRE(benchmark::State& state, const char* regexp, const StringPiece& text) { - PCRE re(regexp, PCRE::UTF8); - CHECK_EQ(re.error(), ""); + PCRE& re = *GetCachedPCRE(regexp); StringPiece sp1, sp2, sp3; for (auto _ : state) { CHECK(PCRE::FullMatch(text, re, &sp1, &sp2, &sp3)); @@ -1184,8 +1185,7 @@ void Parse3CachedPCRE(benchmark::State& state, const char* regexp, void Parse3CachedRE2(benchmark::State& state, const char* regexp, const StringPiece& text) { - RE2 re(regexp); - CHECK_EQ(re.error(), ""); + RE2& re = *GetCachedRE2(regexp); StringPiece sp1, sp2, sp3; for (auto _ : state) { CHECK(RE2::FullMatch(text, re, &sp1, &sp2, &sp3)); @@ -1262,67 +1262,46 @@ void Parse1RE2(benchmark::State& state, const char* regexp, void Parse1CachedNFA(benchmark::State& state, const char* regexp, const StringPiece& text) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); + Prog* prog = GetCachedProg(regexp); StringPiece sp[2]; // 2 because sp[0] is whole match. for (auto _ : state) { CHECK(prog->SearchNFA(text, StringPiece(), Prog::kAnchored, Prog::kFullMatch, sp, 2)); } - delete prog; - re->Decref(); } void Parse1CachedOnePass(benchmark::State& state, const char* regexp, const StringPiece& text) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); + Prog* prog = GetCachedProg(regexp); CHECK(prog->IsOnePass()); StringPiece sp[2]; // 2 because sp[0] is whole match. for (auto _ : state) { CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); } - delete prog; - re->Decref(); } void Parse1CachedBitState(benchmark::State& state, const char* regexp, const StringPiece& text) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); + Prog* prog = GetCachedProg(regexp); CHECK(prog->CanBitState()); StringPiece sp[2]; // 2 because sp[0] is whole match. for (auto _ : state) { CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); } - delete prog; - re->Decref(); } void Parse1CachedBacktrack(benchmark::State& state, const char* regexp, const StringPiece& text) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); + Prog* prog = GetCachedProg(regexp); StringPiece sp[2]; // 2 because sp[0] is whole match. for (auto _ : state) { CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); } - delete prog; - re->Decref(); } void Parse1CachedPCRE(benchmark::State& state, const char* regexp, const StringPiece& text) { - PCRE re(regexp, PCRE::UTF8); - CHECK_EQ(re.error(), ""); + PCRE& re = *GetCachedPCRE(regexp); StringPiece sp1; for (auto _ : state) { CHECK(PCRE::FullMatch(text, re, &sp1)); @@ -1331,8 +1310,7 @@ void Parse1CachedPCRE(benchmark::State& state, const char* regexp, void Parse1CachedRE2(benchmark::State& state, const char* regexp, const StringPiece& text) { - RE2 re(regexp); - CHECK_EQ(re.error(), ""); + RE2& re = *GetCachedRE2(regexp); StringPiece sp1; for (auto _ : state) { CHECK(RE2::FullMatch(text, re, &sp1)); @@ -1341,8 +1319,7 @@ void Parse1CachedRE2(benchmark::State& state, const char* regexp, void SearchParse2CachedPCRE(benchmark::State& state, const char* regexp, const StringPiece& text) { - PCRE re(regexp, PCRE::UTF8); - CHECK_EQ(re.error(), ""); + PCRE& re = *GetCachedPCRE(regexp); for (auto _ : state) { StringPiece sp1, sp2; CHECK(PCRE::PartialMatch(text, re, &sp1, &sp2)); @@ -1351,8 +1328,7 @@ void SearchParse2CachedPCRE(benchmark::State& state, const char* regexp, void SearchParse2CachedRE2(benchmark::State& state, const char* regexp, const StringPiece& text) { - RE2 re(regexp); - CHECK_EQ(re.error(), ""); + RE2& re = *GetCachedRE2(regexp); for (auto _ : state) { StringPiece sp1, sp2; CHECK(RE2::PartialMatch(text, re, &sp1, &sp2)); @@ -1361,8 +1337,7 @@ void SearchParse2CachedRE2(benchmark::State& state, const char* regexp, void SearchParse1CachedPCRE(benchmark::State& state, const char* regexp, const StringPiece& text) { - PCRE re(regexp, PCRE::UTF8); - CHECK_EQ(re.error(), ""); + PCRE& re = *GetCachedPCRE(regexp); for (auto _ : state) { StringPiece sp1; CHECK(PCRE::PartialMatch(text, re, &sp1)); @@ -1371,8 +1346,7 @@ void SearchParse1CachedPCRE(benchmark::State& state, const char* regexp, void SearchParse1CachedRE2(benchmark::State& state, const char* regexp, const StringPiece& text) { - RE2 re(regexp); - CHECK_EQ(re.error(), ""); + RE2& re = *GetCachedRE2(regexp); for (auto _ : state) { StringPiece sp1; CHECK(RE2::PartialMatch(text, re, &sp1)); -- cgit v1.2.3