diff options
author | DongHun Kwak <dh0128.kwak@samsung.com> | 2018-02-20 15:36:30 +0900 |
---|---|---|
committer | DongHun Kwak <dh0128.kwak@samsung.com> | 2018-02-20 15:36:30 +0900 |
commit | fb1f61b46bac5b1a435d7edd9f97d9e804d30251 (patch) | |
tree | c4b16b30e5c009275706ad90355578008c2bcd31 | |
parent | 47030f9abe0ed076af6465569939740f6a36b1d4 (diff) | |
download | re2-fb1f61b46bac5b1a435d7edd9f97d9e804d30251.tar.gz re2-fb1f61b46bac5b1a435d7edd9f97d9e804d30251.tar.bz2 re2-fb1f61b46bac5b1a435d7edd9f97d9e804d30251.zip |
Imported Upstream version 20171101upstream/20171101
Change-Id: Ia1df4d395f2335bd59ba868feb94d3ba9fdc5999
Signed-off-by: DongHun Kwak <dh0128.kwak@samsung.com>
-rw-r--r-- | .travis.yml | 9 | ||||
-rw-r--r-- | BUILD | 72 | ||||
-rw-r--r-- | CMakeLists.txt | 4 | ||||
-rw-r--r-- | doc/syntax.txt | 10 | ||||
-rw-r--r-- | re2/compile.cc | 67 | ||||
-rw-r--r-- | re2/dfa.cc | 171 | ||||
-rw-r--r-- | re2/fuzzing/re2_fuzzer.cc | 15 | ||||
-rw-r--r-- | re2/parse.cc | 3 | ||||
-rw-r--r-- | re2/prog.h | 7 | ||||
-rw-r--r-- | re2/re2.cc | 132 | ||||
-rw-r--r-- | re2/re2.h | 4 | ||||
-rw-r--r-- | re2/set.cc | 69 | ||||
-rw-r--r-- | re2/set.h | 6 | ||||
-rw-r--r-- | re2/testing/parse_test.cc | 5 | ||||
-rw-r--r-- | re2/testing/regexp_benchmark.cc | 30 | ||||
-rw-r--r-- | re2/testing/set_test.cc | 27 | ||||
-rw-r--r-- | re2/unicode.py | 2 | ||||
-rw-r--r-- | re2/unicode_casefold.cc | 54 | ||||
-rw-r--r-- | re2/unicode_groups.cc | 484 | ||||
-rw-r--r-- | util/sparse_array.h | 51 | ||||
-rw-r--r-- | util/sparse_set.h | 27 | ||||
-rw-r--r-- | util/strutil.cc | 22 | ||||
-rw-r--r-- | util/strutil.h | 2 |
23 files changed, 883 insertions, 390 deletions
diff --git a/.travis.yml b/.travis.yml index 8c9acca..b5a2a94 100644 --- a/.travis.yml +++ b/.travis.yml @@ -112,6 +112,15 @@ matrix: - clang-4.0 env: - MATRIX_EVAL="CC=clang-4.0 CXX=clang++-4.0" + - os: linux + addons: + apt: + sources: + - llvm-toolchain-trusty-5.0 + packages: + - clang-5.0 + env: + - MATRIX_EVAL="CC=clang-5.0 CXX=clang++-5.0" before_install: - eval "${MATRIX_EVAL}" @@ -94,35 +94,77 @@ cc_library( deps = [":testing"], ) -load("re2_test", "re2_test") +load(":re2_test.bzl", "re2_test") -re2_test("charclass_test") +re2_test( + "charclass_test", + size = "small", +) -re2_test("compile_test") +re2_test( + "compile_test", + size = "small", +) -re2_test("filtered_re2_test") +re2_test( + "filtered_re2_test", + size = "small", +) -re2_test("mimics_pcre_test") +re2_test( + "mimics_pcre_test", + size = "small", +) -re2_test("parse_test") +re2_test( + "parse_test", + size = "small", +) -re2_test("possible_match_test") +re2_test( + "possible_match_test", + size = "small", +) -re2_test("re2_arg_test") +re2_test( + "re2_arg_test", + size = "small", +) -re2_test("re2_test") +re2_test( + "re2_test", + size = "small", +) -re2_test("regexp_test") +re2_test( + "regexp_test", + size = "small", +) -re2_test("required_prefix_test") +re2_test( + "required_prefix_test", + size = "small", +) -re2_test("search_test") +re2_test( + "search_test", + size = "small", +) -re2_test("set_test") +re2_test( + "set_test", + size = "small", +) -re2_test("simplify_test") +re2_test( + "simplify_test", + size = "small", +) -re2_test("string_generator_test") +re2_test( + "string_generator_test", + size = "small", +) re2_test( "dfa_test", diff --git a/CMakeLists.txt b/CMakeLists.txt index 3e22472..884873b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,6 +5,10 @@ # Old enough to support Ubuntu Trusty. cmake_minimum_required(VERSION 2.8.12) +if(POLICY CMP0048) + cmake_policy(SET CMP0048 NEW) +endif() + project(RE2 CXX) include(CTest) diff --git a/doc/syntax.txt b/doc/syntax.txt index 09b7e88..92e2f2a 100644 --- a/doc/syntax.txt +++ b/doc/syntax.txt @@ -230,6 +230,7 @@ Zp paragraph separator Zs space separator Unicode character class names--scripts: +Adlam Ahom Anatolian_Hieroglyphs Arabic @@ -240,6 +241,7 @@ Bamum Bassa_Vah Batak Bengali +Bhaiksuki Bopomofo Brahmi Braille @@ -301,6 +303,8 @@ Mahajani Malayalam Mandaic Manichaean +Marchen +Masaram_Gondi Meetei_Mayek Mende_Kikakui Meroitic_Cursive @@ -313,7 +317,9 @@ Multani Myanmar Nabataean New_Tai_Lue +Newa Nko +Nushu Ogham Ol_Chiki Old_Hungarian @@ -324,6 +330,7 @@ Old_Persian Old_South_Arabian Old_Turkic Oriya +Osage Osmanya Pahawh_Hmong Palmyrene @@ -341,6 +348,7 @@ Siddham SignWriting Sinhala Sora_Sompeng +Soyombo Sundanese Syloti_Nagri Syriac @@ -351,6 +359,7 @@ Tai_Tham Tai_Viet Takri Tamil +Tangut Telugu Thaana Thai @@ -361,6 +370,7 @@ Ugaritic Vai Warang_Citi Yi +Zanabazar_Square Vim character classes: \i identifier character NOT SUPPORTED vim diff --git a/re2/compile.cc b/re2/compile.cc index 9a19bde..454c872 100644 --- a/re2/compile.cc +++ b/re2/compile.cc @@ -136,8 +136,7 @@ class Compiler : public Regexp::Walker<Frag> { // Compiles alternation of all the re to a new Prog. // Each re has a match with an id equal to its index in the vector. - static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor, - Regexp* re); + static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem); // Interface for Regexp::Walker, which helps traverse the Regexp. // The walk is purely post-recursive: given the machines for the @@ -286,7 +285,8 @@ int Compiler::AllocInst(int n) { while (inst_len_ + n > inst_cap_) inst_cap_ *= 2; Prog::Inst* ip = new Prog::Inst[inst_cap_]; - memmove(ip, inst_, inst_len_ * sizeof ip[0]); + if (inst_ != NULL) + memmove(ip, inst_, inst_len_ * sizeof ip[0]); memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]); delete[] inst_; inst_ = ip; @@ -877,9 +877,11 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, case kRegexpHaveMatch: { Frag f = Match(re->match_id()); - // Remember unanchored match to end of string. - if (anchor_ != RE2::ANCHOR_BOTH) - f = Cat(DotStar(), Cat(EmptyWidth(kEmptyEndText), f)); + if (anchor_ == RE2::ANCHOR_BOTH) { + // Append \z or else the subexpression will effectively be unanchored. + // Complemented by the UNANCHORED case in CompileSet(). + f = Cat(EmptyWidth(kEmptyEndText), f); + } return f; } @@ -1138,8 +1140,7 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem, // The reversed flag is also recorded in the returned program. Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) { Compiler c; - - c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */); + c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED /* unused */); c.reversed_ = reversed; // Simplify to remove things like counted repetitions @@ -1154,7 +1155,7 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) { bool is_anchor_end = IsAnchorEnd(&sre, 0); // Generate fragment for entire regexp. - Frag f = c.WalkExponential(sre, Frag(), 2*c.max_inst_); + Frag all = c.WalkExponential(sre, Frag(), 2*c.max_inst_); sre->Decref(); if (c.failed_) return NULL; @@ -1163,10 +1164,10 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) { // Turn off c.reversed_ (if it is set) to force the remaining concatenations // to behave normally. c.reversed_ = false; - Frag all = c.Cat(f, c.Match(0)); - c.prog_->set_start(all.begin); + all = c.Cat(all, c.Match(0)); - if (reversed) { + c.prog_->set_reversed(reversed); + if (c.prog_->reversed()) { c.prog_->set_anchor_start(is_anchor_end); c.prog_->set_anchor_end(is_anchor_start); } else { @@ -1174,15 +1175,12 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) { c.prog_->set_anchor_end(is_anchor_end); } - // Also create unanchored version, which starts with a .*? loop. - if (c.prog_->anchor_start()) { - c.prog_->set_start_unanchored(c.prog_->start()); - } else { - Frag unanchored = c.Cat(c.DotStar(), all); - c.prog_->set_start_unanchored(unanchored.begin); + c.prog_->set_start(all.begin); + if (!c.prog_->anchor_start()) { + // Also create unanchored version, which starts with a .*? loop. + all = c.Cat(c.DotStar(), all); } - - c.prog_->set_reversed(reversed); + c.prog_->set_start_unanchored(all.begin); // Hand ownership of prog_ to caller. return c.Finish(); @@ -1235,29 +1233,29 @@ Frag Compiler::DotStar() { } // Compiles RE set to Prog. -Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor, - Regexp* re) { +Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) { Compiler c; + c.Setup(re->parse_flags(), max_mem, anchor); - Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags()); - c.Setup(pf, options.max_mem(), anchor); + Regexp* sre = re->Simplify(); + if (sre == NULL) + return NULL; - // Compile alternation of fragments. - Frag all = c.WalkExponential(re, Frag(), 2*c.max_inst_); - re->Decref(); + Frag all = c.WalkExponential(sre, Frag(), 2*c.max_inst_); + sre->Decref(); if (c.failed_) return NULL; + c.prog_->set_anchor_start(true); + c.prog_->set_anchor_end(true); + if (anchor == RE2::UNANCHORED) { - // The trailing .* was added while handling kRegexpHaveMatch. - // We just have to add the leading one. + // Prepend .* or else the expression will effectively be anchored. + // Complemented by the ANCHOR_BOTH case in PostVisit(). all = c.Cat(c.DotStar(), all); } - c.prog_->set_start(all.begin); c.prog_->set_start_unanchored(all.begin); - c.prog_->set_anchor_start(true); - c.prog_->set_anchor_end(true); Prog* prog = c.Finish(); if (prog == NULL) @@ -1277,9 +1275,8 @@ Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor, return prog; } -Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor, - Regexp* re) { - return Compiler::CompileSet(options, anchor, re); +Prog* Prog::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) { + return Compiler::CompileSet(re, anchor, max_mem); } } // namespace re2 @@ -28,7 +28,6 @@ #include <algorithm> #include <atomic> #include <deque> -#include <map> #include <mutex> #include <new> #include <string> @@ -96,7 +95,7 @@ class DFA { // memory), it sets *failed and returns false. bool Search(const StringPiece& text, const StringPiece& context, bool anchored, bool want_earliest_match, bool run_forward, - bool* failed, const char** ep, std::vector<int>* matches); + bool* failed, const char** ep, SparseSet* matches); // Builds out all states for the entire DFA. // If cb is not empty, it receives one callback per state built. @@ -141,9 +140,9 @@ class DFA { enum { kByteEndText = 256, // imaginary byte at end of text - kFlagEmptyMask = 0xFFF, // State.flag_: bits holding kEmptyXXX flags - kFlagMatch = 0x1000, // State.flag_: this is a matching state - kFlagLastWord = 0x2000, // State.flag_: last byte was a word char + kFlagEmptyMask = 0xFF, // State.flag_: bits holding kEmptyXXX flags + kFlagMatch = 0x0100, // State.flag_: this is a matching state + kFlagLastWord = 0x0200, // State.flag_: last byte was a word char kFlagNeedShift = 16, // needed kEmpty bits are or'ed in shifted left }; @@ -207,7 +206,7 @@ class DFA { // Looks up and returns the State corresponding to a Workq. // L >= mutex_ - State* WorkqToCachedState(Workq* q, uint32_t flag); + State* WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag); // Looks up and returns a State matching the inst, ninst, and flag. // L >= mutex_ @@ -230,8 +229,7 @@ class DFA { // sets *ismatch to true. // L >= mutex_ void RunWorkqOnByte(Workq* q, Workq* nq, - int c, uint32_t flag, bool* ismatch, - Prog::MatchKind kind); + int c, uint32_t flag, bool* ismatch); // Runs a Workq on a set of empty-string flags, producing a new Workq in nq. // L >= mutex_ @@ -273,7 +271,7 @@ class DFA { RWLocker *cache_lock; bool failed; // "out" parameter: whether search gave up const char* ep; // "out" parameter: end pointer for match - std::vector<int>* matches; + SparseSet* matches; private: SearchParams(const SearchParams&) = delete; @@ -377,6 +375,10 @@ static inline const uint8_t* BytePtr(const void* v) { // in the work queue when in leftmost-longest matching mode. #define Mark (-1) +// Separates the match IDs from the instructions in inst_. +// Used only for "many match" DFA states. +#define MatchSep (-2) + // Internally, the DFA uses a sparse array of // program instruction pointers as a work queue. // In leftmost longest mode, marks separate sections @@ -509,7 +511,7 @@ DFA::~DFA() { string DFA::DumpWorkq(Workq* q) { string s; const char* sep = ""; - for (DFA::Workq::iterator it = q->begin(); it != q->end(); ++it) { + for (Workq::iterator it = q->begin(); it != q->end(); ++it) { if (q->is_mark(*it)) { StringAppendF(&s, "|"); sep = ""; @@ -536,6 +538,9 @@ string DFA::DumpState(State* state) { if (state->inst_[i] == Mark) { StringAppendF(&s, "|"); sep = ""; + } else if (state->inst_[i] == MatchSep) { + StringAppendF(&s, "||"); + sep = ""; } else { StringAppendF(&s, "%s%d", sep, state->inst_[i]); sep = ","; @@ -603,7 +608,9 @@ string DFA::DumpState(State* state) { // Looks in the State cache for a State matching q, flag. // If one is found, returns it. If one is not found, allocates one, // inserts it in the cache, and returns it. -DFA::State* DFA::WorkqToCachedState(Workq* q, uint32_t flag) { +// If mq is not null, MatchSep and the match IDs in mq will be appended +// to the State. +DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { //mutex_.AssertHeld(); // Construct array of instruction ids for the new state. @@ -710,6 +717,17 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, uint32_t flag) { } } + // Append MatchSep and the match IDs in mq if necessary. + if (mq != NULL) { + inst[n++] = MatchSep; + for (Workq::iterator i = mq->begin(); i != mq->end(); ++i) { + int id = *i; + Prog::Inst* ip = prog_->inst(id); + if (ip->opcode() == kInstMatch) + inst[n++] = ip->match_id(); + } + } + // Save the needed empty-width flags in the top bits for use later. flag |= needflags << kFlagNeedShift; @@ -789,11 +807,15 @@ void DFA::ClearCache() { void DFA::StateToWorkq(State* s, Workq* q) { q->clear(); for (int i = 0; i < s->ninst_; i++) { - if (s->inst_[i] == Mark) + if (s->inst_[i] == Mark) { q->mark(); - else + } else if (s->inst_[i] == MatchSep) { + // Nothing after this is an instruction! + break; + } else { // Explore from the head of the list. AddToQueue(q, s->inst_[i], s->flag_ & kFlagEmptyMask); + } } } @@ -913,8 +935,7 @@ void DFA::RunWorkqOnEmptyString(Workq* oldq, Workq* newq, uint32_t flag) { // means to match c$. Sets the bool *ismatch to true if the end of the // regular expression program has been reached (the regexp has matched). void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq, - int c, uint32_t flag, bool* ismatch, - Prog::MatchKind kind) { + int c, uint32_t flag, bool* ismatch) { //mutex_.AssertHeld(); newq->clear(); @@ -945,10 +966,11 @@ void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq, break; case kInstMatch: - if (prog_->anchor_end() && c != kByteEndText) + if (prog_->anchor_end() && c != kByteEndText && + kind_ != Prog::kManyMatch) break; *ismatch = true; - if (kind == Prog::kFirstMatch) { + if (kind_ == Prog::kFirstMatch) { // Can stop processing work queue since we found a match. return; } @@ -1040,20 +1062,9 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) { swap(q0_, q1_); } bool ismatch = false; - RunWorkqOnByte(q0_, q1_, c, afterflag, &ismatch, kind_); - - // Most of the time, we build the state from the output of - // RunWorkqOnByte, so swap q0_ and q1_ here. However, so that - // RE2::Set can tell exactly which match instructions - // contributed to the match, don't swap if c is kByteEndText. - // The resulting state wouldn't be correct for further processing - // of the string, but we're at the end of the text so that's okay. - // Leaving q0_ alone preseves the match instructions that led to - // the current setting of ismatch. - if (c != kByteEndText || kind_ != Prog::kManyMatch) { - using std::swap; - swap(q0_, q1_); - } + RunWorkqOnByte(q0_, q1_, c, afterflag, &ismatch); + using std::swap; + swap(q0_, q1_); // Save afterflag along with ismatch and isword in new state. uint32_t flag = afterflag; @@ -1062,7 +1073,10 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) { if (isword) flag |= kFlagLastWord; - ns = WorkqToCachedState(q0_, flag); + if (ismatch && kind_ == Prog::kManyMatch) + ns = WorkqToCachedState(q0_, q1_, flag); + else + ns = WorkqToCachedState(q0_, NULL, flag); // Flush ns before linking to it. // Write barrier before updating state->next_ so that the @@ -1316,11 +1330,24 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, const uint8_t* bytemap = prog_->bytemap(); const uint8_t* lastmatch = NULL; // most recent matching position in text bool matched = false; + State* s = start; + if (ExtraDebug) + fprintf(stderr, "@stx: %s\n", DumpState(s).c_str()); if (s->IsMatch()) { matched = true; lastmatch = p; + if (ExtraDebug) + fprintf(stderr, "match @stx! [%s]\n", DumpState(s).c_str()); + if (params->matches != NULL && kind_ == Prog::kManyMatch) { + for (int i = s->ninst_ - 1; i >= 0; i--) { + int id = s->inst_[i]; + if (id == MatchSep) + break; + params->matches->insert(id); + } + } if (want_earliest_match) { params->ep = reinterpret_cast<const char*>(lastmatch); return true; @@ -1331,6 +1358,7 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, if (ExtraDebug) fprintf(stderr, "@%td: %s\n", p - bp, DumpState(s).c_str()); + if (have_firstbyte && s == start) { // In start state, only way out is to find firstbyte, // so use optimized assembly in memchr to skip ahead. @@ -1424,8 +1452,8 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, params->ep = reinterpret_cast<const char*>(ep); return true; } - s = ns; + s = ns; if (s->IsMatch()) { matched = true; // The DFA notices the match one byte late, @@ -1437,7 +1465,14 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, if (ExtraDebug) fprintf(stderr, "match @%td! [%s]\n", lastmatch - bp, DumpState(s).c_str()); - + if (params->matches != NULL && kind_ == Prog::kManyMatch) { + for (int i = s->ninst_ - 1; i >= 0; i--) { + int id = s->inst_[i]; + if (id == MatchSep) + break; + params->matches->insert(id); + } + } if (want_earliest_match) { params->ep = reinterpret_cast<const char*>(lastmatch); return true; @@ -1447,6 +1482,9 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, // Process one more byte to see if it triggers a match. // (Remember, matches are delayed one byte.) + if (ExtraDebug) + fprintf(stderr, "@etx: %s\n", DumpState(s).c_str()); + int lastbyte; if (run_forward) { if (params->text.end() == params->context.end()) @@ -1478,34 +1516,32 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, } } } - s = ns; - if (ExtraDebug) - fprintf(stderr, "@_: %s\n", DumpState(s).c_str()); - if (s == FullMatchState) { + if (ns <= SpecialStateMax) { + if (ns == DeadState) { + params->ep = reinterpret_cast<const char*>(lastmatch); + return matched; + } + // FullMatchState params->ep = reinterpret_cast<const char*>(ep); return true; } - if (s > SpecialStateMax && s->IsMatch()) { + + s = ns; + if (s->IsMatch()) { matched = true; lastmatch = p; - if (params->matches && kind_ == Prog::kManyMatch) { - std::vector<int>* v = params->matches; - v->clear(); - for (int i = 0; i < s->ninst_; i++) { - Prog::Inst* ip = prog_->inst(s->inst_[i]); - for (;;) { - if (ip->opcode() == kInstMatch) - v->push_back(ip->match_id()); - if (ip->last()) - break; - ip++; - } + if (ExtraDebug) + fprintf(stderr, "match @etx! [%s]\n", DumpState(s).c_str()); + if (params->matches != NULL && kind_ == Prog::kManyMatch) { + for (int i = s->ninst_ - 1; i >= 0; i--) { + int id = s->inst_[i]; + if (id == MatchSep) + break; + params->matches->insert(id); } } - if (ExtraDebug) - fprintf(stderr, "match @%td! [%s]\n", - lastmatch - bp, DumpState(s).c_str()); } + params->ep = reinterpret_cast<const char*>(lastmatch); return matched; } @@ -1681,7 +1717,7 @@ bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info, AddToQueue(q0_, params->anchored ? prog_->start() : prog_->start_unanchored(), flags); - info->start = WorkqToCachedState(q0_, flags); + info->start = WorkqToCachedState(q0_, NULL, flags); if (info->start == NULL) return false; @@ -1732,7 +1768,7 @@ bool DFA::Search(const StringPiece& text, bool run_forward, bool* failed, const char** epp, - std::vector<int>* matches) { + SparseSet* matches) { *epp = NULL; if (!ok()) { *failed = true; @@ -1823,7 +1859,7 @@ void Prog::DeleteDFA(DFA* dfa) { // bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, Anchor anchor, MatchKind kind, StringPiece* match0, - bool* failed, std::vector<int>* matches) { + bool* failed, SparseSet* matches) { *failed = false; StringPiece context = const_context; @@ -1846,7 +1882,7 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, bool anchored = anchor == kAnchored || anchor_start() || kind == kFullMatch; bool endmatch = false; if (kind == kManyMatch) { - endmatch = true; + // This is split out in order to avoid clobbering kind. } else if (kind == kFullMatch || anchor_end()) { endmatch = true; kind = kLongestMatch; @@ -1854,17 +1890,22 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, // If the caller doesn't care where the match is (just whether one exists), // then we can stop at the very first match we find, the so-called - // "shortest match". - bool want_shortest_match = false; - if (match0 == NULL && !endmatch) { - want_shortest_match = true; + // "earliest match". + bool want_earliest_match = false; + if (kind == kManyMatch) { + // This is split out in order to avoid clobbering kind. + if (matches == NULL) { + want_earliest_match = true; + } + } else if (match0 == NULL && !endmatch) { + want_earliest_match = true; kind = kLongestMatch; } DFA* dfa = GetDFA(kind); const char* ep; bool matched = dfa->Search(text, context, anchored, - want_shortest_match, !reversed_, + want_earliest_match, !reversed_, failed, &ep, matches); if (*failed) return false; @@ -1982,7 +2023,7 @@ bool DFA::PossibleMatchRange(string* min, string* max, int maxlen) { // Also note that previously_visited_states[UnseenStatePtr] will, in the STL // tradition, implicitly insert a '0' value at first use. We take advantage // of that property below. - std::map<State*, int> previously_visited_states; + std::unordered_map<State*, int> previously_visited_states; // Pick out start state for anchored search at beginning of text. RWLocker l(&cache_mutex_); @@ -2087,7 +2128,7 @@ bool DFA::PossibleMatchRange(string* min, string* max, int maxlen) { } // Stopped while still adding to *max - round aaaaaaaaaa... to aaaa...b - *max = PrefixSuccessor(*max); + PrefixSuccessor(max); // If there are no bytes left, we have no way to say "there is no maximum // string". We could make the interface more complicated and be able to diff --git a/re2/fuzzing/re2_fuzzer.cc b/re2/fuzzing/re2_fuzzer.cc index daef0be..e0aa96a 100644 --- a/re2/fuzzing/re2_fuzzer.cc +++ b/re2/fuzzing/re2_fuzzer.cc @@ -59,6 +59,20 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { if (size == 0 || size > 1024) return 0; + // Crudely limit the use of \p and \P. + // Otherwise, we will waste time on inputs that have long runs of Unicode + // character classes. The fuzzer has shown itself to be easily capable of + // generating such patterns that fall within the other limits, but result + // in timeouts nonetheless. The marginal cost is high - even more so when + // counted repetition is involved - whereas the marginal benefit is zero. + int backslash_p = 0; + for (size_t i = 0; i < size; i++) { + if (data[i] == '\\' && i+1 < size && (data[i+1] == 'p' || data[i+1] == 'P')) + backslash_p++; + } + if (backslash_p > 10) + return 0; + // The one-at-a-time hash by Bob Jenkins. uint32_t hash = 0; for (size_t i = 0; i < size; i++) { @@ -72,6 +86,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { RE2::Options options; options.set_log_errors(false); + options.set_max_mem(64 << 20); options.set_encoding(hash & 1 ? RE2::Options::EncodingLatin1 : RE2::Options::EncodingUTF8); options.set_posix_syntax(hash & 2); diff --git a/re2/parse.cc b/re2/parse.cc index 0729b75..2b46e3e 100644 --- a/re2/parse.cc +++ b/re2/parse.cc @@ -1224,9 +1224,8 @@ bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) { // Lexing routines. -// Parses a decimal integer, storing it in *n. +// Parses a decimal integer, storing it in *np. // Sets *s to span the remainder of the string. -// Sets *out_re to the regexp for the class. static bool ParseInteger(StringPiece* s, int* np) { if (s->size() == 0 || !isdigit((*s)[0] & 0xFF)) return false; @@ -255,7 +255,7 @@ class Prog { // SearchDFA fills matches with the match IDs of the final matching state. bool SearchDFA(const StringPiece& text, const StringPiece& context, Anchor anchor, MatchKind kind, StringPiece* match0, - bool* failed, std::vector<int>* matches); + bool* failed, SparseSet* matches); // The callback issued after building each DFA state with BuildEntireDFA(). // If next is null, then the memory budget has been exhausted and building @@ -336,9 +336,8 @@ class Prog { void Fanout(SparseArray<int>* fanout); // Compiles a collection of regexps to Prog. Each regexp will have - // its own Match instruction recording the index in the vector. - static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor, - Regexp* re); + // its own Match instruction recording the index in the output vector. + static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem); // Flattens the Prog from "tree" form to "list" form. This is an in-place // operation in the sense that the old instructions are lost. @@ -345,25 +345,6 @@ bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re, } } -// Returns the maximum submatch needed for the rewrite to be done by Replace(). -// E.g. if rewrite == "foo \\2,\\1", returns 2. -int RE2::MaxSubmatch(const StringPiece& rewrite) { - int max = 0; - for (const char *s = rewrite.data(), *end = s + rewrite.size(); - s < end; s++) { - if (*s == '\\') { - s++; - int c = (s < end) ? *s : -1; - if (isdigit(c)) { - int n = (c - '0'); - if (n > max) - max = n; - } - } - } - return max; -} - bool RE2::Replace(string *str, const RE2& re, const StringPiece& rewrite) { @@ -519,14 +500,14 @@ bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const { n = maxlen; // Determine initial min max from prefix_ literal. - string pmin, pmax; - pmin = prefix_.substr(0, n); - pmax = prefix_.substr(0, n); + *min = prefix_.substr(0, n); + *max = prefix_.substr(0, n); if (prefix_foldcase_) { - // prefix is ASCII lowercase; change pmin to uppercase. + // prefix is ASCII lowercase; change *min to uppercase. for (int i = 0; i < n; i++) { - if ('a' <= pmin[i] && pmin[i] <= 'z') - pmin[i] += 'A' - 'a'; + char& c = (*min)[i]; + if ('a' <= c && c <= 'z') + c += 'A' - 'a'; } } @@ -534,13 +515,13 @@ bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const { string dmin, dmax; maxlen -= n; if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) { - pmin += dmin; - pmax += dmax; - } else if (!pmax.empty()) { + min->append(dmin); + max->append(dmax); + } else if (!max->empty()) { // prog_->PossibleMatchRange has failed us, // but we still have useful information from prefix_. - // Round up pmax to allow any possible suffix. - pmax = PrefixSuccessor(pmax); + // Round up *max to allow any possible suffix. + PrefixSuccessor(max); } else { // Nothing useful. *min = ""; @@ -548,8 +529,6 @@ bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const { return false; } - *min = pmin; - *max = pmax; return true; } @@ -858,41 +837,6 @@ bool RE2::DoMatch(const StringPiece& text, return true; } -// Append the "rewrite" string, with backslash subsitutions from "vec", -// to string "out". -bool RE2::Rewrite(string *out, const StringPiece &rewrite, - const StringPiece *vec, int veclen) const { - for (const char *s = rewrite.data(), *end = s + rewrite.size(); - s < end; s++) { - if (*s != '\\') { - out->push_back(*s); - continue; - } - s++; - int c = (s < end) ? *s : -1; - if (isdigit(c)) { - int n = (c - '0'); - if (n >= veclen) { - if (options_.log_errors()) { - LOG(ERROR) << "requested group " << n - << " in regexp " << rewrite.data(); - } - return false; - } - StringPiece snip = vec[n]; - if (snip.size() > 0) - out->append(snip.data(), snip.size()); - } else if (c == '\\') { - out->push_back('\\'); - } else { - if (options_.log_errors()) - LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); - return false; - } - } - return true; -} - // Checks that the rewrite string is well-formed with respect to this // regular expression. bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const { @@ -931,6 +875,60 @@ bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const { return true; } +// Returns the maximum submatch needed for the rewrite to be done by Replace(). +// E.g. if rewrite == "foo \\2,\\1", returns 2. +int RE2::MaxSubmatch(const StringPiece& rewrite) { + int max = 0; + for (const char *s = rewrite.data(), *end = s + rewrite.size(); + s < end; s++) { + if (*s == '\\') { + s++; + int c = (s < end) ? *s : -1; + if (isdigit(c)) { + int n = (c - '0'); + if (n > max) + max = n; + } + } + } + return max; +} + +// Append the "rewrite" string, with backslash subsitutions from "vec", +// to string "out". +bool RE2::Rewrite(string* out, const StringPiece& rewrite, + const StringPiece* vec, int veclen) const { + for (const char *s = rewrite.data(), *end = s + rewrite.size(); + s < end; s++) { + if (*s != '\\') { + out->push_back(*s); + continue; + } + s++; + int c = (s < end) ? *s : -1; + if (isdigit(c)) { + int n = (c - '0'); + if (n >= veclen) { + if (options_.log_errors()) { + LOG(ERROR) << "requested group " << n + << " in regexp " << rewrite.data(); + } + return false; + } + StringPiece snip = vec[n]; + if (snip.size() > 0) + out->append(snip.data(), snip.size()); + } else if (c == '\\') { + out->push_back('\\'); + } else { + if (options_.log_errors()) + LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); + return false; + } + } + return true; +} + /***** Parsers for various types *****/ bool RE2::Arg::parse_null(const char* str, size_t n, void* dest) { @@ -523,8 +523,8 @@ class RE2 { // Returns true on success. This method can fail because of a malformed // rewrite string. CheckRewriteString guarantees that the rewrite will // be sucessful. - bool Rewrite(string *out, - const StringPiece &rewrite, + bool Rewrite(string* out, + const StringPiece& rewrite, const StringPiece* vec, int veclen) const; @@ -5,6 +5,8 @@ #include "re2/set.h" #include <stddef.h> +#include <algorithm> +#include <memory> #include "util/util.h" #include "util/logging.h" @@ -17,26 +19,27 @@ namespace re2 { RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) { options_.Copy(options); + options_.set_never_capture(true); // might unblock some optimisations anchor_ = anchor; prog_ = NULL; compiled_ = false; + size_ = 0; } RE2::Set::~Set() { - for (size_t i = 0; i < re_.size(); i++) - re_[i]->Decref(); + for (size_t i = 0; i < elem_.size(); i++) + elem_[i].second->Decref(); delete prog_; } int RE2::Set::Add(const StringPiece& pattern, string* error) { if (compiled_) { - LOG(DFATAL) << "RE2::Set::Add after Compile"; + LOG(DFATAL) << "RE2::Set::Add() called after compiling"; return -1; } Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>( options_.ParseFlags()); - RegexpStatus status; re2::Regexp* re = Regexp::Parse(pattern, pf, &status); if (re == NULL) { @@ -48,7 +51,7 @@ int RE2::Set::Add(const StringPiece& pattern, string* error) { } // Concatenate with match index and push on vector. - int n = static_cast<int>(re_.size()); + int n = static_cast<int>(elem_.size()); re2::Regexp* m = re2::Regexp::HaveMatch(n, pf); if (re->op() == kRegexpConcat) { int nsub = re->nsub(); @@ -65,45 +68,54 @@ int RE2::Set::Add(const StringPiece& pattern, string* error) { sub[1] = m; re = re2::Regexp::Concat(sub, 2, pf); } - re_.push_back(re); + elem_.emplace_back(pattern.ToString(), re); return n; } bool RE2::Set::Compile() { if (compiled_) { - LOG(DFATAL) << "RE2::Set::Compile multiple times"; + LOG(DFATAL) << "RE2::Set::Compile() called more than once"; return false; } compiled_ = true; + size_ = static_cast<int>(elem_.size()); + + // Sort the elements by their patterns. This is good enough for now + // until we have a Regexp comparison function. (Maybe someday...) + std::sort(elem_.begin(), elem_.end(), + [](const Elem& a, const Elem& b) -> bool { + return a.first < b.first; + }); + + re2::Regexp** sub = new re2::Regexp*[size_]; + for (size_t i = 0; i < elem_.size(); i++) + sub[i] = elem_[i].second; + elem_.clear(); + elem_.shrink_to_fit(); Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>( options_.ParseFlags()); - re2::Regexp* re = re2::Regexp::Alternate(const_cast<re2::Regexp**>(re_.data()), - static_cast<int>(re_.size()), pf); - re_.clear(); - re2::Regexp* sre = re->Simplify(); - re->Decref(); - re = sre; - if (re == NULL) { - if (options_.log_errors()) - LOG(ERROR) << "Error simplifying during Compile."; - return false; - } + re2::Regexp* re = re2::Regexp::Alternate(sub, size_, pf); + delete[] sub; - prog_ = Prog::CompileSet(options_, anchor_, re); + prog_ = Prog::CompileSet(re, anchor_, options_.max_mem()); + re->Decref(); return prog_ != NULL; } bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v) const { if (!compiled_) { - LOG(DFATAL) << "RE2::Set::Match without Compile"; + LOG(DFATAL) << "RE2::Set::Match() called before compiling"; return false; } - if (v != NULL) - v->clear(); bool dfa_failed = false; - bool ret = prog_->SearchDFA(text, text, Prog::kAnchored, - Prog::kManyMatch, NULL, &dfa_failed, v); + std::unique_ptr<SparseSet> matches; + if (v != NULL) { + matches.reset(new SparseSet(size_)); + v->clear(); + } + bool ret = prog_->SearchDFA(text, text, Prog::kAnchored, Prog::kManyMatch, + NULL, &dfa_failed, matches.get()); if (dfa_failed) { if (options_.log_errors()) LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", " @@ -113,9 +125,12 @@ bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v) const { } if (ret == false) return false; - if (v != NULL && v->empty()) { - LOG(DFATAL) << "RE2::Set::Match: match but unknown regexp set"; - return false; + if (v != NULL) { + if (matches->empty()) { + LOG(DFATAL) << "RE2::Set::Match() matched, but matches unknown"; + return false; + } + v->assign(matches->begin(), matches->end()); } return true; } @@ -6,6 +6,7 @@ #define RE2_SET_H_ #include <string> +#include <utility> #include <vector> #include "re2/re2.h" @@ -45,11 +46,14 @@ class RE2::Set { bool Match(const StringPiece& text, std::vector<int>* v) const; private: + typedef std::pair<string, re2::Regexp*> Elem; + RE2::Options options_; RE2::Anchor anchor_; - std::vector<re2::Regexp*> re_; + std::vector<Elem> elem_; re2::Prog* prog_; bool compiled_; + int size_; Set(const Set&) = delete; Set& operator=(const Set&) = delete; diff --git a/re2/testing/parse_test.cc b/re2/testing/parse_test.cc index ec80fc4..1e6929d 100644 --- a/re2/testing/parse_test.cc +++ b/re2/testing/parse_test.cc @@ -12,7 +12,10 @@ namespace re2 { -static const Regexp::ParseFlags TestZeroFlags = Regexp::ParseFlags(1<<30); +// In the past, we used 1<<30 here and zeroed the bit later, but that +// has undefined behaviour, so now we use an internal-only flag because +// otherwise we would have to introduce a new flag value just for this. +static const Regexp::ParseFlags TestZeroFlags = Regexp::WasDollar; struct Test { const char* regexp; diff --git a/re2/testing/regexp_benchmark.cc b/re2/testing/regexp_benchmark.cc index efd8450..8b82e0b 100644 --- a/re2/testing/regexp_benchmark.cc +++ b/re2/testing/regexp_benchmark.cc @@ -1544,4 +1544,34 @@ BENCHMARK_RANGE(FullMatch_DotStarCapture_CachedPCRE, 8, 2<<20); #endif BENCHMARK_RANGE(FullMatch_DotStarCapture_CachedRE2, 8, 2<<20); +void PossibleMatchRangeCommon(int iter, const char* regexp) { + StopBenchmarkTiming(); + RE2 re(regexp); + StartBenchmarkTiming(); + string min; + string max; + const int kMaxLen = 16; + for (int i = 0; i < iter; i++) { + CHECK(re.PossibleMatchRange(&min, &max, kMaxLen)); + } +} + +void PossibleMatchRange_Trivial(int i) { + PossibleMatchRangeCommon(i, ".*"); +} +void PossibleMatchRange_Complex(int i) { + PossibleMatchRangeCommon(i, "^abc[def]?[gh]{1,2}.*"); +} +void PossibleMatchRange_Prefix(int i) { + PossibleMatchRangeCommon(i, "^some_random_prefix.*"); +} +void PossibleMatchRange_NoProg(int i) { + PossibleMatchRangeCommon(i, "^some_random_string$"); +} + +BENCHMARK(PossibleMatchRange_Trivial); +BENCHMARK(PossibleMatchRange_Complex); +BENCHMARK(PossibleMatchRange_Prefix); +BENCHMARK(PossibleMatchRange_NoProg); + } // namespace re2 diff --git a/re2/testing/set_test.cc b/re2/testing/set_test.cc index b6a24de..54aa106 100644 --- a/re2/testing/set_test.cc +++ b/re2/testing/set_test.cc @@ -78,11 +78,38 @@ TEST(Set, UnanchoredDollar) { CHECK_EQ(s.Compile(), true); CHECK_EQ(s.Match("foo", NULL), true); + CHECK_EQ(s.Match("foobar", NULL), false); std::vector<int> v; CHECK_EQ(s.Match("foo", &v), true); CHECK_EQ(v.size(), 1); CHECK_EQ(v[0], 0); + + CHECK_EQ(s.Match("foobar", &v), false); + CHECK_EQ(v.size(), 0); +} + +TEST(Set, UnanchoredWordBoundary) { + RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED); + + CHECK_EQ(s.Add("foo\\b", NULL), 0); + CHECK_EQ(s.Compile(), true); + + CHECK_EQ(s.Match("foo", NULL), true); + CHECK_EQ(s.Match("foobar", NULL), false); + CHECK_EQ(s.Match("foo bar", NULL), true); + + std::vector<int> v; + CHECK_EQ(s.Match("foo", &v), true); + CHECK_EQ(v.size(), 1); + CHECK_EQ(v[0], 0); + + CHECK_EQ(s.Match("foobar", &v), false); + CHECK_EQ(v.size(), 0); + + CHECK_EQ(s.Match("foo bar", &v), true); + CHECK_EQ(v.size(), 1); + CHECK_EQ(v[0], 0); } TEST(Set, Anchored) { diff --git a/re2/unicode.py b/re2/unicode.py index 4b2240c..2899c87 100644 --- a/re2/unicode.py +++ b/re2/unicode.py @@ -9,7 +9,7 @@ import re import urllib2 # Directory or URL where Unicode tables reside. -_UNICODE_DIR = "http://www.unicode.org/Public/8.0.0/ucd" +_UNICODE_DIR = "http://www.unicode.org/Public/10.0.0/ucd" # Largest valid Unicode code value. _RUNE_MAX = 0x10FFFF diff --git a/re2/unicode_casefold.cc b/re2/unicode_casefold.cc index 91a96b4..1686943 100644 --- a/re2/unicode_casefold.cc +++ b/re2/unicode_casefold.cc @@ -7,7 +7,7 @@ namespace re2 { -// 1224 groups, 2469 pairs, 314 ranges +// 1295 groups, 2620 pairs, 343 ranges const CaseFold unicode_casefold[] = { { 65, 90, 32 }, { 97, 106, -32 }, @@ -113,6 +113,7 @@ const CaseFold unicode_casefold[] = { { 614, 614, 42308 }, { 616, 616, -209 }, { 617, 617, -211 }, + { 618, 618, 42308 }, { 619, 619, 10743 }, { 620, 620, 42305 }, { 623, 623, -211 }, @@ -184,9 +185,21 @@ const CaseFold unicode_casefold[] = { { 1021, 1023, -130 }, { 1024, 1039, 80 }, { 1040, 1071, 32 }, - { 1072, 1103, -32 }, + { 1072, 1073, -32 }, + { 1074, 1074, 6222 }, + { 1075, 1075, -32 }, + { 1076, 1076, 6221 }, + { 1077, 1085, -32 }, + { 1086, 1086, 6212 }, + { 1087, 1088, -32 }, + { 1089, 1090, 6210 }, + { 1091, 1097, -32 }, + { 1098, 1098, 6204 }, + { 1099, 1103, -32 }, { 1104, 1119, -80 }, - { 1120, 1153, EvenOdd }, + { 1120, 1122, EvenOdd }, + { 1123, 1123, 6180 }, + { 1124, 1153, EvenOdd }, { 1162, 1215, EvenOdd }, { 1216, 1216, 15 }, { 1217, 1230, OddEven }, @@ -200,6 +213,15 @@ const CaseFold unicode_casefold[] = { { 5024, 5103, 38864 }, { 5104, 5109, 8 }, { 5112, 5117, -8 }, + { 7296, 7296, -6254 }, + { 7297, 7297, -6253 }, + { 7298, 7298, -6244 }, + { 7299, 7299, -6242 }, + { 7300, 7300, EvenOdd }, + { 7301, 7301, -6243 }, + { 7302, 7302, -6236 }, + { 7303, 7303, -6181 }, + { 7304, 7304, 35266 }, { 7545, 7545, 35332 }, { 7549, 7549, 3814 }, { 7680, 7776, EvenOdd }, @@ -293,7 +315,9 @@ const CaseFold unicode_casefold[] = { { 11520, 11557, -7264 }, { 11559, 11559, -7264 }, { 11565, 11565, -7264 }, - { 42560, 42605, EvenOdd }, + { 42560, 42570, EvenOdd }, + { 42571, 42571, -35267 }, + { 42572, 42605, EvenOdd }, { 42624, 42651, EvenOdd }, { 42786, 42799, EvenOdd }, { 42802, 42863, EvenOdd }, @@ -308,6 +332,7 @@ const CaseFold unicode_casefold[] = { { 42923, 42923, -42319 }, { 42924, 42924, -42315 }, { 42925, 42925, -42305 }, + { 42926, 42926, -42308 }, { 42928, 42928, -42258 }, { 42929, 42929, -42282 }, { 42930, 42930, -42261 }, @@ -319,14 +344,18 @@ const CaseFold unicode_casefold[] = { { 65345, 65370, -32 }, { 66560, 66599, 40 }, { 66600, 66639, -40 }, + { 66736, 66771, 40 }, + { 66776, 66811, -40 }, { 68736, 68786, 64 }, { 68800, 68850, -64 }, { 71840, 71871, 32 }, { 71872, 71903, -32 }, + { 125184, 125217, 34 }, + { 125218, 125251, -34 }, }; -const int num_unicode_casefold = 314; +const int num_unicode_casefold = 343; -// 1224 groups, 1245 pairs, 180 ranges +// 1295 groups, 1325 pairs, 191 ranges const CaseFold unicode_tolower[] = { { 65, 90, 32 }, { 181, 181, 775 }, @@ -429,6 +458,14 @@ const CaseFold unicode_tolower[] = { { 4295, 4295, 7264 }, { 4301, 4301, 7264 }, { 5112, 5117, -8 }, + { 7296, 7296, -6222 }, + { 7297, 7297, -6221 }, + { 7298, 7298, -6212 }, + { 7299, 7300, -6210 }, + { 7301, 7301, -6211 }, + { 7302, 7302, -6204 }, + { 7303, 7303, -6180 }, + { 7304, 7304, 35267 }, { 7680, 7828, EvenOddSkip }, { 7835, 7835, -58 }, { 7838, 7838, -7615 }, @@ -498,6 +535,7 @@ const CaseFold unicode_tolower[] = { { 42923, 42923, -42319 }, { 42924, 42924, -42315 }, { 42925, 42925, -42305 }, + { 42926, 42926, -42308 }, { 42928, 42928, -42258 }, { 42929, 42929, -42282 }, { 42930, 42930, -42261 }, @@ -506,10 +544,12 @@ const CaseFold unicode_tolower[] = { { 43888, 43967, -38864 }, { 65313, 65338, 32 }, { 66560, 66599, 40 }, + { 66736, 66771, 40 }, { 68736, 68786, 64 }, { 71840, 71871, 32 }, + { 125184, 125217, 34 }, }; -const int num_unicode_tolower = 180; +const int num_unicode_tolower = 191; diff --git a/re2/unicode_groups.cc b/re2/unicode_groups.cc index 59087bc..ba104d2 100644 --- a/re2/unicode_groups.cc +++ b/re2/unicode_groups.cc @@ -108,7 +108,8 @@ static const URange16 No_range16[] = { { 2930, 2935 }, { 3056, 3058 }, { 3192, 3198 }, - { 3440, 3445 }, + { 3416, 3422 }, + { 3440, 3448 }, { 3882, 3891 }, { 4969, 4988 }, { 6128, 6137 }, @@ -157,6 +158,7 @@ static const URange32 No_range32[] = { { 70113, 70132 }, { 71482, 71483 }, { 71914, 71922 }, + { 72794, 72812 }, { 93019, 93025 }, { 119648, 119665 }, { 125127, 125135 }, @@ -185,7 +187,9 @@ static const URange16 Lo_range16[] = { { 1994, 2026 }, { 2048, 2069 }, { 2112, 2136 }, + { 2144, 2154 }, { 2208, 2228 }, + { 2230, 2237 }, { 2308, 2361 }, { 2365, 2365 }, { 2384, 2384 }, @@ -202,6 +206,7 @@ static const URange16 Lo_range16[] = { { 2524, 2525 }, { 2527, 2529 }, { 2544, 2545 }, + { 2556, 2556 }, { 2565, 2570 }, { 2575, 2576 }, { 2579, 2600 }, @@ -250,6 +255,7 @@ static const URange16 Lo_range16[] = { { 3133, 3133 }, { 3160, 3162 }, { 3168, 3169 }, + { 3200, 3200 }, { 3205, 3212 }, { 3214, 3216 }, { 3218, 3240 }, @@ -264,6 +270,7 @@ static const URange16 Lo_range16[] = { { 3346, 3386 }, { 3389, 3389 }, { 3406, 3406 }, + { 3412, 3414 }, { 3423, 3425 }, { 3450, 3455 }, { 3461, 3478 }, @@ -336,7 +343,8 @@ static const URange16 Lo_range16[] = { { 6108, 6108 }, { 6176, 6210 }, { 6212, 6263 }, - { 6272, 6312 }, + { 6272, 6276 }, + { 6279, 6312 }, { 6314, 6314 }, { 6320, 6389 }, { 6400, 6430 }, @@ -374,12 +382,12 @@ static const URange16 Lo_range16[] = { { 12447, 12447 }, { 12449, 12538 }, { 12543, 12543 }, - { 12549, 12589 }, + { 12549, 12590 }, { 12593, 12686 }, { 12704, 12730 }, { 12784, 12799 }, { 13312, 19893 }, - { 19968, 40917 }, + { 19968, 40938 }, { 40960, 40980 }, { 40982, 42124 }, { 42192, 42231 }, @@ -465,7 +473,7 @@ static const URange32 Lo_range32[] = { { 66176, 66204 }, { 66208, 66256 }, { 66304, 66335 }, - { 66352, 66368 }, + { 66349, 66368 }, { 66370, 66377 }, { 66384, 66421 }, { 66432, 66461 }, @@ -531,6 +539,8 @@ static const URange32 Lo_range32[] = { { 70461, 70461 }, { 70480, 70480 }, { 70493, 70497 }, + { 70656, 70708 }, + { 70727, 70730 }, { 70784, 70831 }, { 70852, 70853 }, { 70855, 70855 }, @@ -541,7 +551,21 @@ static const URange32 Lo_range32[] = { { 71296, 71338 }, { 71424, 71449 }, { 71935, 71935 }, + { 72192, 72192 }, + { 72203, 72242 }, + { 72250, 72250 }, + { 72272, 72272 }, + { 72284, 72323 }, + { 72326, 72329 }, { 72384, 72440 }, + { 72704, 72712 }, + { 72714, 72750 }, + { 72768, 72768 }, + { 72818, 72847 }, + { 72960, 72966 }, + { 72968, 72969 }, + { 72971, 73008 }, + { 73030, 73030 }, { 73728, 74649 }, { 74880, 75075 }, { 77824, 78894 }, @@ -554,7 +578,10 @@ static const URange32 Lo_range32[] = { { 93053, 93071 }, { 93952, 94020 }, { 94032, 94032 }, - { 110592, 110593 }, + { 94208, 100332 }, + { 100352, 101106 }, + { 110592, 110878 }, + { 110960, 111355 }, { 113664, 113770 }, { 113776, 113788 }, { 113792, 113800 }, @@ -597,6 +624,7 @@ static const URange32 Lo_range32[] = { { 173824, 177972 }, { 177984, 178205 }, { 178208, 183969 }, + { 183984, 191456 }, { 194560, 195101 }, }; static const URange16 Ll_range16[] = { @@ -871,6 +899,7 @@ static const URange16 Ll_range16[] = { { 1327, 1327 }, { 1377, 1415 }, { 5112, 5117 }, + { 7296, 7304 }, { 7424, 7467 }, { 7531, 7543 }, { 7545, 7578 }, @@ -1202,6 +1231,7 @@ static const URange16 Ll_range16[] = { }; static const URange32 Ll_range32[] = { { 66600, 66639 }, + { 66776, 66811 }, { 68800, 68850 }, { 71872, 71903 }, { 119834, 119859 }, @@ -1232,6 +1262,7 @@ static const URange32 Ll_range32[] = { { 120746, 120770 }, { 120772, 120777 }, { 120779, 120779 }, + { 125218, 125251 }, }; static const URange16 Lm_range16[] = { { 688, 705 }, @@ -1292,6 +1323,7 @@ static const URange16 Lm_range16[] = { static const URange32 Lm_range32[] = { { 92992, 92995 }, { 94099, 94111 }, + { 94176, 94177 }, }; static const URange16 Nd_range16[] = { { 48, 57 }, @@ -1339,14 +1371,18 @@ static const URange32 Nd_range32[] = { { 69942, 69951 }, { 70096, 70105 }, { 70384, 70393 }, + { 70736, 70745 }, { 70864, 70873 }, { 71248, 71257 }, { 71360, 71369 }, { 71472, 71481 }, { 71904, 71913 }, + { 72784, 72793 }, + { 73040, 73049 }, { 92768, 92777 }, { 93008, 93017 }, { 120782, 120831 }, + { 125264, 125273 }, }; static const URange16 Pc_range16[] = { { 95, 95 }, @@ -1956,13 +1992,14 @@ static const URange16 Lu_range16[] = { { 42916, 42916 }, { 42918, 42918 }, { 42920, 42920 }, - { 42922, 42925 }, + { 42922, 42926 }, { 42928, 42932 }, { 42934, 42934 }, { 65313, 65338 }, }; static const URange32 Lu_range32[] = { { 66560, 66599 }, + { 66736, 66771 }, { 68736, 68786 }, { 71840, 71871 }, { 119808, 119833 }, @@ -1996,6 +2033,7 @@ static const URange32 Lu_range32[] = { { 120662, 120686 }, { 120720, 120744 }, { 120778, 120778 }, + { 125184, 125217 }, }; static const URange16 Pf_range16[] = { { 187, 187 }, @@ -2148,6 +2186,7 @@ static const URange16 Po_range16[] = { { 2142, 2142 }, { 2404, 2405 }, { 2416, 2416 }, + { 2557, 2557 }, { 2800, 2800 }, { 3572, 3572 }, { 3663, 3663 }, @@ -2199,6 +2238,7 @@ static const URange16 Po_range16[] = { { 11824, 11833 }, { 11836, 11839 }, { 11841, 11841 }, + { 11843, 11849 }, { 12289, 12291 }, { 12349, 12349 }, { 12539, 12539 }, @@ -2264,10 +2304,19 @@ static const URange32 Po_range32[] = { { 70109, 70111 }, { 70200, 70205 }, { 70313, 70313 }, + { 70731, 70735 }, + { 70747, 70747 }, + { 70749, 70749 }, { 70854, 70854 }, { 71105, 71127 }, { 71233, 71235 }, + { 71264, 71276 }, { 71484, 71486 }, + { 72255, 72262 }, + { 72346, 72348 }, + { 72350, 72354 }, + { 72769, 72773 }, + { 72816, 72817 }, { 74864, 74868 }, { 92782, 92783 }, { 92917, 92917 }, @@ -2275,6 +2324,7 @@ static const URange32 Po_range32[] = { { 92996, 92996 }, { 113823, 113823 }, { 121479, 121483 }, + { 125278, 125279 }, }; static const URange16 Me_range16[] = { { 1160, 1161 }, @@ -2291,6 +2341,7 @@ static const URange16 C_range16[] = { { 1564, 1564 }, { 1757, 1757 }, { 1807, 1807 }, + { 2274, 2274 }, { 6158, 6158 }, { 8203, 8207 }, { 8234, 8238 }, @@ -2397,6 +2448,7 @@ static const URange16 Mc_range16[] = { { 7220, 7221 }, { 7393, 7393 }, { 7410, 7411 }, + { 7415, 7415 }, { 12334, 12335 }, { 43043, 43044 }, { 43047, 43047 }, @@ -2441,6 +2493,9 @@ static const URange32 Mc_range32[] = { { 70475, 70477 }, { 70487, 70487 }, { 70498, 70499 }, + { 70709, 70711 }, + { 70720, 70721 }, + { 70725, 70725 }, { 70832, 70834 }, { 70841, 70841 }, { 70843, 70846 }, @@ -2456,6 +2511,15 @@ static const URange32 Mc_range32[] = { { 71350, 71350 }, { 71456, 71457 }, { 71462, 71462 }, + { 72199, 72200 }, + { 72249, 72249 }, + { 72279, 72280 }, + { 72343, 72343 }, + { 72751, 72751 }, + { 72766, 72766 }, + { 72873, 72873 }, + { 72881, 72881 }, + { 72884, 72884 }, { 94033, 94078 }, { 119141, 119142 }, { 119149, 119154 }, @@ -2484,6 +2548,7 @@ static const URange16 Mn_range16[] = { { 2085, 2087 }, { 2089, 2093 }, { 2137, 2139 }, + { 2260, 2273 }, { 2275, 2306 }, { 2362, 2362 }, { 2364, 2364 }, @@ -2510,6 +2575,7 @@ static const URange16 Mn_range16[] = { { 2759, 2760 }, { 2765, 2765 }, { 2786, 2787 }, + { 2810, 2815 }, { 2817, 2817 }, { 2876, 2876 }, { 2879, 2879 }, @@ -2532,7 +2598,8 @@ static const URange16 Mn_range16[] = { { 3270, 3270 }, { 3276, 3277 }, { 3298, 3299 }, - { 3329, 3329 }, + { 3328, 3329 }, + { 3387, 3388 }, { 3393, 3396 }, { 3405, 3405 }, { 3426, 3427 }, @@ -2578,6 +2645,7 @@ static const URange16 Mn_range16[] = { { 6089, 6099 }, { 6109, 6109 }, { 6155, 6157 }, + { 6277, 6278 }, { 6313, 6313 }, { 6432, 6434 }, { 6439, 6440 }, @@ -2615,8 +2683,8 @@ static const URange16 Mn_range16[] = { { 7405, 7405 }, { 7412, 7412 }, { 7416, 7417 }, - { 7616, 7669 }, - { 7676, 7679 }, + { 7616, 7673 }, + { 7675, 7679 }, { 8400, 8412 }, { 8417, 8417 }, { 8421, 8432 }, @@ -2633,7 +2701,7 @@ static const URange16 Mn_range16[] = { { 43014, 43014 }, { 43019, 43019 }, { 43045, 43046 }, - { 43204, 43204 }, + { 43204, 43205 }, { 43232, 43249 }, { 43302, 43309 }, { 43335, 43345 }, @@ -2687,6 +2755,7 @@ static const URange32 Mn_range32[] = { { 70191, 70193 }, { 70196, 70196 }, { 70198, 70199 }, + { 70206, 70206 }, { 70367, 70367 }, { 70371, 70378 }, { 70400, 70401 }, @@ -2694,6 +2763,9 @@ static const URange32 Mn_range32[] = { { 70464, 70464 }, { 70502, 70508 }, { 70512, 70516 }, + { 70712, 70719 }, + { 70722, 70724 }, + { 70726, 70726 }, { 70835, 70840 }, { 70842, 70842 }, { 70847, 70848 }, @@ -2712,6 +2784,27 @@ static const URange32 Mn_range32[] = { { 71453, 71455 }, { 71458, 71461 }, { 71463, 71467 }, + { 72193, 72198 }, + { 72201, 72202 }, + { 72243, 72248 }, + { 72251, 72254 }, + { 72263, 72263 }, + { 72273, 72278 }, + { 72281, 72283 }, + { 72330, 72342 }, + { 72344, 72345 }, + { 72752, 72758 }, + { 72760, 72765 }, + { 72767, 72767 }, + { 72850, 72871 }, + { 72874, 72880 }, + { 72882, 72883 }, + { 72885, 72886 }, + { 73009, 73014 }, + { 73018, 73018 }, + { 73020, 73021 }, + { 73023, 73029 }, + { 73031, 73031 }, { 92912, 92916 }, { 92976, 92982 }, { 94095, 94098 }, @@ -2727,7 +2820,13 @@ static const URange32 Mn_range32[] = { { 121476, 121476 }, { 121499, 121503 }, { 121505, 121519 }, + { 122880, 122886 }, + { 122888, 122904 }, + { 122907, 122913 }, + { 122915, 122916 }, + { 122918, 122922 }, { 125136, 125142 }, + { 125252, 125258 }, { 917760, 917999 }, }; static const URange16 M_range16[] = { @@ -2754,6 +2853,7 @@ static const URange16 M_range16[] = { { 2085, 2087 }, { 2089, 2093 }, { 2137, 2139 }, + { 2260, 2273 }, { 2275, 2307 }, { 2362, 2364 }, { 2366, 2383 }, @@ -2780,6 +2880,7 @@ static const URange16 M_range16[] = { { 2759, 2761 }, { 2763, 2765 }, { 2786, 2787 }, + { 2810, 2815 }, { 2817, 2819 }, { 2876, 2876 }, { 2878, 2884 }, @@ -2805,7 +2906,8 @@ static const URange16 M_range16[] = { { 3274, 3277 }, { 3285, 3286 }, { 3298, 3299 }, - { 3329, 3331 }, + { 3328, 3331 }, + { 3387, 3388 }, { 3390, 3396 }, { 3398, 3400 }, { 3402, 3405 }, @@ -2851,6 +2953,7 @@ static const URange16 M_range16[] = { { 6068, 6099 }, { 6109, 6109 }, { 6155, 6157 }, + { 6277, 6278 }, { 6313, 6313 }, { 6432, 6443 }, { 6448, 6459 }, @@ -2870,9 +2973,9 @@ static const URange16 M_range16[] = { { 7380, 7400 }, { 7405, 7405 }, { 7410, 7412 }, - { 7416, 7417 }, - { 7616, 7669 }, - { 7676, 7679 }, + { 7415, 7417 }, + { 7616, 7673 }, + { 7675, 7679 }, { 8400, 8432 }, { 11503, 11505 }, { 11647, 11647 }, @@ -2888,7 +2991,7 @@ static const URange16 M_range16[] = { { 43019, 43019 }, { 43043, 43047 }, { 43136, 43137 }, - { 43188, 43204 }, + { 43188, 43205 }, { 43232, 43249 }, { 43302, 43309 }, { 43335, 43347 }, @@ -2933,6 +3036,7 @@ static const URange32 M_range32[] = { { 70067, 70080 }, { 70090, 70092 }, { 70188, 70199 }, + { 70206, 70206 }, { 70367, 70378 }, { 70400, 70403 }, { 70460, 70460 }, @@ -2943,6 +3047,7 @@ static const URange32 M_range32[] = { { 70498, 70499 }, { 70502, 70508 }, { 70512, 70516 }, + { 70709, 70726 }, { 70832, 70851 }, { 71087, 71093 }, { 71096, 71104 }, @@ -2950,6 +3055,21 @@ static const URange32 M_range32[] = { { 71216, 71232 }, { 71339, 71351 }, { 71453, 71467 }, + { 72193, 72202 }, + { 72243, 72249 }, + { 72251, 72254 }, + { 72263, 72263 }, + { 72273, 72283 }, + { 72330, 72345 }, + { 72751, 72758 }, + { 72760, 72767 }, + { 72850, 72871 }, + { 72873, 72886 }, + { 73009, 73014 }, + { 73018, 73018 }, + { 73020, 73021 }, + { 73023, 73029 }, + { 73031, 73031 }, { 92912, 92916 }, { 92976, 92982 }, { 94033, 94078 }, @@ -2967,7 +3087,13 @@ static const URange32 M_range32[] = { { 121476, 121476 }, { 121499, 121503 }, { 121505, 121519 }, + { 122880, 122886 }, + { 122888, 122904 }, + { 122907, 122913 }, + { 122915, 122916 }, + { 122918, 122922 }, { 125136, 125142 }, + { 125252, 125258 }, { 917760, 917999 }, }; static const URange16 L_range16[] = { @@ -3019,7 +3145,9 @@ static const URange16 L_range16[] = { { 2084, 2084 }, { 2088, 2088 }, { 2112, 2136 }, + { 2144, 2154 }, { 2208, 2228 }, + { 2230, 2237 }, { 2308, 2361 }, { 2365, 2365 }, { 2384, 2384 }, @@ -3036,6 +3164,7 @@ static const URange16 L_range16[] = { { 2524, 2525 }, { 2527, 2529 }, { 2544, 2545 }, + { 2556, 2556 }, { 2565, 2570 }, { 2575, 2576 }, { 2579, 2600 }, @@ -3084,6 +3213,7 @@ static const URange16 L_range16[] = { { 3133, 3133 }, { 3160, 3162 }, { 3168, 3169 }, + { 3200, 3200 }, { 3205, 3212 }, { 3214, 3216 }, { 3218, 3240 }, @@ -3098,6 +3228,7 @@ static const URange16 L_range16[] = { { 3346, 3386 }, { 3389, 3389 }, { 3406, 3406 }, + { 3412, 3414 }, { 3423, 3425 }, { 3450, 3455 }, { 3461, 3478 }, @@ -3176,7 +3307,8 @@ static const URange16 L_range16[] = { { 6103, 6103 }, { 6108, 6108 }, { 6176, 6263 }, - { 6272, 6312 }, + { 6272, 6276 }, + { 6279, 6312 }, { 6314, 6314 }, { 6320, 6389 }, { 6400, 6430 }, @@ -3195,6 +3327,7 @@ static const URange16 L_range16[] = { { 7168, 7203 }, { 7245, 7247 }, { 7258, 7293 }, + { 7296, 7304 }, { 7401, 7404 }, { 7406, 7409 }, { 7413, 7414 }, @@ -3262,12 +3395,12 @@ static const URange16 L_range16[] = { { 12445, 12447 }, { 12449, 12538 }, { 12540, 12543 }, - { 12549, 12589 }, + { 12549, 12590 }, { 12593, 12686 }, { 12704, 12730 }, { 12784, 12799 }, { 13312, 19893 }, - { 19968, 40917 }, + { 19968, 40938 }, { 40960, 42124 }, { 42192, 42237 }, { 42240, 42508 }, @@ -3278,7 +3411,7 @@ static const URange16 L_range16[] = { { 42656, 42725 }, { 42775, 42783 }, { 42786, 42888 }, - { 42891, 42925 }, + { 42891, 42926 }, { 42928, 42935 }, { 42999, 43009 }, { 43011, 43013 }, @@ -3359,13 +3492,15 @@ static const URange32 L_range32[] = { { 66176, 66204 }, { 66208, 66256 }, { 66304, 66335 }, - { 66352, 66368 }, + { 66349, 66368 }, { 66370, 66377 }, { 66384, 66421 }, { 66432, 66461 }, { 66464, 66499 }, { 66504, 66511 }, { 66560, 66717 }, + { 66736, 66771 }, + { 66776, 66811 }, { 66816, 66855 }, { 66864, 66915 }, { 67072, 67382 }, @@ -3427,6 +3562,8 @@ static const URange32 L_range32[] = { { 70461, 70461 }, { 70480, 70480 }, { 70493, 70497 }, + { 70656, 70708 }, + { 70727, 70730 }, { 70784, 70831 }, { 70852, 70853 }, { 70855, 70855 }, @@ -3438,7 +3575,21 @@ static const URange32 L_range32[] = { { 71424, 71449 }, { 71840, 71903 }, { 71935, 71935 }, + { 72192, 72192 }, + { 72203, 72242 }, + { 72250, 72250 }, + { 72272, 72272 }, + { 72284, 72323 }, + { 72326, 72329 }, { 72384, 72440 }, + { 72704, 72712 }, + { 72714, 72750 }, + { 72768, 72768 }, + { 72818, 72847 }, + { 72960, 72966 }, + { 72968, 72969 }, + { 72971, 73008 }, + { 73030, 73030 }, { 73728, 74649 }, { 74880, 75075 }, { 77824, 78894 }, @@ -3453,7 +3604,11 @@ static const URange32 L_range32[] = { { 93952, 94020 }, { 94032, 94032 }, { 94099, 94111 }, - { 110592, 110593 }, + { 94176, 94177 }, + { 94208, 100332 }, + { 100352, 101106 }, + { 110592, 110878 }, + { 110960, 111355 }, { 113664, 113770 }, { 113776, 113788 }, { 113792, 113800 }, @@ -3489,6 +3644,7 @@ static const URange32 L_range32[] = { { 120746, 120770 }, { 120772, 120779 }, { 124928, 125124 }, + { 125184, 125251 }, { 126464, 126467 }, { 126469, 126495 }, { 126497, 126498 }, @@ -3526,6 +3682,7 @@ static const URange32 L_range32[] = { { 173824, 177972 }, { 177984, 178205 }, { 178208, 183969 }, + { 183984, 191456 }, { 194560, 195101 }, }; static const URange16 N_range16[] = { @@ -3547,7 +3704,8 @@ static const URange16 N_range16[] = { { 3174, 3183 }, { 3192, 3198 }, { 3302, 3311 }, - { 3430, 3445 }, + { 3416, 3422 }, + { 3430, 3448 }, { 3558, 3567 }, { 3664, 3673 }, { 3792, 3801 }, @@ -3629,11 +3787,14 @@ static const URange32 N_range32[] = { { 70096, 70105 }, { 70113, 70132 }, { 70384, 70393 }, + { 70736, 70745 }, { 70864, 70873 }, { 71248, 71257 }, { 71360, 71369 }, { 71472, 71483 }, { 71904, 71922 }, + { 72784, 72812 }, + { 73040, 73049 }, { 74752, 74862 }, { 92768, 92777 }, { 93008, 93017 }, @@ -3641,6 +3802,7 @@ static const URange32 N_range32[] = { { 119648, 119665 }, { 120782, 120831 }, { 125127, 125135 }, + { 125264, 125273 }, { 127232, 127244 }, }; static const URange16 Sk_range16[] = { @@ -3713,6 +3875,7 @@ static const URange16 P_range16[] = { { 2142, 2142 }, { 2404, 2405 }, { 2416, 2416 }, + { 2557, 2557 }, { 2800, 2800 }, { 3572, 3572 }, { 3663, 3663 }, @@ -3762,7 +3925,7 @@ static const URange16 P_range16[] = { { 11518, 11519 }, { 11632, 11632 }, { 11776, 11822 }, - { 11824, 11842 }, + { 11824, 11849 }, { 12289, 12291 }, { 12296, 12305 }, { 12308, 12319 }, @@ -3829,10 +3992,19 @@ static const URange32 P_range32[] = { { 70109, 70111 }, { 70200, 70205 }, { 70313, 70313 }, + { 70731, 70735 }, + { 70747, 70747 }, + { 70749, 70749 }, { 70854, 70854 }, { 71105, 71127 }, { 71233, 71235 }, + { 71264, 71276 }, { 71484, 71486 }, + { 72255, 72262 }, + { 72346, 72348 }, + { 72350, 72354 }, + { 72769, 72773 }, + { 72816, 72817 }, { 74864, 74868 }, { 92782, 92783 }, { 92917, 92917 }, @@ -3840,6 +4012,7 @@ static const URange32 P_range32[] = { { 92996, 92996 }, { 113823, 113823 }, { 121479, 121483 }, + { 125278, 125279 }, }; static const URange16 S_range16[] = { { 36, 36 }, @@ -3880,6 +4053,7 @@ static const URange16 S_range16[] = { { 2928, 2928 }, { 3059, 3066 }, { 3199, 3199 }, + { 3407, 3407 }, { 3449, 3449 }, { 3647, 3647 }, { 3841, 3843 }, @@ -3910,7 +4084,7 @@ static const URange16 S_range16[] = { { 8274, 8274 }, { 8314, 8316 }, { 8330, 8332 }, - { 8352, 8382 }, + { 8352, 8383 }, { 8448, 8449 }, { 8451, 8454 }, { 8456, 8457 }, @@ -3928,8 +4102,7 @@ static const URange16 S_range16[] = { { 8586, 8587 }, { 8592, 8967 }, { 8972, 9000 }, - { 9003, 9210 }, - { 9216, 9254 }, + { 9003, 9254 }, { 9280, 9290 }, { 9372, 9449 }, { 9472, 10087 }, @@ -3942,7 +4115,7 @@ static const URange16 S_range16[] = { { 11126, 11157 }, { 11160, 11193 }, { 11197, 11208 }, - { 11210, 11217 }, + { 11210, 11218 }, { 11244, 11247 }, { 11493, 11498 }, { 11904, 11929 }, @@ -3994,7 +4167,7 @@ static const URange16 S_range16[] = { static const URange32 S_range32[] = { { 65847, 65855 }, { 65913, 65929 }, - { 65932, 65932 }, + { 65932, 65934 }, { 65936, 65947 }, { 65952, 65952 }, { 66000, 66044 }, @@ -4038,16 +4211,15 @@ static const URange32 S_range32[] = { { 127185, 127221 }, { 127248, 127278 }, { 127280, 127339 }, - { 127344, 127386 }, + { 127344, 127404 }, { 127462, 127490 }, - { 127504, 127546 }, + { 127504, 127547 }, { 127552, 127560 }, { 127568, 127569 }, - { 127744, 128377 }, - { 128379, 128419 }, - { 128421, 128720 }, + { 127584, 127589 }, + { 127744, 128724 }, { 128736, 128748 }, - { 128752, 128755 }, + { 128752, 128760 }, { 128768, 128883 }, { 128896, 128980 }, { 129024, 129035 }, @@ -4055,9 +4227,13 @@ static const URange32 S_range32[] = { { 129104, 129113 }, { 129120, 129159 }, { 129168, 129197 }, - { 129296, 129304 }, - { 129408, 129412 }, + { 129280, 129291 }, + { 129296, 129342 }, + { 129344, 129356 }, + { 129360, 129387 }, + { 129408, 129431 }, { 129472, 129472 }, + { 129488, 129510 }, }; static const URange16 So_range16[] = { { 166, 166 }, @@ -4076,6 +4252,7 @@ static const URange16 So_range16[] = { { 3059, 3064 }, { 3066, 3066 }, { 3199, 3199 }, + { 3407, 3407 }, { 3449, 3449 }, { 3841, 3843 }, { 3859, 3859 }, @@ -4124,8 +4301,7 @@ static const URange16 So_range16[] = { { 9003, 9083 }, { 9085, 9114 }, { 9140, 9179 }, - { 9186, 9210 }, - { 9216, 9254 }, + { 9186, 9254 }, { 9280, 9290 }, { 9372, 9449 }, { 9472, 9654 }, @@ -4141,7 +4317,7 @@ static const URange16 So_range16[] = { { 11126, 11157 }, { 11160, 11193 }, { 11197, 11208 }, - { 11210, 11217 }, + { 11210, 11218 }, { 11244, 11247 }, { 11493, 11498 }, { 11904, 11929 }, @@ -4178,7 +4354,7 @@ static const URange16 So_range16[] = { static const URange32 So_range32[] = { { 65847, 65855 }, { 65913, 65929 }, - { 65932, 65932 }, + { 65932, 65934 }, { 65936, 65947 }, { 65952, 65952 }, { 66000, 66044 }, @@ -4211,17 +4387,16 @@ static const URange32 So_range32[] = { { 127185, 127221 }, { 127248, 127278 }, { 127280, 127339 }, - { 127344, 127386 }, + { 127344, 127404 }, { 127462, 127490 }, - { 127504, 127546 }, + { 127504, 127547 }, { 127552, 127560 }, { 127568, 127569 }, + { 127584, 127589 }, { 127744, 127994 }, - { 128000, 128377 }, - { 128379, 128419 }, - { 128421, 128720 }, + { 128000, 128724 }, { 128736, 128748 }, - { 128752, 128755 }, + { 128752, 128760 }, { 128768, 128883 }, { 128896, 128980 }, { 129024, 129035 }, @@ -4229,9 +4404,13 @@ static const URange32 So_range32[] = { { 129104, 129113 }, { 129120, 129159 }, { 129168, 129197 }, - { 129296, 129304 }, - { 129408, 129412 }, + { 129280, 129291 }, + { 129296, 129342 }, + { 129344, 129356 }, + { 129360, 129387 }, + { 129408, 129431 }, { 129472, 129472 }, + { 129488, 129510 }, }; static const URange16 Sm_range16[] = { { 43, 43 }, @@ -4312,7 +4491,7 @@ static const URange16 Sc_range16[] = { { 3065, 3065 }, { 3647, 3647 }, { 6107, 6107 }, - { 8352, 8382 }, + { 8352, 8383 }, { 43064, 43064 }, { 65020, 65020 }, { 65129, 65129 }, @@ -4350,6 +4529,7 @@ static const URange16 Cf_range16[] = { { 1564, 1564 }, { 1757, 1757 }, { 1807, 1807 }, + { 2274, 2274 }, { 6158, 6158 }, { 8203, 8207 }, { 8234, 8238 }, @@ -4380,9 +4560,19 @@ static const URange16 Zs_range16[] = { { 8287, 8287 }, { 12288, 12288 }, }; +static const URange32 Tangut_range32[] = { + { 94176, 94176 }, + { 94208, 100332 }, + { 100352, 101106 }, +}; static const URange16 Thaana_range16[] = { { 1920, 1969 }, }; +static const URange32 Adlam_range32[] = { + { 125184, 125258 }, + { 125264, 125273 }, + { 125278, 125279 }, +}; static const URange16 Telugu_range16[] = { { 3072, 3075 }, { 3077, 3084 }, @@ -4401,12 +4591,16 @@ static const URange16 Telugu_range16[] = { static const URange16 Cyrillic_range16[] = { { 1024, 1156 }, { 1159, 1327 }, + { 7296, 7304 }, { 7467, 7467 }, { 7544, 7544 }, { 11744, 11775 }, { 42560, 42655 }, { 65070, 65071 }, }; +static const URange32 Zanabazar_Square_range32[] = { + { 72192, 72263 }, +}; static const URange16 Hangul_range16[] = { { 4352, 4607 }, { 12334, 12335 }, @@ -4473,8 +4667,8 @@ static const URange16 Inherited_range16[] = { { 7405, 7405 }, { 7412, 7412 }, { 7416, 7417 }, - { 7616, 7669 }, - { 7676, 7679 }, + { 7616, 7673 }, + { 7675, 7679 }, { 8204, 8205 }, { 8400, 8432 }, { 12330, 12333 }, @@ -4496,6 +4690,12 @@ static const URange32 Meroitic_Cursive_range32[] = { { 68028, 68047 }, { 68050, 68095 }, }; +static const URange32 Bhaiksuki_range32[] = { + { 72704, 72712 }, + { 72714, 72758 }, + { 72760, 72773 }, + { 72784, 72812 }, +}; static const URange32 Ahom_range32[] = { { 71424, 71449 }, { 71453, 71467 }, @@ -4510,7 +4710,7 @@ static const URange16 Han_range16[] = { { 12321, 12329 }, { 12344, 12347 }, { 13312, 19893 }, - { 19968, 40917 }, + { 19968, 40938 }, { 63744, 64109 }, { 64112, 64217 }, }; @@ -4519,6 +4719,7 @@ static const URange32 Han_range32[] = { { 173824, 177972 }, { 177984, 178205 }, { 178208, 183969 }, + { 183984, 191456 }, { 194560, 195101 }, }; static const URange32 Old_North_Arabian_range32[] = { @@ -4552,7 +4753,7 @@ static const URange16 Tamil_range16[] = { }; static const URange16 Bopomofo_range16[] = { { 746, 747 }, - { 12549, 12589 }, + { 12549, 12590 }, { 12704, 12730 }, }; static const URange32 Bassa_Vah_range32[] = { @@ -4563,22 +4764,28 @@ static const URange16 Sundanese_range16[] = { { 7040, 7103 }, { 7360, 7367 }, }; +static const URange32 Osage_range32[] = { + { 66736, 66771 }, + { 66776, 66811 }, +}; static const URange16 Tagalog_range16[] = { { 5888, 5900 }, { 5902, 5908 }, }; static const URange16 Malayalam_range16[] = { - { 3329, 3331 }, + { 3328, 3331 }, { 3333, 3340 }, { 3342, 3344 }, - { 3346, 3386 }, - { 3389, 3396 }, + { 3346, 3396 }, { 3398, 3400 }, - { 3402, 3406 }, - { 3415, 3415 }, - { 3423, 3427 }, - { 3430, 3445 }, - { 3449, 3455 }, + { 3402, 3407 }, + { 3412, 3427 }, + { 3430, 3455 }, +}; +static const URange32 Marchen_range32[] = { + { 72816, 72847 }, + { 72850, 72871 }, + { 72873, 72886 }, }; static const URange32 Carian_range32[] = { { 66208, 66256 }, @@ -4588,7 +4795,7 @@ static const URange16 Hiragana_range16[] = { { 12445, 12447 }, }; static const URange32 Hiragana_range32[] = { - { 110593, 110593 }, + { 110593, 110878 }, { 127488, 127488 }, }; static const URange16 Tagbanwa_range16[] = { @@ -4639,6 +4846,7 @@ static const URange16 Tai_Tham_range16[] = { }; static const URange32 Old_Italic_range32[] = { { 66304, 66339 }, + { 66349, 66351 }, }; static const URange32 Old_Persian_range32[] = { { 66464, 66499 }, @@ -4672,7 +4880,7 @@ static const URange16 Latin_range16[] = { { 8544, 8584 }, { 11360, 11391 }, { 42786, 42887 }, - { 42891, 42925 }, + { 42891, 42926 }, { 42928, 42935 }, { 42999, 43007 }, { 43824, 43866 }, @@ -4682,7 +4890,7 @@ static const URange16 Latin_range16[] = { { 65345, 65370 }, }; static const URange16 Saurashtra_range16[] = { - { 43136, 43204 }, + { 43136, 43205 }, { 43214, 43225 }, }; static const URange32 Shavian_range32[] = { @@ -4796,7 +5004,7 @@ static const URange16 Greek_range16[] = { { 43877, 43877 }, }; static const URange32 Greek_range32[] = { - { 65856, 65932 }, + { 65856, 65934 }, { 65952, 65952 }, { 119296, 119365 }, }; @@ -4831,6 +5039,7 @@ static const URange16 Syriac_range16[] = { { 1792, 1805 }, { 1807, 1866 }, { 1869, 1871 }, + { 2144, 2154 }, }; static const URange16 Runic_range16[] = { { 5792, 5866 }, @@ -4867,6 +5076,11 @@ static const URange16 New_Tai_Lue_range16[] = { static const URange16 Ol_Chiki_range16[] = { { 7248, 7295 }, }; +static const URange32 Newa_range32[] = { + { 70656, 70745 }, + { 70747, 70747 }, + { 70749, 70749 }, +}; static const URange16 Limbu_range16[] = { { 6400, 6430 }, { 6432, 6443 }, @@ -4921,7 +5135,11 @@ static const URange16 Gujarati_range16[] = { { 2768, 2768 }, { 2784, 2787 }, { 2790, 2801 }, - { 2809, 2809 }, + { 2809, 2815 }, +}; +static const URange32 Nushu_range32[] = { + { 94177, 94177 }, + { 110960, 111355 }, }; static const URange32 Modi_range32[] = { { 71168, 71236 }, @@ -4995,7 +5213,7 @@ static const URange16 Bengali_range16[] = { { 2519, 2519 }, { 2524, 2525 }, { 2527, 2531 }, - { 2534, 2555 }, + { 2534, 2557 }, }; static const URange32 Kaithi_range32[] = { { 69760, 69825 }, @@ -5004,6 +5222,13 @@ static const URange16 Glagolitic_range16[] = { { 11264, 11310 }, { 11312, 11358 }, }; +static const URange32 Glagolitic_range32[] = { + { 122880, 122886 }, + { 122888, 122904 }, + { 122907, 122913 }, + { 122915, 122916 }, + { 122918, 122922 }, +}; static const URange32 Imperial_Aramaic_range32[] = { { 67648, 67669 }, { 67671, 67679 }, @@ -5050,7 +5275,7 @@ static const URange32 Cypriot_range32[] = { { 67647, 67647 }, }; static const URange16 Kannada_range16[] = { - { 3201, 3203 }, + { 3200, 3203 }, { 3205, 3212 }, { 3214, 3216 }, { 3218, 3240 }, @@ -5067,7 +5292,7 @@ static const URange16 Kannada_range16[] = { }; static const URange32 Khojki_range32[] = { { 70144, 70161 }, - { 70163, 70205 }, + { 70163, 70206 }, }; static const URange16 Mongolian_range16[] = { { 6144, 6145 }, @@ -5077,6 +5302,9 @@ static const URange16 Mongolian_range16[] = { { 6176, 6263 }, { 6272, 6314 }, }; +static const URange32 Mongolian_range32[] = { + { 71264, 71276 }, +}; static const URange16 Sinhala_range16[] = { { 3458, 3459 }, { 3461, 3478 }, @@ -5152,6 +5380,15 @@ static const URange16 Lao_range16[] = { static const URange16 Hanunoo_range16[] = { { 5920, 5940 }, }; +static const URange32 Masaram_Gondi_range32[] = { + { 72960, 72966 }, + { 72968, 72969 }, + { 72971, 73014 }, + { 73018, 73018 }, + { 73020, 73021 }, + { 73023, 73031 }, + { 73040, 73049 }, +}; static const URange32 Linear_B_range32[] = { { 65536, 65547 }, { 65549, 65574 }, @@ -5230,10 +5467,11 @@ static const URange16 Common_range16[] = { { 1417, 1417 }, { 1541, 1541 }, { 1548, 1548 }, - { 1563, 1564 }, + { 1563, 1563 }, { 1567, 1567 }, { 1600, 1600 }, { 1757, 1757 }, + { 2274, 2274 }, { 2404, 2405 }, { 3647, 3647 }, { 4053, 4056 }, @@ -5246,30 +5484,29 @@ static const URange16 Common_range16[] = { { 7393, 7393 }, { 7401, 7404 }, { 7406, 7411 }, - { 7413, 7414 }, + { 7413, 7415 }, { 8192, 8203 }, { 8206, 8292 }, { 8294, 8304 }, { 8308, 8318 }, { 8320, 8334 }, - { 8352, 8382 }, + { 8352, 8383 }, { 8448, 8485 }, { 8487, 8489 }, { 8492, 8497 }, { 8499, 8525 }, { 8527, 8543 }, { 8585, 8587 }, - { 8592, 9210 }, - { 9216, 9254 }, + { 8592, 9254 }, { 9280, 9290 }, { 9312, 10239 }, { 10496, 11123 }, { 11126, 11157 }, { 11160, 11193 }, { 11197, 11208 }, - { 11210, 11217 }, + { 11210, 11218 }, { 11244, 11247 }, - { 11776, 11842 }, + { 11776, 11849 }, { 12272, 12283 }, { 12288, 12292 }, { 12294, 12294 }, @@ -5353,17 +5590,16 @@ static const URange32 Common_range32[] = { { 127232, 127244 }, { 127248, 127278 }, { 127280, 127339 }, - { 127344, 127386 }, + { 127344, 127404 }, { 127462, 127487 }, { 127489, 127490 }, - { 127504, 127546 }, + { 127504, 127547 }, { 127552, 127560 }, { 127568, 127569 }, - { 127744, 128377 }, - { 128379, 128419 }, - { 128421, 128720 }, + { 127584, 127589 }, + { 127744, 128724 }, { 128736, 128748 }, - { 128752, 128755 }, + { 128752, 128760 }, { 128768, 128883 }, { 128896, 128980 }, { 129024, 129035 }, @@ -5371,9 +5607,13 @@ static const URange32 Common_range32[] = { { 129104, 129113 }, { 129120, 129159 }, { 129168, 129197 }, - { 129296, 129304 }, - { 129408, 129412 }, + { 129280, 129291 }, + { 129296, 129342 }, + { 129344, 129356 }, + { 129360, 129387 }, + { 129408, 129431 }, { 129472, 129472 }, + { 129488, 129510 }, { 917505, 917505 }, { 917536, 917631 }, }; @@ -5390,6 +5630,7 @@ static const URange16 Arabic_range16[] = { { 1536, 1540 }, { 1542, 1547 }, { 1549, 1562 }, + { 1564, 1564 }, { 1566, 1566 }, { 1568, 1599 }, { 1601, 1610 }, @@ -5398,6 +5639,8 @@ static const URange16 Arabic_range16[] = { { 1758, 1791 }, { 1872, 1919 }, { 2208, 2228 }, + { 2230, 2237 }, + { 2260, 2273 }, { 2275, 2303 }, { 64336, 64449 }, { 64467, 64829 }, @@ -5459,6 +5702,11 @@ static const URange32 Siddham_range32[] = { { 71040, 71093 }, { 71096, 71133 }, }; +static const URange32 Soyombo_range32[] = { + { 72272, 72323 }, + { 72326, 72348 }, + { 72350, 72354 }, +}; static const URange32 Avestan_range32[] = { { 68352, 68405 }, { 68409, 68415 }, @@ -5483,11 +5731,12 @@ static const URange32 Takri_range32[] = { { 71296, 71351 }, { 71360, 71369 }, }; -// 3949 16-bit ranges, 1133 32-bit ranges +// 3981 16-bit ranges, 1325 32-bit ranges const UGroup unicode_groups[] = { + { "Adlam", +1, 0, 0, Adlam_range32, 3 }, { "Ahom", +1, 0, 0, Ahom_range32, 3 }, { "Anatolian_Hieroglyphs", +1, 0, 0, Anatolian_Hieroglyphs_range32, 1 }, - { "Arabic", +1, Arabic_range16, 19, Arabic_range32, 35 }, + { "Arabic", +1, Arabic_range16, 22, Arabic_range32, 35 }, { "Armenian", +1, Armenian_range16, 6, 0, 0 }, { "Avestan", +1, 0, 0, Avestan_range32, 2 }, { "Balinese", +1, Balinese_range16, 2, 0, 0 }, @@ -5495,27 +5744,28 @@ const UGroup unicode_groups[] = { { "Bassa_Vah", +1, 0, 0, Bassa_Vah_range32, 2 }, { "Batak", +1, Batak_range16, 2, 0, 0 }, { "Bengali", +1, Bengali_range16, 14, 0, 0 }, + { "Bhaiksuki", +1, 0, 0, Bhaiksuki_range32, 4 }, { "Bopomofo", +1, Bopomofo_range16, 3, 0, 0 }, { "Brahmi", +1, 0, 0, Brahmi_range32, 3 }, { "Braille", +1, Braille_range16, 1, 0, 0 }, { "Buginese", +1, Buginese_range16, 2, 0, 0 }, { "Buhid", +1, Buhid_range16, 1, 0, 0 }, - { "C", +1, C_range16, 15, C_range32, 7 }, + { "C", +1, C_range16, 16, C_range32, 7 }, { "Canadian_Aboriginal", +1, Canadian_Aboriginal_range16, 2, 0, 0 }, { "Carian", +1, 0, 0, Carian_range32, 1 }, { "Caucasian_Albanian", +1, 0, 0, Caucasian_Albanian_range32, 2 }, { "Cc", +1, Cc_range16, 2, 0, 0 }, - { "Cf", +1, Cf_range16, 12, Cf_range32, 5 }, + { "Cf", +1, Cf_range16, 13, Cf_range32, 5 }, { "Chakma", +1, 0, 0, Chakma_range32, 2 }, { "Cham", +1, Cham_range16, 4, 0, 0 }, { "Cherokee", +1, Cherokee_range16, 3, 0, 0 }, { "Co", +1, Co_range16, 1, Co_range32, 2 }, - { "Common", +1, Common_range16, 92, Common_range32, 69 }, + { "Common", +1, Common_range16, 92, Common_range32, 72 }, { "Coptic", +1, Coptic_range16, 3, 0, 0 }, { "Cs", +1, Cs_range16, 1, 0, 0 }, { "Cuneiform", +1, 0, 0, Cuneiform_range32, 4 }, { "Cypriot", +1, 0, 0, Cypriot_range32, 6 }, - { "Cyrillic", +1, Cyrillic_range16, 7, 0, 0 }, + { "Cyrillic", +1, Cyrillic_range16, 8, 0, 0 }, { "Deseret", +1, 0, 0, Deseret_range32, 1 }, { "Devanagari", +1, Devanagari_range16, 4, 0, 0 }, { "Duployan", +1, 0, 0, Duployan_range32, 5 }, @@ -5523,13 +5773,13 @@ const UGroup unicode_groups[] = { { "Elbasan", +1, 0, 0, Elbasan_range32, 1 }, { "Ethiopic", +1, Ethiopic_range16, 32, 0, 0 }, { "Georgian", +1, Georgian_range16, 8, 0, 0 }, - { "Glagolitic", +1, Glagolitic_range16, 2, 0, 0 }, + { "Glagolitic", +1, Glagolitic_range16, 2, Glagolitic_range32, 5 }, { "Gothic", +1, 0, 0, Gothic_range32, 1 }, { "Grantha", +1, 0, 0, Grantha_range32, 15 }, { "Greek", +1, Greek_range16, 33, Greek_range32, 3 }, { "Gujarati", +1, Gujarati_range16, 14, 0, 0 }, { "Gurmukhi", +1, Gurmukhi_range16, 16, 0, 0 }, - { "Han", +1, Han_range16, 11, Han_range32, 5 }, + { "Han", +1, Han_range16, 11, Han_range32, 6 }, { "Hangul", +1, Hangul_range16, 14, 0, 0 }, { "Hanunoo", +1, Hanunoo_range16, 1, 0, 0 }, { "Hatran", +1, 0, 0, Hatran_range32, 3 }, @@ -5548,7 +5798,7 @@ const UGroup unicode_groups[] = { { "Khmer", +1, Khmer_range16, 4, 0, 0 }, { "Khojki", +1, 0, 0, Khojki_range32, 2 }, { "Khudawadi", +1, 0, 0, Khudawadi_range32, 2 }, - { "L", +1, L_range16, 376, L_range32, 178 }, + { "L", +1, L_range16, 383, L_range32, 202 }, { "Lao", +1, Lao_range16, 18, 0, 0 }, { "Latin", +1, Latin_range16, 31, 0, 0 }, { "Lepcha", +1, Lepcha_range16, 3, 0, 0 }, @@ -5556,50 +5806,55 @@ const UGroup unicode_groups[] = { { "Linear_A", +1, 0, 0, Linear_A_range32, 3 }, { "Linear_B", +1, 0, 0, Linear_B_range32, 7 }, { "Lisu", +1, Lisu_range16, 1, 0, 0 }, - { "Ll", +1, Ll_range16, 599, Ll_range32, 31 }, - { "Lm", +1, Lm_range16, 54, Lm_range32, 2 }, - { "Lo", +1, Lo_range16, 290, Lo_range32, 143 }, + { "Ll", +1, Ll_range16, 600, Ll_range32, 33 }, + { "Lm", +1, Lm_range16, 54, Lm_range32, 3 }, + { "Lo", +1, Lo_range16, 296, Lo_range32, 163 }, { "Lt", +1, Lt_range16, 10, 0, 0 }, - { "Lu", +1, Lu_range16, 591, Lu_range32, 34 }, + { "Lu", +1, Lu_range16, 591, Lu_range32, 36 }, { "Lycian", +1, 0, 0, Lycian_range32, 1 }, { "Lydian", +1, 0, 0, Lydian_range32, 2 }, - { "M", +1, M_range16, 180, M_range32, 56 }, + { "M", +1, M_range16, 184, M_range32, 79 }, { "Mahajani", +1, 0, 0, Mahajani_range32, 1 }, - { "Malayalam", +1, Malayalam_range16, 11, 0, 0 }, + { "Malayalam", +1, Malayalam_range16, 8, 0, 0 }, { "Mandaic", +1, Mandaic_range16, 2, 0, 0 }, { "Manichaean", +1, 0, 0, Manichaean_range32, 2 }, - { "Mc", +1, Mc_range16, 109, Mc_range32, 38 }, + { "Marchen", +1, 0, 0, Marchen_range32, 3 }, + { "Masaram_Gondi", +1, 0, 0, Masaram_Gondi_range32, 7 }, + { "Mc", +1, Mc_range16, 110, Mc_range32, 50 }, { "Me", +1, Me_range16, 5, 0, 0 }, { "Meetei_Mayek", +1, Meetei_Mayek_range16, 3, 0, 0 }, { "Mende_Kikakui", +1, 0, 0, Mende_Kikakui_range32, 2 }, { "Meroitic_Cursive", +1, 0, 0, Meroitic_Cursive_range32, 3 }, { "Meroitic_Hieroglyphs", +1, 0, 0, Meroitic_Hieroglyphs_range32, 1 }, { "Miao", +1, 0, 0, Miao_range32, 3 }, - { "Mn", +1, Mn_range16, 200, Mn_range32, 66 }, + { "Mn", +1, Mn_range16, 204, Mn_range32, 97 }, { "Modi", +1, 0, 0, Modi_range32, 2 }, - { "Mongolian", +1, Mongolian_range16, 6, 0, 0 }, + { "Mongolian", +1, Mongolian_range16, 6, Mongolian_range32, 1 }, { "Mro", +1, 0, 0, Mro_range32, 3 }, { "Multani", +1, 0, 0, Multani_range32, 5 }, { "Myanmar", +1, Myanmar_range16, 3, 0, 0 }, - { "N", +1, N_range16, 66, N_range32, 45 }, + { "N", +1, N_range16, 67, N_range32, 49 }, { "Nabataean", +1, 0, 0, Nabataean_range32, 2 }, - { "Nd", +1, Nd_range16, 37, Nd_range32, 14 }, + { "Nd", +1, Nd_range16, 37, Nd_range32, 18 }, { "New_Tai_Lue", +1, New_Tai_Lue_range16, 4, 0, 0 }, + { "Newa", +1, 0, 0, Newa_range32, 3 }, { "Nko", +1, Nko_range16, 1, 0, 0 }, { "Nl", +1, Nl_range16, 7, Nl_range32, 5 }, - { "No", +1, No_range16, 28, No_range32, 30 }, + { "No", +1, No_range16, 29, No_range32, 31 }, + { "Nushu", +1, 0, 0, Nushu_range32, 2 }, { "Ogham", +1, Ogham_range16, 1, 0, 0 }, { "Ol_Chiki", +1, Ol_Chiki_range16, 1, 0, 0 }, { "Old_Hungarian", +1, 0, 0, Old_Hungarian_range32, 3 }, - { "Old_Italic", +1, 0, 0, Old_Italic_range32, 1 }, + { "Old_Italic", +1, 0, 0, Old_Italic_range32, 2 }, { "Old_North_Arabian", +1, 0, 0, Old_North_Arabian_range32, 1 }, { "Old_Permic", +1, 0, 0, Old_Permic_range32, 1 }, { "Old_Persian", +1, 0, 0, Old_Persian_range32, 2 }, { "Old_South_Arabian", +1, 0, 0, Old_South_Arabian_range32, 1 }, { "Old_Turkic", +1, 0, 0, Old_Turkic_range32, 1 }, { "Oriya", +1, Oriya_range16, 14, 0, 0 }, + { "Osage", +1, 0, 0, Osage_range32, 2 }, { "Osmanya", +1, 0, 0, Osmanya_range32, 2 }, - { "P", +1, P_range16, 127, P_range32, 34 }, + { "P", +1, P_range16, 128, P_range32, 44 }, { "Pahawh_Hmong", +1, 0, 0, Pahawh_Hmong_range32, 5 }, { "Palmyrene", +1, 0, 0, Palmyrene_range32, 1 }, { "Pau_Cin_Hau", +1, 0, 0, Pau_Cin_Hau_range32, 1 }, @@ -5610,12 +5865,12 @@ const UGroup unicode_groups[] = { { "Phags_Pa", +1, Phags_Pa_range16, 1, 0, 0 }, { "Phoenician", +1, 0, 0, Phoenician_range32, 2 }, { "Pi", +1, Pi_range16, 11, 0, 0 }, - { "Po", +1, Po_range16, 123, Po_range32, 34 }, + { "Po", +1, Po_range16, 125, Po_range32, 44 }, { "Ps", +1, Ps_range16, 75, 0, 0 }, { "Psalter_Pahlavi", +1, 0, 0, Psalter_Pahlavi_range32, 3 }, { "Rejang", +1, Rejang_range16, 2, 0, 0 }, { "Runic", +1, Runic_range16, 2, 0, 0 }, - { "S", +1, S_range16, 148, S_range32, 66 }, + { "S", +1, S_range16, 148, S_range32, 69 }, { "Samaritan", +1, Samaritan_range16, 2, 0, 0 }, { "Saurashtra", +1, Saurashtra_range16, 2, 0, 0 }, { "Sc", +1, Sc_range16, 17, 0, 0 }, @@ -5626,11 +5881,12 @@ const UGroup unicode_groups[] = { { "Sinhala", +1, Sinhala_range16, 12, Sinhala_range32, 1 }, { "Sk", +1, Sk_range16, 28, Sk_range32, 1 }, { "Sm", +1, Sm_range16, 53, Sm_range32, 11 }, - { "So", +1, So_range16, 114, So_range32, 56 }, + { "So", +1, So_range16, 114, So_range32, 59 }, { "Sora_Sompeng", +1, 0, 0, Sora_Sompeng_range32, 2 }, + { "Soyombo", +1, 0, 0, Soyombo_range32, 3 }, { "Sundanese", +1, Sundanese_range16, 2, 0, 0 }, { "Syloti_Nagri", +1, Syloti_Nagri_range16, 1, 0, 0 }, - { "Syriac", +1, Syriac_range16, 3, 0, 0 }, + { "Syriac", +1, Syriac_range16, 4, 0, 0 }, { "Tagalog", +1, Tagalog_range16, 2, 0, 0 }, { "Tagbanwa", +1, Tagbanwa_range16, 3, 0, 0 }, { "Tai_Le", +1, Tai_Le_range16, 2, 0, 0 }, @@ -5638,6 +5894,7 @@ const UGroup unicode_groups[] = { { "Tai_Viet", +1, Tai_Viet_range16, 2, 0, 0 }, { "Takri", +1, 0, 0, Takri_range32, 2 }, { "Tamil", +1, Tamil_range16, 16, 0, 0 }, + { "Tangut", +1, 0, 0, Tangut_range32, 3 }, { "Telugu", +1, Telugu_range16, 13, 0, 0 }, { "Thaana", +1, Thaana_range16, 1, 0, 0 }, { "Thai", +1, Thai_range16, 2, 0, 0 }, @@ -5649,11 +5906,12 @@ const UGroup unicode_groups[] = { { "Warang_Citi", +1, 0, 0, Warang_Citi_range32, 2 }, { "Yi", +1, Yi_range16, 2, 0, 0 }, { "Z", +1, Z_range16, 8, 0, 0 }, + { "Zanabazar_Square", +1, 0, 0, Zanabazar_Square_range32, 1 }, { "Zl", +1, Zl_range16, 1, 0, 0 }, { "Zp", +1, Zp_range16, 1, 0, 0 }, { "Zs", +1, Zs_range16, 7, 0, 0 }, }; -const int num_unicode_groups = 167; +const int num_unicode_groups = 177; } // namespace re2 diff --git a/util/sparse_array.h b/util/sparse_array.h index 7738431..09fb4ac 100644 --- a/util/sparse_array.h +++ b/util/sparse_array.h @@ -99,8 +99,8 @@ #include <string.h> #include <algorithm> #include <memory> +#include <type_traits> #include <utility> -#include <vector> namespace re2 { @@ -113,10 +113,12 @@ class SparseArray { // IndexValue pairs: exposed in SparseArray::iterator. class IndexValue; + static_assert(std::is_trivially_destructible<IndexValue>::value, + "IndexValue must be trivially destructible"); typedef IndexValue value_type; - typedef typename std::vector<IndexValue>::iterator iterator; - typedef typename std::vector<IndexValue>::const_iterator const_iterator; + typedef IndexValue* iterator; + typedef const IndexValue* const_iterator; SparseArray(const SparseArray& src); SparseArray(SparseArray&& src) /*noexcept*/; @@ -138,17 +140,17 @@ class SparseArray { // Iterate over the array. iterator begin() { - return dense_.begin(); + return dense_.get(); } iterator end() { - return dense_.begin() + size_; + return dense_.get() + size_; } const_iterator begin() const { - return dense_.begin(); + return dense_.get(); } const_iterator end() const { - return dense_.begin() + size_; + return dense_.get() + size_; } // Change the maximum size of the array. @@ -198,13 +200,13 @@ class SparseArray { iterator find(int i) { if (has_index(i)) - return dense_.begin() + sparse_to_dense_[i]; + return dense_.get() + sparse_to_dense_[i]; return end(); } const_iterator find(int i) const { if (has_index(i)) - return dense_.begin() + sparse_to_dense_[i]; + return dense_.get() + sparse_to_dense_[i]; return end(); } @@ -261,7 +263,7 @@ class SparseArray { DebugCheckInvariants(); std::pair<iterator, bool> p; if (has_index(v.index_)) { - p = {dense_.begin() + sparse_to_dense_[v.index_], false}; + p = {dense_.get() + sparse_to_dense_[v.index_], false}; } else { p = {set_new(std::forward<U>(v).index_, std::forward<U>(v).second), true}; } @@ -295,7 +297,7 @@ class SparseArray { assert(has_index(i)); dense_[sparse_to_dense_[i]].value() = std::forward<U>(v); DebugCheckInvariants(); - return dense_.begin() + sparse_to_dense_[i]; + return dense_.get() + sparse_to_dense_[i]; } // Add the index i to the array. @@ -327,7 +329,7 @@ class SparseArray { int size_ = 0; int max_size_ = 0; std::unique_ptr<int[]> sparse_to_dense_; - std::vector<IndexValue> dense_; + std::unique_ptr<IndexValue[]> dense_; }; template<typename Value> @@ -338,8 +340,9 @@ SparseArray<Value>::SparseArray(const SparseArray& src) : size_(src.size_), max_size_(src.max_size_), sparse_to_dense_(new int[max_size_]), - dense_(src.dense_) { + dense_(new IndexValue[max_size_]) { std::copy_n(src.sparse_to_dense_.get(), max_size_, sparse_to_dense_.get()); + std::copy_n(src.dense_.get(), max_size_, dense_.get()); } template<typename Value> @@ -350,17 +353,18 @@ SparseArray<Value>::SparseArray(SparseArray&& src) /*noexcept*/ // NOLINT dense_(std::move(src.dense_)) { src.size_ = 0; src.max_size_ = 0; - src.dense_.clear(); } template<typename Value> SparseArray<Value>& SparseArray<Value>::operator=(const SparseArray& src) { - std::unique_ptr<int[]> a(new int[src.max_size_]); + size_ = src.size_; + max_size_ = src.max_size_; + std::unique_ptr<int[]> a(new int[max_size_]); std::copy_n(src.sparse_to_dense_.get(), src.max_size_, a.get()); sparse_to_dense_ = std::move(a); - dense_ = src.dense_; - max_size_ = src.max_size_; - size_ = src.size_; + std::unique_ptr<IndexValue[]> b(new IndexValue[max_size_]); + std::copy_n(src.dense_.get(), src.max_size_, b.get()); + dense_ = std::move(b); return *this; } @@ -374,7 +378,6 @@ SparseArray<Value>& SparseArray<Value>::operator=( // clear out the source src.size_ = 0; src.max_size_ = 0; - src.dense_.clear(); return *this; } @@ -427,7 +430,11 @@ void SparseArray<Value>::resize(int max_size) { } sparse_to_dense_ = std::move(a); - dense_.resize(max_size); + std::unique_ptr<IndexValue[]> b(new IndexValue[max_size]); + if (dense_) { + std::copy_n(dense_.get(), max_size_, b.get()); + } + dense_ = std::move(b); if (ShouldInitializeMemory()) { for (int i = max_size_; i < max_size; i++) { @@ -493,8 +500,8 @@ void SparseArray<Value>::create_index(int i) { template<typename Value> SparseArray<Value>::SparseArray(int max_size) { max_size_ = max_size; - sparse_to_dense_ = std::unique_ptr<int[]>(new int[max_size]); - dense_.resize(max_size); + sparse_to_dense_.reset(new int[max_size]); + dense_.reset(new IndexValue[max_size]); size_ = 0; if (ShouldInitializeMemory()) { diff --git a/util/sparse_set.h b/util/sparse_set.h index fd35236..9dde502 100644 --- a/util/sparse_set.h +++ b/util/sparse_set.h @@ -53,7 +53,6 @@ #include <algorithm> #include <memory> #include <utility> -#include <vector> namespace re2 { @@ -64,8 +63,8 @@ class SparseSetT { explicit SparseSetT(int max_size); ~SparseSetT(); - typedef typename std::vector<int>::iterator iterator; - typedef typename std::vector<int>::const_iterator const_iterator; + typedef int* iterator; + typedef const int* const_iterator; // Return the number of entries in the set. int size() const { @@ -79,17 +78,17 @@ class SparseSetT { // Iterate over the set. iterator begin() { - return dense_.begin(); + return dense_.get(); } iterator end() { - return dense_.begin() + size_; + return dense_.get() + size_; } const_iterator begin() const { - return dense_.begin(); + return dense_.get(); } const_iterator end() const { - return dense_.begin() + size_; + return dense_.get() + size_; } // Change the maximum size of the set. @@ -146,7 +145,7 @@ class SparseSetT { create_index(i); } DebugCheckInvariants(); - return dense_.begin() + sparse_to_dense_[i]; + return dense_.get() + sparse_to_dense_[i]; } // Add the index i to the set. @@ -177,7 +176,7 @@ class SparseSetT { int size_ = 0; int max_size_ = 0; std::unique_ptr<int[]> sparse_to_dense_; - std::vector<int> dense_; + std::unique_ptr<int[]> dense_; }; template<typename Value> @@ -195,7 +194,11 @@ void SparseSetT<Value>::resize(int max_size) { } sparse_to_dense_ = std::move(a); - dense_.resize(max_size); + std::unique_ptr<int[]> b(new int[max_size]); + if (dense_) { + std::copy_n(dense_.get(), max_size_, b.get()); + } + dense_ = std::move(b); if (ShouldInitializeMemory()) { for (int i = max_size_; i < max_size; i++) { @@ -234,8 +237,8 @@ void SparseSetT<Value>::create_index(int i) { template<typename Value> SparseSetT<Value>::SparseSetT(int max_size) { max_size_ = max_size; - sparse_to_dense_ = std::unique_ptr<int[]>(new int[max_size]); - dense_.resize(max_size); + sparse_to_dense_.reset(new int[max_size]); + dense_.reset(new int[max_size]); size_ = 0; if (ShouldInitializeMemory()) { diff --git a/util/strutil.cc b/util/strutil.cc index 7181073..8eabfa4 100644 --- a/util/strutil.cc +++ b/util/strutil.cc @@ -75,29 +75,21 @@ string CEscape(const StringPiece& src) { return s; } -string PrefixSuccessor(const StringPiece& prefix) { +void PrefixSuccessor(string* prefix) { // We can increment the last character in the string and be done // unless that character is 255, in which case we have to erase the // last character and increment the previous character, unless that // is 255, etc. If the string is empty or consists entirely of // 255's, we just return the empty string. - bool done = false; - string limit(prefix.data(), prefix.size()); - int index = static_cast<int>(limit.size()) - 1; - while (!done && index >= 0) { - if ((limit[index]&255) == 255) { - limit.erase(index); - index--; + while (!prefix->empty()) { + char& c = prefix->back(); + if (c == '\xff') { // char literal avoids signed/unsigned. + prefix->pop_back(); } else { - limit[index]++; - done = true; + ++c; + break; } } - if (!done) { - return ""; - } else { - return limit; - } } static void StringAppendV(string* dst, const char* format, va_list ap) { diff --git a/util/strutil.h b/util/strutil.h index 71dd293..2c3c104 100644 --- a/util/strutil.h +++ b/util/strutil.h @@ -13,7 +13,7 @@ namespace re2 { string CEscape(const StringPiece& src); -string PrefixSuccessor(const StringPiece& prefix); +void PrefixSuccessor(string* prefix); string StringPrintf(const char* format, ...); void SStringPrintf(string* dst, const char* format, ...); void StringAppendF(string* dst, const char* format, ...); |