summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDongHun Kwak <dh0128.kwak@samsung.com>2018-02-20 15:36:30 +0900
committerDongHun Kwak <dh0128.kwak@samsung.com>2018-02-20 15:36:30 +0900
commitfb1f61b46bac5b1a435d7edd9f97d9e804d30251 (patch)
treec4b16b30e5c009275706ad90355578008c2bcd31
parent47030f9abe0ed076af6465569939740f6a36b1d4 (diff)
downloadre2-fb1f61b46bac5b1a435d7edd9f97d9e804d30251.tar.gz
re2-fb1f61b46bac5b1a435d7edd9f97d9e804d30251.tar.bz2
re2-fb1f61b46bac5b1a435d7edd9f97d9e804d30251.zip
Imported Upstream version 20171101upstream/20171101
Change-Id: Ia1df4d395f2335bd59ba868feb94d3ba9fdc5999 Signed-off-by: DongHun Kwak <dh0128.kwak@samsung.com>
-rw-r--r--.travis.yml9
-rw-r--r--BUILD72
-rw-r--r--CMakeLists.txt4
-rw-r--r--doc/syntax.txt10
-rw-r--r--re2/compile.cc67
-rw-r--r--re2/dfa.cc171
-rw-r--r--re2/fuzzing/re2_fuzzer.cc15
-rw-r--r--re2/parse.cc3
-rw-r--r--re2/prog.h7
-rw-r--r--re2/re2.cc132
-rw-r--r--re2/re2.h4
-rw-r--r--re2/set.cc69
-rw-r--r--re2/set.h6
-rw-r--r--re2/testing/parse_test.cc5
-rw-r--r--re2/testing/regexp_benchmark.cc30
-rw-r--r--re2/testing/set_test.cc27
-rw-r--r--re2/unicode.py2
-rw-r--r--re2/unicode_casefold.cc54
-rw-r--r--re2/unicode_groups.cc484
-rw-r--r--util/sparse_array.h51
-rw-r--r--util/sparse_set.h27
-rw-r--r--util/strutil.cc22
-rw-r--r--util/strutil.h2
23 files changed, 883 insertions, 390 deletions
diff --git a/.travis.yml b/.travis.yml
index 8c9acca..b5a2a94 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -112,6 +112,15 @@ matrix:
- clang-4.0
env:
- MATRIX_EVAL="CC=clang-4.0 CXX=clang++-4.0"
+ - os: linux
+ addons:
+ apt:
+ sources:
+ - llvm-toolchain-trusty-5.0
+ packages:
+ - clang-5.0
+ env:
+ - MATRIX_EVAL="CC=clang-5.0 CXX=clang++-5.0"
before_install:
- eval "${MATRIX_EVAL}"
diff --git a/BUILD b/BUILD
index 5223f6e..0c42995 100644
--- a/BUILD
+++ b/BUILD
@@ -94,35 +94,77 @@ cc_library(
deps = [":testing"],
)
-load("re2_test", "re2_test")
+load(":re2_test.bzl", "re2_test")
-re2_test("charclass_test")
+re2_test(
+ "charclass_test",
+ size = "small",
+)
-re2_test("compile_test")
+re2_test(
+ "compile_test",
+ size = "small",
+)
-re2_test("filtered_re2_test")
+re2_test(
+ "filtered_re2_test",
+ size = "small",
+)
-re2_test("mimics_pcre_test")
+re2_test(
+ "mimics_pcre_test",
+ size = "small",
+)
-re2_test("parse_test")
+re2_test(
+ "parse_test",
+ size = "small",
+)
-re2_test("possible_match_test")
+re2_test(
+ "possible_match_test",
+ size = "small",
+)
-re2_test("re2_arg_test")
+re2_test(
+ "re2_arg_test",
+ size = "small",
+)
-re2_test("re2_test")
+re2_test(
+ "re2_test",
+ size = "small",
+)
-re2_test("regexp_test")
+re2_test(
+ "regexp_test",
+ size = "small",
+)
-re2_test("required_prefix_test")
+re2_test(
+ "required_prefix_test",
+ size = "small",
+)
-re2_test("search_test")
+re2_test(
+ "search_test",
+ size = "small",
+)
-re2_test("set_test")
+re2_test(
+ "set_test",
+ size = "small",
+)
-re2_test("simplify_test")
+re2_test(
+ "simplify_test",
+ size = "small",
+)
-re2_test("string_generator_test")
+re2_test(
+ "string_generator_test",
+ size = "small",
+)
re2_test(
"dfa_test",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3e22472..884873b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,10 @@
# Old enough to support Ubuntu Trusty.
cmake_minimum_required(VERSION 2.8.12)
+if(POLICY CMP0048)
+ cmake_policy(SET CMP0048 NEW)
+endif()
+
project(RE2 CXX)
include(CTest)
diff --git a/doc/syntax.txt b/doc/syntax.txt
index 09b7e88..92e2f2a 100644
--- a/doc/syntax.txt
+++ b/doc/syntax.txt
@@ -230,6 +230,7 @@ Zp paragraph separator
Zs space separator
Unicode character class names--scripts:
+Adlam
Ahom
Anatolian_Hieroglyphs
Arabic
@@ -240,6 +241,7 @@ Bamum
Bassa_Vah
Batak
Bengali
+Bhaiksuki
Bopomofo
Brahmi
Braille
@@ -301,6 +303,8 @@ Mahajani
Malayalam
Mandaic
Manichaean
+Marchen
+Masaram_Gondi
Meetei_Mayek
Mende_Kikakui
Meroitic_Cursive
@@ -313,7 +317,9 @@ Multani
Myanmar
Nabataean
New_Tai_Lue
+Newa
Nko
+Nushu
Ogham
Ol_Chiki
Old_Hungarian
@@ -324,6 +330,7 @@ Old_Persian
Old_South_Arabian
Old_Turkic
Oriya
+Osage
Osmanya
Pahawh_Hmong
Palmyrene
@@ -341,6 +348,7 @@ Siddham
SignWriting
Sinhala
Sora_Sompeng
+Soyombo
Sundanese
Syloti_Nagri
Syriac
@@ -351,6 +359,7 @@ Tai_Tham
Tai_Viet
Takri
Tamil
+Tangut
Telugu
Thaana
Thai
@@ -361,6 +370,7 @@ Ugaritic
Vai
Warang_Citi
Yi
+Zanabazar_Square
Vim character classes:
\i identifier character NOT SUPPORTED vim
diff --git a/re2/compile.cc b/re2/compile.cc
index 9a19bde..454c872 100644
--- a/re2/compile.cc
+++ b/re2/compile.cc
@@ -136,8 +136,7 @@ class Compiler : public Regexp::Walker<Frag> {
// Compiles alternation of all the re to a new Prog.
// Each re has a match with an id equal to its index in the vector.
- static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
- Regexp* re);
+ static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
// Interface for Regexp::Walker, which helps traverse the Regexp.
// The walk is purely post-recursive: given the machines for the
@@ -286,7 +285,8 @@ int Compiler::AllocInst(int n) {
while (inst_len_ + n > inst_cap_)
inst_cap_ *= 2;
Prog::Inst* ip = new Prog::Inst[inst_cap_];
- memmove(ip, inst_, inst_len_ * sizeof ip[0]);
+ if (inst_ != NULL)
+ memmove(ip, inst_, inst_len_ * sizeof ip[0]);
memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]);
delete[] inst_;
inst_ = ip;
@@ -877,9 +877,11 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
case kRegexpHaveMatch: {
Frag f = Match(re->match_id());
- // Remember unanchored match to end of string.
- if (anchor_ != RE2::ANCHOR_BOTH)
- f = Cat(DotStar(), Cat(EmptyWidth(kEmptyEndText), f));
+ if (anchor_ == RE2::ANCHOR_BOTH) {
+ // Append \z or else the subexpression will effectively be unanchored.
+ // Complemented by the UNANCHORED case in CompileSet().
+ f = Cat(EmptyWidth(kEmptyEndText), f);
+ }
return f;
}
@@ -1138,8 +1140,7 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem,
// The reversed flag is also recorded in the returned program.
Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) {
Compiler c;
-
- c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */);
+ c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED /* unused */);
c.reversed_ = reversed;
// Simplify to remove things like counted repetitions
@@ -1154,7 +1155,7 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) {
bool is_anchor_end = IsAnchorEnd(&sre, 0);
// Generate fragment for entire regexp.
- Frag f = c.WalkExponential(sre, Frag(), 2*c.max_inst_);
+ Frag all = c.WalkExponential(sre, Frag(), 2*c.max_inst_);
sre->Decref();
if (c.failed_)
return NULL;
@@ -1163,10 +1164,10 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) {
// Turn off c.reversed_ (if it is set) to force the remaining concatenations
// to behave normally.
c.reversed_ = false;
- Frag all = c.Cat(f, c.Match(0));
- c.prog_->set_start(all.begin);
+ all = c.Cat(all, c.Match(0));
- if (reversed) {
+ c.prog_->set_reversed(reversed);
+ if (c.prog_->reversed()) {
c.prog_->set_anchor_start(is_anchor_end);
c.prog_->set_anchor_end(is_anchor_start);
} else {
@@ -1174,15 +1175,12 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) {
c.prog_->set_anchor_end(is_anchor_end);
}
- // Also create unanchored version, which starts with a .*? loop.
- if (c.prog_->anchor_start()) {
- c.prog_->set_start_unanchored(c.prog_->start());
- } else {
- Frag unanchored = c.Cat(c.DotStar(), all);
- c.prog_->set_start_unanchored(unanchored.begin);
+ c.prog_->set_start(all.begin);
+ if (!c.prog_->anchor_start()) {
+ // Also create unanchored version, which starts with a .*? loop.
+ all = c.Cat(c.DotStar(), all);
}
-
- c.prog_->set_reversed(reversed);
+ c.prog_->set_start_unanchored(all.begin);
// Hand ownership of prog_ to caller.
return c.Finish();
@@ -1235,29 +1233,29 @@ Frag Compiler::DotStar() {
}
// Compiles RE set to Prog.
-Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
- Regexp* re) {
+Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
Compiler c;
+ c.Setup(re->parse_flags(), max_mem, anchor);
- Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags());
- c.Setup(pf, options.max_mem(), anchor);
+ Regexp* sre = re->Simplify();
+ if (sre == NULL)
+ return NULL;
- // Compile alternation of fragments.
- Frag all = c.WalkExponential(re, Frag(), 2*c.max_inst_);
- re->Decref();
+ Frag all = c.WalkExponential(sre, Frag(), 2*c.max_inst_);
+ sre->Decref();
if (c.failed_)
return NULL;
+ c.prog_->set_anchor_start(true);
+ c.prog_->set_anchor_end(true);
+
if (anchor == RE2::UNANCHORED) {
- // The trailing .* was added while handling kRegexpHaveMatch.
- // We just have to add the leading one.
+ // Prepend .* or else the expression will effectively be anchored.
+ // Complemented by the ANCHOR_BOTH case in PostVisit().
all = c.Cat(c.DotStar(), all);
}
-
c.prog_->set_start(all.begin);
c.prog_->set_start_unanchored(all.begin);
- c.prog_->set_anchor_start(true);
- c.prog_->set_anchor_end(true);
Prog* prog = c.Finish();
if (prog == NULL)
@@ -1277,9 +1275,8 @@ Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
return prog;
}
-Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
- Regexp* re) {
- return Compiler::CompileSet(options, anchor, re);
+Prog* Prog::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
+ return Compiler::CompileSet(re, anchor, max_mem);
}
} // namespace re2
diff --git a/re2/dfa.cc b/re2/dfa.cc
index 27fd049..2720a5e 100644
--- a/re2/dfa.cc
+++ b/re2/dfa.cc
@@ -28,7 +28,6 @@
#include <algorithm>
#include <atomic>
#include <deque>
-#include <map>
#include <mutex>
#include <new>
#include <string>
@@ -96,7 +95,7 @@ class DFA {
// memory), it sets *failed and returns false.
bool Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool want_earliest_match, bool run_forward,
- bool* failed, const char** ep, std::vector<int>* matches);
+ bool* failed, const char** ep, SparseSet* matches);
// Builds out all states for the entire DFA.
// If cb is not empty, it receives one callback per state built.
@@ -141,9 +140,9 @@ class DFA {
enum {
kByteEndText = 256, // imaginary byte at end of text
- kFlagEmptyMask = 0xFFF, // State.flag_: bits holding kEmptyXXX flags
- kFlagMatch = 0x1000, // State.flag_: this is a matching state
- kFlagLastWord = 0x2000, // State.flag_: last byte was a word char
+ kFlagEmptyMask = 0xFF, // State.flag_: bits holding kEmptyXXX flags
+ kFlagMatch = 0x0100, // State.flag_: this is a matching state
+ kFlagLastWord = 0x0200, // State.flag_: last byte was a word char
kFlagNeedShift = 16, // needed kEmpty bits are or'ed in shifted left
};
@@ -207,7 +206,7 @@ class DFA {
// Looks up and returns the State corresponding to a Workq.
// L >= mutex_
- State* WorkqToCachedState(Workq* q, uint32_t flag);
+ State* WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag);
// Looks up and returns a State matching the inst, ninst, and flag.
// L >= mutex_
@@ -230,8 +229,7 @@ class DFA {
// sets *ismatch to true.
// L >= mutex_
void RunWorkqOnByte(Workq* q, Workq* nq,
- int c, uint32_t flag, bool* ismatch,
- Prog::MatchKind kind);
+ int c, uint32_t flag, bool* ismatch);
// Runs a Workq on a set of empty-string flags, producing a new Workq in nq.
// L >= mutex_
@@ -273,7 +271,7 @@ class DFA {
RWLocker *cache_lock;
bool failed; // "out" parameter: whether search gave up
const char* ep; // "out" parameter: end pointer for match
- std::vector<int>* matches;
+ SparseSet* matches;
private:
SearchParams(const SearchParams&) = delete;
@@ -377,6 +375,10 @@ static inline const uint8_t* BytePtr(const void* v) {
// in the work queue when in leftmost-longest matching mode.
#define Mark (-1)
+// Separates the match IDs from the instructions in inst_.
+// Used only for "many match" DFA states.
+#define MatchSep (-2)
+
// Internally, the DFA uses a sparse array of
// program instruction pointers as a work queue.
// In leftmost longest mode, marks separate sections
@@ -509,7 +511,7 @@ DFA::~DFA() {
string DFA::DumpWorkq(Workq* q) {
string s;
const char* sep = "";
- for (DFA::Workq::iterator it = q->begin(); it != q->end(); ++it) {
+ for (Workq::iterator it = q->begin(); it != q->end(); ++it) {
if (q->is_mark(*it)) {
StringAppendF(&s, "|");
sep = "";
@@ -536,6 +538,9 @@ string DFA::DumpState(State* state) {
if (state->inst_[i] == Mark) {
StringAppendF(&s, "|");
sep = "";
+ } else if (state->inst_[i] == MatchSep) {
+ StringAppendF(&s, "||");
+ sep = "";
} else {
StringAppendF(&s, "%s%d", sep, state->inst_[i]);
sep = ",";
@@ -603,7 +608,9 @@ string DFA::DumpState(State* state) {
// Looks in the State cache for a State matching q, flag.
// If one is found, returns it. If one is not found, allocates one,
// inserts it in the cache, and returns it.
-DFA::State* DFA::WorkqToCachedState(Workq* q, uint32_t flag) {
+// If mq is not null, MatchSep and the match IDs in mq will be appended
+// to the State.
+DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) {
//mutex_.AssertHeld();
// Construct array of instruction ids for the new state.
@@ -710,6 +717,17 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, uint32_t flag) {
}
}
+ // Append MatchSep and the match IDs in mq if necessary.
+ if (mq != NULL) {
+ inst[n++] = MatchSep;
+ for (Workq::iterator i = mq->begin(); i != mq->end(); ++i) {
+ int id = *i;
+ Prog::Inst* ip = prog_->inst(id);
+ if (ip->opcode() == kInstMatch)
+ inst[n++] = ip->match_id();
+ }
+ }
+
// Save the needed empty-width flags in the top bits for use later.
flag |= needflags << kFlagNeedShift;
@@ -789,11 +807,15 @@ void DFA::ClearCache() {
void DFA::StateToWorkq(State* s, Workq* q) {
q->clear();
for (int i = 0; i < s->ninst_; i++) {
- if (s->inst_[i] == Mark)
+ if (s->inst_[i] == Mark) {
q->mark();
- else
+ } else if (s->inst_[i] == MatchSep) {
+ // Nothing after this is an instruction!
+ break;
+ } else {
// Explore from the head of the list.
AddToQueue(q, s->inst_[i], s->flag_ & kFlagEmptyMask);
+ }
}
}
@@ -913,8 +935,7 @@ void DFA::RunWorkqOnEmptyString(Workq* oldq, Workq* newq, uint32_t flag) {
// means to match c$. Sets the bool *ismatch to true if the end of the
// regular expression program has been reached (the regexp has matched).
void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq,
- int c, uint32_t flag, bool* ismatch,
- Prog::MatchKind kind) {
+ int c, uint32_t flag, bool* ismatch) {
//mutex_.AssertHeld();
newq->clear();
@@ -945,10 +966,11 @@ void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq,
break;
case kInstMatch:
- if (prog_->anchor_end() && c != kByteEndText)
+ if (prog_->anchor_end() && c != kByteEndText &&
+ kind_ != Prog::kManyMatch)
break;
*ismatch = true;
- if (kind == Prog::kFirstMatch) {
+ if (kind_ == Prog::kFirstMatch) {
// Can stop processing work queue since we found a match.
return;
}
@@ -1040,20 +1062,9 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) {
swap(q0_, q1_);
}
bool ismatch = false;
- RunWorkqOnByte(q0_, q1_, c, afterflag, &ismatch, kind_);
-
- // Most of the time, we build the state from the output of
- // RunWorkqOnByte, so swap q0_ and q1_ here. However, so that
- // RE2::Set can tell exactly which match instructions
- // contributed to the match, don't swap if c is kByteEndText.
- // The resulting state wouldn't be correct for further processing
- // of the string, but we're at the end of the text so that's okay.
- // Leaving q0_ alone preseves the match instructions that led to
- // the current setting of ismatch.
- if (c != kByteEndText || kind_ != Prog::kManyMatch) {
- using std::swap;
- swap(q0_, q1_);
- }
+ RunWorkqOnByte(q0_, q1_, c, afterflag, &ismatch);
+ using std::swap;
+ swap(q0_, q1_);
// Save afterflag along with ismatch and isword in new state.
uint32_t flag = afterflag;
@@ -1062,7 +1073,10 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) {
if (isword)
flag |= kFlagLastWord;
- ns = WorkqToCachedState(q0_, flag);
+ if (ismatch && kind_ == Prog::kManyMatch)
+ ns = WorkqToCachedState(q0_, q1_, flag);
+ else
+ ns = WorkqToCachedState(q0_, NULL, flag);
// Flush ns before linking to it.
// Write barrier before updating state->next_ so that the
@@ -1316,11 +1330,24 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params,
const uint8_t* bytemap = prog_->bytemap();
const uint8_t* lastmatch = NULL; // most recent matching position in text
bool matched = false;
+
State* s = start;
+ if (ExtraDebug)
+ fprintf(stderr, "@stx: %s\n", DumpState(s).c_str());
if (s->IsMatch()) {
matched = true;
lastmatch = p;
+ if (ExtraDebug)
+ fprintf(stderr, "match @stx! [%s]\n", DumpState(s).c_str());
+ if (params->matches != NULL && kind_ == Prog::kManyMatch) {
+ for (int i = s->ninst_ - 1; i >= 0; i--) {
+ int id = s->inst_[i];
+ if (id == MatchSep)
+ break;
+ params->matches->insert(id);
+ }
+ }
if (want_earliest_match) {
params->ep = reinterpret_cast<const char*>(lastmatch);
return true;
@@ -1331,6 +1358,7 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params,
if (ExtraDebug)
fprintf(stderr, "@%td: %s\n",
p - bp, DumpState(s).c_str());
+
if (have_firstbyte && s == start) {
// In start state, only way out is to find firstbyte,
// so use optimized assembly in memchr to skip ahead.
@@ -1424,8 +1452,8 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params,
params->ep = reinterpret_cast<const char*>(ep);
return true;
}
- s = ns;
+ s = ns;
if (s->IsMatch()) {
matched = true;
// The DFA notices the match one byte late,
@@ -1437,7 +1465,14 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params,
if (ExtraDebug)
fprintf(stderr, "match @%td! [%s]\n",
lastmatch - bp, DumpState(s).c_str());
-
+ if (params->matches != NULL && kind_ == Prog::kManyMatch) {
+ for (int i = s->ninst_ - 1; i >= 0; i--) {
+ int id = s->inst_[i];
+ if (id == MatchSep)
+ break;
+ params->matches->insert(id);
+ }
+ }
if (want_earliest_match) {
params->ep = reinterpret_cast<const char*>(lastmatch);
return true;
@@ -1447,6 +1482,9 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params,
// Process one more byte to see if it triggers a match.
// (Remember, matches are delayed one byte.)
+ if (ExtraDebug)
+ fprintf(stderr, "@etx: %s\n", DumpState(s).c_str());
+
int lastbyte;
if (run_forward) {
if (params->text.end() == params->context.end())
@@ -1478,34 +1516,32 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params,
}
}
}
- s = ns;
- if (ExtraDebug)
- fprintf(stderr, "@_: %s\n", DumpState(s).c_str());
- if (s == FullMatchState) {
+ if (ns <= SpecialStateMax) {
+ if (ns == DeadState) {
+ params->ep = reinterpret_cast<const char*>(lastmatch);
+ return matched;
+ }
+ // FullMatchState
params->ep = reinterpret_cast<const char*>(ep);
return true;
}
- if (s > SpecialStateMax && s->IsMatch()) {
+
+ s = ns;
+ if (s->IsMatch()) {
matched = true;
lastmatch = p;
- if (params->matches && kind_ == Prog::kManyMatch) {
- std::vector<int>* v = params->matches;
- v->clear();
- for (int i = 0; i < s->ninst_; i++) {
- Prog::Inst* ip = prog_->inst(s->inst_[i]);
- for (;;) {
- if (ip->opcode() == kInstMatch)
- v->push_back(ip->match_id());
- if (ip->last())
- break;
- ip++;
- }
+ if (ExtraDebug)
+ fprintf(stderr, "match @etx! [%s]\n", DumpState(s).c_str());
+ if (params->matches != NULL && kind_ == Prog::kManyMatch) {
+ for (int i = s->ninst_ - 1; i >= 0; i--) {
+ int id = s->inst_[i];
+ if (id == MatchSep)
+ break;
+ params->matches->insert(id);
}
}
- if (ExtraDebug)
- fprintf(stderr, "match @%td! [%s]\n",
- lastmatch - bp, DumpState(s).c_str());
}
+
params->ep = reinterpret_cast<const char*>(lastmatch);
return matched;
}
@@ -1681,7 +1717,7 @@ bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info,
AddToQueue(q0_,
params->anchored ? prog_->start() : prog_->start_unanchored(),
flags);
- info->start = WorkqToCachedState(q0_, flags);
+ info->start = WorkqToCachedState(q0_, NULL, flags);
if (info->start == NULL)
return false;
@@ -1732,7 +1768,7 @@ bool DFA::Search(const StringPiece& text,
bool run_forward,
bool* failed,
const char** epp,
- std::vector<int>* matches) {
+ SparseSet* matches) {
*epp = NULL;
if (!ok()) {
*failed = true;
@@ -1823,7 +1859,7 @@ void Prog::DeleteDFA(DFA* dfa) {
//
bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context,
Anchor anchor, MatchKind kind, StringPiece* match0,
- bool* failed, std::vector<int>* matches) {
+ bool* failed, SparseSet* matches) {
*failed = false;
StringPiece context = const_context;
@@ -1846,7 +1882,7 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context,
bool anchored = anchor == kAnchored || anchor_start() || kind == kFullMatch;
bool endmatch = false;
if (kind == kManyMatch) {
- endmatch = true;
+ // This is split out in order to avoid clobbering kind.
} else if (kind == kFullMatch || anchor_end()) {
endmatch = true;
kind = kLongestMatch;
@@ -1854,17 +1890,22 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context,
// If the caller doesn't care where the match is (just whether one exists),
// then we can stop at the very first match we find, the so-called
- // "shortest match".
- bool want_shortest_match = false;
- if (match0 == NULL && !endmatch) {
- want_shortest_match = true;
+ // "earliest match".
+ bool want_earliest_match = false;
+ if (kind == kManyMatch) {
+ // This is split out in order to avoid clobbering kind.
+ if (matches == NULL) {
+ want_earliest_match = true;
+ }
+ } else if (match0 == NULL && !endmatch) {
+ want_earliest_match = true;
kind = kLongestMatch;
}
DFA* dfa = GetDFA(kind);
const char* ep;
bool matched = dfa->Search(text, context, anchored,
- want_shortest_match, !reversed_,
+ want_earliest_match, !reversed_,
failed, &ep, matches);
if (*failed)
return false;
@@ -1982,7 +2023,7 @@ bool DFA::PossibleMatchRange(string* min, string* max, int maxlen) {
// Also note that previously_visited_states[UnseenStatePtr] will, in the STL
// tradition, implicitly insert a '0' value at first use. We take advantage
// of that property below.
- std::map<State*, int> previously_visited_states;
+ std::unordered_map<State*, int> previously_visited_states;
// Pick out start state for anchored search at beginning of text.
RWLocker l(&cache_mutex_);
@@ -2087,7 +2128,7 @@ bool DFA::PossibleMatchRange(string* min, string* max, int maxlen) {
}
// Stopped while still adding to *max - round aaaaaaaaaa... to aaaa...b
- *max = PrefixSuccessor(*max);
+ PrefixSuccessor(max);
// If there are no bytes left, we have no way to say "there is no maximum
// string". We could make the interface more complicated and be able to
diff --git a/re2/fuzzing/re2_fuzzer.cc b/re2/fuzzing/re2_fuzzer.cc
index daef0be..e0aa96a 100644
--- a/re2/fuzzing/re2_fuzzer.cc
+++ b/re2/fuzzing/re2_fuzzer.cc
@@ -59,6 +59,20 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
if (size == 0 || size > 1024)
return 0;
+ // Crudely limit the use of \p and \P.
+ // Otherwise, we will waste time on inputs that have long runs of Unicode
+ // character classes. The fuzzer has shown itself to be easily capable of
+ // generating such patterns that fall within the other limits, but result
+ // in timeouts nonetheless. The marginal cost is high - even more so when
+ // counted repetition is involved - whereas the marginal benefit is zero.
+ int backslash_p = 0;
+ for (size_t i = 0; i < size; i++) {
+ if (data[i] == '\\' && i+1 < size && (data[i+1] == 'p' || data[i+1] == 'P'))
+ backslash_p++;
+ }
+ if (backslash_p > 10)
+ return 0;
+
// The one-at-a-time hash by Bob Jenkins.
uint32_t hash = 0;
for (size_t i = 0; i < size; i++) {
@@ -72,6 +86,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
RE2::Options options;
options.set_log_errors(false);
+ options.set_max_mem(64 << 20);
options.set_encoding(hash & 1 ? RE2::Options::EncodingLatin1
: RE2::Options::EncodingUTF8);
options.set_posix_syntax(hash & 2);
diff --git a/re2/parse.cc b/re2/parse.cc
index 0729b75..2b46e3e 100644
--- a/re2/parse.cc
+++ b/re2/parse.cc
@@ -1224,9 +1224,8 @@ bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) {
// Lexing routines.
-// Parses a decimal integer, storing it in *n.
+// Parses a decimal integer, storing it in *np.
// Sets *s to span the remainder of the string.
-// Sets *out_re to the regexp for the class.
static bool ParseInteger(StringPiece* s, int* np) {
if (s->size() == 0 || !isdigit((*s)[0] & 0xFF))
return false;
diff --git a/re2/prog.h b/re2/prog.h
index 3176898..3fb1c1f 100644
--- a/re2/prog.h
+++ b/re2/prog.h
@@ -255,7 +255,7 @@ class Prog {
// SearchDFA fills matches with the match IDs of the final matching state.
bool SearchDFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind, StringPiece* match0,
- bool* failed, std::vector<int>* matches);
+ bool* failed, SparseSet* matches);
// The callback issued after building each DFA state with BuildEntireDFA().
// If next is null, then the memory budget has been exhausted and building
@@ -336,9 +336,8 @@ class Prog {
void Fanout(SparseArray<int>* fanout);
// Compiles a collection of regexps to Prog. Each regexp will have
- // its own Match instruction recording the index in the vector.
- static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
- Regexp* re);
+ // its own Match instruction recording the index in the output vector.
+ static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
// Flattens the Prog from "tree" form to "list" form. This is an in-place
// operation in the sense that the old instructions are lost.
diff --git a/re2/re2.cc b/re2/re2.cc
index fa56026..ff4f7a6 100644
--- a/re2/re2.cc
+++ b/re2/re2.cc
@@ -345,25 +345,6 @@ bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
}
}
-// Returns the maximum submatch needed for the rewrite to be done by Replace().
-// E.g. if rewrite == "foo \\2,\\1", returns 2.
-int RE2::MaxSubmatch(const StringPiece& rewrite) {
- int max = 0;
- for (const char *s = rewrite.data(), *end = s + rewrite.size();
- s < end; s++) {
- if (*s == '\\') {
- s++;
- int c = (s < end) ? *s : -1;
- if (isdigit(c)) {
- int n = (c - '0');
- if (n > max)
- max = n;
- }
- }
- }
- return max;
-}
-
bool RE2::Replace(string *str,
const RE2& re,
const StringPiece& rewrite) {
@@ -519,14 +500,14 @@ bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const {
n = maxlen;
// Determine initial min max from prefix_ literal.
- string pmin, pmax;
- pmin = prefix_.substr(0, n);
- pmax = prefix_.substr(0, n);
+ *min = prefix_.substr(0, n);
+ *max = prefix_.substr(0, n);
if (prefix_foldcase_) {
- // prefix is ASCII lowercase; change pmin to uppercase.
+ // prefix is ASCII lowercase; change *min to uppercase.
for (int i = 0; i < n; i++) {
- if ('a' <= pmin[i] && pmin[i] <= 'z')
- pmin[i] += 'A' - 'a';
+ char& c = (*min)[i];
+ if ('a' <= c && c <= 'z')
+ c += 'A' - 'a';
}
}
@@ -534,13 +515,13 @@ bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const {
string dmin, dmax;
maxlen -= n;
if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
- pmin += dmin;
- pmax += dmax;
- } else if (!pmax.empty()) {
+ min->append(dmin);
+ max->append(dmax);
+ } else if (!max->empty()) {
// prog_->PossibleMatchRange has failed us,
// but we still have useful information from prefix_.
- // Round up pmax to allow any possible suffix.
- pmax = PrefixSuccessor(pmax);
+ // Round up *max to allow any possible suffix.
+ PrefixSuccessor(max);
} else {
// Nothing useful.
*min = "";
@@ -548,8 +529,6 @@ bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const {
return false;
}
- *min = pmin;
- *max = pmax;
return true;
}
@@ -858,41 +837,6 @@ bool RE2::DoMatch(const StringPiece& text,
return true;
}
-// Append the "rewrite" string, with backslash subsitutions from "vec",
-// to string "out".
-bool RE2::Rewrite(string *out, const StringPiece &rewrite,
- const StringPiece *vec, int veclen) const {
- for (const char *s = rewrite.data(), *end = s + rewrite.size();
- s < end; s++) {
- if (*s != '\\') {
- out->push_back(*s);
- continue;
- }
- s++;
- int c = (s < end) ? *s : -1;
- if (isdigit(c)) {
- int n = (c - '0');
- if (n >= veclen) {
- if (options_.log_errors()) {
- LOG(ERROR) << "requested group " << n
- << " in regexp " << rewrite.data();
- }
- return false;
- }
- StringPiece snip = vec[n];
- if (snip.size() > 0)
- out->append(snip.data(), snip.size());
- } else if (c == '\\') {
- out->push_back('\\');
- } else {
- if (options_.log_errors())
- LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
- return false;
- }
- }
- return true;
-}
-
// Checks that the rewrite string is well-formed with respect to this
// regular expression.
bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const {
@@ -931,6 +875,60 @@ bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const {
return true;
}
+// Returns the maximum submatch needed for the rewrite to be done by Replace().
+// E.g. if rewrite == "foo \\2,\\1", returns 2.
+int RE2::MaxSubmatch(const StringPiece& rewrite) {
+ int max = 0;
+ for (const char *s = rewrite.data(), *end = s + rewrite.size();
+ s < end; s++) {
+ if (*s == '\\') {
+ s++;
+ int c = (s < end) ? *s : -1;
+ if (isdigit(c)) {
+ int n = (c - '0');
+ if (n > max)
+ max = n;
+ }
+ }
+ }
+ return max;
+}
+
+// Append the "rewrite" string, with backslash subsitutions from "vec",
+// to string "out".
+bool RE2::Rewrite(string* out, const StringPiece& rewrite,
+ const StringPiece* vec, int veclen) const {
+ for (const char *s = rewrite.data(), *end = s + rewrite.size();
+ s < end; s++) {
+ if (*s != '\\') {
+ out->push_back(*s);
+ continue;
+ }
+ s++;
+ int c = (s < end) ? *s : -1;
+ if (isdigit(c)) {
+ int n = (c - '0');
+ if (n >= veclen) {
+ if (options_.log_errors()) {
+ LOG(ERROR) << "requested group " << n
+ << " in regexp " << rewrite.data();
+ }
+ return false;
+ }
+ StringPiece snip = vec[n];
+ if (snip.size() > 0)
+ out->append(snip.data(), snip.size());
+ } else if (c == '\\') {
+ out->push_back('\\');
+ } else {
+ if (options_.log_errors())
+ LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
+ return false;
+ }
+ }
+ return true;
+}
+
/***** Parsers for various types *****/
bool RE2::Arg::parse_null(const char* str, size_t n, void* dest) {
diff --git a/re2/re2.h b/re2/re2.h
index 120a34c..9307704 100644
--- a/re2/re2.h
+++ b/re2/re2.h
@@ -523,8 +523,8 @@ class RE2 {
// Returns true on success. This method can fail because of a malformed
// rewrite string. CheckRewriteString guarantees that the rewrite will
// be sucessful.
- bool Rewrite(string *out,
- const StringPiece &rewrite,
+ bool Rewrite(string* out,
+ const StringPiece& rewrite,
const StringPiece* vec,
int veclen) const;
diff --git a/re2/set.cc b/re2/set.cc
index 1ee431c..d04a487 100644
--- a/re2/set.cc
+++ b/re2/set.cc
@@ -5,6 +5,8 @@
#include "re2/set.h"
#include <stddef.h>
+#include <algorithm>
+#include <memory>
#include "util/util.h"
#include "util/logging.h"
@@ -17,26 +19,27 @@ namespace re2 {
RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) {
options_.Copy(options);
+ options_.set_never_capture(true); // might unblock some optimisations
anchor_ = anchor;
prog_ = NULL;
compiled_ = false;
+ size_ = 0;
}
RE2::Set::~Set() {
- for (size_t i = 0; i < re_.size(); i++)
- re_[i]->Decref();
+ for (size_t i = 0; i < elem_.size(); i++)
+ elem_[i].second->Decref();
delete prog_;
}
int RE2::Set::Add(const StringPiece& pattern, string* error) {
if (compiled_) {
- LOG(DFATAL) << "RE2::Set::Add after Compile";
+ LOG(DFATAL) << "RE2::Set::Add() called after compiling";
return -1;
}
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
options_.ParseFlags());
-
RegexpStatus status;
re2::Regexp* re = Regexp::Parse(pattern, pf, &status);
if (re == NULL) {
@@ -48,7 +51,7 @@ int RE2::Set::Add(const StringPiece& pattern, string* error) {
}
// Concatenate with match index and push on vector.
- int n = static_cast<int>(re_.size());
+ int n = static_cast<int>(elem_.size());
re2::Regexp* m = re2::Regexp::HaveMatch(n, pf);
if (re->op() == kRegexpConcat) {
int nsub = re->nsub();
@@ -65,45 +68,54 @@ int RE2::Set::Add(const StringPiece& pattern, string* error) {
sub[1] = m;
re = re2::Regexp::Concat(sub, 2, pf);
}
- re_.push_back(re);
+ elem_.emplace_back(pattern.ToString(), re);
return n;
}
bool RE2::Set::Compile() {
if (compiled_) {
- LOG(DFATAL) << "RE2::Set::Compile multiple times";
+ LOG(DFATAL) << "RE2::Set::Compile() called more than once";
return false;
}
compiled_ = true;
+ size_ = static_cast<int>(elem_.size());
+
+ // Sort the elements by their patterns. This is good enough for now
+ // until we have a Regexp comparison function. (Maybe someday...)
+ std::sort(elem_.begin(), elem_.end(),
+ [](const Elem& a, const Elem& b) -> bool {
+ return a.first < b.first;
+ });
+
+ re2::Regexp** sub = new re2::Regexp*[size_];
+ for (size_t i = 0; i < elem_.size(); i++)
+ sub[i] = elem_[i].second;
+ elem_.clear();
+ elem_.shrink_to_fit();
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
options_.ParseFlags());
- re2::Regexp* re = re2::Regexp::Alternate(const_cast<re2::Regexp**>(re_.data()),
- static_cast<int>(re_.size()), pf);
- re_.clear();
- re2::Regexp* sre = re->Simplify();
- re->Decref();
- re = sre;
- if (re == NULL) {
- if (options_.log_errors())
- LOG(ERROR) << "Error simplifying during Compile.";
- return false;
- }
+ re2::Regexp* re = re2::Regexp::Alternate(sub, size_, pf);
+ delete[] sub;
- prog_ = Prog::CompileSet(options_, anchor_, re);
+ prog_ = Prog::CompileSet(re, anchor_, options_.max_mem());
+ re->Decref();
return prog_ != NULL;
}
bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v) const {
if (!compiled_) {
- LOG(DFATAL) << "RE2::Set::Match without Compile";
+ LOG(DFATAL) << "RE2::Set::Match() called before compiling";
return false;
}
- if (v != NULL)
- v->clear();
bool dfa_failed = false;
- bool ret = prog_->SearchDFA(text, text, Prog::kAnchored,
- Prog::kManyMatch, NULL, &dfa_failed, v);
+ std::unique_ptr<SparseSet> matches;
+ if (v != NULL) {
+ matches.reset(new SparseSet(size_));
+ v->clear();
+ }
+ bool ret = prog_->SearchDFA(text, text, Prog::kAnchored, Prog::kManyMatch,
+ NULL, &dfa_failed, matches.get());
if (dfa_failed) {
if (options_.log_errors())
LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", "
@@ -113,9 +125,12 @@ bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v) const {
}
if (ret == false)
return false;
- if (v != NULL && v->empty()) {
- LOG(DFATAL) << "RE2::Set::Match: match but unknown regexp set";
- return false;
+ if (v != NULL) {
+ if (matches->empty()) {
+ LOG(DFATAL) << "RE2::Set::Match() matched, but matches unknown";
+ return false;
+ }
+ v->assign(matches->begin(), matches->end());
}
return true;
}
diff --git a/re2/set.h b/re2/set.h
index c31d500..f97d8a5 100644
--- a/re2/set.h
+++ b/re2/set.h
@@ -6,6 +6,7 @@
#define RE2_SET_H_
#include <string>
+#include <utility>
#include <vector>
#include "re2/re2.h"
@@ -45,11 +46,14 @@ class RE2::Set {
bool Match(const StringPiece& text, std::vector<int>* v) const;
private:
+ typedef std::pair<string, re2::Regexp*> Elem;
+
RE2::Options options_;
RE2::Anchor anchor_;
- std::vector<re2::Regexp*> re_;
+ std::vector<Elem> elem_;
re2::Prog* prog_;
bool compiled_;
+ int size_;
Set(const Set&) = delete;
Set& operator=(const Set&) = delete;
diff --git a/re2/testing/parse_test.cc b/re2/testing/parse_test.cc
index ec80fc4..1e6929d 100644
--- a/re2/testing/parse_test.cc
+++ b/re2/testing/parse_test.cc
@@ -12,7 +12,10 @@
namespace re2 {
-static const Regexp::ParseFlags TestZeroFlags = Regexp::ParseFlags(1<<30);
+// In the past, we used 1<<30 here and zeroed the bit later, but that
+// has undefined behaviour, so now we use an internal-only flag because
+// otherwise we would have to introduce a new flag value just for this.
+static const Regexp::ParseFlags TestZeroFlags = Regexp::WasDollar;
struct Test {
const char* regexp;
diff --git a/re2/testing/regexp_benchmark.cc b/re2/testing/regexp_benchmark.cc
index efd8450..8b82e0b 100644
--- a/re2/testing/regexp_benchmark.cc
+++ b/re2/testing/regexp_benchmark.cc
@@ -1544,4 +1544,34 @@ BENCHMARK_RANGE(FullMatch_DotStarCapture_CachedPCRE, 8, 2<<20);
#endif
BENCHMARK_RANGE(FullMatch_DotStarCapture_CachedRE2, 8, 2<<20);
+void PossibleMatchRangeCommon(int iter, const char* regexp) {
+ StopBenchmarkTiming();
+ RE2 re(regexp);
+ StartBenchmarkTiming();
+ string min;
+ string max;
+ const int kMaxLen = 16;
+ for (int i = 0; i < iter; i++) {
+ CHECK(re.PossibleMatchRange(&min, &max, kMaxLen));
+ }
+}
+
+void PossibleMatchRange_Trivial(int i) {
+ PossibleMatchRangeCommon(i, ".*");
+}
+void PossibleMatchRange_Complex(int i) {
+ PossibleMatchRangeCommon(i, "^abc[def]?[gh]{1,2}.*");
+}
+void PossibleMatchRange_Prefix(int i) {
+ PossibleMatchRangeCommon(i, "^some_random_prefix.*");
+}
+void PossibleMatchRange_NoProg(int i) {
+ PossibleMatchRangeCommon(i, "^some_random_string$");
+}
+
+BENCHMARK(PossibleMatchRange_Trivial);
+BENCHMARK(PossibleMatchRange_Complex);
+BENCHMARK(PossibleMatchRange_Prefix);
+BENCHMARK(PossibleMatchRange_NoProg);
+
} // namespace re2
diff --git a/re2/testing/set_test.cc b/re2/testing/set_test.cc
index b6a24de..54aa106 100644
--- a/re2/testing/set_test.cc
+++ b/re2/testing/set_test.cc
@@ -78,11 +78,38 @@ TEST(Set, UnanchoredDollar) {
CHECK_EQ(s.Compile(), true);
CHECK_EQ(s.Match("foo", NULL), true);
+ CHECK_EQ(s.Match("foobar", NULL), false);
std::vector<int> v;
CHECK_EQ(s.Match("foo", &v), true);
CHECK_EQ(v.size(), 1);
CHECK_EQ(v[0], 0);
+
+ CHECK_EQ(s.Match("foobar", &v), false);
+ CHECK_EQ(v.size(), 0);
+}
+
+TEST(Set, UnanchoredWordBoundary) {
+ RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
+
+ CHECK_EQ(s.Add("foo\\b", NULL), 0);
+ CHECK_EQ(s.Compile(), true);
+
+ CHECK_EQ(s.Match("foo", NULL), true);
+ CHECK_EQ(s.Match("foobar", NULL), false);
+ CHECK_EQ(s.Match("foo bar", NULL), true);
+
+ std::vector<int> v;
+ CHECK_EQ(s.Match("foo", &v), true);
+ CHECK_EQ(v.size(), 1);
+ CHECK_EQ(v[0], 0);
+
+ CHECK_EQ(s.Match("foobar", &v), false);
+ CHECK_EQ(v.size(), 0);
+
+ CHECK_EQ(s.Match("foo bar", &v), true);
+ CHECK_EQ(v.size(), 1);
+ CHECK_EQ(v[0], 0);
}
TEST(Set, Anchored) {
diff --git a/re2/unicode.py b/re2/unicode.py
index 4b2240c..2899c87 100644
--- a/re2/unicode.py
+++ b/re2/unicode.py
@@ -9,7 +9,7 @@ import re
import urllib2
# Directory or URL where Unicode tables reside.
-_UNICODE_DIR = "http://www.unicode.org/Public/8.0.0/ucd"
+_UNICODE_DIR = "http://www.unicode.org/Public/10.0.0/ucd"
# Largest valid Unicode code value.
_RUNE_MAX = 0x10FFFF
diff --git a/re2/unicode_casefold.cc b/re2/unicode_casefold.cc
index 91a96b4..1686943 100644
--- a/re2/unicode_casefold.cc
+++ b/re2/unicode_casefold.cc
@@ -7,7 +7,7 @@
namespace re2 {
-// 1224 groups, 2469 pairs, 314 ranges
+// 1295 groups, 2620 pairs, 343 ranges
const CaseFold unicode_casefold[] = {
{ 65, 90, 32 },
{ 97, 106, -32 },
@@ -113,6 +113,7 @@ const CaseFold unicode_casefold[] = {
{ 614, 614, 42308 },
{ 616, 616, -209 },
{ 617, 617, -211 },
+ { 618, 618, 42308 },
{ 619, 619, 10743 },
{ 620, 620, 42305 },
{ 623, 623, -211 },
@@ -184,9 +185,21 @@ const CaseFold unicode_casefold[] = {
{ 1021, 1023, -130 },
{ 1024, 1039, 80 },
{ 1040, 1071, 32 },
- { 1072, 1103, -32 },
+ { 1072, 1073, -32 },
+ { 1074, 1074, 6222 },
+ { 1075, 1075, -32 },
+ { 1076, 1076, 6221 },
+ { 1077, 1085, -32 },
+ { 1086, 1086, 6212 },
+ { 1087, 1088, -32 },
+ { 1089, 1090, 6210 },
+ { 1091, 1097, -32 },
+ { 1098, 1098, 6204 },
+ { 1099, 1103, -32 },
{ 1104, 1119, -80 },
- { 1120, 1153, EvenOdd },
+ { 1120, 1122, EvenOdd },
+ { 1123, 1123, 6180 },
+ { 1124, 1153, EvenOdd },
{ 1162, 1215, EvenOdd },
{ 1216, 1216, 15 },
{ 1217, 1230, OddEven },
@@ -200,6 +213,15 @@ const CaseFold unicode_casefold[] = {
{ 5024, 5103, 38864 },
{ 5104, 5109, 8 },
{ 5112, 5117, -8 },
+ { 7296, 7296, -6254 },
+ { 7297, 7297, -6253 },
+ { 7298, 7298, -6244 },
+ { 7299, 7299, -6242 },
+ { 7300, 7300, EvenOdd },
+ { 7301, 7301, -6243 },
+ { 7302, 7302, -6236 },
+ { 7303, 7303, -6181 },
+ { 7304, 7304, 35266 },
{ 7545, 7545, 35332 },
{ 7549, 7549, 3814 },
{ 7680, 7776, EvenOdd },
@@ -293,7 +315,9 @@ const CaseFold unicode_casefold[] = {
{ 11520, 11557, -7264 },
{ 11559, 11559, -7264 },
{ 11565, 11565, -7264 },
- { 42560, 42605, EvenOdd },
+ { 42560, 42570, EvenOdd },
+ { 42571, 42571, -35267 },
+ { 42572, 42605, EvenOdd },
{ 42624, 42651, EvenOdd },
{ 42786, 42799, EvenOdd },
{ 42802, 42863, EvenOdd },
@@ -308,6 +332,7 @@ const CaseFold unicode_casefold[] = {
{ 42923, 42923, -42319 },
{ 42924, 42924, -42315 },
{ 42925, 42925, -42305 },
+ { 42926, 42926, -42308 },
{ 42928, 42928, -42258 },
{ 42929, 42929, -42282 },
{ 42930, 42930, -42261 },
@@ -319,14 +344,18 @@ const CaseFold unicode_casefold[] = {
{ 65345, 65370, -32 },
{ 66560, 66599, 40 },
{ 66600, 66639, -40 },
+ { 66736, 66771, 40 },
+ { 66776, 66811, -40 },
{ 68736, 68786, 64 },
{ 68800, 68850, -64 },
{ 71840, 71871, 32 },
{ 71872, 71903, -32 },
+ { 125184, 125217, 34 },
+ { 125218, 125251, -34 },
};
-const int num_unicode_casefold = 314;
+const int num_unicode_casefold = 343;
-// 1224 groups, 1245 pairs, 180 ranges
+// 1295 groups, 1325 pairs, 191 ranges
const CaseFold unicode_tolower[] = {
{ 65, 90, 32 },
{ 181, 181, 775 },
@@ -429,6 +458,14 @@ const CaseFold unicode_tolower[] = {
{ 4295, 4295, 7264 },
{ 4301, 4301, 7264 },
{ 5112, 5117, -8 },
+ { 7296, 7296, -6222 },
+ { 7297, 7297, -6221 },
+ { 7298, 7298, -6212 },
+ { 7299, 7300, -6210 },
+ { 7301, 7301, -6211 },
+ { 7302, 7302, -6204 },
+ { 7303, 7303, -6180 },
+ { 7304, 7304, 35267 },
{ 7680, 7828, EvenOddSkip },
{ 7835, 7835, -58 },
{ 7838, 7838, -7615 },
@@ -498,6 +535,7 @@ const CaseFold unicode_tolower[] = {
{ 42923, 42923, -42319 },
{ 42924, 42924, -42315 },
{ 42925, 42925, -42305 },
+ { 42926, 42926, -42308 },
{ 42928, 42928, -42258 },
{ 42929, 42929, -42282 },
{ 42930, 42930, -42261 },
@@ -506,10 +544,12 @@ const CaseFold unicode_tolower[] = {
{ 43888, 43967, -38864 },
{ 65313, 65338, 32 },
{ 66560, 66599, 40 },
+ { 66736, 66771, 40 },
{ 68736, 68786, 64 },
{ 71840, 71871, 32 },
+ { 125184, 125217, 34 },
};
-const int num_unicode_tolower = 180;
+const int num_unicode_tolower = 191;
diff --git a/re2/unicode_groups.cc b/re2/unicode_groups.cc
index 59087bc..ba104d2 100644
--- a/re2/unicode_groups.cc
+++ b/re2/unicode_groups.cc
@@ -108,7 +108,8 @@ static const URange16 No_range16[] = {
{ 2930, 2935 },
{ 3056, 3058 },
{ 3192, 3198 },
- { 3440, 3445 },
+ { 3416, 3422 },
+ { 3440, 3448 },
{ 3882, 3891 },
{ 4969, 4988 },
{ 6128, 6137 },
@@ -157,6 +158,7 @@ static const URange32 No_range32[] = {
{ 70113, 70132 },
{ 71482, 71483 },
{ 71914, 71922 },
+ { 72794, 72812 },
{ 93019, 93025 },
{ 119648, 119665 },
{ 125127, 125135 },
@@ -185,7 +187,9 @@ static const URange16 Lo_range16[] = {
{ 1994, 2026 },
{ 2048, 2069 },
{ 2112, 2136 },
+ { 2144, 2154 },
{ 2208, 2228 },
+ { 2230, 2237 },
{ 2308, 2361 },
{ 2365, 2365 },
{ 2384, 2384 },
@@ -202,6 +206,7 @@ static const URange16 Lo_range16[] = {
{ 2524, 2525 },
{ 2527, 2529 },
{ 2544, 2545 },
+ { 2556, 2556 },
{ 2565, 2570 },
{ 2575, 2576 },
{ 2579, 2600 },
@@ -250,6 +255,7 @@ static const URange16 Lo_range16[] = {
{ 3133, 3133 },
{ 3160, 3162 },
{ 3168, 3169 },
+ { 3200, 3200 },
{ 3205, 3212 },
{ 3214, 3216 },
{ 3218, 3240 },
@@ -264,6 +270,7 @@ static const URange16 Lo_range16[] = {
{ 3346, 3386 },
{ 3389, 3389 },
{ 3406, 3406 },
+ { 3412, 3414 },
{ 3423, 3425 },
{ 3450, 3455 },
{ 3461, 3478 },
@@ -336,7 +343,8 @@ static const URange16 Lo_range16[] = {
{ 6108, 6108 },
{ 6176, 6210 },
{ 6212, 6263 },
- { 6272, 6312 },
+ { 6272, 6276 },
+ { 6279, 6312 },
{ 6314, 6314 },
{ 6320, 6389 },
{ 6400, 6430 },
@@ -374,12 +382,12 @@ static const URange16 Lo_range16[] = {
{ 12447, 12447 },
{ 12449, 12538 },
{ 12543, 12543 },
- { 12549, 12589 },
+ { 12549, 12590 },
{ 12593, 12686 },
{ 12704, 12730 },
{ 12784, 12799 },
{ 13312, 19893 },
- { 19968, 40917 },
+ { 19968, 40938 },
{ 40960, 40980 },
{ 40982, 42124 },
{ 42192, 42231 },
@@ -465,7 +473,7 @@ static const URange32 Lo_range32[] = {
{ 66176, 66204 },
{ 66208, 66256 },
{ 66304, 66335 },
- { 66352, 66368 },
+ { 66349, 66368 },
{ 66370, 66377 },
{ 66384, 66421 },
{ 66432, 66461 },
@@ -531,6 +539,8 @@ static const URange32 Lo_range32[] = {
{ 70461, 70461 },
{ 70480, 70480 },
{ 70493, 70497 },
+ { 70656, 70708 },
+ { 70727, 70730 },
{ 70784, 70831 },
{ 70852, 70853 },
{ 70855, 70855 },
@@ -541,7 +551,21 @@ static const URange32 Lo_range32[] = {
{ 71296, 71338 },
{ 71424, 71449 },
{ 71935, 71935 },
+ { 72192, 72192 },
+ { 72203, 72242 },
+ { 72250, 72250 },
+ { 72272, 72272 },
+ { 72284, 72323 },
+ { 72326, 72329 },
{ 72384, 72440 },
+ { 72704, 72712 },
+ { 72714, 72750 },
+ { 72768, 72768 },
+ { 72818, 72847 },
+ { 72960, 72966 },
+ { 72968, 72969 },
+ { 72971, 73008 },
+ { 73030, 73030 },
{ 73728, 74649 },
{ 74880, 75075 },
{ 77824, 78894 },
@@ -554,7 +578,10 @@ static const URange32 Lo_range32[] = {
{ 93053, 93071 },
{ 93952, 94020 },
{ 94032, 94032 },
- { 110592, 110593 },
+ { 94208, 100332 },
+ { 100352, 101106 },
+ { 110592, 110878 },
+ { 110960, 111355 },
{ 113664, 113770 },
{ 113776, 113788 },
{ 113792, 113800 },
@@ -597,6 +624,7 @@ static const URange32 Lo_range32[] = {
{ 173824, 177972 },
{ 177984, 178205 },
{ 178208, 183969 },
+ { 183984, 191456 },
{ 194560, 195101 },
};
static const URange16 Ll_range16[] = {
@@ -871,6 +899,7 @@ static const URange16 Ll_range16[] = {
{ 1327, 1327 },
{ 1377, 1415 },
{ 5112, 5117 },
+ { 7296, 7304 },
{ 7424, 7467 },
{ 7531, 7543 },
{ 7545, 7578 },
@@ -1202,6 +1231,7 @@ static const URange16 Ll_range16[] = {
};
static const URange32 Ll_range32[] = {
{ 66600, 66639 },
+ { 66776, 66811 },
{ 68800, 68850 },
{ 71872, 71903 },
{ 119834, 119859 },
@@ -1232,6 +1262,7 @@ static const URange32 Ll_range32[] = {
{ 120746, 120770 },
{ 120772, 120777 },
{ 120779, 120779 },
+ { 125218, 125251 },
};
static const URange16 Lm_range16[] = {
{ 688, 705 },
@@ -1292,6 +1323,7 @@ static const URange16 Lm_range16[] = {
static const URange32 Lm_range32[] = {
{ 92992, 92995 },
{ 94099, 94111 },
+ { 94176, 94177 },
};
static const URange16 Nd_range16[] = {
{ 48, 57 },
@@ -1339,14 +1371,18 @@ static const URange32 Nd_range32[] = {
{ 69942, 69951 },
{ 70096, 70105 },
{ 70384, 70393 },
+ { 70736, 70745 },
{ 70864, 70873 },
{ 71248, 71257 },
{ 71360, 71369 },
{ 71472, 71481 },
{ 71904, 71913 },
+ { 72784, 72793 },
+ { 73040, 73049 },
{ 92768, 92777 },
{ 93008, 93017 },
{ 120782, 120831 },
+ { 125264, 125273 },
};
static const URange16 Pc_range16[] = {
{ 95, 95 },
@@ -1956,13 +1992,14 @@ static const URange16 Lu_range16[] = {
{ 42916, 42916 },
{ 42918, 42918 },
{ 42920, 42920 },
- { 42922, 42925 },
+ { 42922, 42926 },
{ 42928, 42932 },
{ 42934, 42934 },
{ 65313, 65338 },
};
static const URange32 Lu_range32[] = {
{ 66560, 66599 },
+ { 66736, 66771 },
{ 68736, 68786 },
{ 71840, 71871 },
{ 119808, 119833 },
@@ -1996,6 +2033,7 @@ static const URange32 Lu_range32[] = {
{ 120662, 120686 },
{ 120720, 120744 },
{ 120778, 120778 },
+ { 125184, 125217 },
};
static const URange16 Pf_range16[] = {
{ 187, 187 },
@@ -2148,6 +2186,7 @@ static const URange16 Po_range16[] = {
{ 2142, 2142 },
{ 2404, 2405 },
{ 2416, 2416 },
+ { 2557, 2557 },
{ 2800, 2800 },
{ 3572, 3572 },
{ 3663, 3663 },
@@ -2199,6 +2238,7 @@ static const URange16 Po_range16[] = {
{ 11824, 11833 },
{ 11836, 11839 },
{ 11841, 11841 },
+ { 11843, 11849 },
{ 12289, 12291 },
{ 12349, 12349 },
{ 12539, 12539 },
@@ -2264,10 +2304,19 @@ static const URange32 Po_range32[] = {
{ 70109, 70111 },
{ 70200, 70205 },
{ 70313, 70313 },
+ { 70731, 70735 },
+ { 70747, 70747 },
+ { 70749, 70749 },
{ 70854, 70854 },
{ 71105, 71127 },
{ 71233, 71235 },
+ { 71264, 71276 },
{ 71484, 71486 },
+ { 72255, 72262 },
+ { 72346, 72348 },
+ { 72350, 72354 },
+ { 72769, 72773 },
+ { 72816, 72817 },
{ 74864, 74868 },
{ 92782, 92783 },
{ 92917, 92917 },
@@ -2275,6 +2324,7 @@ static const URange32 Po_range32[] = {
{ 92996, 92996 },
{ 113823, 113823 },
{ 121479, 121483 },
+ { 125278, 125279 },
};
static const URange16 Me_range16[] = {
{ 1160, 1161 },
@@ -2291,6 +2341,7 @@ static const URange16 C_range16[] = {
{ 1564, 1564 },
{ 1757, 1757 },
{ 1807, 1807 },
+ { 2274, 2274 },
{ 6158, 6158 },
{ 8203, 8207 },
{ 8234, 8238 },
@@ -2397,6 +2448,7 @@ static const URange16 Mc_range16[] = {
{ 7220, 7221 },
{ 7393, 7393 },
{ 7410, 7411 },
+ { 7415, 7415 },
{ 12334, 12335 },
{ 43043, 43044 },
{ 43047, 43047 },
@@ -2441,6 +2493,9 @@ static const URange32 Mc_range32[] = {
{ 70475, 70477 },
{ 70487, 70487 },
{ 70498, 70499 },
+ { 70709, 70711 },
+ { 70720, 70721 },
+ { 70725, 70725 },
{ 70832, 70834 },
{ 70841, 70841 },
{ 70843, 70846 },
@@ -2456,6 +2511,15 @@ static const URange32 Mc_range32[] = {
{ 71350, 71350 },
{ 71456, 71457 },
{ 71462, 71462 },
+ { 72199, 72200 },
+ { 72249, 72249 },
+ { 72279, 72280 },
+ { 72343, 72343 },
+ { 72751, 72751 },
+ { 72766, 72766 },
+ { 72873, 72873 },
+ { 72881, 72881 },
+ { 72884, 72884 },
{ 94033, 94078 },
{ 119141, 119142 },
{ 119149, 119154 },
@@ -2484,6 +2548,7 @@ static const URange16 Mn_range16[] = {
{ 2085, 2087 },
{ 2089, 2093 },
{ 2137, 2139 },
+ { 2260, 2273 },
{ 2275, 2306 },
{ 2362, 2362 },
{ 2364, 2364 },
@@ -2510,6 +2575,7 @@ static const URange16 Mn_range16[] = {
{ 2759, 2760 },
{ 2765, 2765 },
{ 2786, 2787 },
+ { 2810, 2815 },
{ 2817, 2817 },
{ 2876, 2876 },
{ 2879, 2879 },
@@ -2532,7 +2598,8 @@ static const URange16 Mn_range16[] = {
{ 3270, 3270 },
{ 3276, 3277 },
{ 3298, 3299 },
- { 3329, 3329 },
+ { 3328, 3329 },
+ { 3387, 3388 },
{ 3393, 3396 },
{ 3405, 3405 },
{ 3426, 3427 },
@@ -2578,6 +2645,7 @@ static const URange16 Mn_range16[] = {
{ 6089, 6099 },
{ 6109, 6109 },
{ 6155, 6157 },
+ { 6277, 6278 },
{ 6313, 6313 },
{ 6432, 6434 },
{ 6439, 6440 },
@@ -2615,8 +2683,8 @@ static const URange16 Mn_range16[] = {
{ 7405, 7405 },
{ 7412, 7412 },
{ 7416, 7417 },
- { 7616, 7669 },
- { 7676, 7679 },
+ { 7616, 7673 },
+ { 7675, 7679 },
{ 8400, 8412 },
{ 8417, 8417 },
{ 8421, 8432 },
@@ -2633,7 +2701,7 @@ static const URange16 Mn_range16[] = {
{ 43014, 43014 },
{ 43019, 43019 },
{ 43045, 43046 },
- { 43204, 43204 },
+ { 43204, 43205 },
{ 43232, 43249 },
{ 43302, 43309 },
{ 43335, 43345 },
@@ -2687,6 +2755,7 @@ static const URange32 Mn_range32[] = {
{ 70191, 70193 },
{ 70196, 70196 },
{ 70198, 70199 },
+ { 70206, 70206 },
{ 70367, 70367 },
{ 70371, 70378 },
{ 70400, 70401 },
@@ -2694,6 +2763,9 @@ static const URange32 Mn_range32[] = {
{ 70464, 70464 },
{ 70502, 70508 },
{ 70512, 70516 },
+ { 70712, 70719 },
+ { 70722, 70724 },
+ { 70726, 70726 },
{ 70835, 70840 },
{ 70842, 70842 },
{ 70847, 70848 },
@@ -2712,6 +2784,27 @@ static const URange32 Mn_range32[] = {
{ 71453, 71455 },
{ 71458, 71461 },
{ 71463, 71467 },
+ { 72193, 72198 },
+ { 72201, 72202 },
+ { 72243, 72248 },
+ { 72251, 72254 },
+ { 72263, 72263 },
+ { 72273, 72278 },
+ { 72281, 72283 },
+ { 72330, 72342 },
+ { 72344, 72345 },
+ { 72752, 72758 },
+ { 72760, 72765 },
+ { 72767, 72767 },
+ { 72850, 72871 },
+ { 72874, 72880 },
+ { 72882, 72883 },
+ { 72885, 72886 },
+ { 73009, 73014 },
+ { 73018, 73018 },
+ { 73020, 73021 },
+ { 73023, 73029 },
+ { 73031, 73031 },
{ 92912, 92916 },
{ 92976, 92982 },
{ 94095, 94098 },
@@ -2727,7 +2820,13 @@ static const URange32 Mn_range32[] = {
{ 121476, 121476 },
{ 121499, 121503 },
{ 121505, 121519 },
+ { 122880, 122886 },
+ { 122888, 122904 },
+ { 122907, 122913 },
+ { 122915, 122916 },
+ { 122918, 122922 },
{ 125136, 125142 },
+ { 125252, 125258 },
{ 917760, 917999 },
};
static const URange16 M_range16[] = {
@@ -2754,6 +2853,7 @@ static const URange16 M_range16[] = {
{ 2085, 2087 },
{ 2089, 2093 },
{ 2137, 2139 },
+ { 2260, 2273 },
{ 2275, 2307 },
{ 2362, 2364 },
{ 2366, 2383 },
@@ -2780,6 +2880,7 @@ static const URange16 M_range16[] = {
{ 2759, 2761 },
{ 2763, 2765 },
{ 2786, 2787 },
+ { 2810, 2815 },
{ 2817, 2819 },
{ 2876, 2876 },
{ 2878, 2884 },
@@ -2805,7 +2906,8 @@ static const URange16 M_range16[] = {
{ 3274, 3277 },
{ 3285, 3286 },
{ 3298, 3299 },
- { 3329, 3331 },
+ { 3328, 3331 },
+ { 3387, 3388 },
{ 3390, 3396 },
{ 3398, 3400 },
{ 3402, 3405 },
@@ -2851,6 +2953,7 @@ static const URange16 M_range16[] = {
{ 6068, 6099 },
{ 6109, 6109 },
{ 6155, 6157 },
+ { 6277, 6278 },
{ 6313, 6313 },
{ 6432, 6443 },
{ 6448, 6459 },
@@ -2870,9 +2973,9 @@ static const URange16 M_range16[] = {
{ 7380, 7400 },
{ 7405, 7405 },
{ 7410, 7412 },
- { 7416, 7417 },
- { 7616, 7669 },
- { 7676, 7679 },
+ { 7415, 7417 },
+ { 7616, 7673 },
+ { 7675, 7679 },
{ 8400, 8432 },
{ 11503, 11505 },
{ 11647, 11647 },
@@ -2888,7 +2991,7 @@ static const URange16 M_range16[] = {
{ 43019, 43019 },
{ 43043, 43047 },
{ 43136, 43137 },
- { 43188, 43204 },
+ { 43188, 43205 },
{ 43232, 43249 },
{ 43302, 43309 },
{ 43335, 43347 },
@@ -2933,6 +3036,7 @@ static const URange32 M_range32[] = {
{ 70067, 70080 },
{ 70090, 70092 },
{ 70188, 70199 },
+ { 70206, 70206 },
{ 70367, 70378 },
{ 70400, 70403 },
{ 70460, 70460 },
@@ -2943,6 +3047,7 @@ static const URange32 M_range32[] = {
{ 70498, 70499 },
{ 70502, 70508 },
{ 70512, 70516 },
+ { 70709, 70726 },
{ 70832, 70851 },
{ 71087, 71093 },
{ 71096, 71104 },
@@ -2950,6 +3055,21 @@ static const URange32 M_range32[] = {
{ 71216, 71232 },
{ 71339, 71351 },
{ 71453, 71467 },
+ { 72193, 72202 },
+ { 72243, 72249 },
+ { 72251, 72254 },
+ { 72263, 72263 },
+ { 72273, 72283 },
+ { 72330, 72345 },
+ { 72751, 72758 },
+ { 72760, 72767 },
+ { 72850, 72871 },
+ { 72873, 72886 },
+ { 73009, 73014 },
+ { 73018, 73018 },
+ { 73020, 73021 },
+ { 73023, 73029 },
+ { 73031, 73031 },
{ 92912, 92916 },
{ 92976, 92982 },
{ 94033, 94078 },
@@ -2967,7 +3087,13 @@ static const URange32 M_range32[] = {
{ 121476, 121476 },
{ 121499, 121503 },
{ 121505, 121519 },
+ { 122880, 122886 },
+ { 122888, 122904 },
+ { 122907, 122913 },
+ { 122915, 122916 },
+ { 122918, 122922 },
{ 125136, 125142 },
+ { 125252, 125258 },
{ 917760, 917999 },
};
static const URange16 L_range16[] = {
@@ -3019,7 +3145,9 @@ static const URange16 L_range16[] = {
{ 2084, 2084 },
{ 2088, 2088 },
{ 2112, 2136 },
+ { 2144, 2154 },
{ 2208, 2228 },
+ { 2230, 2237 },
{ 2308, 2361 },
{ 2365, 2365 },
{ 2384, 2384 },
@@ -3036,6 +3164,7 @@ static const URange16 L_range16[] = {
{ 2524, 2525 },
{ 2527, 2529 },
{ 2544, 2545 },
+ { 2556, 2556 },
{ 2565, 2570 },
{ 2575, 2576 },
{ 2579, 2600 },
@@ -3084,6 +3213,7 @@ static const URange16 L_range16[] = {
{ 3133, 3133 },
{ 3160, 3162 },
{ 3168, 3169 },
+ { 3200, 3200 },
{ 3205, 3212 },
{ 3214, 3216 },
{ 3218, 3240 },
@@ -3098,6 +3228,7 @@ static const URange16 L_range16[] = {
{ 3346, 3386 },
{ 3389, 3389 },
{ 3406, 3406 },
+ { 3412, 3414 },
{ 3423, 3425 },
{ 3450, 3455 },
{ 3461, 3478 },
@@ -3176,7 +3307,8 @@ static const URange16 L_range16[] = {
{ 6103, 6103 },
{ 6108, 6108 },
{ 6176, 6263 },
- { 6272, 6312 },
+ { 6272, 6276 },
+ { 6279, 6312 },
{ 6314, 6314 },
{ 6320, 6389 },
{ 6400, 6430 },
@@ -3195,6 +3327,7 @@ static const URange16 L_range16[] = {
{ 7168, 7203 },
{ 7245, 7247 },
{ 7258, 7293 },
+ { 7296, 7304 },
{ 7401, 7404 },
{ 7406, 7409 },
{ 7413, 7414 },
@@ -3262,12 +3395,12 @@ static const URange16 L_range16[] = {
{ 12445, 12447 },
{ 12449, 12538 },
{ 12540, 12543 },
- { 12549, 12589 },
+ { 12549, 12590 },
{ 12593, 12686 },
{ 12704, 12730 },
{ 12784, 12799 },
{ 13312, 19893 },
- { 19968, 40917 },
+ { 19968, 40938 },
{ 40960, 42124 },
{ 42192, 42237 },
{ 42240, 42508 },
@@ -3278,7 +3411,7 @@ static const URange16 L_range16[] = {
{ 42656, 42725 },
{ 42775, 42783 },
{ 42786, 42888 },
- { 42891, 42925 },
+ { 42891, 42926 },
{ 42928, 42935 },
{ 42999, 43009 },
{ 43011, 43013 },
@@ -3359,13 +3492,15 @@ static const URange32 L_range32[] = {
{ 66176, 66204 },
{ 66208, 66256 },
{ 66304, 66335 },
- { 66352, 66368 },
+ { 66349, 66368 },
{ 66370, 66377 },
{ 66384, 66421 },
{ 66432, 66461 },
{ 66464, 66499 },
{ 66504, 66511 },
{ 66560, 66717 },
+ { 66736, 66771 },
+ { 66776, 66811 },
{ 66816, 66855 },
{ 66864, 66915 },
{ 67072, 67382 },
@@ -3427,6 +3562,8 @@ static const URange32 L_range32[] = {
{ 70461, 70461 },
{ 70480, 70480 },
{ 70493, 70497 },
+ { 70656, 70708 },
+ { 70727, 70730 },
{ 70784, 70831 },
{ 70852, 70853 },
{ 70855, 70855 },
@@ -3438,7 +3575,21 @@ static const URange32 L_range32[] = {
{ 71424, 71449 },
{ 71840, 71903 },
{ 71935, 71935 },
+ { 72192, 72192 },
+ { 72203, 72242 },
+ { 72250, 72250 },
+ { 72272, 72272 },
+ { 72284, 72323 },
+ { 72326, 72329 },
{ 72384, 72440 },
+ { 72704, 72712 },
+ { 72714, 72750 },
+ { 72768, 72768 },
+ { 72818, 72847 },
+ { 72960, 72966 },
+ { 72968, 72969 },
+ { 72971, 73008 },
+ { 73030, 73030 },
{ 73728, 74649 },
{ 74880, 75075 },
{ 77824, 78894 },
@@ -3453,7 +3604,11 @@ static const URange32 L_range32[] = {
{ 93952, 94020 },
{ 94032, 94032 },
{ 94099, 94111 },
- { 110592, 110593 },
+ { 94176, 94177 },
+ { 94208, 100332 },
+ { 100352, 101106 },
+ { 110592, 110878 },
+ { 110960, 111355 },
{ 113664, 113770 },
{ 113776, 113788 },
{ 113792, 113800 },
@@ -3489,6 +3644,7 @@ static const URange32 L_range32[] = {
{ 120746, 120770 },
{ 120772, 120779 },
{ 124928, 125124 },
+ { 125184, 125251 },
{ 126464, 126467 },
{ 126469, 126495 },
{ 126497, 126498 },
@@ -3526,6 +3682,7 @@ static const URange32 L_range32[] = {
{ 173824, 177972 },
{ 177984, 178205 },
{ 178208, 183969 },
+ { 183984, 191456 },
{ 194560, 195101 },
};
static const URange16 N_range16[] = {
@@ -3547,7 +3704,8 @@ static const URange16 N_range16[] = {
{ 3174, 3183 },
{ 3192, 3198 },
{ 3302, 3311 },
- { 3430, 3445 },
+ { 3416, 3422 },
+ { 3430, 3448 },
{ 3558, 3567 },
{ 3664, 3673 },
{ 3792, 3801 },
@@ -3629,11 +3787,14 @@ static const URange32 N_range32[] = {
{ 70096, 70105 },
{ 70113, 70132 },
{ 70384, 70393 },
+ { 70736, 70745 },
{ 70864, 70873 },
{ 71248, 71257 },
{ 71360, 71369 },
{ 71472, 71483 },
{ 71904, 71922 },
+ { 72784, 72812 },
+ { 73040, 73049 },
{ 74752, 74862 },
{ 92768, 92777 },
{ 93008, 93017 },
@@ -3641,6 +3802,7 @@ static const URange32 N_range32[] = {
{ 119648, 119665 },
{ 120782, 120831 },
{ 125127, 125135 },
+ { 125264, 125273 },
{ 127232, 127244 },
};
static const URange16 Sk_range16[] = {
@@ -3713,6 +3875,7 @@ static const URange16 P_range16[] = {
{ 2142, 2142 },
{ 2404, 2405 },
{ 2416, 2416 },
+ { 2557, 2557 },
{ 2800, 2800 },
{ 3572, 3572 },
{ 3663, 3663 },
@@ -3762,7 +3925,7 @@ static const URange16 P_range16[] = {
{ 11518, 11519 },
{ 11632, 11632 },
{ 11776, 11822 },
- { 11824, 11842 },
+ { 11824, 11849 },
{ 12289, 12291 },
{ 12296, 12305 },
{ 12308, 12319 },
@@ -3829,10 +3992,19 @@ static const URange32 P_range32[] = {
{ 70109, 70111 },
{ 70200, 70205 },
{ 70313, 70313 },
+ { 70731, 70735 },
+ { 70747, 70747 },
+ { 70749, 70749 },
{ 70854, 70854 },
{ 71105, 71127 },
{ 71233, 71235 },
+ { 71264, 71276 },
{ 71484, 71486 },
+ { 72255, 72262 },
+ { 72346, 72348 },
+ { 72350, 72354 },
+ { 72769, 72773 },
+ { 72816, 72817 },
{ 74864, 74868 },
{ 92782, 92783 },
{ 92917, 92917 },
@@ -3840,6 +4012,7 @@ static const URange32 P_range32[] = {
{ 92996, 92996 },
{ 113823, 113823 },
{ 121479, 121483 },
+ { 125278, 125279 },
};
static const URange16 S_range16[] = {
{ 36, 36 },
@@ -3880,6 +4053,7 @@ static const URange16 S_range16[] = {
{ 2928, 2928 },
{ 3059, 3066 },
{ 3199, 3199 },
+ { 3407, 3407 },
{ 3449, 3449 },
{ 3647, 3647 },
{ 3841, 3843 },
@@ -3910,7 +4084,7 @@ static const URange16 S_range16[] = {
{ 8274, 8274 },
{ 8314, 8316 },
{ 8330, 8332 },
- { 8352, 8382 },
+ { 8352, 8383 },
{ 8448, 8449 },
{ 8451, 8454 },
{ 8456, 8457 },
@@ -3928,8 +4102,7 @@ static const URange16 S_range16[] = {
{ 8586, 8587 },
{ 8592, 8967 },
{ 8972, 9000 },
- { 9003, 9210 },
- { 9216, 9254 },
+ { 9003, 9254 },
{ 9280, 9290 },
{ 9372, 9449 },
{ 9472, 10087 },
@@ -3942,7 +4115,7 @@ static const URange16 S_range16[] = {
{ 11126, 11157 },
{ 11160, 11193 },
{ 11197, 11208 },
- { 11210, 11217 },
+ { 11210, 11218 },
{ 11244, 11247 },
{ 11493, 11498 },
{ 11904, 11929 },
@@ -3994,7 +4167,7 @@ static const URange16 S_range16[] = {
static const URange32 S_range32[] = {
{ 65847, 65855 },
{ 65913, 65929 },
- { 65932, 65932 },
+ { 65932, 65934 },
{ 65936, 65947 },
{ 65952, 65952 },
{ 66000, 66044 },
@@ -4038,16 +4211,15 @@ static const URange32 S_range32[] = {
{ 127185, 127221 },
{ 127248, 127278 },
{ 127280, 127339 },
- { 127344, 127386 },
+ { 127344, 127404 },
{ 127462, 127490 },
- { 127504, 127546 },
+ { 127504, 127547 },
{ 127552, 127560 },
{ 127568, 127569 },
- { 127744, 128377 },
- { 128379, 128419 },
- { 128421, 128720 },
+ { 127584, 127589 },
+ { 127744, 128724 },
{ 128736, 128748 },
- { 128752, 128755 },
+ { 128752, 128760 },
{ 128768, 128883 },
{ 128896, 128980 },
{ 129024, 129035 },
@@ -4055,9 +4227,13 @@ static const URange32 S_range32[] = {
{ 129104, 129113 },
{ 129120, 129159 },
{ 129168, 129197 },
- { 129296, 129304 },
- { 129408, 129412 },
+ { 129280, 129291 },
+ { 129296, 129342 },
+ { 129344, 129356 },
+ { 129360, 129387 },
+ { 129408, 129431 },
{ 129472, 129472 },
+ { 129488, 129510 },
};
static const URange16 So_range16[] = {
{ 166, 166 },
@@ -4076,6 +4252,7 @@ static const URange16 So_range16[] = {
{ 3059, 3064 },
{ 3066, 3066 },
{ 3199, 3199 },
+ { 3407, 3407 },
{ 3449, 3449 },
{ 3841, 3843 },
{ 3859, 3859 },
@@ -4124,8 +4301,7 @@ static const URange16 So_range16[] = {
{ 9003, 9083 },
{ 9085, 9114 },
{ 9140, 9179 },
- { 9186, 9210 },
- { 9216, 9254 },
+ { 9186, 9254 },
{ 9280, 9290 },
{ 9372, 9449 },
{ 9472, 9654 },
@@ -4141,7 +4317,7 @@ static const URange16 So_range16[] = {
{ 11126, 11157 },
{ 11160, 11193 },
{ 11197, 11208 },
- { 11210, 11217 },
+ { 11210, 11218 },
{ 11244, 11247 },
{ 11493, 11498 },
{ 11904, 11929 },
@@ -4178,7 +4354,7 @@ static const URange16 So_range16[] = {
static const URange32 So_range32[] = {
{ 65847, 65855 },
{ 65913, 65929 },
- { 65932, 65932 },
+ { 65932, 65934 },
{ 65936, 65947 },
{ 65952, 65952 },
{ 66000, 66044 },
@@ -4211,17 +4387,16 @@ static const URange32 So_range32[] = {
{ 127185, 127221 },
{ 127248, 127278 },
{ 127280, 127339 },
- { 127344, 127386 },
+ { 127344, 127404 },
{ 127462, 127490 },
- { 127504, 127546 },
+ { 127504, 127547 },
{ 127552, 127560 },
{ 127568, 127569 },
+ { 127584, 127589 },
{ 127744, 127994 },
- { 128000, 128377 },
- { 128379, 128419 },
- { 128421, 128720 },
+ { 128000, 128724 },
{ 128736, 128748 },
- { 128752, 128755 },
+ { 128752, 128760 },
{ 128768, 128883 },
{ 128896, 128980 },
{ 129024, 129035 },
@@ -4229,9 +4404,13 @@ static const URange32 So_range32[] = {
{ 129104, 129113 },
{ 129120, 129159 },
{ 129168, 129197 },
- { 129296, 129304 },
- { 129408, 129412 },
+ { 129280, 129291 },
+ { 129296, 129342 },
+ { 129344, 129356 },
+ { 129360, 129387 },
+ { 129408, 129431 },
{ 129472, 129472 },
+ { 129488, 129510 },
};
static const URange16 Sm_range16[] = {
{ 43, 43 },
@@ -4312,7 +4491,7 @@ static const URange16 Sc_range16[] = {
{ 3065, 3065 },
{ 3647, 3647 },
{ 6107, 6107 },
- { 8352, 8382 },
+ { 8352, 8383 },
{ 43064, 43064 },
{ 65020, 65020 },
{ 65129, 65129 },
@@ -4350,6 +4529,7 @@ static const URange16 Cf_range16[] = {
{ 1564, 1564 },
{ 1757, 1757 },
{ 1807, 1807 },
+ { 2274, 2274 },
{ 6158, 6158 },
{ 8203, 8207 },
{ 8234, 8238 },
@@ -4380,9 +4560,19 @@ static const URange16 Zs_range16[] = {
{ 8287, 8287 },
{ 12288, 12288 },
};
+static const URange32 Tangut_range32[] = {
+ { 94176, 94176 },
+ { 94208, 100332 },
+ { 100352, 101106 },
+};
static const URange16 Thaana_range16[] = {
{ 1920, 1969 },
};
+static const URange32 Adlam_range32[] = {
+ { 125184, 125258 },
+ { 125264, 125273 },
+ { 125278, 125279 },
+};
static const URange16 Telugu_range16[] = {
{ 3072, 3075 },
{ 3077, 3084 },
@@ -4401,12 +4591,16 @@ static const URange16 Telugu_range16[] = {
static const URange16 Cyrillic_range16[] = {
{ 1024, 1156 },
{ 1159, 1327 },
+ { 7296, 7304 },
{ 7467, 7467 },
{ 7544, 7544 },
{ 11744, 11775 },
{ 42560, 42655 },
{ 65070, 65071 },
};
+static const URange32 Zanabazar_Square_range32[] = {
+ { 72192, 72263 },
+};
static const URange16 Hangul_range16[] = {
{ 4352, 4607 },
{ 12334, 12335 },
@@ -4473,8 +4667,8 @@ static const URange16 Inherited_range16[] = {
{ 7405, 7405 },
{ 7412, 7412 },
{ 7416, 7417 },
- { 7616, 7669 },
- { 7676, 7679 },
+ { 7616, 7673 },
+ { 7675, 7679 },
{ 8204, 8205 },
{ 8400, 8432 },
{ 12330, 12333 },
@@ -4496,6 +4690,12 @@ static const URange32 Meroitic_Cursive_range32[] = {
{ 68028, 68047 },
{ 68050, 68095 },
};
+static const URange32 Bhaiksuki_range32[] = {
+ { 72704, 72712 },
+ { 72714, 72758 },
+ { 72760, 72773 },
+ { 72784, 72812 },
+};
static const URange32 Ahom_range32[] = {
{ 71424, 71449 },
{ 71453, 71467 },
@@ -4510,7 +4710,7 @@ static const URange16 Han_range16[] = {
{ 12321, 12329 },
{ 12344, 12347 },
{ 13312, 19893 },
- { 19968, 40917 },
+ { 19968, 40938 },
{ 63744, 64109 },
{ 64112, 64217 },
};
@@ -4519,6 +4719,7 @@ static const URange32 Han_range32[] = {
{ 173824, 177972 },
{ 177984, 178205 },
{ 178208, 183969 },
+ { 183984, 191456 },
{ 194560, 195101 },
};
static const URange32 Old_North_Arabian_range32[] = {
@@ -4552,7 +4753,7 @@ static const URange16 Tamil_range16[] = {
};
static const URange16 Bopomofo_range16[] = {
{ 746, 747 },
- { 12549, 12589 },
+ { 12549, 12590 },
{ 12704, 12730 },
};
static const URange32 Bassa_Vah_range32[] = {
@@ -4563,22 +4764,28 @@ static const URange16 Sundanese_range16[] = {
{ 7040, 7103 },
{ 7360, 7367 },
};
+static const URange32 Osage_range32[] = {
+ { 66736, 66771 },
+ { 66776, 66811 },
+};
static const URange16 Tagalog_range16[] = {
{ 5888, 5900 },
{ 5902, 5908 },
};
static const URange16 Malayalam_range16[] = {
- { 3329, 3331 },
+ { 3328, 3331 },
{ 3333, 3340 },
{ 3342, 3344 },
- { 3346, 3386 },
- { 3389, 3396 },
+ { 3346, 3396 },
{ 3398, 3400 },
- { 3402, 3406 },
- { 3415, 3415 },
- { 3423, 3427 },
- { 3430, 3445 },
- { 3449, 3455 },
+ { 3402, 3407 },
+ { 3412, 3427 },
+ { 3430, 3455 },
+};
+static const URange32 Marchen_range32[] = {
+ { 72816, 72847 },
+ { 72850, 72871 },
+ { 72873, 72886 },
};
static const URange32 Carian_range32[] = {
{ 66208, 66256 },
@@ -4588,7 +4795,7 @@ static const URange16 Hiragana_range16[] = {
{ 12445, 12447 },
};
static const URange32 Hiragana_range32[] = {
- { 110593, 110593 },
+ { 110593, 110878 },
{ 127488, 127488 },
};
static const URange16 Tagbanwa_range16[] = {
@@ -4639,6 +4846,7 @@ static const URange16 Tai_Tham_range16[] = {
};
static const URange32 Old_Italic_range32[] = {
{ 66304, 66339 },
+ { 66349, 66351 },
};
static const URange32 Old_Persian_range32[] = {
{ 66464, 66499 },
@@ -4672,7 +4880,7 @@ static const URange16 Latin_range16[] = {
{ 8544, 8584 },
{ 11360, 11391 },
{ 42786, 42887 },
- { 42891, 42925 },
+ { 42891, 42926 },
{ 42928, 42935 },
{ 42999, 43007 },
{ 43824, 43866 },
@@ -4682,7 +4890,7 @@ static const URange16 Latin_range16[] = {
{ 65345, 65370 },
};
static const URange16 Saurashtra_range16[] = {
- { 43136, 43204 },
+ { 43136, 43205 },
{ 43214, 43225 },
};
static const URange32 Shavian_range32[] = {
@@ -4796,7 +5004,7 @@ static const URange16 Greek_range16[] = {
{ 43877, 43877 },
};
static const URange32 Greek_range32[] = {
- { 65856, 65932 },
+ { 65856, 65934 },
{ 65952, 65952 },
{ 119296, 119365 },
};
@@ -4831,6 +5039,7 @@ static const URange16 Syriac_range16[] = {
{ 1792, 1805 },
{ 1807, 1866 },
{ 1869, 1871 },
+ { 2144, 2154 },
};
static const URange16 Runic_range16[] = {
{ 5792, 5866 },
@@ -4867,6 +5076,11 @@ static const URange16 New_Tai_Lue_range16[] = {
static const URange16 Ol_Chiki_range16[] = {
{ 7248, 7295 },
};
+static const URange32 Newa_range32[] = {
+ { 70656, 70745 },
+ { 70747, 70747 },
+ { 70749, 70749 },
+};
static const URange16 Limbu_range16[] = {
{ 6400, 6430 },
{ 6432, 6443 },
@@ -4921,7 +5135,11 @@ static const URange16 Gujarati_range16[] = {
{ 2768, 2768 },
{ 2784, 2787 },
{ 2790, 2801 },
- { 2809, 2809 },
+ { 2809, 2815 },
+};
+static const URange32 Nushu_range32[] = {
+ { 94177, 94177 },
+ { 110960, 111355 },
};
static const URange32 Modi_range32[] = {
{ 71168, 71236 },
@@ -4995,7 +5213,7 @@ static const URange16 Bengali_range16[] = {
{ 2519, 2519 },
{ 2524, 2525 },
{ 2527, 2531 },
- { 2534, 2555 },
+ { 2534, 2557 },
};
static const URange32 Kaithi_range32[] = {
{ 69760, 69825 },
@@ -5004,6 +5222,13 @@ static const URange16 Glagolitic_range16[] = {
{ 11264, 11310 },
{ 11312, 11358 },
};
+static const URange32 Glagolitic_range32[] = {
+ { 122880, 122886 },
+ { 122888, 122904 },
+ { 122907, 122913 },
+ { 122915, 122916 },
+ { 122918, 122922 },
+};
static const URange32 Imperial_Aramaic_range32[] = {
{ 67648, 67669 },
{ 67671, 67679 },
@@ -5050,7 +5275,7 @@ static const URange32 Cypriot_range32[] = {
{ 67647, 67647 },
};
static const URange16 Kannada_range16[] = {
- { 3201, 3203 },
+ { 3200, 3203 },
{ 3205, 3212 },
{ 3214, 3216 },
{ 3218, 3240 },
@@ -5067,7 +5292,7 @@ static const URange16 Kannada_range16[] = {
};
static const URange32 Khojki_range32[] = {
{ 70144, 70161 },
- { 70163, 70205 },
+ { 70163, 70206 },
};
static const URange16 Mongolian_range16[] = {
{ 6144, 6145 },
@@ -5077,6 +5302,9 @@ static const URange16 Mongolian_range16[] = {
{ 6176, 6263 },
{ 6272, 6314 },
};
+static const URange32 Mongolian_range32[] = {
+ { 71264, 71276 },
+};
static const URange16 Sinhala_range16[] = {
{ 3458, 3459 },
{ 3461, 3478 },
@@ -5152,6 +5380,15 @@ static const URange16 Lao_range16[] = {
static const URange16 Hanunoo_range16[] = {
{ 5920, 5940 },
};
+static const URange32 Masaram_Gondi_range32[] = {
+ { 72960, 72966 },
+ { 72968, 72969 },
+ { 72971, 73014 },
+ { 73018, 73018 },
+ { 73020, 73021 },
+ { 73023, 73031 },
+ { 73040, 73049 },
+};
static const URange32 Linear_B_range32[] = {
{ 65536, 65547 },
{ 65549, 65574 },
@@ -5230,10 +5467,11 @@ static const URange16 Common_range16[] = {
{ 1417, 1417 },
{ 1541, 1541 },
{ 1548, 1548 },
- { 1563, 1564 },
+ { 1563, 1563 },
{ 1567, 1567 },
{ 1600, 1600 },
{ 1757, 1757 },
+ { 2274, 2274 },
{ 2404, 2405 },
{ 3647, 3647 },
{ 4053, 4056 },
@@ -5246,30 +5484,29 @@ static const URange16 Common_range16[] = {
{ 7393, 7393 },
{ 7401, 7404 },
{ 7406, 7411 },
- { 7413, 7414 },
+ { 7413, 7415 },
{ 8192, 8203 },
{ 8206, 8292 },
{ 8294, 8304 },
{ 8308, 8318 },
{ 8320, 8334 },
- { 8352, 8382 },
+ { 8352, 8383 },
{ 8448, 8485 },
{ 8487, 8489 },
{ 8492, 8497 },
{ 8499, 8525 },
{ 8527, 8543 },
{ 8585, 8587 },
- { 8592, 9210 },
- { 9216, 9254 },
+ { 8592, 9254 },
{ 9280, 9290 },
{ 9312, 10239 },
{ 10496, 11123 },
{ 11126, 11157 },
{ 11160, 11193 },
{ 11197, 11208 },
- { 11210, 11217 },
+ { 11210, 11218 },
{ 11244, 11247 },
- { 11776, 11842 },
+ { 11776, 11849 },
{ 12272, 12283 },
{ 12288, 12292 },
{ 12294, 12294 },
@@ -5353,17 +5590,16 @@ static const URange32 Common_range32[] = {
{ 127232, 127244 },
{ 127248, 127278 },
{ 127280, 127339 },
- { 127344, 127386 },
+ { 127344, 127404 },
{ 127462, 127487 },
{ 127489, 127490 },
- { 127504, 127546 },
+ { 127504, 127547 },
{ 127552, 127560 },
{ 127568, 127569 },
- { 127744, 128377 },
- { 128379, 128419 },
- { 128421, 128720 },
+ { 127584, 127589 },
+ { 127744, 128724 },
{ 128736, 128748 },
- { 128752, 128755 },
+ { 128752, 128760 },
{ 128768, 128883 },
{ 128896, 128980 },
{ 129024, 129035 },
@@ -5371,9 +5607,13 @@ static const URange32 Common_range32[] = {
{ 129104, 129113 },
{ 129120, 129159 },
{ 129168, 129197 },
- { 129296, 129304 },
- { 129408, 129412 },
+ { 129280, 129291 },
+ { 129296, 129342 },
+ { 129344, 129356 },
+ { 129360, 129387 },
+ { 129408, 129431 },
{ 129472, 129472 },
+ { 129488, 129510 },
{ 917505, 917505 },
{ 917536, 917631 },
};
@@ -5390,6 +5630,7 @@ static const URange16 Arabic_range16[] = {
{ 1536, 1540 },
{ 1542, 1547 },
{ 1549, 1562 },
+ { 1564, 1564 },
{ 1566, 1566 },
{ 1568, 1599 },
{ 1601, 1610 },
@@ -5398,6 +5639,8 @@ static const URange16 Arabic_range16[] = {
{ 1758, 1791 },
{ 1872, 1919 },
{ 2208, 2228 },
+ { 2230, 2237 },
+ { 2260, 2273 },
{ 2275, 2303 },
{ 64336, 64449 },
{ 64467, 64829 },
@@ -5459,6 +5702,11 @@ static const URange32 Siddham_range32[] = {
{ 71040, 71093 },
{ 71096, 71133 },
};
+static const URange32 Soyombo_range32[] = {
+ { 72272, 72323 },
+ { 72326, 72348 },
+ { 72350, 72354 },
+};
static const URange32 Avestan_range32[] = {
{ 68352, 68405 },
{ 68409, 68415 },
@@ -5483,11 +5731,12 @@ static const URange32 Takri_range32[] = {
{ 71296, 71351 },
{ 71360, 71369 },
};
-// 3949 16-bit ranges, 1133 32-bit ranges
+// 3981 16-bit ranges, 1325 32-bit ranges
const UGroup unicode_groups[] = {
+ { "Adlam", +1, 0, 0, Adlam_range32, 3 },
{ "Ahom", +1, 0, 0, Ahom_range32, 3 },
{ "Anatolian_Hieroglyphs", +1, 0, 0, Anatolian_Hieroglyphs_range32, 1 },
- { "Arabic", +1, Arabic_range16, 19, Arabic_range32, 35 },
+ { "Arabic", +1, Arabic_range16, 22, Arabic_range32, 35 },
{ "Armenian", +1, Armenian_range16, 6, 0, 0 },
{ "Avestan", +1, 0, 0, Avestan_range32, 2 },
{ "Balinese", +1, Balinese_range16, 2, 0, 0 },
@@ -5495,27 +5744,28 @@ const UGroup unicode_groups[] = {
{ "Bassa_Vah", +1, 0, 0, Bassa_Vah_range32, 2 },
{ "Batak", +1, Batak_range16, 2, 0, 0 },
{ "Bengali", +1, Bengali_range16, 14, 0, 0 },
+ { "Bhaiksuki", +1, 0, 0, Bhaiksuki_range32, 4 },
{ "Bopomofo", +1, Bopomofo_range16, 3, 0, 0 },
{ "Brahmi", +1, 0, 0, Brahmi_range32, 3 },
{ "Braille", +1, Braille_range16, 1, 0, 0 },
{ "Buginese", +1, Buginese_range16, 2, 0, 0 },
{ "Buhid", +1, Buhid_range16, 1, 0, 0 },
- { "C", +1, C_range16, 15, C_range32, 7 },
+ { "C", +1, C_range16, 16, C_range32, 7 },
{ "Canadian_Aboriginal", +1, Canadian_Aboriginal_range16, 2, 0, 0 },
{ "Carian", +1, 0, 0, Carian_range32, 1 },
{ "Caucasian_Albanian", +1, 0, 0, Caucasian_Albanian_range32, 2 },
{ "Cc", +1, Cc_range16, 2, 0, 0 },
- { "Cf", +1, Cf_range16, 12, Cf_range32, 5 },
+ { "Cf", +1, Cf_range16, 13, Cf_range32, 5 },
{ "Chakma", +1, 0, 0, Chakma_range32, 2 },
{ "Cham", +1, Cham_range16, 4, 0, 0 },
{ "Cherokee", +1, Cherokee_range16, 3, 0, 0 },
{ "Co", +1, Co_range16, 1, Co_range32, 2 },
- { "Common", +1, Common_range16, 92, Common_range32, 69 },
+ { "Common", +1, Common_range16, 92, Common_range32, 72 },
{ "Coptic", +1, Coptic_range16, 3, 0, 0 },
{ "Cs", +1, Cs_range16, 1, 0, 0 },
{ "Cuneiform", +1, 0, 0, Cuneiform_range32, 4 },
{ "Cypriot", +1, 0, 0, Cypriot_range32, 6 },
- { "Cyrillic", +1, Cyrillic_range16, 7, 0, 0 },
+ { "Cyrillic", +1, Cyrillic_range16, 8, 0, 0 },
{ "Deseret", +1, 0, 0, Deseret_range32, 1 },
{ "Devanagari", +1, Devanagari_range16, 4, 0, 0 },
{ "Duployan", +1, 0, 0, Duployan_range32, 5 },
@@ -5523,13 +5773,13 @@ const UGroup unicode_groups[] = {
{ "Elbasan", +1, 0, 0, Elbasan_range32, 1 },
{ "Ethiopic", +1, Ethiopic_range16, 32, 0, 0 },
{ "Georgian", +1, Georgian_range16, 8, 0, 0 },
- { "Glagolitic", +1, Glagolitic_range16, 2, 0, 0 },
+ { "Glagolitic", +1, Glagolitic_range16, 2, Glagolitic_range32, 5 },
{ "Gothic", +1, 0, 0, Gothic_range32, 1 },
{ "Grantha", +1, 0, 0, Grantha_range32, 15 },
{ "Greek", +1, Greek_range16, 33, Greek_range32, 3 },
{ "Gujarati", +1, Gujarati_range16, 14, 0, 0 },
{ "Gurmukhi", +1, Gurmukhi_range16, 16, 0, 0 },
- { "Han", +1, Han_range16, 11, Han_range32, 5 },
+ { "Han", +1, Han_range16, 11, Han_range32, 6 },
{ "Hangul", +1, Hangul_range16, 14, 0, 0 },
{ "Hanunoo", +1, Hanunoo_range16, 1, 0, 0 },
{ "Hatran", +1, 0, 0, Hatran_range32, 3 },
@@ -5548,7 +5798,7 @@ const UGroup unicode_groups[] = {
{ "Khmer", +1, Khmer_range16, 4, 0, 0 },
{ "Khojki", +1, 0, 0, Khojki_range32, 2 },
{ "Khudawadi", +1, 0, 0, Khudawadi_range32, 2 },
- { "L", +1, L_range16, 376, L_range32, 178 },
+ { "L", +1, L_range16, 383, L_range32, 202 },
{ "Lao", +1, Lao_range16, 18, 0, 0 },
{ "Latin", +1, Latin_range16, 31, 0, 0 },
{ "Lepcha", +1, Lepcha_range16, 3, 0, 0 },
@@ -5556,50 +5806,55 @@ const UGroup unicode_groups[] = {
{ "Linear_A", +1, 0, 0, Linear_A_range32, 3 },
{ "Linear_B", +1, 0, 0, Linear_B_range32, 7 },
{ "Lisu", +1, Lisu_range16, 1, 0, 0 },
- { "Ll", +1, Ll_range16, 599, Ll_range32, 31 },
- { "Lm", +1, Lm_range16, 54, Lm_range32, 2 },
- { "Lo", +1, Lo_range16, 290, Lo_range32, 143 },
+ { "Ll", +1, Ll_range16, 600, Ll_range32, 33 },
+ { "Lm", +1, Lm_range16, 54, Lm_range32, 3 },
+ { "Lo", +1, Lo_range16, 296, Lo_range32, 163 },
{ "Lt", +1, Lt_range16, 10, 0, 0 },
- { "Lu", +1, Lu_range16, 591, Lu_range32, 34 },
+ { "Lu", +1, Lu_range16, 591, Lu_range32, 36 },
{ "Lycian", +1, 0, 0, Lycian_range32, 1 },
{ "Lydian", +1, 0, 0, Lydian_range32, 2 },
- { "M", +1, M_range16, 180, M_range32, 56 },
+ { "M", +1, M_range16, 184, M_range32, 79 },
{ "Mahajani", +1, 0, 0, Mahajani_range32, 1 },
- { "Malayalam", +1, Malayalam_range16, 11, 0, 0 },
+ { "Malayalam", +1, Malayalam_range16, 8, 0, 0 },
{ "Mandaic", +1, Mandaic_range16, 2, 0, 0 },
{ "Manichaean", +1, 0, 0, Manichaean_range32, 2 },
- { "Mc", +1, Mc_range16, 109, Mc_range32, 38 },
+ { "Marchen", +1, 0, 0, Marchen_range32, 3 },
+ { "Masaram_Gondi", +1, 0, 0, Masaram_Gondi_range32, 7 },
+ { "Mc", +1, Mc_range16, 110, Mc_range32, 50 },
{ "Me", +1, Me_range16, 5, 0, 0 },
{ "Meetei_Mayek", +1, Meetei_Mayek_range16, 3, 0, 0 },
{ "Mende_Kikakui", +1, 0, 0, Mende_Kikakui_range32, 2 },
{ "Meroitic_Cursive", +1, 0, 0, Meroitic_Cursive_range32, 3 },
{ "Meroitic_Hieroglyphs", +1, 0, 0, Meroitic_Hieroglyphs_range32, 1 },
{ "Miao", +1, 0, 0, Miao_range32, 3 },
- { "Mn", +1, Mn_range16, 200, Mn_range32, 66 },
+ { "Mn", +1, Mn_range16, 204, Mn_range32, 97 },
{ "Modi", +1, 0, 0, Modi_range32, 2 },
- { "Mongolian", +1, Mongolian_range16, 6, 0, 0 },
+ { "Mongolian", +1, Mongolian_range16, 6, Mongolian_range32, 1 },
{ "Mro", +1, 0, 0, Mro_range32, 3 },
{ "Multani", +1, 0, 0, Multani_range32, 5 },
{ "Myanmar", +1, Myanmar_range16, 3, 0, 0 },
- { "N", +1, N_range16, 66, N_range32, 45 },
+ { "N", +1, N_range16, 67, N_range32, 49 },
{ "Nabataean", +1, 0, 0, Nabataean_range32, 2 },
- { "Nd", +1, Nd_range16, 37, Nd_range32, 14 },
+ { "Nd", +1, Nd_range16, 37, Nd_range32, 18 },
{ "New_Tai_Lue", +1, New_Tai_Lue_range16, 4, 0, 0 },
+ { "Newa", +1, 0, 0, Newa_range32, 3 },
{ "Nko", +1, Nko_range16, 1, 0, 0 },
{ "Nl", +1, Nl_range16, 7, Nl_range32, 5 },
- { "No", +1, No_range16, 28, No_range32, 30 },
+ { "No", +1, No_range16, 29, No_range32, 31 },
+ { "Nushu", +1, 0, 0, Nushu_range32, 2 },
{ "Ogham", +1, Ogham_range16, 1, 0, 0 },
{ "Ol_Chiki", +1, Ol_Chiki_range16, 1, 0, 0 },
{ "Old_Hungarian", +1, 0, 0, Old_Hungarian_range32, 3 },
- { "Old_Italic", +1, 0, 0, Old_Italic_range32, 1 },
+ { "Old_Italic", +1, 0, 0, Old_Italic_range32, 2 },
{ "Old_North_Arabian", +1, 0, 0, Old_North_Arabian_range32, 1 },
{ "Old_Permic", +1, 0, 0, Old_Permic_range32, 1 },
{ "Old_Persian", +1, 0, 0, Old_Persian_range32, 2 },
{ "Old_South_Arabian", +1, 0, 0, Old_South_Arabian_range32, 1 },
{ "Old_Turkic", +1, 0, 0, Old_Turkic_range32, 1 },
{ "Oriya", +1, Oriya_range16, 14, 0, 0 },
+ { "Osage", +1, 0, 0, Osage_range32, 2 },
{ "Osmanya", +1, 0, 0, Osmanya_range32, 2 },
- { "P", +1, P_range16, 127, P_range32, 34 },
+ { "P", +1, P_range16, 128, P_range32, 44 },
{ "Pahawh_Hmong", +1, 0, 0, Pahawh_Hmong_range32, 5 },
{ "Palmyrene", +1, 0, 0, Palmyrene_range32, 1 },
{ "Pau_Cin_Hau", +1, 0, 0, Pau_Cin_Hau_range32, 1 },
@@ -5610,12 +5865,12 @@ const UGroup unicode_groups[] = {
{ "Phags_Pa", +1, Phags_Pa_range16, 1, 0, 0 },
{ "Phoenician", +1, 0, 0, Phoenician_range32, 2 },
{ "Pi", +1, Pi_range16, 11, 0, 0 },
- { "Po", +1, Po_range16, 123, Po_range32, 34 },
+ { "Po", +1, Po_range16, 125, Po_range32, 44 },
{ "Ps", +1, Ps_range16, 75, 0, 0 },
{ "Psalter_Pahlavi", +1, 0, 0, Psalter_Pahlavi_range32, 3 },
{ "Rejang", +1, Rejang_range16, 2, 0, 0 },
{ "Runic", +1, Runic_range16, 2, 0, 0 },
- { "S", +1, S_range16, 148, S_range32, 66 },
+ { "S", +1, S_range16, 148, S_range32, 69 },
{ "Samaritan", +1, Samaritan_range16, 2, 0, 0 },
{ "Saurashtra", +1, Saurashtra_range16, 2, 0, 0 },
{ "Sc", +1, Sc_range16, 17, 0, 0 },
@@ -5626,11 +5881,12 @@ const UGroup unicode_groups[] = {
{ "Sinhala", +1, Sinhala_range16, 12, Sinhala_range32, 1 },
{ "Sk", +1, Sk_range16, 28, Sk_range32, 1 },
{ "Sm", +1, Sm_range16, 53, Sm_range32, 11 },
- { "So", +1, So_range16, 114, So_range32, 56 },
+ { "So", +1, So_range16, 114, So_range32, 59 },
{ "Sora_Sompeng", +1, 0, 0, Sora_Sompeng_range32, 2 },
+ { "Soyombo", +1, 0, 0, Soyombo_range32, 3 },
{ "Sundanese", +1, Sundanese_range16, 2, 0, 0 },
{ "Syloti_Nagri", +1, Syloti_Nagri_range16, 1, 0, 0 },
- { "Syriac", +1, Syriac_range16, 3, 0, 0 },
+ { "Syriac", +1, Syriac_range16, 4, 0, 0 },
{ "Tagalog", +1, Tagalog_range16, 2, 0, 0 },
{ "Tagbanwa", +1, Tagbanwa_range16, 3, 0, 0 },
{ "Tai_Le", +1, Tai_Le_range16, 2, 0, 0 },
@@ -5638,6 +5894,7 @@ const UGroup unicode_groups[] = {
{ "Tai_Viet", +1, Tai_Viet_range16, 2, 0, 0 },
{ "Takri", +1, 0, 0, Takri_range32, 2 },
{ "Tamil", +1, Tamil_range16, 16, 0, 0 },
+ { "Tangut", +1, 0, 0, Tangut_range32, 3 },
{ "Telugu", +1, Telugu_range16, 13, 0, 0 },
{ "Thaana", +1, Thaana_range16, 1, 0, 0 },
{ "Thai", +1, Thai_range16, 2, 0, 0 },
@@ -5649,11 +5906,12 @@ const UGroup unicode_groups[] = {
{ "Warang_Citi", +1, 0, 0, Warang_Citi_range32, 2 },
{ "Yi", +1, Yi_range16, 2, 0, 0 },
{ "Z", +1, Z_range16, 8, 0, 0 },
+ { "Zanabazar_Square", +1, 0, 0, Zanabazar_Square_range32, 1 },
{ "Zl", +1, Zl_range16, 1, 0, 0 },
{ "Zp", +1, Zp_range16, 1, 0, 0 },
{ "Zs", +1, Zs_range16, 7, 0, 0 },
};
-const int num_unicode_groups = 167;
+const int num_unicode_groups = 177;
} // namespace re2
diff --git a/util/sparse_array.h b/util/sparse_array.h
index 7738431..09fb4ac 100644
--- a/util/sparse_array.h
+++ b/util/sparse_array.h
@@ -99,8 +99,8 @@
#include <string.h>
#include <algorithm>
#include <memory>
+#include <type_traits>
#include <utility>
-#include <vector>
namespace re2 {
@@ -113,10 +113,12 @@ class SparseArray {
// IndexValue pairs: exposed in SparseArray::iterator.
class IndexValue;
+ static_assert(std::is_trivially_destructible<IndexValue>::value,
+ "IndexValue must be trivially destructible");
typedef IndexValue value_type;
- typedef typename std::vector<IndexValue>::iterator iterator;
- typedef typename std::vector<IndexValue>::const_iterator const_iterator;
+ typedef IndexValue* iterator;
+ typedef const IndexValue* const_iterator;
SparseArray(const SparseArray& src);
SparseArray(SparseArray&& src) /*noexcept*/;
@@ -138,17 +140,17 @@ class SparseArray {
// Iterate over the array.
iterator begin() {
- return dense_.begin();
+ return dense_.get();
}
iterator end() {
- return dense_.begin() + size_;
+ return dense_.get() + size_;
}
const_iterator begin() const {
- return dense_.begin();
+ return dense_.get();
}
const_iterator end() const {
- return dense_.begin() + size_;
+ return dense_.get() + size_;
}
// Change the maximum size of the array.
@@ -198,13 +200,13 @@ class SparseArray {
iterator find(int i) {
if (has_index(i))
- return dense_.begin() + sparse_to_dense_[i];
+ return dense_.get() + sparse_to_dense_[i];
return end();
}
const_iterator find(int i) const {
if (has_index(i))
- return dense_.begin() + sparse_to_dense_[i];
+ return dense_.get() + sparse_to_dense_[i];
return end();
}
@@ -261,7 +263,7 @@ class SparseArray {
DebugCheckInvariants();
std::pair<iterator, bool> p;
if (has_index(v.index_)) {
- p = {dense_.begin() + sparse_to_dense_[v.index_], false};
+ p = {dense_.get() + sparse_to_dense_[v.index_], false};
} else {
p = {set_new(std::forward<U>(v).index_, std::forward<U>(v).second), true};
}
@@ -295,7 +297,7 @@ class SparseArray {
assert(has_index(i));
dense_[sparse_to_dense_[i]].value() = std::forward<U>(v);
DebugCheckInvariants();
- return dense_.begin() + sparse_to_dense_[i];
+ return dense_.get() + sparse_to_dense_[i];
}
// Add the index i to the array.
@@ -327,7 +329,7 @@ class SparseArray {
int size_ = 0;
int max_size_ = 0;
std::unique_ptr<int[]> sparse_to_dense_;
- std::vector<IndexValue> dense_;
+ std::unique_ptr<IndexValue[]> dense_;
};
template<typename Value>
@@ -338,8 +340,9 @@ SparseArray<Value>::SparseArray(const SparseArray& src)
: size_(src.size_),
max_size_(src.max_size_),
sparse_to_dense_(new int[max_size_]),
- dense_(src.dense_) {
+ dense_(new IndexValue[max_size_]) {
std::copy_n(src.sparse_to_dense_.get(), max_size_, sparse_to_dense_.get());
+ std::copy_n(src.dense_.get(), max_size_, dense_.get());
}
template<typename Value>
@@ -350,17 +353,18 @@ SparseArray<Value>::SparseArray(SparseArray&& src) /*noexcept*/ // NOLINT
dense_(std::move(src.dense_)) {
src.size_ = 0;
src.max_size_ = 0;
- src.dense_.clear();
}
template<typename Value>
SparseArray<Value>& SparseArray<Value>::operator=(const SparseArray& src) {
- std::unique_ptr<int[]> a(new int[src.max_size_]);
+ size_ = src.size_;
+ max_size_ = src.max_size_;
+ std::unique_ptr<int[]> a(new int[max_size_]);
std::copy_n(src.sparse_to_dense_.get(), src.max_size_, a.get());
sparse_to_dense_ = std::move(a);
- dense_ = src.dense_;
- max_size_ = src.max_size_;
- size_ = src.size_;
+ std::unique_ptr<IndexValue[]> b(new IndexValue[max_size_]);
+ std::copy_n(src.dense_.get(), src.max_size_, b.get());
+ dense_ = std::move(b);
return *this;
}
@@ -374,7 +378,6 @@ SparseArray<Value>& SparseArray<Value>::operator=(
// clear out the source
src.size_ = 0;
src.max_size_ = 0;
- src.dense_.clear();
return *this;
}
@@ -427,7 +430,11 @@ void SparseArray<Value>::resize(int max_size) {
}
sparse_to_dense_ = std::move(a);
- dense_.resize(max_size);
+ std::unique_ptr<IndexValue[]> b(new IndexValue[max_size]);
+ if (dense_) {
+ std::copy_n(dense_.get(), max_size_, b.get());
+ }
+ dense_ = std::move(b);
if (ShouldInitializeMemory()) {
for (int i = max_size_; i < max_size; i++) {
@@ -493,8 +500,8 @@ void SparseArray<Value>::create_index(int i) {
template<typename Value> SparseArray<Value>::SparseArray(int max_size) {
max_size_ = max_size;
- sparse_to_dense_ = std::unique_ptr<int[]>(new int[max_size]);
- dense_.resize(max_size);
+ sparse_to_dense_.reset(new int[max_size]);
+ dense_.reset(new IndexValue[max_size]);
size_ = 0;
if (ShouldInitializeMemory()) {
diff --git a/util/sparse_set.h b/util/sparse_set.h
index fd35236..9dde502 100644
--- a/util/sparse_set.h
+++ b/util/sparse_set.h
@@ -53,7 +53,6 @@
#include <algorithm>
#include <memory>
#include <utility>
-#include <vector>
namespace re2 {
@@ -64,8 +63,8 @@ class SparseSetT {
explicit SparseSetT(int max_size);
~SparseSetT();
- typedef typename std::vector<int>::iterator iterator;
- typedef typename std::vector<int>::const_iterator const_iterator;
+ typedef int* iterator;
+ typedef const int* const_iterator;
// Return the number of entries in the set.
int size() const {
@@ -79,17 +78,17 @@ class SparseSetT {
// Iterate over the set.
iterator begin() {
- return dense_.begin();
+ return dense_.get();
}
iterator end() {
- return dense_.begin() + size_;
+ return dense_.get() + size_;
}
const_iterator begin() const {
- return dense_.begin();
+ return dense_.get();
}
const_iterator end() const {
- return dense_.begin() + size_;
+ return dense_.get() + size_;
}
// Change the maximum size of the set.
@@ -146,7 +145,7 @@ class SparseSetT {
create_index(i);
}
DebugCheckInvariants();
- return dense_.begin() + sparse_to_dense_[i];
+ return dense_.get() + sparse_to_dense_[i];
}
// Add the index i to the set.
@@ -177,7 +176,7 @@ class SparseSetT {
int size_ = 0;
int max_size_ = 0;
std::unique_ptr<int[]> sparse_to_dense_;
- std::vector<int> dense_;
+ std::unique_ptr<int[]> dense_;
};
template<typename Value>
@@ -195,7 +194,11 @@ void SparseSetT<Value>::resize(int max_size) {
}
sparse_to_dense_ = std::move(a);
- dense_.resize(max_size);
+ std::unique_ptr<int[]> b(new int[max_size]);
+ if (dense_) {
+ std::copy_n(dense_.get(), max_size_, b.get());
+ }
+ dense_ = std::move(b);
if (ShouldInitializeMemory()) {
for (int i = max_size_; i < max_size; i++) {
@@ -234,8 +237,8 @@ void SparseSetT<Value>::create_index(int i) {
template<typename Value> SparseSetT<Value>::SparseSetT(int max_size) {
max_size_ = max_size;
- sparse_to_dense_ = std::unique_ptr<int[]>(new int[max_size]);
- dense_.resize(max_size);
+ sparse_to_dense_.reset(new int[max_size]);
+ dense_.reset(new int[max_size]);
size_ = 0;
if (ShouldInitializeMemory()) {
diff --git a/util/strutil.cc b/util/strutil.cc
index 7181073..8eabfa4 100644
--- a/util/strutil.cc
+++ b/util/strutil.cc
@@ -75,29 +75,21 @@ string CEscape(const StringPiece& src) {
return s;
}
-string PrefixSuccessor(const StringPiece& prefix) {
+void PrefixSuccessor(string* prefix) {
// We can increment the last character in the string and be done
// unless that character is 255, in which case we have to erase the
// last character and increment the previous character, unless that
// is 255, etc. If the string is empty or consists entirely of
// 255's, we just return the empty string.
- bool done = false;
- string limit(prefix.data(), prefix.size());
- int index = static_cast<int>(limit.size()) - 1;
- while (!done && index >= 0) {
- if ((limit[index]&255) == 255) {
- limit.erase(index);
- index--;
+ while (!prefix->empty()) {
+ char& c = prefix->back();
+ if (c == '\xff') { // char literal avoids signed/unsigned.
+ prefix->pop_back();
} else {
- limit[index]++;
- done = true;
+ ++c;
+ break;
}
}
- if (!done) {
- return "";
- } else {
- return limit;
- }
}
static void StringAppendV(string* dst, const char* format, va_list ap) {
diff --git a/util/strutil.h b/util/strutil.h
index 71dd293..2c3c104 100644
--- a/util/strutil.h
+++ b/util/strutil.h
@@ -13,7 +13,7 @@
namespace re2 {
string CEscape(const StringPiece& src);
-string PrefixSuccessor(const StringPiece& prefix);
+void PrefixSuccessor(string* prefix);
string StringPrintf(const char* format, ...);
void SStringPrintf(string* dst, const char* format, ...);
void StringAppendF(string* dst, const char* format, ...);