diff options
author | DongHun Kwak <dh0128.kwak@samsung.com> | 2016-11-21 16:56:53 +0900 |
---|---|---|
committer | DongHun Kwak <dh0128.kwak@samsung.com> | 2016-11-21 16:56:53 +0900 |
commit | 485249b5a02cf59571cde61f83d10a6a9ec36b3d (patch) | |
tree | 1866e7ca3fe6c30538b6bbe29a81a5836f53678d | |
parent | b1560c7299051a0740ae99189cfbded5bde01a4d (diff) | |
download | re2-485249b5a02cf59571cde61f83d10a6a9ec36b3d.tar.gz re2-485249b5a02cf59571cde61f83d10a6a9ec36b3d.tar.bz2 re2-485249b5a02cf59571cde61f83d10a6a9ec36b3d.zip |
Imported Upstream version 20160201upstream/20160201
Change-Id: I9008d23e0e80414f725fdd41819d9856e8a041cc
Signed-off-by: DongHun Kwak <dh0128.kwak@samsung.com>
56 files changed, 1665 insertions, 663 deletions
@@ -6,16 +6,6 @@ licenses(["notice"]) -# stringpiece is a standalone library so that it can be used without pulling in -# all of the other parts of RE2. -cc_library( - name = "stringpiece", - srcs = ["re2/stringpiece.cc"], - hdrs = ["re2/stringpiece.h"], - includes = ["."], - visibility = ["//visibility:public"], -) - cc_library( name = "re2", srcs = [ @@ -39,6 +29,7 @@ cc_library( "re2/regexp.h", "re2/set.cc", "re2/simplify.cc", + "re2/stringpiece.cc", "re2/tostring.cc", "re2/unicode_casefold.cc", "re2/unicode_casefold.h", @@ -65,14 +56,13 @@ cc_library( "re2/filtered_re2.h", "re2/re2.h", "re2/set.h", + "re2/stringpiece.h", "re2/variadic_function.h", ], + copts = ["-pthread"], includes = ["."], linkopts = ["-pthread"], visibility = ["//visibility:public"], - deps = [ - ":stringpiece", - ], ) cc_library( @@ -102,9 +92,7 @@ cc_library( "util/thread.h", ], includes = ["."], - deps = [ - ":re2", - ], + deps = [":re2"], ) load("re2_test", "re2_test") @@ -129,6 +117,6 @@ re2_test("exhaustive1_test") re2_test("exhaustive2_test") re2_test("exhaustive3_test") re2_test("exhaustive_test") -re2_test("random_test") +re2_test("random_test", size="large") # TODO: Add support for regexp_benchmark. diff --git a/CMakeLists.txt b/CMakeLists.txt index c6c6060..1c980df 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,6 +15,7 @@ if(WIN32) add_definitions(-DUNICODE -D_UNICODE -DSTRICT -DNOMINMAX) set(THREADING threadwin) else() + add_definitions(-pthread) set(THREADING thread) list(APPEND EXTRA_TARGET_LINK_LIBRARIES -pthread) endif() @@ -8,8 +8,8 @@ # LDPCRE=-L/usr/local/lib -lpcre CXX?=g++ -CXXFLAGS?=-Wall -O3 -g -pthread # can override -RE2_CXXFLAGS?=-Wsign-compare -c -I. $(CCPCRE) # required +CXXFLAGS?=-O3 -g -pthread # can override +RE2_CXXFLAGS?=-Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -I. $(CCPCRE) # required LDFLAGS?=-pthread AR?=ar ARFLAGS?=rsc @@ -158,7 +158,7 @@ BIGTESTS=\ obj/test/random_test\ SOFILES=$(patsubst obj/%,obj/so/%,$(OFILES)) -STESTOFILES=$(patsubst obj/%,obj/so/%,$(TESTOFILES)) +# We use TESTOFILES for testing the shared lib, only it is built differently. STESTS=$(patsubst obj/%,obj/so/%,$(TESTS)) SBIGTESTS=$(patsubst obj/%,obj/so/%,$(BIGTESTS)) @@ -169,15 +169,15 @@ DBIGTESTS=$(patsubst obj/%,obj/dbg/%,$(BIGTESTS)) obj/%.o: %.cc $(HFILES) @mkdir -p $$(dirname $@) - $(CXX) -o $@ $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.cc + $(CXX) -c -o $@ $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.cc obj/dbg/%.o: %.cc $(HFILES) @mkdir -p $$(dirname $@) - $(CXX) -o $@ -fPIC $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) $*.cc + $(CXX) -c -o $@ $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) $*.cc obj/so/%.o: %.cc $(HFILES) @mkdir -p $$(dirname $@) - $(CXX) -o $@ -fPIC $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.cc + $(CXX) -c -o $@ -fPIC $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.cc obj/libre2.a: $(OFILES) @mkdir -p obj @@ -192,17 +192,18 @@ obj/so/libre2.$(SOEXT): $(SOFILES) $(MAKE_SHARED_LIBRARY) -o obj/so/libre2.$(SOEXTVER) $(SOFILES) ln -sf libre2.$(SOEXTVER) $@ -obj/test/%: obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o - @mkdir -p obj/test - $(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o obj/libre2.a $(LDFLAGS) $(LDPCRE) - obj/dbg/test/%: obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES) obj/dbg/util/test.o @mkdir -p obj/dbg/test $(CXX) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) obj/dbg/util/test.o obj/dbg/libre2.a $(LDFLAGS) $(LDPCRE) -obj/so/test/%: obj/so/libre2.$(SOEXT) obj/libre2.a obj/so/re2/testing/%.o $(STESTOFILES) obj/so/util/test.o +obj/test/%: obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o + @mkdir -p obj/test + $(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o obj/libre2.a $(LDFLAGS) $(LDPCRE) + +# Test the shared lib, falling back to the static lib for private symbols +obj/so/test/%: obj/so/libre2.$(SOEXT) obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o @mkdir -p obj/so/test - $(CXX) -o $@ obj/so/re2/testing/$*.o $(STESTOFILES) obj/so/util/test.o -Lobj/so -lre2 obj/libre2.a $(LDFLAGS) $(LDPCRE) + $(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o -Lobj/so -lre2 obj/libre2.a $(LDFLAGS) $(LDPCRE) obj/test/regexp_benchmark: obj/libre2.a obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/util/benchmark.o @mkdir -p obj/test diff --git a/WORKSPACE b/WORKSPACE new file mode 100644 index 0000000..393f5e6 --- /dev/null +++ b/WORKSPACE @@ -0,0 +1,5 @@ +# Copyright 2009 The RE2 Authors. All Rights Reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +# Bazel (http://bazel.io/) WORKSPACE file for RE2. diff --git a/doc/syntax.txt b/doc/syntax.txt index e9c6ff4..09b7e88 100644 --- a/doc/syntax.txt +++ b/doc/syntax.txt @@ -230,105 +230,137 @@ Zp paragraph separator Zs space separator Unicode character class names--scripts: -Arabic Arabic -Armenian Armenian -Balinese Balinese -Bamum Bamum -Batak Batak -Bengali Bengali -Bopomofo Bopomofo -Brahmi Brahmi -Braille Braille -Buginese Buginese -Buhid Buhid -Canadian_Aboriginal Canadian Aboriginal -Carian Carian -Chakma Chakma -Cham Cham -Cherokee Cherokee -Common characters not specific to one script -Coptic Coptic -Cuneiform Cuneiform -Cypriot Cypriot -Cyrillic Cyrillic -Deseret Deseret -Devanagari Devanagari -Egyptian_Hieroglyphs Egyptian Hieroglyphs -Ethiopic Ethiopic -Georgian Georgian -Glagolitic Glagolitic -Gothic Gothic -Greek Greek -Gujarati Gujarati -Gurmukhi Gurmukhi -Han Han -Hangul Hangul -Hanunoo Hanunoo -Hebrew Hebrew -Hiragana Hiragana -Imperial_Aramaic Imperial Aramaic -Inherited inherit script from previous character -Inscriptional_Pahlavi Inscriptional Pahlavi -Inscriptional_Parthian Inscriptional Parthian -Javanese Javanese -Kaithi Kaithi -Kannada Kannada -Katakana Katakana -Kayah_Li Kayah Li -Kharoshthi Kharoshthi -Khmer Khmer -Lao Lao -Latin Latin -Lepcha Lepcha -Limbu Limbu -Linear_B Linear B -Lycian Lycian -Lydian Lydian -Malayalam Malayalam -Mandaic Mandaic -Meetei_Mayek Meetei Mayek -Meroitic_Cursive Meroitic Cursive -Meroitic_Hieroglyphs Meroitic Hieroglyphs -Miao Miao -Mongolian Mongolian -Myanmar Myanmar -New_Tai_Lue New Tai Lue (aka Simplified Tai Lue) -Nko Nko -Ogham Ogham -Ol_Chiki Ol Chiki -Old_Italic Old Italic -Old_Persian Old Persian -Old_South_Arabian Old South Arabian -Old_Turkic Old Turkic -Oriya Oriya -Osmanya Osmanya -Phags_Pa 'Phags Pa -Phoenician Phoenician -Rejang Rejang -Runic Runic -Saurashtra Saurashtra -Sharada Sharada -Shavian Shavian -Sinhala Sinhala -Sora_Sompeng Sora Sompeng -Sundanese Sundanese -Syloti_Nagri Syloti Nagri -Syriac Syriac -Tagalog Tagalog -Tagbanwa Tagbanwa -Tai_Le Tai Le -Tai_Tham Tai Tham -Tai_Viet Tai Viet -Takri Takri -Tamil Tamil -Telugu Telugu -Thaana Thaana -Thai Thai -Tibetan Tibetan -Tifinagh Tifinagh -Ugaritic Ugaritic -Vai Vai -Yi Yi +Ahom +Anatolian_Hieroglyphs +Arabic +Armenian +Avestan +Balinese +Bamum +Bassa_Vah +Batak +Bengali +Bopomofo +Brahmi +Braille +Buginese +Buhid +Canadian_Aboriginal +Carian +Caucasian_Albanian +Chakma +Cham +Cherokee +Common +Coptic +Cuneiform +Cypriot +Cyrillic +Deseret +Devanagari +Duployan +Egyptian_Hieroglyphs +Elbasan +Ethiopic +Georgian +Glagolitic +Gothic +Grantha +Greek +Gujarati +Gurmukhi +Han +Hangul +Hanunoo +Hatran +Hebrew +Hiragana +Imperial_Aramaic +Inherited +Inscriptional_Pahlavi +Inscriptional_Parthian +Javanese +Kaithi +Kannada +Katakana +Kayah_Li +Kharoshthi +Khmer +Khojki +Khudawadi +Lao +Latin +Lepcha +Limbu +Linear_A +Linear_B +Lisu +Lycian +Lydian +Mahajani +Malayalam +Mandaic +Manichaean +Meetei_Mayek +Mende_Kikakui +Meroitic_Cursive +Meroitic_Hieroglyphs +Miao +Modi +Mongolian +Mro +Multani +Myanmar +Nabataean +New_Tai_Lue +Nko +Ogham +Ol_Chiki +Old_Hungarian +Old_Italic +Old_North_Arabian +Old_Permic +Old_Persian +Old_South_Arabian +Old_Turkic +Oriya +Osmanya +Pahawh_Hmong +Palmyrene +Pau_Cin_Hau +Phags_Pa +Phoenician +Psalter_Pahlavi +Rejang +Runic +Samaritan +Saurashtra +Sharada +Shavian +Siddham +SignWriting +Sinhala +Sora_Sompeng +Sundanese +Syloti_Nagri +Syriac +Tagalog +Tagbanwa +Tai_Le +Tai_Tham +Tai_Viet +Takri +Tamil +Telugu +Thaana +Thai +Tibetan +Tifinagh +Tirhuta +Ugaritic +Vai +Warang_Citi +Yi Vim character classes: \i identifier character NOT SUPPORTED vim @@ -6,5 +6,5 @@ libdir=${exec_prefix}/lib Name: re2 Description: RE2 is a fast, safe, thread-friendly regular expression engine. Version: 0.0.0 -Cflags: -I${includedir} +Cflags: -I${includedir} -pthread Libs: -L${libdir} -lre2 -pthread diff --git a/re2/bitstate.cc b/re2/bitstate.cc index 8ced6ea..5740daa 100644 --- a/re2/bitstate.cc +++ b/re2/bitstate.cc @@ -94,7 +94,7 @@ BitState::~BitState() { // If so, remember that it was visited so that the next time, // we don't repeat the visit. bool BitState::ShouldVisit(int id, const char* p) { - uint n = id * (text_.size() + 1) + (p - text_.begin()); + size_t n = id * (text_.size() + 1) + (p - text_.begin()); if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1)))) return false; visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1)); @@ -272,7 +272,8 @@ bool BitState::TrySearch(int id0, const char* p0) { if (submatch_[0].data() == NULL || (longest_ && p > submatch_[0].end())) { for (int i = 0; i < nsubmatch_; i++) - submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]); + submatch_[i].set(cap_[2*i], + static_cast<int>(cap_[2*i+1] - cap_[2*i])); } // If going for first match, we're done. diff --git a/re2/compile.cc b/re2/compile.cc index e5d6088..9882fef 100644 --- a/re2/compile.cc +++ b/re2/compile.cc @@ -113,7 +113,7 @@ struct Frag { // Input encodings. enum Encoding { kEncodingUTF8 = 1, // UTF-8 (0-10FFFF) - kEncodingLatin1, // Latin1 (0-FF) + kEncodingLatin1, // Latin-1 (0-FF) }; class Compiler : public Regexp::Walker<Frag> { @@ -193,12 +193,28 @@ class Compiler : public Regexp::Walker<Frag> { void Add_80_10ffff(); // New suffix that matches the byte range lo-hi, then goes to next. - int RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next); int UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next); + int CachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next); + + // Returns true iff the suffix is cached. + bool IsCachedRuneByteSuffix(int id); // Adds a suffix to alternation. void AddSuffix(int id); + // Adds a suffix to the trie starting from the given root node. + // Returns zero iff allocating an instruction fails. Otherwise, returns + // the current root node, which might be different from what was given. + int AddSuffixRecursive(int root, int id); + + // Finds the trie node for the given suffix. Returns a Frag in order to + // distinguish between pointing at the root node directly (end.p == 0) + // and pointing at an Alt's out1 or out (end.p&1 == 1 or 0, respectively). + Frag FindByteRange(int root, int id); + + // Compares two ByteRanges and returns true iff they are equal. + bool ByteRangeEqual(int id1, int id2); + // Returns the alternation of all the added suffixes. Frag EndRange(); @@ -496,21 +512,17 @@ int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, return f.begin; } -int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) { - // In Latin1 mode, there's no point in caching. - // In forward UTF-8 mode, only need to cache continuation bytes. - if (encoding_ == kEncodingLatin1 || - (encoding_ == kEncodingUTF8 && - !reversed_ && - !(0x80 <= lo && hi <= 0xbf))) { - return UncachedRuneByteSuffix(lo, hi, foldcase, next); - } +static uint64 MakeRuneCacheKey(uint8 lo, uint8 hi, bool foldcase, int next) { + return (uint64)next << 17 | + (uint64)lo << 9 | + (uint64)hi << 1 | + (uint64)foldcase; +} - uint64 key = (uint64)next << 17 | - (uint64)lo << 9 | - (uint64)hi << 1 | - (uint64)foldcase; - map<uint64, int>::iterator it = rune_cache_.find(key); +int Compiler::CachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, + int next) { + uint64 key = MakeRuneCacheKey(lo, hi, foldcase, next); + map<uint64, int>::const_iterator it = rune_cache_.find(key); if (it != rune_cache_.end()) return it->second; int id = UncachedRuneByteSuffix(lo, hi, foldcase, next); @@ -518,12 +530,28 @@ int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) { return id; } +bool Compiler::IsCachedRuneByteSuffix(int id) { + uint8 lo = inst_[id].lo_; + uint8 hi = inst_[id].hi_; + bool foldcase = inst_[id].foldcase() != 0; + int next = inst_[id].out(); + + uint64 key = MakeRuneCacheKey(lo, hi, foldcase, next); + return rune_cache_.find(key) != rune_cache_.end(); +} + void Compiler::AddSuffix(int id) { if (rune_range_.begin == 0) { rune_range_.begin = id; return; } + if (encoding_ == kEncodingUTF8) { + // Build a trie in order to reduce fanout. + rune_range_.begin = AddSuffixRecursive(rune_range_.begin, id); + return; + } + int alt = AllocInst(1); if (alt < 0) { rune_range_.begin = 0; @@ -533,6 +561,105 @@ void Compiler::AddSuffix(int id) { rune_range_.begin = alt; } +int Compiler::AddSuffixRecursive(int root, int id) { + DCHECK(inst_[root].opcode() == kInstAlt || + inst_[root].opcode() == kInstByteRange); + + Frag f = FindByteRange(root, id); + if (IsNoMatch(f)) { + int alt = AllocInst(1); + if (alt < 0) + return 0; + inst_[alt].InitAlt(root, id); + return alt; + } + + int br; + if (f.end.p == 0) + br = root; + else if (f.end.p&1) + br = inst_[f.begin].out1(); + else + br = inst_[f.begin].out(); + + if (IsCachedRuneByteSuffix(br)) { + // We can't fiddle with cached suffixes, so make a clone of the head. + int byterange = AllocInst(1); + if (byterange < 0) + return 0; + inst_[byterange].InitByteRange(inst_[br].lo(), inst_[br].hi(), + inst_[br].foldcase(), inst_[br].out()); + + // Ensure that the parent points to the clone, not to the original. + // Note that this could leave the head unreachable except via the cache. + br = byterange; + if (f.end.p == 0) + root = br; + else if (f.end.p&1) + inst_[f.begin].out1_ = br; + else + inst_[f.begin].set_out(br); + } + + // We just saved one ByteRange instruction. :) + prog_->byte_inst_count_--; + + int out = inst_[id].out(); + if (!IsCachedRuneByteSuffix(id)) { + // The head should be the instruction most recently allocated, so free it + // instead of leaving it unreachable. + DCHECK_EQ(id, inst_len_-1); + inst_[id].out_opcode_ = 0; + inst_[id].out1_ = 0; + inst_len_--; + } + + out = AddSuffixRecursive(inst_[br].out(), out); + if (out == 0) + return 0; + + inst_[br].set_out(out); + return root; +} + +bool Compiler::ByteRangeEqual(int id1, int id2) { + return inst_[id1].lo() == inst_[id2].lo() && + inst_[id1].hi() == inst_[id2].hi() && + inst_[id1].foldcase() == inst_[id2].foldcase(); +} + +Frag Compiler::FindByteRange(int root, int id) { + if (inst_[root].opcode() == kInstByteRange) { + if (ByteRangeEqual(root, id)) + return Frag(root, nullPatchList); + else + return NoMatch(); + } + + while (inst_[root].opcode() == kInstAlt) { + int out1 = inst_[root].out1(); + if (ByteRangeEqual(out1, id)) + return Frag(root, PatchList::Mk((root << 1) | 1)); + + // CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't + // what we're looking for, then we can stop immediately. Unfortunately, we + // can't short-circuit the search in reverse mode. + if (!reversed_) + return NoMatch(); + + int out = inst_[root].out(); + if (inst_[out].opcode() == kInstAlt) + root = out; + else if (ByteRangeEqual(out, id)) + return Frag(root, PatchList::Mk(root << 1)); + else + return NoMatch(); + } + + LOG(DFATAL) << "should never happen"; + return NoMatch(); +} + Frag Compiler::EndRange() { return rune_range_; } @@ -556,13 +683,13 @@ void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) { } void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) { - // Latin1 is easy: runes *are* bytes. + // Latin-1 is easy: runes *are* bytes. if (lo > hi || lo > 0xFF) return; if (hi > 0xFF) hi = 0xFF; - AddSuffix(RuneByteSuffix(static_cast<uint8>(lo), static_cast<uint8>(hi), - foldcase, 0)); + AddSuffix(UncachedRuneByteSuffix(static_cast<uint8>(lo), + static_cast<uint8>(hi), foldcase, 0)); } // Table describing how to make a UTF-8 matching machine @@ -633,8 +760,8 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { // ASCII range is always a special case. if (hi < Runeself) { - AddSuffix(RuneByteSuffix(static_cast<uint8>(lo), static_cast<uint8>(hi), - foldcase, 0)); + AddSuffix(UncachedRuneByteSuffix(static_cast<uint8>(lo), + static_cast<uint8>(hi), foldcase, 0)); return; } @@ -662,13 +789,49 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { (void)m; // USED(m) DCHECK_EQ(n, m); + // The logic below encodes this thinking: + // + // 1. When we have built the whole suffix, we know that it cannot + // possibly be a suffix of anything longer: in forward mode, nothing + // else can occur before the leading byte; in reverse mode, nothing + // else can occur after the last continuation byte or else the leading + // byte would have to change. Thus, there is no benefit to caching + // the first byte of the suffix whereas there is a cost involved in + // cloning it if it begins a common prefix, which is fairly likely. + // + // 2. Conversely, the last byte of the suffix cannot possibly be a + // prefix of anything because next == 0, so we will never want to + // clone it, but it is fairly likely to be a common suffix. Perhaps + // more so in reverse mode than in forward mode because the former is + // "converging" towards lower entropy, but caching is still worthwhile + // for the latter in cases such as 80-BF. + // + // 3. Handling the bytes between the first and the last is less + // straightforward and, again, the approach depends on whether we are + // "converging" towards lower entropy: in forward mode, a single byte + // is unlikely to be part of a common suffix whereas a byte range + // is more likely so; in reverse mode, a byte range is unlikely to + // be part of a common suffix whereas a single byte is more likely + // so. The same benefit versus cost argument applies here. int id = 0; if (reversed_) { - for (int i = 0; i < n; i++) - id = RuneByteSuffix(ulo[i], uhi[i], false, id); + for (int i = 0; i < n; i++) { + // In reverse UTF-8 mode: cache the leading byte; don't cache the last + // continuation byte; cache anything else iff it's a single byte (XX-XX). + if (i == 0 || (ulo[i] == uhi[i] && i != n-1)) + id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id); + else + id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id); + } } else { - for (int i = n-1; i >= 0; i--) - id = RuneByteSuffix(ulo[i], uhi[i], false, id); + for (int i = n-1; i >= 0; i--) { + // In forward UTF-8 mode: don't cache the leading byte; cache the last + // continuation byte; cache anything else iff it's a byte range (XX-YY). + if (i == n-1 || (ulo[i] < uhi[i] && i != 0)) + id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id); + else + id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id); + } } AddSuffix(id); } @@ -762,16 +925,16 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, } case kRegexpStar: - return Star(child_frags[0], re->parse_flags()&Regexp::NonGreedy); + return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); case kRegexpPlus: - return Plus(child_frags[0], re->parse_flags()&Regexp::NonGreedy); + return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); case kRegexpQuest: - return Quest(child_frags[0], re->parse_flags()&Regexp::NonGreedy); + return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); case kRegexpLiteral: - return Literal(re->rune(), re->parse_flags()&Regexp::FoldCase); + return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0); case kRegexpLiteralString: { // Concatenation of literals. @@ -779,7 +942,8 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, return Nop(); Frag f; for (int i = 0; i < re->nrunes(); i++) { - Frag f1 = Literal(re->runes()[i], re->parse_flags()&Regexp::FoldCase); + Frag f1 = Literal(re->runes()[i], + (re->parse_flags()&Regexp::FoldCase) != 0); if (i == 0) f = f1; else @@ -94,7 +94,7 @@ class DFA { // States, linked by the next_ pointers. If in state s and reading // byte c, the next state should be s->next_[c]. struct State { - inline bool IsMatch() const { return flag_ & kFlagMatch; } + inline bool IsMatch() const { return (flag_ & kFlagMatch) != 0; } void SaveMatch(vector<int>* v); int* inst_; // Instruction pointers in the state. @@ -1015,7 +1015,7 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) { // The state flag kFlagLastWord says whether the last // byte processed was a word character. Use that info to // insert empty-width (non-)word boundaries. - bool islastword = state->flag_ & kFlagLastWord; + bool islastword = (state->flag_ & kFlagLastWord) != 0; bool isword = (c != kByteEndText && Prog::IsWordChar(static_cast<uint8>(c))); if (isword == islastword) beforeflag |= kEmptyNonWordBoundary; @@ -1901,9 +1901,9 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, // as the beginning. if (match0) { if (reversed_) - *match0 = StringPiece(ep, text.end() - ep); + match0->set(ep, static_cast<int>(text.end() - ep)); else - *match0 = StringPiece(text.begin(), ep - text.begin()); + match0->set(text.begin(), static_cast<int>(ep - text.begin())); } return true; } @@ -1939,7 +1939,7 @@ int DFA::BuildAllStates() { } } - return q.size(); + return static_cast<int>(q.size()); } // Build out all states in DFA for kind. Returns number of states. diff --git a/re2/filtered_re2.cc b/re2/filtered_re2.cc index 1ca46d4..5dd65d5 100644 --- a/re2/filtered_re2.cc +++ b/re2/filtered_re2.cc @@ -33,7 +33,7 @@ RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern, } delete re; } else { - *id = re2_vec_.size(); + *id = static_cast<int>(re2_vec_.size()); re2_vec_.push_back(re); } diff --git a/re2/filtered_re2.h b/re2/filtered_re2.h index 0f161e2..f4b2be4 100644 --- a/re2/filtered_re2.h +++ b/re2/filtered_re2.h @@ -76,7 +76,7 @@ class FilteredRE2 { vector<int>* potential_regexps) const; // The number of regexps added. - int NumRegexps() const { return re2_vec_.size(); } + int NumRegexps() const { return static_cast<int>(re2_vec_.size()); } private: @@ -608,7 +608,8 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, if (matched_) { for (int i = 0; i < nsubmatch; i++) - submatch[i].set(match_[2*i], match_[2*i+1] - match_[2*i]); + submatch[i].set(match_[2*i], + static_cast<int>(match_[2*i+1] - match_[2*i])); if (Debug) fprintf(stderr, "match (%d,%d)\n", static_cast<int>(match_[0] - btext_), diff --git a/re2/onepass.cc b/re2/onepass.cc index 2404617..73acdc8 100644 --- a/re2/onepass.cc +++ b/re2/onepass.cc @@ -331,7 +331,8 @@ done: if (!matched) return false; for (int i = 0; i < nmatch; i++) - match[i].set(matchcap[2*i], matchcap[2*i+1] - matchcap[2*i]); + match[i].set(matchcap[2*i], + static_cast<int>(matchcap[2*i+1] - matchcap[2*i])); return true; } diff --git a/re2/parse.cc b/re2/parse.cc index 3c15bbd..f51e589 100644 --- a/re2/parse.cc +++ b/re2/parse.cc @@ -919,9 +919,14 @@ int Regexp::FactorAlternationRecursive( } n = out; - // Round 2: Factor out common complex prefixes, - // just the first piece of each concatenation, - // whatever it is. This is good enough a lot of the time. + // Round 2: Factor out common simple prefixes, + // just the first piece of each concatenation. + // This will be good enough a lot of the time. + // + // Complex subexpressions (e.g. involving quantifiers) + // are not safe to factor because that collapses their + // distinct paths through the automaton, which affects + // correctness in some cases. start = 0; out = 0; Regexp* first = NULL; @@ -934,7 +939,25 @@ int Regexp::FactorAlternationRecursive( Regexp* first_i = NULL; if (i < n) { first_i = LeadingRegexp(sub[i]); - if (first != NULL && Regexp::Equal(first, first_i)) { + if (first != NULL && Regexp::Equal(first, first_i) && + // first must be an empty-width op + // OR a char class, any char or any byte + // OR a fixed repeat of a literal, char class, any char or any byte. + (first->op() == kRegexpBeginLine || + first->op() == kRegexpEndLine || + first->op() == kRegexpWordBoundary || + first->op() == kRegexpNoWordBoundary || + first->op() == kRegexpBeginText || + first->op() == kRegexpEndText || + first->op() == kRegexpCharClass || + first->op() == kRegexpAnyChar || + first->op() == kRegexpAnyByte || + (first->op() == kRegexpRepeat && + first->min() == first->max() && + (first->sub()[0]->op() == kRegexpLiteral || + first->sub()[0]->op() == kRegexpCharClass || + first->sub()[0]->op() == kRegexpAnyChar || + first->sub()[0]->op() == kRegexpAnyByte)))) { continue; } } @@ -1427,7 +1450,8 @@ static bool ParseEscape(StringPiece* s, Rune* rp, BadEscape: // Unrecognized escape sequence. status->set_code(kRegexpBadEscape); - status->set_error_arg(StringPiece(begin, s->data() - begin)); + status->set_error_arg( + StringPiece(begin, static_cast<int>(s->data() - begin))); return false; } @@ -1586,7 +1610,7 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, if (c != '{') { // Name is the bit of string we just skipped over for c. const char* p = seq.begin() + 2; - name = StringPiece(p, s->begin() - p); + name = StringPiece(p, static_cast<int>(s->begin() - p)); } else { // Name is in braces. Look for closing } size_t end = s->find('}', 0); @@ -1597,14 +1621,14 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, status->set_error_arg(seq); return kParseError; } - name = StringPiece(s->begin(), end); // without '}' - s->remove_prefix(end + 1); // with '}' + name = StringPiece(s->begin(), static_cast<int>(end)); // without '}' + s->remove_prefix(static_cast<int>(end) + 1); // with '}' if (!IsValidUTF8(name, status)) return kParseError; } // Chop seq where s now begins. - seq = StringPiece(seq.begin(), s->begin() - seq.begin()); + seq = StringPiece(seq.begin(), static_cast<int>(s->begin() - seq.begin())); // Look up group if (name.size() > 0 && name[0] == '^') { @@ -1645,7 +1669,7 @@ static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags, // Got it. Check that it's valid. q += 2; - StringPiece name(p, q-p); + StringPiece name(p, static_cast<int>(q-p)); const UGroup *g = LookupPosixGroup(name); if (g == NULL) { @@ -1699,7 +1723,8 @@ bool Regexp::ParseState::ParseCCRange(StringPiece* s, RuneRange* rr, return false; if (rr->hi < rr->lo) { status->set_code(kRegexpBadCharRange); - status->set_error_arg(StringPiece(os.data(), s->data() - os.data())); + status->set_error_arg( + StringPiece(os.data(), static_cast<int>(s->data() - os.data()))); return false; } } else { @@ -1881,8 +1906,8 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { } // t is "P<name>...", t[end] == '>' - StringPiece capture(t.begin()-2, end+3); // "(?P<name>" - StringPiece name(t.begin()+2, end-2); // "name" + StringPiece capture(t.begin()-2, static_cast<int>(end)+3); // "(?P<name>" + StringPiece name(t.begin()+2, static_cast<int>(end)-2); // "name" if (!IsValidUTF8(name, status_)) return false; if (!IsValidCaptureName(name)) { @@ -1896,7 +1921,7 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { return false; } - s->remove_prefix(capture.end() - s->begin()); + s->remove_prefix(static_cast<int>(capture.end() - s->begin())); return true; } @@ -1979,7 +2004,8 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { BadPerlOp: status_->set_code(kRegexpBadPerlOp); - status_->set_error_arg(StringPiece(s->begin(), t.begin() - s->begin())); + status_->set_error_arg( + StringPiece(s->begin(), static_cast<int>(t.begin() - s->begin()))); return false; } @@ -2126,12 +2152,13 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, // a** is a syntax error, not a double-star. // (and a++ means something else entirely, which we don't support!) status->set_code(kRegexpRepeatOp); - status->set_error_arg(StringPiece(lastunary.begin(), - t.begin() - lastunary.begin())); + status->set_error_arg( + StringPiece(lastunary.begin(), + static_cast<int>(t.begin() - lastunary.begin()))); return NULL; } } - opstr.set(opstr.data(), t.data() - opstr.data()); + opstr.set(opstr.data(), static_cast<int>(t.data() - opstr.data())); if (!ps.PushRepeatOp(op, opstr, nongreedy)) return NULL; isunary = opstr; @@ -2157,12 +2184,13 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, if (lastunary.size() > 0) { // Not allowed to stack repetition operators. status->set_code(kRegexpRepeatOp); - status->set_error_arg(StringPiece(lastunary.begin(), - t.begin() - lastunary.begin())); + status->set_error_arg( + StringPiece(lastunary.begin(), + static_cast<int>(t.begin() - lastunary.begin()))); return NULL; } } - opstr.set(opstr.data(), t.data() - opstr.data()); + opstr.set(opstr.data(), static_cast<int>(t.data() - opstr.data())); if (!ps.PushRepetition(lo, hi, opstr, nongreedy)) return NULL; isunary = opstr; diff --git a/re2/prefilter.cc b/re2/prefilter.cc index 4a25a43..45e43c9 100644 --- a/re2/prefilter.cc +++ b/re2/prefilter.cc @@ -503,7 +503,7 @@ Prefilter::Info* Prefilter::BuildInfo(Regexp* re) { LOG(INFO) << "BuildPrefilter::Info: " << re->ToString(); } - bool latin1 = re->parse_flags() & Regexp::Latin1; + bool latin1 = (re->parse_flags() & Regexp::Latin1) != 0; Prefilter::Info::Walker w(latin1); Prefilter::Info* info = w.WalkExponential(re, NULL, 100000); diff --git a/re2/prefilter_tree.cc b/re2/prefilter_tree.cc index 89e114c..be9b584 100644 --- a/re2/prefilter_tree.cc +++ b/re2/prefilter_tree.cc @@ -168,7 +168,7 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) { for (size_t i = 0; i < prefilter_vec_.size(); i++) { Prefilter* f = prefilter_vec_[i]; if (f == NULL) - unfiltered_.push_back(i); + unfiltered_.push_back(static_cast<int>(i)); // We push NULL also on to v, so that we maintain the // mapping of index==regexpid for level=0 prefilter nodes. @@ -189,7 +189,7 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) { // Identify unique nodes. int unique_id = 0; - for (int i = v.size() - 1; i >= 0; i--) { + for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { Prefilter *node = v[i]; if (node == NULL) continue; @@ -211,7 +211,7 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) { entries_.resize(node_map_.size()); // Create parent StdIntMap for the entries. - for (int i = v.size() - 1; i >= 0; i--) { + for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { Prefilter* prefilter = v[i]; if (prefilter == NULL) continue; @@ -224,7 +224,7 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) { } // Fill the entries. - for (int i = v.size() - 1; i >= 0; i--) { + for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { Prefilter* prefilter = v[i]; if (prefilter == NULL) continue; @@ -263,8 +263,9 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) { (*child_entry->parents)[prefilter->unique_id()] = 1; } } - entry->propagate_up_at_count = - prefilter->op() == Prefilter::AND ? uniq_child.size() : 1; + entry->propagate_up_at_count = prefilter->op() == Prefilter::AND + ? static_cast<int>(uniq_child.size()) + : 1; break; } @@ -290,10 +291,10 @@ void PrefilterTree::RegexpsGivenStrings( if (!compiled_) { LOG(WARNING) << "Compile() not called"; for (size_t i = 0; i < prefilter_vec_.size(); ++i) - regexps->push_back(i); + regexps->push_back(static_cast<int>(i)); } else { if (!prefilter_vec_.empty()) { - IntMap regexps_map(prefilter_vec_.size()); + IntMap regexps_map(static_cast<int>(prefilter_vec_.size())); vector<int> matched_atom_ids; for (size_t j = 0; j < matched_atoms.size(); j++) { matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]); @@ -313,8 +314,8 @@ void PrefilterTree::RegexpsGivenStrings( void PrefilterTree::PropagateMatch(const vector<int>& atom_ids, IntMap* regexps) const { - IntMap count(entries_.size()); - IntMap work(entries_.size()); + IntMap count(static_cast<int>(entries_.size())); + IntMap work(static_cast<int>(entries_.size())); for (size_t i = 0; i < atom_ids.size(); i++) work.set(atom_ids[i], 1); for (IntMap::iterator it = work.begin(); it != work.end(); ++it) { @@ -96,7 +96,7 @@ class Prog { void InitFail(); // Getters - int id(Prog* p) { return this - p->inst_; } + int id(Prog* p) { return static_cast<int>(this - p->inst_); } InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); } int out() { return out_opcode_>>3; } int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; } @@ -12,7 +12,6 @@ #include <stdio.h> #include <string> #include <errno.h> -#include "util/atomicops.h" #include "util/util.h" #include "util/flags.h" #include "util/sparse_array.h" @@ -289,8 +288,19 @@ int RE2::ProgramFanout(map<int, int>* histogram) const { return histogram->rbegin()->first; } +// Returns num_captures_, computing it if needed, or -1 if the +// regexp wasn't valid on construction. +int RE2::NumberOfCapturingGroups() const { + MutexLock l(mutex_); + if (suffix_regexp_ == NULL) + return -1; + if (num_captures_ == -1) + num_captures_ = suffix_regexp_->NumCaptures(); + return num_captures_; +} + // Returns named_groups_, computing it if needed. -const map<string, int>& RE2::NamedCapturingGroups() const { +const map<string, int>& RE2::NamedCapturingGroups() const { MutexLock l(mutex_); if (!ok()) return *empty_named_groups; @@ -303,7 +313,7 @@ const map<string, int>& RE2::NamedCapturingGroups() const { } // Returns group_names_, computing it if needed. -const map<int, string>& RE2::CapturingGroupNames() const { +const map<int, string>& RE2::CapturingGroupNames() const { MutexLock l(mutex_); if (!ok()) return *empty_group_names; @@ -375,7 +385,7 @@ bool RE2::Replace(string *str, int nvec = 1 + MaxSubmatch(rewrite); if (nvec > arraysize(vec)) return false; - if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec)) + if (!re.Match(*str, 0, static_cast<int>(str->size()), UNANCHORED, vec, nvec)) return false; string s; @@ -402,7 +412,8 @@ int RE2::GlobalReplace(string *str, string out; int count = 0; while (p <= ep) { - if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec)) + if (!re.Match(*str, static_cast<int>(p - str->data()), + static_cast<int>(str->size()), UNANCHORED, vec, nvec)) break; if (p < vec[0].begin()) out.append(p, vec[0].begin() - p); @@ -486,7 +497,7 @@ bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const { if (prog_ == NULL) return false; - int n = prefix_.size(); + int n = static_cast<int>(prefix_.size()); if (n > maxlen) n = maxlen; @@ -598,7 +609,7 @@ bool RE2::Match(const StringPiece& text, if (!prefix_.empty()) { if (startpos != 0) return false; - prefixlen = prefix_.size(); + prefixlen = static_cast<int>(prefix_.size()); if (prefixlen > subtext.size()) return false; if (prefix_foldcase_) { @@ -839,8 +850,8 @@ bool RE2::DoMatch(const StringPiece& text, return false; } - if(consumed != NULL) - *consumed = vec[0].end() - text.begin(); + if (consumed != NULL) + *consumed = static_cast<int>(vec[0].end() - text.begin()); if (n == 0 || args == NULL) { // We are not interested in results @@ -907,20 +918,6 @@ bool RE2::Rewrite(string *out, const StringPiece &rewrite, return true; } -// Return the number of capturing subpatterns, or -1 if the -// regexp wasn't valid on construction. -int RE2::NumberOfCapturingGroups() const { - if (suffix_regexp_ == NULL) - return -1; - int n; - ATOMIC_LOAD_RELAXED(n, &num_captures_); - if (n == -1) { - n = suffix_regexp_->NumCaptures(); - ATOMIC_STORE_RELAXED(&num_captures_, n); - } - return n; -} - // Checks that the rewrite string is well-formed with respect to this // regular expression. bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const { @@ -447,7 +447,6 @@ class RE2 { // does not count: if the regexp is "(a)(b)", returns 2. int NumberOfCapturingGroups() const; - // Return a map from names to capturing indices. // The map records the index of the leftmost group // with the given name. @@ -466,8 +465,8 @@ class RE2 { // On a successful match, fills in match[] (up to nmatch entries) // with information about submatches. // I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true, - // setting match[0] = "barbaz", match[1] = NULL, match[2] = "bar", - // match[3] = NULL, ..., up to match[nmatch-1] = NULL. + // setting match[0] = "barbaz", match[1].data() = NULL, match[2] = "bar", + // match[3].data() = NULL, ..., up to match[nmatch-1].data() = NULL. // // Don't ask for more match information than you will use: // runs much faster with nmatch == 1 than nmatch > 1, and @@ -478,7 +477,7 @@ class RE2 { // Passing text == StringPiece(NULL, 0) will be handled like any other // empty string, but note that on return, it will not be possible to tell // whether submatch i matched the empty string or did not match: - // either way, match[i] == NULL. + // either way, match[i].data() == NULL. bool Match(const StringPiece& text, int startpos, int endpos, diff --git a/re2/regexp.cc b/re2/regexp.cc index d3aa1f0..99e72e5 100644 --- a/re2/regexp.cc +++ b/re2/regexp.cc @@ -453,10 +453,11 @@ bool Regexp::Equal(Regexp* a, Regexp* b) { continue; } - int n = stk.size(); + size_t n = stk.size(); if (n == 0) break; + DCHECK_GE(n, 2); a = stk[n-2]; b = stk[n-1]; stk.resize(n-2); @@ -677,7 +678,7 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { } break; } - *foldcase = (sub[i]->parse_flags() & FoldCase); + *foldcase = (sub[i]->parse_flags() & FoldCase) != 0; i++; // The rest. diff --git a/re2/regexp.h b/re2/regexp.h index b49ce0d..5f222b7 100644 --- a/re2/regexp.h +++ b/re2/regexp.h @@ -313,7 +313,7 @@ class Regexp { // Get. No set, Regexps are logically immutable once created. RegexpOp op() { return static_cast<RegexpOp>(op_); } int nsub() { return nsub_; } - bool simple() { return simple_; } + bool simple() { return simple_ != 0; } enum ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); } int Ref(); // For testing. @@ -45,7 +45,7 @@ int RE2::Set::Add(const StringPiece& pattern, string* error) { } // Concatenate with match index and push on vector. - int n = re_.size(); + int n = static_cast<int>(re_.size()); re2::Regexp* m = re2::Regexp::HaveMatch(n, pf); if (re->op() == kRegexpConcat) { int nsub = re->nsub(); @@ -76,7 +76,7 @@ bool RE2::Set::Compile() { Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>( options_.ParseFlags()); re2::Regexp* re = re2::Regexp::Alternate(const_cast<re2::Regexp**>(re_.data()), - re_.size(), pf); + static_cast<int>(re_.size()), pf); re_.clear(); re2::Regexp* sre = re->Simplify(); re->Decref(); diff --git a/re2/simplify.cc b/re2/simplify.cc index d14483f..ecc60e7 100644 --- a/re2/simplify.cc +++ b/re2/simplify.cc @@ -61,7 +61,7 @@ bool Regexp::ComputeSimple() { // These are simple as long as the subpieces are simple. subs = sub(); for (int i = 0; i < nsub_; i++) - if (!subs[i]->simple_) + if (!subs[i]->simple()) return false; return true; case kRegexpCharClass: @@ -71,12 +71,12 @@ bool Regexp::ComputeSimple() { return !cc_->empty() && !cc_->full(); case kRegexpCapture: subs = sub(); - return subs[0]->simple_; + return subs[0]->simple(); case kRegexpStar: case kRegexpPlus: case kRegexpQuest: subs = sub(); - if (!subs[0]->simple_) + if (!subs[0]->simple()) return false; switch (subs[0]->op_) { case kRegexpStar: @@ -438,7 +438,7 @@ Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { } Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) { - if (re->simple_) { + if (re->simple()) { *stop = true; return re->Incref(); } diff --git a/re2/stringpiece.cc b/re2/stringpiece.cc index c243527..00f478a 100644 --- a/re2/stringpiece.cc +++ b/re2/stringpiece.cc @@ -37,17 +37,19 @@ void StringPiece::AppendToString(string* target) const { target->append(ptr_, length_); } -int StringPiece::copy(char* buf, size_type n, size_type pos) const { - int ret = min(length_ - pos, n); +StringPiece::size_type StringPiece::copy(char* buf, size_type n, + size_type pos) const { + size_type ret = min(length_ - pos, n); memcpy(buf, ptr_ + pos, ret); return ret; } bool StringPiece::contains(StringPiece s) const { - return (size_t)find(s, 0) != npos; + return find(s, 0) != npos; } -int StringPiece::find(const StringPiece& s, size_type pos) const { +StringPiece::size_type StringPiece::find(const StringPiece& s, + size_type pos) const { if (length_ < 0 || pos > static_cast<size_type>(length_)) return npos; @@ -57,7 +59,7 @@ int StringPiece::find(const StringPiece& s, size_type pos) const { return xpos + s.length_ <= static_cast<size_type>(length_) ? xpos : npos; } -int StringPiece::find(char c, size_type pos) const { +StringPiece::size_type StringPiece::find(char c, size_type pos) const { if (length_ <= 0 || pos >= static_cast<size_type>(length_)) { return npos; } @@ -65,9 +67,10 @@ int StringPiece::find(char c, size_type pos) const { return result != ptr_ + length_ ? result - ptr_ : npos; } -int StringPiece::rfind(const StringPiece& s, size_type pos) const { +StringPiece::size_type StringPiece::rfind(const StringPiece& s, + size_type pos) const { if (length_ < s.length_) return npos; - const size_t ulen = length_; + const size_type ulen = length_; if (s.length_ == 0) return min(ulen, pos); const char* last = ptr_ + min(ulen - s.length_, pos) + s.length_; @@ -75,9 +78,9 @@ int StringPiece::rfind(const StringPiece& s, size_type pos) const { return result != last ? result - ptr_ : npos; } -int StringPiece::rfind(char c, size_type pos) const { +StringPiece::size_type StringPiece::rfind(char c, size_type pos) const { if (length_ <= 0) return npos; - for (int i = min(pos, static_cast<size_type>(length_ - 1)); + for (int i = static_cast<int>(min(pos, static_cast<size_type>(length_ - 1))); i >= 0; --i) { if (ptr_[i] == c) { return i; @@ -89,7 +92,7 @@ int StringPiece::rfind(char c, size_type pos) const { StringPiece StringPiece::substr(size_type pos, size_type n) const { if (pos > static_cast<size_type>(length_)) pos = static_cast<size_type>(length_); if (n > length_ - pos) n = length_ - pos; - return StringPiece(ptr_ + pos, n); + return StringPiece(ptr_ + pos, static_cast<int>(n)); } const StringPiece::size_type StringPiece::npos = size_type(-1); diff --git a/re2/stringpiece.h b/re2/stringpiece.h index bc8bf40..1479d1a 100644 --- a/re2/stringpiece.h +++ b/re2/stringpiece.h @@ -137,17 +137,17 @@ class StringPiece { int max_size() const { return length_; } int capacity() const { return length_; } - int copy(char* buf, size_type n, size_type pos = 0) const; + size_type copy(char* buf, size_type n, size_type pos = 0) const; bool contains(StringPiece s) const; - int find(const StringPiece& s, size_type pos = 0) const; - int find(char c, size_type pos = 0) const; - int rfind(const StringPiece& s, size_type pos = npos) const; - int rfind(char c, size_type pos = npos) const; + size_type find(const StringPiece& s, size_type pos = 0) const; + size_type find(char c, size_type pos = 0) const; + size_type rfind(const StringPiece& s, size_type pos = npos) const; + size_type rfind(char c, size_type pos = npos) const; StringPiece substr(size_type pos, size_type n = npos) const; - + static bool _equal(const StringPiece&, const StringPiece&); }; diff --git a/re2/testing/backtrack.cc b/re2/testing/backtrack.cc index b2dd6db..a872840 100644 --- a/re2/testing/backtrack.cc +++ b/re2/testing/backtrack.cc @@ -72,7 +72,7 @@ class Backtracker { // Search state const char* cap_[64]; // capture registers uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked - int nvisited_; // # of words in bitmap + size_t nvisited_; // # of words in bitmap }; Backtracker::Backtracker(Prog* prog) @@ -150,7 +150,7 @@ bool Backtracker::Visit(int id, const char* p) { // either it didn't match or it did but we're hoping for a better match. // Either way, don't go down that road again. CHECK(p <= text_.end()); - int n = id*(text_.size()+1) + (p - text_.begin()); + size_t n = id*(text_.size()+1) + (p - text_.begin()); CHECK_LT(n/32, nvisited_); if (visited_[n/32] & (1 << (n&31))) return false; @@ -212,7 +212,8 @@ bool Backtracker::Visit(int id, const char* p) { if (submatch_[0].data() == NULL || // First match so far ... (longest_ && p > submatch_[0].end())) { // ... or better match for (int i = 0; i < nsubmatch_; i++) - submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]); + submatch_[i].set(cap_[2*i], + static_cast<int>(cap_[2*i+1] - cap_[2*i])); } return true; diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc index d438b19..dee90a3 100644 --- a/re2/testing/compile_test.cc +++ b/re2/testing/compile_test.cc @@ -172,4 +172,90 @@ TEST(TestCompile, ByteRanges) { re->Decref(); } +static void Dump(StringPiece pattern, Regexp::ParseFlags flags, + string* forward, string* reverse) { + Regexp* re = Regexp::Parse(pattern, flags, NULL); + EXPECT_TRUE(re != NULL); + + if (forward != NULL) { + Prog* prog = re->CompileToProg(0); + EXPECT_TRUE(prog != NULL); + *forward = prog->Dump(); + delete prog; + } + + if (reverse != NULL) { + Prog* prog = re->CompileToReverseProg(0); + EXPECT_TRUE(prog != NULL); + *reverse = prog->Dump(); + delete prog; + } + + re->Decref(); +} + +TEST(TestCompile, Bug26705922) { + // Bug in the compiler caused inefficient bytecode to be generated for Unicode + // groups: common suffixes were cached, but common prefixes were not factored. + + string forward, reverse; + + Dump("[\\x{10000}\\x{10010}]", Regexp::LikePerl, &forward, &reverse); + EXPECT_EQ("4. byte [f0-f0] -> 3\n" + "3. byte [90-90] -> 2\n" + "2. byte [80-80] -> 6\n" + "6. alt -> 1 | 5\n" + "1. byte [80-80] -> 7\n" + "5. byte [90-90] -> 7\n" + "7. match! 0\n", + forward); + EXPECT_EQ("6. alt -> 4 | 5\n" + "4. byte [80-80] -> 3\n" + "5. byte [90-90] -> 3\n" + "3. byte [80-80] -> 2\n" + "2. byte [90-90] -> 1\n" + "1. byte [f0-f0] -> 7\n" + "7. match! 0\n", + reverse); + + Dump("[\\x{8000}-\\x{10FFF}]", Regexp::LikePerl, &forward, &reverse); + EXPECT_EQ("6. alt -> 3 | 5\n" + "3. byte [e8-ef] -> 2\n" + "5. byte [f0-f0] -> 4\n" + "2. byte [80-bf] -> 1\n" + "4. byte [90-90] -> 2\n" + "1. byte [80-bf] -> 7\n" + "7. match! 0\n", + forward); + EXPECT_EQ("3. byte [80-bf] -> 2\n" + "2. byte [80-bf] -> 6\n" + "6. alt -> 1 | 5\n" + "1. byte [e8-ef] -> 7\n" + "5. byte [90-90] -> 4\n" + "7. match! 0\n" + "4. byte [f0-f0] -> 7\n", + reverse); + + Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, NULL, &reverse); + EXPECT_EQ("2. byte [80-bf] -> 8\n" + "8. alt -> 5 | 7\n" + "5. alt -> 1 | 4\n" + "7. byte [80-bf] -> 17\n" + "1. byte [c2-df] -> 18\n" + "4. byte [a0-bf] -> 3\n" + "17. alt -> 14 | 16\n" + "18. match! 0\n" + "3. byte [e0-e0] -> 18\n" + "14. alt -> 11 | 13\n" + "16. byte [80-8f] -> 15\n" + "11. alt -> 6 | 10\n" + "13. byte [80-bf] -> 12\n" + "15. byte [f4-f4] -> 18\n" + "6. byte [e1-ef] -> 18\n" + "10. byte [90-bf] -> 9\n" + "12. byte [f1-f3] -> 18\n" + "9. byte [f0-f0] -> 18\n", + reverse); +} + } // namespace re2 diff --git a/re2/testing/dfa_test.cc b/re2/testing/dfa_test.cc index 6294d03..e9c7bef 100644 --- a/re2/testing/dfa_test.cc +++ b/re2/testing/dfa_test.cc @@ -10,6 +10,8 @@ #include "re2/testing/regexp_generator.h" #include "re2/testing/string_generator.h" +static const bool UsingMallocCounter = false; + DECLARE_bool(re2_dfa_bail_when_slow); DEFINE_int32(size, 8, "log2(number of DFA nodes)"); @@ -92,14 +94,13 @@ TEST(SingleThreaded, BuildEntireDFA) { s += "[ab]"; s += "b"; - //LOG(INFO) << s; Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL); CHECK(re); int max = 24; for (int i = 17; i < max; i++) { - int limit = 1<<i; - int usage; - //int progusage, dfamem; + int64 limit = 1<<i; + int64 usage; + //int64 progusage, dfamem; { testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY); Prog* prog = re->CompileToProg(limit); @@ -113,8 +114,10 @@ TEST(SingleThreaded, BuildEntireDFA) { } if (!UsingMallocCounter) continue; - //LOG(INFO) << StringPrintf("Limit %d: prog used %d, DFA budget %d, total %d\n", - // limit, progusage, dfamem, usage); + //LOG(INFO) << "limit " << limit << ", " + // << "prog usage " << progusage << ", " + // << "DFA budget " << dfamem << ", " + // << "total " << usage; // Tolerate +/- 10%. CHECK_GT(usage, limit*9/10); CHECK_LT(usage, limit*11/10); @@ -133,7 +136,7 @@ TEST(SingleThreaded, BuildEntireDFA) { // position in the input, never reusing any states until it gets to the // end of the string. This is the worst possible case for DFA execution. static string DeBruijnString(int n) { - CHECK_LT(n, 8*sizeof(int)); + CHECK_LT(n, static_cast<int>(8*sizeof(int))); CHECK_GT(n, 0); vector<bool> did(1<<n); @@ -222,13 +225,13 @@ TEST(SingleThreaded, SearchDFA) { peak_usage = m.PeakHeapGrowth(); delete prog; } - re->Decref(); - if (!UsingMallocCounter) return; - //LOG(INFO) << "usage " << usage << " " << peak_usage; + //LOG(INFO) << "usage " << usage << ", " + // << "peak usage " << peak_usage; CHECK_LT(usage, 1<<n); CHECK_LT(peak_usage, 1<<n); + re->Decref(); } // Helper thread: searches for match, which should match, diff --git a/re2/testing/exhaustive2_test.cc b/re2/testing/exhaustive2_test.cc index c5fec5b..6dc5016 100644 --- a/re2/testing/exhaustive2_test.cc +++ b/re2/testing/exhaustive2_test.cc @@ -23,7 +23,7 @@ TEST(EmptyString, Exhaustive) { TEST(Punctuation, Literals) { vector<string> alphabet = Explode("()*+?{}[]\\^$."); vector<string> escaped = alphabet; - for (int i = 0; i < escaped.size(); i++) + for (size_t i = 0; i < escaped.size(); i++) escaped[i] = "\\" + escaped[i]; ExhaustiveTest(1, 1, escaped, RegexpGenerator::EgrepOps(), 2, alphabet, "", ""); diff --git a/re2/testing/exhaustive3_test.cc b/re2/testing/exhaustive3_test.cc index 5613fcb..6e46bb4 100644 --- a/re2/testing/exhaustive3_test.cc +++ b/re2/testing/exhaustive3_test.cc @@ -84,7 +84,7 @@ TEST(InterestingUTF8, AB) { "[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]"); vector<string> ops; // no ops vector<string> alpha = InterestingUTF8(); - for (int i = 0; i < alpha.size(); i++) + for (size_t i = 0; i < alpha.size(); i++) alpha[i] = "a" + alpha[i] + "b"; ExhaustiveTest(1, 0, atoms, ops, 1, alpha, "a%sb", ""); diff --git a/re2/testing/filtered_re2_test.cc b/re2/testing/filtered_re2_test.cc index f4376ee..76c1284 100644 --- a/re2/testing/filtered_re2_test.cc +++ b/re2/testing/filtered_re2_test.cc @@ -189,18 +189,16 @@ TEST(FilteredRE2Test, AtomTests) { EXPECT_EQ(0, nfail); } -void FindAtomIndices(const vector<string> atoms, - const vector<string> matched_atoms, +void FindAtomIndices(const vector<string>& atoms, + const vector<string>& matched_atoms, vector<int>* atom_indices) { atom_indices->clear(); for (size_t i = 0; i < matched_atoms.size(); i++) { - size_t j = 0; - for (; j < atoms.size(); j++) { + for (size_t j = 0; j < atoms.size(); j++) { if (matched_atoms[i] == atoms[j]) { - atom_indices->push_back(j); + atom_indices->push_back(static_cast<int>(j)); break; } - EXPECT_LT(j, atoms.size()); } } } diff --git a/re2/testing/parse_test.cc b/re2/testing/parse_test.cc index 29a79b1..e204ce1 100644 --- a/re2/testing/parse_test.cc +++ b/re2/testing/parse_test.cc @@ -133,6 +133,9 @@ static Test tests[] = { { "\\Q+\\E+", "plus{lit{+}}" }, { "\\Q\\\\E", "lit{\\}" }, { "\\Q\\\\\\E", "str{\\\\}" }, + { "\\Qa\\E*", "star{lit{a}}" }, + { "\\Qab\\E*", "cat{lit{a}star{lit{b}}}" }, + { "\\Qabc\\E*", "cat{str{ab}star{lit{c}}}" }, // Test Perl \A and \z { "(?m)^", "bol{}" }, @@ -300,8 +303,8 @@ Test prefix_tests[] = { { "abc|x|abd", "alt{str{abc}lit{x}str{abd}}" }, { "(?i)abc|ABD", "cat{strfold{ab}cc{0x43-0x44 0x63-0x64}}" }, { "[ab]c|[ab]d", "cat{cc{0x61-0x62}cc{0x63-0x64}}" }, - { "(?:xx|yy)c|(?:xx|yy)d", - "cat{alt{str{xx}str{yy}}cc{0x63-0x64}}" }, + { ".c|.d", "cat{cc{0-0x9 0xb-0x10ffff}cc{0x63-0x64}}" }, + { "\\Cc|\\Cd", "cat{byte{}cc{0x63-0x64}}" }, { "x{2}|x{2}[0-9]", "cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" }, { "x{2}y|x{2}[0-9]y", @@ -314,6 +317,10 @@ Test prefix_tests[] = { "alt{cat{lit{r}alt{emp{}lit{s}}}lit{n}}" }, { "rs|r|n", "alt{cat{lit{r}alt{lit{s}emp{}}}lit{n}}" }, + { "a\\C*?c|a\\C*?b", + "cat{lit{a}alt{cat{nstar{byte{}}lit{c}}cat{nstar{byte{}}lit{b}}}}" }, + { "^/a/bc|^/a/de", + "cat{bol{}cat{str{/a/}alt{str{bc}str{de}}}}" }, }; // Test that prefix factoring works. @@ -362,6 +369,7 @@ const char* badtests[] = { "a{100000,}", "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})", "(((x{7}){11}){13})", + "\\Q\\E*", }; // Valid in Perl, bad in POSIX diff --git a/re2/testing/re2_arg_test.cc b/re2/testing/re2_arg_test.cc index ad6c936..d843ffa 100644 --- a/re2/testing/re2_arg_test.cc +++ b/re2/testing/re2_arg_test.cc @@ -95,7 +95,7 @@ const int kNumStrings = arraysize(kSuccessTable); for (int i = 0; i < kNumStrings; ++i) { \ RE2::Arg arg(&r); \ const char* const p = kSuccessTable[i].value_string; \ - bool retval = arg.Parse(p, strlen(p)); \ + bool retval = arg.Parse(p, static_cast<int>(strlen(p))); \ bool success = kSuccessTable[i].success[column]; \ EXPECT_EQ(retval, success) \ << "Parsing '" << p << "' for type " #type " should return " \ diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc index 78e0bc5..e201c1e 100644 --- a/re2/testing/re2_test.cc +++ b/re2/testing/re2_test.cc @@ -176,7 +176,7 @@ TEST(RE2, Replace) { { "", NULL, NULL, NULL, NULL, 0 } }; - for (const ReplaceTest *t = tests; t->original != NULL; ++t) { + for (const ReplaceTest* t = tests; t->original != NULL; t++) { VLOG(1) << StringPrintf("\"%s\" =~ s/%s/%s/g", t->original, t->regexp, t->rewrite); string one(t->original); CHECK(RE2::Replace(&one, t->regexp, t->rewrite)); @@ -369,12 +369,12 @@ TEST(RE2, Match) { CHECK_EQ(port, 9000); } -static void TestRecursion(int size, const char *pattern) { +static void TestRecursion(int size, const char* pattern) { // Fill up a string repeating the pattern given string domain; domain.resize(size); - int patlen = strlen(pattern); - for (int i = 0; i < size; ++i) { + size_t patlen = strlen(pattern); + for (int i = 0; i < size; i++) { domain[i] = pattern[i % patlen]; } // Just make sure it doesn't crash due to too much recursion. @@ -1410,12 +1410,56 @@ TEST(RE2, UnicodeClasses) { // Bug reported by saito. 2009/02/17 TEST(RE2, NullVsEmptyString) { - RE2 re2(".*"); - StringPiece v1(""); - EXPECT_TRUE(RE2::FullMatch(v1, re2)); + RE2 re(".*"); + EXPECT_TRUE(re.ok()); + + StringPiece null; + EXPECT_TRUE(RE2::FullMatch(null, re)); - StringPiece v2; - EXPECT_TRUE(RE2::FullMatch(v2, re2)); + StringPiece empty(""); + EXPECT_TRUE(RE2::FullMatch(empty, re)); +} + +// Similar to the previous test, check that the null string and the empty +// string both match, but also that the null string can only provide null +// submatches whereas the empty string can also provide empty submatches. +TEST(RE2, NullVsEmptyStringSubmatches) { + RE2 re("()|(foo)"); + EXPECT_TRUE(re.ok()); + + // matches[0] is overall match, [1] is (), [2] is (foo), [3] is nonexistent. + StringPiece matches[4]; + + for (int i = 0; i < arraysize(matches); i++) + matches[i] = "bar"; + + StringPiece null; + EXPECT_TRUE(re.Match(null, 0, null.size(), RE2::UNANCHORED, + matches, arraysize(matches))); + for (int i = 0; i < arraysize(matches); i++) { + EXPECT_TRUE(matches[i] == NULL); + EXPECT_TRUE(matches[i].data() == NULL); // always null + EXPECT_TRUE(matches[i] == ""); + } + + for (int i = 0; i < arraysize(matches); i++) + matches[i] = "bar"; + + StringPiece empty(""); + EXPECT_TRUE(re.Match(empty, 0, empty.size(), RE2::UNANCHORED, + matches, arraysize(matches))); + EXPECT_TRUE(matches[0] == NULL); + EXPECT_TRUE(matches[0].data() != NULL); // empty, not null + EXPECT_TRUE(matches[0] == ""); + EXPECT_TRUE(matches[1] == NULL); + EXPECT_TRUE(matches[1].data() != NULL); // empty, not null + EXPECT_TRUE(matches[1] == ""); + EXPECT_TRUE(matches[2] == NULL); + EXPECT_TRUE(matches[2].data() == NULL); + EXPECT_TRUE(matches[2] == ""); + EXPECT_TRUE(matches[3] == NULL); + EXPECT_TRUE(matches[3].data() == NULL); + EXPECT_TRUE(matches[3] == ""); } // Issue 1816809 @@ -1529,4 +1573,23 @@ TEST(RE2, Bug21371806) { CHECK(re.ok()); } +TEST(RE2, Bug26356109) { + // Bug in parser caused by factoring of common prefixes in alternations. + + // In the past, this was factored to "a\\C*?[bc]". Thus, the automaton would + // consume "ab" and then stop (when unanchored) whereas it should consume all + // of "abc" as per first-match semantics. + RE2 re("a\\C*?c|a\\C*?b"); + CHECK(re.ok()); + + string s = "abc"; + StringPiece m; + + CHECK(re.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1)); + CHECK_EQ(m, s) << " (UNANCHORED) got m='" << m << "', want '" << s << "'"; + + CHECK(re.Match(s, 0, s.size(), RE2::ANCHOR_BOTH, &m, 1)); + CHECK_EQ(m, s) << " (ANCHOR_BOTH) got m='" << m << "', want '" << s << "'"; +} + } // namespace re2 diff --git a/re2/testing/regexp_benchmark.cc b/re2/testing/regexp_benchmark.cc index bc8daaa..141bad1 100644 --- a/re2/testing/regexp_benchmark.cc +++ b/re2/testing/regexp_benchmark.cc @@ -265,6 +265,7 @@ BENCHMARK_RANGE(Search_BigFixed_CachedPCRE, 8, 32<<10)->ThreadRange(1, NumCPU BENCHMARK_RANGE(Search_BigFixed_CachedRE2, 8, 1<<20)->ThreadRange(1, NumCPUs()); // Benchmark: FindAndConsume + void FindAndConsume(int iters, int nbytes) { StopBenchmarkTiming(); string s; @@ -286,9 +287,11 @@ BENCHMARK_RANGE(FindAndConsume, 8, 16<<20)->ThreadRange(1, NumCPUs()); // Benchmark: successful anchored search. void SearchSuccess(int iters, int nbytes, const char* regexp, SearchImpl* search) { + StopBenchmarkTiming(); string s; MakeText(&s, nbytes); BenchmarkMemoryUsage(); + StartBenchmarkTiming(); search(iters, regexp, s, Prog::kAnchored, true); SetBenchmarkBytesProcessed(static_cast<int64>(iters)*nbytes); } @@ -346,11 +349,9 @@ BENCHMARK_RANGE(Search_Success1_Cached_RE2, 8, 16<<20)->ThreadRange(1, NumCP // Benchmark: use regexp to find phone number. void SearchDigits(int iters, SearchImpl* search) { - const char *text = "650-253-0001"; - int len = strlen(text); + StringPiece s("650-253-0001"); BenchmarkMemoryUsage(); - search(iters, "([0-9]+)-([0-9]+)-([0-9]+)", - StringPiece(text, len), Prog::kAnchored, true); + search(iters, "([0-9]+)-([0-9]+)-([0-9]+)", s, Prog::kAnchored, true); SetBenchmarkItemsProcessed(iters); } @@ -688,7 +689,6 @@ BENCHMARK(BM_Regexp_SimplifyCompile)->ThreadRange(1, NumCPUs()); BENCHMARK(BM_Regexp_NullWalk)->ThreadRange(1, NumCPUs()); BENCHMARK(BM_RE2_Compile)->ThreadRange(1, NumCPUs()); - // Makes text of size nbytes, then calls run to search // the text for regexp iters times. void SearchPhone(int iters, int nbytes, ParseImpl* search) { @@ -1344,14 +1344,14 @@ BENCHMARK(HTTPPartialMatchPCRE)->ThreadRange(1, NumCPUs()); #endif BENCHMARK(HTTPPartialMatchRE2)->ThreadRange(1, NumCPUs()); -static string http_smalltext = +static string smallhttp_text = "GET /abc HTTP/1.1"; void SmallHTTPPartialMatchPCRE(int n) { StringPiece a; PCRE re("(?-s)^(?:GET|POST) +([^ ]+) HTTP"); for (int i = 0; i < n; i++) { - PCRE::PartialMatch(http_text, re, &a); + PCRE::PartialMatch(smallhttp_text, re, &a); } } @@ -1359,7 +1359,7 @@ void SmallHTTPPartialMatchRE2(int n) { StringPiece a; RE2 re("(?-s)^(?:GET|POST) +([^ ]+) HTTP"); for (int i = 0; i < n; i++) { - RE2::PartialMatch(http_text, re, &a); + RE2::PartialMatch(smallhttp_text, re, &a); } } diff --git a/re2/testing/regexp_generator.cc b/re2/testing/regexp_generator.cc index d10b9a8..fd085db 100644 --- a/re2/testing/regexp_generator.cc +++ b/re2/testing/regexp_generator.cc @@ -134,7 +134,7 @@ void RegexpGenerator::GeneratePostfix(vector<string>* post, int nstk, // Generates a random postfix command sequence. // Stops and returns true once a single sequence has been generated. -bool RegexpGenerator::GenerateRandomPostfix(vector<string> *post, int nstk, +bool RegexpGenerator::GenerateRandomPostfix(vector<string>* post, int nstk, int ops, int atoms) { for (;;) { // Stop if we get to a single element, but only sometimes. @@ -151,7 +151,7 @@ bool RegexpGenerator::GenerateRandomPostfix(vector<string> *post, int nstk, // Add operators if there are enough arguments. if (ops < maxops_ && acm_->Uniform(2) == 0) { - const string& fmt = ops_[acm_->Uniform(ops_.size())]; + const string& fmt = ops_[acm_->Uniform(static_cast<int32>(ops_.size()))]; int nargs = CountArgs(fmt); if (nargs <= nstk) { post->push_back(fmt); @@ -165,7 +165,7 @@ bool RegexpGenerator::GenerateRandomPostfix(vector<string> *post, int nstk, // Add atoms if there is room. if (atoms < maxatoms_ && acm_->Uniform(2) == 0) { - post->push_back(atoms_[acm_->Uniform(atoms_.size())]); + post->push_back(atoms_[acm_->Uniform(static_cast<int32>(atoms_.size()))]); bool ret = GenerateRandomPostfix(post, nstk + 1, ops, atoms + 1); post->pop_back(); if (ret) diff --git a/re2/testing/regexp_test.cc b/re2/testing/regexp_test.cc index a0e7f0b..31c76a3 100644 --- a/re2/testing/regexp_test.cc +++ b/re2/testing/regexp_test.cc @@ -32,7 +32,8 @@ TEST(Regexp, BigConcat) { for (size_t i = 0; i < v.size(); i++) x->Incref(); CHECK_EQ(x->Ref(), 1 + static_cast<int>(v.size())) << x->Ref(); - Regexp* re = Regexp::Concat(&v[0], v.size(), Regexp::NoParseFlags); + Regexp* re = Regexp::Concat(v.data(), static_cast<int>(v.size()), + Regexp::NoParseFlags); CHECK_EQ(re->ToString(), string(v.size(), 'x')); re->Decref(); CHECK_EQ(x->Ref(), 1) << x->Ref(); diff --git a/re2/testing/string_generator.cc b/re2/testing/string_generator.cc index 728ce17..f96ff20 100644 --- a/re2/testing/string_generator.cc +++ b/re2/testing/string_generator.cc @@ -43,14 +43,14 @@ void StringGenerator::Reset() { // Returns false if all the numbers have been used. bool StringGenerator::IncrementDigits() { // First try to increment the current number. - for (int i = digits_.size() - 1; i >= 0; i--) { + for (int i = static_cast<int>(digits_.size()) - 1; i >= 0; i--) { if (++digits_[i] < static_cast<int>(alphabet_.size())) return true; digits_[i] = 0; } // If that failed, make a longer number. - if (digits_.size() < static_cast<size_t>(maxlen_)) { + if (static_cast<int>(digits_.size()) < maxlen_) { digits_.push_back(0); return true; } @@ -68,7 +68,7 @@ bool StringGenerator::RandomDigits() { int len = acm_->Uniform(maxlen_+1); digits_.resize(len); for (int i = 0; i < len; i++) - digits_[i] = acm_->Uniform(alphabet_.size()); + digits_[i] = acm_->Uniform(static_cast<int32>(alphabet_.size())); return true; } @@ -110,4 +110,3 @@ void StringGenerator::GenerateNULL() { } } // namespace re2 - diff --git a/re2/testing/tester.cc b/re2/testing/tester.cc index 20b12e7..cb12bad 100644 --- a/re2/testing/tester.cc +++ b/re2/testing/tester.cc @@ -392,10 +392,13 @@ void TestInstance::RunSearch(Engine type, if (kind_ == Prog::kFullMatch) re_anchor = RE2::ANCHOR_BOTH; - result->matched = re2_->Match(context, - text.begin() - context.begin(), - text.end() - context.begin(), - re_anchor, result->submatch, nsubmatch); + result->matched = re2_->Match( + context, + static_cast<int>(text.begin() - context.begin()), + static_cast<int>(text.end() - context.begin()), + re_anchor, + result->submatch, + nsubmatch); result->have_submatch = nsubmatch > 0; break; } diff --git a/re2/tostring.cc b/re2/tostring.cc index c59d4d9..0230c8c 100644 --- a/re2/tostring.cc +++ b/re2/tostring.cc @@ -156,12 +156,14 @@ int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, break; case kRegexpLiteral: - AppendLiteral(t_, re->rune(), re->parse_flags() & Regexp::FoldCase); + AppendLiteral(t_, re->rune(), + (re->parse_flags() & Regexp::FoldCase) != 0); break; case kRegexpLiteralString: for (int i = 0; i < re->nrunes(); i++) - AppendLiteral(t_, re->runes()[i], re->parse_flags() & Regexp::FoldCase); + AppendLiteral(t_, re->runes()[i], + (re->parse_flags() & Regexp::FoldCase) != 0); if (prec < PrecConcat) t_->append(")"); break; diff --git a/re2/unicode.py b/re2/unicode.py index 6dfe87b..4b2240c 100644 --- a/re2/unicode.py +++ b/re2/unicode.py @@ -9,7 +9,7 @@ import re import urllib2 # Directory or URL where Unicode tables reside. -_UNICODE_DIR = "http://www.unicode.org/Public/6.3.0/ucd" +_UNICODE_DIR = "http://www.unicode.org/Public/8.0.0/ucd" # Largest valid Unicode code value. _RUNE_MAX = 0x10FFFF diff --git a/re2/unicode_casefold.cc b/re2/unicode_casefold.cc index 2293cc7..91a96b4 100644 --- a/re2/unicode_casefold.cc +++ b/re2/unicode_casefold.cc @@ -7,7 +7,7 @@ namespace re2 { -// 1034 groups, 2089 pairs, 289 ranges +// 1224 groups, 2469 pairs, 314 ranges const CaseFold unicode_casefold[] = { { 65, 90, 32 }, { 97, 106, -32 }, @@ -105,13 +105,16 @@ const CaseFold unicode_casefold[] = { { 598, 599, -205 }, { 601, 601, -202 }, { 603, 603, -203 }, + { 604, 604, 42319 }, { 608, 608, -205 }, + { 609, 609, 42315 }, { 611, 611, -207 }, { 613, 613, 42280 }, { 614, 614, 42308 }, { 616, 616, -209 }, { 617, 617, -211 }, { 619, 619, 10743 }, + { 620, 620, 42305 }, { 623, 623, -211 }, { 625, 625, 10749 }, { 626, 626, -213 }, @@ -119,15 +122,19 @@ const CaseFold unicode_casefold[] = { { 637, 637, 10727 }, { 640, 640, -218 }, { 643, 643, -218 }, + { 647, 647, 42282 }, { 648, 648, -218 }, { 649, 649, -69 }, { 650, 651, -217 }, { 652, 652, -71 }, { 658, 658, -219 }, + { 669, 669, 42261 }, + { 670, 670, 42258 }, { 837, 837, 84 }, { 880, 883, EvenOdd }, { 886, 887, EvenOdd }, { 891, 893, 130 }, + { 895, 895, 116 }, { 902, 902, 38 }, { 904, 906, 37 }, { 908, 908, 64 }, @@ -168,6 +175,7 @@ const CaseFold unicode_casefold[] = { { 1008, 1008, -86 }, { 1009, 1009, -80 }, { 1010, 1010, 7 }, + { 1011, 1011, -116 }, { 1012, 1012, -92 }, { 1013, 1013, -96 }, { 1015, 1016, OddEven }, @@ -183,12 +191,15 @@ const CaseFold unicode_casefold[] = { { 1216, 1216, 15 }, { 1217, 1230, OddEven }, { 1231, 1231, -15 }, - { 1232, 1319, EvenOdd }, + { 1232, 1327, EvenOdd }, { 1329, 1366, 48 }, { 1377, 1414, -48 }, { 4256, 4293, 7264 }, { 4295, 4295, 7264 }, { 4301, 4301, 7264 }, + { 5024, 5103, 38864 }, + { 5104, 5109, 8 }, + { 5112, 5117, -8 }, { 7545, 7545, 35332 }, { 7549, 7549, 3814 }, { 7680, 7776, EvenOdd }, @@ -283,7 +294,7 @@ const CaseFold unicode_casefold[] = { { 11559, 11559, -7264 }, { 11565, 11565, -7264 }, { 42560, 42605, EvenOdd }, - { 42624, 42647, EvenOdd }, + { 42624, 42651, EvenOdd }, { 42786, 42799, EvenOdd }, { 42802, 42863, EvenOdd }, { 42873, 42876, OddEven }, @@ -292,16 +303,30 @@ const CaseFold unicode_casefold[] = { { 42891, 42892, OddEven }, { 42893, 42893, -42280 }, { 42896, 42899, EvenOdd }, - { 42912, 42921, EvenOdd }, + { 42902, 42921, EvenOdd }, { 42922, 42922, -42308 }, + { 42923, 42923, -42319 }, + { 42924, 42924, -42315 }, + { 42925, 42925, -42305 }, + { 42928, 42928, -42258 }, + { 42929, 42929, -42282 }, + { 42930, 42930, -42261 }, + { 42931, 42931, 928 }, + { 42932, 42935, EvenOdd }, + { 43859, 43859, -928 }, + { 43888, 43967, -38864 }, { 65313, 65338, 32 }, { 65345, 65370, -32 }, { 66560, 66599, 40 }, { 66600, 66639, -40 }, + { 68736, 68786, 64 }, + { 68800, 68850, -64 }, + { 71840, 71871, 32 }, + { 71872, 71903, -32 }, }; -const int num_unicode_casefold = 289; +const int num_unicode_casefold = 314; -// 1034 groups, 1055 pairs, 167 ranges +// 1224 groups, 1245 pairs, 180 ranges const CaseFold unicode_tolower[] = { { 65, 90, 32 }, { 181, 181, 775 }, @@ -370,6 +395,7 @@ const CaseFold unicode_tolower[] = { { 837, 837, 116 }, { 880, 882, EvenOddSkip }, { 886, 886, EvenOdd }, + { 895, 895, 116 }, { 902, 902, 38 }, { 904, 906, 37 }, { 908, 908, 64 }, @@ -397,11 +423,12 @@ const CaseFold unicode_tolower[] = { { 1162, 1214, EvenOddSkip }, { 1216, 1216, 15 }, { 1217, 1229, OddEvenSkip }, - { 1232, 1318, EvenOddSkip }, + { 1232, 1326, EvenOddSkip }, { 1329, 1366, 48 }, { 4256, 4293, 7264 }, { 4295, 4295, 7264 }, { 4301, 4301, 7264 }, + { 5112, 5117, -8 }, { 7680, 7828, EvenOddSkip }, { 7835, 7835, -58 }, { 7838, 7838, -7615 }, @@ -457,7 +484,7 @@ const CaseFold unicode_tolower[] = { { 11499, 11501, OddEvenSkip }, { 11506, 11506, EvenOdd }, { 42560, 42604, EvenOddSkip }, - { 42624, 42646, EvenOddSkip }, + { 42624, 42650, EvenOddSkip }, { 42786, 42798, EvenOddSkip }, { 42802, 42862, EvenOddSkip }, { 42873, 42875, OddEvenSkip }, @@ -466,12 +493,23 @@ const CaseFold unicode_tolower[] = { { 42891, 42891, OddEven }, { 42893, 42893, -42280 }, { 42896, 42898, EvenOddSkip }, - { 42912, 42920, EvenOddSkip }, + { 42902, 42920, EvenOddSkip }, { 42922, 42922, -42308 }, + { 42923, 42923, -42319 }, + { 42924, 42924, -42315 }, + { 42925, 42925, -42305 }, + { 42928, 42928, -42258 }, + { 42929, 42929, -42282 }, + { 42930, 42930, -42261 }, + { 42931, 42931, 928 }, + { 42932, 42934, EvenOddSkip }, + { 43888, 43967, -38864 }, { 65313, 65338, 32 }, { 66560, 66599, 40 }, + { 68736, 68786, 64 }, + { 71840, 71871, 32 }, }; -const int num_unicode_tolower = 167; +const int num_unicode_tolower = 180; diff --git a/re2/unicode_groups.cc b/re2/unicode_groups.cc index 0df585e..59087bc 100644 --- a/re2/unicode_groups.cc +++ b/re2/unicode_groups.cc @@ -53,6 +53,7 @@ static const URange16 Ps_range16[] = { { 11812, 11812 }, { 11814, 11814 }, { 11816, 11816 }, + { 11842, 11842 }, { 12296, 12296 }, { 12298, 12298 }, { 12300, 12300 }, @@ -63,7 +64,7 @@ static const URange16 Ps_range16[] = { { 12312, 12312 }, { 12314, 12314 }, { 12317, 12317 }, - { 64830, 64830 }, + { 64831, 64831 }, { 65047, 65047 }, { 65077, 65077 }, { 65079, 65079 }, @@ -97,7 +98,7 @@ static const URange32 Nl_range32[] = { { 66369, 66369 }, { 66378, 66378 }, { 66513, 66517 }, - { 74752, 74850 }, + { 74752, 74862 }, }; static const URange16 No_range16[] = { { 178, 179 }, @@ -132,18 +133,34 @@ static const URange16 No_range16[] = { static const URange32 No_range32[] = { { 65799, 65843 }, { 65909, 65912 }, - { 65930, 65930 }, + { 65930, 65931 }, + { 66273, 66299 }, { 66336, 66339 }, { 67672, 67679 }, + { 67705, 67711 }, + { 67751, 67759 }, + { 67835, 67839 }, { 67862, 67867 }, + { 68028, 68029 }, + { 68032, 68047 }, + { 68050, 68095 }, { 68160, 68167 }, { 68221, 68222 }, + { 68253, 68255 }, + { 68331, 68335 }, { 68440, 68447 }, { 68472, 68479 }, + { 68521, 68527 }, + { 68858, 68863 }, { 69216, 69246 }, { 69714, 69733 }, + { 70113, 70132 }, + { 71482, 71483 }, + { 71914, 71922 }, + { 93019, 93025 }, { 119648, 119665 }, - { 127232, 127242 }, + { 125127, 125135 }, + { 127232, 127244 }, }; static const URange16 Lo_range16[] = { { 170, 170 }, @@ -168,14 +185,12 @@ static const URange16 Lo_range16[] = { { 1994, 2026 }, { 2048, 2069 }, { 2112, 2136 }, - { 2208, 2208 }, - { 2210, 2220 }, + { 2208, 2228 }, { 2308, 2361 }, { 2365, 2365 }, { 2384, 2384 }, { 2392, 2401 }, - { 2418, 2423 }, - { 2425, 2431 }, + { 2418, 2432 }, { 2437, 2444 }, { 2447, 2448 }, { 2451, 2472 }, @@ -206,6 +221,7 @@ static const URange16 Lo_range16[] = { { 2749, 2749 }, { 2768, 2768 }, { 2784, 2785 }, + { 2809, 2809 }, { 2821, 2828 }, { 2831, 2832 }, { 2835, 2856 }, @@ -230,10 +246,9 @@ static const URange16 Lo_range16[] = { { 3077, 3084 }, { 3086, 3088 }, { 3090, 3112 }, - { 3114, 3123 }, - { 3125, 3129 }, + { 3114, 3129 }, { 3133, 3133 }, - { 3160, 3161 }, + { 3160, 3162 }, { 3168, 3169 }, { 3205, 3212 }, { 3214, 3216 }, @@ -249,7 +264,7 @@ static const URange16 Lo_range16[] = { { 3346, 3386 }, { 3389, 3389 }, { 3406, 3406 }, - { 3424, 3425 }, + { 3423, 3425 }, { 3450, 3455 }, { 3461, 3478 }, { 3482, 3505 }, @@ -306,11 +321,11 @@ static const URange16 Lo_range16[] = { { 4882, 4885 }, { 4888, 4954 }, { 4992, 5007 }, - { 5024, 5108 }, { 5121, 5740 }, { 5743, 5759 }, { 5761, 5786 }, { 5792, 5866 }, + { 5873, 5880 }, { 5888, 5900 }, { 5902, 5905 }, { 5920, 5937 }, @@ -324,11 +339,11 @@ static const URange16 Lo_range16[] = { { 6272, 6312 }, { 6314, 6314 }, { 6320, 6389 }, - { 6400, 6428 }, + { 6400, 6430 }, { 6480, 6509 }, { 6512, 6516 }, { 6528, 6571 }, - { 6593, 6599 }, + { 6576, 6601 }, { 6656, 6678 }, { 6688, 6740 }, { 6917, 6963 }, @@ -364,7 +379,7 @@ static const URange16 Lo_range16[] = { { 12704, 12730 }, { 12784, 12799 }, { 13312, 19893 }, - { 19968, 40908 }, + { 19968, 40917 }, { 40960, 40980 }, { 40982, 42124 }, { 42192, 42231 }, @@ -373,6 +388,8 @@ static const URange16 Lo_range16[] = { { 42538, 42539 }, { 42606, 42606 }, { 42656, 42725 }, + { 42895, 42895 }, + { 42999, 42999 }, { 43003, 43009 }, { 43011, 43013 }, { 43015, 43018 }, @@ -381,17 +398,21 @@ static const URange16 Lo_range16[] = { { 43138, 43187 }, { 43250, 43255 }, { 43259, 43259 }, + { 43261, 43261 }, { 43274, 43301 }, { 43312, 43334 }, { 43360, 43388 }, { 43396, 43442 }, + { 43488, 43492 }, + { 43495, 43503 }, + { 43514, 43518 }, { 43520, 43560 }, { 43584, 43586 }, { 43588, 43595 }, { 43616, 43631 }, { 43633, 43638 }, { 43642, 43642 }, - { 43648, 43695 }, + { 43646, 43695 }, { 43697, 43697 }, { 43701, 43702 }, { 43705, 43709 }, @@ -443,19 +464,29 @@ static const URange32 Lo_range32[] = { { 65664, 65786 }, { 66176, 66204 }, { 66208, 66256 }, - { 66304, 66334 }, + { 66304, 66335 }, { 66352, 66368 }, { 66370, 66377 }, + { 66384, 66421 }, { 66432, 66461 }, { 66464, 66499 }, { 66504, 66511 }, { 66640, 66717 }, + { 66816, 66855 }, + { 66864, 66915 }, + { 67072, 67382 }, + { 67392, 67413 }, + { 67424, 67431 }, { 67584, 67589 }, { 67592, 67592 }, { 67594, 67637 }, { 67639, 67640 }, { 67644, 67644 }, { 67647, 67669 }, + { 67680, 67702 }, + { 67712, 67742 }, + { 67808, 67826 }, + { 67828, 67829 }, { 67840, 67861 }, { 67872, 67897 }, { 67968, 68023 }, @@ -465,23 +496,70 @@ static const URange32 Lo_range32[] = { { 68117, 68119 }, { 68121, 68147 }, { 68192, 68220 }, + { 68224, 68252 }, + { 68288, 68295 }, + { 68297, 68324 }, { 68352, 68405 }, { 68416, 68437 }, { 68448, 68466 }, + { 68480, 68497 }, { 68608, 68680 }, { 69635, 69687 }, { 69763, 69807 }, { 69840, 69864 }, { 69891, 69926 }, + { 69968, 70002 }, + { 70006, 70006 }, { 70019, 70066 }, { 70081, 70084 }, + { 70106, 70106 }, + { 70108, 70108 }, + { 70144, 70161 }, + { 70163, 70187 }, + { 70272, 70278 }, + { 70280, 70280 }, + { 70282, 70285 }, + { 70287, 70301 }, + { 70303, 70312 }, + { 70320, 70366 }, + { 70405, 70412 }, + { 70415, 70416 }, + { 70419, 70440 }, + { 70442, 70448 }, + { 70450, 70451 }, + { 70453, 70457 }, + { 70461, 70461 }, + { 70480, 70480 }, + { 70493, 70497 }, + { 70784, 70831 }, + { 70852, 70853 }, + { 70855, 70855 }, + { 71040, 71086 }, + { 71128, 71131 }, + { 71168, 71215 }, + { 71236, 71236 }, { 71296, 71338 }, - { 73728, 74606 }, + { 71424, 71449 }, + { 71935, 71935 }, + { 72384, 72440 }, + { 73728, 74649 }, + { 74880, 75075 }, { 77824, 78894 }, + { 82944, 83526 }, { 92160, 92728 }, + { 92736, 92766 }, + { 92880, 92909 }, + { 92928, 92975 }, + { 93027, 93047 }, + { 93053, 93071 }, { 93952, 94020 }, { 94032, 94032 }, { 110592, 110593 }, + { 113664, 113770 }, + { 113776, 113788 }, + { 113792, 113800 }, + { 113808, 113817 }, + { 124928, 125124 }, { 126464, 126467 }, { 126469, 126495 }, { 126497, 126498 }, @@ -518,6 +596,7 @@ static const URange32 Lo_range32[] = { { 131072, 173782 }, { 173824, 177972 }, { 177984, 178205 }, + { 178208, 183969 }, { 194560, 195101 }, }; static const URange16 Ll_range16[] = { @@ -786,7 +865,12 @@ static const URange16 Ll_range16[] = { { 1315, 1315 }, { 1317, 1317 }, { 1319, 1319 }, + { 1321, 1321 }, + { 1323, 1323 }, + { 1325, 1325 }, + { 1327, 1327 }, { 1377, 1415 }, + { 5112, 5117 }, { 7424, 7467 }, { 7531, 7543 }, { 7545, 7578 }, @@ -1044,6 +1128,8 @@ static const URange16 Ll_range16[] = { { 42643, 42643 }, { 42645, 42645 }, { 42647, 42647 }, + { 42649, 42649 }, + { 42651, 42651 }, { 42787, 42787 }, { 42789, 42789 }, { 42791, 42791 }, @@ -1093,19 +1179,31 @@ static const URange16 Ll_range16[] = { { 42892, 42892 }, { 42894, 42894 }, { 42897, 42897 }, - { 42899, 42899 }, + { 42899, 42901 }, + { 42903, 42903 }, + { 42905, 42905 }, + { 42907, 42907 }, + { 42909, 42909 }, + { 42911, 42911 }, { 42913, 42913 }, { 42915, 42915 }, { 42917, 42917 }, { 42919, 42919 }, { 42921, 42921 }, + { 42933, 42933 }, + { 42935, 42935 }, { 43002, 43002 }, + { 43824, 43866 }, + { 43872, 43877 }, + { 43888, 43967 }, { 64256, 64262 }, { 64275, 64279 }, { 65345, 65370 }, }; static const URange32 Ll_range32[] = { { 66600, 66639 }, + { 68800, 68850 }, + { 71872, 71903 }, { 119834, 119859 }, { 119886, 119892 }, { 119894, 119911 }, @@ -1177,18 +1275,22 @@ static const URange16 Lm_range16[] = { { 42232, 42237 }, { 42508, 42508 }, { 42623, 42623 }, + { 42652, 42653 }, { 42775, 42783 }, { 42864, 42864 }, { 42888, 42888 }, { 43000, 43001 }, { 43471, 43471 }, + { 43494, 43494 }, { 43632, 43632 }, { 43741, 43741 }, { 43763, 43764 }, + { 43868, 43871 }, { 65392, 65392 }, { 65438, 65439 }, }; static const URange32 Lm_range32[] = { + { 92992, 92995 }, { 94099, 94111 }, }; static const URange16 Nd_range16[] = { @@ -1205,6 +1307,7 @@ static const URange16 Nd_range16[] = { { 3174, 3183 }, { 3302, 3311 }, { 3430, 3439 }, + { 3558, 3567 }, { 3664, 3673 }, { 3792, 3801 }, { 3872, 3881 }, @@ -1224,6 +1327,7 @@ static const URange16 Nd_range16[] = { { 43216, 43225 }, { 43264, 43273 }, { 43472, 43481 }, + { 43504, 43513 }, { 43600, 43609 }, { 44016, 44025 }, { 65296, 65305 }, @@ -1234,7 +1338,14 @@ static const URange32 Nd_range32[] = { { 69872, 69881 }, { 69942, 69951 }, { 70096, 70105 }, + { 70384, 70393 }, + { 70864, 70873 }, + { 71248, 71257 }, { 71360, 71369 }, + { 71472, 71481 }, + { 71904, 71913 }, + { 92768, 92777 }, + { 93008, 93017 }, { 120782, 120831 }, }; static const URange16 Pc_range16[] = { @@ -1405,6 +1516,7 @@ static const URange16 Lu_range16[] = { { 880, 880 }, { 882, 882 }, { 886, 886 }, + { 895, 895 }, { 902, 902 }, { 904, 906 }, { 908, 908 }, @@ -1524,10 +1636,15 @@ static const URange16 Lu_range16[] = { { 1314, 1314 }, { 1316, 1316 }, { 1318, 1318 }, + { 1320, 1320 }, + { 1322, 1322 }, + { 1324, 1324 }, + { 1326, 1326 }, { 1329, 1366 }, { 4256, 4293 }, { 4295, 4295 }, { 4301, 4301 }, + { 5024, 5109 }, { 7680, 7680 }, { 7682, 7682 }, { 7684, 7684 }, @@ -1778,6 +1895,8 @@ static const URange16 Lu_range16[] = { { 42642, 42642 }, { 42644, 42644 }, { 42646, 42646 }, + { 42648, 42648 }, + { 42650, 42650 }, { 42786, 42786 }, { 42788, 42788 }, { 42790, 42790 }, @@ -1827,16 +1946,25 @@ static const URange16 Lu_range16[] = { { 42893, 42893 }, { 42896, 42896 }, { 42898, 42898 }, + { 42902, 42902 }, + { 42904, 42904 }, + { 42906, 42906 }, + { 42908, 42908 }, + { 42910, 42910 }, { 42912, 42912 }, { 42914, 42914 }, { 42916, 42916 }, { 42918, 42918 }, { 42920, 42920 }, - { 42922, 42922 }, + { 42922, 42925 }, + { 42928, 42932 }, + { 42934, 42934 }, { 65313, 65338 }, }; static const URange32 Lu_range32[] = { { 66560, 66599 }, + { 68736, 68786 }, + { 71840, 71871 }, { 119808, 119833 }, { 119860, 119885 }, { 119912, 119937 }, @@ -1891,6 +2019,7 @@ static const URange16 Pd_range16[] = { { 11799, 11799 }, { 11802, 11802 }, { 11834, 11835 }, + { 11840, 11840 }, { 12316, 12316 }, { 12336, 12336 }, { 12448, 12448 }, @@ -1953,7 +2082,7 @@ static const URange16 Pe_range16[] = { { 12313, 12313 }, { 12315, 12315 }, { 12318, 12319 }, - { 64831, 64831 }, + { 64830, 64830 }, { 65048, 65048 }, { 65078, 65078 }, { 65080, 65080 }, @@ -2068,6 +2197,8 @@ static const URange16 Po_range16[] = { { 11806, 11807 }, { 11818, 11822 }, { 11824, 11833 }, + { 11836, 11839 }, + { 11841, 11841 }, { 12289, 12291 }, { 12349, 12349 }, { 12539, 12539 }, @@ -2079,6 +2210,7 @@ static const URange16 Po_range16[] = { { 43124, 43127 }, { 43214, 43215 }, { 43256, 43258 }, + { 43260, 43260 }, { 43310, 43311 }, { 43359, 43359 }, { 43457, 43469 }, @@ -2112,21 +2244,41 @@ static const URange32 Po_range32[] = { { 65792, 65794 }, { 66463, 66463 }, { 66512, 66512 }, + { 66927, 66927 }, { 67671, 67671 }, { 67871, 67871 }, { 67903, 67903 }, { 68176, 68184 }, { 68223, 68223 }, + { 68336, 68342 }, { 68409, 68415 }, + { 68505, 68508 }, { 69703, 69709 }, { 69819, 69820 }, { 69822, 69825 }, { 69952, 69955 }, - { 70085, 70088 }, - { 74864, 74867 }, + { 70004, 70005 }, + { 70085, 70089 }, + { 70093, 70093 }, + { 70107, 70107 }, + { 70109, 70111 }, + { 70200, 70205 }, + { 70313, 70313 }, + { 70854, 70854 }, + { 71105, 71127 }, + { 71233, 71235 }, + { 71484, 71486 }, + { 74864, 74868 }, + { 92782, 92783 }, + { 92917, 92917 }, + { 92983, 92987 }, + { 92996, 92996 }, + { 113823, 113823 }, + { 121479, 121483 }, }; static const URange16 Me_range16[] = { { 1160, 1161 }, + { 6846, 6846 }, { 8413, 8416 }, { 8418, 8420 }, { 42608, 42610 }, @@ -2135,7 +2287,7 @@ static const URange16 C_range16[] = { { 0, 31 }, { 127, 159 }, { 173, 173 }, - { 1536, 1540 }, + { 1536, 1541 }, { 1564, 1564 }, { 1757, 1757 }, { 1807, 1807 }, @@ -2150,6 +2302,7 @@ static const URange16 C_range16[] = { }; static const URange32 C_range32[] = { { 69821, 69821 }, + { 113824, 113827 }, { 119155, 119162 }, { 917505, 917505 }, { 917536, 917631 }, @@ -2221,8 +2374,6 @@ static const URange16 Mc_range16[] = { { 6441, 6443 }, { 6448, 6449 }, { 6451, 6456 }, - { 6576, 6592 }, - { 6600, 6601 }, { 6681, 6682 }, { 6741, 6741 }, { 6743, 6743 }, @@ -2238,7 +2389,6 @@ static const URange16 Mc_range16[] = { { 7073, 7073 }, { 7078, 7079 }, { 7082, 7082 }, - { 7084, 7085 }, { 7143, 7143 }, { 7146, 7148 }, { 7150, 7150 }, @@ -2261,6 +2411,7 @@ static const URange16 Mc_range16[] = { { 43571, 43572 }, { 43597, 43597 }, { 43643, 43643 }, + { 43645, 43645 }, { 43755, 43755 }, { 43758, 43759 }, { 43765, 43765 }, @@ -2279,9 +2430,32 @@ static const URange32 Mc_range32[] = { { 70018, 70018 }, { 70067, 70069 }, { 70079, 70080 }, + { 70188, 70190 }, + { 70194, 70195 }, + { 70197, 70197 }, + { 70368, 70370 }, + { 70402, 70403 }, + { 70462, 70463 }, + { 70465, 70468 }, + { 70471, 70472 }, + { 70475, 70477 }, + { 70487, 70487 }, + { 70498, 70499 }, + { 70832, 70834 }, + { 70841, 70841 }, + { 70843, 70846 }, + { 70849, 70849 }, + { 71087, 71089 }, + { 71096, 71099 }, + { 71102, 71102 }, + { 71216, 71218 }, + { 71227, 71228 }, + { 71230, 71230 }, { 71340, 71340 }, { 71342, 71343 }, { 71350, 71350 }, + { 71456, 71457 }, + { 71462, 71462 }, { 94033, 94078 }, { 119141, 119142 }, { 119149, 119154 }, @@ -2310,8 +2484,7 @@ static const URange16 Mn_range16[] = { { 2085, 2087 }, { 2089, 2093 }, { 2137, 2139 }, - { 2276, 2302 }, - { 2304, 2306 }, + { 2275, 2306 }, { 2362, 2362 }, { 2364, 2364 }, { 2369, 2376 }, @@ -2347,16 +2520,19 @@ static const URange16 Mn_range16[] = { { 2946, 2946 }, { 3008, 3008 }, { 3021, 3021 }, + { 3072, 3072 }, { 3134, 3136 }, { 3142, 3144 }, { 3146, 3149 }, { 3157, 3158 }, { 3170, 3171 }, + { 3201, 3201 }, { 3260, 3260 }, { 3263, 3263 }, { 3270, 3270 }, { 3276, 3277 }, { 3298, 3299 }, + { 3329, 3329 }, { 3393, 3396 }, { 3405, 3405 }, { 3426, 3427 }, @@ -2416,6 +2592,7 @@ static const URange16 Mn_range16[] = { { 6757, 6764 }, { 6771, 6780 }, { 6783, 6783 }, + { 6832, 6845 }, { 6912, 6915 }, { 6964, 6964 }, { 6966, 6970 }, @@ -2425,7 +2602,7 @@ static const URange16 Mn_range16[] = { { 7040, 7041 }, { 7074, 7077 }, { 7080, 7081 }, - { 7083, 7083 }, + { 7083, 7085 }, { 7142, 7142 }, { 7144, 7145 }, { 7149, 7149 }, @@ -2437,7 +2614,8 @@ static const URange16 Mn_range16[] = { { 7394, 7400 }, { 7405, 7405 }, { 7412, 7412 }, - { 7616, 7654 }, + { 7416, 7417 }, + { 7616, 7669 }, { 7676, 7679 }, { 8400, 8412 }, { 8417, 8417 }, @@ -2449,7 +2627,7 @@ static const URange16 Mn_range16[] = { { 12441, 12442 }, { 42607, 42607 }, { 42612, 42621 }, - { 42655, 42655 }, + { 42654, 42655 }, { 42736, 42737 }, { 43010, 43010 }, { 43014, 43014 }, @@ -2463,11 +2641,13 @@ static const URange16 Mn_range16[] = { { 43443, 43443 }, { 43446, 43449 }, { 43452, 43452 }, + { 43493, 43493 }, { 43561, 43566 }, { 43569, 43570 }, { 43573, 43574 }, { 43587, 43587 }, { 43596, 43596 }, + { 43644, 43644 }, { 43696, 43696 }, { 43698, 43700 }, { 43703, 43704 }, @@ -2480,35 +2660,74 @@ static const URange16 Mn_range16[] = { { 44013, 44013 }, { 64286, 64286 }, { 65024, 65039 }, - { 65056, 65062 }, + { 65056, 65071 }, }; static const URange32 Mn_range32[] = { { 66045, 66045 }, + { 66272, 66272 }, + { 66422, 66426 }, { 68097, 68099 }, { 68101, 68102 }, { 68108, 68111 }, { 68152, 68154 }, { 68159, 68159 }, + { 68325, 68326 }, { 69633, 69633 }, { 69688, 69702 }, - { 69760, 69761 }, + { 69759, 69761 }, { 69811, 69814 }, { 69817, 69818 }, { 69888, 69890 }, { 69927, 69931 }, { 69933, 69940 }, + { 70003, 70003 }, { 70016, 70017 }, { 70070, 70078 }, + { 70090, 70092 }, + { 70191, 70193 }, + { 70196, 70196 }, + { 70198, 70199 }, + { 70367, 70367 }, + { 70371, 70378 }, + { 70400, 70401 }, + { 70460, 70460 }, + { 70464, 70464 }, + { 70502, 70508 }, + { 70512, 70516 }, + { 70835, 70840 }, + { 70842, 70842 }, + { 70847, 70848 }, + { 70850, 70851 }, + { 71090, 71093 }, + { 71100, 71101 }, + { 71103, 71104 }, + { 71132, 71133 }, + { 71219, 71226 }, + { 71229, 71229 }, + { 71231, 71232 }, { 71339, 71339 }, { 71341, 71341 }, { 71344, 71349 }, { 71351, 71351 }, + { 71453, 71455 }, + { 71458, 71461 }, + { 71463, 71467 }, + { 92912, 92916 }, + { 92976, 92982 }, { 94095, 94098 }, + { 113821, 113822 }, { 119143, 119145 }, { 119163, 119170 }, { 119173, 119179 }, { 119210, 119213 }, { 119362, 119364 }, + { 121344, 121398 }, + { 121403, 121452 }, + { 121461, 121461 }, + { 121476, 121476 }, + { 121499, 121503 }, + { 121505, 121519 }, + { 125136, 125142 }, { 917760, 917999 }, }; static const URange16 M_range16[] = { @@ -2535,8 +2754,7 @@ static const URange16 M_range16[] = { { 2085, 2087 }, { 2089, 2093 }, { 2137, 2139 }, - { 2276, 2302 }, - { 2304, 2307 }, + { 2275, 2307 }, { 2362, 2364 }, { 2366, 2383 }, { 2385, 2391 }, @@ -2574,20 +2792,20 @@ static const URange16 M_range16[] = { { 3014, 3016 }, { 3018, 3021 }, { 3031, 3031 }, - { 3073, 3075 }, + { 3072, 3075 }, { 3134, 3140 }, { 3142, 3144 }, { 3146, 3149 }, { 3157, 3158 }, { 3170, 3171 }, - { 3202, 3203 }, + { 3201, 3203 }, { 3260, 3260 }, { 3262, 3268 }, { 3270, 3272 }, { 3274, 3277 }, { 3285, 3286 }, { 3298, 3299 }, - { 3330, 3331 }, + { 3329, 3331 }, { 3390, 3396 }, { 3398, 3400 }, { 3402, 3405 }, @@ -2636,12 +2854,11 @@ static const URange16 M_range16[] = { { 6313, 6313 }, { 6432, 6443 }, { 6448, 6459 }, - { 6576, 6592 }, - { 6600, 6601 }, { 6679, 6683 }, { 6741, 6750 }, { 6752, 6780 }, { 6783, 6783 }, + { 6832, 6846 }, { 6912, 6916 }, { 6964, 6980 }, { 7019, 7027 }, @@ -2653,7 +2870,8 @@ static const URange16 M_range16[] = { { 7380, 7400 }, { 7405, 7405 }, { 7410, 7412 }, - { 7616, 7654 }, + { 7416, 7417 }, + { 7616, 7669 }, { 7676, 7679 }, { 8400, 8432 }, { 11503, 11505 }, @@ -2663,7 +2881,7 @@ static const URange16 M_range16[] = { { 12441, 12442 }, { 42607, 42610 }, { 42612, 42621 }, - { 42655, 42655 }, + { 42654, 42655 }, { 42736, 42737 }, { 43010, 43010 }, { 43014, 43014 }, @@ -2676,10 +2894,11 @@ static const URange16 M_range16[] = { { 43335, 43347 }, { 43392, 43395 }, { 43443, 43456 }, + { 43493, 43493 }, { 43561, 43574 }, { 43587, 43587 }, { 43596, 43597 }, - { 43643, 43643 }, + { 43643, 43645 }, { 43696, 43696 }, { 43698, 43700 }, { 43703, 43704 }, @@ -2691,32 +2910,64 @@ static const URange16 M_range16[] = { { 44012, 44013 }, { 64286, 64286 }, { 65024, 65039 }, - { 65056, 65062 }, + { 65056, 65071 }, }; static const URange32 M_range32[] = { { 66045, 66045 }, + { 66272, 66272 }, + { 66422, 66426 }, { 68097, 68099 }, { 68101, 68102 }, { 68108, 68111 }, { 68152, 68154 }, { 68159, 68159 }, + { 68325, 68326 }, { 69632, 69634 }, { 69688, 69702 }, - { 69760, 69762 }, + { 69759, 69762 }, { 69808, 69818 }, { 69888, 69890 }, { 69927, 69940 }, + { 70003, 70003 }, { 70016, 70018 }, { 70067, 70080 }, + { 70090, 70092 }, + { 70188, 70199 }, + { 70367, 70378 }, + { 70400, 70403 }, + { 70460, 70460 }, + { 70462, 70468 }, + { 70471, 70472 }, + { 70475, 70477 }, + { 70487, 70487 }, + { 70498, 70499 }, + { 70502, 70508 }, + { 70512, 70516 }, + { 70832, 70851 }, + { 71087, 71093 }, + { 71096, 71104 }, + { 71132, 71133 }, + { 71216, 71232 }, { 71339, 71351 }, + { 71453, 71467 }, + { 92912, 92916 }, + { 92976, 92982 }, { 94033, 94078 }, { 94095, 94098 }, + { 113821, 113822 }, { 119141, 119145 }, { 119149, 119154 }, { 119163, 119170 }, { 119173, 119179 }, { 119210, 119213 }, { 119362, 119364 }, + { 121344, 121398 }, + { 121403, 121452 }, + { 121461, 121461 }, + { 121476, 121476 }, + { 121499, 121503 }, + { 121505, 121519 }, + { 125136, 125142 }, { 917760, 917999 }, }; static const URange16 L_range16[] = { @@ -2735,13 +2986,14 @@ static const URange16 L_range16[] = { { 880, 884 }, { 886, 887 }, { 890, 893 }, + { 895, 895 }, { 902, 902 }, { 904, 906 }, { 908, 908 }, { 910, 929 }, { 931, 1013 }, { 1015, 1153 }, - { 1162, 1319 }, + { 1162, 1327 }, { 1329, 1366 }, { 1369, 1369 }, { 1377, 1415 }, @@ -2767,14 +3019,12 @@ static const URange16 L_range16[] = { { 2084, 2084 }, { 2088, 2088 }, { 2112, 2136 }, - { 2208, 2208 }, - { 2210, 2220 }, + { 2208, 2228 }, { 2308, 2361 }, { 2365, 2365 }, { 2384, 2384 }, { 2392, 2401 }, - { 2417, 2423 }, - { 2425, 2431 }, + { 2417, 2432 }, { 2437, 2444 }, { 2447, 2448 }, { 2451, 2472 }, @@ -2805,6 +3055,7 @@ static const URange16 L_range16[] = { { 2749, 2749 }, { 2768, 2768 }, { 2784, 2785 }, + { 2809, 2809 }, { 2821, 2828 }, { 2831, 2832 }, { 2835, 2856 }, @@ -2829,10 +3080,9 @@ static const URange16 L_range16[] = { { 3077, 3084 }, { 3086, 3088 }, { 3090, 3112 }, - { 3114, 3123 }, - { 3125, 3129 }, + { 3114, 3129 }, { 3133, 3133 }, - { 3160, 3161 }, + { 3160, 3162 }, { 3168, 3169 }, { 3205, 3212 }, { 3214, 3216 }, @@ -2848,7 +3098,7 @@ static const URange16 L_range16[] = { { 3346, 3386 }, { 3389, 3389 }, { 3406, 3406 }, - { 3424, 3425 }, + { 3423, 3425 }, { 3450, 3455 }, { 3461, 3478 }, { 3482, 3505 }, @@ -2909,11 +3159,13 @@ static const URange16 L_range16[] = { { 4882, 4885 }, { 4888, 4954 }, { 4992, 5007 }, - { 5024, 5108 }, + { 5024, 5109 }, + { 5112, 5117 }, { 5121, 5740 }, { 5743, 5759 }, { 5761, 5786 }, { 5792, 5866 }, + { 5873, 5880 }, { 5888, 5900 }, { 5902, 5905 }, { 5920, 5937 }, @@ -2927,11 +3179,11 @@ static const URange16 L_range16[] = { { 6272, 6312 }, { 6314, 6314 }, { 6320, 6389 }, - { 6400, 6428 }, + { 6400, 6430 }, { 6480, 6509 }, { 6512, 6516 }, { 6528, 6571 }, - { 6593, 6599 }, + { 6576, 6601 }, { 6656, 6678 }, { 6688, 6740 }, { 6823, 6823 }, @@ -3015,21 +3267,20 @@ static const URange16 L_range16[] = { { 12704, 12730 }, { 12784, 12799 }, { 13312, 19893 }, - { 19968, 40908 }, + { 19968, 40917 }, { 40960, 42124 }, { 42192, 42237 }, { 42240, 42508 }, { 42512, 42527 }, { 42538, 42539 }, { 42560, 42606 }, - { 42623, 42647 }, + { 42623, 42653 }, { 42656, 42725 }, { 42775, 42783 }, { 42786, 42888 }, - { 42891, 42894 }, - { 42896, 42899 }, - { 42912, 42922 }, - { 43000, 43009 }, + { 42891, 42925 }, + { 42928, 42935 }, + { 42999, 43009 }, { 43011, 43013 }, { 43015, 43018 }, { 43020, 43042 }, @@ -3037,17 +3288,21 @@ static const URange16 L_range16[] = { { 43138, 43187 }, { 43250, 43255 }, { 43259, 43259 }, + { 43261, 43261 }, { 43274, 43301 }, { 43312, 43334 }, { 43360, 43388 }, { 43396, 43442 }, { 43471, 43471 }, + { 43488, 43492 }, + { 43494, 43503 }, + { 43514, 43518 }, { 43520, 43560 }, { 43584, 43586 }, { 43588, 43595 }, { 43616, 43638 }, { 43642, 43642 }, - { 43648, 43695 }, + { 43646, 43695 }, { 43697, 43697 }, { 43701, 43702 }, { 43705, 43709 }, @@ -3061,7 +3316,9 @@ static const URange16 L_range16[] = { { 43793, 43798 }, { 43808, 43814 }, { 43816, 43822 }, - { 43968, 44002 }, + { 43824, 43866 }, + { 43868, 43877 }, + { 43888, 44002 }, { 44032, 55203 }, { 55216, 55238 }, { 55243, 55291 }, @@ -3101,19 +3358,29 @@ static const URange32 L_range32[] = { { 65664, 65786 }, { 66176, 66204 }, { 66208, 66256 }, - { 66304, 66334 }, + { 66304, 66335 }, { 66352, 66368 }, { 66370, 66377 }, + { 66384, 66421 }, { 66432, 66461 }, { 66464, 66499 }, { 66504, 66511 }, { 66560, 66717 }, + { 66816, 66855 }, + { 66864, 66915 }, + { 67072, 67382 }, + { 67392, 67413 }, + { 67424, 67431 }, { 67584, 67589 }, { 67592, 67592 }, { 67594, 67637 }, { 67639, 67640 }, { 67644, 67644 }, { 67647, 67669 }, + { 67680, 67702 }, + { 67712, 67742 }, + { 67808, 67826 }, + { 67828, 67829 }, { 67840, 67861 }, { 67872, 67897 }, { 67968, 68023 }, @@ -3123,24 +3390,74 @@ static const URange32 L_range32[] = { { 68117, 68119 }, { 68121, 68147 }, { 68192, 68220 }, + { 68224, 68252 }, + { 68288, 68295 }, + { 68297, 68324 }, { 68352, 68405 }, { 68416, 68437 }, { 68448, 68466 }, + { 68480, 68497 }, { 68608, 68680 }, + { 68736, 68786 }, + { 68800, 68850 }, { 69635, 69687 }, { 69763, 69807 }, { 69840, 69864 }, { 69891, 69926 }, + { 69968, 70002 }, + { 70006, 70006 }, { 70019, 70066 }, { 70081, 70084 }, + { 70106, 70106 }, + { 70108, 70108 }, + { 70144, 70161 }, + { 70163, 70187 }, + { 70272, 70278 }, + { 70280, 70280 }, + { 70282, 70285 }, + { 70287, 70301 }, + { 70303, 70312 }, + { 70320, 70366 }, + { 70405, 70412 }, + { 70415, 70416 }, + { 70419, 70440 }, + { 70442, 70448 }, + { 70450, 70451 }, + { 70453, 70457 }, + { 70461, 70461 }, + { 70480, 70480 }, + { 70493, 70497 }, + { 70784, 70831 }, + { 70852, 70853 }, + { 70855, 70855 }, + { 71040, 71086 }, + { 71128, 71131 }, + { 71168, 71215 }, + { 71236, 71236 }, { 71296, 71338 }, - { 73728, 74606 }, + { 71424, 71449 }, + { 71840, 71903 }, + { 71935, 71935 }, + { 72384, 72440 }, + { 73728, 74649 }, + { 74880, 75075 }, { 77824, 78894 }, + { 82944, 83526 }, { 92160, 92728 }, + { 92736, 92766 }, + { 92880, 92909 }, + { 92928, 92975 }, + { 92992, 92995 }, + { 93027, 93047 }, + { 93053, 93071 }, { 93952, 94020 }, { 94032, 94032 }, { 94099, 94111 }, { 110592, 110593 }, + { 113664, 113770 }, + { 113776, 113788 }, + { 113792, 113800 }, + { 113808, 113817 }, { 119808, 119892 }, { 119894, 119964 }, { 119966, 119967 }, @@ -3171,6 +3488,7 @@ static const URange32 L_range32[] = { { 120714, 120744 }, { 120746, 120770 }, { 120772, 120779 }, + { 124928, 125124 }, { 126464, 126467 }, { 126469, 126495 }, { 126497, 126498 }, @@ -3207,6 +3525,7 @@ static const URange32 L_range32[] = { { 131072, 173782 }, { 173824, 177972 }, { 177984, 178205 }, + { 178208, 183969 }, { 194560, 195101 }, }; static const URange16 N_range16[] = { @@ -3229,6 +3548,7 @@ static const URange16 N_range16[] = { { 3192, 3198 }, { 3302, 3311 }, { 3430, 3445 }, + { 3558, 3567 }, { 3664, 3673 }, { 3792, 3801 }, { 3872, 3891 }, @@ -3271,6 +3591,7 @@ static const URange16 N_range16[] = { { 43216, 43225 }, { 43264, 43273 }, { 43472, 43481 }, + { 43504, 43513 }, { 43600, 43609 }, { 44016, 44025 }, { 65296, 65305 }, @@ -3278,28 +3599,49 @@ static const URange16 N_range16[] = { static const URange32 N_range32[] = { { 65799, 65843 }, { 65856, 65912 }, - { 65930, 65930 }, + { 65930, 65931 }, + { 66273, 66299 }, { 66336, 66339 }, { 66369, 66369 }, { 66378, 66378 }, { 66513, 66517 }, { 66720, 66729 }, { 67672, 67679 }, + { 67705, 67711 }, + { 67751, 67759 }, + { 67835, 67839 }, { 67862, 67867 }, + { 68028, 68029 }, + { 68032, 68047 }, + { 68050, 68095 }, { 68160, 68167 }, { 68221, 68222 }, + { 68253, 68255 }, + { 68331, 68335 }, { 68440, 68447 }, { 68472, 68479 }, + { 68521, 68527 }, + { 68858, 68863 }, { 69216, 69246 }, { 69714, 69743 }, { 69872, 69881 }, { 69942, 69951 }, { 70096, 70105 }, + { 70113, 70132 }, + { 70384, 70393 }, + { 70864, 70873 }, + { 71248, 71257 }, { 71360, 71369 }, - { 74752, 74850 }, + { 71472, 71483 }, + { 71904, 71922 }, + { 74752, 74862 }, + { 92768, 92777 }, + { 93008, 93017 }, + { 93019, 93025 }, { 119648, 119665 }, { 120782, 120831 }, - { 127232, 127242 }, + { 125127, 125135 }, + { 127232, 127244 }, }; static const URange16 Sk_range16[] = { { 94, 94 }, @@ -3325,11 +3667,15 @@ static const URange16 Sk_range16[] = { { 42752, 42774 }, { 42784, 42785 }, { 42889, 42890 }, + { 43867, 43867 }, { 64434, 64449 }, { 65342, 65342 }, { 65344, 65344 }, { 65507, 65507 }, }; +static const URange32 Sk_range32[] = { + { 127995, 127999 }, +}; static const URange16 P_range16[] = { { 33, 35 }, { 37, 42 }, @@ -3416,7 +3762,7 @@ static const URange16 P_range16[] = { { 11518, 11519 }, { 11632, 11632 }, { 11776, 11822 }, - { 11824, 11835 }, + { 11824, 11842 }, { 12289, 12291 }, { 12296, 12305 }, { 12308, 12319 }, @@ -3432,6 +3778,7 @@ static const URange16 P_range16[] = { { 43124, 43127 }, { 43214, 43215 }, { 43256, 43258 }, + { 43260, 43260 }, { 43310, 43311 }, { 43359, 43359 }, { 43457, 43469 }, @@ -3462,18 +3809,37 @@ static const URange32 P_range32[] = { { 65792, 65794 }, { 66463, 66463 }, { 66512, 66512 }, + { 66927, 66927 }, { 67671, 67671 }, { 67871, 67871 }, { 67903, 67903 }, { 68176, 68184 }, { 68223, 68223 }, + { 68336, 68342 }, { 68409, 68415 }, + { 68505, 68508 }, { 69703, 69709 }, { 69819, 69820 }, { 69822, 69825 }, { 69952, 69955 }, - { 70085, 70088 }, - { 74864, 74867 }, + { 70004, 70005 }, + { 70085, 70089 }, + { 70093, 70093 }, + { 70107, 70107 }, + { 70109, 70111 }, + { 70200, 70205 }, + { 70313, 70313 }, + { 70854, 70854 }, + { 71105, 71127 }, + { 71233, 71235 }, + { 71484, 71486 }, + { 74864, 74868 }, + { 92782, 92783 }, + { 92917, 92917 }, + { 92983, 92987 }, + { 92996, 92996 }, + { 113823, 113823 }, + { 121479, 121483 }, }; static const URange16 S_range16[] = { { 36, 36 }, @@ -3500,7 +3866,7 @@ static const URange16 S_range16[] = { { 900, 901 }, { 1014, 1014 }, { 1154, 1154 }, - { 1423, 1423 }, + { 1421, 1423 }, { 1542, 1544 }, { 1547, 1547 }, { 1550, 1551 }, @@ -3544,7 +3910,7 @@ static const URange16 S_range16[] = { { 8274, 8274 }, { 8314, 8316 }, { 8330, 8332 }, - { 8352, 8378 }, + { 8352, 8382 }, { 8448, 8449 }, { 8451, 8454 }, { 8456, 8457 }, @@ -3559,21 +3925,25 @@ static const URange16 S_range16[] = { { 8512, 8516 }, { 8522, 8525 }, { 8527, 8527 }, + { 8586, 8587 }, { 8592, 8967 }, { 8972, 9000 }, - { 9003, 9203 }, + { 9003, 9210 }, { 9216, 9254 }, { 9280, 9290 }, { 9372, 9449 }, - { 9472, 9983 }, - { 9985, 10087 }, + { 9472, 10087 }, { 10132, 10180 }, { 10183, 10213 }, { 10224, 10626 }, { 10649, 10711 }, { 10716, 10747 }, - { 10750, 11084 }, - { 11088, 11097 }, + { 10750, 11123 }, + { 11126, 11157 }, + { 11160, 11193 }, + { 11197, 11208 }, + { 11210, 11217 }, + { 11244, 11247 }, { 11493, 11498 }, { 11904, 11929 }, { 11931, 12019 }, @@ -3603,6 +3973,7 @@ static const URange16 S_range16[] = { { 43048, 43051 }, { 43062, 43065 }, { 43639, 43641 }, + { 43867, 43867 }, { 64297, 64297 }, { 64434, 64449 }, { 65020, 65021 }, @@ -3623,15 +3994,23 @@ static const URange16 S_range16[] = { static const URange32 S_range32[] = { { 65847, 65855 }, { 65913, 65929 }, + { 65932, 65932 }, { 65936, 65947 }, + { 65952, 65952 }, { 66000, 66044 }, + { 67703, 67704 }, + { 68296, 68296 }, + { 71487, 71487 }, + { 92988, 92991 }, + { 92997, 92997 }, + { 113820, 113820 }, { 118784, 119029 }, { 119040, 119078 }, { 119081, 119140 }, { 119146, 119148 }, { 119171, 119172 }, { 119180, 119209 }, - { 119214, 119261 }, + { 119214, 119272 }, { 119296, 119361 }, { 119365, 119365 }, { 119552, 119638 }, @@ -3645,13 +4024,18 @@ static const URange32 S_range32[] = { { 120713, 120713 }, { 120745, 120745 }, { 120771, 120771 }, + { 120832, 121343 }, + { 121399, 121402 }, + { 121453, 121460 }, + { 121462, 121475 }, + { 121477, 121478 }, { 126704, 126705 }, { 126976, 127019 }, { 127024, 127123 }, { 127136, 127150 }, - { 127153, 127166 }, + { 127153, 127167 }, { 127169, 127183 }, - { 127185, 127199 }, + { 127185, 127221 }, { 127248, 127278 }, { 127280, 127339 }, { 127344, 127386 }, @@ -3659,24 +4043,21 @@ static const URange32 S_range32[] = { { 127504, 127546 }, { 127552, 127560 }, { 127568, 127569 }, - { 127744, 127776 }, - { 127792, 127797 }, - { 127799, 127868 }, - { 127872, 127891 }, - { 127904, 127940 }, - { 127942, 127946 }, - { 127968, 127984 }, - { 128000, 128062 }, - { 128064, 128064 }, - { 128066, 128247 }, - { 128249, 128252 }, - { 128256, 128317 }, - { 128320, 128323 }, - { 128336, 128359 }, - { 128507, 128576 }, - { 128581, 128591 }, - { 128640, 128709 }, + { 127744, 128377 }, + { 128379, 128419 }, + { 128421, 128720 }, + { 128736, 128748 }, + { 128752, 128755 }, { 128768, 128883 }, + { 128896, 128980 }, + { 129024, 129035 }, + { 129040, 129095 }, + { 129104, 129113 }, + { 129120, 129159 }, + { 129168, 129197 }, + { 129296, 129304 }, + { 129408, 129412 }, + { 129472, 129472 }, }; static const URange16 So_range16[] = { { 166, 166 }, @@ -3684,6 +4065,7 @@ static const URange16 So_range16[] = { { 174, 174 }, { 176, 176 }, { 1154, 1154 }, + { 1421, 1422 }, { 1550, 1551 }, { 1758, 1758 }, { 1769, 1769 }, @@ -3726,6 +4108,7 @@ static const URange16 So_range16[] = { { 8522, 8522 }, { 8524, 8525 }, { 8527, 8527 }, + { 8586, 8587 }, { 8597, 8601 }, { 8604, 8607 }, { 8609, 8610 }, @@ -3741,7 +4124,7 @@ static const URange16 So_range16[] = { { 9003, 9083 }, { 9085, 9114 }, { 9140, 9179 }, - { 9186, 9203 }, + { 9186, 9210 }, { 9216, 9254 }, { 9280, 9290 }, { 9372, 9449 }, @@ -3749,13 +4132,17 @@ static const URange16 So_range16[] = { { 9656, 9664 }, { 9666, 9719 }, { 9728, 9838 }, - { 9840, 9983 }, - { 9985, 10087 }, + { 9840, 10087 }, { 10132, 10175 }, { 10240, 10495 }, { 11008, 11055 }, { 11077, 11078 }, - { 11088, 11097 }, + { 11085, 11123 }, + { 11126, 11157 }, + { 11160, 11193 }, + { 11197, 11208 }, + { 11210, 11217 }, + { 11244, 11247 }, { 11493, 11498 }, { 11904, 11929 }, { 11931, 12019 }, @@ -3791,24 +4178,37 @@ static const URange16 So_range16[] = { static const URange32 So_range32[] = { { 65847, 65855 }, { 65913, 65929 }, + { 65932, 65932 }, { 65936, 65947 }, + { 65952, 65952 }, { 66000, 66044 }, + { 67703, 67704 }, + { 68296, 68296 }, + { 71487, 71487 }, + { 92988, 92991 }, + { 92997, 92997 }, + { 113820, 113820 }, { 118784, 119029 }, { 119040, 119078 }, { 119081, 119140 }, { 119146, 119148 }, { 119171, 119172 }, { 119180, 119209 }, - { 119214, 119261 }, + { 119214, 119272 }, { 119296, 119361 }, { 119365, 119365 }, { 119552, 119638 }, + { 120832, 121343 }, + { 121399, 121402 }, + { 121453, 121460 }, + { 121462, 121475 }, + { 121477, 121478 }, { 126976, 127019 }, { 127024, 127123 }, { 127136, 127150 }, - { 127153, 127166 }, + { 127153, 127167 }, { 127169, 127183 }, - { 127185, 127199 }, + { 127185, 127221 }, { 127248, 127278 }, { 127280, 127339 }, { 127344, 127386 }, @@ -3816,24 +4216,22 @@ static const URange32 So_range32[] = { { 127504, 127546 }, { 127552, 127560 }, { 127568, 127569 }, - { 127744, 127776 }, - { 127792, 127797 }, - { 127799, 127868 }, - { 127872, 127891 }, - { 127904, 127940 }, - { 127942, 127946 }, - { 127968, 127984 }, - { 128000, 128062 }, - { 128064, 128064 }, - { 128066, 128247 }, - { 128249, 128252 }, - { 128256, 128317 }, - { 128320, 128323 }, - { 128336, 128359 }, - { 128507, 128576 }, - { 128581, 128591 }, - { 128640, 128709 }, + { 127744, 127994 }, + { 128000, 128377 }, + { 128379, 128419 }, + { 128421, 128720 }, + { 128736, 128748 }, + { 128752, 128755 }, { 128768, 128883 }, + { 128896, 128980 }, + { 129024, 129035 }, + { 129040, 129095 }, + { 129104, 129113 }, + { 129120, 129159 }, + { 129168, 129197 }, + { 129296, 129304 }, + { 129408, 129412 }, + { 129472, 129472 }, }; static const URange16 Sm_range16[] = { { 43, 43 }, @@ -3914,7 +4312,7 @@ static const URange16 Sc_range16[] = { { 3065, 3065 }, { 3647, 3647 }, { 6107, 6107 }, - { 8352, 8378 }, + { 8352, 8382 }, { 43064, 43064 }, { 65020, 65020 }, { 65129, 65129 }, @@ -3948,7 +4346,7 @@ static const URange16 Cc_range16[] = { }; static const URange16 Cf_range16[] = { { 173, 173 }, - { 1536, 1540 }, + { 1536, 1541 }, { 1564, 1564 }, { 1757, 1757 }, { 1807, 1807 }, @@ -3962,6 +4360,7 @@ static const URange16 Cf_range16[] = { }; static const URange32 Cf_range32[] = { { 69821, 69821 }, + { 113824, 113827 }, { 119155, 119162 }, { 917505, 917505 }, { 917536, 917631 }, @@ -3985,29 +4384,28 @@ static const URange16 Thaana_range16[] = { { 1920, 1969 }, }; static const URange16 Telugu_range16[] = { - { 3073, 3075 }, + { 3072, 3075 }, { 3077, 3084 }, { 3086, 3088 }, { 3090, 3112 }, - { 3114, 3123 }, - { 3125, 3129 }, + { 3114, 3129 }, { 3133, 3140 }, { 3142, 3144 }, { 3146, 3149 }, { 3157, 3158 }, - { 3160, 3161 }, + { 3160, 3162 }, { 3168, 3171 }, { 3174, 3183 }, { 3192, 3199 }, }; static const URange16 Cyrillic_range16[] = { { 1024, 1156 }, - { 1159, 1319 }, + { 1159, 1327 }, { 7467, 7467 }, { 7544, 7544 }, { 11744, 11775 }, - { 42560, 42647 }, - { 42655, 42655 }, + { 42560, 42655 }, + { 65070, 65071 }, }; static const URange16 Hangul_range16[] = { { 4352, 4607 }, @@ -4068,22 +4466,25 @@ static const URange16 Inherited_range16[] = { { 1611, 1621 }, { 1648, 1648 }, { 2385, 2386 }, + { 6832, 6846 }, { 7376, 7378 }, { 7380, 7392 }, { 7394, 7400 }, { 7405, 7405 }, { 7412, 7412 }, - { 7616, 7654 }, + { 7416, 7417 }, + { 7616, 7669 }, { 7676, 7679 }, { 8204, 8205 }, { 8400, 8432 }, { 12330, 12333 }, { 12441, 12442 }, { 65024, 65039 }, - { 65056, 65062 }, + { 65056, 65069 }, }; static const URange32 Inherited_range32[] = { { 66045, 66045 }, + { 66272, 66272 }, { 119143, 119145 }, { 119163, 119170 }, { 119173, 119179 }, @@ -4092,7 +4493,13 @@ static const URange32 Inherited_range32[] = { }; static const URange32 Meroitic_Cursive_range32[] = { { 68000, 68023 }, - { 68030, 68031 }, + { 68028, 68047 }, + { 68050, 68095 }, +}; +static const URange32 Ahom_range32[] = { + { 71424, 71449 }, + { 71453, 71467 }, + { 71472, 71487 }, }; static const URange16 Han_range16[] = { { 11904, 11929 }, @@ -4103,7 +4510,7 @@ static const URange16 Han_range16[] = { { 12321, 12329 }, { 12344, 12347 }, { 13312, 19893 }, - { 19968, 40908 }, + { 19968, 40917 }, { 63744, 64109 }, { 64112, 64217 }, }; @@ -4111,14 +4518,18 @@ static const URange32 Han_range32[] = { { 131072, 173782 }, { 173824, 177972 }, { 177984, 178205 }, + { 178208, 183969 }, { 194560, 195101 }, }; +static const URange32 Old_North_Arabian_range32[] = { + { 68224, 68255 }, +}; static const URange16 Armenian_range16[] = { { 1329, 1366 }, { 1369, 1375 }, { 1377, 1415 }, { 1418, 1418 }, - { 1423, 1423 }, + { 1421, 1423 }, { 64275, 64279 }, }; static const URange16 Tamil_range16[] = { @@ -4144,6 +4555,10 @@ static const URange16 Bopomofo_range16[] = { { 12549, 12589 }, { 12704, 12730 }, }; +static const URange32 Bassa_Vah_range32[] = { + { 92880, 92909 }, + { 92912, 92917 }, +}; static const URange16 Sundanese_range16[] = { { 7040, 7103 }, { 7360, 7367 }, @@ -4153,7 +4568,7 @@ static const URange16 Tagalog_range16[] = { { 5902, 5908 }, }; static const URange16 Malayalam_range16[] = { - { 3330, 3331 }, + { 3329, 3331 }, { 3333, 3340 }, { 3342, 3344 }, { 3346, 3386 }, @@ -4161,7 +4576,7 @@ static const URange16 Malayalam_range16[] = { { 3398, 3400 }, { 3402, 3406 }, { 3415, 3415 }, - { 3424, 3427 }, + { 3423, 3427 }, { 3430, 3445 }, { 3449, 3455 }, }; @@ -4186,12 +4601,20 @@ static const URange16 Meetei_Mayek_range16[] = { { 43968, 44013 }, { 44016, 44025 }, }; +static const URange32 Pahawh_Hmong_range32[] = { + { 92928, 92997 }, + { 93008, 93017 }, + { 93019, 93025 }, + { 93027, 93047 }, + { 93053, 93071 }, +}; static const URange16 Tai_Le_range16[] = { { 6480, 6509 }, { 6512, 6516 }, }; static const URange16 Kayah_Li_range16[] = { - { 43264, 43311 }, + { 43264, 43309 }, + { 43311, 43311 }, }; static const URange16 Buginese_range16[] = { { 6656, 6683 }, @@ -4215,13 +4638,16 @@ static const URange16 Tai_Tham_range16[] = { { 6816, 6829 }, }; static const URange32 Old_Italic_range32[] = { - { 66304, 66334 }, - { 66336, 66339 }, + { 66304, 66339 }, }; static const URange32 Old_Persian_range32[] = { { 66464, 66499 }, { 66504, 66517 }, }; +static const URange32 Warang_Citi_range32[] = { + { 71840, 71922 }, + { 71935, 71935 }, +}; static const URange16 Latin_range16[] = { { 65, 90 }, { 97, 122 }, @@ -4246,10 +4672,11 @@ static const URange16 Latin_range16[] = { { 8544, 8584 }, { 11360, 11391 }, { 42786, 42887 }, - { 42891, 42894 }, - { 42896, 42899 }, - { 42912, 42922 }, - { 43000, 43007 }, + { 42891, 42925 }, + { 42928, 42935 }, + { 42999, 43007 }, + { 43824, 43866 }, + { 43868, 43876 }, { 64256, 64262 }, { 65313, 65338 }, { 65345, 65370 }, @@ -4271,6 +4698,30 @@ static const URange16 Georgian_range16[] = { { 11559, 11559 }, { 11565, 11565 }, }; +static const URange32 Grantha_range32[] = { + { 70400, 70403 }, + { 70405, 70412 }, + { 70415, 70416 }, + { 70419, 70440 }, + { 70442, 70448 }, + { 70450, 70451 }, + { 70453, 70457 }, + { 70460, 70468 }, + { 70471, 70472 }, + { 70475, 70477 }, + { 70480, 70480 }, + { 70487, 70487 }, + { 70493, 70499 }, + { 70502, 70508 }, + { 70512, 70516 }, +}; +static const URange32 Duployan_range32[] = { + { 113664, 113770 }, + { 113776, 113788 }, + { 113792, 113800 }, + { 113808, 113817 }, + { 113820, 113823 }, +}; static const URange16 Batak_range16[] = { { 7104, 7155 }, { 7164, 7167 }, @@ -4278,9 +4729,8 @@ static const URange16 Batak_range16[] = { static const URange16 Devanagari_range16[] = { { 2304, 2384 }, { 2387, 2403 }, - { 2406, 2423 }, - { 2425, 2431 }, - { 43232, 43259 }, + { 2406, 2431 }, + { 43232, 43261 }, }; static const URange16 Thai_range16[] = { { 3585, 3642 }, @@ -4307,10 +4757,14 @@ static const URange32 Ugaritic_range32[] = { static const URange16 Braille_range16[] = { { 10240, 10495 }, }; +static const URange32 Anatolian_Hieroglyphs_range32[] = { + { 82944, 83526 }, +}; static const URange16 Greek_range16[] = { { 880, 883 }, { 885, 887 }, { 890, 893 }, + { 895, 895 }, { 900, 900 }, { 902, 902 }, { 904, 906 }, @@ -4339,14 +4793,20 @@ static const URange16 Greek_range16[] = { { 8178, 8180 }, { 8182, 8190 }, { 8486, 8486 }, + { 43877, 43877 }, }; static const URange32 Greek_range32[] = { - { 65856, 65930 }, + { 65856, 65932 }, + { 65952, 65952 }, { 119296, 119365 }, }; static const URange32 Lycian_range32[] = { { 66176, 66204 }, }; +static const URange32 Mende_Kikakui_range32[] = { + { 124928, 125124 }, + { 125127, 125142 }, +}; static const URange16 Tai_Viet_range16[] = { { 43648, 43714 }, { 43739, 43743 }, @@ -4374,11 +4834,14 @@ static const URange16 Syriac_range16[] = { }; static const URange16 Runic_range16[] = { { 5792, 5866 }, - { 5870, 5872 }, + { 5870, 5880 }, }; static const URange32 Gothic_range32[] = { { 66352, 66378 }, }; +static const URange32 Mahajani_range32[] = { + { 69968, 70006 }, +}; static const URange16 Katakana_range16[] = { { 12449, 12538 }, { 12541, 12543 }, @@ -4405,14 +4868,19 @@ static const URange16 Ol_Chiki_range16[] = { { 7248, 7295 }, }; static const URange16 Limbu_range16[] = { - { 6400, 6428 }, + { 6400, 6430 }, { 6432, 6443 }, { 6448, 6459 }, { 6464, 6464 }, { 6468, 6479 }, }; +static const URange32 Pau_Cin_Hau_range32[] = { + { 72384, 72440 }, +}; static const URange16 Cherokee_range16[] = { - { 5024, 5108 }, + { 5024, 5109 }, + { 5112, 5117 }, + { 43888, 43967 }, }; static const URange32 Miao_range32[] = { { 93952, 94020 }, @@ -4436,8 +4904,8 @@ static const URange16 Oriya_range16[] = { { 2918, 2935 }, }; static const URange32 Sharada_range32[] = { - { 70016, 70088 }, - { 70096, 70105 }, + { 70016, 70093 }, + { 70096, 70111 }, }; static const URange16 Gujarati_range16[] = { { 2689, 2691 }, @@ -4453,11 +4921,20 @@ static const URange16 Gujarati_range16[] = { { 2768, 2768 }, { 2784, 2787 }, { 2790, 2801 }, + { 2809, 2809 }, +}; +static const URange32 Modi_range32[] = { + { 71168, 71236 }, + { 71248, 71257 }, }; static const URange32 Inscriptional_Pahlavi_range32[] = { { 68448, 68466 }, { 68472, 68479 }, }; +static const URange32 Manichaean_range32[] = { + { 68288, 68326 }, + { 68331, 68342 }, +}; static const URange16 Khmer_range16[] = { { 6016, 6109 }, { 6112, 6121 }, @@ -4465,14 +4942,24 @@ static const URange16 Khmer_range16[] = { { 6624, 6655 }, }; static const URange32 Cuneiform_range32[] = { - { 73728, 74606 }, - { 74752, 74850 }, - { 74864, 74867 }, + { 73728, 74649 }, + { 74752, 74862 }, + { 74864, 74868 }, + { 74880, 75075 }, +}; +static const URange32 Khudawadi_range32[] = { + { 70320, 70378 }, + { 70384, 70393 }, }; static const URange16 Mandaic_range16[] = { { 2112, 2139 }, { 2142, 2142 }, }; +static const URange32 Hatran_range32[] = { + { 67808, 67826 }, + { 67828, 67829 }, + { 67835, 67839 }, +}; static const URange16 Syloti_Nagri_range16[] = { { 43008, 43051 }, }; @@ -4490,8 +4977,12 @@ static const URange32 Phoenician_range32[] = { { 67840, 67867 }, { 67871, 67871 }, }; +static const URange32 Nabataean_range32[] = { + { 67712, 67742 }, + { 67751, 67759 }, +}; static const URange16 Bengali_range16[] = { - { 2433, 2435 }, + { 2432, 2435 }, { 2437, 2444 }, { 2447, 2448 }, { 2451, 2472 }, @@ -4544,6 +5035,9 @@ static const URange16 Javanese_range16[] = { { 43472, 43481 }, { 43486, 43487 }, }; +static const URange32 Old_Permic_range32[] = { + { 66384, 66426 }, +}; static const URange16 Phags_Pa_range16[] = { { 43072, 43127 }, }; @@ -4556,7 +5050,7 @@ static const URange32 Cypriot_range32[] = { { 67647, 67647 }, }; static const URange16 Kannada_range16[] = { - { 3202, 3203 }, + { 3201, 3203 }, { 3205, 3212 }, { 3214, 3216 }, { 3218, 3240 }, @@ -4571,6 +5065,10 @@ static const URange16 Kannada_range16[] = { { 3302, 3311 }, { 3313, 3314 }, }; +static const URange32 Khojki_range32[] = { + { 70144, 70161 }, + { 70163, 70205 }, +}; static const URange16 Mongolian_range16[] = { { 6144, 6145 }, { 6148, 6148 }, @@ -4590,11 +5088,19 @@ static const URange16 Sinhala_range16[] = { { 3535, 3540 }, { 3542, 3542 }, { 3544, 3551 }, + { 3558, 3567 }, { 3570, 3572 }, }; +static const URange32 Sinhala_range32[] = { + { 70113, 70132 }, +}; static const URange32 Brahmi_range32[] = { { 69632, 69709 }, { 69714, 69743 }, + { 69759, 69759 }, +}; +static const URange32 Elbasan_range32[] = { + { 66816, 66855 }, }; static const URange32 Deseret_range32[] = { { 66560, 66639 }, @@ -4603,6 +5109,18 @@ static const URange16 Rejang_range16[] = { { 43312, 43347 }, { 43359, 43359 }, }; +static const URange32 SignWriting_range32[] = { + { 120832, 121483 }, + { 121499, 121503 }, + { 121505, 121519 }, +}; +static const URange32 Multani_range32[] = { + { 70272, 70278 }, + { 70280, 70280 }, + { 70282, 70285 }, + { 70287, 70301 }, + { 70303, 70313 }, +}; static const URange16 Yi_range16[] = { { 40960, 42124 }, { 42128, 42182 }, @@ -4643,6 +5161,11 @@ static const URange32 Linear_B_range32[] = { { 65616, 65629 }, { 65664, 65786 }, }; +static const URange32 Linear_A_range32[] = { + { 67072, 67382 }, + { 67392, 67413 }, + { 67424, 67431 }, +}; static const URange32 Old_Turkic_range32[] = { { 68608, 68680 }, }; @@ -4658,6 +5181,15 @@ static const URange32 Lydian_range32[] = { static const URange32 Egyptian_Hieroglyphs_range32[] = { { 77824, 78894 }, }; +static const URange32 Caucasian_Albanian_range32[] = { + { 66864, 66915 }, + { 66927, 66927 }, +}; +static const URange32 Old_Hungarian_range32[] = { + { 68736, 68786 }, + { 68800, 68850 }, + { 68858, 68863 }, +}; static const URange16 Samaritan_range16[] = { { 2048, 2093 }, { 2096, 2110 }, @@ -4668,6 +5200,18 @@ static const URange16 Lisu_range16[] = { static const URange16 Buhid_range16[] = { { 5952, 5971 }, }; +static const URange32 Palmyrene_range32[] = { + { 67680, 67711 }, +}; +static const URange32 Tirhuta_range32[] = { + { 70784, 70855 }, + { 70864, 70873 }, +}; +static const URange32 Mro_range32[] = { + { 92736, 92766 }, + { 92768, 92777 }, + { 92782, 92783 }, +}; static const URange16 Common_range16[] = { { 0, 64 }, { 91, 96 }, @@ -4684,11 +5228,11 @@ static const URange16 Common_range16[] = { { 901, 901 }, { 903, 903 }, { 1417, 1417 }, + { 1541, 1541 }, { 1548, 1548 }, - { 1563, 1563 }, + { 1563, 1564 }, { 1567, 1567 }, { 1600, 1600 }, - { 1632, 1641 }, { 1757, 1757 }, { 2404, 2405 }, { 3647, 3647 }, @@ -4708,21 +5252,24 @@ static const URange16 Common_range16[] = { { 8294, 8304 }, { 8308, 8318 }, { 8320, 8334 }, - { 8352, 8378 }, + { 8352, 8382 }, { 8448, 8485 }, { 8487, 8489 }, { 8492, 8497 }, { 8499, 8525 }, { 8527, 8543 }, - { 8585, 8585 }, - { 8592, 9203 }, + { 8585, 8587 }, + { 8592, 9210 }, { 9216, 9254 }, { 9280, 9290 }, - { 9312, 9983 }, - { 9985, 10239 }, - { 10496, 11084 }, - { 11088, 11097 }, - { 11776, 11835 }, + { 9312, 10239 }, + { 10496, 11123 }, + { 11126, 11157 }, + { 11160, 11193 }, + { 11197, 11208 }, + { 11210, 11217 }, + { 11244, 11247 }, + { 11776, 11842 }, { 12272, 12283 }, { 12288, 12292 }, { 12294, 12294 }, @@ -4741,9 +5288,10 @@ static const URange16 Common_range16[] = { { 42752, 42785 }, { 42888, 42890 }, { 43056, 43065 }, + { 43310, 43310 }, { 43471, 43471 }, + { 43867, 43867 }, { 64830, 64831 }, - { 65021, 65021 }, { 65040, 65049 }, { 65072, 65106 }, { 65108, 65126 }, @@ -4764,13 +5312,15 @@ static const URange32 Common_range32[] = { { 65847, 65855 }, { 65936, 65947 }, { 66000, 66044 }, + { 66273, 66299 }, + { 113824, 113827 }, { 118784, 119029 }, { 119040, 119078 }, { 119081, 119142 }, { 119146, 119162 }, { 119171, 119172 }, { 119180, 119209 }, - { 119214, 119261 }, + { 119214, 119272 }, { 119552, 119638 }, { 119648, 119665 }, { 119808, 119892 }, @@ -4797,10 +5347,10 @@ static const URange32 Common_range32[] = { { 126976, 127019 }, { 127024, 127123 }, { 127136, 127150 }, - { 127153, 127166 }, + { 127153, 127167 }, { 127169, 127183 }, - { 127185, 127199 }, - { 127232, 127242 }, + { 127185, 127221 }, + { 127232, 127244 }, { 127248, 127278 }, { 127280, 127339 }, { 127344, 127386 }, @@ -4809,24 +5359,21 @@ static const URange32 Common_range32[] = { { 127504, 127546 }, { 127552, 127560 }, { 127568, 127569 }, - { 127744, 127776 }, - { 127792, 127797 }, - { 127799, 127868 }, - { 127872, 127891 }, - { 127904, 127940 }, - { 127942, 127946 }, - { 127968, 127984 }, - { 128000, 128062 }, - { 128064, 128064 }, - { 128066, 128247 }, - { 128249, 128252 }, - { 128256, 128317 }, - { 128320, 128323 }, - { 128336, 128359 }, - { 128507, 128576 }, - { 128581, 128591 }, - { 128640, 128709 }, + { 127744, 128377 }, + { 128379, 128419 }, + { 128421, 128720 }, + { 128736, 128748 }, + { 128752, 128755 }, { 128768, 128883 }, + { 128896, 128980 }, + { 129024, 129035 }, + { 129040, 129095 }, + { 129104, 129113 }, + { 129120, 129159 }, + { 129168, 129197 }, + { 129296, 129304 }, + { 129408, 129412 }, + { 129472, 129472 }, { 917505, 917505 }, { 917536, 917631 }, }; @@ -4843,23 +5390,20 @@ static const URange16 Arabic_range16[] = { { 1536, 1540 }, { 1542, 1547 }, { 1549, 1562 }, - { 1564, 1564 }, { 1566, 1566 }, { 1568, 1599 }, { 1601, 1610 }, - { 1622, 1631 }, - { 1642, 1647 }, + { 1622, 1647 }, { 1649, 1756 }, { 1758, 1791 }, { 1872, 1919 }, - { 2208, 2208 }, - { 2210, 2220 }, - { 2276, 2302 }, + { 2208, 2228 }, + { 2275, 2303 }, { 64336, 64449 }, { 64467, 64829 }, { 64848, 64911 }, { 64914, 64967 }, - { 65008, 65020 }, + { 65008, 65021 }, { 65136, 65140 }, { 65142, 65276 }, }; @@ -4908,7 +5452,12 @@ static const URange32 Bamum_range32[] = { }; static const URange16 Myanmar_range16[] = { { 4096, 4255 }, - { 43616, 43643 }, + { 43488, 43518 }, + { 43616, 43647 }, +}; +static const URange32 Siddham_range32[] = { + { 71040, 71093 }, + { 71096, 71133 }, }; static const URange32 Avestan_range32[] = { { 68352, 68405 }, @@ -4925,127 +5474,159 @@ static const URange16 Hebrew_range16[] = { { 64323, 64324 }, { 64326, 64335 }, }; +static const URange32 Psalter_Pahlavi_range32[] = { + { 68480, 68497 }, + { 68505, 68508 }, + { 68521, 68527 }, +}; static const URange32 Takri_range32[] = { { 71296, 71351 }, { 71360, 71369 }, }; -// 3867 16-bit ranges, 723 32-bit ranges +// 3949 16-bit ranges, 1133 32-bit ranges const UGroup unicode_groups[] = { - { "Arabic", +1, Arabic_range16, 22, Arabic_range32, 35 }, + { "Ahom", +1, 0, 0, Ahom_range32, 3 }, + { "Anatolian_Hieroglyphs", +1, 0, 0, Anatolian_Hieroglyphs_range32, 1 }, + { "Arabic", +1, Arabic_range16, 19, Arabic_range32, 35 }, { "Armenian", +1, Armenian_range16, 6, 0, 0 }, { "Avestan", +1, 0, 0, Avestan_range32, 2 }, { "Balinese", +1, Balinese_range16, 2, 0, 0 }, { "Bamum", +1, Bamum_range16, 1, Bamum_range32, 1 }, + { "Bassa_Vah", +1, 0, 0, Bassa_Vah_range32, 2 }, { "Batak", +1, Batak_range16, 2, 0, 0 }, { "Bengali", +1, Bengali_range16, 14, 0, 0 }, { "Bopomofo", +1, Bopomofo_range16, 3, 0, 0 }, - { "Brahmi", +1, 0, 0, Brahmi_range32, 2 }, + { "Brahmi", +1, 0, 0, Brahmi_range32, 3 }, { "Braille", +1, Braille_range16, 1, 0, 0 }, { "Buginese", +1, Buginese_range16, 2, 0, 0 }, { "Buhid", +1, Buhid_range16, 1, 0, 0 }, - { "C", +1, C_range16, 15, C_range32, 6 }, + { "C", +1, C_range16, 15, C_range32, 7 }, { "Canadian_Aboriginal", +1, Canadian_Aboriginal_range16, 2, 0, 0 }, { "Carian", +1, 0, 0, Carian_range32, 1 }, + { "Caucasian_Albanian", +1, 0, 0, Caucasian_Albanian_range32, 2 }, { "Cc", +1, Cc_range16, 2, 0, 0 }, - { "Cf", +1, Cf_range16, 12, Cf_range32, 4 }, + { "Cf", +1, Cf_range16, 12, Cf_range32, 5 }, { "Chakma", +1, 0, 0, Chakma_range32, 2 }, { "Cham", +1, Cham_range16, 4, 0, 0 }, - { "Cherokee", +1, Cherokee_range16, 1, 0, 0 }, + { "Cherokee", +1, Cherokee_range16, 3, 0, 0 }, { "Co", +1, Co_range16, 1, Co_range32, 2 }, - { "Common", +1, Common_range16, 88, Common_range32, 70 }, + { "Common", +1, Common_range16, 92, Common_range32, 69 }, { "Coptic", +1, Coptic_range16, 3, 0, 0 }, { "Cs", +1, Cs_range16, 1, 0, 0 }, - { "Cuneiform", +1, 0, 0, Cuneiform_range32, 3 }, + { "Cuneiform", +1, 0, 0, Cuneiform_range32, 4 }, { "Cypriot", +1, 0, 0, Cypriot_range32, 6 }, { "Cyrillic", +1, Cyrillic_range16, 7, 0, 0 }, { "Deseret", +1, 0, 0, Deseret_range32, 1 }, - { "Devanagari", +1, Devanagari_range16, 5, 0, 0 }, + { "Devanagari", +1, Devanagari_range16, 4, 0, 0 }, + { "Duployan", +1, 0, 0, Duployan_range32, 5 }, { "Egyptian_Hieroglyphs", +1, 0, 0, Egyptian_Hieroglyphs_range32, 1 }, + { "Elbasan", +1, 0, 0, Elbasan_range32, 1 }, { "Ethiopic", +1, Ethiopic_range16, 32, 0, 0 }, { "Georgian", +1, Georgian_range16, 8, 0, 0 }, { "Glagolitic", +1, Glagolitic_range16, 2, 0, 0 }, { "Gothic", +1, 0, 0, Gothic_range32, 1 }, - { "Greek", +1, Greek_range16, 31, Greek_range32, 2 }, - { "Gujarati", +1, Gujarati_range16, 13, 0, 0 }, + { "Grantha", +1, 0, 0, Grantha_range32, 15 }, + { "Greek", +1, Greek_range16, 33, Greek_range32, 3 }, + { "Gujarati", +1, Gujarati_range16, 14, 0, 0 }, { "Gurmukhi", +1, Gurmukhi_range16, 16, 0, 0 }, - { "Han", +1, Han_range16, 11, Han_range32, 4 }, + { "Han", +1, Han_range16, 11, Han_range32, 5 }, { "Hangul", +1, Hangul_range16, 14, 0, 0 }, { "Hanunoo", +1, Hanunoo_range16, 1, 0, 0 }, + { "Hatran", +1, 0, 0, Hatran_range32, 3 }, { "Hebrew", +1, Hebrew_range16, 9, 0, 0 }, { "Hiragana", +1, Hiragana_range16, 2, Hiragana_range32, 2 }, { "Imperial_Aramaic", +1, 0, 0, Imperial_Aramaic_range32, 2 }, - { "Inherited", +1, Inherited_range16, 18, Inherited_range32, 6 }, + { "Inherited", +1, Inherited_range16, 20, Inherited_range32, 7 }, { "Inscriptional_Pahlavi", +1, 0, 0, Inscriptional_Pahlavi_range32, 2 }, { "Inscriptional_Parthian", +1, 0, 0, Inscriptional_Parthian_range32, 2 }, { "Javanese", +1, Javanese_range16, 3, 0, 0 }, { "Kaithi", +1, 0, 0, Kaithi_range32, 1 }, { "Kannada", +1, Kannada_range16, 14, 0, 0 }, { "Katakana", +1, Katakana_range16, 7, Katakana_range32, 1 }, - { "Kayah_Li", +1, Kayah_Li_range16, 1, 0, 0 }, + { "Kayah_Li", +1, Kayah_Li_range16, 2, 0, 0 }, { "Kharoshthi", +1, 0, 0, Kharoshthi_range32, 8 }, { "Khmer", +1, Khmer_range16, 4, 0, 0 }, - { "L", +1, L_range16, 370, L_range32, 116 }, + { "Khojki", +1, 0, 0, Khojki_range32, 2 }, + { "Khudawadi", +1, 0, 0, Khudawadi_range32, 2 }, + { "L", +1, L_range16, 376, L_range32, 178 }, { "Lao", +1, Lao_range16, 18, 0, 0 }, - { "Latin", +1, Latin_range16, 30, 0, 0 }, + { "Latin", +1, Latin_range16, 31, 0, 0 }, { "Lepcha", +1, Lepcha_range16, 3, 0, 0 }, { "Limbu", +1, Limbu_range16, 5, 0, 0 }, + { "Linear_A", +1, 0, 0, Linear_A_range32, 3 }, { "Linear_B", +1, 0, 0, Linear_B_range32, 7 }, { "Lisu", +1, Lisu_range16, 1, 0, 0 }, - { "Ll", +1, Ll_range16, 582, Ll_range32, 29 }, - { "Lm", +1, Lm_range16, 51, Lm_range32, 1 }, - { "Lo", +1, Lo_range16, 286, Lo_range32, 85 }, + { "Ll", +1, Ll_range16, 599, Ll_range32, 31 }, + { "Lm", +1, Lm_range16, 54, Lm_range32, 2 }, + { "Lo", +1, Lo_range16, 290, Lo_range32, 143 }, { "Lt", +1, Lt_range16, 10, 0, 0 }, - { "Lu", +1, Lu_range16, 576, Lu_range32, 32 }, + { "Lu", +1, Lu_range16, 591, Lu_range32, 34 }, { "Lycian", +1, 0, 0, Lycian_range32, 1 }, { "Lydian", +1, 0, 0, Lydian_range32, 2 }, - { "M", +1, M_range16, 180, M_range32, 24 }, + { "M", +1, M_range16, 180, M_range32, 56 }, + { "Mahajani", +1, 0, 0, Mahajani_range32, 1 }, { "Malayalam", +1, Malayalam_range16, 11, 0, 0 }, { "Mandaic", +1, Mandaic_range16, 2, 0, 0 }, - { "Mc", +1, Mc_range16, 111, Mc_range32, 15 }, - { "Me", +1, Me_range16, 4, 0, 0 }, + { "Manichaean", +1, 0, 0, Manichaean_range32, 2 }, + { "Mc", +1, Mc_range16, 109, Mc_range32, 38 }, + { "Me", +1, Me_range16, 5, 0, 0 }, { "Meetei_Mayek", +1, Meetei_Mayek_range16, 3, 0, 0 }, - { "Meroitic_Cursive", +1, 0, 0, Meroitic_Cursive_range32, 2 }, + { "Mende_Kikakui", +1, 0, 0, Mende_Kikakui_range32, 2 }, + { "Meroitic_Cursive", +1, 0, 0, Meroitic_Cursive_range32, 3 }, { "Meroitic_Hieroglyphs", +1, 0, 0, Meroitic_Hieroglyphs_range32, 1 }, { "Miao", +1, 0, 0, Miao_range32, 3 }, - { "Mn", +1, Mn_range16, 194, Mn_range32, 27 }, + { "Mn", +1, Mn_range16, 200, Mn_range32, 66 }, + { "Modi", +1, 0, 0, Modi_range32, 2 }, { "Mongolian", +1, Mongolian_range16, 6, 0, 0 }, - { "Myanmar", +1, Myanmar_range16, 2, 0, 0 }, - { "N", +1, N_range16, 64, N_range32, 24 }, - { "Nd", +1, Nd_range16, 35, Nd_range32, 7 }, + { "Mro", +1, 0, 0, Mro_range32, 3 }, + { "Multani", +1, 0, 0, Multani_range32, 5 }, + { "Myanmar", +1, Myanmar_range16, 3, 0, 0 }, + { "N", +1, N_range16, 66, N_range32, 45 }, + { "Nabataean", +1, 0, 0, Nabataean_range32, 2 }, + { "Nd", +1, Nd_range16, 37, Nd_range32, 14 }, { "New_Tai_Lue", +1, New_Tai_Lue_range16, 4, 0, 0 }, { "Nko", +1, Nko_range16, 1, 0, 0 }, { "Nl", +1, Nl_range16, 7, Nl_range32, 5 }, - { "No", +1, No_range16, 28, No_range32, 14 }, + { "No", +1, No_range16, 28, No_range32, 30 }, { "Ogham", +1, Ogham_range16, 1, 0, 0 }, { "Ol_Chiki", +1, Ol_Chiki_range16, 1, 0, 0 }, - { "Old_Italic", +1, 0, 0, Old_Italic_range32, 2 }, + { "Old_Hungarian", +1, 0, 0, Old_Hungarian_range32, 3 }, + { "Old_Italic", +1, 0, 0, Old_Italic_range32, 1 }, + { "Old_North_Arabian", +1, 0, 0, Old_North_Arabian_range32, 1 }, + { "Old_Permic", +1, 0, 0, Old_Permic_range32, 1 }, { "Old_Persian", +1, 0, 0, Old_Persian_range32, 2 }, { "Old_South_Arabian", +1, 0, 0, Old_South_Arabian_range32, 1 }, { "Old_Turkic", +1, 0, 0, Old_Turkic_range32, 1 }, { "Oriya", +1, Oriya_range16, 14, 0, 0 }, { "Osmanya", +1, 0, 0, Osmanya_range32, 2 }, - { "P", +1, P_range16, 126, P_range32, 15 }, + { "P", +1, P_range16, 127, P_range32, 34 }, + { "Pahawh_Hmong", +1, 0, 0, Pahawh_Hmong_range32, 5 }, + { "Palmyrene", +1, 0, 0, Palmyrene_range32, 1 }, + { "Pau_Cin_Hau", +1, 0, 0, Pau_Cin_Hau_range32, 1 }, { "Pc", +1, Pc_range16, 6, 0, 0 }, - { "Pd", +1, Pd_range16, 16, 0, 0 }, + { "Pd", +1, Pd_range16, 17, 0, 0 }, { "Pe", +1, Pe_range16, 72, 0, 0 }, { "Pf", +1, Pf_range16, 10, 0, 0 }, { "Phags_Pa", +1, Phags_Pa_range16, 1, 0, 0 }, { "Phoenician", +1, 0, 0, Phoenician_range32, 2 }, { "Pi", +1, Pi_range16, 11, 0, 0 }, - { "Po", +1, Po_range16, 120, Po_range32, 15 }, - { "Ps", +1, Ps_range16, 74, 0, 0 }, + { "Po", +1, Po_range16, 123, Po_range32, 34 }, + { "Ps", +1, Ps_range16, 75, 0, 0 }, + { "Psalter_Pahlavi", +1, 0, 0, Psalter_Pahlavi_range32, 3 }, { "Rejang", +1, Rejang_range16, 2, 0, 0 }, { "Runic", +1, Runic_range16, 2, 0, 0 }, - { "S", +1, S_range16, 143, S_range32, 56 }, + { "S", +1, S_range16, 148, S_range32, 66 }, { "Samaritan", +1, Samaritan_range16, 2, 0, 0 }, { "Saurashtra", +1, Saurashtra_range16, 2, 0, 0 }, { "Sc", +1, Sc_range16, 17, 0, 0 }, { "Sharada", +1, 0, 0, Sharada_range32, 2 }, { "Shavian", +1, 0, 0, Shavian_range32, 1 }, - { "Sinhala", +1, Sinhala_range16, 11, 0, 0 }, - { "Sk", +1, Sk_range16, 27, 0, 0 }, + { "Siddham", +1, 0, 0, Siddham_range32, 2 }, + { "SignWriting", +1, 0, 0, SignWriting_range32, 3 }, + { "Sinhala", +1, Sinhala_range16, 12, Sinhala_range32, 1 }, + { "Sk", +1, Sk_range16, 28, Sk_range32, 1 }, { "Sm", +1, Sm_range16, 53, Sm_range32, 11 }, - { "So", +1, So_range16, 108, So_range32, 45 }, + { "So", +1, So_range16, 114, So_range32, 56 }, { "Sora_Sompeng", +1, 0, 0, Sora_Sompeng_range32, 2 }, { "Sundanese", +1, Sundanese_range16, 2, 0, 0 }, { "Syloti_Nagri", +1, Syloti_Nagri_range16, 1, 0, 0 }, @@ -5057,20 +5638,22 @@ const UGroup unicode_groups[] = { { "Tai_Viet", +1, Tai_Viet_range16, 2, 0, 0 }, { "Takri", +1, 0, 0, Takri_range32, 2 }, { "Tamil", +1, Tamil_range16, 16, 0, 0 }, - { "Telugu", +1, Telugu_range16, 14, 0, 0 }, + { "Telugu", +1, Telugu_range16, 13, 0, 0 }, { "Thaana", +1, Thaana_range16, 1, 0, 0 }, { "Thai", +1, Thai_range16, 2, 0, 0 }, { "Tibetan", +1, Tibetan_range16, 7, 0, 0 }, { "Tifinagh", +1, Tifinagh_range16, 3, 0, 0 }, + { "Tirhuta", +1, 0, 0, Tirhuta_range32, 2 }, { "Ugaritic", +1, 0, 0, Ugaritic_range32, 2 }, { "Vai", +1, Vai_range16, 1, 0, 0 }, + { "Warang_Citi", +1, 0, 0, Warang_Citi_range32, 2 }, { "Yi", +1, Yi_range16, 2, 0, 0 }, { "Z", +1, Z_range16, 8, 0, 0 }, { "Zl", +1, Zl_range16, 1, 0, 0 }, { "Zp", +1, Zp_range16, 1, 0, 0 }, { "Zs", +1, Zs_range16, 7, 0, 0 }, }; -const int num_unicode_groups = 138; +const int num_unicode_groups = 167; } // namespace re2 diff --git a/re2_test.bzl b/re2_test.bzl index a52cd9f..8dafbd5 100644 --- a/re2_test.bzl +++ b/re2_test.bzl @@ -3,12 +3,13 @@ # license that can be found in the LICENSE file. # Define a bazel macro that creates cc_test for re2. -def re2_test(name, deps=[]): +def re2_test(name, deps=[], size="medium"): native.cc_test( name=name, srcs=["re2/testing/%s.cc" % (name)], deps=[ ":re2", ":test", - ] + deps + ] + deps, + size = size, ) diff --git a/util/atomicops.h b/util/atomicops.h index 6007b56..dc944e7 100644 --- a/util/atomicops.h +++ b/util/atomicops.h @@ -53,19 +53,19 @@ static inline void WriteMemoryBarrier() { #elif defined(__ppc__) || defined(__powerpc64__) static inline void WriteMemoryBarrier() { - __asm__ __volatile__("eieio" : : : "memory"); + __asm__ __volatile__("lwsync" : : : "memory"); } -#elif defined(__alpha__) +#elif defined(__aarch64__) static inline void WriteMemoryBarrier() { - __asm__ __volatile__("wmb" : : : "memory"); + __asm__ __volatile__("dmb st" : : : "memory"); } -#elif defined(__aarch64__) +#elif defined(__alpha__) static inline void WriteMemoryBarrier() { - __asm__ __volatile__("dmb st" : : : "memory"); + __asm__ __volatile__("wmb" : : : "memory"); } #elif defined(__arm__) && defined(__linux__) @@ -80,36 +80,28 @@ static inline void WriteMemoryBarrier() { #include <intrin.h> #include <windows.h> +static inline void WriteMemoryBarrier() { #if defined(_M_IX86) || defined(_M_X64) - -// x86 and x64 CPUs have a strong memory model that prohibits most types of -// reordering, so a non-instruction intrinsic to suppress compiler reordering is -// sufficient. _WriteBarrier is deprecated but is still appropriate for the -// "old compiler" path (pre C++11). -inline void WriteMemoryBarrier() { + // x86 and x64 CPUs have a strong memory model that prohibits most types of + // reordering, so a non-instruction intrinsic to suppress compiler reordering + // is sufficient. _WriteBarrier is deprecated, but is still appropriate for + // the "old compiler" path (pre C++11). _WriteBarrier(); -} - #else - -// Windows -inline void WriteMemoryBarrier() { LONG x; ::InterlockedExchange(&x, 0); -} - #endif +} #elif defined(OS_NACL) -// Native Client -inline void WriteMemoryBarrier() { +static inline void WriteMemoryBarrier() { __sync_synchronize(); } #elif defined(__mips__) -inline void WriteMemoryBarrier() { +static inline void WriteMemoryBarrier() { __asm__ __volatile__("sync" : : : "memory"); } @@ -148,7 +140,13 @@ static inline void MaybeReadMemoryBarrier() {} // Read barrier for various targets. -#if defined(__aarch64__) +#if defined(__ppc__) || defined(__powerpc64__) + +static inline void ReadMemoryBarrier() { + __asm__ __volatile__("lwsync" : : : "memory"); +} + +#elif defined(__aarch64__) static inline void ReadMemoryBarrier() { __asm__ __volatile__("dmb ld" : : : "memory"); @@ -162,7 +160,7 @@ static inline void ReadMemoryBarrier() { #elif defined(__mips__) -inline void ReadMemoryBarrier() { +static inline void ReadMemoryBarrier() { __asm__ __volatile__("sync" : : : "memory"); } diff --git a/util/benchmark.cc b/util/benchmark.cc index 03dbda4..b77e22d 100644 --- a/util/benchmark.cc +++ b/util/benchmark.cc @@ -124,9 +124,9 @@ void RunBench(Benchmark* b, int nthread, int siz) { while(ns < (int)1e9 && n < (int)1e9) { last = n; if(ns/n == 0) - n = 1e9; + n = (int)1e9; else - n = 1e9 / (ns/n); + n = (int)1e9 / static_cast<int>(ns/n); n = max(last+1, min(n+n/2, 100*last)); n = round(n); diff --git a/util/logging.h b/util/logging.h index 7812ecd..feac199 100644 --- a/util/logging.h +++ b/util/logging.h @@ -85,6 +85,11 @@ class LogMessage { DISALLOW_COPY_AND_ASSIGN(LogMessage); }; +#ifdef _WIN32 +#pragma warning(push) +#pragma warning(disable: 4722) // destructor never returns +#endif + class LogMessageFatal : public LogMessage { public: LogMessageFatal(const char* file, int line) @@ -97,4 +102,8 @@ class LogMessageFatal : public LogMessage { DISALLOW_COPY_AND_ASSIGN(LogMessageFatal); }; +#ifdef _WIN32 +#pragma warning(pop) +#endif + #endif // RE2_UTIL_LOGGING_H__ diff --git a/util/mutex.h b/util/mutex.h index 9cb6de3..b479e48 100644 --- a/util/mutex.h +++ b/util/mutex.h @@ -55,7 +55,9 @@ namespace re2 { # include <pthread.h> typedef pthread_mutex_t MutexType; #elif defined(_WIN32) -# define WIN32_LEAN_AND_MEAN // We only need minimal includes +# ifndef WIN32_LEAN_AND_MEAN +# define WIN32_LEAN_AND_MEAN // We only need minimal includes +# endif # ifdef GMUTEX_TRYLOCK // We need Windows NT or later for TryEnterCriticalSection(). If you // don't need that functionality, you can remove these _WIN32_WINNT diff --git a/util/pcre.cc b/util/pcre.cc index b52236f..9a3f32d 100644 --- a/util/pcre.cc +++ b/util/pcre.cc @@ -384,10 +384,10 @@ int PCRE::GlobalReplace(string *str, int count = 0; int vec[kVecSize] = {}; string out; - size_t start = 0; + int start = 0; bool last_match_was_empty_string = false; - for (; start <= str->length();) { + while (start <= static_cast<int>(str->size())) { // If the previous match was for the empty string, we shouldn't // just match again: we'll match in the same way and get an // infinite loop. Instead, we do the match in a special way: @@ -403,18 +403,19 @@ int PCRE::GlobalReplace(string *str, matches = pattern.TryMatch(*str, start, ANCHOR_START, false, vec, kVecSize); if (matches <= 0) { - if (start < str->length()) + if (start < static_cast<int>(str->size())) out.push_back((*str)[start]); start++; last_match_was_empty_string = false; continue; } } else { - matches = pattern.TryMatch(*str, start, UNANCHORED, true, vec, kVecSize); + matches = pattern.TryMatch(*str, start, UNANCHORED, true, + vec, kVecSize); if (matches <= 0) break; } - size_t matchstart = vec[0], matchend = vec[1]; + int matchstart = vec[0], matchend = vec[1]; assert(matchstart >= start); assert(matchend >= matchstart); @@ -428,8 +429,8 @@ int PCRE::GlobalReplace(string *str, if (count == 0) return 0; - if (start < str->length()) - out.append(*str, start, str->length() - start); + if (start < static_cast<int>(str->size())) + out.append(*str, start, static_cast<int>(str->size()) - start); swap(out, *str); return count; } @@ -484,7 +485,7 @@ string PCRE::QuoteMeta(const StringPiece& unquoted) { /***** Actual matching and rewriting code *****/ bool PCRE::HitLimit() { - return hit_limit_; + return hit_limit_ != 0; } void PCRE::ClearHitLimit() { @@ -632,9 +633,9 @@ bool PCRE::DoMatch(const StringPiece& text, const Arg* const args[], int n) const { assert(n >= 0); - size_t const vecsize = (1 + n) * 3; // results + PCRE workspace - // (as for kVecSize) - int *vec = new int[vecsize]; + const int vecsize = (1 + n) * 3; // results + PCRE workspace + // (as for kVecSize) + int* vec = new int[vecsize]; bool b = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize); delete[] vec; return b; @@ -840,7 +841,7 @@ bool PCRE::Arg::parse_short_radix(const char* str, if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse if ((short)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast<short*>(dest)) = r; + *(reinterpret_cast<short*>(dest)) = (short)r; return true; } @@ -852,7 +853,7 @@ bool PCRE::Arg::parse_ushort_radix(const char* str, if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse if ((ushort)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast<unsigned short*>(dest)) = r; + *(reinterpret_cast<unsigned short*>(dest)) = (ushort)r; return true; } diff --git a/util/sparse_array.h b/util/sparse_array.h index 8bc243b..8f71fa0 100644 --- a/util/sparse_array.h +++ b/util/sparse_array.h @@ -220,18 +220,25 @@ class SparseArray { // and at the beginning and end of all public non-const member functions. inline void DebugCheckInvariants() const; + static bool InitMemory() { +#ifdef MEMORY_SANITIZER + return true; +#else + return RunningOnValgrind(); +#endif + } + int size_; int max_size_; int* sparse_to_dense_; vector<IndexValue> dense_; - bool valgrind_; DISALLOW_COPY_AND_ASSIGN(SparseArray); }; template<typename Value> SparseArray<Value>::SparseArray() - : size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(), valgrind_(RunningOnValgrind()) {} + : size_(0), max_size_(0), sparse_to_dense_(NULL), dense_() {} // IndexValue pairs: exposed in SparseArray::iterator. template<typename Value> @@ -272,16 +279,22 @@ void SparseArray<Value>::resize(int new_max_size) { int* a = new int[new_max_size]; if (sparse_to_dense_) { memmove(a, sparse_to_dense_, max_size_*sizeof a[0]); - // Don't need to zero the memory but appease Valgrind. - if (valgrind_) { - for (int i = max_size_; i < new_max_size; i++) - a[i] = 0xababababU; - } delete[] sparse_to_dense_; } sparse_to_dense_ = a; dense_.resize(new_max_size); + + // These don't need to be initialized for correctness, + // but Valgrind will warn about use of uninitialized memory, + // so initialize the new memory when compiling debug binaries. + // Initialize it to garbage to detect bugs in the future. + if (InitMemory()) { + for (int i = max_size_; i < new_max_size; i++) { + sparse_to_dense_[i] = 0xababababU; + dense_[i].index_ = 0xababababU; + } + } } max_size_ = new_max_size; if (size_ > max_size_) @@ -418,10 +431,9 @@ void SparseArray<Value>::create_index(int i) { template<typename Value> SparseArray<Value>::SparseArray(int max_size) { max_size_ = max_size; sparse_to_dense_ = new int[max_size]; - valgrind_ = RunningOnValgrind(); dense_.resize(max_size); // Don't need to zero the new memory, but appease Valgrind. - if (valgrind_) { + if (InitMemory()) { for (int i = 0; i < max_size; i++) { sparse_to_dense_[i] = 0xababababU; dense_[i].index_ = 0xababababU; diff --git a/util/sparse_set.h b/util/sparse_set.h index ff592a8..9dd41ee 100644 --- a/util/sparse_set.h +++ b/util/sparse_set.h @@ -51,28 +51,18 @@ namespace re2 { -static bool InitMemory() { -#ifdef MEMORY_SANITIZER - return true; -#else - return RunningOnValgrind(); -#endif -} - class SparseSet { public: SparseSet() - : size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(NULL), - init_memory_(InitMemory()) {} + : size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(NULL) {} SparseSet(int max_size) { max_size_ = max_size; sparse_to_dense_ = new int[max_size]; dense_ = new int[max_size]; - init_memory_ = InitMemory(); // Don't need to zero the memory, but do so anyway // to appease Valgrind. - if (init_memory_) { + if (InitMemory()) { for (int i = 0; i < max_size; i++) { dense_[i] = 0xababababU; sparse_to_dense_[i] = 0xababababU; @@ -104,7 +94,7 @@ class SparseSet { int* a = new int[new_max_size]; if (sparse_to_dense_) { memmove(a, sparse_to_dense_, max_size_*sizeof a[0]); - if (init_memory_) { + if (InitMemory()) { for (int i = max_size_; i < new_max_size; i++) a[i] = 0xababababU; } @@ -115,7 +105,7 @@ class SparseSet { a = new int[new_max_size]; if (dense_) { memmove(a, dense_, size_*sizeof a[0]); - if (init_memory_) { + if (InitMemory()) { for (int i = size_; i < new_max_size; i++) a[i] = 0xababababU; } @@ -174,11 +164,18 @@ class SparseSet { static bool less(int a, int b) { return a < b; } private: + static bool InitMemory() { +#ifdef MEMORY_SANITIZER + return true; +#else + return RunningOnValgrind(); +#endif + } + int size_; int max_size_; int* sparse_to_dense_; int* dense_; - bool init_memory_; DISALLOW_COPY_AND_ASSIGN(SparseSet); }; diff --git a/util/strutil.cc b/util/strutil.cc index 19a4640..d3a0249 100644 --- a/util/strutil.cc +++ b/util/strutil.cc @@ -83,7 +83,7 @@ string PrefixSuccessor(const StringPiece& prefix) { // 255's, we just return the empty string. bool done = false; string limit(prefix.data(), prefix.size()); - int index = limit.length() - 1; + int index = static_cast<int>(limit.size()) - 1; while (!done && index >= 0) { if ((limit[index]&255) == 255) { limit.erase(index); diff --git a/util/test.cc b/util/test.cc index 85055b2..b0167e7 100644 --- a/util/test.cc +++ b/util/test.cc @@ -23,18 +23,6 @@ void RegisterTest(void (*fn)(void), const char *name) { tests[ntests++].name = name; } -namespace re2 { -int64 VirtualProcessSize() { -#ifdef _WIN32 - return 0; -#else - struct rusage ru; - getrusage(RUSAGE_SELF, &ru); - return (int64)ru.ru_maxrss*1024; -#endif -} -} // namespace re2 - int main(int argc, char **argv) { for (int i = 0; i < ntests; i++) { printf("%s\n", tests[i].name); diff --git a/util/test.h b/util/test.h index 45ca6fa..3701eab 100644 --- a/util/test.h +++ b/util/test.h @@ -31,20 +31,15 @@ class TestRegisterer { #define EXPECT_GE CHECK_GE #define EXPECT_FALSE(x) CHECK(!(x)) -const bool UsingMallocCounter = false; namespace testing { class MallocCounter { public: - MallocCounter(int x) { } + MallocCounter(int x) {} static const int THIS_THREAD_ONLY = 0; long long HeapGrowth() { return 0; } long long PeakHeapGrowth() { return 0; } - void Reset() { } + void Reset() {} }; } // namespace testing -namespace re2 { -int64 VirtualProcessSize(); -} // namespace re2 - #endif // RE2_UTIL_TEST_H__ diff --git a/util/util.h b/util/util.h index d4b072d..c59d91f 100644 --- a/util/util.h +++ b/util/util.h @@ -70,9 +70,6 @@ using std::unordered_set; #define strtoull _strtoui64 #define vsnprintf vsnprintf_s -#pragma warning(disable: 4018) // signed/unsigned mismatch -#pragma warning(disable: 4800) // conversion from int to bool - #endif namespace re2 { @@ -141,7 +138,7 @@ static inline uint64 Hash64StringWithSeed(const char* s, int len, uint32 seed) { return ((uint64)x << 32) | y; } -int RunningOnValgrind(); +bool RunningOnValgrind(); } // namespace re2 diff --git a/util/valgrind.cc b/util/valgrind.cc index 82f9a4c..19ec22e 100644 --- a/util/valgrind.cc +++ b/util/valgrind.cc @@ -9,17 +9,11 @@ namespace re2 { -#ifndef __has_feature -#define __has_feature(x) 0 -#endif - -int RunningOnValgrind() { -#if __has_feature(memory_sanitizer) - return true; -#elif defined(RUNNING_ON_VALGRIND) - return RUNNING_ON_VALGRIND; +bool RunningOnValgrind() { +#ifdef RUNNING_ON_VALGRIND + return RUNNING_ON_VALGRIND != 0; #else - return 0; + return false; #endif } |