summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDongHun Kwak <dh0128.kwak@samsung.com>2016-11-21 16:59:01 +0900
committerDongHun Kwak <dh0128.kwak@samsung.com>2016-11-21 16:59:02 +0900
commitc04192c9e5fef3601690a75a4d3dd197c79aaf5b (patch)
tree9c097513a295fe4a4f51e093c9ac71bc7df7185a
parentab01fdafee9f990730a48f220e11c15516c79498 (diff)
downloadre2-c04192c9e5fef3601690a75a4d3dd197c79aaf5b.tar.gz
re2-c04192c9e5fef3601690a75a4d3dd197c79aaf5b.tar.bz2
re2-c04192c9e5fef3601690a75a4d3dd197c79aaf5b.zip
Imported Upstream version 20160801upstream/20160801
Change-Id: I3f2d32dbe405230b6c3d4c4d4c3156eac24d419d Signed-off-by: DongHun Kwak <dh0128.kwak@samsung.com>
-rw-r--r--BUILD1
-rw-r--r--Makefile1
-rw-r--r--re2/dfa.cc92
-rw-r--r--re2/filtered_re2.h6
-rw-r--r--re2/nfa.cc2
-rw-r--r--re2/onepass.cc58
-rw-r--r--re2/parse.cc6
-rw-r--r--re2/prefilter.h6
-rw-r--r--re2/prefilter_tree.h6
-rw-r--r--re2/prog.cc225
-rw-r--r--re2/prog.h49
-rw-r--r--re2/re2.cc66
-rw-r--r--re2/re2.h127
-rw-r--r--re2/regexp.h8
-rw-r--r--re2/set.h6
-rw-r--r--re2/stringpiece.h8
-rw-r--r--re2/testing/compile_test.cc122
-rw-r--r--re2/testing/exhaustive_tester.h6
-rw-r--r--re2/testing/re2_arg_test.cc12
-rw-r--r--re2/testing/re2_test.cc12
-rw-r--r--re2/testing/regexp_generator.h8
-rw-r--r--re2/testing/string_generator.h8
-rw-r--r--re2/testing/tester.h8
-rw-r--r--re2/unicode_casefold.h8
-rw-r--r--re2/unicode_groups.h8
-rw-r--r--re2/walker-inl.h10
-rwxr-xr-xruntests2
-rw-r--r--util/benchmark.cc4
-rw-r--r--util/benchmark.h8
-rw-r--r--util/bitmap.h92
-rw-r--r--util/flags.h8
-rw-r--r--util/logging.h8
-rw-r--r--util/mutex.h8
-rw-r--r--util/pcre.cc64
-rw-r--r--util/pcre.h60
-rw-r--r--util/random.h8
-rw-r--r--util/sparse_array.h8
-rw-r--r--util/sparse_set.h8
-rw-r--r--util/test.h6
-rw-r--r--util/thread.h6
-rw-r--r--util/utf.h7
-rw-r--r--util/util.h15
-rw-r--r--util/valgrind.h7
43 files changed, 726 insertions, 462 deletions
diff --git a/BUILD b/BUILD
index 3376943..5798eae 100644
--- a/BUILD
+++ b/BUILD
@@ -38,6 +38,7 @@ cc_library(
"re2/unicode_groups.cc",
"re2/unicode_groups.h",
"re2/walker-inl.h",
+ "util/bitmap.h",
"util/flags.h",
"util/hash.cc",
"util/logging.cc",
diff --git a/Makefile b/Makefile
index d498296..7ddf8b2 100644
--- a/Makefile
+++ b/Makefile
@@ -78,6 +78,7 @@ INSTALL_HFILES=\
HFILES=\
util/benchmark.h\
+ util/bitmap.h\
util/flags.h\
util/logging.h\
util/mutex.h\
diff --git a/re2/dfa.cc b/re2/dfa.cc
index e33bb01..fba7b64 100644
--- a/re2/dfa.cc
+++ b/re2/dfa.cc
@@ -114,25 +114,6 @@ class DFA {
kFlagNeedShift = 16, // needed kEmpty bits are or'ed in shifted left
};
-#ifndef STL_MSVC
- // STL function structures for use with unordered_set.
- struct StateEqual {
- bool operator()(const State* a, const State* b) const {
- if (a == b)
- return true;
- if (a == NULL || b == NULL)
- return false;
- if (a->ninst_ != b->ninst_)
- return false;
- if (a->flag_ != b->flag_)
- return false;
- for (int i = 0; i < a->ninst_; i++)
- if (a->inst_[i] != b->inst_[i])
- return false;
- return true; // they're equal
- }
- };
-#endif // STL_MSVC
struct StateHash {
size_t operator()(const State* a) const {
if (a == NULL)
@@ -140,39 +121,30 @@ class DFA {
const char* s = reinterpret_cast<const char*>(a->inst_);
int len = a->ninst_ * sizeof a->inst_[0];
if (sizeof(size_t) == sizeof(uint32))
- return Hash32StringWithSeed(s, len, a->flag_);
+ return static_cast<size_t>(Hash32StringWithSeed(s, len, a->flag_));
else
return static_cast<size_t>(Hash64StringWithSeed(s, len, a->flag_));
}
-#ifdef STL_MSVC
- // Less than operator.
+ };
+
+ struct StateEqual {
bool operator()(const State* a, const State* b) const {
if (a == b)
- return false;
+ return true;
if (a == NULL || b == NULL)
- return a == NULL;
+ return false;
if (a->ninst_ != b->ninst_)
- return a->ninst_ < b->ninst_;
+ return false;
if (a->flag_ != b->flag_)
- return a->flag_ < b->flag_;
- for (int i = 0; i < a->ninst_; ++i)
+ return false;
+ for (int i = 0; i < a->ninst_; i++)
if (a->inst_[i] != b->inst_[i])
- return a->inst_[i] < b->inst_[i];
- return false; // they're equal
+ return false;
+ return true; // they're equal
}
- // The two public members are required by msvc. 4 and 8 are default values.
- // Reference: http://msdn.microsoft.com/en-us/library/1s1byw77.aspx
- static const size_t bucket_size = 4;
- static const size_t min_buckets = 8;
-#endif // STL_MSVC
};
-#ifdef STL_MSVC
- typedef unordered_set<State*, StateHash> StateSet;
-#else // !STL_MSVC
typedef unordered_set<State*, StateHash, StateEqual> StateSet;
-#endif // STL_MSVC
-
private:
// Special "firstbyte" values for a state. (Values >= 0 denote actual bytes.)
@@ -645,7 +617,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, uint flag) {
fprintf(stderr, " -> FullMatchState\n");
return FullMatchState;
}
- // Fall through.
+ FALLTHROUGH_INTENDED;
default:
// Record iff id is the head of its list, which must
// be the case if id-1 is the last of *its* list. :)
@@ -726,7 +698,12 @@ DFA::State* DFA::CachedState(int* inst, int ninst, uint flag) {
mutex_.AssertHeld();
// Look in the cache for a pre-existing state.
- State state = {inst, ninst, flag};
+ // We have to initialise the struct like this because otherwise
+ // MSVC will complain about the flexible array member. :(
+ State state;
+ state.inst_ = inst;
+ state.ninst_ = ninst;
+ state.flag_ = flag;
StateSet::iterator it = state_cache_.find(&state);
if (it != state_cache_.end()) {
if (DebugDFA)
@@ -770,16 +747,15 @@ DFA::State* DFA::CachedState(int* inst, int ninst, uint flag) {
// Clear the cache. Must hold cache_mutex_.w or be in destructor.
void DFA::ClearCache() {
- // In case state_cache_ doesn't support deleting entries
- // during iteration, copy into a vector and then delete.
- vector<State*> v;
- v.reserve(state_cache_.size());
- for (StateSet::iterator it = state_cache_.begin();
- it != state_cache_.end(); ++it)
- v.push_back(*it);
+ StateSet::iterator begin = state_cache_.begin();
+ StateSet::iterator end = state_cache_.end();
+ while (begin != end) {
+ StateSet::iterator tmp = begin;
+ ++begin;
+ // Deallocate the blob of memory that we allocated in DFA::CachedState().
+ delete[] reinterpret_cast<const char*>(*tmp);
+ }
state_cache_.clear();
- for (size_t i = 0; i < v.size(); i++)
- delete[] reinterpret_cast<const char*>(v[i]);
}
// Copies insts in state s to the work queue q.
@@ -1716,7 +1692,7 @@ bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info,
State* s = RunStateOnByte(info->start, i);
if (s == NULL) {
// Synchronize with "quick check" above.
- info->firstbyte.store(firstbyte, std::memory_order_release);
+ info->firstbyte.store(kFbUnknown, std::memory_order_release);
return false;
}
if (s == info->start)
@@ -1809,15 +1785,19 @@ DFA* Prog::GetDFA(MatchKind kind) {
return dfa;
// For a forward DFA, half the memory goes to each DFA.
+ // However, if it is a "many match" DFA, then there is
+ // no counterpart with which the memory must be shared.
+ //
// For a reverse DFA, all the memory goes to the
// "longest match" DFA, because RE2 never does reverse
// "first match" searches.
- int64 m = dfa_mem_/2;
+ int64 m = dfa_mem_;
if (reversed_) {
- if (kind == kLongestMatch || kind == kManyMatch)
- m = dfa_mem_;
- else
- m = 0;
+ DCHECK_EQ(kind, kLongestMatch);
+ } else if (kind == kFirstMatch || kind == kLongestMatch) {
+ m /= 2;
+ } else {
+ DCHECK_EQ(kind, kManyMatch);
}
dfa = new DFA(this, kind, m);
diff --git a/re2/filtered_re2.h b/re2/filtered_re2.h
index f4b2be4..1035a12 100644
--- a/re2/filtered_re2.h
+++ b/re2/filtered_re2.h
@@ -2,6 +2,9 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
+#ifndef RE2_FILTERED_RE2_H_
+#define RE2_FILTERED_RE2_H_
+
// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
// It provides a prefilter mechanism that helps in cutting down the
// number of regexps that need to be actually searched.
@@ -18,9 +21,6 @@
// indices of strings that were found in the text to get the actual
// regexp matches.
-#ifndef RE2_FILTERED_RE2_H_
-#define RE2_FILTERED_RE2_H_
-
#include <vector>
#include "re2/re2.h"
diff --git a/re2/nfa.cc b/re2/nfa.cc
index f084e60..1de5aa7 100644
--- a/re2/nfa.cc
+++ b/re2/nfa.cc
@@ -288,7 +288,7 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, int flag,
case kInstByteRange:
if (!ip->Matches(c))
goto Next;
- // Fallthrough intended.
+ FALLTHROUGH_INTENDED;
case kInstMatch:
// Save state; will pick up at next byte.
diff --git a/re2/onepass.cc b/re2/onepass.cc
index a6e3321..da90a86 100644
--- a/re2/onepass.cc
+++ b/re2/onepass.cc
@@ -191,13 +191,10 @@ static void ApplyCaptures(uint32 cond, const char* p,
cap[i] = p;
}
-// Compute a node pointer.
-// Basically (OneState*)(nodes + statesize*nodeindex)
-// but the version with the C++ casts overflows 80 characters (and is ugly).
-static inline OneState* IndexToNode(volatile uint8* nodes, int statesize,
+// Computes the OneState* for the given nodeindex.
+static inline OneState* IndexToNode(uint8* nodes, int statesize,
int nodeindex) {
- return reinterpret_cast<OneState*>(
- const_cast<uint8*>(nodes + statesize*nodeindex));
+ return reinterpret_cast<OneState*>(nodes + statesize*nodeindex);
}
bool Prog::SearchOnePass(const StringPiece& text,
@@ -233,13 +230,10 @@ bool Prog::SearchOnePass(const StringPiece& text,
if (anchor_end())
kind = kFullMatch;
- // State and act are marked volatile to
- // keep the compiler from re-ordering the
- // memory accesses walking over the NFA.
- // This is worth about 5%.
- volatile OneState* state = onepass_start_;
- volatile uint8* nodes = onepass_nodes_;
- volatile uint32 statesize = onepass_statesize_;
+ uint8* nodes = onepass_nodes_;
+ int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32);
+ // start() is always mapped to the zeroth OneState.
+ OneState* state = IndexToNode(nodes, statesize, 0);
uint8* bytemap = bytemap_;
const char* bp = text.begin();
const char* ep = text.end();
@@ -374,7 +368,7 @@ struct InstCond {
// Constructs and saves corresponding one-pass NFA on success.
bool Prog::IsOnePass() {
if (did_onepass_)
- return onepass_start_ != NULL;
+ return onepass_nodes_ != NULL;
did_onepass_ = true;
if (start() == 0) // no match
@@ -385,7 +379,7 @@ bool Prog::IsOnePass() {
// Limit max node count to 65000 as a conservative estimate to
// avoid overflowing 16-bit node index in encoding.
int maxnodes = 2 + inst_count(kInstByteRange);
- int statesize = sizeof(OneState) + bytemap_range_*sizeof(uint32);
+ int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32);
if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
return false;
@@ -401,18 +395,20 @@ bool Prog::IsOnePass() {
int* nodebyid = new int[size]; // indexed by ip
memset(nodebyid, 0xFF, size*sizeof nodebyid[0]);
- uint8* nodes = new uint8[maxnodes*statesize];
- uint8* nodep = nodes;
+ // Originally, nodes was a uint8[maxnodes*statesize], but that was
+ // unnecessarily optimistic: why allocate a large amount of memory
+ // upfront for a large program when it is unlikely to be one-pass?
+ vector<uint8> nodes;
Instq tovisit(size), workq(size);
AddQ(&tovisit, start());
nodebyid[start()] = 0;
- nodep += statesize;
int nalloc = 1;
+ nodes.insert(nodes.end(), statesize, 0);
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
int id = *it;
int nodeindex = nodebyid[id];
- OneState* node = IndexToNode(nodes, statesize, nodeindex);
+ OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
// Flood graph using manual stack, filling in actions as found.
// Default is none.
@@ -452,14 +448,16 @@ bool Prog::IsOnePass() {
if (nalloc >= maxnodes) {
if (Debug)
LOG(ERROR) << StringPrintf(
- "Not OnePass: hit node limit %d > %d", nalloc, maxnodes);
+ "Not OnePass: hit node limit %d >= %d", nalloc, maxnodes);
goto fail;
}
nextindex = nalloc;
- nodep += statesize;
- nodebyid[ip->out()] = nextindex;
- nalloc++;
AddQ(&tovisit, ip->out());
+ nodebyid[ip->out()] = nalloc;
+ nalloc++;
+ nodes.insert(nodes.end(), statesize, 0);
+ // Update node because it might have been invalidated.
+ node = IndexToNode(nodes.data(), statesize, nodeindex);
}
for (int c = ip->lo(); c <= ip->hi(); c++) {
int b = bytemap_[c];
@@ -587,7 +585,7 @@ bool Prog::IsOnePass() {
int nodeindex = nodebyid[id];
if (nodeindex == -1)
continue;
- OneState* node = IndexToNode(nodes, statesize, nodeindex);
+ OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
StringAppendF(&dump, "node %d id=%d: matchcond=%#x\n",
nodeindex, id, node->matchcond);
for (int i = 0; i < bytemap_range_; i++) {
@@ -602,16 +600,9 @@ bool Prog::IsOnePass() {
LOG(ERROR) << "nodes:\n" << dump;
}
- // Overallocated earlier; cut down to actual size.
- nodep = new uint8[nalloc*statesize];
- memmove(nodep, nodes, nalloc*statesize);
- delete[] nodes;
- nodes = nodep;
-
- onepass_start_ = IndexToNode(nodes, statesize, nodebyid[start()]);
- onepass_nodes_ = nodes;
- onepass_statesize_ = statesize;
dfa_mem_ -= nalloc*statesize;
+ onepass_nodes_ = new uint8[nalloc*statesize];
+ memmove(onepass_nodes_, nodes.data(), nalloc*statesize);
delete[] stack;
delete[] nodebyid;
@@ -620,7 +611,6 @@ bool Prog::IsOnePass() {
fail:
delete[] stack;
delete[] nodebyid;
- delete[] nodes;
return false;
}
diff --git a/re2/parse.cc b/re2/parse.cc
index f55dc8c..9cd9cc1 100644
--- a/re2/parse.cc
+++ b/re2/parse.cc
@@ -284,7 +284,7 @@ Rune ApplyFold(const CaseFold *f, Rune r) {
case EvenOddSkip: // even <-> odd but only applies to every other
if ((r - f->lo) % 2)
return r;
- // fall through
+ FALLTHROUGH_INTENDED;
case EvenOdd: // even <-> odd
if (r%2 == 0)
return r + 1;
@@ -293,7 +293,7 @@ Rune ApplyFold(const CaseFold *f, Rune r) {
case OddEvenSkip: // odd <-> even but only applies to every other
if ((r - f->lo) % 2)
return r;
- // fall through
+ FALLTHROUGH_INTENDED;
case OddEven: // odd <-> even
if (r%2 == 1)
return r + 1;
@@ -1354,7 +1354,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp,
// Single non-zero octal digit is a backreference; not supported.
if (s->size() == 0 || (*s)[0] < '0' || (*s)[0] > '7')
goto BadEscape;
- // fall through
+ FALLTHROUGH_INTENDED;
case '0':
// consume up to three octal digits; already have one.
code = c - '0';
diff --git a/re2/prefilter.h b/re2/prefilter.h
index 531c189..e58efe8 100644
--- a/re2/prefilter.h
+++ b/re2/prefilter.h
@@ -2,13 +2,13 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
+#ifndef RE2_PREFILTER_H_
+#define RE2_PREFILTER_H_
+
// Prefilter is the class used to extract string guards from regexps.
// Rather than using Prefilter class directly, use FilteredRE2.
// See filtered_re2.h
-#ifndef RE2_PREFILTER_H_
-#define RE2_PREFILTER_H_
-
#include "util/util.h"
namespace re2 {
diff --git a/re2/prefilter_tree.h b/re2/prefilter_tree.h
index abea55d..a8ec589 100644
--- a/re2/prefilter_tree.h
+++ b/re2/prefilter_tree.h
@@ -2,6 +2,9 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
+#ifndef RE2_PREFILTER_TREE_H_
+#define RE2_PREFILTER_TREE_H_
+
// The PrefilterTree class is used to form an AND-OR tree of strings
// that would trigger each regexp. The 'prefilter' of each regexp is
// added tp PrefilterTree, and then PrefilterTree is used to find all
@@ -12,9 +15,6 @@
// favorite engine. PrefilterTree provides a set of strings (called
// atoms) that the user of this class should use to do the string
// matching.
-//
-#ifndef RE2_PREFILTER_TREE_H_
-#define RE2_PREFILTER_TREE_H_
#include "util/util.h"
#include "util/sparse_array.h"
diff --git a/re2/prog.cc b/re2/prog.cc
index 9098dd4..5d8dd6c 100644
--- a/re2/prog.cc
+++ b/re2/prog.cc
@@ -6,6 +6,7 @@
// Tested by compile_test.cc
#include "util/util.h"
+#include "util/bitmap.h"
#include "re2/prog.h"
#include "re2/stringpiece.h"
@@ -101,13 +102,12 @@ Prog::Prog()
bytemap_range_(0),
first_byte_(-1),
flags_(0),
- onepass_statesize_(0),
+ list_count_(0),
inst_(NULL),
+ onepass_nodes_(NULL),
dfa_first_(NULL),
dfa_longest_(NULL),
- dfa_mem_(0),
- onepass_nodes_(NULL),
- onepass_start_(NULL) {
+ dfa_mem_(0) {
}
Prog::~Prog() {
@@ -313,70 +313,197 @@ uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) {
return flags;
}
+// ByteMapBuilder implements a coloring algorithm.
+//
+// The first phase is a series of "mark and merge" batches: we mark one or more
+// [lo-hi] ranges, then merge them into our internal state. Batching is not for
+// performance; rather, it means that the ranges are treated indistinguishably.
+//
+// Internally, the ranges are represented using a bitmap that stores the splits
+// and a vector that stores the colors; both of them are indexed by the ranges'
+// last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at
+// hi (if not already split), then recolor each range in between. The color map
+// (i.e. from the old color to the new color) is maintained for the lifetime of
+// the batch and so underpins this somewhat obscure approach to set operations.
+//
+// The second phase builds the bytemap from our internal state: we recolor each
+// range, then store the new color (which is now the byte class) in each of the
+// corresponding array elements. Finally, we output the number of byte classes.
+class ByteMapBuilder {
+ public:
+ ByteMapBuilder() {
+ // Initial state: the [0-255] range has color 256.
+ // This will avoid problems during the second phase,
+ // in which we assign byte classes numbered from 0.
+ splits_.Set(255);
+ colors_.resize(256);
+ colors_[255] = 256;
+ nextcolor_ = 257;
+ }
+
+ void Mark(int lo, int hi);
+ void Merge();
+ void Build(uint8* bytemap, int* bytemap_range);
+
+ private:
+ int Recolor(int oldcolor);
+
+ Bitmap256 splits_;
+ vector<int> colors_;
+ int nextcolor_;
+ vector<pair<int, int>> colormap_;
+ vector<pair<int, int>> ranges_;
+
+ DISALLOW_COPY_AND_ASSIGN(ByteMapBuilder);
+};
+
+void ByteMapBuilder::Mark(int lo, int hi) {
+ DCHECK_GE(lo, 0);
+ DCHECK_GE(hi, 0);
+ DCHECK_LE(lo, 255);
+ DCHECK_LE(hi, 255);
+ DCHECK_LE(lo, hi);
+
+ // Ignore any [0-255] ranges. They cause us to recolor every range, which
+ // has no effect on the eventual result and is therefore a waste of time.
+ if (lo == 0 && hi == 255)
+ return;
+
+ ranges_.emplace_back(lo, hi);
+}
+
+void ByteMapBuilder::Merge() {
+ for (vector<pair<int, int>>::const_iterator it = ranges_.begin();
+ it != ranges_.end();
+ ++it) {
+ int lo = it->first-1;
+ int hi = it->second;
+
+ if (0 <= lo && !splits_.Test(lo)) {
+ splits_.Set(lo);
+ int next = splits_.FindNextSetBit(lo+1);
+ colors_[lo] = colors_[next];
+ }
+ if (!splits_.Test(hi)) {
+ splits_.Set(hi);
+ int next = splits_.FindNextSetBit(hi+1);
+ colors_[hi] = colors_[next];
+ }
+
+ int c = lo+1;
+ while (c < 256) {
+ int next = splits_.FindNextSetBit(c);
+ colors_[next] = Recolor(colors_[next]);
+ if (next == hi)
+ break;
+ c = next+1;
+ }
+ }
+ colormap_.clear();
+ ranges_.clear();
+}
+
+void ByteMapBuilder::Build(uint8* bytemap, int* bytemap_range) {
+ // Assign byte classes numbered from 0.
+ nextcolor_ = 0;
+
+ int c = 0;
+ while (c < 256) {
+ int next = splits_.FindNextSetBit(c);
+ uint8 b = static_cast<uint8>(Recolor(colors_[next]));
+ while (c <= next) {
+ bytemap[c] = b;
+ c++;
+ }
+ }
+
+ *bytemap_range = nextcolor_;
+}
+
+int ByteMapBuilder::Recolor(int oldcolor) {
+ // Yes, this is a linear search. There can be at most 256
+ // colors and there will typically be far fewer than that.
+ // Also, we need to consider keys *and* values in order to
+ // avoid recoloring a given range more than once per batch.
+ vector<pair<int, int>>::const_iterator it =
+ std::find_if(colormap_.begin(), colormap_.end(),
+ [&](const pair<int, int>& kv) -> bool {
+ return kv.first == oldcolor || kv.second == oldcolor;
+ });
+ if (it != colormap_.end())
+ return it->second;
+ int newcolor = nextcolor_;
+ nextcolor_++;
+ colormap_.emplace_back(oldcolor, newcolor);
+ return newcolor;
+}
+
void Prog::ComputeByteMap() {
- // Fill in byte map with byte classes for the program.
+ // Fill in bytemap with byte classes for the program.
// Ranges of bytes that are treated indistinguishably
- // are mapped to a single byte class.
- Bitmap<256> v;
+ // will be mapped to a single byte class.
+ ByteMapBuilder builder;
+ // Don't repeat the work for ^ and $.
+ bool marked_line_boundaries = false;
// Don't repeat the work for \b and \B.
- bool done_word_boundaries = false;
+ bool marked_word_boundaries = false;
for (int id = 0; id < static_cast<int>(size()); id++) {
Inst* ip = inst(id);
if (ip->opcode() == kInstByteRange) {
int lo = ip->lo();
int hi = ip->hi();
- if (0 < lo)
- v.Set(lo - 1);
- v.Set(hi);
+ builder.Mark(lo, hi);
if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
- if (lo < 'a')
- lo = 'a';
- if (hi > 'z')
- hi = 'z';
- if (lo <= hi) {
- v.Set(lo + 'A' - 'a' - 1);
- v.Set(hi + 'A' - 'a');
- }
+ int foldlo = lo;
+ int foldhi = hi;
+ if (foldlo < 'a')
+ foldlo = 'a';
+ if (foldhi > 'z')
+ foldhi = 'z';
+ if (foldlo <= foldhi)
+ builder.Mark(foldlo + 'A' - 'a', foldhi + 'A' - 'a');
}
+ // If this Inst is not the last Inst in its list AND the next Inst is
+ // also a ByteRange AND the Insts have the same out, defer the merge.
+ if (!ip->last() &&
+ inst(id+1)->opcode() == kInstByteRange &&
+ ip->out() == inst(id+1)->out())
+ continue;
+ builder.Merge();
} else if (ip->opcode() == kInstEmptyWidth) {
- if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine)) {
- v.Set('\n' - 1);
- v.Set('\n');
+ if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine) &&
+ !marked_line_boundaries) {
+ builder.Mark('\n', '\n');
+ builder.Merge();
+ marked_line_boundaries = true;
}
- if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary)) {
- if (done_word_boundaries)
- continue;
- int j;
- for (int i = 0; i < 256; i = j) {
- for (j = i + 1; j < 256 &&
- Prog::IsWordChar(static_cast<uint8>(i)) ==
- Prog::IsWordChar(static_cast<uint8>(j));
- j++)
- ;
- if (0 < i)
- v.Set(i - 1);
- v.Set(j - 1);
+ if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary) &&
+ !marked_word_boundaries) {
+ // We require two batches here: the first for ranges that are word
+ // characters, the second for ranges that are not word characters.
+ for (bool isword : {true, false}) {
+ int j;
+ for (int i = 0; i < 256; i = j) {
+ for (j = i + 1; j < 256 &&
+ Prog::IsWordChar(static_cast<uint8>(i)) ==
+ Prog::IsWordChar(static_cast<uint8>(j));
+ j++)
+ ;
+ if (Prog::IsWordChar(static_cast<uint8>(i)) == isword)
+ builder.Mark(i, j - 1);
+ }
+ builder.Merge();
}
- done_word_boundaries = true;
+ marked_word_boundaries = true;
}
}
}
- COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize);
- uint8 n = 0;
- uint32 bits = 0;
- for (int i = 0; i < 256; i++) {
- if ((i & 31) == 0)
- bits = v.Word(i >> 5);
- bytemap_[i] = n;
- n += bits & 1;
- bits >>= 1;
- }
- bytemap_range_ = bytemap_[255] + 1;
+ builder.Build(bytemap_, &bytemap_range_);
- if (0) { // For debugging: use trivial byte map.
+ if (0) { // For debugging: use trivial bytemap.
for (int i = 0; i < 256; i++)
bytemap_[i] = static_cast<uint8>(i);
bytemap_range_ = 256;
@@ -536,7 +663,7 @@ void Prog::EmitList(int root, SparseArray<int>* rootmap, vector<Inst>* flat,
flat->back().set_opcode(kInstAltMatch);
flat->back().set_out(static_cast<int>(flat->size()));
flat->back().out1_ = static_cast<uint32>(flat->size())+1;
- // Fall through.
+ FALLTHROUGH_INTENDED;
case kInstAlt:
stk->push_back(ip->out1());
diff --git a/re2/prog.h b/re2/prog.h
index d5c8616..e1c7249 100644
--- a/re2/prog.h
+++ b/re2/prog.h
@@ -2,13 +2,13 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
+#ifndef RE2_PROG_H_
+#define RE2_PROG_H_
+
// Compiled representation of regular expressions.
// See regexp.h for the Regexp class, which represents a regular
// expression symbolically.
-#ifndef RE2_PROG_H__
-#define RE2_PROG_H__
-
#include "util/util.h"
#include "util/sparse_array.h"
#include "util/sparse_set.h"
@@ -16,38 +16,6 @@
namespace re2 {
-// Simple fixed-size bitmap.
-template<int Bits>
-class Bitmap {
- public:
- Bitmap() { Reset(); }
- int Size() { return Bits; }
-
- void Reset() {
- for (int i = 0; i < Words; i++)
- w_[i] = 0;
- }
- bool Get(int k) const {
- return w_[k >> WordLog] & (1<<(k & 31));
- }
- void Set(int k) {
- w_[k >> WordLog] |= 1<<(k & 31);
- }
- void Clear(int k) {
- w_[k >> WordLog] &= ~(1<<(k & 31));
- }
- uint32 Word(int i) const {
- return w_[i];
- }
-
- private:
- static const int WordLog = 5;
- static const int Words = (Bits+31)/32;
- uint32 w_[Words];
- DISALLOW_COPY_AND_ASSIGN(Bitmap);
-};
-
-
// Opcodes for Inst
enum InstOp {
kInstAlt = 0, // choose between out_ and out1_
@@ -291,7 +259,7 @@ class Prog {
// for testing purposes. Returns number of states.
int BuildEntireDFA(MatchKind kind);
- // Compute byte map.
+ // Compute bytemap.
void ComputeByteMap();
// Computes whether all matches must begin with the same first
@@ -385,22 +353,19 @@ class Prog {
int bytemap_range_; // bytemap_[x] < bytemap_range_
int first_byte_; // required first byte for match, or -1 if none
int flags_; // regexp parse flags
- int onepass_statesize_; // byte size of each OneState* node
int list_count_; // count of lists (see above)
int inst_count_[kNumInst]; // count of instructions by opcode
Inst* inst_; // pointer to instruction array
+ uint8* onepass_nodes_; // data for OnePass nodes
Mutex dfa_mutex_; // Protects dfa_first_, dfa_longest_
std::atomic<DFA*> dfa_first_; // DFA cached for kFirstMatch
std::atomic<DFA*> dfa_longest_; // DFA cached for kLongestMatch and kFullMatch
int64 dfa_mem_; // Maximum memory for DFAs.
- uint8 bytemap_[256]; // map from input bytes to byte classes
-
- uint8* onepass_nodes_; // data for OnePass nodes
- OneState* onepass_start_; // start node for OnePass program
+ uint8 bytemap_[256]; // map from input bytes to byte classes
std::once_flag first_byte_once_;
@@ -409,4 +374,4 @@ class Prog {
} // namespace re2
-#endif // RE2_PROG_H__
+#endif // RE2_PROG_H_
diff --git a/re2/re2.cc b/re2/re2.cc
index 6bc5edd..49388c7 100644
--- a/re2/re2.cc
+++ b/re2/re2.cc
@@ -962,6 +962,13 @@ bool RE2::Arg::parse_char(const char* str, int n, void* dest) {
return true;
}
+bool RE2::Arg::parse_schar(const char* str, int n, void* dest) {
+ if (n != 1) return false;
+ if (dest == NULL) return true;
+ *(reinterpret_cast<signed char*>(dest)) = str[0];
+ return true;
+}
+
bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) {
if (n != 1) return false;
if (dest == NULL) return true;
@@ -1074,8 +1081,8 @@ bool RE2::Arg::parse_short_radix(const char* str,
void* dest,
int radix) {
long r;
- if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
- if ((short)r != r) return false; // Out of range
+ if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
+ if ((short)r != r) return false; // Out of range
if (dest == NULL) return true;
*(reinterpret_cast<short*>(dest)) = (short)r;
return true;
@@ -1086,10 +1093,10 @@ bool RE2::Arg::parse_ushort_radix(const char* str,
void* dest,
int radix) {
unsigned long r;
- if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
- if ((ushort)r != r) return false; // Out of range
+ if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
+ if ((unsigned short)r != r) return false; // Out of range
if (dest == NULL) return true;
- *(reinterpret_cast<unsigned short*>(dest)) = (ushort)r;
+ *(reinterpret_cast<unsigned short*>(dest)) = (unsigned short)r;
return true;
}
@@ -1098,10 +1105,10 @@ bool RE2::Arg::parse_int_radix(const char* str,
void* dest,
int radix) {
long r;
- if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
- if ((int)r != r) return false; // Out of range
+ if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
+ if ((int)r != r) return false; // Out of range
if (dest == NULL) return true;
- *(reinterpret_cast<int*>(dest)) = r;
+ *(reinterpret_cast<int*>(dest)) = (int)r;
return true;
}
@@ -1110,14 +1117,13 @@ bool RE2::Arg::parse_uint_radix(const char* str,
void* dest,
int radix) {
unsigned long r;
- if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
- if ((uint)r != r) return false; // Out of range
+ if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
+ if ((unsigned int)r != r) return false; // Out of range
if (dest == NULL) return true;
- *(reinterpret_cast<unsigned int*>(dest)) = r;
+ *(reinterpret_cast<unsigned int*>(dest)) = (unsigned int)r;
return true;
}
-#if RE2_HAVE_LONGLONG
bool RE2::Arg::parse_longlong_radix(const char* str,
int n,
void* dest,
@@ -1156,7 +1162,6 @@ bool RE2::Arg::parse_ulonglong_radix(const char* str,
*(reinterpret_cast<uint64*>(dest)) = r;
return true;
}
-#endif
static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) {
if (n == 0) return false;
@@ -1190,30 +1195,29 @@ bool RE2::Arg::parse_float(const char* str, int n, void* dest) {
return parse_double_float(str, n, true, dest);
}
-
-#define DEFINE_INTEGER_PARSERS(name) \
+#define DEFINE_INTEGER_PARSER(name) \
bool RE2::Arg::parse_##name(const char* str, int n, void* dest) { \
- return parse_##name##_radix(str, n, dest, 10); \
- } \
+ return parse_##name##_radix(str, n, dest, 10); \
+ } \
bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \
- return parse_##name##_radix(str, n, dest, 16); \
- } \
+ return parse_##name##_radix(str, n, dest, 16); \
+ } \
bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \
- return parse_##name##_radix(str, n, dest, 8); \
- } \
+ return parse_##name##_radix(str, n, dest, 8); \
+ } \
bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
- return parse_##name##_radix(str, n, dest, 0); \
+ return parse_##name##_radix(str, n, dest, 0); \
}
-DEFINE_INTEGER_PARSERS(short);
-DEFINE_INTEGER_PARSERS(ushort);
-DEFINE_INTEGER_PARSERS(int);
-DEFINE_INTEGER_PARSERS(uint);
-DEFINE_INTEGER_PARSERS(long);
-DEFINE_INTEGER_PARSERS(ulong);
-DEFINE_INTEGER_PARSERS(longlong);
-DEFINE_INTEGER_PARSERS(ulonglong);
+DEFINE_INTEGER_PARSER(short);
+DEFINE_INTEGER_PARSER(ushort);
+DEFINE_INTEGER_PARSER(int);
+DEFINE_INTEGER_PARSER(uint);
+DEFINE_INTEGER_PARSER(long);
+DEFINE_INTEGER_PARSER(ulong);
+DEFINE_INTEGER_PARSER(longlong);
+DEFINE_INTEGER_PARSER(ulonglong);
-#undef DEFINE_INTEGER_PARSERS
+#undef DEFINE_INTEGER_PARSER
} // namespace re2
diff --git a/re2/re2.h b/re2/re2.h
index d9139b2..cc35736 100644
--- a/re2/re2.h
+++ b/re2/re2.h
@@ -2,8 +2,8 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-#ifndef RE2_RE2_H
-#define RE2_RE2_H
+#ifndef RE2_RE2_H_
+#define RE2_RE2_H_
// C++ interface to the re2 regular-expression library.
// RE2 supports Perl-style regular expressions (with extensions like
@@ -185,10 +185,6 @@
#include <string>
#include "re2/stringpiece.h"
-#ifndef RE2_HAVE_LONGLONG
-#define RE2_HAVE_LONGLONG 1
-#endif
-
namespace re2 {
using std::string;
@@ -686,10 +682,8 @@ class RE2 {
static inline Arg CRadix(unsigned int* x);
static inline Arg CRadix(long* x);
static inline Arg CRadix(unsigned long* x);
- #if RE2_HAVE_LONGLONG
static inline Arg CRadix(long long* x);
static inline Arg CRadix(unsigned long long* x);
- #endif
static inline Arg Hex(short* x);
static inline Arg Hex(unsigned short* x);
@@ -697,10 +691,8 @@ class RE2 {
static inline Arg Hex(unsigned int* x);
static inline Arg Hex(long* x);
static inline Arg Hex(unsigned long* x);
- #if RE2_HAVE_LONGLONG
static inline Arg Hex(long long* x);
static inline Arg Hex(unsigned long long* x);
- #endif
static inline Arg Octal(short* x);
static inline Arg Octal(unsigned short* x);
@@ -708,10 +700,8 @@ class RE2 {
static inline Arg Octal(unsigned int* x);
static inline Arg Octal(long* x);
static inline Arg Octal(unsigned long* x);
- #if RE2_HAVE_LONGLONG
static inline Arg Octal(long long* x);
static inline Arg Octal(unsigned long long* x);
- #endif
private:
void Init(const StringPiece& pattern, const Options& options);
@@ -783,28 +773,26 @@ class RE2::Arg {
typedef bool (*Parser)(const char* str, int n, void* dest);
// Type-specific parsers
-#define MAKE_PARSER(type,name) \
- Arg(type* p) : arg_(p), parser_(name) { } \
- Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \
-
+#define MAKE_PARSER(type, name) \
+ Arg(type* p) : arg_(p), parser_(name) {} \
+ Arg(type* p, Parser parser) : arg_(p), parser_(parser) {}
MAKE_PARSER(char, parse_char);
- MAKE_PARSER(signed char, parse_char);
+ MAKE_PARSER(signed char, parse_schar);
MAKE_PARSER(unsigned char, parse_uchar);
+ MAKE_PARSER(float, parse_float);
+ MAKE_PARSER(double, parse_double);
+ MAKE_PARSER(string, parse_string);
+ MAKE_PARSER(StringPiece, parse_stringpiece);
+
MAKE_PARSER(short, parse_short);
MAKE_PARSER(unsigned short, parse_ushort);
MAKE_PARSER(int, parse_int);
MAKE_PARSER(unsigned int, parse_uint);
MAKE_PARSER(long, parse_long);
MAKE_PARSER(unsigned long, parse_ulong);
- #if RE2_HAVE_LONGLONG
MAKE_PARSER(long long, parse_longlong);
MAKE_PARSER(unsigned long long, parse_ulonglong);
- #endif
- MAKE_PARSER(float, parse_float);
- MAKE_PARSER(double, parse_double);
- MAKE_PARSER(string, parse_string);
- MAKE_PARSER(StringPiece, parse_stringpiece);
#undef MAKE_PARSER
@@ -823,21 +811,23 @@ class RE2::Arg {
static bool parse_null (const char* str, int n, void* dest);
static bool parse_char (const char* str, int n, void* dest);
+ static bool parse_schar (const char* str, int n, void* dest);
static bool parse_uchar (const char* str, int n, void* dest);
static bool parse_float (const char* str, int n, void* dest);
static bool parse_double (const char* str, int n, void* dest);
static bool parse_string (const char* str, int n, void* dest);
static bool parse_stringpiece (const char* str, int n, void* dest);
-#define DECLARE_INTEGER_PARSER(name) \
- private: \
- static bool parse_ ## name(const char* str, int n, void* dest); \
- static bool parse_ ## name ## _radix( \
- const char* str, int n, void* dest, int radix); \
- public: \
- static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \
- static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \
- static bool parse_ ## name ## _cradix(const char* str, int n, void* dest)
+#define DECLARE_INTEGER_PARSER(name) \
+ private: \
+ static bool parse_##name(const char* str, int n, void* dest); \
+ static bool parse_##name##_radix(const char* str, int n, void* dest, \
+ int radix); \
+ \
+ public: \
+ static bool parse_##name##_hex(const char* str, int n, void* dest); \
+ static bool parse_##name##_octal(const char* str, int n, void* dest); \
+ static bool parse_##name##_cradix(const char* str, int n, void* dest)
DECLARE_INTEGER_PARSER(short);
DECLARE_INTEGER_PARSER(ushort);
@@ -845,12 +835,11 @@ class RE2::Arg {
DECLARE_INTEGER_PARSER(uint);
DECLARE_INTEGER_PARSER(long);
DECLARE_INTEGER_PARSER(ulong);
- #if RE2_HAVE_LONGLONG
DECLARE_INTEGER_PARSER(longlong);
DECLARE_INTEGER_PARSER(ulonglong);
- #endif
#undef DECLARE_INTEGER_PARSER
+
};
inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
@@ -861,13 +850,16 @@ inline bool RE2::Arg::Parse(const char* str, int n) const {
}
// This part of the parser, appropriate only for ints, deals with bases
-#define MAKE_INTEGER_PARSER(type, name) \
- inline RE2::Arg RE2::Hex(type* ptr) { \
- return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _hex); } \
- inline RE2::Arg RE2::Octal(type* ptr) { \
- return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _octal); } \
- inline RE2::Arg RE2::CRadix(type* ptr) { \
- return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _cradix); }
+#define MAKE_INTEGER_PARSER(type, name) \
+ inline RE2::Arg RE2::Hex(type* ptr) { \
+ return RE2::Arg(ptr, RE2::Arg::parse_##name##_hex); \
+ } \
+ inline RE2::Arg RE2::Octal(type* ptr) { \
+ return RE2::Arg(ptr, RE2::Arg::parse_##name##_octal); \
+ } \
+ inline RE2::Arg RE2::CRadix(type* ptr) { \
+ return RE2::Arg(ptr, RE2::Arg::parse_##name##_cradix); \
+ }
MAKE_INTEGER_PARSER(short, short)
MAKE_INTEGER_PARSER(unsigned short, ushort)
@@ -875,15 +867,62 @@ MAKE_INTEGER_PARSER(int, int)
MAKE_INTEGER_PARSER(unsigned int, uint)
MAKE_INTEGER_PARSER(long, long)
MAKE_INTEGER_PARSER(unsigned long, ulong)
-#if RE2_HAVE_LONGLONG
MAKE_INTEGER_PARSER(long long, longlong)
MAKE_INTEGER_PARSER(unsigned long long, ulonglong)
-#endif
#undef MAKE_INTEGER_PARSER
+#ifndef SWIG
+// Helper for writing global or static RE2s safely.
+// Write
+// static LazyRE2 re = {".*"};
+// and then use *re instead of writing
+// static RE2 re(".*");
+// The former is more careful about multithreaded
+// situations than the latter.
+//
+// N.B. This class never deletes the RE2 object that
+// it constructs: that's a feature, so that it can be used
+// for global and function static variables.
+class LazyRE2 {
+ private:
+ struct NoArg {};
+
+ public:
+ typedef RE2 element_type; // support std::pointer_traits
+
+ // Constructor omitted to preserve braced initialization in C++98.
+
+ // Pretend to be a pointer to Type (never NULL due to on-demand creation):
+ RE2& operator*() const { return *get(); }
+ RE2* operator->() const { return get(); }
+
+ // Named accessor/initializer:
+ RE2* get() const {
+ std::call_once(once_, [this]() { LazyRE2::Init(this); });
+ return ptr_;
+ }
+
+ // All data fields must be public to support {"foo"} initialization.
+ const char* pattern_;
+ RE2::CannedOptions options_;
+ NoArg barrier_against_excess_initializers_;
+
+ mutable RE2* ptr_;
+ mutable std::once_flag once_;
+
+ private:
+ static void Init(const LazyRE2* lazy_re2) {
+ lazy_re2->ptr_ = new RE2(lazy_re2->pattern_, lazy_re2->options_);
+ }
+
+ void operator=(const LazyRE2&); // disallowed
+};
+#endif // SWIG
+
} // namespace re2
using re2::RE2;
+using re2::LazyRE2;
-#endif /* RE2_RE2_H */
+#endif // RE2_RE2_H_
diff --git a/re2/regexp.h b/re2/regexp.h
index 54720eb..607dc93 100644
--- a/re2/regexp.h
+++ b/re2/regexp.h
@@ -2,6 +2,9 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
+#ifndef RE2_REGEXP_H_
+#define RE2_REGEXP_H_
+
// --- SPONSORED LINK --------------------------------------------------
// If you want to use this library for regular expression matching,
// you should use re2/re2.h, which provides a class RE2 that
@@ -83,9 +86,6 @@
// form accessible to clients, so that client code can analyze the
// parsed regular expressions.
-#ifndef RE2_REGEXP_H__
-#define RE2_REGEXP_H__
-
#include "util/util.h"
#include "re2/stringpiece.h"
@@ -630,4 +630,4 @@ inline Regexp::ParseFlags operator~(Regexp::ParseFlags a)
} // namespace re2
-#endif // RE2_REGEXP_H__
+#endif // RE2_REGEXP_H_
diff --git a/re2/set.h b/re2/set.h
index 5bcdb70..0e6d4b3 100644
--- a/re2/set.h
+++ b/re2/set.h
@@ -2,8 +2,8 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-#ifndef RE2_SET_H
-#define RE2_SET_H
+#ifndef RE2_SET_H_
+#define RE2_SET_H_
#include <utility>
#include <vector>
@@ -52,4 +52,4 @@ class RE2::Set {
} // namespace re2
-#endif // RE2_SET_H
+#endif // RE2_SET_H_
diff --git a/re2/stringpiece.h b/re2/stringpiece.h
index 1479d1a..0bf5b0c 100644
--- a/re2/stringpiece.h
+++ b/re2/stringpiece.h
@@ -2,6 +2,9 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
+#ifndef RE2_STRINGPIECE_H_
+#define RE2_STRINGPIECE_H_
+
// A string-like object that points to a sized piece of memory.
//
// Functions or methods may use const StringPiece& parameters to accept either
@@ -16,9 +19,6 @@
//
// Arghh! I wish C++ literals were "string".
-#ifndef STRINGS_STRINGPIECE_H__
-#define STRINGS_STRINGPIECE_H__
-
#include <string.h>
#include <algorithm>
#include <cstddef>
@@ -182,4 +182,4 @@ inline bool operator>=(const StringPiece& x, const StringPiece& y) {
// allow StringPiece to be logged
extern std::ostream& operator<<(std::ostream& o, const re2::StringPiece& piece);
-#endif // STRINGS_STRINGPIECE_H__
+#endif // RE2_STRINGPIECE_H_
diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc
index 0c598e0..cd8406d 100644
--- a/re2/testing/compile_test.cc
+++ b/re2/testing/compile_test.cc
@@ -143,63 +143,89 @@ TEST(TestRegexpCompileToProg, Simple) {
EXPECT_EQ(failed, 0);
}
-// The distinct byte ranges involved in the Latin-1 dot ([^\n]).
-static struct Latin1ByteRange {
- int lo;
- int hi;
-} latin1ranges[] = {
- { 0x00, 0x09 },
- { 0x0A, 0x0A },
- { 0x0B, 0xFF },
-};
-
-TEST(TestCompile, Latin1Ranges) {
- Regexp* re = Regexp::Parse(".", Regexp::PerlX|Regexp::Latin1, NULL);
+static void DumpByteMap(StringPiece pattern, Regexp::ParseFlags flags,
+ string* bytemap) {
+ Regexp* re = Regexp::Parse(pattern, flags, NULL);
EXPECT_TRUE(re != NULL);
+
Prog* prog = re->CompileToProg(0);
EXPECT_TRUE(prog != NULL);
- EXPECT_EQ(prog->bytemap_range(), arraysize(latin1ranges));
- for (int i = 0; i < arraysize(latin1ranges); i++)
- for (int j = latin1ranges[i].lo; j <= latin1ranges[i].hi; j++)
- EXPECT_EQ(prog->bytemap()[j], i) << " byte " << j;
+ *bytemap = prog->DumpByteMap();
delete prog;
+
re->Decref();
}
-// The distinct byte ranges involved in the UTF-8 dot ([^\n]).
-// Once, erroneously split between 0x3f and 0x40 because it is
-// a 6-bit boundary.
-static struct UTF8ByteRange {
- int lo;
- int hi;
-} utf8ranges[] = {
- { 0x00, 0x09 },
- { 0x0A, 0x0A },
- { 0x0B, 0x7F },
- { 0x80, 0x8F },
- { 0x90, 0x9F },
- { 0xA0, 0xBF },
- { 0xC0, 0xC1 },
- { 0xC2, 0xDF },
- { 0xE0, 0xE0 },
- { 0xE1, 0xEF },
- { 0xF0, 0xF0 },
- { 0xF1, 0xF3 },
- { 0xF4, 0xF4 },
- { 0xF5, 0xFF },
-};
+TEST(TestCompile, Latin1Ranges) {
+ // The distinct byte ranges involved in the Latin-1 dot ([^\n]).
+
+ string bytemap;
+
+ DumpByteMap(".", Regexp::PerlX|Regexp::Latin1, &bytemap);
+ EXPECT_EQ("[00-09] -> 0\n"
+ "[0a-0a] -> 1\n"
+ "[0b-ff] -> 0\n",
+ bytemap);
+}
+
+TEST(TestCompile, OtherByteMapTests) {
+ string bytemap;
+
+ // Test that "absent" ranges are mapped to the same byte class.
+ DumpByteMap("[0-9A-Fa-f]+", Regexp::PerlX|Regexp::Latin1, &bytemap);
+ EXPECT_EQ("[00-2f] -> 0\n"
+ "[30-39] -> 1\n"
+ "[3a-40] -> 0\n"
+ "[41-46] -> 1\n"
+ "[47-60] -> 0\n"
+ "[61-66] -> 1\n"
+ "[67-ff] -> 0\n",
+ bytemap);
+
+ // Test the byte classes for \b.
+ DumpByteMap("\\b", Regexp::LikePerl|Regexp::Latin1, &bytemap);
+ EXPECT_EQ("[00-2f] -> 0\n"
+ "[30-39] -> 1\n"
+ "[3a-40] -> 0\n"
+ "[41-5a] -> 1\n"
+ "[5b-5e] -> 0\n"
+ "[5f-5f] -> 1\n"
+ "[60-60] -> 0\n"
+ "[61-7a] -> 1\n"
+ "[7b-ff] -> 0\n",
+ bytemap);
+
+ // Bug in the ASCII case-folding optimization created too many byte classes.
+ DumpByteMap("[^_]", Regexp::LikePerl|Regexp::Latin1, &bytemap);
+ EXPECT_EQ("[00-5e] -> 0\n"
+ "[5f-5f] -> 1\n"
+ "[60-ff] -> 0\n",
+ bytemap);
+}
TEST(TestCompile, UTF8Ranges) {
- Regexp* re = Regexp::Parse(".", Regexp::PerlX, NULL);
- EXPECT_TRUE(re != NULL);
- Prog* prog = re->CompileToProg(0);
- EXPECT_TRUE(prog != NULL);
- EXPECT_EQ(prog->bytemap_range(), arraysize(utf8ranges));
- for (int i = 0; i < arraysize(utf8ranges); i++)
- for (int j = utf8ranges[i].lo; j <= utf8ranges[i].hi; j++)
- EXPECT_EQ(prog->bytemap()[j], i) << " byte " << j;
- delete prog;
- re->Decref();
+ // The distinct byte ranges involved in the UTF-8 dot ([^\n]).
+ // Once, erroneously split between 0x3f and 0x40 because it is
+ // a 6-bit boundary.
+
+ string bytemap;
+
+ DumpByteMap(".", Regexp::PerlX, &bytemap);
+ EXPECT_EQ("[00-09] -> 0\n"
+ "[0a-0a] -> 1\n"
+ "[0b-7f] -> 0\n"
+ "[80-8f] -> 2\n"
+ "[90-9f] -> 3\n"
+ "[a0-bf] -> 4\n"
+ "[c0-c1] -> 1\n"
+ "[c2-df] -> 5\n"
+ "[e0-e0] -> 6\n"
+ "[e1-ef] -> 7\n"
+ "[f0-f0] -> 8\n"
+ "[f1-f3] -> 9\n"
+ "[f4-f4] -> 10\n"
+ "[f5-ff] -> 1\n",
+ bytemap);
}
TEST(TestCompile, InsufficientMemory) {
diff --git a/re2/testing/exhaustive_tester.h b/re2/testing/exhaustive_tester.h
index 1facb97..a8f39eb 100644
--- a/re2/testing/exhaustive_tester.h
+++ b/re2/testing/exhaustive_tester.h
@@ -2,8 +2,8 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-#ifndef RE2_TESTING_EXHAUSTIVE_TESTER_H__
-#define RE2_TESTING_EXHAUSTIVE_TESTER_H__
+#ifndef RE2_TESTING_EXHAUSTIVE_TESTER_H_
+#define RE2_TESTING_EXHAUSTIVE_TESTER_H_
#include <string>
#include <vector>
@@ -92,4 +92,4 @@ void EgrepTest(int maxatoms, int maxops, const string& alphabet,
} // namespace re2
-#endif // RE2_TESTING_EXHAUSTIVE_TESTER_H__
+#endif // RE2_TESTING_EXHAUSTIVE_TESTER_H_
diff --git a/re2/testing/re2_arg_test.cc b/re2/testing/re2_arg_test.cc
index d843ffa..06c58f1 100644
--- a/re2/testing/re2_arg_test.cc
+++ b/re2/testing/re2_arg_test.cc
@@ -106,27 +106,27 @@ const int kNumStrings = arraysize(kSuccessTable);
} \
}
-TEST(REArgTest, Int16Test) {
+TEST(RE2ArgTest, Int16Test) {
PARSE_FOR_TYPE(int16, 0);
}
-TEST(REArgTest, Uint16Test) {
+TEST(RE2ArgTest, Uint16Test) {
PARSE_FOR_TYPE(uint16, 1);
}
-TEST(REArgTest, IntTest) {
+TEST(RE2ArgTest, IntTest) {
PARSE_FOR_TYPE(int, 2);
}
-TEST(REArgTest, UInt32Test) {
+TEST(RE2ArgTest, Uint32Test) {
PARSE_FOR_TYPE(uint32, 3);
}
-TEST(REArgTest, Iint64Test) {
+TEST(RE2ArgTest, Int64Test) {
PARSE_FOR_TYPE(int64, 4);
}
-TEST(REArgTest, Uint64Test) {
+TEST(RE2ArgTest, Uint64Test) {
PARSE_FOR_TYPE(uint64, 5);
}
diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc
index 81d5fef..830b3f7 100644
--- a/re2/testing/re2_test.cc
+++ b/re2/testing/re2_test.cc
@@ -1411,6 +1411,18 @@ TEST(RE2, UnicodeClasses) {
EXPECT_EQ("鋒", c);
}
+TEST(RE2, LazyRE2) {
+ // Test with and without options.
+ static LazyRE2 a = {"a"};
+ static LazyRE2 b = {"b", RE2::Latin1};
+
+ EXPECT_EQ("a", a->pattern());
+ EXPECT_EQ(RE2::Options::EncodingUTF8, a->options().encoding());
+
+ EXPECT_EQ("b", b->pattern());
+ EXPECT_EQ(RE2::Options::EncodingLatin1, b->options().encoding());
+}
+
// Bug reported by saito. 2009/02/17
TEST(RE2, NullVsEmptyString) {
RE2 re(".*");
diff --git a/re2/testing/regexp_generator.h b/re2/testing/regexp_generator.h
index 3ba0d70..06ea4c4 100644
--- a/re2/testing/regexp_generator.h
+++ b/re2/testing/regexp_generator.h
@@ -2,12 +2,12 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
+#ifndef RE2_TESTING_REGEXP_GENERATOR_H_
+#define RE2_TESTING_REGEXP_GENERATOR_H_
+
// Regular expression generator: generates all possible
// regular expressions within given parameters (see below for details).
-#ifndef RE2_TESTING_REGEXP_GENERATOR_H__
-#define RE2_TESTING_REGEXP_GENERATOR_H__
-
#include <string>
#include <vector>
#include "util/random.h"
@@ -67,4 +67,4 @@ vector<string> Split(const StringPiece& sep, const StringPiece& s);
} // namespace re2
-#endif // RE2_TESTING_REGEXP_GENERATOR_H__
+#endif // RE2_TESTING_REGEXP_GENERATOR_H_
diff --git a/re2/testing/string_generator.h b/re2/testing/string_generator.h
index 52e5e22..ff5a711 100644
--- a/re2/testing/string_generator.h
+++ b/re2/testing/string_generator.h
@@ -2,13 +2,13 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
+#ifndef RE2_TESTING_STRING_GENERATOR_H_
+#define RE2_TESTING_STRING_GENERATOR_H_
+
// String generator: generates all possible strings of up to
// maxlen letters using the set of letters in alpha.
// Fetch strings using a Java-like Next()/HasNext() interface.
-#ifndef RE2_TESTING_STRING_GENERATOR_H__
-#define RE2_TESTING_STRING_GENERATOR_H__
-
#include <string>
#include <vector>
#include "util/util.h"
@@ -55,4 +55,4 @@ class StringGenerator {
} // namespace re2
-#endif // RE2_TESTING_STRING_GENERATOR_H__
+#endif // RE2_TESTING_STRING_GENERATOR_H_
diff --git a/re2/testing/tester.h b/re2/testing/tester.h
index e9fae85..07291d2 100644
--- a/re2/testing/tester.h
+++ b/re2/testing/tester.h
@@ -2,12 +2,12 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
+#ifndef RE2_TESTING_TESTER_H_
+#define RE2_TESTING_TESTER_H_
+
// Comparative tester for regular expression matching.
// Checks all implementations against each other.
-#ifndef RE2_TESTING_TESTER_H__
-#define RE2_TESTING_TESTER_H__
-
#include "re2/stringpiece.h"
#include "re2/prog.h"
#include "re2/regexp.h"
@@ -118,4 +118,4 @@ bool TestRegexpOnText(const StringPiece& regexp, const StringPiece& text);
} // namespace re2
-#endif // RE2_TESTING_TESTER_H__
+#endif // RE2_TESTING_TESTER_H_
diff --git a/re2/unicode_casefold.h b/re2/unicode_casefold.h
index 1671140..164ca41 100644
--- a/re2/unicode_casefold.h
+++ b/re2/unicode_casefold.h
@@ -2,6 +2,9 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
+#ifndef RE2_UNICODE_CASEFOLD_H_
+#define RE2_UNICODE_CASEFOLD_H_
+
// Unicode case folding tables.
// The Unicode case folding tables encode the mapping from one Unicode point
@@ -36,9 +39,6 @@
// The grouped form also allows for efficient fold range calculations
// rather than looping one character at a time.
-#ifndef RE2_UNICODE_CASEFOLD_H__
-#define RE2_UNICODE_CASEFOLD_H__
-
#include "util/util.h"
namespace re2 {
@@ -72,4 +72,4 @@ extern Rune ApplyFold(const CaseFold *f, Rune r);
} // namespace re2
-#endif // RE2_UNICODE_CASEFOLD_H__
+#endif // RE2_UNICODE_CASEFOLD_H_
diff --git a/re2/unicode_groups.h b/re2/unicode_groups.h
index fc1c253..d61cd83 100644
--- a/re2/unicode_groups.h
+++ b/re2/unicode_groups.h
@@ -2,6 +2,9 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
+#ifndef RE2_UNICODE_GROUPS_H_
+#define RE2_UNICODE_GROUPS_H_
+
// Unicode character groups.
// The codes get split into ranges of 16-bit codes
@@ -15,9 +18,6 @@
// to 16.5 kB of data but make the data harder to use;
// we don't bother.
-#ifndef RE2_UNICODE_GROUPS_H__
-#define RE2_UNICODE_GROUPS_H__
-
#include "util/util.h"
namespace re2 {
@@ -61,4 +61,4 @@ extern const int num_perl_groups;
} // namespace re2
-#endif // RE2_UNICODE_GROUPS_H__
+#endif // RE2_UNICODE_GROUPS_H_
diff --git a/re2/walker-inl.h b/re2/walker-inl.h
index bdcf7f5..6a1113a 100644
--- a/re2/walker-inl.h
+++ b/re2/walker-inl.h
@@ -2,6 +2,9 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
+#ifndef RE2_WALKER_INL_H_
+#define RE2_WALKER_INL_H_
+
// Helper class for traversing Regexps without recursion.
// Clients should declare their own subclasses that override
// the PreVisit and PostVisit methods, which are called before
@@ -10,9 +13,6 @@
// Not quite the Visitor pattern, because (among other things)
// the Visitor pattern is recursive.
-#ifndef RE2_WALKER_INL_H__
-#define RE2_WALKER_INL_H__
-
#include "re2/regexp.h"
namespace re2 {
@@ -187,7 +187,7 @@ template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
s->child_args = &s->child_arg;
else if (re->nsub_ > 1)
s->child_args = new T[re->nsub_];
- // Fall through.
+ FALLTHROUGH_INTENDED;
}
default: {
if (re->nsub_ > 0) {
@@ -241,4 +241,4 @@ template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg,
} // namespace re2
-#endif // RE2_WALKER_INL_H__
+#endif // RE2_WALKER_INL_H_
diff --git a/runtests b/runtests
index aadcb92..ce7c18d 100755
--- a/runtests
+++ b/runtests
@@ -1,4 +1,4 @@
-#!/usr/bin/env bash
+#!/usr/bin/env sh
success=true
for i
diff --git a/util/benchmark.cc b/util/benchmark.cc
index b77e22d..20b6765 100644
--- a/util/benchmark.cc
+++ b/util/benchmark.cc
@@ -44,7 +44,11 @@ static int64 nsec() {
return ticks.QuadPart;
#else
struct timespec tp;
+#ifdef CLOCK_PROCESS_CPUTIME_ID
+ if(clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &tp) < 0)
+#else
if(clock_gettime(CLOCK_REALTIME, &tp) < 0)
+#endif
return -1;
return (int64)tp.tv_sec*1000*1000*1000 + tp.tv_nsec;
#endif
diff --git a/util/benchmark.h b/util/benchmark.h
index 31bbd53..694565f 100644
--- a/util/benchmark.h
+++ b/util/benchmark.h
@@ -2,8 +2,8 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-#ifndef RE2_UTIL_BENCHMARK_H__
-#define RE2_UTIL_BENCHMARK_H__
+#ifndef UTIL_BENCHMARK_H_
+#define UTIL_BENCHMARK_H_
namespace testing {
struct Benchmark {
@@ -14,7 +14,7 @@ struct Benchmark {
int hi;
int threadlo;
int threadhi;
-
+
void Register();
Benchmark(const char* name, void (*f)(int)) { Clear(name); fn = f; Register(); }
Benchmark(const char* name, void (*f)(int, int), int l, int h) { Clear(name); fnr = f; lo = l; hi = h; Register(); }
@@ -38,4 +38,4 @@ int NumCPUs();
::testing::Benchmark* _benchmark_##f = \
(new ::testing::Benchmark(#f, f, lo, hi))
-#endif // RE2_UTIL_BENCHMARK_H__
+#endif // UTIL_BENCHMARK_H_
diff --git a/util/bitmap.h b/util/bitmap.h
new file mode 100644
index 0000000..8a93d81
--- /dev/null
+++ b/util/bitmap.h
@@ -0,0 +1,92 @@
+// Copyright 2016 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef UTIL_BITMAP_H_
+#define UTIL_BITMAP_H_
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+#include "util/util.h"
+
+namespace re2 {
+
+class Bitmap256 {
+ public:
+ Bitmap256() {
+ memset(words_, 0, sizeof words_);
+ }
+
+ // Tests the bit with index c.
+ bool Test(int c) const {
+ DCHECK_GE(c, 0);
+ DCHECK_LE(c, 255);
+
+ return (words_[c / 64] & (1ULL << (c % 64))) != 0;
+ }
+
+ // Sets the bit with index c.
+ void Set(int c) {
+ DCHECK_GE(c, 0);
+ DCHECK_LE(c, 255);
+
+ words_[c / 64] |= (1ULL << (c % 64));
+ }
+
+ // Finds the next non-zero bit with index >= c.
+ // Returns -1 if no such bit exists.
+ int FindNextSetBit(int c) const;
+
+ private:
+ // Finds the least significant non-zero bit in n.
+ static int FindLSBSet(uint64 n) {
+ DCHECK_NE(n, 0);
+
+#if defined(__GNUC__)
+ return __builtin_ctzll(n);
+#elif defined(_MSC_VER)
+ unsigned long c;
+ _BitScanForward64(&c, n);
+ return static_cast<int>(c);
+#else
+#error "bit scan forward not implemented"
+#endif
+ }
+
+ uint64 words_[4];
+};
+
+int Bitmap256::FindNextSetBit(int c) const {
+ DCHECK_GE(c, 0);
+ DCHECK_LE(c, 255);
+
+ // Check the word that contains the bit. Mask out any lower bits.
+ int i = c / 64;
+ uint64 word = words_[i] & (~0ULL << (c % 64));
+ if (word != 0)
+ return (i * 64) + FindLSBSet(word);
+
+ // Check any following words.
+ i++;
+ switch (i) {
+ case 1:
+ if (words_[1] != 0)
+ return (1 * 64) + FindLSBSet(words_[1]);
+ FALLTHROUGH_INTENDED;
+ case 2:
+ if (words_[2] != 0)
+ return (2 * 64) + FindLSBSet(words_[2]);
+ FALLTHROUGH_INTENDED;
+ case 3:
+ if (words_[3] != 0)
+ return (3 * 64) + FindLSBSet(words_[3]);
+ FALLTHROUGH_INTENDED;
+ default:
+ return -1;
+ }
+}
+
+} // namespace re2
+
+#endif // UTIL_BITMAP_H_
diff --git a/util/flags.h b/util/flags.h
index 98d5c06..1fd5c91 100644
--- a/util/flags.h
+++ b/util/flags.h
@@ -2,14 +2,14 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
+#ifndef UTIL_FLAGS_H_
+#define UTIL_FLAGS_H_
+
// Simplified version of Google's command line flags.
// Does not support parsing the command line.
// If you want to do that, see
// https://gflags.github.io/gflags/
-#ifndef RE2_UTIL_FLAGS_H__
-#define RE2_UTIL_FLAGS_H__
-
#define DEFINE_flag(type, name, deflt, desc) \
namespace re2 { type FLAGS_##name = deflt; }
@@ -24,4 +24,4 @@
#define DECLARE_int32(name) DECLARE_flag(int32, name)
#define DECLARE_string(name) DECLARE_flag(string, name)
-#endif // RE2_UTIL_FLAGS_H__
+#endif // UTIL_FLAGS_H_
diff --git a/util/logging.h b/util/logging.h
index feac199..1573b18 100644
--- a/util/logging.h
+++ b/util/logging.h
@@ -2,10 +2,10 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// Simplified version of Google's logging.
+#ifndef UTIL_LOGGING_H_
+#define UTIL_LOGGING_H_
-#ifndef RE2_UTIL_LOGGING_H__
-#define RE2_UTIL_LOGGING_H__
+// Simplified version of Google's logging.
#include <stdio.h> /* for fwrite */
#include <sstream>
@@ -106,4 +106,4 @@ class LogMessageFatal : public LogMessage {
#pragma warning(pop)
#endif
-#endif // RE2_UTIL_LOGGING_H__
+#endif // UTIL_LOGGING_H_
diff --git a/util/mutex.h b/util/mutex.h
index 8a13226..81121a4 100644
--- a/util/mutex.h
+++ b/util/mutex.h
@@ -2,14 +2,14 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
+#ifndef UTIL_MUTEX_H_
+#define UTIL_MUTEX_H_
+
/*
* A simple mutex wrapper, supporting locks and read-write locks.
* You should assume the locks are *not* re-entrant.
*/
-#ifndef RE2_UTIL_MUTEX_H_
-#define RE2_UTIL_MUTEX_H_
-
#include <stdlib.h>
#if !defined(_WIN32)
@@ -210,4 +210,4 @@ class WriterMutexLock {
} // namespace re2
-#endif /* #define RE2_UTIL_MUTEX_H_ */
+#endif // UTIL_MUTEX_H_
diff --git a/util/pcre.cc b/util/pcre.cc
index 9a3f32d..87affdc 100644
--- a/util/pcre.cc
+++ b/util/pcre.cc
@@ -754,6 +754,13 @@ bool PCRE::Arg::parse_char(const char* str, int n, void* dest) {
return true;
}
+bool PCRE::Arg::parse_schar(const char* str, int n, void* dest) {
+ if (n != 1) return false;
+ if (dest == NULL) return true;
+ *(reinterpret_cast<signed char*>(dest)) = str[0];
+ return true;
+}
+
bool PCRE::Arg::parse_uchar(const char* str, int n, void* dest) {
if (n != 1) return false;
if (dest == NULL) return true;
@@ -838,8 +845,8 @@ bool PCRE::Arg::parse_short_radix(const char* str,
void* dest,
int radix) {
long r;
- if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
- if ((short)r != r) return false; // Out of range
+ if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
+ if ((short)r != r) return false; // Out of range
if (dest == NULL) return true;
*(reinterpret_cast<short*>(dest)) = (short)r;
return true;
@@ -850,10 +857,10 @@ bool PCRE::Arg::parse_ushort_radix(const char* str,
void* dest,
int radix) {
unsigned long r;
- if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
- if ((ushort)r != r) return false; // Out of range
+ if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
+ if ((unsigned short)r != r) return false; // Out of range
if (dest == NULL) return true;
- *(reinterpret_cast<unsigned short*>(dest)) = (ushort)r;
+ *(reinterpret_cast<unsigned short*>(dest)) = (unsigned short)r;
return true;
}
@@ -862,10 +869,10 @@ bool PCRE::Arg::parse_int_radix(const char* str,
void* dest,
int radix) {
long r;
- if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
- if ((int)r != r) return false; // Out of range
+ if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
+ if ((int)r != r) return false; // Out of range
if (dest == NULL) return true;
- *(reinterpret_cast<int*>(dest)) = r;
+ *(reinterpret_cast<int*>(dest)) = (int)r;
return true;
}
@@ -874,10 +881,10 @@ bool PCRE::Arg::parse_uint_radix(const char* str,
void* dest,
int radix) {
unsigned long r;
- if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
- if ((uint)r != r) return false; // Out of range
+ if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
+ if ((unsigned int)r != r) return false; // Out of range
if (dest == NULL) return true;
- *(reinterpret_cast<unsigned int*>(dest)) = r;
+ *(reinterpret_cast<unsigned int*>(dest)) = (unsigned int)r;
return true;
}
@@ -970,30 +977,29 @@ bool PCRE::Arg::parse_float(const char* str, int n, void* dest) {
return true;
}
-
-#define DEFINE_INTEGER_PARSERS(name) \
+#define DEFINE_INTEGER_PARSER(name) \
bool PCRE::Arg::parse_##name(const char* str, int n, void* dest) { \
- return parse_##name##_radix(str, n, dest, 10); \
- } \
+ return parse_##name##_radix(str, n, dest, 10); \
+ } \
bool PCRE::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \
- return parse_##name##_radix(str, n, dest, 16); \
- } \
+ return parse_##name##_radix(str, n, dest, 16); \
+ } \
bool PCRE::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \
- return parse_##name##_radix(str, n, dest, 8); \
- } \
+ return parse_##name##_radix(str, n, dest, 8); \
+ } \
bool PCRE::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
- return parse_##name##_radix(str, n, dest, 0); \
+ return parse_##name##_radix(str, n, dest, 0); \
}
-DEFINE_INTEGER_PARSERS(short);
-DEFINE_INTEGER_PARSERS(ushort);
-DEFINE_INTEGER_PARSERS(int);
-DEFINE_INTEGER_PARSERS(uint);
-DEFINE_INTEGER_PARSERS(long);
-DEFINE_INTEGER_PARSERS(ulong);
-DEFINE_INTEGER_PARSERS(longlong);
-DEFINE_INTEGER_PARSERS(ulonglong);
+DEFINE_INTEGER_PARSER(short);
+DEFINE_INTEGER_PARSER(ushort);
+DEFINE_INTEGER_PARSER(int);
+DEFINE_INTEGER_PARSER(uint);
+DEFINE_INTEGER_PARSER(long);
+DEFINE_INTEGER_PARSER(ulong);
+DEFINE_INTEGER_PARSER(longlong);
+DEFINE_INTEGER_PARSER(ulonglong);
-#undef DEFINE_INTEGER_PARSERS
+#undef DEFINE_INTEGER_PARSER
} // namespace re2
diff --git a/util/pcre.h b/util/pcre.h
index 20b10c0..9ccdf35 100644
--- a/util/pcre.h
+++ b/util/pcre.h
@@ -2,6 +2,9 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
+#ifndef UTIL_PCRE_H_
+#define UTIL_PCRE_H_
+
// This is a variant of PCRE's pcrecpp.h, originally written at Google.
// The main changes are the addition of the HitLimit method and
// compilation as PCRE in namespace re2.
@@ -570,13 +573,18 @@ class PCRE::Arg {
typedef bool (*Parser)(const char* str, int n, void* dest);
// Type-specific parsers
-#define MAKE_PARSER(type,name) \
- Arg(type* p) : arg_(p), parser_(name) { } \
- Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \
-
+#define MAKE_PARSER(type, name) \
+ Arg(type* p) : arg_(p), parser_(name) {} \
+ Arg(type* p, Parser parser) : arg_(p), parser_(parser) {}
MAKE_PARSER(char, parse_char);
+ MAKE_PARSER(signed char, parse_schar);
MAKE_PARSER(unsigned char, parse_uchar);
+ MAKE_PARSER(float, parse_float);
+ MAKE_PARSER(double, parse_double);
+ MAKE_PARSER(string, parse_string);
+ MAKE_PARSER(StringPiece, parse_stringpiece);
+
MAKE_PARSER(short, parse_short);
MAKE_PARSER(unsigned short, parse_ushort);
MAKE_PARSER(int, parse_int);
@@ -585,10 +593,6 @@ class PCRE::Arg {
MAKE_PARSER(unsigned long, parse_ulong);
MAKE_PARSER(long long, parse_longlong);
MAKE_PARSER(unsigned long long, parse_ulonglong);
- MAKE_PARSER(float, parse_float);
- MAKE_PARSER(double, parse_double);
- MAKE_PARSER(string, parse_string);
- MAKE_PARSER(StringPiece, parse_stringpiece);
#undef MAKE_PARSER
@@ -608,21 +612,23 @@ class PCRE::Arg {
static bool parse_null (const char* str, int n, void* dest);
static bool parse_char (const char* str, int n, void* dest);
+ static bool parse_schar (const char* str, int n, void* dest);
static bool parse_uchar (const char* str, int n, void* dest);
static bool parse_float (const char* str, int n, void* dest);
static bool parse_double (const char* str, int n, void* dest);
static bool parse_string (const char* str, int n, void* dest);
static bool parse_stringpiece (const char* str, int n, void* dest);
-#define DECLARE_INTEGER_PARSER(name) \
- private: \
- static bool parse_ ## name(const char* str, int n, void* dest); \
- static bool parse_ ## name ## _radix( \
- const char* str, int n, void* dest, int radix); \
- public: \
- static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \
- static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \
- static bool parse_ ## name ## _cradix(const char* str, int n, void* dest)
+#define DECLARE_INTEGER_PARSER(name) \
+ private: \
+ static bool parse_##name(const char* str, int n, void* dest); \
+ static bool parse_##name##_radix(const char* str, int n, void* dest, \
+ int radix); \
+ \
+ public: \
+ static bool parse_##name##_hex(const char* str, int n, void* dest); \
+ static bool parse_##name##_octal(const char* str, int n, void* dest); \
+ static bool parse_##name##_cradix(const char* str, int n, void* dest)
DECLARE_INTEGER_PARSER(short);
DECLARE_INTEGER_PARSER(ushort);
@@ -634,6 +640,7 @@ class PCRE::Arg {
DECLARE_INTEGER_PARSER(ulonglong);
#undef DECLARE_INTEGER_PARSER
+
};
inline PCRE::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
@@ -644,13 +651,16 @@ inline bool PCRE::Arg::Parse(const char* str, int n) const {
}
// This part of the parser, appropriate only for ints, deals with bases
-#define MAKE_INTEGER_PARSER(type, name) \
- inline PCRE::Arg Hex(type* ptr) { \
- return PCRE::Arg(ptr, PCRE::Arg::parse_ ## name ## _hex); } \
- inline PCRE::Arg Octal(type* ptr) { \
- return PCRE::Arg(ptr, PCRE::Arg::parse_ ## name ## _octal); } \
- inline PCRE::Arg CRadix(type* ptr) { \
- return PCRE::Arg(ptr, PCRE::Arg::parse_ ## name ## _cradix); }
+#define MAKE_INTEGER_PARSER(type, name) \
+ inline PCRE::Arg Hex(type* ptr) { \
+ return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_hex); \
+ } \
+ inline PCRE::Arg Octal(type* ptr) { \
+ return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_octal); \
+ } \
+ inline PCRE::Arg CRadix(type* ptr) { \
+ return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_cradix); \
+ }
MAKE_INTEGER_PARSER(short, short);
MAKE_INTEGER_PARSER(unsigned short, ushort);
@@ -664,3 +674,5 @@ MAKE_INTEGER_PARSER(unsigned long long, ulonglong);
#undef MAKE_INTEGER_PARSER
} // namespace re2
+
+#endif // UTIL_PCRE_H_
diff --git a/util/random.h b/util/random.h
index 6c6e701..6c67b2c 100644
--- a/util/random.h
+++ b/util/random.h
@@ -2,10 +2,10 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// Modified from Google perftools's tcmalloc_unittest.cc.
+#ifndef UTIL_RANDOM_H_
+#define UTIL_RANDOM_H_
-#ifndef RE2_UTIL_RANDOM_H__
-#define RE2_UTIL_RANDOM_H__
+// Modified from Google perftools's tcmalloc_unittest.cc.
#include "util/util.h"
@@ -26,4 +26,4 @@ class ACMRandom {
} // namespace re2
-#endif // RE2_UTIL_RANDOM_H__
+#endif // UTIL_RANDOM_H_
diff --git a/util/sparse_array.h b/util/sparse_array.h
index 8f71fa0..d37a10a 100644
--- a/util/sparse_array.h
+++ b/util/sparse_array.h
@@ -2,6 +2,9 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
+#ifndef UTIL_SPARSE_ARRAY_H_
+#define UTIL_SPARSE_ARRAY_H_
+
// DESCRIPTION
//
// SparseArray<T>(m) is a map from integers in [0, m) to T values.
@@ -89,9 +92,6 @@
// immediately become inaccessible, but they are only guaranteed to be
// destroyed when the SparseArray destructor is called.
-#ifndef RE2_UTIL_SPARSE_ARRAY_H__
-#define RE2_UTIL_SPARSE_ARRAY_H__
-
#include "util/util.h"
namespace re2 {
@@ -462,4 +462,4 @@ template<typename Value> bool SparseArray<Value>::less(const IndexValue& a,
} // namespace re2
-#endif // RE2_UTIL_SPARSE_ARRAY_H__
+#endif // UTIL_SPARSE_ARRAY_H_
diff --git a/util/sparse_set.h b/util/sparse_set.h
index 9dd41ee..537a094 100644
--- a/util/sparse_set.h
+++ b/util/sparse_set.h
@@ -2,6 +2,9 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
+#ifndef UTIL_SPARSE_SET_H_
+#define UTIL_SPARSE_SET_H_
+
// DESCRIPTION
//
// SparseSet<T>(m) is a set of integers in [0, m).
@@ -44,9 +47,6 @@
//
// See sparse_array.h for implementation details
-#ifndef RE2_UTIL_SPARSE_SET_H__
-#define RE2_UTIL_SPARSE_SET_H__
-
#include "util/util.h"
namespace re2 {
@@ -182,4 +182,4 @@ class SparseSet {
} // namespace re2
-#endif // RE2_UTIL_SPARSE_SET_H__
+#endif // UTIL_SPARSE_SET_H_
diff --git a/util/test.h b/util/test.h
index 3701eab..4bdd343 100644
--- a/util/test.h
+++ b/util/test.h
@@ -2,8 +2,8 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-#ifndef RE2_UTIL_TEST_H__
-#define RE2_UTIL_TEST_H__
+#ifndef UTIL_TEST_H_
+#define UTIL_TEST_H_
#include "util/util.h"
#include "util/flags.h"
@@ -42,4 +42,4 @@ class MallocCounter {
};
} // namespace testing
-#endif // RE2_UTIL_TEST_H__
+#endif // UTIL_TEST_H_
diff --git a/util/thread.h b/util/thread.h
index fb67bdc..f9ecaf6 100644
--- a/util/thread.h
+++ b/util/thread.h
@@ -2,8 +2,8 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-#ifndef RE2_UTIL_THREAD_H__
-#define RE2_UTIL_THREAD_H__
+#ifndef UTIL_THREAD_H_
+#define UTIL_THREAD_H_
#ifdef _WIN32
#include <windows.h>
@@ -30,4 +30,4 @@ class Thread {
bool joinable_;
};
-#endif // RE2_UTIL_THREAD_H__
+#endif // UTIL_THREAD_H_
diff --git a/util/utf.h b/util/utf.h
index 06ff8f0..85b4297 100644
--- a/util/utf.h
+++ b/util/utf.h
@@ -14,8 +14,9 @@
* This file and rune.cc have been converted to compile as C++ code
* in name space re2.
*/
-#ifndef RE2_UTIL_UTF_H__
-#define RE2_UTIL_UTF_H__
+
+#ifndef UTIL_UTF_H_
+#define UTIL_UTF_H_
#include <stdint.h>
@@ -40,4 +41,4 @@ char* utfrune(const char*, Rune);
} // namespace re2
-#endif // RE2_UTIL_UTF_H__
+#endif // UTIL_UTF_H_
diff --git a/util/util.h b/util/util.h
index 1e78aa3..27c075f 100644
--- a/util/util.h
+++ b/util/util.h
@@ -2,8 +2,8 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-#ifndef RE2_UTIL_UTIL_H__
-#define RE2_UTIL_UTIL_H__
+#ifndef UTIL_UTIL_H_
+#define UTIL_UTIL_H_
// C
#include <stdio.h>
@@ -33,6 +33,7 @@
#include <atomic>
#include <mutex> // For std::call_once
#include <unordered_set>
+#include <initializer_list>
// Use std names.
using std::set;
@@ -58,6 +59,8 @@ using std::unordered_set;
#define strtoull _strtoui64
#define vsnprintf vsnprintf_s
+#pragma warning(disable: 4200) // zero-sized array
+
#endif
namespace re2 {
@@ -71,9 +74,7 @@ typedef uint32_t uint32;
typedef int64_t int64;
typedef uint64_t uint64;
-typedef unsigned long ulong;
typedef unsigned int uint;
-typedef unsigned short ushort;
// Prevent the compiler from complaining about or optimizing away variables
// that appear unused.
@@ -101,6 +102,10 @@ template<bool> struct CompileAssert {};
#define arraysize(array) (int)(sizeof(array)/sizeof((array)[0]))
+#ifndef FALLTHROUGH_INTENDED
+#define FALLTHROUGH_INTENDED do { } while (0)
+#endif
+
#ifndef NO_THREAD_SAFETY_ANALYSIS
#define NO_THREAD_SAFETY_ANALYSIS
#endif
@@ -138,4 +143,4 @@ bool RunningOnValgrind();
#include "util/mutex.h"
#include "util/utf.h"
-#endif // RE2_UTIL_UTIL_H__
+#endif // UTIL_UTIL_H_
diff --git a/util/valgrind.h b/util/valgrind.h
index ca10b1a..2200a22 100644
--- a/util/valgrind.h
+++ b/util/valgrind.h
@@ -55,6 +55,8 @@
----------------------------------------------------------------
*/
+#ifndef UTIL_VALGRIND_H_
+#define UTIL_VALGRIND_H_
/* This file is for inclusion into client (your!) code.
@@ -70,9 +72,6 @@
problem, you can compile with the NVALGRIND symbol defined (gcc
-DNVALGRIND) so that client requests are not even compiled in. */
-#ifndef __VALGRIND_H
-#define __VALGRIND_H
-
#include <stdarg.h>
/* Nb: this file might be included in a file compiled with -ansi. So
@@ -4514,4 +4513,4 @@ VALGRIND_PRINTF_BACKTRACE(const char *format, ...)
#undef PLAT_ppc32_aix5
#undef PLAT_ppc64_aix5
-#endif /* __VALGRIND_H */
+#endif // UTIL_VALGRIND_H_