summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDongHun Kwak <dh0128.kwak@samsung.com>2018-12-05 10:36:21 +0900
committerDongHun Kwak <dh0128.kwak@samsung.com>2018-12-05 10:36:24 +0900
commit7615b8c36db26d7f3731eb4bdac2e85ba4984413 (patch)
tree2d83f6b8f90ded941a6b5c8cf554945bd90f567e
parent41cf41822489f66bc95b5e0572f5eee10a524274 (diff)
downloadre2-7615b8c36db26d7f3731eb4bdac2e85ba4984413.tar.gz
re2-7615b8c36db26d7f3731eb4bdac2e85ba4984413.tar.bz2
re2-7615b8c36db26d7f3731eb4bdac2e85ba4984413.zip
Imported Upstream version 20181201upstream/20181201
Change-Id: I4e012361ab9520e396e02cc612d950a1698f0be3 Signed-off-by: DongHun Kwak <dh0128.kwak@samsung.com>
-rw-r--r--re2/fuzzing/re2_fuzzer.cc35
-rw-r--r--re2/nfa.cc66
-rw-r--r--re2/prefilter.cc11
-rw-r--r--re2/prefilter_tree.cc1
-rw-r--r--re2/re2.cc28
-rw-r--r--re2/re2.h19
-rw-r--r--re2/testing/re2_test.cc21
-rw-r--r--util/pcre.h4
8 files changed, 120 insertions, 65 deletions
diff --git a/re2/fuzzing/re2_fuzzer.cc b/re2/fuzzing/re2_fuzzer.cc
index 3ce4d1b..28ccaf4 100644
--- a/re2/fuzzing/re2_fuzzer.cc
+++ b/re2/fuzzing/re2_fuzzer.cc
@@ -5,8 +5,11 @@
#include <stddef.h>
#include <stdint.h>
#include <map>
+#include <memory>
+#include <queue>
#include <string>
+#include "re2/prefilter.h"
#include "re2/re2.h"
using re2::StringPiece;
@@ -21,17 +24,45 @@ void Test(StringPiece pattern, const RE2::Options& options, StringPiece text) {
return;
// Don't waste time fuzzing high-size programs.
- // (They can cause bug reports due to fuzzer timeouts.)
+ // They can cause bug reports due to fuzzer timeouts.
int size = re.ProgramSize();
if (size > 9999)
return;
+ int rsize = re.ReverseProgramSize();
+ if (rsize > 9999)
+ return;
// Don't waste time fuzzing high-fanout programs.
- // (They can also cause bug reports due to fuzzer timeouts.)
+ // They can cause bug reports due to fuzzer timeouts.
std::map<int, int> histogram;
int fanout = re.ProgramFanout(&histogram);
if (fanout > 9)
return;
+ int rfanout = re.ReverseProgramFanout(&histogram);
+ if (rfanout > 9)
+ return;
+
+ // Don't waste time fuzzing programs with large substrings.
+ // They can cause bug reports due to fuzzer timeouts when they
+ // are repetitions (e.g. hundreds of NUL bytes) and matching is
+ // unanchored. And they aren't interesting for fuzzing purposes.
+ std::unique_ptr<re2::Prefilter> prefilter(re2::Prefilter::FromRE2(&re));
+ if (prefilter == nullptr)
+ return;
+ std::queue<re2::Prefilter*> nodes;
+ nodes.push(prefilter.get());
+ while (!nodes.empty()) {
+ re2::Prefilter* node = nodes.front();
+ nodes.pop();
+ if (node->op() == re2::Prefilter::ATOM) {
+ if (node->atom().size() > 9)
+ return;
+ } else if (node->op() == re2::Prefilter::AND ||
+ node->op() == re2::Prefilter::OR) {
+ for (re2::Prefilter* sub : *node->subs())
+ nodes.push(sub);
+ }
+ }
StringPiece sp1, sp2, sp3, sp4;
string s1, s2, s3, s4;
diff --git a/re2/nfa.cc b/re2/nfa.cc
index 1a3e0af..b2d5c0a 100644
--- a/re2/nfa.cc
+++ b/re2/nfa.cc
@@ -95,20 +95,20 @@ class NFA {
// Follows all empty arrows from id0 and enqueues all the states reached.
// Enqueues only the ByteRange instructions that match byte c.
- // The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
+ // context is used (with p) for evaluating empty-width specials.
// p is the current input position, and t0 is the current thread.
- void AddToThreadq(Threadq* q, int id0, int c, int flag,
+ void AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
const char* p, Thread* t0);
// Run runq on byte c, appending new states to nextq.
// Updates matched_ and match_ as new, better matches are found.
+ // context is used (with p) for evaluating empty-width specials.
// p is the position of byte c in the input string for AddToThreadq;
// p-1 will be used when processing Match instructions.
- // flag is the bitwise OR of Bol, Eol, etc., specifying whether
- // ^, $ and \b match the current input position (after c).
// Frees all the threads on runq.
// If there is a shortcut to the end, returns that shortcut.
- inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
+ int Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
+ const char* p);
// Returns text version of capture information, for debugging.
string FormatCapture(const char** capture);
@@ -204,9 +204,9 @@ void NFA::CopyCapture(const char** dst, const char** src) {
// Follows all empty arrows from id0 and enqueues all the states reached.
// Enqueues only the ByteRange instructions that match byte c.
-// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
+// context is used (with p) for evaluating empty-width specials.
// p is the current input position, and t0 is the current thread.
-void NFA::AddToThreadq(Threadq* q, int id0, int c, int flag,
+void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
const char* p, Thread* t0) {
if (id0 == 0)
return;
@@ -318,7 +318,7 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, int flag,
stk[nstk++] = AddState(id+1);
// Continue on if we have all the right flag bits.
- if (ip->empty() & ~flag)
+ if (ip->empty() & ~Prog::EmptyFlags(context, p))
break;
a = AddState(ip->out());
goto Loop;
@@ -328,13 +328,13 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, int flag,
// Run runq on byte c, appending new states to nextq.
// Updates matched_ and match_ as new, better matches are found.
+// context is used (with p) for evaluating empty-width specials.
// p is the position of byte c in the input string for AddToThreadq;
// p-1 will be used when processing Match instructions.
-// flag is the bitwise OR of Bol, Eol, etc., specifying whether
-// ^, $ and \b match the current input position (after c).
// Frees all the threads on runq.
// If there is a shortcut to the end, returns that shortcut.
-int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
+int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
+ const char* p) {
nextq->clear();
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
@@ -360,7 +360,7 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
break;
case kInstByteRange:
- AddToThreadq(nextq, ip->out(), c, flag, p, t);
+ AddToThreadq(nextq, ip->out(), c, context, p, t);
break;
case kInstAltMatch:
@@ -500,38 +500,9 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
runq->clear();
nextq->clear();
memset(&match_[0], 0, ncapture_*sizeof match_[0]);
- int wasword = 0;
-
- if (text.begin() > context.begin())
- wasword = Prog::IsWordChar(text.begin()[-1] & 0xFF);
// Loop over the text, stepping the machine.
for (const char* p = text.begin();; p++) {
- // Check for empty-width specials.
- int flag = 0;
-
- // ^ and \A
- if (p == context.begin())
- flag |= kEmptyBeginText | kEmptyBeginLine;
- else if (p <= context.end() && p[-1] == '\n')
- flag |= kEmptyBeginLine;
-
- // $ and \z
- if (p == context.end())
- flag |= kEmptyEndText | kEmptyEndLine;
- else if (p < context.end() && p[0] == '\n')
- flag |= kEmptyEndLine;
-
- // \b and \B
- int isword = 0;
- if (p < context.end())
- isword = Prog::IsWordChar(p[0] & 0xFF);
-
- if (isword != wasword)
- flag |= kEmptyWordBoundary;
- else
- flag |= kEmptyNonWordBoundary;
-
if (ExtraDebug) {
int c = 0;
if (p == context.begin())
@@ -541,7 +512,7 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
else if (p < text.end())
c = p[0] & 0xFF;
- fprintf(stderr, "%c[%#x/%d/%d]:", c, flag, isword, wasword);
+ fprintf(stderr, "%c:", c);
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
Thread* t = i->second;
if (t == NULL)
@@ -552,7 +523,7 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
}
// This is a no-op the first time around the loop because runq is empty.
- int id = Step(runq, nextq, p < text.end() ? p[0] & 0xFF : -1, flag, p);
+ int id = Step(runq, nextq, p < text.end() ? p[0] & 0xFF : -1, context, p);
DCHECK_EQ(runq->size(), 0);
using std::swap;
swap(nextq, runq);
@@ -604,17 +575,14 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
if (p == NULL) {
p = text.end();
- isword = 0;
- } else {
- isword = Prog::IsWordChar(p[0] & 0xFF);
}
- flag = Prog::EmptyFlags(context, p);
}
Thread* t = AllocThread();
CopyCapture(t->capture, match_);
t->capture[0] = p;
- AddToThreadq(runq, start_, p < text.end() ? p[0] & 0xFF : -1, flag, p, t);
+ AddToThreadq(runq, start_, p < text.end() ? p[0] & 0xFF : -1, context, p,
+ t);
Decref(t);
}
@@ -624,8 +592,6 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
fprintf(stderr, "dead\n");
break;
}
-
- wasword = isword;
}
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
diff --git a/re2/prefilter.cc b/re2/prefilter.cc
index e34aaf0..b657357 100644
--- a/re2/prefilter.cc
+++ b/re2/prefilter.cc
@@ -214,7 +214,7 @@ class Prefilter::Info {
static Info* Quest(Info* a);
static Info* EmptyString();
static Info* NoMatch();
- static Info* AnyChar();
+ static Info* AnyCharOrAnyByte();
static Info* CClass(CharClass* cc, bool latin1);
static Info* Literal(Rune r);
static Info* LiteralLatin1(Rune r);
@@ -417,8 +417,8 @@ Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
return info;
}
-// Constructs Info for dot (any character).
-Prefilter::Info* Prefilter::Info::AnyChar() {
+// Constructs Info for dot (any character) or \C (any byte).
+Prefilter::Info* Prefilter::Info::AnyCharOrAnyByte() {
Prefilter::Info* info = new Prefilter::Info();
info->match_ = new Prefilter(ALL);
return info;
@@ -461,7 +461,7 @@ Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
// If the class is too large, it's okay to overestimate.
if (cc->size() > 10)
- return AnyChar();
+ return AnyCharOrAnyByte();
Prefilter::Info *a = new Prefilter::Info();
for (CCIter i = cc->begin(); i != cc->end(); ++i)
@@ -622,8 +622,9 @@ Prefilter::Info* Prefilter::Info::Walker::PostVisit(
break;
case kRegexpAnyChar:
+ case kRegexpAnyByte:
// Claim nothing, except that it's not empty.
- info = AnyChar();
+ info = AnyCharOrAnyByte();
break;
case kRegexpCharClass:
diff --git a/re2/prefilter_tree.cc b/re2/prefilter_tree.cc
index 376c6cd..a07de40 100644
--- a/re2/prefilter_tree.cc
+++ b/re2/prefilter_tree.cc
@@ -138,6 +138,7 @@ bool PrefilterTree::KeepNode(Prefilter* node) const {
return false;
case Prefilter::ALL:
+ case Prefilter::NONE:
return false;
case Prefilter::ATOM:
diff --git a/re2/re2.cc b/re2/re2.cc
index 89c67ec..f3f96ae 100644
--- a/re2/re2.cc
+++ b/re2/re2.cc
@@ -262,11 +262,18 @@ int RE2::ProgramSize() const {
return prog_->size();
}
-int RE2::ProgramFanout(std::map<int, int>* histogram) const {
+int RE2::ReverseProgramSize() const {
if (prog_ == NULL)
return -1;
- SparseArray<int> fanout(prog_->size());
- prog_->Fanout(&fanout);
+ Prog* prog = ReverseProg();
+ if (prog == NULL)
+ return -1;
+ return prog->size();
+}
+
+static int Fanout(Prog* prog, std::map<int, int>* histogram) {
+ SparseArray<int> fanout(prog->size());
+ prog->Fanout(&fanout);
histogram->clear();
for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) {
// TODO(junyer): Optimise this?
@@ -279,6 +286,21 @@ int RE2::ProgramFanout(std::map<int, int>* histogram) const {
return histogram->rbegin()->first;
}
+int RE2::ProgramFanout(std::map<int, int>* histogram) const {
+ if (prog_ == NULL)
+ return -1;
+ return Fanout(prog_, histogram);
+}
+
+int RE2::ReverseProgramFanout(std::map<int, int>* histogram) const {
+ if (prog_ == NULL)
+ return -1;
+ Prog* prog = ReverseProg();
+ if (prog == NULL)
+ return -1;
+ return Fanout(prog, histogram);
+}
+
// Returns num_captures_, computing it if needed, or -1 if the
// regexp wasn't valid on construction.
int RE2::NumberOfCapturingGroups() const {
diff --git a/re2/re2.h b/re2/re2.h
index 2a012e1..c93de56 100644
--- a/re2/re2.h
+++ b/re2/re2.h
@@ -53,9 +53,19 @@
// CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1)));
//
// -----------------------------------------------------------------------
-// MATCHING WITH SUB-STRING EXTRACTION:
-//
-// You can supply extra pointer arguments to extract matched subpieces.
+// MATCHING WITH SUBSTRING EXTRACTION:
+//
+// You can supply extra pointer arguments to extract matched substrings.
+// On match failure, none of the pointees will have been modified.
+// On match success, the substrings will be converted (as necessary) and
+// their values will be assigned to their pointees until all conversions
+// have succeeded or one conversion has failed.
+// On conversion failure, the pointees will be in an indeterminate state
+// because the caller has no way of knowing which conversion failed.
+// However, conversion cannot fail for types like string and StringPiece
+// that do not inspect the substring contents. Hence, in the common case
+// where all of the pointees are of such types, failure is always due to
+// match failure and thus none of the pointees will have been modified.
//
// Example: extracts "ruby" into "s" and 1234 into "i"
// int i;
@@ -278,11 +288,13 @@ class RE2 {
// Returns the program size, a very approximate measure of a regexp's "cost".
// Larger numbers are more expensive than smaller numbers.
int ProgramSize() const;
+ int ReverseProgramSize() const;
// EXPERIMENTAL! SUBJECT TO CHANGE!
// Outputs the program fanout as a histogram bucketed by powers of 2.
// Returns the number of the largest non-empty bucket.
int ProgramFanout(std::map<int, int>* histogram) const;
+ int ReverseProgramFanout(std::map<int, int>* histogram) const;
// Returns the underlying Regexp; not for general use.
// Returns entire_regexp_ so that callers don't need
@@ -489,6 +501,7 @@ class RE2 {
// I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true, with
// submatch[0] = "barbaz", submatch[1].data() = NULL, submatch[2] = "bar",
// submatch[3].data() = NULL, ..., up to submatch[nsubmatch-1].data() = NULL.
+ // Caveat: submatch[] may be clobbered even on match failure.
//
// Don't ask for more match information than you will use:
// runs much faster with nsubmatch == 1 than nsubmatch > 1, and
diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc
index b847325..cae956c 100644
--- a/re2/testing/re2_test.cc
+++ b/re2/testing/re2_test.cc
@@ -461,6 +461,10 @@ TEST(ProgramSize, BigProgram) {
ASSERT_GT(re_simple.ProgramSize(), 0);
ASSERT_GT(re_medium.ProgramSize(), re_simple.ProgramSize());
ASSERT_GT(re_complex.ProgramSize(), re_medium.ProgramSize());
+
+ ASSERT_GT(re_simple.ReverseProgramSize(), 0);
+ ASSERT_GT(re_medium.ReverseProgramSize(), re_simple.ReverseProgramSize());
+ ASSERT_GT(re_complex.ReverseProgramSize(), re_medium.ReverseProgramSize());
}
TEST(ProgramFanout, BigProgram) {
@@ -486,6 +490,23 @@ TEST(ProgramFanout, BigProgram) {
// 13 is the largest non-empty bucket and has 1000 elements.
ASSERT_EQ(13, re1000.ProgramFanout(&histogram));
ASSERT_EQ(1000, histogram[13]);
+
+ // 2 is the largest non-empty bucket and has 3 elements.
+ // This differs from the others due to how reverse `.' works.
+ ASSERT_EQ(2, re1.ReverseProgramFanout(&histogram));
+ ASSERT_EQ(3, histogram[2]);
+
+ // 5 is the largest non-empty bucket and has 10 elements.
+ ASSERT_EQ(5, re10.ReverseProgramFanout(&histogram));
+ ASSERT_EQ(10, histogram[5]);
+
+ // 9 is the largest non-empty bucket and has 100 elements.
+ ASSERT_EQ(9, re100.ReverseProgramFanout(&histogram));
+ ASSERT_EQ(100, histogram[9]);
+
+ // 12 is the largest non-empty bucket and has 1000 elements.
+ ASSERT_EQ(12, re1000.ReverseProgramFanout(&histogram));
+ ASSERT_EQ(1000, histogram[12]);
}
// Issue 956519: handling empty character sets was
diff --git a/util/pcre.h b/util/pcre.h
index 7c6403d..10ec4f2 100644
--- a/util/pcre.h
+++ b/util/pcre.h
@@ -61,9 +61,9 @@
// CHECK(PCRE::FullMatch(utf8_string, re));
//
// -----------------------------------------------------------------------
-// MATCHING WITH SUB-STRING EXTRACTION:
+// MATCHING WITH SUBSTRING EXTRACTION:
//
-// You can supply extra pointer arguments to extract matched subpieces.
+// You can supply extra pointer arguments to extract matched substrings.
//
// Example: extracts "ruby" into "s" and 1234 into "i"
// int i;