Imported Upstream version 20181201upstream/20181201

Change-Id: I4e012361ab9520e396e02cc612d950a1698f0be3 Signed-off-by: DongHun Kwak <dh0128.kwak@samsung.com>
author: DongHun Kwak <dh0128.kwak@samsung.com> 2018-12-05 10:36:21 +0900
committer: DongHun Kwak <dh0128.kwak@samsung.com> 2018-12-05 10:36:24 +0900
commit: 7615b8c36db26d7f3731eb4bdac2e85ba4984413 (patch)
tree: 2d83f6b8f90ded941a6b5c8cf554945bd90f567e
parent: 41cf41822489f66bc95b5e0572f5eee10a524274 (diff)
download: re2-7615b8c36db26d7f3731eb4bdac2e85ba4984413.tar.gz
re2-7615b8c36db26d7f3731eb4bdac2e85ba4984413.tar.bz2
re2-7615b8c36db26d7f3731eb4bdac2e85ba4984413.zip
8 files changed, 120 insertions, 65 deletions
diff --git a/re2/fuzzing/re2_fuzzer.cc b/re2/fuzzing/re2_fuzzer.cc
index 3ce4d1b..28ccaf4 100644
--- a/re2/fuzzing/re2_fuzzer.cc
+++ b/re2/fuzzing/re2_fuzzer.cc
@@ -5,8 +5,11 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <map>
+#include <memory>
+#include <queue>
 #include <string>
 
+#include "re2/prefilter.h"
 #include "re2/re2.h"
 
 using re2::StringPiece;
@@ -21,17 +24,45 @@ void Test(StringPiece pattern, const RE2::Options& options, StringPiece text) {
     return;
 
   // Don't waste time fuzzing high-size programs.
-  // (They can cause bug reports due to fuzzer timeouts.)
+  // They can cause bug reports due to fuzzer timeouts.
   int size = re.ProgramSize();
   if (size > 9999)
     return;
+  int rsize = re.ReverseProgramSize();
+  if (rsize > 9999)
+    return;
 
   // Don't waste time fuzzing high-fanout programs.
-  // (They can also cause bug reports due to fuzzer timeouts.)
+  // They can cause bug reports due to fuzzer timeouts.
   std::map<int, int> histogram;
   int fanout = re.ProgramFanout(&histogram);
   if (fanout > 9)
     return;
+  int rfanout = re.ReverseProgramFanout(&histogram);
+  if (rfanout > 9)
+    return;
+
+  // Don't waste time fuzzing programs with large substrings.
+  // They can cause bug reports due to fuzzer timeouts when they
+  // are repetitions (e.g. hundreds of NUL bytes) and matching is
+  // unanchored. And they aren't interesting for fuzzing purposes.
+  std::unique_ptr<re2::Prefilter> prefilter(re2::Prefilter::FromRE2(&re));
+  if (prefilter == nullptr)
+    return;
+  std::queue<re2::Prefilter*> nodes;
+  nodes.push(prefilter.get());
+  while (!nodes.empty()) {
+    re2::Prefilter* node = nodes.front();
+    nodes.pop();
+    if (node->op() == re2::Prefilter::ATOM) {
+      if (node->atom().size() > 9)
+        return;
+    } else if (node->op() == re2::Prefilter::AND ||
+               node->op() == re2::Prefilter::OR) {
+      for (re2::Prefilter* sub : *node->subs())
+        nodes.push(sub);
+    }
+  }
 
   StringPiece sp1, sp2, sp3, sp4;
   string s1, s2, s3, s4;
diff --git a/re2/nfa.cc b/re2/nfa.cc
index 1a3e0af..b2d5c0a 100644
--- a/re2/nfa.cc
+++ b/re2/nfa.cc
@@ -95,20 +95,20 @@ class NFA {
 
   // Follows all empty arrows from id0 and enqueues all the states reached.
   // Enqueues only the ByteRange instructions that match byte c.
-  // The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
+  // context is used (with p) for evaluating empty-width specials.
   // p is the current input position, and t0 is the current thread.
-  void AddToThreadq(Threadq* q, int id0, int c, int flag,
+  void AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
                     const char* p, Thread* t0);
 
   // Run runq on byte c, appending new states to nextq.
   // Updates matched_ and match_ as new, better matches are found.
+  // context is used (with p) for evaluating empty-width specials.
   // p is the position of byte c in the input string for AddToThreadq;
   // p-1 will be used when processing Match instructions.
-  // flag is the bitwise OR of Bol, Eol, etc., specifying whether
-  // ^, $ and \b match the current input position (after c).
   // Frees all the threads on runq.
   // If there is a shortcut to the end, returns that shortcut.
-  inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
+  int Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
+           const char* p);
 
   // Returns text version of capture information, for debugging.
   string FormatCapture(const char** capture);
@@ -204,9 +204,9 @@ void NFA::CopyCapture(const char** dst, const char** src) {
 
 // Follows all empty arrows from id0 and enqueues all the states reached.
 // Enqueues only the ByteRange instructions that match byte c.
-// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
+// context is used (with p) for evaluating empty-width specials.
 // p is the current input position, and t0 is the current thread.
-void NFA::AddToThreadq(Threadq* q, int id0, int c, int flag,
+void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
                        const char* p, Thread* t0) {
   if (id0 == 0)
     return;
@@ -318,7 +318,7 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, int flag,
         stk[nstk++] = AddState(id+1);
 
       // Continue on if we have all the right flag bits.
-      if (ip->empty() & ~flag)
+      if (ip->empty() & ~Prog::EmptyFlags(context, p))
         break;
       a = AddState(ip->out());
       goto Loop;
@@ -328,13 +328,13 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, int flag,
 
 // Run runq on byte c, appending new states to nextq.
 // Updates matched_ and match_ as new, better matches are found.
+// context is used (with p) for evaluating empty-width specials.
 // p is the position of byte c in the input string for AddToThreadq;
 // p-1 will be used when processing Match instructions.
-// flag is the bitwise OR of Bol, Eol, etc., specifying whether
-// ^, $ and \b match the current input position (after c).
 // Frees all the threads on runq.
 // If there is a shortcut to the end, returns that shortcut.
-int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
+int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
+              const char* p) {
   nextq->clear();
 
   for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
@@ -360,7 +360,7 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
         break;
 
       case kInstByteRange:
-        AddToThreadq(nextq, ip->out(), c, flag, p, t);
+        AddToThreadq(nextq, ip->out(), c, context, p, t);
         break;
 
       case kInstAltMatch:
@@ -500,38 +500,9 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
   runq->clear();
   nextq->clear();
   memset(&match_[0], 0, ncapture_*sizeof match_[0]);
-  int wasword = 0;
-
-  if (text.begin() > context.begin())
-    wasword = Prog::IsWordChar(text.begin()[-1] & 0xFF);
 
   // Loop over the text, stepping the machine.
   for (const char* p = text.begin();; p++) {
-    // Check for empty-width specials.
-    int flag = 0;
-
-    // ^ and \A
-    if (p == context.begin())
-      flag |= kEmptyBeginText | kEmptyBeginLine;
-    else if (p <= context.end() && p[-1] == '\n')
-      flag |= kEmptyBeginLine;
-
-    // $ and \z
-    if (p == context.end())
-      flag |= kEmptyEndText | kEmptyEndLine;
-    else if (p < context.end() && p[0] == '\n')
-      flag |= kEmptyEndLine;
-
-    // \b and \B
-    int isword = 0;
-    if (p < context.end())
-      isword = Prog::IsWordChar(p[0] & 0xFF);
-
-    if (isword != wasword)
-      flag |= kEmptyWordBoundary;
-    else
-      flag |= kEmptyNonWordBoundary;
-
     if (ExtraDebug) {
       int c = 0;
       if (p == context.begin())
@@ -541,7 +512,7 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
       else if (p < text.end())
         c = p[0] & 0xFF;
 
-      fprintf(stderr, "%c[%#x/%d/%d]:", c, flag, isword, wasword);
+      fprintf(stderr, "%c:", c);
       for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
         Thread* t = i->second;
         if (t == NULL)
@@ -552,7 +523,7 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
     }
 
     // This is a no-op the first time around the loop because runq is empty.
-    int id = Step(runq, nextq, p < text.end() ? p[0] & 0xFF : -1, flag, p);
+    int id = Step(runq, nextq, p < text.end() ? p[0] & 0xFF : -1, context, p);
     DCHECK_EQ(runq->size(), 0);
     using std::swap;
     swap(nextq, runq);
@@ -604,17 +575,14 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
         p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
         if (p == NULL) {
           p = text.end();
-          isword = 0;
-        } else {
-          isword = Prog::IsWordChar(p[0] & 0xFF);
         }
-        flag = Prog::EmptyFlags(context, p);
       }
 
       Thread* t = AllocThread();
       CopyCapture(t->capture, match_);
       t->capture[0] = p;
-      AddToThreadq(runq, start_, p < text.end() ? p[0] & 0xFF : -1, flag, p, t);
+      AddToThreadq(runq, start_, p < text.end() ? p[0] & 0xFF : -1, context, p,
+                   t);
       Decref(t);
     }
 
@@ -624,8 +592,6 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
         fprintf(stderr, "dead\n");
       break;
     }
-
-    wasword = isword;
   }
 
   for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
diff --git a/re2/prefilter.cc b/re2/prefilter.cc
index e34aaf0..b657357 100644
--- a/re2/prefilter.cc
+++ b/re2/prefilter.cc
@@ -214,7 +214,7 @@ class Prefilter::Info {
   static Info* Quest(Info* a);
   static Info* EmptyString();
   static Info* NoMatch();
-  static Info* AnyChar();
+  static Info* AnyCharOrAnyByte();
   static Info* CClass(CharClass* cc, bool latin1);
   static Info* Literal(Rune r);
   static Info* LiteralLatin1(Rune r);
@@ -417,8 +417,8 @@ Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
   return info;
 }
 
-// Constructs Info for dot (any character).
-Prefilter::Info* Prefilter::Info::AnyChar() {
+// Constructs Info for dot (any character) or \C (any byte).
+Prefilter::Info* Prefilter::Info::AnyCharOrAnyByte() {
   Prefilter::Info* info = new Prefilter::Info();
   info->match_ = new Prefilter(ALL);
   return info;
@@ -461,7 +461,7 @@ Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
 
   // If the class is too large, it's okay to overestimate.
   if (cc->size() > 10)
-    return AnyChar();
+    return AnyCharOrAnyByte();
 
   Prefilter::Info *a = new Prefilter::Info();
   for (CCIter i = cc->begin(); i != cc->end(); ++i)
@@ -622,8 +622,9 @@ Prefilter::Info* Prefilter::Info::Walker::PostVisit(
       break;
 
     case kRegexpAnyChar:
+    case kRegexpAnyByte:
       // Claim nothing, except that it's not empty.
-      info = AnyChar();
+      info = AnyCharOrAnyByte();
       break;
 
     case kRegexpCharClass:
diff --git a/re2/prefilter_tree.cc b/re2/prefilter_tree.cc
index 376c6cd..a07de40 100644
--- a/re2/prefilter_tree.cc
+++ b/re2/prefilter_tree.cc
@@ -138,6 +138,7 @@ bool PrefilterTree::KeepNode(Prefilter* node) const {
       return false;
 
     case Prefilter::ALL:
+    case Prefilter::NONE:
       return false;
 
     case Prefilter::ATOM:
diff --git a/re2/re2.cc b/re2/re2.cc
index 89c67ec..f3f96ae 100644
--- a/re2/re2.cc
+++ b/re2/re2.cc
@@ -262,11 +262,18 @@ int RE2::ProgramSize() const {
   return prog_->size();
 }
 
-int RE2::ProgramFanout(std::map<int, int>* histogram) const {
+int RE2::ReverseProgramSize() const {
   if (prog_ == NULL)
     return -1;
-  SparseArray<int> fanout(prog_->size());
-  prog_->Fanout(&fanout);
+  Prog* prog = ReverseProg();
+  if (prog == NULL)
+    return -1;
+  return prog->size();
+}
+
+static int Fanout(Prog* prog, std::map<int, int>* histogram) {
+  SparseArray<int> fanout(prog->size());
+  prog->Fanout(&fanout);
   histogram->clear();
   for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) {
     // TODO(junyer): Optimise this?
@@ -279,6 +286,21 @@ int RE2::ProgramFanout(std::map<int, int>* histogram) const {
   return histogram->rbegin()->first;
 }
 
+int RE2::ProgramFanout(std::map<int, int>* histogram) const {
+  if (prog_ == NULL)
+    return -1;
+  return Fanout(prog_, histogram);
+}
+
+int RE2::ReverseProgramFanout(std::map<int, int>* histogram) const {
+  if (prog_ == NULL)
+    return -1;
+  Prog* prog = ReverseProg();
+  if (prog == NULL)
+    return -1;
+  return Fanout(prog, histogram);
+}
+
 // Returns num_captures_, computing it if needed, or -1 if the
 // regexp wasn't valid on construction.
 int RE2::NumberOfCapturingGroups() const {
diff --git a/re2/re2.h b/re2/re2.h
index 2a012e1..c93de56 100644
--- a/re2/re2.h
+++ b/re2/re2.h
@@ -53,9 +53,19 @@
 //    CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1)));
 //
 // -----------------------------------------------------------------------
-// MATCHING WITH SUB-STRING EXTRACTION:
-//
-// You can supply extra pointer arguments to extract matched subpieces.
+// MATCHING WITH SUBSTRING EXTRACTION:
+//
+// You can supply extra pointer arguments to extract matched substrings.
+// On match failure, none of the pointees will have been modified.
+// On match success, the substrings will be converted (as necessary) and
+// their values will be assigned to their pointees until all conversions
+// have succeeded or one conversion has failed.
+// On conversion failure, the pointees will be in an indeterminate state
+// because the caller has no way of knowing which conversion failed.
+// However, conversion cannot fail for types like string and StringPiece
+// that do not inspect the substring contents. Hence, in the common case
+// where all of the pointees are of such types, failure is always due to
+// match failure and thus none of the pointees will have been modified.
 //
 // Example: extracts "ruby" into "s" and 1234 into "i"
 //    int i;
@@ -278,11 +288,13 @@ class RE2 {
   // Returns the program size, a very approximate measure of a regexp's "cost".
   // Larger numbers are more expensive than smaller numbers.
   int ProgramSize() const;
+  int ReverseProgramSize() const;
 
   // EXPERIMENTAL! SUBJECT TO CHANGE!
   // Outputs the program fanout as a histogram bucketed by powers of 2.
   // Returns the number of the largest non-empty bucket.
   int ProgramFanout(std::map<int, int>* histogram) const;
+  int ReverseProgramFanout(std::map<int, int>* histogram) const;
 
   // Returns the underlying Regexp; not for general use.
   // Returns entire_regexp_ so that callers don't need
@@ -489,6 +501,7 @@ class RE2 {
   // I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true, with
   // submatch[0] = "barbaz", submatch[1].data() = NULL, submatch[2] = "bar",
   // submatch[3].data() = NULL, ..., up to submatch[nsubmatch-1].data() = NULL.
+  // Caveat: submatch[] may be clobbered even on match failure.
   //
   // Don't ask for more match information than you will use:
   // runs much faster with nsubmatch == 1 than nsubmatch > 1, and
diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc
index b847325..cae956c 100644
--- a/re2/testing/re2_test.cc
+++ b/re2/testing/re2_test.cc
@@ -461,6 +461,10 @@ TEST(ProgramSize, BigProgram) {
   ASSERT_GT(re_simple.ProgramSize(), 0);
   ASSERT_GT(re_medium.ProgramSize(), re_simple.ProgramSize());
   ASSERT_GT(re_complex.ProgramSize(), re_medium.ProgramSize());
+
+  ASSERT_GT(re_simple.ReverseProgramSize(), 0);
+  ASSERT_GT(re_medium.ReverseProgramSize(), re_simple.ReverseProgramSize());
+  ASSERT_GT(re_complex.ReverseProgramSize(), re_medium.ReverseProgramSize());
 }
 
 TEST(ProgramFanout, BigProgram) {
@@ -486,6 +490,23 @@ TEST(ProgramFanout, BigProgram) {
   // 13 is the largest non-empty bucket and has 1000 elements.
   ASSERT_EQ(13, re1000.ProgramFanout(&histogram));
   ASSERT_EQ(1000, histogram[13]);
+
+  // 2 is the largest non-empty bucket and has 3 elements.
+  // This differs from the others due to how reverse `.' works.
+  ASSERT_EQ(2, re1.ReverseProgramFanout(&histogram));
+  ASSERT_EQ(3, histogram[2]);
+
+  // 5 is the largest non-empty bucket and has 10 elements.
+  ASSERT_EQ(5, re10.ReverseProgramFanout(&histogram));
+  ASSERT_EQ(10, histogram[5]);
+
+  // 9 is the largest non-empty bucket and has 100 elements.
+  ASSERT_EQ(9, re100.ReverseProgramFanout(&histogram));
+  ASSERT_EQ(100, histogram[9]);
+
+  // 12 is the largest non-empty bucket and has 1000 elements.
+  ASSERT_EQ(12, re1000.ReverseProgramFanout(&histogram));
+  ASSERT_EQ(1000, histogram[12]);
 }
 
 // Issue 956519: handling empty character sets was
diff --git a/util/pcre.h b/util/pcre.h
index 7c6403d..10ec4f2 100644
--- a/util/pcre.h
+++ b/util/pcre.h
@@ -61,9 +61,9 @@
 //    CHECK(PCRE::FullMatch(utf8_string, re));
 //
 // -----------------------------------------------------------------------
-// MATCHING WITH SUB-STRING EXTRACTION:
+// MATCHING WITH SUBSTRING EXTRACTION:
 //
-// You can supply extra pointer arguments to extract matched subpieces.
+// You can supply extra pointer arguments to extract matched substrings.
 //
 // Example: extracts "ruby" into "s" and 1234 into "i"
 //    int i;
author	DongHun Kwak <dh0128.kwak@samsung.com>	2018-12-05 10:36:21 +0900
committer	DongHun Kwak <dh0128.kwak@samsung.com>	2018-12-05 10:36:24 +0900
commit	7615b8c36db26d7f3731eb4bdac2e85ba4984413 (patch)
tree	2d83f6b8f90ded941a6b5c8cf554945bd90f567e
parent	41cf41822489f66bc95b5e0572f5eee10a524274 (diff)
download	re2-7615b8c36db26d7f3731eb4bdac2e85ba4984413.tar.gz re2-7615b8c36db26d7f3731eb4bdac2e85ba4984413.tar.bz2 re2-7615b8c36db26d7f3731eb4bdac2e85ba4984413.zip