summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.bazelrc23
-rwxr-xr-x.github/bazel.sh4
-rwxr-xr-x.github/cmake.sh4
-rw-r--r--.github/workflows/ci-bazel.yml4
-rw-r--r--.github/workflows/ci-cmake.yml55
-rw-r--r--.github/workflows/ci.yml34
-rw-r--r--.github/workflows/pr.yml4
-rw-r--r--.github/workflows/python.yml224
-rw-r--r--BUILD278
-rw-r--r--BUILD.bazel394
-rw-r--r--CMakeLists.txt158
-rw-r--r--MODULE.bazel27
-rw-r--r--Makefile112
-rw-r--r--README5
-rw-r--r--WORKSPACE.bazel (renamed from WORKSPACE)2
-rw-r--r--WORKSPACE.bzlmod7
-rw-r--r--app/BUILD.bazel24
-rw-r--r--app/_re2.cc94
-rw-r--r--app/_re2.d.ts23
-rw-r--r--app/app.ts111
-rwxr-xr-xapp/build.sh44
-rw-r--r--app/index.html5
-rw-r--r--app/package.json14
-rw-r--r--app/rollup.config.js28
-rw-r--r--app/tsconfig.json17
-rw-r--r--doc/README.xkcd1
-rw-r--r--doc/syntax.html4
-rw-r--r--doc/syntax.txt4
-rw-r--r--doc/xkcd.pngbin26496 -> 0 bytes
-rw-r--r--libre2.symbols3
-rw-r--r--libre2.symbols.darwin3
-rw-r--r--python/BUILD.bazel36
l---------python/LICENSE1
-rw-r--r--python/README1
-rw-r--r--python/_re2.cc338
-rw-r--r--python/re2.py582
-rw-r--r--python/re2_test.py482
-rw-r--r--python/setup.py117
-rw-r--r--re2.pc.in (renamed from re2.pc)7
-rw-r--r--re2/bitmap256.cc44
-rw-r--r--re2/bitmap256.h31
-rw-r--r--re2/bitstate.cc46
-rw-r--r--re2/compile.cc15
-rw-r--r--re2/dfa.cc228
-rw-r--r--re2/filtered_re2.cc19
-rw-r--r--re2/filtered_re2.h9
-rw-r--r--re2/fuzzing/compiler-rt/LICENSE219
-rw-r--r--re2/fuzzing/compiler-rt/include/fuzzer/FuzzedDataProvider.h397
-rw-r--r--re2/fuzzing/re2_fuzzer.cc51
-rw-r--r--re2/mimics_pcre.cc1
-rw-r--r--re2/nfa.cc71
-rw-r--r--re2/onepass.cc52
-rw-r--r--re2/parse.cc274
-rw-r--r--re2/prefilter.cc56
-rw-r--r--re2/prefilter.h63
-rw-r--r--re2/prefilter_tree.cc201
-rw-r--r--re2/prefilter_tree.h43
-rw-r--r--re2/prog.cc43
-rw-r--r--re2/prog.h44
-rw-r--r--re2/re2.cc248
-rw-r--r--re2/re2.h221
-rw-r--r--re2/regexp.cc60
-rw-r--r--re2/regexp.h19
-rw-r--r--re2/set.cc12
-rw-r--r--re2/set.h7
-rw-r--r--re2/simplify.cc28
-rw-r--r--re2/stringpiece.cc65
-rw-r--r--re2/stringpiece.h202
-rw-r--r--re2/testing/backtrack.cc47
-rw-r--r--re2/testing/charclass_test.cc44
-rw-r--r--re2/testing/compile_test.cc9
-rw-r--r--re2/testing/dfa_test.cc66
-rw-r--r--re2/testing/dump.cc16
-rw-r--r--re2/testing/exhaustive1_test.cc2
-rw-r--r--re2/testing/exhaustive2_test.cc2
-rw-r--r--re2/testing/exhaustive3_test.cc2
-rw-r--r--re2/testing/exhaustive_test.cc2
-rw-r--r--re2/testing/exhaustive_tester.cc68
-rw-r--r--re2/testing/exhaustive_tester.h1
-rw-r--r--re2/testing/filtered_re2_test.cc20
-rw-r--r--re2/testing/mimics_pcre_test.cc5
-rw-r--r--re2/testing/null_walker.cc2
-rw-r--r--re2/testing/parse_test.cc43
-rw-r--r--re2/testing/possible_match_test.cc29
-rw-r--r--re2/testing/random_test.cc25
-rw-r--r--re2/testing/re2_arg_test.cc27
-rw-r--r--re2/testing/re2_test.cc96
-rw-r--r--re2/testing/regexp_benchmark.cc289
-rw-r--r--re2/testing/regexp_generator.cc32
-rw-r--r--re2/testing/regexp_generator.h7
-rw-r--r--re2/testing/regexp_test.cc2
-rw-r--r--re2/testing/required_prefix_test.cc9
-rw-r--r--re2/testing/search_test.cc5
-rw-r--r--re2/testing/set_test.cc2
-rw-r--r--re2/testing/simplify_test.cc21
-rw-r--r--re2/testing/string_generator.cc6
-rw-r--r--re2/testing/string_generator.h11
-rw-r--r--re2/testing/string_generator_test.cc4
-rw-r--r--re2/testing/tester.cc155
-rw-r--r--re2/testing/tester.h28
-rw-r--r--re2/tostring.cc15
-rw-r--r--re2/unicode.py4
-rw-r--r--re2/unicode_casefold.cc16
-rw-r--r--re2/unicode_casefold.h1
-rw-r--r--re2/unicode_groups.cc238
-rw-r--r--re2/unicode_groups.h1
-rw-r--r--re2/walker-inl.h3
-rw-r--r--re2Config.cmake.in6
-rw-r--r--util/benchmark.cc131
-rw-r--r--util/benchmark.h156
-rw-r--r--util/flags.h26
-rw-r--r--util/fuzz.cc21
-rw-r--r--util/logging.h4
-rw-r--r--util/mix.h41
-rw-r--r--util/mutex.h148
-rw-r--r--util/pcre.cc165
-rw-r--r--util/pcre.h118
-rw-r--r--util/rune.cc8
-rw-r--r--util/strutil.cc123
-rw-r--r--util/strutil.h5
-rw-r--r--util/test.cc34
-rw-r--r--util/test.h50
-rw-r--r--util/util.h42
123 files changed, 4797 insertions, 3679 deletions
diff --git a/.bazelrc b/.bazelrc
new file mode 100644
index 0000000..540fb57
--- /dev/null
+++ b/.bazelrc
@@ -0,0 +1,23 @@
+# Copyright 2022 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Enable Bzlmod. This will be the default eventually...
+build --enable_bzlmod
+# Enable layering check features. Useful on Clang only.
+build --features=layering_check
+# Enable parse headers features. Enforcing that headers are self-contained.
+build --features=parse_headers
+
+# Abseil requires C++14 at minimum.
+# Previously, the flag was set via `BAZEL_CXXOPTS`. On macOS, we also had to set
+# `BAZEL_USE_CPP_ONLY_TOOLCHAIN` since Bazel wouldn't respect the former without
+# the latter. However, the latter stopped Bazel from using Xcode and `-framework
+# Foundation`, which CCTZ (vendored into Abseil) requires.
+build --enable_platform_specific_config
+build:linux --cxxopt=-std=c++14
+build:macos --cxxopt=-std=c++14
+build:windows --cxxopt=/std:c++14
+
+# Print test logs for failed tests.
+test --test_output=errors
diff --git a/.github/bazel.sh b/.github/bazel.sh
index fbe92e6..7295ec6 100755
--- a/.github/bazel.sh
+++ b/.github/bazel.sh
@@ -3,7 +3,7 @@ set -eux
bazel clean
bazel build --compilation_mode=dbg -- //:all
-bazel test --compilation_mode=dbg --test_output=errors -- //:all \
+bazel test --compilation_mode=dbg -- //:all \
-//:dfa_test \
-//:exhaustive1_test \
-//:exhaustive2_test \
@@ -13,7 +13,7 @@ bazel test --compilation_mode=dbg --test_output=errors -- //:all \
bazel clean
bazel build --compilation_mode=opt -- //:all
-bazel test --compilation_mode=opt --test_output=errors -- //:all \
+bazel test --compilation_mode=opt -- //:all \
-//:dfa_test \
-//:exhaustive1_test \
-//:exhaustive2_test \
diff --git a/.github/cmake.sh b/.github/cmake.sh
index 145a843..782334e 100755
--- a/.github/cmake.sh
+++ b/.github/cmake.sh
@@ -1,11 +1,11 @@
#!/bin/bash
set -eux
-cmake . -D CMAKE_BUILD_TYPE=Debug
+cmake . -D CMAKE_BUILD_TYPE=Debug -D RE2_BUILD_TESTING=ON "$@"
cmake --build . --config Debug --clean-first
ctest -C Debug --output-on-failure -E 'dfa|exhaustive|random'
-cmake . -D CMAKE_BUILD_TYPE=Release
+cmake . -D CMAKE_BUILD_TYPE=Release -D RE2_BUILD_TESTING=ON "$@"
cmake --build . --config Release --clean-first
ctest -C Release --output-on-failure -E 'dfa|exhaustive|random'
diff --git a/.github/workflows/ci-bazel.yml b/.github/workflows/ci-bazel.yml
index 681034d..013b52c 100644
--- a/.github/workflows/ci-bazel.yml
+++ b/.github/workflows/ci-bazel.yml
@@ -12,6 +12,8 @@ jobs:
env:
BAZELISK_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
+ # TODO(junyer): Use `v2` whenever a new release is tagged.
+ - uses: bazelbuild/setup-bazelisk@6244971d4f7ba9aca943c2f3ede2bbd813fcca51
- run: .github/bazel.sh
shell: bash
diff --git a/.github/workflows/ci-cmake.yml b/.github/workflows/ci-cmake.yml
index 585c386..d2d03af 100644
--- a/.github/workflows/ci-cmake.yml
+++ b/.github/workflows/ci-cmake.yml
@@ -3,13 +3,58 @@ on:
push:
branches: [main]
jobs:
- build:
- runs-on: ${{ matrix.os }}
+ build-linux:
+ runs-on: ubuntu-latest
+ # The Benchmark package on Ubuntu 22.04 LTS is problematic whereas this
+ # Docker container is based on Debian bookworm and has a newer version.
+ container: gcc:13
strategy:
fail-fast: false
matrix:
- os: [macos-latest, ubuntu-latest, windows-latest]
+ build_shared_libs: [OFF, ON]
steps:
- - uses: actions/checkout@v2
- - run: .github/cmake.sh
+ - uses: actions/checkout@v3
+ - name: Install CMake
+ run: |
+ apt update -y
+ apt install -y cmake
+ shell: bash
+ - name: Install Abseil, GoogleTest and Benchmark
+ run: |
+ apt update -y
+ apt install -y libabsl-dev libgtest-dev libbenchmark-dev
+ shell: bash
+ - run: .github/cmake.sh -D BUILD_SHARED_LIBS=${{ matrix.build_shared_libs }}
+ shell: bash
+ build-macos:
+ runs-on: macos-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ build_shared_libs: [OFF, ON]
+ steps:
+ - uses: actions/checkout@v3
+ - name: Install Abseil, GoogleTest and Benchmark
+ run: |
+ brew update
+ brew install abseil googletest google-benchmark
+ shell: bash
+ - run: .github/cmake.sh -D BUILD_SHARED_LIBS=${{ matrix.build_shared_libs }}
+ shell: bash
+ build-windows:
+ runs-on: windows-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ build_shared_libs: [OFF, ON]
+ steps:
+ - uses: actions/checkout@v3
+ - name: Install Abseil, GoogleTest and Benchmark
+ run: |
+ vcpkg update
+ vcpkg install abseil gtest benchmark
+ shell: bash
+ - run: |
+ .github/cmake.sh -D BUILD_SHARED_LIBS=${{ matrix.build_shared_libs }} \
+ -D CMAKE_TOOLCHAIN_FILE=C:/vcpkg/scripts/buildsystems/vcpkg.cmake
shell: bash
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 44a773b..44ac9dc 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -8,7 +8,7 @@ jobs:
strategy:
fail-fast: false
matrix:
- ver: [11, 14, 17, 20]
+ ver: [17, 20]
env:
CC: clang
CXX: clang++
@@ -19,7 +19,12 @@ jobs:
# (The other two flags are the default provided for CXXFLAGS in Makefile.)
CXXFLAGS: -O3 -g -std=c++${{ matrix.ver }}
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
+ - name: Install Abseil, GoogleTest and Benchmark
+ run: |
+ brew update
+ brew install abseil googletest google-benchmark
+ shell: bash
- run: make && make test
shell: bash
build-clang:
@@ -27,31 +32,42 @@ jobs:
strategy:
fail-fast: false
matrix:
- ver: [9, 10, 11, 12, 13]
+ ver: [15, 16, 17]
env:
CC: clang-${{ matrix.ver }}
CXX: clang++-${{ matrix.ver }}
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- name: Install Clang ${{ matrix.ver }}
run: |
+ # Avoid `Conflicts: python3-lldb-x.y` between packages.
+ sudo apt purge -y python3-lldb-14
wget https://apt.llvm.org/llvm.sh
chmod +x ./llvm.sh
sudo ./llvm.sh ${{ matrix.ver }}
shell: bash
+ - name: Install Abseil, GoogleTest and Benchmark
+ run: |
+ sudo apt update -y
+ sudo apt install -y libabsl-dev libgtest-dev libbenchmark-dev
+ shell: bash
- run: make && make test
shell: bash
build-gcc:
runs-on: ubuntu-latest
- container: gcc:${{ matrix.ver }}
strategy:
fail-fast: false
matrix:
- ver: [6, 7, 8, 9, 10, 11]
+ ver: [11, 12, 13]
env:
- CC: gcc
- CXX: g++
+ CC: gcc-${{ matrix.ver }}
+ CXX: g++-${{ matrix.ver }}
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
+ - name: Install Abseil, GoogleTest and Benchmark
+ run: |
+ sudo apt update -y
+ sudo apt install -y libabsl-dev libgtest-dev libbenchmark-dev
+ shell: bash
- run: make && make test
shell: bash
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 8ede73b..860da62 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -7,8 +7,8 @@ jobs:
close:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v2
- - uses: actions/github-script@v5
+ - uses: actions/checkout@v3
+ - uses: actions/github-script@v6
with:
script: |
const fs = require('fs');
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
new file mode 100644
index 0000000..d505eaa
--- /dev/null
+++ b/.github/workflows/python.yml
@@ -0,0 +1,224 @@
+name: Python
+on:
+ workflow_dispatch:
+ inputs:
+ build:
+ required: true
+ type: number
+jobs:
+ wheel-linux:
+ name: Linux ${{ matrix.os }}, ${{ matrix.arch.name }}, Python ${{ matrix.ver }}
+ runs-on: ${{ matrix.arch.runs-on }}
+ container:
+ image: quay.io/pypa/${{ matrix.os }}_${{ matrix.arch.python-name }}
+ options: --init
+ strategy:
+ fail-fast: false
+ matrix:
+ arch:
+ - { name: X64, python-name: x86_64, runs-on: [ubuntu-latest] }
+ - { name: ARM64, python-name: aarch64, runs-on: [self-hosted, linux, arm64] }
+ os: [manylinux2014, manylinux_2_28]
+ ver: ['3.8', '3.9', '3.10', '3.11', '3.12']
+ env:
+ BAZELISK_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ steps:
+ - uses: actions/checkout@v3
+ # Stash the timestamp for the commit SHA that triggered the workflow.
+ - run: echo "timestamp=$(git log -1 --pretty=%ct)" >> "${GITHUB_ENV}"
+ shell: bash
+ # TODO(junyer): Use `v2` whenever a new release is tagged.
+ - uses: bazelbuild/setup-bazelisk@6244971d4f7ba9aca943c2f3ede2bbd813fcca51
+ - name: Prepare Python ${{ matrix.ver }} environment
+ run: |
+ ln -sf /usr/local/bin/python${{ matrix.ver }} /usr/local/bin/python
+ ln -sf /usr/local/bin/python${{ matrix.ver }} /usr/local/bin/python3
+ python -m pip install --upgrade pip
+ python -m pip install --upgrade build wheel auditwheel
+ python -m pip install --upgrade absl-py
+ shell: bash
+ - name: Build wheel
+ env:
+ SOURCE_DATE_EPOCH: ${{ env.timestamp }}
+ run: |
+ # TODO(junyer): Get rid of this hack whenever @rules_python no longer
+ # fails due to Bazel running as root. (It sounds more likely than the
+ # Docker container changing to be built with the `USER` instruction.)
+ useradd "${GITHUB_ACTOR}"
+ chown -R "${GITHUB_ACTOR}" ..
+ su -c 'python -m build --wheel' "${GITHUB_ACTOR}"
+ chown -R "${USER}" ..
+ python -m auditwheel repair --wheel-dir=. dist/*
+ shell: bash
+ working-directory: python
+ - name: Test wheel
+ run: |
+ python -m pip install google_re2-*.whl
+ python re2_test.py
+ shell: bash
+ working-directory: python
+ - uses: actions/upload-artifact@v3
+ with:
+ name: ${{ hashFiles('python/google_re2-*.whl') }}
+ path: python/google_re2-*.whl
+ retention-days: 1
+ wheel-macos:
+ name: macOS ${{ matrix.os }}, ${{ matrix.arch.name }}, Python ${{ matrix.ver }}
+ runs-on: macos-${{ matrix.os }}
+ strategy:
+ fail-fast: false
+ matrix:
+ arch:
+ - { name: X64, bazel-name: x86_64, python-name: x86_64 }
+ - { name: ARM64, bazel-name: arm64, python-name: arm64 }
+ os: [11, 12, 13]
+ ver: ['3.8', '3.9', '3.10', '3.11', '3.12']
+ env:
+ BAZELISK_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ BAZEL_CPU: darwin_${{ matrix.arch.bazel-name }}
+ PLAT_NAME: macosx-${{ matrix.os }}.0-${{ matrix.arch.python-name }}
+ # Stop macOS from reporting the system version as 10.x.
+ # Otherwise, Python refuses to install the built wheel!
+ SYSTEM_VERSION_COMPAT: 0
+ steps:
+ - uses: actions/checkout@v3
+ # Stash the timestamp for the commit SHA that triggered the workflow.
+ - run: echo "timestamp=$(git log -1 --pretty=%ct)" >> "${GITHUB_ENV}"
+ shell: bash
+ # TODO(junyer): Use `v2` whenever a new release is tagged.
+ - uses: bazelbuild/setup-bazelisk@6244971d4f7ba9aca943c2f3ede2bbd813fcca51
+ - uses: actions/setup-python@v4
+ with:
+ python-version: ${{ matrix.ver }}
+ - name: Prepare Python ${{ matrix.ver }} environment
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install --upgrade build wheel delocate
+ python -m pip install --upgrade absl-py
+ shell: bash
+ - name: Build wheel
+ env:
+ SOURCE_DATE_EPOCH: ${{ env.timestamp }}
+ run: |
+ python -m build --wheel
+ python -m delocate.cmd.delocate_wheel --wheel-dir=. dist/*
+ shell: bash
+ working-directory: python
+ - if: matrix.arch.name == runner.arch
+ name: Test wheel
+ run: |
+ python -m pip install google_re2-*.whl
+ python re2_test.py
+ shell: bash
+ working-directory: python
+ - uses: actions/upload-artifact@v3
+ with:
+ name: ${{ hashFiles('python/google_re2-*.whl') }}
+ path: python/google_re2-*.whl
+ retention-days: 1
+ wheel-windows:
+ name: Windows, ${{ matrix.arch.name }}, Python ${{ matrix.ver }}
+ runs-on: windows-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ arch:
+ - { name: X86, bazel-name: x64_x86, python-name: win32 }
+ - { name: X64, bazel-name: x64, python-name: win_amd64 }
+ ver: ['3.8', '3.9', '3.10', '3.11', '3.12']
+ env:
+ BAZELISK_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ BAZEL_CPU: ${{ matrix.arch.bazel-name }}_windows
+ PLAT_NAME: ${{ matrix.arch.python-name }}
+ steps:
+ - uses: actions/checkout@v3
+ # Stash the timestamp for the commit SHA that triggered the workflow.
+ - run: echo "timestamp=$(git log -1 --pretty=%ct)" >> "${GITHUB_ENV}"
+ shell: bash
+ # Avoid the Chocolatey install of Bazel getting in the way;
+ # `bazelbuild/setup-bazelisk` doesn't work for some reason.
+ - run: |
+ choco uninstall -y bazel
+ choco install -y bazelisk
+ shell: bash
+ # Lowercase the architecture name for `actions/setup-python`.
+ - run: |
+ ARCHITECTURE=${{ matrix.arch.name }}
+ echo "architecture=${ARCHITECTURE,,}" >> "${GITHUB_ENV}"
+ shell: bash
+ - uses: actions/setup-python@v4
+ with:
+ python-version: ${{ matrix.ver }}
+ architecture: ${{ env.architecture }}
+ - name: Prepare Python ${{ matrix.ver }} environment
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install --upgrade build wheel delvewheel
+ python -m pip install --upgrade absl-py
+ shell: bash
+ - name: Build wheel
+ env:
+ SOURCE_DATE_EPOCH: ${{ env.timestamp }}
+ run: |
+ python -m build --wheel
+ python -m delvewheel repair --wheel-dir=. dist/*
+ shell: bash
+ working-directory: python
+ - name: Test wheel
+ run: |
+ python -m pip install google_re2-*.whl
+ python re2_test.py
+ shell: bash
+ working-directory: python
+ - uses: actions/upload-artifact@v3
+ with:
+ name: ${{ hashFiles('python/google_re2-*.whl') }}
+ path: python/google_re2-*.whl
+ retention-days: 1
+ publish:
+ needs:
+ - wheel-linux
+ - wheel-macos
+ - wheel-windows
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v3
+ # Stash the timestamp for the commit SHA that triggered the workflow.
+ - run: echo "timestamp=$(git log -1 --pretty=%ct)" >> "${GITHUB_ENV}"
+ shell: bash
+ - uses: actions/setup-python@v4
+ with:
+ python-version: '3.x'
+ - name: Prepare Python 3.x environment
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install --upgrade build wheel
+ shell: bash
+ - if: inputs.build == 1
+ name: Build source
+ env:
+ SOURCE_DATE_EPOCH: ${{ env.timestamp }}
+ run: |
+ python -m build --sdist
+ shell: bash
+ working-directory: python
+ - uses: actions/download-artifact@v3
+ with:
+ path: python
+ - name: Set build number to ${{ inputs.build }}
+ env:
+ SOURCE_DATE_EPOCH: ${{ env.timestamp }}
+ run: |
+ mkdir -p dist
+ for WHL in */google_re2-*.whl; do
+ python -m wheel unpack "${WHL}"
+ python -m wheel pack --dest-dir=dist --build-number=${{ inputs.build }} google_re2-*
+ rm -rf google_re2-*
+ done
+ shell: bash
+ working-directory: python
+ - if: inputs.build >= 1
+ uses: pypa/gh-action-pypi-publish@release/v1
+ with:
+ password: ${{ secrets.PYPI_API_TOKEN }}
+ packages_dir: python/dist
diff --git a/BUILD b/BUILD
deleted file mode 100644
index 00330b6..0000000
--- a/BUILD
+++ /dev/null
@@ -1,278 +0,0 @@
-# Copyright 2009 The RE2 Authors. All Rights Reserved.
-# Use of this source code is governed by a BSD-style
-# license that can be found in the LICENSE file.
-
-# Bazel (http://bazel.io/) BUILD file for RE2.
-
-licenses(["notice"])
-
-exports_files(["LICENSE"])
-
-config_setting(
- name = "macos",
- values = {"cpu": "darwin"},
-)
-
-config_setting(
- name = "wasm",
- values = {"cpu": "wasm32"},
-)
-
-config_setting(
- name = "windows",
- values = {"cpu": "x64_windows"},
-)
-
-cc_library(
- name = "re2",
- srcs = [
- "re2/bitmap256.h",
- "re2/bitstate.cc",
- "re2/compile.cc",
- "re2/dfa.cc",
- "re2/filtered_re2.cc",
- "re2/mimics_pcre.cc",
- "re2/nfa.cc",
- "re2/onepass.cc",
- "re2/parse.cc",
- "re2/perl_groups.cc",
- "re2/pod_array.h",
- "re2/prefilter.cc",
- "re2/prefilter.h",
- "re2/prefilter_tree.cc",
- "re2/prefilter_tree.h",
- "re2/prog.cc",
- "re2/prog.h",
- "re2/re2.cc",
- "re2/regexp.cc",
- "re2/regexp.h",
- "re2/set.cc",
- "re2/simplify.cc",
- "re2/sparse_array.h",
- "re2/sparse_set.h",
- "re2/stringpiece.cc",
- "re2/tostring.cc",
- "re2/unicode_casefold.cc",
- "re2/unicode_casefold.h",
- "re2/unicode_groups.cc",
- "re2/unicode_groups.h",
- "re2/walker-inl.h",
- "util/logging.h",
- "util/mix.h",
- "util/mutex.h",
- "util/rune.cc",
- "util/strutil.cc",
- "util/strutil.h",
- "util/utf.h",
- "util/util.h",
- ],
- hdrs = [
- "re2/filtered_re2.h",
- "re2/re2.h",
- "re2/set.h",
- "re2/stringpiece.h",
- ],
- copts = select({
- ":wasm": [],
- ":windows": [],
- "//conditions:default": ["-pthread"],
- }),
- linkopts = select({
- # macOS doesn't need `-pthread' when linking and it appears that
- # older versions of Clang will warn about the unused command line
- # argument, so just don't pass it.
- ":macos": [],
- ":wasm": [],
- ":windows": [],
- "//conditions:default": ["-pthread"],
- }),
- visibility = ["//visibility:public"],
-)
-
-cc_library(
- name = "testing",
- testonly = 1,
- srcs = [
- "re2/testing/backtrack.cc",
- "re2/testing/dump.cc",
- "re2/testing/exhaustive_tester.cc",
- "re2/testing/null_walker.cc",
- "re2/testing/regexp_generator.cc",
- "re2/testing/string_generator.cc",
- "re2/testing/tester.cc",
- "util/pcre.cc",
- ],
- hdrs = [
- "re2/testing/exhaustive_tester.h",
- "re2/testing/regexp_generator.h",
- "re2/testing/string_generator.h",
- "re2/testing/tester.h",
- "util/benchmark.h",
- "util/flags.h",
- "util/malloc_counter.h",
- "util/pcre.h",
- "util/test.h",
- ],
- deps = [":re2"],
-)
-
-cc_library(
- name = "test",
- testonly = 1,
- srcs = ["util/test.cc"],
- deps = [":testing"],
-)
-
-cc_test(
- name = "charclass_test",
- size = "small",
- srcs = ["re2/testing/charclass_test.cc"],
- deps = [":test"],
-)
-
-cc_test(
- name = "compile_test",
- size = "small",
- srcs = ["re2/testing/compile_test.cc"],
- deps = [":test"],
-)
-
-cc_test(
- name = "filtered_re2_test",
- size = "small",
- srcs = ["re2/testing/filtered_re2_test.cc"],
- deps = [":test"],
-)
-
-cc_test(
- name = "mimics_pcre_test",
- size = "small",
- srcs = ["re2/testing/mimics_pcre_test.cc"],
- deps = [":test"],
-)
-
-cc_test(
- name = "parse_test",
- size = "small",
- srcs = ["re2/testing/parse_test.cc"],
- deps = [":test"],
-)
-
-cc_test(
- name = "possible_match_test",
- size = "small",
- srcs = ["re2/testing/possible_match_test.cc"],
- deps = [":test"],
-)
-
-cc_test(
- name = "re2_arg_test",
- size = "small",
- srcs = ["re2/testing/re2_arg_test.cc"],
- deps = [":test"],
-)
-
-cc_test(
- name = "re2_test",
- size = "small",
- srcs = ["re2/testing/re2_test.cc"],
- deps = [":test"],
-)
-
-cc_test(
- name = "regexp_test",
- size = "small",
- srcs = ["re2/testing/regexp_test.cc"],
- deps = [":test"],
-)
-
-cc_test(
- name = "required_prefix_test",
- size = "small",
- srcs = ["re2/testing/required_prefix_test.cc"],
- deps = [":test"],
-)
-
-cc_test(
- name = "search_test",
- size = "small",
- srcs = ["re2/testing/search_test.cc"],
- deps = [":test"],
-)
-
-cc_test(
- name = "set_test",
- size = "small",
- srcs = ["re2/testing/set_test.cc"],
- deps = [":test"],
-)
-
-cc_test(
- name = "simplify_test",
- size = "small",
- srcs = ["re2/testing/simplify_test.cc"],
- deps = [":test"],
-)
-
-cc_test(
- name = "string_generator_test",
- size = "small",
- srcs = ["re2/testing/string_generator_test.cc"],
- deps = [":test"],
-)
-
-cc_test(
- name = "dfa_test",
- size = "large",
- srcs = ["re2/testing/dfa_test.cc"],
- deps = [":test"],
-)
-
-cc_test(
- name = "exhaustive1_test",
- size = "large",
- srcs = ["re2/testing/exhaustive1_test.cc"],
- deps = [":test"],
-)
-
-cc_test(
- name = "exhaustive2_test",
- size = "large",
- srcs = ["re2/testing/exhaustive2_test.cc"],
- deps = [":test"],
-)
-
-cc_test(
- name = "exhaustive3_test",
- size = "large",
- srcs = ["re2/testing/exhaustive3_test.cc"],
- deps = [":test"],
-)
-
-cc_test(
- name = "exhaustive_test",
- size = "large",
- srcs = ["re2/testing/exhaustive_test.cc"],
- deps = [":test"],
-)
-
-cc_test(
- name = "random_test",
- size = "large",
- srcs = ["re2/testing/random_test.cc"],
- deps = [":test"],
-)
-
-cc_library(
- name = "benchmark",
- testonly = 1,
- srcs = ["util/benchmark.cc"],
- deps = [":testing"],
-)
-
-cc_binary(
- name = "regexp_benchmark",
- testonly = 1,
- srcs = ["re2/testing/regexp_benchmark.cc"],
- deps = [":benchmark"],
-)
diff --git a/BUILD.bazel b/BUILD.bazel
new file mode 100644
index 0000000..6122a3f
--- /dev/null
+++ b/BUILD.bazel
@@ -0,0 +1,394 @@
+# Copyright 2009 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Bazel (http://bazel.build/) BUILD file for RE2.
+
+licenses(["notice"])
+
+exports_files(["LICENSE"])
+
+cc_library(
+ name = "re2",
+ srcs = [
+ "re2/bitmap256.cc",
+ "re2/bitmap256.h",
+ "re2/bitstate.cc",
+ "re2/compile.cc",
+ "re2/dfa.cc",
+ "re2/filtered_re2.cc",
+ "re2/mimics_pcre.cc",
+ "re2/nfa.cc",
+ "re2/onepass.cc",
+ "re2/parse.cc",
+ "re2/perl_groups.cc",
+ "re2/pod_array.h",
+ "re2/prefilter.cc",
+ "re2/prefilter.h",
+ "re2/prefilter_tree.cc",
+ "re2/prefilter_tree.h",
+ "re2/prog.cc",
+ "re2/prog.h",
+ "re2/re2.cc",
+ "re2/regexp.cc",
+ "re2/regexp.h",
+ "re2/set.cc",
+ "re2/simplify.cc",
+ "re2/sparse_array.h",
+ "re2/sparse_set.h",
+ "re2/tostring.cc",
+ "re2/unicode_casefold.cc",
+ "re2/unicode_casefold.h",
+ "re2/unicode_groups.cc",
+ "re2/unicode_groups.h",
+ "re2/walker-inl.h",
+ "util/logging.h",
+ "util/rune.cc",
+ "util/strutil.cc",
+ "util/strutil.h",
+ "util/utf.h",
+ ],
+ hdrs = [
+ "re2/filtered_re2.h",
+ "re2/re2.h",
+ "re2/set.h",
+ "re2/stringpiece.h",
+ ],
+ copts = select({
+ "@platforms//os:wasi": [],
+ "@platforms//os:windows": [],
+ "//conditions:default": ["-pthread"],
+ }),
+ linkopts = select({
+ # macOS doesn't need `-pthread' when linking and it appears that
+ # older versions of Clang will warn about the unused command line
+ # argument, so just don't pass it.
+ "@platforms//os:macos": [],
+ "@platforms//os:wasi": [],
+ "@platforms//os:windows": [],
+ "//conditions:default": ["-pthread"],
+ }),
+ visibility = ["//visibility:public"],
+ deps = [
+ "@com_google_absl//absl/base",
+ "@com_google_absl//absl/base:core_headers",
+ "@com_google_absl//absl/container:fixed_array",
+ "@com_google_absl//absl/container:flat_hash_map",
+ "@com_google_absl//absl/container:flat_hash_set",
+ "@com_google_absl//absl/container:inlined_vector",
+ "@com_google_absl//absl/strings",
+ "@com_google_absl//absl/strings:str_format",
+ "@com_google_absl//absl/synchronization",
+ "@com_google_absl//absl/types:optional",
+ "@com_google_absl//absl/types:span",
+ ],
+)
+
+cc_library(
+ name = "testing",
+ testonly = 1,
+ srcs = [
+ "re2/testing/backtrack.cc",
+ "re2/testing/dump.cc",
+ "re2/testing/exhaustive_tester.cc",
+ "re2/testing/null_walker.cc",
+ "re2/testing/regexp_generator.cc",
+ "re2/testing/string_generator.cc",
+ "re2/testing/tester.cc",
+ "util/pcre.cc",
+ ],
+ hdrs = [
+ "re2/testing/exhaustive_tester.h",
+ "re2/testing/regexp_generator.h",
+ "re2/testing/string_generator.h",
+ "re2/testing/tester.h",
+ "util/malloc_counter.h",
+ "util/pcre.h",
+
+ # Exposed for testing only.
+ "re2/bitmap256.h",
+ "re2/pod_array.h",
+ "re2/prefilter.h",
+ "re2/prefilter_tree.h",
+ "re2/prog.h",
+ "re2/regexp.h",
+ "re2/sparse_array.h",
+ "re2/sparse_set.h",
+ "re2/unicode_casefold.h",
+ "re2/unicode_groups.h",
+ "re2/walker-inl.h",
+ "util/logging.h",
+ "util/strutil.h",
+ "util/utf.h",
+ ],
+ visibility = [":__subpackages__"],
+ deps = [
+ ":re2",
+ "@com_google_absl//absl/base",
+ "@com_google_absl//absl/base:core_headers",
+ "@com_google_absl//absl/flags:flag",
+ "@com_google_absl//absl/strings",
+ "@com_google_absl//absl/strings:str_format",
+ "@googletest//:gtest",
+ ],
+)
+
+cc_test(
+ name = "charclass_test",
+ size = "small",
+ srcs = ["re2/testing/charclass_test.cc"],
+ deps = [
+ ":testing",
+ "@com_google_absl//absl/base:core_headers",
+ "@com_google_absl//absl/strings:str_format",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "compile_test",
+ size = "small",
+ srcs = ["re2/testing/compile_test.cc"],
+ deps = [
+ ":testing",
+ "@com_google_absl//absl/base:core_headers",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "filtered_re2_test",
+ size = "small",
+ srcs = ["re2/testing/filtered_re2_test.cc"],
+ deps = [
+ ":re2",
+ ":testing",
+ "@com_google_absl//absl/base:core_headers",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "mimics_pcre_test",
+ size = "small",
+ srcs = ["re2/testing/mimics_pcre_test.cc"],
+ deps = [
+ ":testing",
+ "@com_google_absl//absl/base:core_headers",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "parse_test",
+ size = "small",
+ srcs = ["re2/testing/parse_test.cc"],
+ deps = [
+ ":testing",
+ "@com_google_absl//absl/base:core_headers",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "possible_match_test",
+ size = "small",
+ srcs = ["re2/testing/possible_match_test.cc"],
+ deps = [
+ ":re2",
+ ":testing",
+ "@com_google_absl//absl/base:core_headers",
+ "@com_google_absl//absl/strings",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "re2_arg_test",
+ size = "small",
+ srcs = ["re2/testing/re2_arg_test.cc"],
+ deps = [
+ ":re2",
+ ":testing",
+ "@com_google_absl//absl/base:core_headers",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "re2_test",
+ size = "small",
+ srcs = ["re2/testing/re2_test.cc"],
+ deps = [
+ ":re2",
+ ":testing",
+ "@com_google_absl//absl/base:core_headers",
+ "@com_google_absl//absl/strings:str_format",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "regexp_test",
+ size = "small",
+ srcs = ["re2/testing/regexp_test.cc"],
+ deps = [
+ ":testing",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "required_prefix_test",
+ size = "small",
+ srcs = ["re2/testing/required_prefix_test.cc"],
+ deps = [
+ ":testing",
+ "@com_google_absl//absl/base:core_headers",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "search_test",
+ size = "small",
+ srcs = ["re2/testing/search_test.cc"],
+ deps = [
+ ":testing",
+ "@com_google_absl//absl/base:core_headers",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "set_test",
+ size = "small",
+ srcs = ["re2/testing/set_test.cc"],
+ deps = [
+ ":re2",
+ ":testing",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "simplify_test",
+ size = "small",
+ srcs = ["re2/testing/simplify_test.cc"],
+ deps = [
+ ":testing",
+ "@com_google_absl//absl/base:core_headers",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "string_generator_test",
+ size = "small",
+ srcs = ["re2/testing/string_generator_test.cc"],
+ deps = [
+ ":testing",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "dfa_test",
+ size = "large",
+ srcs = ["re2/testing/dfa_test.cc"],
+ deps = [
+ ":re2",
+ ":testing",
+ "@com_google_absl//absl/base:core_headers",
+ "@com_google_absl//absl/flags:flag",
+ "@com_google_absl//absl/strings:str_format",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "exhaustive1_test",
+ size = "large",
+ srcs = ["re2/testing/exhaustive1_test.cc"],
+ deps = [
+ ":testing",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "exhaustive2_test",
+ size = "large",
+ srcs = ["re2/testing/exhaustive2_test.cc"],
+ deps = [
+ ":testing",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "exhaustive3_test",
+ size = "large",
+ srcs = ["re2/testing/exhaustive3_test.cc"],
+ deps = [
+ ":testing",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "exhaustive_test",
+ size = "large",
+ srcs = ["re2/testing/exhaustive_test.cc"],
+ deps = [
+ ":testing",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "random_test",
+ size = "large",
+ srcs = ["re2/testing/random_test.cc"],
+ deps = [
+ ":testing",
+ "@com_google_absl//absl/flags:flag",
+ "@com_google_absl//absl/strings:str_format",
+ "@googletest//:gtest",
+ "@googletest//:gtest_main",
+ ],
+)
+
+cc_binary(
+ name = "regexp_benchmark",
+ testonly = 1,
+ srcs = ["re2/testing/regexp_benchmark.cc"],
+ deps = [
+ ":re2",
+ ":testing",
+ "@com_google_absl//absl/container:flat_hash_map",
+ "@com_google_absl//absl/flags:flag",
+ "@com_google_absl//absl/strings:str_format",
+ "@com_google_absl//absl/synchronization",
+ "@google_benchmark//:benchmark_main",
+ ],
+)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 62d4995..bdac5af 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,38 +2,43 @@
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
-# Old enough to support Ubuntu Xenial.
-cmake_minimum_required(VERSION 3.5.1)
+# https://github.com/google/oss-policies-info/blob/main/foundational-cxx-support-matrix.md
+cmake_minimum_required(VERSION 3.13)
project(RE2 CXX)
include(CMakePackageConfigHelpers)
include(CTest)
include(GNUInstallDirs)
-if(NOT CMAKE_CXX_STANDARD)
- set(CMAKE_CXX_STANDARD 11)
- set(CMAKE_CXX_STANDARD_REQUIRED ON)
-endif()
-
option(BUILD_SHARED_LIBS "build shared libraries" OFF)
-option(USEPCRE "use PCRE in tests and benchmarks" OFF)
+option(RE2_USE_ICU "build against ICU for full Unicode properties support" OFF)
+
+# For historical reasons, this is just "USEPCRE", not "RE2_USE_PCRE".
+option(USEPCRE "build against PCRE for testing and benchmarking" OFF)
+
+# See https://groups.google.com/g/re2-dev/c/P6_NM0YIWvA for details.
+# This has no effect unless RE2 is being built for an Apple platform
+# such as macOS or iOS.
+option(RE2_BUILD_FRAMEWORK "build RE2 as a framework" OFF)
# CMake seems to have no way to enable/disable testing per subproject,
# so we provide an option similar to BUILD_TESTING, but just for RE2.
-option(RE2_BUILD_TESTING "enable testing for RE2" ON)
+option(RE2_BUILD_TESTING "enable testing for RE2" OFF)
+
+# The pkg-config Requires: field.
+set(REQUIRES)
# ABI version
# http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html
-set(SONAME 9)
+set(SONAME 11)
set(EXTRA_TARGET_LINK_LIBRARIES)
if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
- if(MSVC_VERSION LESS 1900)
- message(FATAL_ERROR "you need Visual Studio 2015 or later")
+ if(MSVC_VERSION LESS 1920)
+ message(FATAL_ERROR "you need Visual Studio 2019 or later")
endif()
if(BUILD_SHARED_LIBS)
- # See http://www.kitware.com/blog/home/post/939 for details.
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
endif()
# CMake defaults to /W3, but some users like /W4 (or /Wall) and /WX,
@@ -47,17 +52,53 @@ endif()
if(WIN32)
add_definitions(-DUNICODE -D_UNICODE -DSTRICT -DNOMINMAX)
add_definitions(-D_CRT_SECURE_NO_WARNINGS -D_SCL_SECURE_NO_WARNINGS)
-elseif(UNIX)
+endif()
+
+if(UNIX)
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
endif()
+set(ABSL_DEPS
+ absl_base
+ absl_core_headers
+ absl_fixed_array
+ absl_flags
+ absl_flat_hash_map
+ absl_flat_hash_set
+ absl_inlined_vector
+ absl_optional
+ absl_span
+ absl_str_format
+ absl_strings
+ absl_synchronization
+ )
+
+# If a top-level project has called add_directory(abseil-cpp) already (possibly
+# indirectly), let that take precedence over any copy of Abseil that might have
+# been installed on the system. And likewise for ICU, GoogleTest and Benchmark.
+if(NOT TARGET absl::base)
+ find_package(absl REQUIRED)
+endif()
+list(APPEND REQUIRES ${ABSL_DEPS})
+
+if(RE2_USE_ICU)
+ if(NOT TARGET ICU::uc)
+ find_package(ICU REQUIRED COMPONENTS uc)
+ endif()
+ add_definitions(-DRE2_USE_ICU)
+ list(APPEND REQUIRES icu-uc)
+endif()
+
if(USEPCRE)
add_definitions(-DUSEPCRE)
list(APPEND EXTRA_TARGET_LINK_LIBRARIES pcre)
endif()
+list(JOIN REQUIRES " " REQUIRES)
+
set(RE2_SOURCES
+ re2/bitmap256.cc
re2/bitstate.cc
re2/compile.cc
re2/dfa.cc
@@ -74,7 +115,6 @@ set(RE2_SOURCES
re2/regexp.cc
re2/set.cc
re2/simplify.cc
- re2/stringpiece.cc
re2/tostring.cc
re2/unicode_casefold.cc
re2/unicode_groups.cc
@@ -82,16 +122,50 @@ set(RE2_SOURCES
util/strutil.cc
)
+set(RE2_HEADERS
+ re2/filtered_re2.h
+ re2/re2.h
+ re2/set.h
+ re2/stringpiece.h
+ )
+
add_library(re2 ${RE2_SOURCES})
+target_compile_features(re2 PUBLIC cxx_std_14)
target_include_directories(re2 PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
+# CMake gives "set_target_properties called with incorrect number of arguments."
+# errors if we don't quote ${RE2_HEADERS}, so quote it despite prevailing style.
+set_target_properties(re2 PROPERTIES PUBLIC_HEADER "${RE2_HEADERS}")
set_target_properties(re2 PROPERTIES SOVERSION ${SONAME} VERSION ${SONAME}.0.0)
add_library(re2::re2 ALIAS re2)
+if(APPLE AND RE2_BUILD_FRAMEWORK)
+ set_target_properties(re2 PROPERTIES
+ FRAMEWORK TRUE
+ FRAMEWORK_VERSION A
+ MACOSX_FRAMEWORK_IDENTIFIER com.googlesource.code.re2)
+endif()
+
if(UNIX)
target_link_libraries(re2 PUBLIC Threads::Threads)
endif()
+foreach(dep ${ABSL_DEPS})
+ string(REGEX REPLACE "^absl_" "absl::" dep ${dep})
+ target_link_libraries(re2 PUBLIC ${dep})
+endforeach()
+
+if(RE2_USE_ICU)
+ target_link_libraries(re2 PUBLIC ICU::uc)
+endif()
+
if(RE2_BUILD_TESTING)
+ if(NOT TARGET GTest::gtest)
+ find_package(GTest REQUIRED)
+ endif()
+ if(NOT TARGET benchmark::benchmark)
+ find_package(benchmark REQUIRED)
+ endif()
+
set(TESTING_SOURCES
re2/testing/backtrack.cc
re2/testing/dump.cc
@@ -103,8 +177,12 @@ if(RE2_BUILD_TESTING)
util/pcre.cc
)
- add_library(testing STATIC ${TESTING_SOURCES})
- target_link_libraries(testing PUBLIC re2)
+ add_library(testing ${TESTING_SOURCES})
+ if(BUILD_SHARED_LIBS AND WIN32)
+ target_compile_definitions(testing PRIVATE -DRE2_BUILD_TESTING_DLL)
+ endif()
+ target_compile_features(testing PUBLIC cxx_std_14)
+ target_link_libraries(testing PUBLIC re2 GTest::gtest)
set(TEST_TARGETS
charclass_test
@@ -135,43 +213,51 @@ if(RE2_BUILD_TESTING)
)
foreach(target ${TEST_TARGETS})
- add_executable(${target} re2/testing/${target}.cc util/test.cc)
- target_link_libraries(${target} testing ${EXTRA_TARGET_LINK_LIBRARIES})
+ add_executable(${target} re2/testing/${target}.cc)
+ if(BUILD_SHARED_LIBS AND WIN32)
+ target_compile_definitions(${target} PRIVATE -DRE2_CONSUME_TESTING_DLL)
+ endif()
+ target_compile_features(${target} PUBLIC cxx_std_14)
+ target_link_libraries(${target} PUBLIC testing GTest::gtest_main ${EXTRA_TARGET_LINK_LIBRARIES})
add_test(NAME ${target} COMMAND ${target})
endforeach()
foreach(target ${BENCHMARK_TARGETS})
- add_executable(${target} re2/testing/${target}.cc util/benchmark.cc)
- target_link_libraries(${target} testing ${EXTRA_TARGET_LINK_LIBRARIES})
+ add_executable(${target} re2/testing/${target}.cc)
+ if(BUILD_SHARED_LIBS AND WIN32)
+ target_compile_definitions(${target} PRIVATE -DRE2_CONSUME_TESTING_DLL)
+ endif()
+ target_compile_features(${target} PUBLIC cxx_std_14)
+ target_link_libraries(${target} PUBLIC testing benchmark::benchmark_main ${EXTRA_TARGET_LINK_LIBRARIES})
endforeach()
endif()
-set(RE2_HEADERS
- re2/filtered_re2.h
- re2/re2.h
- re2/set.h
- re2/stringpiece.h
- )
-
-install(FILES ${RE2_HEADERS}
- DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/re2)
-install(TARGETS re2 EXPORT re2Targets
+install(TARGETS re2
+ EXPORT re2Targets
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+ FRAMEWORK DESTINATION ${CMAKE_INSTALL_LIBDIR}
+ PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/re2
INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
install(EXPORT re2Targets
- DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/re2 NAMESPACE re2::)
+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/re2
+ NAMESPACE re2::)
configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/re2Config.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/re2Config.cmake
- INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/re2
- )
+ INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/re2)
write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/re2ConfigVersion.cmake
VERSION ${SONAME}.0.0
- COMPATIBILITY SameMajorVersion
- )
+ COMPATIBILITY SameMajorVersion)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/re2Config.cmake
${CMAKE_CURRENT_BINARY_DIR}/re2ConfigVersion.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/re2)
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/re2.pc.in
+ ${CMAKE_CURRENT_BINARY_DIR}/re2.pc
+ @ONLY)
+
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/re2.pc
+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
diff --git a/MODULE.bazel b/MODULE.bazel
new file mode 100644
index 0000000..87a5576
--- /dev/null
+++ b/MODULE.bazel
@@ -0,0 +1,27 @@
+# Copyright 2009 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Bazel (http://bazel.build/) MODULE file for RE2.
+
+module(
+ name = "re2",
+ version = "2023-11-01",
+ compatibility_level = 1,
+)
+
+bazel_dep(name = "platforms", version = "0.0.8")
+bazel_dep(name = "rules_cc", version = "0.0.9")
+bazel_dep(name = "abseil-cpp", version = "20230802.0", repo_name = "com_google_absl")
+bazel_dep(name = "rules_python", version = "0.26.0")
+bazel_dep(name = "pybind11_bazel", version = "2.11.1")
+
+python_configure = use_extension("@pybind11_bazel//:python_configure.bzl", "extension")
+python_configure.toolchain(python_version = "3") # ignored when non-root module
+use_repo(python_configure, "local_config_python", "pybind11")
+
+# These dependencies will be ignored when the `re2` module is not
+# the root module (or when `--ignore_dev_dependency` is enabled).
+bazel_dep(name = "google_benchmark", version = "1.8.3", dev_dependency = True)
+bazel_dep(name = "googletest", version = "1.14.0.bcr.1", dev_dependency = True)
+bazel_dep(name = "abseil-py", version = "1.4.0", dev_dependency = True)
diff --git a/Makefile b/Makefile
index c7c8145..017ab55 100644
--- a/Makefile
+++ b/Makefile
@@ -2,12 +2,34 @@
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
+# Build against Abseil.
+ABSL_DEPS=\
+ absl_base\
+ absl_core_headers\
+ absl_fixed_array\
+ absl_flags\
+ absl_flat_hash_map\
+ absl_flat_hash_set\
+ absl_inlined_vector\
+ absl_optional\
+ absl_span\
+ absl_str_format\
+ absl_strings\
+ absl_synchronization\
+
+PKG_CONFIG?=pkg-config
+CCABSL=$(shell $(PKG_CONFIG) $(ABSL_DEPS) --cflags)
+# GCC barfs on `-Wl` whereas Clang doesn't mind, but it's unclear what
+# causes it to manifest on Ubuntu 22.04 LTS, so filter it out for now.
+# Similar is needed for `static-testinstall` and `shared-testinstall`.
+LDABSL=$(shell $(PKG_CONFIG) $(ABSL_DEPS) --libs | sed -e 's/-Wl / /g')
+
# To build against ICU for full Unicode properties support,
# uncomment the next two lines:
-# CCICU=$(shell pkg-config icu-uc --cflags) -DRE2_USE_ICU
-# LDICU=$(shell pkg-config icu-uc --libs)
+# CCICU=$(shell $(PKG_CONFIG) icu-uc --cflags) -DRE2_USE_ICU
+# LDICU=$(shell $(PKG_CONFIG) icu-uc --libs)
-# To build against PCRE for testing or benchmarking,
+# To build against PCRE for testing and benchmarking,
# uncomment the next two lines:
# CCPCRE=-I/usr/local/include -DUSEPCRE
# LDPCRE=-L/usr/local/lib -lpcre
@@ -17,8 +39,8 @@ CXX?=g++
CXXFLAGS?=-O3 -g
LDFLAGS?=
# required
-RE2_CXXFLAGS?=-pthread -Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -I. $(CCICU) $(CCPCRE)
-RE2_LDFLAGS?=-pthread $(LDICU) $(LDPCRE)
+RE2_CXXFLAGS?=-pthread -Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -I. $(CCABSL) $(CCICU) $(CCPCRE)
+RE2_LDFLAGS?=-pthread $(LDABSL) $(LDICU) $(LDPCRE)
AR?=ar
ARFLAGS?=rsc
NM?=nm
@@ -42,9 +64,15 @@ else
SED_INPLACE=sed -i
endif
+# The pkg-config Requires: field.
+REQUIRES=$(ABSL_DEPS)
+ifdef LDICU
+REQUIRES+=icu-uc
+endif
+
# ABI version
# http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html
-SONAME=9
+SONAME=11
# To rebuild the Tables generated by Perl and Python scripts (requires Internet
# access for Unicode data), uncomment the following line:
@@ -55,17 +83,17 @@ ifeq ($(shell uname),Darwin)
SOEXT=dylib
SOEXTVER=$(SONAME).$(SOEXT)
SOEXTVER00=$(SONAME).0.0.$(SOEXT)
-MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib -Wl,-compatibility_version,$(SONAME),-current_version,$(SONAME).0.0,-install_name,$(libdir)/libre2.$(SOEXTVER),-exported_symbols_list,libre2.symbols.darwin $(RE2_LDFLAGS) $(LDFLAGS)
+MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib -Wl,-compatibility_version,$(SONAME),-current_version,$(SONAME).0.0,-install_name,$(libdir)/libre2.$(SOEXTVER),-exported_symbols_list,libre2.symbols.darwin
else ifeq ($(shell uname),SunOS)
SOEXT=so
SOEXTVER=$(SOEXT).$(SONAME)
SOEXTVER00=$(SOEXT).$(SONAME).0.0
-MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER) $(RE2_LDFLAGS) $(LDFLAGS)
+MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER)
else
SOEXT=so
SOEXTVER=$(SOEXT).$(SONAME)
SOEXTVER00=$(SOEXT).$(SONAME).0.0
-MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),--version-script,libre2.symbols $(RE2_LDFLAGS) $(LDFLAGS)
+MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),--version-script,libre2.symbols
endif
.PHONY: all
@@ -78,17 +106,11 @@ INSTALL_HFILES=\
re2/stringpiece.h\
HFILES=\
- util/benchmark.h\
- util/flags.h\
util/logging.h\
util/malloc_counter.h\
- util/mix.h\
- util/mutex.h\
util/pcre.h\
util/strutil.h\
- util/test.h\
util/utf.h\
- util/util.h\
re2/bitmap256.h\
re2/filtered_re2.h\
re2/pod_array.h\
@@ -112,6 +134,7 @@ HFILES=\
OFILES=\
obj/util/rune.o\
obj/util/strutil.o\
+ obj/re2/bitmap256.o\
obj/re2/bitstate.o\
obj/re2/compile.o\
obj/re2/dfa.o\
@@ -128,7 +151,6 @@ OFILES=\
obj/re2/regexp.o\
obj/re2/set.o\
obj/re2/simplify.o\
- obj/re2/stringpiece.o\
obj/re2/tostring.o\
obj/re2/unicode_casefold.o\
obj/re2/unicode_groups.o\
@@ -205,38 +227,32 @@ obj/dbg/libre2.a: $(DOFILES)
.PRECIOUS: obj/so/libre2.$(SOEXT)
obj/so/libre2.$(SOEXT): $(SOFILES) libre2.symbols libre2.symbols.darwin
@mkdir -p obj/so
- $(MAKE_SHARED_LIBRARY) -o obj/so/libre2.$(SOEXTVER) $(SOFILES)
+ $(MAKE_SHARED_LIBRARY) -o obj/so/libre2.$(SOEXTVER) $(SOFILES) $(RE2_LDFLAGS) $(LDFLAGS)
ln -sf libre2.$(SOEXTVER) $@
.PRECIOUS: obj/dbg/test/%
-obj/dbg/test/%: obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES) obj/dbg/util/test.o
+obj/dbg/test/%: obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES)
@mkdir -p obj/dbg/test
- $(CXX) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) obj/dbg/util/test.o obj/dbg/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
+ $(CXX) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) -lgtest -lgtest_main obj/dbg/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
.PRECIOUS: obj/test/%
-obj/test/%: obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o
+obj/test/%: obj/libre2.a obj/re2/testing/%.o $(TESTOFILES)
@mkdir -p obj/test
- $(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
+ $(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) -lgtest -lgtest_main obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
# Test the shared lib, falling back to the static lib for private symbols
.PRECIOUS: obj/so/test/%
-obj/so/test/%: obj/so/libre2.$(SOEXT) obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o
+obj/so/test/%: obj/so/libre2.$(SOEXT) obj/libre2.a obj/re2/testing/%.o $(TESTOFILES)
@mkdir -p obj/so/test
- $(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o -Lobj/so -lre2 obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
+ $(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) -lgtest -lgtest_main -Lobj/so -lre2 obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
-# Filter out dump.o because testing::TempDir() isn't available for it.
-obj/test/regexp_benchmark: obj/libre2.a obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/util/benchmark.o
+obj/test/regexp_benchmark: obj/libre2.a obj/re2/testing/regexp_benchmark.o $(TESTOFILES)
@mkdir -p obj/test
- $(CXX) -o $@ obj/re2/testing/regexp_benchmark.o $(filter-out obj/re2/testing/dump.o, $(TESTOFILES)) obj/util/benchmark.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
-
-# re2_fuzzer is a target for fuzzers like libFuzzer and AFL. This fake fuzzing
-# is simply a way to check that the target builds and then to run it against a
-# fixed set of inputs. To perform real fuzzing, refer to the documentation for
-# libFuzzer (llvm.org/docs/LibFuzzer.html) and AFL (lcamtuf.coredump.cx/afl/).
-obj/test/re2_fuzzer: CXXFLAGS:=-I./re2/fuzzing/compiler-rt/include $(CXXFLAGS)
-obj/test/re2_fuzzer: obj/libre2.a obj/re2/fuzzing/re2_fuzzer.o obj/util/fuzz.o
+ $(CXX) -o $@ obj/re2/testing/regexp_benchmark.o $(TESTOFILES) -lgtest -lbenchmark -lbenchmark_main obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
+
+obj/test/re2_fuzzer: obj/libre2.a obj/re2/fuzzing/re2_fuzzer.o
@mkdir -p obj/test
- $(CXX) -o $@ obj/re2/fuzzing/re2_fuzzer.o obj/util/fuzz.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
+ $(CXX) -o $@ obj/re2/fuzzing/re2_fuzzer.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
ifdef REBUILD_TABLES
.PRECIOUS: re2/perl_groups.cc
@@ -316,9 +332,11 @@ shared-install: obj/so/libre2.$(SOEXT) common-install
common-install:
mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir)/pkgconfig
$(INSTALL_DATA) $(INSTALL_HFILES) $(DESTDIR)$(includedir)/re2
- $(INSTALL_DATA) re2.pc $(DESTDIR)$(libdir)/pkgconfig/re2.pc
- $(SED_INPLACE) -e "s#@includedir@#$(includedir)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc
- $(SED_INPLACE) -e "s#@libdir@#$(libdir)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc
+ $(INSTALL_DATA) re2.pc.in $(DESTDIR)$(libdir)/pkgconfig/re2.pc
+ $(SED_INPLACE) -e "s#@CMAKE_INSTALL_FULL_INCLUDEDIR@#$(includedir)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc
+ $(SED_INPLACE) -e "s#@CMAKE_INSTALL_FULL_LIBDIR@#$(libdir)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc
+ $(SED_INPLACE) -e "s#@REQUIRES@#$(REQUIRES)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc
+ $(SED_INPLACE) -e "s#@SONAME@#$(SONAME)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc
.PHONY: testinstall
testinstall: static-testinstall shared-testinstall
@@ -327,27 +345,29 @@ testinstall: static-testinstall shared-testinstall
@echo
.PHONY: static-testinstall
-static-testinstall: CXXFLAGS:=-pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS)
-static-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -l:libre2.a $(LDICU) $(LDFLAGS)
static-testinstall:
- @mkdir -p obj
- @cp testinstall.cc obj
ifeq ($(shell uname),Darwin)
@echo Skipping test for libre2.a on Darwin.
else ifeq ($(shell uname),SunOS)
@echo Skipping test for libre2.a on SunOS.
else
- (cd obj && $(CXX) testinstall.cc -o static-testinstall $(CXXFLAGS) $(LDFLAGS))
+ @mkdir -p obj
+ @cp testinstall.cc obj/static-testinstall.cc
+ (cd obj && export PKG_CONFIG_PATH=$(DESTDIR)$(libdir)/pkgconfig; \
+ $(CXX) static-testinstall.cc -o static-testinstall $(CXXFLAGS) $(LDFLAGS) \
+ $$($(PKG_CONFIG) re2 --cflags) \
+ $$($(PKG_CONFIG) re2 --libs | sed -e 's/-Wl / /g' | sed -e 's/-lre2/-l:libre2.a/'))
obj/static-testinstall
endif
.PHONY: shared-testinstall
-shared-testinstall: CXXFLAGS:=-pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS)
-shared-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -lre2 $(LDICU) $(LDFLAGS)
shared-testinstall:
@mkdir -p obj
- @cp testinstall.cc obj
- (cd obj && $(CXX) testinstall.cc -o shared-testinstall $(CXXFLAGS) $(LDFLAGS))
+ @cp testinstall.cc obj/shared-testinstall.cc
+ (cd obj && export PKG_CONFIG_PATH=$(DESTDIR)$(libdir)/pkgconfig; \
+ $(CXX) shared-testinstall.cc -o shared-testinstall $(CXXFLAGS) $(LDFLAGS) \
+ $$($(PKG_CONFIG) re2 --cflags) \
+ $$($(PKG_CONFIG) re2 --libs | sed -e 's/-Wl / /g'))
ifeq ($(shell uname),Darwin)
DYLD_LIBRARY_PATH="$(DESTDIR)$(libdir):$(DYLD_LIBRARY_PATH)" obj/shared-testinstall
else
diff --git a/README b/README
index caee6af..469d6f3 100644
--- a/README
+++ b/README
@@ -10,6 +10,11 @@ make test
make install
make testinstall
+Building RE2 requires Abseil (https://github.com/abseil/abseil-cpp)
+to be installed on your system. Building the testing for RE2 requires
+GoogleTest (https://github.com/google/googletest) and Benchmark
+(https://github.com/google/benchmark) to be installed as well.
+
There is a fair amount of documentation (including code snippets) in
the re2.h header file.
diff --git a/WORKSPACE b/WORKSPACE.bazel
index b35619c..fa514a8 100644
--- a/WORKSPACE
+++ b/WORKSPACE.bazel
@@ -2,6 +2,6 @@
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
-# Bazel (http://bazel.io/) WORKSPACE file for RE2.
+# Bazel (http://bazel.build/) WORKSPACE file for RE2.
workspace(name = "com_googlesource_code_re2")
diff --git a/WORKSPACE.bzlmod b/WORKSPACE.bzlmod
new file mode 100644
index 0000000..fa514a8
--- /dev/null
+++ b/WORKSPACE.bzlmod
@@ -0,0 +1,7 @@
+# Copyright 2009 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Bazel (http://bazel.build/) WORKSPACE file for RE2.
+
+workspace(name = "com_googlesource_code_re2")
diff --git a/app/BUILD.bazel b/app/BUILD.bazel
new file mode 100644
index 0000000..cb510af
--- /dev/null
+++ b/app/BUILD.bazel
@@ -0,0 +1,24 @@
+# Copyright 2009 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Bazel (http://bazel.build/) BUILD file for RE2 app.
+
+cc_binary(
+ name = "_re2.js",
+ testonly = 1,
+ srcs = ["_re2.cc"],
+ linkopts = [
+ "--bind",
+ "-sENVIRONMENT=web",
+ "-sSINGLE_FILE=1",
+ "-sMODULARIZE=1",
+ "-sEXPORT_ES6=1",
+ "-sEXPORT_NAME=loadModule",
+ "-sUSE_PTHREADS=0",
+ ],
+ deps = [
+ "//:re2",
+ "//:testing",
+ ],
+)
diff --git a/app/_re2.cc b/app/_re2.cc
new file mode 100644
index 0000000..a63313e
--- /dev/null
+++ b/app/_re2.cc
@@ -0,0 +1,94 @@
+// Copyright 2022 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <memory>
+#include <string>
+
+#include <emscripten/bind.h>
+#include "re2/prog.h"
+#include "re2/re2.h"
+#include "re2/regexp.h"
+
+namespace re2_app {
+
+struct Info {
+ std::string pattern;
+ std::string error;
+ std::string prefix;
+ bool prefix_foldcase = false;
+ std::string accel_prefix;
+ bool accel_prefix_foldcase = false;
+ int num_captures;
+ bool is_one_pass;
+ bool can_bit_state;
+ std::string bytecode;
+ std::string bytemap;
+};
+
+Info GetInfo(const std::string& pattern) {
+ Info info;
+ info.pattern = pattern;
+
+ RE2::Options options;
+ re2::RegexpStatus status;
+ re2::Regexp* regexp = re2::Regexp::Parse(
+ pattern, static_cast<re2::Regexp::ParseFlags>(options.ParseFlags()),
+ &status);
+ if (regexp == nullptr) {
+ info.error = "failed to parse pattern: " + status.Text();
+ return info;
+ }
+
+ std::string prefix;
+ bool prefix_foldcase;
+ re2::Regexp* suffix;
+ if (regexp->RequiredPrefix(&prefix, &prefix_foldcase, &suffix)) {
+ info.prefix = prefix;
+ info.prefix_foldcase = prefix_foldcase;
+ } else {
+ suffix = regexp->Incref();
+ }
+
+ std::unique_ptr<re2::Prog> prog(suffix->CompileToProg(options.max_mem()));
+ if (prog == nullptr) {
+ info.error = "failed to compile forward Prog";
+ suffix->Decref();
+ regexp->Decref();
+ return info;
+ }
+
+ if (regexp->RequiredPrefixForAccel(&prefix, &prefix_foldcase)) {
+ info.accel_prefix = prefix;
+ info.accel_prefix_foldcase = prefix_foldcase;
+ }
+
+ info.num_captures = suffix->NumCaptures();
+ info.is_one_pass = prog->IsOnePass();
+ info.can_bit_state = prog->CanBitState();
+ info.bytecode = prog->Dump();
+ info.bytemap = prog->DumpByteMap();
+
+ suffix->Decref();
+ regexp->Decref();
+ return info;
+}
+
+EMSCRIPTEN_BINDINGS(_re2) {
+ emscripten::value_object<Info>("Info")
+ .field("pattern", &Info::pattern)
+ .field("error", &Info::error)
+ .field("prefix", &Info::prefix)
+ .field("prefix_foldcase", &Info::prefix_foldcase)
+ .field("accel_prefix", &Info::accel_prefix)
+ .field("accel_prefix_foldcase", &Info::accel_prefix_foldcase)
+ .field("num_captures", &Info::num_captures)
+ .field("is_one_pass", &Info::is_one_pass)
+ .field("can_bit_state", &Info::can_bit_state)
+ .field("bytecode", &Info::bytecode)
+ .field("bytemap", &Info::bytemap);
+
+ emscripten::function("getInfo", &GetInfo);
+}
+
+} // namespace re2_app
diff --git a/app/_re2.d.ts b/app/_re2.d.ts
new file mode 100644
index 0000000..dff5e49
--- /dev/null
+++ b/app/_re2.d.ts
@@ -0,0 +1,23 @@
+// Copyright 2022 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+export type Info = {
+ pattern: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string,
+ error: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string,
+ prefix: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string,
+ prefix_foldcase: boolean,
+ accel_prefix: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string,
+ accel_prefix_foldcase: boolean,
+ num_captures: number,
+ is_one_pass: boolean,
+ can_bit_state: boolean,
+ bytecode: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string,
+ bytemap: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string,
+};
+
+export interface MainModule {
+ getInfo(pattern: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string): Info;
+}
+
+export default function loadModule(): Promise<MainModule>;
diff --git a/app/app.ts b/app/app.ts
new file mode 100644
index 0000000..4b9e7bd
--- /dev/null
+++ b/app/app.ts
@@ -0,0 +1,111 @@
+// Copyright 2022 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+import {css, html, LitElement, render} from 'lit';
+import {customElement} from 'lit/decorators.js';
+
+import /*default*/ loadModule from './_re2';
+import {Info, MainModule} from './_re2';
+
+var _re2: MainModule;
+loadModule().then((module: MainModule) => {
+ _re2 = module;
+ render(html`<title>re2-dev</title><re2-dev></re2-dev>`, document.body);
+});
+
+@customElement('re2-dev')
+export class RE2Dev extends LitElement {
+ private _pattern: string = '';
+ private _info: Info|null = null;
+
+ constructor() {
+ super();
+ this._pattern = decodeURIComponent(window.location.hash.slice(1));
+ this._info = this._pattern ? _re2.getInfo(this._pattern) : null;
+ this.requestUpdate();
+ }
+
+ private _onChange = (e: Event) => {
+ this._pattern = (e.target as HTMLInputElement).value;
+ this._info = this._pattern ? _re2.getInfo(this._pattern) : null;
+ this.requestUpdate();
+ window.location.hash = '#' + encodeURIComponent(this._pattern);
+ };
+
+ static override styles = css`
+.code {
+ font-family: monospace;
+ white-space: pre-line;
+}
+`;
+
+ override render() {
+ var fragments = [];
+ fragments.push(html`
+<div>
+ <input type="text" size="48" @change=${this._onChange} .value=${this._pattern}>
+</div>
+`);
+
+ if (this._info === null) {
+ return html`${fragments}`;
+ }
+
+ if (this._info.error) {
+ fragments.push(html`
+<br>
+<div>
+ error:
+ <span class="code">${this._info.error}</span>
+</div>
+`);
+ return html`${fragments}`;
+ }
+
+ fragments.push(html`
+<br>
+<div>
+ pattern:
+ <span class="code">${this._info.pattern}</span>
+ <br>
+ prefix:
+ <span class="code">${this._info.prefix}</span>
+ ·
+ _foldcase:
+ <span class="code">${this._info.prefix_foldcase}</span>
+ <br>
+ accel_prefix:
+ <span class="code">${this._info.accel_prefix}</span>
+ ·
+ _foldcase:
+ <span class="code">${this._info.accel_prefix_foldcase}</span>
+ <br>
+ num_captures:
+ <span class="code">${this._info.num_captures}</span>
+ <br>
+ is_one_pass:
+ <span class="code">${this._info.is_one_pass}</span>
+ <br>
+ can_bit_state:
+ <span class="code">${this._info.can_bit_state}</span>
+ <br>
+ <br>
+ bytecode:
+ <br>
+ <span class="code">${this._info.bytecode}</span>
+ <br>
+ bytemap:
+ <br>
+ <span class="code">${this._info.bytemap}</span>
+</div>
+`);
+ return html`${fragments}`;
+ }
+}
+
+declare global {
+ interface HTMLElementTagNameMap {
+ 're2-dev': RE2Dev;
+ }
+}
diff --git a/app/build.sh b/app/build.sh
new file mode 100755
index 0000000..09d931f
--- /dev/null
+++ b/app/build.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+set -eux
+
+SRCDIR=$(readlink --canonicalize $(dirname $0))
+DSTDIR=$(mktemp --directory --tmpdir $(basename $0).XXXXXXXXXX)
+
+BAZEL=/tmp/bazel
+BAZELISK_RELEASE=v1.17.0
+
+if [[ ${UID} -ne 0 ]]; then
+ if [[ -d deploy ]]; then
+ echo -e '\033[1;31m' "** The ${PWD}/deploy directory exists! Refusing to clobber it! **" '\033[0m'
+ exit 1
+ fi
+ mkdir deploy
+ sudo docker run -i -t --pull always --rm -v ${SRCDIR}/..:/src -v ${PWD}:/dst emscripten/emsdk /src/app/$(basename $0)
+ ls -l deploy
+else
+ wget -O ${BAZEL} https://github.com/bazelbuild/bazelisk/releases/download/${BAZELISK_RELEASE}/bazelisk-linux-amd64
+ chmod +x ${BAZEL}
+
+ cd ${SRCDIR}
+ # Emscripten doesn't support `-fstack-protector`.
+ AR=emar CC=emcc \
+ ${BAZEL} build --compilation_mode=opt \
+ --copt=-fno-stack-protector \
+ -- :all
+ cp ../bazel-bin/app/_re2.js ${DSTDIR}
+ # Clean up the sundry Bazel output directories.
+ ${BAZEL} clean --expunge
+ cp app.ts index.html _re2.d.ts ${DSTDIR}
+ cp package.json rollup.config.js tsconfig.json ${DSTDIR}
+
+ cd ${DSTDIR}
+ npm install
+ npx tsc
+ npx rollup -c rollup.config.js -d deploy
+ mv deploy/* /dst/deploy
+fi
+
+cd ${SRCDIR}
+rm -rf ${DSTDIR}
+
+exit 0
diff --git a/app/index.html b/app/index.html
new file mode 100644
index 0000000..d229e56
--- /dev/null
+++ b/app/index.html
@@ -0,0 +1,5 @@
+<!DOCTYPE html>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<style>:root { color-scheme: dark light; }</style>
+<script type="module" src="app.js"></script>
diff --git a/app/package.json b/app/package.json
new file mode 100644
index 0000000..e702789
--- /dev/null
+++ b/app/package.json
@@ -0,0 +1,14 @@
+{
+ "dependencies": {
+ "lit": "*"
+ },
+ "devDependencies": {
+ "@rollup/plugin-node-resolve": "*",
+ "@rollup/plugin-terser": "*",
+ "@web/rollup-plugin-html": "*",
+ "@web/rollup-plugin-import-meta-assets": "*",
+ "rollup": "~2",
+ "tslib": "*",
+ "typescript": "*"
+ }
+}
diff --git a/app/rollup.config.js b/app/rollup.config.js
new file mode 100644
index 0000000..3a20e66
--- /dev/null
+++ b/app/rollup.config.js
@@ -0,0 +1,28 @@
+// Copyright 2022 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+import nodeResolve from '@rollup/plugin-node-resolve';
+import terser from '@rollup/plugin-terser';
+import html from '@web/rollup-plugin-html';
+import {importMetaAssets} from '@web/rollup-plugin-import-meta-assets';
+
+export default {
+ input: 'index.html',
+ output: {
+ entryFileNames: '[hash].js',
+ chunkFileNames: '[hash].js',
+ assetFileNames: '[hash][extname]',
+ format: 'es',
+ },
+ preserveEntrySignatures: false,
+ plugins:
+ [
+ html({
+ minify: true,
+ }),
+ nodeResolve(),
+ terser(),
+ importMetaAssets(),
+ ],
+};
diff --git a/app/tsconfig.json b/app/tsconfig.json
new file mode 100644
index 0000000..86cc302
--- /dev/null
+++ b/app/tsconfig.json
@@ -0,0 +1,17 @@
+{
+ "compilerOptions": {
+ "target": "esnext",
+ "module": "esnext",
+ "moduleResolution": "node",
+ "noEmitOnError": true,
+ "lib": ["esnext", "dom"],
+ "strict": true,
+ "esModuleInterop": false,
+ "allowSyntheticDefaultImports": true,
+ "experimentalDecorators": true,
+ "importHelpers": true,
+ "sourceMap": true,
+ "inlineSources": true,
+ "incremental": true
+ }
+}
diff --git a/doc/README.xkcd b/doc/README.xkcd
deleted file mode 100644
index b50a579..0000000
--- a/doc/README.xkcd
+++ /dev/null
@@ -1 +0,0 @@
-xkcd.png is a cropped version of http://xkcd.com/208/
diff --git a/doc/syntax.html b/doc/syntax.html
index f0e0138..6cbda14 100644
--- a/doc/syntax.html
+++ b/doc/syntax.html
@@ -62,7 +62,7 @@
<tr><td colspan=2><b>Grouping:</b></td></tr>
<tr><td><code>(re)</code></td><td>numbered capturing group (submatch)</td></tr>
<tr><td><code>(?P&lt;name&gt;re)</code></td><td>named &amp; numbered capturing group (submatch)</td></tr>
-<tr><td><code><font color=#808080>(?&lt;name&gt;re)</font></code></td><td>named &amp; numbered capturing group (submatch) </td></tr>
+<tr><td><code>(?&lt;name&gt;re)</code></td><td>named &amp; numbered capturing group (submatch)</td></tr>
<tr><td><code><font color=#808080>(?'name're)</font></code></td><td>named &amp; numbered capturing group (submatch) </td></tr>
<tr><td><code>(?:re)</code></td><td>non-capturing group</td></tr>
<tr><td><code>(?flags)</code></td><td>set flags within current group; non-capturing</td></tr>
@@ -303,6 +303,7 @@
<tr><td colspan=2>Kaithi</td></tr>
<tr><td colspan=2>Kannada</td></tr>
<tr><td colspan=2>Katakana</td></tr>
+<tr><td colspan=2>Kawi</td></tr>
<tr><td colspan=2>Kayah_Li</td></tr>
<tr><td colspan=2>Kharoshthi</td></tr>
<tr><td colspan=2>Khitan_Small_Script</td></tr>
@@ -337,6 +338,7 @@
<tr><td colspan=2>Multani</td></tr>
<tr><td colspan=2>Myanmar</td></tr>
<tr><td colspan=2>Nabataean</td></tr>
+<tr><td colspan=2>Nag_Mundari</td></tr>
<tr><td colspan=2>Nandinagari</td></tr>
<tr><td colspan=2>New_Tai_Lue</td></tr>
<tr><td colspan=2>Newa</td></tr>
diff --git a/doc/syntax.txt b/doc/syntax.txt
index c12a482..6070efd 100644
--- a/doc/syntax.txt
+++ b/doc/syntax.txt
@@ -51,7 +51,7 @@ x{n}+ exactly «n» «x», possessive NOT SUPPORTED
Grouping:
(re) numbered capturing group (submatch)
(?P<name>re) named & numbered capturing group (submatch)
-(?<name>re) named & numbered capturing group (submatch) NOT SUPPORTED
+(?<name>re) named & numbered capturing group (submatch)
(?'name're) named & numbered capturing group (submatch) NOT SUPPORTED
(?:re) non-capturing group
(?flags) set flags within current group; non-capturing
@@ -292,6 +292,7 @@ Javanese
Kaithi
Kannada
Katakana
+Kawi
Kayah_Li
Kharoshthi
Khitan_Small_Script
@@ -326,6 +327,7 @@ Mro
Multani
Myanmar
Nabataean
+Nag_Mundari
Nandinagari
New_Tai_Lue
Newa
diff --git a/doc/xkcd.png b/doc/xkcd.png
deleted file mode 100644
index 6249e8e..0000000
--- a/doc/xkcd.png
+++ /dev/null
Binary files differ
diff --git a/libre2.symbols b/libre2.symbols
index 93b71b4..0cab3d9 100644
--- a/libre2.symbols
+++ b/libre2.symbols
@@ -3,9 +3,6 @@
# re2::RE2*
_ZN3re23RE2*;
_ZNK3re23RE2*;
- # re2::StringPiece*
- _ZN3re211StringPiece*;
- _ZNK3re211StringPiece*;
# re2::operator<<*
_ZN3re2ls*;
# re2::FilteredRE2*
diff --git a/libre2.symbols.darwin b/libre2.symbols.darwin
index 41ac96f..754f45c 100644
--- a/libre2.symbols.darwin
+++ b/libre2.symbols.darwin
@@ -2,9 +2,6 @@
# re2::RE2*
__ZN3re23RE2*
__ZNK3re23RE2*
-# re2::StringPiece*
-__ZN3re211StringPiece*
-__ZNK3re211StringPiece*
# re2::operator<<*
__ZN3re2ls*
# re2::FilteredRE2*
diff --git a/python/BUILD.bazel b/python/BUILD.bazel
new file mode 100644
index 0000000..a05fb6e
--- /dev/null
+++ b/python/BUILD.bazel
@@ -0,0 +1,36 @@
+# Copyright 2009 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Bazel (http://bazel.build/) BUILD file for RE2 Python.
+
+load("@pybind11_bazel//:build_defs.bzl", "pybind_extension")
+load("@rules_python//python:defs.bzl", "py_library", "py_test")
+
+pybind_extension(
+ name = "_re2",
+ srcs = ["_re2.cc"],
+ deps = [
+ "//:re2",
+ "@com_google_absl//absl/strings",
+ ],
+)
+
+py_library(
+ name = "re2",
+ srcs = ["re2.py"],
+ data = [":_re2.so"],
+ imports = ["."],
+ visibility = ["//visibility:public"],
+)
+
+py_test(
+ name = "re2_test",
+ size = "small",
+ srcs = ["re2_test.py"],
+ deps = [
+ ":re2",
+ "@abseil-py//absl/testing:absltest",
+ "@abseil-py//absl/testing:parameterized",
+ ],
+)
diff --git a/python/LICENSE b/python/LICENSE
new file mode 120000
index 0000000..ea5b606
--- /dev/null
+++ b/python/LICENSE
@@ -0,0 +1 @@
+../LICENSE \ No newline at end of file
diff --git a/python/README b/python/README
new file mode 100644
index 0000000..782378f
--- /dev/null
+++ b/python/README
@@ -0,0 +1 @@
+Building requires Python 3 and pybind11 to be installed on your system.
diff --git a/python/_re2.cc b/python/_re2.cc
new file mode 100644
index 0000000..8564f8a
--- /dev/null
+++ b/python/_re2.cc
@@ -0,0 +1,338 @@
+// Copyright 2019 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include "absl/strings/string_view.h"
+#include "re2/filtered_re2.h"
+#include "re2/re2.h"
+#include "re2/set.h"
+
+#ifdef _WIN32
+#include <basetsd.h>
+#define ssize_t SSIZE_T
+#endif
+
+namespace re2_python {
+
+// This is conventional.
+namespace py = pybind11;
+
+// In terms of the pybind11 API, a py::buffer is merely a py::object that
+// supports the buffer interface/protocol and you must explicitly request
+// a py::buffer_info in order to access the actual bytes. Under the hood,
+// the py::buffer_info manages a reference count to the py::buffer, so it
+// must be constructed and subsequently destructed while holding the GIL.
+static inline absl::string_view FromBytes(const py::buffer_info& bytes) {
+ char* data = reinterpret_cast<char*>(bytes.ptr);
+ ssize_t size = bytes.size;
+ return absl::string_view(data, size);
+}
+
+static inline int OneCharLen(const char* ptr) {
+ return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*ptr & 0xFF) >> 4];
+}
+
+// Helper function for when Python encodes str to bytes and then needs to
+// convert str offsets to bytes offsets. Assumes that text is valid UTF-8.
+ssize_t CharLenToBytes(py::buffer buffer, ssize_t pos, ssize_t len) {
+ auto bytes = buffer.request();
+ auto text = FromBytes(bytes);
+ auto ptr = text.data() + pos;
+ auto end = text.data() + text.size();
+ while (ptr < end && len > 0) {
+ ptr += OneCharLen(ptr);
+ --len;
+ }
+ return ptr - (text.data() + pos);
+}
+
+// Helper function for when Python decodes bytes to str and then needs to
+// convert bytes offsets to str offsets. Assumes that text is valid UTF-8.
+ssize_t BytesToCharLen(py::buffer buffer, ssize_t pos, ssize_t endpos) {
+ auto bytes = buffer.request();
+ auto text = FromBytes(bytes);
+ auto ptr = text.data() + pos;
+ auto end = text.data() + endpos;
+ ssize_t len = 0;
+ while (ptr < end) {
+ ptr += OneCharLen(ptr);
+ ++len;
+ }
+ return len;
+}
+
+std::unique_ptr<RE2> RE2InitShim(py::buffer buffer,
+ const RE2::Options& options) {
+ auto bytes = buffer.request();
+ auto pattern = FromBytes(bytes);
+ return std::make_unique<RE2>(pattern, options);
+}
+
+py::bytes RE2ErrorShim(const RE2& self) {
+ // Return std::string as bytes. That is, without decoding to str.
+ return self.error();
+}
+
+std::vector<std::pair<py::bytes, int>> RE2NamedCapturingGroupsShim(
+ const RE2& self) {
+ const int num_groups = self.NumberOfCapturingGroups();
+ std::vector<std::pair<py::bytes, int>> groups;
+ groups.reserve(num_groups);
+ for (const auto& it : self.NamedCapturingGroups()) {
+ groups.emplace_back(it.first, it.second);
+ }
+ return groups;
+}
+
+std::vector<int> RE2ProgramFanoutShim(const RE2& self) {
+ std::vector<int> histogram;
+ self.ProgramFanout(&histogram);
+ return histogram;
+}
+
+std::vector<int> RE2ReverseProgramFanoutShim(const RE2& self) {
+ std::vector<int> histogram;
+ self.ReverseProgramFanout(&histogram);
+ return histogram;
+}
+
+std::tuple<bool, py::bytes, py::bytes> RE2PossibleMatchRangeShim(
+ const RE2& self, int maxlen) {
+ std::string min, max;
+ // Return std::string as bytes. That is, without decoding to str.
+ return {self.PossibleMatchRange(&min, &max, maxlen), min, max};
+}
+
+std::vector<std::pair<ssize_t, ssize_t>> RE2MatchShim(const RE2& self,
+ RE2::Anchor anchor,
+ py::buffer buffer,
+ ssize_t pos,
+ ssize_t endpos) {
+ auto bytes = buffer.request();
+ auto text = FromBytes(bytes);
+ const int num_groups = self.NumberOfCapturingGroups() + 1; // need $0
+ std::vector<absl::string_view> groups;
+ groups.resize(num_groups);
+ py::gil_scoped_release release_gil;
+ if (!self.Match(text, pos, endpos, anchor, groups.data(), groups.size())) {
+ // Ensure that groups are null before converting to spans!
+ for (auto& it : groups) {
+ it = absl::string_view();
+ }
+ }
+ std::vector<std::pair<ssize_t, ssize_t>> spans;
+ spans.reserve(num_groups);
+ for (const auto& it : groups) {
+ if (it.data() == NULL) {
+ spans.emplace_back(-1, -1);
+ } else {
+ spans.emplace_back(it.data() - text.data(),
+ it.data() - text.data() + it.size());
+ }
+ }
+ return spans;
+}
+
+py::bytes RE2QuoteMetaShim(py::buffer buffer) {
+ auto bytes = buffer.request();
+ auto pattern = FromBytes(bytes);
+ // Return std::string as bytes. That is, without decoding to str.
+ return RE2::QuoteMeta(pattern);
+}
+
+class Set {
+ public:
+ Set(RE2::Anchor anchor, const RE2::Options& options)
+ : set_(options, anchor) {}
+
+ ~Set() = default;
+
+ // Not copyable or movable.
+ Set(const Set&) = delete;
+ Set& operator=(const Set&) = delete;
+
+ int Add(py::buffer buffer) {
+ auto bytes = buffer.request();
+ auto pattern = FromBytes(bytes);
+ int index = set_.Add(pattern, /*error=*/NULL); // -1 on error
+ return index;
+ }
+
+ bool Compile() {
+ // Compiling can fail.
+ return set_.Compile();
+ }
+
+ std::vector<int> Match(py::buffer buffer) const {
+ auto bytes = buffer.request();
+ auto text = FromBytes(bytes);
+ std::vector<int> matches;
+ py::gil_scoped_release release_gil;
+ set_.Match(text, &matches);
+ return matches;
+ }
+
+ private:
+ RE2::Set set_;
+};
+
+class Filter {
+ public:
+ Filter() = default;
+ ~Filter() = default;
+
+ // Not copyable or movable.
+ Filter(const Filter&) = delete;
+ Filter& operator=(const Filter&) = delete;
+
+ int Add(py::buffer buffer, const RE2::Options& options) {
+ auto bytes = buffer.request();
+ auto pattern = FromBytes(bytes);
+ int index = -1; // not clobbered on error
+ filter_.Add(pattern, options, &index);
+ return index;
+ }
+
+ bool Compile() {
+ std::vector<std::string> atoms;
+ filter_.Compile(&atoms);
+ RE2::Options options;
+ options.set_literal(true);
+ options.set_case_sensitive(false);
+ set_ = std::make_unique<RE2::Set>(options, RE2::UNANCHORED);
+ for (int i = 0; i < static_cast<int>(atoms.size()); ++i) {
+ if (set_->Add(atoms[i], /*error=*/NULL) != i) {
+ // Should never happen: the atom is a literal!
+ py::pybind11_fail("set_->Add() failed");
+ }
+ }
+ // Compiling can fail.
+ return set_->Compile();
+ }
+
+ std::vector<int> Match(py::buffer buffer, bool potential) const {
+ auto bytes = buffer.request();
+ auto text = FromBytes(bytes);
+ std::vector<int> atoms;
+ py::gil_scoped_release release_gil;
+ set_->Match(text, &atoms);
+ std::vector<int> matches;
+ if (potential) {
+ filter_.AllPotentials(atoms, &matches);
+ } else {
+ filter_.AllMatches(text, atoms, &matches);
+ }
+ return matches;
+ }
+
+ const RE2& GetRE2(int index) const {
+ return filter_.GetRE2(index);
+ }
+
+ private:
+ re2::FilteredRE2 filter_;
+ std::unique_ptr<RE2::Set> set_;
+};
+
+PYBIND11_MODULE(_re2, module) {
+ module.def("CharLenToBytes", &CharLenToBytes);
+ module.def("BytesToCharLen", &BytesToCharLen);
+
+ // CLASSES
+ // class RE2
+ // enum Anchor
+ // class Options
+ // enum Encoding
+ // class Set
+ // class Filter
+ py::class_<RE2> re2(module, "RE2");
+ py::enum_<RE2::Anchor> anchor(re2, "Anchor");
+ py::class_<RE2::Options> options(re2, "Options");
+ py::enum_<RE2::Options::Encoding> encoding(options, "Encoding");
+ py::class_<Set> set(module, "Set");
+ py::class_<Filter> filter(module, "Filter");
+
+ anchor.value("UNANCHORED", RE2::Anchor::UNANCHORED);
+ anchor.value("ANCHOR_START", RE2::Anchor::ANCHOR_START);
+ anchor.value("ANCHOR_BOTH", RE2::Anchor::ANCHOR_BOTH);
+
+ encoding.value("UTF8", RE2::Options::Encoding::EncodingUTF8);
+ encoding.value("LATIN1", RE2::Options::Encoding::EncodingLatin1);
+
+ options.def(py::init<>())
+ .def_property("max_mem", //
+ &RE2::Options::max_mem, //
+ &RE2::Options::set_max_mem) //
+ .def_property("encoding", //
+ &RE2::Options::encoding, //
+ &RE2::Options::set_encoding) //
+ .def_property("posix_syntax", //
+ &RE2::Options::posix_syntax, //
+ &RE2::Options::set_posix_syntax) //
+ .def_property("longest_match", //
+ &RE2::Options::longest_match, //
+ &RE2::Options::set_longest_match) //
+ .def_property("log_errors", //
+ &RE2::Options::log_errors, //
+ &RE2::Options::set_log_errors) //
+ .def_property("literal", //
+ &RE2::Options::literal, //
+ &RE2::Options::set_literal) //
+ .def_property("never_nl", //
+ &RE2::Options::never_nl, //
+ &RE2::Options::set_never_nl) //
+ .def_property("dot_nl", //
+ &RE2::Options::dot_nl, //
+ &RE2::Options::set_dot_nl) //
+ .def_property("never_capture", //
+ &RE2::Options::never_capture, //
+ &RE2::Options::set_never_capture) //
+ .def_property("case_sensitive", //
+ &RE2::Options::case_sensitive, //
+ &RE2::Options::set_case_sensitive) //
+ .def_property("perl_classes", //
+ &RE2::Options::perl_classes, //
+ &RE2::Options::set_perl_classes) //
+ .def_property("word_boundary", //
+ &RE2::Options::word_boundary, //
+ &RE2::Options::set_word_boundary) //
+ .def_property("one_line", //
+ &RE2::Options::one_line, //
+ &RE2::Options::set_one_line); //
+
+ re2.def(py::init(&RE2InitShim))
+ .def("ok", &RE2::ok)
+ .def("error", &RE2ErrorShim)
+ .def("options", &RE2::options)
+ .def("NumberOfCapturingGroups", &RE2::NumberOfCapturingGroups)
+ .def("NamedCapturingGroups", &RE2NamedCapturingGroupsShim)
+ .def("ProgramSize", &RE2::ProgramSize)
+ .def("ReverseProgramSize", &RE2::ReverseProgramSize)
+ .def("ProgramFanout", &RE2ProgramFanoutShim)
+ .def("ReverseProgramFanout", &RE2ReverseProgramFanoutShim)
+ .def("PossibleMatchRange", &RE2PossibleMatchRangeShim)
+ .def("Match", &RE2MatchShim)
+ .def_static("QuoteMeta", &RE2QuoteMetaShim);
+
+ set.def(py::init<RE2::Anchor, const RE2::Options&>())
+ .def("Add", &Set::Add)
+ .def("Compile", &Set::Compile)
+ .def("Match", &Set::Match);
+
+ filter.def(py::init<>())
+ .def("Add", &Filter::Add)
+ .def("Compile", &Filter::Compile)
+ .def("Match", &Filter::Match)
+ .def("GetRE2", &Filter::GetRE2,
+ py::return_value_policy::reference_internal);
+}
+
+} // namespace re2_python
diff --git a/python/re2.py b/python/re2.py
new file mode 100644
index 0000000..8a6d985
--- /dev/null
+++ b/python/re2.py
@@ -0,0 +1,582 @@
+# Copyright 2019 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+r"""A drop-in replacement for the re module.
+
+It uses RE2 under the hood, of course, so various PCRE features
+(e.g. backreferences, look-around assertions) are not supported.
+See https://github.com/google/re2/wiki/Syntax for the canonical
+reference, but known syntactic "gotchas" relative to Python are:
+
+ * PCRE supports \Z and \z; RE2 supports \z; Python supports \z,
+ but calls it \Z. You must rewrite \Z to \z in pattern strings.
+
+Known differences between this module's API and the re module's API:
+
+ * The error class does not provide any error information as attributes.
+ * The Options class replaces the re module's flags with RE2's options as
+ gettable/settable properties. Please see re2.h for their documentation.
+ * The pattern string and the input string do not have to be the same type.
+ Any str will be encoded to UTF-8.
+ * The pattern string cannot be str if the options specify Latin-1 encoding.
+
+This module's LRU cache contains a maximum of 128 regular expression objects.
+Each regular expression object's underlying RE2 object uses a maximum of 8MiB
+of memory (by default). Hence, this module's LRU cache uses a maximum of 1GiB
+of memory (by default), but in most cases, it should use much less than that.
+"""
+
+import codecs
+import functools
+import itertools
+
+import _re2
+
+
+class error(Exception):
+ pass
+
+
+class Options(_re2.RE2.Options):
+
+ __slots__ = ()
+
+ NAMES = (
+ 'max_mem',
+ 'encoding',
+ 'posix_syntax',
+ 'longest_match',
+ 'log_errors',
+ 'literal',
+ 'never_nl',
+ 'dot_nl',
+ 'never_capture',
+ 'case_sensitive',
+ 'perl_classes',
+ 'word_boundary',
+ 'one_line',
+ )
+
+
+def compile(pattern, options=None):
+ if isinstance(pattern, _Regexp):
+ if options:
+ raise error('pattern is already compiled, so '
+ 'options may not be specified')
+ pattern = pattern._pattern
+ options = options or Options()
+ values = tuple(getattr(options, name) for name in Options.NAMES)
+ return _Regexp._make(pattern, values)
+
+
+def search(pattern, text, options=None):
+ return compile(pattern, options=options).search(text)
+
+
+def match(pattern, text, options=None):
+ return compile(pattern, options=options).match(text)
+
+
+def fullmatch(pattern, text, options=None):
+ return compile(pattern, options=options).fullmatch(text)
+
+
+def finditer(pattern, text, options=None):
+ return compile(pattern, options=options).finditer(text)
+
+
+def findall(pattern, text, options=None):
+ return compile(pattern, options=options).findall(text)
+
+
+def split(pattern, text, maxsplit=0, options=None):
+ return compile(pattern, options=options).split(text, maxsplit)
+
+
+def subn(pattern, repl, text, count=0, options=None):
+ return compile(pattern, options=options).subn(repl, text, count)
+
+
+def sub(pattern, repl, text, count=0, options=None):
+ return compile(pattern, options=options).sub(repl, text, count)
+
+
+def _encode(t):
+ return t.encode(encoding='utf-8')
+
+
+def _decode(b):
+ return b.decode(encoding='utf-8')
+
+
+def escape(pattern):
+ if isinstance(pattern, str):
+ encoded_pattern = _encode(pattern)
+ escaped = _re2.RE2.QuoteMeta(encoded_pattern)
+ decoded_escaped = _decode(escaped)
+ return decoded_escaped
+ else:
+ escaped = _re2.RE2.QuoteMeta(pattern)
+ return escaped
+
+
+def purge():
+ return _Regexp._make.cache_clear()
+
+
+_Anchor = _re2.RE2.Anchor
+_NULL_SPAN = (-1, -1)
+
+
+class _Regexp(object):
+
+ __slots__ = ('_pattern', '_regexp')
+
+ @classmethod
+ @functools.lru_cache(typed=True)
+ def _make(cls, pattern, values):
+ options = Options()
+ for name, value in zip(Options.NAMES, values):
+ setattr(options, name, value)
+ return cls(pattern, options)
+
+ def __init__(self, pattern, options):
+ self._pattern = pattern
+ if isinstance(self._pattern, str):
+ if options.encoding == Options.Encoding.LATIN1:
+ raise error('string type of pattern is str, but '
+ 'encoding specified in options is LATIN1')
+ encoded_pattern = _encode(self._pattern)
+ self._regexp = _re2.RE2(encoded_pattern, options)
+ else:
+ self._regexp = _re2.RE2(self._pattern, options)
+ if not self._regexp.ok():
+ raise error(self._regexp.error())
+
+ def __getstate__(self):
+ options = {name: getattr(self.options, name) for name in Options.NAMES}
+ return self._pattern, options
+
+ def __setstate__(self, state):
+ pattern, options = state
+ values = tuple(options[name] for name in Options.NAMES)
+ other = _Regexp._make(pattern, values)
+ self._pattern = other._pattern
+ self._regexp = other._regexp
+
+ def _match(self, anchor, text, pos=None, endpos=None):
+ pos = 0 if pos is None else max(0, min(pos, len(text)))
+ endpos = len(text) if endpos is None else max(0, min(endpos, len(text)))
+ if pos > endpos:
+ return
+ if isinstance(text, str):
+ encoded_text = _encode(text)
+ encoded_pos = _re2.CharLenToBytes(encoded_text, 0, pos)
+ if endpos == len(text):
+ # This is the common case.
+ encoded_endpos = len(encoded_text)
+ else:
+ encoded_endpos = encoded_pos + _re2.CharLenToBytes(
+ encoded_text, encoded_pos, endpos - pos)
+ decoded_offsets = {0: 0}
+ last_offset = 0
+ while True:
+ spans = self._regexp.Match(anchor, encoded_text, encoded_pos,
+ encoded_endpos)
+ if spans[0] == _NULL_SPAN:
+ break
+
+ # This algorithm is linear in the length of encoded_text. Specifically,
+ # no matter how many groups there are for a given regular expression or
+ # how many iterations through the loop there are for a given generator,
+ # this algorithm uses a single, straightforward pass over encoded_text.
+ offsets = sorted(set(itertools.chain(*spans)))
+ if offsets[0] == -1:
+ offsets = offsets[1:]
+ # Discard the rest of the items because they are useless now - and we
+ # could accumulate one item per str offset in the pathological case!
+ decoded_offsets = {last_offset: decoded_offsets[last_offset]}
+ for offset in offsets:
+ decoded_offsets[offset] = (
+ decoded_offsets[last_offset] +
+ _re2.BytesToCharLen(encoded_text, last_offset, offset))
+ last_offset = offset
+
+ def decode(span):
+ if span == _NULL_SPAN:
+ return span
+ return decoded_offsets[span[0]], decoded_offsets[span[1]]
+
+ decoded_spans = [decode(span) for span in spans]
+ yield _Match(self, text, pos, endpos, decoded_spans)
+ if encoded_pos == encoded_endpos:
+ break
+ elif encoded_pos == spans[0][1]:
+ # We matched the empty string at encoded_pos and would be stuck, so
+ # in order to make forward progress, increment the str offset.
+ encoded_pos += _re2.CharLenToBytes(encoded_text, encoded_pos, 1)
+ else:
+ encoded_pos = spans[0][1]
+ else:
+ while True:
+ spans = self._regexp.Match(anchor, text, pos, endpos)
+ if spans[0] == _NULL_SPAN:
+ break
+ yield _Match(self, text, pos, endpos, spans)
+ if pos == endpos:
+ break
+ elif pos == spans[0][1]:
+ # We matched the empty string at pos and would be stuck, so in order
+ # to make forward progress, increment the bytes offset.
+ pos += 1
+ else:
+ pos = spans[0][1]
+
+ def search(self, text, pos=None, endpos=None):
+ return next(self._match(_Anchor.UNANCHORED, text, pos, endpos), None)
+
+ def match(self, text, pos=None, endpos=None):
+ return next(self._match(_Anchor.ANCHOR_START, text, pos, endpos), None)
+
+ def fullmatch(self, text, pos=None, endpos=None):
+ return next(self._match(_Anchor.ANCHOR_BOTH, text, pos, endpos), None)
+
+ def finditer(self, text, pos=None, endpos=None):
+ return self._match(_Anchor.UNANCHORED, text, pos, endpos)
+
+ def findall(self, text, pos=None, endpos=None):
+ empty = type(text)()
+ items = []
+ for match in self.finditer(text, pos, endpos):
+ if not self.groups:
+ item = match.group()
+ elif self.groups == 1:
+ item = match.groups(default=empty)[0]
+ else:
+ item = match.groups(default=empty)
+ items.append(item)
+ return items
+
+ def _split(self, cb, text, maxsplit=0):
+ if maxsplit < 0:
+ return [text], 0
+ elif maxsplit > 0:
+ matchiter = itertools.islice(self.finditer(text), maxsplit)
+ else:
+ matchiter = self.finditer(text)
+ pieces = []
+ end = 0
+ numsplit = 0
+ for match in matchiter:
+ pieces.append(text[end:match.start()])
+ pieces.extend(cb(match))
+ end = match.end()
+ numsplit += 1
+ pieces.append(text[end:])
+ return pieces, numsplit
+
+ def split(self, text, maxsplit=0):
+ cb = lambda match: [match[group] for group in range(1, self.groups + 1)]
+ pieces, _ = self._split(cb, text, maxsplit)
+ return pieces
+
+ def subn(self, repl, text, count=0):
+ cb = lambda match: [repl(match) if callable(repl) else match.expand(repl)]
+ empty = type(text)()
+ pieces, numsplit = self._split(cb, text, count)
+ joined_pieces = empty.join(pieces)
+ return joined_pieces, numsplit
+
+ def sub(self, repl, text, count=0):
+ joined_pieces, _ = self.subn(repl, text, count)
+ return joined_pieces
+
+ @property
+ def pattern(self):
+ return self._pattern
+
+ @property
+ def options(self):
+ return self._regexp.options()
+
+ @property
+ def groups(self):
+ return self._regexp.NumberOfCapturingGroups()
+
+ @property
+ def groupindex(self):
+ groups = self._regexp.NamedCapturingGroups()
+ if isinstance(self._pattern, str):
+ decoded_groups = [(_decode(group), index) for group, index in groups]
+ return dict(decoded_groups)
+ else:
+ return dict(groups)
+
+ @property
+ def programsize(self):
+ return self._regexp.ProgramSize()
+
+ @property
+ def reverseprogramsize(self):
+ return self._regexp.ReverseProgramSize()
+
+ @property
+ def programfanout(self):
+ return self._regexp.ProgramFanout()
+
+ @property
+ def reverseprogramfanout(self):
+ return self._regexp.ReverseProgramFanout()
+
+ def possiblematchrange(self, maxlen):
+ ok, min, max = self._regexp.PossibleMatchRange(maxlen)
+ if not ok:
+ raise error('failed to compute match range')
+ return min, max
+
+
+class _Match(object):
+
+ __slots__ = ('_regexp', '_text', '_pos', '_endpos', '_spans')
+
+ def __init__(self, regexp, text, pos, endpos, spans):
+ self._regexp = regexp
+ self._text = text
+ self._pos = pos
+ self._endpos = endpos
+ self._spans = spans
+
+ # Python prioritises three-digit octal numbers over group escapes.
+ # For example, \100 should not be handled the same way as \g<10>0.
+ _OCTAL_RE = compile('\\\\[0-7][0-7][0-7]')
+
+ # Python supports \1 through \99 (inclusive) and \g<...> syntax.
+ _GROUP_RE = compile('\\\\[1-9][0-9]?|\\\\g<\\w+>')
+
+ @classmethod
+ @functools.lru_cache(typed=True)
+ def _split(cls, template):
+ if isinstance(template, str):
+ backslash = '\\'
+ else:
+ backslash = b'\\'
+ empty = type(template)()
+ pieces = [empty]
+ index = template.find(backslash)
+ while index != -1:
+ piece, template = template[:index], template[index:]
+ pieces[-1] += piece
+ octal_match = cls._OCTAL_RE.match(template)
+ group_match = cls._GROUP_RE.match(template)
+ if (not octal_match) and group_match:
+ index = group_match.end()
+ piece, template = template[:index], template[index:]
+ pieces.extend((piece, empty))
+ else:
+ # 2 isn't enough for \o, \x, \N, \u and \U escapes, but none of those
+ # should contain backslashes, so break them here and then fix them at
+ # the beginning of the next loop iteration or right before returning.
+ index = 2
+ piece, template = template[:index], template[index:]
+ pieces[-1] += piece
+ index = template.find(backslash)
+ pieces[-1] += template
+ return pieces
+
+ def expand(self, template):
+ if isinstance(template, str):
+ unescape = codecs.unicode_escape_decode
+ else:
+ unescape = codecs.escape_decode
+ empty = type(template)()
+ # Make a copy so that we don't clobber the cached pieces!
+ pieces = list(self._split(template))
+ for index, piece in enumerate(pieces):
+ if not index % 2:
+ pieces[index], _ = unescape(piece)
+ else:
+ if len(piece) <= 3: # \1 through \99 (inclusive)
+ group = int(piece[1:])
+ else: # \g<...>
+ group = piece[3:-1]
+ try:
+ group = int(group)
+ except ValueError:
+ pass
+ pieces[index] = self.__getitem__(group) or empty
+ joined_pieces = empty.join(pieces)
+ return joined_pieces
+
+ def __getitem__(self, group):
+ if not isinstance(group, int):
+ try:
+ group = self._regexp.groupindex[group]
+ except KeyError:
+ raise IndexError('bad group name')
+ if not 0 <= group <= self._regexp.groups:
+ raise IndexError('bad group index')
+ span = self._spans[group]
+ if span == _NULL_SPAN:
+ return None
+ return self._text[span[0]:span[1]]
+
+ def group(self, *groups):
+ if not groups:
+ groups = (0,)
+ items = (self.__getitem__(group) for group in groups)
+ return next(items) if len(groups) == 1 else tuple(items)
+
+ def groups(self, default=None):
+ items = []
+ for group in range(1, self._regexp.groups + 1):
+ item = self.__getitem__(group)
+ items.append(default if item is None else item)
+ return tuple(items)
+
+ def groupdict(self, default=None):
+ items = []
+ for group, index in self._regexp.groupindex.items():
+ item = self.__getitem__(index)
+ items.append((group, default) if item is None else (group, item))
+ return dict(items)
+
+ def start(self, group=0):
+ if not 0 <= group <= self._regexp.groups:
+ raise IndexError('bad group index')
+ return self._spans[group][0]
+
+ def end(self, group=0):
+ if not 0 <= group <= self._regexp.groups:
+ raise IndexError('bad group index')
+ return self._spans[group][1]
+
+ def span(self, group=0):
+ if not 0 <= group <= self._regexp.groups:
+ raise IndexError('bad group index')
+ return self._spans[group]
+
+ @property
+ def re(self):
+ return self._regexp
+
+ @property
+ def string(self):
+ return self._text
+
+ @property
+ def pos(self):
+ return self._pos
+
+ @property
+ def endpos(self):
+ return self._endpos
+
+ @property
+ def lastindex(self):
+ max_end = -1
+ max_group = None
+ # We look for the rightmost right parenthesis by keeping the first group
+ # that ends at max_end because that is the leftmost/outermost group when
+ # there are nested groups!
+ for group in range(1, self._regexp.groups + 1):
+ end = self._spans[group][1]
+ if max_end < end:
+ max_end = end
+ max_group = group
+ return max_group
+
+ @property
+ def lastgroup(self):
+ max_group = self.lastindex
+ if not max_group:
+ return None
+ for group, index in self._regexp.groupindex.items():
+ if max_group == index:
+ return group
+ return None
+
+
+class Set(object):
+ """A Pythonic wrapper around RE2::Set."""
+
+ __slots__ = ('_set')
+
+ def __init__(self, anchor, options=None):
+ options = options or Options()
+ self._set = _re2.Set(anchor, options)
+
+ @classmethod
+ def SearchSet(cls, options=None):
+ return cls(_Anchor.UNANCHORED, options=options)
+
+ @classmethod
+ def MatchSet(cls, options=None):
+ return cls(_Anchor.ANCHOR_START, options=options)
+
+ @classmethod
+ def FullMatchSet(cls, options=None):
+ return cls(_Anchor.ANCHOR_BOTH, options=options)
+
+ def Add(self, pattern):
+ if isinstance(pattern, str):
+ encoded_pattern = _encode(pattern)
+ index = self._set.Add(encoded_pattern)
+ else:
+ index = self._set.Add(pattern)
+ if index == -1:
+ raise error('failed to add %r to Set' % pattern)
+ return index
+
+ def Compile(self):
+ if not self._set.Compile():
+ raise error('failed to compile Set')
+
+ def Match(self, text):
+ if isinstance(text, str):
+ encoded_text = _encode(text)
+ matches = self._set.Match(encoded_text)
+ else:
+ matches = self._set.Match(text)
+ return matches or None
+
+
+class Filter(object):
+ """A Pythonic wrapper around FilteredRE2."""
+
+ __slots__ = ('_filter', '_patterns')
+
+ def __init__(self):
+ self._filter = _re2.Filter()
+ self._patterns = []
+
+ def Add(self, pattern, options=None):
+ options = options or Options()
+ if isinstance(pattern, str):
+ encoded_pattern = _encode(pattern)
+ index = self._filter.Add(encoded_pattern, options)
+ else:
+ index = self._filter.Add(pattern, options)
+ if index == -1:
+ raise error('failed to add %r to Filter' % pattern)
+ self._patterns.append(pattern)
+ return index
+
+ def Compile(self):
+ if not self._filter.Compile():
+ raise error('failed to compile Filter')
+
+ def Match(self, text, potential=False):
+ if isinstance(text, str):
+ encoded_text = _encode(text)
+ matches = self._filter.Match(encoded_text, potential)
+ else:
+ matches = self._filter.Match(text, potential)
+ return matches or None
+
+ def re(self, index):
+ if not 0 <= index < len(self._patterns):
+ raise IndexError('bad index')
+ proxy = object.__new__(_Regexp)
+ proxy._pattern = self._patterns[index]
+ proxy._regexp = self._filter.GetRE2(index)
+ return proxy
diff --git a/python/re2_test.py b/python/re2_test.py
new file mode 100644
index 0000000..86aa9ae
--- /dev/null
+++ b/python/re2_test.py
@@ -0,0 +1,482 @@
+# Copyright 2019 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+"""Tests for google3.third_party.re2.python.re2."""
+
+import collections
+import pickle
+import re
+
+from absl.testing import absltest
+from absl.testing import parameterized
+import re2
+
+
+class OptionsTest(parameterized.TestCase):
+
+ @parameterized.parameters(*re2.Options.NAMES)
+ def test_option(self, name):
+ options = re2.Options()
+ value = getattr(options, name)
+ if isinstance(value, re2.Options.Encoding):
+ value = next(v for v in type(value).__members__.values() if v != value)
+ elif isinstance(value, bool):
+ value = not value
+ elif isinstance(value, int):
+ value = value + 1
+ else:
+ raise TypeError('option {!r}: {!r} {!r}'.format(name, type(value), value))
+ setattr(options, name, value)
+ self.assertEqual(value, getattr(options, name))
+
+
+class Re2CompileTest(parameterized.TestCase):
+ """Contains tests that apply to the re2 module only.
+
+ We disagree with Python on the string types of group names,
+ so there is no point attempting to verify consistency.
+ """
+
+ @parameterized.parameters(
+ (u'(foo*)(?P<bar>qux+)', 2, [(u'bar', 2)]),
+ (b'(foo*)(?P<bar>qux+)', 2, [(b'bar', 2)]),
+ (u'(foo*)(?P<中文>qux+)', 2, [(u'中文', 2)]),
+ )
+ def test_compile(self, pattern, expected_groups, expected_groupindex):
+ regexp = re2.compile(pattern)
+ self.assertIs(regexp, re2.compile(pattern)) # cached
+ self.assertIs(regexp, re2.compile(regexp)) # cached
+ with self.assertRaisesRegex(re2.error,
+ ('pattern is already compiled, so '
+ 'options may not be specified')):
+ options = re2.Options()
+ options.log_errors = not options.log_errors
+ re2.compile(regexp, options=options)
+ self.assertIsNotNone(regexp.options)
+ self.assertEqual(expected_groups, regexp.groups)
+ self.assertDictEqual(dict(expected_groupindex), regexp.groupindex)
+
+ def test_compile_with_options(self):
+ options = re2.Options()
+ options.max_mem = 100
+ with self.assertRaisesRegex(re2.error, 'pattern too large'):
+ re2.compile('.{1000}', options=options)
+
+ def test_programsize_reverseprogramsize(self):
+ regexp = re2.compile('a+b')
+ self.assertEqual(7, regexp.programsize)
+ self.assertEqual(7, regexp.reverseprogramsize)
+
+ def test_programfanout_reverseprogramfanout(self):
+ regexp = re2.compile('a+b')
+ self.assertListEqual([1, 1], regexp.programfanout)
+ self.assertListEqual([3], regexp.reverseprogramfanout)
+
+ @parameterized.parameters(
+ (u'abc', 0, None),
+ (b'abc', 0, None),
+ (u'abc', 10, (b'abc', b'abc')),
+ (b'abc', 10, (b'abc', b'abc')),
+ (u'ab*c', 10, (b'ab', b'ac')),
+ (b'ab*c', 10, (b'ab', b'ac')),
+ (u'ab+c', 10, (b'abb', b'abc')),
+ (b'ab+c', 10, (b'abb', b'abc')),
+ (u'ab?c', 10, (b'abc', b'ac')),
+ (b'ab?c', 10, (b'abc', b'ac')),
+ (u'.*', 10, (b'', b'\xf4\xbf\xbf\xc0')),
+ (b'.*', 10, None),
+ (u'\\C*', 10, None),
+ (b'\\C*', 10, None),
+ )
+ def test_possiblematchrange(self, pattern, maxlen, expected_min_max):
+ # For brevity, the string type of pattern determines the encoding.
+ # It would otherwise be possible to have bytes with UTF8, but as per
+ # the module docstring, it isn't permitted to have str with LATIN1.
+ options = re2.Options()
+ if isinstance(pattern, str):
+ options.encoding = re2.Options.Encoding.UTF8
+ else:
+ options.encoding = re2.Options.Encoding.LATIN1
+ regexp = re2.compile(pattern, options=options)
+ if expected_min_max:
+ self.assertEqual(expected_min_max, regexp.possiblematchrange(maxlen))
+ else:
+ with self.assertRaisesRegex(re2.error, 'failed to compute match range'):
+ regexp.possiblematchrange(maxlen)
+
+
+Params = collections.namedtuple(
+ 'Params', ('pattern', 'text', 'spans', 'search', 'match', 'fullmatch'))
+
+PARAMS = [
+ Params(u'\\d+', u'Hello, world.', None, False, False, False),
+ Params(b'\\d+', b'Hello, world.', None, False, False, False),
+ Params(u'\\s+', u'Hello, world.', [(6, 7)], True, False, False),
+ Params(b'\\s+', b'Hello, world.', [(6, 7)], True, False, False),
+ Params(u'\\w+', u'Hello, world.', [(0, 5)], True, True, False),
+ Params(b'\\w+', b'Hello, world.', [(0, 5)], True, True, False),
+ Params(u'(\\d+)?', u'Hello, world.', [(0, 0), (-1, -1)], True, True, False),
+ Params(b'(\\d+)?', b'Hello, world.', [(0, 0), (-1, -1)], True, True, False),
+ Params(u'youtube(_device|_md|_gaia|_multiday|_multiday_gaia)?',
+ u'youtube_ads', [(0, 7), (-1, -1)], True, True, False),
+ Params(b'youtube(_device|_md|_gaia|_multiday|_multiday_gaia)?',
+ b'youtube_ads', [(0, 7), (-1, -1)], True, True, False),
+]
+
+
+def upper(match):
+ return match.group().upper()
+
+
+class ReRegexpTest(parameterized.TestCase):
+ """Contains tests that apply to the re and re2 modules."""
+
+ MODULE = re
+
+ @parameterized.parameters((p.pattern,) for p in PARAMS)
+ def test_pickle(self, pattern):
+ regexp = self.MODULE.compile(pattern)
+ rick = pickle.loads(pickle.dumps(regexp))
+ self.assertEqual(regexp.pattern, rick.pattern)
+
+ @parameterized.parameters(
+ (p.pattern, p.text, (p.spans if p.search else None)) for p in PARAMS)
+ def test_search(self, pattern, text, expected_spans):
+ match = self.MODULE.search(pattern, text)
+ if expected_spans is None:
+ self.assertIsNone(match)
+ else:
+ spans = [match.span(group) for group in range(match.re.groups + 1)]
+ self.assertListEqual(expected_spans, spans)
+
+ def test_search_with_pos_and_endpos(self):
+ regexp = self.MODULE.compile(u'.+') # empty string NOT allowed
+ text = u'I \u2665 RE2!'
+ # Note that len(text) is the position of the empty string at the end of
+ # text, so range() stops at len(text) + 1 in order to include len(text).
+ for pos in range(len(text) + 1):
+ for endpos in range(pos, len(text) + 1):
+ match = regexp.search(text, pos=pos, endpos=endpos)
+ if pos == endpos:
+ self.assertIsNone(match)
+ else:
+ self.assertEqual(pos, match.pos)
+ self.assertEqual(endpos, match.endpos)
+ self.assertEqual(pos, match.start())
+ self.assertEqual(endpos, match.end())
+ self.assertTupleEqual((pos, endpos), match.span())
+
+ def test_search_with_bogus_pos_and_endpos(self):
+ regexp = self.MODULE.compile(u'.*') # empty string allowed
+ text = u'I \u2665 RE2!'
+
+ match = regexp.search(text, pos=-100)
+ self.assertEqual(0, match.pos)
+ match = regexp.search(text, pos=100)
+ self.assertEqual(8, match.pos)
+
+ match = regexp.search(text, endpos=-100)
+ self.assertEqual(0, match.endpos)
+ match = regexp.search(text, endpos=100)
+ self.assertEqual(8, match.endpos)
+
+ match = regexp.search(text, pos=100, endpos=-100)
+ self.assertIsNone(match)
+
+ @parameterized.parameters(
+ (p.pattern, p.text, (p.spans if p.match else None)) for p in PARAMS)
+ def test_match(self, pattern, text, expected_spans):
+ match = self.MODULE.match(pattern, text)
+ if expected_spans is None:
+ self.assertIsNone(match)
+ else:
+ spans = [match.span(group) for group in range(match.re.groups + 1)]
+ self.assertListEqual(expected_spans, spans)
+
+ @parameterized.parameters(
+ (p.pattern, p.text, (p.spans if p.fullmatch else None)) for p in PARAMS)
+ def test_fullmatch(self, pattern, text, expected_spans):
+ match = self.MODULE.fullmatch(pattern, text)
+ if expected_spans is None:
+ self.assertIsNone(match)
+ else:
+ spans = [match.span(group) for group in range(match.re.groups + 1)]
+ self.assertListEqual(expected_spans, spans)
+
+ @parameterized.parameters(
+ (u'', u'', [(0, 0)]),
+ (b'', b'', [(0, 0)]),
+ (u'', u'x', [(0, 0), (1, 1)]),
+ (b'', b'x', [(0, 0), (1, 1)]),
+ (u'', u'xy', [(0, 0), (1, 1), (2, 2)]),
+ (b'', b'xy', [(0, 0), (1, 1), (2, 2)]),
+ (u'.', u'xy', [(0, 1), (1, 2)]),
+ (b'.', b'xy', [(0, 1), (1, 2)]),
+ (u'x', u'xy', [(0, 1)]),
+ (b'x', b'xy', [(0, 1)]),
+ (u'y', u'xy', [(1, 2)]),
+ (b'y', b'xy', [(1, 2)]),
+ (u'z', u'xy', []),
+ (b'z', b'xy', []),
+ (u'\\w*', u'Hello, world.', [(0, 5), (5, 5), (6, 6), (7, 12), (12, 12),
+ (13, 13)]),
+ (b'\\w*', b'Hello, world.', [(0, 5), (5, 5), (6, 6), (7, 12), (12, 12),
+ (13, 13)]),
+ )
+ def test_finditer(self, pattern, text, expected_matches):
+ matches = [match.span() for match in self.MODULE.finditer(pattern, text)]
+ self.assertListEqual(expected_matches, matches)
+
+ @parameterized.parameters(
+ (u'\\w\\w+', u'Hello, world.', [u'Hello', u'world']),
+ (b'\\w\\w+', b'Hello, world.', [b'Hello', b'world']),
+ (u'(\\w)\\w+', u'Hello, world.', [u'H', u'w']),
+ (b'(\\w)\\w+', b'Hello, world.', [b'H', b'w']),
+ (u'(\\w)(\\w+)', u'Hello, world.', [(u'H', u'ello'), (u'w', u'orld')]),
+ (b'(\\w)(\\w+)', b'Hello, world.', [(b'H', b'ello'), (b'w', b'orld')]),
+ (u'(\\w)(\\w+)?', u'Hello, w.', [(u'H', u'ello'), (u'w', u'')]),
+ (b'(\\w)(\\w+)?', b'Hello, w.', [(b'H', b'ello'), (b'w', b'')]),
+ )
+ def test_findall(self, pattern, text, expected_matches):
+ matches = self.MODULE.findall(pattern, text)
+ self.assertListEqual(expected_matches, matches)
+
+ @parameterized.parameters(
+ (u'\\W+', u'Hello, world.', -1, [u'Hello, world.']),
+ (b'\\W+', b'Hello, world.', -1, [b'Hello, world.']),
+ (u'\\W+', u'Hello, world.', 0, [u'Hello', u'world', u'']),
+ (b'\\W+', b'Hello, world.', 0, [b'Hello', b'world', b'']),
+ (u'\\W+', u'Hello, world.', 1, [u'Hello', u'world.']),
+ (b'\\W+', b'Hello, world.', 1, [b'Hello', b'world.']),
+ (u'(\\W+)', u'Hello, world.', -1, [u'Hello, world.']),
+ (b'(\\W+)', b'Hello, world.', -1, [b'Hello, world.']),
+ (u'(\\W+)', u'Hello, world.', 0, [u'Hello', u', ', u'world', u'.', u'']),
+ (b'(\\W+)', b'Hello, world.', 0, [b'Hello', b', ', b'world', b'.', b'']),
+ (u'(\\W+)', u'Hello, world.', 1, [u'Hello', u', ', u'world.']),
+ (b'(\\W+)', b'Hello, world.', 1, [b'Hello', b', ', b'world.']),
+ )
+ def test_split(self, pattern, text, maxsplit, expected_pieces):
+ pieces = self.MODULE.split(pattern, text, maxsplit)
+ self.assertListEqual(expected_pieces, pieces)
+
+ @parameterized.parameters(
+ (u'\\w+', upper, u'Hello, world.', -1, u'Hello, world.', 0),
+ (b'\\w+', upper, b'Hello, world.', -1, b'Hello, world.', 0),
+ (u'\\w+', upper, u'Hello, world.', 0, u'HELLO, WORLD.', 2),
+ (b'\\w+', upper, b'Hello, world.', 0, b'HELLO, WORLD.', 2),
+ (u'\\w+', upper, u'Hello, world.', 1, u'HELLO, world.', 1),
+ (b'\\w+', upper, b'Hello, world.', 1, b'HELLO, world.', 1),
+ (u'\\w+', u'MEEP', u'Hello, world.', -1, u'Hello, world.', 0),
+ (b'\\w+', b'MEEP', b'Hello, world.', -1, b'Hello, world.', 0),
+ (u'\\w+', u'MEEP', u'Hello, world.', 0, u'MEEP, MEEP.', 2),
+ (b'\\w+', b'MEEP', b'Hello, world.', 0, b'MEEP, MEEP.', 2),
+ (u'\\w+', u'MEEP', u'Hello, world.', 1, u'MEEP, world.', 1),
+ (b'\\w+', b'MEEP', b'Hello, world.', 1, b'MEEP, world.', 1),
+ (u'\\\\', u'\\\\\\\\', u'Hello,\\world.', 0, u'Hello,\\\\world.', 1),
+ (b'\\\\', b'\\\\\\\\', b'Hello,\\world.', 0, b'Hello,\\\\world.', 1),
+ )
+ def test_subn_sub(self, pattern, repl, text, count, expected_joined_pieces,
+ expected_numsplit):
+ joined_pieces, numsplit = self.MODULE.subn(pattern, repl, text, count)
+ self.assertEqual(expected_joined_pieces, joined_pieces)
+ self.assertEqual(expected_numsplit, numsplit)
+
+ joined_pieces = self.MODULE.sub(pattern, repl, text, count)
+ self.assertEqual(expected_joined_pieces, joined_pieces)
+
+
+class Re2RegexpTest(ReRegexpTest):
+ """Contains tests that apply to the re2 module only."""
+
+ MODULE = re2
+
+ def test_compile_with_latin1_encoding(self):
+ options = re2.Options()
+ options.encoding = re2.Options.Encoding.LATIN1
+ with self.assertRaisesRegex(re2.error,
+ ('string type of pattern is str, but '
+ 'encoding specified in options is LATIN1')):
+ re2.compile(u'.?', options=options)
+
+ # ... whereas this is fine, of course.
+ re2.compile(b'.?', options=options)
+
+ @parameterized.parameters(
+ (u'\\p{Lo}', u'\u0ca0_\u0ca0', [(0, 1), (2, 3)]),
+ (b'\\p{Lo}', b'\xe0\xb2\xa0_\xe0\xb2\xa0', [(0, 3), (4, 7)]),
+ )
+ def test_finditer_with_utf8(self, pattern, text, expected_matches):
+ matches = [match.span() for match in self.MODULE.finditer(pattern, text)]
+ self.assertListEqual(expected_matches, matches)
+
+ def test_purge(self):
+ re2.compile('Goodbye, world.')
+ self.assertGreater(re2._Regexp._make.cache_info().currsize, 0)
+ re2.purge()
+ self.assertEqual(re2._Regexp._make.cache_info().currsize, 0)
+
+
+class Re2EscapeTest(parameterized.TestCase):
+ """Contains tests that apply to the re2 module only.
+
+ We disagree with Python on the escaping of some characters,
+ so there is no point attempting to verify consistency.
+ """
+
+ @parameterized.parameters(
+ (u'a*b+c?', u'a\\*b\\+c\\?'),
+ (b'a*b+c?', b'a\\*b\\+c\\?'),
+ )
+ def test_escape(self, pattern, expected_escaped):
+ escaped = re2.escape(pattern)
+ self.assertEqual(expected_escaped, escaped)
+
+
+class ReMatchTest(parameterized.TestCase):
+ """Contains tests that apply to the re and re2 modules."""
+
+ MODULE = re
+
+ def test_expand(self):
+ pattern = u'(?P<S>[\u2600-\u26ff]+).*?(?P<P>[^\\s\\w]+)'
+ text = u'I \u2665 RE2!\n'
+ match = self.MODULE.search(pattern, text)
+
+ self.assertEqual(u'\u2665\n!', match.expand(u'\\1\\n\\2'))
+ self.assertEqual(u'\u2665\n!', match.expand(u'\\g<1>\\n\\g<2>'))
+ self.assertEqual(u'\u2665\n!', match.expand(u'\\g<S>\\n\\g<P>'))
+ self.assertEqual(u'\\1\\2\n\u2665!', match.expand(u'\\\\1\\\\2\\n\\1\\2'))
+
+ def test_expand_with_octal(self):
+ pattern = u'()()()()()()()()()(\\w+)'
+ text = u'Hello, world.'
+ match = self.MODULE.search(pattern, text)
+
+ self.assertEqual(u'Hello\n', match.expand(u'\\g<0>\\n'))
+ self.assertEqual(u'Hello\n', match.expand(u'\\g<10>\\n'))
+
+ self.assertEqual(u'\x00\n', match.expand(u'\\0\\n'))
+ self.assertEqual(u'\x00\n', match.expand(u'\\00\\n'))
+ self.assertEqual(u'\x00\n', match.expand(u'\\000\\n'))
+ self.assertEqual(u'\x000\n', match.expand(u'\\0000\\n'))
+
+ self.assertEqual(u'\n', match.expand(u'\\1\\n'))
+ self.assertEqual(u'Hello\n', match.expand(u'\\10\\n'))
+ self.assertEqual(u'@\n', match.expand(u'\\100\\n'))
+ self.assertEqual(u'@0\n', match.expand(u'\\1000\\n'))
+
+ def test_getitem_group_groups_groupdict(self):
+ pattern = u'(?P<S>[\u2600-\u26ff]+).*?(?P<P>[^\\s\\w]+)'
+ text = u'Hello, world.\nI \u2665 RE2!\nGoodbye, world.\n'
+ match = self.MODULE.search(pattern, text)
+
+ self.assertEqual(u'\u2665 RE2!', match[0])
+ self.assertEqual(u'\u2665', match[1])
+ self.assertEqual(u'!', match[2])
+ self.assertEqual(u'\u2665', match[u'S'])
+ self.assertEqual(u'!', match[u'P'])
+
+ self.assertEqual(u'\u2665 RE2!', match.group())
+ self.assertEqual(u'\u2665 RE2!', match.group(0))
+ self.assertEqual(u'\u2665', match.group(1))
+ self.assertEqual(u'!', match.group(2))
+ self.assertEqual(u'\u2665', match.group(u'S'))
+ self.assertEqual(u'!', match.group(u'P'))
+
+ self.assertTupleEqual((u'\u2665', u'!'), match.group(1, 2))
+ self.assertTupleEqual((u'\u2665', u'!'), match.group(u'S', u'P'))
+ self.assertTupleEqual((u'\u2665', u'!'), match.groups())
+ self.assertDictEqual({u'S': u'\u2665', u'P': u'!'}, match.groupdict())
+
+ def test_bogus_group_start_end_and_span(self):
+ pattern = u'(?P<S>[\u2600-\u26ff]+).*?(?P<P>[^\\s\\w]+)'
+ text = u'I \u2665 RE2!\n'
+ match = self.MODULE.search(pattern, text)
+
+ self.assertRaises(IndexError, match.group, -1)
+ self.assertRaises(IndexError, match.group, 3)
+ self.assertRaises(IndexError, match.group, 'X')
+
+ self.assertRaises(IndexError, match.start, -1)
+ self.assertRaises(IndexError, match.start, 3)
+
+ self.assertRaises(IndexError, match.end, -1)
+ self.assertRaises(IndexError, match.end, 3)
+
+ self.assertRaises(IndexError, match.span, -1)
+ self.assertRaises(IndexError, match.span, 3)
+
+ @parameterized.parameters(
+ (u'((a)(b))((c)(d))', u'foo bar qux', None, None),
+ (u'(?P<one>(a)(b))((c)(d))', u'foo abcd qux', 4, None),
+ (u'(?P<one>(a)(b))(?P<four>(c)(d))', u'foo abcd qux', 4, 'four'),
+ )
+ def test_lastindex_lastgroup(self, pattern, text, expected_lastindex,
+ expected_lastgroup):
+ match = self.MODULE.search(pattern, text)
+ if expected_lastindex is None:
+ self.assertIsNone(match)
+ else:
+ self.assertEqual(expected_lastindex, match.lastindex)
+ self.assertEqual(expected_lastgroup, match.lastgroup)
+
+
+class Re2MatchTest(ReMatchTest):
+ """Contains tests that apply to the re2 module only."""
+
+ MODULE = re2
+
+
+class SetTest(absltest.TestCase):
+
+ def test_search(self):
+ s = re2.Set.SearchSet()
+ self.assertEqual(0, s.Add('\\d+'))
+ self.assertEqual(1, s.Add('\\s+'))
+ self.assertEqual(2, s.Add('\\w+'))
+ self.assertRaises(re2.error, s.Add, '(MEEP')
+ s.Compile()
+ self.assertItemsEqual([1, 2], s.Match('Hello, world.'))
+
+ def test_match(self):
+ s = re2.Set.MatchSet()
+ self.assertEqual(0, s.Add('\\d+'))
+ self.assertEqual(1, s.Add('\\s+'))
+ self.assertEqual(2, s.Add('\\w+'))
+ self.assertRaises(re2.error, s.Add, '(MEEP')
+ s.Compile()
+ self.assertItemsEqual([2], s.Match('Hello, world.'))
+
+ def test_fullmatch(self):
+ s = re2.Set.FullMatchSet()
+ self.assertEqual(0, s.Add('\\d+'))
+ self.assertEqual(1, s.Add('\\s+'))
+ self.assertEqual(2, s.Add('\\w+'))
+ self.assertRaises(re2.error, s.Add, '(MEEP')
+ s.Compile()
+ self.assertIsNone(s.Match('Hello, world.'))
+
+
+class FilterTest(absltest.TestCase):
+
+ def test_match(self):
+ f = re2.Filter()
+ self.assertEqual(0, f.Add('Hello, \\w+\\.'))
+ self.assertEqual(1, f.Add('\\w+, world\\.'))
+ self.assertEqual(2, f.Add('Goodbye, \\w+\\.'))
+ self.assertRaises(re2.error, f.Add, '(MEEP')
+ f.Compile()
+ self.assertItemsEqual([0, 1], f.Match('Hello, world.', potential=True))
+ self.assertItemsEqual([0, 1], f.Match('HELLO, WORLD.', potential=True))
+ self.assertItemsEqual([0, 1], f.Match('Hello, world.'))
+ self.assertIsNone(f.Match('HELLO, WORLD.'))
+
+ self.assertRaises(IndexError, f.re, -1)
+ self.assertRaises(IndexError, f.re, 3)
+ self.assertEqual('Goodbye, \\w+\\.', f.re(2).pattern)
+ # Verify whether the underlying RE2 object is usable.
+ self.assertEqual(0, f.re(2).groups)
+
+
+if __name__ == '__main__':
+ absltest.main()
diff --git a/python/setup.py b/python/setup.py
new file mode 100644
index 0000000..3bd11ed
--- /dev/null
+++ b/python/setup.py
@@ -0,0 +1,117 @@
+# Copyright 2019 The RE2 Authors. All Rights Reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+import os
+import setuptools
+import setuptools.command.build_ext
+import shutil
+import sys
+
+long_description = r"""A drop-in replacement for the re module.
+
+It uses RE2 under the hood, of course, so various PCRE features
+(e.g. backreferences, look-around assertions) are not supported.
+See https://github.com/google/re2/wiki/Syntax for the canonical
+reference, but known syntactic "gotchas" relative to Python are:
+
+ * PCRE supports \Z and \z; RE2 supports \z; Python supports \z,
+ but calls it \Z. You must rewrite \Z to \z in pattern strings.
+
+Known differences between this module's API and the re module's API:
+
+ * The error class does not provide any error information as attributes.
+ * The Options class replaces the re module's flags with RE2's options as
+ gettable/settable properties. Please see re2.h for their documentation.
+ * The pattern string and the input string do not have to be the same type.
+ Any str will be encoded to UTF-8.
+ * The pattern string cannot be str if the options specify Latin-1 encoding.
+
+Known issues with regard to building the C++ extension:
+
+ * Building requires RE2 to be installed on your system.
+ On Debian, for example, install the libre2-dev package.
+ * Building requires pybind11 to be installed on your system OR venv.
+ On Debian, for example, install the pybind11-dev package.
+ For a venv, install the pybind11 package from PyPI.
+ * Building on macOS is known to work, but has been known to fail.
+ For example, the system Python may not know which compiler flags
+ to set when building bindings for software installed by Homebrew;
+ see https://docs.brew.sh/Homebrew-and-Python#brewed-python-modules.
+ * Building on Windows has not been tested yet and will probably fail.
+"""
+
+
+class BuildExt(setuptools.command.build_ext.build_ext):
+
+ def build_extension(self, ext):
+ if 'GITHUB_ACTIONS' not in os.environ:
+ return super().build_extension(ext)
+
+ # For @pybind11_bazel's `python_configure()`.
+ os.environ['PYTHON_BIN_PATH'] = sys.executable
+
+ cmd = ['bazel', 'build']
+ try:
+ cmd.append(f'--cpu={os.environ["BAZEL_CPU"].lower()}')
+ except KeyError:
+ pass
+ cmd += ['--compilation_mode=opt', '--', ':all']
+ self.spawn(cmd)
+
+ # This ensures that f'_re2.{importlib.machinery.EXTENSION_SUFFIXES[0]}'
+ # is the filename in the destination directory, which is what's needed.
+ shutil.copyfile('../bazel-bin/python/_re2.so',
+ self.get_ext_fullpath(ext.name))
+
+ cmd = ['bazel', 'clean', '--expunge']
+ self.spawn(cmd)
+
+
+def options():
+ bdist_wheel = {}
+ try:
+ bdist_wheel['plat_name'] = os.environ['PLAT_NAME']
+ except KeyError:
+ pass
+ return {'bdist_wheel': bdist_wheel}
+
+
+def include_dirs():
+ try:
+ import pybind11
+ yield pybind11.get_include()
+ except ModuleNotFoundError:
+ pass
+
+
+ext_module = setuptools.Extension(
+ name='_re2',
+ sources=['_re2.cc'],
+ include_dirs=list(include_dirs()),
+ libraries=['re2'],
+ extra_compile_args=['-fvisibility=hidden'],
+)
+
+setuptools.setup(
+ name='google-re2',
+ version='1.1',
+ description='RE2 Python bindings',
+ long_description=long_description,
+ long_description_content_type='text/plain',
+ author='The RE2 Authors',
+ author_email='re2-dev@googlegroups.com',
+ url='https://github.com/google/re2',
+ py_modules=['re2'],
+ ext_modules=[ext_module],
+ classifiers=[
+ 'Development Status :: 5 - Production/Stable',
+ 'Intended Audience :: Developers',
+ 'License :: OSI Approved :: BSD License',
+ 'Programming Language :: C++',
+ 'Programming Language :: Python :: 3.8',
+ ],
+ options=options(),
+ cmdclass={'build_ext': BuildExt},
+ python_requires='~=3.8',
+)
diff --git a/re2.pc b/re2.pc.in
index a590ab8..c6182d8 100644
--- a/re2.pc
+++ b/re2.pc.in
@@ -1,8 +1,9 @@
-includedir=@includedir@
-libdir=@libdir@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
Name: re2
Description: RE2 is a fast, safe, thread-friendly regular expression engine.
-Version: 0.0.0
+Requires: @REQUIRES@
+Version: @SONAME@.0.0
Cflags: -pthread -I${includedir}
Libs: -pthread -L${libdir} -lre2
diff --git a/re2/bitmap256.cc b/re2/bitmap256.cc
new file mode 100644
index 0000000..f6fbca3
--- /dev/null
+++ b/re2/bitmap256.cc
@@ -0,0 +1,44 @@
+// Copyright 2023 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "re2/bitmap256.h"
+
+#include <stdint.h>
+
+#include "absl/base/macros.h"
+#include "util/logging.h"
+
+namespace re2 {
+
+int Bitmap256::FindNextSetBit(int c) const {
+ DCHECK_GE(c, 0);
+ DCHECK_LE(c, 255);
+
+ // Check the word that contains the bit. Mask out any lower bits.
+ int i = c / 64;
+ uint64_t word = words_[i] & (~uint64_t{0} << (c % 64));
+ if (word != 0)
+ return (i * 64) + FindLSBSet(word);
+
+ // Check any following words.
+ i++;
+ switch (i) {
+ case 1:
+ if (words_[1] != 0)
+ return (1 * 64) + FindLSBSet(words_[1]);
+ ABSL_FALLTHROUGH_INTENDED;
+ case 2:
+ if (words_[2] != 0)
+ return (2 * 64) + FindLSBSet(words_[2]);
+ ABSL_FALLTHROUGH_INTENDED;
+ case 3:
+ if (words_[3] != 0)
+ return (3 * 64) + FindLSBSet(words_[3]);
+ ABSL_FALLTHROUGH_INTENDED;
+ default:
+ return -1;
+ }
+}
+
+} // namespace re2
diff --git a/re2/bitmap256.h b/re2/bitmap256.h
index 4899379..293b31d 100644
--- a/re2/bitmap256.h
+++ b/re2/bitmap256.h
@@ -11,7 +11,6 @@
#include <stdint.h>
#include <string.h>
-#include "util/util.h"
#include "util/logging.h"
namespace re2 {
@@ -82,36 +81,6 @@ class Bitmap256 {
uint64_t words_[4];
};
-int Bitmap256::FindNextSetBit(int c) const {
- DCHECK_GE(c, 0);
- DCHECK_LE(c, 255);
-
- // Check the word that contains the bit. Mask out any lower bits.
- int i = c / 64;
- uint64_t word = words_[i] & (~uint64_t{0} << (c % 64));
- if (word != 0)
- return (i * 64) + FindLSBSet(word);
-
- // Check any following words.
- i++;
- switch (i) {
- case 1:
- if (words_[1] != 0)
- return (1 * 64) + FindLSBSet(words_[1]);
- FALLTHROUGH_INTENDED;
- case 2:
- if (words_[2] != 0)
- return (2 * 64) + FindLSBSet(words_[2]);
- FALLTHROUGH_INTENDED;
- case 3:
- if (words_[3] != 0)
- return (3 * 64) + FindLSBSet(words_[3]);
- FALLTHROUGH_INTENDED;
- default:
- return -1;
- }
-}
-
} // namespace re2
#endif // RE2_BITMAP256_H_
diff --git a/re2/bitstate.cc b/re2/bitstate.cc
index 877e548..38a0b87 100644
--- a/re2/bitstate.cc
+++ b/re2/bitstate.cc
@@ -42,9 +42,8 @@ class BitState {
// The usual Search prototype.
// Can only call Search once per BitState.
- bool Search(const StringPiece& text, const StringPiece& context,
- bool anchored, bool longest,
- StringPiece* submatch, int nsubmatch);
+ bool Search(absl::string_view text, absl::string_view context, bool anchored,
+ bool longest, absl::string_view* submatch, int nsubmatch);
private:
inline bool ShouldVisit(int id, const char* p);
@@ -53,14 +52,14 @@ class BitState {
bool TrySearch(int id, const char* p);
// Search parameters
- Prog* prog_; // program being run
- StringPiece text_; // text being searched
- StringPiece context_; // greater context of text being searched
- bool anchored_; // whether search is anchored at text.begin()
- bool longest_; // whether search wants leftmost-longest match
- bool endmatch_; // whether match must end at text.end()
- StringPiece* submatch_; // submatches to fill in
- int nsubmatch_; // # of submatches to fill in
+ Prog* prog_; // program being run
+ absl::string_view text_; // text being searched
+ absl::string_view context_; // greater context of text being searched
+ bool anchored_; // whether search is anchored at text.begin()
+ bool longest_; // whether search wants leftmost-longest match
+ bool endmatch_; // whether match must end at text.end()
+ absl::string_view* submatch_; // submatches to fill in
+ int nsubmatch_; // # of submatches to fill in
// Search state
static constexpr int kVisitedBits = 64;
@@ -256,9 +255,9 @@ bool BitState::TrySearch(int id0, const char* p0) {
if (submatch_[0].data() == NULL ||
(longest_ && p > submatch_[0].data() + submatch_[0].size())) {
for (int i = 0; i < nsubmatch_; i++)
- submatch_[i] =
- StringPiece(cap_[2 * i],
- static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
+ submatch_[i] = absl::string_view(
+ cap_[2 * i],
+ static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
}
// If going for first match, we're done.
@@ -285,9 +284,9 @@ bool BitState::TrySearch(int id0, const char* p0) {
}
// Search text (within context) for prog_.
-bool BitState::Search(const StringPiece& text, const StringPiece& context,
- bool anchored, bool longest,
- StringPiece* submatch, int nsubmatch) {
+bool BitState::Search(absl::string_view text, absl::string_view context,
+ bool anchored, bool longest, absl::string_view* submatch,
+ int nsubmatch) {
// Search parameters.
text_ = text;
context_ = context;
@@ -303,7 +302,7 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context,
submatch_ = submatch;
nsubmatch_ = nsubmatch;
for (int i = 0; i < nsubmatch_; i++)
- submatch_[i] = StringPiece();
+ submatch_[i] = absl::string_view();
// Allocate scratch space.
int nvisited = prog_->list_count() * static_cast<int>(text.size()+1);
@@ -353,16 +352,13 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context,
}
// Bit-state search.
-bool Prog::SearchBitState(const StringPiece& text,
- const StringPiece& context,
- Anchor anchor,
- MatchKind kind,
- StringPiece* match,
- int nmatch) {
+bool Prog::SearchBitState(absl::string_view text, absl::string_view context,
+ Anchor anchor, MatchKind kind,
+ absl::string_view* match, int nmatch) {
// If full match, we ask for an anchored longest match
// and then check that match[0] == text.
// So make sure match[0] exists.
- StringPiece sp0;
+ absl::string_view sp0;
if (kind == kFullMatch) {
anchor = kAnchored;
if (nmatch < 1) {
diff --git a/re2/compile.cc b/re2/compile.cc
index 61d801a..aa79887 100644
--- a/re2/compile.cc
+++ b/re2/compile.cc
@@ -10,9 +10,10 @@
#include <stdint.h>
#include <string.h>
-#include <unordered_map>
#include <utility>
+#include "absl/base/macros.h"
+#include "absl/container/flat_hash_map.h"
#include "util/logging.h"
#include "util/utf.h"
#include "re2/pod_array.h"
@@ -211,7 +212,7 @@ class Compiler : public Regexp::Walker<Frag> {
int64_t max_mem_; // Total memory budget.
- std::unordered_map<uint64_t, int> rune_cache_;
+ absl::flat_hash_map<uint64_t, int> rune_cache_;
Frag rune_range_;
RE2::Anchor anchor_; // anchor mode for RE2::Set
@@ -478,7 +479,7 @@ static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase,
int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
int next) {
uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
- std::unordered_map<uint64_t, int>::const_iterator it = rune_cache_.find(key);
+ absl::flat_hash_map<uint64_t, int>::const_iterator it = rune_cache_.find(key);
if (it != rune_cache_.end())
return it->second;
int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
@@ -789,8 +790,8 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
// Should not be called.
Frag Compiler::Copy(Frag arg) {
// We're using WalkExponential; there should be no copying.
- LOG(DFATAL) << "Compiler::Copy called!";
failed_ = true;
+ LOG(DFATAL) << "Compiler::Copy called!";
return NoMatch();
}
@@ -916,8 +917,8 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
CharClass* cc = re->cc();
if (cc->empty()) {
// This can't happen.
- LOG(DFATAL) << "No ranges in char class";
failed_ = true;
+ LOG(DFATAL) << "No ranges in char class";
return NoMatch();
}
@@ -974,8 +975,8 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
case kRegexpNoWordBoundary:
return EmptyWidth(kEmptyNonWordBoundary);
}
- LOG(DFATAL) << "Missing case in Compiler: " << re->op();
failed_ = true;
+ LOG(DFATAL) << "Missing case in Compiler: " << re->op();
return NoMatch();
}
@@ -1243,7 +1244,7 @@ Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
// Make sure DFA has enough memory to operate,
// since we're not going to fall back to the NFA.
bool dfa_failed = false;
- StringPiece sp = "hello, world";
+ absl::string_view sp = "hello, world";
prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
NULL, &dfa_failed, NULL);
if (dfa_failed) {
diff --git a/re2/dfa.cc b/re2/dfa.cc
index d47c7d5..41fc61d 100644
--- a/re2/dfa.cc
+++ b/re2/dfa.cc
@@ -28,23 +28,25 @@
#include <algorithm>
#include <atomic>
#include <deque>
-#include <mutex>
#include <new>
#include <string>
-#include <unordered_map>
-#include <unordered_set>
#include <utility>
#include <vector>
+#include "absl/base/call_once.h"
+#include "absl/base/macros.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
#include "util/logging.h"
-#include "util/mix.h"
-#include "util/mutex.h"
#include "util/strutil.h"
#include "re2/pod_array.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/sparse_set.h"
-#include "re2/stringpiece.h"
// Silence "zero-sized array in struct/union" warning for DFA::State::next_.
#ifdef _MSC_VER
@@ -88,9 +90,9 @@ class DFA {
// returning the leftmost end of the match instead of the rightmost one.
// If the DFA cannot complete the search (for example, if it is out of
// memory), it sets *failed and returns false.
- bool Search(const StringPiece& text, const StringPiece& context,
- bool anchored, bool want_earliest_match, bool run_forward,
- bool* failed, const char** ep, SparseSet* matches);
+ bool Search(absl::string_view text, absl::string_view context, bool anchored,
+ bool want_earliest_match, bool run_forward, bool* failed,
+ const char** ep, SparseSet* matches);
// Builds out all states for the entire DFA.
// If cb is not empty, it receives one callback per state built.
@@ -114,21 +116,26 @@ class DFA {
struct State {
inline bool IsMatch() const { return (flag_ & kFlagMatch) != 0; }
+ template <typename H>
+ friend H AbslHashValue(H h, const State& a) {
+ const absl::Span<const int> ainst(a.inst_, a.ninst_);
+ return H::combine(std::move(h), a.flag_, ainst);
+ }
+
+ friend bool operator==(const State& a, const State& b) {
+ const absl::Span<const int> ainst(a.inst_, a.ninst_);
+ const absl::Span<const int> binst(b.inst_, b.ninst_);
+ return &a == &b || (a.flag_ == b.flag_ && ainst == binst);
+ }
+
int* inst_; // Instruction pointers in the state.
int ninst_; // # of inst_ pointers.
uint32_t flag_; // Empty string bitfield flags in effect on the way
// into this state, along with kFlagMatch if this
// is a matching state.
-// Work around the bug affecting flexible array members in GCC 6.x (for x >= 1).
-// (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70932)
-#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && __GNUC_MINOR__ >= 1
- std::atomic<State*> next_[0]; // Outgoing arrows from State,
-#else
std::atomic<State*> next_[]; // Outgoing arrows from State,
-#endif
-
- // one per input byte class
+ // one per input byte class
};
enum {
@@ -143,11 +150,7 @@ class DFA {
struct StateHash {
size_t operator()(const State* a) const {
DCHECK(a != NULL);
- HashMix mix(a->flag_);
- for (int i = 0; i < a->ninst_; i++)
- mix.Mix(a->inst_[i]);
- mix.Mix(0);
- return mix.get();
+ return absl::Hash<State>()(*a);
}
};
@@ -155,24 +158,15 @@ class DFA {
bool operator()(const State* a, const State* b) const {
DCHECK(a != NULL);
DCHECK(b != NULL);
- if (a == b)
- return true;
- if (a->flag_ != b->flag_)
- return false;
- if (a->ninst_ != b->ninst_)
- return false;
- for (int i = 0; i < a->ninst_; i++)
- if (a->inst_[i] != b->inst_[i])
- return false;
- return true;
+ return *a == *b;
}
};
- typedef std::unordered_set<State*, StateHash, StateEqual> StateSet;
+ typedef absl::flat_hash_set<State*, StateHash, StateEqual> StateSet;
private:
// Make it easier to swap in a scalable reader-writer mutex.
- using CacheMutex = Mutex;
+ using CacheMutex = absl::Mutex;
enum {
// Indices into start_ for unanchored searches.
@@ -238,7 +232,7 @@ class DFA {
// Search parameters
struct SearchParams {
- SearchParams(const StringPiece& text, const StringPiece& context,
+ SearchParams(absl::string_view text, absl::string_view context,
RWLocker* cache_lock)
: text(text),
context(context),
@@ -252,8 +246,8 @@ class DFA {
ep(NULL),
matches(NULL) {}
- StringPiece text;
- StringPiece context;
+ absl::string_view text;
+ absl::string_view context;
bool anchored;
bool can_prefix_accel;
bool want_earliest_match;
@@ -325,7 +319,7 @@ class DFA {
Prog::MatchKind kind_; // The kind of DFA.
bool init_failed_; // initialization failed (out of memory)
- Mutex mutex_; // mutex_ >= cache_mutex_.r
+ absl::Mutex mutex_; // mutex_ >= cache_mutex_.r
// Scratch areas, protected by mutex_.
Workq* q0_; // Two pre-allocated work queues.
@@ -428,7 +422,7 @@ DFA::DFA(Prog* prog, Prog::MatchKind kind, int64_t max_mem)
q1_(NULL),
mem_budget_(max_mem) {
if (ExtraDebug)
- fprintf(stderr, "\nkind %d\n%s\n", kind_, prog_->DumpUnanchored().c_str());
+ absl::FPrintF(stderr, "\nkind %d\n%s\n", kind_, prog_->DumpUnanchored());
int nmark = 0;
if (kind_ == Prog::kLongestMatch)
nmark = prog_->size();
@@ -498,7 +492,7 @@ std::string DFA::DumpWorkq(Workq* q) {
s += "|";
sep = "";
} else {
- s += StringPrintf("%s%d", sep, *it);
+ s += absl::StrFormat("%s%d", sep, *it);
sep = ",";
}
}
@@ -515,7 +509,7 @@ std::string DFA::DumpState(State* state) {
return "*";
std::string s;
const char* sep = "";
- s += StringPrintf("(%p)", state);
+ s += absl::StrFormat("(%p)", state);
for (int i = 0; i < state->ninst_; i++) {
if (state->inst_[i] == Mark) {
s += "|";
@@ -524,11 +518,11 @@ std::string DFA::DumpState(State* state) {
s += "||";
sep = "";
} else {
- s += StringPrintf("%s%d", sep, state->inst_[i]);
+ s += absl::StrFormat("%s%d", sep, state->inst_[i]);
sep = ",";
}
}
- s += StringPrintf(" flag=%#x", state->flag_);
+ s += absl::StrFormat(" flag=%#x", state->flag_);
return s;
}
@@ -596,16 +590,35 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) {
//mutex_.AssertHeld();
// Construct array of instruction ids for the new state.
- // Only ByteRange, EmptyWidth, and Match instructions are useful to keep:
- // those are the only operators with any effect in
- // RunWorkqOnEmptyString or RunWorkqOnByte.
+ // In some cases, kInstAltMatch may trigger an upgrade to FullMatchState.
+ // Otherwise, "compress" q down to list heads for storage; StateToWorkq()
+ // will "decompress" it for computation by exploring from each list head.
+ //
+ // Historically, only kInstByteRange, kInstEmptyWidth and kInstMatch were
+ // useful to keep, but it turned out that kInstAlt was necessary to keep:
+ //
+ // > [*] kInstAlt would seem useless to record in a state, since
+ // > we've already followed both its arrows and saved all the
+ // > interesting states we can reach from there. The problem
+ // > is that one of the empty-width instructions might lead
+ // > back to the same kInstAlt (if an empty-width operator is starred),
+ // > producing a different evaluation order depending on whether
+ // > we keep the kInstAlt to begin with. Sigh.
+ // > A specific case that this affects is /(^|a)+/ matching "a".
+ // > If we don't save the kInstAlt, we will match the whole "a" (0,1)
+ // > but in fact the correct leftmost-first match is the leading "" (0,0).
+ //
+ // Recall that flattening transformed the Prog from "tree" form to "list"
+ // form: in the former, kInstAlt existed explicitly... and abundantly; in
+ // the latter, it's implied between the instructions that compose a list.
+ // Thus, because the information wasn't lost, the bug doesn't remanifest.
PODArray<int> inst(q->size());
int n = 0;
uint32_t needflags = 0; // flags needed by kInstEmptyWidth instructions
bool sawmatch = false; // whether queue contains guaranteed kInstMatch
bool sawmark = false; // whether queue contains a Mark
if (ExtraDebug)
- fprintf(stderr, "WorkqToCachedState %s [%#x]", DumpWorkq(q).c_str(), flag);
+ absl::FPrintF(stderr, "WorkqToCachedState %s [%#x]", DumpWorkq(q), flag);
for (Workq::iterator it = q->begin(); it != q->end(); ++it) {
int id = *it;
if (sawmatch && (kind_ == Prog::kFirstMatch || q->is_mark(id)))
@@ -630,10 +643,10 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) {
(kind_ != Prog::kLongestMatch || !sawmark) &&
(flag & kFlagMatch)) {
if (ExtraDebug)
- fprintf(stderr, " -> FullMatchState\n");
+ absl::FPrintF(stderr, " -> FullMatchState\n");
return FullMatchState;
}
- FALLTHROUGH_INTENDED;
+ ABSL_FALLTHROUGH_INTENDED;
default:
// Record iff id is the head of its list, which must
// be the case if id-1 is the last of *its* list. :)
@@ -676,7 +689,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) {
// if the state is *not* a matching state.
if (n == 0 && flag == 0) {
if (ExtraDebug)
- fprintf(stderr, " -> DeadState\n");
+ absl::FPrintF(stderr, " -> DeadState\n");
return DeadState;
}
@@ -740,25 +753,29 @@ DFA::State* DFA::CachedState(int* inst, int ninst, uint32_t flag) {
StateSet::iterator it = state_cache_.find(&state);
if (it != state_cache_.end()) {
if (ExtraDebug)
- fprintf(stderr, " -cached-> %s\n", DumpState(*it).c_str());
+ absl::FPrintF(stderr, " -cached-> %s\n", DumpState(*it));
return *it;
}
// Must have enough memory for new state.
// In addition to what we're going to allocate,
- // the state cache hash table seems to incur about 40 bytes per
- // State*, empirically.
- const int kStateCacheOverhead = 40;
+ // the state cache hash table seems to incur about 18 bytes per
+ // State*. Worst case for non-small sets is it being half full, where each
+ // value present takes up 1 byte hash sample plus the pointer itself.
+ const int kStateCacheOverhead = 18;
int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot
- int mem = sizeof(State) + nnext*sizeof(std::atomic<State*>) +
- ninst*sizeof(int);
- if (mem_budget_ < mem + kStateCacheOverhead) {
+ int mem = sizeof(State) + nnext*sizeof(std::atomic<State*>);
+ int instmem = ninst*sizeof(int);
+ if (mem_budget_ < mem + instmem + kStateCacheOverhead) {
mem_budget_ = -1;
return NULL;
}
- mem_budget_ -= mem + kStateCacheOverhead;
+ mem_budget_ -= mem + instmem + kStateCacheOverhead;
// Allocate new state along with room for next_ and inst_.
+ // inst_ is stored separately since it's colder; this also
+ // means that the States for a given DFA are the same size
+ // class, so the allocator can hopefully pack them better.
char* space = std::allocator<char>().allocate(mem);
State* s = new (space) State;
(void) new (s->next_) std::atomic<State*>[nnext];
@@ -766,12 +783,13 @@ DFA::State* DFA::CachedState(int* inst, int ninst, uint32_t flag) {
// (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64658)
for (int i = 0; i < nnext; i++)
(void) new (s->next_ + i) std::atomic<State*>(NULL);
- s->inst_ = new (s->next_ + nnext) int[ninst];
- memmove(s->inst_, inst, ninst*sizeof s->inst_[0]);
+ s->inst_ = std::allocator<int>().allocate(ninst);
+ (void) new (s->inst_) int[ninst];
+ memmove(s->inst_, inst, instmem);
s->ninst_ = ninst;
s->flag_ = flag;
if (ExtraDebug)
- fprintf(stderr, " -> %s\n", DumpState(s).c_str());
+ absl::FPrintF(stderr, " -> %s\n", DumpState(s));
// Put state in cache and return it.
state_cache_.insert(s);
@@ -785,12 +803,12 @@ void DFA::ClearCache() {
while (begin != end) {
StateSet::iterator tmp = begin;
++begin;
+ // Deallocate the instruction array, which is stored separately as above.
+ std::allocator<int>().deallocate((*tmp)->inst_, (*tmp)->ninst_);
// Deallocate the blob of memory that we allocated in DFA::CachedState().
// We recompute mem in order to benefit from sized delete where possible.
- int ninst = (*tmp)->ninst_;
int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot
- int mem = sizeof(State) + nnext*sizeof(std::atomic<State*>) +
- ninst*sizeof(int);
+ int mem = sizeof(State) + nnext*sizeof(std::atomic<State*>);
std::allocator<char>().deallocate(reinterpret_cast<char*>(*tmp), mem);
}
state_cache_.clear();
@@ -985,8 +1003,8 @@ void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq,
}
if (ExtraDebug)
- fprintf(stderr, "%s on %d[%#x] -> %s [%d]\n",
- DumpWorkq(oldq).c_str(), c, flag, DumpWorkq(newq).c_str(), *ismatch);
+ absl::FPrintF(stderr, "%s on %d[%#x] -> %s [%d]\n",
+ DumpWorkq(oldq), c, flag, DumpWorkq(newq), *ismatch);
}
// Processes input byte c in state, returning new state.
@@ -994,7 +1012,7 @@ void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq,
DFA::State* DFA::RunStateOnByteUnlocked(State* state, int c) {
// Keep only one RunStateOnByte going
// even if the DFA is being run by multiple threads.
- MutexLock l(&mutex_);
+ absl::MutexLock l(&mutex_);
return RunStateOnByte(state, c);
}
@@ -1134,9 +1152,9 @@ DFA::RWLocker::RWLocker(CacheMutex* mu) : mu_(mu), writing_(false) {
mu_->ReaderLock();
}
-// This function is marked as NO_THREAD_SAFETY_ANALYSIS because
+// This function is marked as ABSL_NO_THREAD_SAFETY_ANALYSIS because
// the annotations don't support lock upgrade.
-void DFA::RWLocker::LockForWriting() NO_THREAD_SAFETY_ANALYSIS {
+void DFA::RWLocker::LockForWriting() ABSL_NO_THREAD_SAFETY_ANALYSIS {
if (!writing_) {
mu_->ReaderUnlock();
mu_->WriterLock();
@@ -1246,7 +1264,7 @@ DFA::StateSaver::~StateSaver() {
DFA::State* DFA::StateSaver::Restore() {
if (is_special_)
return special_;
- MutexLock l(&dfa_->mutex_);
+ absl::MutexLock l(&dfa_->mutex_);
State* s = dfa_->CachedState(inst_, ninst_, flag_);
if (s == NULL)
LOG(DFATAL) << "StateSaver failed to restore state.";
@@ -1342,13 +1360,13 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) {
State* s = start;
if (ExtraDebug)
- fprintf(stderr, "@stx: %s\n", DumpState(s).c_str());
+ absl::FPrintF(stderr, "@stx: %s\n", DumpState(s));
if (s->IsMatch()) {
matched = true;
lastmatch = p;
if (ExtraDebug)
- fprintf(stderr, "match @stx! [%s]\n", DumpState(s).c_str());
+ absl::FPrintF(stderr, "match @stx! [%s]\n", DumpState(s));
if (params->matches != NULL && kind_ == Prog::kManyMatch) {
for (int i = s->ninst_ - 1; i >= 0; i--) {
int id = s->inst_[i];
@@ -1365,7 +1383,7 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) {
while (p != ep) {
if (ExtraDebug)
- fprintf(stderr, "@%td: %s\n", p - bp, DumpState(s).c_str());
+ absl::FPrintF(stderr, "@%d: %s\n", p - bp, DumpState(s));
if (can_prefix_accel && s == start) {
// In start state, only way out is to find the prefix,
@@ -1465,7 +1483,7 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) {
else
lastmatch = p + 1;
if (ExtraDebug)
- fprintf(stderr, "match @%td! [%s]\n", lastmatch - bp, DumpState(s).c_str());
+ absl::FPrintF(stderr, "match @%d! [%s]\n", lastmatch - bp, DumpState(s));
if (params->matches != NULL && kind_ == Prog::kManyMatch) {
for (int i = s->ninst_ - 1; i >= 0; i--) {
int id = s->inst_[i];
@@ -1484,7 +1502,7 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) {
// Process one more byte to see if it triggers a match.
// (Remember, matches are delayed one byte.)
if (ExtraDebug)
- fprintf(stderr, "@etx: %s\n", DumpState(s).c_str());
+ absl::FPrintF(stderr, "@etx: %s\n", DumpState(s));
int lastbyte;
if (run_forward) {
@@ -1532,7 +1550,7 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) {
matched = true;
lastmatch = p;
if (ExtraDebug)
- fprintf(stderr, "match @etx! [%s]\n", DumpState(s).c_str());
+ absl::FPrintF(stderr, "match @etx! [%s]\n", DumpState(s));
if (params->matches != NULL && kind_ == Prog::kManyMatch) {
for (int i = s->ninst_ - 1; i >= 0; i--) {
int id = s->inst_[i];
@@ -1623,8 +1641,8 @@ bool DFA::FastSearchLoop(SearchParams* params) {
// state for the DFA search loop. Fills in params and returns true on success.
// Returns false on failure.
bool DFA::AnalyzeSearch(SearchParams* params) {
- const StringPiece& text = params->text;
- const StringPiece& context = params->context;
+ absl::string_view text = params->text;
+ absl::string_view context = params->context;
// Sanity check: make sure that text lies within context.
if (BeginPtr(text) < BeginPtr(context) || EndPtr(text) > EndPtr(context)) {
@@ -1675,8 +1693,8 @@ bool DFA::AnalyzeSearch(SearchParams* params) {
if (!AnalyzeSearchHelper(params, info, flags)) {
ResetCache(params->cache_lock);
if (!AnalyzeSearchHelper(params, info, flags)) {
- LOG(DFATAL) << "Failed to analyze start state.";
params->failed = true;
+ LOG(DFATAL) << "Failed to analyze start state.";
return false;
}
}
@@ -1694,9 +1712,9 @@ bool DFA::AnalyzeSearch(SearchParams* params) {
params->can_prefix_accel = true;
if (ExtraDebug)
- fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s can_prefix_accel=%d\n",
- params->anchored, params->run_forward, flags,
- DumpState(params->start).c_str(), params->can_prefix_accel);
+ absl::FPrintF(stderr, "anchored=%d fwd=%d flags=%#x state=%s can_prefix_accel=%d\n",
+ params->anchored, params->run_forward, flags,
+ DumpState(params->start), params->can_prefix_accel);
return true;
}
@@ -1709,7 +1727,7 @@ bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info,
if (start != NULL)
return true;
- MutexLock l(&mutex_);
+ absl::MutexLock l(&mutex_);
start = info->start.load(std::memory_order_relaxed);
if (start != NULL)
return true;
@@ -1728,14 +1746,9 @@ bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info,
}
// The actual DFA search: calls AnalyzeSearch and then FastSearchLoop.
-bool DFA::Search(const StringPiece& text,
- const StringPiece& context,
- bool anchored,
- bool want_earliest_match,
- bool run_forward,
- bool* failed,
- const char** epp,
- SparseSet* matches) {
+bool DFA::Search(absl::string_view text, absl::string_view context,
+ bool anchored, bool want_earliest_match, bool run_forward,
+ bool* failed, const char** epp, SparseSet* matches) {
*epp = NULL;
if (!ok()) {
*failed = true;
@@ -1744,9 +1757,9 @@ bool DFA::Search(const StringPiece& text,
*failed = false;
if (ExtraDebug) {
- fprintf(stderr, "\nprogram:\n%s\n", prog_->DumpUnanchored().c_str());
- fprintf(stderr, "text %s anchored=%d earliest=%d fwd=%d kind %d\n",
- std::string(text).c_str(), anchored, want_earliest_match, run_forward, kind_);
+ absl::FPrintF(stderr, "\nprogram:\n%s\n", prog_->DumpUnanchored());
+ absl::FPrintF(stderr, "text %s anchored=%d earliest=%d fwd=%d kind %d\n",
+ text, anchored, want_earliest_match, run_forward, kind_);
}
RWLocker l(&cache_mutex_);
@@ -1770,7 +1783,7 @@ bool DFA::Search(const StringPiece& text,
return true;
}
if (ExtraDebug)
- fprintf(stderr, "start %s\n", DumpState(params.start).c_str());
+ absl::FPrintF(stderr, "start %s\n", DumpState(params.start));
bool ret = FastSearchLoop(&params);
if (params.failed) {
*failed = true;
@@ -1789,17 +1802,17 @@ DFA* Prog::GetDFA(MatchKind kind) {
// "longest match" DFA, because RE2 never does reverse
// "first match" searches.
if (kind == kFirstMatch) {
- std::call_once(dfa_first_once_, [](Prog* prog) {
+ absl::call_once(dfa_first_once_, [](Prog* prog) {
prog->dfa_first_ = new DFA(prog, kFirstMatch, prog->dfa_mem_ / 2);
}, this);
return dfa_first_;
} else if (kind == kManyMatch) {
- std::call_once(dfa_first_once_, [](Prog* prog) {
+ absl::call_once(dfa_first_once_, [](Prog* prog) {
prog->dfa_first_ = new DFA(prog, kManyMatch, prog->dfa_mem_);
}, this);
return dfa_first_;
} else {
- std::call_once(dfa_longest_once_, [](Prog* prog) {
+ absl::call_once(dfa_longest_once_, [](Prog* prog) {
if (!prog->reversed_)
prog->dfa_longest_ = new DFA(prog, kLongestMatch, prog->dfa_mem_ / 2);
else
@@ -1823,12 +1836,11 @@ void Prog::DeleteDFA(DFA* dfa) {
//
// This is the only external interface (class DFA only exists in this file).
//
-bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context,
- Anchor anchor, MatchKind kind, StringPiece* match0,
+bool Prog::SearchDFA(absl::string_view text, absl::string_view context,
+ Anchor anchor, MatchKind kind, absl::string_view* match0,
bool* failed, SparseSet* matches) {
*failed = false;
- StringPiece context = const_context;
if (context.data() == NULL)
context = text;
bool caret = anchor_start();
@@ -1889,10 +1901,10 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context,
if (match0) {
if (reversed_)
*match0 =
- StringPiece(ep, static_cast<size_t>(text.data() + text.size() - ep));
+ absl::string_view(ep, static_cast<size_t>(text.data() + text.size() - ep));
else
*match0 =
- StringPiece(text.data(), static_cast<size_t>(ep - text.data()));
+ absl::string_view(text.data(), static_cast<size_t>(ep - text.data()));
}
return true;
}
@@ -1905,7 +1917,7 @@ int DFA::BuildAllStates(const Prog::DFAStateCallback& cb) {
// Pick out start state for unanchored search
// at beginning of text.
RWLocker l(&cache_mutex_);
- SearchParams params(StringPiece(), StringPiece(), &l);
+ SearchParams params(absl::string_view(), absl::string_view(), &l);
params.anchored = false;
if (!AnalyzeSearch(&params) ||
params.start == NULL ||
@@ -1915,7 +1927,7 @@ int DFA::BuildAllStates(const Prog::DFAStateCallback& cb) {
// Add start state to work queue.
// Note that any State* that we handle here must point into the cache,
// so we can simply depend on pointer-as-a-number hashing and equality.
- std::unordered_map<State*, int> m;
+ absl::flat_hash_map<State*, int> m;
std::deque<State*> q;
m.emplace(params.start, static_cast<int>(m.size()));
q.push_back(params.start);
@@ -1989,11 +2001,11 @@ bool DFA::PossibleMatchRange(std::string* min, std::string* max, int maxlen) {
// Also note that previously_visited_states[UnseenStatePtr] will, in the STL
// tradition, implicitly insert a '0' value at first use. We take advantage
// of that property below.
- std::unordered_map<State*, int> previously_visited_states;
+ absl::flat_hash_map<State*, int> previously_visited_states;
// Pick out start state for anchored search at beginning of text.
RWLocker l(&cache_mutex_);
- SearchParams params(StringPiece(), StringPiece(), &l);
+ SearchParams params(absl::string_view(), absl::string_view(), &l);
params.anchored = true;
if (!AnalyzeSearch(&params))
return false;
@@ -2033,7 +2045,7 @@ bool DFA::PossibleMatchRange(std::string* min, std::string* max, int maxlen) {
// Build minimum prefix.
State* s = params.start;
min->clear();
- MutexLock lock(&mutex_);
+ absl::MutexLock lock(&mutex_);
for (int i = 0; i < maxlen; i++) {
if (previously_visited_states[s] > kMaxEltRepetitions)
break;
diff --git a/re2/filtered_re2.cc b/re2/filtered_re2.cc
index 5df9745..49cf686 100644
--- a/re2/filtered_re2.cc
+++ b/re2/filtered_re2.cc
@@ -8,7 +8,6 @@
#include <string>
#include <utility>
-#include "util/util.h"
#include "util/logging.h"
#include "re2/prefilter.h"
#include "re2/prefilter_tree.h"
@@ -46,7 +45,7 @@ FilteredRE2& FilteredRE2::operator=(FilteredRE2&& other) {
return *this;
}
-RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern,
+RE2::ErrorCode FilteredRE2::Add(absl::string_view pattern,
const RE2::Options& options, int* id) {
RE2* re = new RE2(pattern, options);
RE2::ErrorCode code = re->error_code();
@@ -85,14 +84,14 @@ void FilteredRE2::Compile(std::vector<std::string>* atoms) {
compiled_ = true;
}
-int FilteredRE2::SlowFirstMatch(const StringPiece& text) const {
+int FilteredRE2::SlowFirstMatch(absl::string_view text) const {
for (size_t i = 0; i < re2_vec_.size(); i++)
if (RE2::PartialMatch(text, *re2_vec_[i]))
return static_cast<int>(i);
return -1;
}
-int FilteredRE2::FirstMatch(const StringPiece& text,
+int FilteredRE2::FirstMatch(absl::string_view text,
const std::vector<int>& atoms) const {
if (!compiled_) {
LOG(DFATAL) << "FirstMatch called before Compile.";
@@ -106,10 +105,9 @@ int FilteredRE2::FirstMatch(const StringPiece& text,
return -1;
}
-bool FilteredRE2::AllMatches(
- const StringPiece& text,
- const std::vector<int>& atoms,
- std::vector<int>* matching_regexps) const {
+bool FilteredRE2::AllMatches(absl::string_view text,
+ const std::vector<int>& atoms,
+ std::vector<int>* matching_regexps) const {
matching_regexps->clear();
std::vector<int> regexps;
prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
@@ -119,9 +117,8 @@ bool FilteredRE2::AllMatches(
return !matching_regexps->empty();
}
-void FilteredRE2::AllPotentials(
- const std::vector<int>& atoms,
- std::vector<int>* potential_regexps) const {
+void FilteredRE2::AllPotentials(const std::vector<int>& atoms,
+ std::vector<int>* potential_regexps) const {
prefilter_tree_->RegexpsGivenStrings(atoms, potential_regexps);
}
diff --git a/re2/filtered_re2.h b/re2/filtered_re2.h
index dd618c7..a9abd69 100644
--- a/re2/filtered_re2.h
+++ b/re2/filtered_re2.h
@@ -25,6 +25,7 @@
#include <string>
#include <vector>
+#include "absl/strings/string_view.h"
#include "re2/re2.h"
namespace re2 {
@@ -47,7 +48,7 @@ class FilteredRE2 {
// Uses RE2 constructor to create a RE2 object (re). Returns
// re->error_code(). If error_code is other than NoError, then re is
// deleted and not added to re2_vec_.
- RE2::ErrorCode Add(const StringPiece& pattern,
+ RE2::ErrorCode Add(absl::string_view pattern,
const RE2::Options& options,
int* id);
@@ -63,17 +64,17 @@ class FilteredRE2 {
// Returns -1 on no match. Can be called prior to Compile.
// Does not do any filtering: simply tries to Match the
// regexps in a loop.
- int SlowFirstMatch(const StringPiece& text) const;
+ int SlowFirstMatch(absl::string_view text) const;
// Returns the index of the first matching regexp.
// Returns -1 on no match. Compile has to be called before
// calling this.
- int FirstMatch(const StringPiece& text,
+ int FirstMatch(absl::string_view text,
const std::vector<int>& atoms) const;
// Returns the indices of all matching regexps, after first clearing
// matched_regexps.
- bool AllMatches(const StringPiece& text,
+ bool AllMatches(absl::string_view text,
const std::vector<int>& atoms,
std::vector<int>* matching_regexps) const;
diff --git a/re2/fuzzing/compiler-rt/LICENSE b/re2/fuzzing/compiler-rt/LICENSE
deleted file mode 100644
index f9dc506..0000000
--- a/re2/fuzzing/compiler-rt/LICENSE
+++ /dev/null
@@ -1,219 +0,0 @@
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
- APPENDIX: How to apply the Apache License to your work.
-
- To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "[]"
- replaced with your own identifying information. (Don't include
- the brackets!) The text should be enclosed in the appropriate
- comment syntax for the file format. We also recommend that a
- file or class name and description of purpose be included on the
- same "printed page" as the copyright notice for easier
- identification within third-party archives.
-
- Copyright [yyyy] [name of copyright owner]
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-
-
---- LLVM Exceptions to the Apache 2.0 License ----
-
-As an exception, if, as a result of your compiling your source code, portions
-of this Software are embedded into an Object form of such source code, you
-may redistribute such embedded portions in such Object form without complying
-with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
-
-In addition, if you combine or link compiled forms of this Software with
-software that is licensed under the GPLv2 ("Combined Software") and if a
-court of competent jurisdiction determines that the patent provision (Section
-3), the indemnity provision (Section 9) or other Section of the License
-conflicts with the conditions of the GPLv2, you may retroactively and
-prospectively choose to deem waived or otherwise exclude such Section(s) of
-the License, but only in their entirety and only with respect to the Combined
-Software.
-
diff --git a/re2/fuzzing/compiler-rt/include/fuzzer/FuzzedDataProvider.h b/re2/fuzzing/compiler-rt/include/fuzzer/FuzzedDataProvider.h
deleted file mode 100644
index 71cb427..0000000
--- a/re2/fuzzing/compiler-rt/include/fuzzer/FuzzedDataProvider.h
+++ /dev/null
@@ -1,397 +0,0 @@
-//===- FuzzedDataProvider.h - Utility header for fuzz targets ---*- C++ -* ===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// A single header library providing an utility class to break up an array of
-// bytes. Whenever run on the same input, provides the same output, as long as
-// its methods are called in the same order, with the same arguments.
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_
-#define LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_
-
-#include <algorithm>
-#include <array>
-#include <climits>
-#include <cstddef>
-#include <cstdint>
-#include <cstring>
-#include <initializer_list>
-#include <limits>
-#include <string>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-// In addition to the comments below, the API is also briefly documented at
-// https://github.com/google/fuzzing/blob/master/docs/split-inputs.md#fuzzed-data-provider
-class FuzzedDataProvider {
- public:
- // |data| is an array of length |size| that the FuzzedDataProvider wraps to
- // provide more granular access. |data| must outlive the FuzzedDataProvider.
- FuzzedDataProvider(const uint8_t *data, size_t size)
- : data_ptr_(data), remaining_bytes_(size) {}
- ~FuzzedDataProvider() = default;
-
- // See the implementation below (after the class definition) for more verbose
- // comments for each of the methods.
-
- // Methods returning std::vector of bytes. These are the most popular choice
- // when splitting fuzzing input into pieces, as every piece is put into a
- // separate buffer (i.e. ASan would catch any under-/overflow) and the memory
- // will be released automatically.
- template <typename T> std::vector<T> ConsumeBytes(size_t num_bytes);
- template <typename T>
- std::vector<T> ConsumeBytesWithTerminator(size_t num_bytes, T terminator = 0);
- template <typename T> std::vector<T> ConsumeRemainingBytes();
-
- // Methods returning strings. Use only when you need a std::string or a null
- // terminated C-string. Otherwise, prefer the methods returning std::vector.
- std::string ConsumeBytesAsString(size_t num_bytes);
- std::string ConsumeRandomLengthString(size_t max_length);
- std::string ConsumeRandomLengthString();
- std::string ConsumeRemainingBytesAsString();
-
- // Methods returning integer values.
- template <typename T> T ConsumeIntegral();
- template <typename T> T ConsumeIntegralInRange(T min, T max);
-
- // Methods returning floating point values.
- template <typename T> T ConsumeFloatingPoint();
- template <typename T> T ConsumeFloatingPointInRange(T min, T max);
-
- // 0 <= return value <= 1.
- template <typename T> T ConsumeProbability();
-
- bool ConsumeBool();
-
- // Returns a value chosen from the given enum.
- template <typename T> T ConsumeEnum();
-
- // Returns a value from the given array.
- template <typename T, size_t size> T PickValueInArray(const T (&array)[size]);
- template <typename T, size_t size>
- T PickValueInArray(const std::array<T, size> &array);
- template <typename T> T PickValueInArray(std::initializer_list<const T> list);
-
- // Writes data to the given destination and returns number of bytes written.
- size_t ConsumeData(void *destination, size_t num_bytes);
-
- // Reports the remaining bytes available for fuzzed input.
- size_t remaining_bytes() { return remaining_bytes_; }
-
- private:
- FuzzedDataProvider(const FuzzedDataProvider &) = delete;
- FuzzedDataProvider &operator=(const FuzzedDataProvider &) = delete;
-
- void CopyAndAdvance(void *destination, size_t num_bytes);
-
- void Advance(size_t num_bytes);
-
- template <typename T>
- std::vector<T> ConsumeBytes(size_t size, size_t num_bytes);
-
- template <typename TS, typename TU> TS ConvertUnsignedToSigned(TU value);
-
- const uint8_t *data_ptr_;
- size_t remaining_bytes_;
-};
-
-// Returns a std::vector containing |num_bytes| of input data. If fewer than
-// |num_bytes| of data remain, returns a shorter std::vector containing all
-// of the data that's left. Can be used with any byte sized type, such as
-// char, unsigned char, uint8_t, etc.
-template <typename T>
-std::vector<T> FuzzedDataProvider::ConsumeBytes(size_t num_bytes) {
- num_bytes = std::min(num_bytes, remaining_bytes_);
- return ConsumeBytes<T>(num_bytes, num_bytes);
-}
-
-// Similar to |ConsumeBytes|, but also appends the terminator value at the end
-// of the resulting vector. Useful, when a mutable null-terminated C-string is
-// needed, for example. But that is a rare case. Better avoid it, if possible,
-// and prefer using |ConsumeBytes| or |ConsumeBytesAsString| methods.
-template <typename T>
-std::vector<T> FuzzedDataProvider::ConsumeBytesWithTerminator(size_t num_bytes,
- T terminator) {
- num_bytes = std::min(num_bytes, remaining_bytes_);
- std::vector<T> result = ConsumeBytes<T>(num_bytes + 1, num_bytes);
- result.back() = terminator;
- return result;
-}
-
-// Returns a std::vector containing all remaining bytes of the input data.
-template <typename T>
-std::vector<T> FuzzedDataProvider::ConsumeRemainingBytes() {
- return ConsumeBytes<T>(remaining_bytes_);
-}
-
-// Returns a std::string containing |num_bytes| of input data. Using this and
-// |.c_str()| on the resulting string is the best way to get an immutable
-// null-terminated C string. If fewer than |num_bytes| of data remain, returns
-// a shorter std::string containing all of the data that's left.
-inline std::string FuzzedDataProvider::ConsumeBytesAsString(size_t num_bytes) {
- static_assert(sizeof(std::string::value_type) == sizeof(uint8_t),
- "ConsumeBytesAsString cannot convert the data to a string.");
-
- num_bytes = std::min(num_bytes, remaining_bytes_);
- std::string result(
- reinterpret_cast<const std::string::value_type *>(data_ptr_), num_bytes);
- Advance(num_bytes);
- return result;
-}
-
-// Returns a std::string of length from 0 to |max_length|. When it runs out of
-// input data, returns what remains of the input. Designed to be more stable
-// with respect to a fuzzer inserting characters than just picking a random
-// length and then consuming that many bytes with |ConsumeBytes|.
-inline std::string
-FuzzedDataProvider::ConsumeRandomLengthString(size_t max_length) {
- // Reads bytes from the start of |data_ptr_|. Maps "\\" to "\", and maps "\"
- // followed by anything else to the end of the string. As a result of this
- // logic, a fuzzer can insert characters into the string, and the string
- // will be lengthened to include those new characters, resulting in a more
- // stable fuzzer than picking the length of a string independently from
- // picking its contents.
- std::string result;
-
- // Reserve the anticipated capaticity to prevent several reallocations.
- result.reserve(std::min(max_length, remaining_bytes_));
- for (size_t i = 0; i < max_length && remaining_bytes_ != 0; ++i) {
- char next = ConvertUnsignedToSigned<char>(data_ptr_[0]);
- Advance(1);
- if (next == '\\' && remaining_bytes_ != 0) {
- next = ConvertUnsignedToSigned<char>(data_ptr_[0]);
- Advance(1);
- if (next != '\\')
- break;
- }
- result += next;
- }
-
- result.shrink_to_fit();
- return result;
-}
-
-// Returns a std::string of length from 0 to |remaining_bytes_|.
-inline std::string FuzzedDataProvider::ConsumeRandomLengthString() {
- return ConsumeRandomLengthString(remaining_bytes_);
-}
-
-// Returns a std::string containing all remaining bytes of the input data.
-// Prefer using |ConsumeRemainingBytes| unless you actually need a std::string
-// object.
-inline std::string FuzzedDataProvider::ConsumeRemainingBytesAsString() {
- return ConsumeBytesAsString(remaining_bytes_);
-}
-
-// Returns a number in the range [Type's min, Type's max]. The value might
-// not be uniformly distributed in the given range. If there's no input data
-// left, always returns |min|.
-template <typename T> T FuzzedDataProvider::ConsumeIntegral() {
- return ConsumeIntegralInRange(std::numeric_limits<T>::min(),
- std::numeric_limits<T>::max());
-}
-
-// Returns a number in the range [min, max] by consuming bytes from the
-// input data. The value might not be uniformly distributed in the given
-// range. If there's no input data left, always returns |min|. |min| must
-// be less than or equal to |max|.
-template <typename T>
-T FuzzedDataProvider::ConsumeIntegralInRange(T min, T max) {
- static_assert(std::is_integral<T>::value, "An integral type is required.");
- static_assert(sizeof(T) <= sizeof(uint64_t), "Unsupported integral type.");
-
- if (min > max)
- abort();
-
- // Use the biggest type possible to hold the range and the result.
- uint64_t range = static_cast<uint64_t>(max) - min;
- uint64_t result = 0;
- size_t offset = 0;
-
- while (offset < sizeof(T) * CHAR_BIT && (range >> offset) > 0 &&
- remaining_bytes_ != 0) {
- // Pull bytes off the end of the seed data. Experimentally, this seems to
- // allow the fuzzer to more easily explore the input space. This makes
- // sense, since it works by modifying inputs that caused new code to run,
- // and this data is often used to encode length of data read by
- // |ConsumeBytes|. Separating out read lengths makes it easier modify the
- // contents of the data that is actually read.
- --remaining_bytes_;
- result = (result << CHAR_BIT) | data_ptr_[remaining_bytes_];
- offset += CHAR_BIT;
- }
-
- // Avoid division by 0, in case |range + 1| results in overflow.
- if (range != std::numeric_limits<decltype(range)>::max())
- result = result % (range + 1);
-
- return static_cast<T>(min + result);
-}
-
-// Returns a floating point value in the range [Type's lowest, Type's max] by
-// consuming bytes from the input data. If there's no input data left, always
-// returns approximately 0.
-template <typename T> T FuzzedDataProvider::ConsumeFloatingPoint() {
- return ConsumeFloatingPointInRange<T>(std::numeric_limits<T>::lowest(),
- std::numeric_limits<T>::max());
-}
-
-// Returns a floating point value in the given range by consuming bytes from
-// the input data. If there's no input data left, returns |min|. Note that
-// |min| must be less than or equal to |max|.
-template <typename T>
-T FuzzedDataProvider::ConsumeFloatingPointInRange(T min, T max) {
- if (min > max)
- abort();
-
- T range = .0;
- T result = min;
- constexpr T zero(.0);
- if (max > zero && min < zero && max > min + std::numeric_limits<T>::max()) {
- // The diff |max - min| would overflow the given floating point type. Use
- // the half of the diff as the range and consume a bool to decide whether
- // the result is in the first of the second part of the diff.
- range = (max / 2.0) - (min / 2.0);
- if (ConsumeBool()) {
- result += range;
- }
- } else {
- range = max - min;
- }
-
- return result + range * ConsumeProbability<T>();
-}
-
-// Returns a floating point number in the range [0.0, 1.0]. If there's no
-// input data left, always returns 0.
-template <typename T> T FuzzedDataProvider::ConsumeProbability() {
- static_assert(std::is_floating_point<T>::value,
- "A floating point type is required.");
-
- // Use different integral types for different floating point types in order
- // to provide better density of the resulting values.
- using IntegralType =
- typename std::conditional<(sizeof(T) <= sizeof(uint32_t)), uint32_t,
- uint64_t>::type;
-
- T result = static_cast<T>(ConsumeIntegral<IntegralType>());
- result /= static_cast<T>(std::numeric_limits<IntegralType>::max());
- return result;
-}
-
-// Reads one byte and returns a bool, or false when no data remains.
-inline bool FuzzedDataProvider::ConsumeBool() {
- return 1 & ConsumeIntegral<uint8_t>();
-}
-
-// Returns an enum value. The enum must start at 0 and be contiguous. It must
-// also contain |kMaxValue| aliased to its largest (inclusive) value. Such as:
-// enum class Foo { SomeValue, OtherValue, kMaxValue = OtherValue };
-template <typename T> T FuzzedDataProvider::ConsumeEnum() {
- static_assert(std::is_enum<T>::value, "|T| must be an enum type.");
- return static_cast<T>(
- ConsumeIntegralInRange<uint32_t>(0, static_cast<uint32_t>(T::kMaxValue)));
-}
-
-// Returns a copy of the value selected from the given fixed-size |array|.
-template <typename T, size_t size>
-T FuzzedDataProvider::PickValueInArray(const T (&array)[size]) {
- static_assert(size > 0, "The array must be non empty.");
- return array[ConsumeIntegralInRange<size_t>(0, size - 1)];
-}
-
-template <typename T, size_t size>
-T FuzzedDataProvider::PickValueInArray(const std::array<T, size> &array) {
- static_assert(size > 0, "The array must be non empty.");
- return array[ConsumeIntegralInRange<size_t>(0, size - 1)];
-}
-
-template <typename T>
-T FuzzedDataProvider::PickValueInArray(std::initializer_list<const T> list) {
- // TODO(Dor1s): switch to static_assert once C++14 is allowed.
- if (!list.size())
- abort();
-
- return *(list.begin() + ConsumeIntegralInRange<size_t>(0, list.size() - 1));
-}
-
-// Writes |num_bytes| of input data to the given destination pointer. If there
-// is not enough data left, writes all remaining bytes. Return value is the
-// number of bytes written.
-// In general, it's better to avoid using this function, but it may be useful
-// in cases when it's necessary to fill a certain buffer or object with
-// fuzzing data.
-inline size_t FuzzedDataProvider::ConsumeData(void *destination,
- size_t num_bytes) {
- num_bytes = std::min(num_bytes, remaining_bytes_);
- CopyAndAdvance(destination, num_bytes);
- return num_bytes;
-}
-
-// Private methods.
-inline void FuzzedDataProvider::CopyAndAdvance(void *destination,
- size_t num_bytes) {
- std::memcpy(destination, data_ptr_, num_bytes);
- Advance(num_bytes);
-}
-
-inline void FuzzedDataProvider::Advance(size_t num_bytes) {
- if (num_bytes > remaining_bytes_)
- abort();
-
- data_ptr_ += num_bytes;
- remaining_bytes_ -= num_bytes;
-}
-
-template <typename T>
-std::vector<T> FuzzedDataProvider::ConsumeBytes(size_t size, size_t num_bytes) {
- static_assert(sizeof(T) == sizeof(uint8_t), "Incompatible data type.");
-
- // The point of using the size-based constructor below is to increase the
- // odds of having a vector object with capacity being equal to the length.
- // That part is always implementation specific, but at least both libc++ and
- // libstdc++ allocate the requested number of bytes in that constructor,
- // which seems to be a natural choice for other implementations as well.
- // To increase the odds even more, we also call |shrink_to_fit| below.
- std::vector<T> result(size);
- if (size == 0) {
- if (num_bytes != 0)
- abort();
- return result;
- }
-
- CopyAndAdvance(result.data(), num_bytes);
-
- // Even though |shrink_to_fit| is also implementation specific, we expect it
- // to provide an additional assurance in case vector's constructor allocated
- // a buffer which is larger than the actual amount of data we put inside it.
- result.shrink_to_fit();
- return result;
-}
-
-template <typename TS, typename TU>
-TS FuzzedDataProvider::ConvertUnsignedToSigned(TU value) {
- static_assert(sizeof(TS) == sizeof(TU), "Incompatible data types.");
- static_assert(!std::numeric_limits<TU>::is_signed,
- "Source type must be unsigned.");
-
- // TODO(Dor1s): change to `if constexpr` once C++17 becomes mainstream.
- if (std::numeric_limits<TS>::is_modulo)
- return static_cast<TS>(value);
-
- // Avoid using implementation-defined unsigned to signed conversions.
- // To learn more, see https://stackoverflow.com/questions/13150449.
- if (value <= std::numeric_limits<TS>::max()) {
- return static_cast<TS>(value);
- } else {
- constexpr auto TS_min = std::numeric_limits<TS>::min();
- return TS_min + static_cast<TS>(value - TS_min);
- }
-}
-
-#endif // LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_
diff --git a/re2/fuzzing/re2_fuzzer.cc b/re2/fuzzing/re2_fuzzer.cc
index 3082a76..9a7af08 100644
--- a/re2/fuzzing/re2_fuzzer.cc
+++ b/re2/fuzzing/re2_fuzzer.cc
@@ -9,12 +9,12 @@
#include <string>
#include <vector>
+#include "re2/filtered_re2.h"
#include "re2/re2.h"
#include "re2/regexp.h"
+#include "re2/set.h"
#include "re2/walker-inl.h"
-using re2::StringPiece;
-
// NOT static, NOT signed.
uint8_t dummy = 0;
@@ -95,8 +95,8 @@ class SubstringWalker : public re2::Regexp::Walker<int> {
SubstringWalker& operator=(const SubstringWalker&) = delete;
};
-void TestOneInput(StringPiece pattern, const RE2::Options& options,
- StringPiece text) {
+void TestOneInput(absl::string_view pattern, const RE2::Options& options,
+ RE2::Anchor anchor, absl::string_view text) {
// Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W.
// Otherwise, we will waste time on inputs that have long runs of various
// character classes. The fuzzer has shown itself to be easily capable of
@@ -105,7 +105,7 @@ void TestOneInput(StringPiece pattern, const RE2::Options& options,
// counted repetition is involved - whereas the marginal benefit is zero.
// Crudely limit the use of 'k', 'K', 's' and 'S' too because they become
// three-element character classes when case-insensitive and using UTF-8.
- // TODO(junyer): Handle [:isalnum:] et al. when they start to cause pain.
+ // TODO(junyer): Handle [[:alnum:]] et al. when they start to cause pain.
int char_class = 0;
int backslash_p = 0; // very expensive, so handle specially
for (size_t i = 0; i < pattern.size(); i++) {
@@ -131,6 +131,9 @@ void TestOneInput(StringPiece pattern, const RE2::Options& options,
if (backslash_p > 1)
return;
+ // Iterate just once when fuzzing. Otherwise, we easily get bogged down
+ // and coverage is unlikely to improve despite significant expense.
+ RE2::FUZZING_ONLY_set_maximum_global_replace_count(1);
// The default is 1000. Even 100 turned out to be too generous
// for fuzzing, empirically speaking, so let's try 10 instead.
re2::Regexp::FUZZING_ONLY_set_maximum_repeat_count(10);
@@ -173,7 +176,7 @@ void TestOneInput(StringPiece pattern, const RE2::Options& options,
if (re.NumberOfCapturingGroups() == 0) {
// Avoid early return due to too many arguments.
- StringPiece sp = text;
+ absl::string_view sp = text;
RE2::FullMatch(sp, re);
RE2::PartialMatch(sp, re);
RE2::Consume(&sp, re);
@@ -182,7 +185,7 @@ void TestOneInput(StringPiece pattern, const RE2::Options& options,
} else {
// Okay, we have at least one capturing group...
// Try conversion for variously typed arguments.
- StringPiece sp = text;
+ absl::string_view sp = text;
short s;
RE2::FullMatch(sp, re, &s);
long l;
@@ -206,6 +209,30 @@ void TestOneInput(StringPiece pattern, const RE2::Options& options,
dummy += re.NamedCapturingGroups().size();
dummy += re.CapturingGroupNames().size();
dummy += RE2::QuoteMeta(pattern).size();
+ dummy += re.Regexp()->ToString().size();
+
+ RE2::Set set(options, anchor);
+ int index = set.Add(pattern, /*error=*/NULL); // -1 on error
+ if (index != -1 && set.Compile()) {
+ std::vector<int> matches;
+ set.Match(text, &matches);
+ }
+
+ re2::FilteredRE2 filter;
+ index = -1; // not clobbered on error
+ filter.Add(pattern, options, &index);
+ if (index != -1) {
+ std::vector<std::string> atoms;
+ filter.Compile(&atoms);
+ // Pretend that all atoms match, which
+ // triggers the AND-OR tree maximally.
+ std::vector<int> matched_atoms;
+ matched_atoms.reserve(atoms.size());
+ for (size_t i = 0; i < atoms.size(); ++i)
+ matched_atoms.push_back(static_cast<int>(i));
+ std::vector<int> matches;
+ filter.AllMatches(text, matched_atoms, &matches);
+ }
}
// Entry point for libFuzzer.
@@ -239,9 +266,17 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
options.set_word_boundary(fdp.ConsumeBool());
options.set_one_line(fdp.ConsumeBool());
+ // ConsumeEnum<RE2::Anchor>() would require RE2::Anchor to specify
+ // kMaxValue, so just use PickValueInArray<RE2::Anchor>() instead.
+ RE2::Anchor anchor = fdp.PickValueInArray<RE2::Anchor>({
+ RE2::UNANCHORED,
+ RE2::ANCHOR_START,
+ RE2::ANCHOR_BOTH,
+ });
+
std::string pattern = fdp.ConsumeRandomLengthString(999);
std::string text = fdp.ConsumeRandomLengthString(999);
- TestOneInput(pattern, options, text);
+ TestOneInput(pattern, options, anchor, text);
return 0;
}
diff --git a/re2/mimics_pcre.cc b/re2/mimics_pcre.cc
index b1d6a51..ac0c69d 100644
--- a/re2/mimics_pcre.cc
+++ b/re2/mimics_pcre.cc
@@ -22,7 +22,6 @@
//
// Regexp::MimicsPCRE checks for any of these conditions.
-#include "util/util.h"
#include "util/logging.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
diff --git a/re2/nfa.cc b/re2/nfa.cc
index c7339f8..a655884 100644
--- a/re2/nfa.cc
+++ b/re2/nfa.cc
@@ -32,8 +32,8 @@
#include <utility>
#include <vector>
+#include "absl/strings/str_format.h"
#include "util/logging.h"
-#include "util/strutil.h"
#include "re2/pod_array.h"
#include "re2/prog.h"
#include "re2/regexp.h"
@@ -60,9 +60,8 @@ class NFA {
// Submatch[0] is the entire match. When there is a choice in
// which text matches each subexpression, the submatch boundaries
// are chosen to match what a backtracking implementation would choose.
- bool Search(const StringPiece& text, const StringPiece& context,
- bool anchored, bool longest,
- StringPiece* submatch, int nsubmatch);
+ bool Search(absl::string_view text, absl::string_view context, bool anchored,
+ bool longest, absl::string_view* submatch, int nsubmatch);
private:
struct Thread {
@@ -92,7 +91,7 @@ class NFA {
// Enqueues only the ByteRange instructions that match byte c.
// context is used (with p) for evaluating empty-width specials.
// p is the current input position, and t0 is the current thread.
- void AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
+ void AddToThreadq(Threadq* q, int id0, int c, absl::string_view context,
const char* p, Thread* t0);
// Run runq on byte c, appending new states to nextq.
@@ -102,7 +101,7 @@ class NFA {
// p-1 will be used when processing Match instructions.
// Frees all the threads on runq.
// If there is a shortcut to the end, returns that shortcut.
- int Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
+ int Step(Threadq* runq, Threadq* nextq, int c, absl::string_view context,
const char* p);
// Returns text version of capture information, for debugging.
@@ -192,7 +191,7 @@ void NFA::Decref(Thread* t) {
// Enqueues only the ByteRange instructions that match byte c.
// context is used (with p) for evaluating empty-width specials.
// p is the current input position, and t0 is the current thread.
-void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
+void NFA::AddToThreadq(Threadq* q, int id0, int c, absl::string_view context,
const char* p, Thread* t0) {
if (id0 == 0)
return;
@@ -225,7 +224,7 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
continue;
if (q->has_index(id)) {
if (ExtraDebug)
- fprintf(stderr, " [%d%s]\n", id, FormatCapture(t0->capture).c_str());
+ absl::FPrintF(stderr, " [%d%s]\n", id, FormatCapture(t0->capture));
continue;
}
@@ -288,7 +287,7 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
t = Incref(t0);
*tp = t;
if (ExtraDebug)
- fprintf(stderr, " + %d%s\n", id, FormatCapture(t0->capture).c_str());
+ absl::FPrintF(stderr, " + %d%s\n", id, FormatCapture(t0->capture));
if (ip->hint() == 0)
break;
@@ -300,7 +299,7 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
t = Incref(t0);
*tp = t;
if (ExtraDebug)
- fprintf(stderr, " ! %d%s\n", id, FormatCapture(t0->capture).c_str());
+ absl::FPrintF(stderr, " ! %d%s\n", id, FormatCapture(t0->capture));
Next:
if (ip->last())
@@ -328,7 +327,7 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
// p-1 will be used when processing Match instructions.
// Frees all the threads on runq.
// If there is a shortcut to the end, returns that shortcut.
-int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
+int NFA::Step(Threadq* runq, Threadq* nextq, int c, absl::string_view context,
const char* p) {
nextq->clear();
@@ -435,23 +434,22 @@ std::string NFA::FormatCapture(const char** capture) {
if (capture[i] == NULL)
s += "(?,?)";
else if (capture[i+1] == NULL)
- s += StringPrintf("(%td,?)",
- capture[i] - btext_);
+ s += absl::StrFormat("(%d,?)",
+ capture[i] - btext_);
else
- s += StringPrintf("(%td,%td)",
- capture[i] - btext_,
- capture[i+1] - btext_);
+ s += absl::StrFormat("(%d,%d)",
+ capture[i] - btext_,
+ capture[i+1] - btext_);
}
return s;
}
-bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
- bool anchored, bool longest,
- StringPiece* submatch, int nsubmatch) {
+bool NFA::Search(absl::string_view text, absl::string_view context,
+ bool anchored, bool longest, absl::string_view* submatch,
+ int nsubmatch) {
if (start_ == 0)
return false;
- StringPiece context = const_context;
if (context.data() == NULL)
context = text;
@@ -497,8 +495,8 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
etext_ = text.data() + text.size();
if (ExtraDebug)
- fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
- std::string(text).c_str(), std::string(context).c_str(), anchored, longest);
+ absl::FPrintF(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
+ text, context, anchored, longest);
// Set up search.
Threadq* runq = &q0_;
@@ -517,14 +515,14 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
else if (p < etext_)
c = p[0] & 0xFF;
- fprintf(stderr, "%c:", c);
+ absl::FPrintF(stderr, "%c:", c);
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
Thread* t = i->value();
if (t == NULL)
continue;
- fprintf(stderr, " %d%s", i->index(), FormatCapture(t->capture).c_str());
+ absl::FPrintF(stderr, " %d%s", i->index(), FormatCapture(t->capture));
}
- fprintf(stderr, "\n");
+ absl::FPrintF(stderr, "\n");
}
// This is a no-op the first time around the loop because runq is empty.
@@ -592,7 +590,7 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
// If all the threads have died, stop early.
if (runq->size() == 0) {
if (ExtraDebug)
- fprintf(stderr, "dead\n");
+ absl::FPrintF(stderr, "dead\n");
break;
}
@@ -616,27 +614,26 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
if (matched_) {
for (int i = 0; i < nsubmatch; i++)
- submatch[i] =
- StringPiece(match_[2 * i],
- static_cast<size_t>(match_[2 * i + 1] - match_[2 * i]));
+ submatch[i] = absl::string_view(
+ match_[2 * i],
+ static_cast<size_t>(match_[2 * i + 1] - match_[2 * i]));
if (ExtraDebug)
- fprintf(stderr, "match (%td,%td)\n",
- match_[0] - btext_,
- match_[1] - btext_);
+ absl::FPrintF(stderr, "match (%d,%d)\n",
+ match_[0] - btext_,
+ match_[1] - btext_);
return true;
}
return false;
}
-bool
-Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
- Anchor anchor, MatchKind kind,
- StringPiece* match, int nmatch) {
+bool Prog::SearchNFA(absl::string_view text, absl::string_view context,
+ Anchor anchor, MatchKind kind, absl::string_view* match,
+ int nmatch) {
if (ExtraDebug)
Dump();
NFA nfa(this);
- StringPiece sp;
+ absl::string_view sp;
if (kind == kFullMatch) {
anchor = kAnchored;
if (nmatch == 0) {
diff --git a/re2/onepass.cc b/re2/onepass.cc
index 2639746..7931cf9 100644
--- a/re2/onepass.cc
+++ b/re2/onepass.cc
@@ -57,14 +57,14 @@
#include <string>
#include <vector>
-#include "util/util.h"
+#include "absl/container/fixed_array.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/str_format.h"
#include "util/logging.h"
-#include "util/strutil.h"
#include "util/utf.h"
#include "re2/pod_array.h"
#include "re2/prog.h"
#include "re2/sparse_set.h"
-#include "re2/stringpiece.h"
// Silence "zero-sized array in struct/union" warning for OneState::action.
#ifdef _MSC_VER
@@ -189,7 +189,7 @@ void OnePass_Checks() {
"kMaxCap disagrees with kMaxOnePassCapture");
}
-static bool Satisfy(uint32_t cond, const StringPiece& context, const char* p) {
+static bool Satisfy(uint32_t cond, absl::string_view context, const char* p) {
uint32_t satisfied = Prog::EmptyFlags(context, p);
if (cond & kEmptyAllFlags & ~satisfied)
return false;
@@ -211,10 +211,9 @@ static inline OneState* IndexToNode(uint8_t* nodes, int statesize,
return reinterpret_cast<OneState*>(nodes + statesize*nodeindex);
}
-bool Prog::SearchOnePass(const StringPiece& text,
- const StringPiece& const_context,
+bool Prog::SearchOnePass(absl::string_view text, absl::string_view context,
Anchor anchor, MatchKind kind,
- StringPiece* match, int nmatch) {
+ absl::string_view* match, int nmatch) {
if (anchor != kAnchored && kind != kFullMatch) {
LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches.";
return false;
@@ -234,7 +233,6 @@ bool Prog::SearchOnePass(const StringPiece& text,
for (int i = 0; i < ncap; i++)
matchcap[i] = NULL;
- StringPiece context = const_context;
if (context.data() == NULL)
context = text;
if (anchor_start() && BeginPtr(context) != BeginPtr(text))
@@ -339,13 +337,12 @@ done:
if (!matched)
return false;
for (int i = 0; i < nmatch; i++)
- match[i] =
- StringPiece(matchcap[2 * i],
- static_cast<size_t>(matchcap[2 * i + 1] - matchcap[2 * i]));
+ match[i] = absl::string_view(
+ matchcap[2 * i],
+ static_cast<size_t>(matchcap[2 * i + 1] - matchcap[2 * i]));
return true;
}
-
// Analysis to determine whether a given regexp program is one-pass.
// If ip is not on workq, adds ip to work queue and returns true.
@@ -404,16 +401,17 @@ bool Prog::IsOnePass() {
int stacksize = inst_count(kInstCapture) +
inst_count(kInstEmptyWidth) +
inst_count(kInstNop) + 1; // + 1 for start inst
- PODArray<InstCond> stack(stacksize);
+ absl::FixedArray<InstCond, 64> stack_storage(stacksize);
+ InstCond* stack = stack_storage.data();
int size = this->size();
- PODArray<int> nodebyid(size); // indexed by ip
- memset(nodebyid.data(), 0xFF, size*sizeof nodebyid[0]);
+ absl::FixedArray<int, 128> nodebyid_storage(size, -1); // indexed by ip
+ int* nodebyid = nodebyid_storage.data();
// Originally, nodes was a uint8_t[maxnodes*statesize], but that was
// unnecessarily optimistic: why allocate a large amount of memory
// upfront for a large program when it is unlikely to be one-pass?
- std::vector<uint8_t> nodes;
+ absl::InlinedVector<uint8_t, 2048> nodes;
Instq tovisit(size), workq(size);
AddQ(&tovisit, start());
@@ -462,7 +460,7 @@ bool Prog::IsOnePass() {
if (nextindex == -1) {
if (nalloc >= maxnodes) {
if (ExtraDebug)
- LOG(ERROR) << StringPrintf(
+ LOG(ERROR) << absl::StrFormat(
"Not OnePass: hit node limit %d >= %d", nalloc, maxnodes);
goto fail;
}
@@ -487,7 +485,7 @@ bool Prog::IsOnePass() {
node->action[b] = newact;
} else if (act != newact) {
if (ExtraDebug)
- LOG(ERROR) << StringPrintf(
+ LOG(ERROR) << absl::StrFormat(
"Not OnePass: conflict on byte %#x at state %d", c, *it);
goto fail;
}
@@ -508,7 +506,7 @@ bool Prog::IsOnePass() {
node->action[b] = newact;
} else if (act != newact) {
if (ExtraDebug)
- LOG(ERROR) << StringPrintf(
+ LOG(ERROR) << absl::StrFormat(
"Not OnePass: conflict on byte %#x at state %d", c, *it);
goto fail;
}
@@ -549,7 +547,7 @@ bool Prog::IsOnePass() {
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, ip->out())) {
if (ExtraDebug)
- LOG(ERROR) << StringPrintf(
+ LOG(ERROR) << absl::StrFormat(
"Not OnePass: multiple paths %d -> %d", *it, ip->out());
goto fail;
}
@@ -560,7 +558,7 @@ bool Prog::IsOnePass() {
if (matched) {
// (3) is violated
if (ExtraDebug)
- LOG(ERROR) << StringPrintf(
+ LOG(ERROR) << absl::StrFormat(
"Not OnePass: multiple matches from %d", *it);
goto fail;
}
@@ -597,15 +595,15 @@ bool Prog::IsOnePass() {
if (nodeindex == -1)
continue;
OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
- dump += StringPrintf("node %d id=%d: matchcond=%#x\n",
- nodeindex, id, node->matchcond);
+ dump += absl::StrFormat("node %d id=%d: matchcond=%#x\n",
+ nodeindex, id, node->matchcond);
for (int i = 0; i < bytemap_range_; i++) {
if ((node->action[i] & kImpossible) == kImpossible)
continue;
- dump += StringPrintf(" %d cond %#x -> %d id=%d\n",
- i, node->action[i] & 0xFFFF,
- node->action[i] >> kIndexShift,
- idmap[node->action[i] >> kIndexShift]);
+ dump += absl::StrFormat(" %d cond %#x -> %d id=%d\n",
+ i, node->action[i] & 0xFFFF,
+ node->action[i] >> kIndexShift,
+ idmap[node->action[i] >> kIndexShift]);
}
}
LOG(ERROR) << "nodes:\n" << dump;
diff --git a/re2/parse.cc b/re2/parse.cc
index 85f16f0..655cb9a 100644
--- a/re2/parse.cc
+++ b/re2/parse.cc
@@ -25,13 +25,12 @@
#include <string>
#include <vector>
-#include "util/util.h"
+#include "absl/base/macros.h"
+#include "absl/strings/ascii.h"
#include "util/logging.h"
-#include "util/strutil.h"
#include "util/utf.h"
#include "re2/pod_array.h"
#include "re2/regexp.h"
-#include "re2/stringpiece.h"
#include "re2/unicode_casefold.h"
#include "re2/unicode_groups.h"
#include "re2/walker-inl.h"
@@ -70,7 +69,7 @@ void Regexp::FUZZING_ONLY_set_maximum_repeat_count(int i) {
class Regexp::ParseState {
public:
- ParseState(ParseFlags flags, const StringPiece& whole_regexp,
+ ParseState(ParseFlags flags, absl::string_view whole_regexp,
RegexpStatus* status);
~ParseState();
@@ -107,18 +106,18 @@ class Regexp::ParseState {
// Pushes a repeat operator regexp onto the stack.
// A valid argument for the operator must already be on the stack.
// s is the name of the operator, for use in error messages.
- bool PushRepeatOp(RegexpOp op, const StringPiece& s, bool nongreedy);
+ bool PushRepeatOp(RegexpOp op, absl::string_view s, bool nongreedy);
// Pushes a repetition regexp onto the stack.
// A valid argument for the operator must already be on the stack.
- bool PushRepetition(int min, int max, const StringPiece& s, bool nongreedy);
+ bool PushRepetition(int min, int max, absl::string_view s, bool nongreedy);
// Checks whether a particular regexp op is a marker.
bool IsMarker(RegexpOp op);
// Processes a left parenthesis in the input.
// Pushes a marker onto the stack.
- bool DoLeftParen(const StringPiece& name);
+ bool DoLeftParen(absl::string_view name);
bool DoLeftParenNoCapture();
// Processes a vertical bar in the input.
@@ -142,24 +141,23 @@ class Regexp::ParseState {
// Parse a character class into *out_re.
// Removes parsed text from s.
- bool ParseCharClass(StringPiece* s, Regexp** out_re,
+ bool ParseCharClass(absl::string_view* s, Regexp** out_re,
RegexpStatus* status);
// Parse a character class character into *rp.
// Removes parsed text from s.
- bool ParseCCCharacter(StringPiece* s, Rune *rp,
- const StringPiece& whole_class,
+ bool ParseCCCharacter(absl::string_view* s, Rune* rp,
+ absl::string_view whole_class,
RegexpStatus* status);
// Parse a character class range into rr.
// Removes parsed text from s.
- bool ParseCCRange(StringPiece* s, RuneRange* rr,
- const StringPiece& whole_class,
+ bool ParseCCRange(absl::string_view* s, RuneRange* rr,
+ absl::string_view whole_class,
RegexpStatus* status);
// Parse a Perl flag set or non-capturing group from s.
- bool ParsePerlFlags(StringPiece* s);
-
+ bool ParsePerlFlags(absl::string_view* s);
// Finishes the current concatenation,
// collapsing it into a single regexp on the stack.
@@ -177,7 +175,7 @@ class Regexp::ParseState {
private:
ParseFlags flags_;
- StringPiece whole_regexp_;
+ absl::string_view whole_regexp_;
RegexpStatus* status_;
Regexp* stacktop_;
int ncap_; // number of capturing parens seen
@@ -192,7 +190,7 @@ const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+1);
const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+2);
Regexp::ParseState::ParseState(ParseFlags flags,
- const StringPiece& whole_regexp,
+ absl::string_view whole_regexp,
RegexpStatus* status)
: flags_(flags), whole_regexp_(whole_regexp),
status_(status), stacktop_(NULL), ncap_(0) {
@@ -269,7 +267,7 @@ bool Regexp::ParseState::PushRegexp(Regexp* re) {
// Searches the case folding tables and returns the CaseFold* that contains r.
// If there isn't one, returns the CaseFold* with smallest f->lo bigger than r.
// If there isn't one, returns NULL.
-const CaseFold* LookupCaseFold(const CaseFold *f, int n, Rune r) {
+const CaseFold* LookupCaseFold(const CaseFold* f, int n, Rune r) {
const CaseFold* ef = f + n;
// Binary search for entry containing r.
@@ -297,7 +295,7 @@ const CaseFold* LookupCaseFold(const CaseFold *f, int n, Rune r) {
}
// Returns the result of applying the fold f to the rune r.
-Rune ApplyFold(const CaseFold *f, Rune r) {
+Rune ApplyFold(const CaseFold* f, Rune r) {
switch (f->delta) {
default:
return r + f->delta;
@@ -305,7 +303,7 @@ Rune ApplyFold(const CaseFold *f, Rune r) {
case EvenOddSkip: // even <-> odd but only applies to every other
if ((r - f->lo) % 2)
return r;
- FALLTHROUGH_INTENDED;
+ ABSL_FALLTHROUGH_INTENDED;
case EvenOdd: // even <-> odd
if (r%2 == 0)
return r + 1;
@@ -314,7 +312,7 @@ Rune ApplyFold(const CaseFold *f, Rune r) {
case OddEvenSkip: // odd <-> even but only applies to every other
if ((r - f->lo) % 2)
return r;
- FALLTHROUGH_INTENDED;
+ ABSL_FALLTHROUGH_INTENDED;
case OddEven: // odd <-> even
if (r%2 == 1)
return r + 1;
@@ -472,7 +470,7 @@ bool Regexp::ParseState::PushSimpleOp(RegexpOp op) {
// Pushes a repeat operator regexp onto the stack.
// A valid argument for the operator must already be on the stack.
// The char c is the name of the operator, for use in error messages.
-bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s,
+bool Regexp::ParseState::PushRepeatOp(RegexpOp op, absl::string_view s,
bool nongreedy) {
if (stacktop_ == NULL || IsMarker(stacktop_->op())) {
status_->set_code(kRegexpRepeatArgument);
@@ -565,8 +563,7 @@ int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) {
// Pushes a repetition regexp onto the stack.
// A valid argument for the operator must already be on the stack.
-bool Regexp::ParseState::PushRepetition(int min, int max,
- const StringPiece& s,
+bool Regexp::ParseState::PushRepetition(int min, int max, absl::string_view s,
bool nongreedy) {
if ((max != -1 && max < min) ||
min > maximum_repeat_count ||
@@ -609,7 +606,7 @@ bool Regexp::ParseState::IsMarker(RegexpOp op) {
// Processes a left parenthesis in the input.
// Pushes a marker onto the stack.
-bool Regexp::ParseState::DoLeftParen(const StringPiece& name) {
+bool Regexp::ParseState::DoLeftParen(absl::string_view name) {
Regexp* re = new Regexp(kLeftParen, flags_);
re->cap_ = ++ncap_;
if (name.data() != NULL)
@@ -774,8 +771,8 @@ Regexp* Regexp::RemoveLeadingRegexp(Regexp* re) {
// Returns the leading string that re starts with.
// The returned Rune* points into a piece of re,
// so it must not be used after the caller calls re->Decref().
-Rune* Regexp::LeadingString(Regexp* re, int *nrune,
- Regexp::ParseFlags *flags) {
+Rune* Regexp::LeadingString(Regexp* re, int* nrune,
+ Regexp::ParseFlags* flags) {
while (re->op() == kRegexpConcat && re->nsub() > 0)
re = re->sub()[0];
@@ -806,7 +803,7 @@ void Regexp::RemoveLeadingString(Regexp* re, int n) {
Regexp* stk[4];
size_t d = 0;
while (re->op() == kRegexpConcat) {
- if (d < arraysize(stk))
+ if (d < ABSL_ARRAYSIZE(stk))
stk[d++] = re;
re = re->sub()[0];
}
@@ -1325,15 +1322,15 @@ bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) {
// Parses a decimal integer, storing it in *np.
// Sets *s to span the remainder of the string.
-static bool ParseInteger(StringPiece* s, int* np) {
- if (s->empty() || !isdigit((*s)[0] & 0xFF))
+static bool ParseInteger(absl::string_view* s, int* np) {
+ if (s->empty() || !absl::ascii_isdigit((*s)[0] & 0xFF))
return false;
// Disallow leading zeros.
- if (s->size() >= 2 && (*s)[0] == '0' && isdigit((*s)[1] & 0xFF))
+ if (s->size() >= 2 && (*s)[0] == '0' && absl::ascii_isdigit((*s)[1] & 0xFF))
return false;
int n = 0;
int c;
- while (!s->empty() && isdigit(c = (*s)[0] & 0xFF)) {
+ while (!s->empty() && absl::ascii_isdigit(c = (*s)[0] & 0xFF)) {
// Avoid overflow.
if (n >= 100000000)
return false;
@@ -1351,10 +1348,10 @@ static bool ParseInteger(StringPiece* s, int* np) {
// sets *hi to -1 to signify this.
// {,2} is NOT a valid suffix.
// The Maybe in the name signifies that the regexp parse
-// doesn't fail even if ParseRepetition does, so the StringPiece
+// doesn't fail even if ParseRepetition does, so the string_view
// s must NOT be edited unless MaybeParseRepetition returns true.
-static bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) {
- StringPiece s = *sp;
+static bool MaybeParseRepetition(absl::string_view* sp, int* lo, int* hi) {
+ absl::string_view s = *sp;
if (s.empty() || s[0] != '{')
return false;
s.remove_prefix(1); // '{'
@@ -1385,12 +1382,13 @@ static bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) {
return true;
}
-// Removes the next Rune from the StringPiece and stores it in *r.
+// Removes the next Rune from the string_view and stores it in *r.
// Returns number of bytes removed from sp.
// Behaves as though there is a terminating NUL at the end of sp.
// Argument order is backwards from usual Google style
// but consistent with chartorune.
-static int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) {
+static int StringViewToRune(Rune* r, absl::string_view* sp,
+ RegexpStatus* status) {
// fullrune() takes int, not size_t. However, it just looks
// at the leading byte and treats any length >= 4 the same.
if (fullrune(sp->data(), static_cast<int>(std::min(size_t{4}, sp->size())))) {
@@ -1411,18 +1409,18 @@ static int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) {
if (status != NULL) {
status->set_code(kRegexpBadUTF8);
- status->set_error_arg(StringPiece());
+ status->set_error_arg(absl::string_view());
}
return -1;
}
// Returns whether name is valid UTF-8.
// If not, sets status to kRegexpBadUTF8.
-static bool IsValidUTF8(const StringPiece& s, RegexpStatus* status) {
- StringPiece t = s;
+static bool IsValidUTF8(absl::string_view s, RegexpStatus* status) {
+ absl::string_view t = s;
Rune r;
while (!t.empty()) {
- if (StringPieceToRune(&r, &t, status) < 0)
+ if (StringViewToRune(&r, &t, status) < 0)
return false;
}
return true;
@@ -1450,28 +1448,28 @@ static int UnHex(int c) {
// Parse an escape sequence (e.g., \n, \{).
// Sets *s to span the remainder of the string.
// Sets *rp to the named character.
-static bool ParseEscape(StringPiece* s, Rune* rp,
+static bool ParseEscape(absl::string_view* s, Rune* rp,
RegexpStatus* status, int rune_max) {
const char* begin = s->data();
if (s->empty() || (*s)[0] != '\\') {
// Should not happen - caller always checks.
status->set_code(kRegexpInternalError);
- status->set_error_arg(StringPiece());
+ status->set_error_arg(absl::string_view());
return false;
}
if (s->size() == 1) {
status->set_code(kRegexpTrailingBackslash);
- status->set_error_arg(StringPiece());
+ status->set_error_arg(absl::string_view());
return false;
}
Rune c, c1;
s->remove_prefix(1); // backslash
- if (StringPieceToRune(&c, s, status) < 0)
+ if (StringViewToRune(&c, s, status) < 0)
return false;
int code;
switch (c) {
default:
- if (c < Runeself && !isalpha(c) && !isdigit(c)) {
+ if (c < Runeself && !absl::ascii_isalnum(c)) {
// Escaped non-word characters are always themselves.
// PCRE is not quite so rigorous: it accepts things like
// \q, but we don't. We once rejected \_, but too many
@@ -1492,7 +1490,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp,
// Single non-zero octal digit is a backreference; not supported.
if (s->empty() || (*s)[0] < '0' || (*s)[0] > '7')
goto BadEscape;
- FALLTHROUGH_INTENDED;
+ ABSL_FALLTHROUGH_INTENDED;
case '0':
// consume up to three octal digits; already have one.
code = c - '0';
@@ -1516,7 +1514,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp,
case 'x':
if (s->empty())
goto BadEscape;
- if (StringPieceToRune(&c, s, status) < 0)
+ if (StringViewToRune(&c, s, status) < 0)
return false;
if (c == '{') {
// Any number of digits in braces.
@@ -1525,7 +1523,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp,
// Perl accepts any text at all; it ignores all text
// after the first non-hex digit. We require only hex digits,
// and at least one.
- if (StringPieceToRune(&c, s, status) < 0)
+ if (StringViewToRune(&c, s, status) < 0)
return false;
int nhex = 0;
code = 0;
@@ -1536,7 +1534,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp,
goto BadEscape;
if (s->empty())
goto BadEscape;
- if (StringPieceToRune(&c, s, status) < 0)
+ if (StringViewToRune(&c, s, status) < 0)
return false;
}
if (c != '}' || nhex == 0)
@@ -1547,7 +1545,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp,
// Easy case: two hex digits.
if (s->empty())
goto BadEscape;
- if (StringPieceToRune(&c1, s, status) < 0)
+ if (StringViewToRune(&c1, s, status) < 0)
return false;
if (!IsHex(c) || !IsHex(c1))
goto BadEscape;
@@ -1589,13 +1587,11 @@ static bool ParseEscape(StringPiece* s, Rune* rp,
// return true;
}
- LOG(DFATAL) << "Not reached in ParseEscape.";
-
BadEscape:
// Unrecognized escape sequence.
status->set_code(kRegexpBadEscape);
status->set_error_arg(
- StringPiece(begin, static_cast<size_t>(s->data() - begin)));
+ absl::string_view(begin, static_cast<size_t>(s->data() - begin)));
return false;
}
@@ -1623,21 +1619,21 @@ void CharClassBuilder::AddRangeFlags(
}
// Look for a group with the given name.
-static const UGroup* LookupGroup(const StringPiece& name,
- const UGroup *groups, int ngroups) {
+static const UGroup* LookupGroup(absl::string_view name,
+ const UGroup* groups, int ngroups) {
// Simple name lookup.
for (int i = 0; i < ngroups; i++)
- if (StringPiece(groups[i].name) == name)
+ if (absl::string_view(groups[i].name) == name)
return &groups[i];
return NULL;
}
// Look for a POSIX group with the given name (e.g., "[:^alpha:]")
-static const UGroup* LookupPosixGroup(const StringPiece& name) {
+static const UGroup* LookupPosixGroup(absl::string_view name) {
return LookupGroup(name, posix_groups, num_posix_groups);
}
-static const UGroup* LookupPerlGroup(const StringPiece& name) {
+static const UGroup* LookupPerlGroup(absl::string_view name) {
return LookupGroup(name, perl_groups, num_perl_groups);
}
@@ -1648,16 +1644,16 @@ static URange32 any32[] = { { 65536, Runemax } };
static UGroup anygroup = { "Any", +1, any16, 1, any32, 1 };
// Look for a Unicode group with the given name (e.g., "Han")
-static const UGroup* LookupUnicodeGroup(const StringPiece& name) {
+static const UGroup* LookupUnicodeGroup(absl::string_view name) {
// Special case: "Any" means any.
- if (name == StringPiece("Any"))
+ if (name == absl::string_view("Any"))
return &anygroup;
return LookupGroup(name, unicode_groups, num_unicode_groups);
}
#endif
// Add a UGroup or its negation to the character class.
-static void AddUGroup(CharClassBuilder *cc, const UGroup *g, int sign,
+static void AddUGroup(CharClassBuilder* cc, const UGroup* g, int sign,
Regexp::ParseFlags parse_flags) {
if (sign == +1) {
for (int i = 0; i < g->nr16; i++) {
@@ -1707,16 +1703,17 @@ static void AddUGroup(CharClassBuilder *cc, const UGroup *g, int sign,
// not the Perl empty-string classes (\b \B \A \Z \z).
// On success, sets *s to span the remainder of the string
// and returns the corresponding UGroup.
-// The StringPiece must *NOT* be edited unless the call succeeds.
-const UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) {
+// The string_view must *NOT* be edited unless the call succeeds.
+const UGroup* MaybeParsePerlCCEscape(absl::string_view* s,
+ Regexp::ParseFlags parse_flags) {
if (!(parse_flags & Regexp::PerlClasses))
return NULL;
if (s->size() < 2 || (*s)[0] != '\\')
return NULL;
- // Could use StringPieceToRune, but there aren't
+ // Could use StringViewToRune, but there aren't
// any non-ASCII Perl group names.
- StringPiece name(s->data(), 2);
- const UGroup *g = LookupPerlGroup(name);
+ absl::string_view name(s->data(), 2);
+ const UGroup* g = LookupPerlGroup(name);
if (g == NULL)
return NULL;
s->remove_prefix(name.size());
@@ -1731,9 +1728,9 @@ enum ParseStatus {
// Maybe parses a Unicode character group like \p{Han} or \P{Han}
// (the latter is a negated group).
-ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags,
- CharClassBuilder *cc,
- RegexpStatus* status) {
+ParseStatus ParseUnicodeGroup(absl::string_view* s,
+ Regexp::ParseFlags parse_flags,
+ CharClassBuilder* cc, RegexpStatus* status) {
// Decide whether to parse.
if (!(parse_flags & Regexp::UnicodeGroups))
return kParseNothing;
@@ -1747,34 +1744,34 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags,
int sign = +1; // -1 = negated char class
if (c == 'P')
sign = -sign;
- StringPiece seq = *s; // \p{Han} or \pL
- StringPiece name; // Han or L
+ absl::string_view seq = *s; // \p{Han} or \pL
+ absl::string_view name; // Han or L
s->remove_prefix(2); // '\\', 'p'
- if (!StringPieceToRune(&c, s, status))
+ if (!StringViewToRune(&c, s, status))
return kParseError;
if (c != '{') {
// Name is the bit of string we just skipped over for c.
const char* p = seq.data() + 2;
- name = StringPiece(p, static_cast<size_t>(s->data() - p));
+ name = absl::string_view(p, static_cast<size_t>(s->data() - p));
} else {
// Name is in braces. Look for closing }
size_t end = s->find('}', 0);
- if (end == StringPiece::npos) {
+ if (end == absl::string_view::npos) {
if (!IsValidUTF8(seq, status))
return kParseError;
status->set_code(kRegexpBadCharRange);
status->set_error_arg(seq);
return kParseError;
}
- name = StringPiece(s->data(), end); // without '}'
+ name = absl::string_view(s->data(), end); // without '}'
s->remove_prefix(end + 1); // with '}'
if (!IsValidUTF8(name, status))
return kParseError;
}
// Chop seq where s now begins.
- seq = StringPiece(seq.data(), static_cast<size_t>(s->data() - seq.data()));
+ seq = absl::string_view(seq.data(), static_cast<size_t>(s->data() - seq.data()));
if (!name.empty() && name[0] == '^') {
sign = -sign;
@@ -1783,7 +1780,7 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags,
#if !defined(RE2_USE_ICU)
// Look up the group in the RE2 Unicode data.
- const UGroup *g = LookupUnicodeGroup(name);
+ const UGroup* g = LookupUnicodeGroup(name);
if (g == NULL) {
status->set_code(kRegexpBadCharRange);
status->set_error_arg(seq);
@@ -1821,9 +1818,9 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags,
// Parses a character class name like [:alnum:].
// Sets *s to span the remainder of the string.
// Adds the ranges corresponding to the class to ranges.
-static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags,
- CharClassBuilder *cc,
- RegexpStatus* status) {
+static ParseStatus ParseCCName(absl::string_view* s,
+ Regexp::ParseFlags parse_flags,
+ CharClassBuilder* cc, RegexpStatus* status) {
// Check begins with [:
const char* p = s->data();
const char* ep = s->data() + s->size();
@@ -1841,9 +1838,9 @@ static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags,
// Got it. Check that it's valid.
q += 2;
- StringPiece name(p, static_cast<size_t>(q - p));
+ absl::string_view name(p, static_cast<size_t>(q - p));
- const UGroup *g = LookupPosixGroup(name);
+ const UGroup* g = LookupPosixGroup(name);
if (g == NULL) {
status->set_code(kRegexpBadCharRange);
status->set_error_arg(name);
@@ -1859,8 +1856,8 @@ static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags,
// There are fewer special characters here than in the rest of the regexp.
// Sets *s to span the remainder of the string.
// Sets *rp to the character.
-bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp,
- const StringPiece& whole_class,
+bool Regexp::ParseState::ParseCCCharacter(absl::string_view* s, Rune* rp,
+ absl::string_view whole_class,
RegexpStatus* status) {
if (s->empty()) {
status->set_code(kRegexpMissingBracket);
@@ -1874,7 +1871,7 @@ bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp,
return ParseEscape(s, rp, status, rune_max_);
// Otherwise take the next rune.
- return StringPieceToRune(rp, s, status) >= 0;
+ return StringViewToRune(rp, s, status) >= 0;
}
// Parses a character class character, or, if the character
@@ -1882,10 +1879,10 @@ bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp,
// For single characters, rr->lo == rr->hi.
// Sets *s to span the remainder of the string.
// Sets *rp to the character.
-bool Regexp::ParseState::ParseCCRange(StringPiece* s, RuneRange* rr,
- const StringPiece& whole_class,
+bool Regexp::ParseState::ParseCCRange(absl::string_view* s, RuneRange* rr,
+ absl::string_view whole_class,
RegexpStatus* status) {
- StringPiece os = *s;
+ absl::string_view os = *s;
if (!ParseCCCharacter(s, &rr->lo, whole_class, status))
return false;
// [a-] means (a|-), so check for final ].
@@ -1895,8 +1892,8 @@ bool Regexp::ParseState::ParseCCRange(StringPiece* s, RuneRange* rr,
return false;
if (rr->hi < rr->lo) {
status->set_code(kRegexpBadCharRange);
- status->set_error_arg(
- StringPiece(os.data(), static_cast<size_t>(s->data() - os.data())));
+ status->set_error_arg(absl::string_view(
+ os.data(), static_cast<size_t>(s->data() - os.data())));
return false;
}
} else {
@@ -1908,14 +1905,13 @@ bool Regexp::ParseState::ParseCCRange(StringPiece* s, RuneRange* rr,
// Parses a possibly-negated character class expression like [^abx-z[:digit:]].
// Sets *s to span the remainder of the string.
// Sets *out_re to the regexp for the class.
-bool Regexp::ParseState::ParseCharClass(StringPiece* s,
- Regexp** out_re,
+bool Regexp::ParseState::ParseCharClass(absl::string_view* s, Regexp** out_re,
RegexpStatus* status) {
- StringPiece whole_class = *s;
+ absl::string_view whole_class = *s;
if (s->empty() || (*s)[0] != '[') {
// Caller checked this.
status->set_code(kRegexpInternalError);
- status->set_error_arg(StringPiece());
+ status->set_error_arg(absl::string_view());
return false;
}
bool negated = false;
@@ -1937,16 +1933,16 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s,
// Except that Perl allows - anywhere.
if ((*s)[0] == '-' && !first && !(flags_&PerlX) &&
(s->size() == 1 || (*s)[1] != ']')) {
- StringPiece t = *s;
+ absl::string_view t = *s;
t.remove_prefix(1); // '-'
Rune r;
- int n = StringPieceToRune(&r, &t, status);
+ int n = StringViewToRune(&r, &t, status);
if (n < 0) {
re->Decref();
return false;
}
status->set_code(kRegexpBadCharRange);
- status->set_error_arg(StringPiece(s->data(), 1+n));
+ status->set_error_arg(absl::string_view(s->data(), 1+n));
re->Decref();
return false;
}
@@ -1981,7 +1977,7 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s,
}
// Look for Perl character class symbols (extension).
- const UGroup *g = MaybeParsePerlCCEscape(s, flags_);
+ const UGroup* g = MaybeParsePerlCCEscape(s, flags_);
if (g != NULL) {
AddUGroup(re->ccb_, g, g->sign, flags_);
continue;
@@ -2016,7 +2012,7 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s,
}
// Returns whether name is a valid capture name.
-static bool IsValidCaptureName(const StringPiece& name) {
+static bool IsValidCaptureName(absl::string_view name) {
if (name.empty())
return false;
@@ -2030,17 +2026,17 @@ static bool IsValidCaptureName(const StringPiece& name) {
// if they start doing that for capture names, we won't follow suit.
static const CharClass* const cc = []() {
CharClassBuilder ccb;
- for (StringPiece group :
+ for (absl::string_view group :
{"Lu", "Ll", "Lt", "Lm", "Lo", "Nl", "Mn", "Mc", "Nd", "Pc"})
AddUGroup(&ccb, LookupGroup(group, unicode_groups, num_unicode_groups),
+1, Regexp::NoParseFlags);
return ccb.GetCharClass();
}();
- StringPiece t = name;
+ absl::string_view t = name;
Rune r;
while (!t.empty()) {
- if (StringPieceToRune(&r, &t, NULL) < 0)
+ if (StringViewToRune(&r, &t, NULL) < 0)
return false;
if (cc->Contains(r))
continue;
@@ -2054,18 +2050,16 @@ static bool IsValidCaptureName(const StringPiece& name) {
// The caller must check that s begins with "(?".
// Returns true on success. If the Perl flag is not
// well-formed or not supported, sets status_ and returns false.
-bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) {
- StringPiece t = *s;
+bool Regexp::ParseState::ParsePerlFlags(absl::string_view* s) {
+ absl::string_view t = *s;
// Caller is supposed to check this.
if (!(flags_ & PerlX) || t.size() < 2 || t[0] != '(' || t[1] != '?') {
- LOG(DFATAL) << "Bad call to ParseState::ParsePerlFlags";
status_->set_code(kRegexpInternalError);
+ LOG(DFATAL) << "Bad call to ParseState::ParsePerlFlags";
return false;
}
- t.remove_prefix(2); // "(?"
-
// Check for named captures, first introduced in Python's regexp library.
// As usual, there are three slightly different syntaxes:
//
@@ -2079,22 +2073,23 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) {
// support all three as well. EcmaScript 4 uses only the Python form.
//
// In both the open source world (via Code Search) and the
- // Google source tree, (?P<expr>name) is the dominant form,
- // so that's the one we implement. One is enough.
- if (t.size() > 2 && t[0] == 'P' && t[1] == '<') {
+ // Google source tree, (?P<name>expr) and (?<name>expr) are the
+ // dominant forms of named captures and both are supported.
+ if ((t.size() > 4 && t[2] == 'P' && t[3] == '<') ||
+ (t.size() > 3 && t[2] == '<')) {
// Pull out name.
- size_t end = t.find('>', 2);
- if (end == StringPiece::npos) {
- if (!IsValidUTF8(*s, status_))
+ size_t begin = t[2] == 'P' ? 4 : 3;
+ size_t end = t.find('>', begin);
+ if (end == absl::string_view::npos) {
+ if (!IsValidUTF8(t, status_))
return false;
status_->set_code(kRegexpBadNamedCapture);
- status_->set_error_arg(*s);
+ status_->set_error_arg(t);
return false;
}
- // t is "P<name>...", t[end] == '>'
- StringPiece capture(t.data()-2, end+3); // "(?P<name>"
- StringPiece name(t.data()+2, end-2); // "name"
+ absl::string_view capture(t.data(), end+1);
+ absl::string_view name(t.data()+begin, end-begin);
if (!IsValidUTF8(name, status_))
return false;
if (!IsValidCaptureName(name)) {
@@ -2108,11 +2103,12 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) {
return false;
}
- s->remove_prefix(
- static_cast<size_t>(capture.data() + capture.size() - s->data()));
+ s->remove_prefix(capture.size());
return true;
}
+ t.remove_prefix(2); // "(?"
+
bool negated = false;
bool sawflags = false;
int nflags = flags_;
@@ -2120,7 +2116,7 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) {
for (bool done = false; !done; ) {
if (t.empty())
goto BadPerlOp;
- if (StringPieceToRune(&c, &t, status_) < 0)
+ if (StringViewToRune(&c, &t, status_) < 0)
return false;
switch (c) {
default:
@@ -2193,7 +2189,7 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) {
BadPerlOp:
status_->set_code(kRegexpBadPerlOp);
status_->set_error_arg(
- StringPiece(s->data(), static_cast<size_t>(t.data() - s->data())));
+ absl::string_view(s->data(), static_cast<size_t>(t.data() - s->data())));
return false;
}
@@ -2201,7 +2197,7 @@ BadPerlOp:
// into UTF8 encoding in string.
// Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is
// deprecated and because it rejects code points 0x80-0x9F.
-void ConvertLatin1ToUTF8(const StringPiece& latin1, std::string* utf) {
+void ConvertLatin1ToUTF8(absl::string_view latin1, std::string* utf) {
char buf[UTFmax];
utf->clear();
@@ -2216,7 +2212,7 @@ void ConvertLatin1ToUTF8(const StringPiece& latin1, std::string* utf) {
// returning the corresponding Regexp tree.
// The caller must Decref the return value when done with it.
// Returns NULL on error.
-Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,
+Regexp* Regexp::Parse(absl::string_view s, ParseFlags global_flags,
RegexpStatus* status) {
// Make status non-NULL (easier on everyone else).
RegexpStatus xstatus;
@@ -2224,7 +2220,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,
status = &xstatus;
ParseState ps(global_flags, s, status);
- StringPiece t = s;
+ absl::string_view t = s;
// Convert regexp to UTF-8 (easier on the rest of the parser).
if (global_flags & Latin1) {
@@ -2238,7 +2234,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,
// Special parse loop for literal string.
while (!t.empty()) {
Rune r;
- if (StringPieceToRune(&r, &t, status) < 0)
+ if (StringViewToRune(&r, &t, status) < 0)
return NULL;
if (!ps.PushLiteral(r))
return NULL;
@@ -2246,13 +2242,13 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,
return ps.DoFinish();
}
- StringPiece lastunary = StringPiece();
+ absl::string_view lastunary = absl::string_view();
while (!t.empty()) {
- StringPiece isunary = StringPiece();
+ absl::string_view isunary = absl::string_view();
switch (t[0]) {
default: {
Rune r;
- if (StringPieceToRune(&r, &t, status) < 0)
+ if (StringViewToRune(&r, &t, status) < 0)
return NULL;
if (!ps.PushLiteral(r))
return NULL;
@@ -2271,7 +2267,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,
if (!ps.DoLeftParenNoCapture())
return NULL;
} else {
- if (!ps.DoLeftParen(StringPiece()))
+ if (!ps.DoLeftParen(absl::string_view()))
return NULL;
}
t.remove_prefix(1); // '('
@@ -2327,7 +2323,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,
op = kRegexpQuest;
goto Rep;
Rep:
- StringPiece opstr = t;
+ absl::string_view opstr = t;
bool nongreedy = false;
t.remove_prefix(1); // '*' or '+' or '?'
if (ps.flags() & PerlX) {
@@ -2340,14 +2336,14 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,
// a** is a syntax error, not a double-star.
// (and a++ means something else entirely, which we don't support!)
status->set_code(kRegexpRepeatOp);
- status->set_error_arg(StringPiece(
+ status->set_error_arg(absl::string_view(
lastunary.data(),
static_cast<size_t>(t.data() - lastunary.data())));
return NULL;
}
}
- opstr = StringPiece(opstr.data(),
- static_cast<size_t>(t.data() - opstr.data()));
+ opstr = absl::string_view(opstr.data(),
+ static_cast<size_t>(t.data() - opstr.data()));
if (!ps.PushRepeatOp(op, opstr, nongreedy))
return NULL;
isunary = opstr;
@@ -2356,7 +2352,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,
case '{': { // Counted repetition.
int lo, hi;
- StringPiece opstr = t;
+ absl::string_view opstr = t;
if (!MaybeParseRepetition(&t, &lo, &hi)) {
// Treat like a literal.
if (!ps.PushLiteral('{'))
@@ -2373,14 +2369,14 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,
if (!lastunary.empty()) {
// Not allowed to stack repetition operators.
status->set_code(kRegexpRepeatOp);
- status->set_error_arg(StringPiece(
+ status->set_error_arg(absl::string_view(
lastunary.data(),
static_cast<size_t>(t.data() - lastunary.data())));
return NULL;
}
}
- opstr = StringPiece(opstr.data(),
- static_cast<size_t>(t.data() - opstr.data()));
+ opstr = absl::string_view(opstr.data(),
+ static_cast<size_t>(t.data() - opstr.data()));
if (!ps.PushRepetition(lo, hi, opstr, nongreedy))
return NULL;
isunary = opstr;
@@ -2430,7 +2426,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,
break;
}
Rune r;
- if (StringPieceToRune(&r, &t, status) < 0)
+ if (StringViewToRune(&r, &t, status) < 0)
return NULL;
if (!ps.PushLiteral(r))
return NULL;
@@ -2456,7 +2452,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,
}
}
- const UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags());
+ const UGroup* g = MaybeParsePerlCCEscape(&t, ps.flags());
if (g != NULL) {
Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);
re->ccb_ = new CharClassBuilder;
diff --git a/re2/prefilter.cc b/re2/prefilter.cc
index a47b312..3c7886f 100644
--- a/re2/prefilter.cc
+++ b/re2/prefilter.cc
@@ -7,11 +7,11 @@
#include <stddef.h>
#include <stdint.h>
#include <string>
+#include <utility>
#include <vector>
-#include "util/util.h"
+#include "absl/strings/str_format.h"
#include "util/logging.h"
-#include "util/strutil.h"
#include "util/utf.h"
#include "re2/re2.h"
#include "re2/unicode_casefold.h"
@@ -21,9 +21,6 @@ namespace re2 {
static const bool ExtraDebug = false;
-typedef std::set<std::string>::iterator SSIter;
-typedef std::set<std::string>::const_iterator ConstSSIter;
-
// Initializes a Prefilter, allocating subs_ as necessary.
Prefilter::Prefilter(Op op) {
op_ = op;
@@ -140,7 +137,7 @@ Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
return AndOr(OR, a, b);
}
-static void SimplifyStringSet(std::set<std::string>* ss) {
+void Prefilter::SimplifyStringSet(SSet* ss) {
// Now make sure that the strings aren't redundant. For example, if
// we know "ab" is a required string, then it doesn't help at all to
// know that "abc" is also a required string, so delete "abc". This
@@ -149,13 +146,19 @@ static void SimplifyStringSet(std::set<std::string>* ss) {
// candidate for match, so further matching "abc" is redundant.
// Note that we must ignore "" because find() would find it at the
// start of everything and thus we would end up erasing everything.
- for (SSIter i = ss->begin(); i != ss->end(); ++i) {
- if (i->empty())
- continue;
+ //
+ // The SSet sorts strings by length, then lexicographically. Note that
+ // smaller strings appear first and all strings must be unique. These
+ // observations let us skip string comparisons when possible.
+ SSIter i = ss->begin();
+ if (i != ss->end() && i->empty()) {
+ ++i;
+ }
+ for (; i != ss->end(); ++i) {
SSIter j = i;
++j;
while (j != ss->end()) {
- if (j->find(*i) != std::string::npos) {
+ if (j->size() > i->size() && j->find(*i) != std::string::npos) {
j = ss->erase(j);
continue;
}
@@ -164,7 +167,7 @@ static void SimplifyStringSet(std::set<std::string>* ss) {
}
}
-Prefilter* Prefilter::OrStrings(std::set<std::string>* ss) {
+Prefilter* Prefilter::OrStrings(SSet* ss) {
Prefilter* or_prefilter = new Prefilter(NONE);
SimplifyStringSet(ss);
for (SSIter i = ss->begin(); i != ss->end(); ++i)
@@ -226,14 +229,14 @@ class Prefilter::Info {
// Caller takes ownership of the Prefilter.
Prefilter* TakeMatch();
- std::set<std::string>& exact() { return exact_; }
+ SSet& exact() { return exact_; }
bool is_exact() const { return is_exact_; }
class Walker;
private:
- std::set<std::string> exact_;
+ SSet exact_;
// When is_exact_ is true, the strings that match
// are placed in exact_. When it is no longer an exact
@@ -286,18 +289,7 @@ std::string Prefilter::Info::ToString() {
return "";
}
-// Add the strings from src to dst.
-static void CopyIn(const std::set<std::string>& src,
- std::set<std::string>* dst) {
- for (ConstSSIter i = src.begin(); i != src.end(); ++i)
- dst->insert(*i);
-}
-
-// Add the cross-product of a and b to dst.
-// (For each string i in a and j in b, add i+j.)
-static void CrossProduct(const std::set<std::string>& a,
- const std::set<std::string>& b,
- std::set<std::string>* dst) {
+void Prefilter::CrossProduct(const SSet& a, const SSet& b, SSet* dst) {
for (ConstSSIter i = a.begin(); i != a.end(); ++i)
for (ConstSSIter j = b.begin(); j != b.end(); ++j)
dst->insert(*i + *j);
@@ -343,8 +335,14 @@ Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
Info *ab = new Info();
if (a->is_exact_ && b->is_exact_) {
- CopyIn(a->exact_, &ab->exact_);
- CopyIn(b->exact_, &ab->exact_);
+ // Avoid string copies by moving the larger exact_ set into
+ // ab directly, then merge in the smaller set.
+ if (a->exact_.size() < b->exact_.size()) {
+ using std::swap;
+ swap(a, b);
+ }
+ ab->exact_ = std::move(a->exact_);
+ ab->exact_.insert(b->exact_.begin(), b->exact_.end());
ab->is_exact_ = true;
} else {
// Either a or b has is_exact_ = false. If the other
@@ -532,8 +530,8 @@ Prefilter::Info* Prefilter::Info::Walker::PostVisit(
switch (re->op()) {
default:
case kRegexpRepeat:
- LOG(DFATAL) << "Bad regexp op " << re->op();
info = EmptyString();
+ LOG(DFATAL) << "Bad regexp op " << re->op();
break;
case kRegexpNoMatch:
@@ -665,7 +663,7 @@ std::string Prefilter::DebugString() const {
switch (op_) {
default:
LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
- return StringPrintf("op%d", op_);
+ return absl::StrFormat("op%d", op_);
case NONE:
return "*no-matches*";
case ATOM:
diff --git a/re2/prefilter.h b/re2/prefilter.h
index 4fedeb4..018691d 100644
--- a/re2/prefilter.h
+++ b/re2/prefilter.h
@@ -13,7 +13,6 @@
#include <string>
#include <vector>
-#include "util/util.h"
#include "util/logging.h"
namespace re2 {
@@ -60,8 +59,59 @@ class Prefilter {
std::string DebugString() const;
private:
+ template <typename H>
+ friend H AbslHashValue(H h, const Prefilter& a) {
+ h = H::combine(std::move(h), a.op_);
+ if (a.op_ == ATOM) {
+ h = H::combine(std::move(h), a.atom_);
+ } else if (a.op_ == AND || a.op_ == OR) {
+ h = H::combine(std::move(h), a.subs_->size());
+ for (size_t i = 0; i < a.subs_->size(); ++i) {
+ h = H::combine(std::move(h), (*a.subs_)[i]->unique_id_);
+ }
+ }
+ return h;
+ }
+
+ friend bool operator==(const Prefilter& a, const Prefilter& b) {
+ if (&a == &b) {
+ return true;
+ }
+ if (a.op_ != b.op_) {
+ return false;
+ }
+ if (a.op_ == ATOM) {
+ if (a.atom_ != b.atom_) {
+ return false;
+ }
+ } else if (a.op_ == AND || a.op_ == OR) {
+ if (a.subs_->size() != b.subs_->size()) {
+ return false;
+ }
+ for (size_t i = 0; i < a.subs_->size(); ++i) {
+ if ((*a.subs_)[i]->unique_id_ != (*b.subs_)[i]->unique_id_) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ // A comparator used to store exact strings. We compare by length,
+ // then lexicographically. This ordering makes it easier to reduce the
+ // set of strings in SimplifyStringSet.
+ struct LengthThenLex {
+ bool operator()(const std::string& a, const std::string& b) const {
+ return (a.size() < b.size()) || (a.size() == b.size() && a < b);
+ }
+ };
+
class Info;
+ using SSet = std::set<std::string, LengthThenLex>;
+ using SSIter = SSet::iterator;
+ using ConstSSIter = SSet::const_iterator;
+
// Combines two prefilters together to create an AND. The passed
// Prefilters will be part of the returned Prefilter or deleted.
static Prefilter* And(Prefilter* a, Prefilter* b);
@@ -77,12 +127,21 @@ class Prefilter {
static Prefilter* FromString(const std::string& str);
- static Prefilter* OrStrings(std::set<std::string>* ss);
+ static Prefilter* OrStrings(SSet* ss);
static Info* BuildInfo(Regexp* re);
Prefilter* Simplify();
+ // Removes redundant strings from the set. A string is redundant if
+ // any of the other strings appear as a substring. The empty string
+ // is a special case, which is ignored.
+ static void SimplifyStringSet(SSet* ss);
+
+ // Adds the cross-product of a and b to dst.
+ // (For each string i in a and j in b, add i+j.)
+ static void CrossProduct(const SSet& a, const SSet& b, SSet* dst);
+
// Kind of Prefilter.
Op op_;
diff --git a/re2/prefilter_tree.cc b/re2/prefilter_tree.cc
index fdf4e08..3afb241 100644
--- a/re2/prefilter_tree.cc
+++ b/re2/prefilter_tree.cc
@@ -6,16 +6,14 @@
#include <stddef.h>
#include <algorithm>
-#include <map>
+#include <cmath>
#include <memory>
-#include <set>
#include <string>
#include <utility>
#include <vector>
-#include "util/util.h"
+#include "absl/strings/str_format.h"
#include "util/logging.h"
-#include "util/strutil.h"
#include "re2/prefilter.h"
#include "re2/re2.h"
@@ -36,9 +34,6 @@ PrefilterTree::PrefilterTree(int min_atom_len)
PrefilterTree::~PrefilterTree() {
for (size_t i = 0; i < prefilter_vec_.size(); i++)
delete prefilter_vec_[i];
-
- for (size_t i = 0; i < entries_.size(); i++)
- delete entries_[i].parents;
}
void PrefilterTree::Add(Prefilter* prefilter) {
@@ -67,65 +62,18 @@ void PrefilterTree::Compile(std::vector<std::string>* atom_vec) {
compiled_ = true;
- // TODO(junyer): Use std::unordered_set<Prefilter*> instead?
- NodeMap nodes;
+ NodeSet nodes;
AssignUniqueIds(&nodes, atom_vec);
-
- // Identify nodes that are too common among prefilters and are
- // triggering too many parents. Then get rid of them if possible.
- // Note that getting rid of a prefilter node simply means they are
- // no longer necessary for their parent to trigger; that is, we do
- // not miss out on any regexps triggering by getting rid of a
- // prefilter node.
- for (size_t i = 0; i < entries_.size(); i++) {
- StdIntMap* parents = entries_[i].parents;
- if (parents->size() > 8) {
- // This one triggers too many things. If all the parents are AND
- // nodes and have other things guarding them, then get rid of
- // this trigger. TODO(vsri): Adjust the threshold appropriately,
- // make it a function of total number of nodes?
- bool have_other_guard = true;
- for (StdIntMap::iterator it = parents->begin();
- it != parents->end(); ++it) {
- have_other_guard = have_other_guard &&
- (entries_[it->first].propagate_up_at_count > 1);
- }
-
- if (have_other_guard) {
- for (StdIntMap::iterator it = parents->begin();
- it != parents->end(); ++it)
- entries_[it->first].propagate_up_at_count -= 1;
-
- parents->clear(); // Forget the parents
- }
- }
- }
-
if (ExtraDebug)
PrintDebugInfo(&nodes);
}
-Prefilter* PrefilterTree::CanonicalNode(NodeMap* nodes, Prefilter* node) {
- std::string node_string = NodeString(node);
- NodeMap::iterator iter = nodes->find(node_string);
- if (iter == nodes->end())
- return NULL;
- return (*iter).second;
-}
-
-std::string PrefilterTree::NodeString(Prefilter* node) const {
- // Adding the operation disambiguates AND/OR/atom nodes.
- std::string s = StringPrintf("%d", node->op()) + ":";
- if (node->op() == Prefilter::ATOM) {
- s += node->atom();
- } else {
- for (size_t i = 0; i < node->subs()->size(); i++) {
- if (i > 0)
- s += ',';
- s += StringPrintf("%d", (*node->subs())[i]->unique_id());
- }
+Prefilter* PrefilterTree::CanonicalNode(NodeSet* nodes, Prefilter* node) {
+ NodeSet::const_iterator iter = nodes->find(node);
+ if (iter != nodes->end()) {
+ return *iter;
}
- return s;
+ return NULL;
}
bool PrefilterTree::KeepNode(Prefilter* node) const {
@@ -165,7 +113,7 @@ bool PrefilterTree::KeepNode(Prefilter* node) const {
}
}
-void PrefilterTree::AssignUniqueIds(NodeMap* nodes,
+void PrefilterTree::AssignUniqueIds(NodeSet* nodes,
std::vector<std::string>* atom_vec) {
atom_vec->clear();
@@ -205,9 +153,9 @@ void PrefilterTree::AssignUniqueIds(NodeMap* nodes,
node->set_unique_id(-1);
Prefilter* canonical = CanonicalNode(nodes, node);
if (canonical == NULL) {
- // Any further nodes that have the same node string
+ // Any further nodes that have the same atom/subs
// will find this node as the canonical node.
- nodes->emplace(NodeString(node), node);
+ nodes->emplace(node);
if (node->op() == Prefilter::ATOM) {
atom_vec->push_back(node->atom());
atom_index_to_id_.push_back(unique_id);
@@ -217,65 +165,42 @@ void PrefilterTree::AssignUniqueIds(NodeMap* nodes,
node->set_unique_id(canonical->unique_id());
}
}
- entries_.resize(nodes->size());
-
- // Create parent StdIntMap for the entries.
- for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
- Prefilter* prefilter = v[i];
- if (prefilter == NULL)
- continue;
-
- if (CanonicalNode(nodes, prefilter) != prefilter)
- continue;
-
- Entry* entry = &entries_[prefilter->unique_id()];
- entry->parents = new StdIntMap();
- }
+ entries_.resize(unique_id);
// Fill the entries.
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
Prefilter* prefilter = v[i];
if (prefilter == NULL)
continue;
-
if (CanonicalNode(nodes, prefilter) != prefilter)
continue;
-
- Entry* entry = &entries_[prefilter->unique_id()];
-
+ int id = prefilter->unique_id();
switch (prefilter->op()) {
default:
- case Prefilter::ALL:
LOG(DFATAL) << "Unexpected op: " << prefilter->op();
return;
case Prefilter::ATOM:
- entry->propagate_up_at_count = 1;
+ entries_[id].propagate_up_at_count = 1;
break;
case Prefilter::OR:
case Prefilter::AND: {
- std::set<int> uniq_child;
+ // For each child, we append our id to the child's list of
+ // parent ids... unless we happen to have done so already.
+ // The number of appends is the number of unique children,
+ // which allows correct upward propagation from AND nodes.
+ int up_count = 0;
for (size_t j = 0; j < prefilter->subs()->size(); j++) {
- Prefilter* child = (*prefilter->subs())[j];
- Prefilter* canonical = CanonicalNode(nodes, child);
- if (canonical == NULL) {
- LOG(DFATAL) << "Null canonical node";
- return;
- }
- int child_id = canonical->unique_id();
- uniq_child.insert(child_id);
- // To the child, we want to add to parent indices.
- Entry* child_entry = &entries_[child_id];
- if (child_entry->parents->find(prefilter->unique_id()) ==
- child_entry->parents->end()) {
- (*child_entry->parents)[prefilter->unique_id()] = 1;
+ int child_id = (*prefilter->subs())[j]->unique_id();
+ std::vector<int>& parents = entries_[child_id].parents;
+ if (parents.empty() || parents.back() != id) {
+ parents.push_back(id);
+ up_count++;
}
}
- entry->propagate_up_at_count = prefilter->op() == Prefilter::AND
- ? static_cast<int>(uniq_child.size())
- : 1;
-
+ entries_[id].propagate_up_at_count =
+ prefilter->op() == Prefilter::AND ? up_count : 1;
break;
}
}
@@ -290,6 +215,52 @@ void PrefilterTree::AssignUniqueIds(NodeMap* nodes,
Entry* entry = &entries_[id];
entry->regexps.push_back(static_cast<int>(i));
}
+
+ // Lastly, using probability-based heuristics, we identify nodes
+ // that trigger too many parents and then we try to prune edges.
+ // We use logarithms below to avoid the likelihood of underflow.
+ double log_num_regexps = std::log(prefilter_vec_.size() - unfiltered_.size());
+ // Hoisted this above the loop so that we don't thrash the heap.
+ std::vector<std::pair<size_t, int>> entries_by_num_edges;
+ for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
+ Prefilter* prefilter = v[i];
+ // Pruning applies only to AND nodes because it "just" reduces
+ // precision; applied to OR nodes, it would break correctness.
+ if (prefilter == NULL || prefilter->op() != Prefilter::AND)
+ continue;
+ if (CanonicalNode(nodes, prefilter) != prefilter)
+ continue;
+ int id = prefilter->unique_id();
+
+ // Sort the current node's children by the numbers of parents.
+ entries_by_num_edges.clear();
+ for (size_t j = 0; j < prefilter->subs()->size(); j++) {
+ int child_id = (*prefilter->subs())[j]->unique_id();
+ const std::vector<int>& parents = entries_[child_id].parents;
+ entries_by_num_edges.emplace_back(parents.size(), child_id);
+ }
+ std::stable_sort(entries_by_num_edges.begin(), entries_by_num_edges.end());
+
+ // A running estimate of how many regexps will be triggered by
+ // pruning the remaining children's edges to the current node.
+ // Our nominal target is one, so the threshold is log(1) == 0;
+ // pruning occurs iff the child has more than nine edges left.
+ double log_num_triggered = log_num_regexps;
+ for (const auto& pair : entries_by_num_edges) {
+ int child_id = pair.second;
+ std::vector<int>& parents = entries_[child_id].parents;
+ if (log_num_triggered > 0.) {
+ log_num_triggered += std::log(parents.size());
+ log_num_triggered -= log_num_regexps;
+ } else if (parents.size() > 9) {
+ auto it = std::find(parents.begin(), parents.end(), id);
+ if (it != parents.end()) {
+ parents.erase(it);
+ entries_[id].propagate_up_at_count--;
+ }
+ }
+ }
+ }
}
// Functions for triggering during search.
@@ -313,7 +284,7 @@ void PrefilterTree::RegexpsGivenStrings(
for (size_t j = 0; j < matched_atoms.size(); j++)
matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
PropagateMatch(matched_atom_ids, &regexps_map);
- for (IntMap::iterator it = regexps_map.begin();
+ for (IntMap::const_iterator it = regexps_map.begin();
it != regexps_map.end();
++it)
regexps->push_back(it->index());
@@ -329,17 +300,14 @@ void PrefilterTree::PropagateMatch(const std::vector<int>& atom_ids,
IntMap work(static_cast<int>(entries_.size()));
for (size_t i = 0; i < atom_ids.size(); i++)
work.set(atom_ids[i], 1);
- for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
+ for (IntMap::const_iterator it = work.begin(); it != work.end(); ++it) {
const Entry& entry = entries_[it->index()];
// Record regexps triggered.
for (size_t i = 0; i < entry.regexps.size(); i++)
regexps->set(entry.regexps[i], 1);
int c;
// Pass trigger up to parents.
- for (StdIntMap::iterator it = entry.parents->begin();
- it != entry.parents->end();
- ++it) {
- int j = it->first;
+ for (int j : entry.parents) {
const Entry& parent = entries_[j];
// Delay until all the children have succeeded.
if (parent.propagate_up_at_count > 1) {
@@ -364,23 +332,22 @@ void PrefilterTree::PrintPrefilter(int regexpid) {
LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]);
}
-void PrefilterTree::PrintDebugInfo(NodeMap* nodes) {
+void PrefilterTree::PrintDebugInfo(NodeSet* nodes) {
LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size();
LOG(ERROR) << "#Unique Nodes: " << entries_.size();
for (size_t i = 0; i < entries_.size(); i++) {
- StdIntMap* parents = entries_[i].parents;
+ const std::vector<int>& parents = entries_[i].parents;
const std::vector<int>& regexps = entries_[i].regexps;
LOG(ERROR) << "EntryId: " << i
- << " N: " << parents->size() << " R: " << regexps.size();
- for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it)
- LOG(ERROR) << it->first;
+ << " N: " << parents.size() << " R: " << regexps.size();
+ for (int parent : parents)
+ LOG(ERROR) << parent;
}
- LOG(ERROR) << "Map:";
- for (NodeMap::const_iterator iter = nodes->begin();
+ LOG(ERROR) << "Set:";
+ for (NodeSet::const_iterator iter = nodes->begin();
iter != nodes->end(); ++iter)
- LOG(ERROR) << "NodeId: " << (*iter).second->unique_id()
- << " Str: " << (*iter).first;
+ LOG(ERROR) << "NodeId: " << (*iter)->unique_id();
}
std::string PrefilterTree::DebugNodeString(Prefilter* node) const {
@@ -395,7 +362,7 @@ std::string PrefilterTree::DebugNodeString(Prefilter* node) const {
for (size_t i = 0; i < node->subs()->size(); i++) {
if (i > 0)
node_string += ',';
- node_string += StringPrintf("%d", (*node->subs())[i]->unique_id());
+ node_string += absl::StrFormat("%d", (*node->subs())[i]->unique_id());
node_string += ":";
node_string += DebugNodeString((*node->subs())[i]);
}
diff --git a/re2/prefilter_tree.h b/re2/prefilter_tree.h
index 5d73074..71e7a29 100644
--- a/re2/prefilter_tree.h
+++ b/re2/prefilter_tree.h
@@ -16,13 +16,13 @@
// atoms) that the user of this class should use to do the string
// matching.
-#include <map>
#include <string>
#include <vector>
-#include "util/util.h"
+#include "absl/container/flat_hash_set.h"
#include "re2/prefilter.h"
#include "re2/sparse_array.h"
+#include "util/logging.h"
namespace re2 {
@@ -58,9 +58,25 @@ class PrefilterTree {
void PrintPrefilter(int regexpid);
private:
- typedef SparseArray<int> IntMap;
- typedef std::map<int, int> StdIntMap;
- typedef std::map<std::string, Prefilter*> NodeMap;
+ using IntMap = SparseArray<int>;
+
+ struct PrefilterHash {
+ size_t operator()(const Prefilter* a) const {
+ DCHECK(a != NULL);
+ return absl::Hash<Prefilter>()(*a);
+ }
+ };
+
+ struct PrefilterEqual {
+ bool operator()(const Prefilter* a, const Prefilter* b) const {
+ DCHECK(a != NULL);
+ DCHECK(b != NULL);
+ return *a == *b;
+ }
+ };
+
+ using NodeSet =
+ absl::flat_hash_set<Prefilter*, PrefilterHash, PrefilterEqual>;
// Each unique node has a corresponding Entry that helps in
// passing the matching trigger information along the tree.
@@ -77,7 +93,7 @@ class PrefilterTree {
// are two different nodes, but they share the atom 'def'. So when
// 'def' matches, it triggers two parents, corresponding to the two
// different OR nodes.
- StdIntMap* parents;
+ std::vector<int> parents;
// When this node is ready to trigger the parent, what are the
// regexps that are triggered.
@@ -90,25 +106,22 @@ class PrefilterTree {
// This function assigns unique ids to various parts of the
// prefilter, by looking at if these nodes are already in the
// PrefilterTree.
- void AssignUniqueIds(NodeMap* nodes, std::vector<std::string>* atom_vec);
+ void AssignUniqueIds(NodeSet* nodes, std::vector<std::string>* atom_vec);
// Given the matching atoms, find the regexps to be triggered.
void PropagateMatch(const std::vector<int>& atom_ids,
IntMap* regexps) const;
- // Returns the prefilter node that has the same NodeString as this
- // node. For the canonical node, returns node.
- Prefilter* CanonicalNode(NodeMap* nodes, Prefilter* node);
-
- // A string that uniquely identifies the node. Assumes that the
- // children of node has already been assigned unique ids.
- std::string NodeString(Prefilter* node) const;
+ // Returns the prefilter node that has the same atom/subs as this
+ // node. For the canonical node, returns node. Assumes that the
+ // children of node have already been assigned unique ids.
+ Prefilter* CanonicalNode(NodeSet* nodes, Prefilter* node);
// Recursively constructs a readable prefilter string.
std::string DebugNodeString(Prefilter* node) const;
// Used for debugging.
- void PrintDebugInfo(NodeMap* nodes);
+ void PrintDebugInfo(NodeSet* nodes);
// These are all the nodes formed by Compile. Essentially, there is
// one node for each unique atom and each unique AND/OR node.
diff --git a/re2/prog.cc b/re2/prog.cc
index a700d35..6cadcfa 100644
--- a/re2/prog.cc
+++ b/re2/prog.cc
@@ -19,11 +19,10 @@
#include <memory>
#include <utility>
-#include "util/util.h"
+#include "absl/base/macros.h"
+#include "absl/strings/str_format.h"
#include "util/logging.h"
-#include "util/strutil.h"
#include "re2/bitmap256.h"
-#include "re2/stringpiece.h"
namespace re2 {
@@ -74,34 +73,34 @@ void Prog::Inst::InitFail() {
std::string Prog::Inst::Dump() {
switch (opcode()) {
default:
- return StringPrintf("opcode %d", static_cast<int>(opcode()));
+ return absl::StrFormat("opcode %d", static_cast<int>(opcode()));
case kInstAlt:
- return StringPrintf("alt -> %d | %d", out(), out1_);
+ return absl::StrFormat("alt -> %d | %d", out(), out1_);
case kInstAltMatch:
- return StringPrintf("altmatch -> %d | %d", out(), out1_);
+ return absl::StrFormat("altmatch -> %d | %d", out(), out1_);
case kInstByteRange:
- return StringPrintf("byte%s [%02x-%02x] %d -> %d",
- foldcase() ? "/i" : "",
- lo_, hi_, hint(), out());
+ return absl::StrFormat("byte%s [%02x-%02x] %d -> %d",
+ foldcase() ? "/i" : "",
+ lo_, hi_, hint(), out());
case kInstCapture:
- return StringPrintf("capture %d -> %d", cap_, out());
+ return absl::StrFormat("capture %d -> %d", cap_, out());
case kInstEmptyWidth:
- return StringPrintf("emptywidth %#x -> %d",
- static_cast<int>(empty_), out());
+ return absl::StrFormat("emptywidth %#x -> %d",
+ static_cast<int>(empty_), out());
case kInstMatch:
- return StringPrintf("match! %d", match_id());
+ return absl::StrFormat("match! %d", match_id());
case kInstNop:
- return StringPrintf("nop -> %d", out());
+ return absl::StrFormat("nop -> %d", out());
case kInstFail:
- return StringPrintf("fail");
+ return absl::StrFormat("fail");
}
}
@@ -143,7 +142,7 @@ static std::string ProgToString(Prog* prog, Workq* q) {
for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
int id = *i;
Prog::Inst* ip = prog->inst(id);
- s += StringPrintf("%d. %s\n", id, ip->Dump().c_str());
+ s += absl::StrFormat("%d. %s\n", id, ip->Dump());
AddToQueue(q, ip->out());
if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
AddToQueue(q, ip->out1());
@@ -156,9 +155,9 @@ static std::string FlattenedProgToString(Prog* prog, int start) {
for (int id = start; id < prog->size(); id++) {
Prog::Inst* ip = prog->inst(id);
if (ip->last())
- s += StringPrintf("%d. %s\n", id, ip->Dump().c_str());
+ s += absl::StrFormat("%d. %s\n", id, ip->Dump());
else
- s += StringPrintf("%d+ %s\n", id, ip->Dump().c_str());
+ s += absl::StrFormat("%d+ %s\n", id, ip->Dump());
}
return s;
}
@@ -189,7 +188,7 @@ std::string Prog::DumpByteMap() {
while (c < 256-1 && bytemap_[c+1] == b)
c++;
int hi = c;
- map += StringPrintf("[%02x-%02x] -> %d\n", lo, hi, b);
+ map += absl::StrFormat("[%02x-%02x] -> %d\n", lo, hi, b);
}
return map;
}
@@ -284,7 +283,7 @@ void Prog::Optimize() {
}
}
-uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) {
+uint32_t Prog::EmptyFlags(absl::string_view text, const char* p) {
int flags = 0;
// ^ and \A
@@ -511,7 +510,7 @@ void Prog::ComputeByteMap() {
builder.Build(bytemap_, &bytemap_range_);
- if (0) { // For debugging, use trivial bytemap.
+ if ((0)) { // For debugging, use trivial bytemap.
LOG(ERROR) << "Using trivial bytemap.";
for (int i = 0; i < 256; i++)
bytemap_[i] = static_cast<uint8_t>(i);
@@ -813,7 +812,7 @@ void Prog::EmitList(int root, SparseArray<int>* rootmap,
flat->back().set_opcode(kInstAltMatch);
flat->back().set_out(static_cast<int>(flat->size()));
flat->back().out1_ = static_cast<uint32_t>(flat->size())+1;
- FALLTHROUGH_INTENDED;
+ ABSL_FALLTHROUGH_INTENDED;
case kInstAlt:
stk->push_back(ip->out1());
diff --git a/re2/prog.h b/re2/prog.h
index 4af012a..41923f3 100644
--- a/re2/prog.h
+++ b/re2/prog.h
@@ -11,12 +11,12 @@
#include <stdint.h>
#include <functional>
-#include <mutex>
#include <string>
#include <vector>
#include <type_traits>
-#include "util/util.h"
+#include "absl/base/call_once.h"
+#include "absl/strings/string_view.h"
#include "util/logging.h"
#include "re2/pod_array.h"
#include "re2/re2.h"
@@ -249,7 +249,7 @@ class Prog {
// Returns the set of kEmpty flags that are in effect at
// position p within context.
- static uint32_t EmptyFlags(const StringPiece& context, const char* p);
+ static uint32_t EmptyFlags(absl::string_view context, const char* p);
// Returns whether byte c is a word character: ASCII only.
// Used by the implementation of \b and \B.
@@ -274,15 +274,15 @@ class Prog {
// If a particular submatch is not matched during the regexp match,
// it is set to NULL.
//
- // Matching text == StringPiece(NULL, 0) is treated as any other empty
+ // Matching text == absl::string_view() is treated as any other empty
// string, but note that on return, it will not be possible to distinguish
// submatches that matched that empty string from submatches that didn't
// match anything. Either way, match[i] == NULL.
// Search using NFA: can find submatches but kind of slow.
- bool SearchNFA(const StringPiece& text, const StringPiece& context,
- Anchor anchor, MatchKind kind,
- StringPiece* match, int nmatch);
+ bool SearchNFA(absl::string_view text, absl::string_view context,
+ Anchor anchor, MatchKind kind, absl::string_view* match,
+ int nmatch);
// Search using DFA: much faster than NFA but only finds
// end of match and can use a lot more memory.
@@ -290,8 +290,8 @@ class Prog {
// If the DFA runs out of memory, sets *failed to true and returns false.
// If matches != NULL and kind == kManyMatch and there is a match,
// SearchDFA fills matches with the match IDs of the final matching state.
- bool SearchDFA(const StringPiece& text, const StringPiece& context,
- Anchor anchor, MatchKind kind, StringPiece* match0,
+ bool SearchDFA(absl::string_view text, absl::string_view context,
+ Anchor anchor, MatchKind kind, absl::string_view* match0,
bool* failed, SparseSet* matches);
// The callback issued after building each DFA state with BuildEntireDFA().
@@ -321,16 +321,16 @@ class Prog {
// but much faster than NFA (competitive with PCRE)
// for those expressions.
bool IsOnePass();
- bool SearchOnePass(const StringPiece& text, const StringPiece& context,
- Anchor anchor, MatchKind kind,
- StringPiece* match, int nmatch);
+ bool SearchOnePass(absl::string_view text, absl::string_view context,
+ Anchor anchor, MatchKind kind, absl::string_view* match,
+ int nmatch);
// Bit-state backtracking. Fast on small cases but uses memory
// proportional to the product of the list count and the text size.
bool CanBitState() { return list_heads_.data() != NULL; }
- bool SearchBitState(const StringPiece& text, const StringPiece& context,
- Anchor anchor, MatchKind kind,
- StringPiece* match, int nmatch);
+ bool SearchBitState(absl::string_view text, absl::string_view context,
+ Anchor anchor, MatchKind kind, absl::string_view* match,
+ int nmatch);
static const int kMaxOnePassCapture = 5; // $0 through $4
@@ -340,10 +340,9 @@ class Prog {
// It is also recursive, so can't use in production (will overflow stacks).
// The name "Unsafe" here is supposed to be a flag that
// you should not be using this function.
- bool UnsafeSearchBacktrack(const StringPiece& text,
- const StringPiece& context,
+ bool UnsafeSearchBacktrack(absl::string_view text, absl::string_view context,
Anchor anchor, MatchKind kind,
- StringPiece* match, int nmatch);
+ absl::string_view* match, int nmatch);
// Computes range for any strings matching regexp. The min and max can in
// some cases be arbitrarily precise, so the caller gets to specify the
@@ -361,7 +360,6 @@ class Prog {
// Returns true on success, false on error.
bool PossibleMatchRange(std::string* min, std::string* max, int maxlen);
- // EXPERIMENTAL! SUBJECT TO CHANGE!
// Outputs the program fanout into the given sparse array.
void Fanout(SparseArray<int>* fanout);
@@ -445,8 +443,8 @@ class Prog {
uint8_t bytemap_[256]; // map from input bytes to byte classes
- std::once_flag dfa_first_once_;
- std::once_flag dfa_longest_once_;
+ absl::once_flag dfa_first_once_;
+ absl::once_flag dfa_longest_once_;
Prog(const Prog&) = delete;
Prog& operator=(const Prog&) = delete;
@@ -456,10 +454,10 @@ class Prog {
// that don't allow comparisons between different objects - not even if
// those objects are views into the same string! Thus, we provide these
// conversion functions for convenience.
-static inline const char* BeginPtr(const StringPiece& s) {
+static inline const char* BeginPtr(absl::string_view s) {
return s.data();
}
-static inline const char* EndPtr(const StringPiece& s) {
+static inline const char* EndPtr(absl::string_view s) {
return s.data() + s.size();
}
diff --git a/re2/re2.cc b/re2/re2.cc
index c027133..bc713cf 100644
--- a/re2/re2.cc
+++ b/re2/re2.cc
@@ -21,12 +21,14 @@
#include <algorithm>
#include <atomic>
#include <iterator>
-#include <mutex>
#include <string>
#include <utility>
#include <vector>
-#include "util/util.h"
+#include "absl/base/macros.h"
+#include "absl/container/fixed_array.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_format.h"
#include "util/logging.h"
#include "util/strutil.h"
#include "util/utf.h"
@@ -36,6 +38,13 @@
namespace re2 {
+// Controls the maximum count permitted by GlobalReplace(); -1 is unlimited.
+static int maximum_global_replace_count = -1;
+
+void RE2::FUZZING_ONLY_set_maximum_global_replace_count(int i) {
+ maximum_global_replace_count = i;
+}
+
// Maximum number of args we can set
static const int kMaxArgs = 16;
static const int kVecSize = 1+kMaxArgs;
@@ -43,11 +52,11 @@ static const int kVecSize = 1+kMaxArgs;
const int RE2::Options::kDefaultMaxMem; // initialized in re2.h
RE2::Options::Options(RE2::CannedOptions opt)
- : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),
+ : max_mem_(kDefaultMaxMem),
+ encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),
posix_syntax_(opt == RE2::POSIX),
longest_match_(opt == RE2::POSIX),
log_errors_(opt != RE2::Quiet),
- max_mem_(kDefaultMaxMem),
literal_(false),
never_nl_(false),
dot_nl_(false),
@@ -58,11 +67,30 @@ RE2::Options::Options(RE2::CannedOptions opt)
one_line_(false) {
}
-// static empty objects for use as const references.
-// To avoid global constructors, allocated in RE2::Init().
-static const std::string* empty_string;
-static const std::map<std::string, int>* empty_named_groups;
-static const std::map<int, std::string>* empty_group_names;
+// Empty objects for use as const references.
+// Statically allocating the storage and then
+// lazily constructing the objects (in a once
+// in RE2::Init()) avoids global constructors
+// and the false positives (thanks, Valgrind)
+// about memory leaks at program termination.
+struct EmptyStorage {
+ std::string empty_string;
+ std::map<std::string, int> empty_named_groups;
+ std::map<int, std::string> empty_group_names;
+};
+alignas(EmptyStorage) static char empty_storage[sizeof(EmptyStorage)];
+
+static inline std::string* empty_string() {
+ return &reinterpret_cast<EmptyStorage*>(empty_storage)->empty_string;
+}
+
+static inline std::map<std::string, int>* empty_named_groups() {
+ return &reinterpret_cast<EmptyStorage*>(empty_storage)->empty_named_groups;
+}
+
+static inline std::map<int, std::string>* empty_group_names() {
+ return &reinterpret_cast<EmptyStorage*>(empty_storage)->empty_group_names;
+}
// Converts from Regexp error code to RE2 error code.
// Maybe some day they will diverge. In any event, this
@@ -103,7 +131,7 @@ static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
return RE2::ErrorInternal;
}
-static std::string trunc(const StringPiece& pattern) {
+static std::string trunc(absl::string_view pattern) {
if (pattern.size() < 100)
return std::string(pattern);
return std::string(pattern.substr(0, 100)) + "...";
@@ -118,11 +146,11 @@ RE2::RE2(const std::string& pattern) {
Init(pattern, DefaultOptions);
}
-RE2::RE2(const StringPiece& pattern) {
+RE2::RE2(absl::string_view pattern) {
Init(pattern, DefaultOptions);
}
-RE2::RE2(const StringPiece& pattern, const Options& options) {
+RE2::RE2(absl::string_view pattern, const Options& options) {
Init(pattern, options);
}
@@ -170,26 +198,26 @@ int RE2::Options::ParseFlags() const {
return flags;
}
-void RE2::Init(const StringPiece& pattern, const Options& options) {
- static std::once_flag empty_once;
- std::call_once(empty_once, []() {
- empty_string = new std::string;
- empty_named_groups = new std::map<std::string, int>;
- empty_group_names = new std::map<int, std::string>;
+void RE2::Init(absl::string_view pattern, const Options& options) {
+ static absl::once_flag empty_once;
+ absl::call_once(empty_once, []() {
+ (void) new (empty_storage) EmptyStorage;
});
- pattern_.assign(pattern.data(), pattern.size());
+ pattern_ = new std::string(pattern);
options_.Copy(options);
entire_regexp_ = NULL;
- error_ = empty_string;
- error_code_ = NoError;
- error_arg_.clear();
- prefix_.clear();
- prefix_foldcase_ = false;
suffix_regexp_ = NULL;
- prog_ = NULL;
+ error_ = empty_string();
+ error_arg_ = empty_string();
+
num_captures_ = -1;
+ error_code_ = NoError;
+ longest_match_ = options_.longest_match();
is_one_pass_ = false;
+ prefix_foldcase_ = false;
+ prefix_.clear();
+ prog_ = NULL;
rprog_ = NULL;
named_groups_ = NULL;
@@ -197,25 +225,29 @@ void RE2::Init(const StringPiece& pattern, const Options& options) {
RegexpStatus status;
entire_regexp_ = Regexp::Parse(
- pattern_,
+ *pattern_,
static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
&status);
if (entire_regexp_ == NULL) {
if (options_.log_errors()) {
- LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
+ LOG(ERROR) << "Error parsing '" << trunc(*pattern_) << "': "
<< status.Text();
}
error_ = new std::string(status.Text());
error_code_ = RegexpErrorToRE2(status.code());
- error_arg_ = std::string(status.error_arg());
+ error_arg_ = new std::string(status.error_arg());
return;
}
+ bool foldcase;
re2::Regexp* suffix;
- if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
+ if (entire_regexp_->RequiredPrefix(&prefix_, &foldcase, &suffix)) {
+ prefix_foldcase_ = foldcase;
suffix_regexp_ = suffix;
- else
+ }
+ else {
suffix_regexp_ = entire_regexp_->Incref();
+ }
// Two thirds of the memory goes to the forward Prog,
// one third to the reverse prog, because the forward
@@ -223,7 +255,7 @@ void RE2::Init(const StringPiece& pattern, const Options& options) {
prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
if (prog_ == NULL) {
if (options_.log_errors())
- LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
+ LOG(ERROR) << "Error compiling '" << trunc(*pattern_) << "'";
error_ = new std::string("pattern too large - compile failed");
error_code_ = RE2::ErrorPatternTooLarge;
return;
@@ -231,7 +263,7 @@ void RE2::Init(const StringPiece& pattern, const Options& options) {
// We used to compute this lazily, but it's used during the
// typical control flow for a match call, so we now compute
- // it eagerly, which avoids the overhead of std::once_flag.
+ // it eagerly, which avoids the overhead of absl::once_flag.
num_captures_ = suffix_regexp_->NumCaptures();
// Could delay this until the first match call that
@@ -244,12 +276,13 @@ void RE2::Init(const StringPiece& pattern, const Options& options) {
// Returns rprog_, computing it if needed.
re2::Prog* RE2::ReverseProg() const {
- std::call_once(rprog_once_, [](const RE2* re) {
+ absl::call_once(rprog_once_, [](const RE2* re) {
re->rprog_ =
re->suffix_regexp_->CompileToReverseProg(re->options_.max_mem() / 3);
if (re->rprog_ == NULL) {
if (re->options_.log_errors())
- LOG(ERROR) << "Error reverse compiling '" << trunc(re->pattern_) << "'";
+ LOG(ERROR) << "Error reverse compiling '" << trunc(*re->pattern_)
+ << "'";
// We no longer touch error_ and error_code_ because failing to compile
// the reverse Prog is not a showstopper: falling back to NFA execution
// is fine. More importantly, an RE2 object is supposed to be logically
@@ -261,18 +294,21 @@ re2::Prog* RE2::ReverseProg() const {
}
RE2::~RE2() {
+ if (group_names_ != empty_group_names())
+ delete group_names_;
+ if (named_groups_ != empty_named_groups())
+ delete named_groups_;
+ delete rprog_;
+ delete prog_;
+ if (error_arg_ != empty_string())
+ delete error_arg_;
+ if (error_ != empty_string())
+ delete error_;
if (suffix_regexp_)
suffix_regexp_->Decref();
if (entire_regexp_)
entire_regexp_->Decref();
- delete prog_;
- delete rprog_;
- if (error_ != empty_string)
- delete error_;
- if (named_groups_ != NULL && named_groups_ != empty_named_groups)
- delete named_groups_;
- if (group_names_ != NULL && group_names_ != empty_group_names)
- delete group_names_;
+ delete pattern_;
}
int RE2::ProgramSize() const {
@@ -348,39 +384,39 @@ int RE2::ReverseProgramFanout(std::vector<int>* histogram) const {
// Returns named_groups_, computing it if needed.
const std::map<std::string, int>& RE2::NamedCapturingGroups() const {
- std::call_once(named_groups_once_, [](const RE2* re) {
+ absl::call_once(named_groups_once_, [](const RE2* re) {
if (re->suffix_regexp_ != NULL)
re->named_groups_ = re->suffix_regexp_->NamedCaptures();
if (re->named_groups_ == NULL)
- re->named_groups_ = empty_named_groups;
+ re->named_groups_ = empty_named_groups();
}, this);
return *named_groups_;
}
// Returns group_names_, computing it if needed.
const std::map<int, std::string>& RE2::CapturingGroupNames() const {
- std::call_once(group_names_once_, [](const RE2* re) {
+ absl::call_once(group_names_once_, [](const RE2* re) {
if (re->suffix_regexp_ != NULL)
re->group_names_ = re->suffix_regexp_->CaptureNames();
if (re->group_names_ == NULL)
- re->group_names_ = empty_group_names;
+ re->group_names_ = empty_group_names();
}, this);
return *group_names_;
}
/***** Convenience interfaces *****/
-bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
+bool RE2::FullMatchN(absl::string_view text, const RE2& re,
const Arg* const args[], int n) {
return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
}
-bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
+bool RE2::PartialMatchN(absl::string_view text, const RE2& re,
const Arg* const args[], int n) {
return re.DoMatch(text, UNANCHORED, NULL, args, n);
}
-bool RE2::ConsumeN(StringPiece* input, const RE2& re,
+bool RE2::ConsumeN(absl::string_view* input, const RE2& re,
const Arg* const args[], int n) {
size_t consumed;
if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
@@ -391,7 +427,7 @@ bool RE2::ConsumeN(StringPiece* input, const RE2& re,
}
}
-bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
+bool RE2::FindAndConsumeN(absl::string_view* input, const RE2& re,
const Arg* const args[], int n) {
size_t consumed;
if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
@@ -404,12 +440,12 @@ bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
bool RE2::Replace(std::string* str,
const RE2& re,
- const StringPiece& rewrite) {
- StringPiece vec[kVecSize];
+ absl::string_view rewrite) {
+ absl::string_view vec[kVecSize];
int nvec = 1 + MaxSubmatch(rewrite);
if (nvec > 1 + re.NumberOfCapturingGroups())
return false;
- if (nvec > static_cast<int>(arraysize(vec)))
+ if (nvec > static_cast<int>(ABSL_ARRAYSIZE(vec)))
return false;
if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
return false;
@@ -426,12 +462,12 @@ bool RE2::Replace(std::string* str,
int RE2::GlobalReplace(std::string* str,
const RE2& re,
- const StringPiece& rewrite) {
- StringPiece vec[kVecSize];
+ absl::string_view rewrite) {
+ absl::string_view vec[kVecSize];
int nvec = 1 + MaxSubmatch(rewrite);
if (nvec > 1 + re.NumberOfCapturingGroups())
return false;
- if (nvec > static_cast<int>(arraysize(vec)))
+ if (nvec > static_cast<int>(ABSL_ARRAYSIZE(vec)))
return false;
const char* p = str->data();
@@ -439,13 +475,10 @@ int RE2::GlobalReplace(std::string* str,
const char* lastend = NULL;
std::string out;
int count = 0;
-#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
- // Iterate just once when fuzzing. Otherwise, we easily get bogged down
- // and coverage is unlikely to improve despite significant expense.
- while (p == str->data()) {
-#else
while (p <= ep) {
-#endif
+ if (maximum_global_replace_count != -1 &&
+ count >= maximum_global_replace_count)
+ break;
if (!re.Match(*str, static_cast<size_t>(p - str->data()),
str->size(), UNANCHORED, vec, nvec))
break;
@@ -497,15 +530,15 @@ int RE2::GlobalReplace(std::string* str,
return count;
}
-bool RE2::Extract(const StringPiece& text,
+bool RE2::Extract(absl::string_view text,
const RE2& re,
- const StringPiece& rewrite,
+ absl::string_view rewrite,
std::string* out) {
- StringPiece vec[kVecSize];
+ absl::string_view vec[kVecSize];
int nvec = 1 + MaxSubmatch(rewrite);
if (nvec > 1 + re.NumberOfCapturingGroups())
return false;
- if (nvec > static_cast<int>(arraysize(vec)))
+ if (nvec > static_cast<int>(ABSL_ARRAYSIZE(vec)))
return false;
if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
return false;
@@ -514,7 +547,7 @@ bool RE2::Extract(const StringPiece& text,
return re.Rewrite(out, rewrite, vec, nvec);
}
-std::string RE2::QuoteMeta(const StringPiece& unquoted) {
+std::string RE2::QuoteMeta(absl::string_view unquoted) {
std::string result;
result.reserve(unquoted.size() << 1);
@@ -613,11 +646,11 @@ static int ascii_strcasecmp(const char* a, const char* b, size_t len) {
/***** Actual matching and rewriting code *****/
-bool RE2::Match(const StringPiece& text,
+bool RE2::Match(absl::string_view text,
size_t startpos,
size_t endpos,
Anchor re_anchor,
- StringPiece* submatch,
+ absl::string_view* submatch,
int nsubmatch) const {
if (!ok()) {
if (options_.log_errors())
@@ -634,7 +667,7 @@ bool RE2::Match(const StringPiece& text,
return false;
}
- StringPiece subtext = text;
+ absl::string_view subtext = text;
subtext.remove_prefix(startpos);
subtext.remove_suffix(text.size() - endpos);
@@ -642,8 +675,8 @@ bool RE2::Match(const StringPiece& text,
// Don't ask for the location if we won't use it.
// SearchDFA can do extra optimizations in that case.
- StringPiece match;
- StringPiece* matchp = &match;
+ absl::string_view match;
+ absl::string_view* matchp = &match;
if (nsubmatch == 0)
matchp = NULL;
@@ -686,9 +719,8 @@ bool RE2::Match(const StringPiece& text,
}
Prog::Anchor anchor = Prog::kUnanchored;
- Prog::MatchKind kind = Prog::kFirstMatch;
- if (options_.longest_match())
- kind = Prog::kLongestMatch;
+ Prog::MatchKind kind =
+ longest_match_ ? Prog::kLongestMatch : Prog::kFirstMatch;
bool can_one_pass = is_one_pass_ && ncap <= Prog::kMaxOnePassCapture;
bool can_bit_state = prog_->CanBitState();
@@ -720,7 +752,7 @@ bool RE2::Match(const StringPiece& text,
if (dfa_failed) {
if (options_.log_errors())
LOG(ERROR) << "DFA out of memory: "
- << "pattern length " << pattern_.size() << ", "
+ << "pattern length " << pattern_->size() << ", "
<< "program size " << prog->size() << ", "
<< "list count " << prog->list_count() << ", "
<< "bytemap range " << prog->bytemap_range();
@@ -740,7 +772,7 @@ bool RE2::Match(const StringPiece& text,
if (dfa_failed) {
if (options_.log_errors())
LOG(ERROR) << "DFA out of memory: "
- << "pattern length " << pattern_.size() << ", "
+ << "pattern length " << pattern_->size() << ", "
<< "program size " << prog_->size() << ", "
<< "list count " << prog_->list_count() << ", "
<< "bytemap range " << prog_->bytemap_range();
@@ -766,7 +798,7 @@ bool RE2::Match(const StringPiece& text,
if (dfa_failed) {
if (options_.log_errors())
LOG(ERROR) << "DFA out of memory: "
- << "pattern length " << pattern_.size() << ", "
+ << "pattern length " << pattern_->size() << ", "
<< "program size " << prog->size() << ", "
<< "list count " << prog->list_count() << ", "
<< "bytemap range " << prog->bytemap_range();
@@ -809,7 +841,7 @@ bool RE2::Match(const StringPiece& text,
if (dfa_failed) {
if (options_.log_errors())
LOG(ERROR) << "DFA out of memory: "
- << "pattern length " << pattern_.size() << ", "
+ << "pattern length " << pattern_->size() << ", "
<< "program size " << prog_->size() << ", "
<< "list count " << prog_->list_count() << ", "
<< "bytemap range " << prog_->bytemap_range();
@@ -827,7 +859,7 @@ bool RE2::Match(const StringPiece& text,
if (ncap == 1)
submatch[0] = match;
} else {
- StringPiece subtext1;
+ absl::string_view subtext1;
if (skipped_test) {
// DFA ran out of memory or was skipped:
// need to search in entire original text.
@@ -865,17 +897,17 @@ bool RE2::Match(const StringPiece& text,
// Adjust overall match for required prefix that we stripped off.
if (prefixlen > 0 && nsubmatch > 0)
- submatch[0] = StringPiece(submatch[0].data() - prefixlen,
- submatch[0].size() + prefixlen);
+ submatch[0] = absl::string_view(submatch[0].data() - prefixlen,
+ submatch[0].size() + prefixlen);
// Zero submatches that don't exist in the regexp.
for (int i = ncap; i < nsubmatch; i++)
- submatch[i] = StringPiece();
+ submatch[i] = absl::string_view();
return true;
}
-// Internal matcher - like Match() but takes Args not StringPieces.
-bool RE2::DoMatch(const StringPiece& text,
+// Internal matcher - like Match() but takes Args not string_views.
+bool RE2::DoMatch(absl::string_view text,
Anchor re_anchor,
size_t* consumed,
const Arg* const* args,
@@ -898,19 +930,10 @@ bool RE2::DoMatch(const StringPiece& text,
else
nvec = n+1;
- StringPiece* vec;
- StringPiece stkvec[kVecSize];
- StringPiece* heapvec = NULL;
-
- if (nvec <= static_cast<int>(arraysize(stkvec))) {
- vec = stkvec;
- } else {
- vec = new StringPiece[nvec];
- heapvec = vec;
- }
+ absl::FixedArray<absl::string_view, kVecSize> vec_storage(nvec);
+ absl::string_view* vec = vec_storage.data();
if (!Match(text, 0, text.size(), re_anchor, vec, nvec)) {
- delete[] heapvec;
return false;
}
@@ -919,27 +942,24 @@ bool RE2::DoMatch(const StringPiece& text,
if (n == 0 || args == NULL) {
// We are not interested in results
- delete[] heapvec;
return true;
}
// If we got here, we must have matched the whole pattern.
for (int i = 0; i < n; i++) {
- const StringPiece& s = vec[i+1];
+ absl::string_view s = vec[i+1];
if (!args[i]->Parse(s.data(), s.size())) {
// TODO: Should we indicate what the error was?
- delete[] heapvec;
return false;
}
}
- delete[] heapvec;
return true;
}
// Checks that the rewrite string is well-formed with respect to this
// regular expression.
-bool RE2::CheckRewriteString(const StringPiece& rewrite,
+bool RE2::CheckRewriteString(absl::string_view rewrite,
std::string* error) const {
int max_token = -1;
for (const char *s = rewrite.data(), *end = s + rewrite.size();
@@ -956,7 +976,7 @@ bool RE2::CheckRewriteString(const StringPiece& rewrite,
if (c == '\\') {
continue;
}
- if (!isdigit(c)) {
+ if (!absl::ascii_isdigit(c)) {
*error = "Rewrite schema error: "
"'\\' must be followed by a digit or '\\'.";
return false;
@@ -968,7 +988,7 @@ bool RE2::CheckRewriteString(const StringPiece& rewrite,
}
if (max_token > NumberOfCapturingGroups()) {
- *error = StringPrintf(
+ *error = absl::StrFormat(
"Rewrite schema requests %d matches, but the regexp only has %d "
"parenthesized subexpressions.",
max_token, NumberOfCapturingGroups());
@@ -979,14 +999,14 @@ bool RE2::CheckRewriteString(const StringPiece& rewrite,
// Returns the maximum submatch needed for the rewrite to be done by Replace().
// E.g. if rewrite == "foo \\2,\\1", returns 2.
-int RE2::MaxSubmatch(const StringPiece& rewrite) {
+int RE2::MaxSubmatch(absl::string_view rewrite) {
int max = 0;
for (const char *s = rewrite.data(), *end = s + rewrite.size();
s < end; s++) {
if (*s == '\\') {
s++;
int c = (s < end) ? *s : -1;
- if (isdigit(c)) {
+ if (absl::ascii_isdigit(c)) {
int n = (c - '0');
if (n > max)
max = n;
@@ -996,11 +1016,11 @@ int RE2::MaxSubmatch(const StringPiece& rewrite) {
return max;
}
-// Append the "rewrite" string, with backslash subsitutions from "vec",
+// Append the "rewrite" string, with backslash substitutions from "vec",
// to string "out".
bool RE2::Rewrite(std::string* out,
- const StringPiece& rewrite,
- const StringPiece* vec,
+ absl::string_view rewrite,
+ const absl::string_view* vec,
int veclen) const {
for (const char *s = rewrite.data(), *end = s + rewrite.size();
s < end; s++) {
@@ -1010,7 +1030,7 @@ bool RE2::Rewrite(std::string* out,
}
s++;
int c = (s < end) ? *s : -1;
- if (isdigit(c)) {
+ if (absl::ascii_isdigit(c)) {
int n = (c - '0');
if (n >= veclen) {
if (options_.log_errors()) {
@@ -1019,7 +1039,7 @@ bool RE2::Rewrite(std::string* out,
}
return false;
}
- StringPiece snip = vec[n];
+ absl::string_view snip = vec[n];
if (!snip.empty())
out->append(snip.data(), snip.size());
} else if (c == '\\') {
@@ -1051,9 +1071,9 @@ bool Parse(const char* str, size_t n, std::string* dest) {
}
template <>
-bool Parse(const char* str, size_t n, StringPiece* dest) {
+bool Parse(const char* str, size_t n, absl::string_view* dest) {
if (dest == NULL) return true;
- *dest = StringPiece(str, n);
+ *dest = absl::string_view(str, n);
return true;
}
@@ -1091,13 +1111,13 @@ static const char* TerminateNumber(char* buf, size_t nbuf, const char* str,
size_t* np, bool accept_spaces) {
size_t n = *np;
if (n == 0) return "";
- if (n > 0 && isspace(*str)) {
+ if (n > 0 && absl::ascii_isspace(*str)) {
// We are less forgiving than the strtoxxx() routines and do not
// allow leading spaces. We do allow leading spaces for floats.
if (!accept_spaces) {
return "";
}
- while (n > 0 && isspace(*str)) {
+ while (n > 0 && absl::ascii_isspace(*str)) {
n--;
str++;
}
diff --git a/re2/re2.h b/re2/re2.h
index df32ce3..68fbed1 100644
--- a/re2/re2.h
+++ b/re2/re2.h
@@ -66,17 +66,17 @@
// CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1)));
//
// -----------------------------------------------------------------------
-// MATCHING WITH SUBSTRING EXTRACTION:
+// SUBMATCH EXTRACTION:
//
-// You can supply extra pointer arguments to extract matched substrings.
+// You can supply extra pointer arguments to extract submatches.
// On match failure, none of the pointees will have been modified.
-// On match success, the substrings will be converted (as necessary) and
+// On match success, the submatches will be converted (as necessary) and
// their values will be assigned to their pointees until all conversions
// have succeeded or one conversion has failed.
// On conversion failure, the pointees will be in an indeterminate state
// because the caller has no way of knowing which conversion failed.
-// However, conversion cannot fail for types like string and StringPiece
-// that do not inspect the substring contents. Hence, in the common case
+// However, conversion cannot fail for types like string and string_view
+// that do not inspect the submatch contents. Hence, in the common case
// where all of the pointees are of such types, failure is always due to
// match failure and thus none of the pointees will have been modified.
//
@@ -85,6 +85,11 @@
// std::string s;
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
//
+// Example: extracts "ruby" into "s" and no value into "i"
+// absl::optional<int> i;
+// std::string s;
+// CHECK(RE2::FullMatch("ruby", "(\\w+)(?::(\\d+))?", &s, &i));
+//
// Example: fails because string cannot be stored in integer
// CHECK(!RE2::FullMatch("ruby", "(.*)", &i));
//
@@ -100,10 +105,10 @@
// Example: integer overflow causes failure
// CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i));
//
-// NOTE(rsc): Asking for substrings slows successful matches quite a bit.
+// NOTE(rsc): Asking for submatches slows successful matches quite a bit.
// This may get a little faster in the future, but right now is slower
// than PCRE. On the other hand, failed matches run *very* fast (faster
-// than PCRE), as do matches without substring extraction.
+// than PCRE), as do matches without submatch extraction.
//
// -----------------------------------------------------------------------
// PARTIAL MATCHES
@@ -140,12 +145,12 @@
//
// The "Consume" operation may be useful if you want to repeatedly
// match regular expressions at the front of a string and skip over
-// them as they match. This requires use of the "StringPiece" type,
+// them as they match. This requires use of the string_view type,
// which represents a sub-range of a real string.
//
// Example: read lines of the form "var = value" from a string.
-// std::string contents = ...; // Fill string somehow
-// StringPiece input(contents); // Wrap a StringPiece around it
+// std::string contents = ...; // Fill string somehow
+// absl::string_view input(contents); // Wrap a string_view around it
//
// std::string var;
// int value;
@@ -206,7 +211,6 @@
#include <stdint.h>
#include <algorithm>
#include <map>
-#include <mutex>
#include <string>
#include <type_traits>
#include <vector>
@@ -215,6 +219,9 @@
#include <TargetConditionals.h>
#endif
+#include "absl/base/call_once.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
#include "re2/stringpiece.h"
namespace re2 {
@@ -273,22 +280,34 @@ class RE2 {
// Need to have the const char* and const std::string& forms for implicit
// conversions when passing string literals to FullMatch and PartialMatch.
- // Otherwise the StringPiece form would be sufficient.
-#ifndef SWIG
+ // Otherwise the absl::string_view form would be sufficient.
RE2(const char* pattern);
RE2(const std::string& pattern);
-#endif
- RE2(const StringPiece& pattern);
- RE2(const StringPiece& pattern, const Options& options);
+ RE2(absl::string_view pattern);
+ RE2(absl::string_view pattern, const Options& options);
~RE2();
+ // Not copyable.
+ // RE2 objects are expensive. You should probably use std::shared_ptr<RE2>
+ // instead. If you really must copy, RE2(first.pattern(), first.options())
+ // effectively does so: it produces a second object that mimics the first.
+ RE2(const RE2&) = delete;
+ RE2& operator=(const RE2&) = delete;
+ // Not movable.
+ // RE2 objects are thread-safe and logically immutable. You should probably
+ // use std::unique_ptr<RE2> instead. Otherwise, consider std::deque<RE2> if
+ // direct emplacement into a container is desired. If you really must move,
+ // be prepared to submit a design document along with your feature request.
+ RE2(RE2&&) = delete;
+ RE2& operator=(RE2&&) = delete;
+
// Returns whether RE2 was created properly.
bool ok() const { return error_code() == NoError; }
// The string specification for this RE2. E.g.
// RE2 re("ab*c?d+");
// re.pattern(); // "ab*c?d+"
- const std::string& pattern() const { return pattern_; }
+ const std::string& pattern() const { return *pattern_; }
// If RE2 could not be created properly, returns an error string.
// Else returns the empty string.
@@ -300,7 +319,7 @@ class RE2 {
// If RE2 could not be created properly, returns the offending
// portion of the regexp.
- const std::string& error_arg() const { return error_arg_; }
+ const std::string& error_arg() const { return *error_arg_; }
// Returns the program size, a very approximate measure of a regexp's "cost".
// Larger numbers are more expensive than smaller numbers.
@@ -324,16 +343,15 @@ class RE2 {
// the functions whose names are the prefix before the 'N'. It is sometimes
// useful to invoke them directly, but the syntax is awkward, so the 'N'-less
// versions should be preferred.
- static bool FullMatchN(const StringPiece& text, const RE2& re,
+ static bool FullMatchN(absl::string_view text, const RE2& re,
const Arg* const args[], int n);
- static bool PartialMatchN(const StringPiece& text, const RE2& re,
+ static bool PartialMatchN(absl::string_view text, const RE2& re,
const Arg* const args[], int n);
- static bool ConsumeN(StringPiece* input, const RE2& re,
+ static bool ConsumeN(absl::string_view* input, const RE2& re,
const Arg* const args[], int n);
- static bool FindAndConsumeN(StringPiece* input, const RE2& re,
+ static bool FindAndConsumeN(absl::string_view* input, const RE2& re,
const Arg* const args[], int n);
-#ifndef SWIG
private:
template <typename F, typename SP>
static inline bool Apply(F f, SP sp, const RE2& re) {
@@ -363,10 +381,11 @@ class RE2 {
//
// The provided pointer arguments can be pointers to any scalar numeric
// type, or one of:
- // std::string (matched piece is copied to string)
- // StringPiece (StringPiece is mutated to point to matched piece)
- // T (where "bool T::ParseFrom(const char*, size_t)" exists)
- // (void*)NULL (the corresponding matched sub-pattern is not copied)
+ // std::string (matched piece is copied to string)
+ // absl::string_view (string_view is mutated to point to matched piece)
+ // absl::optional<T> (T is a supported numeric or string type as above)
+ // T ("bool T::ParseFrom(const char*, size_t)" must exist)
+ // (void*)NULL (the corresponding matched sub-pattern is not copied)
//
// Returns true iff all of the following conditions are satisfied:
// a. "text" matches "re" fully - from the beginning to the end of "text".
@@ -378,13 +397,16 @@ class RE2 {
// ignored.
//
// CAVEAT: An optional sub-pattern that does not exist in the
- // matched string is assigned the empty string. Therefore, the
- // following will return false (because the empty string is not a
- // valid number):
+ // matched string is assigned the null string. Therefore, the
+ // following returns false because the null string - absence of
+ // a string (not even the empty string) - is not a valid number:
+ //
// int number;
// RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number);
+ //
+ // Use absl::optional<int> instead to handle this case correctly.
template <typename... A>
- static bool FullMatch(const StringPiece& text, const RE2& re, A&&... a) {
+ static bool FullMatch(absl::string_view text, const RE2& re, A&&... a) {
return Apply(FullMatchN, text, re, Arg(std::forward<A>(a))...);
}
@@ -400,7 +422,7 @@ class RE2 {
// number of sub-patterns, the "i"th captured sub-pattern is
// ignored.
template <typename... A>
- static bool PartialMatch(const StringPiece& text, const RE2& re, A&&... a) {
+ static bool PartialMatch(absl::string_view text, const RE2& re, A&&... a) {
return Apply(PartialMatchN, text, re, Arg(std::forward<A>(a))...);
}
@@ -418,7 +440,7 @@ class RE2 {
// number of sub-patterns, the "i"th captured sub-pattern is
// ignored.
template <typename... A>
- static bool Consume(StringPiece* input, const RE2& re, A&&... a) {
+ static bool Consume(absl::string_view* input, const RE2& re, A&&... a) {
return Apply(ConsumeN, input, re, Arg(std::forward<A>(a))...);
}
@@ -436,10 +458,9 @@ class RE2 {
// number of sub-patterns, the "i"th captured sub-pattern is
// ignored.
template <typename... A>
- static bool FindAndConsume(StringPiece* input, const RE2& re, A&&... a) {
+ static bool FindAndConsume(absl::string_view* input, const RE2& re, A&&... a) {
return Apply(FindAndConsumeN, input, re, Arg(std::forward<A>(a))...);
}
-#endif
// Replace the first match of "re" in "str" with "rewrite".
// Within "rewrite", backslash-escaped digits (\1 to \9) can be
@@ -456,7 +477,7 @@ class RE2 {
// false otherwise.
static bool Replace(std::string* str,
const RE2& re,
- const StringPiece& rewrite);
+ absl::string_view rewrite);
// Like Replace(), except replaces successive non-overlapping occurrences
// of the pattern in the string with the rewrite. E.g.
@@ -473,7 +494,7 @@ class RE2 {
// Returns the number of replacements made.
static int GlobalReplace(std::string* str,
const RE2& re,
- const StringPiece& rewrite);
+ absl::string_view rewrite);
// Like Replace, except that if the pattern matches, "rewrite"
// is copied into "out" with substitutions. The non-matching
@@ -483,9 +504,9 @@ class RE2 {
// successfully; if no match occurs, the string is left unaffected.
//
// REQUIRES: "text" must not alias any part of "*out".
- static bool Extract(const StringPiece& text,
+ static bool Extract(absl::string_view text,
const RE2& re,
- const StringPiece& rewrite,
+ absl::string_view rewrite,
std::string* out);
// Escapes all potentially meaningful regexp characters in
@@ -494,7 +515,7 @@ class RE2 {
// 1.5-2.0?
// may become:
// 1\.5\-2\.0\?
- static std::string QuoteMeta(const StringPiece& unquoted);
+ static std::string QuoteMeta(absl::string_view unquoted);
// Computes range for any strings matching regexp. The min and max can in
// some cases be arbitrarily precise, so the caller gets to specify the
@@ -522,7 +543,7 @@ class RE2 {
ANCHOR_BOTH // Anchor at start and end
};
- // Return the number of capturing subpatterns, or -1 if the
+ // Return the number of capturing sub-patterns, or -1 if the
// regexp wasn't valid on construction. The overall match ($0)
// does not count: if the regexp is "(a)(b)", returns 2.
int NumberOfCapturingGroups() const { return num_captures_; }
@@ -555,15 +576,15 @@ class RE2 {
// Doesn't make sense to use nsubmatch > 1 + NumberOfCapturingGroups(),
// but will be handled correctly.
//
- // Passing text == StringPiece(NULL, 0) will be handled like any other
+ // Passing text == absl::string_view() will be handled like any other
// empty string, but note that on return, it will not be possible to tell
// whether submatch i matched the empty string or did not match:
// either way, submatch[i].data() == NULL.
- bool Match(const StringPiece& text,
+ bool Match(absl::string_view text,
size_t startpos,
size_t endpos,
Anchor re_anchor,
- StringPiece* submatch,
+ absl::string_view* submatch,
int nsubmatch) const;
// Check that the given rewrite string is suitable for use with this
@@ -574,21 +595,21 @@ class RE2 {
// '\' followed by anything other than a digit or '\'.
// A true return value guarantees that Replace() and Extract() won't
// fail because of a bad rewrite string.
- bool CheckRewriteString(const StringPiece& rewrite,
+ bool CheckRewriteString(absl::string_view rewrite,
std::string* error) const;
// Returns the maximum submatch needed for the rewrite to be done by
// Replace(). E.g. if rewrite == "foo \\2,\\1", returns 2.
- static int MaxSubmatch(const StringPiece& rewrite);
+ static int MaxSubmatch(absl::string_view rewrite);
- // Append the "rewrite" string, with backslash subsitutions from "vec",
+ // Append the "rewrite" string, with backslash substitutions from "vec",
// to string "out".
// Returns true on success. This method can fail because of a malformed
// rewrite string. CheckRewriteString guarantees that the rewrite will
// be sucessful.
bool Rewrite(std::string* out,
- const StringPiece& rewrite,
- const StringPiece* vec,
+ absl::string_view rewrite,
+ const absl::string_view* vec,
int veclen) const;
// Constructor options
@@ -653,11 +674,11 @@ class RE2 {
};
Options() :
+ max_mem_(kDefaultMaxMem),
encoding_(EncodingUTF8),
posix_syntax_(false),
longest_match_(false),
log_errors_(true),
- max_mem_(kDefaultMaxMem),
literal_(false),
never_nl_(false),
dot_nl_(false),
@@ -670,6 +691,9 @@ class RE2 {
/*implicit*/ Options(CannedOptions);
+ int64_t max_mem() const { return max_mem_; }
+ void set_max_mem(int64_t m) { max_mem_ = m; }
+
Encoding encoding() const { return encoding_; }
void set_encoding(Encoding encoding) { encoding_ = encoding; }
@@ -682,9 +706,6 @@ class RE2 {
bool log_errors() const { return log_errors_; }
void set_log_errors(bool b) { log_errors_ = b; }
- int64_t max_mem() const { return max_mem_; }
- void set_max_mem(int64_t m) { max_mem_ = m; }
-
bool literal() const { return literal_; }
void set_literal(bool b) { literal_ = b; }
@@ -716,11 +737,11 @@ class RE2 {
int ParseFlags() const;
private:
+ int64_t max_mem_;
Encoding encoding_;
bool posix_syntax_;
bool longest_match_;
bool log_errors_;
- int64_t max_mem_;
bool literal_;
bool never_nl_;
bool dot_nl_;
@@ -742,10 +763,14 @@ class RE2 {
template <typename T>
static Arg Octal(T* ptr);
+ // Controls the maximum count permitted by GlobalReplace(); -1 is unlimited.
+ // FOR FUZZING ONLY.
+ static void FUZZING_ONLY_set_maximum_global_replace_count(int i);
+
private:
- void Init(const StringPiece& pattern, const Options& options);
+ void Init(absl::string_view pattern, const Options& options);
- bool DoMatch(const StringPiece& text,
+ bool DoMatch(absl::string_view text,
Anchor re_anchor,
size_t* consumed,
const Arg* const args[],
@@ -753,18 +778,23 @@ class RE2 {
re2::Prog* ReverseProg() const;
- std::string pattern_; // string regular expression
- Options options_; // option flags
- re2::Regexp* entire_regexp_; // parsed regular expression
- const std::string* error_; // error indicator (or points to empty string)
- ErrorCode error_code_; // error code
- std::string error_arg_; // fragment of regexp showing error
- std::string prefix_; // required prefix (before suffix_regexp_)
- bool prefix_foldcase_; // prefix_ is ASCII case-insensitive
- re2::Regexp* suffix_regexp_; // parsed regular expression, prefix_ removed
- re2::Prog* prog_; // compiled program for regexp
- int num_captures_; // number of capturing groups
- bool is_one_pass_; // can use prog_->SearchOnePass?
+ // First cache line is relatively cold fields.
+ const std::string* pattern_; // string regular expression
+ Options options_; // option flags
+ re2::Regexp* entire_regexp_; // parsed regular expression
+ re2::Regexp* suffix_regexp_; // parsed regular expression, prefix_ removed
+ const std::string* error_; // error indicator (or points to empty string)
+ const std::string* error_arg_; // fragment of regexp showing error (or ditto)
+
+ // Second cache line is relatively hot fields.
+ // These are ordered oddly to pack everything.
+ int num_captures_; // number of capturing groups
+ ErrorCode error_code_ : 29; // error code (29 bits is more than enough)
+ bool longest_match_ : 1; // cached copy of options_.longest_match()
+ bool is_one_pass_ : 1; // can use prog_->SearchOnePass?
+ bool prefix_foldcase_ : 1; // prefix_ is ASCII case-insensitive
+ std::string prefix_; // required prefix (before suffix_regexp_)
+ re2::Prog* prog_; // compiled program for regexp
// Reverse Prog for DFA execution only
mutable re2::Prog* rprog_;
@@ -773,12 +803,9 @@ class RE2 {
// Map from capture indices to names
mutable const std::map<int, std::string>* group_names_;
- mutable std::once_flag rprog_once_;
- mutable std::once_flag named_groups_once_;
- mutable std::once_flag group_names_once_;
-
- RE2(const RE2&) = delete;
- RE2& operator=(const RE2&) = delete;
+ mutable absl::once_flag rprog_once_;
+ mutable absl::once_flag named_groups_once_;
+ mutable absl::once_flag group_names_once_;
};
/***** Implementation details *****/
@@ -789,7 +816,7 @@ namespace re2_internal {
template <typename T> struct Parse3ary : public std::false_type {};
template <> struct Parse3ary<void> : public std::true_type {};
template <> struct Parse3ary<std::string> : public std::true_type {};
-template <> struct Parse3ary<StringPiece> : public std::true_type {};
+template <> struct Parse3ary<absl::string_view> : public std::true_type {};
template <> struct Parse3ary<char> : public std::true_type {};
template <> struct Parse3ary<signed char> : public std::true_type {};
template <> struct Parse3ary<unsigned char> : public std::true_type {};
@@ -813,6 +840,42 @@ template <> struct Parse4ary<unsigned long long> : public std::true_type {};
template <typename T>
bool Parse(const char* str, size_t n, T* dest, int radix);
+// Support absl::optional<T> for all T with a stock parser.
+template <typename T> struct Parse3ary<absl::optional<T>> : public Parse3ary<T> {};
+template <typename T> struct Parse4ary<absl::optional<T>> : public Parse4ary<T> {};
+
+template <typename T>
+bool Parse(const char* str, size_t n, absl::optional<T>* dest) {
+ if (str == NULL) {
+ if (dest != NULL)
+ dest->reset();
+ return true;
+ }
+ T tmp;
+ if (Parse(str, n, &tmp)) {
+ if (dest != NULL)
+ dest->emplace(std::move(tmp));
+ return true;
+ }
+ return false;
+}
+
+template <typename T>
+bool Parse(const char* str, size_t n, absl::optional<T>* dest, int radix) {
+ if (str == NULL) {
+ if (dest != NULL)
+ dest->reset();
+ return true;
+ }
+ T tmp;
+ if (Parse(str, n, &tmp, radix)) {
+ if (dest != NULL)
+ dest->emplace(std::move(tmp));
+ return true;
+ }
+ return false;
+}
+
} // namespace re2_internal
class RE2::Arg {
@@ -908,9 +971,8 @@ inline RE2::Arg RE2::Octal(T* ptr) {
});
}
-#ifndef SWIG
// Silence warnings about missing initializers for members of LazyRE2.
-#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 6
+#if !defined(__clang__) && defined(__GNUC__)
#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
#endif
@@ -940,7 +1002,7 @@ class LazyRE2 {
// Named accessor/initializer:
RE2* get() const {
- std::call_once(once_, &LazyRE2::Init, this);
+ absl::call_once(once_, &LazyRE2::Init, this);
return ptr_;
}
@@ -950,7 +1012,7 @@ class LazyRE2 {
NoArg barrier_against_excess_initializers_;
mutable RE2* ptr_;
- mutable std::once_flag once_;
+ mutable absl::once_flag once_;
private:
static void Init(const LazyRE2* lazy_re2) {
@@ -959,7 +1021,6 @@ class LazyRE2 {
void operator=(const LazyRE2&); // disallowed
};
-#endif
namespace hooks {
diff --git a/re2/regexp.cc b/re2/regexp.cc
index ca1318b..4ea81cf 100644
--- a/re2/regexp.cc
+++ b/re2/regexp.cc
@@ -12,16 +12,16 @@
#include <string.h>
#include <algorithm>
#include <map>
-#include <mutex>
#include <string>
#include <vector>
-#include "util/util.h"
+#include "absl/base/call_once.h"
+#include "absl/base/macros.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/synchronization/mutex.h"
#include "util/logging.h"
-#include "util/mutex.h"
#include "util/utf.h"
#include "re2/pod_array.h"
-#include "re2/stringpiece.h"
#include "re2/walker-inl.h"
namespace re2 {
@@ -74,35 +74,45 @@ bool Regexp::QuickDestroy() {
return false;
}
-// Lazily allocated.
-static Mutex* ref_mutex;
-static std::map<Regexp*, int>* ref_map;
+// Similar to EmptyStorage in re2.cc.
+struct RefStorage {
+ absl::Mutex ref_mutex;
+ absl::flat_hash_map<Regexp*, int> ref_map;
+};
+alignas(RefStorage) static char ref_storage[sizeof(RefStorage)];
+
+static inline absl::Mutex* ref_mutex() {
+ return &reinterpret_cast<RefStorage*>(ref_storage)->ref_mutex;
+}
+
+static inline absl::flat_hash_map<Regexp*, int>* ref_map() {
+ return &reinterpret_cast<RefStorage*>(ref_storage)->ref_map;
+}
int Regexp::Ref() {
if (ref_ < kMaxRef)
return ref_;
- MutexLock l(ref_mutex);
- return (*ref_map)[this];
+ absl::MutexLock l(ref_mutex());
+ return (*ref_map())[this];
}
// Increments reference count, returns object as convenience.
Regexp* Regexp::Incref() {
if (ref_ >= kMaxRef-1) {
- static std::once_flag ref_once;
- std::call_once(ref_once, []() {
- ref_mutex = new Mutex;
- ref_map = new std::map<Regexp*, int>;
+ static absl::once_flag ref_once;
+ absl::call_once(ref_once, []() {
+ (void) new (ref_storage) RefStorage;
});
// Store ref count in overflow map.
- MutexLock l(ref_mutex);
+ absl::MutexLock l(ref_mutex());
if (ref_ == kMaxRef) {
// already overflowed
- (*ref_map)[this]++;
+ (*ref_map())[this]++;
} else {
// overflowing now
- (*ref_map)[this] = kMaxRef;
+ (*ref_map())[this] = kMaxRef;
ref_ = kMaxRef;
}
return this;
@@ -116,13 +126,13 @@ Regexp* Regexp::Incref() {
void Regexp::Decref() {
if (ref_ == kMaxRef) {
// Ref count is stored in overflow map.
- MutexLock l(ref_mutex);
- int r = (*ref_map)[this] - 1;
+ absl::MutexLock l(ref_mutex());
+ int r = (*ref_map())[this] - 1;
if (r < kMaxRef) {
ref_ = static_cast<uint16_t>(r);
- ref_map->erase(this);
+ ref_map()->erase(this);
} else {
- (*ref_map)[this] = r;
+ (*ref_map())[this] = r;
}
return;
}
@@ -390,7 +400,13 @@ static bool TopEqual(Regexp* a, Regexp* b) {
a->max() == b->max();
case kRegexpCapture:
- return a->cap() == b->cap() && a->name() == b->name();
+ if (a->name() == NULL || b->name() == NULL) {
+ // One pointer is null, so the other pointer should also be null.
+ return a->cap() == b->cap() && a->name() == b->name();
+ } else {
+ // Neither pointer is null, so compare the pointees for equality.
+ return a->cap() == b->cap() && *a->name() == *b->name();
+ }
case kRegexpHaveMatch:
return a->match_id() == b->match_id();
@@ -509,7 +525,7 @@ static const char *kErrorStrings[] = {
};
std::string RegexpStatus::CodeText(enum RegexpStatusCode code) {
- if (code < 0 || code >= arraysize(kErrorStrings))
+ if (code < 0 || code >= ABSL_ARRAYSIZE(kErrorStrings))
code = kRegexpInternalError;
return kErrorStrings[code];
}
diff --git a/re2/regexp.h b/re2/regexp.h
index b6446f9..df49894 100644
--- a/re2/regexp.h
+++ b/re2/regexp.h
@@ -92,10 +92,9 @@
#include <set>
#include <string>
-#include "util/util.h"
+#include "absl/strings/string_view.h"
#include "util/logging.h"
#include "util/utf.h"
-#include "re2/stringpiece.h"
namespace re2 {
@@ -195,10 +194,10 @@ class RegexpStatus {
~RegexpStatus() { delete tmp_; }
void set_code(RegexpStatusCode code) { code_ = code; }
- void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
+ void set_error_arg(absl::string_view error_arg) { error_arg_ = error_arg; }
void set_tmp(std::string* tmp) { delete tmp_; tmp_ = tmp; }
RegexpStatusCode code() const { return code_; }
- const StringPiece& error_arg() const { return error_arg_; }
+ absl::string_view error_arg() const { return error_arg_; }
bool ok() const { return code() == kRegexpSuccess; }
// Copies state from status.
@@ -213,9 +212,9 @@ class RegexpStatus {
std::string Text() const;
private:
- RegexpStatusCode code_; // Kind of error
- StringPiece error_arg_; // Piece of regexp containing syntax error.
- std::string* tmp_; // Temporary storage, possibly where error_arg_ is.
+ RegexpStatusCode code_; // Kind of error.
+ absl::string_view error_arg_; // Piece of regexp containing syntax error.
+ std::string* tmp_; // Temporary storage, possibly for error_arg_.
RegexpStatus(const RegexpStatus&) = delete;
RegexpStatus& operator=(const RegexpStatus&) = delete;
@@ -352,7 +351,7 @@ class Regexp {
// Parses string s to produce regular expression, returned.
// Caller must release return value with re->Decref().
// On failure, sets *status (if status != NULL) and returns NULL.
- static Regexp* Parse(const StringPiece& s, ParseFlags flags,
+ static Regexp* Parse(absl::string_view s, ParseFlags flags,
RegexpStatus* status);
// Returns a _new_ simplified version of the current regexp.
@@ -369,7 +368,7 @@ class Regexp {
// Parses the regexp src and then simplifies it and sets *dst to the
// string representation of the simplified form. Returns true on success.
// Returns false and sets *status (if status != NULL) on parse error.
- static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags,
+ static bool SimplifyRegexp(absl::string_view src, ParseFlags flags,
std::string* dst, RegexpStatus* status);
// Returns the number of capturing groups in the regexp.
@@ -467,7 +466,7 @@ class Regexp {
class ParseState;
friend class ParseState;
- friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
+ friend bool ParseCharClass(absl::string_view* s, Regexp** out_re,
RegexpStatus* status);
// Helper for testing [sic].
diff --git a/re2/set.cc b/re2/set.cc
index 1870566..b9c918e 100644
--- a/re2/set.cc
+++ b/re2/set.cc
@@ -9,13 +9,11 @@
#include <memory>
#include <utility>
-#include "util/util.h"
#include "util/logging.h"
#include "re2/pod_array.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/regexp.h"
-#include "re2/stringpiece.h"
namespace re2 {
@@ -52,7 +50,7 @@ RE2::Set& RE2::Set::operator=(Set&& other) {
return *this;
}
-int RE2::Set::Add(const StringPiece& pattern, std::string* error) {
+int RE2::Set::Add(absl::string_view pattern, std::string* error) {
if (compiled_) {
LOG(DFATAL) << "RE2::Set::Add() called after compiling";
return -1;
@@ -121,16 +119,16 @@ bool RE2::Set::Compile() {
return prog_ != nullptr;
}
-bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v) const {
+bool RE2::Set::Match(absl::string_view text, std::vector<int>* v) const {
return Match(text, v, NULL);
}
-bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v,
+bool RE2::Set::Match(absl::string_view text, std::vector<int>* v,
ErrorInfo* error_info) const {
if (!compiled_) {
- LOG(DFATAL) << "RE2::Set::Match() called before compiling";
if (error_info != NULL)
error_info->kind = kNotCompiled;
+ LOG(DFATAL) << "RE2::Set::Match() called before compiling";
return false;
}
#ifdef RE2_HAVE_THREAD_LOCAL
@@ -161,9 +159,9 @@ bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v,
}
if (v != NULL) {
if (matches->empty()) {
- LOG(DFATAL) << "RE2::Set::Match() matched, but no matches returned?!";
if (error_info != NULL)
error_info->kind = kInconsistent;
+ LOG(DFATAL) << "RE2::Set::Match() matched, but no matches returned?!";
return false;
}
v->assign(matches->begin(), matches->end());
diff --git a/re2/set.h b/re2/set.h
index 8d64f30..3fe419b 100644
--- a/re2/set.h
+++ b/re2/set.h
@@ -10,6 +10,7 @@
#include <utility>
#include <vector>
+#include "absl/strings/string_view.h"
#include "re2/re2.h"
namespace re2 {
@@ -50,7 +51,7 @@ class RE2::Set {
// Indices are assigned in sequential order starting from 0.
// Errors do not increment the index; if error is not NULL, *error will hold
// the error message from the parser.
- int Add(const StringPiece& pattern, std::string* error);
+ int Add(absl::string_view pattern, std::string* error);
// Compiles the set in preparation for matching.
// Returns false if the compiler runs out of memory.
@@ -61,12 +62,12 @@ class RE2::Set {
// Returns true if text matches at least one of the regexps in the set.
// Fills v (if not NULL) with the indices of the matching regexps.
// Callers must not expect v to be sorted.
- bool Match(const StringPiece& text, std::vector<int>* v) const;
+ bool Match(absl::string_view text, std::vector<int>* v) const;
// As above, but populates error_info (if not NULL) when none of the regexps
// in the set matched. This can inform callers when DFA execution fails, for
// example, because they might wish to handle that case differently.
- bool Match(const StringPiece& text, std::vector<int>* v,
+ bool Match(absl::string_view text, std::vector<int>* v,
ErrorInfo* error_info) const;
private:
diff --git a/re2/simplify.cc b/re2/simplify.cc
index 663d5fc..cea100b 100644
--- a/re2/simplify.cc
+++ b/re2/simplify.cc
@@ -6,9 +6,9 @@
// to use simple extended regular expression features.
// Also sort and simplify character classes.
+#include <algorithm>
#include <string>
-#include "util/util.h"
#include "util/logging.h"
#include "util/utf.h"
#include "re2/pod_array.h"
@@ -20,7 +20,7 @@ namespace re2 {
// Parses the regexp src and then simplifies it and sets *dst to the
// string representation of the simplified form. Returns true on success.
// Returns false and sets *error (if error != NULL) on error.
-bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
+bool Regexp::SimplifyRegexp(absl::string_view src, ParseFlags flags,
std::string* dst, RegexpStatus* status) {
Regexp* re = Parse(src, flags, status);
if (re == NULL)
@@ -371,8 +371,8 @@ void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
break;
default:
- LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
nre->Decref();
+ LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
return;
}
@@ -432,8 +432,8 @@ void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
}
default:
- LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
nre->Decref();
+ LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
return;
}
@@ -580,6 +580,16 @@ Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
return re;
}
+// Returns true if re is an empty-width op.
+static bool IsEmptyOp(Regexp* re) {
+ return (re->op() == kRegexpBeginLine ||
+ re->op() == kRegexpEndLine ||
+ re->op() == kRegexpWordBoundary ||
+ re->op() == kRegexpNoWordBoundary ||
+ re->op() == kRegexpBeginText ||
+ re->op() == kRegexpEndText);
+}
+
// Simplifies the expression re{min,max} in terms of *, +, and ?.
// Returns a new regexp. Does not edit re. Does not consume reference to re.
// Caller must Decref return value when done with it.
@@ -588,6 +598,16 @@ Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
// but in the Regexp* representation, both (x) are marked as $1.
Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
Regexp::ParseFlags f) {
+ // For an empty-width op OR a concatenation or alternation of empty-width
+ // ops, cap the repetition count at 1.
+ if (IsEmptyOp(re) ||
+ ((re->op() == kRegexpConcat ||
+ re->op() == kRegexpAlternate) &&
+ std::all_of(re->sub(), re->sub() + re->nsub(), IsEmptyOp))) {
+ min = std::min(min, 1);
+ max = std::min(max, 1);
+ }
+
// x{n,} means at least n matches of x.
if (max == -1) {
// Special case: x{0,} is x*
diff --git a/re2/stringpiece.cc b/re2/stringpiece.cc
deleted file mode 100644
index ef2e287..0000000
--- a/re2/stringpiece.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright 2004 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "re2/stringpiece.h"
-
-#include <ostream>
-
-#include "util/util.h"
-
-namespace re2 {
-
-const StringPiece::size_type StringPiece::npos; // initialized in stringpiece.h
-
-StringPiece::size_type StringPiece::copy(char* buf, size_type n,
- size_type pos) const {
- size_type ret = std::min(size_ - pos, n);
- memcpy(buf, data_ + pos, ret);
- return ret;
-}
-
-StringPiece StringPiece::substr(size_type pos, size_type n) const {
- if (pos > size_) pos = size_;
- if (n > size_ - pos) n = size_ - pos;
- return StringPiece(data_ + pos, n);
-}
-
-StringPiece::size_type StringPiece::find(const StringPiece& s,
- size_type pos) const {
- if (pos > size_) return npos;
- const_pointer result = std::search(data_ + pos, data_ + size_,
- s.data_, s.data_ + s.size_);
- size_type xpos = result - data_;
- return xpos + s.size_ <= size_ ? xpos : npos;
-}
-
-StringPiece::size_type StringPiece::find(char c, size_type pos) const {
- if (size_ <= 0 || pos >= size_) return npos;
- const_pointer result = std::find(data_ + pos, data_ + size_, c);
- return result != data_ + size_ ? result - data_ : npos;
-}
-
-StringPiece::size_type StringPiece::rfind(const StringPiece& s,
- size_type pos) const {
- if (size_ < s.size_) return npos;
- if (s.size_ == 0) return std::min(size_, pos);
- const_pointer last = data_ + std::min(size_ - s.size_, pos) + s.size_;
- const_pointer result = std::find_end(data_, last, s.data_, s.data_ + s.size_);
- return result != last ? result - data_ : npos;
-}
-
-StringPiece::size_type StringPiece::rfind(char c, size_type pos) const {
- if (size_ <= 0) return npos;
- for (size_t i = std::min(pos + 1, size_); i != 0;) {
- if (data_[--i] == c) return i;
- }
- return npos;
-}
-
-std::ostream& operator<<(std::ostream& o, const StringPiece& p) {
- o.write(p.data(), p.size());
- return o;
-}
-
-} // namespace re2
diff --git a/re2/stringpiece.h b/re2/stringpiece.h
index 1d9c2d3..e9367bf 100644
--- a/re2/stringpiece.h
+++ b/re2/stringpiece.h
@@ -1,209 +1,17 @@
-// Copyright 2001-2010 The RE2 Authors. All Rights Reserved.
+// Copyright 2022 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_STRINGPIECE_H_
#define RE2_STRINGPIECE_H_
-// A string-like object that points to a sized piece of memory.
-//
-// Functions or methods may use const StringPiece& parameters to accept either
-// a "const char*" or a "string" value that will be implicitly converted to
-// a StringPiece. The implicit conversion means that it is often appropriate
-// to include this .h file in other files rather than forward-declaring
-// StringPiece as would be appropriate for most other Google classes.
-//
-// Systematic usage of StringPiece is encouraged as it will reduce unnecessary
-// conversions from "const char*" to "string" and back again.
-//
-//
-// Arghh! I wish C++ literals were "string".
-
-// Doing this simplifies the logic below.
-#ifndef __has_include
-#define __has_include(x) 0
-#endif
-
-#include <stddef.h>
-#include <string.h>
-#include <algorithm>
-#include <iosfwd>
-#include <iterator>
-#include <string>
-#if __has_include(<string_view>) && __cplusplus >= 201703L
-#include <string_view>
-#endif
+#include "absl/strings/string_view.h"
namespace re2 {
-class StringPiece {
- public:
- typedef std::char_traits<char> traits_type;
- typedef char value_type;
- typedef char* pointer;
- typedef const char* const_pointer;
- typedef char& reference;
- typedef const char& const_reference;
- typedef const char* const_iterator;
- typedef const_iterator iterator;
- typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
- typedef const_reverse_iterator reverse_iterator;
- typedef size_t size_type;
- typedef ptrdiff_t difference_type;
- static const size_type npos = static_cast<size_type>(-1);
-
- // We provide non-explicit singleton constructors so users can pass
- // in a "const char*" or a "string" wherever a "StringPiece" is
- // expected.
- StringPiece()
- : data_(NULL), size_(0) {}
-#if __has_include(<string_view>) && __cplusplus >= 201703L
- StringPiece(const std::string_view& str)
- : data_(str.data()), size_(str.size()) {}
-#endif
- StringPiece(const std::string& str)
- : data_(str.data()), size_(str.size()) {}
- StringPiece(const char* str)
- : data_(str), size_(str == NULL ? 0 : strlen(str)) {}
- StringPiece(const char* str, size_type len)
- : data_(str), size_(len) {}
-
- const_iterator begin() const { return data_; }
- const_iterator end() const { return data_ + size_; }
- const_reverse_iterator rbegin() const {
- return const_reverse_iterator(data_ + size_);
- }
- const_reverse_iterator rend() const {
- return const_reverse_iterator(data_);
- }
-
- size_type size() const { return size_; }
- size_type length() const { return size_; }
- bool empty() const { return size_ == 0; }
-
- const_reference operator[](size_type i) const { return data_[i]; }
- const_pointer data() const { return data_; }
-
- void remove_prefix(size_type n) {
- data_ += n;
- size_ -= n;
- }
-
- void remove_suffix(size_type n) {
- size_ -= n;
- }
-
- void set(const char* str) {
- data_ = str;
- size_ = str == NULL ? 0 : strlen(str);
- }
-
- void set(const char* str, size_type len) {
- data_ = str;
- size_ = len;
- }
-
- // Converts to `std::basic_string`.
- template <typename A>
- explicit operator std::basic_string<char, traits_type, A>() const {
- if (!data_) return {};
- return std::basic_string<char, traits_type, A>(data_, size_);
- }
-
- std::string as_string() const {
- return std::string(data_, size_);
- }
-
- // We also define ToString() here, since many other string-like
- // interfaces name the routine that converts to a C++ string
- // "ToString", and it's confusing to have the method that does that
- // for a StringPiece be called "as_string()". We also leave the
- // "as_string()" method defined here for existing code.
- std::string ToString() const {
- return std::string(data_, size_);
- }
-
- void CopyToString(std::string* target) const {
- target->assign(data_, size_);
- }
-
- void AppendToString(std::string* target) const {
- target->append(data_, size_);
- }
-
- size_type copy(char* buf, size_type n, size_type pos = 0) const;
- StringPiece substr(size_type pos = 0, size_type n = npos) const;
-
- int compare(const StringPiece& x) const {
- size_type min_size = std::min(size(), x.size());
- if (min_size > 0) {
- int r = memcmp(data(), x.data(), min_size);
- if (r < 0) return -1;
- if (r > 0) return 1;
- }
- if (size() < x.size()) return -1;
- if (size() > x.size()) return 1;
- return 0;
- }
-
- // Does "this" start with "x"?
- bool starts_with(const StringPiece& x) const {
- return x.empty() ||
- (size() >= x.size() && memcmp(data(), x.data(), x.size()) == 0);
- }
-
- // Does "this" end with "x"?
- bool ends_with(const StringPiece& x) const {
- return x.empty() ||
- (size() >= x.size() &&
- memcmp(data() + (size() - x.size()), x.data(), x.size()) == 0);
- }
-
- bool contains(const StringPiece& s) const {
- return find(s) != npos;
- }
-
- size_type find(const StringPiece& s, size_type pos = 0) const;
- size_type find(char c, size_type pos = 0) const;
- size_type rfind(const StringPiece& s, size_type pos = npos) const;
- size_type rfind(char c, size_type pos = npos) const;
-
- private:
- const_pointer data_;
- size_type size_;
-};
-
-inline bool operator==(const StringPiece& x, const StringPiece& y) {
- StringPiece::size_type len = x.size();
- if (len != y.size()) return false;
- return x.data() == y.data() || len == 0 ||
- memcmp(x.data(), y.data(), len) == 0;
-}
-
-inline bool operator!=(const StringPiece& x, const StringPiece& y) {
- return !(x == y);
-}
-
-inline bool operator<(const StringPiece& x, const StringPiece& y) {
- StringPiece::size_type min_size = std::min(x.size(), y.size());
- int r = min_size == 0 ? 0 : memcmp(x.data(), y.data(), min_size);
- return (r < 0) || (r == 0 && x.size() < y.size());
-}
-
-inline bool operator>(const StringPiece& x, const StringPiece& y) {
- return y < x;
-}
-
-inline bool operator<=(const StringPiece& x, const StringPiece& y) {
- return !(x > y);
-}
-
-inline bool operator>=(const StringPiece& x, const StringPiece& y) {
- return !(x < y);
-}
-
-// Allow StringPiece to be logged.
-std::ostream& operator<<(std::ostream& o, const StringPiece& p);
+// Until RE2 requires C++17 and uses std::string_view, allow users to
+// continue to #include "re2/stringpiece.h" and use re2::StringPiece.
+using StringPiece = absl::string_view;
} // namespace re2
diff --git a/re2/testing/backtrack.cc b/re2/testing/backtrack.cc
index 920a453..90071bb 100644
--- a/re2/testing/backtrack.cc
+++ b/re2/testing/backtrack.cc
@@ -27,7 +27,7 @@
#include <stdint.h>
#include <string.h>
-#include "util/util.h"
+#include "absl/base/macros.h"
#include "util/logging.h"
#include "re2/pod_array.h"
#include "re2/prog.h"
@@ -55,9 +55,8 @@ class Backtracker {
public:
explicit Backtracker(Prog* prog);
- bool Search(const StringPiece& text, const StringPiece& context,
- bool anchored, bool longest,
- StringPiece* submatch, int nsubmatch);
+ bool Search(absl::string_view text, absl::string_view context, bool anchored,
+ bool longest, absl::string_view* submatch, int nsubmatch);
private:
// Explores from instruction id at string position p looking for a match.
@@ -69,14 +68,14 @@ class Backtracker {
bool Try(int id, const char* p);
// Search parameters
- Prog* prog_; // program being run
- StringPiece text_; // text being searched
- StringPiece context_; // greater context of text being searched
- bool anchored_; // whether search is anchored at text.begin()
- bool longest_; // whether search wants leftmost-longest match
- bool endmatch_; // whether search must end at text.end()
- StringPiece *submatch_; // submatches to fill in
- int nsubmatch_; // # of submatches to fill in
+ Prog* prog_; // program being run
+ absl::string_view text_; // text being searched
+ absl::string_view context_; // greater context of text being searched
+ bool anchored_; // whether search is anchored at text.begin()
+ bool longest_; // whether search wants leftmost-longest match
+ bool endmatch_; // whether search must end at text.end()
+ absl::string_view* submatch_; // submatches to fill in
+ int nsubmatch_; // # of submatches to fill in
// Search state
const char* cap_[64]; // capture registers
@@ -96,9 +95,9 @@ Backtracker::Backtracker(Prog* prog)
}
// Runs a backtracking search.
-bool Backtracker::Search(const StringPiece& text, const StringPiece& context,
+bool Backtracker::Search(absl::string_view text, absl::string_view context,
bool anchored, bool longest,
- StringPiece* submatch, int nsubmatch) {
+ absl::string_view* submatch, int nsubmatch) {
text_ = text;
context_ = context;
if (context_.data() == NULL)
@@ -112,17 +111,17 @@ bool Backtracker::Search(const StringPiece& text, const StringPiece& context,
endmatch_ = prog_->anchor_end();
submatch_ = submatch;
nsubmatch_ = nsubmatch;
- CHECK_LT(2*nsubmatch_, static_cast<int>(arraysize(cap_)));
+ CHECK_LT(2*nsubmatch_, static_cast<int>(ABSL_ARRAYSIZE(cap_)));
memset(cap_, 0, sizeof cap_);
// We use submatch_[0] for our own bookkeeping,
// so it had better exist.
- StringPiece sp0;
+ absl::string_view sp0;
if (nsubmatch < 1) {
submatch_ = &sp0;
nsubmatch_ = 1;
}
- submatch_[0] = StringPiece();
+ submatch_[0] = absl::string_view();
// Allocate new visited_ bitmap -- size is proportional
// to text, so have to reallocate on each call to Search.
@@ -203,7 +202,7 @@ bool Backtracker::Try(int id, const char* p) {
case kInstCapture:
if (0 <= ip->cap() &&
- ip->cap() < static_cast<int>(arraysize(cap_))) {
+ ip->cap() < static_cast<int>(ABSL_ARRAYSIZE(cap_))) {
// Capture p to register, but save old value.
const char* q = cap_[ip->cap()];
cap_[ip->cap()] = p;
@@ -232,7 +231,7 @@ bool Backtracker::Try(int id, const char* p) {
(longest_ && p > submatch_[0].data() + submatch_[0].size())) {
// First match so far - or better match.
for (int i = 0; i < nsubmatch_; i++)
- submatch_[i] = StringPiece(
+ submatch_[i] = absl::string_view(
cap_[2 * i], static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
}
return true;
@@ -243,16 +242,14 @@ bool Backtracker::Try(int id, const char* p) {
}
// Runs a backtracking search.
-bool Prog::UnsafeSearchBacktrack(const StringPiece& text,
- const StringPiece& context,
- Anchor anchor,
- MatchKind kind,
- StringPiece* match,
+bool Prog::UnsafeSearchBacktrack(absl::string_view text,
+ absl::string_view context, Anchor anchor,
+ MatchKind kind, absl::string_view* match,
int nmatch) {
// If full match, we ask for an anchored longest match
// and then check that match[0] == text.
// So make sure match[0] exists.
- StringPiece sp0;
+ absl::string_view sp0;
if (kind == kFullMatch) {
anchor = kAnchored;
if (nmatch < 1) {
diff --git a/re2/testing/charclass_test.cc b/re2/testing/charclass_test.cc
index 9c2a32f..ad95d6c 100644
--- a/re2/testing/charclass_test.cc
+++ b/re2/testing/charclass_test.cc
@@ -6,7 +6,9 @@
#include <stdio.h>
-#include "util/test.h"
+#include "absl/base/macros.h"
+#include "absl/strings/str_format.h"
+#include "gtest/gtest.h"
#include "util/utf.h"
#include "re2/regexp.h"
@@ -88,25 +90,25 @@ static CCTest tests[] = {
template <typename CharClass>
static void Broke(const char *desc, const CCTest* t, CharClass* cc) {
if (t == NULL) {
- printf("\t%s:", desc);
+ absl::PrintF("\t%s:", desc);
} else {
- printf("\n");
- printf("CharClass added: [%s]", desc);
+ absl::PrintF("\n");
+ absl::PrintF("CharClass added: [%s]", desc);
for (int k = 0; t->add[k].lo >= 0; k++)
- printf(" %d-%d", t->add[k].lo, t->add[k].hi);
- printf("\n");
+ absl::PrintF(" %d-%d", t->add[k].lo, t->add[k].hi);
+ absl::PrintF("\n");
if (t->remove >= 0)
- printf("Removed > %d\n", t->remove);
- printf("\twant:");
+ absl::PrintF("Removed > %d\n", t->remove);
+ absl::PrintF("\twant:");
for (int k = 0; t->final[k].lo >= 0; k++)
- printf(" %d-%d", t->final[k].lo, t->final[k].hi);
- printf("\n");
- printf("\thave:");
+ absl::PrintF(" %d-%d", t->final[k].lo, t->final[k].hi);
+ absl::PrintF("\n");
+ absl::PrintF("\thave:");
}
for (typename CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
- printf(" %d-%d", it->lo, it->hi);
- printf("\n");
+ absl::PrintF(" %d-%d", it->lo, it->hi);
+ absl::PrintF("\n");
}
bool ShouldContain(CCTest *t, int x) {
@@ -155,7 +157,7 @@ bool CorrectCC(CharClass *cc, CCTest *t, const char *desc) {
}
if (cc->size() != size) {
Broke(desc, t, cc);
- printf("wrong size: want %d have %d\n", size, cc->size());
+ absl::PrintF("wrong size: want %d have %d\n", size, cc->size());
return false;
}
@@ -164,8 +166,8 @@ bool CorrectCC(CharClass *cc, CCTest *t, const char *desc) {
j = Runemax;
if (ShouldContain(t, j) != cc->Contains(j)) {
Broke(desc, t, cc);
- printf("want contains(%d)=%d, got %d\n",
- j, ShouldContain(t, j), cc->Contains(j));
+ absl::PrintF("want contains(%d)=%d, got %d\n",
+ j, ShouldContain(t, j), cc->Contains(j));
return false;
}
}
@@ -177,16 +179,16 @@ bool CorrectCC(CharClass *cc, CCTest *t, const char *desc) {
if (ShouldContain(t, j) == ncc->Contains(j)) {
Broke(desc, t, cc);
Broke("ncc", NULL, ncc);
- printf("want ncc contains(%d)!=%d, got %d\n",
- j, ShouldContain(t, j), ncc->Contains(j));
+ absl::PrintF("want ncc contains(%d)!=%d, got %d\n",
+ j, ShouldContain(t, j), ncc->Contains(j));
Delete(ncc);
return false;
}
if (ncc->size() != Runemax+1 - cc->size()) {
Broke(desc, t, cc);
Broke("ncc", NULL, ncc);
- printf("ncc size should be %d is %d\n",
- Runemax+1 - cc->size(), ncc->size());
+ absl::PrintF("ncc size should be %d is %d\n",
+ Runemax+1 - cc->size(), ncc->size());
Delete(ncc);
return false;
}
@@ -197,7 +199,7 @@ bool CorrectCC(CharClass *cc, CCTest *t, const char *desc) {
TEST(TestCharClassBuilder, Adds) {
int nfail = 0;
- for (size_t i = 0; i < arraysize(tests); i++) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
CharClassBuilder ccb;
CCTest* t = &tests[i];
for (int j = 0; t->add[j].lo >= 0; j++)
diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc
index 4718830..f6899d3 100644
--- a/re2/testing/compile_test.cc
+++ b/re2/testing/compile_test.cc
@@ -6,7 +6,8 @@
#include <string>
-#include "util/test.h"
+#include "absl/base/macros.h"
+#include "gtest/gtest.h"
#include "util/logging.h"
#include "re2/regexp.h"
#include "re2/prog.h"
@@ -127,7 +128,7 @@ static Test tests[] = {
TEST(TestRegexpCompileToProg, Simple) {
int failed = 0;
- for (size_t i = 0; i < arraysize(tests); i++) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
const re2::Test& t = tests[i];
Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
if (re == NULL) {
@@ -156,7 +157,7 @@ TEST(TestRegexpCompileToProg, Simple) {
EXPECT_EQ(failed, 0);
}
-static void DumpByteMap(StringPiece pattern, Regexp::ParseFlags flags,
+static void DumpByteMap(absl::string_view pattern, Regexp::ParseFlags flags,
std::string* bytemap) {
Regexp* re = Regexp::Parse(pattern, flags, NULL);
EXPECT_TRUE(re != NULL);
@@ -257,7 +258,7 @@ TEST(TestCompile, InsufficientMemory) {
re->Decref();
}
-static void Dump(StringPiece pattern, Regexp::ParseFlags flags,
+static void Dump(absl::string_view pattern, Regexp::ParseFlags flags,
std::string* forward, std::string* reverse) {
Regexp* re = Regexp::Parse(pattern, flags, NULL);
EXPECT_TRUE(re != NULL);
diff --git a/re2/testing/dfa_test.cc b/re2/testing/dfa_test.cc
index 842daaf..b0759f7 100644
--- a/re2/testing/dfa_test.cc
+++ b/re2/testing/dfa_test.cc
@@ -7,11 +7,12 @@
#include <thread>
#include <vector>
-#include "util/test.h"
-#include "util/flags.h"
+#include "absl/base/macros.h"
+#include "absl/flags/flag.h"
+#include "absl/strings/str_format.h"
+#include "gtest/gtest.h"
#include "util/logging.h"
#include "util/malloc_counter.h"
-#include "util/strutil.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/regexp.h"
@@ -20,9 +21,9 @@
static const bool UsingMallocCounter = false;
-DEFINE_FLAG(int, size, 8, "log2(number of DFA nodes)");
-DEFINE_FLAG(int, repeat, 2, "Repetition count.");
-DEFINE_FLAG(int, threads, 4, "number of threads");
+ABSL_FLAG(int, size, 8, "log2(number of DFA nodes)");
+ABSL_FLAG(int, repeat, 2, "Repetition count.");
+ABSL_FLAG(int, threads, 4, "number of threads");
namespace re2 {
@@ -50,7 +51,7 @@ static void DoBuild(Prog* prog) {
TEST(Multithreaded, BuildEntireDFA) {
// Create regexp with 2^FLAGS_size states in DFA.
std::string s = "a";
- for (int i = 0; i < GetFlag(FLAGS_size); i++)
+ for (int i = 0; i < absl::GetFlag(FLAGS_size); i++)
s += "[ab]";
s += "b";
Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL);
@@ -68,14 +69,14 @@ TEST(Multithreaded, BuildEntireDFA) {
}
// Build the DFA simultaneously in a bunch of threads.
- for (int i = 0; i < GetFlag(FLAGS_repeat); i++) {
+ for (int i = 0; i < absl::GetFlag(FLAGS_repeat); i++) {
Prog* prog = re->CompileToProg(0);
ASSERT_TRUE(prog != NULL);
std::vector<std::thread> threads;
- for (int j = 0; j < GetFlag(FLAGS_threads); j++)
+ for (int j = 0; j < absl::GetFlag(FLAGS_threads); j++)
threads.emplace_back(DoBuild, prog);
- for (int j = 0; j < GetFlag(FLAGS_threads); j++)
+ for (int j = 0; j < absl::GetFlag(FLAGS_threads); j++)
threads[j].join();
// One more compile, to make sure everything is okay.
@@ -154,7 +155,7 @@ TEST(SingleThreaded, SearchDFA) {
// Empirically, n = 18 is a good compromise between the two.
const int n = 18;
- Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
+ Regexp* re = Regexp::Parse(absl::StrFormat("0[01]{%d}$", n),
Regexp::LikePerl, NULL);
ASSERT_TRUE(re != NULL);
@@ -172,12 +173,14 @@ TEST(SingleThreaded, SearchDFA) {
for (int i = 0; i < 10; i++) {
bool matched = false;
bool failed = false;
- matched = prog->SearchDFA(match, StringPiece(), Prog::kUnanchored,
- Prog::kFirstMatch, NULL, &failed, NULL);
+ matched =
+ prog->SearchDFA(match, absl::string_view(), Prog::kUnanchored,
+ Prog::kFirstMatch, NULL, &failed, NULL);
ASSERT_FALSE(failed);
ASSERT_TRUE(matched);
- matched = prog->SearchDFA(no_match, StringPiece(), Prog::kUnanchored,
- Prog::kFirstMatch, NULL, &failed, NULL);
+ matched =
+ prog->SearchDFA(no_match, absl::string_view(), Prog::kUnanchored,
+ Prog::kFirstMatch, NULL, &failed, NULL);
ASSERT_FALSE(failed);
ASSERT_FALSE(matched);
}
@@ -201,17 +204,19 @@ TEST(SingleThreaded, SearchDFA) {
// Helper function: searches for match, which should match,
// and no_match, which should not.
-static void DoSearch(Prog* prog, const StringPiece& match,
- const StringPiece& no_match) {
+static void DoSearch(Prog* prog, absl::string_view match,
+ absl::string_view no_match) {
for (int i = 0; i < 2; i++) {
bool matched = false;
bool failed = false;
- matched = prog->SearchDFA(match, StringPiece(), Prog::kUnanchored,
- Prog::kFirstMatch, NULL, &failed, NULL);
+ matched =
+ prog->SearchDFA(match, absl::string_view(), Prog::kUnanchored,
+ Prog::kFirstMatch, NULL, &failed, NULL);
ASSERT_FALSE(failed);
ASSERT_TRUE(matched);
- matched = prog->SearchDFA(no_match, StringPiece(), Prog::kUnanchored,
- Prog::kFirstMatch, NULL, &failed, NULL);
+ matched =
+ prog->SearchDFA(no_match, absl::string_view(), Prog::kUnanchored,
+ Prog::kFirstMatch, NULL, &failed, NULL);
ASSERT_FALSE(failed);
ASSERT_FALSE(matched);
}
@@ -224,7 +229,7 @@ TEST(Multithreaded, SearchDFA) {
// Same as single-threaded test above.
const int n = 18;
- Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
+ Regexp* re = Regexp::Parse(absl::StrFormat("0[01]{%d}$", n),
Regexp::LikePerl, NULL);
ASSERT_TRUE(re != NULL);
std::string no_match = DeBruijnString(n);
@@ -243,14 +248,14 @@ TEST(Multithreaded, SearchDFA) {
// Run the search simultaneously in a bunch of threads.
// Reuse same flags for Multithreaded.BuildDFA above.
- for (int i = 0; i < GetFlag(FLAGS_repeat); i++) {
+ for (int i = 0; i < absl::GetFlag(FLAGS_repeat); i++) {
Prog* prog = re->CompileToProg(1<<n);
ASSERT_TRUE(prog != NULL);
std::vector<std::thread> threads;
- for (int j = 0; j < GetFlag(FLAGS_threads); j++)
+ for (int j = 0; j < absl::GetFlag(FLAGS_threads); j++)
threads.emplace_back(DoSearch, prog, match, no_match);
- for (int j = 0; j < GetFlag(FLAGS_threads); j++)
+ for (int j = 0; j < absl::GetFlag(FLAGS_threads); j++)
threads[j].join();
delete prog;
@@ -281,15 +286,16 @@ ReverseTest reverse_tests[] = {
TEST(DFA, ReverseMatch) {
int nfail = 0;
- for (size_t i = 0; i < arraysize(reverse_tests); i++) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(reverse_tests); i++) {
const ReverseTest& t = reverse_tests[i];
Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
ASSERT_TRUE(re != NULL);
Prog* prog = re->CompileToReverseProg(0);
ASSERT_TRUE(prog != NULL);
bool failed = false;
- bool matched = prog->SearchDFA(t.text, StringPiece(), Prog::kUnanchored,
- Prog::kFirstMatch, NULL, &failed, NULL);
+ bool matched =
+ prog->SearchDFA(t.text, absl::string_view(), Prog::kUnanchored,
+ Prog::kFirstMatch, NULL, &failed, NULL);
if (matched != t.match) {
LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match;
nfail++;
@@ -336,7 +342,7 @@ CallbackTest callback_tests[] = {
TEST(DFA, Callback) {
int nfail = 0;
- for (size_t i = 0; i < arraysize(callback_tests); i++) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(callback_tests); i++) {
const CallbackTest& t = callback_tests[i];
Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
ASSERT_TRUE(re != NULL);
@@ -349,7 +355,7 @@ TEST(DFA, Callback) {
dump += " ";
dump += match ? "[[" : "[";
for (int b = 0; b < prog->bytemap_range() + 1; b++)
- dump += StringPrintf("%d,", next[b]);
+ dump += absl::StrFormat("%d,", next[b]);
dump.pop_back();
dump += match ? "]]" : "]";
});
diff --git a/re2/testing/dump.cc b/re2/testing/dump.cc
index cad0910..5cddd23 100644
--- a/re2/testing/dump.cc
+++ b/re2/testing/dump.cc
@@ -18,11 +18,11 @@
#include <string>
-#include "util/test.h"
+#include "absl/base/macros.h"
+#include "absl/strings/str_format.h"
+#include "gtest/gtest.h"
#include "util/logging.h"
-#include "util/strutil.h"
#include "util/utf.h"
-#include "re2/stringpiece.h"
#include "re2/regexp.h"
namespace re2 {
@@ -55,8 +55,8 @@ static const char* kOpcodeNames[] = {
// Create string representation of regexp with explicit structure.
// Nothing pretty, just for testing.
static void DumpRegexpAppending(Regexp* re, std::string* s) {
- if (re->op() < 0 || re->op() >= arraysize(kOpcodeNames)) {
- *s += StringPrintf("op%d", re->op());
+ if (re->op() < 0 || re->op() >= ABSL_ARRAYSIZE(kOpcodeNames)) {
+ *s += absl::StrFormat("op%d", re->op());
} else {
switch (re->op()) {
default:
@@ -129,7 +129,7 @@ static void DumpRegexpAppending(Regexp* re, std::string* s) {
DumpRegexpAppending(re->sub()[0], s);
break;
case kRegexpRepeat:
- s->append(StringPrintf("%d,%d ", re->min(), re->max()));
+ s->append(absl::StrFormat("%d,%d ", re->min(), re->max()));
DumpRegexpAppending(re->sub()[0], s);
break;
case kRegexpCharClass: {
@@ -139,9 +139,9 @@ static void DumpRegexpAppending(Regexp* re, std::string* s) {
RuneRange rr = *it;
s->append(sep);
if (rr.lo == rr.hi)
- s->append(StringPrintf("%#x", rr.lo));
+ s->append(absl::StrFormat("%#x", rr.lo));
else
- s->append(StringPrintf("%#x-%#x", rr.lo, rr.hi));
+ s->append(absl::StrFormat("%#x-%#x", rr.lo, rr.hi));
sep = " ";
}
break;
diff --git a/re2/testing/exhaustive1_test.cc b/re2/testing/exhaustive1_test.cc
index eef2dae..9337989 100644
--- a/re2/testing/exhaustive1_test.cc
+++ b/re2/testing/exhaustive1_test.cc
@@ -7,7 +7,7 @@
#include <string>
#include <vector>
-#include "util/test.h"
+#include "gtest/gtest.h"
#include "re2/testing/exhaustive_tester.h"
namespace re2 {
diff --git a/re2/testing/exhaustive2_test.cc b/re2/testing/exhaustive2_test.cc
index ae89ece..14f629d 100644
--- a/re2/testing/exhaustive2_test.cc
+++ b/re2/testing/exhaustive2_test.cc
@@ -9,7 +9,7 @@
#include <string>
#include <vector>
-#include "util/test.h"
+#include "gtest/gtest.h"
#include "re2/testing/exhaustive_tester.h"
namespace re2 {
diff --git a/re2/testing/exhaustive3_test.cc b/re2/testing/exhaustive3_test.cc
index 1fe46b6..de703c0 100644
--- a/re2/testing/exhaustive3_test.cc
+++ b/re2/testing/exhaustive3_test.cc
@@ -9,7 +9,7 @@
#include <string>
#include <vector>
-#include "util/test.h"
+#include "gtest/gtest.h"
#include "util/utf.h"
#include "re2/testing/exhaustive_tester.h"
diff --git a/re2/testing/exhaustive_test.cc b/re2/testing/exhaustive_test.cc
index 514fd90..5e586f1 100644
--- a/re2/testing/exhaustive_test.cc
+++ b/re2/testing/exhaustive_test.cc
@@ -4,7 +4,7 @@
// Exhaustive testing of regular expression matching.
-#include "util/test.h"
+#include "gtest/gtest.h"
#include "re2/testing/exhaustive_tester.h"
namespace re2 {
diff --git a/re2/testing/exhaustive_tester.cc b/re2/testing/exhaustive_tester.cc
index b0409c3..a57f700 100644
--- a/re2/testing/exhaustive_tester.cc
+++ b/re2/testing/exhaustive_tester.cc
@@ -13,10 +13,11 @@
#include <stdio.h>
-#include "util/test.h"
-#include "util/flags.h"
+#include "absl/base/macros.h"
+#include "absl/flags/flag.h"
+#include "absl/strings/str_format.h"
+#include "gtest/gtest.h"
#include "util/logging.h"
-#include "util/strutil.h"
#include "re2/testing/exhaustive_tester.h"
#include "re2/testing/tester.h"
@@ -25,15 +26,15 @@
#define LOGGING 0
#endif
-DEFINE_FLAG(bool, show_regexps, false, "show regexps during testing");
+ABSL_FLAG(bool, show_regexps, false, "show regexps during testing");
-DEFINE_FLAG(int, max_bad_regexp_inputs, 1,
- "Stop testing a regular expression after finding this many "
- "strings that break it.");
+ABSL_FLAG(int, max_bad_regexp_inputs, 1,
+ "Stop testing a regular expression after finding this many "
+ "strings that break it.");
namespace re2 {
-static char* escape(const StringPiece& sp) {
+static char* escape(absl::string_view sp) {
static char buf[512];
char* p = buf;
*p++ = '\"';
@@ -55,20 +56,21 @@ static char* escape(const StringPiece& sp) {
return buf;
}
-static void PrintResult(const RE2& re, const StringPiece& input, RE2::Anchor anchor, StringPiece *m, int n) {
+static void PrintResult(const RE2& re, absl::string_view input,
+ RE2::Anchor anchor, absl::string_view* m, int n) {
if (!re.Match(input, 0, input.size(), anchor, m, n)) {
- printf("-");
+ absl::PrintF("-");
return;
}
for (int i = 0; i < n; i++) {
if (i > 0)
- printf(" ");
+ absl::PrintF(" ");
if (m[i].data() == NULL)
- printf("-");
+ absl::PrintF("-");
else
- printf("%td-%td",
- BeginPtr(m[i]) - BeginPtr(input),
- EndPtr(m[i]) - BeginPtr(input));
+ absl::PrintF("%d-%d",
+ BeginPtr(m[i]) - BeginPtr(input),
+ EndPtr(m[i]) - BeginPtr(input));
}
}
@@ -79,11 +81,13 @@ void ExhaustiveTester::HandleRegexp(const std::string& const_regexp) {
regexps_++;
std::string regexp = const_regexp;
if (!topwrapper_.empty()) {
- regexp = StringPrintf(topwrapper_.c_str(), regexp.c_str());
+ auto fmt = absl::ParsedFormat<'s'>::New(topwrapper_);
+ CHECK(fmt != nullptr);
+ regexp = absl::StrFormat(*fmt, regexp);
}
- if (GetFlag(FLAGS_show_regexps)) {
- printf("\r%s", regexp.c_str());
+ if (absl::GetFlag(FLAGS_show_regexps)) {
+ absl::PrintF("\r%s", regexp);
fflush(stdout);
}
@@ -93,32 +97,32 @@ void ExhaustiveTester::HandleRegexp(const std::string& const_regexp) {
if (randomstrings_)
LOG(ERROR) << "Cannot log with random strings.";
if (regexps_ == 1) { // first
- printf("strings\n");
+ absl::PrintF("strings\n");
strgen_.Reset();
while (strgen_.HasNext())
- printf("%s\n", escape(strgen_.Next()));
- printf("regexps\n");
+ absl::PrintF("%s\n", escape(strgen_.Next()));
+ absl::PrintF("regexps\n");
}
- printf("%s\n", escape(regexp));
+ absl::PrintF("%s\n", escape(regexp));
RE2 re(regexp);
RE2::Options longest;
longest.set_longest_match(true);
RE2 relongest(regexp, longest);
int ngroup = re.NumberOfCapturingGroups()+1;
- StringPiece* group = new StringPiece[ngroup];
+ absl::string_view* group = new absl::string_view[ngroup];
strgen_.Reset();
while (strgen_.HasNext()) {
- StringPiece input = strgen_.Next();
+ absl::string_view input = strgen_.Next();
PrintResult(re, input, RE2::ANCHOR_BOTH, group, ngroup);
- printf(";");
+ absl::PrintF(";");
PrintResult(re, input, RE2::UNANCHORED, group, ngroup);
- printf(";");
+ absl::PrintF(";");
PrintResult(relongest, input, RE2::ANCHOR_BOTH, group, ngroup);
- printf(";");
+ absl::PrintF(";");
PrintResult(relongest, input, RE2::UNANCHORED, group, ngroup);
- printf("\n");
+ absl::PrintF("\n");
}
delete[] group;
return;
@@ -137,7 +141,7 @@ void ExhaustiveTester::HandleRegexp(const std::string& const_regexp) {
tests_++;
if (!tester.TestInput(strgen_.Next())) {
failures_++;
- if (++bad_inputs >= GetFlag(FLAGS_max_bad_regexp_inputs))
+ if (++bad_inputs >= absl::GetFlag(FLAGS_max_bad_regexp_inputs))
break;
}
}
@@ -164,8 +168,8 @@ void ExhaustiveTest(int maxatoms, int maxops,
topwrapper);
t.Generate();
if (!LOGGING) {
- printf("%d regexps, %d tests, %d failures [%d/%d str]\n",
- t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size());
+ absl::PrintF("%d regexps, %d tests, %d failures [%d/%d str]\n",
+ t.regexps(), t.tests(), t.failures(), maxstrlen, stralphabet.size());
}
EXPECT_EQ(0, t.failures());
}
@@ -177,7 +181,7 @@ void EgrepTest(int maxatoms, int maxops, const std::string& alphabet,
const std::string& wrapper) {
const char* tops[] = { "", "^(?:%s)", "(?:%s)$", "^(?:%s)$" };
- for (size_t i = 0; i < arraysize(tops); i++) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(tops); i++) {
ExhaustiveTest(maxatoms, maxops,
Split("", alphabet),
RegexpGenerator::EgrepOps(),
diff --git a/re2/testing/exhaustive_tester.h b/re2/testing/exhaustive_tester.h
index 3a14282..906be0c 100644
--- a/re2/testing/exhaustive_tester.h
+++ b/re2/testing/exhaustive_tester.h
@@ -9,7 +9,6 @@
#include <string>
#include <vector>
-#include "util/util.h"
#include "re2/testing/regexp_generator.h"
#include "re2/testing/string_generator.h"
diff --git a/re2/testing/filtered_re2_test.cc b/re2/testing/filtered_re2_test.cc
index c788fda..a8d2dfc 100644
--- a/re2/testing/filtered_re2_test.cc
+++ b/re2/testing/filtered_re2_test.cc
@@ -9,7 +9,8 @@
#include <vector>
#include <utility>
-#include "util/test.h"
+#include "absl/base/macros.h"
+#include "gtest/gtest.h"
#include "util/logging.h"
#include "re2/filtered_re2.h"
#include "re2/re2.h"
@@ -106,12 +107,13 @@ AtomTest atom_tests[] = {
// substring in an OR are removed; that is, only the shortest
// substring is kept.
"SubstrAtomRemovesSuperStrInOr", {
- "(abc123|abc|ghi789|abc1234).*[x-z]+",
+ "(abc123|abc|defxyz|ghi789|abc1234|xyz).*[x-z]+",
"abcd..yyy..yyyzzz",
"mnmnpp[a-z]+PPP"
}, {
"abc",
"ghi789",
+ "xyz",
"abcd",
"yyy",
"yyyzzz",
@@ -184,14 +186,14 @@ bool CheckExpectedAtoms(const char* atoms[],
TEST(FilteredRE2Test, AtomTests) {
int nfail = 0;
- for (size_t i = 0; i < arraysize(atom_tests); i++) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(atom_tests); i++) {
FilterTestVars v;
AtomTest* t = &atom_tests[i];
size_t nregexp, natom;
- for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
+ for (nregexp = 0; nregexp < ABSL_ARRAYSIZE(t->regexps); nregexp++)
if (t->regexps[nregexp] == NULL)
break;
- for (natom = 0; natom < arraysize(t->atoms); natom++)
+ for (natom = 0; natom < ABSL_ARRAYSIZE(t->atoms); natom++)
if (t->atoms[natom] == NULL)
break;
AddRegexpsAndCompile(t->regexps, nregexp, &v);
@@ -223,7 +225,7 @@ TEST(FilteredRE2Test, MatchEmptyPattern) {
// the index we use for the test is for the correct test.
EXPECT_EQ("CheckEmptyPattern", std::string(t->testname));
size_t nregexp;
- for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
+ for (nregexp = 0; nregexp < ABSL_ARRAYSIZE(t->regexps); nregexp++)
if (t->regexps[nregexp] == NULL)
break;
AddRegexpsAndCompile(t->regexps, nregexp, &v);
@@ -240,7 +242,7 @@ TEST(FilteredRE2Test, MatchTests) {
// for this test.
EXPECT_EQ("SubstrAtomRemovesSuperStrInOr", std::string(t->testname));
size_t nregexp;
- for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
+ for (nregexp = 0; nregexp < ABSL_ARRAYSIZE(t->regexps); nregexp++)
if (t->regexps[nregexp] == NULL)
break;
AddRegexpsAndCompile(t->regexps, nregexp, &v);
@@ -287,8 +289,8 @@ TEST(FilteredRE2Test, EmptyStringInStringSetBug) {
FilterTestVars v(0); // override the minimum atom length
const char* regexps[] = {"-R.+(|ADD=;AA){12}}"};
const char* atoms[] = {"", "-r", "add=;aa", "}"};
- AddRegexpsAndCompile(regexps, arraysize(regexps), &v);
- EXPECT_TRUE(CheckExpectedAtoms(atoms, arraysize(atoms),
+ AddRegexpsAndCompile(regexps, ABSL_ARRAYSIZE(regexps), &v);
+ EXPECT_TRUE(CheckExpectedAtoms(atoms, ABSL_ARRAYSIZE(atoms),
"EmptyStringInStringSetBug", &v));
}
diff --git a/re2/testing/mimics_pcre_test.cc b/re2/testing/mimics_pcre_test.cc
index 01ab41e..829659d 100644
--- a/re2/testing/mimics_pcre_test.cc
+++ b/re2/testing/mimics_pcre_test.cc
@@ -2,7 +2,8 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-#include "util/test.h"
+#include "absl/base/macros.h"
+#include "gtest/gtest.h"
#include "util/logging.h"
#include "re2/prog.h"
#include "re2/regexp.h"
@@ -58,7 +59,7 @@ static PCRETest tests[] = {
};
TEST(MimicsPCRE, SimpleTests) {
- for (size_t i = 0; i < arraysize(tests); i++) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
const PCRETest& t = tests[i];
for (size_t j = 0; j < 2; j++) {
Regexp::ParseFlags flags = Regexp::LikePerl;
diff --git a/re2/testing/null_walker.cc b/re2/testing/null_walker.cc
index 2bdea02..745364b 100644
--- a/re2/testing/null_walker.cc
+++ b/re2/testing/null_walker.cc
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-#include "util/test.h"
+#include "gtest/gtest.h"
#include "util/logging.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
diff --git a/re2/testing/parse_test.cc b/re2/testing/parse_test.cc
index e571127..0ee5561 100644
--- a/re2/testing/parse_test.cc
+++ b/re2/testing/parse_test.cc
@@ -6,7 +6,8 @@
#include <string>
-#include "util/test.h"
+#include "absl/base/macros.h"
+#include "gtest/gtest.h"
#include "util/logging.h"
#include "re2/regexp.h"
@@ -165,6 +166,8 @@ static Test tests[] = {
// Test named captures
{ "(?P<name>a)", "cap{name:lit{a}}" },
{ "(?P<中文>a)", "cap{中文:lit{a}}" },
+ { "(?<name>a)", "cap{name:lit{a}}" },
+ { "(?<中文>a)", "cap{中文:lit{a}}" },
// Case-folded literals
{ "[Aa]", "litfold{a}" },
@@ -262,7 +265,7 @@ void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags,
// Test that regexps parse to expected structures.
TEST(TestParse, SimpleRegexps) {
- TestParse(tests, arraysize(tests), kTestFlags, "simple");
+ TestParse(tests, ABSL_ARRAYSIZE(tests), kTestFlags, "simple");
}
Test foldcase_tests[] = {
@@ -279,7 +282,7 @@ Test foldcase_tests[] = {
// Test that parsing with FoldCase works.
TEST(TestParse, FoldCase) {
- TestParse(foldcase_tests, arraysize(foldcase_tests), Regexp::FoldCase, "foldcase");
+ TestParse(foldcase_tests, ABSL_ARRAYSIZE(foldcase_tests), Regexp::FoldCase, "foldcase");
}
Test literal_tests[] = {
@@ -288,7 +291,7 @@ Test literal_tests[] = {
// Test that parsing with Literal works.
TEST(TestParse, Literal) {
- TestParse(literal_tests, arraysize(literal_tests), Regexp::Literal, "literal");
+ TestParse(literal_tests, ABSL_ARRAYSIZE(literal_tests), Regexp::Literal, "literal");
}
Test matchnl_tests[] = {
@@ -301,7 +304,7 @@ Test matchnl_tests[] = {
// Test that parsing with MatchNL works.
// (Also tested above during simple cases.)
TEST(TestParse, MatchNL) {
- TestParse(matchnl_tests, arraysize(matchnl_tests), Regexp::MatchNL, "with MatchNL");
+ TestParse(matchnl_tests, ABSL_ARRAYSIZE(matchnl_tests), Regexp::MatchNL, "with MatchNL");
}
Test nomatchnl_tests[] = {
@@ -313,7 +316,7 @@ Test nomatchnl_tests[] = {
// Test that parsing without MatchNL works.
TEST(TestParse, NoMatchNL) {
- TestParse(nomatchnl_tests, arraysize(nomatchnl_tests), Regexp::NoParseFlags, "without MatchNL");
+ TestParse(nomatchnl_tests, ABSL_ARRAYSIZE(nomatchnl_tests), Regexp::NoParseFlags, "without MatchNL");
}
Test prefix_tests[] = {
@@ -357,7 +360,7 @@ Test prefix_tests[] = {
// Test that prefix factoring works.
TEST(TestParse, Prefix) {
- TestParse(prefix_tests, arraysize(prefix_tests), Regexp::PerlX, "prefix");
+ TestParse(prefix_tests, ABSL_ARRAYSIZE(prefix_tests), Regexp::PerlX, "prefix");
}
Test nested_tests[] = {
@@ -373,7 +376,7 @@ Test nested_tests[] = {
// Test that nested repetition works.
TEST(TestParse, Nested) {
- TestParse(nested_tests, arraysize(nested_tests), Regexp::PerlX, "nested");
+ TestParse(nested_tests, ABSL_ARRAYSIZE(nested_tests), Regexp::PerlX, "nested");
}
// Invalid regular expressions
@@ -395,6 +398,11 @@ const char* badtests[] = {
"(?P<name",
"(?P<x y>a)",
"(?P<>a)",
+ "(?<name>a",
+ "(?<name>",
+ "(?<name",
+ "(?<x y>a)",
+ "(?<>a)",
"[a-Z]",
"(?i)[a-Z]",
"a{100000}",
@@ -415,6 +423,7 @@ const char* only_perl[] = {
"\\Q\\\\\\\\\\E",
"(?:a)",
"(?P<name>a)",
+ "(?<name>a)",
};
// Valid in POSIX, bad in Perl.
@@ -428,20 +437,20 @@ const char* only_posix[] = {
// Test that parser rejects bad regexps.
TEST(TestParse, InvalidRegexps) {
- for (size_t i = 0; i < arraysize(badtests); i++) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(badtests); i++) {
ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::PerlX, NULL) == NULL)
<< " " << badtests[i];
ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::NoParseFlags, NULL) == NULL)
<< " " << badtests[i];
}
- for (size_t i = 0; i < arraysize(only_posix); i++) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(only_posix); i++) {
ASSERT_TRUE(Regexp::Parse(only_posix[i], Regexp::PerlX, NULL) == NULL)
<< " " << only_posix[i];
Regexp* re = Regexp::Parse(only_posix[i], Regexp::NoParseFlags, NULL);
ASSERT_TRUE(re != NULL) << " " << only_posix[i];
re->Decref();
}
- for (size_t i = 0; i < arraysize(only_perl); i++) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(only_perl); i++) {
ASSERT_TRUE(Regexp::Parse(only_perl[i], Regexp::NoParseFlags, NULL) == NULL)
<< " " << only_perl[i];
Regexp* re = Regexp::Parse(only_perl[i], Regexp::PerlX, NULL);
@@ -452,7 +461,7 @@ TEST(TestParse, InvalidRegexps) {
// Test that ToString produces original regexp or equivalent one.
TEST(TestToString, EquivalentParse) {
- for (size_t i = 0; i < arraysize(tests); i++) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
RegexpStatus status;
Regexp::ParseFlags f = kTestFlags;
if (tests[i].flags != 0) {
@@ -504,6 +513,16 @@ TEST(NamedCaptures, ErrorArgs) {
EXPECT_TRUE(re == NULL);
EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
EXPECT_EQ(status.error_arg(), "(?P<space bar>");
+
+ re = Regexp::Parse("test(?<name", Regexp::LikePerl, &status);
+ EXPECT_TRUE(re == NULL);
+ EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
+ EXPECT_EQ(status.error_arg(), "(?<name");
+
+ re = Regexp::Parse("test(?<space bar>z)", Regexp::LikePerl, &status);
+ EXPECT_TRUE(re == NULL);
+ EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
+ EXPECT_EQ(status.error_arg(), "(?<space bar>");
}
} // namespace re2
diff --git a/re2/testing/possible_match_test.cc b/re2/testing/possible_match_test.cc
index 0ec90ae..fe199c6 100644
--- a/re2/testing/possible_match_test.cc
+++ b/re2/testing/possible_match_test.cc
@@ -6,9 +6,10 @@
#include <string>
#include <vector>
-#include "util/test.h"
+#include "absl/base/macros.h"
+#include "absl/strings/escaping.h"
+#include "gtest/gtest.h"
#include "util/logging.h"
-#include "util/strutil.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/regexp.h"
@@ -107,12 +108,12 @@ static PrefixTest tests[] = {
};
TEST(PossibleMatchRange, HandWritten) {
- for (size_t i = 0; i < arraysize(tests); i++) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
for (size_t j = 0; j < 2; j++) {
const PrefixTest& t = tests[i];
std::string min, max;
if (j == 0) {
- LOG(INFO) << "Checking regexp=" << CEscape(t.regexp);
+ LOG(INFO) << "Checking regexp=" << absl::CEscape(t.regexp);
Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
ASSERT_TRUE(re != NULL);
Prog* prog = re->CompileToProg(0);
@@ -142,26 +143,26 @@ TEST(PossibleMatchRange, Failures) {
// are no valid UTF-8 strings beginning with byte 0xFF.
EXPECT_FALSE(RE2("[\\s\\S]+", RE2::Latin1).
PossibleMatchRange(&min, &max, 10))
- << "min=" << CEscape(min) << ", max=" << CEscape(max);
+ << "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max);
EXPECT_FALSE(RE2("[\\0-\xFF]+", RE2::Latin1).
PossibleMatchRange(&min, &max, 10))
- << "min=" << CEscape(min) << ", max=" << CEscape(max);
+ << "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max);
EXPECT_FALSE(RE2(".+hello", RE2::Latin1).
PossibleMatchRange(&min, &max, 10))
- << "min=" << CEscape(min) << ", max=" << CEscape(max);
+ << "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max);
EXPECT_FALSE(RE2(".*hello", RE2::Latin1).
PossibleMatchRange(&min, &max, 10))
- << "min=" << CEscape(min) << ", max=" << CEscape(max);
+ << "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max);
EXPECT_FALSE(RE2(".*", RE2::Latin1).
PossibleMatchRange(&min, &max, 10))
- << "min=" << CEscape(min) << ", max=" << CEscape(max);
+ << "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max);
EXPECT_FALSE(RE2("\\C*").
PossibleMatchRange(&min, &max, 10))
- << "min=" << CEscape(min) << ", max=" << CEscape(max);
+ << "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max);
// Fails because it's a malformed regexp.
EXPECT_FALSE(RE2("*hello").PossibleMatchRange(&min, &max, 10))
- << "min=" << CEscape(min) << ", max=" << CEscape(max);
+ << "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max);
}
// Exhaustive test: generate all regexps within parameters,
@@ -201,7 +202,7 @@ class PossibleMatchTester : public RegexpGenerator {
void PossibleMatchTester::HandleRegexp(const std::string& regexp) {
regexps_++;
- VLOG(3) << CEscape(regexp);
+ VLOG(3) << absl::CEscape(regexp);
RE2 re(regexp, RE2::Latin1);
ASSERT_EQ(re.error(), "");
@@ -213,12 +214,12 @@ void PossibleMatchTester::HandleRegexp(const std::string& regexp) {
// complicated expressions.
if(strstr(regexp.c_str(), "\\C*"))
return;
- LOG(QFATAL) << "PossibleMatchRange failed on: " << CEscape(regexp);
+ LOG(QFATAL) << "PossibleMatchRange failed on: " << absl::CEscape(regexp);
}
strgen_.Reset();
while (strgen_.HasNext()) {
- const StringPiece& s = strgen_.Next();
+ absl::string_view s = strgen_.Next();
tests_++;
if (!RE2::FullMatch(s, re))
continue;
diff --git a/re2/testing/random_test.cc b/re2/testing/random_test.cc
index 44712eb..d076b39 100644
--- a/re2/testing/random_test.cc
+++ b/re2/testing/random_test.cc
@@ -8,14 +8,15 @@
#include <string>
#include <vector>
-#include "util/test.h"
-#include "util/flags.h"
+#include "absl/flags/flag.h"
+#include "absl/strings/str_format.h"
+#include "gtest/gtest.h"
#include "re2/testing/exhaustive_tester.h"
-DEFINE_FLAG(int, regexpseed, 404, "Random regexp seed.");
-DEFINE_FLAG(int, regexpcount, 100, "How many random regexps to generate.");
-DEFINE_FLAG(int, stringseed, 200, "Random string seed.");
-DEFINE_FLAG(int, stringcount, 100, "How many random strings to generate.");
+ABSL_FLAG(int, regexpseed, 404, "Random regexp seed.");
+ABSL_FLAG(int, regexpcount, 100, "How many random regexps to generate.");
+ABSL_FLAG(int, stringseed, 200, "Random string seed.");
+ABSL_FLAG(int, stringcount, 100, "How many random strings to generate.");
namespace re2 {
@@ -38,12 +39,12 @@ static void RandomTest(int maxatoms, int maxops,
ExhaustiveTester t(maxatoms, maxops, alphabet, ops,
maxstrlen, stralphabet, wrapper, "");
- t.RandomStrings(GetFlag(FLAGS_stringseed),
- GetFlag(FLAGS_stringcount));
- t.GenerateRandom(GetFlag(FLAGS_regexpseed),
- GetFlag(FLAGS_regexpcount));
- printf("%d regexps, %d tests, %d failures [%d/%d str]\n",
- t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size());
+ t.RandomStrings(absl::GetFlag(FLAGS_stringseed),
+ absl::GetFlag(FLAGS_stringcount));
+ t.GenerateRandom(absl::GetFlag(FLAGS_regexpseed),
+ absl::GetFlag(FLAGS_regexpcount));
+ absl::PrintF("%d regexps, %d tests, %d failures [%d/%d str]\n",
+ t.regexps(), t.tests(), t.failures(), maxstrlen, stralphabet.size());
EXPECT_EQ(0, t.failures());
}
diff --git a/re2/testing/re2_arg_test.cc b/re2/testing/re2_arg_test.cc
index f62e17c..4b00be3 100644
--- a/re2/testing/re2_arg_test.cc
+++ b/re2/testing/re2_arg_test.cc
@@ -10,7 +10,8 @@
#include <stdint.h>
#include <string.h>
-#include "util/test.h"
+#include "absl/base/macros.h"
+#include "gtest/gtest.h"
#include "util/logging.h"
#include "re2/re2.h"
@@ -87,7 +88,7 @@ const SuccessTable kSuccessTable[] = {
{ "18446744073709551616", 0, { false, false, false, false, false, false }},
};
-const int kNumStrings = arraysize(kSuccessTable);
+const int kNumStrings = ABSL_ARRAYSIZE(kSuccessTable);
// It's ugly to use a macro, but we apparently can't use the EXPECT_EQ
// macro outside of a TEST block and this seems to be the only way to
@@ -157,4 +158,26 @@ TEST(RE2ArgTest, ParseFromTest) {
#endif
}
+TEST(RE2ArgTest, OptionalDoubleTest) {
+ absl::optional<double> opt;
+ RE2::Arg arg(&opt);
+ EXPECT_TRUE(arg.Parse(NULL, 0));
+ EXPECT_FALSE(opt.has_value());
+ EXPECT_FALSE(arg.Parse("", 0));
+ EXPECT_TRUE(arg.Parse("28.30", 5));
+ EXPECT_TRUE(opt.has_value());
+ EXPECT_EQ(*opt, 28.30);
+}
+
+TEST(RE2ArgTest, OptionalIntWithCRadixTest) {
+ absl::optional<int> opt;
+ RE2::Arg arg = RE2::CRadix(&opt);
+ EXPECT_TRUE(arg.Parse(NULL, 0));
+ EXPECT_FALSE(opt.has_value());
+ EXPECT_FALSE(arg.Parse("", 0));
+ EXPECT_TRUE(arg.Parse("0xb0e", 5));
+ EXPECT_TRUE(opt.has_value());
+ EXPECT_EQ(*opt, 2830);
+}
+
} // namespace re2
diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc
index b1f7d73..151525f 100644
--- a/re2/testing/re2_test.cc
+++ b/re2/testing/re2_test.cc
@@ -18,9 +18,10 @@
#include <unistd.h> /* for sysconf */
#endif
-#include "util/test.h"
+#include "absl/base/macros.h"
+#include "absl/strings/str_format.h"
+#include "gtest/gtest.h"
#include "util/logging.h"
-#include "util/strutil.h"
#include "re2/re2.h"
#include "re2/regexp.h"
@@ -238,7 +239,7 @@ TEST(RE2, Consume) {
std::string word;
std::string s(" aaa b!@#$@#$cccc");
- StringPiece input(s);
+ absl::string_view input(s);
ASSERT_TRUE(RE2::Consume(&input, r, &word));
ASSERT_EQ(word, "aaa") << " input: " << input;
@@ -249,7 +250,7 @@ TEST(RE2, Consume) {
TEST(RE2, ConsumeN) {
const std::string s(" one two three 4");
- StringPiece input(s);
+ absl::string_view input(s);
RE2::Arg argv[2];
const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
@@ -276,7 +277,7 @@ TEST(RE2, FindAndConsume) {
std::string word;
std::string s(" aaa b!@#$@#$cccc");
- StringPiece input(s);
+ absl::string_view input(s);
ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word));
ASSERT_EQ(word, "aaa");
@@ -296,7 +297,7 @@ TEST(RE2, FindAndConsume) {
TEST(RE2, FindAndConsumeN) {
const std::string s(" one two three 4");
- StringPiece input(s);
+ absl::string_view input(s);
RE2::Arg argv[2];
const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
@@ -345,17 +346,17 @@ TEST(RE2, MatchNumberPeculiarity) {
TEST(RE2, Match) {
RE2 re("((\\w+):([0-9]+))"); // extracts host and port
- StringPiece group[4];
+ absl::string_view group[4];
// No match.
- StringPiece s = "zyzzyva";
+ absl::string_view s = "zyzzyva";
ASSERT_FALSE(
- re.Match(s, 0, s.size(), RE2::UNANCHORED, group, arraysize(group)));
+ re.Match(s, 0, s.size(), RE2::UNANCHORED, group, ABSL_ARRAYSIZE(group)));
// Matches and extracts.
s = "a chrisr:9000 here";
ASSERT_TRUE(
- re.Match(s, 0, s.size(), RE2::UNANCHORED, group, arraysize(group)));
+ re.Match(s, 0, s.size(), RE2::UNANCHORED, group, ABSL_ARRAYSIZE(group)));
ASSERT_EQ(group[0], "chrisr:9000");
ASSERT_EQ(group[1], "chrisr:9000");
ASSERT_EQ(group[2], "chrisr");
@@ -528,7 +529,7 @@ TEST(EmptyCharset, Fuzz) {
"[^\\D\\d]",
"[^\\D[:digit:]]"
};
- for (size_t i = 0; i < arraysize(empties); i++)
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(empties); i++)
ASSERT_FALSE(RE2(empties[i]).Match("abc", 0, 3, RE2::UNANCHORED, NULL, 0));
}
@@ -542,8 +543,8 @@ TEST(EmptyCharset, BitstateAssumptions) {
"((((()))))" "([^\\S\\s]|[^\\S\\s])?",
"((((()))))" "(([^\\S\\s]|[^\\S\\s])|)"
};
- StringPiece group[6];
- for (size_t i = 0; i < arraysize(nop_empties); i++)
+ absl::string_view group[6];
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(nop_empties); i++)
ASSERT_TRUE(RE2(nop_empties[i]).Match("", 0, 0, RE2::UNANCHORED, group, 6));
}
@@ -672,15 +673,15 @@ TEST(RE2, FullMatchIntegerArg) {
TEST(RE2, FullMatchStringArg) {
std::string s;
- // String-arg
+ // string-arg
ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", &s));
ASSERT_EQ(s, std::string("ell"));
}
-TEST(RE2, FullMatchStringPieceArg) {
+TEST(RE2, FullMatchStringViewArg) {
int i;
- // StringPiece-arg
- StringPiece sp;
+ absl::string_view sp;
+ // string_view-arg
ASSERT_TRUE(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &sp, &i));
ASSERT_EQ(sp.size(), 4);
ASSERT_TRUE(memcmp(sp.data(), "ruby", 4) == 0);
@@ -742,7 +743,7 @@ TEST(RE2, FullMatchTypedNullArg) {
// Ignore non-void* NULL arg
ASSERT_TRUE(RE2::FullMatch("hello", "he(.*)lo", (char*)NULL));
ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", (std::string*)NULL));
- ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", (StringPiece*)NULL));
+ ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", (absl::string_view*)NULL));
ASSERT_TRUE(RE2::FullMatch("1234", "(.*)", (int*)NULL));
ASSERT_TRUE(RE2::FullMatch("1234567890123456", "(.*)", (long long*)NULL));
ASSERT_TRUE(RE2::FullMatch("123.4567890123456", "(.*)", (double*)NULL));
@@ -777,7 +778,8 @@ TEST(RE2, NULTerminated) {
v[pagesize - 1] = '1';
x = 0;
- ASSERT_TRUE(RE2::FullMatch(StringPiece(v + pagesize - 1, 1), "(.*)", &x));
+ ASSERT_TRUE(
+ RE2::FullMatch(absl::string_view(v + pagesize - 1, 1), "(.*)", &x));
ASSERT_EQ(x, 1);
#endif
}
@@ -914,10 +916,10 @@ TEST(RE2, FloatingPointFullMatchTypes) {
// implementation of strtof(3). And apparently MSVC too. Sigh.
#if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__)
ASSERT_TRUE(RE2::FullMatch("0.1", "(.*)", &v));
- ASSERT_EQ(v, 0.1f) << StringPrintf("%.8g != %.8g", v, 0.1f);
+ ASSERT_EQ(v, 0.1f) << absl::StrFormat("%.8g != %.8g", v, 0.1f);
ASSERT_TRUE(RE2::FullMatch("6700000000081920.1", "(.*)", &v));
ASSERT_EQ(v, 6700000000081920.1f)
- << StringPrintf("%.8g != %.8g", v, 6700000000081920.1f);
+ << absl::StrFormat("%.8g != %.8g", v, 6700000000081920.1f);
#endif
}
{
@@ -929,10 +931,10 @@ TEST(RE2, FloatingPointFullMatchTypes) {
ASSERT_EQ(v, double(1e23));
ASSERT_TRUE(RE2::FullMatch("0.1", "(.*)", &v));
- ASSERT_EQ(v, 0.1) << StringPrintf("%.17g != %.17g", v, 0.1);
+ ASSERT_EQ(v, 0.1) << absl::StrFormat("%.17g != %.17g", v, 0.1);
ASSERT_TRUE(RE2::FullMatch("1.00000005960464485", "(.*)", &v));
ASSERT_EQ(v, 1.0000000596046448)
- << StringPrintf("%.17g != %.17g", v, 1.0000000596046448);
+ << absl::StrFormat("%.17g != %.17g", v, 1.0000000596046448);
}
}
@@ -1242,21 +1244,21 @@ TEST(RE2, DeepRecursion) {
// not implementing case-folding.
TEST(CaseInsensitive, MatchAndConsume) {
std::string text = "A fish named *Wanda*";
- StringPiece sp(text);
- StringPiece result;
+ absl::string_view sp(text);
+ absl::string_view result;
EXPECT_TRUE(RE2::PartialMatch(text, "(?i)([wand]{5})", &result));
EXPECT_TRUE(RE2::FindAndConsume(&sp, "(?i)([wand]{5})", &result));
}
-// RE2 should permit implicit conversions from string, StringPiece, const char*,
+// RE2 should permit implicit conversions from string, string_view, const char*,
// and C string literals.
TEST(RE2, ImplicitConversions) {
std::string re_string(".");
- StringPiece re_stringpiece(".");
- const char* re_cstring = ".";
+ absl::string_view re_string_view(".");
+ const char* re_c_string = ".";
EXPECT_TRUE(RE2::PartialMatch("e", re_string));
- EXPECT_TRUE(RE2::PartialMatch("e", re_stringpiece));
- EXPECT_TRUE(RE2::PartialMatch("e", re_cstring));
+ EXPECT_TRUE(RE2::PartialMatch("e", re_string_view));
+ EXPECT_TRUE(RE2::PartialMatch("e", re_c_string));
EXPECT_TRUE(RE2::PartialMatch("e", "."));
}
@@ -1309,7 +1311,7 @@ static struct ErrorTest {
{ "zz(?P<name\377>abc)", RE2::ErrorBadUTF8, "" },
};
TEST(RE2, ErrorCodeAndArg) {
- for (size_t i = 0; i < arraysize(error_tests); i++) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(error_tests); i++) {
RE2 re(error_tests[i].regexp, RE2::Quiet);
EXPECT_FALSE(re.ok());
EXPECT_EQ(re.error_code(), error_tests[i].error_code) << re.error();
@@ -1332,13 +1334,13 @@ static struct NeverTest {
TEST(RE2, NeverNewline) {
RE2::Options opt;
opt.set_never_nl(true);
- for (size_t i = 0; i < arraysize(never_tests); i++) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(never_tests); i++) {
const NeverTest& t = never_tests[i];
RE2 re(t.regexp, opt);
if (t.match == NULL) {
EXPECT_FALSE(re.PartialMatch(t.text, re));
} else {
- StringPiece m;
+ absl::string_view m;
EXPECT_TRUE(re.PartialMatch(t.text, re, &m));
EXPECT_EQ(m, t.match);
}
@@ -1371,7 +1373,7 @@ TEST(RE2, BitstateCaptureBug) {
RE2::Options opt;
opt.set_max_mem(20000);
RE2 re("(_________$)", opt);
- StringPiece s = "xxxxxxxxxxxxxxxxxxxxxxxxxx_________x";
+ absl::string_view s = "xxxxxxxxxxxxxxxxxxxxxxxxxx_________x";
EXPECT_FALSE(re.Match(s, 0, s.size(), RE2::UNANCHORED, NULL, 0));
}
@@ -1450,10 +1452,10 @@ TEST(RE2, NullVsEmptyString) {
RE2 re(".*");
EXPECT_TRUE(re.ok());
- StringPiece null;
+ absl::string_view null;
EXPECT_TRUE(RE2::FullMatch(null, re));
- StringPiece empty("");
+ absl::string_view empty("");
EXPECT_TRUE(RE2::FullMatch(empty, re));
}
@@ -1465,25 +1467,25 @@ TEST(RE2, NullVsEmptyStringSubmatches) {
EXPECT_TRUE(re.ok());
// matches[0] is overall match, [1] is (), [2] is (foo), [3] is nonexistent.
- StringPiece matches[4];
+ absl::string_view matches[4];
- for (size_t i = 0; i < arraysize(matches); i++)
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(matches); i++)
matches[i] = "bar";
- StringPiece null;
+ absl::string_view null;
EXPECT_TRUE(re.Match(null, 0, null.size(), RE2::UNANCHORED,
- matches, arraysize(matches)));
- for (size_t i = 0; i < arraysize(matches); i++) {
+ matches, ABSL_ARRAYSIZE(matches)));
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(matches); i++) {
EXPECT_TRUE(matches[i].data() == NULL); // always null
EXPECT_TRUE(matches[i].empty());
}
- for (size_t i = 0; i < arraysize(matches); i++)
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(matches); i++)
matches[i] = "bar";
- StringPiece empty("");
+ absl::string_view empty("");
EXPECT_TRUE(re.Match(empty, 0, empty.size(), RE2::UNANCHORED,
- matches, arraysize(matches)));
+ matches, ABSL_ARRAYSIZE(matches)));
EXPECT_TRUE(matches[0].data() != NULL); // empty, not null
EXPECT_TRUE(matches[0].empty());
EXPECT_TRUE(matches[1].data() != NULL); // empty, not null
@@ -1497,7 +1499,7 @@ TEST(RE2, NullVsEmptyStringSubmatches) {
// Issue 1816809
TEST(RE2, Bug1816809) {
RE2 re("(((((llx((-3)|(4)))(;(llx((-3)|(4))))*))))");
- StringPiece piece("llx-3;llx4");
+ absl::string_view piece("llx-3;llx4");
std::string x;
EXPECT_TRUE(RE2::Consume(&piece, re, &x));
}
@@ -1615,7 +1617,7 @@ TEST(RE2, Bug26356109) {
ASSERT_TRUE(re.ok());
std::string s = "abc";
- StringPiece m;
+ absl::string_view m;
ASSERT_TRUE(re.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1));
ASSERT_EQ(m, s) << " (UNANCHORED) got m='" << m << "', want '" << s << "'";
@@ -1645,7 +1647,7 @@ TEST(RE2, Issue310) {
// (?:|a)* matched more text than (?:|a)+ did.
std::string s = "aaa";
- StringPiece m;
+ absl::string_view m;
RE2 star("(?:|a)*");
ASSERT_TRUE(star.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1));
diff --git a/re2/testing/regexp_benchmark.cc b/re2/testing/regexp_benchmark.cc
index 3eeb098..5352b31 100644
--- a/re2/testing/regexp_benchmark.cc
+++ b/re2/testing/regexp_benchmark.cc
@@ -9,19 +9,18 @@
#include <stdlib.h>
#include <string>
#include <thread>
-#include <unordered_map>
#include <utility>
-#include "util/benchmark.h"
-#include "util/test.h"
-#include "util/flags.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/flags/flag.h"
+#include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
+#include "benchmark/benchmark.h"
#include "util/logging.h"
#include "util/malloc_counter.h"
-#include "util/strutil.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/regexp.h"
-#include "util/mutex.h"
#include "util/pcre.h"
namespace re2 {
@@ -41,7 +40,7 @@ void Test() {
CHECK(prog->IsOnePass());
CHECK(prog->CanBitState());
const char* text = "650-253-0001";
- StringPiece sp[4];
+ absl::string_view sp[4];
CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4));
CHECK_EQ(sp[0], "650-253-0001");
CHECK_EQ(sp[1], "650");
@@ -61,22 +60,22 @@ void MemoryUsage() {
CHECK(re);
// Can't pass mc.HeapGrowth() and mc.PeakHeapGrowth() to LOG(INFO) directly,
// because LOG(INFO) might do a big allocation before they get evaluated.
- fprintf(stderr, "Regexp: %7lld bytes (peak=%lld)\n",
- mc.HeapGrowth(), mc.PeakHeapGrowth());
+ absl::FPrintF(stderr, "Regexp: %7d bytes (peak=%d)\n",
+ mc.HeapGrowth(), mc.PeakHeapGrowth());
mc.Reset();
Prog* prog = re->CompileToProg(0);
CHECK(prog);
CHECK(prog->IsOnePass());
CHECK(prog->CanBitState());
- fprintf(stderr, "Prog: %7lld bytes (peak=%lld)\n",
- mc.HeapGrowth(), mc.PeakHeapGrowth());
+ absl::FPrintF(stderr, "Prog: %7d bytes (peak=%d)\n",
+ mc.HeapGrowth(), mc.PeakHeapGrowth());
mc.Reset();
- StringPiece sp[4];
+ absl::string_view sp[4];
CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4));
- fprintf(stderr, "Search: %7lld bytes (peak=%lld)\n",
- mc.HeapGrowth(), mc.PeakHeapGrowth());
+ absl::FPrintF(stderr, "Search: %7d bytes (peak=%d)\n",
+ mc.HeapGrowth(), mc.PeakHeapGrowth());
delete prog;
re->Decref();
}
@@ -85,22 +84,22 @@ void MemoryUsage() {
MallocCounter mc(MallocCounter::THIS_THREAD_ONLY);
PCRE re(regexp, PCRE::UTF8);
- fprintf(stderr, "RE: %7lld bytes (peak=%lld)\n",
- mc.HeapGrowth(), mc.PeakHeapGrowth());
+ absl::FPrintF(stderr, "RE: %7d bytes (peak=%d)\n",
+ mc.HeapGrowth(), mc.PeakHeapGrowth());
PCRE::FullMatch(text, re);
- fprintf(stderr, "RE: %7lld bytes (peak=%lld)\n",
- mc.HeapGrowth(), mc.PeakHeapGrowth());
+ absl::FPrintF(stderr, "RE: %7d bytes (peak=%d)\n",
+ mc.HeapGrowth(), mc.PeakHeapGrowth());
}
{
MallocCounter mc(MallocCounter::THIS_THREAD_ONLY);
PCRE* re = new PCRE(regexp, PCRE::UTF8);
- fprintf(stderr, "PCRE*: %7lld bytes (peak=%lld)\n",
- mc.HeapGrowth(), mc.PeakHeapGrowth());
+ absl::FPrintF(stderr, "PCRE*: %7d bytes (peak=%d)\n",
+ mc.HeapGrowth(), mc.PeakHeapGrowth());
PCRE::FullMatch(text, *re);
- fprintf(stderr, "PCRE*: %7lld bytes (peak=%lld)\n",
- mc.HeapGrowth(), mc.PeakHeapGrowth());
+ absl::FPrintF(stderr, "PCRE*: %7d bytes (peak=%d)\n",
+ mc.HeapGrowth(), mc.PeakHeapGrowth());
delete re;
}
@@ -108,15 +107,15 @@ void MemoryUsage() {
MallocCounter mc(MallocCounter::THIS_THREAD_ONLY);
RE2 re(regexp);
- fprintf(stderr, "RE2: %7lld bytes (peak=%lld)\n",
- mc.HeapGrowth(), mc.PeakHeapGrowth());
+ absl::FPrintF(stderr, "RE2: %7d bytes (peak=%d)\n",
+ mc.HeapGrowth(), mc.PeakHeapGrowth());
RE2::FullMatch(text, re);
- fprintf(stderr, "RE2: %7lld bytes (peak=%lld)\n",
- mc.HeapGrowth(), mc.PeakHeapGrowth());
+ absl::FPrintF(stderr, "RE2: %7d bytes (peak=%d)\n",
+ mc.HeapGrowth(), mc.PeakHeapGrowth());
}
- fprintf(stderr, "sizeof: PCRE=%zd RE2=%zd Prog=%zd Inst=%zd\n",
- sizeof(PCRE), sizeof(RE2), sizeof(Prog), sizeof(Prog::Inst));
+ absl::FPrintF(stderr, "sizeof: PCRE=%d RE2=%d Prog=%d Inst=%d\n",
+ sizeof(PCRE), sizeof(RE2), sizeof(Prog), sizeof(Prog::Inst));
}
int NumCPUs() {
@@ -128,7 +127,7 @@ int NumCPUs() {
// and not interesting.
typedef void SearchImpl(benchmark::State& state, const char* regexp,
- const StringPiece& text, Prog::Anchor anchor,
+ absl::string_view text, Prog::Anchor anchor,
bool expect_match);
SearchImpl SearchDFA, SearchNFA, SearchOnePass, SearchBitState, SearchPCRE,
@@ -136,7 +135,7 @@ SearchImpl SearchDFA, SearchNFA, SearchOnePass, SearchBitState, SearchPCRE,
SearchCachedBitState, SearchCachedPCRE, SearchCachedRE2;
typedef void ParseImpl(benchmark::State& state, const char* regexp,
- const StringPiece& text);
+ absl::string_view text);
ParseImpl Parse1NFA, Parse1OnePass, Parse1BitState, Parse1PCRE, Parse1RE2,
Parse1Backtrack, Parse1CachedNFA, Parse1CachedOnePass, Parse1CachedBitState,
@@ -318,8 +317,8 @@ void FindAndConsume(benchmark::State& state) {
s.append("Hello World");
RE2 re("((Hello World))");
for (auto _ : state) {
- StringPiece t = s;
- StringPiece u;
+ absl::string_view t = s;
+ absl::string_view u;
CHECK(RE2::FindAndConsume(&t, re, &u));
CHECK_EQ(u, "Hello World");
}
@@ -442,7 +441,7 @@ BENCHMARK_RANGE(Search_AltMatch_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCP
// Benchmark: use regexp to find phone number.
void SearchDigits(benchmark::State& state, SearchImpl* search) {
- StringPiece s("650-253-0001");
+ absl::string_view s("650-253-0001");
search(state, "([0-9]+)-([0-9]+)-([0-9]+)", s, Prog::kAnchored, true);
state.SetItemsProcessed(state.iterations());
}
@@ -467,7 +466,7 @@ BENCHMARK(Search_Digits_BitState)->ThreadRange(1, NumCPUs());
void Parse3Digits(benchmark::State& state,
void (*parse3)(benchmark::State&, const char*,
- const StringPiece&)) {
+ absl::string_view)) {
parse3(state, "([0-9]+)-([0-9]+)-([0-9]+)", "650-253-0001");
state.SetItemsProcessed(state.iterations());
}
@@ -506,7 +505,7 @@ BENCHMARK(Parse_CachedDigits_BitState)->ThreadRange(1, NumCPUs());
void Parse3DigitDs(benchmark::State& state,
void (*parse3)(benchmark::State&, const char*,
- const StringPiece&)) {
+ absl::string_view)) {
parse3(state, "(\\d+)-(\\d+)-(\\d+)", "650-253-0001");
state.SetItemsProcessed(state.iterations());
}
@@ -547,7 +546,7 @@ BENCHMARK(Parse_CachedDigitDs_BitState)->ThreadRange(1, NumCPUs());
void Parse1Split(benchmark::State& state,
void (*parse1)(benchmark::State&, const char*,
- const StringPiece&)) {
+ absl::string_view)) {
parse1(state, "[0-9]+-(.*)", "650-253-0001");
state.SetItemsProcessed(state.iterations());
}
@@ -584,7 +583,7 @@ BENCHMARK(Parse_CachedSplit_BitState)->ThreadRange(1, NumCPUs());
void Parse1SplitHard(benchmark::State& state,
void (*run)(benchmark::State&, const char*,
- const StringPiece&)) {
+ absl::string_view)) {
run(state, "[0-9]+.(.*)", "650-253-0001");
state.SetItemsProcessed(state.iterations());
}
@@ -619,7 +618,7 @@ BENCHMARK(Parse_CachedSplitHard_Backtrack)->ThreadRange(1, NumCPUs());
void Parse1SplitBig1(benchmark::State& state,
void (*run)(benchmark::State&, const char*,
- const StringPiece&)) {
+ absl::string_view)) {
std::string s;
s.append(100000, 'x');
s.append("650-253-0001");
@@ -639,7 +638,7 @@ BENCHMARK(Parse_CachedSplitBig1_RE2)->ThreadRange(1, NumCPUs());
void Parse1SplitBig2(benchmark::State& state,
void (*run)(benchmark::State&, const char*,
- const StringPiece&)) {
+ absl::string_view)) {
std::string s;
s.append("650-253-");
s.append(100000, '0');
@@ -756,20 +755,20 @@ void RunBuild(benchmark::State& state, const std::string& regexp,
} // namespace re2
-DEFINE_FLAG(std::string, compile_regexp, "(.*)-(\\d+)-of-(\\d+)",
- "regexp for compile benchmarks");
+ABSL_FLAG(std::string, compile_regexp, "(.*)-(\\d+)-of-(\\d+)",
+ "regexp for compile benchmarks");
namespace re2 {
-void BM_PCRE_Compile(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), CompilePCRE); }
-void BM_Regexp_Parse(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), ParseRegexp); }
-void BM_Regexp_Simplify(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), SimplifyRegexp); }
-void BM_CompileToProg(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), CompileToProg); }
-void BM_CompileByteMap(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), CompileByteMap); }
-void BM_Regexp_Compile(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), CompileRegexp); }
-void BM_Regexp_SimplifyCompile(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), SimplifyCompileRegexp); }
-void BM_Regexp_NullWalk(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), NullWalkRegexp); }
-void BM_RE2_Compile(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), CompileRE2); }
+void BM_PCRE_Compile(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), CompilePCRE); }
+void BM_Regexp_Parse(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), ParseRegexp); }
+void BM_Regexp_Simplify(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), SimplifyRegexp); }
+void BM_CompileToProg(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), CompileToProg); }
+void BM_CompileByteMap(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), CompileByteMap); }
+void BM_Regexp_Compile(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), CompileRegexp); }
+void BM_Regexp_SimplifyCompile(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), SimplifyCompileRegexp); }
+void BM_Regexp_NullWalk(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), NullWalkRegexp); }
+void BM_RE2_Compile(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), CompileRE2); }
#ifdef USEPCRE
BENCHMARK(BM_PCRE_Compile)->ThreadRange(1, NumCPUs());
@@ -859,7 +858,7 @@ DO24(MY_BENCHMARK_WITH_ARG, CacheFillDFA)
// Anchored says whether to run an anchored search.
void SearchDFA(benchmark::State& state, const char* regexp,
- const StringPiece& text, Prog::Anchor anchor,
+ absl::string_view text, Prog::Anchor anchor,
bool expect_match) {
for (auto _ : state) {
Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
@@ -867,8 +866,8 @@ void SearchDFA(benchmark::State& state, const char* regexp,
Prog* prog = re->CompileToProg(0);
CHECK(prog);
bool failed = false;
- CHECK_EQ(prog->SearchDFA(text, StringPiece(), anchor, Prog::kFirstMatch,
- NULL, &failed, NULL),
+ CHECK_EQ(prog->SearchDFA(text, absl::string_view(), anchor,
+ Prog::kFirstMatch, NULL, &failed, NULL),
expect_match);
CHECK(!failed);
delete prog;
@@ -877,15 +876,15 @@ void SearchDFA(benchmark::State& state, const char* regexp,
}
void SearchNFA(benchmark::State& state, const char* regexp,
- const StringPiece& text, Prog::Anchor anchor,
+ absl::string_view text, Prog::Anchor anchor,
bool expect_match) {
for (auto _ : state) {
Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
CHECK(re);
Prog* prog = re->CompileToProg(0);
CHECK(prog);
- CHECK_EQ(prog->SearchNFA(text, StringPiece(), anchor, Prog::kFirstMatch,
- NULL, 0),
+ CHECK_EQ(prog->SearchNFA(text, absl::string_view(), anchor,
+ Prog::kFirstMatch, NULL, 0),
expect_match);
delete prog;
re->Decref();
@@ -893,7 +892,7 @@ void SearchNFA(benchmark::State& state, const char* regexp,
}
void SearchOnePass(benchmark::State& state, const char* regexp,
- const StringPiece& text, Prog::Anchor anchor,
+ absl::string_view text, Prog::Anchor anchor,
bool expect_match) {
for (auto _ : state) {
Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
@@ -909,7 +908,7 @@ void SearchOnePass(benchmark::State& state, const char* regexp,
}
void SearchBitState(benchmark::State& state, const char* regexp,
- const StringPiece& text, Prog::Anchor anchor,
+ absl::string_view text, Prog::Anchor anchor,
bool expect_match) {
for (auto _ : state) {
Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
@@ -925,7 +924,7 @@ void SearchBitState(benchmark::State& state, const char* regexp,
}
void SearchPCRE(benchmark::State& state, const char* regexp,
- const StringPiece& text, Prog::Anchor anchor,
+ absl::string_view text, Prog::Anchor anchor,
bool expect_match) {
for (auto _ : state) {
PCRE re(regexp, PCRE::UTF8);
@@ -938,7 +937,7 @@ void SearchPCRE(benchmark::State& state, const char* regexp,
}
void SearchRE2(benchmark::State& state, const char* regexp,
- const StringPiece& text, Prog::Anchor anchor,
+ absl::string_view text, Prog::Anchor anchor,
bool expect_match) {
for (auto _ : state) {
RE2 re(regexp);
@@ -955,9 +954,9 @@ void SearchRE2(benchmark::State& state, const char* regexp,
// search time without the per-regexp overhead.
Prog* GetCachedProg(const char* regexp) {
- static auto& mutex = *new Mutex;
- MutexLock lock(&mutex);
- static auto& cache = *new std::unordered_map<std::string, Prog*>;
+ static auto& mutex = *new absl::Mutex;
+ absl::MutexLock lock(&mutex);
+ static auto& cache = *new absl::flat_hash_map<std::string, Prog*>;
Prog* prog = cache[regexp];
if (prog == NULL) {
Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
@@ -973,9 +972,9 @@ Prog* GetCachedProg(const char* regexp) {
}
PCRE* GetCachedPCRE(const char* regexp) {
- static auto& mutex = *new Mutex;
- MutexLock lock(&mutex);
- static auto& cache = *new std::unordered_map<std::string, PCRE*>;
+ static auto& mutex = *new absl::Mutex;
+ absl::MutexLock lock(&mutex);
+ static auto& cache = *new absl::flat_hash_map<std::string, PCRE*>;
PCRE* re = cache[regexp];
if (re == NULL) {
re = new PCRE(regexp, PCRE::UTF8);
@@ -986,9 +985,9 @@ PCRE* GetCachedPCRE(const char* regexp) {
}
RE2* GetCachedRE2(const char* regexp) {
- static auto& mutex = *new Mutex;
- MutexLock lock(&mutex);
- static auto& cache = *new std::unordered_map<std::string, RE2*>;
+ static auto& mutex = *new absl::Mutex;
+ absl::MutexLock lock(&mutex);
+ static auto& cache = *new absl::flat_hash_map<std::string, RE2*>;
RE2* re = cache[regexp];
if (re == NULL) {
re = new RE2(regexp);
@@ -999,31 +998,31 @@ RE2* GetCachedRE2(const char* regexp) {
}
void SearchCachedDFA(benchmark::State& state, const char* regexp,
- const StringPiece& text, Prog::Anchor anchor,
+ absl::string_view text, Prog::Anchor anchor,
bool expect_match) {
Prog* prog = GetCachedProg(regexp);
for (auto _ : state) {
bool failed = false;
- CHECK_EQ(prog->SearchDFA(text, StringPiece(), anchor, Prog::kFirstMatch,
- NULL, &failed, NULL),
+ CHECK_EQ(prog->SearchDFA(text, absl::string_view(), anchor,
+ Prog::kFirstMatch, NULL, &failed, NULL),
expect_match);
CHECK(!failed);
}
}
void SearchCachedNFA(benchmark::State& state, const char* regexp,
- const StringPiece& text, Prog::Anchor anchor,
+ absl::string_view text, Prog::Anchor anchor,
bool expect_match) {
Prog* prog = GetCachedProg(regexp);
for (auto _ : state) {
- CHECK_EQ(prog->SearchNFA(text, StringPiece(), anchor, Prog::kFirstMatch,
- NULL, 0),
+ CHECK_EQ(prog->SearchNFA(text, absl::string_view(), anchor,
+ Prog::kFirstMatch, NULL, 0),
expect_match);
}
}
void SearchCachedOnePass(benchmark::State& state, const char* regexp,
- const StringPiece& text, Prog::Anchor anchor,
+ absl::string_view text, Prog::Anchor anchor,
bool expect_match) {
Prog* prog = GetCachedProg(regexp);
CHECK(prog->IsOnePass());
@@ -1034,7 +1033,7 @@ void SearchCachedOnePass(benchmark::State& state, const char* regexp,
}
void SearchCachedBitState(benchmark::State& state, const char* regexp,
- const StringPiece& text, Prog::Anchor anchor,
+ absl::string_view text, Prog::Anchor anchor,
bool expect_match) {
Prog* prog = GetCachedProg(regexp);
CHECK(prog->CanBitState());
@@ -1045,7 +1044,7 @@ void SearchCachedBitState(benchmark::State& state, const char* regexp,
}
void SearchCachedPCRE(benchmark::State& state, const char* regexp,
- const StringPiece& text, Prog::Anchor anchor,
+ absl::string_view text, Prog::Anchor anchor,
bool expect_match) {
PCRE& re = *GetCachedPCRE(regexp);
for (auto _ : state) {
@@ -1057,7 +1056,7 @@ void SearchCachedPCRE(benchmark::State& state, const char* regexp,
}
void SearchCachedRE2(benchmark::State& state, const char* regexp,
- const StringPiece& text, Prog::Anchor anchor,
+ absl::string_view text, Prog::Anchor anchor,
bool expect_match) {
RE2& re = *GetCachedRE2(regexp);
for (auto _ : state) {
@@ -1072,14 +1071,14 @@ void SearchCachedRE2(benchmark::State& state, const char* regexp,
// extracting three submatches. Expects match always.
void Parse3NFA(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
for (auto _ : state) {
Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
CHECK(re);
Prog* prog = re->CompileToProg(0);
CHECK(prog);
- StringPiece sp[4]; // 4 because sp[0] is whole match.
- CHECK(prog->SearchNFA(text, StringPiece(), Prog::kAnchored,
+ absl::string_view sp[4]; // 4 because sp[0] is whole match.
+ CHECK(prog->SearchNFA(text, absl::string_view(), Prog::kAnchored,
Prog::kFullMatch, sp, 4));
delete prog;
re->Decref();
@@ -1087,14 +1086,14 @@ void Parse3NFA(benchmark::State& state, const char* regexp,
}
void Parse3OnePass(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
for (auto _ : state) {
Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
CHECK(re);
Prog* prog = re->CompileToProg(0);
CHECK(prog);
CHECK(prog->IsOnePass());
- StringPiece sp[4]; // 4 because sp[0] is whole match.
+ absl::string_view sp[4]; // 4 because sp[0] is whole match.
CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4));
delete prog;
re->Decref();
@@ -1102,14 +1101,14 @@ void Parse3OnePass(benchmark::State& state, const char* regexp,
}
void Parse3BitState(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
for (auto _ : state) {
Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
CHECK(re);
Prog* prog = re->CompileToProg(0);
CHECK(prog);
CHECK(prog->CanBitState());
- StringPiece sp[4]; // 4 because sp[0] is whole match.
+ absl::string_view sp[4]; // 4 because sp[0] is whole match.
CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4));
delete prog;
re->Decref();
@@ -1117,13 +1116,13 @@ void Parse3BitState(benchmark::State& state, const char* regexp,
}
void Parse3Backtrack(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
for (auto _ : state) {
Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
CHECK(re);
Prog* prog = re->CompileToProg(0);
CHECK(prog);
- StringPiece sp[4]; // 4 because sp[0] is whole match.
+ absl::string_view sp[4]; // 4 because sp[0] is whole match.
CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4));
delete prog;
re->Decref();
@@ -1131,77 +1130,77 @@ void Parse3Backtrack(benchmark::State& state, const char* regexp,
}
void Parse3PCRE(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
for (auto _ : state) {
PCRE re(regexp, PCRE::UTF8);
CHECK_EQ(re.error(), "");
- StringPiece sp1, sp2, sp3;
+ absl::string_view sp1, sp2, sp3;
CHECK(PCRE::FullMatch(text, re, &sp1, &sp2, &sp3));
}
}
void Parse3RE2(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
for (auto _ : state) {
RE2 re(regexp);
CHECK_EQ(re.error(), "");
- StringPiece sp1, sp2, sp3;
+ absl::string_view sp1, sp2, sp3;
CHECK(RE2::FullMatch(text, re, &sp1, &sp2, &sp3));
}
}
void Parse3CachedNFA(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
Prog* prog = GetCachedProg(regexp);
- StringPiece sp[4]; // 4 because sp[0] is whole match.
+ absl::string_view sp[4]; // 4 because sp[0] is whole match.
for (auto _ : state) {
- CHECK(prog->SearchNFA(text, StringPiece(), Prog::kAnchored,
+ CHECK(prog->SearchNFA(text, absl::string_view(), Prog::kAnchored,
Prog::kFullMatch, sp, 4));
}
}
void Parse3CachedOnePass(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
Prog* prog = GetCachedProg(regexp);
CHECK(prog->IsOnePass());
- StringPiece sp[4]; // 4 because sp[0] is whole match.
+ absl::string_view sp[4]; // 4 because sp[0] is whole match.
for (auto _ : state) {
CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4));
}
}
void Parse3CachedBitState(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
Prog* prog = GetCachedProg(regexp);
CHECK(prog->CanBitState());
- StringPiece sp[4]; // 4 because sp[0] is whole match.
+ absl::string_view sp[4]; // 4 because sp[0] is whole match.
for (auto _ : state) {
CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4));
}
}
void Parse3CachedBacktrack(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
Prog* prog = GetCachedProg(regexp);
- StringPiece sp[4]; // 4 because sp[0] is whole match.
+ absl::string_view sp[4]; // 4 because sp[0] is whole match.
for (auto _ : state) {
CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4));
}
}
void Parse3CachedPCRE(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
PCRE& re = *GetCachedPCRE(regexp);
- StringPiece sp1, sp2, sp3;
+ absl::string_view sp1, sp2, sp3;
for (auto _ : state) {
CHECK(PCRE::FullMatch(text, re, &sp1, &sp2, &sp3));
}
}
void Parse3CachedRE2(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
RE2& re = *GetCachedRE2(regexp);
- StringPiece sp1, sp2, sp3;
+ absl::string_view sp1, sp2, sp3;
for (auto _ : state) {
CHECK(RE2::FullMatch(text, re, &sp1, &sp2, &sp3));
}
@@ -1211,14 +1210,14 @@ void Parse3CachedRE2(benchmark::State& state, const char* regexp,
// extracting three submatches. Expects match always.
void Parse1NFA(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
for (auto _ : state) {
Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
CHECK(re);
Prog* prog = re->CompileToProg(0);
CHECK(prog);
- StringPiece sp[2]; // 2 because sp[0] is whole match.
- CHECK(prog->SearchNFA(text, StringPiece(), Prog::kAnchored,
+ absl::string_view sp[2]; // 2 because sp[0] is whole match.
+ CHECK(prog->SearchNFA(text, absl::string_view(), Prog::kAnchored,
Prog::kFullMatch, sp, 2));
delete prog;
re->Decref();
@@ -1226,14 +1225,14 @@ void Parse1NFA(benchmark::State& state, const char* regexp,
}
void Parse1OnePass(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
for (auto _ : state) {
Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
CHECK(re);
Prog* prog = re->CompileToProg(0);
CHECK(prog);
CHECK(prog->IsOnePass());
- StringPiece sp[2]; // 2 because sp[0] is whole match.
+ absl::string_view sp[2]; // 2 because sp[0] is whole match.
CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2));
delete prog;
re->Decref();
@@ -1241,14 +1240,14 @@ void Parse1OnePass(benchmark::State& state, const char* regexp,
}
void Parse1BitState(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
for (auto _ : state) {
Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL);
CHECK(re);
Prog* prog = re->CompileToProg(0);
CHECK(prog);
CHECK(prog->CanBitState());
- StringPiece sp[2]; // 2 because sp[0] is whole match.
+ absl::string_view sp[2]; // 2 because sp[0] is whole match.
CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2));
delete prog;
re->Decref();
@@ -1256,114 +1255,114 @@ void Parse1BitState(benchmark::State& state, const char* regexp,
}
void Parse1PCRE(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
for (auto _ : state) {
PCRE re(regexp, PCRE::UTF8);
CHECK_EQ(re.error(), "");
- StringPiece sp1;
+ absl::string_view sp1;
CHECK(PCRE::FullMatch(text, re, &sp1));
}
}
void Parse1RE2(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
for (auto _ : state) {
RE2 re(regexp);
CHECK_EQ(re.error(), "");
- StringPiece sp1;
+ absl::string_view sp1;
CHECK(RE2::FullMatch(text, re, &sp1));
}
}
void Parse1CachedNFA(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
Prog* prog = GetCachedProg(regexp);
- StringPiece sp[2]; // 2 because sp[0] is whole match.
+ absl::string_view sp[2]; // 2 because sp[0] is whole match.
for (auto _ : state) {
- CHECK(prog->SearchNFA(text, StringPiece(), Prog::kAnchored,
+ CHECK(prog->SearchNFA(text, absl::string_view(), Prog::kAnchored,
Prog::kFullMatch, sp, 2));
}
}
void Parse1CachedOnePass(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
Prog* prog = GetCachedProg(regexp);
CHECK(prog->IsOnePass());
- StringPiece sp[2]; // 2 because sp[0] is whole match.
+ absl::string_view sp[2]; // 2 because sp[0] is whole match.
for (auto _ : state) {
CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2));
}
}
void Parse1CachedBitState(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
Prog* prog = GetCachedProg(regexp);
CHECK(prog->CanBitState());
- StringPiece sp[2]; // 2 because sp[0] is whole match.
+ absl::string_view sp[2]; // 2 because sp[0] is whole match.
for (auto _ : state) {
CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2));
}
}
void Parse1CachedBacktrack(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
Prog* prog = GetCachedProg(regexp);
- StringPiece sp[2]; // 2 because sp[0] is whole match.
+ absl::string_view sp[2]; // 2 because sp[0] is whole match.
for (auto _ : state) {
CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2));
}
}
void Parse1CachedPCRE(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
PCRE& re = *GetCachedPCRE(regexp);
- StringPiece sp1;
+ absl::string_view sp1;
for (auto _ : state) {
CHECK(PCRE::FullMatch(text, re, &sp1));
}
}
void Parse1CachedRE2(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
RE2& re = *GetCachedRE2(regexp);
- StringPiece sp1;
+ absl::string_view sp1;
for (auto _ : state) {
CHECK(RE2::FullMatch(text, re, &sp1));
}
}
void SearchParse2CachedPCRE(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
PCRE& re = *GetCachedPCRE(regexp);
for (auto _ : state) {
- StringPiece sp1, sp2;
+ absl::string_view sp1, sp2;
CHECK(PCRE::PartialMatch(text, re, &sp1, &sp2));
}
}
void SearchParse2CachedRE2(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
RE2& re = *GetCachedRE2(regexp);
for (auto _ : state) {
- StringPiece sp1, sp2;
+ absl::string_view sp1, sp2;
CHECK(RE2::PartialMatch(text, re, &sp1, &sp2));
}
}
void SearchParse1CachedPCRE(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
PCRE& re = *GetCachedPCRE(regexp);
for (auto _ : state) {
- StringPiece sp1;
+ absl::string_view sp1;
CHECK(PCRE::PartialMatch(text, re, &sp1));
}
}
void SearchParse1CachedRE2(benchmark::State& state, const char* regexp,
- const StringPiece& text) {
+ absl::string_view text) {
RE2& re = *GetCachedRE2(regexp);
for (auto _ : state) {
- StringPiece sp1;
+ absl::string_view sp1;
CHECK(RE2::PartialMatch(text, re, &sp1));
}
}
@@ -1409,7 +1408,7 @@ static std::string http_text =
"alksdjfhasdlkfhasdlkjfhasdljkfhadsjklf HTTP/1.1";
void HTTPPartialMatchPCRE(benchmark::State& state) {
- StringPiece a;
+ absl::string_view a;
PCRE re("(?-s)^(?:GET|POST) +([^ ]+) HTTP");
for (auto _ : state) {
PCRE::PartialMatch(http_text, re, &a);
@@ -1417,7 +1416,7 @@ void HTTPPartialMatchPCRE(benchmark::State& state) {
}
void HTTPPartialMatchRE2(benchmark::State& state) {
- StringPiece a;
+ absl::string_view a;
RE2 re("(?-s)^(?:GET|POST) +([^ ]+) HTTP");
for (auto _ : state) {
RE2::PartialMatch(http_text, re, &a);
@@ -1433,7 +1432,7 @@ static std::string smallhttp_text =
"GET /abc HTTP/1.1";
void SmallHTTPPartialMatchPCRE(benchmark::State& state) {
- StringPiece a;
+ absl::string_view a;
PCRE re("(?-s)^(?:GET|POST) +([^ ]+) HTTP");
for (auto _ : state) {
PCRE::PartialMatch(smallhttp_text, re, &a);
@@ -1441,7 +1440,7 @@ void SmallHTTPPartialMatchPCRE(benchmark::State& state) {
}
void SmallHTTPPartialMatchRE2(benchmark::State& state) {
- StringPiece a;
+ absl::string_view a;
RE2 re("(?-s)^(?:GET|POST) +([^ ]+) HTTP");
for (auto _ : state) {
RE2::PartialMatch(smallhttp_text, re, &a);
@@ -1454,7 +1453,7 @@ BENCHMARK(SmallHTTPPartialMatchPCRE)->ThreadRange(1, NumCPUs());
BENCHMARK(SmallHTTPPartialMatchRE2)->ThreadRange(1, NumCPUs());
void DotMatchPCRE(benchmark::State& state) {
- StringPiece a;
+ absl::string_view a;
PCRE re("(?-s)^(.+)");
for (auto _ : state) {
PCRE::PartialMatch(http_text, re, &a);
@@ -1462,7 +1461,7 @@ void DotMatchPCRE(benchmark::State& state) {
}
void DotMatchRE2(benchmark::State& state) {
- StringPiece a;
+ absl::string_view a;
RE2 re("(?-s)^(.+)");
for (auto _ : state) {
RE2::PartialMatch(http_text, re, &a);
@@ -1475,7 +1474,7 @@ BENCHMARK(DotMatchPCRE)->ThreadRange(1, NumCPUs());
BENCHMARK(DotMatchRE2)->ThreadRange(1, NumCPUs());
void ASCIIMatchPCRE(benchmark::State& state) {
- StringPiece a;
+ absl::string_view a;
PCRE re("(?-s)^([ -~]+)");
for (auto _ : state) {
PCRE::PartialMatch(http_text, re, &a);
@@ -1483,7 +1482,7 @@ void ASCIIMatchPCRE(benchmark::State& state) {
}
void ASCIIMatchRE2(benchmark::State& state) {
- StringPiece a;
+ absl::string_view a;
RE2 re("(?-s)^([ -~]+)");
for (auto _ : state) {
RE2::PartialMatch(http_text, re, &a);
diff --git a/re2/testing/regexp_generator.cc b/re2/testing/regexp_generator.cc
index 3eeda25..b1761ed 100644
--- a/re2/testing/regexp_generator.cc
+++ b/re2/testing/regexp_generator.cc
@@ -29,9 +29,11 @@
#include <string>
#include <vector>
-#include "util/test.h"
+#include "absl/base/macros.h"
+#include "absl/strings/escaping.h"
+#include "absl/strings/str_format.h"
+#include "gtest/gtest.h"
#include "util/logging.h"
-#include "util/strutil.h"
#include "util/utf.h"
#include "re2/testing/regexp_generator.h"
@@ -47,7 +49,7 @@ const std::vector<std::string>& RegexpGenerator::EgrepOps() {
"%s?",
"%s\\C*",
};
- static std::vector<std::string> v(ops, ops + arraysize(ops));
+ static std::vector<std::string> v(ops, ops + ABSL_ARRAYSIZE(ops));
return v;
}
@@ -199,19 +201,21 @@ void RegexpGenerator::RunPostfix(const std::vector<std::string>& post) {
regexps.push(post[i]);
break;
case 1: {
+ auto fmt = absl::ParsedFormat<'s'>::New(post[i]);
+ CHECK(fmt != nullptr);
std::string a = regexps.top();
regexps.pop();
- regexps.push("(?:" + StringPrintf(post[i].c_str(), a.c_str()) + ")");
+ regexps.push("(?:" + absl::StrFormat(*fmt, a) + ")");
break;
}
case 2: {
+ auto fmt = absl::ParsedFormat<'s', 's'>::New(post[i]);
+ CHECK(fmt != nullptr);
std::string b = regexps.top();
regexps.pop();
std::string a = regexps.top();
regexps.pop();
- regexps.push("(?:" +
- StringPrintf(post[i].c_str(), a.c_str(), b.c_str()) +
- ")");
+ regexps.push("(?:" + absl::StrFormat(*fmt, a, b) + ")");
break;
}
}
@@ -219,13 +223,13 @@ void RegexpGenerator::RunPostfix(const std::vector<std::string>& post) {
if (regexps.size() != 1) {
// Internal error - should never happen.
- printf("Bad regexp program:\n");
+ absl::PrintF("Bad regexp program:\n");
for (size_t i = 0; i < post.size(); i++) {
- printf(" %s\n", CEscape(post[i]).c_str());
+ absl::PrintF(" %s\n", absl::CEscape(post[i]));
}
- printf("Stack after running program:\n");
+ absl::PrintF("Stack after running program:\n");
while (!regexps.empty()) {
- printf(" %s\n", CEscape(regexps.top()).c_str());
+ absl::PrintF(" %s\n", absl::CEscape(regexps.top()));
regexps.pop();
}
LOG(FATAL) << "Bad regexp program.";
@@ -238,7 +242,7 @@ void RegexpGenerator::RunPostfix(const std::vector<std::string>& post) {
}
// Split s into an vector of strings, one for each UTF-8 character.
-std::vector<std::string> Explode(const StringPiece& s) {
+std::vector<std::string> Explode(absl::string_view s) {
std::vector<std::string> v;
for (const char *q = s.data(); q < s.data() + s.size(); ) {
@@ -253,7 +257,7 @@ std::vector<std::string> Explode(const StringPiece& s) {
// Split string everywhere a substring is found, returning
// vector of pieces.
-std::vector<std::string> Split(const StringPiece& sep, const StringPiece& s) {
+std::vector<std::string> Split(absl::string_view sep, absl::string_view s) {
std::vector<std::string> v;
if (sep.empty())
@@ -261,7 +265,7 @@ std::vector<std::string> Split(const StringPiece& sep, const StringPiece& s) {
const char *p = s.data();
for (const char *q = s.data(); q + sep.size() <= s.data() + s.size(); q++) {
- if (StringPiece(q, sep.size()) == sep) {
+ if (absl::string_view(q, sep.size()) == sep) {
v.push_back(std::string(p, q - p));
p = q + sep.size();
q = p - 1; // -1 for ++ in loop
diff --git a/re2/testing/regexp_generator.h b/re2/testing/regexp_generator.h
index 7d72aff..e1be1a9 100644
--- a/re2/testing/regexp_generator.h
+++ b/re2/testing/regexp_generator.h
@@ -13,8 +13,7 @@
#include <string>
#include <vector>
-#include "util/util.h"
-#include "re2/stringpiece.h"
+#include "absl/strings/string_view.h"
namespace re2 {
@@ -66,11 +65,11 @@ class RegexpGenerator {
// Helpers for preparing arguments to RegexpGenerator constructor.
// Returns one string for each character in s.
-std::vector<std::string> Explode(const StringPiece& s);
+std::vector<std::string> Explode(absl::string_view s);
// Splits string everywhere sep is found, returning
// vector of pieces.
-std::vector<std::string> Split(const StringPiece& sep, const StringPiece& s);
+std::vector<std::string> Split(absl::string_view sep, absl::string_view s);
} // namespace re2
diff --git a/re2/testing/regexp_test.cc b/re2/testing/regexp_test.cc
index f7e7e92..ef8f59d 100644
--- a/re2/testing/regexp_test.cc
+++ b/re2/testing/regexp_test.cc
@@ -9,7 +9,7 @@
#include <string>
#include <vector>
-#include "util/test.h"
+#include "gtest/gtest.h"
#include "util/logging.h"
#include "re2/regexp.h"
diff --git a/re2/testing/required_prefix_test.cc b/re2/testing/required_prefix_test.cc
index 60a11f8..231fd34 100644
--- a/re2/testing/required_prefix_test.cc
+++ b/re2/testing/required_prefix_test.cc
@@ -4,7 +4,8 @@
#include <string>
-#include "util/test.h"
+#include "absl/base/macros.h"
+#include "gtest/gtest.h"
#include "util/logging.h"
#include "re2/prog.h"
#include "re2/regexp.h"
@@ -44,7 +45,7 @@ static PrefixTest tests[] = {
};
TEST(RequiredPrefix, SimpleTests) {
- for (size_t i = 0; i < arraysize(tests); i++) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
const PrefixTest& t = tests[i];
for (size_t j = 0; j < 2; j++) {
Regexp::ParseFlags flags = Regexp::LikePerl;
@@ -106,7 +107,7 @@ static PrefixTest for_accel_tests[] = {
};
TEST(RequiredPrefixForAccel, SimpleTests) {
- for (size_t i = 0; i < arraysize(for_accel_tests); i++) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(for_accel_tests); i++) {
const PrefixTest& t = for_accel_tests[i];
for (size_t j = 0; j < 2; j++) {
Regexp::ParseFlags flags = Regexp::LikePerl;
@@ -171,7 +172,7 @@ static const char* prefix_accel_tests[] = {
};
TEST(PrefixAccel, SimpleTests) {
- for (size_t i = 0; i < arraysize(prefix_accel_tests); i++) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(prefix_accel_tests); i++) {
const char* pattern = prefix_accel_tests[i];
Regexp* re = Regexp::Parse(pattern, Regexp::LikePerl, NULL);
ASSERT_TRUE(re != NULL);
diff --git a/re2/testing/search_test.cc b/re2/testing/search_test.cc
index 5d86dbf..166652a 100644
--- a/re2/testing/search_test.cc
+++ b/re2/testing/search_test.cc
@@ -2,7 +2,8 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-#include "util/test.h"
+#include "absl/base/macros.h"
+#include "gtest/gtest.h"
#include "re2/prog.h"
#include "re2/regexp.h"
#include "re2/testing/tester.h"
@@ -314,7 +315,7 @@ RegexpTest simple_tests[] = {
TEST(Regexp, SearchTests) {
int failures = 0;
- for (size_t i = 0; i < arraysize(simple_tests); i++) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(simple_tests); i++) {
const RegexpTest& t = simple_tests[i];
if (!TestRegexpOnText(t.regexp, t.text))
failures++;
diff --git a/re2/testing/set_test.cc b/re2/testing/set_test.cc
index 5a760c4..fdbc0b2 100644
--- a/re2/testing/set_test.cc
+++ b/re2/testing/set_test.cc
@@ -7,7 +7,7 @@
#include <vector>
#include <utility>
-#include "util/test.h"
+#include "gtest/gtest.h"
#include "util/logging.h"
#include "re2/re2.h"
#include "re2/set.h"
diff --git a/re2/testing/simplify_test.cc b/re2/testing/simplify_test.cc
index 9dcd4ac..5b683f5 100644
--- a/re2/testing/simplify_test.cc
+++ b/re2/testing/simplify_test.cc
@@ -7,7 +7,8 @@
#include <string.h>
#include <string>
-#include "util/test.h"
+#include "absl/base/macros.h"
+#include "gtest/gtest.h"
#include "util/logging.h"
#include "re2/regexp.h"
@@ -139,6 +140,22 @@ static Test tests[] = {
{ "(){1,}", "()+" },
{ "(){0,2}", "(?:()()?)?" },
+ // For an empty-width op OR a concatenation or alternation of empty-width
+ // ops, test that the repetition count is capped at 1.
+ { "(?:^){0,}", "^*" }, // x{0,} -> x*
+ { "(?:$){28,}", "$+" }, // x{N,} -> x{1,} -> x+
+ { "(?-m:^){0,30}", "(?-m:^)?" }, // x{0,N} -> x{0,1} -> x?
+ { "(?-m:$){28,30}", "(?-m:$)" }, // x{N,M} -> x{1,1} -> x
+ { "\\b(?:\\b\\B){999}\\B", "\\b\\b\\B\\B" },
+ { "\\b(?:\\b|\\B){999}\\B", "\\b(?:\\b|\\B)\\B" },
+ // NonGreedy should also be handled.
+ { "(?:^){0,}?", "^*?" },
+ { "(?:$){28,}?", "$+?" },
+ { "(?-m:^){0,30}?", "(?-m:^)??" },
+ { "(?-m:$){28,30}?", "(?-m:$)" },
+ { "\\b(?:\\b\\B){999}?\\B", "\\b\\b\\B\\B" },
+ { "\\b(?:\\b|\\B){999}?\\B", "\\b(?:\\b|\\B)\\B" },
+
// Test that coalescing occurs and that the resulting repeats are simplified.
// Two-op combinations of *, +, ?, {n}, {n,} and {n,m} with a literal:
{ "a*a*", "a*" },
@@ -245,7 +262,7 @@ static Test tests[] = {
};
TEST(TestSimplify, SimpleRegexps) {
- for (size_t i = 0; i < arraysize(tests); i++) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
RegexpStatus status;
VLOG(1) << "Testing " << tests[i].regexp;
Regexp* re = Regexp::Parse(tests[i].regexp,
diff --git a/re2/testing/string_generator.cc b/re2/testing/string_generator.cc
index 44837fe..1891b14 100644
--- a/re2/testing/string_generator.cc
+++ b/re2/testing/string_generator.cc
@@ -11,7 +11,7 @@
#include <string>
#include <vector>
-#include "util/test.h"
+#include "gtest/gtest.h"
#include "util/logging.h"
#include "re2/testing/string_generator.h"
@@ -81,11 +81,11 @@ bool StringGenerator::RandomDigits() {
// currently described by digits_. Calls IncrementDigits
// after computing the string, so that it knows the answer
// for subsequent HasNext() calls.
-const StringPiece& StringGenerator::Next() {
+absl::string_view StringGenerator::Next() {
CHECK(hasnext_);
if (generate_null_) {
generate_null_ = false;
- sp_ = StringPiece();
+ sp_ = absl::string_view();
return sp_;
}
s_.clear();
diff --git a/re2/testing/string_generator.h b/re2/testing/string_generator.h
index 73fbb51..0d6f5fc 100644
--- a/re2/testing/string_generator.h
+++ b/re2/testing/string_generator.h
@@ -14,8 +14,7 @@
#include <string>
#include <vector>
-#include "util/util.h"
-#include "re2/stringpiece.h"
+#include "absl/strings/string_view.h"
namespace re2 {
@@ -24,7 +23,7 @@ class StringGenerator {
StringGenerator(int maxlen, const std::vector<std::string>& alphabet);
~StringGenerator() {}
- const StringPiece& Next();
+ absl::string_view Next();
bool HasNext() { return hasnext_; }
// Resets generator to start sequence over.
@@ -45,11 +44,11 @@ class StringGenerator {
std::vector<std::string> alphabet_; // Alphabet, one string per letter.
// Iteration state.
- StringPiece sp_; // Last StringPiece returned by Next().
- std::string s_; // String data in last StringPiece returned by Next().
+ absl::string_view sp_; // Last string_view returned by Next().
+ std::string s_; // String data in last string_view returned by Next().
bool hasnext_; // Whether Next() can be called again.
std::vector<int> digits_; // Alphabet indices for next string.
- bool generate_null_; // Whether to generate a NULL StringPiece next.
+ bool generate_null_; // Whether to generate a NULL string_view next.
bool random_; // Whether generated strings are random.
int nrandom_; // Number of random strings left to generate.
std::minstd_rand0 rng_; // Random number generator.
diff --git a/re2/testing/string_generator_test.cc b/re2/testing/string_generator_test.cc
index d0f84f4..b1273d9 100644
--- a/re2/testing/string_generator_test.cc
+++ b/re2/testing/string_generator_test.cc
@@ -7,7 +7,7 @@
#include <stdint.h>
#include <string>
-#include "util/test.h"
+#include "gtest/gtest.h"
#include "util/utf.h"
#include "re2/testing/string_generator.h"
#include "re2/testing/regexp_generator.h"
@@ -41,7 +41,7 @@ static void RunTest(int len, const std::string& alphabet, bool donull) {
if (donull) {
g.GenerateNULL();
EXPECT_TRUE(g.HasNext());
- StringPiece sp = g.Next();
+ absl::string_view sp = g.Next();
EXPECT_EQ(sp.data(), static_cast<const char*>(NULL));
EXPECT_EQ(sp.size(), 0);
}
diff --git a/re2/testing/tester.cc b/re2/testing/tester.cc
index b0c22f2..a094cb4 100644
--- a/re2/testing/tester.cc
+++ b/re2/testing/tester.cc
@@ -9,24 +9,25 @@
#include <string.h>
#include <string>
-#include "util/util.h"
-#include "util/flags.h"
+#include "absl/base/macros.h"
+#include "absl/flags/flag.h"
+#include "absl/strings/escaping.h"
+#include "absl/strings/str_format.h"
#include "util/logging.h"
-#include "util/strutil.h"
#include "re2/testing/tester.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/regexp.h"
-DEFINE_FLAG(bool, dump_prog, false, "dump regexp program");
-DEFINE_FLAG(bool, log_okay, false, "log successful runs");
-DEFINE_FLAG(bool, dump_rprog, false, "dump reversed regexp program");
+ABSL_FLAG(bool, dump_prog, false, "dump regexp program");
+ABSL_FLAG(bool, log_okay, false, "log successful runs");
+ABSL_FLAG(bool, dump_rprog, false, "dump reversed regexp program");
-DEFINE_FLAG(int, max_regexp_failures, 100,
- "maximum number of regexp test failures (-1 = unlimited)");
+ABSL_FLAG(int, max_regexp_failures, 100,
+ "maximum number of regexp test failures (-1 = unlimited)");
-DEFINE_FLAG(std::string, regexp_engines, "",
- "pattern to select regexp engines to test");
+ABSL_FLAG(std::string, regexp_engines, "",
+ "pattern to select regexp engines to test");
namespace re2 {
@@ -50,7 +51,7 @@ const char* engine_names[kEngineMax] = {
// Returns the name of the engine.
static const char* EngineName(Engine e) {
CHECK_GE(e, 0);
- CHECK_LT(e, arraysize(engine_names));
+ CHECK_LT(e, ABSL_ARRAYSIZE(engine_names));
CHECK(engine_names[e] != NULL);
return engine_names[e];
}
@@ -63,11 +64,11 @@ static uint32_t Engines() {
if (did_parse)
return cached_engines;
- if (GetFlag(FLAGS_regexp_engines).empty()) {
+ if (absl::GetFlag(FLAGS_regexp_engines).empty()) {
cached_engines = ~0;
} else {
for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++)
- if (GetFlag(FLAGS_regexp_engines).find(EngineName(i)) != std::string::npos)
+ if (absl::GetFlag(FLAGS_regexp_engines).find(EngineName(i)) != std::string::npos)
cached_engines |= 1<<i;
}
@@ -97,7 +98,7 @@ struct TestInstance::Result {
void ClearSubmatch() {
for (int i = 0; i < kMaxSubmatch; i++)
- submatch[i] = StringPiece();
+ submatch[i] = absl::string_view();
}
bool skipped; // test skipped: wasn't applicable
@@ -105,24 +106,24 @@ struct TestInstance::Result {
bool untrusted; // don't really trust the answer
bool have_submatch; // computed all submatch info
bool have_submatch0; // computed just submatch[0]
- StringPiece submatch[kMaxSubmatch];
+ absl::string_view submatch[kMaxSubmatch];
};
typedef TestInstance::Result Result;
// Formats a single capture range s in text in the form (a,b)
// where a and b are the starting and ending offsets of s in text.
-static std::string FormatCapture(const StringPiece& text,
- const StringPiece& s) {
+static std::string FormatCapture(absl::string_view text,
+ absl::string_view s) {
if (s.data() == NULL)
return "(?,?)";
- return StringPrintf("(%td,%td)",
- BeginPtr(s) - BeginPtr(text),
- EndPtr(s) - BeginPtr(text));
+ return absl::StrFormat("(%d,%d)",
+ BeginPtr(s) - BeginPtr(text),
+ EndPtr(s) - BeginPtr(text));
}
// Returns whether text contains non-ASCII (>= 0x80) bytes.
-static bool NonASCII(const StringPiece& text) {
+static bool NonASCII(absl::string_view text) {
for (size_t i = 0; i < text.size(); i++)
if ((uint8_t)text[i] >= 0x80)
return true;
@@ -174,15 +175,15 @@ static ParseMode parse_modes[] = {
};
static std::string FormatMode(Regexp::ParseFlags flags) {
- for (size_t i = 0; i < arraysize(parse_modes); i++)
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(parse_modes); i++)
if (parse_modes[i].parse_flags == flags)
return parse_modes[i].desc;
- return StringPrintf("%#x", static_cast<uint32_t>(flags));
+ return absl::StrFormat("%#x", static_cast<uint32_t>(flags));
}
// Constructs and saves all the matching engines that
// will be required for the given tests.
-TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
+TestInstance::TestInstance(absl::string_view regexp_str, Prog::MatchKind kind,
Regexp::ParseFlags flags)
: regexp_str_(regexp_str),
kind_(kind),
@@ -195,14 +196,14 @@ TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
re_(NULL),
re2_(NULL) {
- VLOG(1) << CEscape(regexp_str);
+ VLOG(1) << absl::CEscape(regexp_str);
// Compile regexp to prog.
// Always required - needed for backtracking (reference implementation).
RegexpStatus status;
regexp_ = Regexp::Parse(regexp_str, flags, &status);
if (regexp_ == NULL) {
- LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
+ LOG(INFO) << "Cannot parse: " << absl::CEscape(regexp_str_)
<< " mode: " << FormatMode(flags);
error_ = true;
return;
@@ -210,14 +211,14 @@ TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
num_captures_ = regexp_->NumCaptures();
prog_ = regexp_->CompileToProg(0);
if (prog_ == NULL) {
- LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_);
+ LOG(INFO) << "Cannot compile: " << absl::CEscape(regexp_str_);
error_ = true;
return;
}
- if (GetFlag(FLAGS_dump_prog)) {
+ if (absl::GetFlag(FLAGS_dump_prog)) {
LOG(INFO) << "Prog for "
<< " regexp "
- << CEscape(regexp_str_)
+ << absl::CEscape(regexp_str_)
<< " (" << FormatKind(kind_)
<< ", " << FormatMode(flags_)
<< ")\n"
@@ -228,11 +229,11 @@ TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) {
rprog_ = regexp_->CompileToReverseProg(0);
if (rprog_ == NULL) {
- LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_);
+ LOG(INFO) << "Cannot reverse compile: " << absl::CEscape(regexp_str_);
error_ = true;
return;
}
- if (GetFlag(FLAGS_dump_rprog))
+ if (absl::GetFlag(FLAGS_dump_rprog))
LOG(INFO) << rprog_->Dump();
}
@@ -256,7 +257,7 @@ TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
options.set_longest_match(true);
re2_ = new RE2(re, options);
if (!re2_->error().empty()) {
- LOG(INFO) << "Cannot RE2: " << CEscape(re);
+ LOG(INFO) << "Cannot RE2: " << absl::CEscape(re);
error_ = true;
return;
}
@@ -282,7 +283,7 @@ TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
// add one more layer of parens.
re_ = new PCRE("("+re+")", o);
if (!re_->error().empty()) {
- LOG(INFO) << "Cannot PCRE: " << CEscape(re);
+ LOG(INFO) << "Cannot PCRE: " << absl::CEscape(re);
error_ = true;
return;
}
@@ -301,11 +302,9 @@ TestInstance::~TestInstance() {
// Runs a single search using the named engine type.
// This interface hides all the irregularities of the various
// engine interfaces from the rest of this file.
-void TestInstance::RunSearch(Engine type,
- const StringPiece& orig_text,
- const StringPiece& orig_context,
- Prog::Anchor anchor,
- Result* result) {
+void TestInstance::RunSearch(Engine type, absl::string_view orig_text,
+ absl::string_view orig_context,
+ Prog::Anchor anchor, Result* result) {
if (regexp_ == NULL) {
result->skipped = true;
return;
@@ -314,8 +313,8 @@ void TestInstance::RunSearch(Engine type,
if (nsubmatch > kMaxSubmatch)
nsubmatch = kMaxSubmatch;
- StringPiece text = orig_text;
- StringPiece context = orig_context;
+ absl::string_view text = orig_text;
+ absl::string_view context = orig_context;
switch (type) {
default:
@@ -368,8 +367,8 @@ void TestInstance::RunSearch(Engine type,
result->submatch,
&result->skipped, NULL)) {
LOG(ERROR) << "Reverse DFA inconsistency: "
- << CEscape(regexp_str_)
- << " on " << CEscape(text);
+ << absl::CEscape(regexp_str_)
+ << " on " << absl::CEscape(text);
result->matched = false;
}
}
@@ -438,19 +437,19 @@ void TestInstance::RunSearch(Engine type,
// whitespace, not just vertical tab. Regexp::MimicsPCRE() is
// unable to handle all cases of this, unfortunately, so just
// catch them here. :(
- if (regexp_str_.find("\\v") != StringPiece::npos &&
- (text.find('\n') != StringPiece::npos ||
- text.find('\f') != StringPiece::npos ||
- text.find('\r') != StringPiece::npos)) {
+ if (regexp_str_.find("\\v") != absl::string_view::npos &&
+ (text.find('\n') != absl::string_view::npos ||
+ text.find('\f') != absl::string_view::npos ||
+ text.find('\r') != absl::string_view::npos)) {
result->skipped = true;
break;
}
// PCRE 8.34 or so started allowing vertical tab to match \s,
// following a change made in Perl 5.18. RE2 does not.
- if ((regexp_str_.find("\\s") != StringPiece::npos ||
- regexp_str_.find("\\S") != StringPiece::npos) &&
- text.find('\v') != StringPiece::npos) {
+ if ((regexp_str_.find("\\s") != absl::string_view::npos ||
+ regexp_str_.find("\\S") != absl::string_view::npos) &&
+ text.find('\v') != absl::string_view::npos) {
result->skipped = true;
break;
}
@@ -513,7 +512,7 @@ static bool ResultOkay(const Result& r, const Result& correct) {
}
// Runs a single test.
-bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
+bool TestInstance::RunCase(absl::string_view text, absl::string_view context,
Prog::Anchor anchor) {
// Backtracking is the gold standard.
Result correct;
@@ -521,12 +520,12 @@ bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
if (correct.skipped) {
if (regexp_ == NULL)
return true;
- LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
+ LOG(ERROR) << "Skipped backtracking! " << absl::CEscape(regexp_str_)
<< " " << FormatMode(flags_);
return false;
}
- VLOG(1) << "Try: regexp " << CEscape(regexp_str_)
- << " text " << CEscape(text)
+ VLOG(1) << "Try: regexp " << absl::CEscape(regexp_str_)
+ << " text " << absl::CEscape(text)
<< " (" << FormatKind(kind_)
<< ", " << FormatAnchor(anchor)
<< ", " << FormatMode(flags_)
@@ -541,7 +540,7 @@ bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
Result r;
RunSearch(i, text, context, anchor, &r);
if (ResultOkay(r, correct)) {
- if (GetFlag(FLAGS_log_okay))
+ if (absl::GetFlag(FLAGS_log_okay))
LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);
continue;
}
@@ -571,14 +570,14 @@ bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
if (r.submatch[i].data() != correct.submatch[i].data() ||
r.submatch[i].size() != correct.submatch[i].size()) {
LOG(INFO) <<
- StringPrintf(" $%d: should be %s is %s",
- i,
- FormatCapture(text, correct.submatch[i]).c_str(),
- FormatCapture(text, r.submatch[i]).c_str());
+ absl::StrFormat(" $%d: should be %s is %s",
+ i,
+ FormatCapture(text, correct.submatch[i]),
+ FormatCapture(text, r.submatch[i]));
} else {
LOG(INFO) <<
- StringPrintf(" $%d: %s ok", i,
- FormatCapture(text, r.submatch[i]).c_str());
+ absl::StrFormat(" $%d: %s ok", i,
+ FormatCapture(text, r.submatch[i]));
}
}
}
@@ -586,7 +585,7 @@ bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
if (!all_okay) {
// This will be initialised once (after flags have been initialised)
// and that is desirable because we want to enforce a global limit.
- static int max_regexp_failures = GetFlag(FLAGS_max_regexp_failures);
+ static int max_regexp_failures = absl::GetFlag(FLAGS_max_regexp_failures);
if (max_regexp_failures > 0 && --max_regexp_failures == 0)
LOG(QFATAL) << "Too many regexp failures.";
}
@@ -595,22 +594,22 @@ bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
}
void TestInstance::LogMatch(const char* prefix, Engine e,
- const StringPiece& text, const StringPiece& context,
+ absl::string_view text, absl::string_view context,
Prog::Anchor anchor) {
LOG(INFO) << prefix
<< EngineName(e)
<< " regexp "
- << CEscape(regexp_str_)
+ << absl::CEscape(regexp_str_)
<< " "
- << CEscape(regexp_->ToString())
+ << absl::CEscape(regexp_->ToString())
<< " text "
- << CEscape(text)
+ << absl::CEscape(text)
<< " ("
<< BeginPtr(text) - BeginPtr(context)
<< ","
<< EndPtr(text) - BeginPtr(context)
<< ") of context "
- << CEscape(context)
+ << absl::CEscape(context)
<< " (" << FormatKind(kind_)
<< ", " << FormatAnchor(anchor)
<< ", " << FormatMode(flags_)
@@ -624,10 +623,10 @@ static Prog::MatchKind kinds[] = {
};
// Test all possible match kinds and parse modes.
-Tester::Tester(const StringPiece& regexp) {
+Tester::Tester(absl::string_view regexp) {
error_ = false;
- for (size_t i = 0; i < arraysize(kinds); i++) {
- for (size_t j = 0; j < arraysize(parse_modes); j++) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(kinds); i++) {
+ for (size_t j = 0; j < ABSL_ARRAYSIZE(parse_modes); j++) {
TestInstance* t = new TestInstance(regexp, kinds[i],
parse_modes[j].parse_flags);
error_ |= t->error();
@@ -641,8 +640,8 @@ Tester::~Tester() {
delete v_[i];
}
-bool Tester::TestCase(const StringPiece& text, const StringPiece& context,
- Prog::Anchor anchor) {
+bool Tester::TestCase(absl::string_view text, absl::string_view context,
+ Prog::Anchor anchor) {
bool okay = true;
for (size_t i = 0; i < v_.size(); i++)
okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));
@@ -654,10 +653,10 @@ static Prog::Anchor anchors[] = {
Prog::kUnanchored
};
-bool Tester::TestInput(const StringPiece& text) {
+bool Tester::TestInput(absl::string_view text) {
bool okay = TestInputInContext(text, text);
if (!text.empty()) {
- StringPiece sp;
+ absl::string_view sp;
sp = text;
sp.remove_prefix(1);
okay &= TestInputInContext(sp, text);
@@ -668,16 +667,16 @@ bool Tester::TestInput(const StringPiece& text) {
return okay;
}
-bool Tester::TestInputInContext(const StringPiece& text,
- const StringPiece& context) {
+bool Tester::TestInputInContext(absl::string_view text,
+ absl::string_view context) {
bool okay = true;
- for (size_t i = 0; i < arraysize(anchors); i++)
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(anchors); i++)
okay &= TestCase(text, context, anchors[i]);
return okay;
}
-bool TestRegexpOnText(const StringPiece& regexp,
- const StringPiece& text) {
+bool TestRegexpOnText(absl::string_view regexp,
+ absl::string_view text) {
Tester t(regexp);
return t.TestInput(text);
}
diff --git a/re2/testing/tester.h b/re2/testing/tester.h
index 47d0c43..59be5ea 100644
--- a/re2/testing/tester.h
+++ b/re2/testing/tester.h
@@ -10,7 +10,7 @@
#include <vector>
-#include "re2/stringpiece.h"
+#include "absl/strings/string_view.h"
#include "re2/prog.h"
#include "re2/regexp.h"
#include "re2/re2.h"
@@ -51,7 +51,7 @@ class TestInstance {
public:
struct Result;
- TestInstance(const StringPiece& regexp, Prog::MatchKind kind,
+ TestInstance(absl::string_view regexp, Prog::MatchKind kind,
Regexp::ParseFlags flags);
~TestInstance();
Regexp::ParseFlags flags() { return flags_; }
@@ -59,20 +59,18 @@ class TestInstance {
// Runs a single test case: search in text, which is in context,
// using the given anchoring.
- bool RunCase(const StringPiece& text, const StringPiece& context,
+ bool RunCase(absl::string_view text, absl::string_view context,
Prog::Anchor anchor);
private:
// Runs a single search using the named engine type.
- void RunSearch(Engine type,
- const StringPiece& text, const StringPiece& context,
- Prog::Anchor anchor,
- Result *result);
+ void RunSearch(Engine type, absl::string_view text, absl::string_view context,
+ Prog::Anchor anchor, Result* result);
- void LogMatch(const char* prefix, Engine e, const StringPiece& text,
- const StringPiece& context, Prog::Anchor anchor);
+ void LogMatch(const char* prefix, Engine e, absl::string_view text,
+ absl::string_view context, Prog::Anchor anchor);
- const StringPiece regexp_str_; // regexp being tested
+ absl::string_view regexp_str_; // regexp being tested
Prog::MatchKind kind_; // kind of match
Regexp::ParseFlags flags_; // flags for parsing regexp_str_
bool error_; // error during constructor?
@@ -91,21 +89,21 @@ class TestInstance {
// A group of TestInstances for all possible configurations.
class Tester {
public:
- explicit Tester(const StringPiece& regexp);
+ explicit Tester(absl::string_view regexp);
~Tester();
bool error() { return error_; }
// Runs a single test case: search in text, which is in context,
// using the given anchoring.
- bool TestCase(const StringPiece& text, const StringPiece& context,
+ bool TestCase(absl::string_view text, absl::string_view context,
Prog::Anchor anchor);
// Run TestCase(text, text, anchor) for all anchoring modes.
- bool TestInput(const StringPiece& text);
+ bool TestInput(absl::string_view text);
// Run TestCase(text, context, anchor) for all anchoring modes.
- bool TestInputInContext(const StringPiece& text, const StringPiece& context);
+ bool TestInputInContext(absl::string_view text, absl::string_view context);
private:
bool error_;
@@ -116,7 +114,7 @@ class Tester {
};
// Run all possible tests using regexp and text.
-bool TestRegexpOnText(const StringPiece& regexp, const StringPiece& text);
+bool TestRegexpOnText(absl::string_view regexp, absl::string_view text);
} // namespace re2
diff --git a/re2/tostring.cc b/re2/tostring.cc
index 9c1c038..33179fd 100644
--- a/re2/tostring.cc
+++ b/re2/tostring.cc
@@ -8,9 +8,8 @@
#include <string.h>
#include <string>
-#include "util/util.h"
+#include "absl/strings/str_format.h"
#include "util/logging.h"
-#include "util/strutil.h"
#include "util/utf.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
@@ -216,11 +215,11 @@ int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
case kRegexpRepeat:
if (re->max() == -1)
- t_->append(StringPrintf("{%d,}", re->min()));
+ t_->append(absl::StrFormat("{%d,}", re->min()));
else if (re->min() == re->max())
- t_->append(StringPrintf("{%d}", re->min()));
+ t_->append(absl::StrFormat("{%d}", re->min()));
else
- t_->append(StringPrintf("{%d,%d}", re->min(), re->max()));
+ t_->append(absl::StrFormat("{%d,%d}", re->min(), re->max()));
if (re->parse_flags() & Regexp::NonGreedy)
t_->append("?");
if (prec < PrecUnary)
@@ -291,7 +290,7 @@ int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
// There's no syntax accepted by the parser to generate
// this node (it is generated by RE2::Set) so make something
// up that is readable but won't compile.
- t_->append(StringPrintf("(?HaveMatch:%d)", re->match_id()));
+ t_->append(absl::StrFormat("(?HaveMatch:%d)", re->match_id()));
break;
}
@@ -332,10 +331,10 @@ static void AppendCCChar(std::string* t, Rune r) {
}
if (r < 0x100) {
- *t += StringPrintf("\\x%02x", static_cast<int>(r));
+ *t += absl::StrFormat("\\x%02x", static_cast<int>(r));
return;
}
- *t += StringPrintf("\\x{%x}", static_cast<int>(r));
+ *t += absl::StrFormat("\\x{%x}", static_cast<int>(r));
}
static void AppendCCRange(std::string* t, Rune lo, Rune hi) {
diff --git a/re2/unicode.py b/re2/unicode.py
index 727bea5..9173407 100644
--- a/re2/unicode.py
+++ b/re2/unicode.py
@@ -10,10 +10,10 @@ from __future__ import print_function
import os
import re
-from six.moves import urllib
+import urllib.request
# Directory or URL where Unicode tables reside.
-_UNICODE_DIR = "https://www.unicode.org/Public/14.0.0/ucd"
+_UNICODE_DIR = "https://www.unicode.org/Public/15.1.0/ucd"
# Largest valid Unicode code value.
_RUNE_MAX = 0x10FFFF
diff --git a/re2/unicode_casefold.cc b/re2/unicode_casefold.cc
index d9de282..297d0c8 100644
--- a/re2/unicode_casefold.cc
+++ b/re2/unicode_casefold.cc
@@ -7,7 +7,7 @@
namespace re2 {
-// 1424 groups, 2878 pairs, 367 ranges
+// 1427 groups, 2884 pairs, 372 ranges
const CaseFold unicode_casefold[] = {
{ 65, 90, 32 },
{ 97, 106, -32 },
@@ -141,11 +141,13 @@ const CaseFold unicode_casefold[] = {
{ 904, 906, 37 },
{ 908, 908, 64 },
{ 910, 911, 63 },
+ { 912, 912, 7235 },
{ 913, 929, 32 },
{ 931, 931, 31 },
{ 932, 939, 32 },
{ 940, 940, -38 },
{ 941, 943, -37 },
+ { 944, 944, 7219 },
{ 945, 945, -32 },
{ 946, 946, 30 },
{ 947, 948, -32 },
@@ -278,9 +280,11 @@ const CaseFold unicode_casefold[] = {
{ 8136, 8139, -86 },
{ 8140, 8140, -9 },
{ 8144, 8145, 8 },
+ { 8147, 8147, -7235 },
{ 8152, 8153, -8 },
{ 8154, 8155, -100 },
{ 8160, 8161, 8 },
+ { 8163, 8163, -7219 },
{ 8165, 8165, 7 },
{ 8168, 8169, -8 },
{ 8170, 8171, -112 },
@@ -354,6 +358,7 @@ const CaseFold unicode_casefold[] = {
{ 42997, 42998, OddEven },
{ 43859, 43859, -928 },
{ 43888, 43967, -38864 },
+ { 64261, 64262, OddEven },
{ 65313, 65338, 32 },
{ 65345, 65370, -32 },
{ 66560, 66599, 40 },
@@ -377,9 +382,9 @@ const CaseFold unicode_casefold[] = {
{ 125184, 125217, 34 },
{ 125218, 125251, -34 },
};
-const int num_unicode_casefold = 367;
+const int num_unicode_casefold = 372;
-// 1424 groups, 1454 pairs, 205 ranges
+// 1427 groups, 1457 pairs, 208 ranges
const CaseFold unicode_tolower[] = {
{ 65, 90, 32 },
{ 181, 181, 775 },
@@ -515,8 +520,10 @@ const CaseFold unicode_tolower[] = {
{ 8126, 8126, -7173 },
{ 8136, 8139, -86 },
{ 8140, 8140, -9 },
+ { 8147, 8147, -7235 },
{ 8152, 8153, -8 },
{ 8154, 8155, -100 },
+ { 8163, 8163, -7219 },
{ 8168, 8169, -8 },
{ 8170, 8171, -112 },
{ 8172, 8172, -7 },
@@ -575,6 +582,7 @@ const CaseFold unicode_tolower[] = {
{ 42966, 42968, EvenOddSkip },
{ 42997, 42997, OddEven },
{ 43888, 43967, -38864 },
+ { 64261, 64261, OddEven },
{ 65313, 65338, 32 },
{ 66560, 66599, 40 },
{ 66736, 66771, 40 },
@@ -587,7 +595,7 @@ const CaseFold unicode_tolower[] = {
{ 93760, 93791, 32 },
{ 125184, 125217, 34 },
};
-const int num_unicode_tolower = 205;
+const int num_unicode_tolower = 208;
diff --git a/re2/unicode_casefold.h b/re2/unicode_casefold.h
index 8bdbb42..4acad68 100644
--- a/re2/unicode_casefold.h
+++ b/re2/unicode_casefold.h
@@ -41,7 +41,6 @@
#include <stdint.h>
-#include "util/util.h"
#include "util/utf.h"
namespace re2 {
diff --git a/re2/unicode_groups.cc b/re2/unicode_groups.cc
index 2a8d7da..b2a7ba6 100644
--- a/re2/unicode_groups.cc
+++ b/re2/unicode_groups.cc
@@ -29,7 +29,7 @@ static const URange16 C_range16[] = {
static const URange32 C_range32[] = {
{ 69821, 69821 },
{ 69837, 69837 },
- { 78896, 78904 },
+ { 78896, 78911 },
{ 113824, 113827 },
{ 119155, 119162 },
{ 917505, 917505 },
@@ -60,7 +60,7 @@ static const URange16 Cf_range16[] = {
static const URange32 Cf_range32[] = {
{ 69821, 69821 },
{ 69837, 69837 },
- { 78896, 78904 },
+ { 78896, 78911 },
{ 113824, 113827 },
{ 119155, 119162 },
{ 917505, 917505 },
@@ -548,6 +548,7 @@ static const URange32 L_range32[] = {
{ 70108, 70108 },
{ 70144, 70161 },
{ 70163, 70187 },
+ { 70207, 70208 },
{ 70272, 70278 },
{ 70280, 70280 },
{ 70282, 70285 },
@@ -610,11 +611,15 @@ static const URange32 L_range32[] = {
{ 73066, 73097 },
{ 73112, 73112 },
{ 73440, 73458 },
+ { 73474, 73474 },
+ { 73476, 73488 },
+ { 73490, 73523 },
{ 73648, 73648 },
{ 73728, 74649 },
{ 74880, 75075 },
{ 77712, 77808 },
- { 77824, 78894 },
+ { 77824, 78895 },
+ { 78913, 78918 },
{ 82944, 83526 },
{ 92160, 92728 },
{ 92736, 92766 },
@@ -637,7 +642,9 @@ static const URange32 L_range32[] = {
{ 110581, 110587 },
{ 110589, 110590 },
{ 110592, 110882 },
+ { 110898, 110898 },
{ 110928, 110930 },
+ { 110933, 110933 },
{ 110948, 110951 },
{ 110960, 111355 },
{ 113664, 113770 },
@@ -675,11 +682,14 @@ static const URange32 L_range32[] = {
{ 120746, 120770 },
{ 120772, 120779 },
{ 122624, 122654 },
+ { 122661, 122666 },
+ { 122928, 122989 },
{ 123136, 123180 },
{ 123191, 123197 },
{ 123214, 123214 },
{ 123536, 123565 },
{ 123584, 123627 },
+ { 124112, 124139 },
{ 124896, 124902 },
{ 124904, 124907 },
{ 124909, 124910 },
@@ -721,12 +731,14 @@ static const URange32 L_range32[] = {
{ 126629, 126633 },
{ 126635, 126651 },
{ 131072, 173791 },
- { 173824, 177976 },
+ { 173824, 177977 },
{ 177984, 178205 },
{ 178208, 183969 },
{ 183984, 191456 },
+ { 191472, 192093 },
{ 194560, 195101 },
{ 196608, 201546 },
+ { 201552, 205743 },
};
static const URange16 Ll_range16[] = {
{ 97, 122 },
@@ -1387,6 +1399,7 @@ static const URange32 Ll_range32[] = {
{ 120779, 120779 },
{ 122624, 122633 },
{ 122635, 122654 },
+ { 122661, 122666 },
{ 125218, 125251 },
};
static const URange16 Lm_range16[] = {
@@ -1459,7 +1472,9 @@ static const URange32 Lm_range32[] = {
{ 110576, 110579 },
{ 110581, 110587 },
{ 110589, 110590 },
+ { 122928, 122989 },
{ 123191, 123197 },
+ { 124139, 124139 },
{ 125259, 125259 },
};
static const URange16 Lo_range16[] = {
@@ -1829,6 +1844,7 @@ static const URange32 Lo_range32[] = {
{ 70108, 70108 },
{ 70144, 70161 },
{ 70163, 70187 },
+ { 70207, 70208 },
{ 70272, 70278 },
{ 70280, 70280 },
{ 70282, 70285 },
@@ -1890,11 +1906,15 @@ static const URange32 Lo_range32[] = {
{ 73066, 73097 },
{ 73112, 73112 },
{ 73440, 73458 },
+ { 73474, 73474 },
+ { 73476, 73488 },
+ { 73490, 73523 },
{ 73648, 73648 },
{ 73728, 74649 },
{ 74880, 75075 },
{ 77712, 77808 },
- { 77824, 78894 },
+ { 77824, 78895 },
+ { 78913, 78918 },
{ 82944, 83526 },
{ 92160, 92728 },
{ 92736, 92766 },
@@ -1909,7 +1929,9 @@ static const URange32 Lo_range32[] = {
{ 100352, 101589 },
{ 101632, 101640 },
{ 110592, 110882 },
+ { 110898, 110898 },
{ 110928, 110930 },
+ { 110933, 110933 },
{ 110948, 110951 },
{ 110960, 111355 },
{ 113664, 113770 },
@@ -1921,6 +1943,7 @@ static const URange32 Lo_range32[] = {
{ 123214, 123214 },
{ 123536, 123565 },
{ 123584, 123627 },
+ { 124112, 124138 },
{ 124896, 124902 },
{ 124904, 124907 },
{ 124909, 124910 },
@@ -1960,12 +1983,14 @@ static const URange32 Lo_range32[] = {
{ 126629, 126633 },
{ 126635, 126651 },
{ 131072, 173791 },
- { 173824, 177976 },
+ { 173824, 177977 },
{ 177984, 178205 },
{ 178208, 183969 },
{ 183984, 191456 },
+ { 191472, 192093 },
{ 194560, 195101 },
{ 196608, 201546 },
+ { 201552, 205743 },
};
static const URange16 Lt_range16[] = {
{ 453, 453 },
@@ -2710,6 +2735,7 @@ static const URange16 M_range16[] = {
{ 3274, 3277 },
{ 3285, 3286 },
{ 3298, 3299 },
+ { 3315, 3315 },
{ 3328, 3331 },
{ 3387, 3388 },
{ 3390, 3396 },
@@ -2728,7 +2754,7 @@ static const URange16 M_range16[] = {
{ 3655, 3662 },
{ 3761, 3761 },
{ 3764, 3772 },
- { 3784, 3789 },
+ { 3784, 3790 },
{ 3864, 3865 },
{ 3893, 3893 },
{ 3895, 3895 },
@@ -2832,6 +2858,7 @@ static const URange32 M_range32[] = {
{ 68325, 68326 },
{ 68900, 68903 },
{ 69291, 69292 },
+ { 69373, 69375 },
{ 69446, 69456 },
{ 69506, 69509 },
{ 69632, 69634 },
@@ -2851,6 +2878,7 @@ static const URange32 M_range32[] = {
{ 70094, 70095 },
{ 70188, 70199 },
{ 70206, 70206 },
+ { 70209, 70209 },
{ 70367, 70378 },
{ 70400, 70403 },
{ 70459, 70460 },
@@ -2898,6 +2926,12 @@ static const URange32 M_range32[] = {
{ 73104, 73105 },
{ 73107, 73111 },
{ 73459, 73462 },
+ { 73472, 73473 },
+ { 73475, 73475 },
+ { 73524, 73530 },
+ { 73534, 73538 },
+ { 78912, 78912 },
+ { 78919, 78933 },
{ 92912, 92916 },
{ 92976, 92982 },
{ 94031, 94031 },
@@ -2925,9 +2959,11 @@ static const URange32 M_range32[] = {
{ 122907, 122913 },
{ 122915, 122916 },
{ 122918, 122922 },
+ { 123023, 123023 },
{ 123184, 123190 },
{ 123566, 123566 },
{ 123628, 123631 },
+ { 124140, 124143 },
{ 125136, 125142 },
{ 125252, 125258 },
{ 917760, 917999 },
@@ -2968,6 +3004,7 @@ static const URange16 Mc_range16[] = {
{ 3271, 3272 },
{ 3274, 3275 },
{ 3285, 3286 },
+ { 3315, 3315 },
{ 3330, 3331 },
{ 3390, 3392 },
{ 3398, 3400 },
@@ -3108,6 +3145,10 @@ static const URange32 Mc_range32[] = {
{ 73107, 73108 },
{ 73110, 73110 },
{ 73461, 73462 },
+ { 73475, 73475 },
+ { 73524, 73525 },
+ { 73534, 73535 },
+ { 73537, 73537 },
{ 94033, 94087 },
{ 94192, 94193 },
{ 119141, 119142 },
@@ -3213,7 +3254,7 @@ static const URange16 Mn_range16[] = {
{ 3655, 3662 },
{ 3761, 3761 },
{ 3764, 3772 },
- { 3784, 3789 },
+ { 3784, 3790 },
{ 3864, 3865 },
{ 3893, 3893 },
{ 3895, 3895 },
@@ -3346,6 +3387,7 @@ static const URange32 Mn_range32[] = {
{ 68325, 68326 },
{ 68900, 68903 },
{ 69291, 69292 },
+ { 69373, 69375 },
{ 69446, 69456 },
{ 69506, 69509 },
{ 69633, 69633 },
@@ -3368,6 +3410,7 @@ static const URange32 Mn_range32[] = {
{ 70196, 70196 },
{ 70198, 70199 },
{ 70206, 70206 },
+ { 70209, 70209 },
{ 70367, 70367 },
{ 70371, 70378 },
{ 70400, 70401 },
@@ -3429,6 +3472,12 @@ static const URange32 Mn_range32[] = {
{ 73109, 73109 },
{ 73111, 73111 },
{ 73459, 73460 },
+ { 73472, 73473 },
+ { 73526, 73530 },
+ { 73536, 73536 },
+ { 73538, 73538 },
+ { 78912, 78912 },
+ { 78919, 78933 },
{ 92912, 92916 },
{ 92976, 92982 },
{ 94031, 94031 },
@@ -3453,9 +3502,11 @@ static const URange32 Mn_range32[] = {
{ 122907, 122913 },
{ 122915, 122916 },
{ 122918, 122922 },
+ { 123023, 123023 },
{ 123184, 123190 },
{ 123566, 123566 },
{ 123628, 123631 },
+ { 124140, 124143 },
{ 125136, 125142 },
{ 125252, 125258 },
{ 917760, 917999 },
@@ -3576,6 +3627,7 @@ static const URange32 N_range32[] = {
{ 72784, 72812 },
{ 73040, 73049 },
{ 73120, 73129 },
+ { 73552, 73561 },
{ 73664, 73684 },
{ 74752, 74862 },
{ 92768, 92777 },
@@ -3583,11 +3635,13 @@ static const URange32 N_range32[] = {
{ 93008, 93017 },
{ 93019, 93025 },
{ 93824, 93846 },
+ { 119488, 119507 },
{ 119520, 119539 },
{ 119648, 119672 },
{ 120782, 120831 },
{ 123200, 123209 },
{ 123632, 123641 },
+ { 124144, 124153 },
{ 125127, 125135 },
{ 125264, 125273 },
{ 126065, 126123 },
@@ -3655,12 +3709,14 @@ static const URange32 Nd_range32[] = {
{ 72784, 72793 },
{ 73040, 73049 },
{ 73120, 73129 },
+ { 73552, 73561 },
{ 92768, 92777 },
{ 92864, 92873 },
{ 93008, 93017 },
{ 120782, 120831 },
{ 123200, 123209 },
{ 123632, 123641 },
+ { 124144, 124153 },
{ 125264, 125273 },
{ 130032, 130041 },
};
@@ -3745,6 +3801,7 @@ static const URange32 No_range32[] = {
{ 73664, 73684 },
{ 93019, 93025 },
{ 93824, 93846 },
+ { 119488, 119507 },
{ 119520, 119539 },
{ 119648, 119672 },
{ 125127, 125135 },
@@ -3932,9 +3989,11 @@ static const URange32 P_range32[] = {
{ 72255, 72262 },
{ 72346, 72348 },
{ 72350, 72354 },
+ { 72448, 72457 },
{ 72769, 72773 },
{ 72816, 72817 },
{ 73463, 73464 },
+ { 73539, 73551 },
{ 73727, 73727 },
{ 74864, 74868 },
{ 77809, 77810 },
@@ -4255,9 +4314,11 @@ static const URange32 Po_range32[] = {
{ 72255, 72262 },
{ 72346, 72348 },
{ 72350, 72354 },
+ { 72448, 72457 },
{ 72769, 72773 },
{ 72816, 72817 },
{ 73463, 73464 },
+ { 73539, 73551 },
{ 73727, 73727 },
{ 74864, 74868 },
{ 77809, 77810 },
@@ -4460,7 +4521,7 @@ static const URange16 S_range16[] = {
{ 11904, 11929 },
{ 11931, 12019 },
{ 12032, 12245 },
- { 12272, 12283 },
+ { 12272, 12287 },
{ 12292, 12292 },
{ 12306, 12307 },
{ 12320, 12320 },
@@ -4470,6 +4531,7 @@ static const URange16 S_range16[] = {
{ 12688, 12689 },
{ 12694, 12703 },
{ 12736, 12771 },
+ { 12783, 12783 },
{ 12800, 12830 },
{ 12842, 12871 },
{ 12880, 12880 },
@@ -4564,10 +4626,10 @@ static const URange32 S_range32[] = {
{ 127568, 127569 },
{ 127584, 127589 },
{ 127744, 128727 },
- { 128733, 128748 },
+ { 128732, 128748 },
{ 128752, 128764 },
- { 128768, 128883 },
- { 128896, 128984 },
+ { 128768, 128886 },
+ { 128891, 128985 },
{ 128992, 129003 },
{ 129008, 129008 },
{ 129024, 129035 },
@@ -4578,15 +4640,13 @@ static const URange32 S_range32[] = {
{ 129200, 129201 },
{ 129280, 129619 },
{ 129632, 129645 },
- { 129648, 129652 },
- { 129656, 129660 },
- { 129664, 129670 },
- { 129680, 129708 },
- { 129712, 129722 },
- { 129728, 129733 },
- { 129744, 129753 },
- { 129760, 129767 },
- { 129776, 129782 },
+ { 129648, 129660 },
+ { 129664, 129672 },
+ { 129680, 129725 },
+ { 129727, 129733 },
+ { 129742, 129755 },
+ { 129760, 129768 },
+ { 129776, 129784 },
{ 129792, 129938 },
{ 129940, 129994 },
};
@@ -4805,7 +4865,7 @@ static const URange16 So_range16[] = {
{ 11904, 11929 },
{ 11931, 12019 },
{ 12032, 12245 },
- { 12272, 12283 },
+ { 12272, 12287 },
{ 12292, 12292 },
{ 12306, 12307 },
{ 12320, 12320 },
@@ -4814,6 +4874,7 @@ static const URange16 So_range16[] = {
{ 12688, 12689 },
{ 12694, 12703 },
{ 12736, 12771 },
+ { 12783, 12783 },
{ 12800, 12830 },
{ 12842, 12871 },
{ 12880, 12880 },
@@ -4882,10 +4943,10 @@ static const URange32 So_range32[] = {
{ 127584, 127589 },
{ 127744, 127994 },
{ 128000, 128727 },
- { 128733, 128748 },
+ { 128732, 128748 },
{ 128752, 128764 },
- { 128768, 128883 },
- { 128896, 128984 },
+ { 128768, 128886 },
+ { 128891, 128985 },
{ 128992, 129003 },
{ 129008, 129008 },
{ 129024, 129035 },
@@ -4896,15 +4957,13 @@ static const URange32 So_range32[] = {
{ 129200, 129201 },
{ 129280, 129619 },
{ 129632, 129645 },
- { 129648, 129652 },
- { 129656, 129660 },
- { 129664, 129670 },
- { 129680, 129708 },
- { 129712, 129722 },
- { 129728, 129733 },
- { 129744, 129753 },
- { 129760, 129767 },
- { 129776, 129782 },
+ { 129648, 129660 },
+ { 129664, 129672 },
+ { 129680, 129725 },
+ { 129727, 129733 },
+ { 129742, 129755 },
+ { 129760, 129768 },
+ { 129776, 129784 },
{ 129792, 129938 },
{ 129940, 129994 },
};
@@ -4972,6 +5031,7 @@ static const URange16 Arabic_range16[] = {
};
static const URange32 Arabic_range32[] = {
{ 69216, 69246 },
+ { 69373, 69375 },
{ 126464, 126467 },
{ 126469, 126495 },
{ 126497, 126498 },
@@ -5164,8 +5224,7 @@ static const URange16 Common_range16[] = {
{ 11126, 11157 },
{ 11159, 11263 },
{ 11776, 11869 },
- { 12272, 12283 },
- { 12288, 12292 },
+ { 12272, 12292 },
{ 12294, 12294 },
{ 12296, 12320 },
{ 12336, 12343 },
@@ -5175,6 +5234,7 @@ static const URange16 Common_range16[] = {
{ 12539, 12540 },
{ 12688, 12703 },
{ 12736, 12771 },
+ { 12783, 12783 },
{ 12832, 12895 },
{ 12927, 13007 },
{ 13055, 13055 },
@@ -5218,6 +5278,7 @@ static const URange32 Common_range32[] = {
{ 119171, 119172 },
{ 119180, 119209 },
{ 119214, 119274 },
+ { 119488, 119507 },
{ 119520, 119539 },
{ 119552, 119638 },
{ 119648, 119672 },
@@ -5258,10 +5319,10 @@ static const URange32 Common_range32[] = {
{ 127568, 127569 },
{ 127584, 127589 },
{ 127744, 128727 },
- { 128733, 128748 },
+ { 128732, 128748 },
{ 128752, 128764 },
- { 128768, 128883 },
- { 128896, 128984 },
+ { 128768, 128886 },
+ { 128891, 128985 },
{ 128992, 129003 },
{ 129008, 129008 },
{ 129024, 129035 },
@@ -5272,15 +5333,13 @@ static const URange32 Common_range32[] = {
{ 129200, 129201 },
{ 129280, 129619 },
{ 129632, 129645 },
- { 129648, 129652 },
- { 129656, 129660 },
- { 129664, 129670 },
- { 129680, 129708 },
- { 129712, 129722 },
- { 129728, 129733 },
- { 129744, 129753 },
- { 129760, 129767 },
- { 129776, 129782 },
+ { 129648, 129660 },
+ { 129664, 129672 },
+ { 129680, 129725 },
+ { 129727, 129733 },
+ { 129742, 129755 },
+ { 129760, 129768 },
+ { 129776, 129784 },
{ 129792, 129938 },
{ 129940, 129994 },
{ 130032, 130041 },
@@ -5319,6 +5378,10 @@ static const URange16 Cyrillic_range16[] = {
{ 42560, 42655 },
{ 65070, 65071 },
};
+static const URange32 Cyrillic_range32[] = {
+ { 122928, 122989 },
+ { 123023, 123023 },
+};
static const URange32 Deseret_range32[] = {
{ 66560, 66639 },
};
@@ -5328,6 +5391,9 @@ static const URange16 Devanagari_range16[] = {
{ 2406, 2431 },
{ 43232, 43263 },
};
+static const URange32 Devanagari_range32[] = {
+ { 72448, 72457 },
+};
static const URange32 Dives_Akuru_range32[] = {
{ 71936, 71942 },
{ 71945, 71945 },
@@ -5349,8 +5415,7 @@ static const URange32 Duployan_range32[] = {
{ 113820, 113823 },
};
static const URange32 Egyptian_Hieroglyphs_range32[] = {
- { 77824, 78894 },
- { 78896, 78904 },
+ { 77824, 78933 },
};
static const URange32 Elbasan_range32[] = {
{ 66816, 66855 },
@@ -5539,12 +5604,14 @@ static const URange32 Han_range32[] = {
{ 94178, 94179 },
{ 94192, 94193 },
{ 131072, 173791 },
- { 173824, 177976 },
+ { 173824, 177977 },
{ 177984, 178205 },
{ 178208, 183969 },
{ 183984, 191456 },
+ { 191472, 192093 },
{ 194560, 195101 },
{ 196608, 201546 },
+ { 201552, 205743 },
};
static const URange16 Hangul_range16[] = {
{ 4352, 4607 },
@@ -5591,6 +5658,7 @@ static const URange16 Hiragana_range16[] = {
};
static const URange32 Hiragana_range32[] = {
{ 110593, 110879 },
+ { 110898, 110898 },
{ 110928, 110930 },
{ 127488, 127488 },
};
@@ -5661,7 +5729,7 @@ static const URange16 Kannada_range16[] = {
{ 3293, 3294 },
{ 3296, 3299 },
{ 3302, 3311 },
- { 3313, 3314 },
+ { 3313, 3315 },
};
static const URange16 Katakana_range16[] = {
{ 12449, 12538 },
@@ -5678,8 +5746,14 @@ static const URange32 Katakana_range32[] = {
{ 110589, 110590 },
{ 110592, 110592 },
{ 110880, 110882 },
+ { 110933, 110933 },
{ 110948, 110951 },
};
+static const URange32 Kawi_range32[] = {
+ { 73472, 73488 },
+ { 73490, 73530 },
+ { 73534, 73561 },
+};
static const URange16 Kayah_Li_range16[] = {
{ 43264, 43309 },
{ 43311, 43311 },
@@ -5706,7 +5780,7 @@ static const URange16 Khmer_range16[] = {
};
static const URange32 Khojki_range32[] = {
{ 70144, 70161 },
- { 70163, 70206 },
+ { 70163, 70209 },
};
static const URange32 Khudawadi_range32[] = {
{ 70320, 70378 },
@@ -5721,7 +5795,7 @@ static const URange16 Lao_range16[] = {
{ 3751, 3773 },
{ 3776, 3780 },
{ 3782, 3782 },
- { 3784, 3789 },
+ { 3784, 3790 },
{ 3792, 3801 },
{ 3804, 3807 },
};
@@ -5766,6 +5840,7 @@ static const URange32 Latin_range32[] = {
{ 67463, 67504 },
{ 67506, 67514 },
{ 122624, 122654 },
+ { 122661, 122666 },
};
static const URange16 Lepcha_range16[] = {
{ 7168, 7223 },
@@ -5903,6 +5978,9 @@ static const URange32 Nabataean_range32[] = {
{ 67712, 67742 },
{ 67751, 67759 },
};
+static const URange32 Nag_Mundari_range32[] = {
+ { 124112, 124153 },
+};
static const URange32 Nandinagari_range32[] = {
{ 72096, 72103 },
{ 72106, 72151 },
@@ -6229,12 +6307,12 @@ static const URange16 Yi_range16[] = {
static const URange32 Zanabazar_Square_range32[] = {
{ 72192, 72263 },
};
-// 4038 16-bit ranges, 1712 32-bit ranges
+// 4042 16-bit ranges, 1778 32-bit ranges
const UGroup unicode_groups[] = {
{ "Adlam", +1, 0, 0, Adlam_range32, 3 },
{ "Ahom", +1, 0, 0, Ahom_range32, 3 },
{ "Anatolian_Hieroglyphs", +1, 0, 0, Anatolian_Hieroglyphs_range32, 1 },
- { "Arabic", +1, Arabic_range16, 22, Arabic_range32, 35 },
+ { "Arabic", +1, Arabic_range16, 22, Arabic_range32, 36 },
{ "Armenian", +1, Armenian_range16, 4, 0, 0 },
{ "Avestan", +1, 0, 0, Avestan_range32, 2 },
{ "Balinese", +1, Balinese_range16, 2, 0, 0 },
@@ -6259,19 +6337,19 @@ const UGroup unicode_groups[] = {
{ "Cherokee", +1, Cherokee_range16, 3, 0, 0 },
{ "Chorasmian", +1, 0, 0, Chorasmian_range32, 1 },
{ "Co", +1, Co_range16, 1, Co_range32, 2 },
- { "Common", +1, Common_range16, 91, Common_range32, 83 },
+ { "Common", +1, Common_range16, 91, Common_range32, 82 },
{ "Coptic", +1, Coptic_range16, 3, 0, 0 },
{ "Cs", +1, Cs_range16, 1, 0, 0 },
{ "Cuneiform", +1, 0, 0, Cuneiform_range32, 4 },
{ "Cypriot", +1, 0, 0, Cypriot_range32, 6 },
{ "Cypro_Minoan", +1, 0, 0, Cypro_Minoan_range32, 1 },
- { "Cyrillic", +1, Cyrillic_range16, 8, 0, 0 },
+ { "Cyrillic", +1, Cyrillic_range16, 8, Cyrillic_range32, 2 },
{ "Deseret", +1, 0, 0, Deseret_range32, 1 },
- { "Devanagari", +1, Devanagari_range16, 4, 0, 0 },
+ { "Devanagari", +1, Devanagari_range16, 4, Devanagari_range32, 1 },
{ "Dives_Akuru", +1, 0, 0, Dives_Akuru_range32, 8 },
{ "Dogra", +1, 0, 0, Dogra_range32, 1 },
{ "Duployan", +1, 0, 0, Duployan_range32, 5 },
- { "Egyptian_Hieroglyphs", +1, 0, 0, Egyptian_Hieroglyphs_range32, 2 },
+ { "Egyptian_Hieroglyphs", +1, 0, 0, Egyptian_Hieroglyphs_range32, 1 },
{ "Elbasan", +1, 0, 0, Elbasan_range32, 1 },
{ "Elymaic", +1, 0, 0, Elymaic_range32, 1 },
{ "Ethiopic", +1, Ethiopic_range16, 32, Ethiopic_range32, 4 },
@@ -6283,13 +6361,13 @@ const UGroup unicode_groups[] = {
{ "Gujarati", +1, Gujarati_range16, 14, 0, 0 },
{ "Gunjala_Gondi", +1, 0, 0, Gunjala_Gondi_range32, 6 },
{ "Gurmukhi", +1, Gurmukhi_range16, 16, 0, 0 },
- { "Han", +1, Han_range16, 11, Han_range32, 9 },
+ { "Han", +1, Han_range16, 11, Han_range32, 11 },
{ "Hangul", +1, Hangul_range16, 14, 0, 0 },
{ "Hanifi_Rohingya", +1, 0, 0, Hanifi_Rohingya_range32, 2 },
{ "Hanunoo", +1, Hanunoo_range16, 1, 0, 0 },
{ "Hatran", +1, 0, 0, Hatran_range32, 3 },
{ "Hebrew", +1, Hebrew_range16, 9, 0, 0 },
- { "Hiragana", +1, Hiragana_range16, 2, Hiragana_range32, 3 },
+ { "Hiragana", +1, Hiragana_range16, 2, Hiragana_range32, 4 },
{ "Imperial_Aramaic", +1, 0, 0, Imperial_Aramaic_range32, 2 },
{ "Inherited", +1, Inherited_range16, 19, Inherited_range32, 10 },
{ "Inscriptional_Pahlavi", +1, 0, 0, Inscriptional_Pahlavi_range32, 2 },
@@ -6297,29 +6375,30 @@ const UGroup unicode_groups[] = {
{ "Javanese", +1, Javanese_range16, 3, 0, 0 },
{ "Kaithi", +1, 0, 0, Kaithi_range32, 2 },
{ "Kannada", +1, Kannada_range16, 13, 0, 0 },
- { "Katakana", +1, Katakana_range16, 7, Katakana_range32, 6 },
+ { "Katakana", +1, Katakana_range16, 7, Katakana_range32, 7 },
+ { "Kawi", +1, 0, 0, Kawi_range32, 3 },
{ "Kayah_Li", +1, Kayah_Li_range16, 2, 0, 0 },
{ "Kharoshthi", +1, 0, 0, Kharoshthi_range32, 8 },
{ "Khitan_Small_Script", +1, 0, 0, Khitan_Small_Script_range32, 2 },
{ "Khmer", +1, Khmer_range16, 4, 0, 0 },
{ "Khojki", +1, 0, 0, Khojki_range32, 2 },
{ "Khudawadi", +1, 0, 0, Khudawadi_range32, 2 },
- { "L", +1, L_range16, 380, L_range32, 268 },
+ { "L", +1, L_range16, 380, L_range32, 280 },
{ "Lao", +1, Lao_range16, 11, 0, 0 },
- { "Latin", +1, Latin_range16, 34, Latin_range32, 4 },
+ { "Latin", +1, Latin_range16, 34, Latin_range32, 5 },
{ "Lepcha", +1, Lepcha_range16, 3, 0, 0 },
{ "Limbu", +1, Limbu_range16, 5, 0, 0 },
{ "Linear_A", +1, 0, 0, Linear_A_range32, 3 },
{ "Linear_B", +1, 0, 0, Linear_B_range32, 7 },
{ "Lisu", +1, Lisu_range16, 1, Lisu_range32, 1 },
- { "Ll", +1, Ll_range16, 617, Ll_range32, 40 },
- { "Lm", +1, Lm_range16, 57, Lm_range32, 12 },
- { "Lo", +1, Lo_range16, 290, Lo_range32, 211 },
+ { "Ll", +1, Ll_range16, 617, Ll_range32, 41 },
+ { "Lm", +1, Lm_range16, 57, Lm_range32, 14 },
+ { "Lo", +1, Lo_range16, 290, Lo_range32, 221 },
{ "Lt", +1, Lt_range16, 10, 0, 0 },
{ "Lu", +1, Lu_range16, 605, Lu_range32, 41 },
{ "Lycian", +1, 0, 0, Lycian_range32, 1 },
{ "Lydian", +1, 0, 0, Lydian_range32, 2 },
- { "M", +1, M_range16, 189, M_range32, 110 },
+ { "M", +1, M_range16, 190, M_range32, 120 },
{ "Mahajani", +1, 0, 0, Mahajani_range32, 1 },
{ "Makasar", +1, 0, 0, Makasar_range32, 1 },
{ "Malayalam", +1, Malayalam_range16, 7, 0, 0 },
@@ -6327,7 +6406,7 @@ const UGroup unicode_groups[] = {
{ "Manichaean", +1, 0, 0, Manichaean_range32, 2 },
{ "Marchen", +1, 0, 0, Marchen_range32, 3 },
{ "Masaram_Gondi", +1, 0, 0, Masaram_Gondi_range32, 7 },
- { "Mc", +1, Mc_range16, 111, Mc_range32, 66 },
+ { "Mc", +1, Mc_range16, 112, Mc_range32, 70 },
{ "Me", +1, Me_range16, 5, 0, 0 },
{ "Medefaidrin", +1, 0, 0, Medefaidrin_range32, 1 },
{ "Meetei_Mayek", +1, Meetei_Mayek_range16, 3, 0, 0 },
@@ -6335,21 +6414,22 @@ const UGroup unicode_groups[] = {
{ "Meroitic_Cursive", +1, 0, 0, Meroitic_Cursive_range32, 3 },
{ "Meroitic_Hieroglyphs", +1, 0, 0, Meroitic_Hieroglyphs_range32, 1 },
{ "Miao", +1, 0, 0, Miao_range32, 3 },
- { "Mn", +1, Mn_range16, 212, Mn_range32, 124 },
+ { "Mn", +1, Mn_range16, 212, Mn_range32, 134 },
{ "Modi", +1, 0, 0, Modi_range32, 2 },
{ "Mongolian", +1, Mongolian_range16, 5, Mongolian_range32, 1 },
{ "Mro", +1, 0, 0, Mro_range32, 3 },
{ "Multani", +1, 0, 0, Multani_range32, 5 },
{ "Myanmar", +1, Myanmar_range16, 3, 0, 0 },
- { "N", +1, N_range16, 67, N_range32, 67 },
+ { "N", +1, N_range16, 67, N_range32, 70 },
{ "Nabataean", +1, 0, 0, Nabataean_range32, 2 },
+ { "Nag_Mundari", +1, 0, 0, Nag_Mundari_range32, 1 },
{ "Nandinagari", +1, 0, 0, Nandinagari_range32, 3 },
- { "Nd", +1, Nd_range16, 37, Nd_range32, 25 },
+ { "Nd", +1, Nd_range16, 37, Nd_range32, 27 },
{ "New_Tai_Lue", +1, New_Tai_Lue_range16, 4, 0, 0 },
{ "Newa", +1, 0, 0, Newa_range32, 2 },
{ "Nko", +1, Nko_range16, 2, 0, 0 },
{ "Nl", +1, Nl_range16, 7, Nl_range32, 5 },
- { "No", +1, No_range16, 29, No_range32, 42 },
+ { "No", +1, No_range16, 29, No_range32, 43 },
{ "Nushu", +1, 0, 0, Nushu_range32, 2 },
{ "Nyiakeng_Puachue_Hmong", +1, 0, 0, Nyiakeng_Puachue_Hmong_range32, 4 },
{ "Ogham", +1, Ogham_range16, 1, 0, 0 },
@@ -6366,7 +6446,7 @@ const UGroup unicode_groups[] = {
{ "Oriya", +1, Oriya_range16, 14, 0, 0 },
{ "Osage", +1, 0, 0, Osage_range32, 2 },
{ "Osmanya", +1, 0, 0, Osmanya_range32, 2 },
- { "P", +1, P_range16, 133, P_range32, 56 },
+ { "P", +1, P_range16, 133, P_range32, 58 },
{ "Pahawh_Hmong", +1, 0, 0, Pahawh_Hmong_range32, 5 },
{ "Palmyrene", +1, 0, 0, Palmyrene_range32, 1 },
{ "Pau_Cin_Hau", +1, 0, 0, Pau_Cin_Hau_range32, 1 },
@@ -6377,12 +6457,12 @@ const UGroup unicode_groups[] = {
{ "Phags_Pa", +1, Phags_Pa_range16, 1, 0, 0 },
{ "Phoenician", +1, 0, 0, Phoenician_range32, 2 },
{ "Pi", +1, Pi_range16, 11, 0, 0 },
- { "Po", +1, Po_range16, 130, Po_range32, 55 },
+ { "Po", +1, Po_range16, 130, Po_range32, 57 },
{ "Ps", +1, Ps_range16, 79, 0, 0 },
{ "Psalter_Pahlavi", +1, 0, 0, Psalter_Pahlavi_range32, 3 },
{ "Rejang", +1, Rejang_range16, 2, 0, 0 },
{ "Runic", +1, Runic_range16, 2, 0, 0 },
- { "S", +1, S_range16, 151, S_range32, 83 },
+ { "S", +1, S_range16, 152, S_range32, 81 },
{ "Samaritan", +1, Samaritan_range16, 2, 0, 0 },
{ "Saurashtra", +1, Saurashtra_range16, 2, 0, 0 },
{ "Sc", +1, Sc_range16, 18, Sc_range32, 3 },
@@ -6393,7 +6473,7 @@ const UGroup unicode_groups[] = {
{ "Sinhala", +1, Sinhala_range16, 12, Sinhala_range32, 1 },
{ "Sk", +1, Sk_range16, 30, Sk_range32, 1 },
{ "Sm", +1, Sm_range16, 53, Sm_range32, 11 },
- { "So", +1, So_range16, 114, So_range32, 72 },
+ { "So", +1, So_range16, 115, So_range32, 70 },
{ "Sogdian", +1, 0, 0, Sogdian_range32, 1 },
{ "Sora_Sompeng", +1, 0, 0, Sora_Sompeng_range32, 2 },
{ "Soyombo", +1, 0, 0, Soyombo_range32, 1 },
@@ -6429,7 +6509,7 @@ const UGroup unicode_groups[] = {
{ "Zp", +1, Zp_range16, 1, 0, 0 },
{ "Zs", +1, Zs_range16, 7, 0, 0 },
};
-const int num_unicode_groups = 197;
+const int num_unicode_groups = 199;
} // namespace re2
diff --git a/re2/unicode_groups.h b/re2/unicode_groups.h
index 75f55da..6dc6532 100644
--- a/re2/unicode_groups.h
+++ b/re2/unicode_groups.h
@@ -20,7 +20,6 @@
#include <stdint.h>
-#include "util/util.h"
#include "util/utf.h"
namespace re2 {
diff --git a/re2/walker-inl.h b/re2/walker-inl.h
index 4d064a0..45763a7 100644
--- a/re2/walker-inl.h
+++ b/re2/walker-inl.h
@@ -15,6 +15,7 @@
#include <stack>
+#include "absl/base/macros.h"
#include "util/logging.h"
#include "re2/regexp.h"
@@ -190,7 +191,7 @@ template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
s->child_args = &s->child_arg;
else if (re->nsub_ > 1)
s->child_args = new T[re->nsub_];
- FALLTHROUGH_INTENDED;
+ ABSL_FALLTHROUGH_INTENDED;
}
default: {
if (re->nsub_ > 0) {
diff --git a/re2Config.cmake.in b/re2Config.cmake.in
index 7698107..6a177c6 100644
--- a/re2Config.cmake.in
+++ b/re2Config.cmake.in
@@ -13,6 +13,12 @@ if(UNIX)
find_dependency(Threads REQUIRED)
endif()
+find_dependency(absl REQUIRED)
+
+if(@RE2_USE_ICU@)
+ find_dependency(ICU REQUIRED COMPONENTS uc)
+endif()
+
check_required_components(re2)
if(TARGET re2::re2)
diff --git a/util/benchmark.cc b/util/benchmark.cc
deleted file mode 100644
index e39c334..0000000
--- a/util/benchmark.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-// Copyright 2009 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <algorithm>
-#include <chrono>
-
-#include "util/benchmark.h"
-#include "util/flags.h"
-#include "re2/re2.h"
-
-#ifdef _WIN32
-#define snprintf _snprintf
-#endif
-
-using ::testing::Benchmark;
-
-static Benchmark* benchmarks[10000];
-static int nbenchmarks;
-
-void Benchmark::Register() {
- lo_ = std::max(1, lo_);
- hi_ = std::max(lo_, hi_);
- benchmarks[nbenchmarks++] = this;
-}
-
-static int64_t nsec() {
- return std::chrono::duration_cast<std::chrono::nanoseconds>(
- std::chrono::steady_clock::now().time_since_epoch())
- .count();
-}
-
-static int64_t t0;
-static int64_t ns;
-static int64_t bytes;
-static int64_t items;
-
-void StartBenchmarkTiming() {
- if (t0 == 0) {
- t0 = nsec();
- }
-}
-
-void StopBenchmarkTiming() {
- if (t0 != 0) {
- ns += nsec() - t0;
- t0 = 0;
- }
-}
-
-void SetBenchmarkBytesProcessed(int64_t b) { bytes = b; }
-
-void SetBenchmarkItemsProcessed(int64_t i) { items = i; }
-
-static void RunFunc(Benchmark* b, int iters, int arg) {
- t0 = nsec();
- ns = 0;
- bytes = 0;
- items = 0;
- b->func()(iters, arg);
- StopBenchmarkTiming();
-}
-
-static int round(int n) {
- int base = 1;
- while (base * 10 < n) base *= 10;
- if (n < 2 * base) return 2 * base;
- if (n < 5 * base) return 5 * base;
- return 10 * base;
-}
-
-static void RunBench(Benchmark* b, int arg) {
- int iters, last;
-
- // Run once just in case it's expensive.
- iters = 1;
- RunFunc(b, iters, arg);
- while (ns < (int)1e9 && iters < (int)1e9) {
- last = iters;
- if (ns / iters == 0) {
- iters = (int)1e9;
- } else {
- iters = (int)1e9 / static_cast<int>(ns / iters);
- }
- iters = std::max(last + 1, std::min(iters + iters / 2, 100 * last));
- iters = round(iters);
- RunFunc(b, iters, arg);
- }
-
- char mb[100];
- char suf[100];
- mb[0] = '\0';
- suf[0] = '\0';
- if (ns > 0 && bytes > 0)
- snprintf(mb, sizeof mb, "\t%7.2f MB/s",
- ((double)bytes / 1e6) / ((double)ns / 1e9));
- if (b->has_arg()) {
- if (arg >= (1 << 20)) {
- snprintf(suf, sizeof suf, "/%dM", arg / (1 << 20));
- } else if (arg >= (1 << 10)) {
- snprintf(suf, sizeof suf, "/%dK", arg / (1 << 10));
- } else {
- snprintf(suf, sizeof suf, "/%d", arg);
- }
- }
- printf("%s%s\t%8d\t%10lld ns/op%s\n", b->name(), suf, iters,
- (long long)ns / iters, mb);
- fflush(stdout);
-}
-
-static bool WantBench(const char* name, int argc, const char** argv) {
- if (argc == 1) return true;
- for (int i = 1; i < argc; i++) {
- if (RE2::PartialMatch(name, argv[i]))
- return true;
- }
- return false;
-}
-
-int main(int argc, const char** argv) {
- for (int i = 0; i < nbenchmarks; i++) {
- Benchmark* b = benchmarks[i];
- if (!WantBench(b->name(), argc, argv))
- continue;
- for (int arg = b->lo(); arg <= b->hi(); arg <<= 1)
- RunBench(b, arg);
- }
-}
diff --git a/util/benchmark.h b/util/benchmark.h
deleted file mode 100644
index d97b49e..0000000
--- a/util/benchmark.h
+++ /dev/null
@@ -1,156 +0,0 @@
-// Copyright 2009 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef UTIL_BENCHMARK_H_
-#define UTIL_BENCHMARK_H_
-
-#include <stdint.h>
-#include <functional>
-
-#include "util/logging.h"
-#include "util/util.h"
-
-// Globals for the old benchmark API.
-void StartBenchmarkTiming();
-void StopBenchmarkTiming();
-void SetBenchmarkBytesProcessed(int64_t b);
-void SetBenchmarkItemsProcessed(int64_t i);
-
-namespace benchmark {
-
-// The new benchmark API implemented as a layer over the old benchmark API.
-// (Please refer to https://github.com/google/benchmark for documentation.)
-class State {
- private:
- class Iterator {
- public:
- // Benchmark code looks like this:
- //
- // for (auto _ : state) {
- // // ...
- // }
- //
- // We try to avoid compiler warnings about such variables being unused.
- struct ATTRIBUTE_UNUSED Value {};
-
- explicit Iterator(int64_t iters) : iters_(iters) {}
-
- bool operator!=(const Iterator& that) const {
- if (iters_ != that.iters_) {
- return true;
- } else {
- // We are about to stop the loop, so stop timing.
- StopBenchmarkTiming();
- return false;
- }
- }
-
- Value operator*() const {
- return Value();
- }
-
- Iterator& operator++() {
- --iters_;
- return *this;
- }
-
- private:
- int64_t iters_;
- };
-
- public:
- explicit State(int64_t iters)
- : iters_(iters), arg_(0), has_arg_(false) {}
-
- State(int64_t iters, int64_t arg)
- : iters_(iters), arg_(arg), has_arg_(true) {}
-
- Iterator begin() {
- // We are about to start the loop, so start timing.
- StartBenchmarkTiming();
- return Iterator(iters_);
- }
-
- Iterator end() {
- return Iterator(0);
- }
-
- void SetBytesProcessed(int64_t b) { SetBenchmarkBytesProcessed(b); }
- void SetItemsProcessed(int64_t i) { SetBenchmarkItemsProcessed(i); }
- int64_t iterations() const { return iters_; }
- // Pretend to support multiple arguments.
- int64_t range(int pos) const { CHECK(has_arg_); return arg_; }
-
- private:
- int64_t iters_;
- int64_t arg_;
- bool has_arg_;
-
- State(const State&) = delete;
- State& operator=(const State&) = delete;
-};
-
-} // namespace benchmark
-
-namespace testing {
-
-class Benchmark {
- public:
- Benchmark(const char* name, void (*func)(benchmark::State&))
- : name_(name),
- func_([func](int iters, int arg) {
- benchmark::State state(iters);
- func(state);
- }),
- lo_(0),
- hi_(0),
- has_arg_(false) {
- Register();
- }
-
- Benchmark(const char* name, void (*func)(benchmark::State&), int lo, int hi)
- : name_(name),
- func_([func](int iters, int arg) {
- benchmark::State state(iters, arg);
- func(state);
- }),
- lo_(lo),
- hi_(hi),
- has_arg_(true) {
- Register();
- }
-
- // Pretend to support multiple threads.
- Benchmark* ThreadRange(int lo, int hi) { return this; }
-
- const char* name() const { return name_; }
- const std::function<void(int, int)>& func() const { return func_; }
- int lo() const { return lo_; }
- int hi() const { return hi_; }
- bool has_arg() const { return has_arg_; }
-
- private:
- void Register();
-
- const char* name_;
- std::function<void(int, int)> func_;
- int lo_;
- int hi_;
- bool has_arg_;
-
- Benchmark(const Benchmark&) = delete;
- Benchmark& operator=(const Benchmark&) = delete;
-};
-
-} // namespace testing
-
-#define BENCHMARK(f) \
- ::testing::Benchmark* _benchmark_##f = \
- (new ::testing::Benchmark(#f, f))
-
-#define BENCHMARK_RANGE(f, lo, hi) \
- ::testing::Benchmark* _benchmark_##f = \
- (new ::testing::Benchmark(#f, f, lo, hi))
-
-#endif // UTIL_BENCHMARK_H_
diff --git a/util/flags.h b/util/flags.h
deleted file mode 100644
index 3386b72..0000000
--- a/util/flags.h
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright 2009 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef UTIL_FLAGS_H_
-#define UTIL_FLAGS_H_
-
-// Simplified version of Google's command line flags.
-// Does not support parsing the command line.
-// If you want to do that, see
-// https://gflags.github.io/gflags/
-
-#define DEFINE_FLAG(type, name, deflt, desc) \
- namespace re2 { type FLAGS_##name = deflt; }
-
-#define DECLARE_FLAG(type, name) \
- namespace re2 { extern type FLAGS_##name; }
-
-namespace re2 {
-template <typename T>
-T GetFlag(const T& flag) {
- return flag;
-}
-} // namespace re2
-
-#endif // UTIL_FLAGS_H_
diff --git a/util/fuzz.cc b/util/fuzz.cc
deleted file mode 100644
index 9cac118..0000000
--- a/util/fuzz.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2016 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdlib.h>
-
-// Entry point for libFuzzer.
-extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size);
-
-int main(int argc, char** argv) {
- uint8_t data[32];
- for (int i = 0; i < 32; i++) {
- for (int j = 0; j < 32; j++) {
- data[j] = random() & 0xFF;
- }
- LLVMFuzzerTestOneInput(data, 32);
- }
- return 0;
-}
diff --git a/util/logging.h b/util/logging.h
index 5b2217f..946962b 100644
--- a/util/logging.h
+++ b/util/logging.h
@@ -13,7 +13,7 @@
#include <ostream>
#include <sstream>
-#include "util/util.h"
+#include "absl/base/attributes.h"
// Debug-only checking.
#define DCHECK(condition) assert(condition)
@@ -93,7 +93,7 @@ class LogMessageFatal : public LogMessage {
public:
LogMessageFatal(const char* file, int line)
: LogMessage(file, line) {}
- ATTRIBUTE_NORETURN ~LogMessageFatal() {
+ ABSL_ATTRIBUTE_NORETURN ~LogMessageFatal() {
Flush();
abort();
}
diff --git a/util/mix.h b/util/mix.h
deleted file mode 100644
index d85c172..0000000
--- a/util/mix.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright 2016 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef UTIL_MIX_H_
-#define UTIL_MIX_H_
-
-#include <stddef.h>
-#include <limits>
-
-namespace re2 {
-
-// Silence "truncation of constant value" warning for kMul in 32-bit mode.
-// Since this is a header file, push and then pop to limit the scope.
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable: 4309)
-#endif
-
-class HashMix {
- public:
- HashMix() : hash_(1) {}
- explicit HashMix(size_t val) : hash_(val + 83) {}
- void Mix(size_t val) {
- static const size_t kMul = static_cast<size_t>(0xdc3eb94af8ab4c93ULL);
- hash_ *= kMul;
- hash_ = ((hash_ << 19) |
- (hash_ >> (std::numeric_limits<size_t>::digits - 19))) + val;
- }
- size_t get() const { return hash_; }
- private:
- size_t hash_;
-};
-
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-} // namespace re2
-
-#endif // UTIL_MIX_H_
diff --git a/util/mutex.h b/util/mutex.h
deleted file mode 100644
index 158046b..0000000
--- a/util/mutex.h
+++ /dev/null
@@ -1,148 +0,0 @@
-// Copyright 2007 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef UTIL_MUTEX_H_
-#define UTIL_MUTEX_H_
-
-/*
- * A simple mutex wrapper, supporting locks and read-write locks.
- * You should assume the locks are *not* re-entrant.
- */
-
-#ifdef _WIN32
-// Requires Windows Vista or Windows Server 2008 at minimum.
-#include <windows.h>
-#if defined(WINVER) && WINVER >= 0x0600
-#define MUTEX_IS_WIN32_SRWLOCK
-#endif
-#else
-#ifndef _POSIX_C_SOURCE
-#define _POSIX_C_SOURCE 200809L
-#endif
-#include <unistd.h>
-#if defined(_POSIX_READER_WRITER_LOCKS) && _POSIX_READER_WRITER_LOCKS > 0
-#define MUTEX_IS_PTHREAD_RWLOCK
-#endif
-#endif
-
-#if defined(MUTEX_IS_WIN32_SRWLOCK)
-typedef SRWLOCK MutexType;
-#elif defined(MUTEX_IS_PTHREAD_RWLOCK)
-#include <pthread.h>
-#include <stdlib.h>
-typedef pthread_rwlock_t MutexType;
-#else
-#include <mutex>
-typedef std::mutex MutexType;
-#endif
-
-namespace re2 {
-
-class Mutex {
- public:
- inline Mutex();
- inline ~Mutex();
- inline void Lock(); // Block if needed until free then acquire exclusively
- inline void Unlock(); // Release a lock acquired via Lock()
- // Note that on systems that don't support read-write locks, these may
- // be implemented as synonyms to Lock() and Unlock(). So you can use
- // these for efficiency, but don't use them anyplace where being able
- // to do shared reads is necessary to avoid deadlock.
- inline void ReaderLock(); // Block until free or shared then acquire a share
- inline void ReaderUnlock(); // Release a read share of this Mutex
- inline void WriterLock() { Lock(); } // Acquire an exclusive lock
- inline void WriterUnlock() { Unlock(); } // Release a lock from WriterLock()
-
- private:
- MutexType mutex_;
-
- // Catch the error of writing Mutex when intending MutexLock.
- Mutex(Mutex *ignored);
-
- Mutex(const Mutex&) = delete;
- Mutex& operator=(const Mutex&) = delete;
-};
-
-#if defined(MUTEX_IS_WIN32_SRWLOCK)
-
-Mutex::Mutex() : mutex_(SRWLOCK_INIT) { }
-Mutex::~Mutex() { }
-void Mutex::Lock() { AcquireSRWLockExclusive(&mutex_); }
-void Mutex::Unlock() { ReleaseSRWLockExclusive(&mutex_); }
-void Mutex::ReaderLock() { AcquireSRWLockShared(&mutex_); }
-void Mutex::ReaderUnlock() { ReleaseSRWLockShared(&mutex_); }
-
-#elif defined(MUTEX_IS_PTHREAD_RWLOCK)
-
-#define SAFE_PTHREAD(fncall) \
- do { \
- if ((fncall) != 0) abort(); \
- } while (0)
-
-Mutex::Mutex() { SAFE_PTHREAD(pthread_rwlock_init(&mutex_, NULL)); }
-Mutex::~Mutex() { SAFE_PTHREAD(pthread_rwlock_destroy(&mutex_)); }
-void Mutex::Lock() { SAFE_PTHREAD(pthread_rwlock_wrlock(&mutex_)); }
-void Mutex::Unlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); }
-void Mutex::ReaderLock() { SAFE_PTHREAD(pthread_rwlock_rdlock(&mutex_)); }
-void Mutex::ReaderUnlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); }
-
-#undef SAFE_PTHREAD
-
-#else
-
-Mutex::Mutex() { }
-Mutex::~Mutex() { }
-void Mutex::Lock() { mutex_.lock(); }
-void Mutex::Unlock() { mutex_.unlock(); }
-void Mutex::ReaderLock() { Lock(); } // C++11 doesn't have std::shared_mutex.
-void Mutex::ReaderUnlock() { Unlock(); }
-
-#endif
-
-// --------------------------------------------------------------------------
-// Some helper classes
-
-// MutexLock(mu) acquires mu when constructed and releases it when destroyed.
-class MutexLock {
- public:
- explicit MutexLock(Mutex *mu) : mu_(mu) { mu_->Lock(); }
- ~MutexLock() { mu_->Unlock(); }
- private:
- Mutex * const mu_;
-
- MutexLock(const MutexLock&) = delete;
- MutexLock& operator=(const MutexLock&) = delete;
-};
-
-// ReaderMutexLock and WriterMutexLock do the same, for rwlocks
-class ReaderMutexLock {
- public:
- explicit ReaderMutexLock(Mutex *mu) : mu_(mu) { mu_->ReaderLock(); }
- ~ReaderMutexLock() { mu_->ReaderUnlock(); }
- private:
- Mutex * const mu_;
-
- ReaderMutexLock(const ReaderMutexLock&) = delete;
- ReaderMutexLock& operator=(const ReaderMutexLock&) = delete;
-};
-
-class WriterMutexLock {
- public:
- explicit WriterMutexLock(Mutex *mu) : mu_(mu) { mu_->WriterLock(); }
- ~WriterMutexLock() { mu_->WriterUnlock(); }
- private:
- Mutex * const mu_;
-
- WriterMutexLock(const WriterMutexLock&) = delete;
- WriterMutexLock& operator=(const WriterMutexLock&) = delete;
-};
-
-// Catch bug where variable name is omitted, e.g. MutexLock (&mu);
-#define MutexLock(x) static_assert(false, "MutexLock declaration missing variable name")
-#define ReaderMutexLock(x) static_assert(false, "ReaderMutexLock declaration missing variable name")
-#define WriterMutexLock(x) static_assert(false, "WriterMutexLock declaration missing variable name")
-
-} // namespace re2
-
-#endif // UTIL_MUTEX_H_
diff --git a/util/pcre.cc b/util/pcre.cc
index b689851..f54cb28 100644
--- a/util/pcre.cc
+++ b/util/pcre.cc
@@ -15,14 +15,13 @@
#include <string>
#include <utility>
-#include "util/util.h"
-#include "util/flags.h"
+#include "absl/flags/flag.h"
+#include "absl/strings/str_format.h"
#include "util/logging.h"
#include "util/pcre.h"
-#include "util/strutil.h"
// Silence warnings about the wacky formatting in the operator() functions.
-#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 6
+#if !defined(__clang__) && defined(__GNUC__)
#pragma GCC diagnostic ignored "-Wmisleading-indentation"
#endif
@@ -33,10 +32,10 @@
// not exceed main thread stacks. Note that other threads
// often have smaller stacks, and therefore tightening
// regexp_stack_limit may frequently be necessary.
-DEFINE_FLAG(int, regexp_stack_limit, 256 << 10,
- "default PCRE stack limit (bytes)");
-DEFINE_FLAG(int, regexp_match_limit, 1000000,
- "default PCRE match limit (function calls)");
+ABSL_FLAG(int, regexp_stack_limit, 256 << 10,
+ "default PCRE stack limit (bytes)");
+ABSL_FLAG(int, regexp_match_limit, 1000000,
+ "default PCRE match limit (function calls)");
#ifndef USEPCRE
@@ -191,24 +190,11 @@ pcre* PCRE::Compile(Anchor anchor) {
/***** Convenience interfaces *****/
-bool PCRE::FullMatchFunctor::operator ()(const StringPiece& text,
- const PCRE& re,
- const Arg& a0,
- const Arg& a1,
- const Arg& a2,
- const Arg& a3,
- const Arg& a4,
- const Arg& a5,
- const Arg& a6,
- const Arg& a7,
- const Arg& a8,
- const Arg& a9,
- const Arg& a10,
- const Arg& a11,
- const Arg& a12,
- const Arg& a13,
- const Arg& a14,
- const Arg& a15) const {
+bool PCRE::FullMatchFunctor::operator()(
+ absl::string_view text, const PCRE& re, const Arg& a0, const Arg& a1,
+ const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6,
+ const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11,
+ const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const {
const Arg* args[kMaxArgs];
int n = 0;
if (&a0 == &no_more_args) goto done; args[n++] = &a0;
@@ -234,24 +220,11 @@ done:
return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
}
-bool PCRE::PartialMatchFunctor::operator ()(const StringPiece& text,
- const PCRE& re,
- const Arg& a0,
- const Arg& a1,
- const Arg& a2,
- const Arg& a3,
- const Arg& a4,
- const Arg& a5,
- const Arg& a6,
- const Arg& a7,
- const Arg& a8,
- const Arg& a9,
- const Arg& a10,
- const Arg& a11,
- const Arg& a12,
- const Arg& a13,
- const Arg& a14,
- const Arg& a15) const {
+bool PCRE::PartialMatchFunctor::operator()(
+ absl::string_view text, const PCRE& re, const Arg& a0, const Arg& a1,
+ const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6,
+ const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11,
+ const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const {
const Arg* args[kMaxArgs];
int n = 0;
if (&a0 == &no_more_args) goto done; args[n++] = &a0;
@@ -277,24 +250,11 @@ done:
return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
}
-bool PCRE::ConsumeFunctor::operator ()(StringPiece* input,
- const PCRE& pattern,
- const Arg& a0,
- const Arg& a1,
- const Arg& a2,
- const Arg& a3,
- const Arg& a4,
- const Arg& a5,
- const Arg& a6,
- const Arg& a7,
- const Arg& a8,
- const Arg& a9,
- const Arg& a10,
- const Arg& a11,
- const Arg& a12,
- const Arg& a13,
- const Arg& a14,
- const Arg& a15) const {
+bool PCRE::ConsumeFunctor::operator()(
+ absl::string_view* input, const PCRE& pattern, const Arg& a0, const Arg& a1,
+ const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6,
+ const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11,
+ const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const {
const Arg* args[kMaxArgs];
int n = 0;
if (&a0 == &no_more_args) goto done; args[n++] = &a0;
@@ -326,24 +286,11 @@ done:
}
}
-bool PCRE::FindAndConsumeFunctor::operator ()(StringPiece* input,
- const PCRE& pattern,
- const Arg& a0,
- const Arg& a1,
- const Arg& a2,
- const Arg& a3,
- const Arg& a4,
- const Arg& a5,
- const Arg& a6,
- const Arg& a7,
- const Arg& a8,
- const Arg& a9,
- const Arg& a10,
- const Arg& a11,
- const Arg& a12,
- const Arg& a13,
- const Arg& a14,
- const Arg& a15) const {
+bool PCRE::FindAndConsumeFunctor::operator()(
+ absl::string_view* input, const PCRE& pattern, const Arg& a0, const Arg& a1,
+ const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6,
+ const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11,
+ const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const {
const Arg* args[kMaxArgs];
int n = 0;
if (&a0 == &no_more_args) goto done; args[n++] = &a0;
@@ -375,9 +322,8 @@ done:
}
}
-bool PCRE::Replace(std::string *str,
- const PCRE& pattern,
- const StringPiece& rewrite) {
+bool PCRE::Replace(std::string* str, const PCRE& pattern,
+ absl::string_view rewrite) {
int vec[kVecSize] = {};
int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
if (matches == 0)
@@ -393,9 +339,8 @@ bool PCRE::Replace(std::string *str,
return true;
}
-int PCRE::GlobalReplace(std::string *str,
- const PCRE& pattern,
- const StringPiece& rewrite) {
+int PCRE::GlobalReplace(std::string* str, const PCRE& pattern,
+ absl::string_view rewrite) {
int count = 0;
int vec[kVecSize] = {};
std::string out;
@@ -451,10 +396,8 @@ int PCRE::GlobalReplace(std::string *str,
return count;
}
-bool PCRE::Extract(const StringPiece &text,
- const PCRE& pattern,
- const StringPiece &rewrite,
- std::string *out) {
+bool PCRE::Extract(absl::string_view text, const PCRE& pattern,
+ absl::string_view rewrite, std::string* out) {
int vec[kVecSize] = {};
int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
if (matches == 0)
@@ -463,7 +406,7 @@ bool PCRE::Extract(const StringPiece &text,
return pattern.Rewrite(out, rewrite, text, vec, matches);
}
-std::string PCRE::QuoteMeta(const StringPiece& unquoted) {
+std::string PCRE::QuoteMeta(absl::string_view unquoted) {
std::string result;
result.reserve(unquoted.size() << 1);
@@ -508,12 +451,8 @@ void PCRE::ClearHitLimit() {
hit_limit_ = 0;
}
-int PCRE::TryMatch(const StringPiece& text,
- size_t startpos,
- Anchor anchor,
- bool empty_ok,
- int *vec,
- int vecsize) const {
+int PCRE::TryMatch(absl::string_view text, size_t startpos, Anchor anchor,
+ bool empty_ok, int* vec, int vecsize) const {
pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
if (re == NULL) {
PCREPORT(ERROR) << "Matching against invalid re: " << *error_;
@@ -522,12 +461,12 @@ int PCRE::TryMatch(const StringPiece& text,
int match_limit = match_limit_;
if (match_limit <= 0) {
- match_limit = GetFlag(FLAGS_regexp_match_limit);
+ match_limit = absl::GetFlag(FLAGS_regexp_match_limit);
}
int stack_limit = stack_limit_;
if (stack_limit <= 0) {
- stack_limit = GetFlag(FLAGS_regexp_stack_limit);
+ stack_limit = absl::GetFlag(FLAGS_regexp_stack_limit);
}
pcre_extra extra = { 0 };
@@ -604,12 +543,8 @@ int PCRE::TryMatch(const StringPiece& text,
return rc;
}
-bool PCRE::DoMatchImpl(const StringPiece& text,
- Anchor anchor,
- size_t* consumed,
- const Arg* const* args,
- int n,
- int* vec,
+bool PCRE::DoMatchImpl(absl::string_view text, Anchor anchor, size_t* consumed,
+ const Arg* const* args, int n, int* vec,
int vecsize) const {
assert((1 + n) * 3 <= vecsize); // results + PCRE workspace
if (NumberOfCapturingGroups() < n) {
@@ -654,11 +589,8 @@ bool PCRE::DoMatchImpl(const StringPiece& text,
return true;
}
-bool PCRE::DoMatch(const StringPiece& text,
- Anchor anchor,
- size_t* consumed,
- const Arg* const args[],
- int n) const {
+bool PCRE::DoMatch(absl::string_view text, Anchor anchor, size_t* consumed,
+ const Arg* const args[], int n) const {
assert(n >= 0);
const int vecsize = (1 + n) * 3; // results + PCRE workspace
// (as for kVecSize)
@@ -668,8 +600,8 @@ bool PCRE::DoMatch(const StringPiece& text,
return b;
}
-bool PCRE::Rewrite(std::string *out, const StringPiece &rewrite,
- const StringPiece &text, int *vec, int veclen) const {
+bool PCRE::Rewrite(std::string* out, absl::string_view rewrite,
+ absl::string_view text, int* vec, int veclen) const {
int number_of_capturing_groups = NumberOfCapturingGroups();
for (const char *s = rewrite.data(), *end = s + rewrite.size();
s < end; s++) {
@@ -704,7 +636,7 @@ bool PCRE::Rewrite(std::string *out, const StringPiece &rewrite,
return true;
}
-bool PCRE::CheckRewriteString(const StringPiece& rewrite,
+bool PCRE::CheckRewriteString(absl::string_view rewrite,
std::string* error) const {
int max_token = -1;
for (const char *s = rewrite.data(), *end = s + rewrite.size();
@@ -733,7 +665,7 @@ bool PCRE::CheckRewriteString(const StringPiece& rewrite,
}
if (max_token > NumberOfCapturingGroups()) {
- *error = StringPrintf(
+ *error = absl::StrFormat(
"Rewrite schema requests %d matches, but the regexp only has %d "
"parenthesized subexpressions.",
max_token, NumberOfCapturingGroups());
@@ -742,7 +674,6 @@ bool PCRE::CheckRewriteString(const StringPiece& rewrite,
return true;
}
-
// Return the number of capturing subpatterns, or -1 if the
// regexp wasn't valid on construction.
int PCRE::NumberOfCapturingGroups() const {
@@ -774,9 +705,9 @@ bool PCRE::Arg::parse_string(const char* str, size_t n, void* dest) {
return true;
}
-bool PCRE::Arg::parse_stringpiece(const char* str, size_t n, void* dest) {
+bool PCRE::Arg::parse_string_view(const char* str, size_t n, void* dest) {
if (dest == NULL) return true;
- *(reinterpret_cast<StringPiece*>(dest)) = StringPiece(str, n);
+ *(reinterpret_cast<absl::string_view*>(dest)) = absl::string_view(str, n);
return true;
}
diff --git a/util/pcre.h b/util/pcre.h
index 896b0bd..846f300 100644
--- a/util/pcre.h
+++ b/util/pcre.h
@@ -120,12 +120,12 @@
//
// The "Consume" operation may be useful if you want to repeatedly
// match regular expressions at the front of a string and skip over
-// them as they match. This requires use of the "StringPiece" type,
+// them as they match. This requires use of the string_view type,
// which represents a sub-range of a real string.
//
// Example: read lines of the form "var = value" from a string.
-// std::string contents = ...; // Fill string somehow
-// StringPiece input(contents); // Wrap a StringPiece around it
+// std::string contents = ...; // Fill string somehow
+// absl::string_view input(contents); // Wrap a string_view around it
//
// std::string var;
// int value;
@@ -161,8 +161,7 @@
// Octal(&a), Hex(&b), CRadix(&c), CRadix(&d));
// will leave 64 in a, b, c, and d.
-#include "util/util.h"
-#include "re2/stringpiece.h"
+#include "absl/strings/string_view.h"
#ifdef USEPCRE
#include <pcre.h>
@@ -176,6 +175,16 @@ const bool UsingPCRE = false;
} // namespace re2
#endif
+// To produce a DLL, CMake can automatically export code symbols,
+// but not data symbols, so we have to annotate those manually...
+#if defined(RE2_BUILD_TESTING_DLL)
+#define RE2_TESTING_DLL __declspec(dllexport)
+#elif defined(RE2_CONSUME_TESTING_DLL)
+#define RE2_TESTING_DLL __declspec(dllimport)
+#else
+#define RE2_TESTING_DLL
+#endif
+
namespace re2 {
class PCRE_Options;
@@ -191,7 +200,7 @@ class PCRE {
// Marks end of arg list.
// ONLY USE IN OPTIONAL ARG DEFAULTS.
// DO NOT PASS EXPLICITLY.
- static Arg no_more_args;
+ RE2_TESTING_DLL static Arg no_more_args;
// Options are same value as those in pcre. We provide them here
// to avoid users needing to include pcre.h and also to isolate
@@ -246,10 +255,10 @@ class PCRE {
//
// The provided pointer arguments can be pointers to any scalar numeric
// type, or one of:
- // std::string (matched piece is copied to string)
- // StringPiece (StringPiece is mutated to point to matched piece)
- // T (where "bool T::ParseFrom(const char*, size_t)" exists)
- // (void*)NULL (the corresponding matched sub-pattern is not copied)
+ // std::string (matched piece is copied to string)
+ // absl::string_view (string_view is mutated to point to matched piece)
+ // T ("bool T::ParseFrom(const char*, size_t)" must exist)
+ // (void*)NULL (the corresponding matched sub-pattern is not copied)
//
// Returns true iff all of the following conditions are satisfied:
// a. "text" matches "pattern" exactly
@@ -267,7 +276,7 @@ class PCRE {
// int number;
// PCRE::FullMatch("abc", "[a-z]+(\\d+)?", &number);
struct FullMatchFunctor {
- bool operator ()(const StringPiece& text, const PCRE& re, // 3..16 args
+ bool operator ()(absl::string_view text, const PCRE& re, // 3..16 args
const Arg& ptr1 = no_more_args,
const Arg& ptr2 = no_more_args,
const Arg& ptr3 = no_more_args,
@@ -286,12 +295,12 @@ class PCRE {
const Arg& ptr16 = no_more_args) const;
};
- static const FullMatchFunctor FullMatch;
+ RE2_TESTING_DLL static const FullMatchFunctor FullMatch;
// Exactly like FullMatch(), except that "pattern" is allowed to match
// a substring of "text".
struct PartialMatchFunctor {
- bool operator ()(const StringPiece& text, const PCRE& re, // 3..16 args
+ bool operator ()(absl::string_view text, const PCRE& re, // 3..16 args
const Arg& ptr1 = no_more_args,
const Arg& ptr2 = no_more_args,
const Arg& ptr3 = no_more_args,
@@ -310,13 +319,13 @@ class PCRE {
const Arg& ptr16 = no_more_args) const;
};
- static const PartialMatchFunctor PartialMatch;
+ RE2_TESTING_DLL static const PartialMatchFunctor PartialMatch;
// Like FullMatch() and PartialMatch(), except that pattern has to
// match a prefix of "text", and "input" is advanced past the matched
// text. Note: "input" is modified iff this routine returns true.
struct ConsumeFunctor {
- bool operator ()(StringPiece* input, const PCRE& pattern, // 3..16 args
+ bool operator ()(absl::string_view* input, const PCRE& pattern, // 3..16 args
const Arg& ptr1 = no_more_args,
const Arg& ptr2 = no_more_args,
const Arg& ptr3 = no_more_args,
@@ -335,14 +344,14 @@ class PCRE {
const Arg& ptr16 = no_more_args) const;
};
- static const ConsumeFunctor Consume;
+ RE2_TESTING_DLL static const ConsumeFunctor Consume;
// Like Consume(..), but does not anchor the match at the beginning of the
// string. That is, "pattern" need not start its match at the beginning of
// "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next
// word in "s" and stores it in "word".
struct FindAndConsumeFunctor {
- bool operator ()(StringPiece* input, const PCRE& pattern,
+ bool operator ()(absl::string_view* input, const PCRE& pattern, // 3..16 args
const Arg& ptr1 = no_more_args,
const Arg& ptr2 = no_more_args,
const Arg& ptr3 = no_more_args,
@@ -361,7 +370,7 @@ class PCRE {
const Arg& ptr16 = no_more_args) const;
};
- static const FindAndConsumeFunctor FindAndConsume;
+ RE2_TESTING_DLL static const FindAndConsumeFunctor FindAndConsume;
// Replace the first match of "pattern" in "str" with "rewrite".
// Within "rewrite", backslash-escaped digits (\1 to \9) can be
@@ -376,9 +385,8 @@ class PCRE {
//
// Returns true if the pattern matches and a replacement occurs,
// false otherwise.
- static bool Replace(std::string *str,
- const PCRE& pattern,
- const StringPiece& rewrite);
+ static bool Replace(std::string* str, const PCRE& pattern,
+ absl::string_view rewrite);
// Like Replace(), except replaces all occurrences of the pattern in
// the string with the rewrite. Replacements are not subject to
@@ -390,9 +398,8 @@ class PCRE {
// will leave "s" containing "yada dada doo"
//
// Returns the number of replacements made.
- static int GlobalReplace(std::string *str,
- const PCRE& pattern,
- const StringPiece& rewrite);
+ static int GlobalReplace(std::string* str, const PCRE& pattern,
+ absl::string_view rewrite);
// Like Replace, except that if the pattern matches, "rewrite"
// is copied into "out" with substitutions. The non-matching
@@ -400,10 +407,8 @@ class PCRE {
//
// Returns true iff a match occurred and the extraction happened
// successfully; if no match occurs, the string is left unaffected.
- static bool Extract(const StringPiece &text,
- const PCRE& pattern,
- const StringPiece &rewrite,
- std::string *out);
+ static bool Extract(absl::string_view text, const PCRE& pattern,
+ absl::string_view rewrite, std::string* out);
// Check that the given @p rewrite string is suitable for use with
// this PCRE. It checks that:
@@ -418,8 +423,7 @@ class PCRE {
// @param error An error message is recorded here, iff we return false.
// Otherwise, it is unchanged.
// @return true, iff @p rewrite is suitable for use with the PCRE.
- bool CheckRewriteString(const StringPiece& rewrite,
- std::string* error) const;
+ bool CheckRewriteString(absl::string_view rewrite, std::string* error) const;
// Returns a copy of 'unquoted' with all potentially meaningful
// regexp characters backslash-escaped. The returned string, used
@@ -428,7 +432,7 @@ class PCRE {
// 1.5-2.0?
// becomes:
// 1\.5\-2\.0\?
- static std::string QuoteMeta(const StringPiece& unquoted);
+ static std::string QuoteMeta(absl::string_view unquoted);
/***** Generic matching interface (not so nice to use) *****/
@@ -441,9 +445,7 @@ class PCRE {
// General matching routine. Stores the length of the match in
// "*consumed" if successful.
- bool DoMatch(const StringPiece& text,
- Anchor anchor,
- size_t* consumed,
+ bool DoMatch(absl::string_view text, Anchor anchor, size_t* consumed,
const Arg* const* args, int n) const;
// Return the number of capturing subpatterns, or -1 if the
@@ -465,29 +467,17 @@ class PCRE {
// against "foo", "bar", and "baz" respectively.
// When matching PCRE("(foo)|hello") against "hello", it will return 1.
// But the values for all subpattern are filled in into "vec".
- int TryMatch(const StringPiece& text,
- size_t startpos,
- Anchor anchor,
- bool empty_ok,
- int *vec,
- int vecsize) const;
-
- // Append the "rewrite" string, with backslash subsitutions from "text"
+ int TryMatch(absl::string_view text, size_t startpos, Anchor anchor,
+ bool empty_ok, int* vec, int vecsize) const;
+
+ // Append the "rewrite" string, with backslash substitutions from "text"
// and "vec", to string "out".
- bool Rewrite(std::string *out,
- const StringPiece &rewrite,
- const StringPiece &text,
- int *vec,
- int veclen) const;
+ bool Rewrite(std::string* out, absl::string_view rewrite,
+ absl::string_view text, int* vec, int veclen) const;
// internal implementation for DoMatch
- bool DoMatchImpl(const StringPiece& text,
- Anchor anchor,
- size_t* consumed,
- const Arg* const args[],
- int n,
- int* vec,
- int vecsize) const;
+ bool DoMatchImpl(absl::string_view text, Anchor anchor, size_t* consumed,
+ const Arg* const args[], int n, int* vec, int vecsize) const;
// Compile the regexp for the specified anchoring mode
pcre* Compile(Anchor anchor);
@@ -500,7 +490,7 @@ class PCRE {
bool report_errors_; // Silences error logging if false
int match_limit_; // Limit on execution resources
int stack_limit_; // Limit on stack resources (bytes)
- mutable int32_t hit_limit_; // Hit limit during execution (bool)
+ mutable int hit_limit_; // Hit limit during execution (bool)
PCRE(const PCRE&) = delete;
PCRE& operator=(const PCRE&) = delete;
@@ -586,7 +576,7 @@ class PCRE::Arg {
MAKE_PARSER(float, parse_float);
MAKE_PARSER(double, parse_double);
MAKE_PARSER(std::string, parse_string);
- MAKE_PARSER(StringPiece, parse_stringpiece);
+ MAKE_PARSER(absl::string_view, parse_string_view);
MAKE_PARSER(short, parse_short);
MAKE_PARSER(unsigned short, parse_ushort);
@@ -613,14 +603,14 @@ class PCRE::Arg {
void* arg_;
Parser parser_;
- static bool parse_null (const char* str, size_t n, void* dest);
- static bool parse_char (const char* str, size_t n, void* dest);
- static bool parse_schar (const char* str, size_t n, void* dest);
- static bool parse_uchar (const char* str, size_t n, void* dest);
- static bool parse_float (const char* str, size_t n, void* dest);
- static bool parse_double (const char* str, size_t n, void* dest);
- static bool parse_string (const char* str, size_t n, void* dest);
- static bool parse_stringpiece (const char* str, size_t n, void* dest);
+ static bool parse_null (const char* str, size_t n, void* dest);
+ static bool parse_char (const char* str, size_t n, void* dest);
+ static bool parse_schar (const char* str, size_t n, void* dest);
+ static bool parse_uchar (const char* str, size_t n, void* dest);
+ static bool parse_float (const char* str, size_t n, void* dest);
+ static bool parse_double (const char* str, size_t n, void* dest);
+ static bool parse_string (const char* str, size_t n, void* dest);
+ static bool parse_string_view (const char* str, size_t n, void* dest);
#define DECLARE_INTEGER_PARSER(name) \
private: \
diff --git a/util/rune.cc b/util/rune.cc
index 4f625ea..a40e756 100644
--- a/util/rune.cc
+++ b/util/rune.cc
@@ -51,7 +51,7 @@ int
chartorune(Rune *rune, const char *str)
{
int c, c1, c2, c3;
- long l;
+ Rune l;
/*
* one character sequence
@@ -127,7 +127,7 @@ int
runetochar(char *str, const Rune *rune)
{
/* Runes are signed, so convert to unsigned for range check. */
- unsigned long c;
+ unsigned int c;
/*
* one character sequence
@@ -212,7 +212,7 @@ int
utflen(const char *s)
{
int c;
- long n;
+ int n;
Rune rune;
n = 0;
@@ -232,7 +232,7 @@ utflen(const char *s)
char*
utfrune(const char *s, Rune c)
{
- long c1;
+ int c1;
Rune r;
int n;
diff --git a/util/strutil.cc b/util/strutil.cc
index fb7e6b1..da06f85 100644
--- a/util/strutil.cc
+++ b/util/strutil.cc
@@ -2,79 +2,10 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-#include <stdarg.h>
-#include <stdio.h>
-
#include "util/strutil.h"
-#ifdef _WIN32
-#define snprintf _snprintf
-#define vsnprintf _vsnprintf
-#endif
-
namespace re2 {
-// ----------------------------------------------------------------------
-// CEscapeString()
-// Copies 'src' to 'dest', escaping dangerous characters using
-// C-style escape sequences. 'src' and 'dest' should not overlap.
-// Returns the number of bytes written to 'dest' (not including the \0)
-// or (size_t)-1 if there was insufficient space.
-// ----------------------------------------------------------------------
-static size_t CEscapeString(const char* src, size_t src_len,
- char* dest, size_t dest_len) {
- const char* src_end = src + src_len;
- size_t used = 0;
-
- for (; src < src_end; src++) {
- if (dest_len - used < 2) // space for two-character escape
- return (size_t)-1;
-
- unsigned char c = *src;
- switch (c) {
- case '\n': dest[used++] = '\\'; dest[used++] = 'n'; break;
- case '\r': dest[used++] = '\\'; dest[used++] = 'r'; break;
- case '\t': dest[used++] = '\\'; dest[used++] = 't'; break;
- case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break;
- case '\'': dest[used++] = '\\'; dest[used++] = '\''; break;
- case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break;
- default:
- // Note that if we emit \xNN and the src character after that is a hex
- // digit then that digit must be escaped too to prevent it being
- // interpreted as part of the character code by C.
- if (c < ' ' || c > '~') {
- if (dest_len - used < 5) // space for four-character escape + \0
- return (size_t)-1;
- snprintf(dest + used, 5, "\\%03o", c);
- used += 4;
- } else {
- dest[used++] = c; break;
- }
- }
- }
-
- if (dest_len - used < 1) // make sure that there is room for \0
- return (size_t)-1;
-
- dest[used] = '\0'; // doesn't count towards return value though
- return used;
-}
-
-// ----------------------------------------------------------------------
-// CEscape()
-// Copies 'src' to result, escaping dangerous characters using
-// C-style escape sequences. 'src' and 'dest' should not overlap.
-// ----------------------------------------------------------------------
-std::string CEscape(const StringPiece& src) {
- const size_t dest_len = src.size() * 4 + 1; // Maximum possible expansion
- char* dest = new char[dest_len];
- const size_t used = CEscapeString(src.data(), src.size(),
- dest, dest_len);
- std::string s = std::string(dest, used);
- delete[] dest;
- return s;
-}
-
void PrefixSuccessor(std::string* prefix) {
// We can increment the last character in the string and be done
// unless that character is 255, in which case we have to erase the
@@ -92,58 +23,4 @@ void PrefixSuccessor(std::string* prefix) {
}
}
-static void StringAppendV(std::string* dst, const char* format, va_list ap) {
- // First try with a small fixed size buffer
- char space[1024];
-
- // It's possible for methods that use a va_list to invalidate
- // the data in it upon use. The fix is to make a copy
- // of the structure before using it and use that copy instead.
- va_list backup_ap;
- va_copy(backup_ap, ap);
- int result = vsnprintf(space, sizeof(space), format, backup_ap);
- va_end(backup_ap);
-
- if ((result >= 0) && (static_cast<size_t>(result) < sizeof(space))) {
- // It fit
- dst->append(space, result);
- return;
- }
-
- // Repeatedly increase buffer size until it fits
- int length = sizeof(space);
- while (true) {
- if (result < 0) {
- // Older behavior: just try doubling the buffer size
- length *= 2;
- } else {
- // We need exactly "result+1" characters
- length = result+1;
- }
- char* buf = new char[length];
-
- // Restore the va_list before we use it again
- va_copy(backup_ap, ap);
- result = vsnprintf(buf, length, format, backup_ap);
- va_end(backup_ap);
-
- if ((result >= 0) && (result < length)) {
- // It fit
- dst->append(buf, result);
- delete[] buf;
- return;
- }
- delete[] buf;
- }
-}
-
-std::string StringPrintf(const char* format, ...) {
- va_list ap;
- va_start(ap, format);
- std::string result;
- StringAppendV(&result, format, ap);
- va_end(ap);
- return result;
-}
-
} // namespace re2
diff --git a/util/strutil.h b/util/strutil.h
index a69908a..f5d87a5 100644
--- a/util/strutil.h
+++ b/util/strutil.h
@@ -7,14 +7,9 @@
#include <string>
-#include "re2/stringpiece.h"
-#include "util/util.h"
-
namespace re2 {
-std::string CEscape(const StringPiece& src);
void PrefixSuccessor(std::string* prefix);
-std::string StringPrintf(const char* format, ...);
} // namespace re2
diff --git a/util/test.cc b/util/test.cc
deleted file mode 100644
index 028616b..0000000
--- a/util/test.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright 2009 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include <stdio.h>
-#include <string>
-
-#include "util/test.h"
-
-namespace testing {
-std::string TempDir() { return "/tmp/"; }
-} // namespace testing
-
-struct Test {
- void (*fn)(void);
- const char *name;
-};
-
-static Test tests[10000];
-static int ntests;
-
-void RegisterTest(void (*fn)(void), const char *name) {
- tests[ntests].fn = fn;
- tests[ntests++].name = name;
-}
-
-int main(int argc, char** argv) {
- for (int i = 0; i < ntests; i++) {
- printf("%s\n", tests[i].name);
- tests[i].fn();
- }
- printf("PASS\n");
- return 0;
-}
diff --git a/util/test.h b/util/test.h
deleted file mode 100644
index 54e6f8f..0000000
--- a/util/test.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright 2009 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef UTIL_TEST_H_
-#define UTIL_TEST_H_
-
-#include "util/util.h"
-#include "util/logging.h"
-
-namespace testing {
-std::string TempDir();
-} // namespace testing
-
-#define TEST(x, y) \
- void x##y(void); \
- TestRegisterer r##x##y(x##y, # x "." # y); \
- void x##y(void)
-
-void RegisterTest(void (*)(void), const char*);
-
-class TestRegisterer {
- public:
- TestRegisterer(void (*fn)(void), const char *s) {
- RegisterTest(fn, s);
- }
-};
-
-// fatal assertions
-#define ASSERT_TRUE CHECK
-#define ASSERT_FALSE(x) CHECK(!(x))
-#define ASSERT_EQ CHECK_EQ
-#define ASSERT_NE CHECK_NE
-#define ASSERT_LT CHECK_LT
-#define ASSERT_LE CHECK_LE
-#define ASSERT_GT CHECK_GT
-#define ASSERT_GE CHECK_GE
-
-// nonfatal assertions
-// TODO(rsc): Do a better job?
-#define EXPECT_TRUE CHECK
-#define EXPECT_FALSE(x) CHECK(!(x))
-#define EXPECT_EQ CHECK_EQ
-#define EXPECT_NE CHECK_NE
-#define EXPECT_LT CHECK_LT
-#define EXPECT_LE CHECK_LE
-#define EXPECT_GT CHECK_GT
-#define EXPECT_GE CHECK_GE
-
-#endif // UTIL_TEST_H_
diff --git a/util/util.h b/util/util.h
deleted file mode 100644
index 56e46c1..0000000
--- a/util/util.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright 2009 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef UTIL_UTIL_H_
-#define UTIL_UTIL_H_
-
-#define arraysize(array) (sizeof(array)/sizeof((array)[0]))
-
-#ifndef ATTRIBUTE_NORETURN
-#if defined(__GNUC__)
-#define ATTRIBUTE_NORETURN __attribute__((noreturn))
-#elif defined(_MSC_VER)
-#define ATTRIBUTE_NORETURN __declspec(noreturn)
-#else
-#define ATTRIBUTE_NORETURN
-#endif
-#endif
-
-#ifndef ATTRIBUTE_UNUSED
-#if defined(__GNUC__)
-#define ATTRIBUTE_UNUSED __attribute__((unused))
-#else
-#define ATTRIBUTE_UNUSED
-#endif
-#endif
-
-#ifndef FALLTHROUGH_INTENDED
-#if defined(__clang__)
-#define FALLTHROUGH_INTENDED [[clang::fallthrough]]
-#elif defined(__GNUC__) && __GNUC__ >= 7
-#define FALLTHROUGH_INTENDED [[gnu::fallthrough]]
-#else
-#define FALLTHROUGH_INTENDED do {} while (0)
-#endif
-#endif
-
-#ifndef NO_THREAD_SAFETY_ANALYSIS
-#define NO_THREAD_SAFETY_ANALYSIS
-#endif
-
-#endif // UTIL_UTIL_H_