diff options
123 files changed, 4797 insertions, 3679 deletions
diff --git a/.bazelrc b/.bazelrc new file mode 100644 index 0000000..540fb57 --- /dev/null +++ b/.bazelrc @@ -0,0 +1,23 @@ +# Copyright 2022 The RE2 Authors. All Rights Reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +# Enable Bzlmod. This will be the default eventually... +build --enable_bzlmod +# Enable layering check features. Useful on Clang only. +build --features=layering_check +# Enable parse headers features. Enforcing that headers are self-contained. +build --features=parse_headers + +# Abseil requires C++14 at minimum. +# Previously, the flag was set via `BAZEL_CXXOPTS`. On macOS, we also had to set +# `BAZEL_USE_CPP_ONLY_TOOLCHAIN` since Bazel wouldn't respect the former without +# the latter. However, the latter stopped Bazel from using Xcode and `-framework +# Foundation`, which CCTZ (vendored into Abseil) requires. +build --enable_platform_specific_config +build:linux --cxxopt=-std=c++14 +build:macos --cxxopt=-std=c++14 +build:windows --cxxopt=/std:c++14 + +# Print test logs for failed tests. +test --test_output=errors diff --git a/.github/bazel.sh b/.github/bazel.sh index fbe92e6..7295ec6 100755 --- a/.github/bazel.sh +++ b/.github/bazel.sh @@ -3,7 +3,7 @@ set -eux bazel clean bazel build --compilation_mode=dbg -- //:all -bazel test --compilation_mode=dbg --test_output=errors -- //:all \ +bazel test --compilation_mode=dbg -- //:all \ -//:dfa_test \ -//:exhaustive1_test \ -//:exhaustive2_test \ @@ -13,7 +13,7 @@ bazel test --compilation_mode=dbg --test_output=errors -- //:all \ bazel clean bazel build --compilation_mode=opt -- //:all -bazel test --compilation_mode=opt --test_output=errors -- //:all \ +bazel test --compilation_mode=opt -- //:all \ -//:dfa_test \ -//:exhaustive1_test \ -//:exhaustive2_test \ diff --git a/.github/cmake.sh b/.github/cmake.sh index 145a843..782334e 100755 --- a/.github/cmake.sh +++ b/.github/cmake.sh @@ -1,11 +1,11 @@ #!/bin/bash set -eux -cmake . -D CMAKE_BUILD_TYPE=Debug +cmake . -D CMAKE_BUILD_TYPE=Debug -D RE2_BUILD_TESTING=ON "$@" cmake --build . --config Debug --clean-first ctest -C Debug --output-on-failure -E 'dfa|exhaustive|random' -cmake . -D CMAKE_BUILD_TYPE=Release +cmake . -D CMAKE_BUILD_TYPE=Release -D RE2_BUILD_TESTING=ON "$@" cmake --build . --config Release --clean-first ctest -C Release --output-on-failure -E 'dfa|exhaustive|random' diff --git a/.github/workflows/ci-bazel.yml b/.github/workflows/ci-bazel.yml index 681034d..013b52c 100644 --- a/.github/workflows/ci-bazel.yml +++ b/.github/workflows/ci-bazel.yml @@ -12,6 +12,8 @@ jobs: env: BAZELISK_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 + # TODO(junyer): Use `v2` whenever a new release is tagged. + - uses: bazelbuild/setup-bazelisk@6244971d4f7ba9aca943c2f3ede2bbd813fcca51 - run: .github/bazel.sh shell: bash diff --git a/.github/workflows/ci-cmake.yml b/.github/workflows/ci-cmake.yml index 585c386..d2d03af 100644 --- a/.github/workflows/ci-cmake.yml +++ b/.github/workflows/ci-cmake.yml @@ -3,13 +3,58 @@ on: push: branches: [main] jobs: - build: - runs-on: ${{ matrix.os }} + build-linux: + runs-on: ubuntu-latest + # The Benchmark package on Ubuntu 22.04 LTS is problematic whereas this + # Docker container is based on Debian bookworm and has a newer version. + container: gcc:13 strategy: fail-fast: false matrix: - os: [macos-latest, ubuntu-latest, windows-latest] + build_shared_libs: [OFF, ON] steps: - - uses: actions/checkout@v2 - - run: .github/cmake.sh + - uses: actions/checkout@v3 + - name: Install CMake + run: | + apt update -y + apt install -y cmake + shell: bash + - name: Install Abseil, GoogleTest and Benchmark + run: | + apt update -y + apt install -y libabsl-dev libgtest-dev libbenchmark-dev + shell: bash + - run: .github/cmake.sh -D BUILD_SHARED_LIBS=${{ matrix.build_shared_libs }} + shell: bash + build-macos: + runs-on: macos-latest + strategy: + fail-fast: false + matrix: + build_shared_libs: [OFF, ON] + steps: + - uses: actions/checkout@v3 + - name: Install Abseil, GoogleTest and Benchmark + run: | + brew update + brew install abseil googletest google-benchmark + shell: bash + - run: .github/cmake.sh -D BUILD_SHARED_LIBS=${{ matrix.build_shared_libs }} + shell: bash + build-windows: + runs-on: windows-latest + strategy: + fail-fast: false + matrix: + build_shared_libs: [OFF, ON] + steps: + - uses: actions/checkout@v3 + - name: Install Abseil, GoogleTest and Benchmark + run: | + vcpkg update + vcpkg install abseil gtest benchmark + shell: bash + - run: | + .github/cmake.sh -D BUILD_SHARED_LIBS=${{ matrix.build_shared_libs }} \ + -D CMAKE_TOOLCHAIN_FILE=C:/vcpkg/scripts/buildsystems/vcpkg.cmake shell: bash diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 44a773b..44ac9dc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -8,7 +8,7 @@ jobs: strategy: fail-fast: false matrix: - ver: [11, 14, 17, 20] + ver: [17, 20] env: CC: clang CXX: clang++ @@ -19,7 +19,12 @@ jobs: # (The other two flags are the default provided for CXXFLAGS in Makefile.) CXXFLAGS: -O3 -g -std=c++${{ matrix.ver }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 + - name: Install Abseil, GoogleTest and Benchmark + run: | + brew update + brew install abseil googletest google-benchmark + shell: bash - run: make && make test shell: bash build-clang: @@ -27,31 +32,42 @@ jobs: strategy: fail-fast: false matrix: - ver: [9, 10, 11, 12, 13] + ver: [15, 16, 17] env: CC: clang-${{ matrix.ver }} CXX: clang++-${{ matrix.ver }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Install Clang ${{ matrix.ver }} run: | + # Avoid `Conflicts: python3-lldb-x.y` between packages. + sudo apt purge -y python3-lldb-14 wget https://apt.llvm.org/llvm.sh chmod +x ./llvm.sh sudo ./llvm.sh ${{ matrix.ver }} shell: bash + - name: Install Abseil, GoogleTest and Benchmark + run: | + sudo apt update -y + sudo apt install -y libabsl-dev libgtest-dev libbenchmark-dev + shell: bash - run: make && make test shell: bash build-gcc: runs-on: ubuntu-latest - container: gcc:${{ matrix.ver }} strategy: fail-fast: false matrix: - ver: [6, 7, 8, 9, 10, 11] + ver: [11, 12, 13] env: - CC: gcc - CXX: g++ + CC: gcc-${{ matrix.ver }} + CXX: g++-${{ matrix.ver }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 + - name: Install Abseil, GoogleTest and Benchmark + run: | + sudo apt update -y + sudo apt install -y libabsl-dev libgtest-dev libbenchmark-dev + shell: bash - run: make && make test shell: bash diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 8ede73b..860da62 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -7,8 +7,8 @@ jobs: close: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - uses: actions/github-script@v5 + - uses: actions/checkout@v3 + - uses: actions/github-script@v6 with: script: | const fs = require('fs'); diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml new file mode 100644 index 0000000..d505eaa --- /dev/null +++ b/.github/workflows/python.yml @@ -0,0 +1,224 @@ +name: Python +on: + workflow_dispatch: + inputs: + build: + required: true + type: number +jobs: + wheel-linux: + name: Linux ${{ matrix.os }}, ${{ matrix.arch.name }}, Python ${{ matrix.ver }} + runs-on: ${{ matrix.arch.runs-on }} + container: + image: quay.io/pypa/${{ matrix.os }}_${{ matrix.arch.python-name }} + options: --init + strategy: + fail-fast: false + matrix: + arch: + - { name: X64, python-name: x86_64, runs-on: [ubuntu-latest] } + - { name: ARM64, python-name: aarch64, runs-on: [self-hosted, linux, arm64] } + os: [manylinux2014, manylinux_2_28] + ver: ['3.8', '3.9', '3.10', '3.11', '3.12'] + env: + BAZELISK_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: actions/checkout@v3 + # Stash the timestamp for the commit SHA that triggered the workflow. + - run: echo "timestamp=$(git log -1 --pretty=%ct)" >> "${GITHUB_ENV}" + shell: bash + # TODO(junyer): Use `v2` whenever a new release is tagged. + - uses: bazelbuild/setup-bazelisk@6244971d4f7ba9aca943c2f3ede2bbd813fcca51 + - name: Prepare Python ${{ matrix.ver }} environment + run: | + ln -sf /usr/local/bin/python${{ matrix.ver }} /usr/local/bin/python + ln -sf /usr/local/bin/python${{ matrix.ver }} /usr/local/bin/python3 + python -m pip install --upgrade pip + python -m pip install --upgrade build wheel auditwheel + python -m pip install --upgrade absl-py + shell: bash + - name: Build wheel + env: + SOURCE_DATE_EPOCH: ${{ env.timestamp }} + run: | + # TODO(junyer): Get rid of this hack whenever @rules_python no longer + # fails due to Bazel running as root. (It sounds more likely than the + # Docker container changing to be built with the `USER` instruction.) + useradd "${GITHUB_ACTOR}" + chown -R "${GITHUB_ACTOR}" .. + su -c 'python -m build --wheel' "${GITHUB_ACTOR}" + chown -R "${USER}" .. + python -m auditwheel repair --wheel-dir=. dist/* + shell: bash + working-directory: python + - name: Test wheel + run: | + python -m pip install google_re2-*.whl + python re2_test.py + shell: bash + working-directory: python + - uses: actions/upload-artifact@v3 + with: + name: ${{ hashFiles('python/google_re2-*.whl') }} + path: python/google_re2-*.whl + retention-days: 1 + wheel-macos: + name: macOS ${{ matrix.os }}, ${{ matrix.arch.name }}, Python ${{ matrix.ver }} + runs-on: macos-${{ matrix.os }} + strategy: + fail-fast: false + matrix: + arch: + - { name: X64, bazel-name: x86_64, python-name: x86_64 } + - { name: ARM64, bazel-name: arm64, python-name: arm64 } + os: [11, 12, 13] + ver: ['3.8', '3.9', '3.10', '3.11', '3.12'] + env: + BAZELISK_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + BAZEL_CPU: darwin_${{ matrix.arch.bazel-name }} + PLAT_NAME: macosx-${{ matrix.os }}.0-${{ matrix.arch.python-name }} + # Stop macOS from reporting the system version as 10.x. + # Otherwise, Python refuses to install the built wheel! + SYSTEM_VERSION_COMPAT: 0 + steps: + - uses: actions/checkout@v3 + # Stash the timestamp for the commit SHA that triggered the workflow. + - run: echo "timestamp=$(git log -1 --pretty=%ct)" >> "${GITHUB_ENV}" + shell: bash + # TODO(junyer): Use `v2` whenever a new release is tagged. + - uses: bazelbuild/setup-bazelisk@6244971d4f7ba9aca943c2f3ede2bbd813fcca51 + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.ver }} + - name: Prepare Python ${{ matrix.ver }} environment + run: | + python -m pip install --upgrade pip + python -m pip install --upgrade build wheel delocate + python -m pip install --upgrade absl-py + shell: bash + - name: Build wheel + env: + SOURCE_DATE_EPOCH: ${{ env.timestamp }} + run: | + python -m build --wheel + python -m delocate.cmd.delocate_wheel --wheel-dir=. dist/* + shell: bash + working-directory: python + - if: matrix.arch.name == runner.arch + name: Test wheel + run: | + python -m pip install google_re2-*.whl + python re2_test.py + shell: bash + working-directory: python + - uses: actions/upload-artifact@v3 + with: + name: ${{ hashFiles('python/google_re2-*.whl') }} + path: python/google_re2-*.whl + retention-days: 1 + wheel-windows: + name: Windows, ${{ matrix.arch.name }}, Python ${{ matrix.ver }} + runs-on: windows-latest + strategy: + fail-fast: false + matrix: + arch: + - { name: X86, bazel-name: x64_x86, python-name: win32 } + - { name: X64, bazel-name: x64, python-name: win_amd64 } + ver: ['3.8', '3.9', '3.10', '3.11', '3.12'] + env: + BAZELISK_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + BAZEL_CPU: ${{ matrix.arch.bazel-name }}_windows + PLAT_NAME: ${{ matrix.arch.python-name }} + steps: + - uses: actions/checkout@v3 + # Stash the timestamp for the commit SHA that triggered the workflow. + - run: echo "timestamp=$(git log -1 --pretty=%ct)" >> "${GITHUB_ENV}" + shell: bash + # Avoid the Chocolatey install of Bazel getting in the way; + # `bazelbuild/setup-bazelisk` doesn't work for some reason. + - run: | + choco uninstall -y bazel + choco install -y bazelisk + shell: bash + # Lowercase the architecture name for `actions/setup-python`. + - run: | + ARCHITECTURE=${{ matrix.arch.name }} + echo "architecture=${ARCHITECTURE,,}" >> "${GITHUB_ENV}" + shell: bash + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.ver }} + architecture: ${{ env.architecture }} + - name: Prepare Python ${{ matrix.ver }} environment + run: | + python -m pip install --upgrade pip + python -m pip install --upgrade build wheel delvewheel + python -m pip install --upgrade absl-py + shell: bash + - name: Build wheel + env: + SOURCE_DATE_EPOCH: ${{ env.timestamp }} + run: | + python -m build --wheel + python -m delvewheel repair --wheel-dir=. dist/* + shell: bash + working-directory: python + - name: Test wheel + run: | + python -m pip install google_re2-*.whl + python re2_test.py + shell: bash + working-directory: python + - uses: actions/upload-artifact@v3 + with: + name: ${{ hashFiles('python/google_re2-*.whl') }} + path: python/google_re2-*.whl + retention-days: 1 + publish: + needs: + - wheel-linux + - wheel-macos + - wheel-windows + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + # Stash the timestamp for the commit SHA that triggered the workflow. + - run: echo "timestamp=$(git log -1 --pretty=%ct)" >> "${GITHUB_ENV}" + shell: bash + - uses: actions/setup-python@v4 + with: + python-version: '3.x' + - name: Prepare Python 3.x environment + run: | + python -m pip install --upgrade pip + python -m pip install --upgrade build wheel + shell: bash + - if: inputs.build == 1 + name: Build source + env: + SOURCE_DATE_EPOCH: ${{ env.timestamp }} + run: | + python -m build --sdist + shell: bash + working-directory: python + - uses: actions/download-artifact@v3 + with: + path: python + - name: Set build number to ${{ inputs.build }} + env: + SOURCE_DATE_EPOCH: ${{ env.timestamp }} + run: | + mkdir -p dist + for WHL in */google_re2-*.whl; do + python -m wheel unpack "${WHL}" + python -m wheel pack --dest-dir=dist --build-number=${{ inputs.build }} google_re2-* + rm -rf google_re2-* + done + shell: bash + working-directory: python + - if: inputs.build >= 1 + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.PYPI_API_TOKEN }} + packages_dir: python/dist @@ -1,278 +0,0 @@ -# Copyright 2009 The RE2 Authors. All Rights Reserved. -# Use of this source code is governed by a BSD-style -# license that can be found in the LICENSE file. - -# Bazel (http://bazel.io/) BUILD file for RE2. - -licenses(["notice"]) - -exports_files(["LICENSE"]) - -config_setting( - name = "macos", - values = {"cpu": "darwin"}, -) - -config_setting( - name = "wasm", - values = {"cpu": "wasm32"}, -) - -config_setting( - name = "windows", - values = {"cpu": "x64_windows"}, -) - -cc_library( - name = "re2", - srcs = [ - "re2/bitmap256.h", - "re2/bitstate.cc", - "re2/compile.cc", - "re2/dfa.cc", - "re2/filtered_re2.cc", - "re2/mimics_pcre.cc", - "re2/nfa.cc", - "re2/onepass.cc", - "re2/parse.cc", - "re2/perl_groups.cc", - "re2/pod_array.h", - "re2/prefilter.cc", - "re2/prefilter.h", - "re2/prefilter_tree.cc", - "re2/prefilter_tree.h", - "re2/prog.cc", - "re2/prog.h", - "re2/re2.cc", - "re2/regexp.cc", - "re2/regexp.h", - "re2/set.cc", - "re2/simplify.cc", - "re2/sparse_array.h", - "re2/sparse_set.h", - "re2/stringpiece.cc", - "re2/tostring.cc", - "re2/unicode_casefold.cc", - "re2/unicode_casefold.h", - "re2/unicode_groups.cc", - "re2/unicode_groups.h", - "re2/walker-inl.h", - "util/logging.h", - "util/mix.h", - "util/mutex.h", - "util/rune.cc", - "util/strutil.cc", - "util/strutil.h", - "util/utf.h", - "util/util.h", - ], - hdrs = [ - "re2/filtered_re2.h", - "re2/re2.h", - "re2/set.h", - "re2/stringpiece.h", - ], - copts = select({ - ":wasm": [], - ":windows": [], - "//conditions:default": ["-pthread"], - }), - linkopts = select({ - # macOS doesn't need `-pthread' when linking and it appears that - # older versions of Clang will warn about the unused command line - # argument, so just don't pass it. - ":macos": [], - ":wasm": [], - ":windows": [], - "//conditions:default": ["-pthread"], - }), - visibility = ["//visibility:public"], -) - -cc_library( - name = "testing", - testonly = 1, - srcs = [ - "re2/testing/backtrack.cc", - "re2/testing/dump.cc", - "re2/testing/exhaustive_tester.cc", - "re2/testing/null_walker.cc", - "re2/testing/regexp_generator.cc", - "re2/testing/string_generator.cc", - "re2/testing/tester.cc", - "util/pcre.cc", - ], - hdrs = [ - "re2/testing/exhaustive_tester.h", - "re2/testing/regexp_generator.h", - "re2/testing/string_generator.h", - "re2/testing/tester.h", - "util/benchmark.h", - "util/flags.h", - "util/malloc_counter.h", - "util/pcre.h", - "util/test.h", - ], - deps = [":re2"], -) - -cc_library( - name = "test", - testonly = 1, - srcs = ["util/test.cc"], - deps = [":testing"], -) - -cc_test( - name = "charclass_test", - size = "small", - srcs = ["re2/testing/charclass_test.cc"], - deps = [":test"], -) - -cc_test( - name = "compile_test", - size = "small", - srcs = ["re2/testing/compile_test.cc"], - deps = [":test"], -) - -cc_test( - name = "filtered_re2_test", - size = "small", - srcs = ["re2/testing/filtered_re2_test.cc"], - deps = [":test"], -) - -cc_test( - name = "mimics_pcre_test", - size = "small", - srcs = ["re2/testing/mimics_pcre_test.cc"], - deps = [":test"], -) - -cc_test( - name = "parse_test", - size = "small", - srcs = ["re2/testing/parse_test.cc"], - deps = [":test"], -) - -cc_test( - name = "possible_match_test", - size = "small", - srcs = ["re2/testing/possible_match_test.cc"], - deps = [":test"], -) - -cc_test( - name = "re2_arg_test", - size = "small", - srcs = ["re2/testing/re2_arg_test.cc"], - deps = [":test"], -) - -cc_test( - name = "re2_test", - size = "small", - srcs = ["re2/testing/re2_test.cc"], - deps = [":test"], -) - -cc_test( - name = "regexp_test", - size = "small", - srcs = ["re2/testing/regexp_test.cc"], - deps = [":test"], -) - -cc_test( - name = "required_prefix_test", - size = "small", - srcs = ["re2/testing/required_prefix_test.cc"], - deps = [":test"], -) - -cc_test( - name = "search_test", - size = "small", - srcs = ["re2/testing/search_test.cc"], - deps = [":test"], -) - -cc_test( - name = "set_test", - size = "small", - srcs = ["re2/testing/set_test.cc"], - deps = [":test"], -) - -cc_test( - name = "simplify_test", - size = "small", - srcs = ["re2/testing/simplify_test.cc"], - deps = [":test"], -) - -cc_test( - name = "string_generator_test", - size = "small", - srcs = ["re2/testing/string_generator_test.cc"], - deps = [":test"], -) - -cc_test( - name = "dfa_test", - size = "large", - srcs = ["re2/testing/dfa_test.cc"], - deps = [":test"], -) - -cc_test( - name = "exhaustive1_test", - size = "large", - srcs = ["re2/testing/exhaustive1_test.cc"], - deps = [":test"], -) - -cc_test( - name = "exhaustive2_test", - size = "large", - srcs = ["re2/testing/exhaustive2_test.cc"], - deps = [":test"], -) - -cc_test( - name = "exhaustive3_test", - size = "large", - srcs = ["re2/testing/exhaustive3_test.cc"], - deps = [":test"], -) - -cc_test( - name = "exhaustive_test", - size = "large", - srcs = ["re2/testing/exhaustive_test.cc"], - deps = [":test"], -) - -cc_test( - name = "random_test", - size = "large", - srcs = ["re2/testing/random_test.cc"], - deps = [":test"], -) - -cc_library( - name = "benchmark", - testonly = 1, - srcs = ["util/benchmark.cc"], - deps = [":testing"], -) - -cc_binary( - name = "regexp_benchmark", - testonly = 1, - srcs = ["re2/testing/regexp_benchmark.cc"], - deps = [":benchmark"], -) diff --git a/BUILD.bazel b/BUILD.bazel new file mode 100644 index 0000000..6122a3f --- /dev/null +++ b/BUILD.bazel @@ -0,0 +1,394 @@ +# Copyright 2009 The RE2 Authors. All Rights Reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +# Bazel (http://bazel.build/) BUILD file for RE2. + +licenses(["notice"]) + +exports_files(["LICENSE"]) + +cc_library( + name = "re2", + srcs = [ + "re2/bitmap256.cc", + "re2/bitmap256.h", + "re2/bitstate.cc", + "re2/compile.cc", + "re2/dfa.cc", + "re2/filtered_re2.cc", + "re2/mimics_pcre.cc", + "re2/nfa.cc", + "re2/onepass.cc", + "re2/parse.cc", + "re2/perl_groups.cc", + "re2/pod_array.h", + "re2/prefilter.cc", + "re2/prefilter.h", + "re2/prefilter_tree.cc", + "re2/prefilter_tree.h", + "re2/prog.cc", + "re2/prog.h", + "re2/re2.cc", + "re2/regexp.cc", + "re2/regexp.h", + "re2/set.cc", + "re2/simplify.cc", + "re2/sparse_array.h", + "re2/sparse_set.h", + "re2/tostring.cc", + "re2/unicode_casefold.cc", + "re2/unicode_casefold.h", + "re2/unicode_groups.cc", + "re2/unicode_groups.h", + "re2/walker-inl.h", + "util/logging.h", + "util/rune.cc", + "util/strutil.cc", + "util/strutil.h", + "util/utf.h", + ], + hdrs = [ + "re2/filtered_re2.h", + "re2/re2.h", + "re2/set.h", + "re2/stringpiece.h", + ], + copts = select({ + "@platforms//os:wasi": [], + "@platforms//os:windows": [], + "//conditions:default": ["-pthread"], + }), + linkopts = select({ + # macOS doesn't need `-pthread' when linking and it appears that + # older versions of Clang will warn about the unused command line + # argument, so just don't pass it. + "@platforms//os:macos": [], + "@platforms//os:wasi": [], + "@platforms//os:windows": [], + "//conditions:default": ["-pthread"], + }), + visibility = ["//visibility:public"], + deps = [ + "@com_google_absl//absl/base", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/container:fixed_array", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/container:flat_hash_set", + "@com_google_absl//absl/container:inlined_vector", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/synchronization", + "@com_google_absl//absl/types:optional", + "@com_google_absl//absl/types:span", + ], +) + +cc_library( + name = "testing", + testonly = 1, + srcs = [ + "re2/testing/backtrack.cc", + "re2/testing/dump.cc", + "re2/testing/exhaustive_tester.cc", + "re2/testing/null_walker.cc", + "re2/testing/regexp_generator.cc", + "re2/testing/string_generator.cc", + "re2/testing/tester.cc", + "util/pcre.cc", + ], + hdrs = [ + "re2/testing/exhaustive_tester.h", + "re2/testing/regexp_generator.h", + "re2/testing/string_generator.h", + "re2/testing/tester.h", + "util/malloc_counter.h", + "util/pcre.h", + + # Exposed for testing only. + "re2/bitmap256.h", + "re2/pod_array.h", + "re2/prefilter.h", + "re2/prefilter_tree.h", + "re2/prog.h", + "re2/regexp.h", + "re2/sparse_array.h", + "re2/sparse_set.h", + "re2/unicode_casefold.h", + "re2/unicode_groups.h", + "re2/walker-inl.h", + "util/logging.h", + "util/strutil.h", + "util/utf.h", + ], + visibility = [":__subpackages__"], + deps = [ + ":re2", + "@com_google_absl//absl/base", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/flags:flag", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@googletest//:gtest", + ], +) + +cc_test( + name = "charclass_test", + size = "small", + srcs = ["re2/testing/charclass_test.cc"], + deps = [ + ":testing", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/strings:str_format", + "@googletest//:gtest", + "@googletest//:gtest_main", + ], +) + +cc_test( + name = "compile_test", + size = "small", + srcs = ["re2/testing/compile_test.cc"], + deps = [ + ":testing", + "@com_google_absl//absl/base:core_headers", + "@googletest//:gtest", + "@googletest//:gtest_main", + ], +) + +cc_test( + name = "filtered_re2_test", + size = "small", + srcs = ["re2/testing/filtered_re2_test.cc"], + deps = [ + ":re2", + ":testing", + "@com_google_absl//absl/base:core_headers", + "@googletest//:gtest", + "@googletest//:gtest_main", + ], +) + +cc_test( + name = "mimics_pcre_test", + size = "small", + srcs = ["re2/testing/mimics_pcre_test.cc"], + deps = [ + ":testing", + "@com_google_absl//absl/base:core_headers", + "@googletest//:gtest", + "@googletest//:gtest_main", + ], +) + +cc_test( + name = "parse_test", + size = "small", + srcs = ["re2/testing/parse_test.cc"], + deps = [ + ":testing", + "@com_google_absl//absl/base:core_headers", + "@googletest//:gtest", + "@googletest//:gtest_main", + ], +) + +cc_test( + name = "possible_match_test", + size = "small", + srcs = ["re2/testing/possible_match_test.cc"], + deps = [ + ":re2", + ":testing", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/strings", + "@googletest//:gtest", + "@googletest//:gtest_main", + ], +) + +cc_test( + name = "re2_arg_test", + size = "small", + srcs = ["re2/testing/re2_arg_test.cc"], + deps = [ + ":re2", + ":testing", + "@com_google_absl//absl/base:core_headers", + "@googletest//:gtest", + "@googletest//:gtest_main", + ], +) + +cc_test( + name = "re2_test", + size = "small", + srcs = ["re2/testing/re2_test.cc"], + deps = [ + ":re2", + ":testing", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/strings:str_format", + "@googletest//:gtest", + "@googletest//:gtest_main", + ], +) + +cc_test( + name = "regexp_test", + size = "small", + srcs = ["re2/testing/regexp_test.cc"], + deps = [ + ":testing", + "@googletest//:gtest", + "@googletest//:gtest_main", + ], +) + +cc_test( + name = "required_prefix_test", + size = "small", + srcs = ["re2/testing/required_prefix_test.cc"], + deps = [ + ":testing", + "@com_google_absl//absl/base:core_headers", + "@googletest//:gtest", + "@googletest//:gtest_main", + ], +) + +cc_test( + name = "search_test", + size = "small", + srcs = ["re2/testing/search_test.cc"], + deps = [ + ":testing", + "@com_google_absl//absl/base:core_headers", + "@googletest//:gtest", + "@googletest//:gtest_main", + ], +) + +cc_test( + name = "set_test", + size = "small", + srcs = ["re2/testing/set_test.cc"], + deps = [ + ":re2", + ":testing", + "@googletest//:gtest", + "@googletest//:gtest_main", + ], +) + +cc_test( + name = "simplify_test", + size = "small", + srcs = ["re2/testing/simplify_test.cc"], + deps = [ + ":testing", + "@com_google_absl//absl/base:core_headers", + "@googletest//:gtest", + "@googletest//:gtest_main", + ], +) + +cc_test( + name = "string_generator_test", + size = "small", + srcs = ["re2/testing/string_generator_test.cc"], + deps = [ + ":testing", + "@googletest//:gtest", + "@googletest//:gtest_main", + ], +) + +cc_test( + name = "dfa_test", + size = "large", + srcs = ["re2/testing/dfa_test.cc"], + deps = [ + ":re2", + ":testing", + "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/flags:flag", + "@com_google_absl//absl/strings:str_format", + "@googletest//:gtest", + "@googletest//:gtest_main", + ], +) + +cc_test( + name = "exhaustive1_test", + size = "large", + srcs = ["re2/testing/exhaustive1_test.cc"], + deps = [ + ":testing", + "@googletest//:gtest", + "@googletest//:gtest_main", + ], +) + +cc_test( + name = "exhaustive2_test", + size = "large", + srcs = ["re2/testing/exhaustive2_test.cc"], + deps = [ + ":testing", + "@googletest//:gtest", + "@googletest//:gtest_main", + ], +) + +cc_test( + name = "exhaustive3_test", + size = "large", + srcs = ["re2/testing/exhaustive3_test.cc"], + deps = [ + ":testing", + "@googletest//:gtest", + "@googletest//:gtest_main", + ], +) + +cc_test( + name = "exhaustive_test", + size = "large", + srcs = ["re2/testing/exhaustive_test.cc"], + deps = [ + ":testing", + "@googletest//:gtest", + "@googletest//:gtest_main", + ], +) + +cc_test( + name = "random_test", + size = "large", + srcs = ["re2/testing/random_test.cc"], + deps = [ + ":testing", + "@com_google_absl//absl/flags:flag", + "@com_google_absl//absl/strings:str_format", + "@googletest//:gtest", + "@googletest//:gtest_main", + ], +) + +cc_binary( + name = "regexp_benchmark", + testonly = 1, + srcs = ["re2/testing/regexp_benchmark.cc"], + deps = [ + ":re2", + ":testing", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/flags:flag", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/synchronization", + "@google_benchmark//:benchmark_main", + ], +) diff --git a/CMakeLists.txt b/CMakeLists.txt index 62d4995..bdac5af 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,38 +2,43 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. -# Old enough to support Ubuntu Xenial. -cmake_minimum_required(VERSION 3.5.1) +# https://github.com/google/oss-policies-info/blob/main/foundational-cxx-support-matrix.md +cmake_minimum_required(VERSION 3.13) project(RE2 CXX) include(CMakePackageConfigHelpers) include(CTest) include(GNUInstallDirs) -if(NOT CMAKE_CXX_STANDARD) - set(CMAKE_CXX_STANDARD 11) - set(CMAKE_CXX_STANDARD_REQUIRED ON) -endif() - option(BUILD_SHARED_LIBS "build shared libraries" OFF) -option(USEPCRE "use PCRE in tests and benchmarks" OFF) +option(RE2_USE_ICU "build against ICU for full Unicode properties support" OFF) + +# For historical reasons, this is just "USEPCRE", not "RE2_USE_PCRE". +option(USEPCRE "build against PCRE for testing and benchmarking" OFF) + +# See https://groups.google.com/g/re2-dev/c/P6_NM0YIWvA for details. +# This has no effect unless RE2 is being built for an Apple platform +# such as macOS or iOS. +option(RE2_BUILD_FRAMEWORK "build RE2 as a framework" OFF) # CMake seems to have no way to enable/disable testing per subproject, # so we provide an option similar to BUILD_TESTING, but just for RE2. -option(RE2_BUILD_TESTING "enable testing for RE2" ON) +option(RE2_BUILD_TESTING "enable testing for RE2" OFF) + +# The pkg-config Requires: field. +set(REQUIRES) # ABI version # http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html -set(SONAME 9) +set(SONAME 11) set(EXTRA_TARGET_LINK_LIBRARIES) if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") - if(MSVC_VERSION LESS 1900) - message(FATAL_ERROR "you need Visual Studio 2015 or later") + if(MSVC_VERSION LESS 1920) + message(FATAL_ERROR "you need Visual Studio 2019 or later") endif() if(BUILD_SHARED_LIBS) - # See http://www.kitware.com/blog/home/post/939 for details. set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) endif() # CMake defaults to /W3, but some users like /W4 (or /Wall) and /WX, @@ -47,17 +52,53 @@ endif() if(WIN32) add_definitions(-DUNICODE -D_UNICODE -DSTRICT -DNOMINMAX) add_definitions(-D_CRT_SECURE_NO_WARNINGS -D_SCL_SECURE_NO_WARNINGS) -elseif(UNIX) +endif() + +if(UNIX) set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) endif() +set(ABSL_DEPS + absl_base + absl_core_headers + absl_fixed_array + absl_flags + absl_flat_hash_map + absl_flat_hash_set + absl_inlined_vector + absl_optional + absl_span + absl_str_format + absl_strings + absl_synchronization + ) + +# If a top-level project has called add_directory(abseil-cpp) already (possibly +# indirectly), let that take precedence over any copy of Abseil that might have +# been installed on the system. And likewise for ICU, GoogleTest and Benchmark. +if(NOT TARGET absl::base) + find_package(absl REQUIRED) +endif() +list(APPEND REQUIRES ${ABSL_DEPS}) + +if(RE2_USE_ICU) + if(NOT TARGET ICU::uc) + find_package(ICU REQUIRED COMPONENTS uc) + endif() + add_definitions(-DRE2_USE_ICU) + list(APPEND REQUIRES icu-uc) +endif() + if(USEPCRE) add_definitions(-DUSEPCRE) list(APPEND EXTRA_TARGET_LINK_LIBRARIES pcre) endif() +list(JOIN REQUIRES " " REQUIRES) + set(RE2_SOURCES + re2/bitmap256.cc re2/bitstate.cc re2/compile.cc re2/dfa.cc @@ -74,7 +115,6 @@ set(RE2_SOURCES re2/regexp.cc re2/set.cc re2/simplify.cc - re2/stringpiece.cc re2/tostring.cc re2/unicode_casefold.cc re2/unicode_groups.cc @@ -82,16 +122,50 @@ set(RE2_SOURCES util/strutil.cc ) +set(RE2_HEADERS + re2/filtered_re2.h + re2/re2.h + re2/set.h + re2/stringpiece.h + ) + add_library(re2 ${RE2_SOURCES}) +target_compile_features(re2 PUBLIC cxx_std_14) target_include_directories(re2 PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>) +# CMake gives "set_target_properties called with incorrect number of arguments." +# errors if we don't quote ${RE2_HEADERS}, so quote it despite prevailing style. +set_target_properties(re2 PROPERTIES PUBLIC_HEADER "${RE2_HEADERS}") set_target_properties(re2 PROPERTIES SOVERSION ${SONAME} VERSION ${SONAME}.0.0) add_library(re2::re2 ALIAS re2) +if(APPLE AND RE2_BUILD_FRAMEWORK) + set_target_properties(re2 PROPERTIES + FRAMEWORK TRUE + FRAMEWORK_VERSION A + MACOSX_FRAMEWORK_IDENTIFIER com.googlesource.code.re2) +endif() + if(UNIX) target_link_libraries(re2 PUBLIC Threads::Threads) endif() +foreach(dep ${ABSL_DEPS}) + string(REGEX REPLACE "^absl_" "absl::" dep ${dep}) + target_link_libraries(re2 PUBLIC ${dep}) +endforeach() + +if(RE2_USE_ICU) + target_link_libraries(re2 PUBLIC ICU::uc) +endif() + if(RE2_BUILD_TESTING) + if(NOT TARGET GTest::gtest) + find_package(GTest REQUIRED) + endif() + if(NOT TARGET benchmark::benchmark) + find_package(benchmark REQUIRED) + endif() + set(TESTING_SOURCES re2/testing/backtrack.cc re2/testing/dump.cc @@ -103,8 +177,12 @@ if(RE2_BUILD_TESTING) util/pcre.cc ) - add_library(testing STATIC ${TESTING_SOURCES}) - target_link_libraries(testing PUBLIC re2) + add_library(testing ${TESTING_SOURCES}) + if(BUILD_SHARED_LIBS AND WIN32) + target_compile_definitions(testing PRIVATE -DRE2_BUILD_TESTING_DLL) + endif() + target_compile_features(testing PUBLIC cxx_std_14) + target_link_libraries(testing PUBLIC re2 GTest::gtest) set(TEST_TARGETS charclass_test @@ -135,43 +213,51 @@ if(RE2_BUILD_TESTING) ) foreach(target ${TEST_TARGETS}) - add_executable(${target} re2/testing/${target}.cc util/test.cc) - target_link_libraries(${target} testing ${EXTRA_TARGET_LINK_LIBRARIES}) + add_executable(${target} re2/testing/${target}.cc) + if(BUILD_SHARED_LIBS AND WIN32) + target_compile_definitions(${target} PRIVATE -DRE2_CONSUME_TESTING_DLL) + endif() + target_compile_features(${target} PUBLIC cxx_std_14) + target_link_libraries(${target} PUBLIC testing GTest::gtest_main ${EXTRA_TARGET_LINK_LIBRARIES}) add_test(NAME ${target} COMMAND ${target}) endforeach() foreach(target ${BENCHMARK_TARGETS}) - add_executable(${target} re2/testing/${target}.cc util/benchmark.cc) - target_link_libraries(${target} testing ${EXTRA_TARGET_LINK_LIBRARIES}) + add_executable(${target} re2/testing/${target}.cc) + if(BUILD_SHARED_LIBS AND WIN32) + target_compile_definitions(${target} PRIVATE -DRE2_CONSUME_TESTING_DLL) + endif() + target_compile_features(${target} PUBLIC cxx_std_14) + target_link_libraries(${target} PUBLIC testing benchmark::benchmark_main ${EXTRA_TARGET_LINK_LIBRARIES}) endforeach() endif() -set(RE2_HEADERS - re2/filtered_re2.h - re2/re2.h - re2/set.h - re2/stringpiece.h - ) - -install(FILES ${RE2_HEADERS} - DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/re2) -install(TARGETS re2 EXPORT re2Targets +install(TARGETS re2 + EXPORT re2Targets ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + FRAMEWORK DESTINATION ${CMAKE_INSTALL_LIBDIR} + PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/re2 INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) install(EXPORT re2Targets - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/re2 NAMESPACE re2::) + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/re2 + NAMESPACE re2::) configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/re2Config.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/re2Config.cmake - INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/re2 - ) + INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/re2) write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/re2ConfigVersion.cmake VERSION ${SONAME}.0.0 - COMPATIBILITY SameMajorVersion - ) + COMPATIBILITY SameMajorVersion) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/re2Config.cmake ${CMAKE_CURRENT_BINARY_DIR}/re2ConfigVersion.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/re2) + +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/re2.pc.in + ${CMAKE_CURRENT_BINARY_DIR}/re2.pc + @ONLY) + +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/re2.pc + DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) diff --git a/MODULE.bazel b/MODULE.bazel new file mode 100644 index 0000000..87a5576 --- /dev/null +++ b/MODULE.bazel @@ -0,0 +1,27 @@ +# Copyright 2009 The RE2 Authors. All Rights Reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +# Bazel (http://bazel.build/) MODULE file for RE2. + +module( + name = "re2", + version = "2023-11-01", + compatibility_level = 1, +) + +bazel_dep(name = "platforms", version = "0.0.8") +bazel_dep(name = "rules_cc", version = "0.0.9") +bazel_dep(name = "abseil-cpp", version = "20230802.0", repo_name = "com_google_absl") +bazel_dep(name = "rules_python", version = "0.26.0") +bazel_dep(name = "pybind11_bazel", version = "2.11.1") + +python_configure = use_extension("@pybind11_bazel//:python_configure.bzl", "extension") +python_configure.toolchain(python_version = "3") # ignored when non-root module +use_repo(python_configure, "local_config_python", "pybind11") + +# These dependencies will be ignored when the `re2` module is not +# the root module (or when `--ignore_dev_dependency` is enabled). +bazel_dep(name = "google_benchmark", version = "1.8.3", dev_dependency = True) +bazel_dep(name = "googletest", version = "1.14.0.bcr.1", dev_dependency = True) +bazel_dep(name = "abseil-py", version = "1.4.0", dev_dependency = True) @@ -2,12 +2,34 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. +# Build against Abseil. +ABSL_DEPS=\ + absl_base\ + absl_core_headers\ + absl_fixed_array\ + absl_flags\ + absl_flat_hash_map\ + absl_flat_hash_set\ + absl_inlined_vector\ + absl_optional\ + absl_span\ + absl_str_format\ + absl_strings\ + absl_synchronization\ + +PKG_CONFIG?=pkg-config +CCABSL=$(shell $(PKG_CONFIG) $(ABSL_DEPS) --cflags) +# GCC barfs on `-Wl` whereas Clang doesn't mind, but it's unclear what +# causes it to manifest on Ubuntu 22.04 LTS, so filter it out for now. +# Similar is needed for `static-testinstall` and `shared-testinstall`. +LDABSL=$(shell $(PKG_CONFIG) $(ABSL_DEPS) --libs | sed -e 's/-Wl / /g') + # To build against ICU for full Unicode properties support, # uncomment the next two lines: -# CCICU=$(shell pkg-config icu-uc --cflags) -DRE2_USE_ICU -# LDICU=$(shell pkg-config icu-uc --libs) +# CCICU=$(shell $(PKG_CONFIG) icu-uc --cflags) -DRE2_USE_ICU +# LDICU=$(shell $(PKG_CONFIG) icu-uc --libs) -# To build against PCRE for testing or benchmarking, +# To build against PCRE for testing and benchmarking, # uncomment the next two lines: # CCPCRE=-I/usr/local/include -DUSEPCRE # LDPCRE=-L/usr/local/lib -lpcre @@ -17,8 +39,8 @@ CXX?=g++ CXXFLAGS?=-O3 -g LDFLAGS?= # required -RE2_CXXFLAGS?=-pthread -Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -I. $(CCICU) $(CCPCRE) -RE2_LDFLAGS?=-pthread $(LDICU) $(LDPCRE) +RE2_CXXFLAGS?=-pthread -Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -I. $(CCABSL) $(CCICU) $(CCPCRE) +RE2_LDFLAGS?=-pthread $(LDABSL) $(LDICU) $(LDPCRE) AR?=ar ARFLAGS?=rsc NM?=nm @@ -42,9 +64,15 @@ else SED_INPLACE=sed -i endif +# The pkg-config Requires: field. +REQUIRES=$(ABSL_DEPS) +ifdef LDICU +REQUIRES+=icu-uc +endif + # ABI version # http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html -SONAME=9 +SONAME=11 # To rebuild the Tables generated by Perl and Python scripts (requires Internet # access for Unicode data), uncomment the following line: @@ -55,17 +83,17 @@ ifeq ($(shell uname),Darwin) SOEXT=dylib SOEXTVER=$(SONAME).$(SOEXT) SOEXTVER00=$(SONAME).0.0.$(SOEXT) -MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib -Wl,-compatibility_version,$(SONAME),-current_version,$(SONAME).0.0,-install_name,$(libdir)/libre2.$(SOEXTVER),-exported_symbols_list,libre2.symbols.darwin $(RE2_LDFLAGS) $(LDFLAGS) +MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib -Wl,-compatibility_version,$(SONAME),-current_version,$(SONAME).0.0,-install_name,$(libdir)/libre2.$(SOEXTVER),-exported_symbols_list,libre2.symbols.darwin else ifeq ($(shell uname),SunOS) SOEXT=so SOEXTVER=$(SOEXT).$(SONAME) SOEXTVER00=$(SOEXT).$(SONAME).0.0 -MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER) $(RE2_LDFLAGS) $(LDFLAGS) +MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER) else SOEXT=so SOEXTVER=$(SOEXT).$(SONAME) SOEXTVER00=$(SOEXT).$(SONAME).0.0 -MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),--version-script,libre2.symbols $(RE2_LDFLAGS) $(LDFLAGS) +MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),--version-script,libre2.symbols endif .PHONY: all @@ -78,17 +106,11 @@ INSTALL_HFILES=\ re2/stringpiece.h\ HFILES=\ - util/benchmark.h\ - util/flags.h\ util/logging.h\ util/malloc_counter.h\ - util/mix.h\ - util/mutex.h\ util/pcre.h\ util/strutil.h\ - util/test.h\ util/utf.h\ - util/util.h\ re2/bitmap256.h\ re2/filtered_re2.h\ re2/pod_array.h\ @@ -112,6 +134,7 @@ HFILES=\ OFILES=\ obj/util/rune.o\ obj/util/strutil.o\ + obj/re2/bitmap256.o\ obj/re2/bitstate.o\ obj/re2/compile.o\ obj/re2/dfa.o\ @@ -128,7 +151,6 @@ OFILES=\ obj/re2/regexp.o\ obj/re2/set.o\ obj/re2/simplify.o\ - obj/re2/stringpiece.o\ obj/re2/tostring.o\ obj/re2/unicode_casefold.o\ obj/re2/unicode_groups.o\ @@ -205,38 +227,32 @@ obj/dbg/libre2.a: $(DOFILES) .PRECIOUS: obj/so/libre2.$(SOEXT) obj/so/libre2.$(SOEXT): $(SOFILES) libre2.symbols libre2.symbols.darwin @mkdir -p obj/so - $(MAKE_SHARED_LIBRARY) -o obj/so/libre2.$(SOEXTVER) $(SOFILES) + $(MAKE_SHARED_LIBRARY) -o obj/so/libre2.$(SOEXTVER) $(SOFILES) $(RE2_LDFLAGS) $(LDFLAGS) ln -sf libre2.$(SOEXTVER) $@ .PRECIOUS: obj/dbg/test/% -obj/dbg/test/%: obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES) obj/dbg/util/test.o +obj/dbg/test/%: obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES) @mkdir -p obj/dbg/test - $(CXX) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) obj/dbg/util/test.o obj/dbg/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) + $(CXX) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) -lgtest -lgtest_main obj/dbg/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) .PRECIOUS: obj/test/% -obj/test/%: obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o +obj/test/%: obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) @mkdir -p obj/test - $(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) + $(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) -lgtest -lgtest_main obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) # Test the shared lib, falling back to the static lib for private symbols .PRECIOUS: obj/so/test/% -obj/so/test/%: obj/so/libre2.$(SOEXT) obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o +obj/so/test/%: obj/so/libre2.$(SOEXT) obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) @mkdir -p obj/so/test - $(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o -Lobj/so -lre2 obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) + $(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) -lgtest -lgtest_main -Lobj/so -lre2 obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) -# Filter out dump.o because testing::TempDir() isn't available for it. -obj/test/regexp_benchmark: obj/libre2.a obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/util/benchmark.o +obj/test/regexp_benchmark: obj/libre2.a obj/re2/testing/regexp_benchmark.o $(TESTOFILES) @mkdir -p obj/test - $(CXX) -o $@ obj/re2/testing/regexp_benchmark.o $(filter-out obj/re2/testing/dump.o, $(TESTOFILES)) obj/util/benchmark.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) - -# re2_fuzzer is a target for fuzzers like libFuzzer and AFL. This fake fuzzing -# is simply a way to check that the target builds and then to run it against a -# fixed set of inputs. To perform real fuzzing, refer to the documentation for -# libFuzzer (llvm.org/docs/LibFuzzer.html) and AFL (lcamtuf.coredump.cx/afl/). -obj/test/re2_fuzzer: CXXFLAGS:=-I./re2/fuzzing/compiler-rt/include $(CXXFLAGS) -obj/test/re2_fuzzer: obj/libre2.a obj/re2/fuzzing/re2_fuzzer.o obj/util/fuzz.o + $(CXX) -o $@ obj/re2/testing/regexp_benchmark.o $(TESTOFILES) -lgtest -lbenchmark -lbenchmark_main obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) + +obj/test/re2_fuzzer: obj/libre2.a obj/re2/fuzzing/re2_fuzzer.o @mkdir -p obj/test - $(CXX) -o $@ obj/re2/fuzzing/re2_fuzzer.o obj/util/fuzz.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) + $(CXX) -o $@ obj/re2/fuzzing/re2_fuzzer.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) ifdef REBUILD_TABLES .PRECIOUS: re2/perl_groups.cc @@ -316,9 +332,11 @@ shared-install: obj/so/libre2.$(SOEXT) common-install common-install: mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir)/pkgconfig $(INSTALL_DATA) $(INSTALL_HFILES) $(DESTDIR)$(includedir)/re2 - $(INSTALL_DATA) re2.pc $(DESTDIR)$(libdir)/pkgconfig/re2.pc - $(SED_INPLACE) -e "s#@includedir@#$(includedir)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc - $(SED_INPLACE) -e "s#@libdir@#$(libdir)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc + $(INSTALL_DATA) re2.pc.in $(DESTDIR)$(libdir)/pkgconfig/re2.pc + $(SED_INPLACE) -e "s#@CMAKE_INSTALL_FULL_INCLUDEDIR@#$(includedir)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc + $(SED_INPLACE) -e "s#@CMAKE_INSTALL_FULL_LIBDIR@#$(libdir)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc + $(SED_INPLACE) -e "s#@REQUIRES@#$(REQUIRES)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc + $(SED_INPLACE) -e "s#@SONAME@#$(SONAME)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc .PHONY: testinstall testinstall: static-testinstall shared-testinstall @@ -327,27 +345,29 @@ testinstall: static-testinstall shared-testinstall @echo .PHONY: static-testinstall -static-testinstall: CXXFLAGS:=-pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS) -static-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -l:libre2.a $(LDICU) $(LDFLAGS) static-testinstall: - @mkdir -p obj - @cp testinstall.cc obj ifeq ($(shell uname),Darwin) @echo Skipping test for libre2.a on Darwin. else ifeq ($(shell uname),SunOS) @echo Skipping test for libre2.a on SunOS. else - (cd obj && $(CXX) testinstall.cc -o static-testinstall $(CXXFLAGS) $(LDFLAGS)) + @mkdir -p obj + @cp testinstall.cc obj/static-testinstall.cc + (cd obj && export PKG_CONFIG_PATH=$(DESTDIR)$(libdir)/pkgconfig; \ + $(CXX) static-testinstall.cc -o static-testinstall $(CXXFLAGS) $(LDFLAGS) \ + $$($(PKG_CONFIG) re2 --cflags) \ + $$($(PKG_CONFIG) re2 --libs | sed -e 's/-Wl / /g' | sed -e 's/-lre2/-l:libre2.a/')) obj/static-testinstall endif .PHONY: shared-testinstall -shared-testinstall: CXXFLAGS:=-pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS) -shared-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -lre2 $(LDICU) $(LDFLAGS) shared-testinstall: @mkdir -p obj - @cp testinstall.cc obj - (cd obj && $(CXX) testinstall.cc -o shared-testinstall $(CXXFLAGS) $(LDFLAGS)) + @cp testinstall.cc obj/shared-testinstall.cc + (cd obj && export PKG_CONFIG_PATH=$(DESTDIR)$(libdir)/pkgconfig; \ + $(CXX) shared-testinstall.cc -o shared-testinstall $(CXXFLAGS) $(LDFLAGS) \ + $$($(PKG_CONFIG) re2 --cflags) \ + $$($(PKG_CONFIG) re2 --libs | sed -e 's/-Wl / /g')) ifeq ($(shell uname),Darwin) DYLD_LIBRARY_PATH="$(DESTDIR)$(libdir):$(DYLD_LIBRARY_PATH)" obj/shared-testinstall else @@ -10,6 +10,11 @@ make test make install make testinstall +Building RE2 requires Abseil (https://github.com/abseil/abseil-cpp) +to be installed on your system. Building the testing for RE2 requires +GoogleTest (https://github.com/google/googletest) and Benchmark +(https://github.com/google/benchmark) to be installed as well. + There is a fair amount of documentation (including code snippets) in the re2.h header file. diff --git a/WORKSPACE b/WORKSPACE.bazel index b35619c..fa514a8 100644 --- a/WORKSPACE +++ b/WORKSPACE.bazel @@ -2,6 +2,6 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. -# Bazel (http://bazel.io/) WORKSPACE file for RE2. +# Bazel (http://bazel.build/) WORKSPACE file for RE2. workspace(name = "com_googlesource_code_re2") diff --git a/WORKSPACE.bzlmod b/WORKSPACE.bzlmod new file mode 100644 index 0000000..fa514a8 --- /dev/null +++ b/WORKSPACE.bzlmod @@ -0,0 +1,7 @@ +# Copyright 2009 The RE2 Authors. All Rights Reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +# Bazel (http://bazel.build/) WORKSPACE file for RE2. + +workspace(name = "com_googlesource_code_re2") diff --git a/app/BUILD.bazel b/app/BUILD.bazel new file mode 100644 index 0000000..cb510af --- /dev/null +++ b/app/BUILD.bazel @@ -0,0 +1,24 @@ +# Copyright 2009 The RE2 Authors. All Rights Reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +# Bazel (http://bazel.build/) BUILD file for RE2 app. + +cc_binary( + name = "_re2.js", + testonly = 1, + srcs = ["_re2.cc"], + linkopts = [ + "--bind", + "-sENVIRONMENT=web", + "-sSINGLE_FILE=1", + "-sMODULARIZE=1", + "-sEXPORT_ES6=1", + "-sEXPORT_NAME=loadModule", + "-sUSE_PTHREADS=0", + ], + deps = [ + "//:re2", + "//:testing", + ], +) diff --git a/app/_re2.cc b/app/_re2.cc new file mode 100644 index 0000000..a63313e --- /dev/null +++ b/app/_re2.cc @@ -0,0 +1,94 @@ +// Copyright 2022 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include <memory> +#include <string> + +#include <emscripten/bind.h> +#include "re2/prog.h" +#include "re2/re2.h" +#include "re2/regexp.h" + +namespace re2_app { + +struct Info { + std::string pattern; + std::string error; + std::string prefix; + bool prefix_foldcase = false; + std::string accel_prefix; + bool accel_prefix_foldcase = false; + int num_captures; + bool is_one_pass; + bool can_bit_state; + std::string bytecode; + std::string bytemap; +}; + +Info GetInfo(const std::string& pattern) { + Info info; + info.pattern = pattern; + + RE2::Options options; + re2::RegexpStatus status; + re2::Regexp* regexp = re2::Regexp::Parse( + pattern, static_cast<re2::Regexp::ParseFlags>(options.ParseFlags()), + &status); + if (regexp == nullptr) { + info.error = "failed to parse pattern: " + status.Text(); + return info; + } + + std::string prefix; + bool prefix_foldcase; + re2::Regexp* suffix; + if (regexp->RequiredPrefix(&prefix, &prefix_foldcase, &suffix)) { + info.prefix = prefix; + info.prefix_foldcase = prefix_foldcase; + } else { + suffix = regexp->Incref(); + } + + std::unique_ptr<re2::Prog> prog(suffix->CompileToProg(options.max_mem())); + if (prog == nullptr) { + info.error = "failed to compile forward Prog"; + suffix->Decref(); + regexp->Decref(); + return info; + } + + if (regexp->RequiredPrefixForAccel(&prefix, &prefix_foldcase)) { + info.accel_prefix = prefix; + info.accel_prefix_foldcase = prefix_foldcase; + } + + info.num_captures = suffix->NumCaptures(); + info.is_one_pass = prog->IsOnePass(); + info.can_bit_state = prog->CanBitState(); + info.bytecode = prog->Dump(); + info.bytemap = prog->DumpByteMap(); + + suffix->Decref(); + regexp->Decref(); + return info; +} + +EMSCRIPTEN_BINDINGS(_re2) { + emscripten::value_object<Info>("Info") + .field("pattern", &Info::pattern) + .field("error", &Info::error) + .field("prefix", &Info::prefix) + .field("prefix_foldcase", &Info::prefix_foldcase) + .field("accel_prefix", &Info::accel_prefix) + .field("accel_prefix_foldcase", &Info::accel_prefix_foldcase) + .field("num_captures", &Info::num_captures) + .field("is_one_pass", &Info::is_one_pass) + .field("can_bit_state", &Info::can_bit_state) + .field("bytecode", &Info::bytecode) + .field("bytemap", &Info::bytemap); + + emscripten::function("getInfo", &GetInfo); +} + +} // namespace re2_app diff --git a/app/_re2.d.ts b/app/_re2.d.ts new file mode 100644 index 0000000..dff5e49 --- /dev/null +++ b/app/_re2.d.ts @@ -0,0 +1,23 @@ +// Copyright 2022 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +export type Info = { + pattern: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string, + error: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string, + prefix: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string, + prefix_foldcase: boolean, + accel_prefix: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string, + accel_prefix_foldcase: boolean, + num_captures: number, + is_one_pass: boolean, + can_bit_state: boolean, + bytecode: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string, + bytemap: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string, +}; + +export interface MainModule { + getInfo(pattern: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string): Info; +} + +export default function loadModule(): Promise<MainModule>; diff --git a/app/app.ts b/app/app.ts new file mode 100644 index 0000000..4b9e7bd --- /dev/null +++ b/app/app.ts @@ -0,0 +1,111 @@ +// Copyright 2022 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +import {css, html, LitElement, render} from 'lit'; +import {customElement} from 'lit/decorators.js'; + +import /*default*/ loadModule from './_re2'; +import {Info, MainModule} from './_re2'; + +var _re2: MainModule; +loadModule().then((module: MainModule) => { + _re2 = module; + render(html`<title>re2-dev</title><re2-dev></re2-dev>`, document.body); +}); + +@customElement('re2-dev') +export class RE2Dev extends LitElement { + private _pattern: string = ''; + private _info: Info|null = null; + + constructor() { + super(); + this._pattern = decodeURIComponent(window.location.hash.slice(1)); + this._info = this._pattern ? _re2.getInfo(this._pattern) : null; + this.requestUpdate(); + } + + private _onChange = (e: Event) => { + this._pattern = (e.target as HTMLInputElement).value; + this._info = this._pattern ? _re2.getInfo(this._pattern) : null; + this.requestUpdate(); + window.location.hash = '#' + encodeURIComponent(this._pattern); + }; + + static override styles = css` +.code { + font-family: monospace; + white-space: pre-line; +} +`; + + override render() { + var fragments = []; + fragments.push(html` +<div> + <input type="text" size="48" @change=${this._onChange} .value=${this._pattern}> +</div> +`); + + if (this._info === null) { + return html`${fragments}`; + } + + if (this._info.error) { + fragments.push(html` +<br> +<div> + error: + <span class="code">${this._info.error}</span> +</div> +`); + return html`${fragments}`; + } + + fragments.push(html` +<br> +<div> + pattern: + <span class="code">${this._info.pattern}</span> + <br> + prefix: + <span class="code">${this._info.prefix}</span> + · + _foldcase: + <span class="code">${this._info.prefix_foldcase}</span> + <br> + accel_prefix: + <span class="code">${this._info.accel_prefix}</span> + · + _foldcase: + <span class="code">${this._info.accel_prefix_foldcase}</span> + <br> + num_captures: + <span class="code">${this._info.num_captures}</span> + <br> + is_one_pass: + <span class="code">${this._info.is_one_pass}</span> + <br> + can_bit_state: + <span class="code">${this._info.can_bit_state}</span> + <br> + <br> + bytecode: + <br> + <span class="code">${this._info.bytecode}</span> + <br> + bytemap: + <br> + <span class="code">${this._info.bytemap}</span> +</div> +`); + return html`${fragments}`; + } +} + +declare global { + interface HTMLElementTagNameMap { + 're2-dev': RE2Dev; + } +} diff --git a/app/build.sh b/app/build.sh new file mode 100755 index 0000000..09d931f --- /dev/null +++ b/app/build.sh @@ -0,0 +1,44 @@ +#!/bin/bash +set -eux + +SRCDIR=$(readlink --canonicalize $(dirname $0)) +DSTDIR=$(mktemp --directory --tmpdir $(basename $0).XXXXXXXXXX) + +BAZEL=/tmp/bazel +BAZELISK_RELEASE=v1.17.0 + +if [[ ${UID} -ne 0 ]]; then + if [[ -d deploy ]]; then + echo -e '\033[1;31m' "** The ${PWD}/deploy directory exists! Refusing to clobber it! **" '\033[0m' + exit 1 + fi + mkdir deploy + sudo docker run -i -t --pull always --rm -v ${SRCDIR}/..:/src -v ${PWD}:/dst emscripten/emsdk /src/app/$(basename $0) + ls -l deploy +else + wget -O ${BAZEL} https://github.com/bazelbuild/bazelisk/releases/download/${BAZELISK_RELEASE}/bazelisk-linux-amd64 + chmod +x ${BAZEL} + + cd ${SRCDIR} + # Emscripten doesn't support `-fstack-protector`. + AR=emar CC=emcc \ + ${BAZEL} build --compilation_mode=opt \ + --copt=-fno-stack-protector \ + -- :all + cp ../bazel-bin/app/_re2.js ${DSTDIR} + # Clean up the sundry Bazel output directories. + ${BAZEL} clean --expunge + cp app.ts index.html _re2.d.ts ${DSTDIR} + cp package.json rollup.config.js tsconfig.json ${DSTDIR} + + cd ${DSTDIR} + npm install + npx tsc + npx rollup -c rollup.config.js -d deploy + mv deploy/* /dst/deploy +fi + +cd ${SRCDIR} +rm -rf ${DSTDIR} + +exit 0 diff --git a/app/index.html b/app/index.html new file mode 100644 index 0000000..d229e56 --- /dev/null +++ b/app/index.html @@ -0,0 +1,5 @@ +<!DOCTYPE html> +<meta charset="utf-8"> +<meta name="viewport" content="width=device-width, initial-scale=1"> +<style>:root { color-scheme: dark light; }</style> +<script type="module" src="app.js"></script> diff --git a/app/package.json b/app/package.json new file mode 100644 index 0000000..e702789 --- /dev/null +++ b/app/package.json @@ -0,0 +1,14 @@ +{ + "dependencies": { + "lit": "*" + }, + "devDependencies": { + "@rollup/plugin-node-resolve": "*", + "@rollup/plugin-terser": "*", + "@web/rollup-plugin-html": "*", + "@web/rollup-plugin-import-meta-assets": "*", + "rollup": "~2", + "tslib": "*", + "typescript": "*" + } +} diff --git a/app/rollup.config.js b/app/rollup.config.js new file mode 100644 index 0000000..3a20e66 --- /dev/null +++ b/app/rollup.config.js @@ -0,0 +1,28 @@ +// Copyright 2022 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +import nodeResolve from '@rollup/plugin-node-resolve'; +import terser from '@rollup/plugin-terser'; +import html from '@web/rollup-plugin-html'; +import {importMetaAssets} from '@web/rollup-plugin-import-meta-assets'; + +export default { + input: 'index.html', + output: { + entryFileNames: '[hash].js', + chunkFileNames: '[hash].js', + assetFileNames: '[hash][extname]', + format: 'es', + }, + preserveEntrySignatures: false, + plugins: + [ + html({ + minify: true, + }), + nodeResolve(), + terser(), + importMetaAssets(), + ], +}; diff --git a/app/tsconfig.json b/app/tsconfig.json new file mode 100644 index 0000000..86cc302 --- /dev/null +++ b/app/tsconfig.json @@ -0,0 +1,17 @@ +{ + "compilerOptions": { + "target": "esnext", + "module": "esnext", + "moduleResolution": "node", + "noEmitOnError": true, + "lib": ["esnext", "dom"], + "strict": true, + "esModuleInterop": false, + "allowSyntheticDefaultImports": true, + "experimentalDecorators": true, + "importHelpers": true, + "sourceMap": true, + "inlineSources": true, + "incremental": true + } +} diff --git a/doc/README.xkcd b/doc/README.xkcd deleted file mode 100644 index b50a579..0000000 --- a/doc/README.xkcd +++ /dev/null @@ -1 +0,0 @@ -xkcd.png is a cropped version of http://xkcd.com/208/ diff --git a/doc/syntax.html b/doc/syntax.html index f0e0138..6cbda14 100644 --- a/doc/syntax.html +++ b/doc/syntax.html @@ -62,7 +62,7 @@ <tr><td colspan=2><b>Grouping:</b></td></tr> <tr><td><code>(re)</code></td><td>numbered capturing group (submatch)</td></tr> <tr><td><code>(?P<name>re)</code></td><td>named & numbered capturing group (submatch)</td></tr> -<tr><td><code><font color=#808080>(?<name>re)</font></code></td><td>named & numbered capturing group (submatch) </td></tr> +<tr><td><code>(?<name>re)</code></td><td>named & numbered capturing group (submatch)</td></tr> <tr><td><code><font color=#808080>(?'name're)</font></code></td><td>named & numbered capturing group (submatch) </td></tr> <tr><td><code>(?:re)</code></td><td>non-capturing group</td></tr> <tr><td><code>(?flags)</code></td><td>set flags within current group; non-capturing</td></tr> @@ -303,6 +303,7 @@ <tr><td colspan=2>Kaithi</td></tr> <tr><td colspan=2>Kannada</td></tr> <tr><td colspan=2>Katakana</td></tr> +<tr><td colspan=2>Kawi</td></tr> <tr><td colspan=2>Kayah_Li</td></tr> <tr><td colspan=2>Kharoshthi</td></tr> <tr><td colspan=2>Khitan_Small_Script</td></tr> @@ -337,6 +338,7 @@ <tr><td colspan=2>Multani</td></tr> <tr><td colspan=2>Myanmar</td></tr> <tr><td colspan=2>Nabataean</td></tr> +<tr><td colspan=2>Nag_Mundari</td></tr> <tr><td colspan=2>Nandinagari</td></tr> <tr><td colspan=2>New_Tai_Lue</td></tr> <tr><td colspan=2>Newa</td></tr> diff --git a/doc/syntax.txt b/doc/syntax.txt index c12a482..6070efd 100644 --- a/doc/syntax.txt +++ b/doc/syntax.txt @@ -51,7 +51,7 @@ x{n}+ exactly «n» «x», possessive NOT SUPPORTED Grouping: (re) numbered capturing group (submatch) (?P<name>re) named & numbered capturing group (submatch) -(?<name>re) named & numbered capturing group (submatch) NOT SUPPORTED +(?<name>re) named & numbered capturing group (submatch) (?'name're) named & numbered capturing group (submatch) NOT SUPPORTED (?:re) non-capturing group (?flags) set flags within current group; non-capturing @@ -292,6 +292,7 @@ Javanese Kaithi Kannada Katakana +Kawi Kayah_Li Kharoshthi Khitan_Small_Script @@ -326,6 +327,7 @@ Mro Multani Myanmar Nabataean +Nag_Mundari Nandinagari New_Tai_Lue Newa diff --git a/doc/xkcd.png b/doc/xkcd.png Binary files differdeleted file mode 100644 index 6249e8e..0000000 --- a/doc/xkcd.png +++ /dev/null diff --git a/libre2.symbols b/libre2.symbols index 93b71b4..0cab3d9 100644 --- a/libre2.symbols +++ b/libre2.symbols @@ -3,9 +3,6 @@ # re2::RE2* _ZN3re23RE2*; _ZNK3re23RE2*; - # re2::StringPiece* - _ZN3re211StringPiece*; - _ZNK3re211StringPiece*; # re2::operator<<* _ZN3re2ls*; # re2::FilteredRE2* diff --git a/libre2.symbols.darwin b/libre2.symbols.darwin index 41ac96f..754f45c 100644 --- a/libre2.symbols.darwin +++ b/libre2.symbols.darwin @@ -2,9 +2,6 @@ # re2::RE2* __ZN3re23RE2* __ZNK3re23RE2* -# re2::StringPiece* -__ZN3re211StringPiece* -__ZNK3re211StringPiece* # re2::operator<<* __ZN3re2ls* # re2::FilteredRE2* diff --git a/python/BUILD.bazel b/python/BUILD.bazel new file mode 100644 index 0000000..a05fb6e --- /dev/null +++ b/python/BUILD.bazel @@ -0,0 +1,36 @@ +# Copyright 2009 The RE2 Authors. All Rights Reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +# Bazel (http://bazel.build/) BUILD file for RE2 Python. + +load("@pybind11_bazel//:build_defs.bzl", "pybind_extension") +load("@rules_python//python:defs.bzl", "py_library", "py_test") + +pybind_extension( + name = "_re2", + srcs = ["_re2.cc"], + deps = [ + "//:re2", + "@com_google_absl//absl/strings", + ], +) + +py_library( + name = "re2", + srcs = ["re2.py"], + data = [":_re2.so"], + imports = ["."], + visibility = ["//visibility:public"], +) + +py_test( + name = "re2_test", + size = "small", + srcs = ["re2_test.py"], + deps = [ + ":re2", + "@abseil-py//absl/testing:absltest", + "@abseil-py//absl/testing:parameterized", + ], +) diff --git a/python/LICENSE b/python/LICENSE new file mode 120000 index 0000000..ea5b606 --- /dev/null +++ b/python/LICENSE @@ -0,0 +1 @@ +../LICENSE
\ No newline at end of file diff --git a/python/README b/python/README new file mode 100644 index 0000000..782378f --- /dev/null +++ b/python/README @@ -0,0 +1 @@ +Building requires Python 3 and pybind11 to be installed on your system. diff --git a/python/_re2.cc b/python/_re2.cc new file mode 100644 index 0000000..8564f8a --- /dev/null +++ b/python/_re2.cc @@ -0,0 +1,338 @@ +// Copyright 2019 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include <memory> +#include <string> +#include <tuple> +#include <utility> +#include <vector> + +#include <pybind11/pybind11.h> +#include <pybind11/stl.h> +#include "absl/strings/string_view.h" +#include "re2/filtered_re2.h" +#include "re2/re2.h" +#include "re2/set.h" + +#ifdef _WIN32 +#include <basetsd.h> +#define ssize_t SSIZE_T +#endif + +namespace re2_python { + +// This is conventional. +namespace py = pybind11; + +// In terms of the pybind11 API, a py::buffer is merely a py::object that +// supports the buffer interface/protocol and you must explicitly request +// a py::buffer_info in order to access the actual bytes. Under the hood, +// the py::buffer_info manages a reference count to the py::buffer, so it +// must be constructed and subsequently destructed while holding the GIL. +static inline absl::string_view FromBytes(const py::buffer_info& bytes) { + char* data = reinterpret_cast<char*>(bytes.ptr); + ssize_t size = bytes.size; + return absl::string_view(data, size); +} + +static inline int OneCharLen(const char* ptr) { + return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*ptr & 0xFF) >> 4]; +} + +// Helper function for when Python encodes str to bytes and then needs to +// convert str offsets to bytes offsets. Assumes that text is valid UTF-8. +ssize_t CharLenToBytes(py::buffer buffer, ssize_t pos, ssize_t len) { + auto bytes = buffer.request(); + auto text = FromBytes(bytes); + auto ptr = text.data() + pos; + auto end = text.data() + text.size(); + while (ptr < end && len > 0) { + ptr += OneCharLen(ptr); + --len; + } + return ptr - (text.data() + pos); +} + +// Helper function for when Python decodes bytes to str and then needs to +// convert bytes offsets to str offsets. Assumes that text is valid UTF-8. +ssize_t BytesToCharLen(py::buffer buffer, ssize_t pos, ssize_t endpos) { + auto bytes = buffer.request(); + auto text = FromBytes(bytes); + auto ptr = text.data() + pos; + auto end = text.data() + endpos; + ssize_t len = 0; + while (ptr < end) { + ptr += OneCharLen(ptr); + ++len; + } + return len; +} + +std::unique_ptr<RE2> RE2InitShim(py::buffer buffer, + const RE2::Options& options) { + auto bytes = buffer.request(); + auto pattern = FromBytes(bytes); + return std::make_unique<RE2>(pattern, options); +} + +py::bytes RE2ErrorShim(const RE2& self) { + // Return std::string as bytes. That is, without decoding to str. + return self.error(); +} + +std::vector<std::pair<py::bytes, int>> RE2NamedCapturingGroupsShim( + const RE2& self) { + const int num_groups = self.NumberOfCapturingGroups(); + std::vector<std::pair<py::bytes, int>> groups; + groups.reserve(num_groups); + for (const auto& it : self.NamedCapturingGroups()) { + groups.emplace_back(it.first, it.second); + } + return groups; +} + +std::vector<int> RE2ProgramFanoutShim(const RE2& self) { + std::vector<int> histogram; + self.ProgramFanout(&histogram); + return histogram; +} + +std::vector<int> RE2ReverseProgramFanoutShim(const RE2& self) { + std::vector<int> histogram; + self.ReverseProgramFanout(&histogram); + return histogram; +} + +std::tuple<bool, py::bytes, py::bytes> RE2PossibleMatchRangeShim( + const RE2& self, int maxlen) { + std::string min, max; + // Return std::string as bytes. That is, without decoding to str. + return {self.PossibleMatchRange(&min, &max, maxlen), min, max}; +} + +std::vector<std::pair<ssize_t, ssize_t>> RE2MatchShim(const RE2& self, + RE2::Anchor anchor, + py::buffer buffer, + ssize_t pos, + ssize_t endpos) { + auto bytes = buffer.request(); + auto text = FromBytes(bytes); + const int num_groups = self.NumberOfCapturingGroups() + 1; // need $0 + std::vector<absl::string_view> groups; + groups.resize(num_groups); + py::gil_scoped_release release_gil; + if (!self.Match(text, pos, endpos, anchor, groups.data(), groups.size())) { + // Ensure that groups are null before converting to spans! + for (auto& it : groups) { + it = absl::string_view(); + } + } + std::vector<std::pair<ssize_t, ssize_t>> spans; + spans.reserve(num_groups); + for (const auto& it : groups) { + if (it.data() == NULL) { + spans.emplace_back(-1, -1); + } else { + spans.emplace_back(it.data() - text.data(), + it.data() - text.data() + it.size()); + } + } + return spans; +} + +py::bytes RE2QuoteMetaShim(py::buffer buffer) { + auto bytes = buffer.request(); + auto pattern = FromBytes(bytes); + // Return std::string as bytes. That is, without decoding to str. + return RE2::QuoteMeta(pattern); +} + +class Set { + public: + Set(RE2::Anchor anchor, const RE2::Options& options) + : set_(options, anchor) {} + + ~Set() = default; + + // Not copyable or movable. + Set(const Set&) = delete; + Set& operator=(const Set&) = delete; + + int Add(py::buffer buffer) { + auto bytes = buffer.request(); + auto pattern = FromBytes(bytes); + int index = set_.Add(pattern, /*error=*/NULL); // -1 on error + return index; + } + + bool Compile() { + // Compiling can fail. + return set_.Compile(); + } + + std::vector<int> Match(py::buffer buffer) const { + auto bytes = buffer.request(); + auto text = FromBytes(bytes); + std::vector<int> matches; + py::gil_scoped_release release_gil; + set_.Match(text, &matches); + return matches; + } + + private: + RE2::Set set_; +}; + +class Filter { + public: + Filter() = default; + ~Filter() = default; + + // Not copyable or movable. + Filter(const Filter&) = delete; + Filter& operator=(const Filter&) = delete; + + int Add(py::buffer buffer, const RE2::Options& options) { + auto bytes = buffer.request(); + auto pattern = FromBytes(bytes); + int index = -1; // not clobbered on error + filter_.Add(pattern, options, &index); + return index; + } + + bool Compile() { + std::vector<std::string> atoms; + filter_.Compile(&atoms); + RE2::Options options; + options.set_literal(true); + options.set_case_sensitive(false); + set_ = std::make_unique<RE2::Set>(options, RE2::UNANCHORED); + for (int i = 0; i < static_cast<int>(atoms.size()); ++i) { + if (set_->Add(atoms[i], /*error=*/NULL) != i) { + // Should never happen: the atom is a literal! + py::pybind11_fail("set_->Add() failed"); + } + } + // Compiling can fail. + return set_->Compile(); + } + + std::vector<int> Match(py::buffer buffer, bool potential) const { + auto bytes = buffer.request(); + auto text = FromBytes(bytes); + std::vector<int> atoms; + py::gil_scoped_release release_gil; + set_->Match(text, &atoms); + std::vector<int> matches; + if (potential) { + filter_.AllPotentials(atoms, &matches); + } else { + filter_.AllMatches(text, atoms, &matches); + } + return matches; + } + + const RE2& GetRE2(int index) const { + return filter_.GetRE2(index); + } + + private: + re2::FilteredRE2 filter_; + std::unique_ptr<RE2::Set> set_; +}; + +PYBIND11_MODULE(_re2, module) { + module.def("CharLenToBytes", &CharLenToBytes); + module.def("BytesToCharLen", &BytesToCharLen); + + // CLASSES + // class RE2 + // enum Anchor + // class Options + // enum Encoding + // class Set + // class Filter + py::class_<RE2> re2(module, "RE2"); + py::enum_<RE2::Anchor> anchor(re2, "Anchor"); + py::class_<RE2::Options> options(re2, "Options"); + py::enum_<RE2::Options::Encoding> encoding(options, "Encoding"); + py::class_<Set> set(module, "Set"); + py::class_<Filter> filter(module, "Filter"); + + anchor.value("UNANCHORED", RE2::Anchor::UNANCHORED); + anchor.value("ANCHOR_START", RE2::Anchor::ANCHOR_START); + anchor.value("ANCHOR_BOTH", RE2::Anchor::ANCHOR_BOTH); + + encoding.value("UTF8", RE2::Options::Encoding::EncodingUTF8); + encoding.value("LATIN1", RE2::Options::Encoding::EncodingLatin1); + + options.def(py::init<>()) + .def_property("max_mem", // + &RE2::Options::max_mem, // + &RE2::Options::set_max_mem) // + .def_property("encoding", // + &RE2::Options::encoding, // + &RE2::Options::set_encoding) // + .def_property("posix_syntax", // + &RE2::Options::posix_syntax, // + &RE2::Options::set_posix_syntax) // + .def_property("longest_match", // + &RE2::Options::longest_match, // + &RE2::Options::set_longest_match) // + .def_property("log_errors", // + &RE2::Options::log_errors, // + &RE2::Options::set_log_errors) // + .def_property("literal", // + &RE2::Options::literal, // + &RE2::Options::set_literal) // + .def_property("never_nl", // + &RE2::Options::never_nl, // + &RE2::Options::set_never_nl) // + .def_property("dot_nl", // + &RE2::Options::dot_nl, // + &RE2::Options::set_dot_nl) // + .def_property("never_capture", // + &RE2::Options::never_capture, // + &RE2::Options::set_never_capture) // + .def_property("case_sensitive", // + &RE2::Options::case_sensitive, // + &RE2::Options::set_case_sensitive) // + .def_property("perl_classes", // + &RE2::Options::perl_classes, // + &RE2::Options::set_perl_classes) // + .def_property("word_boundary", // + &RE2::Options::word_boundary, // + &RE2::Options::set_word_boundary) // + .def_property("one_line", // + &RE2::Options::one_line, // + &RE2::Options::set_one_line); // + + re2.def(py::init(&RE2InitShim)) + .def("ok", &RE2::ok) + .def("error", &RE2ErrorShim) + .def("options", &RE2::options) + .def("NumberOfCapturingGroups", &RE2::NumberOfCapturingGroups) + .def("NamedCapturingGroups", &RE2NamedCapturingGroupsShim) + .def("ProgramSize", &RE2::ProgramSize) + .def("ReverseProgramSize", &RE2::ReverseProgramSize) + .def("ProgramFanout", &RE2ProgramFanoutShim) + .def("ReverseProgramFanout", &RE2ReverseProgramFanoutShim) + .def("PossibleMatchRange", &RE2PossibleMatchRangeShim) + .def("Match", &RE2MatchShim) + .def_static("QuoteMeta", &RE2QuoteMetaShim); + + set.def(py::init<RE2::Anchor, const RE2::Options&>()) + .def("Add", &Set::Add) + .def("Compile", &Set::Compile) + .def("Match", &Set::Match); + + filter.def(py::init<>()) + .def("Add", &Filter::Add) + .def("Compile", &Filter::Compile) + .def("Match", &Filter::Match) + .def("GetRE2", &Filter::GetRE2, + py::return_value_policy::reference_internal); +} + +} // namespace re2_python diff --git a/python/re2.py b/python/re2.py new file mode 100644 index 0000000..8a6d985 --- /dev/null +++ b/python/re2.py @@ -0,0 +1,582 @@ +# Copyright 2019 The RE2 Authors. All Rights Reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. +r"""A drop-in replacement for the re module. + +It uses RE2 under the hood, of course, so various PCRE features +(e.g. backreferences, look-around assertions) are not supported. +See https://github.com/google/re2/wiki/Syntax for the canonical +reference, but known syntactic "gotchas" relative to Python are: + + * PCRE supports \Z and \z; RE2 supports \z; Python supports \z, + but calls it \Z. You must rewrite \Z to \z in pattern strings. + +Known differences between this module's API and the re module's API: + + * The error class does not provide any error information as attributes. + * The Options class replaces the re module's flags with RE2's options as + gettable/settable properties. Please see re2.h for their documentation. + * The pattern string and the input string do not have to be the same type. + Any str will be encoded to UTF-8. + * The pattern string cannot be str if the options specify Latin-1 encoding. + +This module's LRU cache contains a maximum of 128 regular expression objects. +Each regular expression object's underlying RE2 object uses a maximum of 8MiB +of memory (by default). Hence, this module's LRU cache uses a maximum of 1GiB +of memory (by default), but in most cases, it should use much less than that. +""" + +import codecs +import functools +import itertools + +import _re2 + + +class error(Exception): + pass + + +class Options(_re2.RE2.Options): + + __slots__ = () + + NAMES = ( + 'max_mem', + 'encoding', + 'posix_syntax', + 'longest_match', + 'log_errors', + 'literal', + 'never_nl', + 'dot_nl', + 'never_capture', + 'case_sensitive', + 'perl_classes', + 'word_boundary', + 'one_line', + ) + + +def compile(pattern, options=None): + if isinstance(pattern, _Regexp): + if options: + raise error('pattern is already compiled, so ' + 'options may not be specified') + pattern = pattern._pattern + options = options or Options() + values = tuple(getattr(options, name) for name in Options.NAMES) + return _Regexp._make(pattern, values) + + +def search(pattern, text, options=None): + return compile(pattern, options=options).search(text) + + +def match(pattern, text, options=None): + return compile(pattern, options=options).match(text) + + +def fullmatch(pattern, text, options=None): + return compile(pattern, options=options).fullmatch(text) + + +def finditer(pattern, text, options=None): + return compile(pattern, options=options).finditer(text) + + +def findall(pattern, text, options=None): + return compile(pattern, options=options).findall(text) + + +def split(pattern, text, maxsplit=0, options=None): + return compile(pattern, options=options).split(text, maxsplit) + + +def subn(pattern, repl, text, count=0, options=None): + return compile(pattern, options=options).subn(repl, text, count) + + +def sub(pattern, repl, text, count=0, options=None): + return compile(pattern, options=options).sub(repl, text, count) + + +def _encode(t): + return t.encode(encoding='utf-8') + + +def _decode(b): + return b.decode(encoding='utf-8') + + +def escape(pattern): + if isinstance(pattern, str): + encoded_pattern = _encode(pattern) + escaped = _re2.RE2.QuoteMeta(encoded_pattern) + decoded_escaped = _decode(escaped) + return decoded_escaped + else: + escaped = _re2.RE2.QuoteMeta(pattern) + return escaped + + +def purge(): + return _Regexp._make.cache_clear() + + +_Anchor = _re2.RE2.Anchor +_NULL_SPAN = (-1, -1) + + +class _Regexp(object): + + __slots__ = ('_pattern', '_regexp') + + @classmethod + @functools.lru_cache(typed=True) + def _make(cls, pattern, values): + options = Options() + for name, value in zip(Options.NAMES, values): + setattr(options, name, value) + return cls(pattern, options) + + def __init__(self, pattern, options): + self._pattern = pattern + if isinstance(self._pattern, str): + if options.encoding == Options.Encoding.LATIN1: + raise error('string type of pattern is str, but ' + 'encoding specified in options is LATIN1') + encoded_pattern = _encode(self._pattern) + self._regexp = _re2.RE2(encoded_pattern, options) + else: + self._regexp = _re2.RE2(self._pattern, options) + if not self._regexp.ok(): + raise error(self._regexp.error()) + + def __getstate__(self): + options = {name: getattr(self.options, name) for name in Options.NAMES} + return self._pattern, options + + def __setstate__(self, state): + pattern, options = state + values = tuple(options[name] for name in Options.NAMES) + other = _Regexp._make(pattern, values) + self._pattern = other._pattern + self._regexp = other._regexp + + def _match(self, anchor, text, pos=None, endpos=None): + pos = 0 if pos is None else max(0, min(pos, len(text))) + endpos = len(text) if endpos is None else max(0, min(endpos, len(text))) + if pos > endpos: + return + if isinstance(text, str): + encoded_text = _encode(text) + encoded_pos = _re2.CharLenToBytes(encoded_text, 0, pos) + if endpos == len(text): + # This is the common case. + encoded_endpos = len(encoded_text) + else: + encoded_endpos = encoded_pos + _re2.CharLenToBytes( + encoded_text, encoded_pos, endpos - pos) + decoded_offsets = {0: 0} + last_offset = 0 + while True: + spans = self._regexp.Match(anchor, encoded_text, encoded_pos, + encoded_endpos) + if spans[0] == _NULL_SPAN: + break + + # This algorithm is linear in the length of encoded_text. Specifically, + # no matter how many groups there are for a given regular expression or + # how many iterations through the loop there are for a given generator, + # this algorithm uses a single, straightforward pass over encoded_text. + offsets = sorted(set(itertools.chain(*spans))) + if offsets[0] == -1: + offsets = offsets[1:] + # Discard the rest of the items because they are useless now - and we + # could accumulate one item per str offset in the pathological case! + decoded_offsets = {last_offset: decoded_offsets[last_offset]} + for offset in offsets: + decoded_offsets[offset] = ( + decoded_offsets[last_offset] + + _re2.BytesToCharLen(encoded_text, last_offset, offset)) + last_offset = offset + + def decode(span): + if span == _NULL_SPAN: + return span + return decoded_offsets[span[0]], decoded_offsets[span[1]] + + decoded_spans = [decode(span) for span in spans] + yield _Match(self, text, pos, endpos, decoded_spans) + if encoded_pos == encoded_endpos: + break + elif encoded_pos == spans[0][1]: + # We matched the empty string at encoded_pos and would be stuck, so + # in order to make forward progress, increment the str offset. + encoded_pos += _re2.CharLenToBytes(encoded_text, encoded_pos, 1) + else: + encoded_pos = spans[0][1] + else: + while True: + spans = self._regexp.Match(anchor, text, pos, endpos) + if spans[0] == _NULL_SPAN: + break + yield _Match(self, text, pos, endpos, spans) + if pos == endpos: + break + elif pos == spans[0][1]: + # We matched the empty string at pos and would be stuck, so in order + # to make forward progress, increment the bytes offset. + pos += 1 + else: + pos = spans[0][1] + + def search(self, text, pos=None, endpos=None): + return next(self._match(_Anchor.UNANCHORED, text, pos, endpos), None) + + def match(self, text, pos=None, endpos=None): + return next(self._match(_Anchor.ANCHOR_START, text, pos, endpos), None) + + def fullmatch(self, text, pos=None, endpos=None): + return next(self._match(_Anchor.ANCHOR_BOTH, text, pos, endpos), None) + + def finditer(self, text, pos=None, endpos=None): + return self._match(_Anchor.UNANCHORED, text, pos, endpos) + + def findall(self, text, pos=None, endpos=None): + empty = type(text)() + items = [] + for match in self.finditer(text, pos, endpos): + if not self.groups: + item = match.group() + elif self.groups == 1: + item = match.groups(default=empty)[0] + else: + item = match.groups(default=empty) + items.append(item) + return items + + def _split(self, cb, text, maxsplit=0): + if maxsplit < 0: + return [text], 0 + elif maxsplit > 0: + matchiter = itertools.islice(self.finditer(text), maxsplit) + else: + matchiter = self.finditer(text) + pieces = [] + end = 0 + numsplit = 0 + for match in matchiter: + pieces.append(text[end:match.start()]) + pieces.extend(cb(match)) + end = match.end() + numsplit += 1 + pieces.append(text[end:]) + return pieces, numsplit + + def split(self, text, maxsplit=0): + cb = lambda match: [match[group] for group in range(1, self.groups + 1)] + pieces, _ = self._split(cb, text, maxsplit) + return pieces + + def subn(self, repl, text, count=0): + cb = lambda match: [repl(match) if callable(repl) else match.expand(repl)] + empty = type(text)() + pieces, numsplit = self._split(cb, text, count) + joined_pieces = empty.join(pieces) + return joined_pieces, numsplit + + def sub(self, repl, text, count=0): + joined_pieces, _ = self.subn(repl, text, count) + return joined_pieces + + @property + def pattern(self): + return self._pattern + + @property + def options(self): + return self._regexp.options() + + @property + def groups(self): + return self._regexp.NumberOfCapturingGroups() + + @property + def groupindex(self): + groups = self._regexp.NamedCapturingGroups() + if isinstance(self._pattern, str): + decoded_groups = [(_decode(group), index) for group, index in groups] + return dict(decoded_groups) + else: + return dict(groups) + + @property + def programsize(self): + return self._regexp.ProgramSize() + + @property + def reverseprogramsize(self): + return self._regexp.ReverseProgramSize() + + @property + def programfanout(self): + return self._regexp.ProgramFanout() + + @property + def reverseprogramfanout(self): + return self._regexp.ReverseProgramFanout() + + def possiblematchrange(self, maxlen): + ok, min, max = self._regexp.PossibleMatchRange(maxlen) + if not ok: + raise error('failed to compute match range') + return min, max + + +class _Match(object): + + __slots__ = ('_regexp', '_text', '_pos', '_endpos', '_spans') + + def __init__(self, regexp, text, pos, endpos, spans): + self._regexp = regexp + self._text = text + self._pos = pos + self._endpos = endpos + self._spans = spans + + # Python prioritises three-digit octal numbers over group escapes. + # For example, \100 should not be handled the same way as \g<10>0. + _OCTAL_RE = compile('\\\\[0-7][0-7][0-7]') + + # Python supports \1 through \99 (inclusive) and \g<...> syntax. + _GROUP_RE = compile('\\\\[1-9][0-9]?|\\\\g<\\w+>') + + @classmethod + @functools.lru_cache(typed=True) + def _split(cls, template): + if isinstance(template, str): + backslash = '\\' + else: + backslash = b'\\' + empty = type(template)() + pieces = [empty] + index = template.find(backslash) + while index != -1: + piece, template = template[:index], template[index:] + pieces[-1] += piece + octal_match = cls._OCTAL_RE.match(template) + group_match = cls._GROUP_RE.match(template) + if (not octal_match) and group_match: + index = group_match.end() + piece, template = template[:index], template[index:] + pieces.extend((piece, empty)) + else: + # 2 isn't enough for \o, \x, \N, \u and \U escapes, but none of those + # should contain backslashes, so break them here and then fix them at + # the beginning of the next loop iteration or right before returning. + index = 2 + piece, template = template[:index], template[index:] + pieces[-1] += piece + index = template.find(backslash) + pieces[-1] += template + return pieces + + def expand(self, template): + if isinstance(template, str): + unescape = codecs.unicode_escape_decode + else: + unescape = codecs.escape_decode + empty = type(template)() + # Make a copy so that we don't clobber the cached pieces! + pieces = list(self._split(template)) + for index, piece in enumerate(pieces): + if not index % 2: + pieces[index], _ = unescape(piece) + else: + if len(piece) <= 3: # \1 through \99 (inclusive) + group = int(piece[1:]) + else: # \g<...> + group = piece[3:-1] + try: + group = int(group) + except ValueError: + pass + pieces[index] = self.__getitem__(group) or empty + joined_pieces = empty.join(pieces) + return joined_pieces + + def __getitem__(self, group): + if not isinstance(group, int): + try: + group = self._regexp.groupindex[group] + except KeyError: + raise IndexError('bad group name') + if not 0 <= group <= self._regexp.groups: + raise IndexError('bad group index') + span = self._spans[group] + if span == _NULL_SPAN: + return None + return self._text[span[0]:span[1]] + + def group(self, *groups): + if not groups: + groups = (0,) + items = (self.__getitem__(group) for group in groups) + return next(items) if len(groups) == 1 else tuple(items) + + def groups(self, default=None): + items = [] + for group in range(1, self._regexp.groups + 1): + item = self.__getitem__(group) + items.append(default if item is None else item) + return tuple(items) + + def groupdict(self, default=None): + items = [] + for group, index in self._regexp.groupindex.items(): + item = self.__getitem__(index) + items.append((group, default) if item is None else (group, item)) + return dict(items) + + def start(self, group=0): + if not 0 <= group <= self._regexp.groups: + raise IndexError('bad group index') + return self._spans[group][0] + + def end(self, group=0): + if not 0 <= group <= self._regexp.groups: + raise IndexError('bad group index') + return self._spans[group][1] + + def span(self, group=0): + if not 0 <= group <= self._regexp.groups: + raise IndexError('bad group index') + return self._spans[group] + + @property + def re(self): + return self._regexp + + @property + def string(self): + return self._text + + @property + def pos(self): + return self._pos + + @property + def endpos(self): + return self._endpos + + @property + def lastindex(self): + max_end = -1 + max_group = None + # We look for the rightmost right parenthesis by keeping the first group + # that ends at max_end because that is the leftmost/outermost group when + # there are nested groups! + for group in range(1, self._regexp.groups + 1): + end = self._spans[group][1] + if max_end < end: + max_end = end + max_group = group + return max_group + + @property + def lastgroup(self): + max_group = self.lastindex + if not max_group: + return None + for group, index in self._regexp.groupindex.items(): + if max_group == index: + return group + return None + + +class Set(object): + """A Pythonic wrapper around RE2::Set.""" + + __slots__ = ('_set') + + def __init__(self, anchor, options=None): + options = options or Options() + self._set = _re2.Set(anchor, options) + + @classmethod + def SearchSet(cls, options=None): + return cls(_Anchor.UNANCHORED, options=options) + + @classmethod + def MatchSet(cls, options=None): + return cls(_Anchor.ANCHOR_START, options=options) + + @classmethod + def FullMatchSet(cls, options=None): + return cls(_Anchor.ANCHOR_BOTH, options=options) + + def Add(self, pattern): + if isinstance(pattern, str): + encoded_pattern = _encode(pattern) + index = self._set.Add(encoded_pattern) + else: + index = self._set.Add(pattern) + if index == -1: + raise error('failed to add %r to Set' % pattern) + return index + + def Compile(self): + if not self._set.Compile(): + raise error('failed to compile Set') + + def Match(self, text): + if isinstance(text, str): + encoded_text = _encode(text) + matches = self._set.Match(encoded_text) + else: + matches = self._set.Match(text) + return matches or None + + +class Filter(object): + """A Pythonic wrapper around FilteredRE2.""" + + __slots__ = ('_filter', '_patterns') + + def __init__(self): + self._filter = _re2.Filter() + self._patterns = [] + + def Add(self, pattern, options=None): + options = options or Options() + if isinstance(pattern, str): + encoded_pattern = _encode(pattern) + index = self._filter.Add(encoded_pattern, options) + else: + index = self._filter.Add(pattern, options) + if index == -1: + raise error('failed to add %r to Filter' % pattern) + self._patterns.append(pattern) + return index + + def Compile(self): + if not self._filter.Compile(): + raise error('failed to compile Filter') + + def Match(self, text, potential=False): + if isinstance(text, str): + encoded_text = _encode(text) + matches = self._filter.Match(encoded_text, potential) + else: + matches = self._filter.Match(text, potential) + return matches or None + + def re(self, index): + if not 0 <= index < len(self._patterns): + raise IndexError('bad index') + proxy = object.__new__(_Regexp) + proxy._pattern = self._patterns[index] + proxy._regexp = self._filter.GetRE2(index) + return proxy diff --git a/python/re2_test.py b/python/re2_test.py new file mode 100644 index 0000000..86aa9ae --- /dev/null +++ b/python/re2_test.py @@ -0,0 +1,482 @@ +# Copyright 2019 The RE2 Authors. All Rights Reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. +"""Tests for google3.third_party.re2.python.re2.""" + +import collections +import pickle +import re + +from absl.testing import absltest +from absl.testing import parameterized +import re2 + + +class OptionsTest(parameterized.TestCase): + + @parameterized.parameters(*re2.Options.NAMES) + def test_option(self, name): + options = re2.Options() + value = getattr(options, name) + if isinstance(value, re2.Options.Encoding): + value = next(v for v in type(value).__members__.values() if v != value) + elif isinstance(value, bool): + value = not value + elif isinstance(value, int): + value = value + 1 + else: + raise TypeError('option {!r}: {!r} {!r}'.format(name, type(value), value)) + setattr(options, name, value) + self.assertEqual(value, getattr(options, name)) + + +class Re2CompileTest(parameterized.TestCase): + """Contains tests that apply to the re2 module only. + + We disagree with Python on the string types of group names, + so there is no point attempting to verify consistency. + """ + + @parameterized.parameters( + (u'(foo*)(?P<bar>qux+)', 2, [(u'bar', 2)]), + (b'(foo*)(?P<bar>qux+)', 2, [(b'bar', 2)]), + (u'(foo*)(?P<ä¸æ–‡>qux+)', 2, [(u'ä¸æ–‡', 2)]), + ) + def test_compile(self, pattern, expected_groups, expected_groupindex): + regexp = re2.compile(pattern) + self.assertIs(regexp, re2.compile(pattern)) # cached + self.assertIs(regexp, re2.compile(regexp)) # cached + with self.assertRaisesRegex(re2.error, + ('pattern is already compiled, so ' + 'options may not be specified')): + options = re2.Options() + options.log_errors = not options.log_errors + re2.compile(regexp, options=options) + self.assertIsNotNone(regexp.options) + self.assertEqual(expected_groups, regexp.groups) + self.assertDictEqual(dict(expected_groupindex), regexp.groupindex) + + def test_compile_with_options(self): + options = re2.Options() + options.max_mem = 100 + with self.assertRaisesRegex(re2.error, 'pattern too large'): + re2.compile('.{1000}', options=options) + + def test_programsize_reverseprogramsize(self): + regexp = re2.compile('a+b') + self.assertEqual(7, regexp.programsize) + self.assertEqual(7, regexp.reverseprogramsize) + + def test_programfanout_reverseprogramfanout(self): + regexp = re2.compile('a+b') + self.assertListEqual([1, 1], regexp.programfanout) + self.assertListEqual([3], regexp.reverseprogramfanout) + + @parameterized.parameters( + (u'abc', 0, None), + (b'abc', 0, None), + (u'abc', 10, (b'abc', b'abc')), + (b'abc', 10, (b'abc', b'abc')), + (u'ab*c', 10, (b'ab', b'ac')), + (b'ab*c', 10, (b'ab', b'ac')), + (u'ab+c', 10, (b'abb', b'abc')), + (b'ab+c', 10, (b'abb', b'abc')), + (u'ab?c', 10, (b'abc', b'ac')), + (b'ab?c', 10, (b'abc', b'ac')), + (u'.*', 10, (b'', b'\xf4\xbf\xbf\xc0')), + (b'.*', 10, None), + (u'\\C*', 10, None), + (b'\\C*', 10, None), + ) + def test_possiblematchrange(self, pattern, maxlen, expected_min_max): + # For brevity, the string type of pattern determines the encoding. + # It would otherwise be possible to have bytes with UTF8, but as per + # the module docstring, it isn't permitted to have str with LATIN1. + options = re2.Options() + if isinstance(pattern, str): + options.encoding = re2.Options.Encoding.UTF8 + else: + options.encoding = re2.Options.Encoding.LATIN1 + regexp = re2.compile(pattern, options=options) + if expected_min_max: + self.assertEqual(expected_min_max, regexp.possiblematchrange(maxlen)) + else: + with self.assertRaisesRegex(re2.error, 'failed to compute match range'): + regexp.possiblematchrange(maxlen) + + +Params = collections.namedtuple( + 'Params', ('pattern', 'text', 'spans', 'search', 'match', 'fullmatch')) + +PARAMS = [ + Params(u'\\d+', u'Hello, world.', None, False, False, False), + Params(b'\\d+', b'Hello, world.', None, False, False, False), + Params(u'\\s+', u'Hello, world.', [(6, 7)], True, False, False), + Params(b'\\s+', b'Hello, world.', [(6, 7)], True, False, False), + Params(u'\\w+', u'Hello, world.', [(0, 5)], True, True, False), + Params(b'\\w+', b'Hello, world.', [(0, 5)], True, True, False), + Params(u'(\\d+)?', u'Hello, world.', [(0, 0), (-1, -1)], True, True, False), + Params(b'(\\d+)?', b'Hello, world.', [(0, 0), (-1, -1)], True, True, False), + Params(u'youtube(_device|_md|_gaia|_multiday|_multiday_gaia)?', + u'youtube_ads', [(0, 7), (-1, -1)], True, True, False), + Params(b'youtube(_device|_md|_gaia|_multiday|_multiday_gaia)?', + b'youtube_ads', [(0, 7), (-1, -1)], True, True, False), +] + + +def upper(match): + return match.group().upper() + + +class ReRegexpTest(parameterized.TestCase): + """Contains tests that apply to the re and re2 modules.""" + + MODULE = re + + @parameterized.parameters((p.pattern,) for p in PARAMS) + def test_pickle(self, pattern): + regexp = self.MODULE.compile(pattern) + rick = pickle.loads(pickle.dumps(regexp)) + self.assertEqual(regexp.pattern, rick.pattern) + + @parameterized.parameters( + (p.pattern, p.text, (p.spans if p.search else None)) for p in PARAMS) + def test_search(self, pattern, text, expected_spans): + match = self.MODULE.search(pattern, text) + if expected_spans is None: + self.assertIsNone(match) + else: + spans = [match.span(group) for group in range(match.re.groups + 1)] + self.assertListEqual(expected_spans, spans) + + def test_search_with_pos_and_endpos(self): + regexp = self.MODULE.compile(u'.+') # empty string NOT allowed + text = u'I \u2665 RE2!' + # Note that len(text) is the position of the empty string at the end of + # text, so range() stops at len(text) + 1 in order to include len(text). + for pos in range(len(text) + 1): + for endpos in range(pos, len(text) + 1): + match = regexp.search(text, pos=pos, endpos=endpos) + if pos == endpos: + self.assertIsNone(match) + else: + self.assertEqual(pos, match.pos) + self.assertEqual(endpos, match.endpos) + self.assertEqual(pos, match.start()) + self.assertEqual(endpos, match.end()) + self.assertTupleEqual((pos, endpos), match.span()) + + def test_search_with_bogus_pos_and_endpos(self): + regexp = self.MODULE.compile(u'.*') # empty string allowed + text = u'I \u2665 RE2!' + + match = regexp.search(text, pos=-100) + self.assertEqual(0, match.pos) + match = regexp.search(text, pos=100) + self.assertEqual(8, match.pos) + + match = regexp.search(text, endpos=-100) + self.assertEqual(0, match.endpos) + match = regexp.search(text, endpos=100) + self.assertEqual(8, match.endpos) + + match = regexp.search(text, pos=100, endpos=-100) + self.assertIsNone(match) + + @parameterized.parameters( + (p.pattern, p.text, (p.spans if p.match else None)) for p in PARAMS) + def test_match(self, pattern, text, expected_spans): + match = self.MODULE.match(pattern, text) + if expected_spans is None: + self.assertIsNone(match) + else: + spans = [match.span(group) for group in range(match.re.groups + 1)] + self.assertListEqual(expected_spans, spans) + + @parameterized.parameters( + (p.pattern, p.text, (p.spans if p.fullmatch else None)) for p in PARAMS) + def test_fullmatch(self, pattern, text, expected_spans): + match = self.MODULE.fullmatch(pattern, text) + if expected_spans is None: + self.assertIsNone(match) + else: + spans = [match.span(group) for group in range(match.re.groups + 1)] + self.assertListEqual(expected_spans, spans) + + @parameterized.parameters( + (u'', u'', [(0, 0)]), + (b'', b'', [(0, 0)]), + (u'', u'x', [(0, 0), (1, 1)]), + (b'', b'x', [(0, 0), (1, 1)]), + (u'', u'xy', [(0, 0), (1, 1), (2, 2)]), + (b'', b'xy', [(0, 0), (1, 1), (2, 2)]), + (u'.', u'xy', [(0, 1), (1, 2)]), + (b'.', b'xy', [(0, 1), (1, 2)]), + (u'x', u'xy', [(0, 1)]), + (b'x', b'xy', [(0, 1)]), + (u'y', u'xy', [(1, 2)]), + (b'y', b'xy', [(1, 2)]), + (u'z', u'xy', []), + (b'z', b'xy', []), + (u'\\w*', u'Hello, world.', [(0, 5), (5, 5), (6, 6), (7, 12), (12, 12), + (13, 13)]), + (b'\\w*', b'Hello, world.', [(0, 5), (5, 5), (6, 6), (7, 12), (12, 12), + (13, 13)]), + ) + def test_finditer(self, pattern, text, expected_matches): + matches = [match.span() for match in self.MODULE.finditer(pattern, text)] + self.assertListEqual(expected_matches, matches) + + @parameterized.parameters( + (u'\\w\\w+', u'Hello, world.', [u'Hello', u'world']), + (b'\\w\\w+', b'Hello, world.', [b'Hello', b'world']), + (u'(\\w)\\w+', u'Hello, world.', [u'H', u'w']), + (b'(\\w)\\w+', b'Hello, world.', [b'H', b'w']), + (u'(\\w)(\\w+)', u'Hello, world.', [(u'H', u'ello'), (u'w', u'orld')]), + (b'(\\w)(\\w+)', b'Hello, world.', [(b'H', b'ello'), (b'w', b'orld')]), + (u'(\\w)(\\w+)?', u'Hello, w.', [(u'H', u'ello'), (u'w', u'')]), + (b'(\\w)(\\w+)?', b'Hello, w.', [(b'H', b'ello'), (b'w', b'')]), + ) + def test_findall(self, pattern, text, expected_matches): + matches = self.MODULE.findall(pattern, text) + self.assertListEqual(expected_matches, matches) + + @parameterized.parameters( + (u'\\W+', u'Hello, world.', -1, [u'Hello, world.']), + (b'\\W+', b'Hello, world.', -1, [b'Hello, world.']), + (u'\\W+', u'Hello, world.', 0, [u'Hello', u'world', u'']), + (b'\\W+', b'Hello, world.', 0, [b'Hello', b'world', b'']), + (u'\\W+', u'Hello, world.', 1, [u'Hello', u'world.']), + (b'\\W+', b'Hello, world.', 1, [b'Hello', b'world.']), + (u'(\\W+)', u'Hello, world.', -1, [u'Hello, world.']), + (b'(\\W+)', b'Hello, world.', -1, [b'Hello, world.']), + (u'(\\W+)', u'Hello, world.', 0, [u'Hello', u', ', u'world', u'.', u'']), + (b'(\\W+)', b'Hello, world.', 0, [b'Hello', b', ', b'world', b'.', b'']), + (u'(\\W+)', u'Hello, world.', 1, [u'Hello', u', ', u'world.']), + (b'(\\W+)', b'Hello, world.', 1, [b'Hello', b', ', b'world.']), + ) + def test_split(self, pattern, text, maxsplit, expected_pieces): + pieces = self.MODULE.split(pattern, text, maxsplit) + self.assertListEqual(expected_pieces, pieces) + + @parameterized.parameters( + (u'\\w+', upper, u'Hello, world.', -1, u'Hello, world.', 0), + (b'\\w+', upper, b'Hello, world.', -1, b'Hello, world.', 0), + (u'\\w+', upper, u'Hello, world.', 0, u'HELLO, WORLD.', 2), + (b'\\w+', upper, b'Hello, world.', 0, b'HELLO, WORLD.', 2), + (u'\\w+', upper, u'Hello, world.', 1, u'HELLO, world.', 1), + (b'\\w+', upper, b'Hello, world.', 1, b'HELLO, world.', 1), + (u'\\w+', u'MEEP', u'Hello, world.', -1, u'Hello, world.', 0), + (b'\\w+', b'MEEP', b'Hello, world.', -1, b'Hello, world.', 0), + (u'\\w+', u'MEEP', u'Hello, world.', 0, u'MEEP, MEEP.', 2), + (b'\\w+', b'MEEP', b'Hello, world.', 0, b'MEEP, MEEP.', 2), + (u'\\w+', u'MEEP', u'Hello, world.', 1, u'MEEP, world.', 1), + (b'\\w+', b'MEEP', b'Hello, world.', 1, b'MEEP, world.', 1), + (u'\\\\', u'\\\\\\\\', u'Hello,\\world.', 0, u'Hello,\\\\world.', 1), + (b'\\\\', b'\\\\\\\\', b'Hello,\\world.', 0, b'Hello,\\\\world.', 1), + ) + def test_subn_sub(self, pattern, repl, text, count, expected_joined_pieces, + expected_numsplit): + joined_pieces, numsplit = self.MODULE.subn(pattern, repl, text, count) + self.assertEqual(expected_joined_pieces, joined_pieces) + self.assertEqual(expected_numsplit, numsplit) + + joined_pieces = self.MODULE.sub(pattern, repl, text, count) + self.assertEqual(expected_joined_pieces, joined_pieces) + + +class Re2RegexpTest(ReRegexpTest): + """Contains tests that apply to the re2 module only.""" + + MODULE = re2 + + def test_compile_with_latin1_encoding(self): + options = re2.Options() + options.encoding = re2.Options.Encoding.LATIN1 + with self.assertRaisesRegex(re2.error, + ('string type of pattern is str, but ' + 'encoding specified in options is LATIN1')): + re2.compile(u'.?', options=options) + + # ... whereas this is fine, of course. + re2.compile(b'.?', options=options) + + @parameterized.parameters( + (u'\\p{Lo}', u'\u0ca0_\u0ca0', [(0, 1), (2, 3)]), + (b'\\p{Lo}', b'\xe0\xb2\xa0_\xe0\xb2\xa0', [(0, 3), (4, 7)]), + ) + def test_finditer_with_utf8(self, pattern, text, expected_matches): + matches = [match.span() for match in self.MODULE.finditer(pattern, text)] + self.assertListEqual(expected_matches, matches) + + def test_purge(self): + re2.compile('Goodbye, world.') + self.assertGreater(re2._Regexp._make.cache_info().currsize, 0) + re2.purge() + self.assertEqual(re2._Regexp._make.cache_info().currsize, 0) + + +class Re2EscapeTest(parameterized.TestCase): + """Contains tests that apply to the re2 module only. + + We disagree with Python on the escaping of some characters, + so there is no point attempting to verify consistency. + """ + + @parameterized.parameters( + (u'a*b+c?', u'a\\*b\\+c\\?'), + (b'a*b+c?', b'a\\*b\\+c\\?'), + ) + def test_escape(self, pattern, expected_escaped): + escaped = re2.escape(pattern) + self.assertEqual(expected_escaped, escaped) + + +class ReMatchTest(parameterized.TestCase): + """Contains tests that apply to the re and re2 modules.""" + + MODULE = re + + def test_expand(self): + pattern = u'(?P<S>[\u2600-\u26ff]+).*?(?P<P>[^\\s\\w]+)' + text = u'I \u2665 RE2!\n' + match = self.MODULE.search(pattern, text) + + self.assertEqual(u'\u2665\n!', match.expand(u'\\1\\n\\2')) + self.assertEqual(u'\u2665\n!', match.expand(u'\\g<1>\\n\\g<2>')) + self.assertEqual(u'\u2665\n!', match.expand(u'\\g<S>\\n\\g<P>')) + self.assertEqual(u'\\1\\2\n\u2665!', match.expand(u'\\\\1\\\\2\\n\\1\\2')) + + def test_expand_with_octal(self): + pattern = u'()()()()()()()()()(\\w+)' + text = u'Hello, world.' + match = self.MODULE.search(pattern, text) + + self.assertEqual(u'Hello\n', match.expand(u'\\g<0>\\n')) + self.assertEqual(u'Hello\n', match.expand(u'\\g<10>\\n')) + + self.assertEqual(u'\x00\n', match.expand(u'\\0\\n')) + self.assertEqual(u'\x00\n', match.expand(u'\\00\\n')) + self.assertEqual(u'\x00\n', match.expand(u'\\000\\n')) + self.assertEqual(u'\x000\n', match.expand(u'\\0000\\n')) + + self.assertEqual(u'\n', match.expand(u'\\1\\n')) + self.assertEqual(u'Hello\n', match.expand(u'\\10\\n')) + self.assertEqual(u'@\n', match.expand(u'\\100\\n')) + self.assertEqual(u'@0\n', match.expand(u'\\1000\\n')) + + def test_getitem_group_groups_groupdict(self): + pattern = u'(?P<S>[\u2600-\u26ff]+).*?(?P<P>[^\\s\\w]+)' + text = u'Hello, world.\nI \u2665 RE2!\nGoodbye, world.\n' + match = self.MODULE.search(pattern, text) + + self.assertEqual(u'\u2665 RE2!', match[0]) + self.assertEqual(u'\u2665', match[1]) + self.assertEqual(u'!', match[2]) + self.assertEqual(u'\u2665', match[u'S']) + self.assertEqual(u'!', match[u'P']) + + self.assertEqual(u'\u2665 RE2!', match.group()) + self.assertEqual(u'\u2665 RE2!', match.group(0)) + self.assertEqual(u'\u2665', match.group(1)) + self.assertEqual(u'!', match.group(2)) + self.assertEqual(u'\u2665', match.group(u'S')) + self.assertEqual(u'!', match.group(u'P')) + + self.assertTupleEqual((u'\u2665', u'!'), match.group(1, 2)) + self.assertTupleEqual((u'\u2665', u'!'), match.group(u'S', u'P')) + self.assertTupleEqual((u'\u2665', u'!'), match.groups()) + self.assertDictEqual({u'S': u'\u2665', u'P': u'!'}, match.groupdict()) + + def test_bogus_group_start_end_and_span(self): + pattern = u'(?P<S>[\u2600-\u26ff]+).*?(?P<P>[^\\s\\w]+)' + text = u'I \u2665 RE2!\n' + match = self.MODULE.search(pattern, text) + + self.assertRaises(IndexError, match.group, -1) + self.assertRaises(IndexError, match.group, 3) + self.assertRaises(IndexError, match.group, 'X') + + self.assertRaises(IndexError, match.start, -1) + self.assertRaises(IndexError, match.start, 3) + + self.assertRaises(IndexError, match.end, -1) + self.assertRaises(IndexError, match.end, 3) + + self.assertRaises(IndexError, match.span, -1) + self.assertRaises(IndexError, match.span, 3) + + @parameterized.parameters( + (u'((a)(b))((c)(d))', u'foo bar qux', None, None), + (u'(?P<one>(a)(b))((c)(d))', u'foo abcd qux', 4, None), + (u'(?P<one>(a)(b))(?P<four>(c)(d))', u'foo abcd qux', 4, 'four'), + ) + def test_lastindex_lastgroup(self, pattern, text, expected_lastindex, + expected_lastgroup): + match = self.MODULE.search(pattern, text) + if expected_lastindex is None: + self.assertIsNone(match) + else: + self.assertEqual(expected_lastindex, match.lastindex) + self.assertEqual(expected_lastgroup, match.lastgroup) + + +class Re2MatchTest(ReMatchTest): + """Contains tests that apply to the re2 module only.""" + + MODULE = re2 + + +class SetTest(absltest.TestCase): + + def test_search(self): + s = re2.Set.SearchSet() + self.assertEqual(0, s.Add('\\d+')) + self.assertEqual(1, s.Add('\\s+')) + self.assertEqual(2, s.Add('\\w+')) + self.assertRaises(re2.error, s.Add, '(MEEP') + s.Compile() + self.assertItemsEqual([1, 2], s.Match('Hello, world.')) + + def test_match(self): + s = re2.Set.MatchSet() + self.assertEqual(0, s.Add('\\d+')) + self.assertEqual(1, s.Add('\\s+')) + self.assertEqual(2, s.Add('\\w+')) + self.assertRaises(re2.error, s.Add, '(MEEP') + s.Compile() + self.assertItemsEqual([2], s.Match('Hello, world.')) + + def test_fullmatch(self): + s = re2.Set.FullMatchSet() + self.assertEqual(0, s.Add('\\d+')) + self.assertEqual(1, s.Add('\\s+')) + self.assertEqual(2, s.Add('\\w+')) + self.assertRaises(re2.error, s.Add, '(MEEP') + s.Compile() + self.assertIsNone(s.Match('Hello, world.')) + + +class FilterTest(absltest.TestCase): + + def test_match(self): + f = re2.Filter() + self.assertEqual(0, f.Add('Hello, \\w+\\.')) + self.assertEqual(1, f.Add('\\w+, world\\.')) + self.assertEqual(2, f.Add('Goodbye, \\w+\\.')) + self.assertRaises(re2.error, f.Add, '(MEEP') + f.Compile() + self.assertItemsEqual([0, 1], f.Match('Hello, world.', potential=True)) + self.assertItemsEqual([0, 1], f.Match('HELLO, WORLD.', potential=True)) + self.assertItemsEqual([0, 1], f.Match('Hello, world.')) + self.assertIsNone(f.Match('HELLO, WORLD.')) + + self.assertRaises(IndexError, f.re, -1) + self.assertRaises(IndexError, f.re, 3) + self.assertEqual('Goodbye, \\w+\\.', f.re(2).pattern) + # Verify whether the underlying RE2 object is usable. + self.assertEqual(0, f.re(2).groups) + + +if __name__ == '__main__': + absltest.main() diff --git a/python/setup.py b/python/setup.py new file mode 100644 index 0000000..3bd11ed --- /dev/null +++ b/python/setup.py @@ -0,0 +1,117 @@ +# Copyright 2019 The RE2 Authors. All Rights Reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +import os +import setuptools +import setuptools.command.build_ext +import shutil +import sys + +long_description = r"""A drop-in replacement for the re module. + +It uses RE2 under the hood, of course, so various PCRE features +(e.g. backreferences, look-around assertions) are not supported. +See https://github.com/google/re2/wiki/Syntax for the canonical +reference, but known syntactic "gotchas" relative to Python are: + + * PCRE supports \Z and \z; RE2 supports \z; Python supports \z, + but calls it \Z. You must rewrite \Z to \z in pattern strings. + +Known differences between this module's API and the re module's API: + + * The error class does not provide any error information as attributes. + * The Options class replaces the re module's flags with RE2's options as + gettable/settable properties. Please see re2.h for their documentation. + * The pattern string and the input string do not have to be the same type. + Any str will be encoded to UTF-8. + * The pattern string cannot be str if the options specify Latin-1 encoding. + +Known issues with regard to building the C++ extension: + + * Building requires RE2 to be installed on your system. + On Debian, for example, install the libre2-dev package. + * Building requires pybind11 to be installed on your system OR venv. + On Debian, for example, install the pybind11-dev package. + For a venv, install the pybind11 package from PyPI. + * Building on macOS is known to work, but has been known to fail. + For example, the system Python may not know which compiler flags + to set when building bindings for software installed by Homebrew; + see https://docs.brew.sh/Homebrew-and-Python#brewed-python-modules. + * Building on Windows has not been tested yet and will probably fail. +""" + + +class BuildExt(setuptools.command.build_ext.build_ext): + + def build_extension(self, ext): + if 'GITHUB_ACTIONS' not in os.environ: + return super().build_extension(ext) + + # For @pybind11_bazel's `python_configure()`. + os.environ['PYTHON_BIN_PATH'] = sys.executable + + cmd = ['bazel', 'build'] + try: + cmd.append(f'--cpu={os.environ["BAZEL_CPU"].lower()}') + except KeyError: + pass + cmd += ['--compilation_mode=opt', '--', ':all'] + self.spawn(cmd) + + # This ensures that f'_re2.{importlib.machinery.EXTENSION_SUFFIXES[0]}' + # is the filename in the destination directory, which is what's needed. + shutil.copyfile('../bazel-bin/python/_re2.so', + self.get_ext_fullpath(ext.name)) + + cmd = ['bazel', 'clean', '--expunge'] + self.spawn(cmd) + + +def options(): + bdist_wheel = {} + try: + bdist_wheel['plat_name'] = os.environ['PLAT_NAME'] + except KeyError: + pass + return {'bdist_wheel': bdist_wheel} + + +def include_dirs(): + try: + import pybind11 + yield pybind11.get_include() + except ModuleNotFoundError: + pass + + +ext_module = setuptools.Extension( + name='_re2', + sources=['_re2.cc'], + include_dirs=list(include_dirs()), + libraries=['re2'], + extra_compile_args=['-fvisibility=hidden'], +) + +setuptools.setup( + name='google-re2', + version='1.1', + description='RE2 Python bindings', + long_description=long_description, + long_description_content_type='text/plain', + author='The RE2 Authors', + author_email='re2-dev@googlegroups.com', + url='https://github.com/google/re2', + py_modules=['re2'], + ext_modules=[ext_module], + classifiers=[ + 'Development Status :: 5 - Production/Stable', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: BSD License', + 'Programming Language :: C++', + 'Programming Language :: Python :: 3.8', + ], + options=options(), + cmdclass={'build_ext': BuildExt}, + python_requires='~=3.8', +) @@ -1,8 +1,9 @@ -includedir=@includedir@ -libdir=@libdir@ +includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ +libdir=@CMAKE_INSTALL_FULL_LIBDIR@ Name: re2 Description: RE2 is a fast, safe, thread-friendly regular expression engine. -Version: 0.0.0 +Requires: @REQUIRES@ +Version: @SONAME@.0.0 Cflags: -pthread -I${includedir} Libs: -pthread -L${libdir} -lre2 diff --git a/re2/bitmap256.cc b/re2/bitmap256.cc new file mode 100644 index 0000000..f6fbca3 --- /dev/null +++ b/re2/bitmap256.cc @@ -0,0 +1,44 @@ +// Copyright 2023 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "re2/bitmap256.h" + +#include <stdint.h> + +#include "absl/base/macros.h" +#include "util/logging.h" + +namespace re2 { + +int Bitmap256::FindNextSetBit(int c) const { + DCHECK_GE(c, 0); + DCHECK_LE(c, 255); + + // Check the word that contains the bit. Mask out any lower bits. + int i = c / 64; + uint64_t word = words_[i] & (~uint64_t{0} << (c % 64)); + if (word != 0) + return (i * 64) + FindLSBSet(word); + + // Check any following words. + i++; + switch (i) { + case 1: + if (words_[1] != 0) + return (1 * 64) + FindLSBSet(words_[1]); + ABSL_FALLTHROUGH_INTENDED; + case 2: + if (words_[2] != 0) + return (2 * 64) + FindLSBSet(words_[2]); + ABSL_FALLTHROUGH_INTENDED; + case 3: + if (words_[3] != 0) + return (3 * 64) + FindLSBSet(words_[3]); + ABSL_FALLTHROUGH_INTENDED; + default: + return -1; + } +} + +} // namespace re2 diff --git a/re2/bitmap256.h b/re2/bitmap256.h index 4899379..293b31d 100644 --- a/re2/bitmap256.h +++ b/re2/bitmap256.h @@ -11,7 +11,6 @@ #include <stdint.h> #include <string.h> -#include "util/util.h" #include "util/logging.h" namespace re2 { @@ -82,36 +81,6 @@ class Bitmap256 { uint64_t words_[4]; }; -int Bitmap256::FindNextSetBit(int c) const { - DCHECK_GE(c, 0); - DCHECK_LE(c, 255); - - // Check the word that contains the bit. Mask out any lower bits. - int i = c / 64; - uint64_t word = words_[i] & (~uint64_t{0} << (c % 64)); - if (word != 0) - return (i * 64) + FindLSBSet(word); - - // Check any following words. - i++; - switch (i) { - case 1: - if (words_[1] != 0) - return (1 * 64) + FindLSBSet(words_[1]); - FALLTHROUGH_INTENDED; - case 2: - if (words_[2] != 0) - return (2 * 64) + FindLSBSet(words_[2]); - FALLTHROUGH_INTENDED; - case 3: - if (words_[3] != 0) - return (3 * 64) + FindLSBSet(words_[3]); - FALLTHROUGH_INTENDED; - default: - return -1; - } -} - } // namespace re2 #endif // RE2_BITMAP256_H_ diff --git a/re2/bitstate.cc b/re2/bitstate.cc index 877e548..38a0b87 100644 --- a/re2/bitstate.cc +++ b/re2/bitstate.cc @@ -42,9 +42,8 @@ class BitState { // The usual Search prototype. // Can only call Search once per BitState. - bool Search(const StringPiece& text, const StringPiece& context, - bool anchored, bool longest, - StringPiece* submatch, int nsubmatch); + bool Search(absl::string_view text, absl::string_view context, bool anchored, + bool longest, absl::string_view* submatch, int nsubmatch); private: inline bool ShouldVisit(int id, const char* p); @@ -53,14 +52,14 @@ class BitState { bool TrySearch(int id, const char* p); // Search parameters - Prog* prog_; // program being run - StringPiece text_; // text being searched - StringPiece context_; // greater context of text being searched - bool anchored_; // whether search is anchored at text.begin() - bool longest_; // whether search wants leftmost-longest match - bool endmatch_; // whether match must end at text.end() - StringPiece* submatch_; // submatches to fill in - int nsubmatch_; // # of submatches to fill in + Prog* prog_; // program being run + absl::string_view text_; // text being searched + absl::string_view context_; // greater context of text being searched + bool anchored_; // whether search is anchored at text.begin() + bool longest_; // whether search wants leftmost-longest match + bool endmatch_; // whether match must end at text.end() + absl::string_view* submatch_; // submatches to fill in + int nsubmatch_; // # of submatches to fill in // Search state static constexpr int kVisitedBits = 64; @@ -256,9 +255,9 @@ bool BitState::TrySearch(int id0, const char* p0) { if (submatch_[0].data() == NULL || (longest_ && p > submatch_[0].data() + submatch_[0].size())) { for (int i = 0; i < nsubmatch_; i++) - submatch_[i] = - StringPiece(cap_[2 * i], - static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i])); + submatch_[i] = absl::string_view( + cap_[2 * i], + static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i])); } // If going for first match, we're done. @@ -285,9 +284,9 @@ bool BitState::TrySearch(int id0, const char* p0) { } // Search text (within context) for prog_. -bool BitState::Search(const StringPiece& text, const StringPiece& context, - bool anchored, bool longest, - StringPiece* submatch, int nsubmatch) { +bool BitState::Search(absl::string_view text, absl::string_view context, + bool anchored, bool longest, absl::string_view* submatch, + int nsubmatch) { // Search parameters. text_ = text; context_ = context; @@ -303,7 +302,7 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context, submatch_ = submatch; nsubmatch_ = nsubmatch; for (int i = 0; i < nsubmatch_; i++) - submatch_[i] = StringPiece(); + submatch_[i] = absl::string_view(); // Allocate scratch space. int nvisited = prog_->list_count() * static_cast<int>(text.size()+1); @@ -353,16 +352,13 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context, } // Bit-state search. -bool Prog::SearchBitState(const StringPiece& text, - const StringPiece& context, - Anchor anchor, - MatchKind kind, - StringPiece* match, - int nmatch) { +bool Prog::SearchBitState(absl::string_view text, absl::string_view context, + Anchor anchor, MatchKind kind, + absl::string_view* match, int nmatch) { // If full match, we ask for an anchored longest match // and then check that match[0] == text. // So make sure match[0] exists. - StringPiece sp0; + absl::string_view sp0; if (kind == kFullMatch) { anchor = kAnchored; if (nmatch < 1) { diff --git a/re2/compile.cc b/re2/compile.cc index 61d801a..aa79887 100644 --- a/re2/compile.cc +++ b/re2/compile.cc @@ -10,9 +10,10 @@ #include <stdint.h> #include <string.h> -#include <unordered_map> #include <utility> +#include "absl/base/macros.h" +#include "absl/container/flat_hash_map.h" #include "util/logging.h" #include "util/utf.h" #include "re2/pod_array.h" @@ -211,7 +212,7 @@ class Compiler : public Regexp::Walker<Frag> { int64_t max_mem_; // Total memory budget. - std::unordered_map<uint64_t, int> rune_cache_; + absl::flat_hash_map<uint64_t, int> rune_cache_; Frag rune_range_; RE2::Anchor anchor_; // anchor mode for RE2::Set @@ -478,7 +479,7 @@ static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase, int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next) { uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next); - std::unordered_map<uint64_t, int>::const_iterator it = rune_cache_.find(key); + absl::flat_hash_map<uint64_t, int>::const_iterator it = rune_cache_.find(key); if (it != rune_cache_.end()) return it->second; int id = UncachedRuneByteSuffix(lo, hi, foldcase, next); @@ -789,8 +790,8 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { // Should not be called. Frag Compiler::Copy(Frag arg) { // We're using WalkExponential; there should be no copying. - LOG(DFATAL) << "Compiler::Copy called!"; failed_ = true; + LOG(DFATAL) << "Compiler::Copy called!"; return NoMatch(); } @@ -916,8 +917,8 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, CharClass* cc = re->cc(); if (cc->empty()) { // This can't happen. - LOG(DFATAL) << "No ranges in char class"; failed_ = true; + LOG(DFATAL) << "No ranges in char class"; return NoMatch(); } @@ -974,8 +975,8 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, case kRegexpNoWordBoundary: return EmptyWidth(kEmptyNonWordBoundary); } - LOG(DFATAL) << "Missing case in Compiler: " << re->op(); failed_ = true; + LOG(DFATAL) << "Missing case in Compiler: " << re->op(); return NoMatch(); } @@ -1243,7 +1244,7 @@ Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) { // Make sure DFA has enough memory to operate, // since we're not going to fall back to the NFA. bool dfa_failed = false; - StringPiece sp = "hello, world"; + absl::string_view sp = "hello, world"; prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch, NULL, &dfa_failed, NULL); if (dfa_failed) { @@ -28,23 +28,25 @@ #include <algorithm> #include <atomic> #include <deque> -#include <mutex> #include <new> #include <string> -#include <unordered_map> -#include <unordered_set> #include <utility> #include <vector> +#include "absl/base/call_once.h" +#include "absl/base/macros.h" +#include "absl/base/thread_annotations.h" +#include "absl/container/flat_hash_map.h" +#include "absl/container/flat_hash_set.h" +#include "absl/strings/str_format.h" +#include "absl/synchronization/mutex.h" +#include "absl/types/span.h" #include "util/logging.h" -#include "util/mix.h" -#include "util/mutex.h" #include "util/strutil.h" #include "re2/pod_array.h" #include "re2/prog.h" #include "re2/re2.h" #include "re2/sparse_set.h" -#include "re2/stringpiece.h" // Silence "zero-sized array in struct/union" warning for DFA::State::next_. #ifdef _MSC_VER @@ -88,9 +90,9 @@ class DFA { // returning the leftmost end of the match instead of the rightmost one. // If the DFA cannot complete the search (for example, if it is out of // memory), it sets *failed and returns false. - bool Search(const StringPiece& text, const StringPiece& context, - bool anchored, bool want_earliest_match, bool run_forward, - bool* failed, const char** ep, SparseSet* matches); + bool Search(absl::string_view text, absl::string_view context, bool anchored, + bool want_earliest_match, bool run_forward, bool* failed, + const char** ep, SparseSet* matches); // Builds out all states for the entire DFA. // If cb is not empty, it receives one callback per state built. @@ -114,21 +116,26 @@ class DFA { struct State { inline bool IsMatch() const { return (flag_ & kFlagMatch) != 0; } + template <typename H> + friend H AbslHashValue(H h, const State& a) { + const absl::Span<const int> ainst(a.inst_, a.ninst_); + return H::combine(std::move(h), a.flag_, ainst); + } + + friend bool operator==(const State& a, const State& b) { + const absl::Span<const int> ainst(a.inst_, a.ninst_); + const absl::Span<const int> binst(b.inst_, b.ninst_); + return &a == &b || (a.flag_ == b.flag_ && ainst == binst); + } + int* inst_; // Instruction pointers in the state. int ninst_; // # of inst_ pointers. uint32_t flag_; // Empty string bitfield flags in effect on the way // into this state, along with kFlagMatch if this // is a matching state. -// Work around the bug affecting flexible array members in GCC 6.x (for x >= 1). -// (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70932) -#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && __GNUC_MINOR__ >= 1 - std::atomic<State*> next_[0]; // Outgoing arrows from State, -#else std::atomic<State*> next_[]; // Outgoing arrows from State, -#endif - - // one per input byte class + // one per input byte class }; enum { @@ -143,11 +150,7 @@ class DFA { struct StateHash { size_t operator()(const State* a) const { DCHECK(a != NULL); - HashMix mix(a->flag_); - for (int i = 0; i < a->ninst_; i++) - mix.Mix(a->inst_[i]); - mix.Mix(0); - return mix.get(); + return absl::Hash<State>()(*a); } }; @@ -155,24 +158,15 @@ class DFA { bool operator()(const State* a, const State* b) const { DCHECK(a != NULL); DCHECK(b != NULL); - if (a == b) - return true; - if (a->flag_ != b->flag_) - return false; - if (a->ninst_ != b->ninst_) - return false; - for (int i = 0; i < a->ninst_; i++) - if (a->inst_[i] != b->inst_[i]) - return false; - return true; + return *a == *b; } }; - typedef std::unordered_set<State*, StateHash, StateEqual> StateSet; + typedef absl::flat_hash_set<State*, StateHash, StateEqual> StateSet; private: // Make it easier to swap in a scalable reader-writer mutex. - using CacheMutex = Mutex; + using CacheMutex = absl::Mutex; enum { // Indices into start_ for unanchored searches. @@ -238,7 +232,7 @@ class DFA { // Search parameters struct SearchParams { - SearchParams(const StringPiece& text, const StringPiece& context, + SearchParams(absl::string_view text, absl::string_view context, RWLocker* cache_lock) : text(text), context(context), @@ -252,8 +246,8 @@ class DFA { ep(NULL), matches(NULL) {} - StringPiece text; - StringPiece context; + absl::string_view text; + absl::string_view context; bool anchored; bool can_prefix_accel; bool want_earliest_match; @@ -325,7 +319,7 @@ class DFA { Prog::MatchKind kind_; // The kind of DFA. bool init_failed_; // initialization failed (out of memory) - Mutex mutex_; // mutex_ >= cache_mutex_.r + absl::Mutex mutex_; // mutex_ >= cache_mutex_.r // Scratch areas, protected by mutex_. Workq* q0_; // Two pre-allocated work queues. @@ -428,7 +422,7 @@ DFA::DFA(Prog* prog, Prog::MatchKind kind, int64_t max_mem) q1_(NULL), mem_budget_(max_mem) { if (ExtraDebug) - fprintf(stderr, "\nkind %d\n%s\n", kind_, prog_->DumpUnanchored().c_str()); + absl::FPrintF(stderr, "\nkind %d\n%s\n", kind_, prog_->DumpUnanchored()); int nmark = 0; if (kind_ == Prog::kLongestMatch) nmark = prog_->size(); @@ -498,7 +492,7 @@ std::string DFA::DumpWorkq(Workq* q) { s += "|"; sep = ""; } else { - s += StringPrintf("%s%d", sep, *it); + s += absl::StrFormat("%s%d", sep, *it); sep = ","; } } @@ -515,7 +509,7 @@ std::string DFA::DumpState(State* state) { return "*"; std::string s; const char* sep = ""; - s += StringPrintf("(%p)", state); + s += absl::StrFormat("(%p)", state); for (int i = 0; i < state->ninst_; i++) { if (state->inst_[i] == Mark) { s += "|"; @@ -524,11 +518,11 @@ std::string DFA::DumpState(State* state) { s += "||"; sep = ""; } else { - s += StringPrintf("%s%d", sep, state->inst_[i]); + s += absl::StrFormat("%s%d", sep, state->inst_[i]); sep = ","; } } - s += StringPrintf(" flag=%#x", state->flag_); + s += absl::StrFormat(" flag=%#x", state->flag_); return s; } @@ -596,16 +590,35 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { //mutex_.AssertHeld(); // Construct array of instruction ids for the new state. - // Only ByteRange, EmptyWidth, and Match instructions are useful to keep: - // those are the only operators with any effect in - // RunWorkqOnEmptyString or RunWorkqOnByte. + // In some cases, kInstAltMatch may trigger an upgrade to FullMatchState. + // Otherwise, "compress" q down to list heads for storage; StateToWorkq() + // will "decompress" it for computation by exploring from each list head. + // + // Historically, only kInstByteRange, kInstEmptyWidth and kInstMatch were + // useful to keep, but it turned out that kInstAlt was necessary to keep: + // + // > [*] kInstAlt would seem useless to record in a state, since + // > we've already followed both its arrows and saved all the + // > interesting states we can reach from there. The problem + // > is that one of the empty-width instructions might lead + // > back to the same kInstAlt (if an empty-width operator is starred), + // > producing a different evaluation order depending on whether + // > we keep the kInstAlt to begin with. Sigh. + // > A specific case that this affects is /(^|a)+/ matching "a". + // > If we don't save the kInstAlt, we will match the whole "a" (0,1) + // > but in fact the correct leftmost-first match is the leading "" (0,0). + // + // Recall that flattening transformed the Prog from "tree" form to "list" + // form: in the former, kInstAlt existed explicitly... and abundantly; in + // the latter, it's implied between the instructions that compose a list. + // Thus, because the information wasn't lost, the bug doesn't remanifest. PODArray<int> inst(q->size()); int n = 0; uint32_t needflags = 0; // flags needed by kInstEmptyWidth instructions bool sawmatch = false; // whether queue contains guaranteed kInstMatch bool sawmark = false; // whether queue contains a Mark if (ExtraDebug) - fprintf(stderr, "WorkqToCachedState %s [%#x]", DumpWorkq(q).c_str(), flag); + absl::FPrintF(stderr, "WorkqToCachedState %s [%#x]", DumpWorkq(q), flag); for (Workq::iterator it = q->begin(); it != q->end(); ++it) { int id = *it; if (sawmatch && (kind_ == Prog::kFirstMatch || q->is_mark(id))) @@ -630,10 +643,10 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { (kind_ != Prog::kLongestMatch || !sawmark) && (flag & kFlagMatch)) { if (ExtraDebug) - fprintf(stderr, " -> FullMatchState\n"); + absl::FPrintF(stderr, " -> FullMatchState\n"); return FullMatchState; } - FALLTHROUGH_INTENDED; + ABSL_FALLTHROUGH_INTENDED; default: // Record iff id is the head of its list, which must // be the case if id-1 is the last of *its* list. :) @@ -676,7 +689,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { // if the state is *not* a matching state. if (n == 0 && flag == 0) { if (ExtraDebug) - fprintf(stderr, " -> DeadState\n"); + absl::FPrintF(stderr, " -> DeadState\n"); return DeadState; } @@ -740,25 +753,29 @@ DFA::State* DFA::CachedState(int* inst, int ninst, uint32_t flag) { StateSet::iterator it = state_cache_.find(&state); if (it != state_cache_.end()) { if (ExtraDebug) - fprintf(stderr, " -cached-> %s\n", DumpState(*it).c_str()); + absl::FPrintF(stderr, " -cached-> %s\n", DumpState(*it)); return *it; } // Must have enough memory for new state. // In addition to what we're going to allocate, - // the state cache hash table seems to incur about 40 bytes per - // State*, empirically. - const int kStateCacheOverhead = 40; + // the state cache hash table seems to incur about 18 bytes per + // State*. Worst case for non-small sets is it being half full, where each + // value present takes up 1 byte hash sample plus the pointer itself. + const int kStateCacheOverhead = 18; int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot - int mem = sizeof(State) + nnext*sizeof(std::atomic<State*>) + - ninst*sizeof(int); - if (mem_budget_ < mem + kStateCacheOverhead) { + int mem = sizeof(State) + nnext*sizeof(std::atomic<State*>); + int instmem = ninst*sizeof(int); + if (mem_budget_ < mem + instmem + kStateCacheOverhead) { mem_budget_ = -1; return NULL; } - mem_budget_ -= mem + kStateCacheOverhead; + mem_budget_ -= mem + instmem + kStateCacheOverhead; // Allocate new state along with room for next_ and inst_. + // inst_ is stored separately since it's colder; this also + // means that the States for a given DFA are the same size + // class, so the allocator can hopefully pack them better. char* space = std::allocator<char>().allocate(mem); State* s = new (space) State; (void) new (s->next_) std::atomic<State*>[nnext]; @@ -766,12 +783,13 @@ DFA::State* DFA::CachedState(int* inst, int ninst, uint32_t flag) { // (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64658) for (int i = 0; i < nnext; i++) (void) new (s->next_ + i) std::atomic<State*>(NULL); - s->inst_ = new (s->next_ + nnext) int[ninst]; - memmove(s->inst_, inst, ninst*sizeof s->inst_[0]); + s->inst_ = std::allocator<int>().allocate(ninst); + (void) new (s->inst_) int[ninst]; + memmove(s->inst_, inst, instmem); s->ninst_ = ninst; s->flag_ = flag; if (ExtraDebug) - fprintf(stderr, " -> %s\n", DumpState(s).c_str()); + absl::FPrintF(stderr, " -> %s\n", DumpState(s)); // Put state in cache and return it. state_cache_.insert(s); @@ -785,12 +803,12 @@ void DFA::ClearCache() { while (begin != end) { StateSet::iterator tmp = begin; ++begin; + // Deallocate the instruction array, which is stored separately as above. + std::allocator<int>().deallocate((*tmp)->inst_, (*tmp)->ninst_); // Deallocate the blob of memory that we allocated in DFA::CachedState(). // We recompute mem in order to benefit from sized delete where possible. - int ninst = (*tmp)->ninst_; int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot - int mem = sizeof(State) + nnext*sizeof(std::atomic<State*>) + - ninst*sizeof(int); + int mem = sizeof(State) + nnext*sizeof(std::atomic<State*>); std::allocator<char>().deallocate(reinterpret_cast<char*>(*tmp), mem); } state_cache_.clear(); @@ -985,8 +1003,8 @@ void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq, } if (ExtraDebug) - fprintf(stderr, "%s on %d[%#x] -> %s [%d]\n", - DumpWorkq(oldq).c_str(), c, flag, DumpWorkq(newq).c_str(), *ismatch); + absl::FPrintF(stderr, "%s on %d[%#x] -> %s [%d]\n", + DumpWorkq(oldq), c, flag, DumpWorkq(newq), *ismatch); } // Processes input byte c in state, returning new state. @@ -994,7 +1012,7 @@ void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq, DFA::State* DFA::RunStateOnByteUnlocked(State* state, int c) { // Keep only one RunStateOnByte going // even if the DFA is being run by multiple threads. - MutexLock l(&mutex_); + absl::MutexLock l(&mutex_); return RunStateOnByte(state, c); } @@ -1134,9 +1152,9 @@ DFA::RWLocker::RWLocker(CacheMutex* mu) : mu_(mu), writing_(false) { mu_->ReaderLock(); } -// This function is marked as NO_THREAD_SAFETY_ANALYSIS because +// This function is marked as ABSL_NO_THREAD_SAFETY_ANALYSIS because // the annotations don't support lock upgrade. -void DFA::RWLocker::LockForWriting() NO_THREAD_SAFETY_ANALYSIS { +void DFA::RWLocker::LockForWriting() ABSL_NO_THREAD_SAFETY_ANALYSIS { if (!writing_) { mu_->ReaderUnlock(); mu_->WriterLock(); @@ -1246,7 +1264,7 @@ DFA::StateSaver::~StateSaver() { DFA::State* DFA::StateSaver::Restore() { if (is_special_) return special_; - MutexLock l(&dfa_->mutex_); + absl::MutexLock l(&dfa_->mutex_); State* s = dfa_->CachedState(inst_, ninst_, flag_); if (s == NULL) LOG(DFATAL) << "StateSaver failed to restore state."; @@ -1342,13 +1360,13 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) { State* s = start; if (ExtraDebug) - fprintf(stderr, "@stx: %s\n", DumpState(s).c_str()); + absl::FPrintF(stderr, "@stx: %s\n", DumpState(s)); if (s->IsMatch()) { matched = true; lastmatch = p; if (ExtraDebug) - fprintf(stderr, "match @stx! [%s]\n", DumpState(s).c_str()); + absl::FPrintF(stderr, "match @stx! [%s]\n", DumpState(s)); if (params->matches != NULL && kind_ == Prog::kManyMatch) { for (int i = s->ninst_ - 1; i >= 0; i--) { int id = s->inst_[i]; @@ -1365,7 +1383,7 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) { while (p != ep) { if (ExtraDebug) - fprintf(stderr, "@%td: %s\n", p - bp, DumpState(s).c_str()); + absl::FPrintF(stderr, "@%d: %s\n", p - bp, DumpState(s)); if (can_prefix_accel && s == start) { // In start state, only way out is to find the prefix, @@ -1465,7 +1483,7 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) { else lastmatch = p + 1; if (ExtraDebug) - fprintf(stderr, "match @%td! [%s]\n", lastmatch - bp, DumpState(s).c_str()); + absl::FPrintF(stderr, "match @%d! [%s]\n", lastmatch - bp, DumpState(s)); if (params->matches != NULL && kind_ == Prog::kManyMatch) { for (int i = s->ninst_ - 1; i >= 0; i--) { int id = s->inst_[i]; @@ -1484,7 +1502,7 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) { // Process one more byte to see if it triggers a match. // (Remember, matches are delayed one byte.) if (ExtraDebug) - fprintf(stderr, "@etx: %s\n", DumpState(s).c_str()); + absl::FPrintF(stderr, "@etx: %s\n", DumpState(s)); int lastbyte; if (run_forward) { @@ -1532,7 +1550,7 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params) { matched = true; lastmatch = p; if (ExtraDebug) - fprintf(stderr, "match @etx! [%s]\n", DumpState(s).c_str()); + absl::FPrintF(stderr, "match @etx! [%s]\n", DumpState(s)); if (params->matches != NULL && kind_ == Prog::kManyMatch) { for (int i = s->ninst_ - 1; i >= 0; i--) { int id = s->inst_[i]; @@ -1623,8 +1641,8 @@ bool DFA::FastSearchLoop(SearchParams* params) { // state for the DFA search loop. Fills in params and returns true on success. // Returns false on failure. bool DFA::AnalyzeSearch(SearchParams* params) { - const StringPiece& text = params->text; - const StringPiece& context = params->context; + absl::string_view text = params->text; + absl::string_view context = params->context; // Sanity check: make sure that text lies within context. if (BeginPtr(text) < BeginPtr(context) || EndPtr(text) > EndPtr(context)) { @@ -1675,8 +1693,8 @@ bool DFA::AnalyzeSearch(SearchParams* params) { if (!AnalyzeSearchHelper(params, info, flags)) { ResetCache(params->cache_lock); if (!AnalyzeSearchHelper(params, info, flags)) { - LOG(DFATAL) << "Failed to analyze start state."; params->failed = true; + LOG(DFATAL) << "Failed to analyze start state."; return false; } } @@ -1694,9 +1712,9 @@ bool DFA::AnalyzeSearch(SearchParams* params) { params->can_prefix_accel = true; if (ExtraDebug) - fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s can_prefix_accel=%d\n", - params->anchored, params->run_forward, flags, - DumpState(params->start).c_str(), params->can_prefix_accel); + absl::FPrintF(stderr, "anchored=%d fwd=%d flags=%#x state=%s can_prefix_accel=%d\n", + params->anchored, params->run_forward, flags, + DumpState(params->start), params->can_prefix_accel); return true; } @@ -1709,7 +1727,7 @@ bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info, if (start != NULL) return true; - MutexLock l(&mutex_); + absl::MutexLock l(&mutex_); start = info->start.load(std::memory_order_relaxed); if (start != NULL) return true; @@ -1728,14 +1746,9 @@ bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info, } // The actual DFA search: calls AnalyzeSearch and then FastSearchLoop. -bool DFA::Search(const StringPiece& text, - const StringPiece& context, - bool anchored, - bool want_earliest_match, - bool run_forward, - bool* failed, - const char** epp, - SparseSet* matches) { +bool DFA::Search(absl::string_view text, absl::string_view context, + bool anchored, bool want_earliest_match, bool run_forward, + bool* failed, const char** epp, SparseSet* matches) { *epp = NULL; if (!ok()) { *failed = true; @@ -1744,9 +1757,9 @@ bool DFA::Search(const StringPiece& text, *failed = false; if (ExtraDebug) { - fprintf(stderr, "\nprogram:\n%s\n", prog_->DumpUnanchored().c_str()); - fprintf(stderr, "text %s anchored=%d earliest=%d fwd=%d kind %d\n", - std::string(text).c_str(), anchored, want_earliest_match, run_forward, kind_); + absl::FPrintF(stderr, "\nprogram:\n%s\n", prog_->DumpUnanchored()); + absl::FPrintF(stderr, "text %s anchored=%d earliest=%d fwd=%d kind %d\n", + text, anchored, want_earliest_match, run_forward, kind_); } RWLocker l(&cache_mutex_); @@ -1770,7 +1783,7 @@ bool DFA::Search(const StringPiece& text, return true; } if (ExtraDebug) - fprintf(stderr, "start %s\n", DumpState(params.start).c_str()); + absl::FPrintF(stderr, "start %s\n", DumpState(params.start)); bool ret = FastSearchLoop(¶ms); if (params.failed) { *failed = true; @@ -1789,17 +1802,17 @@ DFA* Prog::GetDFA(MatchKind kind) { // "longest match" DFA, because RE2 never does reverse // "first match" searches. if (kind == kFirstMatch) { - std::call_once(dfa_first_once_, [](Prog* prog) { + absl::call_once(dfa_first_once_, [](Prog* prog) { prog->dfa_first_ = new DFA(prog, kFirstMatch, prog->dfa_mem_ / 2); }, this); return dfa_first_; } else if (kind == kManyMatch) { - std::call_once(dfa_first_once_, [](Prog* prog) { + absl::call_once(dfa_first_once_, [](Prog* prog) { prog->dfa_first_ = new DFA(prog, kManyMatch, prog->dfa_mem_); }, this); return dfa_first_; } else { - std::call_once(dfa_longest_once_, [](Prog* prog) { + absl::call_once(dfa_longest_once_, [](Prog* prog) { if (!prog->reversed_) prog->dfa_longest_ = new DFA(prog, kLongestMatch, prog->dfa_mem_ / 2); else @@ -1823,12 +1836,11 @@ void Prog::DeleteDFA(DFA* dfa) { // // This is the only external interface (class DFA only exists in this file). // -bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, - Anchor anchor, MatchKind kind, StringPiece* match0, +bool Prog::SearchDFA(absl::string_view text, absl::string_view context, + Anchor anchor, MatchKind kind, absl::string_view* match0, bool* failed, SparseSet* matches) { *failed = false; - StringPiece context = const_context; if (context.data() == NULL) context = text; bool caret = anchor_start(); @@ -1889,10 +1901,10 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, if (match0) { if (reversed_) *match0 = - StringPiece(ep, static_cast<size_t>(text.data() + text.size() - ep)); + absl::string_view(ep, static_cast<size_t>(text.data() + text.size() - ep)); else *match0 = - StringPiece(text.data(), static_cast<size_t>(ep - text.data())); + absl::string_view(text.data(), static_cast<size_t>(ep - text.data())); } return true; } @@ -1905,7 +1917,7 @@ int DFA::BuildAllStates(const Prog::DFAStateCallback& cb) { // Pick out start state for unanchored search // at beginning of text. RWLocker l(&cache_mutex_); - SearchParams params(StringPiece(), StringPiece(), &l); + SearchParams params(absl::string_view(), absl::string_view(), &l); params.anchored = false; if (!AnalyzeSearch(¶ms) || params.start == NULL || @@ -1915,7 +1927,7 @@ int DFA::BuildAllStates(const Prog::DFAStateCallback& cb) { // Add start state to work queue. // Note that any State* that we handle here must point into the cache, // so we can simply depend on pointer-as-a-number hashing and equality. - std::unordered_map<State*, int> m; + absl::flat_hash_map<State*, int> m; std::deque<State*> q; m.emplace(params.start, static_cast<int>(m.size())); q.push_back(params.start); @@ -1989,11 +2001,11 @@ bool DFA::PossibleMatchRange(std::string* min, std::string* max, int maxlen) { // Also note that previously_visited_states[UnseenStatePtr] will, in the STL // tradition, implicitly insert a '0' value at first use. We take advantage // of that property below. - std::unordered_map<State*, int> previously_visited_states; + absl::flat_hash_map<State*, int> previously_visited_states; // Pick out start state for anchored search at beginning of text. RWLocker l(&cache_mutex_); - SearchParams params(StringPiece(), StringPiece(), &l); + SearchParams params(absl::string_view(), absl::string_view(), &l); params.anchored = true; if (!AnalyzeSearch(¶ms)) return false; @@ -2033,7 +2045,7 @@ bool DFA::PossibleMatchRange(std::string* min, std::string* max, int maxlen) { // Build minimum prefix. State* s = params.start; min->clear(); - MutexLock lock(&mutex_); + absl::MutexLock lock(&mutex_); for (int i = 0; i < maxlen; i++) { if (previously_visited_states[s] > kMaxEltRepetitions) break; diff --git a/re2/filtered_re2.cc b/re2/filtered_re2.cc index 5df9745..49cf686 100644 --- a/re2/filtered_re2.cc +++ b/re2/filtered_re2.cc @@ -8,7 +8,6 @@ #include <string> #include <utility> -#include "util/util.h" #include "util/logging.h" #include "re2/prefilter.h" #include "re2/prefilter_tree.h" @@ -46,7 +45,7 @@ FilteredRE2& FilteredRE2::operator=(FilteredRE2&& other) { return *this; } -RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern, +RE2::ErrorCode FilteredRE2::Add(absl::string_view pattern, const RE2::Options& options, int* id) { RE2* re = new RE2(pattern, options); RE2::ErrorCode code = re->error_code(); @@ -85,14 +84,14 @@ void FilteredRE2::Compile(std::vector<std::string>* atoms) { compiled_ = true; } -int FilteredRE2::SlowFirstMatch(const StringPiece& text) const { +int FilteredRE2::SlowFirstMatch(absl::string_view text) const { for (size_t i = 0; i < re2_vec_.size(); i++) if (RE2::PartialMatch(text, *re2_vec_[i])) return static_cast<int>(i); return -1; } -int FilteredRE2::FirstMatch(const StringPiece& text, +int FilteredRE2::FirstMatch(absl::string_view text, const std::vector<int>& atoms) const { if (!compiled_) { LOG(DFATAL) << "FirstMatch called before Compile."; @@ -106,10 +105,9 @@ int FilteredRE2::FirstMatch(const StringPiece& text, return -1; } -bool FilteredRE2::AllMatches( - const StringPiece& text, - const std::vector<int>& atoms, - std::vector<int>* matching_regexps) const { +bool FilteredRE2::AllMatches(absl::string_view text, + const std::vector<int>& atoms, + std::vector<int>* matching_regexps) const { matching_regexps->clear(); std::vector<int> regexps; prefilter_tree_->RegexpsGivenStrings(atoms, ®exps); @@ -119,9 +117,8 @@ bool FilteredRE2::AllMatches( return !matching_regexps->empty(); } -void FilteredRE2::AllPotentials( - const std::vector<int>& atoms, - std::vector<int>* potential_regexps) const { +void FilteredRE2::AllPotentials(const std::vector<int>& atoms, + std::vector<int>* potential_regexps) const { prefilter_tree_->RegexpsGivenStrings(atoms, potential_regexps); } diff --git a/re2/filtered_re2.h b/re2/filtered_re2.h index dd618c7..a9abd69 100644 --- a/re2/filtered_re2.h +++ b/re2/filtered_re2.h @@ -25,6 +25,7 @@ #include <string> #include <vector> +#include "absl/strings/string_view.h" #include "re2/re2.h" namespace re2 { @@ -47,7 +48,7 @@ class FilteredRE2 { // Uses RE2 constructor to create a RE2 object (re). Returns // re->error_code(). If error_code is other than NoError, then re is // deleted and not added to re2_vec_. - RE2::ErrorCode Add(const StringPiece& pattern, + RE2::ErrorCode Add(absl::string_view pattern, const RE2::Options& options, int* id); @@ -63,17 +64,17 @@ class FilteredRE2 { // Returns -1 on no match. Can be called prior to Compile. // Does not do any filtering: simply tries to Match the // regexps in a loop. - int SlowFirstMatch(const StringPiece& text) const; + int SlowFirstMatch(absl::string_view text) const; // Returns the index of the first matching regexp. // Returns -1 on no match. Compile has to be called before // calling this. - int FirstMatch(const StringPiece& text, + int FirstMatch(absl::string_view text, const std::vector<int>& atoms) const; // Returns the indices of all matching regexps, after first clearing // matched_regexps. - bool AllMatches(const StringPiece& text, + bool AllMatches(absl::string_view text, const std::vector<int>& atoms, std::vector<int>* matching_regexps) const; diff --git a/re2/fuzzing/compiler-rt/LICENSE b/re2/fuzzing/compiler-rt/LICENSE deleted file mode 100644 index f9dc506..0000000 --- a/re2/fuzzing/compiler-rt/LICENSE +++ /dev/null @@ -1,219 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - - ---- LLVM Exceptions to the Apache 2.0 License ---- - -As an exception, if, as a result of your compiling your source code, portions -of this Software are embedded into an Object form of such source code, you -may redistribute such embedded portions in such Object form without complying -with the conditions of Sections 4(a), 4(b) and 4(d) of the License. - -In addition, if you combine or link compiled forms of this Software with -software that is licensed under the GPLv2 ("Combined Software") and if a -court of competent jurisdiction determines that the patent provision (Section -3), the indemnity provision (Section 9) or other Section of the License -conflicts with the conditions of the GPLv2, you may retroactively and -prospectively choose to deem waived or otherwise exclude such Section(s) of -the License, but only in their entirety and only with respect to the Combined -Software. - diff --git a/re2/fuzzing/compiler-rt/include/fuzzer/FuzzedDataProvider.h b/re2/fuzzing/compiler-rt/include/fuzzer/FuzzedDataProvider.h deleted file mode 100644 index 71cb427..0000000 --- a/re2/fuzzing/compiler-rt/include/fuzzer/FuzzedDataProvider.h +++ /dev/null @@ -1,397 +0,0 @@ -//===- FuzzedDataProvider.h - Utility header for fuzz targets ---*- C++ -* ===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// A single header library providing an utility class to break up an array of -// bytes. Whenever run on the same input, provides the same output, as long as -// its methods are called in the same order, with the same arguments. -//===----------------------------------------------------------------------===// - -#ifndef LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_ -#define LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_ - -#include <algorithm> -#include <array> -#include <climits> -#include <cstddef> -#include <cstdint> -#include <cstring> -#include <initializer_list> -#include <limits> -#include <string> -#include <type_traits> -#include <utility> -#include <vector> - -// In addition to the comments below, the API is also briefly documented at -// https://github.com/google/fuzzing/blob/master/docs/split-inputs.md#fuzzed-data-provider -class FuzzedDataProvider { - public: - // |data| is an array of length |size| that the FuzzedDataProvider wraps to - // provide more granular access. |data| must outlive the FuzzedDataProvider. - FuzzedDataProvider(const uint8_t *data, size_t size) - : data_ptr_(data), remaining_bytes_(size) {} - ~FuzzedDataProvider() = default; - - // See the implementation below (after the class definition) for more verbose - // comments for each of the methods. - - // Methods returning std::vector of bytes. These are the most popular choice - // when splitting fuzzing input into pieces, as every piece is put into a - // separate buffer (i.e. ASan would catch any under-/overflow) and the memory - // will be released automatically. - template <typename T> std::vector<T> ConsumeBytes(size_t num_bytes); - template <typename T> - std::vector<T> ConsumeBytesWithTerminator(size_t num_bytes, T terminator = 0); - template <typename T> std::vector<T> ConsumeRemainingBytes(); - - // Methods returning strings. Use only when you need a std::string or a null - // terminated C-string. Otherwise, prefer the methods returning std::vector. - std::string ConsumeBytesAsString(size_t num_bytes); - std::string ConsumeRandomLengthString(size_t max_length); - std::string ConsumeRandomLengthString(); - std::string ConsumeRemainingBytesAsString(); - - // Methods returning integer values. - template <typename T> T ConsumeIntegral(); - template <typename T> T ConsumeIntegralInRange(T min, T max); - - // Methods returning floating point values. - template <typename T> T ConsumeFloatingPoint(); - template <typename T> T ConsumeFloatingPointInRange(T min, T max); - - // 0 <= return value <= 1. - template <typename T> T ConsumeProbability(); - - bool ConsumeBool(); - - // Returns a value chosen from the given enum. - template <typename T> T ConsumeEnum(); - - // Returns a value from the given array. - template <typename T, size_t size> T PickValueInArray(const T (&array)[size]); - template <typename T, size_t size> - T PickValueInArray(const std::array<T, size> &array); - template <typename T> T PickValueInArray(std::initializer_list<const T> list); - - // Writes data to the given destination and returns number of bytes written. - size_t ConsumeData(void *destination, size_t num_bytes); - - // Reports the remaining bytes available for fuzzed input. - size_t remaining_bytes() { return remaining_bytes_; } - - private: - FuzzedDataProvider(const FuzzedDataProvider &) = delete; - FuzzedDataProvider &operator=(const FuzzedDataProvider &) = delete; - - void CopyAndAdvance(void *destination, size_t num_bytes); - - void Advance(size_t num_bytes); - - template <typename T> - std::vector<T> ConsumeBytes(size_t size, size_t num_bytes); - - template <typename TS, typename TU> TS ConvertUnsignedToSigned(TU value); - - const uint8_t *data_ptr_; - size_t remaining_bytes_; -}; - -// Returns a std::vector containing |num_bytes| of input data. If fewer than -// |num_bytes| of data remain, returns a shorter std::vector containing all -// of the data that's left. Can be used with any byte sized type, such as -// char, unsigned char, uint8_t, etc. -template <typename T> -std::vector<T> FuzzedDataProvider::ConsumeBytes(size_t num_bytes) { - num_bytes = std::min(num_bytes, remaining_bytes_); - return ConsumeBytes<T>(num_bytes, num_bytes); -} - -// Similar to |ConsumeBytes|, but also appends the terminator value at the end -// of the resulting vector. Useful, when a mutable null-terminated C-string is -// needed, for example. But that is a rare case. Better avoid it, if possible, -// and prefer using |ConsumeBytes| or |ConsumeBytesAsString| methods. -template <typename T> -std::vector<T> FuzzedDataProvider::ConsumeBytesWithTerminator(size_t num_bytes, - T terminator) { - num_bytes = std::min(num_bytes, remaining_bytes_); - std::vector<T> result = ConsumeBytes<T>(num_bytes + 1, num_bytes); - result.back() = terminator; - return result; -} - -// Returns a std::vector containing all remaining bytes of the input data. -template <typename T> -std::vector<T> FuzzedDataProvider::ConsumeRemainingBytes() { - return ConsumeBytes<T>(remaining_bytes_); -} - -// Returns a std::string containing |num_bytes| of input data. Using this and -// |.c_str()| on the resulting string is the best way to get an immutable -// null-terminated C string. If fewer than |num_bytes| of data remain, returns -// a shorter std::string containing all of the data that's left. -inline std::string FuzzedDataProvider::ConsumeBytesAsString(size_t num_bytes) { - static_assert(sizeof(std::string::value_type) == sizeof(uint8_t), - "ConsumeBytesAsString cannot convert the data to a string."); - - num_bytes = std::min(num_bytes, remaining_bytes_); - std::string result( - reinterpret_cast<const std::string::value_type *>(data_ptr_), num_bytes); - Advance(num_bytes); - return result; -} - -// Returns a std::string of length from 0 to |max_length|. When it runs out of -// input data, returns what remains of the input. Designed to be more stable -// with respect to a fuzzer inserting characters than just picking a random -// length and then consuming that many bytes with |ConsumeBytes|. -inline std::string -FuzzedDataProvider::ConsumeRandomLengthString(size_t max_length) { - // Reads bytes from the start of |data_ptr_|. Maps "\\" to "\", and maps "\" - // followed by anything else to the end of the string. As a result of this - // logic, a fuzzer can insert characters into the string, and the string - // will be lengthened to include those new characters, resulting in a more - // stable fuzzer than picking the length of a string independently from - // picking its contents. - std::string result; - - // Reserve the anticipated capaticity to prevent several reallocations. - result.reserve(std::min(max_length, remaining_bytes_)); - for (size_t i = 0; i < max_length && remaining_bytes_ != 0; ++i) { - char next = ConvertUnsignedToSigned<char>(data_ptr_[0]); - Advance(1); - if (next == '\\' && remaining_bytes_ != 0) { - next = ConvertUnsignedToSigned<char>(data_ptr_[0]); - Advance(1); - if (next != '\\') - break; - } - result += next; - } - - result.shrink_to_fit(); - return result; -} - -// Returns a std::string of length from 0 to |remaining_bytes_|. -inline std::string FuzzedDataProvider::ConsumeRandomLengthString() { - return ConsumeRandomLengthString(remaining_bytes_); -} - -// Returns a std::string containing all remaining bytes of the input data. -// Prefer using |ConsumeRemainingBytes| unless you actually need a std::string -// object. -inline std::string FuzzedDataProvider::ConsumeRemainingBytesAsString() { - return ConsumeBytesAsString(remaining_bytes_); -} - -// Returns a number in the range [Type's min, Type's max]. The value might -// not be uniformly distributed in the given range. If there's no input data -// left, always returns |min|. -template <typename T> T FuzzedDataProvider::ConsumeIntegral() { - return ConsumeIntegralInRange(std::numeric_limits<T>::min(), - std::numeric_limits<T>::max()); -} - -// Returns a number in the range [min, max] by consuming bytes from the -// input data. The value might not be uniformly distributed in the given -// range. If there's no input data left, always returns |min|. |min| must -// be less than or equal to |max|. -template <typename T> -T FuzzedDataProvider::ConsumeIntegralInRange(T min, T max) { - static_assert(std::is_integral<T>::value, "An integral type is required."); - static_assert(sizeof(T) <= sizeof(uint64_t), "Unsupported integral type."); - - if (min > max) - abort(); - - // Use the biggest type possible to hold the range and the result. - uint64_t range = static_cast<uint64_t>(max) - min; - uint64_t result = 0; - size_t offset = 0; - - while (offset < sizeof(T) * CHAR_BIT && (range >> offset) > 0 && - remaining_bytes_ != 0) { - // Pull bytes off the end of the seed data. Experimentally, this seems to - // allow the fuzzer to more easily explore the input space. This makes - // sense, since it works by modifying inputs that caused new code to run, - // and this data is often used to encode length of data read by - // |ConsumeBytes|. Separating out read lengths makes it easier modify the - // contents of the data that is actually read. - --remaining_bytes_; - result = (result << CHAR_BIT) | data_ptr_[remaining_bytes_]; - offset += CHAR_BIT; - } - - // Avoid division by 0, in case |range + 1| results in overflow. - if (range != std::numeric_limits<decltype(range)>::max()) - result = result % (range + 1); - - return static_cast<T>(min + result); -} - -// Returns a floating point value in the range [Type's lowest, Type's max] by -// consuming bytes from the input data. If there's no input data left, always -// returns approximately 0. -template <typename T> T FuzzedDataProvider::ConsumeFloatingPoint() { - return ConsumeFloatingPointInRange<T>(std::numeric_limits<T>::lowest(), - std::numeric_limits<T>::max()); -} - -// Returns a floating point value in the given range by consuming bytes from -// the input data. If there's no input data left, returns |min|. Note that -// |min| must be less than or equal to |max|. -template <typename T> -T FuzzedDataProvider::ConsumeFloatingPointInRange(T min, T max) { - if (min > max) - abort(); - - T range = .0; - T result = min; - constexpr T zero(.0); - if (max > zero && min < zero && max > min + std::numeric_limits<T>::max()) { - // The diff |max - min| would overflow the given floating point type. Use - // the half of the diff as the range and consume a bool to decide whether - // the result is in the first of the second part of the diff. - range = (max / 2.0) - (min / 2.0); - if (ConsumeBool()) { - result += range; - } - } else { - range = max - min; - } - - return result + range * ConsumeProbability<T>(); -} - -// Returns a floating point number in the range [0.0, 1.0]. If there's no -// input data left, always returns 0. -template <typename T> T FuzzedDataProvider::ConsumeProbability() { - static_assert(std::is_floating_point<T>::value, - "A floating point type is required."); - - // Use different integral types for different floating point types in order - // to provide better density of the resulting values. - using IntegralType = - typename std::conditional<(sizeof(T) <= sizeof(uint32_t)), uint32_t, - uint64_t>::type; - - T result = static_cast<T>(ConsumeIntegral<IntegralType>()); - result /= static_cast<T>(std::numeric_limits<IntegralType>::max()); - return result; -} - -// Reads one byte and returns a bool, or false when no data remains. -inline bool FuzzedDataProvider::ConsumeBool() { - return 1 & ConsumeIntegral<uint8_t>(); -} - -// Returns an enum value. The enum must start at 0 and be contiguous. It must -// also contain |kMaxValue| aliased to its largest (inclusive) value. Such as: -// enum class Foo { SomeValue, OtherValue, kMaxValue = OtherValue }; -template <typename T> T FuzzedDataProvider::ConsumeEnum() { - static_assert(std::is_enum<T>::value, "|T| must be an enum type."); - return static_cast<T>( - ConsumeIntegralInRange<uint32_t>(0, static_cast<uint32_t>(T::kMaxValue))); -} - -// Returns a copy of the value selected from the given fixed-size |array|. -template <typename T, size_t size> -T FuzzedDataProvider::PickValueInArray(const T (&array)[size]) { - static_assert(size > 0, "The array must be non empty."); - return array[ConsumeIntegralInRange<size_t>(0, size - 1)]; -} - -template <typename T, size_t size> -T FuzzedDataProvider::PickValueInArray(const std::array<T, size> &array) { - static_assert(size > 0, "The array must be non empty."); - return array[ConsumeIntegralInRange<size_t>(0, size - 1)]; -} - -template <typename T> -T FuzzedDataProvider::PickValueInArray(std::initializer_list<const T> list) { - // TODO(Dor1s): switch to static_assert once C++14 is allowed. - if (!list.size()) - abort(); - - return *(list.begin() + ConsumeIntegralInRange<size_t>(0, list.size() - 1)); -} - -// Writes |num_bytes| of input data to the given destination pointer. If there -// is not enough data left, writes all remaining bytes. Return value is the -// number of bytes written. -// In general, it's better to avoid using this function, but it may be useful -// in cases when it's necessary to fill a certain buffer or object with -// fuzzing data. -inline size_t FuzzedDataProvider::ConsumeData(void *destination, - size_t num_bytes) { - num_bytes = std::min(num_bytes, remaining_bytes_); - CopyAndAdvance(destination, num_bytes); - return num_bytes; -} - -// Private methods. -inline void FuzzedDataProvider::CopyAndAdvance(void *destination, - size_t num_bytes) { - std::memcpy(destination, data_ptr_, num_bytes); - Advance(num_bytes); -} - -inline void FuzzedDataProvider::Advance(size_t num_bytes) { - if (num_bytes > remaining_bytes_) - abort(); - - data_ptr_ += num_bytes; - remaining_bytes_ -= num_bytes; -} - -template <typename T> -std::vector<T> FuzzedDataProvider::ConsumeBytes(size_t size, size_t num_bytes) { - static_assert(sizeof(T) == sizeof(uint8_t), "Incompatible data type."); - - // The point of using the size-based constructor below is to increase the - // odds of having a vector object with capacity being equal to the length. - // That part is always implementation specific, but at least both libc++ and - // libstdc++ allocate the requested number of bytes in that constructor, - // which seems to be a natural choice for other implementations as well. - // To increase the odds even more, we also call |shrink_to_fit| below. - std::vector<T> result(size); - if (size == 0) { - if (num_bytes != 0) - abort(); - return result; - } - - CopyAndAdvance(result.data(), num_bytes); - - // Even though |shrink_to_fit| is also implementation specific, we expect it - // to provide an additional assurance in case vector's constructor allocated - // a buffer which is larger than the actual amount of data we put inside it. - result.shrink_to_fit(); - return result; -} - -template <typename TS, typename TU> -TS FuzzedDataProvider::ConvertUnsignedToSigned(TU value) { - static_assert(sizeof(TS) == sizeof(TU), "Incompatible data types."); - static_assert(!std::numeric_limits<TU>::is_signed, - "Source type must be unsigned."); - - // TODO(Dor1s): change to `if constexpr` once C++17 becomes mainstream. - if (std::numeric_limits<TS>::is_modulo) - return static_cast<TS>(value); - - // Avoid using implementation-defined unsigned to signed conversions. - // To learn more, see https://stackoverflow.com/questions/13150449. - if (value <= std::numeric_limits<TS>::max()) { - return static_cast<TS>(value); - } else { - constexpr auto TS_min = std::numeric_limits<TS>::min(); - return TS_min + static_cast<TS>(value - TS_min); - } -} - -#endif // LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_ diff --git a/re2/fuzzing/re2_fuzzer.cc b/re2/fuzzing/re2_fuzzer.cc index 3082a76..9a7af08 100644 --- a/re2/fuzzing/re2_fuzzer.cc +++ b/re2/fuzzing/re2_fuzzer.cc @@ -9,12 +9,12 @@ #include <string> #include <vector> +#include "re2/filtered_re2.h" #include "re2/re2.h" #include "re2/regexp.h" +#include "re2/set.h" #include "re2/walker-inl.h" -using re2::StringPiece; - // NOT static, NOT signed. uint8_t dummy = 0; @@ -95,8 +95,8 @@ class SubstringWalker : public re2::Regexp::Walker<int> { SubstringWalker& operator=(const SubstringWalker&) = delete; }; -void TestOneInput(StringPiece pattern, const RE2::Options& options, - StringPiece text) { +void TestOneInput(absl::string_view pattern, const RE2::Options& options, + RE2::Anchor anchor, absl::string_view text) { // Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W. // Otherwise, we will waste time on inputs that have long runs of various // character classes. The fuzzer has shown itself to be easily capable of @@ -105,7 +105,7 @@ void TestOneInput(StringPiece pattern, const RE2::Options& options, // counted repetition is involved - whereas the marginal benefit is zero. // Crudely limit the use of 'k', 'K', 's' and 'S' too because they become // three-element character classes when case-insensitive and using UTF-8. - // TODO(junyer): Handle [:isalnum:] et al. when they start to cause pain. + // TODO(junyer): Handle [[:alnum:]] et al. when they start to cause pain. int char_class = 0; int backslash_p = 0; // very expensive, so handle specially for (size_t i = 0; i < pattern.size(); i++) { @@ -131,6 +131,9 @@ void TestOneInput(StringPiece pattern, const RE2::Options& options, if (backslash_p > 1) return; + // Iterate just once when fuzzing. Otherwise, we easily get bogged down + // and coverage is unlikely to improve despite significant expense. + RE2::FUZZING_ONLY_set_maximum_global_replace_count(1); // The default is 1000. Even 100 turned out to be too generous // for fuzzing, empirically speaking, so let's try 10 instead. re2::Regexp::FUZZING_ONLY_set_maximum_repeat_count(10); @@ -173,7 +176,7 @@ void TestOneInput(StringPiece pattern, const RE2::Options& options, if (re.NumberOfCapturingGroups() == 0) { // Avoid early return due to too many arguments. - StringPiece sp = text; + absl::string_view sp = text; RE2::FullMatch(sp, re); RE2::PartialMatch(sp, re); RE2::Consume(&sp, re); @@ -182,7 +185,7 @@ void TestOneInput(StringPiece pattern, const RE2::Options& options, } else { // Okay, we have at least one capturing group... // Try conversion for variously typed arguments. - StringPiece sp = text; + absl::string_view sp = text; short s; RE2::FullMatch(sp, re, &s); long l; @@ -206,6 +209,30 @@ void TestOneInput(StringPiece pattern, const RE2::Options& options, dummy += re.NamedCapturingGroups().size(); dummy += re.CapturingGroupNames().size(); dummy += RE2::QuoteMeta(pattern).size(); + dummy += re.Regexp()->ToString().size(); + + RE2::Set set(options, anchor); + int index = set.Add(pattern, /*error=*/NULL); // -1 on error + if (index != -1 && set.Compile()) { + std::vector<int> matches; + set.Match(text, &matches); + } + + re2::FilteredRE2 filter; + index = -1; // not clobbered on error + filter.Add(pattern, options, &index); + if (index != -1) { + std::vector<std::string> atoms; + filter.Compile(&atoms); + // Pretend that all atoms match, which + // triggers the AND-OR tree maximally. + std::vector<int> matched_atoms; + matched_atoms.reserve(atoms.size()); + for (size_t i = 0; i < atoms.size(); ++i) + matched_atoms.push_back(static_cast<int>(i)); + std::vector<int> matches; + filter.AllMatches(text, matched_atoms, &matches); + } } // Entry point for libFuzzer. @@ -239,9 +266,17 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { options.set_word_boundary(fdp.ConsumeBool()); options.set_one_line(fdp.ConsumeBool()); + // ConsumeEnum<RE2::Anchor>() would require RE2::Anchor to specify + // kMaxValue, so just use PickValueInArray<RE2::Anchor>() instead. + RE2::Anchor anchor = fdp.PickValueInArray<RE2::Anchor>({ + RE2::UNANCHORED, + RE2::ANCHOR_START, + RE2::ANCHOR_BOTH, + }); + std::string pattern = fdp.ConsumeRandomLengthString(999); std::string text = fdp.ConsumeRandomLengthString(999); - TestOneInput(pattern, options, text); + TestOneInput(pattern, options, anchor, text); return 0; } diff --git a/re2/mimics_pcre.cc b/re2/mimics_pcre.cc index b1d6a51..ac0c69d 100644 --- a/re2/mimics_pcre.cc +++ b/re2/mimics_pcre.cc @@ -22,7 +22,6 @@ // // Regexp::MimicsPCRE checks for any of these conditions. -#include "util/util.h" #include "util/logging.h" #include "re2/regexp.h" #include "re2/walker-inl.h" @@ -32,8 +32,8 @@ #include <utility> #include <vector> +#include "absl/strings/str_format.h" #include "util/logging.h" -#include "util/strutil.h" #include "re2/pod_array.h" #include "re2/prog.h" #include "re2/regexp.h" @@ -60,9 +60,8 @@ class NFA { // Submatch[0] is the entire match. When there is a choice in // which text matches each subexpression, the submatch boundaries // are chosen to match what a backtracking implementation would choose. - bool Search(const StringPiece& text, const StringPiece& context, - bool anchored, bool longest, - StringPiece* submatch, int nsubmatch); + bool Search(absl::string_view text, absl::string_view context, bool anchored, + bool longest, absl::string_view* submatch, int nsubmatch); private: struct Thread { @@ -92,7 +91,7 @@ class NFA { // Enqueues only the ByteRange instructions that match byte c. // context is used (with p) for evaluating empty-width specials. // p is the current input position, and t0 is the current thread. - void AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context, + void AddToThreadq(Threadq* q, int id0, int c, absl::string_view context, const char* p, Thread* t0); // Run runq on byte c, appending new states to nextq. @@ -102,7 +101,7 @@ class NFA { // p-1 will be used when processing Match instructions. // Frees all the threads on runq. // If there is a shortcut to the end, returns that shortcut. - int Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, + int Step(Threadq* runq, Threadq* nextq, int c, absl::string_view context, const char* p); // Returns text version of capture information, for debugging. @@ -192,7 +191,7 @@ void NFA::Decref(Thread* t) { // Enqueues only the ByteRange instructions that match byte c. // context is used (with p) for evaluating empty-width specials. // p is the current input position, and t0 is the current thread. -void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context, +void NFA::AddToThreadq(Threadq* q, int id0, int c, absl::string_view context, const char* p, Thread* t0) { if (id0 == 0) return; @@ -225,7 +224,7 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context, continue; if (q->has_index(id)) { if (ExtraDebug) - fprintf(stderr, " [%d%s]\n", id, FormatCapture(t0->capture).c_str()); + absl::FPrintF(stderr, " [%d%s]\n", id, FormatCapture(t0->capture)); continue; } @@ -288,7 +287,7 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context, t = Incref(t0); *tp = t; if (ExtraDebug) - fprintf(stderr, " + %d%s\n", id, FormatCapture(t0->capture).c_str()); + absl::FPrintF(stderr, " + %d%s\n", id, FormatCapture(t0->capture)); if (ip->hint() == 0) break; @@ -300,7 +299,7 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context, t = Incref(t0); *tp = t; if (ExtraDebug) - fprintf(stderr, " ! %d%s\n", id, FormatCapture(t0->capture).c_str()); + absl::FPrintF(stderr, " ! %d%s\n", id, FormatCapture(t0->capture)); Next: if (ip->last()) @@ -328,7 +327,7 @@ void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context, // p-1 will be used when processing Match instructions. // Frees all the threads on runq. // If there is a shortcut to the end, returns that shortcut. -int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, +int NFA::Step(Threadq* runq, Threadq* nextq, int c, absl::string_view context, const char* p) { nextq->clear(); @@ -435,23 +434,22 @@ std::string NFA::FormatCapture(const char** capture) { if (capture[i] == NULL) s += "(?,?)"; else if (capture[i+1] == NULL) - s += StringPrintf("(%td,?)", - capture[i] - btext_); + s += absl::StrFormat("(%d,?)", + capture[i] - btext_); else - s += StringPrintf("(%td,%td)", - capture[i] - btext_, - capture[i+1] - btext_); + s += absl::StrFormat("(%d,%d)", + capture[i] - btext_, + capture[i+1] - btext_); } return s; } -bool NFA::Search(const StringPiece& text, const StringPiece& const_context, - bool anchored, bool longest, - StringPiece* submatch, int nsubmatch) { +bool NFA::Search(absl::string_view text, absl::string_view context, + bool anchored, bool longest, absl::string_view* submatch, + int nsubmatch) { if (start_ == 0) return false; - StringPiece context = const_context; if (context.data() == NULL) context = text; @@ -497,8 +495,8 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, etext_ = text.data() + text.size(); if (ExtraDebug) - fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n", - std::string(text).c_str(), std::string(context).c_str(), anchored, longest); + absl::FPrintF(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n", + text, context, anchored, longest); // Set up search. Threadq* runq = &q0_; @@ -517,14 +515,14 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, else if (p < etext_) c = p[0] & 0xFF; - fprintf(stderr, "%c:", c); + absl::FPrintF(stderr, "%c:", c); for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { Thread* t = i->value(); if (t == NULL) continue; - fprintf(stderr, " %d%s", i->index(), FormatCapture(t->capture).c_str()); + absl::FPrintF(stderr, " %d%s", i->index(), FormatCapture(t->capture)); } - fprintf(stderr, "\n"); + absl::FPrintF(stderr, "\n"); } // This is a no-op the first time around the loop because runq is empty. @@ -592,7 +590,7 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, // If all the threads have died, stop early. if (runq->size() == 0) { if (ExtraDebug) - fprintf(stderr, "dead\n"); + absl::FPrintF(stderr, "dead\n"); break; } @@ -616,27 +614,26 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, if (matched_) { for (int i = 0; i < nsubmatch; i++) - submatch[i] = - StringPiece(match_[2 * i], - static_cast<size_t>(match_[2 * i + 1] - match_[2 * i])); + submatch[i] = absl::string_view( + match_[2 * i], + static_cast<size_t>(match_[2 * i + 1] - match_[2 * i])); if (ExtraDebug) - fprintf(stderr, "match (%td,%td)\n", - match_[0] - btext_, - match_[1] - btext_); + absl::FPrintF(stderr, "match (%d,%d)\n", + match_[0] - btext_, + match_[1] - btext_); return true; } return false; } -bool -Prog::SearchNFA(const StringPiece& text, const StringPiece& context, - Anchor anchor, MatchKind kind, - StringPiece* match, int nmatch) { +bool Prog::SearchNFA(absl::string_view text, absl::string_view context, + Anchor anchor, MatchKind kind, absl::string_view* match, + int nmatch) { if (ExtraDebug) Dump(); NFA nfa(this); - StringPiece sp; + absl::string_view sp; if (kind == kFullMatch) { anchor = kAnchored; if (nmatch == 0) { diff --git a/re2/onepass.cc b/re2/onepass.cc index 2639746..7931cf9 100644 --- a/re2/onepass.cc +++ b/re2/onepass.cc @@ -57,14 +57,14 @@ #include <string> #include <vector> -#include "util/util.h" +#include "absl/container/fixed_array.h" +#include "absl/container/inlined_vector.h" +#include "absl/strings/str_format.h" #include "util/logging.h" -#include "util/strutil.h" #include "util/utf.h" #include "re2/pod_array.h" #include "re2/prog.h" #include "re2/sparse_set.h" -#include "re2/stringpiece.h" // Silence "zero-sized array in struct/union" warning for OneState::action. #ifdef _MSC_VER @@ -189,7 +189,7 @@ void OnePass_Checks() { "kMaxCap disagrees with kMaxOnePassCapture"); } -static bool Satisfy(uint32_t cond, const StringPiece& context, const char* p) { +static bool Satisfy(uint32_t cond, absl::string_view context, const char* p) { uint32_t satisfied = Prog::EmptyFlags(context, p); if (cond & kEmptyAllFlags & ~satisfied) return false; @@ -211,10 +211,9 @@ static inline OneState* IndexToNode(uint8_t* nodes, int statesize, return reinterpret_cast<OneState*>(nodes + statesize*nodeindex); } -bool Prog::SearchOnePass(const StringPiece& text, - const StringPiece& const_context, +bool Prog::SearchOnePass(absl::string_view text, absl::string_view context, Anchor anchor, MatchKind kind, - StringPiece* match, int nmatch) { + absl::string_view* match, int nmatch) { if (anchor != kAnchored && kind != kFullMatch) { LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches."; return false; @@ -234,7 +233,6 @@ bool Prog::SearchOnePass(const StringPiece& text, for (int i = 0; i < ncap; i++) matchcap[i] = NULL; - StringPiece context = const_context; if (context.data() == NULL) context = text; if (anchor_start() && BeginPtr(context) != BeginPtr(text)) @@ -339,13 +337,12 @@ done: if (!matched) return false; for (int i = 0; i < nmatch; i++) - match[i] = - StringPiece(matchcap[2 * i], - static_cast<size_t>(matchcap[2 * i + 1] - matchcap[2 * i])); + match[i] = absl::string_view( + matchcap[2 * i], + static_cast<size_t>(matchcap[2 * i + 1] - matchcap[2 * i])); return true; } - // Analysis to determine whether a given regexp program is one-pass. // If ip is not on workq, adds ip to work queue and returns true. @@ -404,16 +401,17 @@ bool Prog::IsOnePass() { int stacksize = inst_count(kInstCapture) + inst_count(kInstEmptyWidth) + inst_count(kInstNop) + 1; // + 1 for start inst - PODArray<InstCond> stack(stacksize); + absl::FixedArray<InstCond, 64> stack_storage(stacksize); + InstCond* stack = stack_storage.data(); int size = this->size(); - PODArray<int> nodebyid(size); // indexed by ip - memset(nodebyid.data(), 0xFF, size*sizeof nodebyid[0]); + absl::FixedArray<int, 128> nodebyid_storage(size, -1); // indexed by ip + int* nodebyid = nodebyid_storage.data(); // Originally, nodes was a uint8_t[maxnodes*statesize], but that was // unnecessarily optimistic: why allocate a large amount of memory // upfront for a large program when it is unlikely to be one-pass? - std::vector<uint8_t> nodes; + absl::InlinedVector<uint8_t, 2048> nodes; Instq tovisit(size), workq(size); AddQ(&tovisit, start()); @@ -462,7 +460,7 @@ bool Prog::IsOnePass() { if (nextindex == -1) { if (nalloc >= maxnodes) { if (ExtraDebug) - LOG(ERROR) << StringPrintf( + LOG(ERROR) << absl::StrFormat( "Not OnePass: hit node limit %d >= %d", nalloc, maxnodes); goto fail; } @@ -487,7 +485,7 @@ bool Prog::IsOnePass() { node->action[b] = newact; } else if (act != newact) { if (ExtraDebug) - LOG(ERROR) << StringPrintf( + LOG(ERROR) << absl::StrFormat( "Not OnePass: conflict on byte %#x at state %d", c, *it); goto fail; } @@ -508,7 +506,7 @@ bool Prog::IsOnePass() { node->action[b] = newact; } else if (act != newact) { if (ExtraDebug) - LOG(ERROR) << StringPrintf( + LOG(ERROR) << absl::StrFormat( "Not OnePass: conflict on byte %#x at state %d", c, *it); goto fail; } @@ -549,7 +547,7 @@ bool Prog::IsOnePass() { // If already on work queue, (1) is violated: bail out. if (!AddQ(&workq, ip->out())) { if (ExtraDebug) - LOG(ERROR) << StringPrintf( + LOG(ERROR) << absl::StrFormat( "Not OnePass: multiple paths %d -> %d", *it, ip->out()); goto fail; } @@ -560,7 +558,7 @@ bool Prog::IsOnePass() { if (matched) { // (3) is violated if (ExtraDebug) - LOG(ERROR) << StringPrintf( + LOG(ERROR) << absl::StrFormat( "Not OnePass: multiple matches from %d", *it); goto fail; } @@ -597,15 +595,15 @@ bool Prog::IsOnePass() { if (nodeindex == -1) continue; OneState* node = IndexToNode(nodes.data(), statesize, nodeindex); - dump += StringPrintf("node %d id=%d: matchcond=%#x\n", - nodeindex, id, node->matchcond); + dump += absl::StrFormat("node %d id=%d: matchcond=%#x\n", + nodeindex, id, node->matchcond); for (int i = 0; i < bytemap_range_; i++) { if ((node->action[i] & kImpossible) == kImpossible) continue; - dump += StringPrintf(" %d cond %#x -> %d id=%d\n", - i, node->action[i] & 0xFFFF, - node->action[i] >> kIndexShift, - idmap[node->action[i] >> kIndexShift]); + dump += absl::StrFormat(" %d cond %#x -> %d id=%d\n", + i, node->action[i] & 0xFFFF, + node->action[i] >> kIndexShift, + idmap[node->action[i] >> kIndexShift]); } } LOG(ERROR) << "nodes:\n" << dump; diff --git a/re2/parse.cc b/re2/parse.cc index 85f16f0..655cb9a 100644 --- a/re2/parse.cc +++ b/re2/parse.cc @@ -25,13 +25,12 @@ #include <string> #include <vector> -#include "util/util.h" +#include "absl/base/macros.h" +#include "absl/strings/ascii.h" #include "util/logging.h" -#include "util/strutil.h" #include "util/utf.h" #include "re2/pod_array.h" #include "re2/regexp.h" -#include "re2/stringpiece.h" #include "re2/unicode_casefold.h" #include "re2/unicode_groups.h" #include "re2/walker-inl.h" @@ -70,7 +69,7 @@ void Regexp::FUZZING_ONLY_set_maximum_repeat_count(int i) { class Regexp::ParseState { public: - ParseState(ParseFlags flags, const StringPiece& whole_regexp, + ParseState(ParseFlags flags, absl::string_view whole_regexp, RegexpStatus* status); ~ParseState(); @@ -107,18 +106,18 @@ class Regexp::ParseState { // Pushes a repeat operator regexp onto the stack. // A valid argument for the operator must already be on the stack. // s is the name of the operator, for use in error messages. - bool PushRepeatOp(RegexpOp op, const StringPiece& s, bool nongreedy); + bool PushRepeatOp(RegexpOp op, absl::string_view s, bool nongreedy); // Pushes a repetition regexp onto the stack. // A valid argument for the operator must already be on the stack. - bool PushRepetition(int min, int max, const StringPiece& s, bool nongreedy); + bool PushRepetition(int min, int max, absl::string_view s, bool nongreedy); // Checks whether a particular regexp op is a marker. bool IsMarker(RegexpOp op); // Processes a left parenthesis in the input. // Pushes a marker onto the stack. - bool DoLeftParen(const StringPiece& name); + bool DoLeftParen(absl::string_view name); bool DoLeftParenNoCapture(); // Processes a vertical bar in the input. @@ -142,24 +141,23 @@ class Regexp::ParseState { // Parse a character class into *out_re. // Removes parsed text from s. - bool ParseCharClass(StringPiece* s, Regexp** out_re, + bool ParseCharClass(absl::string_view* s, Regexp** out_re, RegexpStatus* status); // Parse a character class character into *rp. // Removes parsed text from s. - bool ParseCCCharacter(StringPiece* s, Rune *rp, - const StringPiece& whole_class, + bool ParseCCCharacter(absl::string_view* s, Rune* rp, + absl::string_view whole_class, RegexpStatus* status); // Parse a character class range into rr. // Removes parsed text from s. - bool ParseCCRange(StringPiece* s, RuneRange* rr, - const StringPiece& whole_class, + bool ParseCCRange(absl::string_view* s, RuneRange* rr, + absl::string_view whole_class, RegexpStatus* status); // Parse a Perl flag set or non-capturing group from s. - bool ParsePerlFlags(StringPiece* s); - + bool ParsePerlFlags(absl::string_view* s); // Finishes the current concatenation, // collapsing it into a single regexp on the stack. @@ -177,7 +175,7 @@ class Regexp::ParseState { private: ParseFlags flags_; - StringPiece whole_regexp_; + absl::string_view whole_regexp_; RegexpStatus* status_; Regexp* stacktop_; int ncap_; // number of capturing parens seen @@ -192,7 +190,7 @@ const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+1); const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+2); Regexp::ParseState::ParseState(ParseFlags flags, - const StringPiece& whole_regexp, + absl::string_view whole_regexp, RegexpStatus* status) : flags_(flags), whole_regexp_(whole_regexp), status_(status), stacktop_(NULL), ncap_(0) { @@ -269,7 +267,7 @@ bool Regexp::ParseState::PushRegexp(Regexp* re) { // Searches the case folding tables and returns the CaseFold* that contains r. // If there isn't one, returns the CaseFold* with smallest f->lo bigger than r. // If there isn't one, returns NULL. -const CaseFold* LookupCaseFold(const CaseFold *f, int n, Rune r) { +const CaseFold* LookupCaseFold(const CaseFold* f, int n, Rune r) { const CaseFold* ef = f + n; // Binary search for entry containing r. @@ -297,7 +295,7 @@ const CaseFold* LookupCaseFold(const CaseFold *f, int n, Rune r) { } // Returns the result of applying the fold f to the rune r. -Rune ApplyFold(const CaseFold *f, Rune r) { +Rune ApplyFold(const CaseFold* f, Rune r) { switch (f->delta) { default: return r + f->delta; @@ -305,7 +303,7 @@ Rune ApplyFold(const CaseFold *f, Rune r) { case EvenOddSkip: // even <-> odd but only applies to every other if ((r - f->lo) % 2) return r; - FALLTHROUGH_INTENDED; + ABSL_FALLTHROUGH_INTENDED; case EvenOdd: // even <-> odd if (r%2 == 0) return r + 1; @@ -314,7 +312,7 @@ Rune ApplyFold(const CaseFold *f, Rune r) { case OddEvenSkip: // odd <-> even but only applies to every other if ((r - f->lo) % 2) return r; - FALLTHROUGH_INTENDED; + ABSL_FALLTHROUGH_INTENDED; case OddEven: // odd <-> even if (r%2 == 1) return r + 1; @@ -472,7 +470,7 @@ bool Regexp::ParseState::PushSimpleOp(RegexpOp op) { // Pushes a repeat operator regexp onto the stack. // A valid argument for the operator must already be on the stack. // The char c is the name of the operator, for use in error messages. -bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s, +bool Regexp::ParseState::PushRepeatOp(RegexpOp op, absl::string_view s, bool nongreedy) { if (stacktop_ == NULL || IsMarker(stacktop_->op())) { status_->set_code(kRegexpRepeatArgument); @@ -565,8 +563,7 @@ int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) { // Pushes a repetition regexp onto the stack. // A valid argument for the operator must already be on the stack. -bool Regexp::ParseState::PushRepetition(int min, int max, - const StringPiece& s, +bool Regexp::ParseState::PushRepetition(int min, int max, absl::string_view s, bool nongreedy) { if ((max != -1 && max < min) || min > maximum_repeat_count || @@ -609,7 +606,7 @@ bool Regexp::ParseState::IsMarker(RegexpOp op) { // Processes a left parenthesis in the input. // Pushes a marker onto the stack. -bool Regexp::ParseState::DoLeftParen(const StringPiece& name) { +bool Regexp::ParseState::DoLeftParen(absl::string_view name) { Regexp* re = new Regexp(kLeftParen, flags_); re->cap_ = ++ncap_; if (name.data() != NULL) @@ -774,8 +771,8 @@ Regexp* Regexp::RemoveLeadingRegexp(Regexp* re) { // Returns the leading string that re starts with. // The returned Rune* points into a piece of re, // so it must not be used after the caller calls re->Decref(). -Rune* Regexp::LeadingString(Regexp* re, int *nrune, - Regexp::ParseFlags *flags) { +Rune* Regexp::LeadingString(Regexp* re, int* nrune, + Regexp::ParseFlags* flags) { while (re->op() == kRegexpConcat && re->nsub() > 0) re = re->sub()[0]; @@ -806,7 +803,7 @@ void Regexp::RemoveLeadingString(Regexp* re, int n) { Regexp* stk[4]; size_t d = 0; while (re->op() == kRegexpConcat) { - if (d < arraysize(stk)) + if (d < ABSL_ARRAYSIZE(stk)) stk[d++] = re; re = re->sub()[0]; } @@ -1325,15 +1322,15 @@ bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) { // Parses a decimal integer, storing it in *np. // Sets *s to span the remainder of the string. -static bool ParseInteger(StringPiece* s, int* np) { - if (s->empty() || !isdigit((*s)[0] & 0xFF)) +static bool ParseInteger(absl::string_view* s, int* np) { + if (s->empty() || !absl::ascii_isdigit((*s)[0] & 0xFF)) return false; // Disallow leading zeros. - if (s->size() >= 2 && (*s)[0] == '0' && isdigit((*s)[1] & 0xFF)) + if (s->size() >= 2 && (*s)[0] == '0' && absl::ascii_isdigit((*s)[1] & 0xFF)) return false; int n = 0; int c; - while (!s->empty() && isdigit(c = (*s)[0] & 0xFF)) { + while (!s->empty() && absl::ascii_isdigit(c = (*s)[0] & 0xFF)) { // Avoid overflow. if (n >= 100000000) return false; @@ -1351,10 +1348,10 @@ static bool ParseInteger(StringPiece* s, int* np) { // sets *hi to -1 to signify this. // {,2} is NOT a valid suffix. // The Maybe in the name signifies that the regexp parse -// doesn't fail even if ParseRepetition does, so the StringPiece +// doesn't fail even if ParseRepetition does, so the string_view // s must NOT be edited unless MaybeParseRepetition returns true. -static bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) { - StringPiece s = *sp; +static bool MaybeParseRepetition(absl::string_view* sp, int* lo, int* hi) { + absl::string_view s = *sp; if (s.empty() || s[0] != '{') return false; s.remove_prefix(1); // '{' @@ -1385,12 +1382,13 @@ static bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) { return true; } -// Removes the next Rune from the StringPiece and stores it in *r. +// Removes the next Rune from the string_view and stores it in *r. // Returns number of bytes removed from sp. // Behaves as though there is a terminating NUL at the end of sp. // Argument order is backwards from usual Google style // but consistent with chartorune. -static int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) { +static int StringViewToRune(Rune* r, absl::string_view* sp, + RegexpStatus* status) { // fullrune() takes int, not size_t. However, it just looks // at the leading byte and treats any length >= 4 the same. if (fullrune(sp->data(), static_cast<int>(std::min(size_t{4}, sp->size())))) { @@ -1411,18 +1409,18 @@ static int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) { if (status != NULL) { status->set_code(kRegexpBadUTF8); - status->set_error_arg(StringPiece()); + status->set_error_arg(absl::string_view()); } return -1; } // Returns whether name is valid UTF-8. // If not, sets status to kRegexpBadUTF8. -static bool IsValidUTF8(const StringPiece& s, RegexpStatus* status) { - StringPiece t = s; +static bool IsValidUTF8(absl::string_view s, RegexpStatus* status) { + absl::string_view t = s; Rune r; while (!t.empty()) { - if (StringPieceToRune(&r, &t, status) < 0) + if (StringViewToRune(&r, &t, status) < 0) return false; } return true; @@ -1450,28 +1448,28 @@ static int UnHex(int c) { // Parse an escape sequence (e.g., \n, \{). // Sets *s to span the remainder of the string. // Sets *rp to the named character. -static bool ParseEscape(StringPiece* s, Rune* rp, +static bool ParseEscape(absl::string_view* s, Rune* rp, RegexpStatus* status, int rune_max) { const char* begin = s->data(); if (s->empty() || (*s)[0] != '\\') { // Should not happen - caller always checks. status->set_code(kRegexpInternalError); - status->set_error_arg(StringPiece()); + status->set_error_arg(absl::string_view()); return false; } if (s->size() == 1) { status->set_code(kRegexpTrailingBackslash); - status->set_error_arg(StringPiece()); + status->set_error_arg(absl::string_view()); return false; } Rune c, c1; s->remove_prefix(1); // backslash - if (StringPieceToRune(&c, s, status) < 0) + if (StringViewToRune(&c, s, status) < 0) return false; int code; switch (c) { default: - if (c < Runeself && !isalpha(c) && !isdigit(c)) { + if (c < Runeself && !absl::ascii_isalnum(c)) { // Escaped non-word characters are always themselves. // PCRE is not quite so rigorous: it accepts things like // \q, but we don't. We once rejected \_, but too many @@ -1492,7 +1490,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp, // Single non-zero octal digit is a backreference; not supported. if (s->empty() || (*s)[0] < '0' || (*s)[0] > '7') goto BadEscape; - FALLTHROUGH_INTENDED; + ABSL_FALLTHROUGH_INTENDED; case '0': // consume up to three octal digits; already have one. code = c - '0'; @@ -1516,7 +1514,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp, case 'x': if (s->empty()) goto BadEscape; - if (StringPieceToRune(&c, s, status) < 0) + if (StringViewToRune(&c, s, status) < 0) return false; if (c == '{') { // Any number of digits in braces. @@ -1525,7 +1523,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp, // Perl accepts any text at all; it ignores all text // after the first non-hex digit. We require only hex digits, // and at least one. - if (StringPieceToRune(&c, s, status) < 0) + if (StringViewToRune(&c, s, status) < 0) return false; int nhex = 0; code = 0; @@ -1536,7 +1534,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp, goto BadEscape; if (s->empty()) goto BadEscape; - if (StringPieceToRune(&c, s, status) < 0) + if (StringViewToRune(&c, s, status) < 0) return false; } if (c != '}' || nhex == 0) @@ -1547,7 +1545,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp, // Easy case: two hex digits. if (s->empty()) goto BadEscape; - if (StringPieceToRune(&c1, s, status) < 0) + if (StringViewToRune(&c1, s, status) < 0) return false; if (!IsHex(c) || !IsHex(c1)) goto BadEscape; @@ -1589,13 +1587,11 @@ static bool ParseEscape(StringPiece* s, Rune* rp, // return true; } - LOG(DFATAL) << "Not reached in ParseEscape."; - BadEscape: // Unrecognized escape sequence. status->set_code(kRegexpBadEscape); status->set_error_arg( - StringPiece(begin, static_cast<size_t>(s->data() - begin))); + absl::string_view(begin, static_cast<size_t>(s->data() - begin))); return false; } @@ -1623,21 +1619,21 @@ void CharClassBuilder::AddRangeFlags( } // Look for a group with the given name. -static const UGroup* LookupGroup(const StringPiece& name, - const UGroup *groups, int ngroups) { +static const UGroup* LookupGroup(absl::string_view name, + const UGroup* groups, int ngroups) { // Simple name lookup. for (int i = 0; i < ngroups; i++) - if (StringPiece(groups[i].name) == name) + if (absl::string_view(groups[i].name) == name) return &groups[i]; return NULL; } // Look for a POSIX group with the given name (e.g., "[:^alpha:]") -static const UGroup* LookupPosixGroup(const StringPiece& name) { +static const UGroup* LookupPosixGroup(absl::string_view name) { return LookupGroup(name, posix_groups, num_posix_groups); } -static const UGroup* LookupPerlGroup(const StringPiece& name) { +static const UGroup* LookupPerlGroup(absl::string_view name) { return LookupGroup(name, perl_groups, num_perl_groups); } @@ -1648,16 +1644,16 @@ static URange32 any32[] = { { 65536, Runemax } }; static UGroup anygroup = { "Any", +1, any16, 1, any32, 1 }; // Look for a Unicode group with the given name (e.g., "Han") -static const UGroup* LookupUnicodeGroup(const StringPiece& name) { +static const UGroup* LookupUnicodeGroup(absl::string_view name) { // Special case: "Any" means any. - if (name == StringPiece("Any")) + if (name == absl::string_view("Any")) return &anygroup; return LookupGroup(name, unicode_groups, num_unicode_groups); } #endif // Add a UGroup or its negation to the character class. -static void AddUGroup(CharClassBuilder *cc, const UGroup *g, int sign, +static void AddUGroup(CharClassBuilder* cc, const UGroup* g, int sign, Regexp::ParseFlags parse_flags) { if (sign == +1) { for (int i = 0; i < g->nr16; i++) { @@ -1707,16 +1703,17 @@ static void AddUGroup(CharClassBuilder *cc, const UGroup *g, int sign, // not the Perl empty-string classes (\b \B \A \Z \z). // On success, sets *s to span the remainder of the string // and returns the corresponding UGroup. -// The StringPiece must *NOT* be edited unless the call succeeds. -const UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) { +// The string_view must *NOT* be edited unless the call succeeds. +const UGroup* MaybeParsePerlCCEscape(absl::string_view* s, + Regexp::ParseFlags parse_flags) { if (!(parse_flags & Regexp::PerlClasses)) return NULL; if (s->size() < 2 || (*s)[0] != '\\') return NULL; - // Could use StringPieceToRune, but there aren't + // Could use StringViewToRune, but there aren't // any non-ASCII Perl group names. - StringPiece name(s->data(), 2); - const UGroup *g = LookupPerlGroup(name); + absl::string_view name(s->data(), 2); + const UGroup* g = LookupPerlGroup(name); if (g == NULL) return NULL; s->remove_prefix(name.size()); @@ -1731,9 +1728,9 @@ enum ParseStatus { // Maybe parses a Unicode character group like \p{Han} or \P{Han} // (the latter is a negated group). -ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, - CharClassBuilder *cc, - RegexpStatus* status) { +ParseStatus ParseUnicodeGroup(absl::string_view* s, + Regexp::ParseFlags parse_flags, + CharClassBuilder* cc, RegexpStatus* status) { // Decide whether to parse. if (!(parse_flags & Regexp::UnicodeGroups)) return kParseNothing; @@ -1747,34 +1744,34 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, int sign = +1; // -1 = negated char class if (c == 'P') sign = -sign; - StringPiece seq = *s; // \p{Han} or \pL - StringPiece name; // Han or L + absl::string_view seq = *s; // \p{Han} or \pL + absl::string_view name; // Han or L s->remove_prefix(2); // '\\', 'p' - if (!StringPieceToRune(&c, s, status)) + if (!StringViewToRune(&c, s, status)) return kParseError; if (c != '{') { // Name is the bit of string we just skipped over for c. const char* p = seq.data() + 2; - name = StringPiece(p, static_cast<size_t>(s->data() - p)); + name = absl::string_view(p, static_cast<size_t>(s->data() - p)); } else { // Name is in braces. Look for closing } size_t end = s->find('}', 0); - if (end == StringPiece::npos) { + if (end == absl::string_view::npos) { if (!IsValidUTF8(seq, status)) return kParseError; status->set_code(kRegexpBadCharRange); status->set_error_arg(seq); return kParseError; } - name = StringPiece(s->data(), end); // without '}' + name = absl::string_view(s->data(), end); // without '}' s->remove_prefix(end + 1); // with '}' if (!IsValidUTF8(name, status)) return kParseError; } // Chop seq where s now begins. - seq = StringPiece(seq.data(), static_cast<size_t>(s->data() - seq.data())); + seq = absl::string_view(seq.data(), static_cast<size_t>(s->data() - seq.data())); if (!name.empty() && name[0] == '^') { sign = -sign; @@ -1783,7 +1780,7 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, #if !defined(RE2_USE_ICU) // Look up the group in the RE2 Unicode data. - const UGroup *g = LookupUnicodeGroup(name); + const UGroup* g = LookupUnicodeGroup(name); if (g == NULL) { status->set_code(kRegexpBadCharRange); status->set_error_arg(seq); @@ -1821,9 +1818,9 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, // Parses a character class name like [:alnum:]. // Sets *s to span the remainder of the string. // Adds the ranges corresponding to the class to ranges. -static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags, - CharClassBuilder *cc, - RegexpStatus* status) { +static ParseStatus ParseCCName(absl::string_view* s, + Regexp::ParseFlags parse_flags, + CharClassBuilder* cc, RegexpStatus* status) { // Check begins with [: const char* p = s->data(); const char* ep = s->data() + s->size(); @@ -1841,9 +1838,9 @@ static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags, // Got it. Check that it's valid. q += 2; - StringPiece name(p, static_cast<size_t>(q - p)); + absl::string_view name(p, static_cast<size_t>(q - p)); - const UGroup *g = LookupPosixGroup(name); + const UGroup* g = LookupPosixGroup(name); if (g == NULL) { status->set_code(kRegexpBadCharRange); status->set_error_arg(name); @@ -1859,8 +1856,8 @@ static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags, // There are fewer special characters here than in the rest of the regexp. // Sets *s to span the remainder of the string. // Sets *rp to the character. -bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp, - const StringPiece& whole_class, +bool Regexp::ParseState::ParseCCCharacter(absl::string_view* s, Rune* rp, + absl::string_view whole_class, RegexpStatus* status) { if (s->empty()) { status->set_code(kRegexpMissingBracket); @@ -1874,7 +1871,7 @@ bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp, return ParseEscape(s, rp, status, rune_max_); // Otherwise take the next rune. - return StringPieceToRune(rp, s, status) >= 0; + return StringViewToRune(rp, s, status) >= 0; } // Parses a character class character, or, if the character @@ -1882,10 +1879,10 @@ bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp, // For single characters, rr->lo == rr->hi. // Sets *s to span the remainder of the string. // Sets *rp to the character. -bool Regexp::ParseState::ParseCCRange(StringPiece* s, RuneRange* rr, - const StringPiece& whole_class, +bool Regexp::ParseState::ParseCCRange(absl::string_view* s, RuneRange* rr, + absl::string_view whole_class, RegexpStatus* status) { - StringPiece os = *s; + absl::string_view os = *s; if (!ParseCCCharacter(s, &rr->lo, whole_class, status)) return false; // [a-] means (a|-), so check for final ]. @@ -1895,8 +1892,8 @@ bool Regexp::ParseState::ParseCCRange(StringPiece* s, RuneRange* rr, return false; if (rr->hi < rr->lo) { status->set_code(kRegexpBadCharRange); - status->set_error_arg( - StringPiece(os.data(), static_cast<size_t>(s->data() - os.data()))); + status->set_error_arg(absl::string_view( + os.data(), static_cast<size_t>(s->data() - os.data()))); return false; } } else { @@ -1908,14 +1905,13 @@ bool Regexp::ParseState::ParseCCRange(StringPiece* s, RuneRange* rr, // Parses a possibly-negated character class expression like [^abx-z[:digit:]]. // Sets *s to span the remainder of the string. // Sets *out_re to the regexp for the class. -bool Regexp::ParseState::ParseCharClass(StringPiece* s, - Regexp** out_re, +bool Regexp::ParseState::ParseCharClass(absl::string_view* s, Regexp** out_re, RegexpStatus* status) { - StringPiece whole_class = *s; + absl::string_view whole_class = *s; if (s->empty() || (*s)[0] != '[') { // Caller checked this. status->set_code(kRegexpInternalError); - status->set_error_arg(StringPiece()); + status->set_error_arg(absl::string_view()); return false; } bool negated = false; @@ -1937,16 +1933,16 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s, // Except that Perl allows - anywhere. if ((*s)[0] == '-' && !first && !(flags_&PerlX) && (s->size() == 1 || (*s)[1] != ']')) { - StringPiece t = *s; + absl::string_view t = *s; t.remove_prefix(1); // '-' Rune r; - int n = StringPieceToRune(&r, &t, status); + int n = StringViewToRune(&r, &t, status); if (n < 0) { re->Decref(); return false; } status->set_code(kRegexpBadCharRange); - status->set_error_arg(StringPiece(s->data(), 1+n)); + status->set_error_arg(absl::string_view(s->data(), 1+n)); re->Decref(); return false; } @@ -1981,7 +1977,7 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s, } // Look for Perl character class symbols (extension). - const UGroup *g = MaybeParsePerlCCEscape(s, flags_); + const UGroup* g = MaybeParsePerlCCEscape(s, flags_); if (g != NULL) { AddUGroup(re->ccb_, g, g->sign, flags_); continue; @@ -2016,7 +2012,7 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s, } // Returns whether name is a valid capture name. -static bool IsValidCaptureName(const StringPiece& name) { +static bool IsValidCaptureName(absl::string_view name) { if (name.empty()) return false; @@ -2030,17 +2026,17 @@ static bool IsValidCaptureName(const StringPiece& name) { // if they start doing that for capture names, we won't follow suit. static const CharClass* const cc = []() { CharClassBuilder ccb; - for (StringPiece group : + for (absl::string_view group : {"Lu", "Ll", "Lt", "Lm", "Lo", "Nl", "Mn", "Mc", "Nd", "Pc"}) AddUGroup(&ccb, LookupGroup(group, unicode_groups, num_unicode_groups), +1, Regexp::NoParseFlags); return ccb.GetCharClass(); }(); - StringPiece t = name; + absl::string_view t = name; Rune r; while (!t.empty()) { - if (StringPieceToRune(&r, &t, NULL) < 0) + if (StringViewToRune(&r, &t, NULL) < 0) return false; if (cc->Contains(r)) continue; @@ -2054,18 +2050,16 @@ static bool IsValidCaptureName(const StringPiece& name) { // The caller must check that s begins with "(?". // Returns true on success. If the Perl flag is not // well-formed or not supported, sets status_ and returns false. -bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { - StringPiece t = *s; +bool Regexp::ParseState::ParsePerlFlags(absl::string_view* s) { + absl::string_view t = *s; // Caller is supposed to check this. if (!(flags_ & PerlX) || t.size() < 2 || t[0] != '(' || t[1] != '?') { - LOG(DFATAL) << "Bad call to ParseState::ParsePerlFlags"; status_->set_code(kRegexpInternalError); + LOG(DFATAL) << "Bad call to ParseState::ParsePerlFlags"; return false; } - t.remove_prefix(2); // "(?" - // Check for named captures, first introduced in Python's regexp library. // As usual, there are three slightly different syntaxes: // @@ -2079,22 +2073,23 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { // support all three as well. EcmaScript 4 uses only the Python form. // // In both the open source world (via Code Search) and the - // Google source tree, (?P<expr>name) is the dominant form, - // so that's the one we implement. One is enough. - if (t.size() > 2 && t[0] == 'P' && t[1] == '<') { + // Google source tree, (?P<name>expr) and (?<name>expr) are the + // dominant forms of named captures and both are supported. + if ((t.size() > 4 && t[2] == 'P' && t[3] == '<') || + (t.size() > 3 && t[2] == '<')) { // Pull out name. - size_t end = t.find('>', 2); - if (end == StringPiece::npos) { - if (!IsValidUTF8(*s, status_)) + size_t begin = t[2] == 'P' ? 4 : 3; + size_t end = t.find('>', begin); + if (end == absl::string_view::npos) { + if (!IsValidUTF8(t, status_)) return false; status_->set_code(kRegexpBadNamedCapture); - status_->set_error_arg(*s); + status_->set_error_arg(t); return false; } - // t is "P<name>...", t[end] == '>' - StringPiece capture(t.data()-2, end+3); // "(?P<name>" - StringPiece name(t.data()+2, end-2); // "name" + absl::string_view capture(t.data(), end+1); + absl::string_view name(t.data()+begin, end-begin); if (!IsValidUTF8(name, status_)) return false; if (!IsValidCaptureName(name)) { @@ -2108,11 +2103,12 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { return false; } - s->remove_prefix( - static_cast<size_t>(capture.data() + capture.size() - s->data())); + s->remove_prefix(capture.size()); return true; } + t.remove_prefix(2); // "(?" + bool negated = false; bool sawflags = false; int nflags = flags_; @@ -2120,7 +2116,7 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { for (bool done = false; !done; ) { if (t.empty()) goto BadPerlOp; - if (StringPieceToRune(&c, &t, status_) < 0) + if (StringViewToRune(&c, &t, status_) < 0) return false; switch (c) { default: @@ -2193,7 +2189,7 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { BadPerlOp: status_->set_code(kRegexpBadPerlOp); status_->set_error_arg( - StringPiece(s->data(), static_cast<size_t>(t.data() - s->data()))); + absl::string_view(s->data(), static_cast<size_t>(t.data() - s->data()))); return false; } @@ -2201,7 +2197,7 @@ BadPerlOp: // into UTF8 encoding in string. // Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is // deprecated and because it rejects code points 0x80-0x9F. -void ConvertLatin1ToUTF8(const StringPiece& latin1, std::string* utf) { +void ConvertLatin1ToUTF8(absl::string_view latin1, std::string* utf) { char buf[UTFmax]; utf->clear(); @@ -2216,7 +2212,7 @@ void ConvertLatin1ToUTF8(const StringPiece& latin1, std::string* utf) { // returning the corresponding Regexp tree. // The caller must Decref the return value when done with it. // Returns NULL on error. -Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, +Regexp* Regexp::Parse(absl::string_view s, ParseFlags global_flags, RegexpStatus* status) { // Make status non-NULL (easier on everyone else). RegexpStatus xstatus; @@ -2224,7 +2220,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, status = &xstatus; ParseState ps(global_flags, s, status); - StringPiece t = s; + absl::string_view t = s; // Convert regexp to UTF-8 (easier on the rest of the parser). if (global_flags & Latin1) { @@ -2238,7 +2234,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, // Special parse loop for literal string. while (!t.empty()) { Rune r; - if (StringPieceToRune(&r, &t, status) < 0) + if (StringViewToRune(&r, &t, status) < 0) return NULL; if (!ps.PushLiteral(r)) return NULL; @@ -2246,13 +2242,13 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, return ps.DoFinish(); } - StringPiece lastunary = StringPiece(); + absl::string_view lastunary = absl::string_view(); while (!t.empty()) { - StringPiece isunary = StringPiece(); + absl::string_view isunary = absl::string_view(); switch (t[0]) { default: { Rune r; - if (StringPieceToRune(&r, &t, status) < 0) + if (StringViewToRune(&r, &t, status) < 0) return NULL; if (!ps.PushLiteral(r)) return NULL; @@ -2271,7 +2267,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, if (!ps.DoLeftParenNoCapture()) return NULL; } else { - if (!ps.DoLeftParen(StringPiece())) + if (!ps.DoLeftParen(absl::string_view())) return NULL; } t.remove_prefix(1); // '(' @@ -2327,7 +2323,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, op = kRegexpQuest; goto Rep; Rep: - StringPiece opstr = t; + absl::string_view opstr = t; bool nongreedy = false; t.remove_prefix(1); // '*' or '+' or '?' if (ps.flags() & PerlX) { @@ -2340,14 +2336,14 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, // a** is a syntax error, not a double-star. // (and a++ means something else entirely, which we don't support!) status->set_code(kRegexpRepeatOp); - status->set_error_arg(StringPiece( + status->set_error_arg(absl::string_view( lastunary.data(), static_cast<size_t>(t.data() - lastunary.data()))); return NULL; } } - opstr = StringPiece(opstr.data(), - static_cast<size_t>(t.data() - opstr.data())); + opstr = absl::string_view(opstr.data(), + static_cast<size_t>(t.data() - opstr.data())); if (!ps.PushRepeatOp(op, opstr, nongreedy)) return NULL; isunary = opstr; @@ -2356,7 +2352,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, case '{': { // Counted repetition. int lo, hi; - StringPiece opstr = t; + absl::string_view opstr = t; if (!MaybeParseRepetition(&t, &lo, &hi)) { // Treat like a literal. if (!ps.PushLiteral('{')) @@ -2373,14 +2369,14 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, if (!lastunary.empty()) { // Not allowed to stack repetition operators. status->set_code(kRegexpRepeatOp); - status->set_error_arg(StringPiece( + status->set_error_arg(absl::string_view( lastunary.data(), static_cast<size_t>(t.data() - lastunary.data()))); return NULL; } } - opstr = StringPiece(opstr.data(), - static_cast<size_t>(t.data() - opstr.data())); + opstr = absl::string_view(opstr.data(), + static_cast<size_t>(t.data() - opstr.data())); if (!ps.PushRepetition(lo, hi, opstr, nongreedy)) return NULL; isunary = opstr; @@ -2430,7 +2426,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, break; } Rune r; - if (StringPieceToRune(&r, &t, status) < 0) + if (StringViewToRune(&r, &t, status) < 0) return NULL; if (!ps.PushLiteral(r)) return NULL; @@ -2456,7 +2452,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, } } - const UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags()); + const UGroup* g = MaybeParsePerlCCEscape(&t, ps.flags()); if (g != NULL) { Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase); re->ccb_ = new CharClassBuilder; diff --git a/re2/prefilter.cc b/re2/prefilter.cc index a47b312..3c7886f 100644 --- a/re2/prefilter.cc +++ b/re2/prefilter.cc @@ -7,11 +7,11 @@ #include <stddef.h> #include <stdint.h> #include <string> +#include <utility> #include <vector> -#include "util/util.h" +#include "absl/strings/str_format.h" #include "util/logging.h" -#include "util/strutil.h" #include "util/utf.h" #include "re2/re2.h" #include "re2/unicode_casefold.h" @@ -21,9 +21,6 @@ namespace re2 { static const bool ExtraDebug = false; -typedef std::set<std::string>::iterator SSIter; -typedef std::set<std::string>::const_iterator ConstSSIter; - // Initializes a Prefilter, allocating subs_ as necessary. Prefilter::Prefilter(Op op) { op_ = op; @@ -140,7 +137,7 @@ Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) { return AndOr(OR, a, b); } -static void SimplifyStringSet(std::set<std::string>* ss) { +void Prefilter::SimplifyStringSet(SSet* ss) { // Now make sure that the strings aren't redundant. For example, if // we know "ab" is a required string, then it doesn't help at all to // know that "abc" is also a required string, so delete "abc". This @@ -149,13 +146,19 @@ static void SimplifyStringSet(std::set<std::string>* ss) { // candidate for match, so further matching "abc" is redundant. // Note that we must ignore "" because find() would find it at the // start of everything and thus we would end up erasing everything. - for (SSIter i = ss->begin(); i != ss->end(); ++i) { - if (i->empty()) - continue; + // + // The SSet sorts strings by length, then lexicographically. Note that + // smaller strings appear first and all strings must be unique. These + // observations let us skip string comparisons when possible. + SSIter i = ss->begin(); + if (i != ss->end() && i->empty()) { + ++i; + } + for (; i != ss->end(); ++i) { SSIter j = i; ++j; while (j != ss->end()) { - if (j->find(*i) != std::string::npos) { + if (j->size() > i->size() && j->find(*i) != std::string::npos) { j = ss->erase(j); continue; } @@ -164,7 +167,7 @@ static void SimplifyStringSet(std::set<std::string>* ss) { } } -Prefilter* Prefilter::OrStrings(std::set<std::string>* ss) { +Prefilter* Prefilter::OrStrings(SSet* ss) { Prefilter* or_prefilter = new Prefilter(NONE); SimplifyStringSet(ss); for (SSIter i = ss->begin(); i != ss->end(); ++i) @@ -226,14 +229,14 @@ class Prefilter::Info { // Caller takes ownership of the Prefilter. Prefilter* TakeMatch(); - std::set<std::string>& exact() { return exact_; } + SSet& exact() { return exact_; } bool is_exact() const { return is_exact_; } class Walker; private: - std::set<std::string> exact_; + SSet exact_; // When is_exact_ is true, the strings that match // are placed in exact_. When it is no longer an exact @@ -286,18 +289,7 @@ std::string Prefilter::Info::ToString() { return ""; } -// Add the strings from src to dst. -static void CopyIn(const std::set<std::string>& src, - std::set<std::string>* dst) { - for (ConstSSIter i = src.begin(); i != src.end(); ++i) - dst->insert(*i); -} - -// Add the cross-product of a and b to dst. -// (For each string i in a and j in b, add i+j.) -static void CrossProduct(const std::set<std::string>& a, - const std::set<std::string>& b, - std::set<std::string>* dst) { +void Prefilter::CrossProduct(const SSet& a, const SSet& b, SSet* dst) { for (ConstSSIter i = a.begin(); i != a.end(); ++i) for (ConstSSIter j = b.begin(); j != b.end(); ++j) dst->insert(*i + *j); @@ -343,8 +335,14 @@ Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) { Info *ab = new Info(); if (a->is_exact_ && b->is_exact_) { - CopyIn(a->exact_, &ab->exact_); - CopyIn(b->exact_, &ab->exact_); + // Avoid string copies by moving the larger exact_ set into + // ab directly, then merge in the smaller set. + if (a->exact_.size() < b->exact_.size()) { + using std::swap; + swap(a, b); + } + ab->exact_ = std::move(a->exact_); + ab->exact_.insert(b->exact_.begin(), b->exact_.end()); ab->is_exact_ = true; } else { // Either a or b has is_exact_ = false. If the other @@ -532,8 +530,8 @@ Prefilter::Info* Prefilter::Info::Walker::PostVisit( switch (re->op()) { default: case kRegexpRepeat: - LOG(DFATAL) << "Bad regexp op " << re->op(); info = EmptyString(); + LOG(DFATAL) << "Bad regexp op " << re->op(); break; case kRegexpNoMatch: @@ -665,7 +663,7 @@ std::string Prefilter::DebugString() const { switch (op_) { default: LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_; - return StringPrintf("op%d", op_); + return absl::StrFormat("op%d", op_); case NONE: return "*no-matches*"; case ATOM: diff --git a/re2/prefilter.h b/re2/prefilter.h index 4fedeb4..018691d 100644 --- a/re2/prefilter.h +++ b/re2/prefilter.h @@ -13,7 +13,6 @@ #include <string> #include <vector> -#include "util/util.h" #include "util/logging.h" namespace re2 { @@ -60,8 +59,59 @@ class Prefilter { std::string DebugString() const; private: + template <typename H> + friend H AbslHashValue(H h, const Prefilter& a) { + h = H::combine(std::move(h), a.op_); + if (a.op_ == ATOM) { + h = H::combine(std::move(h), a.atom_); + } else if (a.op_ == AND || a.op_ == OR) { + h = H::combine(std::move(h), a.subs_->size()); + for (size_t i = 0; i < a.subs_->size(); ++i) { + h = H::combine(std::move(h), (*a.subs_)[i]->unique_id_); + } + } + return h; + } + + friend bool operator==(const Prefilter& a, const Prefilter& b) { + if (&a == &b) { + return true; + } + if (a.op_ != b.op_) { + return false; + } + if (a.op_ == ATOM) { + if (a.atom_ != b.atom_) { + return false; + } + } else if (a.op_ == AND || a.op_ == OR) { + if (a.subs_->size() != b.subs_->size()) { + return false; + } + for (size_t i = 0; i < a.subs_->size(); ++i) { + if ((*a.subs_)[i]->unique_id_ != (*b.subs_)[i]->unique_id_) { + return false; + } + } + } + return true; + } + + // A comparator used to store exact strings. We compare by length, + // then lexicographically. This ordering makes it easier to reduce the + // set of strings in SimplifyStringSet. + struct LengthThenLex { + bool operator()(const std::string& a, const std::string& b) const { + return (a.size() < b.size()) || (a.size() == b.size() && a < b); + } + }; + class Info; + using SSet = std::set<std::string, LengthThenLex>; + using SSIter = SSet::iterator; + using ConstSSIter = SSet::const_iterator; + // Combines two prefilters together to create an AND. The passed // Prefilters will be part of the returned Prefilter or deleted. static Prefilter* And(Prefilter* a, Prefilter* b); @@ -77,12 +127,21 @@ class Prefilter { static Prefilter* FromString(const std::string& str); - static Prefilter* OrStrings(std::set<std::string>* ss); + static Prefilter* OrStrings(SSet* ss); static Info* BuildInfo(Regexp* re); Prefilter* Simplify(); + // Removes redundant strings from the set. A string is redundant if + // any of the other strings appear as a substring. The empty string + // is a special case, which is ignored. + static void SimplifyStringSet(SSet* ss); + + // Adds the cross-product of a and b to dst. + // (For each string i in a and j in b, add i+j.) + static void CrossProduct(const SSet& a, const SSet& b, SSet* dst); + // Kind of Prefilter. Op op_; diff --git a/re2/prefilter_tree.cc b/re2/prefilter_tree.cc index fdf4e08..3afb241 100644 --- a/re2/prefilter_tree.cc +++ b/re2/prefilter_tree.cc @@ -6,16 +6,14 @@ #include <stddef.h> #include <algorithm> -#include <map> +#include <cmath> #include <memory> -#include <set> #include <string> #include <utility> #include <vector> -#include "util/util.h" +#include "absl/strings/str_format.h" #include "util/logging.h" -#include "util/strutil.h" #include "re2/prefilter.h" #include "re2/re2.h" @@ -36,9 +34,6 @@ PrefilterTree::PrefilterTree(int min_atom_len) PrefilterTree::~PrefilterTree() { for (size_t i = 0; i < prefilter_vec_.size(); i++) delete prefilter_vec_[i]; - - for (size_t i = 0; i < entries_.size(); i++) - delete entries_[i].parents; } void PrefilterTree::Add(Prefilter* prefilter) { @@ -67,65 +62,18 @@ void PrefilterTree::Compile(std::vector<std::string>* atom_vec) { compiled_ = true; - // TODO(junyer): Use std::unordered_set<Prefilter*> instead? - NodeMap nodes; + NodeSet nodes; AssignUniqueIds(&nodes, atom_vec); - - // Identify nodes that are too common among prefilters and are - // triggering too many parents. Then get rid of them if possible. - // Note that getting rid of a prefilter node simply means they are - // no longer necessary for their parent to trigger; that is, we do - // not miss out on any regexps triggering by getting rid of a - // prefilter node. - for (size_t i = 0; i < entries_.size(); i++) { - StdIntMap* parents = entries_[i].parents; - if (parents->size() > 8) { - // This one triggers too many things. If all the parents are AND - // nodes and have other things guarding them, then get rid of - // this trigger. TODO(vsri): Adjust the threshold appropriately, - // make it a function of total number of nodes? - bool have_other_guard = true; - for (StdIntMap::iterator it = parents->begin(); - it != parents->end(); ++it) { - have_other_guard = have_other_guard && - (entries_[it->first].propagate_up_at_count > 1); - } - - if (have_other_guard) { - for (StdIntMap::iterator it = parents->begin(); - it != parents->end(); ++it) - entries_[it->first].propagate_up_at_count -= 1; - - parents->clear(); // Forget the parents - } - } - } - if (ExtraDebug) PrintDebugInfo(&nodes); } -Prefilter* PrefilterTree::CanonicalNode(NodeMap* nodes, Prefilter* node) { - std::string node_string = NodeString(node); - NodeMap::iterator iter = nodes->find(node_string); - if (iter == nodes->end()) - return NULL; - return (*iter).second; -} - -std::string PrefilterTree::NodeString(Prefilter* node) const { - // Adding the operation disambiguates AND/OR/atom nodes. - std::string s = StringPrintf("%d", node->op()) + ":"; - if (node->op() == Prefilter::ATOM) { - s += node->atom(); - } else { - for (size_t i = 0; i < node->subs()->size(); i++) { - if (i > 0) - s += ','; - s += StringPrintf("%d", (*node->subs())[i]->unique_id()); - } +Prefilter* PrefilterTree::CanonicalNode(NodeSet* nodes, Prefilter* node) { + NodeSet::const_iterator iter = nodes->find(node); + if (iter != nodes->end()) { + return *iter; } - return s; + return NULL; } bool PrefilterTree::KeepNode(Prefilter* node) const { @@ -165,7 +113,7 @@ bool PrefilterTree::KeepNode(Prefilter* node) const { } } -void PrefilterTree::AssignUniqueIds(NodeMap* nodes, +void PrefilterTree::AssignUniqueIds(NodeSet* nodes, std::vector<std::string>* atom_vec) { atom_vec->clear(); @@ -205,9 +153,9 @@ void PrefilterTree::AssignUniqueIds(NodeMap* nodes, node->set_unique_id(-1); Prefilter* canonical = CanonicalNode(nodes, node); if (canonical == NULL) { - // Any further nodes that have the same node string + // Any further nodes that have the same atom/subs // will find this node as the canonical node. - nodes->emplace(NodeString(node), node); + nodes->emplace(node); if (node->op() == Prefilter::ATOM) { atom_vec->push_back(node->atom()); atom_index_to_id_.push_back(unique_id); @@ -217,65 +165,42 @@ void PrefilterTree::AssignUniqueIds(NodeMap* nodes, node->set_unique_id(canonical->unique_id()); } } - entries_.resize(nodes->size()); - - // Create parent StdIntMap for the entries. - for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { - Prefilter* prefilter = v[i]; - if (prefilter == NULL) - continue; - - if (CanonicalNode(nodes, prefilter) != prefilter) - continue; - - Entry* entry = &entries_[prefilter->unique_id()]; - entry->parents = new StdIntMap(); - } + entries_.resize(unique_id); // Fill the entries. for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { Prefilter* prefilter = v[i]; if (prefilter == NULL) continue; - if (CanonicalNode(nodes, prefilter) != prefilter) continue; - - Entry* entry = &entries_[prefilter->unique_id()]; - + int id = prefilter->unique_id(); switch (prefilter->op()) { default: - case Prefilter::ALL: LOG(DFATAL) << "Unexpected op: " << prefilter->op(); return; case Prefilter::ATOM: - entry->propagate_up_at_count = 1; + entries_[id].propagate_up_at_count = 1; break; case Prefilter::OR: case Prefilter::AND: { - std::set<int> uniq_child; + // For each child, we append our id to the child's list of + // parent ids... unless we happen to have done so already. + // The number of appends is the number of unique children, + // which allows correct upward propagation from AND nodes. + int up_count = 0; for (size_t j = 0; j < prefilter->subs()->size(); j++) { - Prefilter* child = (*prefilter->subs())[j]; - Prefilter* canonical = CanonicalNode(nodes, child); - if (canonical == NULL) { - LOG(DFATAL) << "Null canonical node"; - return; - } - int child_id = canonical->unique_id(); - uniq_child.insert(child_id); - // To the child, we want to add to parent indices. - Entry* child_entry = &entries_[child_id]; - if (child_entry->parents->find(prefilter->unique_id()) == - child_entry->parents->end()) { - (*child_entry->parents)[prefilter->unique_id()] = 1; + int child_id = (*prefilter->subs())[j]->unique_id(); + std::vector<int>& parents = entries_[child_id].parents; + if (parents.empty() || parents.back() != id) { + parents.push_back(id); + up_count++; } } - entry->propagate_up_at_count = prefilter->op() == Prefilter::AND - ? static_cast<int>(uniq_child.size()) - : 1; - + entries_[id].propagate_up_at_count = + prefilter->op() == Prefilter::AND ? up_count : 1; break; } } @@ -290,6 +215,52 @@ void PrefilterTree::AssignUniqueIds(NodeMap* nodes, Entry* entry = &entries_[id]; entry->regexps.push_back(static_cast<int>(i)); } + + // Lastly, using probability-based heuristics, we identify nodes + // that trigger too many parents and then we try to prune edges. + // We use logarithms below to avoid the likelihood of underflow. + double log_num_regexps = std::log(prefilter_vec_.size() - unfiltered_.size()); + // Hoisted this above the loop so that we don't thrash the heap. + std::vector<std::pair<size_t, int>> entries_by_num_edges; + for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { + Prefilter* prefilter = v[i]; + // Pruning applies only to AND nodes because it "just" reduces + // precision; applied to OR nodes, it would break correctness. + if (prefilter == NULL || prefilter->op() != Prefilter::AND) + continue; + if (CanonicalNode(nodes, prefilter) != prefilter) + continue; + int id = prefilter->unique_id(); + + // Sort the current node's children by the numbers of parents. + entries_by_num_edges.clear(); + for (size_t j = 0; j < prefilter->subs()->size(); j++) { + int child_id = (*prefilter->subs())[j]->unique_id(); + const std::vector<int>& parents = entries_[child_id].parents; + entries_by_num_edges.emplace_back(parents.size(), child_id); + } + std::stable_sort(entries_by_num_edges.begin(), entries_by_num_edges.end()); + + // A running estimate of how many regexps will be triggered by + // pruning the remaining children's edges to the current node. + // Our nominal target is one, so the threshold is log(1) == 0; + // pruning occurs iff the child has more than nine edges left. + double log_num_triggered = log_num_regexps; + for (const auto& pair : entries_by_num_edges) { + int child_id = pair.second; + std::vector<int>& parents = entries_[child_id].parents; + if (log_num_triggered > 0.) { + log_num_triggered += std::log(parents.size()); + log_num_triggered -= log_num_regexps; + } else if (parents.size() > 9) { + auto it = std::find(parents.begin(), parents.end(), id); + if (it != parents.end()) { + parents.erase(it); + entries_[id].propagate_up_at_count--; + } + } + } + } } // Functions for triggering during search. @@ -313,7 +284,7 @@ void PrefilterTree::RegexpsGivenStrings( for (size_t j = 0; j < matched_atoms.size(); j++) matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]); PropagateMatch(matched_atom_ids, ®exps_map); - for (IntMap::iterator it = regexps_map.begin(); + for (IntMap::const_iterator it = regexps_map.begin(); it != regexps_map.end(); ++it) regexps->push_back(it->index()); @@ -329,17 +300,14 @@ void PrefilterTree::PropagateMatch(const std::vector<int>& atom_ids, IntMap work(static_cast<int>(entries_.size())); for (size_t i = 0; i < atom_ids.size(); i++) work.set(atom_ids[i], 1); - for (IntMap::iterator it = work.begin(); it != work.end(); ++it) { + for (IntMap::const_iterator it = work.begin(); it != work.end(); ++it) { const Entry& entry = entries_[it->index()]; // Record regexps triggered. for (size_t i = 0; i < entry.regexps.size(); i++) regexps->set(entry.regexps[i], 1); int c; // Pass trigger up to parents. - for (StdIntMap::iterator it = entry.parents->begin(); - it != entry.parents->end(); - ++it) { - int j = it->first; + for (int j : entry.parents) { const Entry& parent = entries_[j]; // Delay until all the children have succeeded. if (parent.propagate_up_at_count > 1) { @@ -364,23 +332,22 @@ void PrefilterTree::PrintPrefilter(int regexpid) { LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]); } -void PrefilterTree::PrintDebugInfo(NodeMap* nodes) { +void PrefilterTree::PrintDebugInfo(NodeSet* nodes) { LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size(); LOG(ERROR) << "#Unique Nodes: " << entries_.size(); for (size_t i = 0; i < entries_.size(); i++) { - StdIntMap* parents = entries_[i].parents; + const std::vector<int>& parents = entries_[i].parents; const std::vector<int>& regexps = entries_[i].regexps; LOG(ERROR) << "EntryId: " << i - << " N: " << parents->size() << " R: " << regexps.size(); - for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it) - LOG(ERROR) << it->first; + << " N: " << parents.size() << " R: " << regexps.size(); + for (int parent : parents) + LOG(ERROR) << parent; } - LOG(ERROR) << "Map:"; - for (NodeMap::const_iterator iter = nodes->begin(); + LOG(ERROR) << "Set:"; + for (NodeSet::const_iterator iter = nodes->begin(); iter != nodes->end(); ++iter) - LOG(ERROR) << "NodeId: " << (*iter).second->unique_id() - << " Str: " << (*iter).first; + LOG(ERROR) << "NodeId: " << (*iter)->unique_id(); } std::string PrefilterTree::DebugNodeString(Prefilter* node) const { @@ -395,7 +362,7 @@ std::string PrefilterTree::DebugNodeString(Prefilter* node) const { for (size_t i = 0; i < node->subs()->size(); i++) { if (i > 0) node_string += ','; - node_string += StringPrintf("%d", (*node->subs())[i]->unique_id()); + node_string += absl::StrFormat("%d", (*node->subs())[i]->unique_id()); node_string += ":"; node_string += DebugNodeString((*node->subs())[i]); } diff --git a/re2/prefilter_tree.h b/re2/prefilter_tree.h index 5d73074..71e7a29 100644 --- a/re2/prefilter_tree.h +++ b/re2/prefilter_tree.h @@ -16,13 +16,13 @@ // atoms) that the user of this class should use to do the string // matching. -#include <map> #include <string> #include <vector> -#include "util/util.h" +#include "absl/container/flat_hash_set.h" #include "re2/prefilter.h" #include "re2/sparse_array.h" +#include "util/logging.h" namespace re2 { @@ -58,9 +58,25 @@ class PrefilterTree { void PrintPrefilter(int regexpid); private: - typedef SparseArray<int> IntMap; - typedef std::map<int, int> StdIntMap; - typedef std::map<std::string, Prefilter*> NodeMap; + using IntMap = SparseArray<int>; + + struct PrefilterHash { + size_t operator()(const Prefilter* a) const { + DCHECK(a != NULL); + return absl::Hash<Prefilter>()(*a); + } + }; + + struct PrefilterEqual { + bool operator()(const Prefilter* a, const Prefilter* b) const { + DCHECK(a != NULL); + DCHECK(b != NULL); + return *a == *b; + } + }; + + using NodeSet = + absl::flat_hash_set<Prefilter*, PrefilterHash, PrefilterEqual>; // Each unique node has a corresponding Entry that helps in // passing the matching trigger information along the tree. @@ -77,7 +93,7 @@ class PrefilterTree { // are two different nodes, but they share the atom 'def'. So when // 'def' matches, it triggers two parents, corresponding to the two // different OR nodes. - StdIntMap* parents; + std::vector<int> parents; // When this node is ready to trigger the parent, what are the // regexps that are triggered. @@ -90,25 +106,22 @@ class PrefilterTree { // This function assigns unique ids to various parts of the // prefilter, by looking at if these nodes are already in the // PrefilterTree. - void AssignUniqueIds(NodeMap* nodes, std::vector<std::string>* atom_vec); + void AssignUniqueIds(NodeSet* nodes, std::vector<std::string>* atom_vec); // Given the matching atoms, find the regexps to be triggered. void PropagateMatch(const std::vector<int>& atom_ids, IntMap* regexps) const; - // Returns the prefilter node that has the same NodeString as this - // node. For the canonical node, returns node. - Prefilter* CanonicalNode(NodeMap* nodes, Prefilter* node); - - // A string that uniquely identifies the node. Assumes that the - // children of node has already been assigned unique ids. - std::string NodeString(Prefilter* node) const; + // Returns the prefilter node that has the same atom/subs as this + // node. For the canonical node, returns node. Assumes that the + // children of node have already been assigned unique ids. + Prefilter* CanonicalNode(NodeSet* nodes, Prefilter* node); // Recursively constructs a readable prefilter string. std::string DebugNodeString(Prefilter* node) const; // Used for debugging. - void PrintDebugInfo(NodeMap* nodes); + void PrintDebugInfo(NodeSet* nodes); // These are all the nodes formed by Compile. Essentially, there is // one node for each unique atom and each unique AND/OR node. diff --git a/re2/prog.cc b/re2/prog.cc index a700d35..6cadcfa 100644 --- a/re2/prog.cc +++ b/re2/prog.cc @@ -19,11 +19,10 @@ #include <memory> #include <utility> -#include "util/util.h" +#include "absl/base/macros.h" +#include "absl/strings/str_format.h" #include "util/logging.h" -#include "util/strutil.h" #include "re2/bitmap256.h" -#include "re2/stringpiece.h" namespace re2 { @@ -74,34 +73,34 @@ void Prog::Inst::InitFail() { std::string Prog::Inst::Dump() { switch (opcode()) { default: - return StringPrintf("opcode %d", static_cast<int>(opcode())); + return absl::StrFormat("opcode %d", static_cast<int>(opcode())); case kInstAlt: - return StringPrintf("alt -> %d | %d", out(), out1_); + return absl::StrFormat("alt -> %d | %d", out(), out1_); case kInstAltMatch: - return StringPrintf("altmatch -> %d | %d", out(), out1_); + return absl::StrFormat("altmatch -> %d | %d", out(), out1_); case kInstByteRange: - return StringPrintf("byte%s [%02x-%02x] %d -> %d", - foldcase() ? "/i" : "", - lo_, hi_, hint(), out()); + return absl::StrFormat("byte%s [%02x-%02x] %d -> %d", + foldcase() ? "/i" : "", + lo_, hi_, hint(), out()); case kInstCapture: - return StringPrintf("capture %d -> %d", cap_, out()); + return absl::StrFormat("capture %d -> %d", cap_, out()); case kInstEmptyWidth: - return StringPrintf("emptywidth %#x -> %d", - static_cast<int>(empty_), out()); + return absl::StrFormat("emptywidth %#x -> %d", + static_cast<int>(empty_), out()); case kInstMatch: - return StringPrintf("match! %d", match_id()); + return absl::StrFormat("match! %d", match_id()); case kInstNop: - return StringPrintf("nop -> %d", out()); + return absl::StrFormat("nop -> %d", out()); case kInstFail: - return StringPrintf("fail"); + return absl::StrFormat("fail"); } } @@ -143,7 +142,7 @@ static std::string ProgToString(Prog* prog, Workq* q) { for (Workq::iterator i = q->begin(); i != q->end(); ++i) { int id = *i; Prog::Inst* ip = prog->inst(id); - s += StringPrintf("%d. %s\n", id, ip->Dump().c_str()); + s += absl::StrFormat("%d. %s\n", id, ip->Dump()); AddToQueue(q, ip->out()); if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch) AddToQueue(q, ip->out1()); @@ -156,9 +155,9 @@ static std::string FlattenedProgToString(Prog* prog, int start) { for (int id = start; id < prog->size(); id++) { Prog::Inst* ip = prog->inst(id); if (ip->last()) - s += StringPrintf("%d. %s\n", id, ip->Dump().c_str()); + s += absl::StrFormat("%d. %s\n", id, ip->Dump()); else - s += StringPrintf("%d+ %s\n", id, ip->Dump().c_str()); + s += absl::StrFormat("%d+ %s\n", id, ip->Dump()); } return s; } @@ -189,7 +188,7 @@ std::string Prog::DumpByteMap() { while (c < 256-1 && bytemap_[c+1] == b) c++; int hi = c; - map += StringPrintf("[%02x-%02x] -> %d\n", lo, hi, b); + map += absl::StrFormat("[%02x-%02x] -> %d\n", lo, hi, b); } return map; } @@ -284,7 +283,7 @@ void Prog::Optimize() { } } -uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) { +uint32_t Prog::EmptyFlags(absl::string_view text, const char* p) { int flags = 0; // ^ and \A @@ -511,7 +510,7 @@ void Prog::ComputeByteMap() { builder.Build(bytemap_, &bytemap_range_); - if (0) { // For debugging, use trivial bytemap. + if ((0)) { // For debugging, use trivial bytemap. LOG(ERROR) << "Using trivial bytemap."; for (int i = 0; i < 256; i++) bytemap_[i] = static_cast<uint8_t>(i); @@ -813,7 +812,7 @@ void Prog::EmitList(int root, SparseArray<int>* rootmap, flat->back().set_opcode(kInstAltMatch); flat->back().set_out(static_cast<int>(flat->size())); flat->back().out1_ = static_cast<uint32_t>(flat->size())+1; - FALLTHROUGH_INTENDED; + ABSL_FALLTHROUGH_INTENDED; case kInstAlt: stk->push_back(ip->out1()); @@ -11,12 +11,12 @@ #include <stdint.h> #include <functional> -#include <mutex> #include <string> #include <vector> #include <type_traits> -#include "util/util.h" +#include "absl/base/call_once.h" +#include "absl/strings/string_view.h" #include "util/logging.h" #include "re2/pod_array.h" #include "re2/re2.h" @@ -249,7 +249,7 @@ class Prog { // Returns the set of kEmpty flags that are in effect at // position p within context. - static uint32_t EmptyFlags(const StringPiece& context, const char* p); + static uint32_t EmptyFlags(absl::string_view context, const char* p); // Returns whether byte c is a word character: ASCII only. // Used by the implementation of \b and \B. @@ -274,15 +274,15 @@ class Prog { // If a particular submatch is not matched during the regexp match, // it is set to NULL. // - // Matching text == StringPiece(NULL, 0) is treated as any other empty + // Matching text == absl::string_view() is treated as any other empty // string, but note that on return, it will not be possible to distinguish // submatches that matched that empty string from submatches that didn't // match anything. Either way, match[i] == NULL. // Search using NFA: can find submatches but kind of slow. - bool SearchNFA(const StringPiece& text, const StringPiece& context, - Anchor anchor, MatchKind kind, - StringPiece* match, int nmatch); + bool SearchNFA(absl::string_view text, absl::string_view context, + Anchor anchor, MatchKind kind, absl::string_view* match, + int nmatch); // Search using DFA: much faster than NFA but only finds // end of match and can use a lot more memory. @@ -290,8 +290,8 @@ class Prog { // If the DFA runs out of memory, sets *failed to true and returns false. // If matches != NULL and kind == kManyMatch and there is a match, // SearchDFA fills matches with the match IDs of the final matching state. - bool SearchDFA(const StringPiece& text, const StringPiece& context, - Anchor anchor, MatchKind kind, StringPiece* match0, + bool SearchDFA(absl::string_view text, absl::string_view context, + Anchor anchor, MatchKind kind, absl::string_view* match0, bool* failed, SparseSet* matches); // The callback issued after building each DFA state with BuildEntireDFA(). @@ -321,16 +321,16 @@ class Prog { // but much faster than NFA (competitive with PCRE) // for those expressions. bool IsOnePass(); - bool SearchOnePass(const StringPiece& text, const StringPiece& context, - Anchor anchor, MatchKind kind, - StringPiece* match, int nmatch); + bool SearchOnePass(absl::string_view text, absl::string_view context, + Anchor anchor, MatchKind kind, absl::string_view* match, + int nmatch); // Bit-state backtracking. Fast on small cases but uses memory // proportional to the product of the list count and the text size. bool CanBitState() { return list_heads_.data() != NULL; } - bool SearchBitState(const StringPiece& text, const StringPiece& context, - Anchor anchor, MatchKind kind, - StringPiece* match, int nmatch); + bool SearchBitState(absl::string_view text, absl::string_view context, + Anchor anchor, MatchKind kind, absl::string_view* match, + int nmatch); static const int kMaxOnePassCapture = 5; // $0 through $4 @@ -340,10 +340,9 @@ class Prog { // It is also recursive, so can't use in production (will overflow stacks). // The name "Unsafe" here is supposed to be a flag that // you should not be using this function. - bool UnsafeSearchBacktrack(const StringPiece& text, - const StringPiece& context, + bool UnsafeSearchBacktrack(absl::string_view text, absl::string_view context, Anchor anchor, MatchKind kind, - StringPiece* match, int nmatch); + absl::string_view* match, int nmatch); // Computes range for any strings matching regexp. The min and max can in // some cases be arbitrarily precise, so the caller gets to specify the @@ -361,7 +360,6 @@ class Prog { // Returns true on success, false on error. bool PossibleMatchRange(std::string* min, std::string* max, int maxlen); - // EXPERIMENTAL! SUBJECT TO CHANGE! // Outputs the program fanout into the given sparse array. void Fanout(SparseArray<int>* fanout); @@ -445,8 +443,8 @@ class Prog { uint8_t bytemap_[256]; // map from input bytes to byte classes - std::once_flag dfa_first_once_; - std::once_flag dfa_longest_once_; + absl::once_flag dfa_first_once_; + absl::once_flag dfa_longest_once_; Prog(const Prog&) = delete; Prog& operator=(const Prog&) = delete; @@ -456,10 +454,10 @@ class Prog { // that don't allow comparisons between different objects - not even if // those objects are views into the same string! Thus, we provide these // conversion functions for convenience. -static inline const char* BeginPtr(const StringPiece& s) { +static inline const char* BeginPtr(absl::string_view s) { return s.data(); } -static inline const char* EndPtr(const StringPiece& s) { +static inline const char* EndPtr(absl::string_view s) { return s.data() + s.size(); } @@ -21,12 +21,14 @@ #include <algorithm> #include <atomic> #include <iterator> -#include <mutex> #include <string> #include <utility> #include <vector> -#include "util/util.h" +#include "absl/base/macros.h" +#include "absl/container/fixed_array.h" +#include "absl/strings/ascii.h" +#include "absl/strings/str_format.h" #include "util/logging.h" #include "util/strutil.h" #include "util/utf.h" @@ -36,6 +38,13 @@ namespace re2 { +// Controls the maximum count permitted by GlobalReplace(); -1 is unlimited. +static int maximum_global_replace_count = -1; + +void RE2::FUZZING_ONLY_set_maximum_global_replace_count(int i) { + maximum_global_replace_count = i; +} + // Maximum number of args we can set static const int kMaxArgs = 16; static const int kVecSize = 1+kMaxArgs; @@ -43,11 +52,11 @@ static const int kVecSize = 1+kMaxArgs; const int RE2::Options::kDefaultMaxMem; // initialized in re2.h RE2::Options::Options(RE2::CannedOptions opt) - : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8), + : max_mem_(kDefaultMaxMem), + encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8), posix_syntax_(opt == RE2::POSIX), longest_match_(opt == RE2::POSIX), log_errors_(opt != RE2::Quiet), - max_mem_(kDefaultMaxMem), literal_(false), never_nl_(false), dot_nl_(false), @@ -58,11 +67,30 @@ RE2::Options::Options(RE2::CannedOptions opt) one_line_(false) { } -// static empty objects for use as const references. -// To avoid global constructors, allocated in RE2::Init(). -static const std::string* empty_string; -static const std::map<std::string, int>* empty_named_groups; -static const std::map<int, std::string>* empty_group_names; +// Empty objects for use as const references. +// Statically allocating the storage and then +// lazily constructing the objects (in a once +// in RE2::Init()) avoids global constructors +// and the false positives (thanks, Valgrind) +// about memory leaks at program termination. +struct EmptyStorage { + std::string empty_string; + std::map<std::string, int> empty_named_groups; + std::map<int, std::string> empty_group_names; +}; +alignas(EmptyStorage) static char empty_storage[sizeof(EmptyStorage)]; + +static inline std::string* empty_string() { + return &reinterpret_cast<EmptyStorage*>(empty_storage)->empty_string; +} + +static inline std::map<std::string, int>* empty_named_groups() { + return &reinterpret_cast<EmptyStorage*>(empty_storage)->empty_named_groups; +} + +static inline std::map<int, std::string>* empty_group_names() { + return &reinterpret_cast<EmptyStorage*>(empty_storage)->empty_group_names; +} // Converts from Regexp error code to RE2 error code. // Maybe some day they will diverge. In any event, this @@ -103,7 +131,7 @@ static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) { return RE2::ErrorInternal; } -static std::string trunc(const StringPiece& pattern) { +static std::string trunc(absl::string_view pattern) { if (pattern.size() < 100) return std::string(pattern); return std::string(pattern.substr(0, 100)) + "..."; @@ -118,11 +146,11 @@ RE2::RE2(const std::string& pattern) { Init(pattern, DefaultOptions); } -RE2::RE2(const StringPiece& pattern) { +RE2::RE2(absl::string_view pattern) { Init(pattern, DefaultOptions); } -RE2::RE2(const StringPiece& pattern, const Options& options) { +RE2::RE2(absl::string_view pattern, const Options& options) { Init(pattern, options); } @@ -170,26 +198,26 @@ int RE2::Options::ParseFlags() const { return flags; } -void RE2::Init(const StringPiece& pattern, const Options& options) { - static std::once_flag empty_once; - std::call_once(empty_once, []() { - empty_string = new std::string; - empty_named_groups = new std::map<std::string, int>; - empty_group_names = new std::map<int, std::string>; +void RE2::Init(absl::string_view pattern, const Options& options) { + static absl::once_flag empty_once; + absl::call_once(empty_once, []() { + (void) new (empty_storage) EmptyStorage; }); - pattern_.assign(pattern.data(), pattern.size()); + pattern_ = new std::string(pattern); options_.Copy(options); entire_regexp_ = NULL; - error_ = empty_string; - error_code_ = NoError; - error_arg_.clear(); - prefix_.clear(); - prefix_foldcase_ = false; suffix_regexp_ = NULL; - prog_ = NULL; + error_ = empty_string(); + error_arg_ = empty_string(); + num_captures_ = -1; + error_code_ = NoError; + longest_match_ = options_.longest_match(); is_one_pass_ = false; + prefix_foldcase_ = false; + prefix_.clear(); + prog_ = NULL; rprog_ = NULL; named_groups_ = NULL; @@ -197,25 +225,29 @@ void RE2::Init(const StringPiece& pattern, const Options& options) { RegexpStatus status; entire_regexp_ = Regexp::Parse( - pattern_, + *pattern_, static_cast<Regexp::ParseFlags>(options_.ParseFlags()), &status); if (entire_regexp_ == NULL) { if (options_.log_errors()) { - LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': " + LOG(ERROR) << "Error parsing '" << trunc(*pattern_) << "': " << status.Text(); } error_ = new std::string(status.Text()); error_code_ = RegexpErrorToRE2(status.code()); - error_arg_ = std::string(status.error_arg()); + error_arg_ = new std::string(status.error_arg()); return; } + bool foldcase; re2::Regexp* suffix; - if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix)) + if (entire_regexp_->RequiredPrefix(&prefix_, &foldcase, &suffix)) { + prefix_foldcase_ = foldcase; suffix_regexp_ = suffix; - else + } + else { suffix_regexp_ = entire_regexp_->Incref(); + } // Two thirds of the memory goes to the forward Prog, // one third to the reverse prog, because the forward @@ -223,7 +255,7 @@ void RE2::Init(const StringPiece& pattern, const Options& options) { prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3); if (prog_ == NULL) { if (options_.log_errors()) - LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'"; + LOG(ERROR) << "Error compiling '" << trunc(*pattern_) << "'"; error_ = new std::string("pattern too large - compile failed"); error_code_ = RE2::ErrorPatternTooLarge; return; @@ -231,7 +263,7 @@ void RE2::Init(const StringPiece& pattern, const Options& options) { // We used to compute this lazily, but it's used during the // typical control flow for a match call, so we now compute - // it eagerly, which avoids the overhead of std::once_flag. + // it eagerly, which avoids the overhead of absl::once_flag. num_captures_ = suffix_regexp_->NumCaptures(); // Could delay this until the first match call that @@ -244,12 +276,13 @@ void RE2::Init(const StringPiece& pattern, const Options& options) { // Returns rprog_, computing it if needed. re2::Prog* RE2::ReverseProg() const { - std::call_once(rprog_once_, [](const RE2* re) { + absl::call_once(rprog_once_, [](const RE2* re) { re->rprog_ = re->suffix_regexp_->CompileToReverseProg(re->options_.max_mem() / 3); if (re->rprog_ == NULL) { if (re->options_.log_errors()) - LOG(ERROR) << "Error reverse compiling '" << trunc(re->pattern_) << "'"; + LOG(ERROR) << "Error reverse compiling '" << trunc(*re->pattern_) + << "'"; // We no longer touch error_ and error_code_ because failing to compile // the reverse Prog is not a showstopper: falling back to NFA execution // is fine. More importantly, an RE2 object is supposed to be logically @@ -261,18 +294,21 @@ re2::Prog* RE2::ReverseProg() const { } RE2::~RE2() { + if (group_names_ != empty_group_names()) + delete group_names_; + if (named_groups_ != empty_named_groups()) + delete named_groups_; + delete rprog_; + delete prog_; + if (error_arg_ != empty_string()) + delete error_arg_; + if (error_ != empty_string()) + delete error_; if (suffix_regexp_) suffix_regexp_->Decref(); if (entire_regexp_) entire_regexp_->Decref(); - delete prog_; - delete rprog_; - if (error_ != empty_string) - delete error_; - if (named_groups_ != NULL && named_groups_ != empty_named_groups) - delete named_groups_; - if (group_names_ != NULL && group_names_ != empty_group_names) - delete group_names_; + delete pattern_; } int RE2::ProgramSize() const { @@ -348,39 +384,39 @@ int RE2::ReverseProgramFanout(std::vector<int>* histogram) const { // Returns named_groups_, computing it if needed. const std::map<std::string, int>& RE2::NamedCapturingGroups() const { - std::call_once(named_groups_once_, [](const RE2* re) { + absl::call_once(named_groups_once_, [](const RE2* re) { if (re->suffix_regexp_ != NULL) re->named_groups_ = re->suffix_regexp_->NamedCaptures(); if (re->named_groups_ == NULL) - re->named_groups_ = empty_named_groups; + re->named_groups_ = empty_named_groups(); }, this); return *named_groups_; } // Returns group_names_, computing it if needed. const std::map<int, std::string>& RE2::CapturingGroupNames() const { - std::call_once(group_names_once_, [](const RE2* re) { + absl::call_once(group_names_once_, [](const RE2* re) { if (re->suffix_regexp_ != NULL) re->group_names_ = re->suffix_regexp_->CaptureNames(); if (re->group_names_ == NULL) - re->group_names_ = empty_group_names; + re->group_names_ = empty_group_names(); }, this); return *group_names_; } /***** Convenience interfaces *****/ -bool RE2::FullMatchN(const StringPiece& text, const RE2& re, +bool RE2::FullMatchN(absl::string_view text, const RE2& re, const Arg* const args[], int n) { return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n); } -bool RE2::PartialMatchN(const StringPiece& text, const RE2& re, +bool RE2::PartialMatchN(absl::string_view text, const RE2& re, const Arg* const args[], int n) { return re.DoMatch(text, UNANCHORED, NULL, args, n); } -bool RE2::ConsumeN(StringPiece* input, const RE2& re, +bool RE2::ConsumeN(absl::string_view* input, const RE2& re, const Arg* const args[], int n) { size_t consumed; if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) { @@ -391,7 +427,7 @@ bool RE2::ConsumeN(StringPiece* input, const RE2& re, } } -bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re, +bool RE2::FindAndConsumeN(absl::string_view* input, const RE2& re, const Arg* const args[], int n) { size_t consumed; if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) { @@ -404,12 +440,12 @@ bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re, bool RE2::Replace(std::string* str, const RE2& re, - const StringPiece& rewrite) { - StringPiece vec[kVecSize]; + absl::string_view rewrite) { + absl::string_view vec[kVecSize]; int nvec = 1 + MaxSubmatch(rewrite); if (nvec > 1 + re.NumberOfCapturingGroups()) return false; - if (nvec > static_cast<int>(arraysize(vec))) + if (nvec > static_cast<int>(ABSL_ARRAYSIZE(vec))) return false; if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec)) return false; @@ -426,12 +462,12 @@ bool RE2::Replace(std::string* str, int RE2::GlobalReplace(std::string* str, const RE2& re, - const StringPiece& rewrite) { - StringPiece vec[kVecSize]; + absl::string_view rewrite) { + absl::string_view vec[kVecSize]; int nvec = 1 + MaxSubmatch(rewrite); if (nvec > 1 + re.NumberOfCapturingGroups()) return false; - if (nvec > static_cast<int>(arraysize(vec))) + if (nvec > static_cast<int>(ABSL_ARRAYSIZE(vec))) return false; const char* p = str->data(); @@ -439,13 +475,10 @@ int RE2::GlobalReplace(std::string* str, const char* lastend = NULL; std::string out; int count = 0; -#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION - // Iterate just once when fuzzing. Otherwise, we easily get bogged down - // and coverage is unlikely to improve despite significant expense. - while (p == str->data()) { -#else while (p <= ep) { -#endif + if (maximum_global_replace_count != -1 && + count >= maximum_global_replace_count) + break; if (!re.Match(*str, static_cast<size_t>(p - str->data()), str->size(), UNANCHORED, vec, nvec)) break; @@ -497,15 +530,15 @@ int RE2::GlobalReplace(std::string* str, return count; } -bool RE2::Extract(const StringPiece& text, +bool RE2::Extract(absl::string_view text, const RE2& re, - const StringPiece& rewrite, + absl::string_view rewrite, std::string* out) { - StringPiece vec[kVecSize]; + absl::string_view vec[kVecSize]; int nvec = 1 + MaxSubmatch(rewrite); if (nvec > 1 + re.NumberOfCapturingGroups()) return false; - if (nvec > static_cast<int>(arraysize(vec))) + if (nvec > static_cast<int>(ABSL_ARRAYSIZE(vec))) return false; if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec)) return false; @@ -514,7 +547,7 @@ bool RE2::Extract(const StringPiece& text, return re.Rewrite(out, rewrite, vec, nvec); } -std::string RE2::QuoteMeta(const StringPiece& unquoted) { +std::string RE2::QuoteMeta(absl::string_view unquoted) { std::string result; result.reserve(unquoted.size() << 1); @@ -613,11 +646,11 @@ static int ascii_strcasecmp(const char* a, const char* b, size_t len) { /***** Actual matching and rewriting code *****/ -bool RE2::Match(const StringPiece& text, +bool RE2::Match(absl::string_view text, size_t startpos, size_t endpos, Anchor re_anchor, - StringPiece* submatch, + absl::string_view* submatch, int nsubmatch) const { if (!ok()) { if (options_.log_errors()) @@ -634,7 +667,7 @@ bool RE2::Match(const StringPiece& text, return false; } - StringPiece subtext = text; + absl::string_view subtext = text; subtext.remove_prefix(startpos); subtext.remove_suffix(text.size() - endpos); @@ -642,8 +675,8 @@ bool RE2::Match(const StringPiece& text, // Don't ask for the location if we won't use it. // SearchDFA can do extra optimizations in that case. - StringPiece match; - StringPiece* matchp = &match; + absl::string_view match; + absl::string_view* matchp = &match; if (nsubmatch == 0) matchp = NULL; @@ -686,9 +719,8 @@ bool RE2::Match(const StringPiece& text, } Prog::Anchor anchor = Prog::kUnanchored; - Prog::MatchKind kind = Prog::kFirstMatch; - if (options_.longest_match()) - kind = Prog::kLongestMatch; + Prog::MatchKind kind = + longest_match_ ? Prog::kLongestMatch : Prog::kFirstMatch; bool can_one_pass = is_one_pass_ && ncap <= Prog::kMaxOnePassCapture; bool can_bit_state = prog_->CanBitState(); @@ -720,7 +752,7 @@ bool RE2::Match(const StringPiece& text, if (dfa_failed) { if (options_.log_errors()) LOG(ERROR) << "DFA out of memory: " - << "pattern length " << pattern_.size() << ", " + << "pattern length " << pattern_->size() << ", " << "program size " << prog->size() << ", " << "list count " << prog->list_count() << ", " << "bytemap range " << prog->bytemap_range(); @@ -740,7 +772,7 @@ bool RE2::Match(const StringPiece& text, if (dfa_failed) { if (options_.log_errors()) LOG(ERROR) << "DFA out of memory: " - << "pattern length " << pattern_.size() << ", " + << "pattern length " << pattern_->size() << ", " << "program size " << prog_->size() << ", " << "list count " << prog_->list_count() << ", " << "bytemap range " << prog_->bytemap_range(); @@ -766,7 +798,7 @@ bool RE2::Match(const StringPiece& text, if (dfa_failed) { if (options_.log_errors()) LOG(ERROR) << "DFA out of memory: " - << "pattern length " << pattern_.size() << ", " + << "pattern length " << pattern_->size() << ", " << "program size " << prog->size() << ", " << "list count " << prog->list_count() << ", " << "bytemap range " << prog->bytemap_range(); @@ -809,7 +841,7 @@ bool RE2::Match(const StringPiece& text, if (dfa_failed) { if (options_.log_errors()) LOG(ERROR) << "DFA out of memory: " - << "pattern length " << pattern_.size() << ", " + << "pattern length " << pattern_->size() << ", " << "program size " << prog_->size() << ", " << "list count " << prog_->list_count() << ", " << "bytemap range " << prog_->bytemap_range(); @@ -827,7 +859,7 @@ bool RE2::Match(const StringPiece& text, if (ncap == 1) submatch[0] = match; } else { - StringPiece subtext1; + absl::string_view subtext1; if (skipped_test) { // DFA ran out of memory or was skipped: // need to search in entire original text. @@ -865,17 +897,17 @@ bool RE2::Match(const StringPiece& text, // Adjust overall match for required prefix that we stripped off. if (prefixlen > 0 && nsubmatch > 0) - submatch[0] = StringPiece(submatch[0].data() - prefixlen, - submatch[0].size() + prefixlen); + submatch[0] = absl::string_view(submatch[0].data() - prefixlen, + submatch[0].size() + prefixlen); // Zero submatches that don't exist in the regexp. for (int i = ncap; i < nsubmatch; i++) - submatch[i] = StringPiece(); + submatch[i] = absl::string_view(); return true; } -// Internal matcher - like Match() but takes Args not StringPieces. -bool RE2::DoMatch(const StringPiece& text, +// Internal matcher - like Match() but takes Args not string_views. +bool RE2::DoMatch(absl::string_view text, Anchor re_anchor, size_t* consumed, const Arg* const* args, @@ -898,19 +930,10 @@ bool RE2::DoMatch(const StringPiece& text, else nvec = n+1; - StringPiece* vec; - StringPiece stkvec[kVecSize]; - StringPiece* heapvec = NULL; - - if (nvec <= static_cast<int>(arraysize(stkvec))) { - vec = stkvec; - } else { - vec = new StringPiece[nvec]; - heapvec = vec; - } + absl::FixedArray<absl::string_view, kVecSize> vec_storage(nvec); + absl::string_view* vec = vec_storage.data(); if (!Match(text, 0, text.size(), re_anchor, vec, nvec)) { - delete[] heapvec; return false; } @@ -919,27 +942,24 @@ bool RE2::DoMatch(const StringPiece& text, if (n == 0 || args == NULL) { // We are not interested in results - delete[] heapvec; return true; } // If we got here, we must have matched the whole pattern. for (int i = 0; i < n; i++) { - const StringPiece& s = vec[i+1]; + absl::string_view s = vec[i+1]; if (!args[i]->Parse(s.data(), s.size())) { // TODO: Should we indicate what the error was? - delete[] heapvec; return false; } } - delete[] heapvec; return true; } // Checks that the rewrite string is well-formed with respect to this // regular expression. -bool RE2::CheckRewriteString(const StringPiece& rewrite, +bool RE2::CheckRewriteString(absl::string_view rewrite, std::string* error) const { int max_token = -1; for (const char *s = rewrite.data(), *end = s + rewrite.size(); @@ -956,7 +976,7 @@ bool RE2::CheckRewriteString(const StringPiece& rewrite, if (c == '\\') { continue; } - if (!isdigit(c)) { + if (!absl::ascii_isdigit(c)) { *error = "Rewrite schema error: " "'\\' must be followed by a digit or '\\'."; return false; @@ -968,7 +988,7 @@ bool RE2::CheckRewriteString(const StringPiece& rewrite, } if (max_token > NumberOfCapturingGroups()) { - *error = StringPrintf( + *error = absl::StrFormat( "Rewrite schema requests %d matches, but the regexp only has %d " "parenthesized subexpressions.", max_token, NumberOfCapturingGroups()); @@ -979,14 +999,14 @@ bool RE2::CheckRewriteString(const StringPiece& rewrite, // Returns the maximum submatch needed for the rewrite to be done by Replace(). // E.g. if rewrite == "foo \\2,\\1", returns 2. -int RE2::MaxSubmatch(const StringPiece& rewrite) { +int RE2::MaxSubmatch(absl::string_view rewrite) { int max = 0; for (const char *s = rewrite.data(), *end = s + rewrite.size(); s < end; s++) { if (*s == '\\') { s++; int c = (s < end) ? *s : -1; - if (isdigit(c)) { + if (absl::ascii_isdigit(c)) { int n = (c - '0'); if (n > max) max = n; @@ -996,11 +1016,11 @@ int RE2::MaxSubmatch(const StringPiece& rewrite) { return max; } -// Append the "rewrite" string, with backslash subsitutions from "vec", +// Append the "rewrite" string, with backslash substitutions from "vec", // to string "out". bool RE2::Rewrite(std::string* out, - const StringPiece& rewrite, - const StringPiece* vec, + absl::string_view rewrite, + const absl::string_view* vec, int veclen) const { for (const char *s = rewrite.data(), *end = s + rewrite.size(); s < end; s++) { @@ -1010,7 +1030,7 @@ bool RE2::Rewrite(std::string* out, } s++; int c = (s < end) ? *s : -1; - if (isdigit(c)) { + if (absl::ascii_isdigit(c)) { int n = (c - '0'); if (n >= veclen) { if (options_.log_errors()) { @@ -1019,7 +1039,7 @@ bool RE2::Rewrite(std::string* out, } return false; } - StringPiece snip = vec[n]; + absl::string_view snip = vec[n]; if (!snip.empty()) out->append(snip.data(), snip.size()); } else if (c == '\\') { @@ -1051,9 +1071,9 @@ bool Parse(const char* str, size_t n, std::string* dest) { } template <> -bool Parse(const char* str, size_t n, StringPiece* dest) { +bool Parse(const char* str, size_t n, absl::string_view* dest) { if (dest == NULL) return true; - *dest = StringPiece(str, n); + *dest = absl::string_view(str, n); return true; } @@ -1091,13 +1111,13 @@ static const char* TerminateNumber(char* buf, size_t nbuf, const char* str, size_t* np, bool accept_spaces) { size_t n = *np; if (n == 0) return ""; - if (n > 0 && isspace(*str)) { + if (n > 0 && absl::ascii_isspace(*str)) { // We are less forgiving than the strtoxxx() routines and do not // allow leading spaces. We do allow leading spaces for floats. if (!accept_spaces) { return ""; } - while (n > 0 && isspace(*str)) { + while (n > 0 && absl::ascii_isspace(*str)) { n--; str++; } @@ -66,17 +66,17 @@ // CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1))); // // ----------------------------------------------------------------------- -// MATCHING WITH SUBSTRING EXTRACTION: +// SUBMATCH EXTRACTION: // -// You can supply extra pointer arguments to extract matched substrings. +// You can supply extra pointer arguments to extract submatches. // On match failure, none of the pointees will have been modified. -// On match success, the substrings will be converted (as necessary) and +// On match success, the submatches will be converted (as necessary) and // their values will be assigned to their pointees until all conversions // have succeeded or one conversion has failed. // On conversion failure, the pointees will be in an indeterminate state // because the caller has no way of knowing which conversion failed. -// However, conversion cannot fail for types like string and StringPiece -// that do not inspect the substring contents. Hence, in the common case +// However, conversion cannot fail for types like string and string_view +// that do not inspect the submatch contents. Hence, in the common case // where all of the pointees are of such types, failure is always due to // match failure and thus none of the pointees will have been modified. // @@ -85,6 +85,11 @@ // std::string s; // CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i)); // +// Example: extracts "ruby" into "s" and no value into "i" +// absl::optional<int> i; +// std::string s; +// CHECK(RE2::FullMatch("ruby", "(\\w+)(?::(\\d+))?", &s, &i)); +// // Example: fails because string cannot be stored in integer // CHECK(!RE2::FullMatch("ruby", "(.*)", &i)); // @@ -100,10 +105,10 @@ // Example: integer overflow causes failure // CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i)); // -// NOTE(rsc): Asking for substrings slows successful matches quite a bit. +// NOTE(rsc): Asking for submatches slows successful matches quite a bit. // This may get a little faster in the future, but right now is slower // than PCRE. On the other hand, failed matches run *very* fast (faster -// than PCRE), as do matches without substring extraction. +// than PCRE), as do matches without submatch extraction. // // ----------------------------------------------------------------------- // PARTIAL MATCHES @@ -140,12 +145,12 @@ // // The "Consume" operation may be useful if you want to repeatedly // match regular expressions at the front of a string and skip over -// them as they match. This requires use of the "StringPiece" type, +// them as they match. This requires use of the string_view type, // which represents a sub-range of a real string. // // Example: read lines of the form "var = value" from a string. -// std::string contents = ...; // Fill string somehow -// StringPiece input(contents); // Wrap a StringPiece around it +// std::string contents = ...; // Fill string somehow +// absl::string_view input(contents); // Wrap a string_view around it // // std::string var; // int value; @@ -206,7 +211,6 @@ #include <stdint.h> #include <algorithm> #include <map> -#include <mutex> #include <string> #include <type_traits> #include <vector> @@ -215,6 +219,9 @@ #include <TargetConditionals.h> #endif +#include "absl/base/call_once.h" +#include "absl/strings/string_view.h" +#include "absl/types/optional.h" #include "re2/stringpiece.h" namespace re2 { @@ -273,22 +280,34 @@ class RE2 { // Need to have the const char* and const std::string& forms for implicit // conversions when passing string literals to FullMatch and PartialMatch. - // Otherwise the StringPiece form would be sufficient. -#ifndef SWIG + // Otherwise the absl::string_view form would be sufficient. RE2(const char* pattern); RE2(const std::string& pattern); -#endif - RE2(const StringPiece& pattern); - RE2(const StringPiece& pattern, const Options& options); + RE2(absl::string_view pattern); + RE2(absl::string_view pattern, const Options& options); ~RE2(); + // Not copyable. + // RE2 objects are expensive. You should probably use std::shared_ptr<RE2> + // instead. If you really must copy, RE2(first.pattern(), first.options()) + // effectively does so: it produces a second object that mimics the first. + RE2(const RE2&) = delete; + RE2& operator=(const RE2&) = delete; + // Not movable. + // RE2 objects are thread-safe and logically immutable. You should probably + // use std::unique_ptr<RE2> instead. Otherwise, consider std::deque<RE2> if + // direct emplacement into a container is desired. If you really must move, + // be prepared to submit a design document along with your feature request. + RE2(RE2&&) = delete; + RE2& operator=(RE2&&) = delete; + // Returns whether RE2 was created properly. bool ok() const { return error_code() == NoError; } // The string specification for this RE2. E.g. // RE2 re("ab*c?d+"); // re.pattern(); // "ab*c?d+" - const std::string& pattern() const { return pattern_; } + const std::string& pattern() const { return *pattern_; } // If RE2 could not be created properly, returns an error string. // Else returns the empty string. @@ -300,7 +319,7 @@ class RE2 { // If RE2 could not be created properly, returns the offending // portion of the regexp. - const std::string& error_arg() const { return error_arg_; } + const std::string& error_arg() const { return *error_arg_; } // Returns the program size, a very approximate measure of a regexp's "cost". // Larger numbers are more expensive than smaller numbers. @@ -324,16 +343,15 @@ class RE2 { // the functions whose names are the prefix before the 'N'. It is sometimes // useful to invoke them directly, but the syntax is awkward, so the 'N'-less // versions should be preferred. - static bool FullMatchN(const StringPiece& text, const RE2& re, + static bool FullMatchN(absl::string_view text, const RE2& re, const Arg* const args[], int n); - static bool PartialMatchN(const StringPiece& text, const RE2& re, + static bool PartialMatchN(absl::string_view text, const RE2& re, const Arg* const args[], int n); - static bool ConsumeN(StringPiece* input, const RE2& re, + static bool ConsumeN(absl::string_view* input, const RE2& re, const Arg* const args[], int n); - static bool FindAndConsumeN(StringPiece* input, const RE2& re, + static bool FindAndConsumeN(absl::string_view* input, const RE2& re, const Arg* const args[], int n); -#ifndef SWIG private: template <typename F, typename SP> static inline bool Apply(F f, SP sp, const RE2& re) { @@ -363,10 +381,11 @@ class RE2 { // // The provided pointer arguments can be pointers to any scalar numeric // type, or one of: - // std::string (matched piece is copied to string) - // StringPiece (StringPiece is mutated to point to matched piece) - // T (where "bool T::ParseFrom(const char*, size_t)" exists) - // (void*)NULL (the corresponding matched sub-pattern is not copied) + // std::string (matched piece is copied to string) + // absl::string_view (string_view is mutated to point to matched piece) + // absl::optional<T> (T is a supported numeric or string type as above) + // T ("bool T::ParseFrom(const char*, size_t)" must exist) + // (void*)NULL (the corresponding matched sub-pattern is not copied) // // Returns true iff all of the following conditions are satisfied: // a. "text" matches "re" fully - from the beginning to the end of "text". @@ -378,13 +397,16 @@ class RE2 { // ignored. // // CAVEAT: An optional sub-pattern that does not exist in the - // matched string is assigned the empty string. Therefore, the - // following will return false (because the empty string is not a - // valid number): + // matched string is assigned the null string. Therefore, the + // following returns false because the null string - absence of + // a string (not even the empty string) - is not a valid number: + // // int number; // RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number); + // + // Use absl::optional<int> instead to handle this case correctly. template <typename... A> - static bool FullMatch(const StringPiece& text, const RE2& re, A&&... a) { + static bool FullMatch(absl::string_view text, const RE2& re, A&&... a) { return Apply(FullMatchN, text, re, Arg(std::forward<A>(a))...); } @@ -400,7 +422,7 @@ class RE2 { // number of sub-patterns, the "i"th captured sub-pattern is // ignored. template <typename... A> - static bool PartialMatch(const StringPiece& text, const RE2& re, A&&... a) { + static bool PartialMatch(absl::string_view text, const RE2& re, A&&... a) { return Apply(PartialMatchN, text, re, Arg(std::forward<A>(a))...); } @@ -418,7 +440,7 @@ class RE2 { // number of sub-patterns, the "i"th captured sub-pattern is // ignored. template <typename... A> - static bool Consume(StringPiece* input, const RE2& re, A&&... a) { + static bool Consume(absl::string_view* input, const RE2& re, A&&... a) { return Apply(ConsumeN, input, re, Arg(std::forward<A>(a))...); } @@ -436,10 +458,9 @@ class RE2 { // number of sub-patterns, the "i"th captured sub-pattern is // ignored. template <typename... A> - static bool FindAndConsume(StringPiece* input, const RE2& re, A&&... a) { + static bool FindAndConsume(absl::string_view* input, const RE2& re, A&&... a) { return Apply(FindAndConsumeN, input, re, Arg(std::forward<A>(a))...); } -#endif // Replace the first match of "re" in "str" with "rewrite". // Within "rewrite", backslash-escaped digits (\1 to \9) can be @@ -456,7 +477,7 @@ class RE2 { // false otherwise. static bool Replace(std::string* str, const RE2& re, - const StringPiece& rewrite); + absl::string_view rewrite); // Like Replace(), except replaces successive non-overlapping occurrences // of the pattern in the string with the rewrite. E.g. @@ -473,7 +494,7 @@ class RE2 { // Returns the number of replacements made. static int GlobalReplace(std::string* str, const RE2& re, - const StringPiece& rewrite); + absl::string_view rewrite); // Like Replace, except that if the pattern matches, "rewrite" // is copied into "out" with substitutions. The non-matching @@ -483,9 +504,9 @@ class RE2 { // successfully; if no match occurs, the string is left unaffected. // // REQUIRES: "text" must not alias any part of "*out". - static bool Extract(const StringPiece& text, + static bool Extract(absl::string_view text, const RE2& re, - const StringPiece& rewrite, + absl::string_view rewrite, std::string* out); // Escapes all potentially meaningful regexp characters in @@ -494,7 +515,7 @@ class RE2 { // 1.5-2.0? // may become: // 1\.5\-2\.0\? - static std::string QuoteMeta(const StringPiece& unquoted); + static std::string QuoteMeta(absl::string_view unquoted); // Computes range for any strings matching regexp. The min and max can in // some cases be arbitrarily precise, so the caller gets to specify the @@ -522,7 +543,7 @@ class RE2 { ANCHOR_BOTH // Anchor at start and end }; - // Return the number of capturing subpatterns, or -1 if the + // Return the number of capturing sub-patterns, or -1 if the // regexp wasn't valid on construction. The overall match ($0) // does not count: if the regexp is "(a)(b)", returns 2. int NumberOfCapturingGroups() const { return num_captures_; } @@ -555,15 +576,15 @@ class RE2 { // Doesn't make sense to use nsubmatch > 1 + NumberOfCapturingGroups(), // but will be handled correctly. // - // Passing text == StringPiece(NULL, 0) will be handled like any other + // Passing text == absl::string_view() will be handled like any other // empty string, but note that on return, it will not be possible to tell // whether submatch i matched the empty string or did not match: // either way, submatch[i].data() == NULL. - bool Match(const StringPiece& text, + bool Match(absl::string_view text, size_t startpos, size_t endpos, Anchor re_anchor, - StringPiece* submatch, + absl::string_view* submatch, int nsubmatch) const; // Check that the given rewrite string is suitable for use with this @@ -574,21 +595,21 @@ class RE2 { // '\' followed by anything other than a digit or '\'. // A true return value guarantees that Replace() and Extract() won't // fail because of a bad rewrite string. - bool CheckRewriteString(const StringPiece& rewrite, + bool CheckRewriteString(absl::string_view rewrite, std::string* error) const; // Returns the maximum submatch needed for the rewrite to be done by // Replace(). E.g. if rewrite == "foo \\2,\\1", returns 2. - static int MaxSubmatch(const StringPiece& rewrite); + static int MaxSubmatch(absl::string_view rewrite); - // Append the "rewrite" string, with backslash subsitutions from "vec", + // Append the "rewrite" string, with backslash substitutions from "vec", // to string "out". // Returns true on success. This method can fail because of a malformed // rewrite string. CheckRewriteString guarantees that the rewrite will // be sucessful. bool Rewrite(std::string* out, - const StringPiece& rewrite, - const StringPiece* vec, + absl::string_view rewrite, + const absl::string_view* vec, int veclen) const; // Constructor options @@ -653,11 +674,11 @@ class RE2 { }; Options() : + max_mem_(kDefaultMaxMem), encoding_(EncodingUTF8), posix_syntax_(false), longest_match_(false), log_errors_(true), - max_mem_(kDefaultMaxMem), literal_(false), never_nl_(false), dot_nl_(false), @@ -670,6 +691,9 @@ class RE2 { /*implicit*/ Options(CannedOptions); + int64_t max_mem() const { return max_mem_; } + void set_max_mem(int64_t m) { max_mem_ = m; } + Encoding encoding() const { return encoding_; } void set_encoding(Encoding encoding) { encoding_ = encoding; } @@ -682,9 +706,6 @@ class RE2 { bool log_errors() const { return log_errors_; } void set_log_errors(bool b) { log_errors_ = b; } - int64_t max_mem() const { return max_mem_; } - void set_max_mem(int64_t m) { max_mem_ = m; } - bool literal() const { return literal_; } void set_literal(bool b) { literal_ = b; } @@ -716,11 +737,11 @@ class RE2 { int ParseFlags() const; private: + int64_t max_mem_; Encoding encoding_; bool posix_syntax_; bool longest_match_; bool log_errors_; - int64_t max_mem_; bool literal_; bool never_nl_; bool dot_nl_; @@ -742,10 +763,14 @@ class RE2 { template <typename T> static Arg Octal(T* ptr); + // Controls the maximum count permitted by GlobalReplace(); -1 is unlimited. + // FOR FUZZING ONLY. + static void FUZZING_ONLY_set_maximum_global_replace_count(int i); + private: - void Init(const StringPiece& pattern, const Options& options); + void Init(absl::string_view pattern, const Options& options); - bool DoMatch(const StringPiece& text, + bool DoMatch(absl::string_view text, Anchor re_anchor, size_t* consumed, const Arg* const args[], @@ -753,18 +778,23 @@ class RE2 { re2::Prog* ReverseProg() const; - std::string pattern_; // string regular expression - Options options_; // option flags - re2::Regexp* entire_regexp_; // parsed regular expression - const std::string* error_; // error indicator (or points to empty string) - ErrorCode error_code_; // error code - std::string error_arg_; // fragment of regexp showing error - std::string prefix_; // required prefix (before suffix_regexp_) - bool prefix_foldcase_; // prefix_ is ASCII case-insensitive - re2::Regexp* suffix_regexp_; // parsed regular expression, prefix_ removed - re2::Prog* prog_; // compiled program for regexp - int num_captures_; // number of capturing groups - bool is_one_pass_; // can use prog_->SearchOnePass? + // First cache line is relatively cold fields. + const std::string* pattern_; // string regular expression + Options options_; // option flags + re2::Regexp* entire_regexp_; // parsed regular expression + re2::Regexp* suffix_regexp_; // parsed regular expression, prefix_ removed + const std::string* error_; // error indicator (or points to empty string) + const std::string* error_arg_; // fragment of regexp showing error (or ditto) + + // Second cache line is relatively hot fields. + // These are ordered oddly to pack everything. + int num_captures_; // number of capturing groups + ErrorCode error_code_ : 29; // error code (29 bits is more than enough) + bool longest_match_ : 1; // cached copy of options_.longest_match() + bool is_one_pass_ : 1; // can use prog_->SearchOnePass? + bool prefix_foldcase_ : 1; // prefix_ is ASCII case-insensitive + std::string prefix_; // required prefix (before suffix_regexp_) + re2::Prog* prog_; // compiled program for regexp // Reverse Prog for DFA execution only mutable re2::Prog* rprog_; @@ -773,12 +803,9 @@ class RE2 { // Map from capture indices to names mutable const std::map<int, std::string>* group_names_; - mutable std::once_flag rprog_once_; - mutable std::once_flag named_groups_once_; - mutable std::once_flag group_names_once_; - - RE2(const RE2&) = delete; - RE2& operator=(const RE2&) = delete; + mutable absl::once_flag rprog_once_; + mutable absl::once_flag named_groups_once_; + mutable absl::once_flag group_names_once_; }; /***** Implementation details *****/ @@ -789,7 +816,7 @@ namespace re2_internal { template <typename T> struct Parse3ary : public std::false_type {}; template <> struct Parse3ary<void> : public std::true_type {}; template <> struct Parse3ary<std::string> : public std::true_type {}; -template <> struct Parse3ary<StringPiece> : public std::true_type {}; +template <> struct Parse3ary<absl::string_view> : public std::true_type {}; template <> struct Parse3ary<char> : public std::true_type {}; template <> struct Parse3ary<signed char> : public std::true_type {}; template <> struct Parse3ary<unsigned char> : public std::true_type {}; @@ -813,6 +840,42 @@ template <> struct Parse4ary<unsigned long long> : public std::true_type {}; template <typename T> bool Parse(const char* str, size_t n, T* dest, int radix); +// Support absl::optional<T> for all T with a stock parser. +template <typename T> struct Parse3ary<absl::optional<T>> : public Parse3ary<T> {}; +template <typename T> struct Parse4ary<absl::optional<T>> : public Parse4ary<T> {}; + +template <typename T> +bool Parse(const char* str, size_t n, absl::optional<T>* dest) { + if (str == NULL) { + if (dest != NULL) + dest->reset(); + return true; + } + T tmp; + if (Parse(str, n, &tmp)) { + if (dest != NULL) + dest->emplace(std::move(tmp)); + return true; + } + return false; +} + +template <typename T> +bool Parse(const char* str, size_t n, absl::optional<T>* dest, int radix) { + if (str == NULL) { + if (dest != NULL) + dest->reset(); + return true; + } + T tmp; + if (Parse(str, n, &tmp, radix)) { + if (dest != NULL) + dest->emplace(std::move(tmp)); + return true; + } + return false; +} + } // namespace re2_internal class RE2::Arg { @@ -908,9 +971,8 @@ inline RE2::Arg RE2::Octal(T* ptr) { }); } -#ifndef SWIG // Silence warnings about missing initializers for members of LazyRE2. -#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 6 +#if !defined(__clang__) && defined(__GNUC__) #pragma GCC diagnostic ignored "-Wmissing-field-initializers" #endif @@ -940,7 +1002,7 @@ class LazyRE2 { // Named accessor/initializer: RE2* get() const { - std::call_once(once_, &LazyRE2::Init, this); + absl::call_once(once_, &LazyRE2::Init, this); return ptr_; } @@ -950,7 +1012,7 @@ class LazyRE2 { NoArg barrier_against_excess_initializers_; mutable RE2* ptr_; - mutable std::once_flag once_; + mutable absl::once_flag once_; private: static void Init(const LazyRE2* lazy_re2) { @@ -959,7 +1021,6 @@ class LazyRE2 { void operator=(const LazyRE2&); // disallowed }; -#endif namespace hooks { diff --git a/re2/regexp.cc b/re2/regexp.cc index ca1318b..4ea81cf 100644 --- a/re2/regexp.cc +++ b/re2/regexp.cc @@ -12,16 +12,16 @@ #include <string.h> #include <algorithm> #include <map> -#include <mutex> #include <string> #include <vector> -#include "util/util.h" +#include "absl/base/call_once.h" +#include "absl/base/macros.h" +#include "absl/container/flat_hash_map.h" +#include "absl/synchronization/mutex.h" #include "util/logging.h" -#include "util/mutex.h" #include "util/utf.h" #include "re2/pod_array.h" -#include "re2/stringpiece.h" #include "re2/walker-inl.h" namespace re2 { @@ -74,35 +74,45 @@ bool Regexp::QuickDestroy() { return false; } -// Lazily allocated. -static Mutex* ref_mutex; -static std::map<Regexp*, int>* ref_map; +// Similar to EmptyStorage in re2.cc. +struct RefStorage { + absl::Mutex ref_mutex; + absl::flat_hash_map<Regexp*, int> ref_map; +}; +alignas(RefStorage) static char ref_storage[sizeof(RefStorage)]; + +static inline absl::Mutex* ref_mutex() { + return &reinterpret_cast<RefStorage*>(ref_storage)->ref_mutex; +} + +static inline absl::flat_hash_map<Regexp*, int>* ref_map() { + return &reinterpret_cast<RefStorage*>(ref_storage)->ref_map; +} int Regexp::Ref() { if (ref_ < kMaxRef) return ref_; - MutexLock l(ref_mutex); - return (*ref_map)[this]; + absl::MutexLock l(ref_mutex()); + return (*ref_map())[this]; } // Increments reference count, returns object as convenience. Regexp* Regexp::Incref() { if (ref_ >= kMaxRef-1) { - static std::once_flag ref_once; - std::call_once(ref_once, []() { - ref_mutex = new Mutex; - ref_map = new std::map<Regexp*, int>; + static absl::once_flag ref_once; + absl::call_once(ref_once, []() { + (void) new (ref_storage) RefStorage; }); // Store ref count in overflow map. - MutexLock l(ref_mutex); + absl::MutexLock l(ref_mutex()); if (ref_ == kMaxRef) { // already overflowed - (*ref_map)[this]++; + (*ref_map())[this]++; } else { // overflowing now - (*ref_map)[this] = kMaxRef; + (*ref_map())[this] = kMaxRef; ref_ = kMaxRef; } return this; @@ -116,13 +126,13 @@ Regexp* Regexp::Incref() { void Regexp::Decref() { if (ref_ == kMaxRef) { // Ref count is stored in overflow map. - MutexLock l(ref_mutex); - int r = (*ref_map)[this] - 1; + absl::MutexLock l(ref_mutex()); + int r = (*ref_map())[this] - 1; if (r < kMaxRef) { ref_ = static_cast<uint16_t>(r); - ref_map->erase(this); + ref_map()->erase(this); } else { - (*ref_map)[this] = r; + (*ref_map())[this] = r; } return; } @@ -390,7 +400,13 @@ static bool TopEqual(Regexp* a, Regexp* b) { a->max() == b->max(); case kRegexpCapture: - return a->cap() == b->cap() && a->name() == b->name(); + if (a->name() == NULL || b->name() == NULL) { + // One pointer is null, so the other pointer should also be null. + return a->cap() == b->cap() && a->name() == b->name(); + } else { + // Neither pointer is null, so compare the pointees for equality. + return a->cap() == b->cap() && *a->name() == *b->name(); + } case kRegexpHaveMatch: return a->match_id() == b->match_id(); @@ -509,7 +525,7 @@ static const char *kErrorStrings[] = { }; std::string RegexpStatus::CodeText(enum RegexpStatusCode code) { - if (code < 0 || code >= arraysize(kErrorStrings)) + if (code < 0 || code >= ABSL_ARRAYSIZE(kErrorStrings)) code = kRegexpInternalError; return kErrorStrings[code]; } diff --git a/re2/regexp.h b/re2/regexp.h index b6446f9..df49894 100644 --- a/re2/regexp.h +++ b/re2/regexp.h @@ -92,10 +92,9 @@ #include <set> #include <string> -#include "util/util.h" +#include "absl/strings/string_view.h" #include "util/logging.h" #include "util/utf.h" -#include "re2/stringpiece.h" namespace re2 { @@ -195,10 +194,10 @@ class RegexpStatus { ~RegexpStatus() { delete tmp_; } void set_code(RegexpStatusCode code) { code_ = code; } - void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; } + void set_error_arg(absl::string_view error_arg) { error_arg_ = error_arg; } void set_tmp(std::string* tmp) { delete tmp_; tmp_ = tmp; } RegexpStatusCode code() const { return code_; } - const StringPiece& error_arg() const { return error_arg_; } + absl::string_view error_arg() const { return error_arg_; } bool ok() const { return code() == kRegexpSuccess; } // Copies state from status. @@ -213,9 +212,9 @@ class RegexpStatus { std::string Text() const; private: - RegexpStatusCode code_; // Kind of error - StringPiece error_arg_; // Piece of regexp containing syntax error. - std::string* tmp_; // Temporary storage, possibly where error_arg_ is. + RegexpStatusCode code_; // Kind of error. + absl::string_view error_arg_; // Piece of regexp containing syntax error. + std::string* tmp_; // Temporary storage, possibly for error_arg_. RegexpStatus(const RegexpStatus&) = delete; RegexpStatus& operator=(const RegexpStatus&) = delete; @@ -352,7 +351,7 @@ class Regexp { // Parses string s to produce regular expression, returned. // Caller must release return value with re->Decref(). // On failure, sets *status (if status != NULL) and returns NULL. - static Regexp* Parse(const StringPiece& s, ParseFlags flags, + static Regexp* Parse(absl::string_view s, ParseFlags flags, RegexpStatus* status); // Returns a _new_ simplified version of the current regexp. @@ -369,7 +368,7 @@ class Regexp { // Parses the regexp src and then simplifies it and sets *dst to the // string representation of the simplified form. Returns true on success. // Returns false and sets *status (if status != NULL) on parse error. - static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags, + static bool SimplifyRegexp(absl::string_view src, ParseFlags flags, std::string* dst, RegexpStatus* status); // Returns the number of capturing groups in the regexp. @@ -467,7 +466,7 @@ class Regexp { class ParseState; friend class ParseState; - friend bool ParseCharClass(StringPiece* s, Regexp** out_re, + friend bool ParseCharClass(absl::string_view* s, Regexp** out_re, RegexpStatus* status); // Helper for testing [sic]. @@ -9,13 +9,11 @@ #include <memory> #include <utility> -#include "util/util.h" #include "util/logging.h" #include "re2/pod_array.h" #include "re2/prog.h" #include "re2/re2.h" #include "re2/regexp.h" -#include "re2/stringpiece.h" namespace re2 { @@ -52,7 +50,7 @@ RE2::Set& RE2::Set::operator=(Set&& other) { return *this; } -int RE2::Set::Add(const StringPiece& pattern, std::string* error) { +int RE2::Set::Add(absl::string_view pattern, std::string* error) { if (compiled_) { LOG(DFATAL) << "RE2::Set::Add() called after compiling"; return -1; @@ -121,16 +119,16 @@ bool RE2::Set::Compile() { return prog_ != nullptr; } -bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v) const { +bool RE2::Set::Match(absl::string_view text, std::vector<int>* v) const { return Match(text, v, NULL); } -bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v, +bool RE2::Set::Match(absl::string_view text, std::vector<int>* v, ErrorInfo* error_info) const { if (!compiled_) { - LOG(DFATAL) << "RE2::Set::Match() called before compiling"; if (error_info != NULL) error_info->kind = kNotCompiled; + LOG(DFATAL) << "RE2::Set::Match() called before compiling"; return false; } #ifdef RE2_HAVE_THREAD_LOCAL @@ -161,9 +159,9 @@ bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v, } if (v != NULL) { if (matches->empty()) { - LOG(DFATAL) << "RE2::Set::Match() matched, but no matches returned?!"; if (error_info != NULL) error_info->kind = kInconsistent; + LOG(DFATAL) << "RE2::Set::Match() matched, but no matches returned?!"; return false; } v->assign(matches->begin(), matches->end()); @@ -10,6 +10,7 @@ #include <utility> #include <vector> +#include "absl/strings/string_view.h" #include "re2/re2.h" namespace re2 { @@ -50,7 +51,7 @@ class RE2::Set { // Indices are assigned in sequential order starting from 0. // Errors do not increment the index; if error is not NULL, *error will hold // the error message from the parser. - int Add(const StringPiece& pattern, std::string* error); + int Add(absl::string_view pattern, std::string* error); // Compiles the set in preparation for matching. // Returns false if the compiler runs out of memory. @@ -61,12 +62,12 @@ class RE2::Set { // Returns true if text matches at least one of the regexps in the set. // Fills v (if not NULL) with the indices of the matching regexps. // Callers must not expect v to be sorted. - bool Match(const StringPiece& text, std::vector<int>* v) const; + bool Match(absl::string_view text, std::vector<int>* v) const; // As above, but populates error_info (if not NULL) when none of the regexps // in the set matched. This can inform callers when DFA execution fails, for // example, because they might wish to handle that case differently. - bool Match(const StringPiece& text, std::vector<int>* v, + bool Match(absl::string_view text, std::vector<int>* v, ErrorInfo* error_info) const; private: diff --git a/re2/simplify.cc b/re2/simplify.cc index 663d5fc..cea100b 100644 --- a/re2/simplify.cc +++ b/re2/simplify.cc @@ -6,9 +6,9 @@ // to use simple extended regular expression features. // Also sort and simplify character classes. +#include <algorithm> #include <string> -#include "util/util.h" #include "util/logging.h" #include "util/utf.h" #include "re2/pod_array.h" @@ -20,7 +20,7 @@ namespace re2 { // Parses the regexp src and then simplifies it and sets *dst to the // string representation of the simplified form. Returns true on success. // Returns false and sets *error (if error != NULL) on error. -bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags, +bool Regexp::SimplifyRegexp(absl::string_view src, ParseFlags flags, std::string* dst, RegexpStatus* status) { Regexp* re = Parse(src, flags, status); if (re == NULL) @@ -371,8 +371,8 @@ void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) { break; default: - LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op(); nre->Decref(); + LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op(); return; } @@ -432,8 +432,8 @@ void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) { } default: - LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op(); nre->Decref(); + LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op(); return; } @@ -580,6 +580,16 @@ Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2, return re; } +// Returns true if re is an empty-width op. +static bool IsEmptyOp(Regexp* re) { + return (re->op() == kRegexpBeginLine || + re->op() == kRegexpEndLine || + re->op() == kRegexpWordBoundary || + re->op() == kRegexpNoWordBoundary || + re->op() == kRegexpBeginText || + re->op() == kRegexpEndText); +} + // Simplifies the expression re{min,max} in terms of *, +, and ?. // Returns a new regexp. Does not edit re. Does not consume reference to re. // Caller must Decref return value when done with it. @@ -588,6 +598,16 @@ Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2, // but in the Regexp* representation, both (x) are marked as $1. Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max, Regexp::ParseFlags f) { + // For an empty-width op OR a concatenation or alternation of empty-width + // ops, cap the repetition count at 1. + if (IsEmptyOp(re) || + ((re->op() == kRegexpConcat || + re->op() == kRegexpAlternate) && + std::all_of(re->sub(), re->sub() + re->nsub(), IsEmptyOp))) { + min = std::min(min, 1); + max = std::min(max, 1); + } + // x{n,} means at least n matches of x. if (max == -1) { // Special case: x{0,} is x* diff --git a/re2/stringpiece.cc b/re2/stringpiece.cc deleted file mode 100644 index ef2e287..0000000 --- a/re2/stringpiece.cc +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright 2004 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include "re2/stringpiece.h" - -#include <ostream> - -#include "util/util.h" - -namespace re2 { - -const StringPiece::size_type StringPiece::npos; // initialized in stringpiece.h - -StringPiece::size_type StringPiece::copy(char* buf, size_type n, - size_type pos) const { - size_type ret = std::min(size_ - pos, n); - memcpy(buf, data_ + pos, ret); - return ret; -} - -StringPiece StringPiece::substr(size_type pos, size_type n) const { - if (pos > size_) pos = size_; - if (n > size_ - pos) n = size_ - pos; - return StringPiece(data_ + pos, n); -} - -StringPiece::size_type StringPiece::find(const StringPiece& s, - size_type pos) const { - if (pos > size_) return npos; - const_pointer result = std::search(data_ + pos, data_ + size_, - s.data_, s.data_ + s.size_); - size_type xpos = result - data_; - return xpos + s.size_ <= size_ ? xpos : npos; -} - -StringPiece::size_type StringPiece::find(char c, size_type pos) const { - if (size_ <= 0 || pos >= size_) return npos; - const_pointer result = std::find(data_ + pos, data_ + size_, c); - return result != data_ + size_ ? result - data_ : npos; -} - -StringPiece::size_type StringPiece::rfind(const StringPiece& s, - size_type pos) const { - if (size_ < s.size_) return npos; - if (s.size_ == 0) return std::min(size_, pos); - const_pointer last = data_ + std::min(size_ - s.size_, pos) + s.size_; - const_pointer result = std::find_end(data_, last, s.data_, s.data_ + s.size_); - return result != last ? result - data_ : npos; -} - -StringPiece::size_type StringPiece::rfind(char c, size_type pos) const { - if (size_ <= 0) return npos; - for (size_t i = std::min(pos + 1, size_); i != 0;) { - if (data_[--i] == c) return i; - } - return npos; -} - -std::ostream& operator<<(std::ostream& o, const StringPiece& p) { - o.write(p.data(), p.size()); - return o; -} - -} // namespace re2 diff --git a/re2/stringpiece.h b/re2/stringpiece.h index 1d9c2d3..e9367bf 100644 --- a/re2/stringpiece.h +++ b/re2/stringpiece.h @@ -1,209 +1,17 @@ -// Copyright 2001-2010 The RE2 Authors. All Rights Reserved. +// Copyright 2022 The RE2 Authors. All Rights Reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #ifndef RE2_STRINGPIECE_H_ #define RE2_STRINGPIECE_H_ -// A string-like object that points to a sized piece of memory. -// -// Functions or methods may use const StringPiece& parameters to accept either -// a "const char*" or a "string" value that will be implicitly converted to -// a StringPiece. The implicit conversion means that it is often appropriate -// to include this .h file in other files rather than forward-declaring -// StringPiece as would be appropriate for most other Google classes. -// -// Systematic usage of StringPiece is encouraged as it will reduce unnecessary -// conversions from "const char*" to "string" and back again. -// -// -// Arghh! I wish C++ literals were "string". - -// Doing this simplifies the logic below. -#ifndef __has_include -#define __has_include(x) 0 -#endif - -#include <stddef.h> -#include <string.h> -#include <algorithm> -#include <iosfwd> -#include <iterator> -#include <string> -#if __has_include(<string_view>) && __cplusplus >= 201703L -#include <string_view> -#endif +#include "absl/strings/string_view.h" namespace re2 { -class StringPiece { - public: - typedef std::char_traits<char> traits_type; - typedef char value_type; - typedef char* pointer; - typedef const char* const_pointer; - typedef char& reference; - typedef const char& const_reference; - typedef const char* const_iterator; - typedef const_iterator iterator; - typedef std::reverse_iterator<const_iterator> const_reverse_iterator; - typedef const_reverse_iterator reverse_iterator; - typedef size_t size_type; - typedef ptrdiff_t difference_type; - static const size_type npos = static_cast<size_type>(-1); - - // We provide non-explicit singleton constructors so users can pass - // in a "const char*" or a "string" wherever a "StringPiece" is - // expected. - StringPiece() - : data_(NULL), size_(0) {} -#if __has_include(<string_view>) && __cplusplus >= 201703L - StringPiece(const std::string_view& str) - : data_(str.data()), size_(str.size()) {} -#endif - StringPiece(const std::string& str) - : data_(str.data()), size_(str.size()) {} - StringPiece(const char* str) - : data_(str), size_(str == NULL ? 0 : strlen(str)) {} - StringPiece(const char* str, size_type len) - : data_(str), size_(len) {} - - const_iterator begin() const { return data_; } - const_iterator end() const { return data_ + size_; } - const_reverse_iterator rbegin() const { - return const_reverse_iterator(data_ + size_); - } - const_reverse_iterator rend() const { - return const_reverse_iterator(data_); - } - - size_type size() const { return size_; } - size_type length() const { return size_; } - bool empty() const { return size_ == 0; } - - const_reference operator[](size_type i) const { return data_[i]; } - const_pointer data() const { return data_; } - - void remove_prefix(size_type n) { - data_ += n; - size_ -= n; - } - - void remove_suffix(size_type n) { - size_ -= n; - } - - void set(const char* str) { - data_ = str; - size_ = str == NULL ? 0 : strlen(str); - } - - void set(const char* str, size_type len) { - data_ = str; - size_ = len; - } - - // Converts to `std::basic_string`. - template <typename A> - explicit operator std::basic_string<char, traits_type, A>() const { - if (!data_) return {}; - return std::basic_string<char, traits_type, A>(data_, size_); - } - - std::string as_string() const { - return std::string(data_, size_); - } - - // We also define ToString() here, since many other string-like - // interfaces name the routine that converts to a C++ string - // "ToString", and it's confusing to have the method that does that - // for a StringPiece be called "as_string()". We also leave the - // "as_string()" method defined here for existing code. - std::string ToString() const { - return std::string(data_, size_); - } - - void CopyToString(std::string* target) const { - target->assign(data_, size_); - } - - void AppendToString(std::string* target) const { - target->append(data_, size_); - } - - size_type copy(char* buf, size_type n, size_type pos = 0) const; - StringPiece substr(size_type pos = 0, size_type n = npos) const; - - int compare(const StringPiece& x) const { - size_type min_size = std::min(size(), x.size()); - if (min_size > 0) { - int r = memcmp(data(), x.data(), min_size); - if (r < 0) return -1; - if (r > 0) return 1; - } - if (size() < x.size()) return -1; - if (size() > x.size()) return 1; - return 0; - } - - // Does "this" start with "x"? - bool starts_with(const StringPiece& x) const { - return x.empty() || - (size() >= x.size() && memcmp(data(), x.data(), x.size()) == 0); - } - - // Does "this" end with "x"? - bool ends_with(const StringPiece& x) const { - return x.empty() || - (size() >= x.size() && - memcmp(data() + (size() - x.size()), x.data(), x.size()) == 0); - } - - bool contains(const StringPiece& s) const { - return find(s) != npos; - } - - size_type find(const StringPiece& s, size_type pos = 0) const; - size_type find(char c, size_type pos = 0) const; - size_type rfind(const StringPiece& s, size_type pos = npos) const; - size_type rfind(char c, size_type pos = npos) const; - - private: - const_pointer data_; - size_type size_; -}; - -inline bool operator==(const StringPiece& x, const StringPiece& y) { - StringPiece::size_type len = x.size(); - if (len != y.size()) return false; - return x.data() == y.data() || len == 0 || - memcmp(x.data(), y.data(), len) == 0; -} - -inline bool operator!=(const StringPiece& x, const StringPiece& y) { - return !(x == y); -} - -inline bool operator<(const StringPiece& x, const StringPiece& y) { - StringPiece::size_type min_size = std::min(x.size(), y.size()); - int r = min_size == 0 ? 0 : memcmp(x.data(), y.data(), min_size); - return (r < 0) || (r == 0 && x.size() < y.size()); -} - -inline bool operator>(const StringPiece& x, const StringPiece& y) { - return y < x; -} - -inline bool operator<=(const StringPiece& x, const StringPiece& y) { - return !(x > y); -} - -inline bool operator>=(const StringPiece& x, const StringPiece& y) { - return !(x < y); -} - -// Allow StringPiece to be logged. -std::ostream& operator<<(std::ostream& o, const StringPiece& p); +// Until RE2 requires C++17 and uses std::string_view, allow users to +// continue to #include "re2/stringpiece.h" and use re2::StringPiece. +using StringPiece = absl::string_view; } // namespace re2 diff --git a/re2/testing/backtrack.cc b/re2/testing/backtrack.cc index 920a453..90071bb 100644 --- a/re2/testing/backtrack.cc +++ b/re2/testing/backtrack.cc @@ -27,7 +27,7 @@ #include <stdint.h> #include <string.h> -#include "util/util.h" +#include "absl/base/macros.h" #include "util/logging.h" #include "re2/pod_array.h" #include "re2/prog.h" @@ -55,9 +55,8 @@ class Backtracker { public: explicit Backtracker(Prog* prog); - bool Search(const StringPiece& text, const StringPiece& context, - bool anchored, bool longest, - StringPiece* submatch, int nsubmatch); + bool Search(absl::string_view text, absl::string_view context, bool anchored, + bool longest, absl::string_view* submatch, int nsubmatch); private: // Explores from instruction id at string position p looking for a match. @@ -69,14 +68,14 @@ class Backtracker { bool Try(int id, const char* p); // Search parameters - Prog* prog_; // program being run - StringPiece text_; // text being searched - StringPiece context_; // greater context of text being searched - bool anchored_; // whether search is anchored at text.begin() - bool longest_; // whether search wants leftmost-longest match - bool endmatch_; // whether search must end at text.end() - StringPiece *submatch_; // submatches to fill in - int nsubmatch_; // # of submatches to fill in + Prog* prog_; // program being run + absl::string_view text_; // text being searched + absl::string_view context_; // greater context of text being searched + bool anchored_; // whether search is anchored at text.begin() + bool longest_; // whether search wants leftmost-longest match + bool endmatch_; // whether search must end at text.end() + absl::string_view* submatch_; // submatches to fill in + int nsubmatch_; // # of submatches to fill in // Search state const char* cap_[64]; // capture registers @@ -96,9 +95,9 @@ Backtracker::Backtracker(Prog* prog) } // Runs a backtracking search. -bool Backtracker::Search(const StringPiece& text, const StringPiece& context, +bool Backtracker::Search(absl::string_view text, absl::string_view context, bool anchored, bool longest, - StringPiece* submatch, int nsubmatch) { + absl::string_view* submatch, int nsubmatch) { text_ = text; context_ = context; if (context_.data() == NULL) @@ -112,17 +111,17 @@ bool Backtracker::Search(const StringPiece& text, const StringPiece& context, endmatch_ = prog_->anchor_end(); submatch_ = submatch; nsubmatch_ = nsubmatch; - CHECK_LT(2*nsubmatch_, static_cast<int>(arraysize(cap_))); + CHECK_LT(2*nsubmatch_, static_cast<int>(ABSL_ARRAYSIZE(cap_))); memset(cap_, 0, sizeof cap_); // We use submatch_[0] for our own bookkeeping, // so it had better exist. - StringPiece sp0; + absl::string_view sp0; if (nsubmatch < 1) { submatch_ = &sp0; nsubmatch_ = 1; } - submatch_[0] = StringPiece(); + submatch_[0] = absl::string_view(); // Allocate new visited_ bitmap -- size is proportional // to text, so have to reallocate on each call to Search. @@ -203,7 +202,7 @@ bool Backtracker::Try(int id, const char* p) { case kInstCapture: if (0 <= ip->cap() && - ip->cap() < static_cast<int>(arraysize(cap_))) { + ip->cap() < static_cast<int>(ABSL_ARRAYSIZE(cap_))) { // Capture p to register, but save old value. const char* q = cap_[ip->cap()]; cap_[ip->cap()] = p; @@ -232,7 +231,7 @@ bool Backtracker::Try(int id, const char* p) { (longest_ && p > submatch_[0].data() + submatch_[0].size())) { // First match so far - or better match. for (int i = 0; i < nsubmatch_; i++) - submatch_[i] = StringPiece( + submatch_[i] = absl::string_view( cap_[2 * i], static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i])); } return true; @@ -243,16 +242,14 @@ bool Backtracker::Try(int id, const char* p) { } // Runs a backtracking search. -bool Prog::UnsafeSearchBacktrack(const StringPiece& text, - const StringPiece& context, - Anchor anchor, - MatchKind kind, - StringPiece* match, +bool Prog::UnsafeSearchBacktrack(absl::string_view text, + absl::string_view context, Anchor anchor, + MatchKind kind, absl::string_view* match, int nmatch) { // If full match, we ask for an anchored longest match // and then check that match[0] == text. // So make sure match[0] exists. - StringPiece sp0; + absl::string_view sp0; if (kind == kFullMatch) { anchor = kAnchored; if (nmatch < 1) { diff --git a/re2/testing/charclass_test.cc b/re2/testing/charclass_test.cc index 9c2a32f..ad95d6c 100644 --- a/re2/testing/charclass_test.cc +++ b/re2/testing/charclass_test.cc @@ -6,7 +6,9 @@ #include <stdio.h> -#include "util/test.h" +#include "absl/base/macros.h" +#include "absl/strings/str_format.h" +#include "gtest/gtest.h" #include "util/utf.h" #include "re2/regexp.h" @@ -88,25 +90,25 @@ static CCTest tests[] = { template <typename CharClass> static void Broke(const char *desc, const CCTest* t, CharClass* cc) { if (t == NULL) { - printf("\t%s:", desc); + absl::PrintF("\t%s:", desc); } else { - printf("\n"); - printf("CharClass added: [%s]", desc); + absl::PrintF("\n"); + absl::PrintF("CharClass added: [%s]", desc); for (int k = 0; t->add[k].lo >= 0; k++) - printf(" %d-%d", t->add[k].lo, t->add[k].hi); - printf("\n"); + absl::PrintF(" %d-%d", t->add[k].lo, t->add[k].hi); + absl::PrintF("\n"); if (t->remove >= 0) - printf("Removed > %d\n", t->remove); - printf("\twant:"); + absl::PrintF("Removed > %d\n", t->remove); + absl::PrintF("\twant:"); for (int k = 0; t->final[k].lo >= 0; k++) - printf(" %d-%d", t->final[k].lo, t->final[k].hi); - printf("\n"); - printf("\thave:"); + absl::PrintF(" %d-%d", t->final[k].lo, t->final[k].hi); + absl::PrintF("\n"); + absl::PrintF("\thave:"); } for (typename CharClass::iterator it = cc->begin(); it != cc->end(); ++it) - printf(" %d-%d", it->lo, it->hi); - printf("\n"); + absl::PrintF(" %d-%d", it->lo, it->hi); + absl::PrintF("\n"); } bool ShouldContain(CCTest *t, int x) { @@ -155,7 +157,7 @@ bool CorrectCC(CharClass *cc, CCTest *t, const char *desc) { } if (cc->size() != size) { Broke(desc, t, cc); - printf("wrong size: want %d have %d\n", size, cc->size()); + absl::PrintF("wrong size: want %d have %d\n", size, cc->size()); return false; } @@ -164,8 +166,8 @@ bool CorrectCC(CharClass *cc, CCTest *t, const char *desc) { j = Runemax; if (ShouldContain(t, j) != cc->Contains(j)) { Broke(desc, t, cc); - printf("want contains(%d)=%d, got %d\n", - j, ShouldContain(t, j), cc->Contains(j)); + absl::PrintF("want contains(%d)=%d, got %d\n", + j, ShouldContain(t, j), cc->Contains(j)); return false; } } @@ -177,16 +179,16 @@ bool CorrectCC(CharClass *cc, CCTest *t, const char *desc) { if (ShouldContain(t, j) == ncc->Contains(j)) { Broke(desc, t, cc); Broke("ncc", NULL, ncc); - printf("want ncc contains(%d)!=%d, got %d\n", - j, ShouldContain(t, j), ncc->Contains(j)); + absl::PrintF("want ncc contains(%d)!=%d, got %d\n", + j, ShouldContain(t, j), ncc->Contains(j)); Delete(ncc); return false; } if (ncc->size() != Runemax+1 - cc->size()) { Broke(desc, t, cc); Broke("ncc", NULL, ncc); - printf("ncc size should be %d is %d\n", - Runemax+1 - cc->size(), ncc->size()); + absl::PrintF("ncc size should be %d is %d\n", + Runemax+1 - cc->size(), ncc->size()); Delete(ncc); return false; } @@ -197,7 +199,7 @@ bool CorrectCC(CharClass *cc, CCTest *t, const char *desc) { TEST(TestCharClassBuilder, Adds) { int nfail = 0; - for (size_t i = 0; i < arraysize(tests); i++) { + for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) { CharClassBuilder ccb; CCTest* t = &tests[i]; for (int j = 0; t->add[j].lo >= 0; j++) diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc index 4718830..f6899d3 100644 --- a/re2/testing/compile_test.cc +++ b/re2/testing/compile_test.cc @@ -6,7 +6,8 @@ #include <string> -#include "util/test.h" +#include "absl/base/macros.h" +#include "gtest/gtest.h" #include "util/logging.h" #include "re2/regexp.h" #include "re2/prog.h" @@ -127,7 +128,7 @@ static Test tests[] = { TEST(TestRegexpCompileToProg, Simple) { int failed = 0; - for (size_t i = 0; i < arraysize(tests); i++) { + for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) { const re2::Test& t = tests[i]; Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL); if (re == NULL) { @@ -156,7 +157,7 @@ TEST(TestRegexpCompileToProg, Simple) { EXPECT_EQ(failed, 0); } -static void DumpByteMap(StringPiece pattern, Regexp::ParseFlags flags, +static void DumpByteMap(absl::string_view pattern, Regexp::ParseFlags flags, std::string* bytemap) { Regexp* re = Regexp::Parse(pattern, flags, NULL); EXPECT_TRUE(re != NULL); @@ -257,7 +258,7 @@ TEST(TestCompile, InsufficientMemory) { re->Decref(); } -static void Dump(StringPiece pattern, Regexp::ParseFlags flags, +static void Dump(absl::string_view pattern, Regexp::ParseFlags flags, std::string* forward, std::string* reverse) { Regexp* re = Regexp::Parse(pattern, flags, NULL); EXPECT_TRUE(re != NULL); diff --git a/re2/testing/dfa_test.cc b/re2/testing/dfa_test.cc index 842daaf..b0759f7 100644 --- a/re2/testing/dfa_test.cc +++ b/re2/testing/dfa_test.cc @@ -7,11 +7,12 @@ #include <thread> #include <vector> -#include "util/test.h" -#include "util/flags.h" +#include "absl/base/macros.h" +#include "absl/flags/flag.h" +#include "absl/strings/str_format.h" +#include "gtest/gtest.h" #include "util/logging.h" #include "util/malloc_counter.h" -#include "util/strutil.h" #include "re2/prog.h" #include "re2/re2.h" #include "re2/regexp.h" @@ -20,9 +21,9 @@ static const bool UsingMallocCounter = false; -DEFINE_FLAG(int, size, 8, "log2(number of DFA nodes)"); -DEFINE_FLAG(int, repeat, 2, "Repetition count."); -DEFINE_FLAG(int, threads, 4, "number of threads"); +ABSL_FLAG(int, size, 8, "log2(number of DFA nodes)"); +ABSL_FLAG(int, repeat, 2, "Repetition count."); +ABSL_FLAG(int, threads, 4, "number of threads"); namespace re2 { @@ -50,7 +51,7 @@ static void DoBuild(Prog* prog) { TEST(Multithreaded, BuildEntireDFA) { // Create regexp with 2^FLAGS_size states in DFA. std::string s = "a"; - for (int i = 0; i < GetFlag(FLAGS_size); i++) + for (int i = 0; i < absl::GetFlag(FLAGS_size); i++) s += "[ab]"; s += "b"; Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL); @@ -68,14 +69,14 @@ TEST(Multithreaded, BuildEntireDFA) { } // Build the DFA simultaneously in a bunch of threads. - for (int i = 0; i < GetFlag(FLAGS_repeat); i++) { + for (int i = 0; i < absl::GetFlag(FLAGS_repeat); i++) { Prog* prog = re->CompileToProg(0); ASSERT_TRUE(prog != NULL); std::vector<std::thread> threads; - for (int j = 0; j < GetFlag(FLAGS_threads); j++) + for (int j = 0; j < absl::GetFlag(FLAGS_threads); j++) threads.emplace_back(DoBuild, prog); - for (int j = 0; j < GetFlag(FLAGS_threads); j++) + for (int j = 0; j < absl::GetFlag(FLAGS_threads); j++) threads[j].join(); // One more compile, to make sure everything is okay. @@ -154,7 +155,7 @@ TEST(SingleThreaded, SearchDFA) { // Empirically, n = 18 is a good compromise between the two. const int n = 18; - Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n), + Regexp* re = Regexp::Parse(absl::StrFormat("0[01]{%d}$", n), Regexp::LikePerl, NULL); ASSERT_TRUE(re != NULL); @@ -172,12 +173,14 @@ TEST(SingleThreaded, SearchDFA) { for (int i = 0; i < 10; i++) { bool matched = false; bool failed = false; - matched = prog->SearchDFA(match, StringPiece(), Prog::kUnanchored, - Prog::kFirstMatch, NULL, &failed, NULL); + matched = + prog->SearchDFA(match, absl::string_view(), Prog::kUnanchored, + Prog::kFirstMatch, NULL, &failed, NULL); ASSERT_FALSE(failed); ASSERT_TRUE(matched); - matched = prog->SearchDFA(no_match, StringPiece(), Prog::kUnanchored, - Prog::kFirstMatch, NULL, &failed, NULL); + matched = + prog->SearchDFA(no_match, absl::string_view(), Prog::kUnanchored, + Prog::kFirstMatch, NULL, &failed, NULL); ASSERT_FALSE(failed); ASSERT_FALSE(matched); } @@ -201,17 +204,19 @@ TEST(SingleThreaded, SearchDFA) { // Helper function: searches for match, which should match, // and no_match, which should not. -static void DoSearch(Prog* prog, const StringPiece& match, - const StringPiece& no_match) { +static void DoSearch(Prog* prog, absl::string_view match, + absl::string_view no_match) { for (int i = 0; i < 2; i++) { bool matched = false; bool failed = false; - matched = prog->SearchDFA(match, StringPiece(), Prog::kUnanchored, - Prog::kFirstMatch, NULL, &failed, NULL); + matched = + prog->SearchDFA(match, absl::string_view(), Prog::kUnanchored, + Prog::kFirstMatch, NULL, &failed, NULL); ASSERT_FALSE(failed); ASSERT_TRUE(matched); - matched = prog->SearchDFA(no_match, StringPiece(), Prog::kUnanchored, - Prog::kFirstMatch, NULL, &failed, NULL); + matched = + prog->SearchDFA(no_match, absl::string_view(), Prog::kUnanchored, + Prog::kFirstMatch, NULL, &failed, NULL); ASSERT_FALSE(failed); ASSERT_FALSE(matched); } @@ -224,7 +229,7 @@ TEST(Multithreaded, SearchDFA) { // Same as single-threaded test above. const int n = 18; - Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n), + Regexp* re = Regexp::Parse(absl::StrFormat("0[01]{%d}$", n), Regexp::LikePerl, NULL); ASSERT_TRUE(re != NULL); std::string no_match = DeBruijnString(n); @@ -243,14 +248,14 @@ TEST(Multithreaded, SearchDFA) { // Run the search simultaneously in a bunch of threads. // Reuse same flags for Multithreaded.BuildDFA above. - for (int i = 0; i < GetFlag(FLAGS_repeat); i++) { + for (int i = 0; i < absl::GetFlag(FLAGS_repeat); i++) { Prog* prog = re->CompileToProg(1<<n); ASSERT_TRUE(prog != NULL); std::vector<std::thread> threads; - for (int j = 0; j < GetFlag(FLAGS_threads); j++) + for (int j = 0; j < absl::GetFlag(FLAGS_threads); j++) threads.emplace_back(DoSearch, prog, match, no_match); - for (int j = 0; j < GetFlag(FLAGS_threads); j++) + for (int j = 0; j < absl::GetFlag(FLAGS_threads); j++) threads[j].join(); delete prog; @@ -281,15 +286,16 @@ ReverseTest reverse_tests[] = { TEST(DFA, ReverseMatch) { int nfail = 0; - for (size_t i = 0; i < arraysize(reverse_tests); i++) { + for (size_t i = 0; i < ABSL_ARRAYSIZE(reverse_tests); i++) { const ReverseTest& t = reverse_tests[i]; Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL); ASSERT_TRUE(re != NULL); Prog* prog = re->CompileToReverseProg(0); ASSERT_TRUE(prog != NULL); bool failed = false; - bool matched = prog->SearchDFA(t.text, StringPiece(), Prog::kUnanchored, - Prog::kFirstMatch, NULL, &failed, NULL); + bool matched = + prog->SearchDFA(t.text, absl::string_view(), Prog::kUnanchored, + Prog::kFirstMatch, NULL, &failed, NULL); if (matched != t.match) { LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match; nfail++; @@ -336,7 +342,7 @@ CallbackTest callback_tests[] = { TEST(DFA, Callback) { int nfail = 0; - for (size_t i = 0; i < arraysize(callback_tests); i++) { + for (size_t i = 0; i < ABSL_ARRAYSIZE(callback_tests); i++) { const CallbackTest& t = callback_tests[i]; Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL); ASSERT_TRUE(re != NULL); @@ -349,7 +355,7 @@ TEST(DFA, Callback) { dump += " "; dump += match ? "[[" : "["; for (int b = 0; b < prog->bytemap_range() + 1; b++) - dump += StringPrintf("%d,", next[b]); + dump += absl::StrFormat("%d,", next[b]); dump.pop_back(); dump += match ? "]]" : "]"; }); diff --git a/re2/testing/dump.cc b/re2/testing/dump.cc index cad0910..5cddd23 100644 --- a/re2/testing/dump.cc +++ b/re2/testing/dump.cc @@ -18,11 +18,11 @@ #include <string> -#include "util/test.h" +#include "absl/base/macros.h" +#include "absl/strings/str_format.h" +#include "gtest/gtest.h" #include "util/logging.h" -#include "util/strutil.h" #include "util/utf.h" -#include "re2/stringpiece.h" #include "re2/regexp.h" namespace re2 { @@ -55,8 +55,8 @@ static const char* kOpcodeNames[] = { // Create string representation of regexp with explicit structure. // Nothing pretty, just for testing. static void DumpRegexpAppending(Regexp* re, std::string* s) { - if (re->op() < 0 || re->op() >= arraysize(kOpcodeNames)) { - *s += StringPrintf("op%d", re->op()); + if (re->op() < 0 || re->op() >= ABSL_ARRAYSIZE(kOpcodeNames)) { + *s += absl::StrFormat("op%d", re->op()); } else { switch (re->op()) { default: @@ -129,7 +129,7 @@ static void DumpRegexpAppending(Regexp* re, std::string* s) { DumpRegexpAppending(re->sub()[0], s); break; case kRegexpRepeat: - s->append(StringPrintf("%d,%d ", re->min(), re->max())); + s->append(absl::StrFormat("%d,%d ", re->min(), re->max())); DumpRegexpAppending(re->sub()[0], s); break; case kRegexpCharClass: { @@ -139,9 +139,9 @@ static void DumpRegexpAppending(Regexp* re, std::string* s) { RuneRange rr = *it; s->append(sep); if (rr.lo == rr.hi) - s->append(StringPrintf("%#x", rr.lo)); + s->append(absl::StrFormat("%#x", rr.lo)); else - s->append(StringPrintf("%#x-%#x", rr.lo, rr.hi)); + s->append(absl::StrFormat("%#x-%#x", rr.lo, rr.hi)); sep = " "; } break; diff --git a/re2/testing/exhaustive1_test.cc b/re2/testing/exhaustive1_test.cc index eef2dae..9337989 100644 --- a/re2/testing/exhaustive1_test.cc +++ b/re2/testing/exhaustive1_test.cc @@ -7,7 +7,7 @@ #include <string> #include <vector> -#include "util/test.h" +#include "gtest/gtest.h" #include "re2/testing/exhaustive_tester.h" namespace re2 { diff --git a/re2/testing/exhaustive2_test.cc b/re2/testing/exhaustive2_test.cc index ae89ece..14f629d 100644 --- a/re2/testing/exhaustive2_test.cc +++ b/re2/testing/exhaustive2_test.cc @@ -9,7 +9,7 @@ #include <string> #include <vector> -#include "util/test.h" +#include "gtest/gtest.h" #include "re2/testing/exhaustive_tester.h" namespace re2 { diff --git a/re2/testing/exhaustive3_test.cc b/re2/testing/exhaustive3_test.cc index 1fe46b6..de703c0 100644 --- a/re2/testing/exhaustive3_test.cc +++ b/re2/testing/exhaustive3_test.cc @@ -9,7 +9,7 @@ #include <string> #include <vector> -#include "util/test.h" +#include "gtest/gtest.h" #include "util/utf.h" #include "re2/testing/exhaustive_tester.h" diff --git a/re2/testing/exhaustive_test.cc b/re2/testing/exhaustive_test.cc index 514fd90..5e586f1 100644 --- a/re2/testing/exhaustive_test.cc +++ b/re2/testing/exhaustive_test.cc @@ -4,7 +4,7 @@ // Exhaustive testing of regular expression matching. -#include "util/test.h" +#include "gtest/gtest.h" #include "re2/testing/exhaustive_tester.h" namespace re2 { diff --git a/re2/testing/exhaustive_tester.cc b/re2/testing/exhaustive_tester.cc index b0409c3..a57f700 100644 --- a/re2/testing/exhaustive_tester.cc +++ b/re2/testing/exhaustive_tester.cc @@ -13,10 +13,11 @@ #include <stdio.h> -#include "util/test.h" -#include "util/flags.h" +#include "absl/base/macros.h" +#include "absl/flags/flag.h" +#include "absl/strings/str_format.h" +#include "gtest/gtest.h" #include "util/logging.h" -#include "util/strutil.h" #include "re2/testing/exhaustive_tester.h" #include "re2/testing/tester.h" @@ -25,15 +26,15 @@ #define LOGGING 0 #endif -DEFINE_FLAG(bool, show_regexps, false, "show regexps during testing"); +ABSL_FLAG(bool, show_regexps, false, "show regexps during testing"); -DEFINE_FLAG(int, max_bad_regexp_inputs, 1, - "Stop testing a regular expression after finding this many " - "strings that break it."); +ABSL_FLAG(int, max_bad_regexp_inputs, 1, + "Stop testing a regular expression after finding this many " + "strings that break it."); namespace re2 { -static char* escape(const StringPiece& sp) { +static char* escape(absl::string_view sp) { static char buf[512]; char* p = buf; *p++ = '\"'; @@ -55,20 +56,21 @@ static char* escape(const StringPiece& sp) { return buf; } -static void PrintResult(const RE2& re, const StringPiece& input, RE2::Anchor anchor, StringPiece *m, int n) { +static void PrintResult(const RE2& re, absl::string_view input, + RE2::Anchor anchor, absl::string_view* m, int n) { if (!re.Match(input, 0, input.size(), anchor, m, n)) { - printf("-"); + absl::PrintF("-"); return; } for (int i = 0; i < n; i++) { if (i > 0) - printf(" "); + absl::PrintF(" "); if (m[i].data() == NULL) - printf("-"); + absl::PrintF("-"); else - printf("%td-%td", - BeginPtr(m[i]) - BeginPtr(input), - EndPtr(m[i]) - BeginPtr(input)); + absl::PrintF("%d-%d", + BeginPtr(m[i]) - BeginPtr(input), + EndPtr(m[i]) - BeginPtr(input)); } } @@ -79,11 +81,13 @@ void ExhaustiveTester::HandleRegexp(const std::string& const_regexp) { regexps_++; std::string regexp = const_regexp; if (!topwrapper_.empty()) { - regexp = StringPrintf(topwrapper_.c_str(), regexp.c_str()); + auto fmt = absl::ParsedFormat<'s'>::New(topwrapper_); + CHECK(fmt != nullptr); + regexp = absl::StrFormat(*fmt, regexp); } - if (GetFlag(FLAGS_show_regexps)) { - printf("\r%s", regexp.c_str()); + if (absl::GetFlag(FLAGS_show_regexps)) { + absl::PrintF("\r%s", regexp); fflush(stdout); } @@ -93,32 +97,32 @@ void ExhaustiveTester::HandleRegexp(const std::string& const_regexp) { if (randomstrings_) LOG(ERROR) << "Cannot log with random strings."; if (regexps_ == 1) { // first - printf("strings\n"); + absl::PrintF("strings\n"); strgen_.Reset(); while (strgen_.HasNext()) - printf("%s\n", escape(strgen_.Next())); - printf("regexps\n"); + absl::PrintF("%s\n", escape(strgen_.Next())); + absl::PrintF("regexps\n"); } - printf("%s\n", escape(regexp)); + absl::PrintF("%s\n", escape(regexp)); RE2 re(regexp); RE2::Options longest; longest.set_longest_match(true); RE2 relongest(regexp, longest); int ngroup = re.NumberOfCapturingGroups()+1; - StringPiece* group = new StringPiece[ngroup]; + absl::string_view* group = new absl::string_view[ngroup]; strgen_.Reset(); while (strgen_.HasNext()) { - StringPiece input = strgen_.Next(); + absl::string_view input = strgen_.Next(); PrintResult(re, input, RE2::ANCHOR_BOTH, group, ngroup); - printf(";"); + absl::PrintF(";"); PrintResult(re, input, RE2::UNANCHORED, group, ngroup); - printf(";"); + absl::PrintF(";"); PrintResult(relongest, input, RE2::ANCHOR_BOTH, group, ngroup); - printf(";"); + absl::PrintF(";"); PrintResult(relongest, input, RE2::UNANCHORED, group, ngroup); - printf("\n"); + absl::PrintF("\n"); } delete[] group; return; @@ -137,7 +141,7 @@ void ExhaustiveTester::HandleRegexp(const std::string& const_regexp) { tests_++; if (!tester.TestInput(strgen_.Next())) { failures_++; - if (++bad_inputs >= GetFlag(FLAGS_max_bad_regexp_inputs)) + if (++bad_inputs >= absl::GetFlag(FLAGS_max_bad_regexp_inputs)) break; } } @@ -164,8 +168,8 @@ void ExhaustiveTest(int maxatoms, int maxops, topwrapper); t.Generate(); if (!LOGGING) { - printf("%d regexps, %d tests, %d failures [%d/%d str]\n", - t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size()); + absl::PrintF("%d regexps, %d tests, %d failures [%d/%d str]\n", + t.regexps(), t.tests(), t.failures(), maxstrlen, stralphabet.size()); } EXPECT_EQ(0, t.failures()); } @@ -177,7 +181,7 @@ void EgrepTest(int maxatoms, int maxops, const std::string& alphabet, const std::string& wrapper) { const char* tops[] = { "", "^(?:%s)", "(?:%s)$", "^(?:%s)$" }; - for (size_t i = 0; i < arraysize(tops); i++) { + for (size_t i = 0; i < ABSL_ARRAYSIZE(tops); i++) { ExhaustiveTest(maxatoms, maxops, Split("", alphabet), RegexpGenerator::EgrepOps(), diff --git a/re2/testing/exhaustive_tester.h b/re2/testing/exhaustive_tester.h index 3a14282..906be0c 100644 --- a/re2/testing/exhaustive_tester.h +++ b/re2/testing/exhaustive_tester.h @@ -9,7 +9,6 @@ #include <string> #include <vector> -#include "util/util.h" #include "re2/testing/regexp_generator.h" #include "re2/testing/string_generator.h" diff --git a/re2/testing/filtered_re2_test.cc b/re2/testing/filtered_re2_test.cc index c788fda..a8d2dfc 100644 --- a/re2/testing/filtered_re2_test.cc +++ b/re2/testing/filtered_re2_test.cc @@ -9,7 +9,8 @@ #include <vector> #include <utility> -#include "util/test.h" +#include "absl/base/macros.h" +#include "gtest/gtest.h" #include "util/logging.h" #include "re2/filtered_re2.h" #include "re2/re2.h" @@ -106,12 +107,13 @@ AtomTest atom_tests[] = { // substring in an OR are removed; that is, only the shortest // substring is kept. "SubstrAtomRemovesSuperStrInOr", { - "(abc123|abc|ghi789|abc1234).*[x-z]+", + "(abc123|abc|defxyz|ghi789|abc1234|xyz).*[x-z]+", "abcd..yyy..yyyzzz", "mnmnpp[a-z]+PPP" }, { "abc", "ghi789", + "xyz", "abcd", "yyy", "yyyzzz", @@ -184,14 +186,14 @@ bool CheckExpectedAtoms(const char* atoms[], TEST(FilteredRE2Test, AtomTests) { int nfail = 0; - for (size_t i = 0; i < arraysize(atom_tests); i++) { + for (size_t i = 0; i < ABSL_ARRAYSIZE(atom_tests); i++) { FilterTestVars v; AtomTest* t = &atom_tests[i]; size_t nregexp, natom; - for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) + for (nregexp = 0; nregexp < ABSL_ARRAYSIZE(t->regexps); nregexp++) if (t->regexps[nregexp] == NULL) break; - for (natom = 0; natom < arraysize(t->atoms); natom++) + for (natom = 0; natom < ABSL_ARRAYSIZE(t->atoms); natom++) if (t->atoms[natom] == NULL) break; AddRegexpsAndCompile(t->regexps, nregexp, &v); @@ -223,7 +225,7 @@ TEST(FilteredRE2Test, MatchEmptyPattern) { // the index we use for the test is for the correct test. EXPECT_EQ("CheckEmptyPattern", std::string(t->testname)); size_t nregexp; - for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) + for (nregexp = 0; nregexp < ABSL_ARRAYSIZE(t->regexps); nregexp++) if (t->regexps[nregexp] == NULL) break; AddRegexpsAndCompile(t->regexps, nregexp, &v); @@ -240,7 +242,7 @@ TEST(FilteredRE2Test, MatchTests) { // for this test. EXPECT_EQ("SubstrAtomRemovesSuperStrInOr", std::string(t->testname)); size_t nregexp; - for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) + for (nregexp = 0; nregexp < ABSL_ARRAYSIZE(t->regexps); nregexp++) if (t->regexps[nregexp] == NULL) break; AddRegexpsAndCompile(t->regexps, nregexp, &v); @@ -287,8 +289,8 @@ TEST(FilteredRE2Test, EmptyStringInStringSetBug) { FilterTestVars v(0); // override the minimum atom length const char* regexps[] = {"-R.+(|ADD=;AA){12}}"}; const char* atoms[] = {"", "-r", "add=;aa", "}"}; - AddRegexpsAndCompile(regexps, arraysize(regexps), &v); - EXPECT_TRUE(CheckExpectedAtoms(atoms, arraysize(atoms), + AddRegexpsAndCompile(regexps, ABSL_ARRAYSIZE(regexps), &v); + EXPECT_TRUE(CheckExpectedAtoms(atoms, ABSL_ARRAYSIZE(atoms), "EmptyStringInStringSetBug", &v)); } diff --git a/re2/testing/mimics_pcre_test.cc b/re2/testing/mimics_pcre_test.cc index 01ab41e..829659d 100644 --- a/re2/testing/mimics_pcre_test.cc +++ b/re2/testing/mimics_pcre_test.cc @@ -2,7 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#include "util/test.h" +#include "absl/base/macros.h" +#include "gtest/gtest.h" #include "util/logging.h" #include "re2/prog.h" #include "re2/regexp.h" @@ -58,7 +59,7 @@ static PCRETest tests[] = { }; TEST(MimicsPCRE, SimpleTests) { - for (size_t i = 0; i < arraysize(tests); i++) { + for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) { const PCRETest& t = tests[i]; for (size_t j = 0; j < 2; j++) { Regexp::ParseFlags flags = Regexp::LikePerl; diff --git a/re2/testing/null_walker.cc b/re2/testing/null_walker.cc index 2bdea02..745364b 100644 --- a/re2/testing/null_walker.cc +++ b/re2/testing/null_walker.cc @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#include "util/test.h" +#include "gtest/gtest.h" #include "util/logging.h" #include "re2/regexp.h" #include "re2/walker-inl.h" diff --git a/re2/testing/parse_test.cc b/re2/testing/parse_test.cc index e571127..0ee5561 100644 --- a/re2/testing/parse_test.cc +++ b/re2/testing/parse_test.cc @@ -6,7 +6,8 @@ #include <string> -#include "util/test.h" +#include "absl/base/macros.h" +#include "gtest/gtest.h" #include "util/logging.h" #include "re2/regexp.h" @@ -165,6 +166,8 @@ static Test tests[] = { // Test named captures { "(?P<name>a)", "cap{name:lit{a}}" }, { "(?P<ä¸æ–‡>a)", "cap{ä¸æ–‡:lit{a}}" }, + { "(?<name>a)", "cap{name:lit{a}}" }, + { "(?<ä¸æ–‡>a)", "cap{ä¸æ–‡:lit{a}}" }, // Case-folded literals { "[Aa]", "litfold{a}" }, @@ -262,7 +265,7 @@ void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags, // Test that regexps parse to expected structures. TEST(TestParse, SimpleRegexps) { - TestParse(tests, arraysize(tests), kTestFlags, "simple"); + TestParse(tests, ABSL_ARRAYSIZE(tests), kTestFlags, "simple"); } Test foldcase_tests[] = { @@ -279,7 +282,7 @@ Test foldcase_tests[] = { // Test that parsing with FoldCase works. TEST(TestParse, FoldCase) { - TestParse(foldcase_tests, arraysize(foldcase_tests), Regexp::FoldCase, "foldcase"); + TestParse(foldcase_tests, ABSL_ARRAYSIZE(foldcase_tests), Regexp::FoldCase, "foldcase"); } Test literal_tests[] = { @@ -288,7 +291,7 @@ Test literal_tests[] = { // Test that parsing with Literal works. TEST(TestParse, Literal) { - TestParse(literal_tests, arraysize(literal_tests), Regexp::Literal, "literal"); + TestParse(literal_tests, ABSL_ARRAYSIZE(literal_tests), Regexp::Literal, "literal"); } Test matchnl_tests[] = { @@ -301,7 +304,7 @@ Test matchnl_tests[] = { // Test that parsing with MatchNL works. // (Also tested above during simple cases.) TEST(TestParse, MatchNL) { - TestParse(matchnl_tests, arraysize(matchnl_tests), Regexp::MatchNL, "with MatchNL"); + TestParse(matchnl_tests, ABSL_ARRAYSIZE(matchnl_tests), Regexp::MatchNL, "with MatchNL"); } Test nomatchnl_tests[] = { @@ -313,7 +316,7 @@ Test nomatchnl_tests[] = { // Test that parsing without MatchNL works. TEST(TestParse, NoMatchNL) { - TestParse(nomatchnl_tests, arraysize(nomatchnl_tests), Regexp::NoParseFlags, "without MatchNL"); + TestParse(nomatchnl_tests, ABSL_ARRAYSIZE(nomatchnl_tests), Regexp::NoParseFlags, "without MatchNL"); } Test prefix_tests[] = { @@ -357,7 +360,7 @@ Test prefix_tests[] = { // Test that prefix factoring works. TEST(TestParse, Prefix) { - TestParse(prefix_tests, arraysize(prefix_tests), Regexp::PerlX, "prefix"); + TestParse(prefix_tests, ABSL_ARRAYSIZE(prefix_tests), Regexp::PerlX, "prefix"); } Test nested_tests[] = { @@ -373,7 +376,7 @@ Test nested_tests[] = { // Test that nested repetition works. TEST(TestParse, Nested) { - TestParse(nested_tests, arraysize(nested_tests), Regexp::PerlX, "nested"); + TestParse(nested_tests, ABSL_ARRAYSIZE(nested_tests), Regexp::PerlX, "nested"); } // Invalid regular expressions @@ -395,6 +398,11 @@ const char* badtests[] = { "(?P<name", "(?P<x y>a)", "(?P<>a)", + "(?<name>a", + "(?<name>", + "(?<name", + "(?<x y>a)", + "(?<>a)", "[a-Z]", "(?i)[a-Z]", "a{100000}", @@ -415,6 +423,7 @@ const char* only_perl[] = { "\\Q\\\\\\\\\\E", "(?:a)", "(?P<name>a)", + "(?<name>a)", }; // Valid in POSIX, bad in Perl. @@ -428,20 +437,20 @@ const char* only_posix[] = { // Test that parser rejects bad regexps. TEST(TestParse, InvalidRegexps) { - for (size_t i = 0; i < arraysize(badtests); i++) { + for (size_t i = 0; i < ABSL_ARRAYSIZE(badtests); i++) { ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::PerlX, NULL) == NULL) << " " << badtests[i]; ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::NoParseFlags, NULL) == NULL) << " " << badtests[i]; } - for (size_t i = 0; i < arraysize(only_posix); i++) { + for (size_t i = 0; i < ABSL_ARRAYSIZE(only_posix); i++) { ASSERT_TRUE(Regexp::Parse(only_posix[i], Regexp::PerlX, NULL) == NULL) << " " << only_posix[i]; Regexp* re = Regexp::Parse(only_posix[i], Regexp::NoParseFlags, NULL); ASSERT_TRUE(re != NULL) << " " << only_posix[i]; re->Decref(); } - for (size_t i = 0; i < arraysize(only_perl); i++) { + for (size_t i = 0; i < ABSL_ARRAYSIZE(only_perl); i++) { ASSERT_TRUE(Regexp::Parse(only_perl[i], Regexp::NoParseFlags, NULL) == NULL) << " " << only_perl[i]; Regexp* re = Regexp::Parse(only_perl[i], Regexp::PerlX, NULL); @@ -452,7 +461,7 @@ TEST(TestParse, InvalidRegexps) { // Test that ToString produces original regexp or equivalent one. TEST(TestToString, EquivalentParse) { - for (size_t i = 0; i < arraysize(tests); i++) { + for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) { RegexpStatus status; Regexp::ParseFlags f = kTestFlags; if (tests[i].flags != 0) { @@ -504,6 +513,16 @@ TEST(NamedCaptures, ErrorArgs) { EXPECT_TRUE(re == NULL); EXPECT_EQ(status.code(), kRegexpBadNamedCapture); EXPECT_EQ(status.error_arg(), "(?P<space bar>"); + + re = Regexp::Parse("test(?<name", Regexp::LikePerl, &status); + EXPECT_TRUE(re == NULL); + EXPECT_EQ(status.code(), kRegexpBadNamedCapture); + EXPECT_EQ(status.error_arg(), "(?<name"); + + re = Regexp::Parse("test(?<space bar>z)", Regexp::LikePerl, &status); + EXPECT_TRUE(re == NULL); + EXPECT_EQ(status.code(), kRegexpBadNamedCapture); + EXPECT_EQ(status.error_arg(), "(?<space bar>"); } } // namespace re2 diff --git a/re2/testing/possible_match_test.cc b/re2/testing/possible_match_test.cc index 0ec90ae..fe199c6 100644 --- a/re2/testing/possible_match_test.cc +++ b/re2/testing/possible_match_test.cc @@ -6,9 +6,10 @@ #include <string> #include <vector> -#include "util/test.h" +#include "absl/base/macros.h" +#include "absl/strings/escaping.h" +#include "gtest/gtest.h" #include "util/logging.h" -#include "util/strutil.h" #include "re2/prog.h" #include "re2/re2.h" #include "re2/regexp.h" @@ -107,12 +108,12 @@ static PrefixTest tests[] = { }; TEST(PossibleMatchRange, HandWritten) { - for (size_t i = 0; i < arraysize(tests); i++) { + for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) { for (size_t j = 0; j < 2; j++) { const PrefixTest& t = tests[i]; std::string min, max; if (j == 0) { - LOG(INFO) << "Checking regexp=" << CEscape(t.regexp); + LOG(INFO) << "Checking regexp=" << absl::CEscape(t.regexp); Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL); ASSERT_TRUE(re != NULL); Prog* prog = re->CompileToProg(0); @@ -142,26 +143,26 @@ TEST(PossibleMatchRange, Failures) { // are no valid UTF-8 strings beginning with byte 0xFF. EXPECT_FALSE(RE2("[\\s\\S]+", RE2::Latin1). PossibleMatchRange(&min, &max, 10)) - << "min=" << CEscape(min) << ", max=" << CEscape(max); + << "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max); EXPECT_FALSE(RE2("[\\0-\xFF]+", RE2::Latin1). PossibleMatchRange(&min, &max, 10)) - << "min=" << CEscape(min) << ", max=" << CEscape(max); + << "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max); EXPECT_FALSE(RE2(".+hello", RE2::Latin1). PossibleMatchRange(&min, &max, 10)) - << "min=" << CEscape(min) << ", max=" << CEscape(max); + << "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max); EXPECT_FALSE(RE2(".*hello", RE2::Latin1). PossibleMatchRange(&min, &max, 10)) - << "min=" << CEscape(min) << ", max=" << CEscape(max); + << "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max); EXPECT_FALSE(RE2(".*", RE2::Latin1). PossibleMatchRange(&min, &max, 10)) - << "min=" << CEscape(min) << ", max=" << CEscape(max); + << "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max); EXPECT_FALSE(RE2("\\C*"). PossibleMatchRange(&min, &max, 10)) - << "min=" << CEscape(min) << ", max=" << CEscape(max); + << "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max); // Fails because it's a malformed regexp. EXPECT_FALSE(RE2("*hello").PossibleMatchRange(&min, &max, 10)) - << "min=" << CEscape(min) << ", max=" << CEscape(max); + << "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max); } // Exhaustive test: generate all regexps within parameters, @@ -201,7 +202,7 @@ class PossibleMatchTester : public RegexpGenerator { void PossibleMatchTester::HandleRegexp(const std::string& regexp) { regexps_++; - VLOG(3) << CEscape(regexp); + VLOG(3) << absl::CEscape(regexp); RE2 re(regexp, RE2::Latin1); ASSERT_EQ(re.error(), ""); @@ -213,12 +214,12 @@ void PossibleMatchTester::HandleRegexp(const std::string& regexp) { // complicated expressions. if(strstr(regexp.c_str(), "\\C*")) return; - LOG(QFATAL) << "PossibleMatchRange failed on: " << CEscape(regexp); + LOG(QFATAL) << "PossibleMatchRange failed on: " << absl::CEscape(regexp); } strgen_.Reset(); while (strgen_.HasNext()) { - const StringPiece& s = strgen_.Next(); + absl::string_view s = strgen_.Next(); tests_++; if (!RE2::FullMatch(s, re)) continue; diff --git a/re2/testing/random_test.cc b/re2/testing/random_test.cc index 44712eb..d076b39 100644 --- a/re2/testing/random_test.cc +++ b/re2/testing/random_test.cc @@ -8,14 +8,15 @@ #include <string> #include <vector> -#include "util/test.h" -#include "util/flags.h" +#include "absl/flags/flag.h" +#include "absl/strings/str_format.h" +#include "gtest/gtest.h" #include "re2/testing/exhaustive_tester.h" -DEFINE_FLAG(int, regexpseed, 404, "Random regexp seed."); -DEFINE_FLAG(int, regexpcount, 100, "How many random regexps to generate."); -DEFINE_FLAG(int, stringseed, 200, "Random string seed."); -DEFINE_FLAG(int, stringcount, 100, "How many random strings to generate."); +ABSL_FLAG(int, regexpseed, 404, "Random regexp seed."); +ABSL_FLAG(int, regexpcount, 100, "How many random regexps to generate."); +ABSL_FLAG(int, stringseed, 200, "Random string seed."); +ABSL_FLAG(int, stringcount, 100, "How many random strings to generate."); namespace re2 { @@ -38,12 +39,12 @@ static void RandomTest(int maxatoms, int maxops, ExhaustiveTester t(maxatoms, maxops, alphabet, ops, maxstrlen, stralphabet, wrapper, ""); - t.RandomStrings(GetFlag(FLAGS_stringseed), - GetFlag(FLAGS_stringcount)); - t.GenerateRandom(GetFlag(FLAGS_regexpseed), - GetFlag(FLAGS_regexpcount)); - printf("%d regexps, %d tests, %d failures [%d/%d str]\n", - t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size()); + t.RandomStrings(absl::GetFlag(FLAGS_stringseed), + absl::GetFlag(FLAGS_stringcount)); + t.GenerateRandom(absl::GetFlag(FLAGS_regexpseed), + absl::GetFlag(FLAGS_regexpcount)); + absl::PrintF("%d regexps, %d tests, %d failures [%d/%d str]\n", + t.regexps(), t.tests(), t.failures(), maxstrlen, stralphabet.size()); EXPECT_EQ(0, t.failures()); } diff --git a/re2/testing/re2_arg_test.cc b/re2/testing/re2_arg_test.cc index f62e17c..4b00be3 100644 --- a/re2/testing/re2_arg_test.cc +++ b/re2/testing/re2_arg_test.cc @@ -10,7 +10,8 @@ #include <stdint.h> #include <string.h> -#include "util/test.h" +#include "absl/base/macros.h" +#include "gtest/gtest.h" #include "util/logging.h" #include "re2/re2.h" @@ -87,7 +88,7 @@ const SuccessTable kSuccessTable[] = { { "18446744073709551616", 0, { false, false, false, false, false, false }}, }; -const int kNumStrings = arraysize(kSuccessTable); +const int kNumStrings = ABSL_ARRAYSIZE(kSuccessTable); // It's ugly to use a macro, but we apparently can't use the EXPECT_EQ // macro outside of a TEST block and this seems to be the only way to @@ -157,4 +158,26 @@ TEST(RE2ArgTest, ParseFromTest) { #endif } +TEST(RE2ArgTest, OptionalDoubleTest) { + absl::optional<double> opt; + RE2::Arg arg(&opt); + EXPECT_TRUE(arg.Parse(NULL, 0)); + EXPECT_FALSE(opt.has_value()); + EXPECT_FALSE(arg.Parse("", 0)); + EXPECT_TRUE(arg.Parse("28.30", 5)); + EXPECT_TRUE(opt.has_value()); + EXPECT_EQ(*opt, 28.30); +} + +TEST(RE2ArgTest, OptionalIntWithCRadixTest) { + absl::optional<int> opt; + RE2::Arg arg = RE2::CRadix(&opt); + EXPECT_TRUE(arg.Parse(NULL, 0)); + EXPECT_FALSE(opt.has_value()); + EXPECT_FALSE(arg.Parse("", 0)); + EXPECT_TRUE(arg.Parse("0xb0e", 5)); + EXPECT_TRUE(opt.has_value()); + EXPECT_EQ(*opt, 2830); +} + } // namespace re2 diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc index b1f7d73..151525f 100644 --- a/re2/testing/re2_test.cc +++ b/re2/testing/re2_test.cc @@ -18,9 +18,10 @@ #include <unistd.h> /* for sysconf */ #endif -#include "util/test.h" +#include "absl/base/macros.h" +#include "absl/strings/str_format.h" +#include "gtest/gtest.h" #include "util/logging.h" -#include "util/strutil.h" #include "re2/re2.h" #include "re2/regexp.h" @@ -238,7 +239,7 @@ TEST(RE2, Consume) { std::string word; std::string s(" aaa b!@#$@#$cccc"); - StringPiece input(s); + absl::string_view input(s); ASSERT_TRUE(RE2::Consume(&input, r, &word)); ASSERT_EQ(word, "aaa") << " input: " << input; @@ -249,7 +250,7 @@ TEST(RE2, Consume) { TEST(RE2, ConsumeN) { const std::string s(" one two three 4"); - StringPiece input(s); + absl::string_view input(s); RE2::Arg argv[2]; const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; @@ -276,7 +277,7 @@ TEST(RE2, FindAndConsume) { std::string word; std::string s(" aaa b!@#$@#$cccc"); - StringPiece input(s); + absl::string_view input(s); ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word)); ASSERT_EQ(word, "aaa"); @@ -296,7 +297,7 @@ TEST(RE2, FindAndConsume) { TEST(RE2, FindAndConsumeN) { const std::string s(" one two three 4"); - StringPiece input(s); + absl::string_view input(s); RE2::Arg argv[2]; const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; @@ -345,17 +346,17 @@ TEST(RE2, MatchNumberPeculiarity) { TEST(RE2, Match) { RE2 re("((\\w+):([0-9]+))"); // extracts host and port - StringPiece group[4]; + absl::string_view group[4]; // No match. - StringPiece s = "zyzzyva"; + absl::string_view s = "zyzzyva"; ASSERT_FALSE( - re.Match(s, 0, s.size(), RE2::UNANCHORED, group, arraysize(group))); + re.Match(s, 0, s.size(), RE2::UNANCHORED, group, ABSL_ARRAYSIZE(group))); // Matches and extracts. s = "a chrisr:9000 here"; ASSERT_TRUE( - re.Match(s, 0, s.size(), RE2::UNANCHORED, group, arraysize(group))); + re.Match(s, 0, s.size(), RE2::UNANCHORED, group, ABSL_ARRAYSIZE(group))); ASSERT_EQ(group[0], "chrisr:9000"); ASSERT_EQ(group[1], "chrisr:9000"); ASSERT_EQ(group[2], "chrisr"); @@ -528,7 +529,7 @@ TEST(EmptyCharset, Fuzz) { "[^\\D\\d]", "[^\\D[:digit:]]" }; - for (size_t i = 0; i < arraysize(empties); i++) + for (size_t i = 0; i < ABSL_ARRAYSIZE(empties); i++) ASSERT_FALSE(RE2(empties[i]).Match("abc", 0, 3, RE2::UNANCHORED, NULL, 0)); } @@ -542,8 +543,8 @@ TEST(EmptyCharset, BitstateAssumptions) { "((((()))))" "([^\\S\\s]|[^\\S\\s])?", "((((()))))" "(([^\\S\\s]|[^\\S\\s])|)" }; - StringPiece group[6]; - for (size_t i = 0; i < arraysize(nop_empties); i++) + absl::string_view group[6]; + for (size_t i = 0; i < ABSL_ARRAYSIZE(nop_empties); i++) ASSERT_TRUE(RE2(nop_empties[i]).Match("", 0, 0, RE2::UNANCHORED, group, 6)); } @@ -672,15 +673,15 @@ TEST(RE2, FullMatchIntegerArg) { TEST(RE2, FullMatchStringArg) { std::string s; - // String-arg + // string-arg ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", &s)); ASSERT_EQ(s, std::string("ell")); } -TEST(RE2, FullMatchStringPieceArg) { +TEST(RE2, FullMatchStringViewArg) { int i; - // StringPiece-arg - StringPiece sp; + absl::string_view sp; + // string_view-arg ASSERT_TRUE(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &sp, &i)); ASSERT_EQ(sp.size(), 4); ASSERT_TRUE(memcmp(sp.data(), "ruby", 4) == 0); @@ -742,7 +743,7 @@ TEST(RE2, FullMatchTypedNullArg) { // Ignore non-void* NULL arg ASSERT_TRUE(RE2::FullMatch("hello", "he(.*)lo", (char*)NULL)); ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", (std::string*)NULL)); - ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", (StringPiece*)NULL)); + ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", (absl::string_view*)NULL)); ASSERT_TRUE(RE2::FullMatch("1234", "(.*)", (int*)NULL)); ASSERT_TRUE(RE2::FullMatch("1234567890123456", "(.*)", (long long*)NULL)); ASSERT_TRUE(RE2::FullMatch("123.4567890123456", "(.*)", (double*)NULL)); @@ -777,7 +778,8 @@ TEST(RE2, NULTerminated) { v[pagesize - 1] = '1'; x = 0; - ASSERT_TRUE(RE2::FullMatch(StringPiece(v + pagesize - 1, 1), "(.*)", &x)); + ASSERT_TRUE( + RE2::FullMatch(absl::string_view(v + pagesize - 1, 1), "(.*)", &x)); ASSERT_EQ(x, 1); #endif } @@ -914,10 +916,10 @@ TEST(RE2, FloatingPointFullMatchTypes) { // implementation of strtof(3). And apparently MSVC too. Sigh. #if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__) ASSERT_TRUE(RE2::FullMatch("0.1", "(.*)", &v)); - ASSERT_EQ(v, 0.1f) << StringPrintf("%.8g != %.8g", v, 0.1f); + ASSERT_EQ(v, 0.1f) << absl::StrFormat("%.8g != %.8g", v, 0.1f); ASSERT_TRUE(RE2::FullMatch("6700000000081920.1", "(.*)", &v)); ASSERT_EQ(v, 6700000000081920.1f) - << StringPrintf("%.8g != %.8g", v, 6700000000081920.1f); + << absl::StrFormat("%.8g != %.8g", v, 6700000000081920.1f); #endif } { @@ -929,10 +931,10 @@ TEST(RE2, FloatingPointFullMatchTypes) { ASSERT_EQ(v, double(1e23)); ASSERT_TRUE(RE2::FullMatch("0.1", "(.*)", &v)); - ASSERT_EQ(v, 0.1) << StringPrintf("%.17g != %.17g", v, 0.1); + ASSERT_EQ(v, 0.1) << absl::StrFormat("%.17g != %.17g", v, 0.1); ASSERT_TRUE(RE2::FullMatch("1.00000005960464485", "(.*)", &v)); ASSERT_EQ(v, 1.0000000596046448) - << StringPrintf("%.17g != %.17g", v, 1.0000000596046448); + << absl::StrFormat("%.17g != %.17g", v, 1.0000000596046448); } } @@ -1242,21 +1244,21 @@ TEST(RE2, DeepRecursion) { // not implementing case-folding. TEST(CaseInsensitive, MatchAndConsume) { std::string text = "A fish named *Wanda*"; - StringPiece sp(text); - StringPiece result; + absl::string_view sp(text); + absl::string_view result; EXPECT_TRUE(RE2::PartialMatch(text, "(?i)([wand]{5})", &result)); EXPECT_TRUE(RE2::FindAndConsume(&sp, "(?i)([wand]{5})", &result)); } -// RE2 should permit implicit conversions from string, StringPiece, const char*, +// RE2 should permit implicit conversions from string, string_view, const char*, // and C string literals. TEST(RE2, ImplicitConversions) { std::string re_string("."); - StringPiece re_stringpiece("."); - const char* re_cstring = "."; + absl::string_view re_string_view("."); + const char* re_c_string = "."; EXPECT_TRUE(RE2::PartialMatch("e", re_string)); - EXPECT_TRUE(RE2::PartialMatch("e", re_stringpiece)); - EXPECT_TRUE(RE2::PartialMatch("e", re_cstring)); + EXPECT_TRUE(RE2::PartialMatch("e", re_string_view)); + EXPECT_TRUE(RE2::PartialMatch("e", re_c_string)); EXPECT_TRUE(RE2::PartialMatch("e", ".")); } @@ -1309,7 +1311,7 @@ static struct ErrorTest { { "zz(?P<name\377>abc)", RE2::ErrorBadUTF8, "" }, }; TEST(RE2, ErrorCodeAndArg) { - for (size_t i = 0; i < arraysize(error_tests); i++) { + for (size_t i = 0; i < ABSL_ARRAYSIZE(error_tests); i++) { RE2 re(error_tests[i].regexp, RE2::Quiet); EXPECT_FALSE(re.ok()); EXPECT_EQ(re.error_code(), error_tests[i].error_code) << re.error(); @@ -1332,13 +1334,13 @@ static struct NeverTest { TEST(RE2, NeverNewline) { RE2::Options opt; opt.set_never_nl(true); - for (size_t i = 0; i < arraysize(never_tests); i++) { + for (size_t i = 0; i < ABSL_ARRAYSIZE(never_tests); i++) { const NeverTest& t = never_tests[i]; RE2 re(t.regexp, opt); if (t.match == NULL) { EXPECT_FALSE(re.PartialMatch(t.text, re)); } else { - StringPiece m; + absl::string_view m; EXPECT_TRUE(re.PartialMatch(t.text, re, &m)); EXPECT_EQ(m, t.match); } @@ -1371,7 +1373,7 @@ TEST(RE2, BitstateCaptureBug) { RE2::Options opt; opt.set_max_mem(20000); RE2 re("(_________$)", opt); - StringPiece s = "xxxxxxxxxxxxxxxxxxxxxxxxxx_________x"; + absl::string_view s = "xxxxxxxxxxxxxxxxxxxxxxxxxx_________x"; EXPECT_FALSE(re.Match(s, 0, s.size(), RE2::UNANCHORED, NULL, 0)); } @@ -1450,10 +1452,10 @@ TEST(RE2, NullVsEmptyString) { RE2 re(".*"); EXPECT_TRUE(re.ok()); - StringPiece null; + absl::string_view null; EXPECT_TRUE(RE2::FullMatch(null, re)); - StringPiece empty(""); + absl::string_view empty(""); EXPECT_TRUE(RE2::FullMatch(empty, re)); } @@ -1465,25 +1467,25 @@ TEST(RE2, NullVsEmptyStringSubmatches) { EXPECT_TRUE(re.ok()); // matches[0] is overall match, [1] is (), [2] is (foo), [3] is nonexistent. - StringPiece matches[4]; + absl::string_view matches[4]; - for (size_t i = 0; i < arraysize(matches); i++) + for (size_t i = 0; i < ABSL_ARRAYSIZE(matches); i++) matches[i] = "bar"; - StringPiece null; + absl::string_view null; EXPECT_TRUE(re.Match(null, 0, null.size(), RE2::UNANCHORED, - matches, arraysize(matches))); - for (size_t i = 0; i < arraysize(matches); i++) { + matches, ABSL_ARRAYSIZE(matches))); + for (size_t i = 0; i < ABSL_ARRAYSIZE(matches); i++) { EXPECT_TRUE(matches[i].data() == NULL); // always null EXPECT_TRUE(matches[i].empty()); } - for (size_t i = 0; i < arraysize(matches); i++) + for (size_t i = 0; i < ABSL_ARRAYSIZE(matches); i++) matches[i] = "bar"; - StringPiece empty(""); + absl::string_view empty(""); EXPECT_TRUE(re.Match(empty, 0, empty.size(), RE2::UNANCHORED, - matches, arraysize(matches))); + matches, ABSL_ARRAYSIZE(matches))); EXPECT_TRUE(matches[0].data() != NULL); // empty, not null EXPECT_TRUE(matches[0].empty()); EXPECT_TRUE(matches[1].data() != NULL); // empty, not null @@ -1497,7 +1499,7 @@ TEST(RE2, NullVsEmptyStringSubmatches) { // Issue 1816809 TEST(RE2, Bug1816809) { RE2 re("(((((llx((-3)|(4)))(;(llx((-3)|(4))))*))))"); - StringPiece piece("llx-3;llx4"); + absl::string_view piece("llx-3;llx4"); std::string x; EXPECT_TRUE(RE2::Consume(&piece, re, &x)); } @@ -1615,7 +1617,7 @@ TEST(RE2, Bug26356109) { ASSERT_TRUE(re.ok()); std::string s = "abc"; - StringPiece m; + absl::string_view m; ASSERT_TRUE(re.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1)); ASSERT_EQ(m, s) << " (UNANCHORED) got m='" << m << "', want '" << s << "'"; @@ -1645,7 +1647,7 @@ TEST(RE2, Issue310) { // (?:|a)* matched more text than (?:|a)+ did. std::string s = "aaa"; - StringPiece m; + absl::string_view m; RE2 star("(?:|a)*"); ASSERT_TRUE(star.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1)); diff --git a/re2/testing/regexp_benchmark.cc b/re2/testing/regexp_benchmark.cc index 3eeb098..5352b31 100644 --- a/re2/testing/regexp_benchmark.cc +++ b/re2/testing/regexp_benchmark.cc @@ -9,19 +9,18 @@ #include <stdlib.h> #include <string> #include <thread> -#include <unordered_map> #include <utility> -#include "util/benchmark.h" -#include "util/test.h" -#include "util/flags.h" +#include "absl/container/flat_hash_map.h" +#include "absl/flags/flag.h" +#include "absl/strings/str_format.h" +#include "absl/synchronization/mutex.h" +#include "benchmark/benchmark.h" #include "util/logging.h" #include "util/malloc_counter.h" -#include "util/strutil.h" #include "re2/prog.h" #include "re2/re2.h" #include "re2/regexp.h" -#include "util/mutex.h" #include "util/pcre.h" namespace re2 { @@ -41,7 +40,7 @@ void Test() { CHECK(prog->IsOnePass()); CHECK(prog->CanBitState()); const char* text = "650-253-0001"; - StringPiece sp[4]; + absl::string_view sp[4]; CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); CHECK_EQ(sp[0], "650-253-0001"); CHECK_EQ(sp[1], "650"); @@ -61,22 +60,22 @@ void MemoryUsage() { CHECK(re); // Can't pass mc.HeapGrowth() and mc.PeakHeapGrowth() to LOG(INFO) directly, // because LOG(INFO) might do a big allocation before they get evaluated. - fprintf(stderr, "Regexp: %7lld bytes (peak=%lld)\n", - mc.HeapGrowth(), mc.PeakHeapGrowth()); + absl::FPrintF(stderr, "Regexp: %7d bytes (peak=%d)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); mc.Reset(); Prog* prog = re->CompileToProg(0); CHECK(prog); CHECK(prog->IsOnePass()); CHECK(prog->CanBitState()); - fprintf(stderr, "Prog: %7lld bytes (peak=%lld)\n", - mc.HeapGrowth(), mc.PeakHeapGrowth()); + absl::FPrintF(stderr, "Prog: %7d bytes (peak=%d)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); mc.Reset(); - StringPiece sp[4]; + absl::string_view sp[4]; CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); - fprintf(stderr, "Search: %7lld bytes (peak=%lld)\n", - mc.HeapGrowth(), mc.PeakHeapGrowth()); + absl::FPrintF(stderr, "Search: %7d bytes (peak=%d)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); delete prog; re->Decref(); } @@ -85,22 +84,22 @@ void MemoryUsage() { MallocCounter mc(MallocCounter::THIS_THREAD_ONLY); PCRE re(regexp, PCRE::UTF8); - fprintf(stderr, "RE: %7lld bytes (peak=%lld)\n", - mc.HeapGrowth(), mc.PeakHeapGrowth()); + absl::FPrintF(stderr, "RE: %7d bytes (peak=%d)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); PCRE::FullMatch(text, re); - fprintf(stderr, "RE: %7lld bytes (peak=%lld)\n", - mc.HeapGrowth(), mc.PeakHeapGrowth()); + absl::FPrintF(stderr, "RE: %7d bytes (peak=%d)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); } { MallocCounter mc(MallocCounter::THIS_THREAD_ONLY); PCRE* re = new PCRE(regexp, PCRE::UTF8); - fprintf(stderr, "PCRE*: %7lld bytes (peak=%lld)\n", - mc.HeapGrowth(), mc.PeakHeapGrowth()); + absl::FPrintF(stderr, "PCRE*: %7d bytes (peak=%d)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); PCRE::FullMatch(text, *re); - fprintf(stderr, "PCRE*: %7lld bytes (peak=%lld)\n", - mc.HeapGrowth(), mc.PeakHeapGrowth()); + absl::FPrintF(stderr, "PCRE*: %7d bytes (peak=%d)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); delete re; } @@ -108,15 +107,15 @@ void MemoryUsage() { MallocCounter mc(MallocCounter::THIS_THREAD_ONLY); RE2 re(regexp); - fprintf(stderr, "RE2: %7lld bytes (peak=%lld)\n", - mc.HeapGrowth(), mc.PeakHeapGrowth()); + absl::FPrintF(stderr, "RE2: %7d bytes (peak=%d)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); RE2::FullMatch(text, re); - fprintf(stderr, "RE2: %7lld bytes (peak=%lld)\n", - mc.HeapGrowth(), mc.PeakHeapGrowth()); + absl::FPrintF(stderr, "RE2: %7d bytes (peak=%d)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); } - fprintf(stderr, "sizeof: PCRE=%zd RE2=%zd Prog=%zd Inst=%zd\n", - sizeof(PCRE), sizeof(RE2), sizeof(Prog), sizeof(Prog::Inst)); + absl::FPrintF(stderr, "sizeof: PCRE=%d RE2=%d Prog=%d Inst=%d\n", + sizeof(PCRE), sizeof(RE2), sizeof(Prog), sizeof(Prog::Inst)); } int NumCPUs() { @@ -128,7 +127,7 @@ int NumCPUs() { // and not interesting. typedef void SearchImpl(benchmark::State& state, const char* regexp, - const StringPiece& text, Prog::Anchor anchor, + absl::string_view text, Prog::Anchor anchor, bool expect_match); SearchImpl SearchDFA, SearchNFA, SearchOnePass, SearchBitState, SearchPCRE, @@ -136,7 +135,7 @@ SearchImpl SearchDFA, SearchNFA, SearchOnePass, SearchBitState, SearchPCRE, SearchCachedBitState, SearchCachedPCRE, SearchCachedRE2; typedef void ParseImpl(benchmark::State& state, const char* regexp, - const StringPiece& text); + absl::string_view text); ParseImpl Parse1NFA, Parse1OnePass, Parse1BitState, Parse1PCRE, Parse1RE2, Parse1Backtrack, Parse1CachedNFA, Parse1CachedOnePass, Parse1CachedBitState, @@ -318,8 +317,8 @@ void FindAndConsume(benchmark::State& state) { s.append("Hello World"); RE2 re("((Hello World))"); for (auto _ : state) { - StringPiece t = s; - StringPiece u; + absl::string_view t = s; + absl::string_view u; CHECK(RE2::FindAndConsume(&t, re, &u)); CHECK_EQ(u, "Hello World"); } @@ -442,7 +441,7 @@ BENCHMARK_RANGE(Search_AltMatch_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCP // Benchmark: use regexp to find phone number. void SearchDigits(benchmark::State& state, SearchImpl* search) { - StringPiece s("650-253-0001"); + absl::string_view s("650-253-0001"); search(state, "([0-9]+)-([0-9]+)-([0-9]+)", s, Prog::kAnchored, true); state.SetItemsProcessed(state.iterations()); } @@ -467,7 +466,7 @@ BENCHMARK(Search_Digits_BitState)->ThreadRange(1, NumCPUs()); void Parse3Digits(benchmark::State& state, void (*parse3)(benchmark::State&, const char*, - const StringPiece&)) { + absl::string_view)) { parse3(state, "([0-9]+)-([0-9]+)-([0-9]+)", "650-253-0001"); state.SetItemsProcessed(state.iterations()); } @@ -506,7 +505,7 @@ BENCHMARK(Parse_CachedDigits_BitState)->ThreadRange(1, NumCPUs()); void Parse3DigitDs(benchmark::State& state, void (*parse3)(benchmark::State&, const char*, - const StringPiece&)) { + absl::string_view)) { parse3(state, "(\\d+)-(\\d+)-(\\d+)", "650-253-0001"); state.SetItemsProcessed(state.iterations()); } @@ -547,7 +546,7 @@ BENCHMARK(Parse_CachedDigitDs_BitState)->ThreadRange(1, NumCPUs()); void Parse1Split(benchmark::State& state, void (*parse1)(benchmark::State&, const char*, - const StringPiece&)) { + absl::string_view)) { parse1(state, "[0-9]+-(.*)", "650-253-0001"); state.SetItemsProcessed(state.iterations()); } @@ -584,7 +583,7 @@ BENCHMARK(Parse_CachedSplit_BitState)->ThreadRange(1, NumCPUs()); void Parse1SplitHard(benchmark::State& state, void (*run)(benchmark::State&, const char*, - const StringPiece&)) { + absl::string_view)) { run(state, "[0-9]+.(.*)", "650-253-0001"); state.SetItemsProcessed(state.iterations()); } @@ -619,7 +618,7 @@ BENCHMARK(Parse_CachedSplitHard_Backtrack)->ThreadRange(1, NumCPUs()); void Parse1SplitBig1(benchmark::State& state, void (*run)(benchmark::State&, const char*, - const StringPiece&)) { + absl::string_view)) { std::string s; s.append(100000, 'x'); s.append("650-253-0001"); @@ -639,7 +638,7 @@ BENCHMARK(Parse_CachedSplitBig1_RE2)->ThreadRange(1, NumCPUs()); void Parse1SplitBig2(benchmark::State& state, void (*run)(benchmark::State&, const char*, - const StringPiece&)) { + absl::string_view)) { std::string s; s.append("650-253-"); s.append(100000, '0'); @@ -756,20 +755,20 @@ void RunBuild(benchmark::State& state, const std::string& regexp, } // namespace re2 -DEFINE_FLAG(std::string, compile_regexp, "(.*)-(\\d+)-of-(\\d+)", - "regexp for compile benchmarks"); +ABSL_FLAG(std::string, compile_regexp, "(.*)-(\\d+)-of-(\\d+)", + "regexp for compile benchmarks"); namespace re2 { -void BM_PCRE_Compile(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), CompilePCRE); } -void BM_Regexp_Parse(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), ParseRegexp); } -void BM_Regexp_Simplify(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), SimplifyRegexp); } -void BM_CompileToProg(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), CompileToProg); } -void BM_CompileByteMap(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), CompileByteMap); } -void BM_Regexp_Compile(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), CompileRegexp); } -void BM_Regexp_SimplifyCompile(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), SimplifyCompileRegexp); } -void BM_Regexp_NullWalk(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), NullWalkRegexp); } -void BM_RE2_Compile(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), CompileRE2); } +void BM_PCRE_Compile(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), CompilePCRE); } +void BM_Regexp_Parse(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), ParseRegexp); } +void BM_Regexp_Simplify(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), SimplifyRegexp); } +void BM_CompileToProg(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), CompileToProg); } +void BM_CompileByteMap(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), CompileByteMap); } +void BM_Regexp_Compile(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), CompileRegexp); } +void BM_Regexp_SimplifyCompile(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), SimplifyCompileRegexp); } +void BM_Regexp_NullWalk(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), NullWalkRegexp); } +void BM_RE2_Compile(benchmark::State& state) { RunBuild(state, absl::GetFlag(FLAGS_compile_regexp), CompileRE2); } #ifdef USEPCRE BENCHMARK(BM_PCRE_Compile)->ThreadRange(1, NumCPUs()); @@ -859,7 +858,7 @@ DO24(MY_BENCHMARK_WITH_ARG, CacheFillDFA) // Anchored says whether to run an anchored search. void SearchDFA(benchmark::State& state, const char* regexp, - const StringPiece& text, Prog::Anchor anchor, + absl::string_view text, Prog::Anchor anchor, bool expect_match) { for (auto _ : state) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); @@ -867,8 +866,8 @@ void SearchDFA(benchmark::State& state, const char* regexp, Prog* prog = re->CompileToProg(0); CHECK(prog); bool failed = false; - CHECK_EQ(prog->SearchDFA(text, StringPiece(), anchor, Prog::kFirstMatch, - NULL, &failed, NULL), + CHECK_EQ(prog->SearchDFA(text, absl::string_view(), anchor, + Prog::kFirstMatch, NULL, &failed, NULL), expect_match); CHECK(!failed); delete prog; @@ -877,15 +876,15 @@ void SearchDFA(benchmark::State& state, const char* regexp, } void SearchNFA(benchmark::State& state, const char* regexp, - const StringPiece& text, Prog::Anchor anchor, + absl::string_view text, Prog::Anchor anchor, bool expect_match) { for (auto _ : state) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); CHECK(re); Prog* prog = re->CompileToProg(0); CHECK(prog); - CHECK_EQ(prog->SearchNFA(text, StringPiece(), anchor, Prog::kFirstMatch, - NULL, 0), + CHECK_EQ(prog->SearchNFA(text, absl::string_view(), anchor, + Prog::kFirstMatch, NULL, 0), expect_match); delete prog; re->Decref(); @@ -893,7 +892,7 @@ void SearchNFA(benchmark::State& state, const char* regexp, } void SearchOnePass(benchmark::State& state, const char* regexp, - const StringPiece& text, Prog::Anchor anchor, + absl::string_view text, Prog::Anchor anchor, bool expect_match) { for (auto _ : state) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); @@ -909,7 +908,7 @@ void SearchOnePass(benchmark::State& state, const char* regexp, } void SearchBitState(benchmark::State& state, const char* regexp, - const StringPiece& text, Prog::Anchor anchor, + absl::string_view text, Prog::Anchor anchor, bool expect_match) { for (auto _ : state) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); @@ -925,7 +924,7 @@ void SearchBitState(benchmark::State& state, const char* regexp, } void SearchPCRE(benchmark::State& state, const char* regexp, - const StringPiece& text, Prog::Anchor anchor, + absl::string_view text, Prog::Anchor anchor, bool expect_match) { for (auto _ : state) { PCRE re(regexp, PCRE::UTF8); @@ -938,7 +937,7 @@ void SearchPCRE(benchmark::State& state, const char* regexp, } void SearchRE2(benchmark::State& state, const char* regexp, - const StringPiece& text, Prog::Anchor anchor, + absl::string_view text, Prog::Anchor anchor, bool expect_match) { for (auto _ : state) { RE2 re(regexp); @@ -955,9 +954,9 @@ void SearchRE2(benchmark::State& state, const char* regexp, // search time without the per-regexp overhead. Prog* GetCachedProg(const char* regexp) { - static auto& mutex = *new Mutex; - MutexLock lock(&mutex); - static auto& cache = *new std::unordered_map<std::string, Prog*>; + static auto& mutex = *new absl::Mutex; + absl::MutexLock lock(&mutex); + static auto& cache = *new absl::flat_hash_map<std::string, Prog*>; Prog* prog = cache[regexp]; if (prog == NULL) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); @@ -973,9 +972,9 @@ Prog* GetCachedProg(const char* regexp) { } PCRE* GetCachedPCRE(const char* regexp) { - static auto& mutex = *new Mutex; - MutexLock lock(&mutex); - static auto& cache = *new std::unordered_map<std::string, PCRE*>; + static auto& mutex = *new absl::Mutex; + absl::MutexLock lock(&mutex); + static auto& cache = *new absl::flat_hash_map<std::string, PCRE*>; PCRE* re = cache[regexp]; if (re == NULL) { re = new PCRE(regexp, PCRE::UTF8); @@ -986,9 +985,9 @@ PCRE* GetCachedPCRE(const char* regexp) { } RE2* GetCachedRE2(const char* regexp) { - static auto& mutex = *new Mutex; - MutexLock lock(&mutex); - static auto& cache = *new std::unordered_map<std::string, RE2*>; + static auto& mutex = *new absl::Mutex; + absl::MutexLock lock(&mutex); + static auto& cache = *new absl::flat_hash_map<std::string, RE2*>; RE2* re = cache[regexp]; if (re == NULL) { re = new RE2(regexp); @@ -999,31 +998,31 @@ RE2* GetCachedRE2(const char* regexp) { } void SearchCachedDFA(benchmark::State& state, const char* regexp, - const StringPiece& text, Prog::Anchor anchor, + absl::string_view text, Prog::Anchor anchor, bool expect_match) { Prog* prog = GetCachedProg(regexp); for (auto _ : state) { bool failed = false; - CHECK_EQ(prog->SearchDFA(text, StringPiece(), anchor, Prog::kFirstMatch, - NULL, &failed, NULL), + CHECK_EQ(prog->SearchDFA(text, absl::string_view(), anchor, + Prog::kFirstMatch, NULL, &failed, NULL), expect_match); CHECK(!failed); } } void SearchCachedNFA(benchmark::State& state, const char* regexp, - const StringPiece& text, Prog::Anchor anchor, + absl::string_view text, Prog::Anchor anchor, bool expect_match) { Prog* prog = GetCachedProg(regexp); for (auto _ : state) { - CHECK_EQ(prog->SearchNFA(text, StringPiece(), anchor, Prog::kFirstMatch, - NULL, 0), + CHECK_EQ(prog->SearchNFA(text, absl::string_view(), anchor, + Prog::kFirstMatch, NULL, 0), expect_match); } } void SearchCachedOnePass(benchmark::State& state, const char* regexp, - const StringPiece& text, Prog::Anchor anchor, + absl::string_view text, Prog::Anchor anchor, bool expect_match) { Prog* prog = GetCachedProg(regexp); CHECK(prog->IsOnePass()); @@ -1034,7 +1033,7 @@ void SearchCachedOnePass(benchmark::State& state, const char* regexp, } void SearchCachedBitState(benchmark::State& state, const char* regexp, - const StringPiece& text, Prog::Anchor anchor, + absl::string_view text, Prog::Anchor anchor, bool expect_match) { Prog* prog = GetCachedProg(regexp); CHECK(prog->CanBitState()); @@ -1045,7 +1044,7 @@ void SearchCachedBitState(benchmark::State& state, const char* regexp, } void SearchCachedPCRE(benchmark::State& state, const char* regexp, - const StringPiece& text, Prog::Anchor anchor, + absl::string_view text, Prog::Anchor anchor, bool expect_match) { PCRE& re = *GetCachedPCRE(regexp); for (auto _ : state) { @@ -1057,7 +1056,7 @@ void SearchCachedPCRE(benchmark::State& state, const char* regexp, } void SearchCachedRE2(benchmark::State& state, const char* regexp, - const StringPiece& text, Prog::Anchor anchor, + absl::string_view text, Prog::Anchor anchor, bool expect_match) { RE2& re = *GetCachedRE2(regexp); for (auto _ : state) { @@ -1072,14 +1071,14 @@ void SearchCachedRE2(benchmark::State& state, const char* regexp, // extracting three submatches. Expects match always. void Parse3NFA(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { for (auto _ : state) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); CHECK(re); Prog* prog = re->CompileToProg(0); CHECK(prog); - StringPiece sp[4]; // 4 because sp[0] is whole match. - CHECK(prog->SearchNFA(text, StringPiece(), Prog::kAnchored, + absl::string_view sp[4]; // 4 because sp[0] is whole match. + CHECK(prog->SearchNFA(text, absl::string_view(), Prog::kAnchored, Prog::kFullMatch, sp, 4)); delete prog; re->Decref(); @@ -1087,14 +1086,14 @@ void Parse3NFA(benchmark::State& state, const char* regexp, } void Parse3OnePass(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { for (auto _ : state) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); CHECK(re); Prog* prog = re->CompileToProg(0); CHECK(prog); CHECK(prog->IsOnePass()); - StringPiece sp[4]; // 4 because sp[0] is whole match. + absl::string_view sp[4]; // 4 because sp[0] is whole match. CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); delete prog; re->Decref(); @@ -1102,14 +1101,14 @@ void Parse3OnePass(benchmark::State& state, const char* regexp, } void Parse3BitState(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { for (auto _ : state) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); CHECK(re); Prog* prog = re->CompileToProg(0); CHECK(prog); CHECK(prog->CanBitState()); - StringPiece sp[4]; // 4 because sp[0] is whole match. + absl::string_view sp[4]; // 4 because sp[0] is whole match. CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); delete prog; re->Decref(); @@ -1117,13 +1116,13 @@ void Parse3BitState(benchmark::State& state, const char* regexp, } void Parse3Backtrack(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { for (auto _ : state) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); CHECK(re); Prog* prog = re->CompileToProg(0); CHECK(prog); - StringPiece sp[4]; // 4 because sp[0] is whole match. + absl::string_view sp[4]; // 4 because sp[0] is whole match. CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); delete prog; re->Decref(); @@ -1131,77 +1130,77 @@ void Parse3Backtrack(benchmark::State& state, const char* regexp, } void Parse3PCRE(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { for (auto _ : state) { PCRE re(regexp, PCRE::UTF8); CHECK_EQ(re.error(), ""); - StringPiece sp1, sp2, sp3; + absl::string_view sp1, sp2, sp3; CHECK(PCRE::FullMatch(text, re, &sp1, &sp2, &sp3)); } } void Parse3RE2(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { for (auto _ : state) { RE2 re(regexp); CHECK_EQ(re.error(), ""); - StringPiece sp1, sp2, sp3; + absl::string_view sp1, sp2, sp3; CHECK(RE2::FullMatch(text, re, &sp1, &sp2, &sp3)); } } void Parse3CachedNFA(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { Prog* prog = GetCachedProg(regexp); - StringPiece sp[4]; // 4 because sp[0] is whole match. + absl::string_view sp[4]; // 4 because sp[0] is whole match. for (auto _ : state) { - CHECK(prog->SearchNFA(text, StringPiece(), Prog::kAnchored, + CHECK(prog->SearchNFA(text, absl::string_view(), Prog::kAnchored, Prog::kFullMatch, sp, 4)); } } void Parse3CachedOnePass(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { Prog* prog = GetCachedProg(regexp); CHECK(prog->IsOnePass()); - StringPiece sp[4]; // 4 because sp[0] is whole match. + absl::string_view sp[4]; // 4 because sp[0] is whole match. for (auto _ : state) { CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); } } void Parse3CachedBitState(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { Prog* prog = GetCachedProg(regexp); CHECK(prog->CanBitState()); - StringPiece sp[4]; // 4 because sp[0] is whole match. + absl::string_view sp[4]; // 4 because sp[0] is whole match. for (auto _ : state) { CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); } } void Parse3CachedBacktrack(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { Prog* prog = GetCachedProg(regexp); - StringPiece sp[4]; // 4 because sp[0] is whole match. + absl::string_view sp[4]; // 4 because sp[0] is whole match. for (auto _ : state) { CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); } } void Parse3CachedPCRE(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { PCRE& re = *GetCachedPCRE(regexp); - StringPiece sp1, sp2, sp3; + absl::string_view sp1, sp2, sp3; for (auto _ : state) { CHECK(PCRE::FullMatch(text, re, &sp1, &sp2, &sp3)); } } void Parse3CachedRE2(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { RE2& re = *GetCachedRE2(regexp); - StringPiece sp1, sp2, sp3; + absl::string_view sp1, sp2, sp3; for (auto _ : state) { CHECK(RE2::FullMatch(text, re, &sp1, &sp2, &sp3)); } @@ -1211,14 +1210,14 @@ void Parse3CachedRE2(benchmark::State& state, const char* regexp, // extracting three submatches. Expects match always. void Parse1NFA(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { for (auto _ : state) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); CHECK(re); Prog* prog = re->CompileToProg(0); CHECK(prog); - StringPiece sp[2]; // 2 because sp[0] is whole match. - CHECK(prog->SearchNFA(text, StringPiece(), Prog::kAnchored, + absl::string_view sp[2]; // 2 because sp[0] is whole match. + CHECK(prog->SearchNFA(text, absl::string_view(), Prog::kAnchored, Prog::kFullMatch, sp, 2)); delete prog; re->Decref(); @@ -1226,14 +1225,14 @@ void Parse1NFA(benchmark::State& state, const char* regexp, } void Parse1OnePass(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { for (auto _ : state) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); CHECK(re); Prog* prog = re->CompileToProg(0); CHECK(prog); CHECK(prog->IsOnePass()); - StringPiece sp[2]; // 2 because sp[0] is whole match. + absl::string_view sp[2]; // 2 because sp[0] is whole match. CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); delete prog; re->Decref(); @@ -1241,14 +1240,14 @@ void Parse1OnePass(benchmark::State& state, const char* regexp, } void Parse1BitState(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { for (auto _ : state) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); CHECK(re); Prog* prog = re->CompileToProg(0); CHECK(prog); CHECK(prog->CanBitState()); - StringPiece sp[2]; // 2 because sp[0] is whole match. + absl::string_view sp[2]; // 2 because sp[0] is whole match. CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); delete prog; re->Decref(); @@ -1256,114 +1255,114 @@ void Parse1BitState(benchmark::State& state, const char* regexp, } void Parse1PCRE(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { for (auto _ : state) { PCRE re(regexp, PCRE::UTF8); CHECK_EQ(re.error(), ""); - StringPiece sp1; + absl::string_view sp1; CHECK(PCRE::FullMatch(text, re, &sp1)); } } void Parse1RE2(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { for (auto _ : state) { RE2 re(regexp); CHECK_EQ(re.error(), ""); - StringPiece sp1; + absl::string_view sp1; CHECK(RE2::FullMatch(text, re, &sp1)); } } void Parse1CachedNFA(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { Prog* prog = GetCachedProg(regexp); - StringPiece sp[2]; // 2 because sp[0] is whole match. + absl::string_view sp[2]; // 2 because sp[0] is whole match. for (auto _ : state) { - CHECK(prog->SearchNFA(text, StringPiece(), Prog::kAnchored, + CHECK(prog->SearchNFA(text, absl::string_view(), Prog::kAnchored, Prog::kFullMatch, sp, 2)); } } void Parse1CachedOnePass(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { Prog* prog = GetCachedProg(regexp); CHECK(prog->IsOnePass()); - StringPiece sp[2]; // 2 because sp[0] is whole match. + absl::string_view sp[2]; // 2 because sp[0] is whole match. for (auto _ : state) { CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); } } void Parse1CachedBitState(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { Prog* prog = GetCachedProg(regexp); CHECK(prog->CanBitState()); - StringPiece sp[2]; // 2 because sp[0] is whole match. + absl::string_view sp[2]; // 2 because sp[0] is whole match. for (auto _ : state) { CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); } } void Parse1CachedBacktrack(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { Prog* prog = GetCachedProg(regexp); - StringPiece sp[2]; // 2 because sp[0] is whole match. + absl::string_view sp[2]; // 2 because sp[0] is whole match. for (auto _ : state) { CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); } } void Parse1CachedPCRE(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { PCRE& re = *GetCachedPCRE(regexp); - StringPiece sp1; + absl::string_view sp1; for (auto _ : state) { CHECK(PCRE::FullMatch(text, re, &sp1)); } } void Parse1CachedRE2(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { RE2& re = *GetCachedRE2(regexp); - StringPiece sp1; + absl::string_view sp1; for (auto _ : state) { CHECK(RE2::FullMatch(text, re, &sp1)); } } void SearchParse2CachedPCRE(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { PCRE& re = *GetCachedPCRE(regexp); for (auto _ : state) { - StringPiece sp1, sp2; + absl::string_view sp1, sp2; CHECK(PCRE::PartialMatch(text, re, &sp1, &sp2)); } } void SearchParse2CachedRE2(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { RE2& re = *GetCachedRE2(regexp); for (auto _ : state) { - StringPiece sp1, sp2; + absl::string_view sp1, sp2; CHECK(RE2::PartialMatch(text, re, &sp1, &sp2)); } } void SearchParse1CachedPCRE(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { PCRE& re = *GetCachedPCRE(regexp); for (auto _ : state) { - StringPiece sp1; + absl::string_view sp1; CHECK(PCRE::PartialMatch(text, re, &sp1)); } } void SearchParse1CachedRE2(benchmark::State& state, const char* regexp, - const StringPiece& text) { + absl::string_view text) { RE2& re = *GetCachedRE2(regexp); for (auto _ : state) { - StringPiece sp1; + absl::string_view sp1; CHECK(RE2::PartialMatch(text, re, &sp1)); } } @@ -1409,7 +1408,7 @@ static std::string http_text = "alksdjfhasdlkfhasdlkjfhasdljkfhadsjklf HTTP/1.1"; void HTTPPartialMatchPCRE(benchmark::State& state) { - StringPiece a; + absl::string_view a; PCRE re("(?-s)^(?:GET|POST) +([^ ]+) HTTP"); for (auto _ : state) { PCRE::PartialMatch(http_text, re, &a); @@ -1417,7 +1416,7 @@ void HTTPPartialMatchPCRE(benchmark::State& state) { } void HTTPPartialMatchRE2(benchmark::State& state) { - StringPiece a; + absl::string_view a; RE2 re("(?-s)^(?:GET|POST) +([^ ]+) HTTP"); for (auto _ : state) { RE2::PartialMatch(http_text, re, &a); @@ -1433,7 +1432,7 @@ static std::string smallhttp_text = "GET /abc HTTP/1.1"; void SmallHTTPPartialMatchPCRE(benchmark::State& state) { - StringPiece a; + absl::string_view a; PCRE re("(?-s)^(?:GET|POST) +([^ ]+) HTTP"); for (auto _ : state) { PCRE::PartialMatch(smallhttp_text, re, &a); @@ -1441,7 +1440,7 @@ void SmallHTTPPartialMatchPCRE(benchmark::State& state) { } void SmallHTTPPartialMatchRE2(benchmark::State& state) { - StringPiece a; + absl::string_view a; RE2 re("(?-s)^(?:GET|POST) +([^ ]+) HTTP"); for (auto _ : state) { RE2::PartialMatch(smallhttp_text, re, &a); @@ -1454,7 +1453,7 @@ BENCHMARK(SmallHTTPPartialMatchPCRE)->ThreadRange(1, NumCPUs()); BENCHMARK(SmallHTTPPartialMatchRE2)->ThreadRange(1, NumCPUs()); void DotMatchPCRE(benchmark::State& state) { - StringPiece a; + absl::string_view a; PCRE re("(?-s)^(.+)"); for (auto _ : state) { PCRE::PartialMatch(http_text, re, &a); @@ -1462,7 +1461,7 @@ void DotMatchPCRE(benchmark::State& state) { } void DotMatchRE2(benchmark::State& state) { - StringPiece a; + absl::string_view a; RE2 re("(?-s)^(.+)"); for (auto _ : state) { RE2::PartialMatch(http_text, re, &a); @@ -1475,7 +1474,7 @@ BENCHMARK(DotMatchPCRE)->ThreadRange(1, NumCPUs()); BENCHMARK(DotMatchRE2)->ThreadRange(1, NumCPUs()); void ASCIIMatchPCRE(benchmark::State& state) { - StringPiece a; + absl::string_view a; PCRE re("(?-s)^([ -~]+)"); for (auto _ : state) { PCRE::PartialMatch(http_text, re, &a); @@ -1483,7 +1482,7 @@ void ASCIIMatchPCRE(benchmark::State& state) { } void ASCIIMatchRE2(benchmark::State& state) { - StringPiece a; + absl::string_view a; RE2 re("(?-s)^([ -~]+)"); for (auto _ : state) { RE2::PartialMatch(http_text, re, &a); diff --git a/re2/testing/regexp_generator.cc b/re2/testing/regexp_generator.cc index 3eeda25..b1761ed 100644 --- a/re2/testing/regexp_generator.cc +++ b/re2/testing/regexp_generator.cc @@ -29,9 +29,11 @@ #include <string> #include <vector> -#include "util/test.h" +#include "absl/base/macros.h" +#include "absl/strings/escaping.h" +#include "absl/strings/str_format.h" +#include "gtest/gtest.h" #include "util/logging.h" -#include "util/strutil.h" #include "util/utf.h" #include "re2/testing/regexp_generator.h" @@ -47,7 +49,7 @@ const std::vector<std::string>& RegexpGenerator::EgrepOps() { "%s?", "%s\\C*", }; - static std::vector<std::string> v(ops, ops + arraysize(ops)); + static std::vector<std::string> v(ops, ops + ABSL_ARRAYSIZE(ops)); return v; } @@ -199,19 +201,21 @@ void RegexpGenerator::RunPostfix(const std::vector<std::string>& post) { regexps.push(post[i]); break; case 1: { + auto fmt = absl::ParsedFormat<'s'>::New(post[i]); + CHECK(fmt != nullptr); std::string a = regexps.top(); regexps.pop(); - regexps.push("(?:" + StringPrintf(post[i].c_str(), a.c_str()) + ")"); + regexps.push("(?:" + absl::StrFormat(*fmt, a) + ")"); break; } case 2: { + auto fmt = absl::ParsedFormat<'s', 's'>::New(post[i]); + CHECK(fmt != nullptr); std::string b = regexps.top(); regexps.pop(); std::string a = regexps.top(); regexps.pop(); - regexps.push("(?:" + - StringPrintf(post[i].c_str(), a.c_str(), b.c_str()) + - ")"); + regexps.push("(?:" + absl::StrFormat(*fmt, a, b) + ")"); break; } } @@ -219,13 +223,13 @@ void RegexpGenerator::RunPostfix(const std::vector<std::string>& post) { if (regexps.size() != 1) { // Internal error - should never happen. - printf("Bad regexp program:\n"); + absl::PrintF("Bad regexp program:\n"); for (size_t i = 0; i < post.size(); i++) { - printf(" %s\n", CEscape(post[i]).c_str()); + absl::PrintF(" %s\n", absl::CEscape(post[i])); } - printf("Stack after running program:\n"); + absl::PrintF("Stack after running program:\n"); while (!regexps.empty()) { - printf(" %s\n", CEscape(regexps.top()).c_str()); + absl::PrintF(" %s\n", absl::CEscape(regexps.top())); regexps.pop(); } LOG(FATAL) << "Bad regexp program."; @@ -238,7 +242,7 @@ void RegexpGenerator::RunPostfix(const std::vector<std::string>& post) { } // Split s into an vector of strings, one for each UTF-8 character. -std::vector<std::string> Explode(const StringPiece& s) { +std::vector<std::string> Explode(absl::string_view s) { std::vector<std::string> v; for (const char *q = s.data(); q < s.data() + s.size(); ) { @@ -253,7 +257,7 @@ std::vector<std::string> Explode(const StringPiece& s) { // Split string everywhere a substring is found, returning // vector of pieces. -std::vector<std::string> Split(const StringPiece& sep, const StringPiece& s) { +std::vector<std::string> Split(absl::string_view sep, absl::string_view s) { std::vector<std::string> v; if (sep.empty()) @@ -261,7 +265,7 @@ std::vector<std::string> Split(const StringPiece& sep, const StringPiece& s) { const char *p = s.data(); for (const char *q = s.data(); q + sep.size() <= s.data() + s.size(); q++) { - if (StringPiece(q, sep.size()) == sep) { + if (absl::string_view(q, sep.size()) == sep) { v.push_back(std::string(p, q - p)); p = q + sep.size(); q = p - 1; // -1 for ++ in loop diff --git a/re2/testing/regexp_generator.h b/re2/testing/regexp_generator.h index 7d72aff..e1be1a9 100644 --- a/re2/testing/regexp_generator.h +++ b/re2/testing/regexp_generator.h @@ -13,8 +13,7 @@ #include <string> #include <vector> -#include "util/util.h" -#include "re2/stringpiece.h" +#include "absl/strings/string_view.h" namespace re2 { @@ -66,11 +65,11 @@ class RegexpGenerator { // Helpers for preparing arguments to RegexpGenerator constructor. // Returns one string for each character in s. -std::vector<std::string> Explode(const StringPiece& s); +std::vector<std::string> Explode(absl::string_view s); // Splits string everywhere sep is found, returning // vector of pieces. -std::vector<std::string> Split(const StringPiece& sep, const StringPiece& s); +std::vector<std::string> Split(absl::string_view sep, absl::string_view s); } // namespace re2 diff --git a/re2/testing/regexp_test.cc b/re2/testing/regexp_test.cc index f7e7e92..ef8f59d 100644 --- a/re2/testing/regexp_test.cc +++ b/re2/testing/regexp_test.cc @@ -9,7 +9,7 @@ #include <string> #include <vector> -#include "util/test.h" +#include "gtest/gtest.h" #include "util/logging.h" #include "re2/regexp.h" diff --git a/re2/testing/required_prefix_test.cc b/re2/testing/required_prefix_test.cc index 60a11f8..231fd34 100644 --- a/re2/testing/required_prefix_test.cc +++ b/re2/testing/required_prefix_test.cc @@ -4,7 +4,8 @@ #include <string> -#include "util/test.h" +#include "absl/base/macros.h" +#include "gtest/gtest.h" #include "util/logging.h" #include "re2/prog.h" #include "re2/regexp.h" @@ -44,7 +45,7 @@ static PrefixTest tests[] = { }; TEST(RequiredPrefix, SimpleTests) { - for (size_t i = 0; i < arraysize(tests); i++) { + for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) { const PrefixTest& t = tests[i]; for (size_t j = 0; j < 2; j++) { Regexp::ParseFlags flags = Regexp::LikePerl; @@ -106,7 +107,7 @@ static PrefixTest for_accel_tests[] = { }; TEST(RequiredPrefixForAccel, SimpleTests) { - for (size_t i = 0; i < arraysize(for_accel_tests); i++) { + for (size_t i = 0; i < ABSL_ARRAYSIZE(for_accel_tests); i++) { const PrefixTest& t = for_accel_tests[i]; for (size_t j = 0; j < 2; j++) { Regexp::ParseFlags flags = Regexp::LikePerl; @@ -171,7 +172,7 @@ static const char* prefix_accel_tests[] = { }; TEST(PrefixAccel, SimpleTests) { - for (size_t i = 0; i < arraysize(prefix_accel_tests); i++) { + for (size_t i = 0; i < ABSL_ARRAYSIZE(prefix_accel_tests); i++) { const char* pattern = prefix_accel_tests[i]; Regexp* re = Regexp::Parse(pattern, Regexp::LikePerl, NULL); ASSERT_TRUE(re != NULL); diff --git a/re2/testing/search_test.cc b/re2/testing/search_test.cc index 5d86dbf..166652a 100644 --- a/re2/testing/search_test.cc +++ b/re2/testing/search_test.cc @@ -2,7 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#include "util/test.h" +#include "absl/base/macros.h" +#include "gtest/gtest.h" #include "re2/prog.h" #include "re2/regexp.h" #include "re2/testing/tester.h" @@ -314,7 +315,7 @@ RegexpTest simple_tests[] = { TEST(Regexp, SearchTests) { int failures = 0; - for (size_t i = 0; i < arraysize(simple_tests); i++) { + for (size_t i = 0; i < ABSL_ARRAYSIZE(simple_tests); i++) { const RegexpTest& t = simple_tests[i]; if (!TestRegexpOnText(t.regexp, t.text)) failures++; diff --git a/re2/testing/set_test.cc b/re2/testing/set_test.cc index 5a760c4..fdbc0b2 100644 --- a/re2/testing/set_test.cc +++ b/re2/testing/set_test.cc @@ -7,7 +7,7 @@ #include <vector> #include <utility> -#include "util/test.h" +#include "gtest/gtest.h" #include "util/logging.h" #include "re2/re2.h" #include "re2/set.h" diff --git a/re2/testing/simplify_test.cc b/re2/testing/simplify_test.cc index 9dcd4ac..5b683f5 100644 --- a/re2/testing/simplify_test.cc +++ b/re2/testing/simplify_test.cc @@ -7,7 +7,8 @@ #include <string.h> #include <string> -#include "util/test.h" +#include "absl/base/macros.h" +#include "gtest/gtest.h" #include "util/logging.h" #include "re2/regexp.h" @@ -139,6 +140,22 @@ static Test tests[] = { { "(){1,}", "()+" }, { "(){0,2}", "(?:()()?)?" }, + // For an empty-width op OR a concatenation or alternation of empty-width + // ops, test that the repetition count is capped at 1. + { "(?:^){0,}", "^*" }, // x{0,} -> x* + { "(?:$){28,}", "$+" }, // x{N,} -> x{1,} -> x+ + { "(?-m:^){0,30}", "(?-m:^)?" }, // x{0,N} -> x{0,1} -> x? + { "(?-m:$){28,30}", "(?-m:$)" }, // x{N,M} -> x{1,1} -> x + { "\\b(?:\\b\\B){999}\\B", "\\b\\b\\B\\B" }, + { "\\b(?:\\b|\\B){999}\\B", "\\b(?:\\b|\\B)\\B" }, + // NonGreedy should also be handled. + { "(?:^){0,}?", "^*?" }, + { "(?:$){28,}?", "$+?" }, + { "(?-m:^){0,30}?", "(?-m:^)??" }, + { "(?-m:$){28,30}?", "(?-m:$)" }, + { "\\b(?:\\b\\B){999}?\\B", "\\b\\b\\B\\B" }, + { "\\b(?:\\b|\\B){999}?\\B", "\\b(?:\\b|\\B)\\B" }, + // Test that coalescing occurs and that the resulting repeats are simplified. // Two-op combinations of *, +, ?, {n}, {n,} and {n,m} with a literal: { "a*a*", "a*" }, @@ -245,7 +262,7 @@ static Test tests[] = { }; TEST(TestSimplify, SimpleRegexps) { - for (size_t i = 0; i < arraysize(tests); i++) { + for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) { RegexpStatus status; VLOG(1) << "Testing " << tests[i].regexp; Regexp* re = Regexp::Parse(tests[i].regexp, diff --git a/re2/testing/string_generator.cc b/re2/testing/string_generator.cc index 44837fe..1891b14 100644 --- a/re2/testing/string_generator.cc +++ b/re2/testing/string_generator.cc @@ -11,7 +11,7 @@ #include <string> #include <vector> -#include "util/test.h" +#include "gtest/gtest.h" #include "util/logging.h" #include "re2/testing/string_generator.h" @@ -81,11 +81,11 @@ bool StringGenerator::RandomDigits() { // currently described by digits_. Calls IncrementDigits // after computing the string, so that it knows the answer // for subsequent HasNext() calls. -const StringPiece& StringGenerator::Next() { +absl::string_view StringGenerator::Next() { CHECK(hasnext_); if (generate_null_) { generate_null_ = false; - sp_ = StringPiece(); + sp_ = absl::string_view(); return sp_; } s_.clear(); diff --git a/re2/testing/string_generator.h b/re2/testing/string_generator.h index 73fbb51..0d6f5fc 100644 --- a/re2/testing/string_generator.h +++ b/re2/testing/string_generator.h @@ -14,8 +14,7 @@ #include <string> #include <vector> -#include "util/util.h" -#include "re2/stringpiece.h" +#include "absl/strings/string_view.h" namespace re2 { @@ -24,7 +23,7 @@ class StringGenerator { StringGenerator(int maxlen, const std::vector<std::string>& alphabet); ~StringGenerator() {} - const StringPiece& Next(); + absl::string_view Next(); bool HasNext() { return hasnext_; } // Resets generator to start sequence over. @@ -45,11 +44,11 @@ class StringGenerator { std::vector<std::string> alphabet_; // Alphabet, one string per letter. // Iteration state. - StringPiece sp_; // Last StringPiece returned by Next(). - std::string s_; // String data in last StringPiece returned by Next(). + absl::string_view sp_; // Last string_view returned by Next(). + std::string s_; // String data in last string_view returned by Next(). bool hasnext_; // Whether Next() can be called again. std::vector<int> digits_; // Alphabet indices for next string. - bool generate_null_; // Whether to generate a NULL StringPiece next. + bool generate_null_; // Whether to generate a NULL string_view next. bool random_; // Whether generated strings are random. int nrandom_; // Number of random strings left to generate. std::minstd_rand0 rng_; // Random number generator. diff --git a/re2/testing/string_generator_test.cc b/re2/testing/string_generator_test.cc index d0f84f4..b1273d9 100644 --- a/re2/testing/string_generator_test.cc +++ b/re2/testing/string_generator_test.cc @@ -7,7 +7,7 @@ #include <stdint.h> #include <string> -#include "util/test.h" +#include "gtest/gtest.h" #include "util/utf.h" #include "re2/testing/string_generator.h" #include "re2/testing/regexp_generator.h" @@ -41,7 +41,7 @@ static void RunTest(int len, const std::string& alphabet, bool donull) { if (donull) { g.GenerateNULL(); EXPECT_TRUE(g.HasNext()); - StringPiece sp = g.Next(); + absl::string_view sp = g.Next(); EXPECT_EQ(sp.data(), static_cast<const char*>(NULL)); EXPECT_EQ(sp.size(), 0); } diff --git a/re2/testing/tester.cc b/re2/testing/tester.cc index b0c22f2..a094cb4 100644 --- a/re2/testing/tester.cc +++ b/re2/testing/tester.cc @@ -9,24 +9,25 @@ #include <string.h> #include <string> -#include "util/util.h" -#include "util/flags.h" +#include "absl/base/macros.h" +#include "absl/flags/flag.h" +#include "absl/strings/escaping.h" +#include "absl/strings/str_format.h" #include "util/logging.h" -#include "util/strutil.h" #include "re2/testing/tester.h" #include "re2/prog.h" #include "re2/re2.h" #include "re2/regexp.h" -DEFINE_FLAG(bool, dump_prog, false, "dump regexp program"); -DEFINE_FLAG(bool, log_okay, false, "log successful runs"); -DEFINE_FLAG(bool, dump_rprog, false, "dump reversed regexp program"); +ABSL_FLAG(bool, dump_prog, false, "dump regexp program"); +ABSL_FLAG(bool, log_okay, false, "log successful runs"); +ABSL_FLAG(bool, dump_rprog, false, "dump reversed regexp program"); -DEFINE_FLAG(int, max_regexp_failures, 100, - "maximum number of regexp test failures (-1 = unlimited)"); +ABSL_FLAG(int, max_regexp_failures, 100, + "maximum number of regexp test failures (-1 = unlimited)"); -DEFINE_FLAG(std::string, regexp_engines, "", - "pattern to select regexp engines to test"); +ABSL_FLAG(std::string, regexp_engines, "", + "pattern to select regexp engines to test"); namespace re2 { @@ -50,7 +51,7 @@ const char* engine_names[kEngineMax] = { // Returns the name of the engine. static const char* EngineName(Engine e) { CHECK_GE(e, 0); - CHECK_LT(e, arraysize(engine_names)); + CHECK_LT(e, ABSL_ARRAYSIZE(engine_names)); CHECK(engine_names[e] != NULL); return engine_names[e]; } @@ -63,11 +64,11 @@ static uint32_t Engines() { if (did_parse) return cached_engines; - if (GetFlag(FLAGS_regexp_engines).empty()) { + if (absl::GetFlag(FLAGS_regexp_engines).empty()) { cached_engines = ~0; } else { for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) - if (GetFlag(FLAGS_regexp_engines).find(EngineName(i)) != std::string::npos) + if (absl::GetFlag(FLAGS_regexp_engines).find(EngineName(i)) != std::string::npos) cached_engines |= 1<<i; } @@ -97,7 +98,7 @@ struct TestInstance::Result { void ClearSubmatch() { for (int i = 0; i < kMaxSubmatch; i++) - submatch[i] = StringPiece(); + submatch[i] = absl::string_view(); } bool skipped; // test skipped: wasn't applicable @@ -105,24 +106,24 @@ struct TestInstance::Result { bool untrusted; // don't really trust the answer bool have_submatch; // computed all submatch info bool have_submatch0; // computed just submatch[0] - StringPiece submatch[kMaxSubmatch]; + absl::string_view submatch[kMaxSubmatch]; }; typedef TestInstance::Result Result; // Formats a single capture range s in text in the form (a,b) // where a and b are the starting and ending offsets of s in text. -static std::string FormatCapture(const StringPiece& text, - const StringPiece& s) { +static std::string FormatCapture(absl::string_view text, + absl::string_view s) { if (s.data() == NULL) return "(?,?)"; - return StringPrintf("(%td,%td)", - BeginPtr(s) - BeginPtr(text), - EndPtr(s) - BeginPtr(text)); + return absl::StrFormat("(%d,%d)", + BeginPtr(s) - BeginPtr(text), + EndPtr(s) - BeginPtr(text)); } // Returns whether text contains non-ASCII (>= 0x80) bytes. -static bool NonASCII(const StringPiece& text) { +static bool NonASCII(absl::string_view text) { for (size_t i = 0; i < text.size(); i++) if ((uint8_t)text[i] >= 0x80) return true; @@ -174,15 +175,15 @@ static ParseMode parse_modes[] = { }; static std::string FormatMode(Regexp::ParseFlags flags) { - for (size_t i = 0; i < arraysize(parse_modes); i++) + for (size_t i = 0; i < ABSL_ARRAYSIZE(parse_modes); i++) if (parse_modes[i].parse_flags == flags) return parse_modes[i].desc; - return StringPrintf("%#x", static_cast<uint32_t>(flags)); + return absl::StrFormat("%#x", static_cast<uint32_t>(flags)); } // Constructs and saves all the matching engines that // will be required for the given tests. -TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind, +TestInstance::TestInstance(absl::string_view regexp_str, Prog::MatchKind kind, Regexp::ParseFlags flags) : regexp_str_(regexp_str), kind_(kind), @@ -195,14 +196,14 @@ TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind, re_(NULL), re2_(NULL) { - VLOG(1) << CEscape(regexp_str); + VLOG(1) << absl::CEscape(regexp_str); // Compile regexp to prog. // Always required - needed for backtracking (reference implementation). RegexpStatus status; regexp_ = Regexp::Parse(regexp_str, flags, &status); if (regexp_ == NULL) { - LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_) + LOG(INFO) << "Cannot parse: " << absl::CEscape(regexp_str_) << " mode: " << FormatMode(flags); error_ = true; return; @@ -210,14 +211,14 @@ TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind, num_captures_ = regexp_->NumCaptures(); prog_ = regexp_->CompileToProg(0); if (prog_ == NULL) { - LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_); + LOG(INFO) << "Cannot compile: " << absl::CEscape(regexp_str_); error_ = true; return; } - if (GetFlag(FLAGS_dump_prog)) { + if (absl::GetFlag(FLAGS_dump_prog)) { LOG(INFO) << "Prog for " << " regexp " - << CEscape(regexp_str_) + << absl::CEscape(regexp_str_) << " (" << FormatKind(kind_) << ", " << FormatMode(flags_) << ")\n" @@ -228,11 +229,11 @@ TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind, if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) { rprog_ = regexp_->CompileToReverseProg(0); if (rprog_ == NULL) { - LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_); + LOG(INFO) << "Cannot reverse compile: " << absl::CEscape(regexp_str_); error_ = true; return; } - if (GetFlag(FLAGS_dump_rprog)) + if (absl::GetFlag(FLAGS_dump_rprog)) LOG(INFO) << rprog_->Dump(); } @@ -256,7 +257,7 @@ TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind, options.set_longest_match(true); re2_ = new RE2(re, options); if (!re2_->error().empty()) { - LOG(INFO) << "Cannot RE2: " << CEscape(re); + LOG(INFO) << "Cannot RE2: " << absl::CEscape(re); error_ = true; return; } @@ -282,7 +283,7 @@ TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind, // add one more layer of parens. re_ = new PCRE("("+re+")", o); if (!re_->error().empty()) { - LOG(INFO) << "Cannot PCRE: " << CEscape(re); + LOG(INFO) << "Cannot PCRE: " << absl::CEscape(re); error_ = true; return; } @@ -301,11 +302,9 @@ TestInstance::~TestInstance() { // Runs a single search using the named engine type. // This interface hides all the irregularities of the various // engine interfaces from the rest of this file. -void TestInstance::RunSearch(Engine type, - const StringPiece& orig_text, - const StringPiece& orig_context, - Prog::Anchor anchor, - Result* result) { +void TestInstance::RunSearch(Engine type, absl::string_view orig_text, + absl::string_view orig_context, + Prog::Anchor anchor, Result* result) { if (regexp_ == NULL) { result->skipped = true; return; @@ -314,8 +313,8 @@ void TestInstance::RunSearch(Engine type, if (nsubmatch > kMaxSubmatch) nsubmatch = kMaxSubmatch; - StringPiece text = orig_text; - StringPiece context = orig_context; + absl::string_view text = orig_text; + absl::string_view context = orig_context; switch (type) { default: @@ -368,8 +367,8 @@ void TestInstance::RunSearch(Engine type, result->submatch, &result->skipped, NULL)) { LOG(ERROR) << "Reverse DFA inconsistency: " - << CEscape(regexp_str_) - << " on " << CEscape(text); + << absl::CEscape(regexp_str_) + << " on " << absl::CEscape(text); result->matched = false; } } @@ -438,19 +437,19 @@ void TestInstance::RunSearch(Engine type, // whitespace, not just vertical tab. Regexp::MimicsPCRE() is // unable to handle all cases of this, unfortunately, so just // catch them here. :( - if (regexp_str_.find("\\v") != StringPiece::npos && - (text.find('\n') != StringPiece::npos || - text.find('\f') != StringPiece::npos || - text.find('\r') != StringPiece::npos)) { + if (regexp_str_.find("\\v") != absl::string_view::npos && + (text.find('\n') != absl::string_view::npos || + text.find('\f') != absl::string_view::npos || + text.find('\r') != absl::string_view::npos)) { result->skipped = true; break; } // PCRE 8.34 or so started allowing vertical tab to match \s, // following a change made in Perl 5.18. RE2 does not. - if ((regexp_str_.find("\\s") != StringPiece::npos || - regexp_str_.find("\\S") != StringPiece::npos) && - text.find('\v') != StringPiece::npos) { + if ((regexp_str_.find("\\s") != absl::string_view::npos || + regexp_str_.find("\\S") != absl::string_view::npos) && + text.find('\v') != absl::string_view::npos) { result->skipped = true; break; } @@ -513,7 +512,7 @@ static bool ResultOkay(const Result& r, const Result& correct) { } // Runs a single test. -bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context, +bool TestInstance::RunCase(absl::string_view text, absl::string_view context, Prog::Anchor anchor) { // Backtracking is the gold standard. Result correct; @@ -521,12 +520,12 @@ bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context, if (correct.skipped) { if (regexp_ == NULL) return true; - LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_) + LOG(ERROR) << "Skipped backtracking! " << absl::CEscape(regexp_str_) << " " << FormatMode(flags_); return false; } - VLOG(1) << "Try: regexp " << CEscape(regexp_str_) - << " text " << CEscape(text) + VLOG(1) << "Try: regexp " << absl::CEscape(regexp_str_) + << " text " << absl::CEscape(text) << " (" << FormatKind(kind_) << ", " << FormatAnchor(anchor) << ", " << FormatMode(flags_) @@ -541,7 +540,7 @@ bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context, Result r; RunSearch(i, text, context, anchor, &r); if (ResultOkay(r, correct)) { - if (GetFlag(FLAGS_log_okay)) + if (absl::GetFlag(FLAGS_log_okay)) LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor); continue; } @@ -571,14 +570,14 @@ bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context, if (r.submatch[i].data() != correct.submatch[i].data() || r.submatch[i].size() != correct.submatch[i].size()) { LOG(INFO) << - StringPrintf(" $%d: should be %s is %s", - i, - FormatCapture(text, correct.submatch[i]).c_str(), - FormatCapture(text, r.submatch[i]).c_str()); + absl::StrFormat(" $%d: should be %s is %s", + i, + FormatCapture(text, correct.submatch[i]), + FormatCapture(text, r.submatch[i])); } else { LOG(INFO) << - StringPrintf(" $%d: %s ok", i, - FormatCapture(text, r.submatch[i]).c_str()); + absl::StrFormat(" $%d: %s ok", i, + FormatCapture(text, r.submatch[i])); } } } @@ -586,7 +585,7 @@ bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context, if (!all_okay) { // This will be initialised once (after flags have been initialised) // and that is desirable because we want to enforce a global limit. - static int max_regexp_failures = GetFlag(FLAGS_max_regexp_failures); + static int max_regexp_failures = absl::GetFlag(FLAGS_max_regexp_failures); if (max_regexp_failures > 0 && --max_regexp_failures == 0) LOG(QFATAL) << "Too many regexp failures."; } @@ -595,22 +594,22 @@ bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context, } void TestInstance::LogMatch(const char* prefix, Engine e, - const StringPiece& text, const StringPiece& context, + absl::string_view text, absl::string_view context, Prog::Anchor anchor) { LOG(INFO) << prefix << EngineName(e) << " regexp " - << CEscape(regexp_str_) + << absl::CEscape(regexp_str_) << " " - << CEscape(regexp_->ToString()) + << absl::CEscape(regexp_->ToString()) << " text " - << CEscape(text) + << absl::CEscape(text) << " (" << BeginPtr(text) - BeginPtr(context) << "," << EndPtr(text) - BeginPtr(context) << ") of context " - << CEscape(context) + << absl::CEscape(context) << " (" << FormatKind(kind_) << ", " << FormatAnchor(anchor) << ", " << FormatMode(flags_) @@ -624,10 +623,10 @@ static Prog::MatchKind kinds[] = { }; // Test all possible match kinds and parse modes. -Tester::Tester(const StringPiece& regexp) { +Tester::Tester(absl::string_view regexp) { error_ = false; - for (size_t i = 0; i < arraysize(kinds); i++) { - for (size_t j = 0; j < arraysize(parse_modes); j++) { + for (size_t i = 0; i < ABSL_ARRAYSIZE(kinds); i++) { + for (size_t j = 0; j < ABSL_ARRAYSIZE(parse_modes); j++) { TestInstance* t = new TestInstance(regexp, kinds[i], parse_modes[j].parse_flags); error_ |= t->error(); @@ -641,8 +640,8 @@ Tester::~Tester() { delete v_[i]; } -bool Tester::TestCase(const StringPiece& text, const StringPiece& context, - Prog::Anchor anchor) { +bool Tester::TestCase(absl::string_view text, absl::string_view context, + Prog::Anchor anchor) { bool okay = true; for (size_t i = 0; i < v_.size(); i++) okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor)); @@ -654,10 +653,10 @@ static Prog::Anchor anchors[] = { Prog::kUnanchored }; -bool Tester::TestInput(const StringPiece& text) { +bool Tester::TestInput(absl::string_view text) { bool okay = TestInputInContext(text, text); if (!text.empty()) { - StringPiece sp; + absl::string_view sp; sp = text; sp.remove_prefix(1); okay &= TestInputInContext(sp, text); @@ -668,16 +667,16 @@ bool Tester::TestInput(const StringPiece& text) { return okay; } -bool Tester::TestInputInContext(const StringPiece& text, - const StringPiece& context) { +bool Tester::TestInputInContext(absl::string_view text, + absl::string_view context) { bool okay = true; - for (size_t i = 0; i < arraysize(anchors); i++) + for (size_t i = 0; i < ABSL_ARRAYSIZE(anchors); i++) okay &= TestCase(text, context, anchors[i]); return okay; } -bool TestRegexpOnText(const StringPiece& regexp, - const StringPiece& text) { +bool TestRegexpOnText(absl::string_view regexp, + absl::string_view text) { Tester t(regexp); return t.TestInput(text); } diff --git a/re2/testing/tester.h b/re2/testing/tester.h index 47d0c43..59be5ea 100644 --- a/re2/testing/tester.h +++ b/re2/testing/tester.h @@ -10,7 +10,7 @@ #include <vector> -#include "re2/stringpiece.h" +#include "absl/strings/string_view.h" #include "re2/prog.h" #include "re2/regexp.h" #include "re2/re2.h" @@ -51,7 +51,7 @@ class TestInstance { public: struct Result; - TestInstance(const StringPiece& regexp, Prog::MatchKind kind, + TestInstance(absl::string_view regexp, Prog::MatchKind kind, Regexp::ParseFlags flags); ~TestInstance(); Regexp::ParseFlags flags() { return flags_; } @@ -59,20 +59,18 @@ class TestInstance { // Runs a single test case: search in text, which is in context, // using the given anchoring. - bool RunCase(const StringPiece& text, const StringPiece& context, + bool RunCase(absl::string_view text, absl::string_view context, Prog::Anchor anchor); private: // Runs a single search using the named engine type. - void RunSearch(Engine type, - const StringPiece& text, const StringPiece& context, - Prog::Anchor anchor, - Result *result); + void RunSearch(Engine type, absl::string_view text, absl::string_view context, + Prog::Anchor anchor, Result* result); - void LogMatch(const char* prefix, Engine e, const StringPiece& text, - const StringPiece& context, Prog::Anchor anchor); + void LogMatch(const char* prefix, Engine e, absl::string_view text, + absl::string_view context, Prog::Anchor anchor); - const StringPiece regexp_str_; // regexp being tested + absl::string_view regexp_str_; // regexp being tested Prog::MatchKind kind_; // kind of match Regexp::ParseFlags flags_; // flags for parsing regexp_str_ bool error_; // error during constructor? @@ -91,21 +89,21 @@ class TestInstance { // A group of TestInstances for all possible configurations. class Tester { public: - explicit Tester(const StringPiece& regexp); + explicit Tester(absl::string_view regexp); ~Tester(); bool error() { return error_; } // Runs a single test case: search in text, which is in context, // using the given anchoring. - bool TestCase(const StringPiece& text, const StringPiece& context, + bool TestCase(absl::string_view text, absl::string_view context, Prog::Anchor anchor); // Run TestCase(text, text, anchor) for all anchoring modes. - bool TestInput(const StringPiece& text); + bool TestInput(absl::string_view text); // Run TestCase(text, context, anchor) for all anchoring modes. - bool TestInputInContext(const StringPiece& text, const StringPiece& context); + bool TestInputInContext(absl::string_view text, absl::string_view context); private: bool error_; @@ -116,7 +114,7 @@ class Tester { }; // Run all possible tests using regexp and text. -bool TestRegexpOnText(const StringPiece& regexp, const StringPiece& text); +bool TestRegexpOnText(absl::string_view regexp, absl::string_view text); } // namespace re2 diff --git a/re2/tostring.cc b/re2/tostring.cc index 9c1c038..33179fd 100644 --- a/re2/tostring.cc +++ b/re2/tostring.cc @@ -8,9 +8,8 @@ #include <string.h> #include <string> -#include "util/util.h" +#include "absl/strings/str_format.h" #include "util/logging.h" -#include "util/strutil.h" #include "util/utf.h" #include "re2/regexp.h" #include "re2/walker-inl.h" @@ -216,11 +215,11 @@ int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, case kRegexpRepeat: if (re->max() == -1) - t_->append(StringPrintf("{%d,}", re->min())); + t_->append(absl::StrFormat("{%d,}", re->min())); else if (re->min() == re->max()) - t_->append(StringPrintf("{%d}", re->min())); + t_->append(absl::StrFormat("{%d}", re->min())); else - t_->append(StringPrintf("{%d,%d}", re->min(), re->max())); + t_->append(absl::StrFormat("{%d,%d}", re->min(), re->max())); if (re->parse_flags() & Regexp::NonGreedy) t_->append("?"); if (prec < PrecUnary) @@ -291,7 +290,7 @@ int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, // There's no syntax accepted by the parser to generate // this node (it is generated by RE2::Set) so make something // up that is readable but won't compile. - t_->append(StringPrintf("(?HaveMatch:%d)", re->match_id())); + t_->append(absl::StrFormat("(?HaveMatch:%d)", re->match_id())); break; } @@ -332,10 +331,10 @@ static void AppendCCChar(std::string* t, Rune r) { } if (r < 0x100) { - *t += StringPrintf("\\x%02x", static_cast<int>(r)); + *t += absl::StrFormat("\\x%02x", static_cast<int>(r)); return; } - *t += StringPrintf("\\x{%x}", static_cast<int>(r)); + *t += absl::StrFormat("\\x{%x}", static_cast<int>(r)); } static void AppendCCRange(std::string* t, Rune lo, Rune hi) { diff --git a/re2/unicode.py b/re2/unicode.py index 727bea5..9173407 100644 --- a/re2/unicode.py +++ b/re2/unicode.py @@ -10,10 +10,10 @@ from __future__ import print_function import os import re -from six.moves import urllib +import urllib.request # Directory or URL where Unicode tables reside. -_UNICODE_DIR = "https://www.unicode.org/Public/14.0.0/ucd" +_UNICODE_DIR = "https://www.unicode.org/Public/15.1.0/ucd" # Largest valid Unicode code value. _RUNE_MAX = 0x10FFFF diff --git a/re2/unicode_casefold.cc b/re2/unicode_casefold.cc index d9de282..297d0c8 100644 --- a/re2/unicode_casefold.cc +++ b/re2/unicode_casefold.cc @@ -7,7 +7,7 @@ namespace re2 { -// 1424 groups, 2878 pairs, 367 ranges +// 1427 groups, 2884 pairs, 372 ranges const CaseFold unicode_casefold[] = { { 65, 90, 32 }, { 97, 106, -32 }, @@ -141,11 +141,13 @@ const CaseFold unicode_casefold[] = { { 904, 906, 37 }, { 908, 908, 64 }, { 910, 911, 63 }, + { 912, 912, 7235 }, { 913, 929, 32 }, { 931, 931, 31 }, { 932, 939, 32 }, { 940, 940, -38 }, { 941, 943, -37 }, + { 944, 944, 7219 }, { 945, 945, -32 }, { 946, 946, 30 }, { 947, 948, -32 }, @@ -278,9 +280,11 @@ const CaseFold unicode_casefold[] = { { 8136, 8139, -86 }, { 8140, 8140, -9 }, { 8144, 8145, 8 }, + { 8147, 8147, -7235 }, { 8152, 8153, -8 }, { 8154, 8155, -100 }, { 8160, 8161, 8 }, + { 8163, 8163, -7219 }, { 8165, 8165, 7 }, { 8168, 8169, -8 }, { 8170, 8171, -112 }, @@ -354,6 +358,7 @@ const CaseFold unicode_casefold[] = { { 42997, 42998, OddEven }, { 43859, 43859, -928 }, { 43888, 43967, -38864 }, + { 64261, 64262, OddEven }, { 65313, 65338, 32 }, { 65345, 65370, -32 }, { 66560, 66599, 40 }, @@ -377,9 +382,9 @@ const CaseFold unicode_casefold[] = { { 125184, 125217, 34 }, { 125218, 125251, -34 }, }; -const int num_unicode_casefold = 367; +const int num_unicode_casefold = 372; -// 1424 groups, 1454 pairs, 205 ranges +// 1427 groups, 1457 pairs, 208 ranges const CaseFold unicode_tolower[] = { { 65, 90, 32 }, { 181, 181, 775 }, @@ -515,8 +520,10 @@ const CaseFold unicode_tolower[] = { { 8126, 8126, -7173 }, { 8136, 8139, -86 }, { 8140, 8140, -9 }, + { 8147, 8147, -7235 }, { 8152, 8153, -8 }, { 8154, 8155, -100 }, + { 8163, 8163, -7219 }, { 8168, 8169, -8 }, { 8170, 8171, -112 }, { 8172, 8172, -7 }, @@ -575,6 +582,7 @@ const CaseFold unicode_tolower[] = { { 42966, 42968, EvenOddSkip }, { 42997, 42997, OddEven }, { 43888, 43967, -38864 }, + { 64261, 64261, OddEven }, { 65313, 65338, 32 }, { 66560, 66599, 40 }, { 66736, 66771, 40 }, @@ -587,7 +595,7 @@ const CaseFold unicode_tolower[] = { { 93760, 93791, 32 }, { 125184, 125217, 34 }, }; -const int num_unicode_tolower = 205; +const int num_unicode_tolower = 208; diff --git a/re2/unicode_casefold.h b/re2/unicode_casefold.h index 8bdbb42..4acad68 100644 --- a/re2/unicode_casefold.h +++ b/re2/unicode_casefold.h @@ -41,7 +41,6 @@ #include <stdint.h> -#include "util/util.h" #include "util/utf.h" namespace re2 { diff --git a/re2/unicode_groups.cc b/re2/unicode_groups.cc index 2a8d7da..b2a7ba6 100644 --- a/re2/unicode_groups.cc +++ b/re2/unicode_groups.cc @@ -29,7 +29,7 @@ static const URange16 C_range16[] = { static const URange32 C_range32[] = { { 69821, 69821 }, { 69837, 69837 }, - { 78896, 78904 }, + { 78896, 78911 }, { 113824, 113827 }, { 119155, 119162 }, { 917505, 917505 }, @@ -60,7 +60,7 @@ static const URange16 Cf_range16[] = { static const URange32 Cf_range32[] = { { 69821, 69821 }, { 69837, 69837 }, - { 78896, 78904 }, + { 78896, 78911 }, { 113824, 113827 }, { 119155, 119162 }, { 917505, 917505 }, @@ -548,6 +548,7 @@ static const URange32 L_range32[] = { { 70108, 70108 }, { 70144, 70161 }, { 70163, 70187 }, + { 70207, 70208 }, { 70272, 70278 }, { 70280, 70280 }, { 70282, 70285 }, @@ -610,11 +611,15 @@ static const URange32 L_range32[] = { { 73066, 73097 }, { 73112, 73112 }, { 73440, 73458 }, + { 73474, 73474 }, + { 73476, 73488 }, + { 73490, 73523 }, { 73648, 73648 }, { 73728, 74649 }, { 74880, 75075 }, { 77712, 77808 }, - { 77824, 78894 }, + { 77824, 78895 }, + { 78913, 78918 }, { 82944, 83526 }, { 92160, 92728 }, { 92736, 92766 }, @@ -637,7 +642,9 @@ static const URange32 L_range32[] = { { 110581, 110587 }, { 110589, 110590 }, { 110592, 110882 }, + { 110898, 110898 }, { 110928, 110930 }, + { 110933, 110933 }, { 110948, 110951 }, { 110960, 111355 }, { 113664, 113770 }, @@ -675,11 +682,14 @@ static const URange32 L_range32[] = { { 120746, 120770 }, { 120772, 120779 }, { 122624, 122654 }, + { 122661, 122666 }, + { 122928, 122989 }, { 123136, 123180 }, { 123191, 123197 }, { 123214, 123214 }, { 123536, 123565 }, { 123584, 123627 }, + { 124112, 124139 }, { 124896, 124902 }, { 124904, 124907 }, { 124909, 124910 }, @@ -721,12 +731,14 @@ static const URange32 L_range32[] = { { 126629, 126633 }, { 126635, 126651 }, { 131072, 173791 }, - { 173824, 177976 }, + { 173824, 177977 }, { 177984, 178205 }, { 178208, 183969 }, { 183984, 191456 }, + { 191472, 192093 }, { 194560, 195101 }, { 196608, 201546 }, + { 201552, 205743 }, }; static const URange16 Ll_range16[] = { { 97, 122 }, @@ -1387,6 +1399,7 @@ static const URange32 Ll_range32[] = { { 120779, 120779 }, { 122624, 122633 }, { 122635, 122654 }, + { 122661, 122666 }, { 125218, 125251 }, }; static const URange16 Lm_range16[] = { @@ -1459,7 +1472,9 @@ static const URange32 Lm_range32[] = { { 110576, 110579 }, { 110581, 110587 }, { 110589, 110590 }, + { 122928, 122989 }, { 123191, 123197 }, + { 124139, 124139 }, { 125259, 125259 }, }; static const URange16 Lo_range16[] = { @@ -1829,6 +1844,7 @@ static const URange32 Lo_range32[] = { { 70108, 70108 }, { 70144, 70161 }, { 70163, 70187 }, + { 70207, 70208 }, { 70272, 70278 }, { 70280, 70280 }, { 70282, 70285 }, @@ -1890,11 +1906,15 @@ static const URange32 Lo_range32[] = { { 73066, 73097 }, { 73112, 73112 }, { 73440, 73458 }, + { 73474, 73474 }, + { 73476, 73488 }, + { 73490, 73523 }, { 73648, 73648 }, { 73728, 74649 }, { 74880, 75075 }, { 77712, 77808 }, - { 77824, 78894 }, + { 77824, 78895 }, + { 78913, 78918 }, { 82944, 83526 }, { 92160, 92728 }, { 92736, 92766 }, @@ -1909,7 +1929,9 @@ static const URange32 Lo_range32[] = { { 100352, 101589 }, { 101632, 101640 }, { 110592, 110882 }, + { 110898, 110898 }, { 110928, 110930 }, + { 110933, 110933 }, { 110948, 110951 }, { 110960, 111355 }, { 113664, 113770 }, @@ -1921,6 +1943,7 @@ static const URange32 Lo_range32[] = { { 123214, 123214 }, { 123536, 123565 }, { 123584, 123627 }, + { 124112, 124138 }, { 124896, 124902 }, { 124904, 124907 }, { 124909, 124910 }, @@ -1960,12 +1983,14 @@ static const URange32 Lo_range32[] = { { 126629, 126633 }, { 126635, 126651 }, { 131072, 173791 }, - { 173824, 177976 }, + { 173824, 177977 }, { 177984, 178205 }, { 178208, 183969 }, { 183984, 191456 }, + { 191472, 192093 }, { 194560, 195101 }, { 196608, 201546 }, + { 201552, 205743 }, }; static const URange16 Lt_range16[] = { { 453, 453 }, @@ -2710,6 +2735,7 @@ static const URange16 M_range16[] = { { 3274, 3277 }, { 3285, 3286 }, { 3298, 3299 }, + { 3315, 3315 }, { 3328, 3331 }, { 3387, 3388 }, { 3390, 3396 }, @@ -2728,7 +2754,7 @@ static const URange16 M_range16[] = { { 3655, 3662 }, { 3761, 3761 }, { 3764, 3772 }, - { 3784, 3789 }, + { 3784, 3790 }, { 3864, 3865 }, { 3893, 3893 }, { 3895, 3895 }, @@ -2832,6 +2858,7 @@ static const URange32 M_range32[] = { { 68325, 68326 }, { 68900, 68903 }, { 69291, 69292 }, + { 69373, 69375 }, { 69446, 69456 }, { 69506, 69509 }, { 69632, 69634 }, @@ -2851,6 +2878,7 @@ static const URange32 M_range32[] = { { 70094, 70095 }, { 70188, 70199 }, { 70206, 70206 }, + { 70209, 70209 }, { 70367, 70378 }, { 70400, 70403 }, { 70459, 70460 }, @@ -2898,6 +2926,12 @@ static const URange32 M_range32[] = { { 73104, 73105 }, { 73107, 73111 }, { 73459, 73462 }, + { 73472, 73473 }, + { 73475, 73475 }, + { 73524, 73530 }, + { 73534, 73538 }, + { 78912, 78912 }, + { 78919, 78933 }, { 92912, 92916 }, { 92976, 92982 }, { 94031, 94031 }, @@ -2925,9 +2959,11 @@ static const URange32 M_range32[] = { { 122907, 122913 }, { 122915, 122916 }, { 122918, 122922 }, + { 123023, 123023 }, { 123184, 123190 }, { 123566, 123566 }, { 123628, 123631 }, + { 124140, 124143 }, { 125136, 125142 }, { 125252, 125258 }, { 917760, 917999 }, @@ -2968,6 +3004,7 @@ static const URange16 Mc_range16[] = { { 3271, 3272 }, { 3274, 3275 }, { 3285, 3286 }, + { 3315, 3315 }, { 3330, 3331 }, { 3390, 3392 }, { 3398, 3400 }, @@ -3108,6 +3145,10 @@ static const URange32 Mc_range32[] = { { 73107, 73108 }, { 73110, 73110 }, { 73461, 73462 }, + { 73475, 73475 }, + { 73524, 73525 }, + { 73534, 73535 }, + { 73537, 73537 }, { 94033, 94087 }, { 94192, 94193 }, { 119141, 119142 }, @@ -3213,7 +3254,7 @@ static const URange16 Mn_range16[] = { { 3655, 3662 }, { 3761, 3761 }, { 3764, 3772 }, - { 3784, 3789 }, + { 3784, 3790 }, { 3864, 3865 }, { 3893, 3893 }, { 3895, 3895 }, @@ -3346,6 +3387,7 @@ static const URange32 Mn_range32[] = { { 68325, 68326 }, { 68900, 68903 }, { 69291, 69292 }, + { 69373, 69375 }, { 69446, 69456 }, { 69506, 69509 }, { 69633, 69633 }, @@ -3368,6 +3410,7 @@ static const URange32 Mn_range32[] = { { 70196, 70196 }, { 70198, 70199 }, { 70206, 70206 }, + { 70209, 70209 }, { 70367, 70367 }, { 70371, 70378 }, { 70400, 70401 }, @@ -3429,6 +3472,12 @@ static const URange32 Mn_range32[] = { { 73109, 73109 }, { 73111, 73111 }, { 73459, 73460 }, + { 73472, 73473 }, + { 73526, 73530 }, + { 73536, 73536 }, + { 73538, 73538 }, + { 78912, 78912 }, + { 78919, 78933 }, { 92912, 92916 }, { 92976, 92982 }, { 94031, 94031 }, @@ -3453,9 +3502,11 @@ static const URange32 Mn_range32[] = { { 122907, 122913 }, { 122915, 122916 }, { 122918, 122922 }, + { 123023, 123023 }, { 123184, 123190 }, { 123566, 123566 }, { 123628, 123631 }, + { 124140, 124143 }, { 125136, 125142 }, { 125252, 125258 }, { 917760, 917999 }, @@ -3576,6 +3627,7 @@ static const URange32 N_range32[] = { { 72784, 72812 }, { 73040, 73049 }, { 73120, 73129 }, + { 73552, 73561 }, { 73664, 73684 }, { 74752, 74862 }, { 92768, 92777 }, @@ -3583,11 +3635,13 @@ static const URange32 N_range32[] = { { 93008, 93017 }, { 93019, 93025 }, { 93824, 93846 }, + { 119488, 119507 }, { 119520, 119539 }, { 119648, 119672 }, { 120782, 120831 }, { 123200, 123209 }, { 123632, 123641 }, + { 124144, 124153 }, { 125127, 125135 }, { 125264, 125273 }, { 126065, 126123 }, @@ -3655,12 +3709,14 @@ static const URange32 Nd_range32[] = { { 72784, 72793 }, { 73040, 73049 }, { 73120, 73129 }, + { 73552, 73561 }, { 92768, 92777 }, { 92864, 92873 }, { 93008, 93017 }, { 120782, 120831 }, { 123200, 123209 }, { 123632, 123641 }, + { 124144, 124153 }, { 125264, 125273 }, { 130032, 130041 }, }; @@ -3745,6 +3801,7 @@ static const URange32 No_range32[] = { { 73664, 73684 }, { 93019, 93025 }, { 93824, 93846 }, + { 119488, 119507 }, { 119520, 119539 }, { 119648, 119672 }, { 125127, 125135 }, @@ -3932,9 +3989,11 @@ static const URange32 P_range32[] = { { 72255, 72262 }, { 72346, 72348 }, { 72350, 72354 }, + { 72448, 72457 }, { 72769, 72773 }, { 72816, 72817 }, { 73463, 73464 }, + { 73539, 73551 }, { 73727, 73727 }, { 74864, 74868 }, { 77809, 77810 }, @@ -4255,9 +4314,11 @@ static const URange32 Po_range32[] = { { 72255, 72262 }, { 72346, 72348 }, { 72350, 72354 }, + { 72448, 72457 }, { 72769, 72773 }, { 72816, 72817 }, { 73463, 73464 }, + { 73539, 73551 }, { 73727, 73727 }, { 74864, 74868 }, { 77809, 77810 }, @@ -4460,7 +4521,7 @@ static const URange16 S_range16[] = { { 11904, 11929 }, { 11931, 12019 }, { 12032, 12245 }, - { 12272, 12283 }, + { 12272, 12287 }, { 12292, 12292 }, { 12306, 12307 }, { 12320, 12320 }, @@ -4470,6 +4531,7 @@ static const URange16 S_range16[] = { { 12688, 12689 }, { 12694, 12703 }, { 12736, 12771 }, + { 12783, 12783 }, { 12800, 12830 }, { 12842, 12871 }, { 12880, 12880 }, @@ -4564,10 +4626,10 @@ static const URange32 S_range32[] = { { 127568, 127569 }, { 127584, 127589 }, { 127744, 128727 }, - { 128733, 128748 }, + { 128732, 128748 }, { 128752, 128764 }, - { 128768, 128883 }, - { 128896, 128984 }, + { 128768, 128886 }, + { 128891, 128985 }, { 128992, 129003 }, { 129008, 129008 }, { 129024, 129035 }, @@ -4578,15 +4640,13 @@ static const URange32 S_range32[] = { { 129200, 129201 }, { 129280, 129619 }, { 129632, 129645 }, - { 129648, 129652 }, - { 129656, 129660 }, - { 129664, 129670 }, - { 129680, 129708 }, - { 129712, 129722 }, - { 129728, 129733 }, - { 129744, 129753 }, - { 129760, 129767 }, - { 129776, 129782 }, + { 129648, 129660 }, + { 129664, 129672 }, + { 129680, 129725 }, + { 129727, 129733 }, + { 129742, 129755 }, + { 129760, 129768 }, + { 129776, 129784 }, { 129792, 129938 }, { 129940, 129994 }, }; @@ -4805,7 +4865,7 @@ static const URange16 So_range16[] = { { 11904, 11929 }, { 11931, 12019 }, { 12032, 12245 }, - { 12272, 12283 }, + { 12272, 12287 }, { 12292, 12292 }, { 12306, 12307 }, { 12320, 12320 }, @@ -4814,6 +4874,7 @@ static const URange16 So_range16[] = { { 12688, 12689 }, { 12694, 12703 }, { 12736, 12771 }, + { 12783, 12783 }, { 12800, 12830 }, { 12842, 12871 }, { 12880, 12880 }, @@ -4882,10 +4943,10 @@ static const URange32 So_range32[] = { { 127584, 127589 }, { 127744, 127994 }, { 128000, 128727 }, - { 128733, 128748 }, + { 128732, 128748 }, { 128752, 128764 }, - { 128768, 128883 }, - { 128896, 128984 }, + { 128768, 128886 }, + { 128891, 128985 }, { 128992, 129003 }, { 129008, 129008 }, { 129024, 129035 }, @@ -4896,15 +4957,13 @@ static const URange32 So_range32[] = { { 129200, 129201 }, { 129280, 129619 }, { 129632, 129645 }, - { 129648, 129652 }, - { 129656, 129660 }, - { 129664, 129670 }, - { 129680, 129708 }, - { 129712, 129722 }, - { 129728, 129733 }, - { 129744, 129753 }, - { 129760, 129767 }, - { 129776, 129782 }, + { 129648, 129660 }, + { 129664, 129672 }, + { 129680, 129725 }, + { 129727, 129733 }, + { 129742, 129755 }, + { 129760, 129768 }, + { 129776, 129784 }, { 129792, 129938 }, { 129940, 129994 }, }; @@ -4972,6 +5031,7 @@ static const URange16 Arabic_range16[] = { }; static const URange32 Arabic_range32[] = { { 69216, 69246 }, + { 69373, 69375 }, { 126464, 126467 }, { 126469, 126495 }, { 126497, 126498 }, @@ -5164,8 +5224,7 @@ static const URange16 Common_range16[] = { { 11126, 11157 }, { 11159, 11263 }, { 11776, 11869 }, - { 12272, 12283 }, - { 12288, 12292 }, + { 12272, 12292 }, { 12294, 12294 }, { 12296, 12320 }, { 12336, 12343 }, @@ -5175,6 +5234,7 @@ static const URange16 Common_range16[] = { { 12539, 12540 }, { 12688, 12703 }, { 12736, 12771 }, + { 12783, 12783 }, { 12832, 12895 }, { 12927, 13007 }, { 13055, 13055 }, @@ -5218,6 +5278,7 @@ static const URange32 Common_range32[] = { { 119171, 119172 }, { 119180, 119209 }, { 119214, 119274 }, + { 119488, 119507 }, { 119520, 119539 }, { 119552, 119638 }, { 119648, 119672 }, @@ -5258,10 +5319,10 @@ static const URange32 Common_range32[] = { { 127568, 127569 }, { 127584, 127589 }, { 127744, 128727 }, - { 128733, 128748 }, + { 128732, 128748 }, { 128752, 128764 }, - { 128768, 128883 }, - { 128896, 128984 }, + { 128768, 128886 }, + { 128891, 128985 }, { 128992, 129003 }, { 129008, 129008 }, { 129024, 129035 }, @@ -5272,15 +5333,13 @@ static const URange32 Common_range32[] = { { 129200, 129201 }, { 129280, 129619 }, { 129632, 129645 }, - { 129648, 129652 }, - { 129656, 129660 }, - { 129664, 129670 }, - { 129680, 129708 }, - { 129712, 129722 }, - { 129728, 129733 }, - { 129744, 129753 }, - { 129760, 129767 }, - { 129776, 129782 }, + { 129648, 129660 }, + { 129664, 129672 }, + { 129680, 129725 }, + { 129727, 129733 }, + { 129742, 129755 }, + { 129760, 129768 }, + { 129776, 129784 }, { 129792, 129938 }, { 129940, 129994 }, { 130032, 130041 }, @@ -5319,6 +5378,10 @@ static const URange16 Cyrillic_range16[] = { { 42560, 42655 }, { 65070, 65071 }, }; +static const URange32 Cyrillic_range32[] = { + { 122928, 122989 }, + { 123023, 123023 }, +}; static const URange32 Deseret_range32[] = { { 66560, 66639 }, }; @@ -5328,6 +5391,9 @@ static const URange16 Devanagari_range16[] = { { 2406, 2431 }, { 43232, 43263 }, }; +static const URange32 Devanagari_range32[] = { + { 72448, 72457 }, +}; static const URange32 Dives_Akuru_range32[] = { { 71936, 71942 }, { 71945, 71945 }, @@ -5349,8 +5415,7 @@ static const URange32 Duployan_range32[] = { { 113820, 113823 }, }; static const URange32 Egyptian_Hieroglyphs_range32[] = { - { 77824, 78894 }, - { 78896, 78904 }, + { 77824, 78933 }, }; static const URange32 Elbasan_range32[] = { { 66816, 66855 }, @@ -5539,12 +5604,14 @@ static const URange32 Han_range32[] = { { 94178, 94179 }, { 94192, 94193 }, { 131072, 173791 }, - { 173824, 177976 }, + { 173824, 177977 }, { 177984, 178205 }, { 178208, 183969 }, { 183984, 191456 }, + { 191472, 192093 }, { 194560, 195101 }, { 196608, 201546 }, + { 201552, 205743 }, }; static const URange16 Hangul_range16[] = { { 4352, 4607 }, @@ -5591,6 +5658,7 @@ static const URange16 Hiragana_range16[] = { }; static const URange32 Hiragana_range32[] = { { 110593, 110879 }, + { 110898, 110898 }, { 110928, 110930 }, { 127488, 127488 }, }; @@ -5661,7 +5729,7 @@ static const URange16 Kannada_range16[] = { { 3293, 3294 }, { 3296, 3299 }, { 3302, 3311 }, - { 3313, 3314 }, + { 3313, 3315 }, }; static const URange16 Katakana_range16[] = { { 12449, 12538 }, @@ -5678,8 +5746,14 @@ static const URange32 Katakana_range32[] = { { 110589, 110590 }, { 110592, 110592 }, { 110880, 110882 }, + { 110933, 110933 }, { 110948, 110951 }, }; +static const URange32 Kawi_range32[] = { + { 73472, 73488 }, + { 73490, 73530 }, + { 73534, 73561 }, +}; static const URange16 Kayah_Li_range16[] = { { 43264, 43309 }, { 43311, 43311 }, @@ -5706,7 +5780,7 @@ static const URange16 Khmer_range16[] = { }; static const URange32 Khojki_range32[] = { { 70144, 70161 }, - { 70163, 70206 }, + { 70163, 70209 }, }; static const URange32 Khudawadi_range32[] = { { 70320, 70378 }, @@ -5721,7 +5795,7 @@ static const URange16 Lao_range16[] = { { 3751, 3773 }, { 3776, 3780 }, { 3782, 3782 }, - { 3784, 3789 }, + { 3784, 3790 }, { 3792, 3801 }, { 3804, 3807 }, }; @@ -5766,6 +5840,7 @@ static const URange32 Latin_range32[] = { { 67463, 67504 }, { 67506, 67514 }, { 122624, 122654 }, + { 122661, 122666 }, }; static const URange16 Lepcha_range16[] = { { 7168, 7223 }, @@ -5903,6 +5978,9 @@ static const URange32 Nabataean_range32[] = { { 67712, 67742 }, { 67751, 67759 }, }; +static const URange32 Nag_Mundari_range32[] = { + { 124112, 124153 }, +}; static const URange32 Nandinagari_range32[] = { { 72096, 72103 }, { 72106, 72151 }, @@ -6229,12 +6307,12 @@ static const URange16 Yi_range16[] = { static const URange32 Zanabazar_Square_range32[] = { { 72192, 72263 }, }; -// 4038 16-bit ranges, 1712 32-bit ranges +// 4042 16-bit ranges, 1778 32-bit ranges const UGroup unicode_groups[] = { { "Adlam", +1, 0, 0, Adlam_range32, 3 }, { "Ahom", +1, 0, 0, Ahom_range32, 3 }, { "Anatolian_Hieroglyphs", +1, 0, 0, Anatolian_Hieroglyphs_range32, 1 }, - { "Arabic", +1, Arabic_range16, 22, Arabic_range32, 35 }, + { "Arabic", +1, Arabic_range16, 22, Arabic_range32, 36 }, { "Armenian", +1, Armenian_range16, 4, 0, 0 }, { "Avestan", +1, 0, 0, Avestan_range32, 2 }, { "Balinese", +1, Balinese_range16, 2, 0, 0 }, @@ -6259,19 +6337,19 @@ const UGroup unicode_groups[] = { { "Cherokee", +1, Cherokee_range16, 3, 0, 0 }, { "Chorasmian", +1, 0, 0, Chorasmian_range32, 1 }, { "Co", +1, Co_range16, 1, Co_range32, 2 }, - { "Common", +1, Common_range16, 91, Common_range32, 83 }, + { "Common", +1, Common_range16, 91, Common_range32, 82 }, { "Coptic", +1, Coptic_range16, 3, 0, 0 }, { "Cs", +1, Cs_range16, 1, 0, 0 }, { "Cuneiform", +1, 0, 0, Cuneiform_range32, 4 }, { "Cypriot", +1, 0, 0, Cypriot_range32, 6 }, { "Cypro_Minoan", +1, 0, 0, Cypro_Minoan_range32, 1 }, - { "Cyrillic", +1, Cyrillic_range16, 8, 0, 0 }, + { "Cyrillic", +1, Cyrillic_range16, 8, Cyrillic_range32, 2 }, { "Deseret", +1, 0, 0, Deseret_range32, 1 }, - { "Devanagari", +1, Devanagari_range16, 4, 0, 0 }, + { "Devanagari", +1, Devanagari_range16, 4, Devanagari_range32, 1 }, { "Dives_Akuru", +1, 0, 0, Dives_Akuru_range32, 8 }, { "Dogra", +1, 0, 0, Dogra_range32, 1 }, { "Duployan", +1, 0, 0, Duployan_range32, 5 }, - { "Egyptian_Hieroglyphs", +1, 0, 0, Egyptian_Hieroglyphs_range32, 2 }, + { "Egyptian_Hieroglyphs", +1, 0, 0, Egyptian_Hieroglyphs_range32, 1 }, { "Elbasan", +1, 0, 0, Elbasan_range32, 1 }, { "Elymaic", +1, 0, 0, Elymaic_range32, 1 }, { "Ethiopic", +1, Ethiopic_range16, 32, Ethiopic_range32, 4 }, @@ -6283,13 +6361,13 @@ const UGroup unicode_groups[] = { { "Gujarati", +1, Gujarati_range16, 14, 0, 0 }, { "Gunjala_Gondi", +1, 0, 0, Gunjala_Gondi_range32, 6 }, { "Gurmukhi", +1, Gurmukhi_range16, 16, 0, 0 }, - { "Han", +1, Han_range16, 11, Han_range32, 9 }, + { "Han", +1, Han_range16, 11, Han_range32, 11 }, { "Hangul", +1, Hangul_range16, 14, 0, 0 }, { "Hanifi_Rohingya", +1, 0, 0, Hanifi_Rohingya_range32, 2 }, { "Hanunoo", +1, Hanunoo_range16, 1, 0, 0 }, { "Hatran", +1, 0, 0, Hatran_range32, 3 }, { "Hebrew", +1, Hebrew_range16, 9, 0, 0 }, - { "Hiragana", +1, Hiragana_range16, 2, Hiragana_range32, 3 }, + { "Hiragana", +1, Hiragana_range16, 2, Hiragana_range32, 4 }, { "Imperial_Aramaic", +1, 0, 0, Imperial_Aramaic_range32, 2 }, { "Inherited", +1, Inherited_range16, 19, Inherited_range32, 10 }, { "Inscriptional_Pahlavi", +1, 0, 0, Inscriptional_Pahlavi_range32, 2 }, @@ -6297,29 +6375,30 @@ const UGroup unicode_groups[] = { { "Javanese", +1, Javanese_range16, 3, 0, 0 }, { "Kaithi", +1, 0, 0, Kaithi_range32, 2 }, { "Kannada", +1, Kannada_range16, 13, 0, 0 }, - { "Katakana", +1, Katakana_range16, 7, Katakana_range32, 6 }, + { "Katakana", +1, Katakana_range16, 7, Katakana_range32, 7 }, + { "Kawi", +1, 0, 0, Kawi_range32, 3 }, { "Kayah_Li", +1, Kayah_Li_range16, 2, 0, 0 }, { "Kharoshthi", +1, 0, 0, Kharoshthi_range32, 8 }, { "Khitan_Small_Script", +1, 0, 0, Khitan_Small_Script_range32, 2 }, { "Khmer", +1, Khmer_range16, 4, 0, 0 }, { "Khojki", +1, 0, 0, Khojki_range32, 2 }, { "Khudawadi", +1, 0, 0, Khudawadi_range32, 2 }, - { "L", +1, L_range16, 380, L_range32, 268 }, + { "L", +1, L_range16, 380, L_range32, 280 }, { "Lao", +1, Lao_range16, 11, 0, 0 }, - { "Latin", +1, Latin_range16, 34, Latin_range32, 4 }, + { "Latin", +1, Latin_range16, 34, Latin_range32, 5 }, { "Lepcha", +1, Lepcha_range16, 3, 0, 0 }, { "Limbu", +1, Limbu_range16, 5, 0, 0 }, { "Linear_A", +1, 0, 0, Linear_A_range32, 3 }, { "Linear_B", +1, 0, 0, Linear_B_range32, 7 }, { "Lisu", +1, Lisu_range16, 1, Lisu_range32, 1 }, - { "Ll", +1, Ll_range16, 617, Ll_range32, 40 }, - { "Lm", +1, Lm_range16, 57, Lm_range32, 12 }, - { "Lo", +1, Lo_range16, 290, Lo_range32, 211 }, + { "Ll", +1, Ll_range16, 617, Ll_range32, 41 }, + { "Lm", +1, Lm_range16, 57, Lm_range32, 14 }, + { "Lo", +1, Lo_range16, 290, Lo_range32, 221 }, { "Lt", +1, Lt_range16, 10, 0, 0 }, { "Lu", +1, Lu_range16, 605, Lu_range32, 41 }, { "Lycian", +1, 0, 0, Lycian_range32, 1 }, { "Lydian", +1, 0, 0, Lydian_range32, 2 }, - { "M", +1, M_range16, 189, M_range32, 110 }, + { "M", +1, M_range16, 190, M_range32, 120 }, { "Mahajani", +1, 0, 0, Mahajani_range32, 1 }, { "Makasar", +1, 0, 0, Makasar_range32, 1 }, { "Malayalam", +1, Malayalam_range16, 7, 0, 0 }, @@ -6327,7 +6406,7 @@ const UGroup unicode_groups[] = { { "Manichaean", +1, 0, 0, Manichaean_range32, 2 }, { "Marchen", +1, 0, 0, Marchen_range32, 3 }, { "Masaram_Gondi", +1, 0, 0, Masaram_Gondi_range32, 7 }, - { "Mc", +1, Mc_range16, 111, Mc_range32, 66 }, + { "Mc", +1, Mc_range16, 112, Mc_range32, 70 }, { "Me", +1, Me_range16, 5, 0, 0 }, { "Medefaidrin", +1, 0, 0, Medefaidrin_range32, 1 }, { "Meetei_Mayek", +1, Meetei_Mayek_range16, 3, 0, 0 }, @@ -6335,21 +6414,22 @@ const UGroup unicode_groups[] = { { "Meroitic_Cursive", +1, 0, 0, Meroitic_Cursive_range32, 3 }, { "Meroitic_Hieroglyphs", +1, 0, 0, Meroitic_Hieroglyphs_range32, 1 }, { "Miao", +1, 0, 0, Miao_range32, 3 }, - { "Mn", +1, Mn_range16, 212, Mn_range32, 124 }, + { "Mn", +1, Mn_range16, 212, Mn_range32, 134 }, { "Modi", +1, 0, 0, Modi_range32, 2 }, { "Mongolian", +1, Mongolian_range16, 5, Mongolian_range32, 1 }, { "Mro", +1, 0, 0, Mro_range32, 3 }, { "Multani", +1, 0, 0, Multani_range32, 5 }, { "Myanmar", +1, Myanmar_range16, 3, 0, 0 }, - { "N", +1, N_range16, 67, N_range32, 67 }, + { "N", +1, N_range16, 67, N_range32, 70 }, { "Nabataean", +1, 0, 0, Nabataean_range32, 2 }, + { "Nag_Mundari", +1, 0, 0, Nag_Mundari_range32, 1 }, { "Nandinagari", +1, 0, 0, Nandinagari_range32, 3 }, - { "Nd", +1, Nd_range16, 37, Nd_range32, 25 }, + { "Nd", +1, Nd_range16, 37, Nd_range32, 27 }, { "New_Tai_Lue", +1, New_Tai_Lue_range16, 4, 0, 0 }, { "Newa", +1, 0, 0, Newa_range32, 2 }, { "Nko", +1, Nko_range16, 2, 0, 0 }, { "Nl", +1, Nl_range16, 7, Nl_range32, 5 }, - { "No", +1, No_range16, 29, No_range32, 42 }, + { "No", +1, No_range16, 29, No_range32, 43 }, { "Nushu", +1, 0, 0, Nushu_range32, 2 }, { "Nyiakeng_Puachue_Hmong", +1, 0, 0, Nyiakeng_Puachue_Hmong_range32, 4 }, { "Ogham", +1, Ogham_range16, 1, 0, 0 }, @@ -6366,7 +6446,7 @@ const UGroup unicode_groups[] = { { "Oriya", +1, Oriya_range16, 14, 0, 0 }, { "Osage", +1, 0, 0, Osage_range32, 2 }, { "Osmanya", +1, 0, 0, Osmanya_range32, 2 }, - { "P", +1, P_range16, 133, P_range32, 56 }, + { "P", +1, P_range16, 133, P_range32, 58 }, { "Pahawh_Hmong", +1, 0, 0, Pahawh_Hmong_range32, 5 }, { "Palmyrene", +1, 0, 0, Palmyrene_range32, 1 }, { "Pau_Cin_Hau", +1, 0, 0, Pau_Cin_Hau_range32, 1 }, @@ -6377,12 +6457,12 @@ const UGroup unicode_groups[] = { { "Phags_Pa", +1, Phags_Pa_range16, 1, 0, 0 }, { "Phoenician", +1, 0, 0, Phoenician_range32, 2 }, { "Pi", +1, Pi_range16, 11, 0, 0 }, - { "Po", +1, Po_range16, 130, Po_range32, 55 }, + { "Po", +1, Po_range16, 130, Po_range32, 57 }, { "Ps", +1, Ps_range16, 79, 0, 0 }, { "Psalter_Pahlavi", +1, 0, 0, Psalter_Pahlavi_range32, 3 }, { "Rejang", +1, Rejang_range16, 2, 0, 0 }, { "Runic", +1, Runic_range16, 2, 0, 0 }, - { "S", +1, S_range16, 151, S_range32, 83 }, + { "S", +1, S_range16, 152, S_range32, 81 }, { "Samaritan", +1, Samaritan_range16, 2, 0, 0 }, { "Saurashtra", +1, Saurashtra_range16, 2, 0, 0 }, { "Sc", +1, Sc_range16, 18, Sc_range32, 3 }, @@ -6393,7 +6473,7 @@ const UGroup unicode_groups[] = { { "Sinhala", +1, Sinhala_range16, 12, Sinhala_range32, 1 }, { "Sk", +1, Sk_range16, 30, Sk_range32, 1 }, { "Sm", +1, Sm_range16, 53, Sm_range32, 11 }, - { "So", +1, So_range16, 114, So_range32, 72 }, + { "So", +1, So_range16, 115, So_range32, 70 }, { "Sogdian", +1, 0, 0, Sogdian_range32, 1 }, { "Sora_Sompeng", +1, 0, 0, Sora_Sompeng_range32, 2 }, { "Soyombo", +1, 0, 0, Soyombo_range32, 1 }, @@ -6429,7 +6509,7 @@ const UGroup unicode_groups[] = { { "Zp", +1, Zp_range16, 1, 0, 0 }, { "Zs", +1, Zs_range16, 7, 0, 0 }, }; -const int num_unicode_groups = 197; +const int num_unicode_groups = 199; } // namespace re2 diff --git a/re2/unicode_groups.h b/re2/unicode_groups.h index 75f55da..6dc6532 100644 --- a/re2/unicode_groups.h +++ b/re2/unicode_groups.h @@ -20,7 +20,6 @@ #include <stdint.h> -#include "util/util.h" #include "util/utf.h" namespace re2 { diff --git a/re2/walker-inl.h b/re2/walker-inl.h index 4d064a0..45763a7 100644 --- a/re2/walker-inl.h +++ b/re2/walker-inl.h @@ -15,6 +15,7 @@ #include <stack> +#include "absl/base/macros.h" #include "util/logging.h" #include "re2/regexp.h" @@ -190,7 +191,7 @@ template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg, s->child_args = &s->child_arg; else if (re->nsub_ > 1) s->child_args = new T[re->nsub_]; - FALLTHROUGH_INTENDED; + ABSL_FALLTHROUGH_INTENDED; } default: { if (re->nsub_ > 0) { diff --git a/re2Config.cmake.in b/re2Config.cmake.in index 7698107..6a177c6 100644 --- a/re2Config.cmake.in +++ b/re2Config.cmake.in @@ -13,6 +13,12 @@ if(UNIX) find_dependency(Threads REQUIRED) endif() +find_dependency(absl REQUIRED) + +if(@RE2_USE_ICU@) + find_dependency(ICU REQUIRED COMPONENTS uc) +endif() + check_required_components(re2) if(TARGET re2::re2) diff --git a/util/benchmark.cc b/util/benchmark.cc deleted file mode 100644 index e39c334..0000000 --- a/util/benchmark.cc +++ /dev/null @@ -1,131 +0,0 @@ -// Copyright 2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include <stdint.h> -#include <stdio.h> -#include <stdlib.h> -#include <algorithm> -#include <chrono> - -#include "util/benchmark.h" -#include "util/flags.h" -#include "re2/re2.h" - -#ifdef _WIN32 -#define snprintf _snprintf -#endif - -using ::testing::Benchmark; - -static Benchmark* benchmarks[10000]; -static int nbenchmarks; - -void Benchmark::Register() { - lo_ = std::max(1, lo_); - hi_ = std::max(lo_, hi_); - benchmarks[nbenchmarks++] = this; -} - -static int64_t nsec() { - return std::chrono::duration_cast<std::chrono::nanoseconds>( - std::chrono::steady_clock::now().time_since_epoch()) - .count(); -} - -static int64_t t0; -static int64_t ns; -static int64_t bytes; -static int64_t items; - -void StartBenchmarkTiming() { - if (t0 == 0) { - t0 = nsec(); - } -} - -void StopBenchmarkTiming() { - if (t0 != 0) { - ns += nsec() - t0; - t0 = 0; - } -} - -void SetBenchmarkBytesProcessed(int64_t b) { bytes = b; } - -void SetBenchmarkItemsProcessed(int64_t i) { items = i; } - -static void RunFunc(Benchmark* b, int iters, int arg) { - t0 = nsec(); - ns = 0; - bytes = 0; - items = 0; - b->func()(iters, arg); - StopBenchmarkTiming(); -} - -static int round(int n) { - int base = 1; - while (base * 10 < n) base *= 10; - if (n < 2 * base) return 2 * base; - if (n < 5 * base) return 5 * base; - return 10 * base; -} - -static void RunBench(Benchmark* b, int arg) { - int iters, last; - - // Run once just in case it's expensive. - iters = 1; - RunFunc(b, iters, arg); - while (ns < (int)1e9 && iters < (int)1e9) { - last = iters; - if (ns / iters == 0) { - iters = (int)1e9; - } else { - iters = (int)1e9 / static_cast<int>(ns / iters); - } - iters = std::max(last + 1, std::min(iters + iters / 2, 100 * last)); - iters = round(iters); - RunFunc(b, iters, arg); - } - - char mb[100]; - char suf[100]; - mb[0] = '\0'; - suf[0] = '\0'; - if (ns > 0 && bytes > 0) - snprintf(mb, sizeof mb, "\t%7.2f MB/s", - ((double)bytes / 1e6) / ((double)ns / 1e9)); - if (b->has_arg()) { - if (arg >= (1 << 20)) { - snprintf(suf, sizeof suf, "/%dM", arg / (1 << 20)); - } else if (arg >= (1 << 10)) { - snprintf(suf, sizeof suf, "/%dK", arg / (1 << 10)); - } else { - snprintf(suf, sizeof suf, "/%d", arg); - } - } - printf("%s%s\t%8d\t%10lld ns/op%s\n", b->name(), suf, iters, - (long long)ns / iters, mb); - fflush(stdout); -} - -static bool WantBench(const char* name, int argc, const char** argv) { - if (argc == 1) return true; - for (int i = 1; i < argc; i++) { - if (RE2::PartialMatch(name, argv[i])) - return true; - } - return false; -} - -int main(int argc, const char** argv) { - for (int i = 0; i < nbenchmarks; i++) { - Benchmark* b = benchmarks[i]; - if (!WantBench(b->name(), argc, argv)) - continue; - for (int arg = b->lo(); arg <= b->hi(); arg <<= 1) - RunBench(b, arg); - } -} diff --git a/util/benchmark.h b/util/benchmark.h deleted file mode 100644 index d97b49e..0000000 --- a/util/benchmark.h +++ /dev/null @@ -1,156 +0,0 @@ -// Copyright 2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#ifndef UTIL_BENCHMARK_H_ -#define UTIL_BENCHMARK_H_ - -#include <stdint.h> -#include <functional> - -#include "util/logging.h" -#include "util/util.h" - -// Globals for the old benchmark API. -void StartBenchmarkTiming(); -void StopBenchmarkTiming(); -void SetBenchmarkBytesProcessed(int64_t b); -void SetBenchmarkItemsProcessed(int64_t i); - -namespace benchmark { - -// The new benchmark API implemented as a layer over the old benchmark API. -// (Please refer to https://github.com/google/benchmark for documentation.) -class State { - private: - class Iterator { - public: - // Benchmark code looks like this: - // - // for (auto _ : state) { - // // ... - // } - // - // We try to avoid compiler warnings about such variables being unused. - struct ATTRIBUTE_UNUSED Value {}; - - explicit Iterator(int64_t iters) : iters_(iters) {} - - bool operator!=(const Iterator& that) const { - if (iters_ != that.iters_) { - return true; - } else { - // We are about to stop the loop, so stop timing. - StopBenchmarkTiming(); - return false; - } - } - - Value operator*() const { - return Value(); - } - - Iterator& operator++() { - --iters_; - return *this; - } - - private: - int64_t iters_; - }; - - public: - explicit State(int64_t iters) - : iters_(iters), arg_(0), has_arg_(false) {} - - State(int64_t iters, int64_t arg) - : iters_(iters), arg_(arg), has_arg_(true) {} - - Iterator begin() { - // We are about to start the loop, so start timing. - StartBenchmarkTiming(); - return Iterator(iters_); - } - - Iterator end() { - return Iterator(0); - } - - void SetBytesProcessed(int64_t b) { SetBenchmarkBytesProcessed(b); } - void SetItemsProcessed(int64_t i) { SetBenchmarkItemsProcessed(i); } - int64_t iterations() const { return iters_; } - // Pretend to support multiple arguments. - int64_t range(int pos) const { CHECK(has_arg_); return arg_; } - - private: - int64_t iters_; - int64_t arg_; - bool has_arg_; - - State(const State&) = delete; - State& operator=(const State&) = delete; -}; - -} // namespace benchmark - -namespace testing { - -class Benchmark { - public: - Benchmark(const char* name, void (*func)(benchmark::State&)) - : name_(name), - func_([func](int iters, int arg) { - benchmark::State state(iters); - func(state); - }), - lo_(0), - hi_(0), - has_arg_(false) { - Register(); - } - - Benchmark(const char* name, void (*func)(benchmark::State&), int lo, int hi) - : name_(name), - func_([func](int iters, int arg) { - benchmark::State state(iters, arg); - func(state); - }), - lo_(lo), - hi_(hi), - has_arg_(true) { - Register(); - } - - // Pretend to support multiple threads. - Benchmark* ThreadRange(int lo, int hi) { return this; } - - const char* name() const { return name_; } - const std::function<void(int, int)>& func() const { return func_; } - int lo() const { return lo_; } - int hi() const { return hi_; } - bool has_arg() const { return has_arg_; } - - private: - void Register(); - - const char* name_; - std::function<void(int, int)> func_; - int lo_; - int hi_; - bool has_arg_; - - Benchmark(const Benchmark&) = delete; - Benchmark& operator=(const Benchmark&) = delete; -}; - -} // namespace testing - -#define BENCHMARK(f) \ - ::testing::Benchmark* _benchmark_##f = \ - (new ::testing::Benchmark(#f, f)) - -#define BENCHMARK_RANGE(f, lo, hi) \ - ::testing::Benchmark* _benchmark_##f = \ - (new ::testing::Benchmark(#f, f, lo, hi)) - -#endif // UTIL_BENCHMARK_H_ diff --git a/util/flags.h b/util/flags.h deleted file mode 100644 index 3386b72..0000000 --- a/util/flags.h +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#ifndef UTIL_FLAGS_H_ -#define UTIL_FLAGS_H_ - -// Simplified version of Google's command line flags. -// Does not support parsing the command line. -// If you want to do that, see -// https://gflags.github.io/gflags/ - -#define DEFINE_FLAG(type, name, deflt, desc) \ - namespace re2 { type FLAGS_##name = deflt; } - -#define DECLARE_FLAG(type, name) \ - namespace re2 { extern type FLAGS_##name; } - -namespace re2 { -template <typename T> -T GetFlag(const T& flag) { - return flag; -} -} // namespace re2 - -#endif // UTIL_FLAGS_H_ diff --git a/util/fuzz.cc b/util/fuzz.cc deleted file mode 100644 index 9cac118..0000000 --- a/util/fuzz.cc +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2016 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include <stddef.h> -#include <stdint.h> -#include <stdlib.h> - -// Entry point for libFuzzer. -extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size); - -int main(int argc, char** argv) { - uint8_t data[32]; - for (int i = 0; i < 32; i++) { - for (int j = 0; j < 32; j++) { - data[j] = random() & 0xFF; - } - LLVMFuzzerTestOneInput(data, 32); - } - return 0; -} diff --git a/util/logging.h b/util/logging.h index 5b2217f..946962b 100644 --- a/util/logging.h +++ b/util/logging.h @@ -13,7 +13,7 @@ #include <ostream> #include <sstream> -#include "util/util.h" +#include "absl/base/attributes.h" // Debug-only checking. #define DCHECK(condition) assert(condition) @@ -93,7 +93,7 @@ class LogMessageFatal : public LogMessage { public: LogMessageFatal(const char* file, int line) : LogMessage(file, line) {} - ATTRIBUTE_NORETURN ~LogMessageFatal() { + ABSL_ATTRIBUTE_NORETURN ~LogMessageFatal() { Flush(); abort(); } diff --git a/util/mix.h b/util/mix.h deleted file mode 100644 index d85c172..0000000 --- a/util/mix.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2016 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#ifndef UTIL_MIX_H_ -#define UTIL_MIX_H_ - -#include <stddef.h> -#include <limits> - -namespace re2 { - -// Silence "truncation of constant value" warning for kMul in 32-bit mode. -// Since this is a header file, push and then pop to limit the scope. -#ifdef _MSC_VER -#pragma warning(push) -#pragma warning(disable: 4309) -#endif - -class HashMix { - public: - HashMix() : hash_(1) {} - explicit HashMix(size_t val) : hash_(val + 83) {} - void Mix(size_t val) { - static const size_t kMul = static_cast<size_t>(0xdc3eb94af8ab4c93ULL); - hash_ *= kMul; - hash_ = ((hash_ << 19) | - (hash_ >> (std::numeric_limits<size_t>::digits - 19))) + val; - } - size_t get() const { return hash_; } - private: - size_t hash_; -}; - -#ifdef _MSC_VER -#pragma warning(pop) -#endif - -} // namespace re2 - -#endif // UTIL_MIX_H_ diff --git a/util/mutex.h b/util/mutex.h deleted file mode 100644 index 158046b..0000000 --- a/util/mutex.h +++ /dev/null @@ -1,148 +0,0 @@ -// Copyright 2007 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#ifndef UTIL_MUTEX_H_ -#define UTIL_MUTEX_H_ - -/* - * A simple mutex wrapper, supporting locks and read-write locks. - * You should assume the locks are *not* re-entrant. - */ - -#ifdef _WIN32 -// Requires Windows Vista or Windows Server 2008 at minimum. -#include <windows.h> -#if defined(WINVER) && WINVER >= 0x0600 -#define MUTEX_IS_WIN32_SRWLOCK -#endif -#else -#ifndef _POSIX_C_SOURCE -#define _POSIX_C_SOURCE 200809L -#endif -#include <unistd.h> -#if defined(_POSIX_READER_WRITER_LOCKS) && _POSIX_READER_WRITER_LOCKS > 0 -#define MUTEX_IS_PTHREAD_RWLOCK -#endif -#endif - -#if defined(MUTEX_IS_WIN32_SRWLOCK) -typedef SRWLOCK MutexType; -#elif defined(MUTEX_IS_PTHREAD_RWLOCK) -#include <pthread.h> -#include <stdlib.h> -typedef pthread_rwlock_t MutexType; -#else -#include <mutex> -typedef std::mutex MutexType; -#endif - -namespace re2 { - -class Mutex { - public: - inline Mutex(); - inline ~Mutex(); - inline void Lock(); // Block if needed until free then acquire exclusively - inline void Unlock(); // Release a lock acquired via Lock() - // Note that on systems that don't support read-write locks, these may - // be implemented as synonyms to Lock() and Unlock(). So you can use - // these for efficiency, but don't use them anyplace where being able - // to do shared reads is necessary to avoid deadlock. - inline void ReaderLock(); // Block until free or shared then acquire a share - inline void ReaderUnlock(); // Release a read share of this Mutex - inline void WriterLock() { Lock(); } // Acquire an exclusive lock - inline void WriterUnlock() { Unlock(); } // Release a lock from WriterLock() - - private: - MutexType mutex_; - - // Catch the error of writing Mutex when intending MutexLock. - Mutex(Mutex *ignored); - - Mutex(const Mutex&) = delete; - Mutex& operator=(const Mutex&) = delete; -}; - -#if defined(MUTEX_IS_WIN32_SRWLOCK) - -Mutex::Mutex() : mutex_(SRWLOCK_INIT) { } -Mutex::~Mutex() { } -void Mutex::Lock() { AcquireSRWLockExclusive(&mutex_); } -void Mutex::Unlock() { ReleaseSRWLockExclusive(&mutex_); } -void Mutex::ReaderLock() { AcquireSRWLockShared(&mutex_); } -void Mutex::ReaderUnlock() { ReleaseSRWLockShared(&mutex_); } - -#elif defined(MUTEX_IS_PTHREAD_RWLOCK) - -#define SAFE_PTHREAD(fncall) \ - do { \ - if ((fncall) != 0) abort(); \ - } while (0) - -Mutex::Mutex() { SAFE_PTHREAD(pthread_rwlock_init(&mutex_, NULL)); } -Mutex::~Mutex() { SAFE_PTHREAD(pthread_rwlock_destroy(&mutex_)); } -void Mutex::Lock() { SAFE_PTHREAD(pthread_rwlock_wrlock(&mutex_)); } -void Mutex::Unlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); } -void Mutex::ReaderLock() { SAFE_PTHREAD(pthread_rwlock_rdlock(&mutex_)); } -void Mutex::ReaderUnlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); } - -#undef SAFE_PTHREAD - -#else - -Mutex::Mutex() { } -Mutex::~Mutex() { } -void Mutex::Lock() { mutex_.lock(); } -void Mutex::Unlock() { mutex_.unlock(); } -void Mutex::ReaderLock() { Lock(); } // C++11 doesn't have std::shared_mutex. -void Mutex::ReaderUnlock() { Unlock(); } - -#endif - -// -------------------------------------------------------------------------- -// Some helper classes - -// MutexLock(mu) acquires mu when constructed and releases it when destroyed. -class MutexLock { - public: - explicit MutexLock(Mutex *mu) : mu_(mu) { mu_->Lock(); } - ~MutexLock() { mu_->Unlock(); } - private: - Mutex * const mu_; - - MutexLock(const MutexLock&) = delete; - MutexLock& operator=(const MutexLock&) = delete; -}; - -// ReaderMutexLock and WriterMutexLock do the same, for rwlocks -class ReaderMutexLock { - public: - explicit ReaderMutexLock(Mutex *mu) : mu_(mu) { mu_->ReaderLock(); } - ~ReaderMutexLock() { mu_->ReaderUnlock(); } - private: - Mutex * const mu_; - - ReaderMutexLock(const ReaderMutexLock&) = delete; - ReaderMutexLock& operator=(const ReaderMutexLock&) = delete; -}; - -class WriterMutexLock { - public: - explicit WriterMutexLock(Mutex *mu) : mu_(mu) { mu_->WriterLock(); } - ~WriterMutexLock() { mu_->WriterUnlock(); } - private: - Mutex * const mu_; - - WriterMutexLock(const WriterMutexLock&) = delete; - WriterMutexLock& operator=(const WriterMutexLock&) = delete; -}; - -// Catch bug where variable name is omitted, e.g. MutexLock (&mu); -#define MutexLock(x) static_assert(false, "MutexLock declaration missing variable name") -#define ReaderMutexLock(x) static_assert(false, "ReaderMutexLock declaration missing variable name") -#define WriterMutexLock(x) static_assert(false, "WriterMutexLock declaration missing variable name") - -} // namespace re2 - -#endif // UTIL_MUTEX_H_ diff --git a/util/pcre.cc b/util/pcre.cc index b689851..f54cb28 100644 --- a/util/pcre.cc +++ b/util/pcre.cc @@ -15,14 +15,13 @@ #include <string> #include <utility> -#include "util/util.h" -#include "util/flags.h" +#include "absl/flags/flag.h" +#include "absl/strings/str_format.h" #include "util/logging.h" #include "util/pcre.h" -#include "util/strutil.h" // Silence warnings about the wacky formatting in the operator() functions. -#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 6 +#if !defined(__clang__) && defined(__GNUC__) #pragma GCC diagnostic ignored "-Wmisleading-indentation" #endif @@ -33,10 +32,10 @@ // not exceed main thread stacks. Note that other threads // often have smaller stacks, and therefore tightening // regexp_stack_limit may frequently be necessary. -DEFINE_FLAG(int, regexp_stack_limit, 256 << 10, - "default PCRE stack limit (bytes)"); -DEFINE_FLAG(int, regexp_match_limit, 1000000, - "default PCRE match limit (function calls)"); +ABSL_FLAG(int, regexp_stack_limit, 256 << 10, + "default PCRE stack limit (bytes)"); +ABSL_FLAG(int, regexp_match_limit, 1000000, + "default PCRE match limit (function calls)"); #ifndef USEPCRE @@ -191,24 +190,11 @@ pcre* PCRE::Compile(Anchor anchor) { /***** Convenience interfaces *****/ -bool PCRE::FullMatchFunctor::operator ()(const StringPiece& text, - const PCRE& re, - const Arg& a0, - const Arg& a1, - const Arg& a2, - const Arg& a3, - const Arg& a4, - const Arg& a5, - const Arg& a6, - const Arg& a7, - const Arg& a8, - const Arg& a9, - const Arg& a10, - const Arg& a11, - const Arg& a12, - const Arg& a13, - const Arg& a14, - const Arg& a15) const { +bool PCRE::FullMatchFunctor::operator()( + absl::string_view text, const PCRE& re, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6, + const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11, + const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const { const Arg* args[kMaxArgs]; int n = 0; if (&a0 == &no_more_args) goto done; args[n++] = &a0; @@ -234,24 +220,11 @@ done: return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize); } -bool PCRE::PartialMatchFunctor::operator ()(const StringPiece& text, - const PCRE& re, - const Arg& a0, - const Arg& a1, - const Arg& a2, - const Arg& a3, - const Arg& a4, - const Arg& a5, - const Arg& a6, - const Arg& a7, - const Arg& a8, - const Arg& a9, - const Arg& a10, - const Arg& a11, - const Arg& a12, - const Arg& a13, - const Arg& a14, - const Arg& a15) const { +bool PCRE::PartialMatchFunctor::operator()( + absl::string_view text, const PCRE& re, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6, + const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11, + const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const { const Arg* args[kMaxArgs]; int n = 0; if (&a0 == &no_more_args) goto done; args[n++] = &a0; @@ -277,24 +250,11 @@ done: return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize); } -bool PCRE::ConsumeFunctor::operator ()(StringPiece* input, - const PCRE& pattern, - const Arg& a0, - const Arg& a1, - const Arg& a2, - const Arg& a3, - const Arg& a4, - const Arg& a5, - const Arg& a6, - const Arg& a7, - const Arg& a8, - const Arg& a9, - const Arg& a10, - const Arg& a11, - const Arg& a12, - const Arg& a13, - const Arg& a14, - const Arg& a15) const { +bool PCRE::ConsumeFunctor::operator()( + absl::string_view* input, const PCRE& pattern, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6, + const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11, + const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const { const Arg* args[kMaxArgs]; int n = 0; if (&a0 == &no_more_args) goto done; args[n++] = &a0; @@ -326,24 +286,11 @@ done: } } -bool PCRE::FindAndConsumeFunctor::operator ()(StringPiece* input, - const PCRE& pattern, - const Arg& a0, - const Arg& a1, - const Arg& a2, - const Arg& a3, - const Arg& a4, - const Arg& a5, - const Arg& a6, - const Arg& a7, - const Arg& a8, - const Arg& a9, - const Arg& a10, - const Arg& a11, - const Arg& a12, - const Arg& a13, - const Arg& a14, - const Arg& a15) const { +bool PCRE::FindAndConsumeFunctor::operator()( + absl::string_view* input, const PCRE& pattern, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6, + const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11, + const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const { const Arg* args[kMaxArgs]; int n = 0; if (&a0 == &no_more_args) goto done; args[n++] = &a0; @@ -375,9 +322,8 @@ done: } } -bool PCRE::Replace(std::string *str, - const PCRE& pattern, - const StringPiece& rewrite) { +bool PCRE::Replace(std::string* str, const PCRE& pattern, + absl::string_view rewrite) { int vec[kVecSize] = {}; int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize); if (matches == 0) @@ -393,9 +339,8 @@ bool PCRE::Replace(std::string *str, return true; } -int PCRE::GlobalReplace(std::string *str, - const PCRE& pattern, - const StringPiece& rewrite) { +int PCRE::GlobalReplace(std::string* str, const PCRE& pattern, + absl::string_view rewrite) { int count = 0; int vec[kVecSize] = {}; std::string out; @@ -451,10 +396,8 @@ int PCRE::GlobalReplace(std::string *str, return count; } -bool PCRE::Extract(const StringPiece &text, - const PCRE& pattern, - const StringPiece &rewrite, - std::string *out) { +bool PCRE::Extract(absl::string_view text, const PCRE& pattern, + absl::string_view rewrite, std::string* out) { int vec[kVecSize] = {}; int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize); if (matches == 0) @@ -463,7 +406,7 @@ bool PCRE::Extract(const StringPiece &text, return pattern.Rewrite(out, rewrite, text, vec, matches); } -std::string PCRE::QuoteMeta(const StringPiece& unquoted) { +std::string PCRE::QuoteMeta(absl::string_view unquoted) { std::string result; result.reserve(unquoted.size() << 1); @@ -508,12 +451,8 @@ void PCRE::ClearHitLimit() { hit_limit_ = 0; } -int PCRE::TryMatch(const StringPiece& text, - size_t startpos, - Anchor anchor, - bool empty_ok, - int *vec, - int vecsize) const { +int PCRE::TryMatch(absl::string_view text, size_t startpos, Anchor anchor, + bool empty_ok, int* vec, int vecsize) const { pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_; if (re == NULL) { PCREPORT(ERROR) << "Matching against invalid re: " << *error_; @@ -522,12 +461,12 @@ int PCRE::TryMatch(const StringPiece& text, int match_limit = match_limit_; if (match_limit <= 0) { - match_limit = GetFlag(FLAGS_regexp_match_limit); + match_limit = absl::GetFlag(FLAGS_regexp_match_limit); } int stack_limit = stack_limit_; if (stack_limit <= 0) { - stack_limit = GetFlag(FLAGS_regexp_stack_limit); + stack_limit = absl::GetFlag(FLAGS_regexp_stack_limit); } pcre_extra extra = { 0 }; @@ -604,12 +543,8 @@ int PCRE::TryMatch(const StringPiece& text, return rc; } -bool PCRE::DoMatchImpl(const StringPiece& text, - Anchor anchor, - size_t* consumed, - const Arg* const* args, - int n, - int* vec, +bool PCRE::DoMatchImpl(absl::string_view text, Anchor anchor, size_t* consumed, + const Arg* const* args, int n, int* vec, int vecsize) const { assert((1 + n) * 3 <= vecsize); // results + PCRE workspace if (NumberOfCapturingGroups() < n) { @@ -654,11 +589,8 @@ bool PCRE::DoMatchImpl(const StringPiece& text, return true; } -bool PCRE::DoMatch(const StringPiece& text, - Anchor anchor, - size_t* consumed, - const Arg* const args[], - int n) const { +bool PCRE::DoMatch(absl::string_view text, Anchor anchor, size_t* consumed, + const Arg* const args[], int n) const { assert(n >= 0); const int vecsize = (1 + n) * 3; // results + PCRE workspace // (as for kVecSize) @@ -668,8 +600,8 @@ bool PCRE::DoMatch(const StringPiece& text, return b; } -bool PCRE::Rewrite(std::string *out, const StringPiece &rewrite, - const StringPiece &text, int *vec, int veclen) const { +bool PCRE::Rewrite(std::string* out, absl::string_view rewrite, + absl::string_view text, int* vec, int veclen) const { int number_of_capturing_groups = NumberOfCapturingGroups(); for (const char *s = rewrite.data(), *end = s + rewrite.size(); s < end; s++) { @@ -704,7 +636,7 @@ bool PCRE::Rewrite(std::string *out, const StringPiece &rewrite, return true; } -bool PCRE::CheckRewriteString(const StringPiece& rewrite, +bool PCRE::CheckRewriteString(absl::string_view rewrite, std::string* error) const { int max_token = -1; for (const char *s = rewrite.data(), *end = s + rewrite.size(); @@ -733,7 +665,7 @@ bool PCRE::CheckRewriteString(const StringPiece& rewrite, } if (max_token > NumberOfCapturingGroups()) { - *error = StringPrintf( + *error = absl::StrFormat( "Rewrite schema requests %d matches, but the regexp only has %d " "parenthesized subexpressions.", max_token, NumberOfCapturingGroups()); @@ -742,7 +674,6 @@ bool PCRE::CheckRewriteString(const StringPiece& rewrite, return true; } - // Return the number of capturing subpatterns, or -1 if the // regexp wasn't valid on construction. int PCRE::NumberOfCapturingGroups() const { @@ -774,9 +705,9 @@ bool PCRE::Arg::parse_string(const char* str, size_t n, void* dest) { return true; } -bool PCRE::Arg::parse_stringpiece(const char* str, size_t n, void* dest) { +bool PCRE::Arg::parse_string_view(const char* str, size_t n, void* dest) { if (dest == NULL) return true; - *(reinterpret_cast<StringPiece*>(dest)) = StringPiece(str, n); + *(reinterpret_cast<absl::string_view*>(dest)) = absl::string_view(str, n); return true; } diff --git a/util/pcre.h b/util/pcre.h index 896b0bd..846f300 100644 --- a/util/pcre.h +++ b/util/pcre.h @@ -120,12 +120,12 @@ // // The "Consume" operation may be useful if you want to repeatedly // match regular expressions at the front of a string and skip over -// them as they match. This requires use of the "StringPiece" type, +// them as they match. This requires use of the string_view type, // which represents a sub-range of a real string. // // Example: read lines of the form "var = value" from a string. -// std::string contents = ...; // Fill string somehow -// StringPiece input(contents); // Wrap a StringPiece around it +// std::string contents = ...; // Fill string somehow +// absl::string_view input(contents); // Wrap a string_view around it // // std::string var; // int value; @@ -161,8 +161,7 @@ // Octal(&a), Hex(&b), CRadix(&c), CRadix(&d)); // will leave 64 in a, b, c, and d. -#include "util/util.h" -#include "re2/stringpiece.h" +#include "absl/strings/string_view.h" #ifdef USEPCRE #include <pcre.h> @@ -176,6 +175,16 @@ const bool UsingPCRE = false; } // namespace re2 #endif +// To produce a DLL, CMake can automatically export code symbols, +// but not data symbols, so we have to annotate those manually... +#if defined(RE2_BUILD_TESTING_DLL) +#define RE2_TESTING_DLL __declspec(dllexport) +#elif defined(RE2_CONSUME_TESTING_DLL) +#define RE2_TESTING_DLL __declspec(dllimport) +#else +#define RE2_TESTING_DLL +#endif + namespace re2 { class PCRE_Options; @@ -191,7 +200,7 @@ class PCRE { // Marks end of arg list. // ONLY USE IN OPTIONAL ARG DEFAULTS. // DO NOT PASS EXPLICITLY. - static Arg no_more_args; + RE2_TESTING_DLL static Arg no_more_args; // Options are same value as those in pcre. We provide them here // to avoid users needing to include pcre.h and also to isolate @@ -246,10 +255,10 @@ class PCRE { // // The provided pointer arguments can be pointers to any scalar numeric // type, or one of: - // std::string (matched piece is copied to string) - // StringPiece (StringPiece is mutated to point to matched piece) - // T (where "bool T::ParseFrom(const char*, size_t)" exists) - // (void*)NULL (the corresponding matched sub-pattern is not copied) + // std::string (matched piece is copied to string) + // absl::string_view (string_view is mutated to point to matched piece) + // T ("bool T::ParseFrom(const char*, size_t)" must exist) + // (void*)NULL (the corresponding matched sub-pattern is not copied) // // Returns true iff all of the following conditions are satisfied: // a. "text" matches "pattern" exactly @@ -267,7 +276,7 @@ class PCRE { // int number; // PCRE::FullMatch("abc", "[a-z]+(\\d+)?", &number); struct FullMatchFunctor { - bool operator ()(const StringPiece& text, const PCRE& re, // 3..16 args + bool operator ()(absl::string_view text, const PCRE& re, // 3..16 args const Arg& ptr1 = no_more_args, const Arg& ptr2 = no_more_args, const Arg& ptr3 = no_more_args, @@ -286,12 +295,12 @@ class PCRE { const Arg& ptr16 = no_more_args) const; }; - static const FullMatchFunctor FullMatch; + RE2_TESTING_DLL static const FullMatchFunctor FullMatch; // Exactly like FullMatch(), except that "pattern" is allowed to match // a substring of "text". struct PartialMatchFunctor { - bool operator ()(const StringPiece& text, const PCRE& re, // 3..16 args + bool operator ()(absl::string_view text, const PCRE& re, // 3..16 args const Arg& ptr1 = no_more_args, const Arg& ptr2 = no_more_args, const Arg& ptr3 = no_more_args, @@ -310,13 +319,13 @@ class PCRE { const Arg& ptr16 = no_more_args) const; }; - static const PartialMatchFunctor PartialMatch; + RE2_TESTING_DLL static const PartialMatchFunctor PartialMatch; // Like FullMatch() and PartialMatch(), except that pattern has to // match a prefix of "text", and "input" is advanced past the matched // text. Note: "input" is modified iff this routine returns true. struct ConsumeFunctor { - bool operator ()(StringPiece* input, const PCRE& pattern, // 3..16 args + bool operator ()(absl::string_view* input, const PCRE& pattern, // 3..16 args const Arg& ptr1 = no_more_args, const Arg& ptr2 = no_more_args, const Arg& ptr3 = no_more_args, @@ -335,14 +344,14 @@ class PCRE { const Arg& ptr16 = no_more_args) const; }; - static const ConsumeFunctor Consume; + RE2_TESTING_DLL static const ConsumeFunctor Consume; // Like Consume(..), but does not anchor the match at the beginning of the // string. That is, "pattern" need not start its match at the beginning of // "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next // word in "s" and stores it in "word". struct FindAndConsumeFunctor { - bool operator ()(StringPiece* input, const PCRE& pattern, + bool operator ()(absl::string_view* input, const PCRE& pattern, // 3..16 args const Arg& ptr1 = no_more_args, const Arg& ptr2 = no_more_args, const Arg& ptr3 = no_more_args, @@ -361,7 +370,7 @@ class PCRE { const Arg& ptr16 = no_more_args) const; }; - static const FindAndConsumeFunctor FindAndConsume; + RE2_TESTING_DLL static const FindAndConsumeFunctor FindAndConsume; // Replace the first match of "pattern" in "str" with "rewrite". // Within "rewrite", backslash-escaped digits (\1 to \9) can be @@ -376,9 +385,8 @@ class PCRE { // // Returns true if the pattern matches and a replacement occurs, // false otherwise. - static bool Replace(std::string *str, - const PCRE& pattern, - const StringPiece& rewrite); + static bool Replace(std::string* str, const PCRE& pattern, + absl::string_view rewrite); // Like Replace(), except replaces all occurrences of the pattern in // the string with the rewrite. Replacements are not subject to @@ -390,9 +398,8 @@ class PCRE { // will leave "s" containing "yada dada doo" // // Returns the number of replacements made. - static int GlobalReplace(std::string *str, - const PCRE& pattern, - const StringPiece& rewrite); + static int GlobalReplace(std::string* str, const PCRE& pattern, + absl::string_view rewrite); // Like Replace, except that if the pattern matches, "rewrite" // is copied into "out" with substitutions. The non-matching @@ -400,10 +407,8 @@ class PCRE { // // Returns true iff a match occurred and the extraction happened // successfully; if no match occurs, the string is left unaffected. - static bool Extract(const StringPiece &text, - const PCRE& pattern, - const StringPiece &rewrite, - std::string *out); + static bool Extract(absl::string_view text, const PCRE& pattern, + absl::string_view rewrite, std::string* out); // Check that the given @p rewrite string is suitable for use with // this PCRE. It checks that: @@ -418,8 +423,7 @@ class PCRE { // @param error An error message is recorded here, iff we return false. // Otherwise, it is unchanged. // @return true, iff @p rewrite is suitable for use with the PCRE. - bool CheckRewriteString(const StringPiece& rewrite, - std::string* error) const; + bool CheckRewriteString(absl::string_view rewrite, std::string* error) const; // Returns a copy of 'unquoted' with all potentially meaningful // regexp characters backslash-escaped. The returned string, used @@ -428,7 +432,7 @@ class PCRE { // 1.5-2.0? // becomes: // 1\.5\-2\.0\? - static std::string QuoteMeta(const StringPiece& unquoted); + static std::string QuoteMeta(absl::string_view unquoted); /***** Generic matching interface (not so nice to use) *****/ @@ -441,9 +445,7 @@ class PCRE { // General matching routine. Stores the length of the match in // "*consumed" if successful. - bool DoMatch(const StringPiece& text, - Anchor anchor, - size_t* consumed, + bool DoMatch(absl::string_view text, Anchor anchor, size_t* consumed, const Arg* const* args, int n) const; // Return the number of capturing subpatterns, or -1 if the @@ -465,29 +467,17 @@ class PCRE { // against "foo", "bar", and "baz" respectively. // When matching PCRE("(foo)|hello") against "hello", it will return 1. // But the values for all subpattern are filled in into "vec". - int TryMatch(const StringPiece& text, - size_t startpos, - Anchor anchor, - bool empty_ok, - int *vec, - int vecsize) const; - - // Append the "rewrite" string, with backslash subsitutions from "text" + int TryMatch(absl::string_view text, size_t startpos, Anchor anchor, + bool empty_ok, int* vec, int vecsize) const; + + // Append the "rewrite" string, with backslash substitutions from "text" // and "vec", to string "out". - bool Rewrite(std::string *out, - const StringPiece &rewrite, - const StringPiece &text, - int *vec, - int veclen) const; + bool Rewrite(std::string* out, absl::string_view rewrite, + absl::string_view text, int* vec, int veclen) const; // internal implementation for DoMatch - bool DoMatchImpl(const StringPiece& text, - Anchor anchor, - size_t* consumed, - const Arg* const args[], - int n, - int* vec, - int vecsize) const; + bool DoMatchImpl(absl::string_view text, Anchor anchor, size_t* consumed, + const Arg* const args[], int n, int* vec, int vecsize) const; // Compile the regexp for the specified anchoring mode pcre* Compile(Anchor anchor); @@ -500,7 +490,7 @@ class PCRE { bool report_errors_; // Silences error logging if false int match_limit_; // Limit on execution resources int stack_limit_; // Limit on stack resources (bytes) - mutable int32_t hit_limit_; // Hit limit during execution (bool) + mutable int hit_limit_; // Hit limit during execution (bool) PCRE(const PCRE&) = delete; PCRE& operator=(const PCRE&) = delete; @@ -586,7 +576,7 @@ class PCRE::Arg { MAKE_PARSER(float, parse_float); MAKE_PARSER(double, parse_double); MAKE_PARSER(std::string, parse_string); - MAKE_PARSER(StringPiece, parse_stringpiece); + MAKE_PARSER(absl::string_view, parse_string_view); MAKE_PARSER(short, parse_short); MAKE_PARSER(unsigned short, parse_ushort); @@ -613,14 +603,14 @@ class PCRE::Arg { void* arg_; Parser parser_; - static bool parse_null (const char* str, size_t n, void* dest); - static bool parse_char (const char* str, size_t n, void* dest); - static bool parse_schar (const char* str, size_t n, void* dest); - static bool parse_uchar (const char* str, size_t n, void* dest); - static bool parse_float (const char* str, size_t n, void* dest); - static bool parse_double (const char* str, size_t n, void* dest); - static bool parse_string (const char* str, size_t n, void* dest); - static bool parse_stringpiece (const char* str, size_t n, void* dest); + static bool parse_null (const char* str, size_t n, void* dest); + static bool parse_char (const char* str, size_t n, void* dest); + static bool parse_schar (const char* str, size_t n, void* dest); + static bool parse_uchar (const char* str, size_t n, void* dest); + static bool parse_float (const char* str, size_t n, void* dest); + static bool parse_double (const char* str, size_t n, void* dest); + static bool parse_string (const char* str, size_t n, void* dest); + static bool parse_string_view (const char* str, size_t n, void* dest); #define DECLARE_INTEGER_PARSER(name) \ private: \ diff --git a/util/rune.cc b/util/rune.cc index 4f625ea..a40e756 100644 --- a/util/rune.cc +++ b/util/rune.cc @@ -51,7 +51,7 @@ int chartorune(Rune *rune, const char *str) { int c, c1, c2, c3; - long l; + Rune l; /* * one character sequence @@ -127,7 +127,7 @@ int runetochar(char *str, const Rune *rune) { /* Runes are signed, so convert to unsigned for range check. */ - unsigned long c; + unsigned int c; /* * one character sequence @@ -212,7 +212,7 @@ int utflen(const char *s) { int c; - long n; + int n; Rune rune; n = 0; @@ -232,7 +232,7 @@ utflen(const char *s) char* utfrune(const char *s, Rune c) { - long c1; + int c1; Rune r; int n; diff --git a/util/strutil.cc b/util/strutil.cc index fb7e6b1..da06f85 100644 --- a/util/strutil.cc +++ b/util/strutil.cc @@ -2,79 +2,10 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#include <stdarg.h> -#include <stdio.h> - #include "util/strutil.h" -#ifdef _WIN32 -#define snprintf _snprintf -#define vsnprintf _vsnprintf -#endif - namespace re2 { -// ---------------------------------------------------------------------- -// CEscapeString() -// Copies 'src' to 'dest', escaping dangerous characters using -// C-style escape sequences. 'src' and 'dest' should not overlap. -// Returns the number of bytes written to 'dest' (not including the \0) -// or (size_t)-1 if there was insufficient space. -// ---------------------------------------------------------------------- -static size_t CEscapeString(const char* src, size_t src_len, - char* dest, size_t dest_len) { - const char* src_end = src + src_len; - size_t used = 0; - - for (; src < src_end; src++) { - if (dest_len - used < 2) // space for two-character escape - return (size_t)-1; - - unsigned char c = *src; - switch (c) { - case '\n': dest[used++] = '\\'; dest[used++] = 'n'; break; - case '\r': dest[used++] = '\\'; dest[used++] = 'r'; break; - case '\t': dest[used++] = '\\'; dest[used++] = 't'; break; - case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break; - case '\'': dest[used++] = '\\'; dest[used++] = '\''; break; - case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break; - default: - // Note that if we emit \xNN and the src character after that is a hex - // digit then that digit must be escaped too to prevent it being - // interpreted as part of the character code by C. - if (c < ' ' || c > '~') { - if (dest_len - used < 5) // space for four-character escape + \0 - return (size_t)-1; - snprintf(dest + used, 5, "\\%03o", c); - used += 4; - } else { - dest[used++] = c; break; - } - } - } - - if (dest_len - used < 1) // make sure that there is room for \0 - return (size_t)-1; - - dest[used] = '\0'; // doesn't count towards return value though - return used; -} - -// ---------------------------------------------------------------------- -// CEscape() -// Copies 'src' to result, escaping dangerous characters using -// C-style escape sequences. 'src' and 'dest' should not overlap. -// ---------------------------------------------------------------------- -std::string CEscape(const StringPiece& src) { - const size_t dest_len = src.size() * 4 + 1; // Maximum possible expansion - char* dest = new char[dest_len]; - const size_t used = CEscapeString(src.data(), src.size(), - dest, dest_len); - std::string s = std::string(dest, used); - delete[] dest; - return s; -} - void PrefixSuccessor(std::string* prefix) { // We can increment the last character in the string and be done // unless that character is 255, in which case we have to erase the @@ -92,58 +23,4 @@ void PrefixSuccessor(std::string* prefix) { } } -static void StringAppendV(std::string* dst, const char* format, va_list ap) { - // First try with a small fixed size buffer - char space[1024]; - - // It's possible for methods that use a va_list to invalidate - // the data in it upon use. The fix is to make a copy - // of the structure before using it and use that copy instead. - va_list backup_ap; - va_copy(backup_ap, ap); - int result = vsnprintf(space, sizeof(space), format, backup_ap); - va_end(backup_ap); - - if ((result >= 0) && (static_cast<size_t>(result) < sizeof(space))) { - // It fit - dst->append(space, result); - return; - } - - // Repeatedly increase buffer size until it fits - int length = sizeof(space); - while (true) { - if (result < 0) { - // Older behavior: just try doubling the buffer size - length *= 2; - } else { - // We need exactly "result+1" characters - length = result+1; - } - char* buf = new char[length]; - - // Restore the va_list before we use it again - va_copy(backup_ap, ap); - result = vsnprintf(buf, length, format, backup_ap); - va_end(backup_ap); - - if ((result >= 0) && (result < length)) { - // It fit - dst->append(buf, result); - delete[] buf; - return; - } - delete[] buf; - } -} - -std::string StringPrintf(const char* format, ...) { - va_list ap; - va_start(ap, format); - std::string result; - StringAppendV(&result, format, ap); - va_end(ap); - return result; -} - } // namespace re2 diff --git a/util/strutil.h b/util/strutil.h index a69908a..f5d87a5 100644 --- a/util/strutil.h +++ b/util/strutil.h @@ -7,14 +7,9 @@ #include <string> -#include "re2/stringpiece.h" -#include "util/util.h" - namespace re2 { -std::string CEscape(const StringPiece& src); void PrefixSuccessor(std::string* prefix); -std::string StringPrintf(const char* format, ...); } // namespace re2 diff --git a/util/test.cc b/util/test.cc deleted file mode 100644 index 028616b..0000000 --- a/util/test.cc +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include <stdio.h> -#include <string> - -#include "util/test.h" - -namespace testing { -std::string TempDir() { return "/tmp/"; } -} // namespace testing - -struct Test { - void (*fn)(void); - const char *name; -}; - -static Test tests[10000]; -static int ntests; - -void RegisterTest(void (*fn)(void), const char *name) { - tests[ntests].fn = fn; - tests[ntests++].name = name; -} - -int main(int argc, char** argv) { - for (int i = 0; i < ntests; i++) { - printf("%s\n", tests[i].name); - tests[i].fn(); - } - printf("PASS\n"); - return 0; -} diff --git a/util/test.h b/util/test.h deleted file mode 100644 index 54e6f8f..0000000 --- a/util/test.h +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#ifndef UTIL_TEST_H_ -#define UTIL_TEST_H_ - -#include "util/util.h" -#include "util/logging.h" - -namespace testing { -std::string TempDir(); -} // namespace testing - -#define TEST(x, y) \ - void x##y(void); \ - TestRegisterer r##x##y(x##y, # x "." # y); \ - void x##y(void) - -void RegisterTest(void (*)(void), const char*); - -class TestRegisterer { - public: - TestRegisterer(void (*fn)(void), const char *s) { - RegisterTest(fn, s); - } -}; - -// fatal assertions -#define ASSERT_TRUE CHECK -#define ASSERT_FALSE(x) CHECK(!(x)) -#define ASSERT_EQ CHECK_EQ -#define ASSERT_NE CHECK_NE -#define ASSERT_LT CHECK_LT -#define ASSERT_LE CHECK_LE -#define ASSERT_GT CHECK_GT -#define ASSERT_GE CHECK_GE - -// nonfatal assertions -// TODO(rsc): Do a better job? -#define EXPECT_TRUE CHECK -#define EXPECT_FALSE(x) CHECK(!(x)) -#define EXPECT_EQ CHECK_EQ -#define EXPECT_NE CHECK_NE -#define EXPECT_LT CHECK_LT -#define EXPECT_LE CHECK_LE -#define EXPECT_GT CHECK_GT -#define EXPECT_GE CHECK_GE - -#endif // UTIL_TEST_H_ diff --git a/util/util.h b/util/util.h deleted file mode 100644 index 56e46c1..0000000 --- a/util/util.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#ifndef UTIL_UTIL_H_ -#define UTIL_UTIL_H_ - -#define arraysize(array) (sizeof(array)/sizeof((array)[0])) - -#ifndef ATTRIBUTE_NORETURN -#if defined(__GNUC__) -#define ATTRIBUTE_NORETURN __attribute__((noreturn)) -#elif defined(_MSC_VER) -#define ATTRIBUTE_NORETURN __declspec(noreturn) -#else -#define ATTRIBUTE_NORETURN -#endif -#endif - -#ifndef ATTRIBUTE_UNUSED -#if defined(__GNUC__) -#define ATTRIBUTE_UNUSED __attribute__((unused)) -#else -#define ATTRIBUTE_UNUSED -#endif -#endif - -#ifndef FALLTHROUGH_INTENDED -#if defined(__clang__) -#define FALLTHROUGH_INTENDED [[clang::fallthrough]] -#elif defined(__GNUC__) && __GNUC__ >= 7 -#define FALLTHROUGH_INTENDED [[gnu::fallthrough]] -#else -#define FALLTHROUGH_INTENDED do {} while (0) -#endif -#endif - -#ifndef NO_THREAD_SAFETY_ANALYSIS -#define NO_THREAD_SAFETY_ANALYSIS -#endif - -#endif // UTIL_UTIL_H_ |