diff options
-rw-r--r-- | .github/workflows/ci.yml | 26 | ||||
-rw-r--r-- | .github/workflows/pr.yml | 26 | ||||
-rw-r--r-- | CMakeLists.txt | 18 | ||||
-rw-r--r-- | CONTRIBUTING.md | 2 | ||||
-rw-r--r-- | Makefile | 16 | ||||
-rw-r--r-- | re2.pc | 2 | ||||
-rw-r--r-- | re2/fuzzing/compiler-rt/include/fuzzer/FuzzedDataProvider.h | 564 | ||||
-rwxr-xr-x | re2/make_unicode_casefold.py | 2 | ||||
-rwxr-xr-x | re2/make_unicode_groups.py | 2 | ||||
-rw-r--r-- | re2/prog.cc | 7 | ||||
-rw-r--r-- | re2/re2.h | 2 | ||||
-rw-r--r-- | re2/regexp.cc | 3 | ||||
-rw-r--r-- | re2Config.cmake.in | 22 |
13 files changed, 427 insertions, 265 deletions
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1dfa9e7..44a773b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -3,15 +3,21 @@ on: push: branches: [main] jobs: - build: - runs-on: ${{ matrix.os }} + build-appleclang: + runs-on: macos-latest strategy: fail-fast: false matrix: - os: [macos-latest, ubuntu-latest] + ver: [11, 14, 17, 20] env: CC: clang CXX: clang++ + # Unlike GCC and upstream Clang, AppleClang still defaults to `-std=c++98` + # for some reason. Also, the macOS image on GitHub Actions provides wildly + # numbered Xcode versions. Thus, rather than varying the compiler version, + # we set the `-std` flag explicitly in order to vary the language version. + # (The other two flags are the default provided for CXXFLAGS in Makefile.) + CXXFLAGS: -O3 -g -std=c++${{ matrix.ver }} steps: - uses: actions/checkout@v2 - run: make && make test @@ -21,27 +27,27 @@ jobs: strategy: fail-fast: false matrix: - tag: [9, 10, 11, 12, 13] + ver: [9, 10, 11, 12, 13] env: - CC: clang-${{ matrix.tag }} - CXX: clang++-${{ matrix.tag }} + CC: clang-${{ matrix.ver }} + CXX: clang++-${{ matrix.ver }} steps: - uses: actions/checkout@v2 - - name: Install Clang ${{ matrix.tag }} + - name: Install Clang ${{ matrix.ver }} run: | wget https://apt.llvm.org/llvm.sh chmod +x ./llvm.sh - sudo ./llvm.sh ${{ matrix.tag }} + sudo ./llvm.sh ${{ matrix.ver }} shell: bash - run: make && make test shell: bash build-gcc: runs-on: ubuntu-latest - container: gcc:${{ matrix.tag }} + container: gcc:${{ matrix.ver }} strategy: fail-fast: false matrix: - tag: [4, 5, 6, 7, 8, 9, 10, 11] + ver: [6, 7, 8, 9, 10, 11] env: CC: gcc CXX: g++ diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml new file mode 100644 index 0000000..8ede73b --- /dev/null +++ b/.github/workflows/pr.yml @@ -0,0 +1,26 @@ +name: PR +on: + pull_request_target: + branches: [main] + types: [opened] +jobs: + close: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/github-script@v5 + with: + script: | + const fs = require('fs'); + console.log(await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: fs.readFileSync('CONTRIBUTING.md', { encoding: 'utf8', }), + })); + console.log(await github.rest.pulls.update({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number: context.issue.number, + state: 'closed', + })); diff --git a/CMakeLists.txt b/CMakeLists.txt index fcd3870..62d4995 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,6 +6,7 @@ cmake_minimum_required(VERSION 3.5.1) project(RE2 CXX) +include(CMakePackageConfigHelpers) include(CTest) include(GNUInstallDirs) @@ -154,10 +155,23 @@ set(RE2_HEADERS install(FILES ${RE2_HEADERS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/re2) -install(TARGETS re2 EXPORT re2Config +install(TARGETS re2 EXPORT re2Targets ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) -install(EXPORT re2Config +install(EXPORT re2Targets DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/re2 NAMESPACE re2::) + +configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/re2Config.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/re2Config.cmake + INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/re2 + ) +write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/re2ConfigVersion.cmake + VERSION ${SONAME}.0.0 + COMPATIBILITY SameMajorVersion + ) + +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/re2Config.cmake + ${CMAKE_CURRENT_BINARY_DIR}/re2ConfigVersion.cmake + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/re2) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3af2b0a..882b0e2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,2 +1,2 @@ RE2 uses Gerrit instead of GitHub pull requests. -See the [Contributing](https://github.com/google/re2/wiki/Contribute) wiki page. +See the [Contribute](https://github.com/google/re2/wiki/Contribute) wiki page. @@ -17,7 +17,7 @@ CXX?=g++ CXXFLAGS?=-O3 -g LDFLAGS?= # required -RE2_CXXFLAGS?=-std=c++11 -pthread -Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -I. $(CCICU) $(CCPCRE) +RE2_CXXFLAGS?=-pthread -Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -I. $(CCICU) $(CCPCRE) RE2_LDFLAGS?=-pthread $(LDICU) $(LDPCRE) AR?=ar ARFLAGS?=rsc @@ -327,7 +327,7 @@ testinstall: static-testinstall shared-testinstall @echo .PHONY: static-testinstall -static-testinstall: CXXFLAGS:=-std=c++11 -pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS) +static-testinstall: CXXFLAGS:=-pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS) static-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -l:libre2.a $(LDICU) $(LDFLAGS) static-testinstall: @mkdir -p obj @@ -337,21 +337,21 @@ ifeq ($(shell uname),Darwin) else ifeq ($(shell uname),SunOS) @echo Skipping test for libre2.a on SunOS. else - (cd obj && $(CXX) testinstall.cc -o testinstall $(CXXFLAGS) $(LDFLAGS)) - obj/testinstall + (cd obj && $(CXX) testinstall.cc -o static-testinstall $(CXXFLAGS) $(LDFLAGS)) + obj/static-testinstall endif .PHONY: shared-testinstall -shared-testinstall: CXXFLAGS:=-std=c++11 -pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS) +shared-testinstall: CXXFLAGS:=-pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS) shared-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -lre2 $(LDICU) $(LDFLAGS) shared-testinstall: @mkdir -p obj @cp testinstall.cc obj - (cd obj && $(CXX) testinstall.cc -o testinstall $(CXXFLAGS) $(LDFLAGS)) + (cd obj && $(CXX) testinstall.cc -o shared-testinstall $(CXXFLAGS) $(LDFLAGS)) ifeq ($(shell uname),Darwin) - DYLD_LIBRARY_PATH="$(DESTDIR)$(libdir):$(DYLD_LIBRARY_PATH)" obj/testinstall + DYLD_LIBRARY_PATH="$(DESTDIR)$(libdir):$(DYLD_LIBRARY_PATH)" obj/shared-testinstall else - LD_LIBRARY_PATH="$(DESTDIR)$(libdir):$(LD_LIBRARY_PATH)" obj/testinstall + LD_LIBRARY_PATH="$(DESTDIR)$(libdir):$(LD_LIBRARY_PATH)" obj/shared-testinstall endif .PHONY: benchlog @@ -4,5 +4,5 @@ libdir=@libdir@ Name: re2 Description: RE2 is a fast, safe, thread-friendly regular expression engine. Version: 0.0.0 -Cflags: -std=c++11 -pthread -I${includedir} +Cflags: -pthread -I${includedir} Libs: -pthread -L${libdir} -lre2 diff --git a/re2/fuzzing/compiler-rt/include/fuzzer/FuzzedDataProvider.h b/re2/fuzzing/compiler-rt/include/fuzzer/FuzzedDataProvider.h index 3e069eb..71cb427 100644 --- a/re2/fuzzing/compiler-rt/include/fuzzer/FuzzedDataProvider.h +++ b/re2/fuzzing/compiler-rt/include/fuzzer/FuzzedDataProvider.h @@ -14,11 +14,13 @@ #define LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_ #include <algorithm> +#include <array> #include <climits> #include <cstddef> #include <cstdint> #include <cstring> #include <initializer_list> +#include <limits> #include <string> #include <type_traits> #include <utility> @@ -34,272 +36,362 @@ class FuzzedDataProvider { : data_ptr_(data), remaining_bytes_(size) {} ~FuzzedDataProvider() = default; - // Returns a std::vector containing |num_bytes| of input data. If fewer than - // |num_bytes| of data remain, returns a shorter std::vector containing all - // of the data that's left. Can be used with any byte sized type, such as - // char, unsigned char, uint8_t, etc. - template <typename T> std::vector<T> ConsumeBytes(size_t num_bytes) { - num_bytes = std::min(num_bytes, remaining_bytes_); - return ConsumeBytes<T>(num_bytes, num_bytes); - } + // See the implementation below (after the class definition) for more verbose + // comments for each of the methods. - // Similar to |ConsumeBytes|, but also appends the terminator value at the end - // of the resulting vector. Useful, when a mutable null-terminated C-string is - // needed, for example. But that is a rare case. Better avoid it, if possible, - // and prefer using |ConsumeBytes| or |ConsumeBytesAsString| methods. + // Methods returning std::vector of bytes. These are the most popular choice + // when splitting fuzzing input into pieces, as every piece is put into a + // separate buffer (i.e. ASan would catch any under-/overflow) and the memory + // will be released automatically. + template <typename T> std::vector<T> ConsumeBytes(size_t num_bytes); template <typename T> - std::vector<T> ConsumeBytesWithTerminator(size_t num_bytes, - T terminator = 0) { - num_bytes = std::min(num_bytes, remaining_bytes_); - std::vector<T> result = ConsumeBytes<T>(num_bytes + 1, num_bytes); - result.back() = terminator; - return result; - } + std::vector<T> ConsumeBytesWithTerminator(size_t num_bytes, T terminator = 0); + template <typename T> std::vector<T> ConsumeRemainingBytes(); - // Returns a std::string containing |num_bytes| of input data. Using this and - // |.c_str()| on the resulting string is the best way to get an immutable - // null-terminated C string. If fewer than |num_bytes| of data remain, returns - // a shorter std::string containing all of the data that's left. - std::string ConsumeBytesAsString(size_t num_bytes) { - static_assert(sizeof(std::string::value_type) == sizeof(uint8_t), - "ConsumeBytesAsString cannot convert the data to a string."); - - num_bytes = std::min(num_bytes, remaining_bytes_); - std::string result( - reinterpret_cast<const std::string::value_type *>(data_ptr_), - num_bytes); - Advance(num_bytes); - return result; - } + // Methods returning strings. Use only when you need a std::string or a null + // terminated C-string. Otherwise, prefer the methods returning std::vector. + std::string ConsumeBytesAsString(size_t num_bytes); + std::string ConsumeRandomLengthString(size_t max_length); + std::string ConsumeRandomLengthString(); + std::string ConsumeRemainingBytesAsString(); - // Returns a number in the range [min, max] by consuming bytes from the - // input data. The value might not be uniformly distributed in the given - // range. If there's no input data left, always returns |min|. |min| must - // be less than or equal to |max|. - template <typename T> T ConsumeIntegralInRange(T min, T max) { - static_assert(std::is_integral<T>::value, "An integral type is required."); - static_assert(sizeof(T) <= sizeof(uint64_t), "Unsupported integral type."); + // Methods returning integer values. + template <typename T> T ConsumeIntegral(); + template <typename T> T ConsumeIntegralInRange(T min, T max); - if (min > max) - abort(); + // Methods returning floating point values. + template <typename T> T ConsumeFloatingPoint(); + template <typename T> T ConsumeFloatingPointInRange(T min, T max); - // Use the biggest type possible to hold the range and the result. - uint64_t range = static_cast<uint64_t>(max) - min; - uint64_t result = 0; - size_t offset = 0; - - while (offset < sizeof(T) * CHAR_BIT && (range >> offset) > 0 && - remaining_bytes_ != 0) { - // Pull bytes off the end of the seed data. Experimentally, this seems to - // allow the fuzzer to more easily explore the input space. This makes - // sense, since it works by modifying inputs that caused new code to run, - // and this data is often used to encode length of data read by - // |ConsumeBytes|. Separating out read lengths makes it easier modify the - // contents of the data that is actually read. - --remaining_bytes_; - result = (result << CHAR_BIT) | data_ptr_[remaining_bytes_]; - offset += CHAR_BIT; - } + // 0 <= return value <= 1. + template <typename T> T ConsumeProbability(); - // Avoid division by 0, in case |range + 1| results in overflow. - if (range != std::numeric_limits<decltype(range)>::max()) - result = result % (range + 1); + bool ConsumeBool(); - return static_cast<T>(min + result); - } + // Returns a value chosen from the given enum. + template <typename T> T ConsumeEnum(); - // Returns a std::string of length from 0 to |max_length|. When it runs out of - // input data, returns what remains of the input. Designed to be more stable - // with respect to a fuzzer inserting characters than just picking a random - // length and then consuming that many bytes with |ConsumeBytes|. - std::string ConsumeRandomLengthString(size_t max_length) { - // Reads bytes from the start of |data_ptr_|. Maps "\\" to "\", and maps "\" - // followed by anything else to the end of the string. As a result of this - // logic, a fuzzer can insert characters into the string, and the string - // will be lengthened to include those new characters, resulting in a more - // stable fuzzer than picking the length of a string independently from - // picking its contents. - std::string result; - - // Reserve the anticipated capaticity to prevent several reallocations. - result.reserve(std::min(max_length, remaining_bytes_)); - for (size_t i = 0; i < max_length && remaining_bytes_ != 0; ++i) { - char next = ConvertUnsignedToSigned<char>(data_ptr_[0]); - Advance(1); - if (next == '\\' && remaining_bytes_ != 0) { - next = ConvertUnsignedToSigned<char>(data_ptr_[0]); - Advance(1); - if (next != '\\') - break; - } - result += next; - } - - result.shrink_to_fit(); - return result; - } + // Returns a value from the given array. + template <typename T, size_t size> T PickValueInArray(const T (&array)[size]); + template <typename T, size_t size> + T PickValueInArray(const std::array<T, size> &array); + template <typename T> T PickValueInArray(std::initializer_list<const T> list); - // Returns a std::vector containing all remaining bytes of the input data. - template <typename T> std::vector<T> ConsumeRemainingBytes() { - return ConsumeBytes<T>(remaining_bytes_); - } + // Writes data to the given destination and returns number of bytes written. + size_t ConsumeData(void *destination, size_t num_bytes); - // Returns a std::string containing all remaining bytes of the input data. - // Prefer using |ConsumeRemainingBytes| unless you actually need a std::string - // object. - std::string ConsumeRemainingBytesAsString() { - return ConsumeBytesAsString(remaining_bytes_); - } + // Reports the remaining bytes available for fuzzed input. + size_t remaining_bytes() { return remaining_bytes_; } - // Returns a number in the range [Type's min, Type's max]. The value might - // not be uniformly distributed in the given range. If there's no input data - // left, always returns |min|. - template <typename T> T ConsumeIntegral() { - return ConsumeIntegralInRange(std::numeric_limits<T>::min(), - std::numeric_limits<T>::max()); - } + private: + FuzzedDataProvider(const FuzzedDataProvider &) = delete; + FuzzedDataProvider &operator=(const FuzzedDataProvider &) = delete; - // Reads one byte and returns a bool, or false when no data remains. - bool ConsumeBool() { return 1 & ConsumeIntegral<uint8_t>(); } + void CopyAndAdvance(void *destination, size_t num_bytes); - // Returns a copy of the value selected from the given fixed-size |array|. - template <typename T, size_t size> - T PickValueInArray(const T (&array)[size]) { - static_assert(size > 0, "The array must be non empty."); - return array[ConsumeIntegralInRange<size_t>(0, size - 1)]; - } + void Advance(size_t num_bytes); template <typename T> - T PickValueInArray(std::initializer_list<const T> list) { - // TODO(Dor1s): switch to static_assert once C++14 is allowed. - if (!list.size()) - abort(); - - return *(list.begin() + ConsumeIntegralInRange<size_t>(0, list.size() - 1)); - } - - // Returns an enum value. The enum must start at 0 and be contiguous. It must - // also contain |kMaxValue| aliased to its largest (inclusive) value. Such as: - // enum class Foo { SomeValue, OtherValue, kMaxValue = OtherValue }; - template <typename T> T ConsumeEnum() { - static_assert(std::is_enum<T>::value, "|T| must be an enum type."); - return static_cast<T>(ConsumeIntegralInRange<uint32_t>( - 0, static_cast<uint32_t>(T::kMaxValue))); - } + std::vector<T> ConsumeBytes(size_t size, size_t num_bytes); - // Returns a floating point number in the range [0.0, 1.0]. If there's no - // input data left, always returns 0. - template <typename T> T ConsumeProbability() { - static_assert(std::is_floating_point<T>::value, - "A floating point type is required."); + template <typename TS, typename TU> TS ConvertUnsignedToSigned(TU value); - // Use different integral types for different floating point types in order - // to provide better density of the resulting values. - using IntegralType = - typename std::conditional<(sizeof(T) <= sizeof(uint32_t)), uint32_t, - uint64_t>::type; + const uint8_t *data_ptr_; + size_t remaining_bytes_; +}; - T result = static_cast<T>(ConsumeIntegral<IntegralType>()); - result /= static_cast<T>(std::numeric_limits<IntegralType>::max()); - return result; +// Returns a std::vector containing |num_bytes| of input data. If fewer than +// |num_bytes| of data remain, returns a shorter std::vector containing all +// of the data that's left. Can be used with any byte sized type, such as +// char, unsigned char, uint8_t, etc. +template <typename T> +std::vector<T> FuzzedDataProvider::ConsumeBytes(size_t num_bytes) { + num_bytes = std::min(num_bytes, remaining_bytes_); + return ConsumeBytes<T>(num_bytes, num_bytes); +} + +// Similar to |ConsumeBytes|, but also appends the terminator value at the end +// of the resulting vector. Useful, when a mutable null-terminated C-string is +// needed, for example. But that is a rare case. Better avoid it, if possible, +// and prefer using |ConsumeBytes| or |ConsumeBytesAsString| methods. +template <typename T> +std::vector<T> FuzzedDataProvider::ConsumeBytesWithTerminator(size_t num_bytes, + T terminator) { + num_bytes = std::min(num_bytes, remaining_bytes_); + std::vector<T> result = ConsumeBytes<T>(num_bytes + 1, num_bytes); + result.back() = terminator; + return result; +} + +// Returns a std::vector containing all remaining bytes of the input data. +template <typename T> +std::vector<T> FuzzedDataProvider::ConsumeRemainingBytes() { + return ConsumeBytes<T>(remaining_bytes_); +} + +// Returns a std::string containing |num_bytes| of input data. Using this and +// |.c_str()| on the resulting string is the best way to get an immutable +// null-terminated C string. If fewer than |num_bytes| of data remain, returns +// a shorter std::string containing all of the data that's left. +inline std::string FuzzedDataProvider::ConsumeBytesAsString(size_t num_bytes) { + static_assert(sizeof(std::string::value_type) == sizeof(uint8_t), + "ConsumeBytesAsString cannot convert the data to a string."); + + num_bytes = std::min(num_bytes, remaining_bytes_); + std::string result( + reinterpret_cast<const std::string::value_type *>(data_ptr_), num_bytes); + Advance(num_bytes); + return result; +} + +// Returns a std::string of length from 0 to |max_length|. When it runs out of +// input data, returns what remains of the input. Designed to be more stable +// with respect to a fuzzer inserting characters than just picking a random +// length and then consuming that many bytes with |ConsumeBytes|. +inline std::string +FuzzedDataProvider::ConsumeRandomLengthString(size_t max_length) { + // Reads bytes from the start of |data_ptr_|. Maps "\\" to "\", and maps "\" + // followed by anything else to the end of the string. As a result of this + // logic, a fuzzer can insert characters into the string, and the string + // will be lengthened to include those new characters, resulting in a more + // stable fuzzer than picking the length of a string independently from + // picking its contents. + std::string result; + + // Reserve the anticipated capaticity to prevent several reallocations. + result.reserve(std::min(max_length, remaining_bytes_)); + for (size_t i = 0; i < max_length && remaining_bytes_ != 0; ++i) { + char next = ConvertUnsignedToSigned<char>(data_ptr_[0]); + Advance(1); + if (next == '\\' && remaining_bytes_ != 0) { + next = ConvertUnsignedToSigned<char>(data_ptr_[0]); + Advance(1); + if (next != '\\') + break; + } + result += next; } - // Returns a floating point value in the range [Type's lowest, Type's max] by - // consuming bytes from the input data. If there's no input data left, always - // returns approximately 0. - template <typename T> T ConsumeFloatingPoint() { - return ConsumeFloatingPointInRange<T>(std::numeric_limits<T>::lowest(), - std::numeric_limits<T>::max()); + result.shrink_to_fit(); + return result; +} + +// Returns a std::string of length from 0 to |remaining_bytes_|. +inline std::string FuzzedDataProvider::ConsumeRandomLengthString() { + return ConsumeRandomLengthString(remaining_bytes_); +} + +// Returns a std::string containing all remaining bytes of the input data. +// Prefer using |ConsumeRemainingBytes| unless you actually need a std::string +// object. +inline std::string FuzzedDataProvider::ConsumeRemainingBytesAsString() { + return ConsumeBytesAsString(remaining_bytes_); +} + +// Returns a number in the range [Type's min, Type's max]. The value might +// not be uniformly distributed in the given range. If there's no input data +// left, always returns |min|. +template <typename T> T FuzzedDataProvider::ConsumeIntegral() { + return ConsumeIntegralInRange(std::numeric_limits<T>::min(), + std::numeric_limits<T>::max()); +} + +// Returns a number in the range [min, max] by consuming bytes from the +// input data. The value might not be uniformly distributed in the given +// range. If there's no input data left, always returns |min|. |min| must +// be less than or equal to |max|. +template <typename T> +T FuzzedDataProvider::ConsumeIntegralInRange(T min, T max) { + static_assert(std::is_integral<T>::value, "An integral type is required."); + static_assert(sizeof(T) <= sizeof(uint64_t), "Unsupported integral type."); + + if (min > max) + abort(); + + // Use the biggest type possible to hold the range and the result. + uint64_t range = static_cast<uint64_t>(max) - min; + uint64_t result = 0; + size_t offset = 0; + + while (offset < sizeof(T) * CHAR_BIT && (range >> offset) > 0 && + remaining_bytes_ != 0) { + // Pull bytes off the end of the seed data. Experimentally, this seems to + // allow the fuzzer to more easily explore the input space. This makes + // sense, since it works by modifying inputs that caused new code to run, + // and this data is often used to encode length of data read by + // |ConsumeBytes|. Separating out read lengths makes it easier modify the + // contents of the data that is actually read. + --remaining_bytes_; + result = (result << CHAR_BIT) | data_ptr_[remaining_bytes_]; + offset += CHAR_BIT; } - // Returns a floating point value in the given range by consuming bytes from - // the input data. If there's no input data left, returns |min|. Note that - // |min| must be less than or equal to |max|. - template <typename T> T ConsumeFloatingPointInRange(T min, T max) { - if (min > max) - abort(); - - T range = .0; - T result = min; - constexpr T zero(.0); - if (max > zero && min < zero && max > min + std::numeric_limits<T>::max()) { - // The diff |max - min| would overflow the given floating point type. Use - // the half of the diff as the range and consume a bool to decide whether - // the result is in the first of the second part of the diff. - range = (max / 2.0) - (min / 2.0); - if (ConsumeBool()) { - result += range; - } - } else { - range = max - min; + // Avoid division by 0, in case |range + 1| results in overflow. + if (range != std::numeric_limits<decltype(range)>::max()) + result = result % (range + 1); + + return static_cast<T>(min + result); +} + +// Returns a floating point value in the range [Type's lowest, Type's max] by +// consuming bytes from the input data. If there's no input data left, always +// returns approximately 0. +template <typename T> T FuzzedDataProvider::ConsumeFloatingPoint() { + return ConsumeFloatingPointInRange<T>(std::numeric_limits<T>::lowest(), + std::numeric_limits<T>::max()); +} + +// Returns a floating point value in the given range by consuming bytes from +// the input data. If there's no input data left, returns |min|. Note that +// |min| must be less than or equal to |max|. +template <typename T> +T FuzzedDataProvider::ConsumeFloatingPointInRange(T min, T max) { + if (min > max) + abort(); + + T range = .0; + T result = min; + constexpr T zero(.0); + if (max > zero && min < zero && max > min + std::numeric_limits<T>::max()) { + // The diff |max - min| would overflow the given floating point type. Use + // the half of the diff as the range and consume a bool to decide whether + // the result is in the first of the second part of the diff. + range = (max / 2.0) - (min / 2.0); + if (ConsumeBool()) { + result += range; } - - return result + range * ConsumeProbability<T>(); + } else { + range = max - min; } - // Reports the remaining bytes available for fuzzed input. - size_t remaining_bytes() { return remaining_bytes_; } - - private: - FuzzedDataProvider(const FuzzedDataProvider &) = delete; - FuzzedDataProvider &operator=(const FuzzedDataProvider &) = delete; - - void Advance(size_t num_bytes) { - if (num_bytes > remaining_bytes_) + return result + range * ConsumeProbability<T>(); +} + +// Returns a floating point number in the range [0.0, 1.0]. If there's no +// input data left, always returns 0. +template <typename T> T FuzzedDataProvider::ConsumeProbability() { + static_assert(std::is_floating_point<T>::value, + "A floating point type is required."); + + // Use different integral types for different floating point types in order + // to provide better density of the resulting values. + using IntegralType = + typename std::conditional<(sizeof(T) <= sizeof(uint32_t)), uint32_t, + uint64_t>::type; + + T result = static_cast<T>(ConsumeIntegral<IntegralType>()); + result /= static_cast<T>(std::numeric_limits<IntegralType>::max()); + return result; +} + +// Reads one byte and returns a bool, or false when no data remains. +inline bool FuzzedDataProvider::ConsumeBool() { + return 1 & ConsumeIntegral<uint8_t>(); +} + +// Returns an enum value. The enum must start at 0 and be contiguous. It must +// also contain |kMaxValue| aliased to its largest (inclusive) value. Such as: +// enum class Foo { SomeValue, OtherValue, kMaxValue = OtherValue }; +template <typename T> T FuzzedDataProvider::ConsumeEnum() { + static_assert(std::is_enum<T>::value, "|T| must be an enum type."); + return static_cast<T>( + ConsumeIntegralInRange<uint32_t>(0, static_cast<uint32_t>(T::kMaxValue))); +} + +// Returns a copy of the value selected from the given fixed-size |array|. +template <typename T, size_t size> +T FuzzedDataProvider::PickValueInArray(const T (&array)[size]) { + static_assert(size > 0, "The array must be non empty."); + return array[ConsumeIntegralInRange<size_t>(0, size - 1)]; +} + +template <typename T, size_t size> +T FuzzedDataProvider::PickValueInArray(const std::array<T, size> &array) { + static_assert(size > 0, "The array must be non empty."); + return array[ConsumeIntegralInRange<size_t>(0, size - 1)]; +} + +template <typename T> +T FuzzedDataProvider::PickValueInArray(std::initializer_list<const T> list) { + // TODO(Dor1s): switch to static_assert once C++14 is allowed. + if (!list.size()) + abort(); + + return *(list.begin() + ConsumeIntegralInRange<size_t>(0, list.size() - 1)); +} + +// Writes |num_bytes| of input data to the given destination pointer. If there +// is not enough data left, writes all remaining bytes. Return value is the +// number of bytes written. +// In general, it's better to avoid using this function, but it may be useful +// in cases when it's necessary to fill a certain buffer or object with +// fuzzing data. +inline size_t FuzzedDataProvider::ConsumeData(void *destination, + size_t num_bytes) { + num_bytes = std::min(num_bytes, remaining_bytes_); + CopyAndAdvance(destination, num_bytes); + return num_bytes; +} + +// Private methods. +inline void FuzzedDataProvider::CopyAndAdvance(void *destination, + size_t num_bytes) { + std::memcpy(destination, data_ptr_, num_bytes); + Advance(num_bytes); +} + +inline void FuzzedDataProvider::Advance(size_t num_bytes) { + if (num_bytes > remaining_bytes_) + abort(); + + data_ptr_ += num_bytes; + remaining_bytes_ -= num_bytes; +} + +template <typename T> +std::vector<T> FuzzedDataProvider::ConsumeBytes(size_t size, size_t num_bytes) { + static_assert(sizeof(T) == sizeof(uint8_t), "Incompatible data type."); + + // The point of using the size-based constructor below is to increase the + // odds of having a vector object with capacity being equal to the length. + // That part is always implementation specific, but at least both libc++ and + // libstdc++ allocate the requested number of bytes in that constructor, + // which seems to be a natural choice for other implementations as well. + // To increase the odds even more, we also call |shrink_to_fit| below. + std::vector<T> result(size); + if (size == 0) { + if (num_bytes != 0) abort(); - - data_ptr_ += num_bytes; - remaining_bytes_ -= num_bytes; - } - - template <typename T> - std::vector<T> ConsumeBytes(size_t size, size_t num_bytes_to_consume) { - static_assert(sizeof(T) == sizeof(uint8_t), "Incompatible data type."); - - // The point of using the size-based constructor below is to increase the - // odds of having a vector object with capacity being equal to the length. - // That part is always implementation specific, but at least both libc++ and - // libstdc++ allocate the requested number of bytes in that constructor, - // which seems to be a natural choice for other implementations as well. - // To increase the odds even more, we also call |shrink_to_fit| below. - std::vector<T> result(size); - if (size == 0) { - if (num_bytes_to_consume != 0) - abort(); - return result; - } - - std::memcpy(result.data(), data_ptr_, num_bytes_to_consume); - Advance(num_bytes_to_consume); - - // Even though |shrink_to_fit| is also implementation specific, we expect it - // to provide an additional assurance in case vector's constructor allocated - // a buffer which is larger than the actual amount of data we put inside it. - result.shrink_to_fit(); return result; } - template <typename TS, typename TU> TS ConvertUnsignedToSigned(TU value) { - static_assert(sizeof(TS) == sizeof(TU), "Incompatible data types."); - static_assert(!std::numeric_limits<TU>::is_signed, - "Source type must be unsigned."); - - // TODO(Dor1s): change to `if constexpr` once C++17 becomes mainstream. - if (std::numeric_limits<TS>::is_modulo) - return static_cast<TS>(value); - - // Avoid using implementation-defined unsigned to signer conversions. - // To learn more, see https://stackoverflow.com/questions/13150449. - if (value <= std::numeric_limits<TS>::max()) { - return static_cast<TS>(value); - } else { - constexpr auto TS_min = std::numeric_limits<TS>::min(); - return TS_min + static_cast<char>(value - TS_min); - } + CopyAndAdvance(result.data(), num_bytes); + + // Even though |shrink_to_fit| is also implementation specific, we expect it + // to provide an additional assurance in case vector's constructor allocated + // a buffer which is larger than the actual amount of data we put inside it. + result.shrink_to_fit(); + return result; +} + +template <typename TS, typename TU> +TS FuzzedDataProvider::ConvertUnsignedToSigned(TU value) { + static_assert(sizeof(TS) == sizeof(TU), "Incompatible data types."); + static_assert(!std::numeric_limits<TU>::is_signed, + "Source type must be unsigned."); + + // TODO(Dor1s): change to `if constexpr` once C++17 becomes mainstream. + if (std::numeric_limits<TS>::is_modulo) + return static_cast<TS>(value); + + // Avoid using implementation-defined unsigned to signed conversions. + // To learn more, see https://stackoverflow.com/questions/13150449. + if (value <= std::numeric_limits<TS>::max()) { + return static_cast<TS>(value); + } else { + constexpr auto TS_min = std::numeric_limits<TS>::min(); + return TS_min + static_cast<TS>(value - TS_min); } - - const uint8_t *data_ptr_; - size_t remaining_bytes_; -}; +} #endif // LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_ diff --git a/re2/make_unicode_casefold.py b/re2/make_unicode_casefold.py index 0cf0d35..803adbd 100755 --- a/re2/make_unicode_casefold.py +++ b/re2/make_unicode_casefold.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 # coding=utf-8 # # Copyright 2008 The RE2 Authors. All Rights Reserved. diff --git a/re2/make_unicode_groups.py b/re2/make_unicode_groups.py index 46aef40..cbe822a 100755 --- a/re2/make_unicode_groups.py +++ b/re2/make_unicode_groups.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 # Copyright 2008 The RE2 Authors. All Rights Reserved. # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. diff --git a/re2/prog.cc b/re2/prog.cc index 55dc105..a700d35 100644 --- a/re2/prog.cc +++ b/re2/prog.cc @@ -611,10 +611,13 @@ void Prog::Flatten() { inst_count_[ip->opcode()]++; } - int total = 0; +#if !defined(NDEBUG) + // Address a `-Wunused-but-set-variable' warning from Clang 13.x. + size_t total = 0; for (int i = 0; i < kNumInst; i++) total += inst_count_[i]; - DCHECK_EQ(total, static_cast<int>(flat.size())); + CHECK_EQ(total, flat.size()); +#endif // Remap start_unanchored and start. if (start_unanchored() == 0) { @@ -971,7 +971,7 @@ namespace hooks { // As per https://github.com/google/re2/issues/325, thread_local support in // MinGW seems to be buggy. (FWIW, Abseil folks also avoid it.) #define RE2_HAVE_THREAD_LOCAL -#if (defined(__APPLE__) && !TARGET_OS_OSX) || defined(__MINGW32__) +#if (defined(__APPLE__) && !(defined(TARGET_OS_OSX) && TARGET_OS_OSX)) || defined(__MINGW32__) #undef RE2_HAVE_THREAD_LOCAL #endif diff --git a/re2/regexp.cc b/re2/regexp.cc index 2e1bfac..ca1318b 100644 --- a/re2/regexp.cc +++ b/re2/regexp.cc @@ -585,8 +585,7 @@ class NamedCapturesWalker : public Regexp::Walker<Ignored> { // Record first occurrence of each name. // (The rule is that if you have the same name // multiple times, only the leftmost one counts.) - if (map_->find(*re->name()) == map_->end()) - (*map_)[*re->name()] = re->cap(); + map_->insert({*re->name(), re->cap()}); } return ignored; } diff --git a/re2Config.cmake.in b/re2Config.cmake.in new file mode 100644 index 0000000..7698107 --- /dev/null +++ b/re2Config.cmake.in @@ -0,0 +1,22 @@ +# Copyright 2022 The RE2 Authors. All Rights Reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +@PACKAGE_INIT@ + +include(CMakeFindDependencyMacro) + +set_and_check(re2_INCLUDE_DIR ${PACKAGE_PREFIX_DIR}/@CMAKE_INSTALL_INCLUDEDIR@) + +if(UNIX) + set(THREADS_PREFER_PTHREAD_FLAG ON) + find_dependency(Threads REQUIRED) +endif() + +check_required_components(re2) + +if(TARGET re2::re2) + return() +endif() + +include(${CMAKE_CURRENT_LIST_DIR}/re2Targets.cmake) |