diff options
Diffstat (limited to 'boost/beast/websocket/detail/utf8_checker.ipp')
-rw-r--r-- | boost/beast/websocket/detail/utf8_checker.ipp | 331 |
1 files changed, 331 insertions, 0 deletions
diff --git a/boost/beast/websocket/detail/utf8_checker.ipp b/boost/beast/websocket/detail/utf8_checker.ipp new file mode 100644 index 0000000000..64a293456b --- /dev/null +++ b/boost/beast/websocket/detail/utf8_checker.ipp @@ -0,0 +1,331 @@ +// +// Copyright (c) 2016-2019 Vinnie Falco (vinnie dot falco at gmail dot com) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Official repository: https://github.com/boostorg/beast +// + +#ifndef BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_IPP +#define BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_IPP + +#include <boost/beast/websocket/detail/utf8_checker.hpp> + +#include <boost/assert.hpp> + +namespace boost { +namespace beast { +namespace websocket { +namespace detail { + +void +utf8_checker:: +reset() +{ + need_ = 0; + p_ = cp_; +} + +bool +utf8_checker:: +finish() +{ + auto const success = need_ == 0; + reset(); + return success; +} + +bool +utf8_checker:: +write(std::uint8_t const* in, std::size_t size) +{ + auto const valid = + [](std::uint8_t const*& p) + { + if(p[0] < 128) + { + ++p; + return true; + } + if((p[0] & 0xe0) == 0xc0) + { + if( (p[1] & 0xc0) != 0x80 || + (p[0] & 0x1e) == 0) // overlong + return false; + p += 2; + return true; + } + if((p[0] & 0xf0) == 0xe0) + { + if( (p[1] & 0xc0) != 0x80 + || (p[2] & 0xc0) != 0x80 + || (p[0] == 0xe0 && (p[1] & 0x20) == 0) // overlong + || (p[0] == 0xed && (p[1] & 0x20) == 0x20) // surrogate + //|| (p[0] == 0xef && p[1] == 0xbf && (p[2] & 0xfe) == 0xbe) // U+FFFE or U+FFFF + ) + return false; + p += 3; + return true; + } + if((p[0] & 0xf8) == 0xf0) + { + if( (p[0] & 0x07) >= 0x05 // invalid F5...FF characters + || (p[1] & 0xc0) != 0x80 + || (p[2] & 0xc0) != 0x80 + || (p[3] & 0xc0) != 0x80 + || (p[0] == 0xf0 && (p[1] & 0x30) == 0) // overlong + || (p[0] == 0xf4 && p[1] > 0x8f) || p[0] > 0xf4 // > U+10FFFF + ) + return false; + p += 4; + return true; + } + return false; + }; + auto const fail_fast = + [&]() + { + if(cp_[0] < 128) + { + return false; + } + + const auto& p = cp_; // alias, only to keep this code similar to valid() above + const auto known_only = p_ - cp_; + if (known_only == 1) + { + if((p[0] & 0xe0) == 0xc0) + { + return ((p[0] & 0x1e) == 0); // overlong + } + if((p[0] & 0xf0) == 0xe0) + { + return false; + } + if((p[0] & 0xf8) == 0xf0) + { + return ((p[0] & 0x07) >= 0x05); // invalid F5...FF characters + } + } + else if (known_only == 2) + { + if((p[0] & 0xe0) == 0xc0) + { + return ((p[1] & 0xc0) != 0x80 || + (p[0] & 0x1e) == 0); // overlong + } + if((p[0] & 0xf0) == 0xe0) + { + return ( (p[1] & 0xc0) != 0x80 + || (p[0] == 0xe0 && (p[1] & 0x20) == 0) // overlong + || (p[0] == 0xed && (p[1] & 0x20) == 0x20)); // surrogate + } + if((p[0] & 0xf8) == 0xf0) + { + return ( (p[0] & 0x07) >= 0x05 // invalid F5...FF characters + || (p[1] & 0xc0) != 0x80 + || (p[0] == 0xf0 && (p[1] & 0x30) == 0) // overlong + || (p[0] == 0xf4 && p[1] > 0x8f) || p[0] > 0xf4); // > U+10FFFF + } + } + else if (known_only == 3) + { + if((p[0] & 0xe0) == 0xc0) + { + return ( (p[1] & 0xc0) != 0x80 + || (p[0] & 0x1e) == 0); // overlong + } + if((p[0] & 0xf0) == 0xe0) + { + return ( (p[1] & 0xc0) != 0x80 + || (p[2] & 0xc0) != 0x80 + || (p[0] == 0xe0 && (p[1] & 0x20) == 0) // overlong + || (p[0] == 0xed && (p[1] & 0x20) == 0x20)); // surrogate + //|| (p[0] == 0xef && p[1] == 0xbf && (p[2] & 0xfe) == 0xbe) // U+FFFE or U+FFFF + } + if((p[0] & 0xf8) == 0xf0) + { + return ( (p[0] & 0x07) >= 0x05 // invalid F5...FF characters + || (p[1] & 0xc0) != 0x80 + || (p[2] & 0xc0) != 0x80 + || (p[0] == 0xf0 && (p[1] & 0x30) == 0) // overlong + || (p[0] == 0xf4 && p[1] > 0x8f) || p[0] > 0xf4); // > U+10FFFF + } + } + return true; + }; + auto const needed = + [](std::uint8_t const v) + { + if(v < 128) + return 1; + if(v < 192) + return 0; + if(v < 224) + return 2; + if(v < 240) + return 3; + if(v < 248) + return 4; + return 0; + }; + + auto const end = in + size; + + // Finish up any incomplete code point + if(need_ > 0) + { + // Calculate what we have + auto n = (std::min)(size, need_); + size -= n; + need_ -= n; + + // Add characters to the code point + while(n--) + *p_++ = *in++; + BOOST_ASSERT(p_ <= cp_ + 4); + + // Still incomplete? + if(need_ > 0) + { + // Incomplete code point + BOOST_ASSERT(in == end); + + // Do partial validation on the incomplete + // code point, this is called "Fail fast" + // in Autobahn|Testsuite parlance. + return ! fail_fast(); + } + + // Complete code point, validate it + std::uint8_t const* p = &cp_[0]; + if(! valid(p)) + return false; + p_ = cp_; + } + + if(size <= sizeof(std::size_t)) + goto slow; + + // Align `in` to sizeof(std::size_t) boundary + { + auto const in0 = in; + auto last = reinterpret_cast<std::uint8_t const*>( + ((reinterpret_cast<std::uintptr_t>(in) + sizeof(std::size_t) - 1) / + sizeof(std::size_t)) * sizeof(std::size_t)); + + // Check one character at a time for low-ASCII + while(in < last) + { + if(*in & 0x80) + { + // Not low-ASCII so switch to slow loop + size = size - (in - in0); + goto slow; + } + ++in; + } + size = size - (in - in0); + } + + // Fast loop: Process 4 or 8 low-ASCII characters at a time + { + auto const in0 = in; + auto last = in + size - 7; + auto constexpr mask = static_cast< + std::size_t>(0x8080808080808080 & ~std::size_t{0}); + while(in < last) + { +#if 0 + std::size_t temp; + std::memcpy(&temp, in, sizeof(temp)); + if((temp & mask) != 0) +#else + // Technically UB but works on all known platforms + if((*reinterpret_cast<std::size_t const*>(in) & mask) != 0) +#endif + { + size = size - (in - in0); + goto slow; + } + in += sizeof(std::size_t); + } + // There's at least one more full code point left + last += 4; + while(in < last) + if(! valid(in)) + return false; + goto tail; + } + +slow: + // Slow loop: Full validation on one code point at a time + { + auto last = in + size - 3; + while(in < last) + if(! valid(in)) + return false; + } + +tail: + // Handle the remaining bytes. The last + // characters could split a code point so + // we save the partial code point for later. + // + // On entry to the loop, `in` points to the + // beginning of a code point. + // + for(;;) + { + // Number of chars left + auto n = end - in; + if(! n) + break; + + // Chars we need to finish this code point + auto const need = needed(*in); + if(need == 0) + return false; + if(need <= n) + { + // Check a whole code point + if(! valid(in)) + return false; + } + else + { + // Calculate how many chars we need + // to finish this partial code point + need_ = need - n; + + // Save the partial code point + while(n--) + *p_++ = *in++; + BOOST_ASSERT(in == end); + BOOST_ASSERT(p_ <= cp_ + 4); + + // Do partial validation on the incomplete + // code point, this is called "Fail fast" + // in Autobahn|Testsuite parlance. + return ! fail_fast(); + } + } + return true; +} + +bool +check_utf8(char const* p, std::size_t n) +{ + utf8_checker c; + if(! c.write(reinterpret_cast<const uint8_t*>(p), n)) + return false; + return c.finish(); +} + +} // detail +} // websocket +} // beast +} // boost + +#endif // BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_IPP |