summaryrefslogtreecommitdiff
path: root/boost/beast/websocket/detail/utf8_checker.ipp
diff options
context:
space:
mode:
Diffstat (limited to 'boost/beast/websocket/detail/utf8_checker.ipp')
-rw-r--r--boost/beast/websocket/detail/utf8_checker.ipp331
1 files changed, 331 insertions, 0 deletions
diff --git a/boost/beast/websocket/detail/utf8_checker.ipp b/boost/beast/websocket/detail/utf8_checker.ipp
new file mode 100644
index 0000000000..64a293456b
--- /dev/null
+++ b/boost/beast/websocket/detail/utf8_checker.ipp
@@ -0,0 +1,331 @@
+//
+// Copyright (c) 2016-2019 Vinnie Falco (vinnie dot falco at gmail dot com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/boostorg/beast
+//
+
+#ifndef BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_IPP
+#define BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_IPP
+
+#include <boost/beast/websocket/detail/utf8_checker.hpp>
+
+#include <boost/assert.hpp>
+
+namespace boost {
+namespace beast {
+namespace websocket {
+namespace detail {
+
+void
+utf8_checker::
+reset()
+{
+ need_ = 0;
+ p_ = cp_;
+}
+
+bool
+utf8_checker::
+finish()
+{
+ auto const success = need_ == 0;
+ reset();
+ return success;
+}
+
+bool
+utf8_checker::
+write(std::uint8_t const* in, std::size_t size)
+{
+ auto const valid =
+ [](std::uint8_t const*& p)
+ {
+ if(p[0] < 128)
+ {
+ ++p;
+ return true;
+ }
+ if((p[0] & 0xe0) == 0xc0)
+ {
+ if( (p[1] & 0xc0) != 0x80 ||
+ (p[0] & 0x1e) == 0) // overlong
+ return false;
+ p += 2;
+ return true;
+ }
+ if((p[0] & 0xf0) == 0xe0)
+ {
+ if( (p[1] & 0xc0) != 0x80
+ || (p[2] & 0xc0) != 0x80
+ || (p[0] == 0xe0 && (p[1] & 0x20) == 0) // overlong
+ || (p[0] == 0xed && (p[1] & 0x20) == 0x20) // surrogate
+ //|| (p[0] == 0xef && p[1] == 0xbf && (p[2] & 0xfe) == 0xbe) // U+FFFE or U+FFFF
+ )
+ return false;
+ p += 3;
+ return true;
+ }
+ if((p[0] & 0xf8) == 0xf0)
+ {
+ if( (p[0] & 0x07) >= 0x05 // invalid F5...FF characters
+ || (p[1] & 0xc0) != 0x80
+ || (p[2] & 0xc0) != 0x80
+ || (p[3] & 0xc0) != 0x80
+ || (p[0] == 0xf0 && (p[1] & 0x30) == 0) // overlong
+ || (p[0] == 0xf4 && p[1] > 0x8f) || p[0] > 0xf4 // > U+10FFFF
+ )
+ return false;
+ p += 4;
+ return true;
+ }
+ return false;
+ };
+ auto const fail_fast =
+ [&]()
+ {
+ if(cp_[0] < 128)
+ {
+ return false;
+ }
+
+ const auto& p = cp_; // alias, only to keep this code similar to valid() above
+ const auto known_only = p_ - cp_;
+ if (known_only == 1)
+ {
+ if((p[0] & 0xe0) == 0xc0)
+ {
+ return ((p[0] & 0x1e) == 0); // overlong
+ }
+ if((p[0] & 0xf0) == 0xe0)
+ {
+ return false;
+ }
+ if((p[0] & 0xf8) == 0xf0)
+ {
+ return ((p[0] & 0x07) >= 0x05); // invalid F5...FF characters
+ }
+ }
+ else if (known_only == 2)
+ {
+ if((p[0] & 0xe0) == 0xc0)
+ {
+ return ((p[1] & 0xc0) != 0x80 ||
+ (p[0] & 0x1e) == 0); // overlong
+ }
+ if((p[0] & 0xf0) == 0xe0)
+ {
+ return ( (p[1] & 0xc0) != 0x80
+ || (p[0] == 0xe0 && (p[1] & 0x20) == 0) // overlong
+ || (p[0] == 0xed && (p[1] & 0x20) == 0x20)); // surrogate
+ }
+ if((p[0] & 0xf8) == 0xf0)
+ {
+ return ( (p[0] & 0x07) >= 0x05 // invalid F5...FF characters
+ || (p[1] & 0xc0) != 0x80
+ || (p[0] == 0xf0 && (p[1] & 0x30) == 0) // overlong
+ || (p[0] == 0xf4 && p[1] > 0x8f) || p[0] > 0xf4); // > U+10FFFF
+ }
+ }
+ else if (known_only == 3)
+ {
+ if((p[0] & 0xe0) == 0xc0)
+ {
+ return ( (p[1] & 0xc0) != 0x80
+ || (p[0] & 0x1e) == 0); // overlong
+ }
+ if((p[0] & 0xf0) == 0xe0)
+ {
+ return ( (p[1] & 0xc0) != 0x80
+ || (p[2] & 0xc0) != 0x80
+ || (p[0] == 0xe0 && (p[1] & 0x20) == 0) // overlong
+ || (p[0] == 0xed && (p[1] & 0x20) == 0x20)); // surrogate
+ //|| (p[0] == 0xef && p[1] == 0xbf && (p[2] & 0xfe) == 0xbe) // U+FFFE or U+FFFF
+ }
+ if((p[0] & 0xf8) == 0xf0)
+ {
+ return ( (p[0] & 0x07) >= 0x05 // invalid F5...FF characters
+ || (p[1] & 0xc0) != 0x80
+ || (p[2] & 0xc0) != 0x80
+ || (p[0] == 0xf0 && (p[1] & 0x30) == 0) // overlong
+ || (p[0] == 0xf4 && p[1] > 0x8f) || p[0] > 0xf4); // > U+10FFFF
+ }
+ }
+ return true;
+ };
+ auto const needed =
+ [](std::uint8_t const v)
+ {
+ if(v < 128)
+ return 1;
+ if(v < 192)
+ return 0;
+ if(v < 224)
+ return 2;
+ if(v < 240)
+ return 3;
+ if(v < 248)
+ return 4;
+ return 0;
+ };
+
+ auto const end = in + size;
+
+ // Finish up any incomplete code point
+ if(need_ > 0)
+ {
+ // Calculate what we have
+ auto n = (std::min)(size, need_);
+ size -= n;
+ need_ -= n;
+
+ // Add characters to the code point
+ while(n--)
+ *p_++ = *in++;
+ BOOST_ASSERT(p_ <= cp_ + 4);
+
+ // Still incomplete?
+ if(need_ > 0)
+ {
+ // Incomplete code point
+ BOOST_ASSERT(in == end);
+
+ // Do partial validation on the incomplete
+ // code point, this is called "Fail fast"
+ // in Autobahn|Testsuite parlance.
+ return ! fail_fast();
+ }
+
+ // Complete code point, validate it
+ std::uint8_t const* p = &cp_[0];
+ if(! valid(p))
+ return false;
+ p_ = cp_;
+ }
+
+ if(size <= sizeof(std::size_t))
+ goto slow;
+
+ // Align `in` to sizeof(std::size_t) boundary
+ {
+ auto const in0 = in;
+ auto last = reinterpret_cast<std::uint8_t const*>(
+ ((reinterpret_cast<std::uintptr_t>(in) + sizeof(std::size_t) - 1) /
+ sizeof(std::size_t)) * sizeof(std::size_t));
+
+ // Check one character at a time for low-ASCII
+ while(in < last)
+ {
+ if(*in & 0x80)
+ {
+ // Not low-ASCII so switch to slow loop
+ size = size - (in - in0);
+ goto slow;
+ }
+ ++in;
+ }
+ size = size - (in - in0);
+ }
+
+ // Fast loop: Process 4 or 8 low-ASCII characters at a time
+ {
+ auto const in0 = in;
+ auto last = in + size - 7;
+ auto constexpr mask = static_cast<
+ std::size_t>(0x8080808080808080 & ~std::size_t{0});
+ while(in < last)
+ {
+#if 0
+ std::size_t temp;
+ std::memcpy(&temp, in, sizeof(temp));
+ if((temp & mask) != 0)
+#else
+ // Technically UB but works on all known platforms
+ if((*reinterpret_cast<std::size_t const*>(in) & mask) != 0)
+#endif
+ {
+ size = size - (in - in0);
+ goto slow;
+ }
+ in += sizeof(std::size_t);
+ }
+ // There's at least one more full code point left
+ last += 4;
+ while(in < last)
+ if(! valid(in))
+ return false;
+ goto tail;
+ }
+
+slow:
+ // Slow loop: Full validation on one code point at a time
+ {
+ auto last = in + size - 3;
+ while(in < last)
+ if(! valid(in))
+ return false;
+ }
+
+tail:
+ // Handle the remaining bytes. The last
+ // characters could split a code point so
+ // we save the partial code point for later.
+ //
+ // On entry to the loop, `in` points to the
+ // beginning of a code point.
+ //
+ for(;;)
+ {
+ // Number of chars left
+ auto n = end - in;
+ if(! n)
+ break;
+
+ // Chars we need to finish this code point
+ auto const need = needed(*in);
+ if(need == 0)
+ return false;
+ if(need <= n)
+ {
+ // Check a whole code point
+ if(! valid(in))
+ return false;
+ }
+ else
+ {
+ // Calculate how many chars we need
+ // to finish this partial code point
+ need_ = need - n;
+
+ // Save the partial code point
+ while(n--)
+ *p_++ = *in++;
+ BOOST_ASSERT(in == end);
+ BOOST_ASSERT(p_ <= cp_ + 4);
+
+ // Do partial validation on the incomplete
+ // code point, this is called "Fail fast"
+ // in Autobahn|Testsuite parlance.
+ return ! fail_fast();
+ }
+ }
+ return true;
+}
+
+bool
+check_utf8(char const* p, std::size_t n)
+{
+ utf8_checker c;
+ if(! c.write(reinterpret_cast<const uint8_t*>(p), n))
+ return false;
+ return c.finish();
+}
+
+} // detail
+} // websocket
+} // beast
+} // boost
+
+#endif // BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_IPP