// // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) // // Distributed under the Boost Software License, Version 1.0. (See // accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) // #ifndef BOOST_LOCALE_UTF_HPP_INCLUDED #define BOOST_LOCALE_UTF_HPP_INCLUDED #include namespace boost { namespace locale { /// /// \brief Namespace that holds basic operations on UTF encoded sequences /// /// All functions defined in this namespace do not require linking with Boost.Locale library /// namespace utf { /// \cond INTERNAL #ifdef __GNUC__ # define BOOST_LOCALE_LIKELY(x) __builtin_expect((x),1) # define BOOST_LOCALE_UNLIKELY(x) __builtin_expect((x),0) #else # define BOOST_LOCALE_LIKELY(x) (x) # define BOOST_LOCALE_UNLIKELY(x) (x) #endif /// \endcond /// /// \brief The integral type that can hold a Unicode code point /// typedef uint32_t code_point; /// /// \brief Special constant that defines illegal code point /// static const code_point illegal = 0xFFFFFFFFu; /// /// \brief Special constant that defines incomplete code point /// static const code_point incomplete = 0xFFFFFFFEu; /// /// \brief the function checks if \a v is a valid code point /// inline bool is_valid_codepoint(code_point v) { if(v>0x10FFFF) return false; if(0xD800 <=v && v<= 0xDFFF) // surragates return false; return true; } #ifdef BOOST_LOCALE_DOXYGEN /// /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points /// template struct utf_traits { /// /// The type of the character /// typedef CharType char_type; /// /// Read one code point from the range [p,e) and return it. /// /// - If the sequence that was read is incomplete sequence returns \ref incomplete, /// - If illegal sequence detected returns \ref illegal /// /// Requirements /// /// - Iterator is valid input iterator /// /// Postconditions /// /// - p points to the last consumed character /// template static code_point decode(Iterator &p,Iterator e); /// /// Maximal width of valid sequence in the code units: /// /// - UTF-8 - 4 /// - UTF-16 - 2 /// - UTF-32 - 1 /// static const int max_width; /// /// The width of specific code point in the code units. /// /// Requirement: value is a valid Unicode code point /// Returns value in range [1..max_width] /// static int width(code_point value); /// /// Get the size of the trail part of variable length encoded sequence. /// /// Returns -1 if C is not valid lead character /// static int trail_length(char_type c); /// /// Returns true if c is trail code unit, always false for UTF-32 /// static bool is_trail(char_type c); /// /// Returns true if c is lead code unit, always true of UTF-32 /// static bool is_lead(char_type c); /// /// Convert valid Unicode code point \a value to the UTF sequence. /// /// Requirements: /// /// - \a value is valid code point /// - \a out is an output iterator should be able to accept at least width(value) units /// /// Returns the iterator past the last written code unit. /// template static Iterator encode(code_point value,Iterator out); /// /// Decodes valid UTF sequence that is pointed by p into code point. /// /// If the sequence is invalid or points to end the behavior is undefined /// template static code_point decode_valid(Iterator &p); }; #else template struct utf_traits; template struct utf_traits { typedef CharType char_type; static int trail_length(char_type ci) { unsigned char c = ci; if(c < 128) return 0; if(BOOST_LOCALE_UNLIKELY(c < 194)) return -1; if(c < 224) return 1; if(c < 240) return 2; if(BOOST_LOCALE_LIKELY(c <=244)) return 3; return -1; } static const int max_width = 4; static int width(code_point value) { if(value <=0x7F) { return 1; } else if(value <=0x7FF) { return 2; } else if(BOOST_LOCALE_LIKELY(value <=0xFFFF)) { return 3; } else { return 4; } } static bool is_trail(char_type ci) { unsigned char c=ci; return (c & 0xC0)==0x80; } static bool is_lead(char_type ci) { return !is_trail(ci); } template static code_point decode(Iterator &p,Iterator e) { if(BOOST_LOCALE_UNLIKELY(p==e)) return incomplete; unsigned char lead = *p++; // First byte is fully validated here int trail_size = trail_length(lead); if(BOOST_LOCALE_UNLIKELY(trail_size < 0)) return illegal; // // Ok as only ASCII may be of size = 0 // also optimize for ASCII text // if(trail_size == 0) return lead; code_point c = lead & ((1<<(6-trail_size))-1); // Read the rest unsigned char tmp; switch(trail_size) { case 3: if(BOOST_LOCALE_UNLIKELY(p==e)) return incomplete; tmp = *p++; if (!is_trail(tmp)) return illegal; c = (c << 6) | ( tmp & 0x3F); case 2: if(BOOST_LOCALE_UNLIKELY(p==e)) return incomplete; tmp = *p++; if (!is_trail(tmp)) return illegal; c = (c << 6) | ( tmp & 0x3F); case 1: if(BOOST_LOCALE_UNLIKELY(p==e)) return incomplete; tmp = *p++; if (!is_trail(tmp)) return illegal; c = (c << 6) | ( tmp & 0x3F); } // Check code point validity: no surrogates and // valid range if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c))) return illegal; // make sure it is the most compact representation if(BOOST_LOCALE_UNLIKELY(width(c)!=trail_size + 1)) return illegal; return c; } template static code_point decode_valid(Iterator &p) { unsigned char lead = *p++; if(lead < 192) return lead; int trail_size; if(lead < 224) trail_size = 1; else if(BOOST_LOCALE_LIKELY(lead < 240)) // non-BMP rare trail_size = 2; else trail_size = 3; code_point c = lead & ((1<<(6-trail_size))-1); switch(trail_size) { case 3: c = (c << 6) | ( static_cast(*p++) & 0x3F); case 2: c = (c << 6) | ( static_cast(*p++) & 0x3F); case 1: c = (c << 6) | ( static_cast(*p++) & 0x3F); } return c; } template static Iterator encode(code_point value,Iterator out) { if(value <= 0x7F) { *out++ = static_cast(value); } else if(value <= 0x7FF) { *out++ = static_cast((value >> 6) | 0xC0); *out++ = static_cast((value & 0x3F) | 0x80); } else if(BOOST_LOCALE_LIKELY(value <= 0xFFFF)) { *out++ = static_cast((value >> 12) | 0xE0); *out++ = static_cast(((value >> 6) & 0x3F) | 0x80); *out++ = static_cast((value & 0x3F) | 0x80); } else { *out++ = static_cast((value >> 18) | 0xF0); *out++ = static_cast(((value >> 12) & 0x3F) | 0x80); *out++ = static_cast(((value >> 6) & 0x3F) | 0x80); *out++ = static_cast((value & 0x3F) | 0x80); } return out; } }; // utf8 template struct utf_traits { typedef CharType char_type; // See RFC 2781 static bool is_first_surrogate(uint16_t x) { return 0xD800 <=x && x<= 0xDBFF; } static bool is_second_surrogate(uint16_t x) { return 0xDC00 <=x && x<= 0xDFFF; } static code_point combine_surrogate(uint16_t w1,uint16_t w2) { return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000; } static int trail_length(char_type c) { if(is_first_surrogate(c)) return 1; if(is_second_surrogate(c)) return -1; return 0; } /// /// Returns true if c is trail code unit, always false for UTF-32 /// static bool is_trail(char_type c) { return is_second_surrogate(c); } /// /// Returns true if c is lead code unit, always true of UTF-32 /// static bool is_lead(char_type c) { return !is_second_surrogate(c); } template static code_point decode(It ¤t,It last) { if(BOOST_LOCALE_UNLIKELY(current == last)) return incomplete; uint16_t w1=*current++; if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) { return w1; } if(w1 > 0xDBFF) return illegal; if(current==last) return incomplete; uint16_t w2=*current++; if(w2 < 0xDC00 || 0xDFFF < w2) return illegal; return combine_surrogate(w1,w2); } template static code_point decode_valid(It ¤t) { uint16_t w1=*current++; if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) { return w1; } uint16_t w2=*current++; return combine_surrogate(w1,w2); } static const int max_width = 2; static int width(code_point u) { return u>=0x10000 ? 2 : 1; } template static It encode(code_point u,It out) { if(BOOST_LOCALE_LIKELY(u<=0xFFFF)) { *out++ = static_cast(u); } else { u -= 0x10000; *out++ = static_cast(0xD800 | (u>>10)); *out++ = static_cast(0xDC00 | (u & 0x3FF)); } return out; } }; // utf16; template struct utf_traits { typedef CharType char_type; static int trail_length(char_type c) { if(is_valid_codepoint(c)) return 0; return -1; } static bool is_trail(char_type /*c*/) { return false; } static bool is_lead(char_type /*c*/) { return true; } template static code_point decode_valid(It ¤t) { return *current++; } template static code_point decode(It ¤t,It last) { if(BOOST_LOCALE_UNLIKELY(current == last)) return boost::locale::utf::incomplete; code_point c=*current++; if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c))) return boost::locale::utf::illegal; return c; } static const int max_width = 1; static int width(code_point /*u*/) { return 1; } template static It encode(code_point u,It out) { *out++ = static_cast(u); return out; } }; // utf32 #endif } // utf } // locale } // boost #endif // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4