// // Copyright (c) 2015 Artyom Beilis (Tonkikh) // // Distributed under the Boost Software License, Version 1.0. (See // accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) // #ifndef BOOST_LOCALE_GENERIC_CODECVT_HPP #define BOOST_LOCALE_GENERIC_CODECVT_HPP #include #include #include #include namespace boost { namespace locale { #ifndef BOOST_LOCALE_DOXYGEN // // Make sure that mbstate can keep 16 bit of UTF-16 sequence // BOOST_STATIC_ASSERT(sizeof(std::mbstate_t)>=2); #endif #if defined(_MSC_VER) && _MSC_VER < 1700 // up to MSVC 11 (2012) do_length is non-standard it counts wide characters instead of narrow and does not change mbstate #define BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST #endif /// /// \brief A base class that used to define constants for generic_codecvt /// class generic_codecvt_base { public: /// /// Initail state for converting to or from unicode code points, used by initial_state in derived classes /// enum initial_convertion_state { to_unicode_state, ///< The state would be used by to_unicode functions from_unicode_state ///< The state would be used by from_unicode functions }; }; /// /// \brief Geneneric generic codecvt facet, various stateless encodings to UTF-16 and UTF-32 using wchar_t, char32_t and char16_t /// /// Implementations should dervide from this class defining itself as CodecvtImpl and provide following members /// /// - `state_type` - a type of special object that allows to store intermediate cached data, for example `iconv_t` descriptor /// - `state_type initial_state(generic_codecvt_base::initial_convertion_state direction) const` - member function that creates initial state /// - `int max_encoding_length() const` - a maximal length that one Unicode code point is represented, for UTF-8 for example it is 4 from ISO-8859-1 it is 1 /// - `utf::code_point to_unicode(state_type &state,char const *&begin,char const *end)` - extract first code point from the text in range [begin,end), in case of success begin would point to the next character sequence to be encoded to next code point, in case of incomplete sequence - utf::incomplete shell be returned, and in case of invalid input sequence utf::illegal shell be returned and begin would remain unmodified /// - `utf::code_point from_unicode(state_type &state,utf::code_point u,char *begin,char const *end)` - convert a unicode code point `u` into a character seqnece at [begin,end). Return the length of the sequence in case of success, utf::incomplete in case of not enough room to encode the code point of utf::illegal in case conversion can not be performed /// /// /// For example implementaion of codecvt for latin1/ISO-8859-1 character set /// /// \code /// /// template /// class latin1_codecvt :boost::locale::generic_codecvt > /// { /// public: /// /// /* Standard codecvt constructor */ /// latin1_codecvt(size_t refs = 0) : boost::locale::generic_codecvt >(refs) /// { /// } /// /// /* State is unused but required by generic_codecvt */ /// struct state_type {}; /// /// state_type initial_state(generic_codecvt_base::initial_convertion_state /*unused*/) const /// { /// return state_type(); /// } /// /// int max_encoding_length() const /// { /// return 1; /// } /// /// boost::locale::utf::code_point to_unicode(state_type &,char const *&begin,char const *end) const /// { /// if(begin == end) /// return boost::locale::utf::incomplete; /// return *begin++; /// } /// /// boost::locale::utf::code_point from_unicode(state_type &,boost::locale::utf::code_point u,char *begin,char const *end) const /// { /// if(u >= 256) /// return boost::locale::utf::illegal; /// if(begin == end) /// return boost::locale::utf::incomplete; /// *begin = u; /// return 1; /// } /// }; /// /// \endcode /// /// When external tools used for encoding conversion, the `state_type` is useful to save objects used for conversions. For example, /// icu::UConverter can be saved in such a state for an efficient use: /// /// \code /// template /// class icu_codecvt :boost::locale::generic_codecvt > /// { /// public: /// /// /* Standard codecvt constructor */ /// icu_codecvt(std::string const &name,refs = 0) : /// boost::locale::generic_codecvt >(refs) /// { ... } /// /// /* State is unused but required by generic_codecvt */ /// struct std::unique_ptr state_type; /// /// state_type &&initial_state(generic_codecvt_base::initial_convertion_state /*unused*/) const /// { /// UErrorCode err = U_ZERO_ERROR; /// state_type ptr(ucnv_safeClone(converter_,0,0,&err,ucnv_close); /// return std::move(ptr); /// } /// /// boost::locale::utf::code_point to_unicode(state_type &ptr,char const *&begin,char const *end) const /// { /// UErrorCode err = U_ZERO_ERROR; /// boost::locale::utf::code_point cp = ucnv_getNextUChar(ptr.get(),&begin,end,&err); /// ... /// } /// ... /// }; /// \endcode /// /// template class generic_codecvt; /// /// \brief UTF-16 to/from UTF-8 codecvt facet to use with char16_t or wchar_t on Windows /// /// Note in order to fit the requirements of usability by std::wfstream it uses mbstate_t /// to handle intermediate states in handling of variable length UTF-16 sequences /// /// Its member functions implement standard virtual functions of basic codecvt /// template class generic_codecvt : public std::codecvt, public generic_codecvt_base { public: typedef CharType uchar; generic_codecvt(size_t refs = 0) : std::codecvt(refs) { } CodecvtImpl const &implementation() const { return *static_cast(this); } protected: virtual std::codecvt_base::result do_unshift(std::mbstate_t &s,char *from,char * /*to*/,char *&next) const { boost::uint16_t &state = *reinterpret_cast(&s); #ifdef DEBUG_CODECVT std::cout << "Entering unshift " << std::hex << state << std::dec << std::endl; #endif if(state != 0) return std::codecvt_base::error; next=from; return std::codecvt_base::ok; } virtual int do_encoding() const throw() { return 0; } virtual int do_max_length() const throw() { return implementation().max_encoding_length(); } virtual bool do_always_noconv() const throw() { return false; } virtual int do_length( std::mbstate_t #ifdef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST const #endif &std_state, char const *from, char const *from_end, size_t max) const { #ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST char const *save_from = from; boost::uint16_t &state = *reinterpret_cast(&std_state); #else size_t save_max = max; boost::uint16_t state = *reinterpret_cast(&std_state); #endif typedef typename CodecvtImpl::state_type state_type; state_type cvt_state = implementation().initial_state(generic_codecvt_base::to_unicode_state); while(max > 0 && from < from_end){ char const *prev_from = from; boost::uint32_t ch=implementation().to_unicode(cvt_state,from,from_end); if(ch==boost::locale::utf::incomplete || ch==boost::locale::utf::illegal) { from = prev_from; break; } max --; if(ch > 0xFFFF) { if(state == 0) { from = prev_from; state = 1; } else { state = 0; } } } #ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST return from - save_from; #else return save_max - max; #endif } virtual std::codecvt_base::result do_in( std::mbstate_t &std_state, char const *from, char const *from_end, char const *&from_next, uchar *to, uchar *to_end, uchar *&to_next) const { std::codecvt_base::result r=std::codecvt_base::ok; // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT()) // according to standard. We use it to keep a flag 0/1 for surrogate pair writing // // if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observerd // and first pair is written, but no input consumed boost::uint16_t &state = *reinterpret_cast(&std_state); typedef typename CodecvtImpl::state_type state_type; state_type cvt_state = implementation().initial_state(generic_codecvt_base::to_unicode_state); while(to < to_end && from < from_end) { #ifdef DEBUG_CODECVT std::cout << "Entering IN--------------" << std::endl; std::cout << "State " << std::hex << state <> 10; boost::uint16_t vl = ch & 0x3FF; boost::uint16_t w1 = vh + 0xD800; boost::uint16_t w2 = vl + 0xDC00; if(state == 0) { from = from_saved; *to++ = w1; state = 1; } else { *to++ = w2; state = 0; } } } from_next=from; to_next=to; if(r == std::codecvt_base::ok && (from!=from_end || state!=0)) r = std::codecvt_base::partial; #ifdef DEBUG_CODECVT std::cout << "Returning "; switch(r) { case std::codecvt_base::ok: std::cout << "ok" << std::endl; break; case std::codecvt_base::partial: std::cout << "partial" << std::endl; break; case std::codecvt_base::error: std::cout << "error" << std::endl; break; default: std::cout << "other" << std::endl; break; } std::cout << "State " << std::hex << state <=2 in order // to be able to store first observerd surrogate pair // // State: state!=0 - a first surrogate pair was observerd (state = first pair), // we expect the second one to come and then zero the state /// boost::uint16_t &state = *reinterpret_cast(&std_state); typedef typename CodecvtImpl::state_type state_type; state_type cvt_state = implementation().initial_state(generic_codecvt_base::from_unicode_state); while(to < to_end && from < from_end) { #ifdef DEBUG_CODECVT std::cout << "Entering OUT --------------" << std::endl; std::cout << "State " << std::hex << state < class generic_codecvt : public std::codecvt, public generic_codecvt_base { public: typedef CharType uchar; generic_codecvt(size_t refs = 0) : std::codecvt(refs) { } CodecvtImpl const &implementation() const { return *static_cast(this); } protected: virtual std::codecvt_base::result do_unshift(std::mbstate_t &/*s*/,char *from,char * /*to*/,char *&next) const { next=from; return std::codecvt_base::ok; } virtual int do_encoding() const throw() { return 0; } virtual int do_max_length() const throw() { return implementation().max_encoding_length(); } virtual bool do_always_noconv() const throw() { return false; } virtual int do_length( std::mbstate_t #ifdef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST const #endif &/*state*/, char const *from, char const *from_end, size_t max) const { #ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST char const *start_from = from; #else size_t save_max = max; #endif typedef typename CodecvtImpl::state_type state_type; state_type cvt_state = implementation().initial_state(generic_codecvt_base::to_unicode_state); while(max > 0 && from < from_end){ char const *save_from = from; boost::uint32_t ch=implementation().to_unicode(cvt_state,from,from_end); if(ch==boost::locale::utf::incomplete || ch==boost::locale::utf::illegal) { from = save_from; break; } max--; } #ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST return from - start_from; #else return save_max - max; #endif } virtual std::codecvt_base::result do_in( std::mbstate_t &/*state*/, char const *from, char const *from_end, char const *&from_next, uchar *to, uchar *to_end, uchar *&to_next) const { std::codecvt_base::result r=std::codecvt_base::ok; // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT()) // according to standard. We use it to keep a flag 0/1 for surrogate pair writing // // if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observerd // and first pair is written, but no input consumed typedef typename CodecvtImpl::state_type state_type; state_type cvt_state = implementation().initial_state(generic_codecvt_base::to_unicode_state); while(to < to_end && from < from_end) { #ifdef DEBUG_CODECVT std::cout << "Entering IN--------------" << std::endl; std::cout << "State " << std::hex << state < class generic_codecvt : public std::codecvt, public generic_codecvt_base { public: typedef CharType uchar; CodecvtImpl const &implementation() const { return *static_cast(this); } generic_codecvt(size_t refs = 0) : std::codecvt(refs) { } }; } // locale } // namespace boost #endif /// // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4