summaryrefslogtreecommitdiff
path: root/boost/locale/util.hpp
blob: 4d8206b52823cca3089465a56b3c20621eef4226 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
//
//  Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
//
//  Distributed under the Boost Software License, Version 1.0. (See
//  accompanying file LICENSE_1_0.txt or copy at
//  http://www.boost.org/LICENSE_1_0.txt)
//
#ifndef BOOST_LOCALE_UTIL_HPP
#define BOOST_LOCALE_UTIL_HPP
#include <locale>
#include <typeinfo>
#include <boost/cstdint.hpp>
#include <boost/locale/utf.hpp>
#include <boost/locale/generator.hpp>
#include <boost/assert.hpp>

#include <vector>
namespace boost {
namespace locale {
///
/// \brief This namespace provides various utility function useful for Boost.Locale backends
/// implementations
///
namespace util {
    
    ///
    /// \brief Return default system locale name in POSIX format.
    ///
    /// This function tries to detect the locale using, LC_CTYPE, LC_ALL and LANG environment
    /// variables in this order and if all of them unset, in POSIX platforms it returns "C"
    /// 
    /// On Windows additionally to check the above environment variables, this function
    /// tries to creates locale name from ISO-339 and ISO-3199 country codes defined
    /// for user default locale.
    /// If \a use_utf8_on_windows is true it sets the encoding to UTF-8, otherwise, if system
    /// locale supports ANSI code-page it defines the ANSI encoding like windows-1252, otherwise it fall-backs
    /// to UTF-8 encoding if ANSI code-page is not available.
    ///
    BOOST_LOCALE_DECL
    std::string get_system_locale(bool use_utf8_on_windows = false);

    ///
    /// \brief Installs information facet to locale in based on locale name \a name
    ///
    /// This function installs boost::locale::info facet into the locale \a in and returns
    /// newly created locale.
    ///
    /// Note: all information is based only on parsing of string \a name;
    ///
    /// The name has following format: language[_COUNTRY][.encoding][\@variant]
    /// Where language is ISO-639 language code like "en" or "ru", COUNTRY is ISO-3166
    /// country identifier like "US" or "RU". the Encoding is a charracter set name
    /// like UTF-8 or ISO-8859-1. Variant is backend specific variant like \c euro or
    /// calendar=hebrew.
    ///
    /// If some parameters are missing they are specified as blanks, default encoding
    /// is assumed to be US-ASCII and missing language is assumed to be "C"
    ///
    BOOST_LOCALE_DECL
    std::locale create_info(std::locale const &in,std::string const &name); 


    ///
    /// \brief This class represent a simple stateless converter from UCS-4 and to UCS-4 for
    ///  each single code point
    ///
    /// This class is used for creation of std::codecvt facet for converting utf-16/utf-32 encoding
    /// to encoding supported by this converter
    ///
    /// Please note, this converter should be fully stateless. Fully stateless means it should
    /// never assume that it is called in any specific order on the text. Even if the
    /// encoding itself seems to be stateless like windows-1255 or shift-jis, some
    /// encoders (most notably iconv) can actually compose several code-point into one or
    /// decompose them in case composite characters are found. So be very careful when implementing
    /// these converters for certain character set.
    ///
    class base_converter {
    public:

        ///
        /// This value should be returned when an illegal input sequence or code-point is observed:
        /// For example if a UCS-32 code-point is in the range reserved for UTF-16 surrogates
        /// or an invalid UTF-8 sequence is found
        ///
        static const uint32_t illegal=utf::illegal;

        ///
        /// This value is returned in following cases: The of incomplete input sequence was found or 
        /// insufficient output buffer was provided so complete output could not be written.
        ///
        static const uint32_t incomplete=utf::incomplete;
        
        virtual ~base_converter() 
        {
        }
        ///
        /// Return the maximal length that one Unicode code-point can be converted to, for example
        /// for UTF-8 it is 4, for Shift-JIS it is 2 and ISO-8859-1 is 1
        ///
        virtual int max_len() const 
        {
            return 1;
        }
        ///
        /// Returns true if calling the functions from_unicode, to_unicode, and max_len is thread safe.
        ///
        /// Rule of thumb: if this class' implementation uses simple tables that are unchanged
        /// or is purely algorithmic like UTF-8 - so it does not share any mutable bit for
        /// independent to_unicode, from_unicode calls, you may set it to true, otherwise,
        /// for example if you use iconv_t descriptor or UConverter as conversion object return false,
        /// and this object will be cloned for each use.
        ///
        virtual bool is_thread_safe() const 
        {
            return false;
        }
        ///
        /// Create a polymorphic copy of this object, usually called only if is_thread_safe() return false
        ///
        virtual base_converter *clone() const 
        {
            BOOST_ASSERT(typeid(*this)==typeid(base_converter));
            return new base_converter();
        }

        ///
        /// Convert a single character starting at begin and ending at most at end to Unicode code-point.
        ///
        /// if valid input sequence found in [\a begin,\a code_point_end) such as \a begin < \a code_point_end && \a code_point_end <= \a end
        /// it is converted to its Unicode code point equivalent, \a begin is set to \a code_point_end
        ///
        /// if incomplete input sequence found in [\a begin,\a end), i.e. there my be such \a code_point_end that \a code_point_end > \a end
        /// and [\a begin, \a code_point_end) would be valid input sequence, then \a incomplete is returned begin stays unchanged, for example
        /// for UTF-8 conversion a *begin = 0xc2, \a begin +1 = \a end is such situation.
        ///
        /// if invalid input sequence found, i.e. there is a sequence [\a begin, \a code_point_end) such as \a code_point_end <= \a end
        /// that is illegal for this encoding, \a illegal is returned and begin stays unchanged. For example if *begin = 0xFF and begin < end
        /// for UTF-8, then \a illegal is returned.
        /// 
        ///
        virtual uint32_t to_unicode(char const *&begin,char const *end) 
        {
            if(begin == end)
                return incomplete;
            unsigned char cp = *begin;
            if(cp <= 0x7F) {
                begin++;
                return cp;
            }
            return illegal;
        }
        ///
        /// Convert a single code-point \a u into encoding and store it in [begin,end) range.
        ///
        /// If u is invalid Unicode code-point, or it can not be mapped correctly to represented character set,
        /// \a illegal should be returned
        ///
        /// If u can be converted to a sequence of bytes c1, ... , cN (1<= N <= max_len() ) then
        /// 
        /// -# If end - begin >= N, c1, ... cN are written starting at begin and N is returned
        /// -# If end - begin < N, incomplete is returned, it is unspecified what would be
        ///    stored in bytes in range [begin,end)

        virtual uint32_t from_unicode(uint32_t u,char *begin,char const *end) 
        {
            if(begin==end)
                return incomplete;
            if(u >= 0x80)
                return illegal;
            *begin = static_cast<char>(u);
            return 1;
        }
    };

    ///
    /// This function creates a \a base_converter that can be used for conversion between UTF-8 and
    /// unicode code points
    ///
    BOOST_LOCALE_DECL std::auto_ptr<base_converter> create_utf8_converter();
    ///
    /// This function creates a \a base_converter that can be used for conversion between single byte
    /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points,
    /// 
    /// If \a encoding is not supported, empty pointer is returned. You should check if
    /// std::auto_ptr<base_converter>::get() != 0
    ///
    BOOST_LOCALE_DECL std::auto_ptr<base_converter> create_simple_converter(std::string const &encoding);


    ///
    /// Install codecvt facet into locale \a in and return new locale that is based on \a in and uses new
    /// facet.
    ///
    /// codecvt facet would convert between narrow and wide/char16_t/char32_t encodings using \a cvt converter.
    /// If \a cvt is null pointer, always failure conversion would be used that fails on every first input or output.
    /// 
    /// Note: the codecvt facet handles both UTF-16 and UTF-32 wide encodings, it knows to break and join
    /// Unicode code-points above 0xFFFF to and from surrogate pairs correctly. \a cvt should be unaware
    /// of wide encoding type
    ///
    BOOST_LOCALE_DECL
    std::locale create_codecvt(std::locale const &in,std::auto_ptr<base_converter> cvt,character_facet_type type);

} // util
} // locale 
} // boost

#endif
// vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4