|
||||
File indexing completed on 2025-01-18 09:39:17
0001 // 0002 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) 0003 // Copyright (c) 2022-2023 Alexander Grund 0004 // 0005 // Distributed under the Boost Software License, Version 1.0. 0006 // https://www.boost.org/LICENSE_1_0.txt 0007 0008 #ifndef BOOST_LOCALE_UTIL_HPP 0009 #define BOOST_LOCALE_UTIL_HPP 0010 0011 #include <boost/locale/generator.hpp> 0012 #include <boost/locale/utf.hpp> 0013 #include <boost/assert.hpp> 0014 #include <cstdint> 0015 #include <locale> 0016 #include <memory> 0017 #include <typeinfo> 0018 0019 namespace boost { namespace locale { 0020 /// \brief This namespace provides various utility function useful for Boost.Locale's backends 0021 /// implementations 0022 namespace util { 0023 0024 /// \brief Return default system locale name in POSIX format. 0025 /// 0026 /// This function tries to detect the locale using LC_ALL, LC_CTYPE and LANG environment 0027 /// variables in this order and if all of them are unset, on POSIX platforms it returns "C". 0028 /// On Windows additionally to the above environment variables, this function 0029 /// tries to create the locale name from ISO-639 and ISO-3166 country codes defined 0030 /// for the users default locale. 0031 /// If \a use_utf8_on_windows is true it sets the encoding to UTF-8, 0032 /// otherwise, if the system locale supports ANSI codepages it defines the ANSI encoding, e.g. windows-1252, 0033 /// otherwise (if ANSI codepage is not available) it uses UTF-8 encoding. 0034 BOOST_LOCALE_DECL 0035 std::string get_system_locale(bool use_utf8_on_windows = false); 0036 0037 /// \brief Installs information facet to locale \a in based on locale name \a name 0038 /// 0039 /// This function installs boost::locale::info facet into the locale \a in and returns 0040 /// newly created locale. 0041 /// 0042 /// Note: all information is based only on parsing of string \a name; 0043 /// 0044 /// The name has following format: language[_COUNTRY][.encoding][\@variant] 0045 /// Where language is ISO-639 language code like "en" or "ru", COUNTRY is ISO-3166 0046 /// country identifier like "US" or "RU". the Encoding is a character set name 0047 /// like UTF-8 or ISO-8859-1. Variant is backend specific variant like \c euro or 0048 /// calendar=hebrew. 0049 /// 0050 /// If some parameters are missing they are specified as blanks, default encoding 0051 /// is assumed to be US-ASCII and missing language is assumed to be "C" 0052 BOOST_LOCALE_DECL 0053 std::locale create_info(const std::locale& in, const std::string& name); 0054 0055 /// \brief This class represent a simple stateless converter from UCS-4 and to UCS-4 for 0056 /// each single code point 0057 /// 0058 /// This class is used for creation of std::codecvt facet for converting utf-16/utf-32 encoding 0059 /// to encoding supported by this converter 0060 /// 0061 /// Please note, this converter should be fully stateless. Fully stateless means it should 0062 /// never assume that it is called in any specific order on the text. Even if the 0063 /// encoding itself seems to be stateless like windows-1255 or shift-jis, some 0064 /// encoders (most notably iconv) can actually compose several code-point into one or 0065 /// decompose them in case composite characters are found. So be very careful when implementing 0066 /// these converters for certain character set. 0067 class BOOST_LOCALE_DECL base_converter { 0068 public: 0069 /// This value should be returned when an illegal input sequence or code-point is observed: 0070 /// For example if a UCS-32 code-point is in the range reserved for UTF-16 surrogates 0071 /// or an invalid UTF-8 sequence is found 0072 static constexpr utf::code_point illegal = utf::illegal; 0073 0074 /// This value is returned in following cases: An incomplete input sequence was found or 0075 /// insufficient output buffer was provided so complete output could not be written. 0076 static constexpr utf::code_point incomplete = utf::incomplete; 0077 0078 virtual ~base_converter(); 0079 0080 /// Return the maximal length that one Unicode code-point can be converted to, for example 0081 /// for UTF-8 it is 4, for Shift-JIS it is 2 and ISO-8859-1 is 1 0082 virtual int max_len() const { return 1; } 0083 0084 /// Returns true if calling the functions from_unicode, to_unicode, and max_len is thread safe. 0085 /// 0086 /// Rule of thumb: if this class' implementation uses simple tables that are unchanged 0087 /// or is purely algorithmic like UTF-8 - so it does not share any mutable bit for 0088 /// independent to_unicode, from_unicode calls, you may set it to true, otherwise, 0089 /// for example if you use iconv_t descriptor or UConverter as conversion object return false, 0090 /// and this object will be cloned for each use. 0091 virtual bool is_thread_safe() const { return false; } 0092 0093 /// Create a polymorphic copy of this object, usually called only if is_thread_safe() return false 0094 virtual base_converter* clone() const 0095 { 0096 BOOST_ASSERT(typeid(*this) == typeid(base_converter)); 0097 return new base_converter(); 0098 } 0099 0100 /// Convert a single character starting at begin and ending at most at end to Unicode code-point. 0101 /// 0102 /// if valid input sequence found in [\a begin,\a code_point_end) such as \a begin < \a code_point_end && \a 0103 /// code_point_end <= \a end it is converted to its Unicode code point equivalent, \a begin is set to \a 0104 /// code_point_end 0105 /// 0106 /// if incomplete input sequence found in [\a begin,\a end), i.e. there my be such \a code_point_end that \a 0107 /// code_point_end > \a end and [\a begin, \a code_point_end) would be valid input sequence, then \a 0108 /// incomplete is returned begin stays unchanged, for example for UTF-8 conversion a *begin = 0xc2, \a begin 0109 /// +1 = \a end is such situation. 0110 /// 0111 /// if invalid input sequence found, i.e. there is a sequence [\a begin, \a code_point_end) such as \a 0112 /// code_point_end <= \a end that is illegal for this encoding, \a illegal is returned and begin stays 0113 /// unchanged. For example if *begin = 0xFF and begin < end for UTF-8, then \a illegal is returned. 0114 virtual utf::code_point to_unicode(const char*& begin, const char* end) 0115 { 0116 if(begin == end) 0117 return incomplete; // LCOV_EXCL_LINE 0118 unsigned char cp = *begin; 0119 if(cp <= 0x7F) { 0120 begin++; 0121 return cp; 0122 } 0123 return illegal; 0124 } 0125 0126 /// Convert a single code-point \a u into encoding and store it in [begin,end) range. 0127 /// 0128 /// If u is invalid Unicode code-point, or it can not be mapped correctly to represented character set, 0129 /// \a illegal should be returned 0130 /// 0131 /// If u can be converted to a sequence of bytes c1, ... , cN (1<= N <= max_len() ) then 0132 /// 0133 /// -# If end - begin >= N, c1, ... cN are written starting at begin and N is returned 0134 /// -# If end - begin < N, incomplete is returned, it is unspecified what would be 0135 /// stored in bytes in range [begin,end) 0136 virtual utf::len_or_error from_unicode(utf::code_point u, char* begin, const char* end) 0137 { 0138 if(begin == end) 0139 return incomplete; // LCOV_EXCL_LINE 0140 if(u >= 0x80) 0141 return illegal; 0142 *begin = static_cast<char>(u); 0143 return 1; 0144 } 0145 }; 0146 0147 /// This function creates a \a base_converter that can be used for conversion between UTF-8 and 0148 /// Unicode code points 0149 BOOST_LOCALE_DECL std::unique_ptr<base_converter> create_utf8_converter(); 0150 0151 BOOST_DEPRECATED("This function is deprecated, use 'create_utf8_converter()'") 0152 inline std::unique_ptr<base_converter> create_utf8_converter_unique_ptr() 0153 { 0154 return create_utf8_converter(); 0155 } 0156 0157 /// This function creates a \a base_converter that can be used for conversion between single byte 0158 /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points, 0159 /// 0160 /// If \a encoding is not supported, empty pointer is returned. 0161 /// So you should check whether the returned pointer is valid/non-NULL 0162 BOOST_LOCALE_DECL std::unique_ptr<base_converter> create_simple_converter(const std::string& encoding); 0163 0164 BOOST_DEPRECATED("This function is deprecated, use 'create_simple_converter()'") 0165 inline std::unique_ptr<base_converter> create_simple_converter_unique_ptr(const std::string& encoding) 0166 { 0167 return create_simple_converter(encoding); 0168 } 0169 0170 /// Install codecvt facet into locale \a in and return new locale that is based on \a in and uses new 0171 /// facet. 0172 /// 0173 /// codecvt facet would convert between narrow and wide/char16_t/char32_t encodings using \a cvt converter. 0174 /// If \a cvt is null pointer, always failure conversion would be used that fails on every first input or 0175 /// output. 0176 /// 0177 /// Note: the codecvt facet handles both UTF-16 and UTF-32 wide encodings, it knows to break and join 0178 /// Unicode code-points above 0xFFFF to and from surrogate pairs correctly. \a cvt should be unaware 0179 /// of wide encoding type 0180 BOOST_LOCALE_DECL 0181 std::locale create_codecvt(const std::locale& in, std::unique_ptr<base_converter> cvt, char_facet_t type); 0182 0183 BOOST_DEPRECATED("This function is deprecated, use 'create_codecvt()'") 0184 inline std::locale create_codecvt_from_pointer(const std::locale& in, base_converter* cvt, char_facet_t type) 0185 { 0186 return create_codecvt(in, std::unique_ptr<base_converter>(cvt), type); 0187 } 0188 0189 BOOST_DEPRECATED("This function is deprecated, use 'create_utf8_converter()'") 0190 BOOST_LOCALE_DECL base_converter* create_utf8_converter_new_ptr(); 0191 0192 BOOST_DEPRECATED("This function is deprecated, use 'create_simple_converter()'") 0193 BOOST_LOCALE_DECL base_converter* create_simple_converter_new_ptr(const std::string& encoding); 0194 0195 /// Install utf8 codecvt to UTF-16 or UTF-32 into locale \a in and return 0196 /// new locale that is based on \a in and uses new facet. 0197 BOOST_LOCALE_DECL 0198 std::locale create_utf8_codecvt(const std::locale& in, char_facet_t type); 0199 0200 /// This function installs codecvt that can be used for conversion between single byte 0201 /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points, 0202 /// 0203 /// \throws boost::locale::conv::invalid_charset_error: Character set is not supported or isn't a single 0204 /// byte character set 0205 BOOST_LOCALE_DECL 0206 std::locale create_simple_codecvt(const std::locale& in, const std::string& encoding, char_facet_t type); 0207 } // namespace util 0208 }} // namespace boost::locale 0209 0210 #endif
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |