boost/locale/util.hpp

0001 //
0002 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
0003 // Copyright (c) 2022-2023 Alexander Grund
0004 //
0005 // Distributed under the Boost Software License, Version 1.0.
0006 // https://www.boost.org/LICENSE_1_0.txt
0007
0008 #ifndef BOOST_LOCALE_UTIL_HPP
0009 #define BOOST_LOCALE_UTIL_HPP
0010
0011 #include <boost/locale/generator.hpp>
0012 #include <boost/locale/utf.hpp>
0013 #include <boost/assert.hpp>
0014 #include <cstdint>
0015 #include <locale>
0016 #include <memory>
0017 #include <typeinfo>
0018
0019 namespace boost { namespace locale {
0020     /// \brief This namespace provides various utility function useful for Boost.Locale's backends
0021     /// implementations
0022     namespace util {
0023
0024         /// \brief Return default system locale name in POSIX format.
0025         ///
0026         /// This function tries to detect the locale using LC_ALL, LC_CTYPE and LANG environment
0027         /// variables in this order and if all of them are unset, on POSIX platforms it returns "C".
0028         /// On Windows additionally to the above environment variables, this function
0029         /// tries to create the locale name from ISO-639 and ISO-3166 country codes defined
0030         /// for the users default locale.
0031         /// If \a use_utf8_on_windows is true it sets the encoding to UTF-8,
0032         /// otherwise, if the system locale supports ANSI codepages it defines the ANSI encoding, e.g. windows-1252,
0033         /// otherwise (if ANSI codepage is not available) it uses UTF-8 encoding.
0034         BOOST_LOCALE_DECL
0035         std::string get_system_locale(bool use_utf8_on_windows = false);
0036
0037         /// \brief Installs information facet to locale \a in based on locale name \a name
0038         ///
0039         /// This function installs boost::locale::info facet into the locale \a in and returns
0040         /// newly created locale.
0041         ///
0042         /// Note: all information is based only on parsing of string \a name;
0043         ///
0044         /// The name has following format: language[_COUNTRY][.encoding][\@variant]
0045         /// Where language is ISO-639 language code like "en" or "ru", COUNTRY is ISO-3166
0046         /// country identifier like "US" or "RU". the Encoding is a character set name
0047         /// like UTF-8 or ISO-8859-1. Variant is backend specific variant like \c euro or
0048         /// calendar=hebrew.
0049         ///
0050         /// If some parameters are missing they are specified as blanks, default encoding
0051         /// is assumed to be US-ASCII and missing language is assumed to be "C"
0052         BOOST_LOCALE_DECL
0053         std::locale create_info(const std::locale& in, const std::string& name);
0054
0055         /// \brief This class represent a simple stateless converter from UCS-4 and to UCS-4 for
0056         ///  each single code point
0057         ///
0058         /// This class is used for creation of std::codecvt facet for converting utf-16/utf-32 encoding
0059         /// to encoding supported by this converter
0060         ///
0061         /// Please note, this converter should be fully stateless. Fully stateless means it should
0062         /// never assume that it is called in any specific order on the text. Even if the
0063         /// encoding itself seems to be stateless like windows-1255 or shift-jis, some
0064         /// encoders (most notably iconv) can actually compose several code-point into one or
0065         /// decompose them in case composite characters are found. So be very careful when implementing
0066         /// these converters for certain character set.
0067         class BOOST_LOCALE_DECL base_converter {
0068         public:
0069             /// This value should be returned when an illegal input sequence or code-point is observed:
0070             /// For example if a UCS-32 code-point is in the range reserved for UTF-16 surrogates
0071             /// or an invalid UTF-8 sequence is found
0072             static constexpr utf::code_point illegal = utf::illegal;
0073
0074             /// This value is returned in following cases: An incomplete input sequence was found or
0075             /// insufficient output buffer was provided so complete output could not be written.
0076             static constexpr utf::code_point incomplete = utf::incomplete;
0077
0078             virtual ~base_converter();
0079
0080             /// Return the maximal length that one Unicode code-point can be converted to, for example
0081             /// for UTF-8 it is 4, for Shift-JIS it is 2 and ISO-8859-1 is 1
0082             virtual int max_len() const { return 1; }
0083
0084             /// Returns true if calling the functions from_unicode, to_unicode, and max_len is thread safe.
0085             ///
0086             /// Rule of thumb: if this class' implementation uses simple tables that are unchanged
0087             /// or is purely algorithmic like UTF-8 - so it does not share any mutable bit for
0088             /// independent to_unicode, from_unicode calls, you may set it to true, otherwise,
0089             /// for example if you use iconv_t descriptor or UConverter as conversion object return false,
0090             /// and this object will be cloned for each use.
0091             virtual bool is_thread_safe() const { return false; }
0092
0093             /// Create a polymorphic copy of this object, usually called only if is_thread_safe() return false
0094             virtual base_converter* clone() const
0095             {
0096                 BOOST_ASSERT(typeid(*this) == typeid(base_converter));
0097                 return new base_converter();
0098             }
0099
0100             /// Convert a single character starting at begin and ending at most at end to Unicode code-point.
0101             ///
0102             /// if valid input sequence found in [\a begin,\a code_point_end) such as \a begin < \a code_point_end && \a
0103             /// code_point_end <= \a end it is converted to its Unicode code point equivalent, \a begin is set to \a
0104             /// code_point_end
0105             ///
0106             /// if incomplete input sequence found in [\a begin,\a end), i.e. there my be such \a code_point_end that \a
0107             /// code_point_end > \a end and [\a begin, \a code_point_end) would be valid input sequence, then \a
0108             /// incomplete is returned begin stays unchanged, for example for UTF-8 conversion a *begin = 0xc2, \a begin
0109             /// +1 = \a end is such situation.
0110             ///
0111             /// if invalid input sequence found, i.e. there is a sequence [\a begin, \a code_point_end) such as \a
0112             /// code_point_end <= \a end that is illegal for this encoding, \a illegal is returned and begin stays
0113             /// unchanged. For example if *begin = 0xFF and begin < end for UTF-8, then \a illegal is returned.
0114             virtual utf::code_point to_unicode(const char*& begin, const char* end)
0115             {
0116                 if(begin == end)
0117                     return incomplete; // LCOV_EXCL_LINE
0118                 unsigned char cp = *begin;
0119                 if(cp <= 0x7F) {
0120                     begin++;
0121                     return cp;
0122                 }
0123                 return illegal;
0124             }
0125
0126             /// Convert a single code-point \a u into encoding and store it in [begin,end) range.
0127             ///
0128             /// If u is invalid Unicode code-point, or it can not be mapped correctly to represented character set,
0129             /// \a illegal should be returned
0130             ///
0131             /// If u can be converted to a sequence of bytes c1, ... , cN (1<= N <= max_len() ) then
0132             ///
0133             /// -# If end - begin >= N, c1, ... cN are written starting at begin and N is returned
0134             /// -# If end - begin < N, incomplete is returned, it is unspecified what would be
0135             ///    stored in bytes in range [begin,end)
0136             virtual utf::len_or_error from_unicode(utf::code_point u, char* begin, const char* end)
0137             {
0138                 if(begin == end)
0139                     return incomplete; // LCOV_EXCL_LINE
0140                 if(u >= 0x80)
0141                     return illegal;
0142                 *begin = static_cast<char>(u);
0143                 return 1;
0144             }
0145         };
0146
0147         /// This function creates a \a base_converter that can be used for conversion between UTF-8 and
0148         /// Unicode code points
0149         BOOST_LOCALE_DECL std::unique_ptr<base_converter> create_utf8_converter();
0150
0151         BOOST_DEPRECATED("This function is deprecated, use 'create_utf8_converter()'")
0152         inline std::unique_ptr<base_converter> create_utf8_converter_unique_ptr()
0153         {
0154             return create_utf8_converter();
0155         }
0156
0157         /// This function creates a \a base_converter that can be used for conversion between single byte
0158         /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points,
0159         ///
0160         /// If \a encoding is not supported, empty pointer is returned.
0161         /// So you should check whether the returned pointer is valid/non-NULL
0162         BOOST_LOCALE_DECL std::unique_ptr<base_converter> create_simple_converter(const std::string& encoding);
0163
0164         BOOST_DEPRECATED("This function is deprecated, use 'create_simple_converter()'")
0165         inline std::unique_ptr<base_converter> create_simple_converter_unique_ptr(const std::string& encoding)
0166         {
0167             return create_simple_converter(encoding);
0168         }
0169
0170         /// Install codecvt facet into locale \a in and return new locale that is based on \a in and uses new
0171         /// facet.
0172         ///
0173         /// codecvt facet would convert between narrow and wide/char16_t/char32_t encodings using \a cvt converter.
0174         /// If \a cvt is null pointer, always failure conversion would be used that fails on every first input or
0175         /// output.
0176         ///
0177         /// Note: the codecvt facet handles both UTF-16 and UTF-32 wide encodings, it knows to break and join
0178         /// Unicode code-points above 0xFFFF to and from surrogate pairs correctly. \a cvt should be unaware
0179         /// of wide encoding type
0180         BOOST_LOCALE_DECL
0181         std::locale create_codecvt(const std::locale& in, std::unique_ptr<base_converter> cvt, char_facet_t type);
0182
0183         BOOST_DEPRECATED("This function is deprecated, use 'create_codecvt()'")
0184         inline std::locale create_codecvt_from_pointer(const std::locale& in, base_converter* cvt, char_facet_t type)
0185         {
0186             return create_codecvt(in, std::unique_ptr<base_converter>(cvt), type);
0187         }
0188
0189         BOOST_DEPRECATED("This function is deprecated, use 'create_utf8_converter()'")
0190         BOOST_LOCALE_DECL base_converter* create_utf8_converter_new_ptr();
0191
0192         BOOST_DEPRECATED("This function is deprecated, use 'create_simple_converter()'")
0193         BOOST_LOCALE_DECL base_converter* create_simple_converter_new_ptr(const std::string& encoding);
0194
0195         /// Install utf8 codecvt to UTF-16 or UTF-32 into locale \a in and return
0196         /// new locale that is based on \a in and uses new facet.
0197         BOOST_LOCALE_DECL
0198         std::locale create_utf8_codecvt(const std::locale& in, char_facet_t type);
0199
0200         /// This function installs codecvt that can be used for conversion between single byte
0201         /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points,
0202         ///
0203         /// \throws boost::locale::conv::invalid_charset_error: Character set is not supported or isn't a single
0204         /// byte character set
0205         BOOST_LOCALE_DECL
0206         std::locale create_simple_codecvt(const std::locale& in, const std::string& encoding, char_facet_t type);
0207     } // namespace util
0208 }}    // namespace boost::locale
0209
0210 #endif