boost/locale/generic_codecvt.hpp

0001 //
0002 // Copyright (c) 2015 Artyom Beilis (Tonkikh)
0003 // Copyright (c) 2021-2023 Alexander Grund
0004 //
0005 // Distributed under the Boost Software License, Version 1.0.
0006 // https://www.boost.org/LICENSE_1_0.txt
0007
0008 #ifndef BOOST_LOCALE_GENERIC_CODECVT_HPP
0009 #define BOOST_LOCALE_GENERIC_CODECVT_HPP
0010
0011 #include <boost/locale/utf.hpp>
0012 #include <cstdint>
0013 #include <locale>
0014
0015 namespace boost { namespace locale {
0016
0017     static_assert(sizeof(std::mbstate_t) >= 2, "std::mbstate_t is to small to store an UTF-16 codepoint");
0018     namespace detail {
0019         // Avoid including cstring for std::memcpy
0020         inline void copy_uint16_t(void* dst, const void* src)
0021         {
0022             unsigned char* cdst = static_cast<unsigned char*>(dst);
0023             const unsigned char* csrc = static_cast<const unsigned char*>(src);
0024             cdst[0] = csrc[0];
0025             cdst[1] = csrc[1];
0026         }
0027         inline uint16_t read_state(const std::mbstate_t& src)
0028         {
0029             uint16_t dst;
0030             copy_uint16_t(&dst, &src);
0031             return dst;
0032         }
0033         inline void write_state(std::mbstate_t& dst, const uint16_t src)
0034         {
0035             copy_uint16_t(&dst, &src);
0036         }
0037     } // namespace detail
0038
0039     /// \brief A base class that used to define constants for generic_codecvt
0040     class generic_codecvt_base {
0041     public:
0042         /// Initial state for converting to or from Unicode code points, used by initial_state in derived classes
0043         enum initial_convertion_state {
0044             to_unicode_state,  ///< The state would be used by to_unicode functions
0045             from_unicode_state ///< The state would be used by from_unicode functions
0046         };
0047     };
0048
0049     /// \brief Generic codecvt facet for various stateless encodings to UTF-16 and UTF-32 using wchar_t, char32_t
0050     /// and char16_t
0051     ///
0052     /// Implementations should derive from this class defining itself as CodecvtImpl and provide following members
0053     ///
0054     /// - `state_type` - a type of special object that allows to store intermediate cached data, for example `iconv_t`
0055     /// descriptor
0056     /// - `state_type initial_state(generic_codecvt_base::initial_convertion_state direction) const` - member function
0057     /// that creates initial state
0058     /// - `int max_encoding_length() const` - a maximal length that one Unicode code point is represented, for UTF-8 for
0059     /// example it is 4 from ISO-8859-1 it is 1
0060     /// - `utf::code_point to_unicode(state_type& state, const char*& begin, const char* end)` - extract first code
0061     /// point from the text in range [begin,end), in case of success begin would point to the next character sequence to
0062     /// be encoded to next code point, in case of incomplete sequence - utf::incomplete shell be returned, and in case
0063     /// of invalid input sequence utf::illegal shell be returned and begin would remain unmodified
0064     /// - `utf::len_or_error from_unicode(state_type &state, utf::code_point u, char* begin, const char* end)` - convert
0065     /// a Unicode code point `u` into a character sequence at [begin,end). Return the length of the sequence in case of
0066     /// success, utf::incomplete in case of not enough room to encode the code point, or utf::illegal in case conversion
0067     /// can not be performed
0068     ///
0069     ///
0070     /// For example implementation of codecvt for latin1/ISO-8859-1 character set
0071     ///
0072     /// \code
0073     ///
0074     /// template<typename CharType>
0075     /// class latin1_codecvt: boost::locale::generic_codecvt<CharType,latin1_codecvt<CharType> >
0076     /// {
0077     /// public:
0078     ///
0079     ///     /* Standard codecvt constructor */
0080     ///     latin1_codecvt(size_t refs = 0): boost::locale::generic_codecvt<CharType,latin1_codecvt<CharType> >(refs)
0081     ///     {
0082     ///     }
0083     ///
0084     ///     /* State is unused but required by generic_codecvt */
0085     ///     struct state_type {};
0086     ///
0087     ///     state_type initial_state(generic_codecvt_base::initial_convertion_state /*unused*/) const
0088     ///     {
0089     ///         return state_type();
0090     ///     }
0091     ///
0092     ///     int max_encoding_length() const
0093     ///     {
0094     ///         return 1;
0095     ///     }
0096     ///
0097     ///     boost::locale::utf::code_point to_unicode(state_type&, const char*& begin, const char* end) const
0098     ///     {
0099     ///        if(begin == end)
0100     ///           return boost::locale::utf::incomplete;
0101     ///        return *begin++;
0102     ///     }
0103     ///
0104     ///     boost::locale::utf::len_or_error from_unicode(state_type&, boost::locale::utf::code_point u,
0105     ///                                                   char* begin, const char* end) const
0106     ///     {
0107     ///        if(u >= 256)
0108     ///           return boost::locale::utf::illegal;
0109     ///        if(begin == end)
0110     ///           return boost::locale::utf::incomplete;
0111     ///        *begin = u;
0112     ///        return 1;
0113     ///     }
0114     /// };
0115     ///
0116     /// \endcode
0117     ///
0118     /// When external tools used for encoding conversion, the `state_type` is useful to save objects used for
0119     /// conversions. For example, icu::UConverter can be saved in such a state for an efficient use:
0120     ///
0121     /// \code
0122     /// template<typename CharType>
0123     /// class icu_codecvt: boost::locale::generic_codecvt<CharType,icu_codecvt<CharType>>
0124     /// {
0125     /// public:
0126     ///
0127     ///     /* Standard codecvt constructor */
0128     ///     icu_codecvt(std::string const &name,refs = 0):
0129     ///         boost::locale::generic_codecvt<CharType,icu_codecvt<CharType>>(refs)
0130     ///     { ... }
0131     ///
0132     ///     using state_type = std::unique_ptr<UConverter,void (*)(UConverter*)>;
0133     ///
0134     ///     state_type initial_state(generic_codecvt_base::initial_convertion_state /*unused*/) const
0135     ///     {
0136     ///         UErrorCode err = U_ZERO_ERROR;
0137     ///         return state_type(ucnv_safeClone(converter_,0,0,&err),ucnv_close);
0138     ///     }
0139     ///
0140     ///     boost::locale::utf::code_point to_unicode(state_type &ptr,char const *&begin,char const *end) const
0141     ///     {
0142     ///         UErrorCode err = U_ZERO_ERROR;
0143     ///         boost::locale::utf::code_point cp = ucnv_getNextUChar(ptr.get(),&begin,end,&err);
0144     ///         ...
0145     ///     }
0146     ///     ...
0147     /// };
0148     /// \endcode
0149     ///
0150     template<typename CharType, typename CodecvtImpl, int CharSize = sizeof(CharType)>
0151     class generic_codecvt;
0152
0153     /// \brief UTF-16 to/from narrow char codecvt facet to use with char16_t or wchar_t on Windows
0154     ///
0155     /// Note in order to fit the requirements of usability by std::wfstream it uses mbstate_t
0156     /// to handle intermediate states in handling of variable length UTF-16 sequences
0157     ///
0158     /// Its member functions implement standard virtual functions of basic codecvt
0159     template<typename CharType, typename CodecvtImpl>
0160     class generic_codecvt<CharType, CodecvtImpl, 2> : public std::codecvt<CharType, char, std::mbstate_t>,
0161                                                       public generic_codecvt_base {
0162     public:
0163         typedef CharType uchar;
0164
0165         generic_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs) {}
0166         const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
0167
0168     protected:
0169         std::codecvt_base::result do_unshift(std::mbstate_t& s, char* from, char* /*to*/, char*& next) const override
0170         {
0171             if(*reinterpret_cast<char*>(&s) != 0)
0172                 return std::codecvt_base::error;
0173             next = from;
0174             return std::codecvt_base::ok;
0175         }
0176         int do_encoding() const noexcept override { return 0; }
0177         int do_max_length() const noexcept override { return implementation().max_encoding_length(); }
0178         bool do_always_noconv() const noexcept override { return false; }
0179
0180         int do_length(std::mbstate_t& std_state, const char* from, const char* from_end, size_t max) const override
0181         {
0182             bool state = *reinterpret_cast<char*>(&std_state) != 0;
0183             const char* save_from = from;
0184
0185             auto cvt_state = implementation().initial_state(to_unicode_state);
0186             while(max > 0 && from < from_end) {
0187                 const char* prev_from = from;
0188                 const utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
0189                 if(ch == boost::locale::utf::incomplete || ch == boost::locale::utf::illegal) {
0190                     from = prev_from;
0191                     break;
0192                 }
0193                 max--;
0194                 if(ch > 0xFFFF) {
0195                     if(!state)
0196                         from = prev_from;
0197                     state = !state;
0198                 }
0199             }
0200             *reinterpret_cast<char*>(&std_state) = state;
0201             return static_cast<int>(from - save_from);
0202         }
0203
0204         std::codecvt_base::result do_in(std::mbstate_t& std_state,
0205                                         const char* from,
0206                                         const char* from_end,
0207                                         const char*& from_next,
0208                                         uchar* to,
0209                                         uchar* to_end,
0210                                         uchar*& to_next) const override
0211         {
0212             std::codecvt_base::result r = std::codecvt_base::ok;
0213
0214             // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
0215             // according to standard. We use it to keep a flag 0/1 for surrogate pair writing
0216             //
0217             // if 0/false no codepoint above >0xFFFF observed, else a codepoint above 0xFFFF was observed
0218             // and first pair is written, but no input consumed
0219             bool state = *reinterpret_cast<char*>(&std_state) != 0;
0220             auto cvt_state = implementation().initial_state(to_unicode_state);
0221             while(to < to_end && from < from_end) {
0222                 const char* from_saved = from;
0223
0224                 utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
0225
0226                 if(ch == boost::locale::utf::illegal) {
0227                     from = from_saved;
0228                     r = std::codecvt_base::error;
0229                     break;
0230                 }
0231                 if(ch == boost::locale::utf::incomplete) {
0232                     from = from_saved;
0233                     r = std::codecvt_base::partial;
0234                     break;
0235                 }
0236                 // Normal codepoints go directly to stream
0237                 if(ch <= 0xFFFF)
0238                     *to++ = static_cast<uchar>(ch);
0239                 else {
0240                     // For other codepoints we do the following
0241                     //
0242                     // 1. We can't consume our input as we may find ourselves
0243                     //    in state where all input consumed but not all output written,i.e. only
0244                     //    1st pair is written
0245                     // 2. We only write first pair and mark this in the state, we also revert back
0246                     //    the from pointer in order to make sure this codepoint would be read
0247                     //    once again and then we would consume our input together with writing
0248                     //    second surrogate pair
0249                     ch -= 0x10000;
0250                     std::uint16_t w1 = static_cast<std::uint16_t>(0xD800 | (ch >> 10));
0251                     std::uint16_t w2 = static_cast<std::uint16_t>(0xDC00 | (ch & 0x3FF));
0252                     if(!state) {
0253                         from = from_saved;
0254                         *to++ = w1;
0255                     } else
0256                         *to++ = w2;
0257                     state = !state;
0258                 }
0259             }
0260             from_next = from;
0261             to_next = to;
0262             if(r == std::codecvt_base::ok && (from != from_end || state))
0263                 r = std::codecvt_base::partial;
0264             *reinterpret_cast<char*>(&std_state) = state;
0265             return r;
0266         }
0267
0268         std::codecvt_base::result do_out(std::mbstate_t& std_state,
0269                                          const uchar* from,
0270                                          const uchar* from_end,
0271                                          const uchar*& from_next,
0272                                          char* to,
0273                                          char* to_end,
0274                                          char*& to_next) const override
0275         {
0276             std::codecvt_base::result r = std::codecvt_base::ok;
0277             // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
0278             // according to standard. We assume that sizeof(mbstate_t) >=2 in order
0279             // to be able to store first observed surrogate pair
0280             //
0281             // State: state!=0 - a first surrogate pair was observed (state = first pair),
0282             // we expect the second one to come and then zero the state
0283             std::uint16_t state = detail::read_state(std_state);
0284             auto cvt_state = implementation().initial_state(from_unicode_state);
0285             while(to < to_end && from < from_end) {
0286                 utf::code_point ch = 0;
0287                 if(state != 0) {
0288                     // if the state indicates that 1st surrogate pair was written
0289                     // we should make sure that the second one that comes is actually
0290                     // second surrogate
0291                     std::uint16_t w1 = state;
0292                     std::uint16_t w2 = *from;
0293                     // we don't forward from as writing may fail to incomplete or
0294                     // partial conversion
0295                     if(0xDC00 <= w2 && w2 <= 0xDFFF) {
0296                         std::uint16_t vh = w1 - 0xD800;
0297                         std::uint16_t vl = w2 - 0xDC00;
0298                         ch = ((uint32_t(vh) << 10) | vl) + 0x10000;
0299                     } else {
0300                         // Invalid surrogate
0301                         r = std::codecvt_base::error;
0302                         break;
0303                     }
0304                 } else {
0305                     ch = *from;
0306                     if(0xD800 <= ch && ch <= 0xDBFF) {
0307                         // if this is a first surrogate pair we put
0308                         // it into the state and consume it, note we don't
0309                         // go forward as it should be illegal so we increase
0310                         // the from pointer manually
0311                         state = static_cast<uint16_t>(ch);
0312                         from++;
0313                         continue;
0314                     } else if(0xDC00 <= ch && ch <= 0xDFFF) {
0315                         // if we observe second surrogate pair and
0316                         // first only may be expected we should break from the loop with error
0317                         // as it is illegal input
0318                         r = std::codecvt_base::error;
0319                         break;
0320                     }
0321                 }
0322                 if(!boost::locale::utf::is_valid_codepoint(ch)) {
0323                     r = std::codecvt_base::error;
0324                     break;
0325                 }
0326                 const utf::code_point len = implementation().from_unicode(cvt_state, ch, to, to_end);
0327                 if(len == boost::locale::utf::incomplete) {
0328                     r = std::codecvt_base::partial;
0329                     break;
0330                 } else if(len == boost::locale::utf::illegal) {
0331                     r = std::codecvt_base::error;
0332                     break;
0333                 } else
0334                     to += len;
0335                 state = 0;
0336                 from++;
0337             }
0338             from_next = from;
0339             to_next = to;
0340             if(r == std::codecvt_base::ok && (from != from_end || state != 0))
0341                 r = std::codecvt_base::partial;
0342             detail::write_state(std_state, state);
0343             return r;
0344         }
0345     };
0346
0347     /// \brief UTF-32 to/from narrow char codecvt facet to use with char32_t or wchar_t on POSIX platforms
0348     ///
0349     /// Its member functions implement standard virtual functions of basic codecvt.
0350     /// mbstate_t is not used for UTF-32 handling due to fixed length encoding
0351     template<typename CharType, typename CodecvtImpl>
0352     class generic_codecvt<CharType, CodecvtImpl, 4> : public std::codecvt<CharType, char, std::mbstate_t>,
0353                                                       public generic_codecvt_base {
0354     public:
0355         typedef CharType uchar;
0356
0357         generic_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs) {}
0358
0359         const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
0360
0361     protected:
0362         std::codecvt_base::result
0363         do_unshift(std::mbstate_t& /*s*/, char* from, char* /*to*/, char*& next) const override
0364         {
0365             next = from;
0366             return std::codecvt_base::ok;
0367         }
0368         int do_encoding() const noexcept override { return 0; }
0369         int do_max_length() const noexcept override { return implementation().max_encoding_length(); }
0370         bool do_always_noconv() const noexcept override { return false; }
0371
0372         int do_length(std::mbstate_t& /*state*/, const char* from, const char* from_end, size_t max) const override
0373         {
0374             const char* start_from = from;
0375             auto cvt_state = implementation().initial_state(to_unicode_state);
0376             while(max > 0 && from < from_end) {
0377                 const char* save_from = from;
0378                 const utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
0379                 if(ch == boost::locale::utf::incomplete || ch == boost::locale::utf::illegal) {
0380                     from = save_from;
0381                     break;
0382                 }
0383                 max--;
0384             }
0385
0386             return static_cast<int>(from - start_from);
0387         }
0388
0389         std::codecvt_base::result do_in(std::mbstate_t& /*state*/,
0390                                         const char* from,
0391                                         const char* from_end,
0392                                         const char*& from_next,
0393                                         uchar* to,
0394                                         uchar* to_end,
0395                                         uchar*& to_next) const override
0396         {
0397             std::codecvt_base::result r = std::codecvt_base::ok;
0398
0399             auto cvt_state = implementation().initial_state(to_unicode_state);
0400             while(to < to_end && from < from_end) {
0401                 const char* from_saved = from;
0402
0403                 const utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
0404
0405                 if(ch == boost::locale::utf::illegal) {
0406                     r = std::codecvt_base::error;
0407                     from = from_saved;
0408                     break;
0409                 }
0410                 if(ch == boost::locale::utf::incomplete) {
0411                     r = std::codecvt_base::partial;
0412                     from = from_saved;
0413                     break;
0414                 }
0415                 *to++ = ch;
0416             }
0417             from_next = from;
0418             to_next = to;
0419             if(r == std::codecvt_base::ok && from != from_end)
0420                 r = std::codecvt_base::partial;
0421             return r;
0422         }
0423
0424         std::codecvt_base::result do_out(std::mbstate_t& /*std_state*/,
0425                                          const uchar* from,
0426                                          const uchar* from_end,
0427                                          const uchar*& from_next,
0428                                          char* to,
0429                                          char* to_end,
0430                                          char*& to_next) const override
0431         {
0432             std::codecvt_base::result r = std::codecvt_base::ok;
0433             auto cvt_state = implementation().initial_state(from_unicode_state);
0434             while(to < to_end && from < from_end) {
0435                 const std::uint32_t ch = *from;
0436                 if(!boost::locale::utf::is_valid_codepoint(ch)) {
0437                     r = std::codecvt_base::error;
0438                     break;
0439                 }
0440                 const utf::code_point len = implementation().from_unicode(cvt_state, ch, to, to_end);
0441                 if(len == boost::locale::utf::incomplete) {
0442                     r = std::codecvt_base::partial;
0443                     break;
0444                 } else if(len == boost::locale::utf::illegal) {
0445                     r = std::codecvt_base::error;
0446                     break;
0447                 }
0448                 to += len;
0449                 from++;
0450             }
0451             from_next = from;
0452             to_next = to;
0453             if(r == std::codecvt_base::ok && from != from_end)
0454                 r = std::codecvt_base::partial;
0455             return r;
0456         }
0457     };
0458
0459     template<typename CodecvtImpl>
0460     class generic_codecvt<char, CodecvtImpl, 1> : public std::codecvt<char, char, std::mbstate_t>,
0461                                                   public generic_codecvt_base {
0462     public:
0463         typedef char uchar;
0464
0465         const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
0466
0467         generic_codecvt(size_t refs = 0) : std::codecvt<char, char, std::mbstate_t>(refs) {}
0468     };
0469
0470 }} // namespace boost::locale
0471
0472 #endif