|
||||
File indexing completed on 2025-01-18 09:30:42
0001 // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu) 0002 // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu). 0003 // Distributed under the Boost Software License, Version 1.0. (See accompany- 0004 // ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 0005 0006 #ifndef BOOST_UTF8_CODECVT_FACET_HPP 0007 #define BOOST_UTF8_CODECVT_FACET_HPP 0008 0009 // MS compatible compilers support #pragma once 0010 #if defined(_MSC_VER) && (_MSC_VER >= 1020) 0011 # pragma once 0012 #endif 0013 0014 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 0015 // utf8_codecvt_facet.hpp 0016 0017 // This header defines class utf8_codecvt_facet, derived from 0018 // std::codecvt<wchar_t, char>, which can be used to convert utf8 data in 0019 // files into wchar_t strings in the application. 0020 // 0021 // The header is NOT STANDALONE, and is not to be included by the USER. 0022 // There are at least two libraries which want to use this functionality, and 0023 // we want to avoid code duplication. It would be possible to create utf8 0024 // library, but: 0025 // - this requires review process first 0026 // - in the case, when linking the a library which uses utf8 0027 // (say 'program_options'), user should also link to the utf8 library. 0028 // This seems inconvenient, and asking a user to link to an unrevieved 0029 // library is strange. 0030 // Until the above points are fixed, a library which wants to use utf8 must: 0031 // - include this header in one of it's headers or sources 0032 // - include the corresponding boost/detail/utf8_codecvt_facet.ipp file in one 0033 // of its sources 0034 // - before including either file, the library must define 0035 // - BOOST_UTF8_BEGIN_NAMESPACE to the namespace declaration that must be used 0036 // - BOOST_UTF8_END_NAMESPACE to the code to close the previous namespace 0037 // declaration. 0038 // - BOOST_UTF8_DECL -- to the code which must be used for all 'exportable' 0039 // symbols. 0040 // 0041 // For example, program_options library might contain: 0042 // #define BOOST_UTF8_BEGIN_NAMESPACE <backslash character> 0043 // namespace boost { namespace program_options { 0044 // #define BOOST_UTF8_END_NAMESPACE }} 0045 // #define BOOST_UTF8_DECL BOOST_PROGRAM_OPTIONS_DECL 0046 // #include <boost/detail/utf8_codecvt_facet.ipp> 0047 // 0048 // Essentially, each library will have its own copy of utf8 code, in 0049 // different namespaces. 0050 0051 // Note:(Robert Ramey). I have made the following alterations in the original 0052 // code. 0053 // a) Rendered utf8_codecvt<wchar_t, char> with using templates 0054 // b) Move longer functions outside class definition to prevent inlining 0055 // and make code smaller 0056 // c) added on a derived class to permit translation to/from current 0057 // locale to utf8 0058 0059 // See http://www.boost.org for updates, documentation, and revision history. 0060 0061 // archives stored as text - note these ar templated on the basic 0062 // stream templates to accommodate wide (and other?) kind of characters 0063 // 0064 // note the fact that on libraries without wide characters, ostream is 0065 // is not a specialization of basic_ostream which in fact is not defined 0066 // in such cases. So we can't use basic_ostream<OStream::char_type> but rather 0067 // use two template parameters 0068 // 0069 // utf8_codecvt_facet 0070 // This is an implementation of a std::codecvt facet for translating 0071 // from UTF-8 externally to UCS-4. Note that this is not tied to 0072 // any specific types in order to allow customization on platforms 0073 // where wchar_t is not big enough. 0074 // 0075 // NOTES: The current implementation jumps through some unpleasant hoops in 0076 // order to deal with signed character types. As a std::codecvt_base::result, 0077 // it is necessary for the ExternType to be convertible to unsigned char. 0078 // I chose not to tie the extern_type explicitly to char. But if any combination 0079 // of types other than <wchar_t,char_t> is used, then std::codecvt must be 0080 // specialized on those types for this to work. 0081 0082 #include <locale> 0083 #include <cwchar> // for mbstate_t 0084 #include <cstddef> // for std::size_t 0085 0086 #include <boost/config.hpp> 0087 #include <boost/detail/workaround.hpp> 0088 0089 #if defined(BOOST_NO_STDC_NAMESPACE) 0090 namespace std { 0091 using ::mbstate_t; 0092 using ::size_t; 0093 } 0094 #endif 0095 0096 // maximum lenght of a multibyte string 0097 #define MB_LENGTH_MAX 8 0098 0099 BOOST_UTF8_BEGIN_NAMESPACE 0100 0101 //----------------------------------------------------------------------------// 0102 // // 0103 // utf8_codecvt_facet // 0104 // // 0105 // See utf8_codecvt_facet.ipp for the implementation. // 0106 //----------------------------------------------------------------------------// 0107 0108 #ifndef BOOST_UTF8_DECL 0109 #define BOOST_UTF8_DECL 0110 #endif 0111 0112 struct BOOST_SYMBOL_VISIBLE utf8_codecvt_facet : 0113 public std::codecvt<wchar_t, char, std::mbstate_t> 0114 { 0115 public: 0116 BOOST_UTF8_DECL explicit utf8_codecvt_facet(std::size_t no_locale_manage = 0); 0117 BOOST_UTF8_DECL virtual ~utf8_codecvt_facet(); 0118 0119 protected: 0120 BOOST_UTF8_DECL virtual std::codecvt_base::result do_in( 0121 std::mbstate_t& state, 0122 const char * from, 0123 const char * from_end, 0124 const char * & from_next, 0125 wchar_t * to, 0126 wchar_t * to_end, 0127 wchar_t * & to_next 0128 ) const; 0129 0130 BOOST_UTF8_DECL virtual std::codecvt_base::result do_out( 0131 std::mbstate_t & state, 0132 const wchar_t * from, 0133 const wchar_t * from_end, 0134 const wchar_t * & from_next, 0135 char * to, 0136 char * to_end, 0137 char * & to_next 0138 ) const; 0139 0140 bool invalid_continuing_octet(unsigned char octet_1) const { 0141 return (octet_1 < 0x80|| 0xbf< octet_1); 0142 } 0143 0144 bool invalid_leading_octet(unsigned char octet_1) const { 0145 return (0x7f < octet_1 && octet_1 < 0xc0) || 0146 (octet_1 > 0xfd); 0147 } 0148 0149 // continuing octets = octets except for the leading octet 0150 static unsigned int get_cont_octet_count(unsigned char lead_octet) { 0151 return get_octet_count(lead_octet) - 1; 0152 } 0153 0154 BOOST_UTF8_DECL static unsigned int get_octet_count(unsigned char lead_octet); 0155 0156 // How many "continuing octets" will be needed for this word 0157 // == total octets - 1. 0158 BOOST_UTF8_DECL static int get_cont_octet_out_count(wchar_t word); 0159 0160 virtual bool do_always_noconv() const BOOST_NOEXCEPT_OR_NOTHROW { 0161 return false; 0162 } 0163 0164 // UTF-8 isn't really stateful since we rewind on partial conversions 0165 virtual std::codecvt_base::result do_unshift( 0166 std::mbstate_t &, 0167 char * from, 0168 char * /*to*/, 0169 char * & next 0170 ) const { 0171 next = from; 0172 return ok; 0173 } 0174 0175 virtual int do_encoding() const BOOST_NOEXCEPT_OR_NOTHROW { 0176 const int variable_byte_external_encoding=0; 0177 return variable_byte_external_encoding; 0178 } 0179 0180 // How many char objects can I process to get <= max_limit 0181 // wchar_t objects? 0182 BOOST_UTF8_DECL virtual int do_length( 0183 std::mbstate_t &, 0184 const char * from, 0185 const char * from_end, 0186 std::size_t max_limit 0187 ) const 0188 #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600)) 0189 throw() 0190 #endif 0191 ; 0192 0193 // Nonstandard override 0194 virtual int do_length( 0195 const std::mbstate_t & s, 0196 const char * from, 0197 const char * from_end, 0198 std::size_t max_limit 0199 ) const 0200 #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600)) 0201 throw() 0202 #endif 0203 { 0204 return do_length( 0205 const_cast<std::mbstate_t &>(s), 0206 from, 0207 from_end, 0208 max_limit 0209 ); 0210 } 0211 0212 // Largest possible value do_length(state,from,from_end,1) could return. 0213 virtual int do_max_length() const BOOST_NOEXCEPT_OR_NOTHROW { 0214 return 6; // largest UTF-8 encoding of a UCS-4 character 0215 } 0216 }; 0217 0218 BOOST_UTF8_END_NAMESPACE 0219 0220 #endif // BOOST_UTF8_CODECVT_FACET_HPP
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |