Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2024-11-16 09:32:59

0001 /*
0002  *
0003  * Copyright (c) 2004
0004  * John Maddock
0005  *
0006  * Use, modification and distribution are subject to the 
0007  * Boost Software License, Version 1.0. (See accompanying file 
0008  * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
0009  *
0010  */
0011  
0012  /*
0013   *   LOCATION:    see http://www.boost.org for most recent version.
0014   *   FILE         unicode_iterator.hpp
0015   *   VERSION      see <boost/version.hpp>
0016   *   DESCRIPTION: Iterator adapters for converting between different Unicode encodings.
0017   */
0018 
0019 /****************************************************************************
0020 
0021 Contents:
0022 ~~~~~~~~~
0023 
0024 1) Read Only, Input Adapters:
0025 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
0026 
0027 template <class BaseIterator, class U8Type = std::uint8_t>
0028 class u32_to_u8_iterator;
0029 
0030 Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-8.
0031 
0032 template <class BaseIterator, class U32Type = std::uint32_t>
0033 class u8_to_u32_iterator;
0034 
0035 Adapts sequence of UTF-8 code points to "look like" a sequence of UTF-32.
0036 
0037 template <class BaseIterator, class U16Type = std::uint16_t>
0038 class u32_to_u16_iterator;
0039 
0040 Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-16.
0041 
0042 template <class BaseIterator, class U32Type = std::uint32_t>
0043 class u16_to_u32_iterator;
0044 
0045 Adapts sequence of UTF-16 code points to "look like" a sequence of UTF-32.
0046 
0047 2) Single pass output iterator adapters:
0048 
0049 template <class BaseIterator>
0050 class utf8_output_iterator;
0051 
0052 Accepts UTF-32 code points and forwards them on as UTF-8 code points.
0053 
0054 template <class BaseIterator>
0055 class utf16_output_iterator;
0056 
0057 Accepts UTF-32 code points and forwards them on as UTF-16 code points.
0058 
0059 ****************************************************************************/
0060 
0061 #ifndef BOOST_REGEX_UNICODE_ITERATOR_HPP
0062 #define BOOST_REGEX_UNICODE_ITERATOR_HPP
0063 #include <cstdint>
0064 #include <boost/regex/config.hpp>
0065 #include <stdexcept>
0066 #include <sstream>
0067 #include <ios>
0068 #include <limits.h> // CHAR_BIT
0069 
0070 #ifndef BOOST_REGEX_STANDALONE
0071 #include <boost/throw_exception.hpp>
0072 #endif
0073 
0074 namespace boost{
0075 
0076 namespace detail{
0077 
0078 static const std::uint16_t high_surrogate_base = 0xD7C0u;
0079 static const std::uint16_t low_surrogate_base = 0xDC00u;
0080 static const std::uint32_t ten_bit_mask = 0x3FFu;
0081 
0082 inline bool is_high_surrogate(std::uint16_t v)
0083 {
0084    return (v & 0xFFFFFC00u) == 0xd800u;
0085 }
0086 inline bool is_low_surrogate(std::uint16_t v)
0087 {
0088    return (v & 0xFFFFFC00u) == 0xdc00u;
0089 }
0090 template <class T>
0091 inline bool is_surrogate(T v)
0092 {
0093    return (v & 0xFFFFF800u) == 0xd800;
0094 }
0095 
0096 inline unsigned utf8_byte_count(std::uint8_t c)
0097 {
0098    // if the most significant bit with a zero in it is in position
0099    // 8-N then there are N bytes in this UTF-8 sequence:
0100    std::uint8_t mask = 0x80u;
0101    unsigned result = 0;
0102    while(c & mask)
0103    {
0104       ++result;
0105       mask >>= 1;
0106    }
0107    return (result == 0) ? 1 : ((result > 4) ? 4 : result);
0108 }
0109 
0110 inline unsigned utf8_trailing_byte_count(std::uint8_t c)
0111 {
0112    return utf8_byte_count(c) - 1;
0113 }
0114 
0115 #ifdef BOOST_REGEX_MSVC
0116 #pragma warning(push)
0117 #pragma warning(disable:4100)
0118 #endif
0119 #ifndef BOOST_NO_EXCEPTIONS
0120 BOOST_REGEX_NORETURN
0121 #endif
0122 inline void invalid_utf32_code_point(std::uint32_t val)
0123 {
0124    std::stringstream ss;
0125    ss << "Invalid UTF-32 code point U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-16 sequence";
0126    std::out_of_range e(ss.str());
0127 #ifndef BOOST_REGEX_STANDALONE
0128    boost::throw_exception(e);
0129 #else
0130    throw e;
0131 #endif
0132 }
0133 #ifdef BOOST_REGEX_MSVC
0134 #pragma warning(pop)
0135 #endif
0136 
0137 
0138 } // namespace detail
0139 
0140 template <class BaseIterator, class U16Type = std::uint16_t>
0141 class u32_to_u16_iterator
0142 {
0143    typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
0144 
0145    static_assert(sizeof(base_value_type)*CHAR_BIT == 32, "Incorrectly sized template argument");
0146    static_assert(sizeof(U16Type)*CHAR_BIT == 16, "Incorrectly sized template argument");
0147 
0148 public:
0149    typedef std::ptrdiff_t     difference_type;
0150    typedef U16Type            value_type;
0151    typedef value_type const*  pointer;
0152    typedef value_type const   reference;
0153    typedef std::bidirectional_iterator_tag iterator_category;
0154 
0155    reference operator*()const
0156    {
0157       if(m_current == 2)
0158          extract_current();
0159       return m_values[m_current];
0160    }
0161    bool operator==(const u32_to_u16_iterator& that)const
0162    {
0163       if(m_position == that.m_position)
0164       {
0165          // Both m_currents must be equal, or both even
0166          // this is the same as saying their sum must be even:
0167          return (m_current + that.m_current) & 1u ? false : true;
0168       }
0169       return false;
0170    }
0171    bool operator!=(const u32_to_u16_iterator& that)const
0172    {
0173       return !(*this == that);
0174    }
0175    u32_to_u16_iterator& operator++()
0176    {
0177       // if we have a pending read then read now, so that we know whether
0178       // to skip a position, or move to a low-surrogate:
0179       if(m_current == 2)
0180       {
0181          // pending read:
0182          extract_current();
0183       }
0184       // move to the next surrogate position:
0185       ++m_current;
0186       // if we've reached the end skip a position:
0187       if(m_values[m_current] == 0)
0188       {
0189          m_current = 2;
0190          ++m_position;
0191       }
0192       return *this;
0193    }
0194    u32_to_u16_iterator operator++(int)
0195    {
0196       u32_to_u16_iterator r(*this);
0197       ++(*this);
0198       return r;
0199    }
0200    u32_to_u16_iterator& operator--()
0201    {
0202       if(m_current != 1)
0203       {
0204          // decrementing an iterator always leads to a valid position:
0205          --m_position;
0206          extract_current();
0207          m_current = m_values[1] ? 1 : 0;
0208       }
0209       else
0210       {
0211          m_current = 0;
0212       }
0213       return *this;
0214    }
0215    u32_to_u16_iterator operator--(int)
0216    {
0217       u32_to_u16_iterator r(*this);
0218       --(*this);
0219       return r;
0220    }
0221    BaseIterator base()const
0222    {
0223       return m_position;
0224    }
0225    // construct:
0226    u32_to_u16_iterator() : m_position(), m_current(0)
0227    {
0228       m_values[0] = 0;
0229       m_values[1] = 0;
0230       m_values[2] = 0;
0231    }
0232    u32_to_u16_iterator(BaseIterator b) : m_position(b), m_current(2)
0233    {
0234       m_values[0] = 0;
0235       m_values[1] = 0;
0236       m_values[2] = 0;
0237    }
0238 private:
0239 
0240    void extract_current()const
0241    {
0242       // begin by checking for a code point out of range:
0243       std::uint32_t v = *m_position;
0244       if(v >= 0x10000u)
0245       {
0246          if(v > 0x10FFFFu)
0247             detail::invalid_utf32_code_point(*m_position);
0248          // split into two surrogates:
0249          m_values[0] = static_cast<U16Type>(v >> 10) + detail::high_surrogate_base;
0250          m_values[1] = static_cast<U16Type>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
0251          m_current = 0;
0252          BOOST_REGEX_ASSERT(detail::is_high_surrogate(m_values[0]));
0253          BOOST_REGEX_ASSERT(detail::is_low_surrogate(m_values[1]));
0254       }
0255       else
0256       {
0257          // 16-bit code point:
0258          m_values[0] = static_cast<U16Type>(*m_position);
0259          m_values[1] = 0;
0260          m_current = 0;
0261          // value must not be a surrogate:
0262          if(detail::is_surrogate(m_values[0]))
0263             detail::invalid_utf32_code_point(*m_position);
0264       }
0265    }
0266    BaseIterator m_position;
0267    mutable U16Type m_values[3];
0268    mutable unsigned m_current;
0269 };
0270 
0271 template <class BaseIterator, class U32Type = std::uint32_t>
0272 class u16_to_u32_iterator
0273 {
0274    // special values for pending iterator reads:
0275    static const U32Type pending_read = 0xffffffffu;
0276 
0277    typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
0278 
0279    static_assert(sizeof(base_value_type)*CHAR_BIT == 16, "Incorrectly sized template argument");
0280    static_assert(sizeof(U32Type)*CHAR_BIT == 32, "Incorrectly sized template argument");
0281 
0282 public:
0283    typedef std::ptrdiff_t     difference_type;
0284    typedef U32Type            value_type;
0285    typedef value_type const*  pointer;
0286    typedef value_type const   reference;
0287    typedef std::bidirectional_iterator_tag iterator_category;
0288 
0289    reference operator*()const
0290    {
0291       if(m_value == pending_read)
0292          extract_current();
0293       return m_value;
0294    }
0295    bool operator==(const u16_to_u32_iterator& that)const
0296    {
0297       return m_position == that.m_position;
0298    }
0299    bool operator!=(const u16_to_u32_iterator& that)const
0300    {
0301       return !(*this == that);
0302    }
0303    u16_to_u32_iterator& operator++()
0304    {
0305       // skip high surrogate first if there is one:
0306       if(detail::is_high_surrogate(*m_position)) ++m_position;
0307       ++m_position;
0308       m_value = pending_read;
0309       return *this;
0310    }
0311    u16_to_u32_iterator operator++(int)
0312    {
0313       u16_to_u32_iterator r(*this);
0314       ++(*this);
0315       return r;
0316    }
0317    u16_to_u32_iterator& operator--()
0318    {
0319       --m_position;
0320       // if we have a low surrogate then go back one more:
0321       if(detail::is_low_surrogate(*m_position)) 
0322          --m_position;
0323       m_value = pending_read;
0324       return *this;
0325    }
0326    u16_to_u32_iterator operator--(int)
0327    {
0328       u16_to_u32_iterator r(*this);
0329       --(*this);
0330       return r;
0331    }
0332    BaseIterator base()const
0333    {
0334       return m_position;
0335    }
0336    // construct:
0337    u16_to_u32_iterator() : m_position()
0338    {
0339       m_value = pending_read;
0340    }
0341    u16_to_u32_iterator(BaseIterator b) : m_position(b)
0342    {
0343       m_value = pending_read;
0344    }
0345    //
0346    // Range checked version:
0347    //
0348    u16_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
0349    {
0350       m_value = pending_read;
0351       //
0352       // The range must not start with a low surrogate, or end in a high surrogate,
0353       // otherwise we run the risk of running outside the underlying input range.
0354       // Likewise b must not be located at a low surrogate.
0355       //
0356       std::uint16_t val;
0357       if(start != end)
0358       {
0359          if((b != start) && (b != end))
0360          {
0361             val = *b;
0362             if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
0363                invalid_code_point(val);
0364          }
0365          val = *start;
0366          if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
0367             invalid_code_point(val);
0368          val = *--end;
0369          if(detail::is_high_surrogate(val))
0370             invalid_code_point(val);
0371       }
0372    }
0373 private:
0374    static void invalid_code_point(std::uint16_t val)
0375    {
0376       std::stringstream ss;
0377       ss << "Misplaced UTF-16 surrogate U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-32 sequence";
0378       std::out_of_range e(ss.str());
0379 #ifndef BOOST_REGEX_STANDALONE
0380       boost::throw_exception(e);
0381 #else
0382       throw e;
0383 #endif
0384    }
0385    void extract_current()const
0386    {
0387       m_value = static_cast<U32Type>(static_cast< std::uint16_t>(*m_position));
0388       // if the last value is a high surrogate then adjust m_position and m_value as needed:
0389       if(detail::is_high_surrogate(*m_position))
0390       {
0391          // precondition; next value must have be a low-surrogate:
0392          BaseIterator next(m_position);
0393          std::uint16_t t = *++next;
0394          if((t & 0xFC00u) != 0xDC00u)
0395             invalid_code_point(t);
0396          m_value = (m_value - detail::high_surrogate_base) << 10;
0397          m_value |= (static_cast<U32Type>(static_cast< std::uint16_t>(t)) & detail::ten_bit_mask);
0398       }
0399       // postcondition; result must not be a surrogate:
0400       if(detail::is_surrogate(m_value))
0401          invalid_code_point(static_cast< std::uint16_t>(m_value));
0402    }
0403    BaseIterator m_position;
0404    mutable U32Type m_value;
0405 };
0406 
0407 template <class BaseIterator, class U8Type = std::uint8_t>
0408 class u32_to_u8_iterator
0409 {
0410    typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
0411 
0412    static_assert(sizeof(base_value_type)*CHAR_BIT == 32, "Incorrectly sized template argument");
0413    static_assert(sizeof(U8Type)*CHAR_BIT == 8, "Incorrectly sized template argument");
0414 
0415 public:
0416    typedef std::ptrdiff_t     difference_type;
0417    typedef U8Type             value_type;
0418    typedef value_type const*  pointer;
0419    typedef value_type const   reference;
0420    typedef std::bidirectional_iterator_tag iterator_category;
0421 
0422    reference operator*()const
0423    {
0424       if(m_current == 4)
0425          extract_current();
0426       return m_values[m_current];
0427    }
0428    bool operator==(const u32_to_u8_iterator& that)const
0429    {
0430       if(m_position == that.m_position)
0431       {
0432          // either the m_current's must be equal, or one must be 0 and 
0433          // the other 4: which means neither must have bits 1 or 2 set:
0434          return (m_current == that.m_current)
0435             || (((m_current | that.m_current) & 3) == 0);
0436       }
0437       return false;
0438    }
0439    bool operator!=(const u32_to_u8_iterator& that)const
0440    {
0441       return !(*this == that);
0442    }
0443    u32_to_u8_iterator& operator++()
0444    {
0445       // if we have a pending read then read now, so that we know whether
0446       // to skip a position, or move to a low-surrogate:
0447       if(m_current == 4)
0448       {
0449          // pending read:
0450          extract_current();
0451       }
0452       // move to the next surrogate position:
0453       ++m_current;
0454       // if we've reached the end skip a position:
0455       if(m_values[m_current] == 0)
0456       {
0457          m_current = 4;
0458          ++m_position;
0459       }
0460       return *this;
0461    }
0462    u32_to_u8_iterator operator++(int)
0463    {
0464       u32_to_u8_iterator r(*this);
0465       ++(*this);
0466       return r;
0467    }
0468    u32_to_u8_iterator& operator--()
0469    {
0470       if((m_current & 3) == 0)
0471       {
0472          --m_position;
0473          extract_current();
0474          m_current = 3;
0475          while(m_current && (m_values[m_current] == 0))
0476             --m_current;
0477       }
0478       else
0479          --m_current;
0480       return *this;
0481    }
0482    u32_to_u8_iterator operator--(int)
0483    {
0484       u32_to_u8_iterator r(*this);
0485       --(*this);
0486       return r;
0487    }
0488    BaseIterator base()const
0489    {
0490       return m_position;
0491    }
0492    // construct:
0493    u32_to_u8_iterator() : m_position(), m_current(0)
0494    {
0495       m_values[0] = 0;
0496       m_values[1] = 0;
0497       m_values[2] = 0;
0498       m_values[3] = 0;
0499       m_values[4] = 0;
0500    }
0501    u32_to_u8_iterator(BaseIterator b) : m_position(b), m_current(4)
0502    {
0503       m_values[0] = 0;
0504       m_values[1] = 0;
0505       m_values[2] = 0;
0506       m_values[3] = 0;
0507       m_values[4] = 0;
0508    }
0509 private:
0510 
0511    void extract_current()const
0512    {
0513       std::uint32_t c = *m_position;
0514       if(c > 0x10FFFFu)
0515          detail::invalid_utf32_code_point(c);
0516       if(c < 0x80u)
0517       {
0518          m_values[0] = static_cast<unsigned char>(c);
0519          m_values[1] = static_cast<unsigned char>(0u);
0520          m_values[2] = static_cast<unsigned char>(0u);
0521          m_values[3] = static_cast<unsigned char>(0u);
0522       }
0523       else if(c < 0x800u)
0524       {
0525          m_values[0] = static_cast<unsigned char>(0xC0u + (c >> 6));
0526          m_values[1] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
0527          m_values[2] = static_cast<unsigned char>(0u);
0528          m_values[3] = static_cast<unsigned char>(0u);
0529       }
0530       else if(c < 0x10000u)
0531       {
0532          m_values[0] = static_cast<unsigned char>(0xE0u + (c >> 12));
0533          m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
0534          m_values[2] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
0535          m_values[3] = static_cast<unsigned char>(0u);
0536       }
0537       else
0538       {
0539          m_values[0] = static_cast<unsigned char>(0xF0u + (c >> 18));
0540          m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
0541          m_values[2] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
0542          m_values[3] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
0543       }
0544       m_current= 0;
0545    }
0546    BaseIterator m_position;
0547    mutable U8Type m_values[5];
0548    mutable unsigned m_current;
0549 };
0550 
0551 template <class BaseIterator, class U32Type = std::uint32_t>
0552 class u8_to_u32_iterator
0553 {
0554    // special values for pending iterator reads:
0555    static const U32Type pending_read = 0xffffffffu;
0556 
0557    typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
0558 
0559    static_assert(sizeof(base_value_type)*CHAR_BIT == 8, "Incorrectly sized template argument");
0560    static_assert(sizeof(U32Type)*CHAR_BIT == 32, "Incorrectly sized template argument");
0561 
0562 public:
0563    typedef std::ptrdiff_t     difference_type;
0564    typedef U32Type            value_type;
0565    typedef value_type const*  pointer;
0566    typedef value_type const   reference;
0567    typedef std::bidirectional_iterator_tag iterator_category;
0568 
0569    reference operator*()const
0570    {
0571       if(m_value == pending_read)
0572          extract_current();
0573       return m_value;
0574    }
0575    bool operator==(const u8_to_u32_iterator& that)const
0576    {
0577       return m_position == that.m_position;
0578    }
0579    bool operator!=(const u8_to_u32_iterator& that)const
0580    {
0581       return !(*this == that);
0582    }
0583    u8_to_u32_iterator& operator++()
0584    {
0585       // We must not start with a continuation character:
0586       if((static_cast<std::uint8_t>(*m_position) & 0xC0) == 0x80)
0587          invalid_sequence();
0588       // skip high surrogate first if there is one:
0589       unsigned c = detail::utf8_byte_count(*m_position);
0590       if(m_value == pending_read)
0591       {
0592          // Since we haven't read in a value, we need to validate the code points:
0593          for(unsigned i = 0; i < c; ++i)
0594          {
0595             ++m_position;
0596             // We must have a continuation byte:
0597             if((i != c - 1) && ((static_cast<std::uint8_t>(*m_position) & 0xC0) != 0x80))
0598                invalid_sequence();
0599          }
0600       }
0601       else
0602       {
0603          std::advance(m_position, c);
0604       }
0605       m_value = pending_read;
0606       return *this;
0607    }
0608    u8_to_u32_iterator operator++(int)
0609    {
0610       u8_to_u32_iterator r(*this);
0611       ++(*this);
0612       return r;
0613    }
0614    u8_to_u32_iterator& operator--()
0615    {
0616       // Keep backtracking until we don't have a trailing character:
0617       unsigned count = 0;
0618       while((*--m_position & 0xC0u) == 0x80u) ++count;
0619       // now check that the sequence was valid:
0620       if(count != detail::utf8_trailing_byte_count(*m_position))
0621          invalid_sequence();
0622       m_value = pending_read;
0623       return *this;
0624    }
0625    u8_to_u32_iterator operator--(int)
0626    {
0627       u8_to_u32_iterator r(*this);
0628       --(*this);
0629       return r;
0630    }
0631    BaseIterator base()const
0632    {
0633       return m_position;
0634    }
0635    // construct:
0636    u8_to_u32_iterator() : m_position()
0637    {
0638       m_value = pending_read;
0639    }
0640    u8_to_u32_iterator(BaseIterator b) : m_position(b)
0641    {
0642       m_value = pending_read;
0643    }
0644    //
0645    // Checked constructor:
0646    //
0647    u8_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
0648    {
0649       m_value = pending_read;
0650       //
0651       // We must not start with a continuation character, or end with a 
0652       // truncated UTF-8 sequence otherwise we run the risk of going past
0653       // the start/end of the underlying sequence:
0654       //
0655       if(start != end)
0656       {
0657          unsigned char v = *start;
0658          if((v & 0xC0u) == 0x80u)
0659             invalid_sequence();
0660          if((b != start) && (b != end) && ((*b & 0xC0u) == 0x80u))
0661             invalid_sequence();
0662          BaseIterator pos = end;
0663          do
0664          {
0665             v = *--pos;
0666          }
0667          while((start != pos) && ((v & 0xC0u) == 0x80u));
0668          std::ptrdiff_t extra = detail::utf8_byte_count(v);
0669          if(std::distance(pos, end) < extra)
0670             invalid_sequence();
0671       }
0672    }
0673 private:
0674    static void invalid_sequence()
0675    {
0676       std::out_of_range e("Invalid UTF-8 sequence encountered while trying to encode UTF-32 character");
0677 #ifndef BOOST_REGEX_STANDALONE
0678       boost::throw_exception(e);
0679 #else
0680       throw e;
0681 #endif
0682    }
0683    void extract_current()const
0684    {
0685       m_value = static_cast<U32Type>(static_cast< std::uint8_t>(*m_position));
0686       // we must not have a continuation character:
0687       if((m_value & 0xC0u) == 0x80u)
0688          invalid_sequence();
0689       // see how many extra bytes we have:
0690       unsigned extra = detail::utf8_trailing_byte_count(*m_position);
0691       // extract the extra bits, 6 from each extra byte:
0692       BaseIterator next(m_position);
0693       for(unsigned c = 0; c < extra; ++c)
0694       {
0695          ++next;
0696          m_value <<= 6;
0697          // We must have a continuation byte:
0698          if((static_cast<std::uint8_t>(*next) & 0xC0) != 0x80)
0699             invalid_sequence();
0700          m_value += static_cast<std::uint8_t>(*next) & 0x3Fu;
0701       }
0702       // we now need to remove a few of the leftmost bits, but how many depends
0703       // upon how many extra bytes we've extracted:
0704       static const std::uint32_t masks[4] = 
0705       {
0706          0x7Fu,
0707          0x7FFu,
0708          0xFFFFu,
0709          0x1FFFFFu,
0710       };
0711       m_value &= masks[extra];
0712       // check the result is in range:
0713       if(m_value > static_cast<U32Type>(0x10FFFFu))
0714          invalid_sequence();
0715       // The result must not be a surrogate:
0716       if((m_value >= static_cast<U32Type>(0xD800)) && (m_value <= static_cast<U32Type>(0xDFFF)))
0717          invalid_sequence();
0718       // We should not have had an invalidly encoded UTF8 sequence:
0719       if((extra > 0) && (m_value <= static_cast<U32Type>(masks[extra - 1])))
0720          invalid_sequence();
0721    }
0722    BaseIterator m_position;
0723    mutable U32Type m_value;
0724 };
0725 
0726 template <class BaseIterator>
0727 class utf16_output_iterator
0728 {
0729 public:
0730    typedef void                                   difference_type;
0731    typedef void                                   value_type;
0732    typedef std::uint32_t*                         pointer;
0733    typedef std::uint32_t&                         reference;
0734    typedef std::output_iterator_tag               iterator_category;
0735 
0736    utf16_output_iterator(const BaseIterator& b)
0737       : m_position(b){}
0738    utf16_output_iterator(const utf16_output_iterator& that)
0739       : m_position(that.m_position){}
0740    utf16_output_iterator& operator=(const utf16_output_iterator& that)
0741    {
0742       m_position = that.m_position;
0743       return *this;
0744    }
0745    const utf16_output_iterator& operator*()const
0746    {
0747       return *this;
0748    }
0749    void operator=(std::uint32_t val)const
0750    {
0751       push(val);
0752    }
0753    utf16_output_iterator& operator++()
0754    {
0755       return *this;
0756    }
0757    utf16_output_iterator& operator++(int)
0758    {
0759       return *this;
0760    }
0761    BaseIterator base()const
0762    {
0763       return m_position;
0764    }
0765 private:
0766    void push(std::uint32_t v)const
0767    {
0768       if(v >= 0x10000u)
0769       {
0770          // begin by checking for a code point out of range:
0771          if(v > 0x10FFFFu)
0772             detail::invalid_utf32_code_point(v);
0773          // split into two surrogates:
0774          *m_position++ = static_cast<std::uint16_t>(v >> 10) + detail::high_surrogate_base;
0775          *m_position++ = static_cast<std::uint16_t>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
0776       }
0777       else
0778       {
0779          // 16-bit code point:
0780          // value must not be a surrogate:
0781          if(detail::is_surrogate(v))
0782             detail::invalid_utf32_code_point(v);
0783          *m_position++ = static_cast<std::uint16_t>(v);
0784       }
0785    }
0786    mutable BaseIterator m_position;
0787 };
0788 
0789 template <class BaseIterator>
0790 class utf8_output_iterator
0791 {
0792 public:
0793    typedef void                                   difference_type;
0794    typedef void                                   value_type;
0795    typedef std::uint32_t*                       pointer;
0796    typedef std::uint32_t&                       reference;
0797    typedef std::output_iterator_tag               iterator_category;
0798 
0799    utf8_output_iterator(const BaseIterator& b)
0800       : m_position(b){}
0801    utf8_output_iterator(const utf8_output_iterator& that)
0802       : m_position(that.m_position){}
0803    utf8_output_iterator& operator=(const utf8_output_iterator& that)
0804    {
0805       m_position = that.m_position;
0806       return *this;
0807    }
0808    const utf8_output_iterator& operator*()const
0809    {
0810       return *this;
0811    }
0812    void operator=(std::uint32_t val)const
0813    {
0814       push(val);
0815    }
0816    utf8_output_iterator& operator++()
0817    {
0818       return *this;
0819    }
0820    utf8_output_iterator& operator++(int)
0821    {
0822       return *this;
0823    }
0824    BaseIterator base()const
0825    {
0826       return m_position;
0827    }
0828 private:
0829    void push(std::uint32_t c)const
0830    {
0831       if(c > 0x10FFFFu)
0832          detail::invalid_utf32_code_point(c);
0833       if(c < 0x80u)
0834       {
0835          *m_position++ = static_cast<unsigned char>(c);
0836       }
0837       else if(c < 0x800u)
0838       {
0839          *m_position++ = static_cast<unsigned char>(0xC0u + (c >> 6));
0840          *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
0841       }
0842       else if(c < 0x10000u)
0843       {
0844          *m_position++ = static_cast<unsigned char>(0xE0u + (c >> 12));
0845          *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
0846          *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
0847       }
0848       else
0849       {
0850          *m_position++ = static_cast<unsigned char>(0xF0u + (c >> 18));
0851          *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
0852          *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
0853          *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
0854       }
0855    }
0856    mutable BaseIterator m_position;
0857 };
0858 
0859 } // namespace boost
0860 
0861 #endif // BOOST_REGEX_UNICODE_ITERATOR_HPP
0862