Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-18 09:51:27

0001 /*
0002  *
0003  * Copyright (c) 2004
0004  * John Maddock
0005  *
0006  * Use, modification and distribution are subject to the 
0007  * Boost Software License, Version 1.0. (See accompanying file 
0008  * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
0009  *
0010  */
0011  
0012  /*
0013   *   LOCATION:    see http://www.boost.org for most recent version.
0014   *   FILE         unicode_iterator.hpp
0015   *   VERSION      see <boost/version.hpp>
0016   *   DESCRIPTION: Iterator adapters for converting between different Unicode encodings.
0017   */
0018 
0019 /****************************************************************************
0020 
0021 Contents:
0022 ~~~~~~~~~
0023 
0024 1) Read Only, Input Adapters:
0025 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
0026 
0027 template <class BaseIterator, class U8Type = ::boost::uint8_t>
0028 class u32_to_u8_iterator;
0029 
0030 Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-8.
0031 
0032 template <class BaseIterator, class U32Type = ::boost::uint32_t>
0033 class u8_to_u32_iterator;
0034 
0035 Adapts sequence of UTF-8 code points to "look like" a sequence of UTF-32.
0036 
0037 template <class BaseIterator, class U16Type = ::boost::uint16_t>
0038 class u32_to_u16_iterator;
0039 
0040 Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-16.
0041 
0042 template <class BaseIterator, class U32Type = ::boost::uint32_t>
0043 class u16_to_u32_iterator;
0044 
0045 Adapts sequence of UTF-16 code points to "look like" a sequence of UTF-32.
0046 
0047 2) Single pass output iterator adapters:
0048 
0049 template <class BaseIterator>
0050 class utf8_output_iterator;
0051 
0052 Accepts UTF-32 code points and forwards them on as UTF-8 code points.
0053 
0054 template <class BaseIterator>
0055 class utf16_output_iterator;
0056 
0057 Accepts UTF-32 code points and forwards them on as UTF-16 code points.
0058 
0059 ****************************************************************************/
0060 
0061 #ifndef BOOST_REGEX_V4_UNICODE_ITERATOR_HPP
0062 #define BOOST_REGEX_V4_UNICODE_ITERATOR_HPP
0063 #include <boost/cstdint.hpp>
0064 #include <boost/regex/config.hpp>
0065 #include <boost/static_assert.hpp>
0066 #include <boost/throw_exception.hpp>
0067 #include <stdexcept>
0068 #ifndef BOOST_NO_STD_LOCALE
0069 #include <sstream>
0070 #include <ios>
0071 #endif
0072 #include <limits.h> // CHAR_BIT
0073 
0074 #ifdef BOOST_REGEX_CXX03
0075 
0076 #else
0077 #endif
0078 
0079 namespace boost{
0080 
0081 namespace detail{
0082 
0083 static const ::boost::uint16_t high_surrogate_base = 0xD7C0u;
0084 static const ::boost::uint16_t low_surrogate_base = 0xDC00u;
0085 static const ::boost::uint32_t ten_bit_mask = 0x3FFu;
0086 
0087 inline bool is_high_surrogate(::boost::uint16_t v)
0088 {
0089    return (v & 0xFFFFFC00u) == 0xd800u;
0090 }
0091 inline bool is_low_surrogate(::boost::uint16_t v)
0092 {
0093    return (v & 0xFFFFFC00u) == 0xdc00u;
0094 }
0095 template <class T>
0096 inline bool is_surrogate(T v)
0097 {
0098    return (v & 0xFFFFF800u) == 0xd800;
0099 }
0100 
0101 inline unsigned utf8_byte_count(boost::uint8_t c)
0102 {
0103    // if the most significant bit with a zero in it is in position
0104    // 8-N then there are N bytes in this UTF-8 sequence:
0105    boost::uint8_t mask = 0x80u;
0106    unsigned result = 0;
0107    while(c & mask)
0108    {
0109       ++result;
0110       mask >>= 1;
0111    }
0112    return (result == 0) ? 1 : ((result > 4) ? 4 : result);
0113 }
0114 
0115 inline unsigned utf8_trailing_byte_count(boost::uint8_t c)
0116 {
0117    return utf8_byte_count(c) - 1;
0118 }
0119 
0120 #ifdef BOOST_MSVC
0121 #pragma warning(push)
0122 #pragma warning(disable:4100)
0123 #endif
0124 #ifndef BOOST_NO_EXCEPTIONS
0125 BOOST_NORETURN
0126 #endif
0127 inline void invalid_utf32_code_point(::boost::uint32_t val)
0128 {
0129 #ifndef BOOST_NO_STD_LOCALE
0130    std::stringstream ss;
0131    ss << "Invalid UTF-32 code point U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-16 sequence";
0132    std::out_of_range e(ss.str());
0133 #else
0134    std::out_of_range e("Invalid UTF-32 code point encountered while trying to encode UTF-16 sequence");
0135 #endif
0136    boost::throw_exception(e);
0137 }
0138 #ifdef BOOST_MSVC
0139 #pragma warning(pop)
0140 #endif
0141 
0142 
0143 } // namespace detail
0144 
0145 template <class BaseIterator, class U16Type = ::boost::uint16_t>
0146 class u32_to_u16_iterator
0147 {
0148 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
0149    typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
0150 
0151    BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32);
0152    BOOST_STATIC_ASSERT(sizeof(U16Type)*CHAR_BIT == 16);
0153 #endif
0154 
0155 public:
0156    typedef std::ptrdiff_t     difference_type;
0157    typedef U16Type            value_type;
0158    typedef value_type const*  pointer;
0159    typedef value_type const   reference;
0160    typedef std::bidirectional_iterator_tag iterator_category;
0161 
0162    reference operator*()const
0163    {
0164       if(m_current == 2)
0165          extract_current();
0166       return m_values[m_current];
0167    }
0168    bool operator==(const u32_to_u16_iterator& that)const
0169    {
0170       if(m_position == that.m_position)
0171       {
0172          // Both m_currents must be equal, or both even
0173          // this is the same as saying their sum must be even:
0174          return (m_current + that.m_current) & 1u ? false : true;
0175       }
0176       return false;
0177    }
0178    bool operator!=(const u32_to_u16_iterator& that)const
0179    {
0180       return !(*this == that);
0181    }
0182    u32_to_u16_iterator& operator++()
0183    {
0184       // if we have a pending read then read now, so that we know whether
0185       // to skip a position, or move to a low-surrogate:
0186       if(m_current == 2)
0187       {
0188          // pending read:
0189          extract_current();
0190       }
0191       // move to the next surrogate position:
0192       ++m_current;
0193       // if we've reached the end skip a position:
0194       if(m_values[m_current] == 0)
0195       {
0196          m_current = 2;
0197          ++m_position;
0198       }
0199       return *this;
0200    }
0201    u32_to_u16_iterator operator++(int)
0202    {
0203       u32_to_u16_iterator r(*this);
0204       ++(*this);
0205       return r;
0206    }
0207    u32_to_u16_iterator& operator--()
0208    {
0209       if(m_current != 1)
0210       {
0211          // decrementing an iterator always leads to a valid position:
0212          --m_position;
0213          extract_current();
0214          m_current = m_values[1] ? 1 : 0;
0215       }
0216       else
0217       {
0218          m_current = 0;
0219       }
0220       return *this;
0221    }
0222    u32_to_u16_iterator operator--(int)
0223    {
0224       u32_to_u16_iterator r(*this);
0225       --(*this);
0226       return r;
0227    }
0228    BaseIterator base()const
0229    {
0230       return m_position;
0231    }
0232    // construct:
0233    u32_to_u16_iterator() : m_position(), m_current(0)
0234    {
0235       m_values[0] = 0;
0236       m_values[1] = 0;
0237       m_values[2] = 0;
0238    }
0239    u32_to_u16_iterator(BaseIterator b) : m_position(b), m_current(2)
0240    {
0241       m_values[0] = 0;
0242       m_values[1] = 0;
0243       m_values[2] = 0;
0244    }
0245 private:
0246 
0247    void extract_current()const
0248    {
0249       // begin by checking for a code point out of range:
0250       ::boost::uint32_t v = *m_position;
0251       if(v >= 0x10000u)
0252       {
0253          if(v > 0x10FFFFu)
0254             detail::invalid_utf32_code_point(*m_position);
0255          // split into two surrogates:
0256          m_values[0] = static_cast<U16Type>(v >> 10) + detail::high_surrogate_base;
0257          m_values[1] = static_cast<U16Type>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
0258          m_current = 0;
0259          BOOST_REGEX_ASSERT(detail::is_high_surrogate(m_values[0]));
0260          BOOST_REGEX_ASSERT(detail::is_low_surrogate(m_values[1]));
0261       }
0262       else
0263       {
0264          // 16-bit code point:
0265          m_values[0] = static_cast<U16Type>(*m_position);
0266          m_values[1] = 0;
0267          m_current = 0;
0268          // value must not be a surrogate:
0269          if(detail::is_surrogate(m_values[0]))
0270             detail::invalid_utf32_code_point(*m_position);
0271       }
0272    }
0273    BaseIterator m_position;
0274    mutable U16Type m_values[3];
0275    mutable unsigned m_current;
0276 };
0277 
0278 template <class BaseIterator, class U32Type = ::boost::uint32_t>
0279 class u16_to_u32_iterator
0280 {
0281    // special values for pending iterator reads:
0282    BOOST_STATIC_CONSTANT(U32Type, pending_read = 0xffffffffu);
0283 
0284 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
0285    typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
0286 
0287    BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 16);
0288    BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32);
0289 #endif
0290 
0291 public:
0292    typedef std::ptrdiff_t     difference_type;
0293    typedef U32Type            value_type;
0294    typedef value_type const*  pointer;
0295    typedef value_type const   reference;
0296    typedef std::bidirectional_iterator_tag iterator_category;
0297 
0298    reference operator*()const
0299    {
0300       if(m_value == pending_read)
0301          extract_current();
0302       return m_value;
0303    }
0304    bool operator==(const u16_to_u32_iterator& that)const
0305    {
0306       return m_position == that.m_position;
0307    }
0308    bool operator!=(const u16_to_u32_iterator& that)const
0309    {
0310       return !(*this == that);
0311    }
0312    u16_to_u32_iterator& operator++()
0313    {
0314       // skip high surrogate first if there is one:
0315       if(detail::is_high_surrogate(*m_position)) ++m_position;
0316       ++m_position;
0317       m_value = pending_read;
0318       return *this;
0319    }
0320    u16_to_u32_iterator operator++(int)
0321    {
0322       u16_to_u32_iterator r(*this);
0323       ++(*this);
0324       return r;
0325    }
0326    u16_to_u32_iterator& operator--()
0327    {
0328       --m_position;
0329       // if we have a low surrogate then go back one more:
0330       if(detail::is_low_surrogate(*m_position)) 
0331          --m_position;
0332       m_value = pending_read;
0333       return *this;
0334    }
0335    u16_to_u32_iterator operator--(int)
0336    {
0337       u16_to_u32_iterator r(*this);
0338       --(*this);
0339       return r;
0340    }
0341    BaseIterator base()const
0342    {
0343       return m_position;
0344    }
0345    // construct:
0346    u16_to_u32_iterator() : m_position()
0347    {
0348       m_value = pending_read;
0349    }
0350    u16_to_u32_iterator(BaseIterator b) : m_position(b)
0351    {
0352       m_value = pending_read;
0353    }
0354    //
0355    // Range checked version:
0356    //
0357    u16_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
0358    {
0359       m_value = pending_read;
0360       //
0361       // The range must not start with a low surrogate, or end in a high surrogate,
0362       // otherwise we run the risk of running outside the underlying input range.
0363       // Likewise b must not be located at a low surrogate.
0364       //
0365       boost::uint16_t val;
0366       if(start != end)
0367       {
0368          if((b != start) && (b != end))
0369          {
0370             val = *b;
0371             if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
0372                invalid_code_point(val);
0373          }
0374          val = *start;
0375          if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
0376             invalid_code_point(val);
0377          val = *--end;
0378          if(detail::is_high_surrogate(val))
0379             invalid_code_point(val);
0380       }
0381    }
0382 private:
0383    static void invalid_code_point(::boost::uint16_t val)
0384    {
0385 #ifndef BOOST_NO_STD_LOCALE
0386       std::stringstream ss;
0387       ss << "Misplaced UTF-16 surrogate U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-32 sequence";
0388       std::out_of_range e(ss.str());
0389 #else
0390       std::out_of_range e("Misplaced UTF-16 surrogate encountered while trying to encode UTF-32 sequence");
0391 #endif
0392       boost::throw_exception(e);
0393    }
0394    void extract_current()const
0395    {
0396       m_value = static_cast<U32Type>(static_cast< ::boost::uint16_t>(*m_position));
0397       // if the last value is a high surrogate then adjust m_position and m_value as needed:
0398       if(detail::is_high_surrogate(*m_position))
0399       {
0400          // precondition; next value must have be a low-surrogate:
0401          BaseIterator next(m_position);
0402          ::boost::uint16_t t = *++next;
0403          if((t & 0xFC00u) != 0xDC00u)
0404             invalid_code_point(t);
0405          m_value = (m_value - detail::high_surrogate_base) << 10;
0406          m_value |= (static_cast<U32Type>(static_cast< ::boost::uint16_t>(t)) & detail::ten_bit_mask);
0407       }
0408       // postcondition; result must not be a surrogate:
0409       if(detail::is_surrogate(m_value))
0410          invalid_code_point(static_cast< ::boost::uint16_t>(m_value));
0411    }
0412    BaseIterator m_position;
0413    mutable U32Type m_value;
0414 };
0415 
0416 template <class BaseIterator, class U8Type = ::boost::uint8_t>
0417 class u32_to_u8_iterator
0418 {
0419 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
0420    typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
0421 
0422    BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32);
0423    BOOST_STATIC_ASSERT(sizeof(U8Type)*CHAR_BIT == 8);
0424 #endif
0425 
0426 public:
0427    typedef std::ptrdiff_t     difference_type;
0428    typedef U8Type             value_type;
0429    typedef value_type const*  pointer;
0430    typedef value_type const   reference;
0431    typedef std::bidirectional_iterator_tag iterator_category;
0432 
0433    reference operator*()const
0434    {
0435       if(m_current == 4)
0436          extract_current();
0437       return m_values[m_current];
0438    }
0439    bool operator==(const u32_to_u8_iterator& that)const
0440    {
0441       if(m_position == that.m_position)
0442       {
0443          // either the m_current's must be equal, or one must be 0 and 
0444          // the other 4: which means neither must have bits 1 or 2 set:
0445          return (m_current == that.m_current)
0446             || (((m_current | that.m_current) & 3) == 0);
0447       }
0448       return false;
0449    }
0450    bool operator!=(const u32_to_u8_iterator& that)const
0451    {
0452       return !(*this == that);
0453    }
0454    u32_to_u8_iterator& operator++()
0455    {
0456       // if we have a pending read then read now, so that we know whether
0457       // to skip a position, or move to a low-surrogate:
0458       if(m_current == 4)
0459       {
0460          // pending read:
0461          extract_current();
0462       }
0463       // move to the next surrogate position:
0464       ++m_current;
0465       // if we've reached the end skip a position:
0466       if(m_values[m_current] == 0)
0467       {
0468          m_current = 4;
0469          ++m_position;
0470       }
0471       return *this;
0472    }
0473    u32_to_u8_iterator operator++(int)
0474    {
0475       u32_to_u8_iterator r(*this);
0476       ++(*this);
0477       return r;
0478    }
0479    u32_to_u8_iterator& operator--()
0480    {
0481       if((m_current & 3) == 0)
0482       {
0483          --m_position;
0484          extract_current();
0485          m_current = 3;
0486          while(m_current && (m_values[m_current] == 0))
0487             --m_current;
0488       }
0489       else
0490          --m_current;
0491       return *this;
0492    }
0493    u32_to_u8_iterator operator--(int)
0494    {
0495       u32_to_u8_iterator r(*this);
0496       --(*this);
0497       return r;
0498    }
0499    BaseIterator base()const
0500    {
0501       return m_position;
0502    }
0503    // construct:
0504    u32_to_u8_iterator() : m_position(), m_current(0)
0505    {
0506       m_values[0] = 0;
0507       m_values[1] = 0;
0508       m_values[2] = 0;
0509       m_values[3] = 0;
0510       m_values[4] = 0;
0511    }
0512    u32_to_u8_iterator(BaseIterator b) : m_position(b), m_current(4)
0513    {
0514       m_values[0] = 0;
0515       m_values[1] = 0;
0516       m_values[2] = 0;
0517       m_values[3] = 0;
0518       m_values[4] = 0;
0519    }
0520 private:
0521 
0522    void extract_current()const
0523    {
0524       boost::uint32_t c = *m_position;
0525       if(c > 0x10FFFFu)
0526          detail::invalid_utf32_code_point(c);
0527       if(c < 0x80u)
0528       {
0529          m_values[0] = static_cast<unsigned char>(c);
0530          m_values[1] = static_cast<unsigned char>(0u);
0531          m_values[2] = static_cast<unsigned char>(0u);
0532          m_values[3] = static_cast<unsigned char>(0u);
0533       }
0534       else if(c < 0x800u)
0535       {
0536          m_values[0] = static_cast<unsigned char>(0xC0u + (c >> 6));
0537          m_values[1] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
0538          m_values[2] = static_cast<unsigned char>(0u);
0539          m_values[3] = static_cast<unsigned char>(0u);
0540       }
0541       else if(c < 0x10000u)
0542       {
0543          m_values[0] = static_cast<unsigned char>(0xE0u + (c >> 12));
0544          m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
0545          m_values[2] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
0546          m_values[3] = static_cast<unsigned char>(0u);
0547       }
0548       else
0549       {
0550          m_values[0] = static_cast<unsigned char>(0xF0u + (c >> 18));
0551          m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
0552          m_values[2] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
0553          m_values[3] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
0554       }
0555       m_current= 0;
0556    }
0557    BaseIterator m_position;
0558    mutable U8Type m_values[5];
0559    mutable unsigned m_current;
0560 };
0561 
0562 template <class BaseIterator, class U32Type = ::boost::uint32_t>
0563 class u8_to_u32_iterator
0564 {
0565    // special values for pending iterator reads:
0566    BOOST_STATIC_CONSTANT(U32Type, pending_read = 0xffffffffu);
0567 
0568 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
0569    typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
0570 
0571    BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 8);
0572    BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32);
0573 #endif
0574 
0575 public:
0576    typedef std::ptrdiff_t     difference_type;
0577    typedef U32Type            value_type;
0578    typedef value_type const*  pointer;
0579    typedef value_type const   reference;
0580    typedef std::bidirectional_iterator_tag iterator_category;
0581 
0582    reference operator*()const
0583    {
0584       if(m_value == pending_read)
0585          extract_current();
0586       return m_value;
0587    }
0588    bool operator==(const u8_to_u32_iterator& that)const
0589    {
0590       return m_position == that.m_position;
0591    }
0592    bool operator!=(const u8_to_u32_iterator& that)const
0593    {
0594       return !(*this == that);
0595    }
0596    u8_to_u32_iterator& operator++()
0597    {
0598       // We must not start with a continuation character:
0599       if((static_cast<boost::uint8_t>(*m_position) & 0xC0) == 0x80)
0600          invalid_sequence();
0601       // skip high surrogate first if there is one:
0602       unsigned c = detail::utf8_byte_count(*m_position);
0603       if(m_value == pending_read)
0604       {
0605          // Since we haven't read in a value, we need to validate the code points:
0606          for(unsigned i = 0; i < c; ++i)
0607          {
0608             ++m_position;
0609             // We must have a continuation byte:
0610             if((i != c - 1) && ((static_cast<boost::uint8_t>(*m_position) & 0xC0) != 0x80))
0611                invalid_sequence();
0612          }
0613       }
0614       else
0615       {
0616          std::advance(m_position, c);
0617       }
0618       m_value = pending_read;
0619       return *this;
0620    }
0621    u8_to_u32_iterator operator++(int)
0622    {
0623       u8_to_u32_iterator r(*this);
0624       ++(*this);
0625       return r;
0626    }
0627    u8_to_u32_iterator& operator--()
0628    {
0629       // Keep backtracking until we don't have a trailing character:
0630       unsigned count = 0;
0631       while((*--m_position & 0xC0u) == 0x80u) ++count;
0632       // now check that the sequence was valid:
0633       if(count != detail::utf8_trailing_byte_count(*m_position))
0634          invalid_sequence();
0635       m_value = pending_read;
0636       return *this;
0637    }
0638    u8_to_u32_iterator operator--(int)
0639    {
0640       u8_to_u32_iterator r(*this);
0641       --(*this);
0642       return r;
0643    }
0644    BaseIterator base()const
0645    {
0646       return m_position;
0647    }
0648    // construct:
0649    u8_to_u32_iterator() : m_position()
0650    {
0651       m_value = pending_read;
0652    }
0653    u8_to_u32_iterator(BaseIterator b) : m_position(b)
0654    {
0655       m_value = pending_read;
0656    }
0657    //
0658    // Checked constructor:
0659    //
0660    u8_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
0661    {
0662       m_value = pending_read;
0663       //
0664       // We must not start with a continuation character, or end with a 
0665       // truncated UTF-8 sequence otherwise we run the risk of going past
0666       // the start/end of the underlying sequence:
0667       //
0668       if(start != end)
0669       {
0670          unsigned char v = *start;
0671          if((v & 0xC0u) == 0x80u)
0672             invalid_sequence();
0673          if((b != start) && (b != end) && ((*b & 0xC0u) == 0x80u))
0674             invalid_sequence();
0675          BaseIterator pos = end;
0676          do
0677          {
0678             v = *--pos;
0679          }
0680          while((start != pos) && ((v & 0xC0u) == 0x80u));
0681          std::ptrdiff_t extra = detail::utf8_byte_count(v);
0682          if(std::distance(pos, end) < extra)
0683             invalid_sequence();
0684       }
0685    }
0686 private:
0687    static void invalid_sequence()
0688    {
0689       std::out_of_range e("Invalid UTF-8 sequence encountered while trying to encode UTF-32 character");
0690       boost::throw_exception(e);
0691    }
0692    void extract_current()const
0693    {
0694       m_value = static_cast<U32Type>(static_cast< ::boost::uint8_t>(*m_position));
0695       // we must not have a continuation character:
0696       if((m_value & 0xC0u) == 0x80u)
0697          invalid_sequence();
0698       // see how many extra bytes we have:
0699       unsigned extra = detail::utf8_trailing_byte_count(*m_position);
0700       // extract the extra bits, 6 from each extra byte:
0701       BaseIterator next(m_position);
0702       for(unsigned c = 0; c < extra; ++c)
0703       {
0704          ++next;
0705          m_value <<= 6;
0706          // We must have a continuation byte:
0707          if((static_cast<boost::uint8_t>(*next) & 0xC0) != 0x80)
0708             invalid_sequence();
0709          m_value += static_cast<boost::uint8_t>(*next) & 0x3Fu;
0710       }
0711       // we now need to remove a few of the leftmost bits, but how many depends
0712       // upon how many extra bytes we've extracted:
0713       static const boost::uint32_t masks[4] = 
0714       {
0715          0x7Fu,
0716          0x7FFu,
0717          0xFFFFu,
0718          0x1FFFFFu,
0719       };
0720       m_value &= masks[extra];
0721       // check the result is in range:
0722       if(m_value > static_cast<U32Type>(0x10FFFFu))
0723          invalid_sequence();
0724       // The result must not be a surrogate:
0725       if((m_value >= static_cast<U32Type>(0xD800)) && (m_value <= static_cast<U32Type>(0xDFFF)))
0726          invalid_sequence();
0727       // We should not have had an invalidly encoded UTF8 sequence:
0728       if((extra > 0) && (m_value <= static_cast<U32Type>(masks[extra - 1])))
0729          invalid_sequence();
0730    }
0731    BaseIterator m_position;
0732    mutable U32Type m_value;
0733 };
0734 
0735 template <class BaseIterator>
0736 class utf16_output_iterator
0737 {
0738 public:
0739    typedef void                                   difference_type;
0740    typedef void                                   value_type;
0741    typedef boost::uint32_t*                       pointer;
0742    typedef boost::uint32_t&                       reference;
0743    typedef std::output_iterator_tag               iterator_category;
0744 
0745    utf16_output_iterator(const BaseIterator& b)
0746       : m_position(b){}
0747    utf16_output_iterator(const utf16_output_iterator& that)
0748       : m_position(that.m_position){}
0749    utf16_output_iterator& operator=(const utf16_output_iterator& that)
0750    {
0751       m_position = that.m_position;
0752       return *this;
0753    }
0754    const utf16_output_iterator& operator*()const
0755    {
0756       return *this;
0757    }
0758    void operator=(boost::uint32_t val)const
0759    {
0760       push(val);
0761    }
0762    utf16_output_iterator& operator++()
0763    {
0764       return *this;
0765    }
0766    utf16_output_iterator& operator++(int)
0767    {
0768       return *this;
0769    }
0770    BaseIterator base()const
0771    {
0772       return m_position;
0773    }
0774 private:
0775    void push(boost::uint32_t v)const
0776    {
0777       if(v >= 0x10000u)
0778       {
0779          // begin by checking for a code point out of range:
0780          if(v > 0x10FFFFu)
0781             detail::invalid_utf32_code_point(v);
0782          // split into two surrogates:
0783          *m_position++ = static_cast<boost::uint16_t>(v >> 10) + detail::high_surrogate_base;
0784          *m_position++ = static_cast<boost::uint16_t>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
0785       }
0786       else
0787       {
0788          // 16-bit code point:
0789          // value must not be a surrogate:
0790          if(detail::is_surrogate(v))
0791             detail::invalid_utf32_code_point(v);
0792          *m_position++ = static_cast<boost::uint16_t>(v);
0793       }
0794    }
0795    mutable BaseIterator m_position;
0796 };
0797 
0798 template <class BaseIterator>
0799 class utf8_output_iterator
0800 {
0801 public:
0802    typedef void                                   difference_type;
0803    typedef void                                   value_type;
0804    typedef boost::uint32_t*                       pointer;
0805    typedef boost::uint32_t&                       reference;
0806    typedef std::output_iterator_tag               iterator_category;
0807 
0808    utf8_output_iterator(const BaseIterator& b)
0809       : m_position(b){}
0810    utf8_output_iterator(const utf8_output_iterator& that)
0811       : m_position(that.m_position){}
0812    utf8_output_iterator& operator=(const utf8_output_iterator& that)
0813    {
0814       m_position = that.m_position;
0815       return *this;
0816    }
0817    const utf8_output_iterator& operator*()const
0818    {
0819       return *this;
0820    }
0821    void operator=(boost::uint32_t val)const
0822    {
0823       push(val);
0824    }
0825    utf8_output_iterator& operator++()
0826    {
0827       return *this;
0828    }
0829    utf8_output_iterator& operator++(int)
0830    {
0831       return *this;
0832    }
0833    BaseIterator base()const
0834    {
0835       return m_position;
0836    }
0837 private:
0838    void push(boost::uint32_t c)const
0839    {
0840       if(c > 0x10FFFFu)
0841          detail::invalid_utf32_code_point(c);
0842       if(c < 0x80u)
0843       {
0844          *m_position++ = static_cast<unsigned char>(c);
0845       }
0846       else if(c < 0x800u)
0847       {
0848          *m_position++ = static_cast<unsigned char>(0xC0u + (c >> 6));
0849          *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
0850       }
0851       else if(c < 0x10000u)
0852       {
0853          *m_position++ = static_cast<unsigned char>(0xE0u + (c >> 12));
0854          *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
0855          *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
0856       }
0857       else
0858       {
0859          *m_position++ = static_cast<unsigned char>(0xF0u + (c >> 18));
0860          *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
0861          *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
0862          *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
0863       }
0864    }
0865    mutable BaseIterator m_position;
0866 };
0867 
0868 } // namespace boost
0869 
0870 #endif // BOOST_REGEX_UNICODE_ITERATOR_HPP
0871