detail/dynamic/parse_charset.hpp

0001 ///////////////////////////////////////////////////////////////////////////////
0002 // parse_charset.hpp
0003 //
0004 //  Copyright 2008 Eric Niebler. Distributed under the Boost
0005 //  Software License, Version 1.0. (See accompanying file
0006 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
0007
0008 #ifndef BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSE_CHARSET_HPP_EAN_10_04_2005
0009 #define BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSE_CHARSET_HPP_EAN_10_04_2005
0010
0011 // MS compatible compilers support #pragma once
0012 #if defined(_MSC_VER)
0013 # pragma once
0014 #endif
0015
0016 #include <boost/config.hpp>
0017 #include <boost/integer.hpp>
0018 #include <boost/mpl/bool.hpp>
0019 #include <boost/throw_exception.hpp>
0020 #include <boost/numeric/conversion/converter.hpp>
0021 #include <boost/xpressive/detail/detail_fwd.hpp>
0022 #include <boost/xpressive/detail/dynamic/parser_enum.hpp>
0023 #include <boost/xpressive/detail/utility/literals.hpp>
0024 #include <boost/xpressive/detail/utility/chset/chset.hpp>
0025 #include <boost/xpressive/regex_constants.hpp>
0026
0027 namespace boost { namespace xpressive { namespace detail
0028 {
0029
0030 enum escape_type
0031 {
0032     escape_char
0033   , escape_mark
0034   , escape_class
0035 };
0036
0037 ///////////////////////////////////////////////////////////////////////////////
0038 // escape_value
0039 //
0040 template<typename Char, typename Class>
0041 struct escape_value
0042 {
0043     Char ch_;
0044     int mark_nbr_;
0045     Class class_;
0046     escape_type type_;
0047 };
0048
0049 ///////////////////////////////////////////////////////////////////////////////
0050 // char_overflow_handler
0051 //
0052 struct char_overflow_handler
0053 {
0054     void operator ()(numeric::range_check_result result) const // throw(regex_error)
0055     {
0056         if(numeric::cInRange != result)
0057         {
0058             BOOST_THROW_EXCEPTION(
0059                 regex_error(
0060                     regex_constants::error_escape
0061                   , "character escape too large to fit in target character type"
0062                 )
0063             );
0064         }
0065     }
0066 };
0067
0068 ///////////////////////////////////////////////////////////////////////////////
0069 // parse_escape
0070 //
0071 template<typename FwdIter, typename CompilerTraits>
0072 escape_value<typename iterator_value<FwdIter>::type, typename CompilerTraits::regex_traits::char_class_type>
0073 parse_escape(FwdIter &begin, FwdIter end, CompilerTraits &tr)
0074 {
0075     using namespace regex_constants;
0076     typedef typename iterator_value<FwdIter>::type char_type;
0077     typedef typename CompilerTraits::regex_traits regex_traits;
0078     typedef typename regex_traits::char_class_type char_class_type;
0079
0080     // define an unsigned type the same size as char_type
0081     typedef typename boost::uint_t<CHAR_BIT * sizeof(char_type)>::least uchar_t;
0082     BOOST_MPL_ASSERT_RELATION(sizeof(uchar_t), ==, sizeof(char_type));
0083     typedef numeric::conversion_traits<uchar_t, int> converstion_traits;
0084
0085     BOOST_XPR_ENSURE_(begin != end, error_escape, "unexpected end of pattern found");
0086     numeric::converter<int, uchar_t, converstion_traits, char_overflow_handler> converter;
0087     escape_value<char_type,char_class_type> esc = { 0, 0, 0, escape_char };
0088     bool const icase = (0 != (regex_constants::icase_ & tr.flags()));
0089     regex_traits const &rxtraits = tr.traits();
0090     FwdIter tmp;
0091
0092     esc.class_ = rxtraits.lookup_classname(begin, begin + 1, icase);
0093     if(0 != esc.class_)
0094     {
0095         esc.type_ = escape_class;
0096         return esc;
0097     }
0098
0099     if(-1 != rxtraits.value(*begin, 8))
0100     {
0101         esc.ch_ = converter(toi(begin, end, rxtraits, 8, 0777));
0102         return esc;
0103     }
0104
0105     switch(*begin)
0106     {
0107     // bell character
0108     case BOOST_XPR_CHAR_(char_type, 'a'):
0109         esc.ch_ = BOOST_XPR_CHAR_(char_type, '\a');
0110         ++begin;
0111         break;
0112     // escape character
0113     case BOOST_XPR_CHAR_(char_type, 'e'):
0114         esc.ch_ = converter(27);
0115         ++begin;
0116         break;
0117     // control character
0118     case BOOST_XPR_CHAR_(char_type, 'c'):
0119         BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found");
0120         BOOST_XPR_ENSURE_
0121         (
0122             rxtraits.in_range(BOOST_XPR_CHAR_(char_type, 'a'), BOOST_XPR_CHAR_(char_type, 'z'), *begin)
0123          || rxtraits.in_range(BOOST_XPR_CHAR_(char_type, 'A'), BOOST_XPR_CHAR_(char_type, 'Z'), *begin)
0124           , error_escape
0125           , "invalid escape control letter; must be one of a-z or A-Z"
0126         );
0127         // Convert to character according to ECMA-262, section 15.10.2.10:
0128         esc.ch_ = converter(*begin % 32);
0129         ++begin;
0130         break;
0131     // formfeed character
0132     case BOOST_XPR_CHAR_(char_type, 'f'):
0133         esc.ch_ = BOOST_XPR_CHAR_(char_type, '\f');
0134         ++begin;
0135         break;
0136     // newline
0137     case BOOST_XPR_CHAR_(char_type, 'n'):
0138         esc.ch_ = BOOST_XPR_CHAR_(char_type, '\n');
0139         ++begin;
0140         break;
0141     // return
0142     case BOOST_XPR_CHAR_(char_type, 'r'):
0143         esc.ch_ = BOOST_XPR_CHAR_(char_type, '\r');
0144         ++begin;
0145         break;
0146     // horizontal tab
0147     case BOOST_XPR_CHAR_(char_type, 't'):
0148         esc.ch_ = BOOST_XPR_CHAR_(char_type, '\t');
0149         ++begin;
0150         break;
0151     // vertical tab
0152     case BOOST_XPR_CHAR_(char_type, 'v'):
0153         esc.ch_ = BOOST_XPR_CHAR_(char_type, '\v');
0154         ++begin;
0155         break;
0156     // hex escape sequence
0157     case BOOST_XPR_CHAR_(char_type, 'x'):
0158         BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found");
0159         tmp = begin;
0160         esc.ch_ = converter(toi(begin, end, rxtraits, 16, 0xff));
0161         BOOST_XPR_ENSURE_(2 == std::distance(tmp, begin), error_escape, "invalid hex escape : "
0162             "must be \\x HexDigit HexDigit");
0163         break;
0164     // Unicode escape sequence
0165     case BOOST_XPR_CHAR_(char_type, 'u'):
0166         BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found");
0167         tmp = begin;
0168         esc.ch_ = converter(toi(begin, end, rxtraits, 16, 0xffff));
0169         BOOST_XPR_ENSURE_(4 == std::distance(tmp, begin), error_escape, "invalid Unicode escape : "
0170             "must be \\u HexDigit HexDigit HexDigit HexDigit");
0171         break;
0172     // backslash
0173     case BOOST_XPR_CHAR_(char_type, '\\'):
0174         //esc.ch_ = BOOST_XPR_CHAR_(char_type, '\\');
0175         //++begin;
0176         //break;
0177     // all other escaped characters represent themselves
0178     default:
0179         esc.ch_ = *begin;
0180         ++begin;
0181         break;
0182     }
0183
0184     return esc;
0185 }
0186
0187 //////////////////////////////////////////////////////////////////////////
0188 // parse_charset
0189 //
0190 template<typename FwdIter, typename RegexTraits, typename CompilerTraits>
0191 inline void parse_charset
0192 (
0193     FwdIter &begin
0194   , FwdIter end
0195   , compound_charset<RegexTraits> &chset
0196   , CompilerTraits &tr
0197 )
0198 {
0199     using namespace regex_constants;
0200     typedef typename RegexTraits::char_type char_type;
0201     typedef typename RegexTraits::char_class_type char_class_type;
0202     BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found");
0203     RegexTraits const &rxtraits = tr.traits();
0204     bool const icase = (0 != (regex_constants::icase_ & tr.flags()));
0205     FwdIter iprev = FwdIter();
0206     escape_value<char_type, char_class_type> esc = {0, 0, 0, escape_char};
0207     bool invert = false;
0208
0209     // check to see if we have an inverse charset
0210     if(begin != end && token_charset_invert == tr.get_charset_token(iprev = begin, end))
0211     {
0212         begin = iprev;
0213         invert = true;
0214     }
0215
0216     // skip the end token if-and-only-if it is the first token in the charset
0217     if(begin != end && token_charset_end == tr.get_charset_token(iprev = begin, end))
0218     {
0219         for(; begin != iprev; ++begin)
0220         {
0221             chset.set_char(*begin, rxtraits, icase);
0222         }
0223     }
0224
0225     compiler_token_type tok;
0226     char_type ch_prev = char_type(), ch_next = char_type();
0227     bool have_prev = false;
0228
0229     BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found");
0230
0231     // remember the current position and grab the next token
0232     iprev = begin;
0233     tok = tr.get_charset_token(begin, end);
0234     do
0235     {
0236         BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found");
0237
0238         if(token_charset_hyphen == tok && have_prev)
0239         {
0240             // remember the current position
0241             FwdIter iprev2 = begin;
0242             have_prev = false;
0243
0244             // ch_prev is lower bound of a range
0245             switch(tr.get_charset_token(begin, end))
0246             {
0247             case token_charset_hyphen:
0248             case token_charset_invert:
0249                 begin = iprev2; // un-get these tokens and fall through
0250                 BOOST_FALLTHROUGH;
0251             case token_literal:
0252                 ch_next = *begin++;
0253                 BOOST_XPR_ENSURE_(ch_prev <= ch_next, error_range, "invalid charset range");
0254                 chset.set_range(ch_prev, ch_next, rxtraits, icase);
0255                 continue;
0256             case token_charset_backspace:
0257                 ch_next = char_type(8); // backspace
0258                 BOOST_XPR_ENSURE_(ch_prev <= ch_next, error_range, "invalid charset range");
0259                 chset.set_range(ch_prev, ch_next, rxtraits, icase);
0260                 continue;
0261             case token_escape:
0262                 esc = parse_escape(begin, end, tr);
0263                 if(escape_char == esc.type_)
0264                 {
0265                     BOOST_XPR_ENSURE_(ch_prev <= esc.ch_, error_range, "invalid charset range");
0266                     chset.set_range(ch_prev, esc.ch_, rxtraits, icase);
0267                     continue;
0268                 }
0269                 BOOST_FALLTHROUGH;
0270             case token_charset_end:
0271             default:                // not a range.
0272                 begin = iprev;      // backup to hyphen token
0273                 chset.set_char(ch_prev, rxtraits, icase);
0274                 chset.set_char(*begin++, rxtraits, icase);
0275                 continue;
0276             }
0277         }
0278
0279         if(have_prev)
0280         {
0281             chset.set_char(ch_prev, rxtraits, icase);
0282             have_prev = false;
0283         }
0284
0285         switch(tok)
0286         {
0287         case token_charset_hyphen:
0288         case token_charset_invert:
0289         case token_charset_end:
0290         case token_posix_charset_end:
0291             begin = iprev; // un-get these tokens
0292             ch_prev = *begin++;
0293             have_prev = true;
0294             continue;
0295
0296         case token_charset_backspace:
0297             ch_prev = char_type(8); // backspace
0298             have_prev = true;
0299             continue;
0300
0301         case token_posix_charset_begin:
0302             {
0303                 FwdIter tmp = begin, start = begin;
0304                 bool invert = (token_charset_invert == tr.get_charset_token(tmp, end));
0305                 if(invert)
0306                 {
0307                     begin = start = tmp;
0308                 }
0309                 while(token_literal == (tok = tr.get_charset_token(begin, end)))
0310                 {
0311                     tmp = ++begin;
0312                     BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found");
0313                 }
0314                 if(token_posix_charset_end == tok)
0315                 {
0316                     char_class_type chclass = rxtraits.lookup_classname(start, tmp, icase);
0317                     BOOST_XPR_ENSURE_(0 != chclass, error_ctype, "unknown class name");
0318                     chset.set_class(chclass, invert);
0319                     continue;
0320                 }
0321                 begin = iprev; // un-get this token
0322                 ch_prev = *begin++;
0323                 have_prev = true;
0324             }
0325             continue;
0326
0327         case token_escape:
0328             esc = parse_escape(begin, end, tr);
0329             if(escape_char == esc.type_)
0330             {
0331                 ch_prev = esc.ch_;
0332                 have_prev = true;
0333             }
0334             else if(escape_class == esc.type_)
0335             {
0336                 char_class_type upper_ = lookup_classname(rxtraits, "upper");
0337                 BOOST_ASSERT(0 != upper_);
0338                 chset.set_class(esc.class_, rxtraits.isctype(*begin++, upper_));
0339             }
0340             else
0341             {
0342                 BOOST_ASSERT(false);
0343             }
0344             continue;
0345
0346         default:
0347             ch_prev = *begin++;
0348             have_prev = true;
0349             continue;
0350         }
0351     }
0352     while(BOOST_XPR_ENSURE_((iprev = begin) != end, error_brack, "unexpected end of pattern found"),
0353           token_charset_end != (tok = tr.get_charset_token(begin, end)));
0354
0355     if(have_prev)
0356     {
0357         chset.set_char(ch_prev, rxtraits, icase);
0358     }
0359
0360     if(invert)
0361     {
0362         chset.inverse();
0363     }
0364 }
0365
0366 }}} // namespace boost::xpressive::detail
0367
0368 #endif