parser/tokeniser/re_tokeniser.hpp

0001 // tokeniser.hpp
0002 // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/)
0003 //
0004 // Distributed under the Boost Software License, Version 1.0. (See accompanying
0005 // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
0006 #ifndef BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_PARSER_TOKENISER_RE_TOKENISER_HPP
0007 #define BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_PARSER_TOKENISER_RE_TOKENISER_HPP
0008
0009 // memcpy()
0010 #include <cstring>
0011 #include <map>
0012 #include "num_token.hpp"
0013 #include "../../runtime_error.hpp"
0014 #include "../../size_t.hpp"
0015 #include <sstream>
0016 #include "../../string_token.hpp"
0017 #include "re_tokeniser_helper.hpp"
0018
0019 namespace boost
0020 {
0021 namespace lexer
0022 {
0023 namespace detail
0024 {
0025 template<typename CharT>
0026 class basic_re_tokeniser
0027 {
0028 public:
0029     typedef basic_num_token<CharT> num_token;
0030     typedef basic_re_tokeniser_state<CharT> state;
0031     typedef basic_string_token<CharT> string_token;
0032     typedef typename string_token::string string;
0033     typedef std::map<string_token, std::size_t> token_map;
0034     typedef std::pair<string_token, std::size_t> token_pair;
0035
0036     static void next (state &state_, token_map &map_, num_token &token_)
0037     {
0038         CharT ch_ = 0;
0039         bool eos_ = state_.next (ch_);
0040
0041         token_.min_max (0, false, 0);
0042
0043         while (!eos_ && ch_ == '"')
0044         {
0045             state_._in_string ^= 1;
0046             eos_ = state_.next (ch_);
0047         }
0048
0049         if (eos_)
0050         {
0051             if (state_._in_string)
0052             {
0053                 throw runtime_error ("Unexpected end of regex "
0054                     "(missing '\"').");
0055             }
0056
0057             if (state_._paren_count)
0058             {
0059                 throw runtime_error ("Unexpected end of regex "
0060                     "(missing ')').");
0061             }
0062
0063             token_.set (num_token::END, null_token);
0064         }
0065         else
0066         {
0067             if (ch_ == '\\')
0068             {
0069                 // Even if we are in a string, respect escape sequences...
0070                 escape (state_, map_, token_);
0071             }
0072             else if (state_._in_string)
0073             {
0074                 // All other meta characters lose their special meaning
0075                 // inside a string.
0076                 create_charset_token (string (1, ch_), false, map_, token_);
0077             }
0078             else
0079             {
0080                 // Not an escape sequence and not inside a string, so
0081                 // check for meta characters.
0082                 switch (ch_)
0083                 {
0084                 case '(':
0085                     token_.set (num_token::OPENPAREN, null_token);
0086                     ++state_._paren_count;
0087                     read_options (state_);
0088                     break;
0089                 case ')':
0090                     --state_._paren_count;
0091
0092                     if (state_._paren_count < 0)
0093                     {
0094                         std::ostringstream ss_;
0095
0096                         ss_ << "Number of open parenthesis < 0 at index " <<
0097                             state_.index () - 1 << '.';
0098                         throw runtime_error (ss_.str ().c_str ());
0099                     }
0100
0101                     token_.set (num_token::CLOSEPAREN, null_token);
0102
0103                     if (!state_._flags_stack.empty ())
0104                     {
0105                         state_._flags = state_._flags_stack.top ();
0106                         state_._flags_stack.pop ();
0107                     }
0108                     break;
0109                 case '?':
0110                     if (!state_.eos () && *state_._curr == '?')
0111                     {
0112                         token_.set (num_token::AOPT, null_token);
0113                         state_.increment ();
0114                     }
0115                     else
0116                     {
0117                         token_.set (num_token::OPT, null_token);
0118                     }
0119
0120                     break;
0121                 case '*':
0122                     if (!state_.eos () && *state_._curr == '?')
0123                     {
0124                         token_.set (num_token::AZEROORMORE, null_token);
0125                         state_.increment ();
0126                     }
0127                     else
0128                     {
0129                         token_.set (num_token::ZEROORMORE, null_token);
0130                     }
0131
0132                     break;
0133                 case '+':
0134                     if (!state_.eos () && *state_._curr == '?')
0135                     {
0136                         token_.set (num_token::AONEORMORE, null_token);
0137                         state_.increment ();
0138                     }
0139                     else
0140                     {
0141                         token_.set (num_token::ONEORMORE, null_token);
0142                     }
0143
0144                     break;
0145                 case '{':
0146                     open_curly (state_, token_);
0147                     break;
0148                 case '|':
0149                     token_.set (num_token::OR, null_token);
0150                     break;
0151                 case '^':
0152                     if (state_._curr - 1 == state_._start)
0153                     {
0154                         token_.set (num_token::CHARSET, bol_token);
0155                         state_._seen_BOL_assertion = true;
0156                     }
0157                     else
0158                     {
0159                         create_charset_token (string (1, ch_), false,
0160                             map_, token_);
0161                     }
0162
0163                     break;
0164                 case '$':
0165                     if (state_._curr == state_._end)
0166                     {
0167                         token_.set (num_token::CHARSET, eol_token);
0168                         state_._seen_EOL_assertion = true;
0169                     }
0170                     else
0171                     {
0172                         create_charset_token (string (1, ch_), false,
0173                             map_, token_);
0174                     }
0175
0176                     break;
0177                 case '.':
0178                 {
0179                     string dot_;
0180
0181                     if (state_._flags & dot_not_newline)
0182                     {
0183                         dot_ = '\n';
0184                     }
0185
0186                     create_charset_token (dot_, true, map_, token_);
0187                     break;
0188                 }
0189                 case '[':
0190                 {
0191                     charset (state_, map_, token_);
0192                     break;
0193                 }
0194                 case '/':
0195                     throw runtime_error("Lookahead ('/') is not supported yet.");
0196                     break;
0197                 default:
0198                     if ((state_._flags & icase) &&
0199                         (std::isupper (ch_, state_._locale) ||
0200                         std::islower (ch_, state_._locale)))
0201                     {
0202                         CharT upper_ = std::toupper (ch_, state_._locale);
0203                         CharT lower_ = std::tolower (ch_, state_._locale);
0204
0205                         string str_ (1, upper_);
0206
0207                         str_ += lower_;
0208                         create_charset_token (str_, false, map_, token_);
0209                     }
0210                     else
0211                     {
0212                         create_charset_token (string (1, ch_), false,
0213                             map_, token_);
0214                     }
0215
0216                     break;
0217                 }
0218             }
0219         }
0220     }
0221
0222 private:
0223     typedef basic_re_tokeniser_helper<CharT> tokeniser_helper;
0224
0225     static void read_options (state &state_)
0226     {
0227         if (!state_.eos () && *state_._curr == '?')
0228         {
0229             CharT ch_ = 0;
0230             bool eos_ = false;
0231             bool negate_ = false;
0232
0233             state_.increment ();
0234             eos_ = state_.next (ch_);
0235             state_._flags_stack.push (state_._flags);
0236
0237             while (!eos_ && ch_ != ':')
0238             {
0239                 switch (ch_)
0240                 {
0241                 case '-':
0242                     negate_ ^= 1;
0243                     break;
0244                 case 'i':
0245                     if (negate_)
0246                     {
0247                         state_._flags = static_cast<regex_flags>
0248                             (state_._flags & ~icase);
0249                     }
0250                     else
0251                     {
0252                         state_._flags = static_cast<regex_flags>
0253                             (state_._flags | icase);
0254                     }
0255
0256                     negate_ = false;
0257                     break;
0258                 case 's':
0259                     if (negate_)
0260                     {
0261                         state_._flags = static_cast<regex_flags>
0262                             (state_._flags | dot_not_newline);
0263                     }
0264                     else
0265                     {
0266                         state_._flags = static_cast<regex_flags>
0267                             (state_._flags & ~dot_not_newline);
0268                     }
0269
0270                     negate_ = false;
0271                     break;
0272                 default:
0273                 {
0274                     std::ostringstream ss_;
0275
0276                     ss_ << "Unknown option at index " <<
0277                         state_.index () - 1 << '.';
0278                     throw runtime_error (ss_.str ().c_str ());
0279                 }
0280                 }
0281
0282                 eos_ = state_.next (ch_);
0283             }
0284
0285             // End of string handler will handle early termination
0286         }
0287         else if (!state_._flags_stack.empty ())
0288         {
0289             state_._flags_stack.push (state_._flags);
0290         }
0291     }
0292
0293     static void escape (state &state_, token_map &map_, num_token &token_)
0294     {
0295         CharT ch_ = 0;
0296         std::size_t str_len_ = 0;
0297         const CharT *str_ = tokeniser_helper::escape_sequence (state_,
0298             ch_, str_len_);
0299
0300         if (str_)
0301         {
0302             state state2_ (str_ + 1, str_ + str_len_, state_._flags,
0303                 state_._locale);
0304
0305             charset (state2_, map_, token_);
0306         }
0307         else
0308         {
0309             create_charset_token (string (1, ch_), false, map_, token_);
0310         }
0311     }
0312
0313     static void charset (state &state_, token_map &map_, num_token &token_)
0314     {
0315         string chars_;
0316         bool negated_ = false;
0317
0318         tokeniser_helper::charset (state_, chars_, negated_);
0319         create_charset_token (chars_, negated_, map_, token_);
0320     }
0321
0322     static void create_charset_token (const string &charset_,
0323         const bool negated_, token_map &map_, num_token &token_)
0324     {
0325         std::size_t id_ = null_token;
0326         string_token stok_ (negated_, charset_);
0327
0328         stok_.remove_duplicates ();
0329         stok_.normalise ();
0330
0331         typename token_map::const_iterator iter_ = map_.find (stok_);
0332
0333         if (iter_ == map_.end ())
0334         {
0335             id_ = map_.size ();
0336             map_.insert (token_pair (stok_, id_));
0337         }
0338         else
0339         {
0340             id_ = iter_->second;
0341         }
0342
0343         token_.set (num_token::CHARSET, id_);
0344     }
0345
0346     static void open_curly (state &state_, num_token &token_)
0347     {
0348         if (state_.eos ())
0349         {
0350             throw runtime_error ("Unexpected end of regex "
0351                 "(missing '}').");
0352         }
0353         else if (*state_._curr >= '0' && *state_._curr <= '9')
0354         {
0355             repeat_n (state_, token_);
0356
0357             if (!state_.eos () && *state_._curr == '?')
0358             {
0359                 token_._type = num_token::AREPEATN;
0360                 state_.increment ();
0361             }
0362         }
0363         else
0364         {
0365             macro (state_, token_);
0366         }
0367     }
0368
0369     // SYNTAX:
0370     //   {n[,[n]]}
0371     // SEMANTIC RULES:
0372     //   {0} - INVALID (throw exception)
0373     //   {0,} = *
0374     //   {0,0} - INVALID (throw exception)
0375     //   {0,1} = ?
0376     //   {1,} = +
0377     //   {min,max} where min == max - {min}
0378     //   {min,max} where max < min - INVALID (throw exception)
0379     static void repeat_n (state &state_, num_token &token_)
0380     {
0381         CharT ch_ = 0;
0382         bool eos_ = state_.next (ch_);
0383
0384         while (!eos_ && ch_ >= '0' && ch_ <= '9')
0385         {
0386             token_._min *= 10;
0387             token_._min += ch_ - '0';
0388             eos_ = state_.next (ch_);
0389         }
0390
0391         if (eos_)
0392         {
0393             throw runtime_error ("Unexpected end of regex "
0394                 "(missing '}').");
0395         }
0396
0397         bool min_max_ = false;
0398         bool repeatn_ = true;
0399
0400         token_._comma = ch_ == ',';
0401
0402         if (token_._comma)
0403         {
0404             eos_ = state_.next (ch_);
0405
0406             if (eos_)
0407             {
0408                 throw runtime_error ("Unexpected end of regex "
0409                     "(missing '}').");
0410             }
0411
0412             if (ch_ == '}')
0413             {
0414                 // Small optimisation: Check for '*' equivalency.
0415                 if (token_._min == 0)
0416                 {
0417                     token_.set (num_token::ZEROORMORE, null_token);
0418                     repeatn_ = false;
0419                 }
0420                 // Small optimisation: Check for '+' equivalency.
0421                 else if (token_._min == 1)
0422                 {
0423                     token_.set (num_token::ONEORMORE, null_token);
0424                     repeatn_ = false;
0425                 }
0426             }
0427             else
0428             {
0429                 if (ch_ < '0' || ch_ > '9')
0430                 {
0431                     std::ostringstream ss_;
0432
0433                     ss_ << "Missing '}' at index " <<
0434                         state_.index () - 1 << '.';
0435                     throw runtime_error (ss_.str ().c_str ());
0436                 }
0437
0438                 min_max_ = true;
0439
0440                 do
0441                 {
0442                     token_._max *= 10;
0443                     token_._max += ch_ - '0';
0444                     eos_ = state_.next (ch_);
0445                 } while (!eos_ && ch_ >= '0' && ch_ <= '9');
0446
0447                 if (eos_)
0448                 {
0449                     throw runtime_error ("Unexpected end of regex "
0450                         "(missing '}').");
0451                 }
0452
0453                 // Small optimisation: Check for '?' equivalency.
0454                 if (token_._min == 0 && token_._max == 1)
0455                 {
0456                     token_.set (num_token::OPT, null_token);
0457                     repeatn_ = false;
0458                 }
0459                 // Small optimisation: if min == max, then min.
0460                 else if (token_._min == token_._max)
0461                 {
0462                     token_._comma = false;
0463                     min_max_ = false;
0464                     token_._max = 0;
0465                 }
0466             }
0467         }
0468
0469         if (ch_ != '}')
0470         {
0471             std::ostringstream ss_;
0472
0473             ss_ << "Missing '}' at index " << state_.index () - 1 << '.';
0474             throw runtime_error (ss_.str ().c_str ());
0475         }
0476
0477         if (repeatn_)
0478         {
0479             // SEMANTIC VALIDATION follows:
0480             // NOTE: {0,} has already become *
0481             // therefore we don't check for a comma.
0482             if (token_._min == 0 && token_._max == 0)
0483             {
0484                 std::ostringstream ss_;
0485
0486                 ss_ << "Cannot have exactly zero repeats preceding index " <<
0487                     state_.index () << '.';
0488                 throw runtime_error (ss_.str ().c_str ());
0489             }
0490
0491             if (min_max_ && token_._max < token_._min)
0492             {
0493                 std::ostringstream ss_;
0494
0495                 ss_ << "Max less than min preceding index " <<
0496                     state_.index () << '.';
0497                 throw runtime_error (ss_.str ().c_str ());
0498             }
0499
0500             token_.set (num_token::REPEATN, null_token);
0501         }
0502     }
0503
0504     static void macro (state &state_, num_token &token_)
0505     {
0506         CharT ch_ = 0;
0507         bool eos_ = false;
0508         const CharT *start_ = state_._curr;
0509
0510         state_.next (ch_);
0511
0512         if (ch_ != '_' && !(ch_ >= 'A' && ch_ <= 'Z') &&
0513             !(ch_ >= 'a' && ch_ <= 'z'))
0514         {
0515             std::ostringstream ss_;
0516
0517             ss_ << "Invalid MACRO name at index " <<
0518                 state_.index () - 1 << '.';
0519             throw runtime_error (ss_.str ().c_str ());
0520         }
0521
0522         do
0523         {
0524             eos_ = state_.next (ch_);
0525
0526             if (eos_)
0527             {
0528                 throw runtime_error ("Unexpected end of regex "
0529                     "(missing '}').");
0530             }
0531         } while (ch_ == '_' || ch_ == '-' || (ch_ >= 'A' && ch_ <= 'Z') ||
0532             (ch_ >= 'a' && ch_ <= 'z') || (ch_ >= '0' && ch_ <= '9'));
0533
0534         if (ch_ != '}')
0535         {
0536             std::ostringstream ss_;
0537
0538             ss_ << "Missing '}' at index " << state_.index () - 1 << '.';
0539             throw runtime_error (ss_.str ().c_str ());
0540         }
0541
0542         std::size_t len_ = state_._curr - 1 - start_;
0543
0544         if (len_ > max_macro_len)
0545         {
0546             std::basic_stringstream<CharT> ss_;
0547             std::ostringstream os_;
0548
0549             os_ << "MACRO name '";
0550
0551             while (len_)
0552             {
0553                 os_ << ss_.narrow (*start_++, ' ');
0554                 --len_;
0555             }
0556
0557             os_ << "' too long.";
0558             throw runtime_error (os_.str ());
0559         }
0560
0561         token_.set (num_token::MACRO, null_token);
0562
0563         // Some systems have memcpy in namespace std.
0564         using namespace std;
0565
0566         memcpy (token_._macro, start_, len_ * sizeof (CharT));
0567         token_._macro[len_] = 0;
0568     }
0569 };
0570 }
0571 }
0572 }
0573
0574 #endif