File indexing completed on 2025-01-19 09:47:49
0001
0002
0003
0004
0005
0006 #ifndef BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_PARSER_TOKENISER_RE_TOKENISER_HPP
0007 #define BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_PARSER_TOKENISER_RE_TOKENISER_HPP
0008
0009
0010 #include <cstring>
0011 #include <map>
0012 #include "num_token.hpp"
0013 #include "../../runtime_error.hpp"
0014 #include "../../size_t.hpp"
0015 #include <sstream>
0016 #include "../../string_token.hpp"
0017 #include "re_tokeniser_helper.hpp"
0018
0019 namespace boost
0020 {
0021 namespace lexer
0022 {
0023 namespace detail
0024 {
0025 template<typename CharT>
0026 class basic_re_tokeniser
0027 {
0028 public:
0029 typedef basic_num_token<CharT> num_token;
0030 typedef basic_re_tokeniser_state<CharT> state;
0031 typedef basic_string_token<CharT> string_token;
0032 typedef typename string_token::string string;
0033 typedef std::map<string_token, std::size_t> token_map;
0034 typedef std::pair<string_token, std::size_t> token_pair;
0035
0036 static void next (state &state_, token_map &map_, num_token &token_)
0037 {
0038 CharT ch_ = 0;
0039 bool eos_ = state_.next (ch_);
0040
0041 token_.min_max (0, false, 0);
0042
0043 while (!eos_ && ch_ == '"')
0044 {
0045 state_._in_string ^= 1;
0046 eos_ = state_.next (ch_);
0047 }
0048
0049 if (eos_)
0050 {
0051 if (state_._in_string)
0052 {
0053 throw runtime_error ("Unexpected end of regex "
0054 "(missing '\"').");
0055 }
0056
0057 if (state_._paren_count)
0058 {
0059 throw runtime_error ("Unexpected end of regex "
0060 "(missing ')').");
0061 }
0062
0063 token_.set (num_token::END, null_token);
0064 }
0065 else
0066 {
0067 if (ch_ == '\\')
0068 {
0069
0070 escape (state_, map_, token_);
0071 }
0072 else if (state_._in_string)
0073 {
0074
0075
0076 create_charset_token (string (1, ch_), false, map_, token_);
0077 }
0078 else
0079 {
0080
0081
0082 switch (ch_)
0083 {
0084 case '(':
0085 token_.set (num_token::OPENPAREN, null_token);
0086 ++state_._paren_count;
0087 read_options (state_);
0088 break;
0089 case ')':
0090 --state_._paren_count;
0091
0092 if (state_._paren_count < 0)
0093 {
0094 std::ostringstream ss_;
0095
0096 ss_ << "Number of open parenthesis < 0 at index " <<
0097 state_.index () - 1 << '.';
0098 throw runtime_error (ss_.str ().c_str ());
0099 }
0100
0101 token_.set (num_token::CLOSEPAREN, null_token);
0102
0103 if (!state_._flags_stack.empty ())
0104 {
0105 state_._flags = state_._flags_stack.top ();
0106 state_._flags_stack.pop ();
0107 }
0108 break;
0109 case '?':
0110 if (!state_.eos () && *state_._curr == '?')
0111 {
0112 token_.set (num_token::AOPT, null_token);
0113 state_.increment ();
0114 }
0115 else
0116 {
0117 token_.set (num_token::OPT, null_token);
0118 }
0119
0120 break;
0121 case '*':
0122 if (!state_.eos () && *state_._curr == '?')
0123 {
0124 token_.set (num_token::AZEROORMORE, null_token);
0125 state_.increment ();
0126 }
0127 else
0128 {
0129 token_.set (num_token::ZEROORMORE, null_token);
0130 }
0131
0132 break;
0133 case '+':
0134 if (!state_.eos () && *state_._curr == '?')
0135 {
0136 token_.set (num_token::AONEORMORE, null_token);
0137 state_.increment ();
0138 }
0139 else
0140 {
0141 token_.set (num_token::ONEORMORE, null_token);
0142 }
0143
0144 break;
0145 case '{':
0146 open_curly (state_, token_);
0147 break;
0148 case '|':
0149 token_.set (num_token::OR, null_token);
0150 break;
0151 case '^':
0152 if (state_._curr - 1 == state_._start)
0153 {
0154 token_.set (num_token::CHARSET, bol_token);
0155 state_._seen_BOL_assertion = true;
0156 }
0157 else
0158 {
0159 create_charset_token (string (1, ch_), false,
0160 map_, token_);
0161 }
0162
0163 break;
0164 case '$':
0165 if (state_._curr == state_._end)
0166 {
0167 token_.set (num_token::CHARSET, eol_token);
0168 state_._seen_EOL_assertion = true;
0169 }
0170 else
0171 {
0172 create_charset_token (string (1, ch_), false,
0173 map_, token_);
0174 }
0175
0176 break;
0177 case '.':
0178 {
0179 string dot_;
0180
0181 if (state_._flags & dot_not_newline)
0182 {
0183 dot_ = '\n';
0184 }
0185
0186 create_charset_token (dot_, true, map_, token_);
0187 break;
0188 }
0189 case '[':
0190 {
0191 charset (state_, map_, token_);
0192 break;
0193 }
0194 case '/':
0195 throw runtime_error("Lookahead ('/') is not supported yet.");
0196 break;
0197 default:
0198 if ((state_._flags & icase) &&
0199 (std::isupper (ch_, state_._locale) ||
0200 std::islower (ch_, state_._locale)))
0201 {
0202 CharT upper_ = std::toupper (ch_, state_._locale);
0203 CharT lower_ = std::tolower (ch_, state_._locale);
0204
0205 string str_ (1, upper_);
0206
0207 str_ += lower_;
0208 create_charset_token (str_, false, map_, token_);
0209 }
0210 else
0211 {
0212 create_charset_token (string (1, ch_), false,
0213 map_, token_);
0214 }
0215
0216 break;
0217 }
0218 }
0219 }
0220 }
0221
0222 private:
0223 typedef basic_re_tokeniser_helper<CharT> tokeniser_helper;
0224
0225 static void read_options (state &state_)
0226 {
0227 if (!state_.eos () && *state_._curr == '?')
0228 {
0229 CharT ch_ = 0;
0230 bool eos_ = false;
0231 bool negate_ = false;
0232
0233 state_.increment ();
0234 eos_ = state_.next (ch_);
0235 state_._flags_stack.push (state_._flags);
0236
0237 while (!eos_ && ch_ != ':')
0238 {
0239 switch (ch_)
0240 {
0241 case '-':
0242 negate_ ^= 1;
0243 break;
0244 case 'i':
0245 if (negate_)
0246 {
0247 state_._flags = static_cast<regex_flags>
0248 (state_._flags & ~icase);
0249 }
0250 else
0251 {
0252 state_._flags = static_cast<regex_flags>
0253 (state_._flags | icase);
0254 }
0255
0256 negate_ = false;
0257 break;
0258 case 's':
0259 if (negate_)
0260 {
0261 state_._flags = static_cast<regex_flags>
0262 (state_._flags | dot_not_newline);
0263 }
0264 else
0265 {
0266 state_._flags = static_cast<regex_flags>
0267 (state_._flags & ~dot_not_newline);
0268 }
0269
0270 negate_ = false;
0271 break;
0272 default:
0273 {
0274 std::ostringstream ss_;
0275
0276 ss_ << "Unknown option at index " <<
0277 state_.index () - 1 << '.';
0278 throw runtime_error (ss_.str ().c_str ());
0279 }
0280 }
0281
0282 eos_ = state_.next (ch_);
0283 }
0284
0285
0286 }
0287 else if (!state_._flags_stack.empty ())
0288 {
0289 state_._flags_stack.push (state_._flags);
0290 }
0291 }
0292
0293 static void escape (state &state_, token_map &map_, num_token &token_)
0294 {
0295 CharT ch_ = 0;
0296 std::size_t str_len_ = 0;
0297 const CharT *str_ = tokeniser_helper::escape_sequence (state_,
0298 ch_, str_len_);
0299
0300 if (str_)
0301 {
0302 state state2_ (str_ + 1, str_ + str_len_, state_._flags,
0303 state_._locale);
0304
0305 charset (state2_, map_, token_);
0306 }
0307 else
0308 {
0309 create_charset_token (string (1, ch_), false, map_, token_);
0310 }
0311 }
0312
0313 static void charset (state &state_, token_map &map_, num_token &token_)
0314 {
0315 string chars_;
0316 bool negated_ = false;
0317
0318 tokeniser_helper::charset (state_, chars_, negated_);
0319 create_charset_token (chars_, negated_, map_, token_);
0320 }
0321
0322 static void create_charset_token (const string &charset_,
0323 const bool negated_, token_map &map_, num_token &token_)
0324 {
0325 std::size_t id_ = null_token;
0326 string_token stok_ (negated_, charset_);
0327
0328 stok_.remove_duplicates ();
0329 stok_.normalise ();
0330
0331 typename token_map::const_iterator iter_ = map_.find (stok_);
0332
0333 if (iter_ == map_.end ())
0334 {
0335 id_ = map_.size ();
0336 map_.insert (token_pair (stok_, id_));
0337 }
0338 else
0339 {
0340 id_ = iter_->second;
0341 }
0342
0343 token_.set (num_token::CHARSET, id_);
0344 }
0345
0346 static void open_curly (state &state_, num_token &token_)
0347 {
0348 if (state_.eos ())
0349 {
0350 throw runtime_error ("Unexpected end of regex "
0351 "(missing '}').");
0352 }
0353 else if (*state_._curr >= '0' && *state_._curr <= '9')
0354 {
0355 repeat_n (state_, token_);
0356
0357 if (!state_.eos () && *state_._curr == '?')
0358 {
0359 token_._type = num_token::AREPEATN;
0360 state_.increment ();
0361 }
0362 }
0363 else
0364 {
0365 macro (state_, token_);
0366 }
0367 }
0368
0369
0370
0371
0372
0373
0374
0375
0376
0377
0378
0379 static void repeat_n (state &state_, num_token &token_)
0380 {
0381 CharT ch_ = 0;
0382 bool eos_ = state_.next (ch_);
0383
0384 while (!eos_ && ch_ >= '0' && ch_ <= '9')
0385 {
0386 token_._min *= 10;
0387 token_._min += ch_ - '0';
0388 eos_ = state_.next (ch_);
0389 }
0390
0391 if (eos_)
0392 {
0393 throw runtime_error ("Unexpected end of regex "
0394 "(missing '}').");
0395 }
0396
0397 bool min_max_ = false;
0398 bool repeatn_ = true;
0399
0400 token_._comma = ch_ == ',';
0401
0402 if (token_._comma)
0403 {
0404 eos_ = state_.next (ch_);
0405
0406 if (eos_)
0407 {
0408 throw runtime_error ("Unexpected end of regex "
0409 "(missing '}').");
0410 }
0411
0412 if (ch_ == '}')
0413 {
0414
0415 if (token_._min == 0)
0416 {
0417 token_.set (num_token::ZEROORMORE, null_token);
0418 repeatn_ = false;
0419 }
0420
0421 else if (token_._min == 1)
0422 {
0423 token_.set (num_token::ONEORMORE, null_token);
0424 repeatn_ = false;
0425 }
0426 }
0427 else
0428 {
0429 if (ch_ < '0' || ch_ > '9')
0430 {
0431 std::ostringstream ss_;
0432
0433 ss_ << "Missing '}' at index " <<
0434 state_.index () - 1 << '.';
0435 throw runtime_error (ss_.str ().c_str ());
0436 }
0437
0438 min_max_ = true;
0439
0440 do
0441 {
0442 token_._max *= 10;
0443 token_._max += ch_ - '0';
0444 eos_ = state_.next (ch_);
0445 } while (!eos_ && ch_ >= '0' && ch_ <= '9');
0446
0447 if (eos_)
0448 {
0449 throw runtime_error ("Unexpected end of regex "
0450 "(missing '}').");
0451 }
0452
0453
0454 if (token_._min == 0 && token_._max == 1)
0455 {
0456 token_.set (num_token::OPT, null_token);
0457 repeatn_ = false;
0458 }
0459
0460 else if (token_._min == token_._max)
0461 {
0462 token_._comma = false;
0463 min_max_ = false;
0464 token_._max = 0;
0465 }
0466 }
0467 }
0468
0469 if (ch_ != '}')
0470 {
0471 std::ostringstream ss_;
0472
0473 ss_ << "Missing '}' at index " << state_.index () - 1 << '.';
0474 throw runtime_error (ss_.str ().c_str ());
0475 }
0476
0477 if (repeatn_)
0478 {
0479
0480
0481
0482 if (token_._min == 0 && token_._max == 0)
0483 {
0484 std::ostringstream ss_;
0485
0486 ss_ << "Cannot have exactly zero repeats preceding index " <<
0487 state_.index () << '.';
0488 throw runtime_error (ss_.str ().c_str ());
0489 }
0490
0491 if (min_max_ && token_._max < token_._min)
0492 {
0493 std::ostringstream ss_;
0494
0495 ss_ << "Max less than min preceding index " <<
0496 state_.index () << '.';
0497 throw runtime_error (ss_.str ().c_str ());
0498 }
0499
0500 token_.set (num_token::REPEATN, null_token);
0501 }
0502 }
0503
0504 static void macro (state &state_, num_token &token_)
0505 {
0506 CharT ch_ = 0;
0507 bool eos_ = false;
0508 const CharT *start_ = state_._curr;
0509
0510 state_.next (ch_);
0511
0512 if (ch_ != '_' && !(ch_ >= 'A' && ch_ <= 'Z') &&
0513 !(ch_ >= 'a' && ch_ <= 'z'))
0514 {
0515 std::ostringstream ss_;
0516
0517 ss_ << "Invalid MACRO name at index " <<
0518 state_.index () - 1 << '.';
0519 throw runtime_error (ss_.str ().c_str ());
0520 }
0521
0522 do
0523 {
0524 eos_ = state_.next (ch_);
0525
0526 if (eos_)
0527 {
0528 throw runtime_error ("Unexpected end of regex "
0529 "(missing '}').");
0530 }
0531 } while (ch_ == '_' || ch_ == '-' || (ch_ >= 'A' && ch_ <= 'Z') ||
0532 (ch_ >= 'a' && ch_ <= 'z') || (ch_ >= '0' && ch_ <= '9'));
0533
0534 if (ch_ != '}')
0535 {
0536 std::ostringstream ss_;
0537
0538 ss_ << "Missing '}' at index " << state_.index () - 1 << '.';
0539 throw runtime_error (ss_.str ().c_str ());
0540 }
0541
0542 std::size_t len_ = state_._curr - 1 - start_;
0543
0544 if (len_ > max_macro_len)
0545 {
0546 std::basic_stringstream<CharT> ss_;
0547 std::ostringstream os_;
0548
0549 os_ << "MACRO name '";
0550
0551 while (len_)
0552 {
0553 os_ << ss_.narrow (*start_++, ' ');
0554 --len_;
0555 }
0556
0557 os_ << "' too long.";
0558 throw runtime_error (os_.str ());
0559 }
0560
0561 token_.set (num_token::MACRO, null_token);
0562
0563
0564 using namespace std;
0565
0566 memcpy (token_._macro, start_, len_ * sizeof (CharT));
0567 token_._macro[len_] = 0;
0568 }
0569 };
0570 }
0571 }
0572 }
0573
0574 #endif