File indexing completed on 2024-11-15 09:34:08
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010 #ifndef BOOST_XPRESSIVE_REGEX_COMPILER_HPP_EAN_10_04_2005
0011 #define BOOST_XPRESSIVE_REGEX_COMPILER_HPP_EAN_10_04_2005
0012
0013
0014 #if defined(_MSC_VER)
0015 # pragma once
0016 #endif
0017
0018 #include <map>
0019 #include <boost/config.hpp>
0020 #include <boost/assert.hpp>
0021 #include <boost/next_prior.hpp>
0022 #include <boost/range/begin.hpp>
0023 #include <boost/range/end.hpp>
0024 #include <boost/mpl/assert.hpp>
0025 #include <boost/throw_exception.hpp>
0026 #include <boost/type_traits/is_same.hpp>
0027 #include <boost/type_traits/is_pointer.hpp>
0028 #include <boost/utility/enable_if.hpp>
0029 #include <boost/iterator/iterator_traits.hpp>
0030 #include <boost/xpressive/basic_regex.hpp>
0031 #include <boost/xpressive/detail/dynamic/parser.hpp>
0032 #include <boost/xpressive/detail/dynamic/parse_charset.hpp>
0033 #include <boost/xpressive/detail/dynamic/parser_enum.hpp>
0034 #include <boost/xpressive/detail/dynamic/parser_traits.hpp>
0035 #include <boost/xpressive/detail/core/linker.hpp>
0036 #include <boost/xpressive/detail/core/optimize.hpp>
0037
0038 namespace boost { namespace xpressive
0039 {
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053 template<typename BidiIter, typename RegexTraits, typename CompilerTraits>
0054 struct regex_compiler
0055 {
0056 typedef BidiIter iterator_type;
0057 typedef typename iterator_value<BidiIter>::type char_type;
0058 typedef regex_constants::syntax_option_type flag_type;
0059 typedef RegexTraits traits_type;
0060 typedef typename traits_type::string_type string_type;
0061 typedef typename traits_type::locale_type locale_type;
0062 typedef typename traits_type::char_class_type char_class_type;
0063
0064 explicit regex_compiler(RegexTraits const &traits = RegexTraits())
0065 : mark_count_(0)
0066 , hidden_mark_count_(0)
0067 , traits_(traits)
0068 , upper_(0)
0069 , self_()
0070 , rules_()
0071 {
0072 this->upper_ = lookup_classname(this->rxtraits(), "upper");
0073 }
0074
0075
0076
0077
0078
0079
0080
0081 locale_type imbue(locale_type loc)
0082 {
0083 locale_type oldloc = this->traits_.imbue(loc);
0084 this->upper_ = lookup_classname(this->rxtraits(), "upper");
0085 return oldloc;
0086 }
0087
0088
0089
0090
0091
0092
0093 locale_type getloc() const
0094 {
0095 return this->traits_.getloc();
0096 }
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116 template<typename InputIter>
0117 basic_regex<BidiIter>
0118 compile(InputIter begin, InputIter end, flag_type flags = regex_constants::ECMAScript)
0119 {
0120 typedef typename iterator_category<InputIter>::type category;
0121 return this->compile_(begin, end, flags, category());
0122 }
0123
0124
0125
0126 template<typename InputRange>
0127 typename disable_if<is_pointer<InputRange>, basic_regex<BidiIter> >::type
0128 compile(InputRange const &pat, flag_type flags = regex_constants::ECMAScript)
0129 {
0130 return this->compile(boost::begin(pat), boost::end(pat), flags);
0131 }
0132
0133
0134
0135 basic_regex<BidiIter>
0136 compile(char_type const *begin, flag_type flags = regex_constants::ECMAScript)
0137 {
0138 BOOST_ASSERT(0 != begin);
0139 char_type const *end = begin + std::char_traits<char_type>::length(begin);
0140 return this->compile(begin, end, flags);
0141 }
0142
0143
0144
0145 basic_regex<BidiIter> compile(char_type const *begin, std::size_t size, flag_type flags)
0146 {
0147 BOOST_ASSERT(0 != begin);
0148 char_type const *end = begin + size;
0149 return this->compile(begin, end, flags);
0150 }
0151
0152
0153
0154
0155
0156
0157
0158
0159
0160
0161 basic_regex<BidiIter> &operator [](string_type const &name)
0162 {
0163 BOOST_ASSERT(!name.empty());
0164 return this->rules_[name];
0165 }
0166
0167
0168
0169 basic_regex<BidiIter> const &operator [](string_type const &name) const
0170 {
0171 BOOST_ASSERT(!name.empty());
0172 return this->rules_[name];
0173 }
0174
0175 private:
0176
0177 typedef detail::escape_value<char_type, char_class_type> escape_value;
0178 typedef detail::alternate_matcher<detail::alternates_vector<BidiIter>, RegexTraits> alternate_matcher;
0179
0180
0181
0182
0183 template<typename FwdIter>
0184 basic_regex<BidiIter> compile_(FwdIter begin, FwdIter end, flag_type flags, std::forward_iterator_tag)
0185 {
0186 BOOST_MPL_ASSERT((is_same<char_type, typename iterator_value<FwdIter>::type>));
0187 using namespace regex_constants;
0188 this->reset();
0189 this->traits_.flags(flags);
0190
0191 basic_regex<BidiIter> rextmp, *prex = &rextmp;
0192 FwdIter tmp = begin;
0193
0194
0195 string_type name;
0196 if(token_group_begin == this->traits_.get_token(tmp, end) &&
0197 BOOST_XPR_ENSURE_(tmp != end, error_paren, "mismatched parenthesis") &&
0198 token_rule_assign == this->traits_.get_group_type(tmp, end, name))
0199 {
0200 begin = tmp;
0201 BOOST_XPR_ENSURE_
0202 (
0203 begin != end && token_group_end == this->traits_.get_token(begin, end)
0204 , error_paren
0205 , "mismatched parenthesis"
0206 );
0207 prex = &this->rules_[name];
0208 }
0209
0210 this->self_ = detail::core_access<BidiIter>::get_regex_impl(*prex);
0211
0212
0213 detail::sequence<BidiIter> seq = this->parse_alternates(begin, end);
0214 BOOST_XPR_ENSURE_(begin == end, error_paren, "mismatched parenthesis");
0215
0216
0217 seq += detail::make_dynamic<BidiIter>(detail::end_matcher());
0218
0219
0220 detail::common_compile(seq.xpr().matchable(), *this->self_, this->rxtraits());
0221
0222 this->self_->traits_ = new detail::traits_holder<RegexTraits>(this->rxtraits());
0223 this->self_->mark_count_ = this->mark_count_;
0224 this->self_->hidden_mark_count_ = this->hidden_mark_count_;
0225
0226
0227 this->self_->tracking_update();
0228 this->self_.reset();
0229 return *prex;
0230 }
0231
0232
0233
0234
0235 template<typename InputIter>
0236 basic_regex<BidiIter> compile_(InputIter begin, InputIter end, flag_type flags, std::input_iterator_tag)
0237 {
0238 string_type pat(begin, end);
0239 return this->compile_(boost::begin(pat), boost::end(pat), flags, std::forward_iterator_tag());
0240 }
0241
0242
0243
0244
0245 void reset()
0246 {
0247 this->mark_count_ = 0;
0248 this->hidden_mark_count_ = 0;
0249 this->traits_.flags(regex_constants::ECMAScript);
0250 }
0251
0252
0253
0254
0255 traits_type &rxtraits()
0256 {
0257 return this->traits_.traits();
0258 }
0259
0260
0261
0262
0263 traits_type const &rxtraits() const
0264 {
0265 return this->traits_.traits();
0266 }
0267
0268
0269
0270
0271 template<typename FwdIter>
0272 detail::sequence<BidiIter> parse_alternates(FwdIter &begin, FwdIter end)
0273 {
0274 using namespace regex_constants;
0275 int count = 0;
0276 FwdIter tmp = begin;
0277 detail::sequence<BidiIter> seq;
0278
0279 do switch(++count)
0280 {
0281 case 1:
0282 seq = this->parse_sequence(tmp, end);
0283 break;
0284 case 2:
0285 seq = detail::make_dynamic<BidiIter>(alternate_matcher()) | seq;
0286 BOOST_FALLTHROUGH;
0287 default:
0288 seq |= this->parse_sequence(tmp, end);
0289 }
0290 while((begin = tmp) != end && token_alternate == this->traits_.get_token(tmp, end));
0291
0292 return seq;
0293 }
0294
0295
0296
0297
0298 template<typename FwdIter>
0299 detail::sequence<BidiIter> parse_group(FwdIter &begin, FwdIter end)
0300 {
0301 using namespace regex_constants;
0302 int mark_nbr = 0;
0303 bool keeper = false;
0304 bool lookahead = false;
0305 bool lookbehind = false;
0306 bool negative = false;
0307 string_type name;
0308
0309 detail::sequence<BidiIter> seq, seq_end;
0310 FwdIter tmp = FwdIter();
0311
0312 syntax_option_type old_flags = this->traits_.flags();
0313
0314 switch(this->traits_.get_group_type(begin, end, name))
0315 {
0316 case token_no_mark:
0317
0318
0319 if(token_group_end == this->traits_.get_token(tmp = begin, end))
0320 {
0321 return this->parse_atom(begin = tmp, end);
0322 }
0323 break;
0324
0325 case token_negative_lookahead:
0326 negative = true;
0327 BOOST_FALLTHROUGH;
0328 case token_positive_lookahead:
0329 lookahead = true;
0330 break;
0331
0332 case token_negative_lookbehind:
0333 negative = true;
0334 BOOST_FALLTHROUGH;
0335 case token_positive_lookbehind:
0336 lookbehind = true;
0337 break;
0338
0339 case token_independent_sub_expression:
0340 keeper = true;
0341 break;
0342
0343 case token_comment:
0344 while(BOOST_XPR_ENSURE_(begin != end, error_paren, "mismatched parenthesis"))
0345 {
0346 switch(this->traits_.get_token(begin, end))
0347 {
0348 case token_group_end:
0349 return this->parse_atom(begin, end);
0350 case token_escape:
0351 BOOST_XPR_ENSURE_(begin != end, error_escape, "incomplete escape sequence");
0352 BOOST_FALLTHROUGH;
0353 case token_literal:
0354 ++begin;
0355 break;
0356 default:
0357 break;
0358 }
0359 }
0360 break;
0361
0362 case token_recurse:
0363 BOOST_XPR_ENSURE_
0364 (
0365 begin != end && token_group_end == this->traits_.get_token(begin, end)
0366 , error_paren
0367 , "mismatched parenthesis"
0368 );
0369 return detail::make_dynamic<BidiIter>(detail::regex_byref_matcher<BidiIter>(this->self_));
0370
0371 case token_rule_assign:
0372 BOOST_THROW_EXCEPTION(
0373 regex_error(error_badrule, "rule assignments must be at the front of the regex")
0374 );
0375 break;
0376
0377 case token_rule_ref:
0378 {
0379 typedef detail::core_access<BidiIter> access;
0380 BOOST_XPR_ENSURE_
0381 (
0382 begin != end && token_group_end == this->traits_.get_token(begin, end)
0383 , error_paren
0384 , "mismatched parenthesis"
0385 );
0386 basic_regex<BidiIter> &rex = this->rules_[name];
0387 shared_ptr<detail::regex_impl<BidiIter> > impl = access::get_regex_impl(rex);
0388 this->self_->track_reference(*impl);
0389 return detail::make_dynamic<BidiIter>(detail::regex_byref_matcher<BidiIter>(impl));
0390 }
0391
0392 case token_named_mark:
0393 mark_nbr = static_cast<int>(++this->mark_count_);
0394 for(std::size_t i = 0; i < this->self_->named_marks_.size(); ++i)
0395 {
0396 BOOST_XPR_ENSURE_(this->self_->named_marks_[i].name_ != name, error_badmark, "named mark already exists");
0397 }
0398 this->self_->named_marks_.push_back(detail::named_mark<char_type>(name, this->mark_count_));
0399 seq = detail::make_dynamic<BidiIter>(detail::mark_begin_matcher(mark_nbr));
0400 seq_end = detail::make_dynamic<BidiIter>(detail::mark_end_matcher(mark_nbr));
0401 break;
0402
0403 case token_named_mark_ref:
0404 BOOST_XPR_ENSURE_
0405 (
0406 begin != end && token_group_end == this->traits_.get_token(begin, end)
0407 , error_paren
0408 , "mismatched parenthesis"
0409 );
0410 for(std::size_t i = 0; i < this->self_->named_marks_.size(); ++i)
0411 {
0412 if(this->self_->named_marks_[i].name_ == name)
0413 {
0414 mark_nbr = static_cast<int>(this->self_->named_marks_[i].mark_nbr_);
0415 return detail::make_backref_xpression<BidiIter>
0416 (
0417 mark_nbr, this->traits_.flags(), this->rxtraits()
0418 );
0419 }
0420 }
0421 BOOST_THROW_EXCEPTION(regex_error(error_badmark, "invalid named back-reference"));
0422 break;
0423
0424 default:
0425 mark_nbr = static_cast<int>(++this->mark_count_);
0426 seq = detail::make_dynamic<BidiIter>(detail::mark_begin_matcher(mark_nbr));
0427 seq_end = detail::make_dynamic<BidiIter>(detail::mark_end_matcher(mark_nbr));
0428 break;
0429 }
0430
0431
0432 seq += this->parse_alternates(begin, end);
0433 seq += seq_end;
0434 BOOST_XPR_ENSURE_
0435 (
0436 begin != end && token_group_end == this->traits_.get_token(begin, end)
0437 , error_paren
0438 , "mismatched parenthesis"
0439 );
0440
0441 typedef detail::shared_matchable<BidiIter> xpr_type;
0442 if(lookahead)
0443 {
0444 seq += detail::make_independent_end_xpression<BidiIter>(seq.pure());
0445 detail::lookahead_matcher<xpr_type> lam(seq.xpr(), negative, seq.pure());
0446 seq = detail::make_dynamic<BidiIter>(lam);
0447 }
0448 else if(lookbehind)
0449 {
0450 seq += detail::make_independent_end_xpression<BidiIter>(seq.pure());
0451 detail::lookbehind_matcher<xpr_type> lbm(seq.xpr(), seq.width().value(), negative, seq.pure());
0452 seq = detail::make_dynamic<BidiIter>(lbm);
0453 }
0454 else if(keeper)
0455 {
0456 seq += detail::make_independent_end_xpression<BidiIter>(seq.pure());
0457 detail::keeper_matcher<xpr_type> km(seq.xpr(), seq.pure());
0458 seq = detail::make_dynamic<BidiIter>(km);
0459 }
0460
0461
0462 this->traits_.flags(old_flags);
0463 return seq;
0464 }
0465
0466
0467
0468
0469 template<typename FwdIter>
0470 detail::sequence<BidiIter> parse_charset(FwdIter &begin, FwdIter end)
0471 {
0472 detail::compound_charset<traits_type> chset;
0473
0474
0475 detail::parse_charset(begin, end, chset, this->traits_);
0476
0477 return detail::make_charset_xpression<BidiIter>
0478 (
0479 chset
0480 , this->rxtraits()
0481 , this->traits_.flags()
0482 );
0483 }
0484
0485
0486
0487
0488 template<typename FwdIter>
0489 detail::sequence<BidiIter> parse_atom(FwdIter &begin, FwdIter end)
0490 {
0491 using namespace regex_constants;
0492 escape_value esc = { 0, 0, 0, detail::escape_char };
0493 FwdIter old_begin = begin;
0494
0495 switch(this->traits_.get_token(begin, end))
0496 {
0497 case token_literal:
0498 return detail::make_literal_xpression<BidiIter>
0499 (
0500 this->parse_literal(begin, end), this->traits_.flags(), this->rxtraits()
0501 );
0502
0503 case token_any:
0504 return detail::make_any_xpression<BidiIter>(this->traits_.flags(), this->rxtraits());
0505
0506 case token_assert_begin_sequence:
0507 return detail::make_dynamic<BidiIter>(detail::assert_bos_matcher());
0508
0509 case token_assert_end_sequence:
0510 return detail::make_dynamic<BidiIter>(detail::assert_eos_matcher());
0511
0512 case token_assert_begin_line:
0513 return detail::make_assert_begin_line<BidiIter>(this->traits_.flags(), this->rxtraits());
0514
0515 case token_assert_end_line:
0516 return detail::make_assert_end_line<BidiIter>(this->traits_.flags(), this->rxtraits());
0517
0518 case token_assert_word_boundary:
0519 return detail::make_assert_word<BidiIter>(detail::word_boundary<mpl::true_>(), this->rxtraits());
0520
0521 case token_assert_not_word_boundary:
0522 return detail::make_assert_word<BidiIter>(detail::word_boundary<mpl::false_>(), this->rxtraits());
0523
0524 case token_assert_word_begin:
0525 return detail::make_assert_word<BidiIter>(detail::word_begin(), this->rxtraits());
0526
0527 case token_assert_word_end:
0528 return detail::make_assert_word<BidiIter>(detail::word_end(), this->rxtraits());
0529
0530 case token_escape:
0531 esc = this->parse_escape(begin, end);
0532 switch(esc.type_)
0533 {
0534 case detail::escape_mark:
0535 return detail::make_backref_xpression<BidiIter>
0536 (
0537 esc.mark_nbr_, this->traits_.flags(), this->rxtraits()
0538 );
0539 case detail::escape_char:
0540 return detail::make_char_xpression<BidiIter>
0541 (
0542 esc.ch_, this->traits_.flags(), this->rxtraits()
0543 );
0544 case detail::escape_class:
0545 return detail::make_posix_charset_xpression<BidiIter>
0546 (
0547 esc.class_
0548 , this->is_upper_(*begin++)
0549 , this->traits_.flags()
0550 , this->rxtraits()
0551 );
0552 }
0553
0554 case token_group_begin:
0555 return this->parse_group(begin, end);
0556
0557 case token_charset_begin:
0558 return this->parse_charset(begin, end);
0559
0560 case token_invalid_quantifier:
0561 BOOST_THROW_EXCEPTION(regex_error(error_badrepeat, "quantifier not expected"));
0562 break;
0563
0564 case token_quote_meta_begin:
0565 return detail::make_literal_xpression<BidiIter>
0566 (
0567 this->parse_quote_meta(begin, end), this->traits_.flags(), this->rxtraits()
0568 );
0569
0570 case token_quote_meta_end:
0571 BOOST_THROW_EXCEPTION(
0572 regex_error(
0573 error_escape
0574 , "found quote-meta end without corresponding quote-meta begin"
0575 )
0576 );
0577 break;
0578
0579 case token_end_of_pattern:
0580 break;
0581
0582 default:
0583 begin = old_begin;
0584 break;
0585 }
0586
0587 return detail::sequence<BidiIter>();
0588 }
0589
0590
0591
0592
0593 template<typename FwdIter>
0594 detail::sequence<BidiIter> parse_quant(FwdIter &begin, FwdIter end)
0595 {
0596 BOOST_ASSERT(begin != end);
0597 detail::quant_spec spec = { 0, 0, false, &this->hidden_mark_count_ };
0598 detail::sequence<BidiIter> seq = this->parse_atom(begin, end);
0599
0600
0601 if(!seq.empty() && begin != end && detail::quant_none != seq.quant())
0602 {
0603 if(this->traits_.get_quant_spec(begin, end, spec))
0604 {
0605 BOOST_ASSERT(spec.min_ <= spec.max_);
0606
0607 if(0 == spec.max_)
0608 {
0609 seq = this->parse_quant(begin, end);
0610 }
0611 else
0612 {
0613 seq.repeat(spec);
0614 }
0615 }
0616 }
0617
0618 return seq;
0619 }
0620
0621
0622
0623
0624 template<typename FwdIter>
0625 detail::sequence<BidiIter> parse_sequence(FwdIter &begin, FwdIter end)
0626 {
0627 detail::sequence<BidiIter> seq;
0628
0629 while(begin != end)
0630 {
0631 detail::sequence<BidiIter> seq_quant = this->parse_quant(begin, end);
0632
0633
0634 if(seq_quant.empty())
0635 break;
0636
0637
0638 seq += seq_quant;
0639 }
0640
0641 return seq;
0642 }
0643
0644
0645
0646
0647
0648 template<typename FwdIter>
0649 string_type parse_literal(FwdIter &begin, FwdIter end)
0650 {
0651 using namespace regex_constants;
0652 BOOST_ASSERT(begin != end);
0653 BOOST_ASSERT(token_literal == this->traits_.get_token(begin, end));
0654 escape_value esc = { 0, 0, 0, detail::escape_char };
0655 string_type literal(1, *begin);
0656
0657 for(FwdIter prev = begin, tmp = ++begin; begin != end; prev = begin, begin = tmp)
0658 {
0659 detail::quant_spec spec = { 0, 0, false, &this->hidden_mark_count_ };
0660 if(this->traits_.get_quant_spec(tmp, end, spec))
0661 {
0662 if(literal.size() != 1)
0663 {
0664 begin = prev;
0665 literal.erase(boost::prior(literal.end()));
0666 }
0667 return literal;
0668 }
0669 else switch(this->traits_.get_token(tmp, end))
0670 {
0671 case token_escape:
0672 esc = this->parse_escape(tmp, end);
0673 if(detail::escape_char != esc.type_) return literal;
0674 literal.insert(literal.end(), esc.ch_);
0675 break;
0676 case token_literal:
0677 literal.insert(literal.end(), *tmp++);
0678 break;
0679 default:
0680 return literal;
0681 }
0682 }
0683
0684 return literal;
0685 }
0686
0687
0688
0689
0690
0691 template<typename FwdIter>
0692 string_type parse_quote_meta(FwdIter &begin, FwdIter end)
0693 {
0694 using namespace regex_constants;
0695 FwdIter old_begin = begin, old_end;
0696 while(end != (old_end = begin))
0697 {
0698 switch(this->traits_.get_token(begin, end))
0699 {
0700 case token_quote_meta_end:
0701 return string_type(old_begin, old_end);
0702 case token_escape:
0703 BOOST_XPR_ENSURE_(begin != end, error_escape, "incomplete escape sequence");
0704 BOOST_FALLTHROUGH;
0705 case token_invalid_quantifier:
0706 case token_literal:
0707 ++begin;
0708 break;
0709 default:
0710 break;
0711 }
0712 }
0713 return string_type(old_begin, begin);
0714 }
0715
0716
0717
0718
0719 template<typename FwdIter>
0720 escape_value parse_escape(FwdIter &begin, FwdIter end)
0721 {
0722 BOOST_XPR_ENSURE_(begin != end, regex_constants::error_escape, "incomplete escape sequence");
0723
0724
0725 if(0 < this->rxtraits().value(*begin, 10))
0726 {
0727
0728 FwdIter tmp = begin;
0729 int mark_nbr = detail::toi(tmp, end, this->rxtraits(), 10, 999);
0730
0731
0732 if(10 > mark_nbr || mark_nbr <= static_cast<int>(this->mark_count_))
0733 {
0734 begin = tmp;
0735 escape_value esc = {0, mark_nbr, 0, detail::escape_mark};
0736 return esc;
0737 }
0738 }
0739
0740
0741 return detail::parse_escape(begin, end, this->traits_);
0742 }
0743
0744 bool is_upper_(char_type ch) const
0745 {
0746 return 0 != this->upper_ && this->rxtraits().isctype(ch, this->upper_);
0747 }
0748
0749 std::size_t mark_count_;
0750 std::size_t hidden_mark_count_;
0751 CompilerTraits traits_;
0752 typename RegexTraits::char_class_type upper_;
0753 shared_ptr<detail::regex_impl<BidiIter> > self_;
0754 std::map<string_type, basic_regex<BidiIter> > rules_;
0755 };
0756
0757 }}
0758
0759 #endif