Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-19 09:47:49

0001 // parser.hpp
0002 // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/)
0003 //
0004 // Distributed under the Boost Software License, Version 1.0. (See accompanying
0005 // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
0006 #ifndef BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_PARSER_PARSER_HPP
0007 #define BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_PARSER_PARSER_HPP
0008 
0009 #include <boost/assert.hpp>
0010 #include "tree/end_node.hpp"
0011 #include "tree/iteration_node.hpp"
0012 #include "tree/leaf_node.hpp"
0013 #include "../runtime_error.hpp"
0014 #include "tree/selection_node.hpp"
0015 #include "tree/sequence_node.hpp"
0016 #include "../size_t.hpp"
0017 #include "tokeniser/re_tokeniser.hpp"
0018 #include <sstream>
0019 
0020 namespace boost
0021 {
0022 namespace lexer
0023 {
0024 namespace detail
0025 {
0026 template<typename CharT>
0027 class basic_parser
0028 {
0029 public:
0030     typedef basic_re_tokeniser<CharT> tokeniser;
0031     typedef typename tokeniser::string string;
0032     typedef std::map<string, const node *> macro_map;
0033     typedef node::node_ptr_vector node_ptr_vector;
0034     typedef typename tokeniser::num_token token;
0035 
0036 /*
0037     General principles of regex parsing:
0038     - Every regex is a sequence of sub-regexes.
0039     - Regexes consist of operands and operators
0040     - All operators decompose to sequence, selection ('|') and iteration ('*')
0041     - Regex tokens are stored on the stack.
0042     - When a complete sequence of regex tokens is on the stack it is processed.
0043 
0044 Grammar:
0045 
0046 <REGEX>      -> <OREXP>
0047 <OREXP>      -> <SEQUENCE> | <OREXP>'|'<SEQUENCE>
0048 <SEQUENCE>   -> <SUB>
0049 <SUB>        -> <EXPRESSION> | <SUB><EXPRESSION>
0050 <EXPRESSION> -> <REPEAT>
0051 <REPEAT>     -> charset | macro | '('<REGEX>')' | <REPEAT><DUPLICATE>
0052 <DUPLICATE>  -> '?' | '*' | '+' | '{n[,[m]]}'
0053 */
0054     static node *parse (const CharT *start_, const CharT * const end_,
0055         const std::size_t id_, const std::size_t unique_id_,
0056         const std::size_t dfa_state_, const regex_flags flags_,
0057         const std::locale &locale_, node_ptr_vector &node_ptr_vector_,
0058         const macro_map &macromap_, typename tokeniser::token_map &map_,
0059         bool &seen_BOL_assertion_, bool &seen_EOL_assertion_)
0060     {
0061         node *root_ = 0;
0062         state state_ (start_, end_, flags_, locale_);
0063         token lhs_token_;
0064         token rhs_token_;
0065         token_stack token_stack_;
0066         tree_node_stack tree_node_stack_;
0067         char action_ = 0;
0068 
0069         token_stack_.push (rhs_token_);
0070         tokeniser::next (state_, map_, rhs_token_);
0071 
0072         do
0073         {
0074             lhs_token_ = token_stack_.top ();
0075             action_ = lhs_token_.precedence (rhs_token_._type);
0076 
0077             switch (action_)
0078             {
0079             case '<':
0080             case '=':
0081                 token_stack_.push (rhs_token_);
0082                 tokeniser::next (state_, map_, rhs_token_);
0083                 break;
0084             case '>':
0085                 reduce (token_stack_, macromap_, node_ptr_vector_,
0086                     tree_node_stack_);
0087                 break;
0088             default:
0089                 std::ostringstream ss_;
0090 
0091                 ss_ << "A syntax error occurred: '" <<
0092                     lhs_token_.precedence_string () <<
0093                     "' against '" << rhs_token_.precedence_string () <<
0094                     "' at index " << state_.index () << ".";
0095                 throw runtime_error (ss_.str ().c_str ());
0096                 break;
0097             }
0098         } while (!token_stack_.empty ());
0099 
0100         if (tree_node_stack_.empty ())
0101         {
0102             throw runtime_error ("Empty rules are not allowed.");
0103         }
0104 
0105         BOOST_ASSERT(tree_node_stack_.size () == 1);
0106 
0107         node *lhs_node_ = tree_node_stack_.top ();
0108 
0109         tree_node_stack_.pop ();
0110 
0111         if (id_ == 0)
0112         {
0113             // Macros have no end state...
0114             root_ = lhs_node_;
0115         }
0116         else
0117         {
0118             node_ptr_vector_->push_back (static_cast<end_node *>(0));
0119 
0120             node *rhs_node_ = new end_node (id_, unique_id_, dfa_state_);
0121 
0122             node_ptr_vector_->back () = rhs_node_;
0123             node_ptr_vector_->push_back (static_cast<sequence_node *>(0));
0124             node_ptr_vector_->back () = new sequence_node
0125                 (lhs_node_, rhs_node_);
0126             root_ = node_ptr_vector_->back ();
0127         }
0128 
0129         // Done this way as bug in VC++ 6 prevents |= operator working
0130         // properly!
0131         if (state_._seen_BOL_assertion) seen_BOL_assertion_ = true;
0132 
0133         if (state_._seen_EOL_assertion) seen_EOL_assertion_ = true;
0134 
0135         return root_;
0136     }
0137 
0138 private:
0139     typedef typename tokeniser::state state;
0140     typedef std::stack<token> token_stack;
0141     typedef node::node_stack tree_node_stack;
0142 
0143     static void reduce (token_stack &token_stack_,
0144         const macro_map &macromap_, node_ptr_vector &node_vector_ptr_,
0145         tree_node_stack &tree_node_stack_)
0146     {
0147         typename tokeniser::num_token lhs_;
0148         typename tokeniser::num_token rhs_;
0149         token_stack handle_;
0150         char action_ = 0;
0151 
0152         do
0153         {
0154             rhs_ = token_stack_.top ();
0155             token_stack_.pop ();
0156             handle_.push (rhs_);
0157 
0158             if (!token_stack_.empty ())
0159             {
0160                 lhs_ = token_stack_.top ();
0161                 action_ = lhs_.precedence (rhs_._type);
0162             }
0163         } while (!token_stack_.empty () && action_ == '=');
0164 
0165         BOOST_ASSERT(token_stack_.empty () || action_ == '<');
0166 
0167         switch (rhs_._type)
0168         {
0169         case token::BEGIN:
0170             // finished processing so exit
0171             break;
0172         case token::REGEX:
0173             // finished parsing, nothing to do
0174             break;
0175         case token::OREXP:
0176             orexp (handle_, token_stack_, node_vector_ptr_, tree_node_stack_);
0177             break;
0178         case token::SEQUENCE:
0179             token_stack_.push (token::OREXP);
0180             break;
0181         case token::SUB:
0182             sub (handle_, token_stack_, node_vector_ptr_, tree_node_stack_);
0183             break;
0184         case token::EXPRESSION:
0185             token_stack_.push (token::SUB);
0186             break;
0187         case token::REPEAT:
0188             repeat (handle_, token_stack_);
0189             break;
0190         case token::CHARSET:
0191             charset (handle_, token_stack_, node_vector_ptr_,
0192                 tree_node_stack_);
0193             break;
0194         case token::MACRO:
0195             macro (handle_, token_stack_, macromap_, node_vector_ptr_,
0196                 tree_node_stack_);
0197             break;
0198         case token::OPENPAREN:
0199             openparen (handle_, token_stack_);
0200             break;
0201         case token::OPT:
0202         case token::AOPT:
0203             optional (rhs_._type == token::OPT, node_vector_ptr_,
0204                 tree_node_stack_);
0205             token_stack_.push (token::DUP);
0206             break;
0207         case token::ZEROORMORE:
0208         case token::AZEROORMORE:
0209             zero_or_more (rhs_._type == token::ZEROORMORE, node_vector_ptr_,
0210                 tree_node_stack_);
0211             token_stack_.push (token::DUP);
0212             break;
0213         case token::ONEORMORE:
0214         case token::AONEORMORE:
0215             one_or_more (rhs_._type == token::ONEORMORE, node_vector_ptr_,
0216                 tree_node_stack_);
0217             token_stack_.push (token::DUP);
0218             break;
0219         case token::REPEATN:
0220         case token::AREPEATN:
0221             repeatn (rhs_._type == token::REPEATN, handle_.top (),
0222                 node_vector_ptr_, tree_node_stack_);
0223             token_stack_.push (token::DUP);
0224             break;
0225         default:
0226             throw runtime_error
0227                 ("Internal error regex_parser::reduce");
0228             break;
0229         }
0230     }
0231 
0232     static void orexp (token_stack &handle_, token_stack &token_stack_,
0233         node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
0234     {
0235         BOOST_ASSERT(handle_.top ()._type == token::OREXP &&
0236             (handle_.size () == 1 || handle_.size () == 3));
0237 
0238         if (handle_.size () == 1)
0239         {
0240             token_stack_.push (token::REGEX);
0241         }
0242         else
0243         {
0244             handle_.pop ();
0245             BOOST_ASSERT(handle_.top ()._type == token::OR);
0246             handle_.pop ();
0247             BOOST_ASSERT(handle_.top ()._type == token::SEQUENCE);
0248             perform_or (node_ptr_vector_, tree_node_stack_);
0249             token_stack_.push (token::OREXP);
0250         }
0251     }
0252 
0253     static void sub (token_stack &handle_, token_stack &token_stack_,
0254         node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
0255     {
0256         BOOST_ASSERT(handle_.top ()._type == token::SUB &&
0257             (handle_.size () == 1 || handle_.size () == 2));
0258 
0259         if (handle_.size () == 1)
0260         {
0261             token_stack_.push (token::SEQUENCE);
0262         }
0263         else
0264         {
0265             handle_.pop ();
0266             BOOST_ASSERT(handle_.top ()._type == token::EXPRESSION);
0267             // perform join
0268             sequence (node_ptr_vector_, tree_node_stack_);
0269             token_stack_.push (token::SUB);
0270         }
0271     }
0272 
0273     static void repeat (token_stack &handle_, token_stack &token_stack_)
0274     {
0275         BOOST_ASSERT(handle_.top ()._type == token::REPEAT &&
0276             handle_.size () >= 1 && handle_.size () <= 3);
0277 
0278         if (handle_.size () == 1)
0279         {
0280             token_stack_.push (token::EXPRESSION);
0281         }
0282         else
0283         {
0284             handle_.pop ();
0285             BOOST_ASSERT(handle_.top ()._type == token::DUP);
0286             token_stack_.push (token::REPEAT);
0287         }
0288     }
0289 
0290     static void charset (token_stack &handle_, token_stack &token_stack_,
0291         node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
0292     {
0293         BOOST_ASSERT(handle_.top ()._type == token::CHARSET &&
0294             handle_.size () == 1);
0295         // store charset
0296         node_ptr_vector_->push_back (static_cast<leaf_node *>(0));
0297 
0298         const size_t id_ = handle_.top ()._id;
0299 
0300         node_ptr_vector_->back () = new leaf_node (id_, true);
0301         tree_node_stack_.push (node_ptr_vector_->back ());
0302         token_stack_.push (token::REPEAT);
0303     }
0304 
0305     static void macro (token_stack &handle_, token_stack &token_stack_,
0306         const macro_map &macromap_, node_ptr_vector &node_ptr_vector_,
0307         tree_node_stack &tree_node_stack_)
0308     {
0309         token &top_ = handle_.top ();
0310 
0311         BOOST_ASSERT(top_._type == token::MACRO && handle_.size () == 1);
0312 
0313         typename macro_map::const_iterator iter_ =
0314             macromap_.find (top_._macro);
0315 
0316         if (iter_ == macromap_.end ())
0317         {
0318             const CharT *name_ = top_._macro;
0319             std::basic_stringstream<CharT> ss_;
0320             std::ostringstream os_;
0321 
0322             os_ << "Unknown MACRO name '";
0323 
0324             while (*name_)
0325             {
0326                 os_ << ss_.narrow (*name_++, ' ');
0327             }
0328 
0329             os_ << "'.";
0330             throw runtime_error (os_.str ());
0331         }
0332 
0333         tree_node_stack_.push (iter_->second->copy (node_ptr_vector_));
0334         token_stack_.push (token::REPEAT);
0335     }
0336 
0337     static void openparen (token_stack &handle_, token_stack &token_stack_)
0338     {
0339         BOOST_ASSERT(handle_.top ()._type == token::OPENPAREN &&
0340             handle_.size () == 3);
0341         handle_.pop ();
0342         BOOST_ASSERT(handle_.top ()._type == token::REGEX);
0343         handle_.pop ();
0344         BOOST_ASSERT(handle_.top ()._type == token::CLOSEPAREN);
0345         token_stack_.push (token::REPEAT);
0346     }
0347 
0348     static void perform_or (node_ptr_vector &node_ptr_vector_,
0349         tree_node_stack &tree_node_stack_)
0350     {
0351         // perform or
0352         node *rhs_ = tree_node_stack_.top ();
0353 
0354         tree_node_stack_.pop ();
0355 
0356         node *lhs_ = tree_node_stack_.top ();
0357 
0358         node_ptr_vector_->push_back (static_cast<selection_node *>(0));
0359         node_ptr_vector_->back () = new selection_node (lhs_, rhs_);
0360         tree_node_stack_.top () = node_ptr_vector_->back ();
0361     }
0362 
0363     static void sequence (node_ptr_vector &node_ptr_vector_,
0364         tree_node_stack &tree_node_stack_)
0365     {
0366         node *rhs_ = tree_node_stack_.top ();
0367 
0368         tree_node_stack_.pop ();
0369 
0370         node *lhs_ = tree_node_stack_.top ();
0371 
0372         node_ptr_vector_->push_back (static_cast<sequence_node *>(0));
0373         node_ptr_vector_->back () = new sequence_node (lhs_, rhs_);
0374         tree_node_stack_.top () = node_ptr_vector_->back ();
0375     }
0376 
0377     static void optional (const bool greedy_,
0378         node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
0379     {
0380         // perform ?
0381         node *lhs_ = tree_node_stack_.top ();
0382         // You don't know if lhs_ is a leaf_node, so get firstpos.
0383         node::node_vector &firstpos_ = lhs_->firstpos ();
0384 
0385         for (node::node_vector::iterator iter_ = firstpos_.begin (),
0386             end_ = firstpos_.end (); iter_ != end_; ++iter_)
0387         {
0388             // These are leaf_nodes!
0389             (*iter_)->greedy (greedy_);
0390         }
0391 
0392         node_ptr_vector_->push_back (static_cast<leaf_node *>(0));
0393 
0394         node *rhs_ = new leaf_node (null_token, greedy_);
0395 
0396         node_ptr_vector_->back () = rhs_;
0397         node_ptr_vector_->push_back (static_cast<selection_node *>(0));
0398         node_ptr_vector_->back () = new selection_node (lhs_, rhs_);
0399         tree_node_stack_.top () = node_ptr_vector_->back ();
0400     }
0401 
0402     static void zero_or_more (const bool greedy_,
0403         node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
0404     {
0405         // perform *
0406         node *ptr_ = tree_node_stack_.top ();
0407 
0408         node_ptr_vector_->push_back (static_cast<iteration_node *>(0));
0409         node_ptr_vector_->back () = new iteration_node (ptr_, greedy_);
0410         tree_node_stack_.top () = node_ptr_vector_->back ();
0411     }
0412 
0413     static void one_or_more (const bool greedy_,
0414         node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
0415     {
0416         // perform +
0417         node *lhs_ = tree_node_stack_.top ();
0418         node *copy_ = lhs_->copy (node_ptr_vector_);
0419 
0420         node_ptr_vector_->push_back (static_cast<iteration_node *>(0));
0421 
0422         node *rhs_ = new iteration_node (copy_, greedy_);
0423 
0424         node_ptr_vector_->back () = rhs_;
0425         node_ptr_vector_->push_back (static_cast<sequence_node *>(0));
0426         node_ptr_vector_->back () = new sequence_node (lhs_, rhs_);
0427         tree_node_stack_.top () = node_ptr_vector_->back ();
0428     }
0429 
0430     // This is one of the most mind bending routines in this code...
0431     static void repeatn (const bool greedy_, const token &token_,
0432         node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
0433     {
0434         // perform {n[,[m]]}
0435         // Semantic checks have already been performed.
0436         // {0,}  = *
0437         // {0,1} = ?
0438         // {1,}  = +
0439         // therefore we do not check for these cases.
0440         if (!(token_._min == 1 && !token_._comma))
0441         {
0442             const std::size_t top_ = token_._min > 0 ?
0443                 token_._min : token_._max;
0444 
0445             if (token_._min == 0)
0446             {
0447                 optional (greedy_, node_ptr_vector_, tree_node_stack_);
0448             }
0449 
0450             node *prev_ = tree_node_stack_.top ()->copy (node_ptr_vector_);
0451             node *curr_ = 0;
0452 
0453             for (std::size_t i_ = 2; i_ < top_; ++i_)
0454             {
0455                 curr_ = prev_->copy (node_ptr_vector_);
0456                 tree_node_stack_.push (static_cast<node *>(0));
0457                 tree_node_stack_.top () = prev_;
0458                 sequence (node_ptr_vector_, tree_node_stack_);
0459                 prev_ = curr_;
0460             }
0461 
0462             if (token_._comma && token_._min > 0)
0463             {
0464                 if (token_._min > 1)
0465                 {
0466                     curr_ = prev_->copy (node_ptr_vector_);
0467                     tree_node_stack_.push (static_cast<node *>(0));
0468                     tree_node_stack_.top () = prev_;
0469                     sequence (node_ptr_vector_, tree_node_stack_);
0470                     prev_ = curr_;
0471                 }
0472 
0473                 if (token_._comma && token_._max)
0474                 {
0475                     tree_node_stack_.push (static_cast<node *>(0));
0476                     tree_node_stack_.top () = prev_;
0477                     optional (greedy_, node_ptr_vector_, tree_node_stack_);
0478                     prev_ = tree_node_stack_.top ();
0479                     tree_node_stack_.pop ();
0480 
0481                     const std::size_t count_ = token_._max - token_._min;
0482 
0483                     for (std::size_t i_ = 1; i_ < count_; ++i_)
0484                     {
0485                         curr_ = prev_->copy (node_ptr_vector_);
0486                         tree_node_stack_.push (static_cast<node *>(0));
0487                         tree_node_stack_.top () = prev_;
0488                         sequence (node_ptr_vector_, tree_node_stack_);
0489                         prev_ = curr_;
0490                     }
0491                 }
0492                 else
0493                 {
0494                     tree_node_stack_.push (static_cast<node *>(0));
0495                     tree_node_stack_.top () = prev_;
0496                     zero_or_more (greedy_, node_ptr_vector_, tree_node_stack_);
0497                     prev_ = tree_node_stack_.top ();
0498                     tree_node_stack_.pop ();
0499                 }
0500             }
0501 
0502             tree_node_stack_.push (static_cast<node *>(0));
0503             tree_node_stack_.top () = prev_;
0504             sequence (node_ptr_vector_, tree_node_stack_);
0505         }
0506     }
0507 };
0508 }
0509 }
0510 }
0511 
0512 #endif