cpplexer/re2clex/cpp_re2c_lexer.hpp

0001 /*=============================================================================
0002     Boost.Wave: A Standard compliant C++ preprocessor library
0003
0004     Re2C based C++ lexer
0005
0006     http://www.boost.org/
0007
0008     Copyright (c) 2001-2012 Hartmut Kaiser. Distributed under the Boost
0009     Software License, Version 1.0. (See accompanying file
0010     LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
0011 =============================================================================*/
0012
0013 #if !defined(BOOST_CPP_RE2C_LEXER_HPP_B81A2629_D5B1_4944_A97D_60254182B9A8_INCLUDED)
0014 #define BOOST_CPP_RE2C_LEXER_HPP_B81A2629_D5B1_4944_A97D_60254182B9A8_INCLUDED
0015
0016 #include <string>
0017 #include <cstdio>
0018 #include <cstdarg>
0019 #if defined(BOOST_SPIRIT_DEBUG)
0020 #include <iostream>
0021 #endif // defined(BOOST_SPIRIT_DEBUG)
0022
0023 #include <boost/concept_check.hpp>
0024 #include <boost/assert.hpp>
0025
0026 #include <boost/wave/wave_config.hpp>
0027 #include <boost/wave/language_support.hpp>
0028 #include <boost/wave/token_ids.hpp>
0029 #include <boost/wave/util/file_position.hpp>
0030 #include <boost/wave/cpplexer/validate_universal_char.hpp>
0031 #include <boost/wave/cpplexer/cpplexer_exceptions.hpp>
0032 #include <boost/wave/cpplexer/token_cache.hpp>
0033 #include <boost/wave/cpplexer/convert_trigraphs.hpp>
0034
0035 #include <boost/wave/cpplexer/cpp_lex_interface.hpp>
0036 #include <boost/wave/cpplexer/re2clex/scanner.hpp>
0037 #include <boost/wave/cpplexer/re2clex/cpp_re.hpp>
0038 #if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0
0039 #include <boost/wave/cpplexer/detect_include_guards.hpp>
0040 #endif
0041
0042 #include <boost/wave/cpplexer/cpp_lex_interface_generator.hpp>
0043
0044 // this must occur after all of the includes and before any code appears
0045 #ifdef BOOST_HAS_ABI_HEADERS
0046 #include BOOST_ABI_PREFIX
0047 #endif
0048
0049 ///////////////////////////////////////////////////////////////////////////////
0050 namespace boost {
0051 namespace wave {
0052 namespace cpplexer {
0053 namespace re2clex {
0054
0055 ///////////////////////////////////////////////////////////////////////////////
0056 //
0057 //  encapsulation of the re2c based cpp lexer
0058 //
0059 ///////////////////////////////////////////////////////////////////////////////
0060
0061 template <typename IteratorT,
0062     typename PositionT = boost::wave::util::file_position_type,
0063     typename TokenT = lex_token<PositionT> >
0064 class lexer
0065 {
0066 public:
0067     typedef TokenT token_type;
0068     typedef typename token_type::string_type  string_type;
0069
0070     lexer(IteratorT const &first, IteratorT const &last,
0071         PositionT const &pos, boost::wave::language_support language_);
0072     ~lexer();
0073
0074     token_type& get(token_type&);
0075     void set_position(PositionT const &pos)
0076     {
0077         // set position has to change the file name and line number only
0078         filename = pos.get_file();
0079         scanner.line = pos.get_line();
0080 //        scanner.column = scanner.curr_column = pos.get_column();
0081         scanner.file_name = filename.c_str();
0082     }
0083 #if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0
0084     bool has_include_guards(std::string& guard_name) const
0085     {
0086         return guards.detected(guard_name);
0087     }
0088 #endif
0089
0090     // error reporting from the re2c generated lexer
0091     static int report_error(Scanner<IteratorT> const* s, int code, char const *, ...);
0092
0093 private:
0094     static char const *tok_names[];
0095
0096     Scanner<IteratorT> scanner;
0097     string_type filename;
0098     string_type value;
0099     bool at_eof;
0100     boost::wave::language_support language;
0101 #if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0
0102     include_guards<token_type> guards;
0103 #endif
0104
0105 #if BOOST_WAVE_SUPPORT_THREADING == 0
0106     static token_cache<string_type> const cache;
0107 #else
0108     token_cache<string_type> const cache;
0109 #endif
0110 };
0111
0112 ///////////////////////////////////////////////////////////////////////////////
0113 // initialize cpp lexer
0114 template <typename IteratorT, typename PositionT, typename TokenT>
0115 inline
0116 lexer<IteratorT, PositionT, TokenT>::lexer(IteratorT const &first,
0117         IteratorT const &last, PositionT const &pos,
0118         boost::wave::language_support language_)
0119     : scanner(first, last),
0120       filename(pos.get_file()), at_eof(false), language(language_)
0121 #if BOOST_WAVE_SUPPORT_THREADING != 0
0122   , cache()
0123 #endif
0124 {
0125     using namespace std;        // some systems have memset in std
0126     scanner.line = pos.get_line();
0127     scanner.column = scanner.curr_column = pos.get_column();
0128     scanner.error_proc = report_error;
0129     scanner.file_name = filename.c_str();
0130
0131 #if BOOST_WAVE_SUPPORT_MS_EXTENSIONS != 0
0132     scanner.enable_ms_extensions = true;
0133 #else
0134     scanner.enable_ms_extensions = false;
0135 #endif
0136
0137 #if BOOST_WAVE_SUPPORT_VARIADICS_PLACEMARKERS != 0
0138     scanner.act_in_c99_mode = boost::wave::need_c99(language_);
0139 #endif
0140
0141 #if BOOST_WAVE_SUPPORT_IMPORT_KEYWORD != 0
0142     scanner.enable_import_keyword = !boost::wave::need_c99(language_);
0143 #else
0144     scanner.enable_import_keyword = false;
0145 #endif
0146
0147     scanner.detect_pp_numbers = boost::wave::need_prefer_pp_numbers(language_);
0148     scanner.single_line_only = boost::wave::need_single_line(language_);
0149
0150 #if BOOST_WAVE_SUPPORT_CPP0X != 0
0151     scanner.act_in_cpp0x_mode = boost::wave::need_cpp0x(language_);
0152 #else
0153     scanner.act_in_cpp0x_mode = false;
0154 #endif
0155
0156 #if BOOST_WAVE_SUPPORT_CPP2A != 0
0157     scanner.act_in_cpp2a_mode = boost::wave::need_cpp2a(language_);
0158     scanner.act_in_cpp0x_mode = boost::wave::need_cpp2a(language_)
0159         || boost::wave::need_cpp0x(language_);
0160 #else
0161     scanner.act_in_cpp2a_mode = false;
0162 #endif
0163 }
0164
0165 template <typename IteratorT, typename PositionT, typename TokenT>
0166 inline
0167 lexer<IteratorT, PositionT, TokenT>::~lexer()
0168 {
0169     using namespace std;        // some systems have free in std
0170     free(scanner.bot);
0171 }
0172
0173 ///////////////////////////////////////////////////////////////////////////////
0174 //  get the next token from the input stream
0175 template <typename IteratorT, typename PositionT, typename TokenT>
0176 inline TokenT&
0177 lexer<IteratorT, PositionT, TokenT>::get(TokenT& result)
0178 {
0179     if (at_eof)
0180         return result = token_type();  // return T_EOI
0181
0182     std::size_t actline = scanner.line;
0183     token_id id = token_id(scan(&scanner));
0184
0185     switch (id) {
0186     case T_IDENTIFIER:
0187     // test identifier characters for validity (throws if invalid chars found)
0188         value = string_type((char const *)scanner.tok,
0189             scanner.cur-scanner.tok);
0190         if (!boost::wave::need_no_character_validation(language))
0191             impl::validate_identifier_name(value, actline, scanner.column, filename);
0192         break;
0193
0194     case T_STRINGLIT:
0195     case T_CHARLIT:
0196     case T_RAWSTRINGLIT:
0197     // test literal characters for validity (throws if invalid chars found)
0198         value = string_type((char const *)scanner.tok,
0199             scanner.cur-scanner.tok);
0200         if (boost::wave::need_convert_trigraphs(language))
0201             value = impl::convert_trigraphs(value);
0202         if (!boost::wave::need_no_character_validation(language))
0203             impl::validate_literal(value, actline, scanner.column, filename);
0204         break;
0205
0206     case T_PP_HHEADER:
0207     case T_PP_QHEADER:
0208     case T_PP_INCLUDE:
0209     // convert to the corresponding ..._next token, if appropriate
0210       {
0211           value = string_type((char const *)scanner.tok,
0212               scanner.cur-scanner.tok);
0213
0214 #if BOOST_WAVE_SUPPORT_INCLUDE_NEXT != 0
0215       // Skip '#' and whitespace and see whether we find an 'include_next' here.
0216           typename string_type::size_type start = value.find("include");
0217           if (value.compare(start, 12, "include_next", 12) == 0)
0218               id = token_id(id | AltTokenType);
0219 #endif
0220           break;
0221       }
0222
0223     case T_LONGINTLIT:  // supported in C++11, C99 and long_long mode
0224         value = string_type((char const *)scanner.tok,
0225             scanner.cur-scanner.tok);
0226         if (!boost::wave::need_long_long(language)) {
0227         // syntax error: not allowed in C++ mode
0228             BOOST_WAVE_LEXER_THROW(lexing_exception, invalid_long_long_literal,
0229                 value.c_str(), actline, scanner.column, filename.c_str());
0230         }
0231         break;
0232
0233     case T_OCTALINT:
0234     case T_DECIMALINT:
0235     case T_HEXAINT:
0236     case T_INTLIT:
0237     case T_FLOATLIT:
0238     case T_FIXEDPOINTLIT:
0239     case T_CCOMMENT:
0240     case T_CPPCOMMENT:
0241     case T_SPACE:
0242     case T_SPACE2:
0243     case T_ANY:
0244     case T_PP_NUMBER:
0245         value = string_type((char const *)scanner.tok,
0246             scanner.cur-scanner.tok);
0247         break;
0248
0249     case T_EOF:
0250         // T_EOF is returned as a valid token, the next call will return T_EOI,
0251         // i.e. the actual end of input
0252         at_eof = true;
0253         value.clear();
0254         break;
0255
0256     case T_OR_TRIGRAPH:
0257     case T_XOR_TRIGRAPH:
0258     case T_LEFTBRACE_TRIGRAPH:
0259     case T_RIGHTBRACE_TRIGRAPH:
0260     case T_LEFTBRACKET_TRIGRAPH:
0261     case T_RIGHTBRACKET_TRIGRAPH:
0262     case T_COMPL_TRIGRAPH:
0263     case T_POUND_TRIGRAPH:
0264         if (boost::wave::need_convert_trigraphs(language)) {
0265             value = cache.get_token_value(BASEID_FROM_TOKEN(id));
0266         }
0267         else {
0268             value = string_type((char const *)scanner.tok,
0269                 scanner.cur-scanner.tok);
0270         }
0271         break;
0272
0273     case T_ANY_TRIGRAPH:
0274         if (boost::wave::need_convert_trigraphs(language)) {
0275             value = impl::convert_trigraph(
0276                 string_type((char const *)scanner.tok,
0277                             scanner.cur-scanner.tok));
0278         }
0279         else {
0280             value = string_type((char const *)scanner.tok,
0281                 scanner.cur-scanner.tok);
0282         }
0283         break;
0284
0285     default:
0286         if (CATEGORY_FROM_TOKEN(id) != EXTCATEGORY_FROM_TOKEN(id) ||
0287             IS_CATEGORY(id, UnknownTokenType))
0288         {
0289             value = string_type((char const *)scanner.tok,
0290                 scanner.cur-scanner.tok);
0291         }
0292         else {
0293             value = cache.get_token_value(id);
0294         }
0295         break;
0296     }
0297
0298 //     std::cerr << boost::wave::get_token_name(id) << ": " << value << std::endl;
0299
0300     // the re2c lexer reports the new line number for newline tokens
0301     result = token_type(id, value, PositionT(filename, actline, scanner.column));
0302
0303 #if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0
0304     return guards.detect_guard(result);
0305 #else
0306     return result;
0307 #endif
0308 }
0309
0310 template <typename IteratorT, typename PositionT, typename TokenT>
0311 inline int
0312 lexer<IteratorT, PositionT, TokenT>::report_error(Scanner<IteratorT> const *s, int errcode,
0313     char const *msg, ...)
0314 {
0315     BOOST_ASSERT(0 != s);
0316     BOOST_ASSERT(0 != msg);
0317
0318     using namespace std;    // some systems have vsnprintf in namespace std
0319
0320     constexpr std::size_t bufsize = 200;            // should be large enough
0321     char buffer[bufsize];
0322     va_list params;
0323     va_start(params, msg);
0324     vsnprintf(buffer, bufsize, msg, params);
0325     va_end(params);
0326
0327     BOOST_WAVE_LEXER_THROW_VAR(lexing_exception, errcode, buffer, s->line,
0328         s->column, s->file_name);
0329     //    BOOST_UNREACHABLE_RETURN(0);
0330     return 0;
0331 }
0332
0333 ///////////////////////////////////////////////////////////////////////////////
0334 //
0335 //  lex_functor
0336 //
0337 ///////////////////////////////////////////////////////////////////////////////
0338
0339 template <typename IteratorT,
0340     typename PositionT = boost::wave::util::file_position_type,
0341     typename TokenT = typename lexer<IteratorT, PositionT>::token_type>
0342 class lex_functor
0343 :   public lex_input_interface_generator<TokenT>
0344 {
0345 public:
0346     typedef TokenT token_type;
0347
0348     lex_functor(IteratorT const &first, IteratorT const &last,
0349             PositionT const &pos, boost::wave::language_support language)
0350     :   re2c_lexer(first, last, pos, language)
0351     {}
0352     virtual ~lex_functor() {}
0353
0354     // get the next token from the input stream
0355     token_type& get(token_type& result) BOOST_OVERRIDE { return re2c_lexer.get(result); }
0356     void set_position(PositionT const &pos) BOOST_OVERRIDE { re2c_lexer.set_position(pos); }
0357 #if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0
0358     bool has_include_guards(std::string& guard_name) const BOOST_OVERRIDE
0359         { return re2c_lexer.has_include_guards(guard_name); }
0360 #endif
0361
0362 private:
0363     lexer<IteratorT, PositionT, TokenT> re2c_lexer;
0364 };
0365
0366 #if BOOST_WAVE_SUPPORT_THREADING == 0
0367 ///////////////////////////////////////////////////////////////////////////////
0368 template <typename IteratorT, typename PositionT, typename TokenT>
0369 token_cache<typename lexer<IteratorT, PositionT, TokenT>::string_type> const
0370     lexer<IteratorT, PositionT, TokenT>::cache =
0371         token_cache<typename lexer<IteratorT, PositionT, TokenT>::string_type>();
0372 #endif
0373
0374 }   // namespace re2clex
0375
0376 ///////////////////////////////////////////////////////////////////////////////
0377 //
0378 //  The new_lexer_gen<>::new_lexer function (declared in cpp_lex_interface.hpp)
0379 //  should be defined inline, if the lex_functor shouldn't be instantiated
0380 //  separately from the lex_iterator.
0381 //
0382 //  Separate (explicit) instantiation helps to reduce compilation time.
0383 //
0384 ///////////////////////////////////////////////////////////////////////////////
0385
0386 #if BOOST_WAVE_SEPARATE_LEXER_INSTANTIATION != 0
0387 #define BOOST_WAVE_RE2C_NEW_LEXER_INLINE
0388 #else
0389 #define BOOST_WAVE_RE2C_NEW_LEXER_INLINE inline
0390 #endif
0391
0392 ///////////////////////////////////////////////////////////////////////////////
0393 //
0394 //  The 'new_lexer' function allows the opaque generation of a new lexer object.
0395 //  It is coupled to the iterator type to allow to decouple the lexer/iterator
0396 //  configurations at compile time.
0397 //
0398 //  This function is declared inside the cpp_lex_token.hpp file, which is
0399 //  referenced by the source file calling the lexer and the source file, which
0400 //  instantiates the lex_functor. But it is defined here, so it will be
0401 //  instantiated only while compiling the source file, which instantiates the
0402 //  lex_functor. While the cpp_re2c_token.hpp file may be included everywhere,
0403 //  this file (cpp_re2c_lexer.hpp) should be included only once. This allows
0404 //  to decouple the lexer interface from the lexer implementation and reduces
0405 //  compilation time.
0406 //
0407 ///////////////////////////////////////////////////////////////////////////////
0408
0409 template <typename IteratorT, typename PositionT, typename TokenT>
0410 BOOST_WAVE_RE2C_NEW_LEXER_INLINE
0411 lex_input_interface<TokenT> *
0412 new_lexer_gen<IteratorT, PositionT, TokenT>::new_lexer(IteratorT const &first,
0413     IteratorT const &last, PositionT const &pos,
0414     boost::wave::language_support language)
0415 {
0416     using re2clex::lex_functor;
0417     return new lex_functor<IteratorT, PositionT, TokenT>(first, last, pos, language);
0418 }
0419
0420 #undef BOOST_WAVE_RE2C_NEW_LEXER_INLINE
0421
0422 ///////////////////////////////////////////////////////////////////////////////
0423 }   // namespace cpplexer
0424 }   // namespace wave
0425 }   // namespace boost
0426
0427 // the suffix header occurs after all of the code
0428 #ifdef BOOST_HAS_ABI_HEADERS
0429 #include BOOST_ABI_SUFFIX
0430 #endif
0431
0432 #endif // !defined(BOOST_CPP_RE2C_LEXER_HPP_B81A2629_D5B1_4944_A97D_60254182B9A8_INCLUDED)