Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-18 09:51:26

0001 /*
0002  *
0003  * Copyright (c) 1998-2002
0004  * John Maddock
0005  *
0006  * Use, modification and distribution are subject to the 
0007  * Boost Software License, Version 1.0. (See accompanying file 
0008  * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
0009  *
0010  */
0011 
0012  /*
0013   *   LOCATION:    see http://www.boost.org for most recent version.
0014   *   FILE         states.cpp
0015   *   VERSION      see <boost/version.hpp>
0016   *   DESCRIPTION: Declares internal state machine structures.
0017   */
0018 
0019 #ifndef BOOST_REGEX_V4_STATES_HPP
0020 #define BOOST_REGEX_V4_STATES_HPP
0021 
0022 #ifdef BOOST_MSVC
0023 #pragma warning(push)
0024 #pragma warning(disable: 4103)
0025 #endif
0026 #ifdef BOOST_HAS_ABI_HEADERS
0027 #  include BOOST_ABI_PREFIX
0028 #endif
0029 #ifdef BOOST_MSVC
0030 #pragma warning(pop)
0031 #endif
0032 
0033 namespace boost{
0034 namespace BOOST_REGEX_DETAIL_NS{
0035 
0036 /*** mask_type *******************************************************
0037 Whenever we have a choice of two alternatives, we use an array of bytes
0038 to indicate which of the two alternatives it is possible to take for any
0039 given input character.  If mask_take is set, then we can take the next 
0040 state, and if mask_skip is set then we can take the alternative.
0041 ***********************************************************************/
0042 enum mask_type
0043 {
0044    mask_take = 1,
0045    mask_skip = 2,
0046    mask_init = 4,
0047    mask_any = mask_skip | mask_take,
0048    mask_all = mask_any
0049 };
0050 
0051 /*** helpers **********************************************************
0052 These helpers let us use function overload resolution to detect whether
0053 we have narrow or wide character strings:
0054 ***********************************************************************/
0055 struct _narrow_type{};
0056 struct _wide_type{};
0057 template <class charT> struct is_byte;
0058 template<>             struct is_byte<char>         { typedef _narrow_type width_type; };
0059 template<>             struct is_byte<unsigned char>{ typedef _narrow_type width_type; };
0060 template<>             struct is_byte<signed char>  { typedef _narrow_type width_type; };
0061 template <class charT> struct is_byte               { typedef _wide_type width_type; };
0062 
0063 /*** enum syntax_element_type ******************************************
0064 Every record in the state machine falls into one of the following types:
0065 ***********************************************************************/
0066 enum syntax_element_type
0067 {
0068    // start of a marked sub-expression, or perl-style (?...) extension
0069    syntax_element_startmark = 0,
0070    // end of a marked sub-expression, or perl-style (?...) extension
0071    syntax_element_endmark = syntax_element_startmark + 1,
0072    // any sequence of literal characters
0073    syntax_element_literal = syntax_element_endmark + 1,
0074    // start of line assertion: ^
0075    syntax_element_start_line = syntax_element_literal + 1,
0076    // end of line assertion $
0077    syntax_element_end_line = syntax_element_start_line + 1,
0078    // match any character: .
0079    syntax_element_wild = syntax_element_end_line + 1,
0080    // end of expression: we have a match when we get here
0081    syntax_element_match = syntax_element_wild + 1,
0082    // perl style word boundary: \b
0083    syntax_element_word_boundary = syntax_element_match + 1,
0084    // perl style within word boundary: \B
0085    syntax_element_within_word = syntax_element_word_boundary + 1,
0086    // start of word assertion: \<
0087    syntax_element_word_start = syntax_element_within_word + 1,
0088    // end of word assertion: \>
0089    syntax_element_word_end = syntax_element_word_start + 1,
0090    // start of buffer assertion: \`
0091    syntax_element_buffer_start = syntax_element_word_end + 1,
0092    // end of buffer assertion: \'
0093    syntax_element_buffer_end = syntax_element_buffer_start + 1,
0094    // backreference to previously matched sub-expression
0095    syntax_element_backref = syntax_element_buffer_end + 1,
0096    // either a wide character set [..] or one with multicharacter collating elements:
0097    syntax_element_long_set = syntax_element_backref + 1,
0098    // narrow character set: [...]
0099    syntax_element_set = syntax_element_long_set + 1,
0100    // jump to a new state in the machine:
0101    syntax_element_jump = syntax_element_set + 1,
0102    // choose between two production states:
0103    syntax_element_alt = syntax_element_jump + 1,
0104    // a repeat
0105    syntax_element_rep = syntax_element_alt + 1,
0106    // match a combining character sequence
0107    syntax_element_combining = syntax_element_rep + 1,
0108    // perl style soft buffer end: \z
0109    syntax_element_soft_buffer_end = syntax_element_combining + 1,
0110    // perl style continuation: \G
0111    syntax_element_restart_continue = syntax_element_soft_buffer_end + 1,
0112    // single character repeats:
0113    syntax_element_dot_rep = syntax_element_restart_continue + 1,
0114    syntax_element_char_rep = syntax_element_dot_rep + 1,
0115    syntax_element_short_set_rep = syntax_element_char_rep + 1,
0116    syntax_element_long_set_rep = syntax_element_short_set_rep + 1,
0117    // a backstep for lookbehind repeats:
0118    syntax_element_backstep = syntax_element_long_set_rep + 1,
0119    // an assertion that a mark was matched:
0120    syntax_element_assert_backref = syntax_element_backstep + 1,
0121    syntax_element_toggle_case = syntax_element_assert_backref + 1,
0122    // a recursive expression:
0123    syntax_element_recurse = syntax_element_toggle_case + 1,
0124    // Verbs:
0125    syntax_element_fail = syntax_element_recurse + 1,
0126    syntax_element_accept = syntax_element_fail + 1,
0127    syntax_element_commit = syntax_element_accept + 1,
0128    syntax_element_then = syntax_element_commit + 1
0129 };
0130 
0131 #ifdef BOOST_REGEX_DEBUG
0132 // dwa 09/26/00 - This is needed to suppress warnings about an ambiguous conversion
0133 std::ostream& operator<<(std::ostream&, syntax_element_type);
0134 #endif
0135 
0136 struct re_syntax_base;
0137 
0138 /*** union offset_type ************************************************
0139 Points to another state in the machine.  During machine construction
0140 we use integral offsets, but these are converted to pointers before
0141 execution of the machine.
0142 ***********************************************************************/
0143 union offset_type
0144 {
0145    re_syntax_base*   p;
0146    std::ptrdiff_t    i;
0147 };
0148 
0149 /*** struct re_syntax_base ********************************************
0150 Base class for all states in the machine.
0151 ***********************************************************************/
0152 struct re_syntax_base
0153 {
0154    syntax_element_type   type;         // what kind of state this is
0155    offset_type           next;         // next state in the machine
0156 };
0157 
0158 /*** struct re_brace **************************************************
0159 A marked parenthesis.
0160 ***********************************************************************/
0161 struct re_brace : public re_syntax_base
0162 {
0163    // The index to match, can be zero (don't mark the sub-expression)
0164    // or negative (for perl style (?...) extensions):
0165    int index;
0166    bool icase;
0167 };
0168 
0169 /*** struct re_dot **************************************************
0170 Match anything.
0171 ***********************************************************************/
0172 enum
0173 {
0174    dont_care = 1,
0175    force_not_newline = 0,
0176    force_newline = 2,
0177 
0178    test_not_newline = 2,
0179    test_newline = 3
0180 };
0181 struct re_dot : public re_syntax_base
0182 {
0183    unsigned char mask;
0184 };
0185 
0186 /*** struct re_literal ************************************************
0187 A string of literals, following this structure will be an 
0188 array of characters: charT[length]
0189 ***********************************************************************/
0190 struct re_literal : public re_syntax_base
0191 {
0192    unsigned int length;
0193 };
0194 
0195 /*** struct re_case ************************************************
0196 Indicates whether we are moving to a case insensive block or not
0197 ***********************************************************************/
0198 struct re_case : public re_syntax_base
0199 {
0200    bool icase;
0201 };
0202 
0203 /*** struct re_set_long ***********************************************
0204 A wide character set of characters, following this structure will be
0205 an array of type charT:
0206 First csingles null-terminated strings
0207 Then 2 * cranges NULL terminated strings
0208 Then cequivalents NULL terminated strings
0209 ***********************************************************************/
0210 template <class mask_type>
0211 struct re_set_long : public re_syntax_base
0212 {
0213    unsigned int            csingles, cranges, cequivalents;
0214    mask_type               cclasses;
0215    mask_type               cnclasses;
0216    bool                    isnot;
0217    bool                    singleton;
0218 };
0219 
0220 /*** struct re_set ****************************************************
0221 A set of narrow-characters, matches any of _map which is none-zero
0222 ***********************************************************************/
0223 struct re_set : public re_syntax_base
0224 {
0225    unsigned char _map[1 << CHAR_BIT];
0226 };
0227 
0228 /*** struct re_jump ***************************************************
0229 Jump to a new location in the machine (not next).
0230 ***********************************************************************/
0231 struct re_jump : public re_syntax_base
0232 {
0233    offset_type     alt;                 // location to jump to
0234 };
0235 
0236 /*** struct re_alt ***************************************************
0237 Jump to a new location in the machine (possibly next).
0238 ***********************************************************************/
0239 struct re_alt : public re_jump
0240 {
0241    unsigned char   _map[1 << CHAR_BIT]; // which characters can take the jump
0242    unsigned int    can_be_null;         // true if we match a NULL string
0243 };
0244 
0245 /*** struct re_repeat *************************************************
0246 Repeat a section of the machine
0247 ***********************************************************************/
0248 struct re_repeat : public re_alt
0249 {
0250    std::size_t   min, max;  // min and max allowable repeats
0251    int           state_id;        // Unique identifier for this repeat
0252    bool          leading;   // True if this repeat is at the start of the machine (lets us optimize some searches)
0253    bool          greedy;    // True if this is a greedy repeat
0254 };
0255 
0256 /*** struct re_recurse ************************************************
0257 Recurse to a particular subexpression.
0258 **********************************************************************/
0259 struct re_recurse : public re_jump
0260 {
0261    int state_id;             // identifier of first nested repeat within the recursion.
0262 };
0263 
0264 /*** struct re_commit *************************************************
0265 Used for the PRUNE, SKIP and COMMIT verbs which basically differ only in what happens
0266 if no match is found and we start searching forward.
0267 **********************************************************************/
0268 enum commit_type
0269 {
0270    commit_prune,
0271    commit_skip,
0272    commit_commit
0273 };
0274 struct re_commit : public re_syntax_base
0275 {
0276    commit_type action;
0277 };
0278 
0279 /*** enum re_jump_size_type *******************************************
0280 Provides compiled size of re_jump structure (allowing for trailing alignment).
0281 We provide this so we know how manybytes to insert when constructing the machine
0282 (The value of padding_mask is defined in regex_raw_buffer.hpp).
0283 ***********************************************************************/
0284 enum re_jump_size_type
0285 {
0286    re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask),
0287    re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask),
0288    re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask)
0289 };
0290 
0291 /*** proc re_is_set_member *********************************************
0292 Forward declaration: we'll need this one later...
0293 ***********************************************************************/
0294 
0295 template<class charT, class traits>
0296 struct regex_data;
0297 
0298 template <class iterator, class charT, class traits_type, class char_classT>
0299 iterator BOOST_REGEX_CALL re_is_set_member(iterator next, 
0300                           iterator last, 
0301                           const re_set_long<char_classT>* set_, 
0302                           const regex_data<charT, traits_type>& e, bool icase);
0303 
0304 } // namespace BOOST_REGEX_DETAIL_NS
0305 
0306 } // namespace boost
0307 
0308 #ifdef BOOST_MSVC
0309 #pragma warning(push)
0310 #pragma warning(disable: 4103)
0311 #endif
0312 #ifdef BOOST_HAS_ABI_HEADERS
0313 #  include BOOST_ABI_SUFFIX
0314 #endif
0315 #ifdef BOOST_MSVC
0316 #pragma warning(pop)
0317 #endif
0318 
0319 #endif
0320 
0321