|
||||
File indexing completed on 2025-01-18 09:51:26
0001 /* 0002 * 0003 * Copyright (c) 1998-2002 0004 * John Maddock 0005 * 0006 * Use, modification and distribution are subject to the 0007 * Boost Software License, Version 1.0. (See accompanying file 0008 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 0009 * 0010 */ 0011 0012 /* 0013 * LOCATION: see http://www.boost.org for most recent version. 0014 * FILE states.cpp 0015 * VERSION see <boost/version.hpp> 0016 * DESCRIPTION: Declares internal state machine structures. 0017 */ 0018 0019 #ifndef BOOST_REGEX_V4_STATES_HPP 0020 #define BOOST_REGEX_V4_STATES_HPP 0021 0022 #ifdef BOOST_MSVC 0023 #pragma warning(push) 0024 #pragma warning(disable: 4103) 0025 #endif 0026 #ifdef BOOST_HAS_ABI_HEADERS 0027 # include BOOST_ABI_PREFIX 0028 #endif 0029 #ifdef BOOST_MSVC 0030 #pragma warning(pop) 0031 #endif 0032 0033 namespace boost{ 0034 namespace BOOST_REGEX_DETAIL_NS{ 0035 0036 /*** mask_type ******************************************************* 0037 Whenever we have a choice of two alternatives, we use an array of bytes 0038 to indicate which of the two alternatives it is possible to take for any 0039 given input character. If mask_take is set, then we can take the next 0040 state, and if mask_skip is set then we can take the alternative. 0041 ***********************************************************************/ 0042 enum mask_type 0043 { 0044 mask_take = 1, 0045 mask_skip = 2, 0046 mask_init = 4, 0047 mask_any = mask_skip | mask_take, 0048 mask_all = mask_any 0049 }; 0050 0051 /*** helpers ********************************************************** 0052 These helpers let us use function overload resolution to detect whether 0053 we have narrow or wide character strings: 0054 ***********************************************************************/ 0055 struct _narrow_type{}; 0056 struct _wide_type{}; 0057 template <class charT> struct is_byte; 0058 template<> struct is_byte<char> { typedef _narrow_type width_type; }; 0059 template<> struct is_byte<unsigned char>{ typedef _narrow_type width_type; }; 0060 template<> struct is_byte<signed char> { typedef _narrow_type width_type; }; 0061 template <class charT> struct is_byte { typedef _wide_type width_type; }; 0062 0063 /*** enum syntax_element_type ****************************************** 0064 Every record in the state machine falls into one of the following types: 0065 ***********************************************************************/ 0066 enum syntax_element_type 0067 { 0068 // start of a marked sub-expression, or perl-style (?...) extension 0069 syntax_element_startmark = 0, 0070 // end of a marked sub-expression, or perl-style (?...) extension 0071 syntax_element_endmark = syntax_element_startmark + 1, 0072 // any sequence of literal characters 0073 syntax_element_literal = syntax_element_endmark + 1, 0074 // start of line assertion: ^ 0075 syntax_element_start_line = syntax_element_literal + 1, 0076 // end of line assertion $ 0077 syntax_element_end_line = syntax_element_start_line + 1, 0078 // match any character: . 0079 syntax_element_wild = syntax_element_end_line + 1, 0080 // end of expression: we have a match when we get here 0081 syntax_element_match = syntax_element_wild + 1, 0082 // perl style word boundary: \b 0083 syntax_element_word_boundary = syntax_element_match + 1, 0084 // perl style within word boundary: \B 0085 syntax_element_within_word = syntax_element_word_boundary + 1, 0086 // start of word assertion: \< 0087 syntax_element_word_start = syntax_element_within_word + 1, 0088 // end of word assertion: \> 0089 syntax_element_word_end = syntax_element_word_start + 1, 0090 // start of buffer assertion: \` 0091 syntax_element_buffer_start = syntax_element_word_end + 1, 0092 // end of buffer assertion: \' 0093 syntax_element_buffer_end = syntax_element_buffer_start + 1, 0094 // backreference to previously matched sub-expression 0095 syntax_element_backref = syntax_element_buffer_end + 1, 0096 // either a wide character set [..] or one with multicharacter collating elements: 0097 syntax_element_long_set = syntax_element_backref + 1, 0098 // narrow character set: [...] 0099 syntax_element_set = syntax_element_long_set + 1, 0100 // jump to a new state in the machine: 0101 syntax_element_jump = syntax_element_set + 1, 0102 // choose between two production states: 0103 syntax_element_alt = syntax_element_jump + 1, 0104 // a repeat 0105 syntax_element_rep = syntax_element_alt + 1, 0106 // match a combining character sequence 0107 syntax_element_combining = syntax_element_rep + 1, 0108 // perl style soft buffer end: \z 0109 syntax_element_soft_buffer_end = syntax_element_combining + 1, 0110 // perl style continuation: \G 0111 syntax_element_restart_continue = syntax_element_soft_buffer_end + 1, 0112 // single character repeats: 0113 syntax_element_dot_rep = syntax_element_restart_continue + 1, 0114 syntax_element_char_rep = syntax_element_dot_rep + 1, 0115 syntax_element_short_set_rep = syntax_element_char_rep + 1, 0116 syntax_element_long_set_rep = syntax_element_short_set_rep + 1, 0117 // a backstep for lookbehind repeats: 0118 syntax_element_backstep = syntax_element_long_set_rep + 1, 0119 // an assertion that a mark was matched: 0120 syntax_element_assert_backref = syntax_element_backstep + 1, 0121 syntax_element_toggle_case = syntax_element_assert_backref + 1, 0122 // a recursive expression: 0123 syntax_element_recurse = syntax_element_toggle_case + 1, 0124 // Verbs: 0125 syntax_element_fail = syntax_element_recurse + 1, 0126 syntax_element_accept = syntax_element_fail + 1, 0127 syntax_element_commit = syntax_element_accept + 1, 0128 syntax_element_then = syntax_element_commit + 1 0129 }; 0130 0131 #ifdef BOOST_REGEX_DEBUG 0132 // dwa 09/26/00 - This is needed to suppress warnings about an ambiguous conversion 0133 std::ostream& operator<<(std::ostream&, syntax_element_type); 0134 #endif 0135 0136 struct re_syntax_base; 0137 0138 /*** union offset_type ************************************************ 0139 Points to another state in the machine. During machine construction 0140 we use integral offsets, but these are converted to pointers before 0141 execution of the machine. 0142 ***********************************************************************/ 0143 union offset_type 0144 { 0145 re_syntax_base* p; 0146 std::ptrdiff_t i; 0147 }; 0148 0149 /*** struct re_syntax_base ******************************************** 0150 Base class for all states in the machine. 0151 ***********************************************************************/ 0152 struct re_syntax_base 0153 { 0154 syntax_element_type type; // what kind of state this is 0155 offset_type next; // next state in the machine 0156 }; 0157 0158 /*** struct re_brace ************************************************** 0159 A marked parenthesis. 0160 ***********************************************************************/ 0161 struct re_brace : public re_syntax_base 0162 { 0163 // The index to match, can be zero (don't mark the sub-expression) 0164 // or negative (for perl style (?...) extensions): 0165 int index; 0166 bool icase; 0167 }; 0168 0169 /*** struct re_dot ************************************************** 0170 Match anything. 0171 ***********************************************************************/ 0172 enum 0173 { 0174 dont_care = 1, 0175 force_not_newline = 0, 0176 force_newline = 2, 0177 0178 test_not_newline = 2, 0179 test_newline = 3 0180 }; 0181 struct re_dot : public re_syntax_base 0182 { 0183 unsigned char mask; 0184 }; 0185 0186 /*** struct re_literal ************************************************ 0187 A string of literals, following this structure will be an 0188 array of characters: charT[length] 0189 ***********************************************************************/ 0190 struct re_literal : public re_syntax_base 0191 { 0192 unsigned int length; 0193 }; 0194 0195 /*** struct re_case ************************************************ 0196 Indicates whether we are moving to a case insensive block or not 0197 ***********************************************************************/ 0198 struct re_case : public re_syntax_base 0199 { 0200 bool icase; 0201 }; 0202 0203 /*** struct re_set_long *********************************************** 0204 A wide character set of characters, following this structure will be 0205 an array of type charT: 0206 First csingles null-terminated strings 0207 Then 2 * cranges NULL terminated strings 0208 Then cequivalents NULL terminated strings 0209 ***********************************************************************/ 0210 template <class mask_type> 0211 struct re_set_long : public re_syntax_base 0212 { 0213 unsigned int csingles, cranges, cequivalents; 0214 mask_type cclasses; 0215 mask_type cnclasses; 0216 bool isnot; 0217 bool singleton; 0218 }; 0219 0220 /*** struct re_set **************************************************** 0221 A set of narrow-characters, matches any of _map which is none-zero 0222 ***********************************************************************/ 0223 struct re_set : public re_syntax_base 0224 { 0225 unsigned char _map[1 << CHAR_BIT]; 0226 }; 0227 0228 /*** struct re_jump *************************************************** 0229 Jump to a new location in the machine (not next). 0230 ***********************************************************************/ 0231 struct re_jump : public re_syntax_base 0232 { 0233 offset_type alt; // location to jump to 0234 }; 0235 0236 /*** struct re_alt *************************************************** 0237 Jump to a new location in the machine (possibly next). 0238 ***********************************************************************/ 0239 struct re_alt : public re_jump 0240 { 0241 unsigned char _map[1 << CHAR_BIT]; // which characters can take the jump 0242 unsigned int can_be_null; // true if we match a NULL string 0243 }; 0244 0245 /*** struct re_repeat ************************************************* 0246 Repeat a section of the machine 0247 ***********************************************************************/ 0248 struct re_repeat : public re_alt 0249 { 0250 std::size_t min, max; // min and max allowable repeats 0251 int state_id; // Unique identifier for this repeat 0252 bool leading; // True if this repeat is at the start of the machine (lets us optimize some searches) 0253 bool greedy; // True if this is a greedy repeat 0254 }; 0255 0256 /*** struct re_recurse ************************************************ 0257 Recurse to a particular subexpression. 0258 **********************************************************************/ 0259 struct re_recurse : public re_jump 0260 { 0261 int state_id; // identifier of first nested repeat within the recursion. 0262 }; 0263 0264 /*** struct re_commit ************************************************* 0265 Used for the PRUNE, SKIP and COMMIT verbs which basically differ only in what happens 0266 if no match is found and we start searching forward. 0267 **********************************************************************/ 0268 enum commit_type 0269 { 0270 commit_prune, 0271 commit_skip, 0272 commit_commit 0273 }; 0274 struct re_commit : public re_syntax_base 0275 { 0276 commit_type action; 0277 }; 0278 0279 /*** enum re_jump_size_type ******************************************* 0280 Provides compiled size of re_jump structure (allowing for trailing alignment). 0281 We provide this so we know how manybytes to insert when constructing the machine 0282 (The value of padding_mask is defined in regex_raw_buffer.hpp). 0283 ***********************************************************************/ 0284 enum re_jump_size_type 0285 { 0286 re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask), 0287 re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask), 0288 re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask) 0289 }; 0290 0291 /*** proc re_is_set_member ********************************************* 0292 Forward declaration: we'll need this one later... 0293 ***********************************************************************/ 0294 0295 template<class charT, class traits> 0296 struct regex_data; 0297 0298 template <class iterator, class charT, class traits_type, class char_classT> 0299 iterator BOOST_REGEX_CALL re_is_set_member(iterator next, 0300 iterator last, 0301 const re_set_long<char_classT>* set_, 0302 const regex_data<charT, traits_type>& e, bool icase); 0303 0304 } // namespace BOOST_REGEX_DETAIL_NS 0305 0306 } // namespace boost 0307 0308 #ifdef BOOST_MSVC 0309 #pragma warning(push) 0310 #pragma warning(disable: 4103) 0311 #endif 0312 #ifdef BOOST_HAS_ABI_HEADERS 0313 # include BOOST_ABI_SUFFIX 0314 #endif 0315 #ifdef BOOST_MSVC 0316 #pragma warning(pop) 0317 #endif 0318 0319 #endif 0320 0321
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |