|
||||
File indexing completed on 2025-01-31 10:12:02
0001 // Protocol Buffers - Google's data interchange format 0002 // Copyright 2008 Google Inc. All rights reserved. 0003 // 0004 // Use of this source code is governed by a BSD-style 0005 // license that can be found in the LICENSE file or at 0006 // https://developers.google.com/open-source/licenses/bsd 0007 0008 // Author: kenton@google.com (Kenton Varda) 0009 // Based on original Protocol Buffers design by 0010 // Sanjay Ghemawat, Jeff Dean, and others. 0011 // 0012 // Class for parsing tokenized text from a ZeroCopyInputStream. 0013 0014 #ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__ 0015 #define GOOGLE_PROTOBUF_IO_TOKENIZER_H__ 0016 0017 #include <string> 0018 #include <vector> 0019 0020 #include "google/protobuf/stubs/common.h" 0021 #include "absl/log/absl_log.h" 0022 #include "absl/strings/string_view.h" 0023 #include "google/protobuf/port.h" 0024 0025 // Must be included last. 0026 #include "google/protobuf/port_def.inc" 0027 0028 namespace google { 0029 namespace protobuf { 0030 namespace io { 0031 0032 class ZeroCopyInputStream; // zero_copy_stream.h 0033 0034 // Defined in this file. 0035 class ErrorCollector; 0036 class Tokenizer; 0037 0038 // By "column number", the proto compiler refers to a count of the number 0039 // of bytes before a given byte, except that a tab character advances to 0040 // the next multiple of 8 bytes. Note in particular that column numbers 0041 // are zero-based, while many user interfaces use one-based column numbers. 0042 typedef int ColumnNumber; 0043 0044 // Abstract interface for an object which collects the errors that occur 0045 // during parsing. A typical implementation might simply print the errors 0046 // to stdout. 0047 class PROTOBUF_EXPORT ErrorCollector { 0048 public: 0049 inline ErrorCollector() {} 0050 ErrorCollector(const ErrorCollector&) = delete; 0051 ErrorCollector& operator=(const ErrorCollector&) = delete; 0052 virtual ~ErrorCollector(); 0053 0054 // Indicates that there was an error in the input at the given line and 0055 // column numbers. The numbers are zero-based, so you may want to add 0056 // 1 to each before printing them. 0057 virtual void RecordError(int line, ColumnNumber column, 0058 absl::string_view message) 0059 = 0; 0060 0061 // Indicates that there was a warning in the input at the given line and 0062 // column numbers. The numbers are zero-based, so you may want to add 0063 // 1 to each before printing them. 0064 virtual void RecordWarning(int line, ColumnNumber column, 0065 absl::string_view message) { 0066 } 0067 0068 }; 0069 0070 // This class converts a stream of raw text into a stream of tokens for 0071 // the protocol definition parser to parse. The tokens recognized are 0072 // similar to those that make up the C language; see the TokenType enum for 0073 // precise descriptions. Whitespace and comments are skipped. By default, 0074 // C- and C++-style comments are recognized, but other styles can be used by 0075 // calling set_comment_style(). 0076 class PROTOBUF_EXPORT Tokenizer { 0077 public: 0078 // Construct a Tokenizer that reads and tokenizes text from the given 0079 // input stream and writes errors to the given error_collector. 0080 // The caller keeps ownership of input and error_collector. 0081 Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector); 0082 Tokenizer(const Tokenizer&) = delete; 0083 Tokenizer& operator=(const Tokenizer&) = delete; 0084 ~Tokenizer(); 0085 0086 enum TokenType { 0087 TYPE_START, // Next() has not yet been called. 0088 TYPE_END, // End of input reached. "text" is empty. 0089 0090 TYPE_IDENTIFIER, // A sequence of letters, digits, and underscores, not 0091 // starting with a digit. It is an error for a number 0092 // to be followed by an identifier with no space in 0093 // between. 0094 TYPE_INTEGER, // A sequence of digits representing an integer. Normally 0095 // the digits are decimal, but a prefix of "0x" indicates 0096 // a hex number and a leading zero indicates octal, just 0097 // like with C numeric literals. A leading negative sign 0098 // is NOT included in the token; it's up to the parser to 0099 // interpret the unary minus operator on its own. 0100 TYPE_FLOAT, // A floating point literal, with a fractional part and/or 0101 // an exponent. Always in decimal. Again, never 0102 // negative. 0103 TYPE_STRING, // A quoted sequence of escaped characters. Either single 0104 // or double quotes can be used, but they must match. 0105 // A string literal cannot cross a line break. 0106 TYPE_SYMBOL, // Any other printable character, like '!' or '+'. 0107 // Symbols are always a single character, so "!+$%" is 0108 // four tokens. 0109 TYPE_WHITESPACE, // A sequence of whitespace. This token type is only 0110 // produced if report_whitespace() is true. It is not 0111 // reported for whitespace within comments or strings. 0112 TYPE_NEWLINE, // A newline (\n). This token type is only 0113 // produced if report_whitespace() is true and 0114 // report_newlines() is true. It is not reported for 0115 // newlines in comments or strings. 0116 }; 0117 0118 // Structure representing a token read from the token stream. 0119 struct Token { 0120 TokenType type; 0121 std::string text; // The exact text of the token as it appeared in 0122 // the input. e.g. tokens of TYPE_STRING will still 0123 // be escaped and in quotes. 0124 0125 // "line" and "column" specify the position of the first character of 0126 // the token within the input stream. They are zero-based. 0127 int line; 0128 ColumnNumber column; 0129 ColumnNumber end_column; 0130 }; 0131 0132 // Get the current token. This is updated when Next() is called. Before 0133 // the first call to Next(), current() has type TYPE_START and no contents. 0134 const Token& current() const; 0135 0136 // Return the previous token -- i.e. what current() returned before the 0137 // previous call to Next(). 0138 const Token& previous() const; 0139 0140 // Advance to the next token. Returns false if the end of the input is 0141 // reached. 0142 bool Next(); 0143 0144 // Like Next(), but also collects comments which appear between the previous 0145 // and next tokens. 0146 // 0147 // Comments which appear to be attached to the previous token are stored 0148 // in *prev_tailing_comments. Comments which appear to be attached to the 0149 // next token are stored in *next_leading_comments. Comments appearing in 0150 // between which do not appear to be attached to either will be added to 0151 // detached_comments. Any of these parameters can be NULL to simply discard 0152 // the comments. 0153 // 0154 // A series of line comments appearing on consecutive lines, with no other 0155 // tokens appearing on those lines, will be treated as a single comment. 0156 // 0157 // Only the comment content is returned; comment markers (e.g. //) are 0158 // stripped out. For block comments, leading whitespace and an asterisk will 0159 // be stripped from the beginning of each line other than the first. Newlines 0160 // are included in the output. 0161 // 0162 // Examples: 0163 // 0164 // optional int32 foo = 1; // Comment attached to foo. 0165 // // Comment attached to bar. 0166 // optional int32 bar = 2; 0167 // 0168 // optional string baz = 3; 0169 // // Comment attached to baz. 0170 // // Another line attached to baz. 0171 // 0172 // // Comment attached to qux. 0173 // // 0174 // // Another line attached to qux. 0175 // optional double qux = 4; 0176 // 0177 // // Detached comment. This is not attached to qux or corge 0178 // // because there are blank lines separating it from both. 0179 // 0180 // optional string corge = 5; 0181 // /* Block comment attached 0182 // * to corge. Leading asterisks 0183 // * will be removed. */ 0184 // /* Block comment attached to 0185 // * grault. */ 0186 // optional int32 grault = 6; 0187 bool NextWithComments(std::string* prev_trailing_comments, 0188 std::vector<std::string>* detached_comments, 0189 std::string* next_leading_comments); 0190 0191 // Parse helpers --------------------------------------------------- 0192 0193 // Parses a TYPE_FLOAT token. This never fails, so long as the text actually 0194 // comes from a TYPE_FLOAT token parsed by Tokenizer. If it doesn't, the 0195 // result is undefined (possibly an assert failure). 0196 static double ParseFloat(const std::string& text); 0197 0198 // Parses given text as if it were a TYPE_FLOAT token. Returns false if the 0199 // given text is not actually a valid float literal. 0200 static bool TryParseFloat(const std::string& text, double* result); 0201 0202 // Parses a TYPE_STRING token. This never fails, so long as the text actually 0203 // comes from a TYPE_STRING token parsed by Tokenizer. If it doesn't, the 0204 // result is undefined (possibly an assert failure). 0205 static void ParseString(const std::string& text, std::string* output); 0206 0207 // Identical to ParseString, but appends to output. 0208 static void ParseStringAppend(const std::string& text, std::string* output); 0209 0210 // Parses a TYPE_INTEGER token. Returns false if the result would be 0211 // greater than max_value. Otherwise, returns true and sets *output to the 0212 // result. If the text is not from a Token of type TYPE_INTEGER originally 0213 // parsed by a Tokenizer, the result is undefined (possibly an assert 0214 // failure). 0215 static bool ParseInteger(const std::string& text, uint64_t max_value, 0216 uint64_t* output); 0217 0218 // Options --------------------------------------------------------- 0219 0220 // Set true to allow floats to be suffixed with the letter 'f'. Tokens 0221 // which would otherwise be integers but which have the 'f' suffix will be 0222 // forced to be interpreted as floats. For all other purposes, the 'f' is 0223 // ignored. 0224 void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; } 0225 0226 // Valid values for set_comment_style(). 0227 enum CommentStyle { 0228 // Line comments begin with "//", block comments are delimited by "/*" and 0229 // "*/". 0230 CPP_COMMENT_STYLE, 0231 // Line comments begin with "#". No way to write block comments. 0232 SH_COMMENT_STYLE 0233 }; 0234 0235 // Sets the comment style. 0236 void set_comment_style(CommentStyle style) { comment_style_ = style; } 0237 0238 // Whether to require whitespace between a number and a field name. 0239 // Default is true. Do not use this; for Google-internal cleanup only. 0240 void set_require_space_after_number(bool require) { 0241 require_space_after_number_ = require; 0242 } 0243 0244 // Whether to allow string literals to span multiple lines. Default is false. 0245 // Do not use this; for Google-internal cleanup only. 0246 void set_allow_multiline_strings(bool allow) { 0247 allow_multiline_strings_ = allow; 0248 } 0249 0250 // If true, whitespace tokens are reported by Next(). 0251 // Note: `set_report_whitespace(false)` implies `set_report_newlines(false)`. 0252 bool report_whitespace() const; 0253 void set_report_whitespace(bool report); 0254 0255 // If true, newline tokens are reported by Next(). 0256 // Note: `set_report_newlines(true)` implies `set_report_whitespace(true)`. 0257 bool report_newlines() const; 0258 void set_report_newlines(bool report); 0259 0260 // External helper: validate an identifier. 0261 static bool IsIdentifier(const std::string& text); 0262 0263 // ----------------------------------------------------------------- 0264 private: 0265 Token current_; // Returned by current(). 0266 Token previous_; // Returned by previous(). 0267 0268 ZeroCopyInputStream* input_; 0269 ErrorCollector* error_collector_; 0270 0271 char current_char_; // == buffer_[buffer_pos_], updated by NextChar(). 0272 const char* buffer_; // Current buffer returned from input_. 0273 int buffer_size_; // Size of buffer_. 0274 int buffer_pos_; // Current position within the buffer. 0275 bool read_error_; // Did we previously encounter a read error? 0276 0277 // Line and column number of current_char_ within the whole input stream. 0278 int line_; 0279 ColumnNumber column_; 0280 0281 // String to which text should be appended as we advance through it. 0282 // Call RecordTo(&str) to start recording and StopRecording() to stop. 0283 // E.g. StartToken() calls RecordTo(¤t_.text). record_start_ is the 0284 // position within the current buffer where recording started. 0285 std::string* record_target_; 0286 int record_start_; 0287 0288 // Options. 0289 bool allow_f_after_float_; 0290 CommentStyle comment_style_; 0291 bool require_space_after_number_; 0292 bool allow_multiline_strings_; 0293 bool report_whitespace_ = false; 0294 bool report_newlines_ = false; 0295 0296 // Since we count columns we need to interpret tabs somehow. We'll take 0297 // the standard 8-character definition for lack of any way to do better. 0298 // This must match the documentation of ColumnNumber. 0299 static const int kTabWidth = 8; 0300 0301 // ----------------------------------------------------------------- 0302 // Helper methods. 0303 0304 // Consume this character and advance to the next one. 0305 void NextChar(); 0306 0307 // Read a new buffer from the input. 0308 void Refresh(); 0309 0310 inline void RecordTo(std::string* target); 0311 inline void StopRecording(); 0312 0313 // Called when the current character is the first character of a new 0314 // token (not including whitespace or comments). 0315 inline void StartToken(); 0316 // Called when the current character is the first character after the 0317 // end of the last token. After this returns, current_.text will 0318 // contain all text consumed since StartToken() was called. 0319 inline void EndToken(); 0320 0321 // Convenience method to add an error at the current line and column. 0322 void AddError(const std::string& message) { 0323 error_collector_->RecordError(line_, column_, message); 0324 } 0325 0326 // ----------------------------------------------------------------- 0327 // The following four methods are used to consume tokens of specific 0328 // types. They are actually used to consume all characters *after* 0329 // the first, since the calling function consumes the first character 0330 // in order to decide what kind of token is being read. 0331 0332 // Read and consume a string, ending when the given delimiter is 0333 // consumed. 0334 void ConsumeString(char delimiter); 0335 0336 // Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER 0337 // depending on what was read. This needs to know if the first 0338 // character was a zero in order to correctly recognize hex and octal 0339 // numbers. 0340 // It also needs to know if the first character was a . to parse floating 0341 // point correctly. 0342 TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot); 0343 0344 // Consume the rest of a line. 0345 void ConsumeLineComment(std::string* content); 0346 // Consume until "*/". 0347 void ConsumeBlockComment(std::string* content); 0348 0349 enum NextCommentStatus { 0350 // Started a line comment. 0351 LINE_COMMENT, 0352 0353 // Started a block comment. 0354 BLOCK_COMMENT, 0355 0356 // Consumed a slash, then realized it wasn't a comment. current_ has 0357 // been filled in with a slash token. The caller should return it. 0358 SLASH_NOT_COMMENT, 0359 0360 // We do not appear to be starting a comment here. 0361 NO_COMMENT 0362 }; 0363 0364 // If we're at the start of a new comment, consume it and return what kind 0365 // of comment it is. 0366 NextCommentStatus TryConsumeCommentStart(); 0367 0368 // If we're looking at a TYPE_WHITESPACE token and `report_whitespace_` is 0369 // true, consume it and return true. 0370 bool TryConsumeWhitespace(); 0371 0372 // If we're looking at a TYPE_NEWLINE token and `report_newlines_` is true, 0373 // consume it and return true. 0374 bool TryConsumeNewline(); 0375 0376 // ----------------------------------------------------------------- 0377 // These helper methods make the parsing code more readable. The 0378 // "character classes" referred to are defined at the top of the .cc file. 0379 // Basically it is a C++ class with one method: 0380 // static bool InClass(char c); 0381 // The method returns true if c is a member of this "class", like "Letter" 0382 // or "Digit". 0383 0384 // Returns true if the current character is of the given character 0385 // class, but does not consume anything. 0386 template <typename CharacterClass> 0387 inline bool LookingAt(); 0388 0389 // If the current character is in the given class, consume it and return 0390 // true. Otherwise return false. 0391 // e.g. TryConsumeOne<Letter>() 0392 template <typename CharacterClass> 0393 inline bool TryConsumeOne(); 0394 0395 // Like above, but try to consume the specific character indicated. 0396 inline bool TryConsume(char c); 0397 0398 // Consume zero or more of the given character class. 0399 template <typename CharacterClass> 0400 inline void ConsumeZeroOrMore(); 0401 0402 // Consume one or more of the given character class or log the given 0403 // error message. 0404 // e.g. ConsumeOneOrMore<Digit>("Expected digits."); 0405 template <typename CharacterClass> 0406 inline void ConsumeOneOrMore(const char* error); 0407 }; 0408 0409 // inline methods ==================================================== 0410 inline const Tokenizer::Token& Tokenizer::current() const { return current_; } 0411 0412 inline const Tokenizer::Token& Tokenizer::previous() const { return previous_; } 0413 0414 inline void Tokenizer::ParseString(const std::string& text, 0415 std::string* output) { 0416 output->clear(); 0417 ParseStringAppend(text, output); 0418 } 0419 0420 } // namespace io 0421 } // namespace protobuf 0422 } // namespace google 0423 0424 #include "google/protobuf/port_undef.inc" 0425 0426 #endif // GOOGLE_PROTOBUF_IO_TOKENIZER_H__
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |