protobuf/io/tokenizer.h

0001 // Protocol Buffers - Google's data interchange format
0002 // Copyright 2008 Google Inc.  All rights reserved.
0003 //
0004 // Use of this source code is governed by a BSD-style
0005 // license that can be found in the LICENSE file or at
0006 // https://developers.google.com/open-source/licenses/bsd
0007
0008 // Author: kenton@google.com (Kenton Varda)
0009 //  Based on original Protocol Buffers design by
0010 //  Sanjay Ghemawat, Jeff Dean, and others.
0011 //
0012 // Class for parsing tokenized text from a ZeroCopyInputStream.
0013
0014 #ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__
0015 #define GOOGLE_PROTOBUF_IO_TOKENIZER_H__
0016
0017 #include <string>
0018 #include <vector>
0019
0020 #include "google/protobuf/stubs/common.h"
0021 #include "absl/log/absl_log.h"
0022 #include "absl/strings/string_view.h"
0023 #include "google/protobuf/port.h"
0024
0025 // Must be included last.
0026 #include "google/protobuf/port_def.inc"
0027
0028 namespace google {
0029 namespace protobuf {
0030 namespace io {
0031
0032 class ZeroCopyInputStream;  // zero_copy_stream.h
0033
0034 // Defined in this file.
0035 class ErrorCollector;
0036 class Tokenizer;
0037
0038 // By "column number", the proto compiler refers to a count of the number
0039 // of bytes before a given byte, except that a tab character advances to
0040 // the next multiple of 8 bytes.  Note in particular that column numbers
0041 // are zero-based, while many user interfaces use one-based column numbers.
0042 typedef int ColumnNumber;
0043
0044 // Abstract interface for an object which collects the errors that occur
0045 // during parsing.  A typical implementation might simply print the errors
0046 // to stdout.
0047 class PROTOBUF_EXPORT ErrorCollector {
0048  public:
0049   inline ErrorCollector() {}
0050   ErrorCollector(const ErrorCollector&) = delete;
0051   ErrorCollector& operator=(const ErrorCollector&) = delete;
0052   virtual ~ErrorCollector();
0053
0054   // Indicates that there was an error in the input at the given line and
0055   // column numbers.  The numbers are zero-based, so you may want to add
0056   // 1 to each before printing them.
0057   virtual void RecordError(int line, ColumnNumber column,
0058                            absl::string_view message)
0059       = 0;
0060
0061   // Indicates that there was a warning in the input at the given line and
0062   // column numbers.  The numbers are zero-based, so you may want to add
0063   // 1 to each before printing them.
0064   virtual void RecordWarning(int line, ColumnNumber column,
0065                              absl::string_view message) {
0066   }
0067
0068 };
0069
0070 // This class converts a stream of raw text into a stream of tokens for
0071 // the protocol definition parser to parse.  The tokens recognized are
0072 // similar to those that make up the C language; see the TokenType enum for
0073 // precise descriptions.  Whitespace and comments are skipped.  By default,
0074 // C- and C++-style comments are recognized, but other styles can be used by
0075 // calling set_comment_style().
0076 class PROTOBUF_EXPORT Tokenizer {
0077  public:
0078   // Construct a Tokenizer that reads and tokenizes text from the given
0079   // input stream and writes errors to the given error_collector.
0080   // The caller keeps ownership of input and error_collector.
0081   Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector);
0082   Tokenizer(const Tokenizer&) = delete;
0083   Tokenizer& operator=(const Tokenizer&) = delete;
0084   ~Tokenizer();
0085
0086   enum TokenType {
0087     TYPE_START,  // Next() has not yet been called.
0088     TYPE_END,    // End of input reached.  "text" is empty.
0089
0090     TYPE_IDENTIFIER,  // A sequence of letters, digits, and underscores, not
0091                       // starting with a digit.  It is an error for a number
0092                       // to be followed by an identifier with no space in
0093                       // between.
0094     TYPE_INTEGER,     // A sequence of digits representing an integer.  Normally
0095                       // the digits are decimal, but a prefix of "0x" indicates
0096                       // a hex number and a leading zero indicates octal, just
0097                       // like with C numeric literals.  A leading negative sign
0098                       // is NOT included in the token; it's up to the parser to
0099                       // interpret the unary minus operator on its own.
0100     TYPE_FLOAT,       // A floating point literal, with a fractional part and/or
0101                       // an exponent.  Always in decimal.  Again, never
0102                       // negative.
0103     TYPE_STRING,      // A quoted sequence of escaped characters.  Either single
0104                       // or double quotes can be used, but they must match.
0105                       // A string literal cannot cross a line break.
0106     TYPE_SYMBOL,      // Any other printable character, like '!' or '+'.
0107                       // Symbols are always a single character, so "!+$%" is
0108                       // four tokens.
0109     TYPE_WHITESPACE,  // A sequence of whitespace.  This token type is only
0110                       // produced if report_whitespace() is true.  It is not
0111                       // reported for whitespace within comments or strings.
0112     TYPE_NEWLINE,     // A newline (\n).  This token type is only
0113                       // produced if report_whitespace() is true and
0114                       // report_newlines() is true.  It is not reported for
0115                       // newlines in comments or strings.
0116   };
0117
0118   // Structure representing a token read from the token stream.
0119   struct Token {
0120     TokenType type;
0121     std::string text;  // The exact text of the token as it appeared in
0122                        // the input.  e.g. tokens of TYPE_STRING will still
0123                        // be escaped and in quotes.
0124
0125     // "line" and "column" specify the position of the first character of
0126     // the token within the input stream.  They are zero-based.
0127     int line;
0128     ColumnNumber column;
0129     ColumnNumber end_column;
0130   };
0131
0132   // Get the current token.  This is updated when Next() is called.  Before
0133   // the first call to Next(), current() has type TYPE_START and no contents.
0134   const Token& current() const;
0135
0136   // Return the previous token -- i.e. what current() returned before the
0137   // previous call to Next().
0138   const Token& previous() const;
0139
0140   // Advance to the next token.  Returns false if the end of the input is
0141   // reached.
0142   bool Next();
0143
0144   // Like Next(), but also collects comments which appear between the previous
0145   // and next tokens.
0146   //
0147   // Comments which appear to be attached to the previous token are stored
0148   // in *prev_tailing_comments.  Comments which appear to be attached to the
0149   // next token are stored in *next_leading_comments.  Comments appearing in
0150   // between which do not appear to be attached to either will be added to
0151   // detached_comments.  Any of these parameters can be NULL to simply discard
0152   // the comments.
0153   //
0154   // A series of line comments appearing on consecutive lines, with no other
0155   // tokens appearing on those lines, will be treated as a single comment.
0156   //
0157   // Only the comment content is returned; comment markers (e.g. //) are
0158   // stripped out.  For block comments, leading whitespace and an asterisk will
0159   // be stripped from the beginning of each line other than the first.  Newlines
0160   // are included in the output.
0161   //
0162   // Examples:
0163   //
0164   //   optional int32 foo = 1;  // Comment attached to foo.
0165   //   // Comment attached to bar.
0166   //   optional int32 bar = 2;
0167   //
0168   //   optional string baz = 3;
0169   //   // Comment attached to baz.
0170   //   // Another line attached to baz.
0171   //
0172   //   // Comment attached to qux.
0173   //   //
0174   //   // Another line attached to qux.
0175   //   optional double qux = 4;
0176   //
0177   //   // Detached comment.  This is not attached to qux or corge
0178   //   // because there are blank lines separating it from both.
0179   //
0180   //   optional string corge = 5;
0181   //   /* Block comment attached
0182   //    * to corge.  Leading asterisks
0183   //    * will be removed. */
0184   //   /* Block comment attached to
0185   //    * grault. */
0186   //   optional int32 grault = 6;
0187   bool NextWithComments(std::string* prev_trailing_comments,
0188                         std::vector<std::string>* detached_comments,
0189                         std::string* next_leading_comments);
0190
0191   // Parse helpers ---------------------------------------------------
0192
0193   // Parses a TYPE_FLOAT token.  This never fails, so long as the text actually
0194   // comes from a TYPE_FLOAT token parsed by Tokenizer.  If it doesn't, the
0195   // result is undefined (possibly an assert failure).
0196   static double ParseFloat(const std::string& text);
0197
0198   // Parses given text as if it were a TYPE_FLOAT token.  Returns false if the
0199   // given text is not actually a valid float literal.
0200   static bool TryParseFloat(const std::string& text, double* result);
0201
0202   // Parses a TYPE_STRING token.  This never fails, so long as the text actually
0203   // comes from a TYPE_STRING token parsed by Tokenizer.  If it doesn't, the
0204   // result is undefined (possibly an assert failure).
0205   static void ParseString(const std::string& text, std::string* output);
0206
0207   // Identical to ParseString, but appends to output.
0208   static void ParseStringAppend(const std::string& text, std::string* output);
0209
0210   // Parses a TYPE_INTEGER token.  Returns false if the result would be
0211   // greater than max_value.  Otherwise, returns true and sets *output to the
0212   // result.  If the text is not from a Token of type TYPE_INTEGER originally
0213   // parsed by a Tokenizer, the result is undefined (possibly an assert
0214   // failure).
0215   static bool ParseInteger(const std::string& text, uint64_t max_value,
0216                            uint64_t* output);
0217
0218   // Options ---------------------------------------------------------
0219
0220   // Set true to allow floats to be suffixed with the letter 'f'.  Tokens
0221   // which would otherwise be integers but which have the 'f' suffix will be
0222   // forced to be interpreted as floats.  For all other purposes, the 'f' is
0223   // ignored.
0224   void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; }
0225
0226   // Valid values for set_comment_style().
0227   enum CommentStyle {
0228     // Line comments begin with "//", block comments are delimited by "/*" and
0229     // "*/".
0230     CPP_COMMENT_STYLE,
0231     // Line comments begin with "#".  No way to write block comments.
0232     SH_COMMENT_STYLE
0233   };
0234
0235   // Sets the comment style.
0236   void set_comment_style(CommentStyle style) { comment_style_ = style; }
0237
0238   // Whether to require whitespace between a number and a field name.
0239   // Default is true. Do not use this; for Google-internal cleanup only.
0240   void set_require_space_after_number(bool require) {
0241     require_space_after_number_ = require;
0242   }
0243
0244   // Whether to allow string literals to span multiple lines. Default is false.
0245   // Do not use this; for Google-internal cleanup only.
0246   void set_allow_multiline_strings(bool allow) {
0247     allow_multiline_strings_ = allow;
0248   }
0249
0250   // If true, whitespace tokens are reported by Next().
0251   // Note: `set_report_whitespace(false)` implies `set_report_newlines(false)`.
0252   bool report_whitespace() const;
0253   void set_report_whitespace(bool report);
0254
0255   // If true, newline tokens are reported by Next().
0256   // Note: `set_report_newlines(true)` implies `set_report_whitespace(true)`.
0257   bool report_newlines() const;
0258   void set_report_newlines(bool report);
0259
0260   // External helper: validate an identifier.
0261   static bool IsIdentifier(const std::string& text);
0262
0263   // -----------------------------------------------------------------
0264  private:
0265   Token current_;   // Returned by current().
0266   Token previous_;  // Returned by previous().
0267
0268   ZeroCopyInputStream* input_;
0269   ErrorCollector* error_collector_;
0270
0271   char current_char_;   // == buffer_[buffer_pos_], updated by NextChar().
0272   const char* buffer_;  // Current buffer returned from input_.
0273   int buffer_size_;     // Size of buffer_.
0274   int buffer_pos_;      // Current position within the buffer.
0275   bool read_error_;     // Did we previously encounter a read error?
0276
0277   // Line and column number of current_char_ within the whole input stream.
0278   int line_;
0279   ColumnNumber column_;
0280
0281   // String to which text should be appended as we advance through it.
0282   // Call RecordTo(&str) to start recording and StopRecording() to stop.
0283   // E.g. StartToken() calls RecordTo(&current_.text).  record_start_ is the
0284   // position within the current buffer where recording started.
0285   std::string* record_target_;
0286   int record_start_;
0287
0288   // Options.
0289   bool allow_f_after_float_;
0290   CommentStyle comment_style_;
0291   bool require_space_after_number_;
0292   bool allow_multiline_strings_;
0293   bool report_whitespace_ = false;
0294   bool report_newlines_ = false;
0295
0296   // Since we count columns we need to interpret tabs somehow.  We'll take
0297   // the standard 8-character definition for lack of any way to do better.
0298   // This must match the documentation of ColumnNumber.
0299   static const int kTabWidth = 8;
0300
0301   // -----------------------------------------------------------------
0302   // Helper methods.
0303
0304   // Consume this character and advance to the next one.
0305   void NextChar();
0306
0307   // Read a new buffer from the input.
0308   void Refresh();
0309
0310   inline void RecordTo(std::string* target);
0311   inline void StopRecording();
0312
0313   // Called when the current character is the first character of a new
0314   // token (not including whitespace or comments).
0315   inline void StartToken();
0316   // Called when the current character is the first character after the
0317   // end of the last token.  After this returns, current_.text will
0318   // contain all text consumed since StartToken() was called.
0319   inline void EndToken();
0320
0321   // Convenience method to add an error at the current line and column.
0322   void AddError(const std::string& message) {
0323     error_collector_->RecordError(line_, column_, message);
0324   }
0325
0326   // -----------------------------------------------------------------
0327   // The following four methods are used to consume tokens of specific
0328   // types.  They are actually used to consume all characters *after*
0329   // the first, since the calling function consumes the first character
0330   // in order to decide what kind of token is being read.
0331
0332   // Read and consume a string, ending when the given delimiter is
0333   // consumed.
0334   void ConsumeString(char delimiter);
0335
0336   // Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER
0337   // depending on what was read.  This needs to know if the first
0338   // character was a zero in order to correctly recognize hex and octal
0339   // numbers.
0340   // It also needs to know if the first character was a . to parse floating
0341   // point correctly.
0342   TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot);
0343
0344   // Consume the rest of a line.
0345   void ConsumeLineComment(std::string* content);
0346   // Consume until "*/".
0347   void ConsumeBlockComment(std::string* content);
0348
0349   enum NextCommentStatus {
0350     // Started a line comment.
0351     LINE_COMMENT,
0352
0353     // Started a block comment.
0354     BLOCK_COMMENT,
0355
0356     // Consumed a slash, then realized it wasn't a comment.  current_ has
0357     // been filled in with a slash token.  The caller should return it.
0358     SLASH_NOT_COMMENT,
0359
0360     // We do not appear to be starting a comment here.
0361     NO_COMMENT
0362   };
0363
0364   // If we're at the start of a new comment, consume it and return what kind
0365   // of comment it is.
0366   NextCommentStatus TryConsumeCommentStart();
0367
0368   // If we're looking at a TYPE_WHITESPACE token and `report_whitespace_` is
0369   // true, consume it and return true.
0370   bool TryConsumeWhitespace();
0371
0372   // If we're looking at a TYPE_NEWLINE token and `report_newlines_` is true,
0373   // consume it and return true.
0374   bool TryConsumeNewline();
0375
0376   // -----------------------------------------------------------------
0377   // These helper methods make the parsing code more readable.  The
0378   // "character classes" referred to are defined at the top of the .cc file.
0379   // Basically it is a C++ class with one method:
0380   //   static bool InClass(char c);
0381   // The method returns true if c is a member of this "class", like "Letter"
0382   // or "Digit".
0383
0384   // Returns true if the current character is of the given character
0385   // class, but does not consume anything.
0386   template <typename CharacterClass>
0387   inline bool LookingAt();
0388
0389   // If the current character is in the given class, consume it and return
0390   // true.  Otherwise return false.
0391   // e.g. TryConsumeOne<Letter>()
0392   template <typename CharacterClass>
0393   inline bool TryConsumeOne();
0394
0395   // Like above, but try to consume the specific character indicated.
0396   inline bool TryConsume(char c);
0397
0398   // Consume zero or more of the given character class.
0399   template <typename CharacterClass>
0400   inline void ConsumeZeroOrMore();
0401
0402   // Consume one or more of the given character class or log the given
0403   // error message.
0404   // e.g. ConsumeOneOrMore<Digit>("Expected digits.");
0405   template <typename CharacterClass>
0406   inline void ConsumeOneOrMore(const char* error);
0407 };
0408
0409 // inline methods ====================================================
0410 inline const Tokenizer::Token& Tokenizer::current() const { return current_; }
0411
0412 inline const Tokenizer::Token& Tokenizer::previous() const { return previous_; }
0413
0414 inline void Tokenizer::ParseString(const std::string& text,
0415                                    std::string* output) {
0416   output->clear();
0417   ParseStringAppend(text, output);
0418 }
0419
0420 }  // namespace io
0421 }  // namespace protobuf
0422 }  // namespace google
0423
0424 #include "google/protobuf/port_undef.inc"
0425
0426 #endif  // GOOGLE_PROTOBUF_IO_TOKENIZER_H__