Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-02-28 10:10:19

0001 // Protocol Buffers - Google's data interchange format
0002 // Copyright 2008 Google Inc.  All rights reserved.
0003 //
0004 // Use of this source code is governed by a BSD-style
0005 // license that can be found in the LICENSE file or at
0006 // https://developers.google.com/open-source/licenses/bsd
0007 
0008 // Internal JSON tokenization utilities; not public API.
0009 #ifndef GOOGLE_PROTOBUF_JSON_INTERNAL_LEXER_H__
0010 #define GOOGLE_PROTOBUF_JSON_INTERNAL_LEXER_H__
0011 
0012 #include <array>
0013 #include <cfloat>
0014 #include <cmath>
0015 #include <cstdint>
0016 #include <iostream>
0017 #include <limits>
0018 #include <ostream>
0019 #include <string>
0020 #include <utility>
0021 
0022 #include "absl/status/status.h"
0023 #include "absl/status/statusor.h"
0024 #include "absl/strings/match.h"
0025 #include "absl/strings/str_format.h"
0026 #include "absl/strings/string_view.h"
0027 #include "google/protobuf/descriptor.h"
0028 #include "google/protobuf/io/zero_copy_stream.h"
0029 #include "google/protobuf/json/internal/message_path.h"
0030 #include "google/protobuf/json/internal/zero_copy_buffered_stream.h"
0031 #include "google/protobuf/stubs/status_macros.h"
0032 
0033 
0034 // Must be included last.
0035 #include "google/protobuf/port_def.inc"
0036 
0037 namespace google {
0038 namespace protobuf {
0039 namespace json_internal {
0040 // This is a duplicate of JsonParseOptions from json_util.h; it must be
0041 // re-defined here so that :json_lexer does not need to depend on :json_util.
0042 struct ParseOptions {
0043   bool ignore_unknown_fields = false;
0044   bool case_insensitive_enum_parsing = false;
0045 
0046   static constexpr size_t kDefaultDepth = 100;
0047 
0048   // The number of times we may recurse before bailing out on the grounds of
0049   // avoiding pathological input.
0050   int recursion_depth = kDefaultDepth;
0051 
0052   // The original parser used by json_util2 accepted a number of non-standard
0053   // options. Setting this flag enables them.
0054   //
0055   // What those extensions were is explicitly not documented, beyond what exists
0056   // in the unit tests; we intend to remove this setting eventually. See
0057   // b/234868512.
0058   bool allow_legacy_syntax = false;
0059 };
0060 
0061 // A position in JSON input, for error context.
0062 struct JsonLocation {
0063   // This type exists to work around an absl type that has not yet been
0064   // released.
0065   struct SourceLocation {
0066     static SourceLocation current() { return {}; }
0067   };
0068 
0069   // Line and column are both zero-indexed in-memory.
0070   size_t offset = 0;
0071   size_t line = 0;
0072   size_t col = 0;
0073   const MessagePath* path = nullptr;
0074 
0075   // Creates an absl::InvalidArgumentError with line/column information.
0076   absl::Status Invalid(absl::string_view message,
0077                        SourceLocation sl = SourceLocation::current()) const;
0078 };
0079 
0080 template <typename T>
0081 struct LocationWith {
0082   T value;
0083   JsonLocation loc;
0084 };
0085 
0086 class JsonLexer {
0087  public:
0088   // A kind of token that PeekKind() can detect.
0089   enum Kind {
0090     kObj,
0091     kArr,
0092     kStr,
0093     kNum,
0094     kTrue,
0095     kFalse,
0096     kNull,
0097   };
0098 
0099   using SourceLocation = JsonLocation::SourceLocation;
0100 
0101   JsonLexer(io::ZeroCopyInputStream* stream, const ParseOptions& options,
0102             MessagePath* path = nullptr, JsonLocation start = {})
0103       : stream_(stream), options_(options), json_loc_(start), path_(path) {
0104     json_loc_.path = path_;
0105   }
0106 
0107   const ParseOptions& options() const { return options_; }
0108 
0109   const MessagePath& path() const { return *path_; }
0110   MessagePath& path() { return *path_; }
0111 
0112   // Creates an absl::InvalidArgumentError with line/column information.
0113   absl::Status Invalid(absl::string_view message,
0114                        SourceLocation sl = SourceLocation::current()) {
0115     return json_loc_.Invalid(message, sl);
0116   }
0117 
0118   // Expects the next bytes to be parsed (after consuming whitespace) to be
0119   // exactly `literal`. If they are, consumes them; otherwise returns an error.
0120   absl::Status Expect(absl::string_view literal,
0121                       SourceLocation sl = SourceLocation::current()) {
0122     RETURN_IF_ERROR(SkipToToken());
0123     auto buffering = stream_.BufferAtLeast(literal.size());
0124     RETURN_IF_ERROR(buffering.status());
0125 
0126     if (!absl::StartsWith(stream_.Unread(), literal)) {
0127       return Invalid(
0128           absl::StrFormat("unexpected character: '%c'; expected '%s'",
0129                           stream_.PeekChar(), literal),
0130           sl);
0131     }
0132 
0133     return Advance(literal.size());
0134   }
0135 
0136   // Like Expect(), but returns a boolean. This makes it clear that the
0137   // lookahead is failible.
0138   bool Peek(absl::string_view literal) {
0139     // Suppress the error; this can only fail on EOF in which case we would
0140     // return false regardless.
0141     (void)SkipToToken();
0142     auto ignored = stream_.BufferAtLeast(literal.size());
0143     if (!absl::StartsWith(stream_.Unread(), literal)) {
0144       return false;
0145     }
0146 
0147     // We just ensured we had enough buffered so we can suppress this error.
0148     (void)Advance(literal.size());
0149     return true;
0150   }
0151 
0152   // Like Peek(string), but returns true if and only if a token of the given
0153   // kind can be lexed next. Returns false on EOF, just like Peek(string).
0154   bool Peek(Kind needle) {
0155     auto kind = PeekKind();
0156     return kind.ok() && *kind == needle;
0157   }
0158 
0159   // Consumes all whitespace and other ignored characters until the next
0160   // token.
0161   //
0162   // This function returns an error on EOF, so PeekChar() can be safely
0163   // called if it returns ok.
0164   absl::Status SkipToToken();
0165 
0166   // Returns which kind of value token (i.e., something that can occur after
0167   // a `:`) is next up to be parsed.
0168   absl::StatusOr<Kind> PeekKind();
0169 
0170   // Parses a JSON number.
0171   absl::StatusOr<LocationWith<double>> ParseNumber();
0172 
0173   // Parses a number as a string, without turning it into an integer.
0174   absl::StatusOr<LocationWith<MaybeOwnedString>> ParseRawNumber();
0175 
0176   // Parses a UTF-8 string. If the contents of the string happen to actually be
0177   // UTF-8, it will return a zero-copy view; otherwise it will allocate.
0178   absl::StatusOr<LocationWith<MaybeOwnedString>> ParseUtf8();
0179 
0180   // Walks over an array, calling `f` each time an element is reached.
0181   //
0182   // `f` should have type `() -> absl::Status`.
0183   template <typename F>
0184   absl::Status VisitArray(F f);
0185 
0186   // Walks over an object, calling `f` just after parsing each `:`.
0187   //
0188   // `f` should have type `(absl::string_view) -> absl::Status`.
0189   template <typename F>
0190   absl::Status VisitObject(F f);
0191 
0192   // Parses a single value and discards it.
0193   absl::Status SkipValue();
0194 
0195   // Forwards of functions from ZeroCopyBufferedStream.
0196 
0197   bool AtEof() {
0198     // Ignore whitespace for the purposes of finding the EOF. This will return
0199     // an error if we hit EOF, so we discard it.
0200     (void)SkipToToken();
0201     return stream_.AtEof();
0202   }
0203 
0204   absl::StatusOr<LocationWith<MaybeOwnedString>> Take(size_t len) {
0205     JsonLocation loc = json_loc_;
0206     auto taken = stream_.Take(len);
0207     RETURN_IF_ERROR(taken.status());
0208     return LocationWith<MaybeOwnedString>{*std::move(taken), loc};
0209   }
0210 
0211   template <typename Pred>
0212   absl::StatusOr<LocationWith<MaybeOwnedString>> TakeWhile(Pred p) {
0213     JsonLocation loc = json_loc_;
0214     auto taken = stream_.TakeWhile(std::move(p));
0215     RETURN_IF_ERROR(taken.status());
0216     return LocationWith<MaybeOwnedString>{*std::move(taken), loc};
0217   }
0218 
0219   LocationWith<Mark> BeginMark() { return {stream_.BeginMark(), json_loc_}; }
0220 
0221  private:
0222   friend BufferingGuard;
0223   friend Mark;
0224   friend MaybeOwnedString;
0225 
0226   absl::Status Push() {
0227     if (options_.recursion_depth == 0) {
0228       return Invalid("JSON content was too deeply nested");
0229     }
0230     --options_.recursion_depth;
0231     return absl::OkStatus();
0232   }
0233 
0234   void Pop() { ++options_.recursion_depth; }
0235 
0236   // Parses the next four bytes as a 16-bit hex numeral.
0237   absl::StatusOr<uint16_t> ParseU16HexCodepoint();
0238 
0239   // Parses a Unicode escape (\uXXXX); this may be a surrogate pair, so it may
0240   // consume the character that follows. Both are encoded as utf8 into
0241   // `out_utf8`; returns the number of bytes written.
0242   absl::StatusOr<size_t> ParseUnicodeEscape(char out_utf8[4]);
0243 
0244   // Parses an alphanumeric "identifier", for use with the non-standard
0245   // "unquoted keys" extension.
0246   absl::StatusOr<LocationWith<MaybeOwnedString>> ParseBareWord();
0247 
0248   absl::Status Advance(size_t bytes) {
0249     RETURN_IF_ERROR(stream_.Advance(bytes));
0250     json_loc_.offset += static_cast<int>(bytes);
0251     json_loc_.col += static_cast<int>(bytes);
0252     return absl::OkStatus();
0253   }
0254 
0255   ZeroCopyBufferedStream stream_;
0256 
0257   ParseOptions options_;
0258   JsonLocation json_loc_;
0259   MessagePath* path_;
0260 };
0261 
0262 template <typename F>
0263 absl::Status JsonLexer::VisitArray(F f) {
0264   RETURN_IF_ERROR(Expect("["));
0265   RETURN_IF_ERROR(Push());
0266 
0267   if (Peek("]")) {
0268     Pop();
0269     return absl::OkStatus();
0270   }
0271 
0272   bool has_comma = true;
0273   do {
0274     if (!has_comma) {
0275       return Invalid("expected ','");
0276     }
0277     RETURN_IF_ERROR(f());
0278     has_comma = Peek(",");
0279   } while (!Peek("]"));
0280 
0281   if (!options_.allow_legacy_syntax && has_comma) {
0282     return Invalid("expected ']'");
0283   }
0284 
0285   Pop();
0286   return absl::OkStatus();
0287 }
0288 
0289 // Walks over an object, calling `f` just after parsing each `:`.
0290 //
0291 // `f` should have type `(MaybeOwnedString&) -> absl::Status`.
0292 template <typename F>
0293 absl::Status JsonLexer::VisitObject(F f) {
0294   RETURN_IF_ERROR(Expect("{"));
0295   RETURN_IF_ERROR(Push());
0296 
0297   if (Peek("}")) {
0298     Pop();
0299     return absl::OkStatus();
0300   }
0301 
0302   bool has_comma = true;
0303   do {
0304     if (!has_comma) {
0305       return Invalid("expected ','");
0306     }
0307     RETURN_IF_ERROR(SkipToToken());
0308 
0309     absl::StatusOr<LocationWith<MaybeOwnedString>> key;
0310     if (stream_.PeekChar() == '"' || stream_.PeekChar() == '\'') {
0311       key = ParseUtf8();
0312     } else if (options_.allow_legacy_syntax) {
0313       key = ParseBareWord();
0314     } else {
0315       return Invalid("expected '\"'");
0316     }
0317 
0318     RETURN_IF_ERROR(key.status());
0319     RETURN_IF_ERROR(Expect(":"));
0320     RETURN_IF_ERROR(f(*key));
0321     has_comma = Peek(",");
0322   } while (!Peek("}"));
0323   Pop();
0324 
0325   if (!options_.allow_legacy_syntax && has_comma) {
0326     return Invalid("expected '}'");
0327   }
0328 
0329   return absl::OkStatus();
0330 }
0331 }  // namespace json_internal
0332 }  // namespace protobuf
0333 }  // namespace google
0334 
0335 #include "google/protobuf/port_undef.inc"
0336 #endif  // GOOGLE_PROTOBUF_JSON_INTERNAL_LEXER_H__