clang/Lex/Lexer.h

0001 //===- Lexer.h - C Language Family Lexer ------------------------*- C++ -*-===//
0002 //
0003 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
0004 // See https://llvm.org/LICENSE.txt for license information.
0005 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
0006 //
0007 //===----------------------------------------------------------------------===//
0008 //
0009 //  This file defines the Lexer interface.
0010 //
0011 //===----------------------------------------------------------------------===//
0012
0013 #ifndef LLVM_CLANG_LEX_LEXER_H
0014 #define LLVM_CLANG_LEX_LEXER_H
0015
0016 #include "clang/Basic/LangOptions.h"
0017 #include "clang/Basic/SourceLocation.h"
0018 #include "clang/Basic/TokenKinds.h"
0019 #include "clang/Lex/DependencyDirectivesScanner.h"
0020 #include "clang/Lex/PreprocessorLexer.h"
0021 #include "clang/Lex/Token.h"
0022 #include "llvm/ADT/SmallVector.h"
0023 #include "llvm/ADT/StringRef.h"
0024 #include <cassert>
0025 #include <cstdint>
0026 #include <optional>
0027 #include <string>
0028
0029 namespace llvm {
0030
0031 class MemoryBufferRef;
0032
0033 } // namespace llvm
0034
0035 namespace clang {
0036
0037 class DiagnosticBuilder;
0038 class Preprocessor;
0039 class SourceManager;
0040 class LangOptions;
0041
0042 /// ConflictMarkerKind - Kinds of conflict marker which the lexer might be
0043 /// recovering from.
0044 enum ConflictMarkerKind {
0045   /// Not within a conflict marker.
0046   CMK_None,
0047
0048   /// A normal or diff3 conflict marker, initiated by at least 7 "<"s,
0049   /// separated by at least 7 "="s or "|"s, and terminated by at least 7 ">"s.
0050   CMK_Normal,
0051
0052   /// A Perforce-style conflict marker, initiated by 4 ">"s,
0053   /// separated by 4 "="s, and terminated by 4 "<"s.
0054   CMK_Perforce
0055 };
0056
0057 /// Describes the bounds (start, size) of the preamble and a flag required by
0058 /// PreprocessorOptions::PrecompiledPreambleBytes.
0059 /// The preamble includes the BOM, if any.
0060 struct PreambleBounds {
0061   /// Size of the preamble in bytes.
0062   unsigned Size;
0063
0064   /// Whether the preamble ends at the start of a new line.
0065   ///
0066   /// Used to inform the lexer as to whether it's starting at the beginning of
0067   /// a line after skipping the preamble.
0068   bool PreambleEndsAtStartOfLine;
0069
0070   PreambleBounds(unsigned Size, bool PreambleEndsAtStartOfLine)
0071       : Size(Size), PreambleEndsAtStartOfLine(PreambleEndsAtStartOfLine) {}
0072 };
0073
0074 /// Lexer - This provides a simple interface that turns a text buffer into a
0075 /// stream of tokens.  This provides no support for file reading or buffering,
0076 /// or buffering/seeking of tokens, only forward lexing is supported.  It relies
0077 /// on the specified Preprocessor object to handle preprocessor directives, etc.
0078 class Lexer : public PreprocessorLexer {
0079   friend class Preprocessor;
0080
0081   void anchor() override;
0082
0083   //===--------------------------------------------------------------------===//
0084   // Constant configuration values for this lexer.
0085
0086   // Start of the buffer.
0087   const char *BufferStart;
0088
0089   // End of the buffer.
0090   const char *BufferEnd;
0091
0092   // Location for start of file.
0093   SourceLocation FileLoc;
0094
0095   // LangOpts enabled by this language.
0096   // Storing LangOptions as reference here is important from performance point
0097   // of view. Lack of reference means that LangOptions copy constructor would be
0098   // called by Lexer(..., const LangOptions &LangOpts,...). Given that local
0099   // Lexer objects are created thousands times (in Lexer::getRawToken,
0100   // Preprocessor::EnterSourceFile and other places) during single module
0101   // processing in frontend it would make std::vector<std::string> copy
0102   // constructors surprisingly hot.
0103   const LangOptions &LangOpts;
0104
0105   // True if '//' line comments are enabled.
0106   bool LineComment;
0107
0108   // True if lexer for _Pragma handling.
0109   bool Is_PragmaLexer;
0110
0111   //===--------------------------------------------------------------------===//
0112   // Context-specific lexing flags set by the preprocessor.
0113   //
0114
0115   /// ExtendedTokenMode - The lexer can optionally keep comments and whitespace
0116   /// and return them as tokens.  This is used for -C and -CC modes, and
0117   /// whitespace preservation can be useful for some clients that want to lex
0118   /// the file in raw mode and get every character from the file.
0119   ///
0120   /// When this is set to 2 it returns comments and whitespace.  When set to 1
0121   /// it returns comments, when it is set to 0 it returns normal tokens only.
0122   unsigned char ExtendedTokenMode;
0123
0124   //===--------------------------------------------------------------------===//
0125   // Context that changes as the file is lexed.
0126   // NOTE: any state that mutates when in raw mode must have save/restore code
0127   // in Lexer::isNextPPTokenLParen.
0128
0129   // BufferPtr - Current pointer into the buffer.  This is the next character
0130   // to be lexed.
0131   const char *BufferPtr;
0132
0133   // IsAtStartOfLine - True if the next lexed token should get the "start of
0134   // line" flag set on it.
0135   bool IsAtStartOfLine;
0136
0137   bool IsAtPhysicalStartOfLine;
0138
0139   bool HasLeadingSpace;
0140
0141   bool HasLeadingEmptyMacro;
0142
0143   /// True if this is the first time we're lexing the input file.
0144   bool IsFirstTimeLexingFile;
0145
0146   // NewLinePtr - A pointer to new line character '\n' being lexed. For '\r\n',
0147   // it also points to '\n.'
0148   const char *NewLinePtr;
0149
0150   // CurrentConflictMarkerState - The kind of conflict marker we are handling.
0151   ConflictMarkerKind CurrentConflictMarkerState;
0152
0153   /// Non-empty if this \p Lexer is \p isDependencyDirectivesLexer().
0154   ArrayRef<dependency_directives_scan::Directive> DepDirectives;
0155
0156   /// If this \p Lexer is \p isDependencyDirectivesLexer(), it represents the
0157   /// next token to use from the current dependency directive.
0158   unsigned NextDepDirectiveTokenIndex = 0;
0159
0160   void InitLexer(const char *BufStart, const char *BufPtr, const char *BufEnd);
0161
0162 public:
0163   /// Lexer constructor - Create a new lexer object for the specified buffer
0164   /// with the specified preprocessor managing the lexing process.  This lexer
0165   /// assumes that the associated file buffer and Preprocessor objects will
0166   /// outlive it, so it doesn't take ownership of either of them.
0167   Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP,
0168         bool IsFirstIncludeOfFile = true);
0169
0170   /// Lexer constructor - Create a new raw lexer object.  This object is only
0171   /// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the
0172   /// text range will outlive it, so it doesn't take ownership of it.
0173   Lexer(SourceLocation FileLoc, const LangOptions &LangOpts,
0174         const char *BufStart, const char *BufPtr, const char *BufEnd,
0175         bool IsFirstIncludeOfFile = true);
0176
0177   /// Lexer constructor - Create a new raw lexer object.  This object is only
0178   /// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the
0179   /// text range will outlive it, so it doesn't take ownership of it.
0180   Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile,
0181         const SourceManager &SM, const LangOptions &LangOpts,
0182         bool IsFirstIncludeOfFile = true);
0183
0184   Lexer(const Lexer &) = delete;
0185   Lexer &operator=(const Lexer &) = delete;
0186
0187   /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
0188   /// _Pragma expansion.  This has a variety of magic semantics that this method
0189   /// sets up.  It returns a new'd Lexer that must be delete'd when done.
0190   static Lexer *Create_PragmaLexer(SourceLocation SpellingLoc,
0191                                    SourceLocation ExpansionLocStart,
0192                                    SourceLocation ExpansionLocEnd,
0193                                    unsigned TokLen, Preprocessor &PP);
0194
0195   /// getFileLoc - Return the File Location for the file we are lexing out of.
0196   /// The physical location encodes the location where the characters come from,
0197   /// the virtual location encodes where we should *claim* the characters came
0198   /// from.  Currently this is only used by _Pragma handling.
0199   SourceLocation getFileLoc() const { return FileLoc; }
0200
0201   /// Lex - Return the next token in the file.  If this is the end of file, it
0202   /// return the tok::eof token.  This implicitly involves the preprocessor.
0203   bool Lex(Token &Result);
0204
0205 private:
0206   /// Called when the preprocessor is in 'dependency scanning lexing mode'.
0207   bool LexDependencyDirectiveToken(Token &Result);
0208
0209   /// Called when the preprocessor is in 'dependency scanning lexing mode' and
0210   /// is skipping a conditional block.
0211   bool LexDependencyDirectiveTokenWhileSkipping(Token &Result);
0212
0213   /// True when the preprocessor is in 'dependency scanning lexing mode' and
0214   /// created this \p Lexer for lexing a set of dependency directive tokens.
0215   bool isDependencyDirectivesLexer() const { return !DepDirectives.empty(); }
0216
0217   /// Initializes \p Result with data from \p DDTok and advances \p BufferPtr to
0218   /// the position just after the token.
0219   /// \returns the buffer pointer at the beginning of the token.
0220   const char *convertDependencyDirectiveToken(
0221       const dependency_directives_scan::Token &DDTok, Token &Result);
0222
0223 public:
0224   /// isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
0225   bool isPragmaLexer() const { return Is_PragmaLexer; }
0226
0227 private:
0228   /// IndirectLex - An indirect call to 'Lex' that can be invoked via
0229   ///  the PreprocessorLexer interface.
0230   void IndirectLex(Token &Result) override { Lex(Result); }
0231
0232 public:
0233   /// LexFromRawLexer - Lex a token from a designated raw lexer (one with no
0234   /// associated preprocessor object.  Return true if the 'next character to
0235   /// read' pointer points at the end of the lexer buffer, false otherwise.
0236   bool LexFromRawLexer(Token &Result) {
0237     assert(LexingRawMode && "Not already in raw mode!");
0238     Lex(Result);
0239     // Note that lexing to the end of the buffer doesn't implicitly delete the
0240     // lexer when in raw mode.
0241     return BufferPtr == BufferEnd;
0242   }
0243
0244   /// isKeepWhitespaceMode - Return true if the lexer should return tokens for
0245   /// every character in the file, including whitespace and comments.  This
0246   /// should only be used in raw mode, as the preprocessor is not prepared to
0247   /// deal with the excess tokens.
0248   bool isKeepWhitespaceMode() const {
0249     return ExtendedTokenMode > 1;
0250   }
0251
0252   /// SetKeepWhitespaceMode - This method lets clients enable or disable
0253   /// whitespace retention mode.
0254   void SetKeepWhitespaceMode(bool Val) {
0255     assert((!Val || LexingRawMode || LangOpts.TraditionalCPP) &&
0256            "Can only retain whitespace in raw mode or -traditional-cpp");
0257     ExtendedTokenMode = Val ? 2 : 0;
0258   }
0259
0260   /// inKeepCommentMode - Return true if the lexer should return comments as
0261   /// tokens.
0262   bool inKeepCommentMode() const {
0263     return ExtendedTokenMode > 0;
0264   }
0265
0266   /// SetCommentRetentionMode - Change the comment retention mode of the lexer
0267   /// to the specified mode.  This is really only useful when lexing in raw
0268   /// mode, because otherwise the lexer needs to manage this.
0269   void SetCommentRetentionState(bool Mode) {
0270     assert(!isKeepWhitespaceMode() &&
0271            "Can't play with comment retention state when retaining whitespace");
0272     ExtendedTokenMode = Mode ? 1 : 0;
0273   }
0274
0275   /// Sets the extended token mode back to its initial value, according to the
0276   /// language options and preprocessor. This controls whether the lexer
0277   /// produces comment and whitespace tokens.
0278   ///
0279   /// This requires the lexer to have an associated preprocessor. A standalone
0280   /// lexer has nothing to reset to.
0281   void resetExtendedTokenMode();
0282
0283   /// Gets source code buffer.
0284   StringRef getBuffer() const {
0285     return StringRef(BufferStart, BufferEnd - BufferStart);
0286   }
0287
0288   /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
0289   /// uninterpreted string.  This switches the lexer out of directive mode.
0290   void ReadToEndOfLine(SmallVectorImpl<char> *Result = nullptr);
0291
0292
0293   /// Diag - Forwarding function for diagnostics.  This translate a source
0294   /// position in the current buffer into a SourceLocation object for rendering.
0295   DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const;
0296
0297   /// getSourceLocation - Return a source location identifier for the specified
0298   /// offset in the current file.
0299   SourceLocation getSourceLocation(const char *Loc, unsigned TokLen = 1) const;
0300
0301   /// getSourceLocation - Return a source location for the next character in
0302   /// the current file.
0303   SourceLocation getSourceLocation() override {
0304     return getSourceLocation(BufferPtr);
0305   }
0306
0307   /// Return the current location in the buffer.
0308   const char *getBufferLocation() const { return BufferPtr; }
0309
0310   /// Returns the current lexing offset.
0311   unsigned getCurrentBufferOffset() {
0312     assert(BufferPtr >= BufferStart && "Invalid buffer state");
0313     return BufferPtr - BufferStart;
0314   }
0315
0316   /// Set the lexer's buffer pointer to \p Offset.
0317   void seek(unsigned Offset, bool IsAtStartOfLine);
0318
0319   /// Stringify - Convert the specified string into a C string by i) escaping
0320   /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
0321   /// If Charify is true, this escapes the ' character instead of ".
0322   static std::string Stringify(StringRef Str, bool Charify = false);
0323
0324   /// Stringify - Convert the specified string into a C string by i) escaping
0325   /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
0326   static void Stringify(SmallVectorImpl<char> &Str);
0327
0328   /// getSpelling - This method is used to get the spelling of a token into a
0329   /// preallocated buffer, instead of as an std::string.  The caller is required
0330   /// to allocate enough space for the token, which is guaranteed to be at least
0331   /// Tok.getLength() bytes long.  The length of the actual result is returned.
0332   ///
0333   /// Note that this method may do two possible things: it may either fill in
0334   /// the buffer specified with characters, or it may *change the input pointer*
0335   /// to point to a constant buffer with the data already in it (avoiding a
0336   /// copy).  The caller is not allowed to modify the returned buffer pointer
0337   /// if an internal buffer is returned.
0338   static unsigned getSpelling(const Token &Tok, const char *&Buffer,
0339                               const SourceManager &SourceMgr,
0340                               const LangOptions &LangOpts,
0341                               bool *Invalid = nullptr);
0342
0343   /// getSpelling() - Return the 'spelling' of the Tok token.  The spelling of a
0344   /// token is the characters used to represent the token in the source file
0345   /// after trigraph expansion and escaped-newline folding.  In particular, this
0346   /// wants to get the true, uncanonicalized, spelling of things like digraphs
0347   /// UCNs, etc.
0348   static std::string getSpelling(const Token &Tok,
0349                                  const SourceManager &SourceMgr,
0350                                  const LangOptions &LangOpts,
0351                                  bool *Invalid = nullptr);
0352
0353   /// getSpelling - This method is used to get the spelling of the
0354   /// token at the given source location.  If, as is usually true, it
0355   /// is not necessary to copy any data, then the returned string may
0356   /// not point into the provided buffer.
0357   ///
0358   /// This method lexes at the expansion depth of the given
0359   /// location and does not jump to the expansion or spelling
0360   /// location.
0361   static StringRef getSpelling(SourceLocation loc,
0362                                SmallVectorImpl<char> &buffer,
0363                                const SourceManager &SM,
0364                                const LangOptions &options,
0365                                bool *invalid = nullptr);
0366
0367   /// MeasureTokenLength - Relex the token at the specified location and return
0368   /// its length in bytes in the input file.  If the token needs cleaning (e.g.
0369   /// includes a trigraph or an escaped newline) then this count includes bytes
0370   /// that are part of that.
0371   static unsigned MeasureTokenLength(SourceLocation Loc,
0372                                      const SourceManager &SM,
0373                                      const LangOptions &LangOpts);
0374
0375   /// Relex the token at the specified location.
0376   /// \returns true if there was a failure, false on success.
0377   static bool getRawToken(SourceLocation Loc, Token &Result,
0378                           const SourceManager &SM,
0379                           const LangOptions &LangOpts,
0380                           bool IgnoreWhiteSpace = false);
0381
0382   /// Given a location any where in a source buffer, find the location
0383   /// that corresponds to the beginning of the token in which the original
0384   /// source location lands.
0385   static SourceLocation GetBeginningOfToken(SourceLocation Loc,
0386                                             const SourceManager &SM,
0387                                             const LangOptions &LangOpts);
0388
0389   /// Get the physical length (including trigraphs and escaped newlines) of the
0390   /// first \p Characters characters of the token starting at TokStart.
0391   static unsigned getTokenPrefixLength(SourceLocation TokStart,
0392                                        unsigned CharNo,
0393                                        const SourceManager &SM,
0394                                        const LangOptions &LangOpts);
0395
0396   /// AdvanceToTokenCharacter - If the current SourceLocation specifies a
0397   /// location at the start of a token, return a new location that specifies a
0398   /// character within the token.  This handles trigraphs and escaped newlines.
0399   static SourceLocation AdvanceToTokenCharacter(SourceLocation TokStart,
0400                                                 unsigned Characters,
0401                                                 const SourceManager &SM,
0402                                                 const LangOptions &LangOpts) {
0403     return TokStart.getLocWithOffset(
0404         getTokenPrefixLength(TokStart, Characters, SM, LangOpts));
0405   }
0406
0407   /// Computes the source location just past the end of the
0408   /// token at this source location.
0409   ///
0410   /// This routine can be used to produce a source location that
0411   /// points just past the end of the token referenced by \p Loc, and
0412   /// is generally used when a diagnostic needs to point just after a
0413   /// token where it expected something different that it received. If
0414   /// the returned source location would not be meaningful (e.g., if
0415   /// it points into a macro), this routine returns an invalid
0416   /// source location.
0417   ///
0418   /// \param Offset an offset from the end of the token, where the source
0419   /// location should refer to. The default offset (0) produces a source
0420   /// location pointing just past the end of the token; an offset of 1 produces
0421   /// a source location pointing to the last character in the token, etc.
0422   static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
0423                                             const SourceManager &SM,
0424                                             const LangOptions &LangOpts);
0425
0426   /// Given a token range, produce a corresponding CharSourceRange that
0427   /// is not a token range. This allows the source range to be used by
0428   /// components that don't have access to the lexer and thus can't find the
0429   /// end of the range for themselves.
0430   static CharSourceRange getAsCharRange(SourceRange Range,
0431                                         const SourceManager &SM,
0432                                         const LangOptions &LangOpts) {
0433     SourceLocation End = getLocForEndOfToken(Range.getEnd(), 0, SM, LangOpts);
0434     return End.isInvalid() ? CharSourceRange()
0435                            : CharSourceRange::getCharRange(
0436                                  Range.getBegin(), End);
0437   }
0438   static CharSourceRange getAsCharRange(CharSourceRange Range,
0439                                         const SourceManager &SM,
0440                                         const LangOptions &LangOpts) {
0441     return Range.isTokenRange()
0442                ? getAsCharRange(Range.getAsRange(), SM, LangOpts)
0443                : Range;
0444   }
0445
0446   /// Returns true if the given MacroID location points at the first
0447   /// token of the macro expansion.
0448   ///
0449   /// \param MacroBegin If non-null and function returns true, it is set to
0450   /// begin location of the macro.
0451   static bool isAtStartOfMacroExpansion(SourceLocation loc,
0452                                         const SourceManager &SM,
0453                                         const LangOptions &LangOpts,
0454                                         SourceLocation *MacroBegin = nullptr);
0455
0456   /// Returns true if the given MacroID location points at the last
0457   /// token of the macro expansion.
0458   ///
0459   /// \param MacroEnd If non-null and function returns true, it is set to
0460   /// end location of the macro.
0461   static bool isAtEndOfMacroExpansion(SourceLocation loc,
0462                                       const SourceManager &SM,
0463                                       const LangOptions &LangOpts,
0464                                       SourceLocation *MacroEnd = nullptr);
0465
0466   /// Accepts a range and returns a character range with file locations.
0467   ///
0468   /// Returns a null range if a part of the range resides inside a macro
0469   /// expansion or the range does not reside on the same FileID.
0470   ///
0471   /// This function is trying to deal with macros and return a range based on
0472   /// file locations. The cases where it can successfully handle macros are:
0473   ///
0474   /// -begin or end range lies at the start or end of a macro expansion, in
0475   ///  which case the location will be set to the expansion point, e.g:
0476   ///    \#define M 1 2
0477   ///    a M
0478   /// If you have a range [a, 2] (where 2 came from the macro), the function
0479   /// will return a range for "a M"
0480   /// if you have range [a, 1], the function will fail because the range
0481   /// overlaps with only a part of the macro
0482   ///
0483   /// -The macro is a function macro and the range can be mapped to the macro
0484   ///  arguments, e.g:
0485   ///    \#define M 1 2
0486   ///    \#define FM(x) x
0487   ///    FM(a b M)
0488   /// if you have range [b, 2], the function will return the file range "b M"
0489   /// inside the macro arguments.
0490   /// if you have range [a, 2], the function will return the file range
0491   /// "FM(a b M)" since the range includes all of the macro expansion.
0492   static CharSourceRange makeFileCharRange(CharSourceRange Range,
0493                                            const SourceManager &SM,
0494                                            const LangOptions &LangOpts);
0495
0496   /// Returns a string for the source that the range encompasses.
0497   static StringRef getSourceText(CharSourceRange Range,
0498                                  const SourceManager &SM,
0499                                  const LangOptions &LangOpts,
0500                                  bool *Invalid = nullptr);
0501
0502   /// Retrieve the name of the immediate macro expansion.
0503   ///
0504   /// This routine starts from a source location, and finds the name of the macro
0505   /// responsible for its immediate expansion. It looks through any intervening
0506   /// macro argument expansions to compute this. It returns a StringRef which
0507   /// refers to the SourceManager-owned buffer of the source where that macro
0508   /// name is spelled. Thus, the result shouldn't out-live that SourceManager.
0509   static StringRef getImmediateMacroName(SourceLocation Loc,
0510                                          const SourceManager &SM,
0511                                          const LangOptions &LangOpts);
0512
0513   /// Retrieve the name of the immediate macro expansion.
0514   ///
0515   /// This routine starts from a source location, and finds the name of the
0516   /// macro responsible for its immediate expansion. It looks through any
0517   /// intervening macro argument expansions to compute this. It returns a
0518   /// StringRef which refers to the SourceManager-owned buffer of the source
0519   /// where that macro name is spelled. Thus, the result shouldn't out-live
0520   /// that SourceManager.
0521   ///
0522   /// This differs from Lexer::getImmediateMacroName in that any macro argument
0523   /// location will result in the topmost function macro that accepted it.
0524   /// e.g.
0525   /// \code
0526   ///   MAC1( MAC2(foo) )
0527   /// \endcode
0528   /// for location of 'foo' token, this function will return "MAC1" while
0529   /// Lexer::getImmediateMacroName will return "MAC2".
0530   static StringRef getImmediateMacroNameForDiagnostics(
0531       SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts);
0532
0533   /// Compute the preamble of the given file.
0534   ///
0535   /// The preamble of a file contains the initial comments, include directives,
0536   /// and other preprocessor directives that occur before the code in this
0537   /// particular file actually begins. The preamble of the main source file is
0538   /// a potential prefix header.
0539   ///
0540   /// \param Buffer The memory buffer containing the file's contents.
0541   ///
0542   /// \param MaxLines If non-zero, restrict the length of the preamble
0543   /// to fewer than this number of lines.
0544   ///
0545   /// \returns The offset into the file where the preamble ends and the rest
0546   /// of the file begins along with a boolean value indicating whether
0547   /// the preamble ends at the beginning of a new line.
0548   static PreambleBounds ComputePreamble(StringRef Buffer,
0549                                         const LangOptions &LangOpts,
0550                                         unsigned MaxLines = 0);
0551
0552   /// Finds the token that comes right after the given location.
0553   ///
0554   /// Returns the next token, or std::nullopt if the location is inside a macro.
0555   static std::optional<Token> findNextToken(SourceLocation Loc,
0556                                             const SourceManager &SM,
0557                                             const LangOptions &LangOpts,
0558                                             bool IncludeComments = false);
0559
0560   /// Finds the token that comes before the given location.
0561   static std::optional<Token> findPreviousToken(SourceLocation Loc,
0562                                                 const SourceManager &SM,
0563                                                 const LangOptions &LangOpts,
0564                                                 bool IncludeComments);
0565
0566   /// Checks that the given token is the first token that occurs after
0567   /// the given location (this excludes comments and whitespace). Returns the
0568   /// location immediately after the specified token. If the token is not found
0569   /// or the location is inside a macro, the returned source location will be
0570   /// invalid.
0571   static SourceLocation findLocationAfterToken(SourceLocation loc,
0572                                          tok::TokenKind TKind,
0573                                          const SourceManager &SM,
0574                                          const LangOptions &LangOpts,
0575                                          bool SkipTrailingWhitespaceAndNewLine);
0576
0577   /// Returns true if the given character could appear in an identifier.
0578   static bool isAsciiIdentifierContinueChar(char c,
0579                                             const LangOptions &LangOpts);
0580
0581   /// Checks whether new line pointed by Str is preceded by escape
0582   /// sequence.
0583   static bool isNewLineEscaped(const char *BufferStart, const char *Str);
0584
0585   /// Represents a char and the number of bytes parsed to produce it.
0586   struct SizedChar {
0587     char Char;
0588     unsigned Size;
0589   };
0590
0591   /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
0592   /// emit a warning.
0593   static inline SizedChar getCharAndSizeNoWarn(const char *Ptr,
0594                                                const LangOptions &LangOpts) {
0595     // If this is not a trigraph and not a UCN or escaped newline, return
0596     // quickly.
0597     if (isObviouslySimpleCharacter(Ptr[0])) {
0598       return {*Ptr, 1u};
0599     }
0600
0601     return getCharAndSizeSlowNoWarn(Ptr, LangOpts);
0602   }
0603
0604   /// Returns the leading whitespace for line that corresponds to the given
0605   /// location \p Loc.
0606   static StringRef getIndentationForLine(SourceLocation Loc,
0607                                          const SourceManager &SM);
0608
0609   /// Check if this is the first time we're lexing the input file.
0610   bool isFirstTimeLexingFile() const { return IsFirstTimeLexingFile; }
0611
0612 private:
0613   //===--------------------------------------------------------------------===//
0614   // Internal implementation interfaces.
0615
0616   /// LexTokenInternal - Internal interface to lex a preprocessing token. Called
0617   /// by Lex.
0618   ///
0619   bool LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine);
0620
0621   bool CheckUnicodeWhitespace(Token &Result, uint32_t C, const char *CurPtr);
0622
0623   bool LexUnicodeIdentifierStart(Token &Result, uint32_t C, const char *CurPtr);
0624
0625   /// FormTokenWithChars - When we lex a token, we have identified a span
0626   /// starting at BufferPtr, going to TokEnd that forms the token.  This method
0627   /// takes that range and assigns it to the token as its location and size.  In
0628   /// addition, since tokens cannot overlap, this also updates BufferPtr to be
0629   /// TokEnd.
0630   void FormTokenWithChars(Token &Result, const char *TokEnd,
0631                           tok::TokenKind Kind) {
0632     unsigned TokLen = TokEnd-BufferPtr;
0633     Result.setLength(TokLen);
0634     Result.setLocation(getSourceLocation(BufferPtr, TokLen));
0635     Result.setKind(Kind);
0636     BufferPtr = TokEnd;
0637   }
0638
0639   /// isNextPPTokenLParen - Return 1 if the next unexpanded token will return a
0640   /// tok::l_paren token, 0 if it is something else and 2 if there are no more
0641   /// tokens in the buffer controlled by this lexer.
0642   unsigned isNextPPTokenLParen();
0643
0644   //===--------------------------------------------------------------------===//
0645   // Lexer character reading interfaces.
0646
0647   // This lexer is built on two interfaces for reading characters, both of which
0648   // automatically provide phase 1/2 translation.  getAndAdvanceChar is used
0649   // when we know that we will be reading a character from the input buffer and
0650   // that this character will be part of the result token. This occurs in (f.e.)
0651   // string processing, because we know we need to read until we find the
0652   // closing '"' character.
0653   //
0654   // The second interface is the combination of getCharAndSize with
0655   // ConsumeChar.  getCharAndSize reads a phase 1/2 translated character,
0656   // returning it and its size.  If the lexer decides that this character is
0657   // part of the current token, it calls ConsumeChar on it.  This two stage
0658   // approach allows us to emit diagnostics for characters (e.g. warnings about
0659   // trigraphs), knowing that they only are emitted if the character is
0660   // consumed.
0661
0662   /// isObviouslySimpleCharacter - Return true if the specified character is
0663   /// obviously the same in translation phase 1 and translation phase 3.  This
0664   /// can return false for characters that end up being the same, but it will
0665   /// never return true for something that needs to be mapped.
0666   static bool isObviouslySimpleCharacter(char C) {
0667     return C != '?' && C != '\\';
0668   }
0669
0670   /// getAndAdvanceChar - Read a single 'character' from the specified buffer,
0671   /// advance over it, and return it.  This is tricky in several cases.  Here we
0672   /// just handle the trivial case and fall-back to the non-inlined
0673   /// getCharAndSizeSlow method to handle the hard case.
0674   inline char getAndAdvanceChar(const char *&Ptr, Token &Tok) {
0675     // If this is not a trigraph and not a UCN or escaped newline, return
0676     // quickly.
0677     if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++;
0678
0679     auto [C, Size] = getCharAndSizeSlow(Ptr, &Tok);
0680     Ptr += Size;
0681     return C;
0682   }
0683
0684   /// ConsumeChar - When a character (identified by getCharAndSize) is consumed
0685   /// and added to a given token, check to see if there are diagnostics that
0686   /// need to be emitted or flags that need to be set on the token.  If so, do
0687   /// it.
0688   const char *ConsumeChar(const char *Ptr, unsigned Size, Token &Tok) {
0689     // Normal case, we consumed exactly one token.  Just return it.
0690     if (Size == 1)
0691       return Ptr+Size;
0692
0693     // Otherwise, re-lex the character with a current token, allowing
0694     // diagnostics to be emitted and flags to be set.
0695     return Ptr + getCharAndSizeSlow(Ptr, &Tok).Size;
0696   }
0697
0698   /// getCharAndSize - Peek a single 'character' from the specified buffer,
0699   /// get its size, and return it.  This is tricky in several cases.  Here we
0700   /// just handle the trivial case and fall-back to the non-inlined
0701   /// getCharAndSizeSlow method to handle the hard case.
0702   inline char getCharAndSize(const char *Ptr, unsigned &Size) {
0703     // If this is not a trigraph and not a UCN or escaped newline, return
0704     // quickly.
0705     if (isObviouslySimpleCharacter(Ptr[0])) {
0706       Size = 1;
0707       return *Ptr;
0708     }
0709
0710     auto CharAndSize = getCharAndSizeSlow(Ptr);
0711     Size = CharAndSize.Size;
0712     return CharAndSize.Char;
0713   }
0714
0715   /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize
0716   /// method.
0717   SizedChar getCharAndSizeSlow(const char *Ptr, Token *Tok = nullptr);
0718
0719   /// getEscapedNewLineSize - Return the size of the specified escaped newline,
0720   /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
0721   /// to this function.
0722   static unsigned getEscapedNewLineSize(const char *P);
0723
0724   /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
0725   /// them), skip over them and return the first non-escaped-newline found,
0726   /// otherwise return P.
0727   static const char *SkipEscapedNewLines(const char *P);
0728
0729   /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a
0730   /// diagnostic.
0731   static SizedChar getCharAndSizeSlowNoWarn(const char *Ptr,
0732                                             const LangOptions &LangOpts);
0733
0734   //===--------------------------------------------------------------------===//
0735   // Other lexer functions.
0736
0737   void SetByteOffset(unsigned Offset, bool StartOfLine);
0738
0739   void PropagateLineStartLeadingSpaceInfo(Token &Result);
0740
0741   const char *LexUDSuffix(Token &Result, const char *CurPtr,
0742                           bool IsStringLiteral);
0743
0744   // Helper functions to lex the remainder of a token of the specific type.
0745
0746   // This function handles both ASCII and Unicode identifiers after
0747   // the first codepoint of the identifyier has been parsed.
0748   bool LexIdentifierContinue(Token &Result, const char *CurPtr);
0749
0750   bool LexNumericConstant    (Token &Result, const char *CurPtr);
0751   bool LexStringLiteral      (Token &Result, const char *CurPtr,
0752                               tok::TokenKind Kind);
0753   bool LexRawStringLiteral   (Token &Result, const char *CurPtr,
0754                               tok::TokenKind Kind);
0755   bool LexAngledStringLiteral(Token &Result, const char *CurPtr);
0756   bool LexCharConstant       (Token &Result, const char *CurPtr,
0757                               tok::TokenKind Kind);
0758   bool LexEndOfFile          (Token &Result, const char *CurPtr);
0759   bool SkipWhitespace        (Token &Result, const char *CurPtr,
0760                               bool &TokAtPhysicalStartOfLine);
0761   bool SkipLineComment       (Token &Result, const char *CurPtr,
0762                               bool &TokAtPhysicalStartOfLine);
0763   bool SkipBlockComment      (Token &Result, const char *CurPtr,
0764                               bool &TokAtPhysicalStartOfLine);
0765   bool SaveLineComment       (Token &Result, const char *CurPtr);
0766
0767   bool IsStartOfConflictMarker(const char *CurPtr);
0768   bool HandleEndOfConflictMarker(const char *CurPtr);
0769
0770   bool lexEditorPlaceholder(Token &Result, const char *CurPtr);
0771
0772   bool isCodeCompletionPoint(const char *CurPtr) const;
0773   void cutOffLexing() { BufferPtr = BufferEnd; }
0774
0775   bool isHexaLiteral(const char *Start, const LangOptions &LangOpts);
0776
0777   void codeCompleteIncludedFile(const char *PathStart,
0778                                 const char *CompletionPoint, bool IsAngled);
0779
0780   std::optional<uint32_t>
0781   tryReadNumericUCN(const char *&StartPtr, const char *SlashLoc, Token *Result);
0782   std::optional<uint32_t> tryReadNamedUCN(const char *&StartPtr,
0783                                           const char *SlashLoc, Token *Result);
0784
0785   /// Read a universal character name.
0786   ///
0787   /// \param StartPtr The position in the source buffer after the initial '\'.
0788   ///                 If the UCN is syntactically well-formed (but not
0789   ///                 necessarily valid), this parameter will be updated to
0790   ///                 point to the character after the UCN.
0791   /// \param SlashLoc The position in the source buffer of the '\'.
0792   /// \param Result   The token being formed. Pass \c nullptr to suppress
0793   ///                 diagnostics and handle token formation in the caller.
0794   ///
0795   /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is
0796   ///         invalid.
0797   uint32_t tryReadUCN(const char *&StartPtr, const char *SlashLoc, Token *Result);
0798
0799   /// Try to consume a UCN as part of an identifier at the current
0800   /// location.
0801   /// \param CurPtr Initially points to the range of characters in the source
0802   ///               buffer containing the '\'. Updated to point past the end of
0803   ///               the UCN on success.
0804   /// \param Size The number of characters occupied by the '\' (including
0805   ///             trigraphs and escaped newlines).
0806   /// \param Result The token being produced. Marked as containing a UCN on
0807   ///               success.
0808   /// \return \c true if a UCN was lexed and it produced an acceptable
0809   ///         identifier character, \c false otherwise.
0810   bool tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
0811                                Token &Result);
0812
0813   /// Try to consume an identifier character encoded in UTF-8.
0814   /// \param CurPtr Points to the start of the (potential) UTF-8 code unit
0815   ///        sequence. On success, updated to point past the end of it.
0816   /// \param Result The token being formed.
0817   /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier
0818   ///         character was lexed, \c false otherwise.
0819   bool tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result);
0820 };
0821
0822 } // namespace clang
0823
0824 #endif // LLVM_CLANG_LEX_LEXER_H