Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-05-10 08:37:12

0001 //===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===//
0002 //
0003 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
0004 // See https://llvm.org/LICENSE.txt for license information.
0005 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
0006 //
0007 //===----------------------------------------------------------------------===//
0008 // Record tokens that a preprocessor emits and define operations to map between
0009 // the tokens written in a file and tokens produced by the preprocessor.
0010 //
0011 // When running the compiler, there are two token streams we are interested in:
0012 //   - "spelled" tokens directly correspond to a substring written in some
0013 //     source file.
0014 //   - "expanded" tokens represent the result of preprocessing, parses consumes
0015 //     this token stream to produce the AST.
0016 //
0017 // Expanded tokens correspond directly to locations found in the AST, allowing
0018 // to find subranges of the token stream covered by various AST nodes. Spelled
0019 // tokens correspond directly to the source code written by the user.
0020 //
0021 // To allow composing these two use-cases, we also define operations that map
0022 // between expanded and spelled tokens that produced them (macro calls,
0023 // directives, etc).
0024 //
0025 //===----------------------------------------------------------------------===//
0026 
0027 #ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
0028 #define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
0029 
0030 #include "clang/Basic/LangOptions.h"
0031 #include "clang/Basic/SourceLocation.h"
0032 #include "clang/Basic/SourceManager.h"
0033 #include "clang/Basic/TokenKinds.h"
0034 #include "clang/Lex/Token.h"
0035 #include "llvm/ADT/ArrayRef.h"
0036 #include "llvm/ADT/DenseMap.h"
0037 #include "llvm/ADT/StringRef.h"
0038 #include "llvm/Support/Compiler.h"
0039 #include "llvm/Support/raw_ostream.h"
0040 #include <cstdint>
0041 #include <tuple>
0042 
0043 namespace clang {
0044 class Preprocessor;
0045 
0046 namespace syntax {
0047 
0048 /// A half-open character range inside a particular file, the start offset is
0049 /// included and the end offset is excluded from the range.
0050 struct FileRange {
0051   /// EXPECTS: File.isValid() && Begin <= End.
0052   FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset);
0053   /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID().
0054   FileRange(const SourceManager &SM, SourceLocation BeginLoc, unsigned Length);
0055   /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(), Begin <= End and files
0056   ///          are the same.
0057   FileRange(const SourceManager &SM, SourceLocation BeginLoc,
0058             SourceLocation EndLoc);
0059 
0060   FileID file() const { return File; }
0061   /// Start is a start offset (inclusive) in the corresponding file.
0062   unsigned beginOffset() const { return Begin; }
0063   /// End offset (exclusive) in the corresponding file.
0064   unsigned endOffset() const { return End; }
0065 
0066   unsigned length() const { return End - Begin; }
0067 
0068   /// Check if \p Offset is inside the range.
0069   bool contains(unsigned Offset) const {
0070     return Begin <= Offset && Offset < End;
0071   }
0072   /// Check \p Offset is inside the range or equal to its endpoint.
0073   bool touches(unsigned Offset) const {
0074     return Begin <= Offset && Offset <= End;
0075   }
0076 
0077   /// Gets the substring that this FileRange refers to.
0078   llvm::StringRef text(const SourceManager &SM) const;
0079 
0080   /// Convert to the clang range. The returned range is always a char range,
0081   /// never a token range.
0082   CharSourceRange toCharRange(const SourceManager &SM) const;
0083 
0084   friend bool operator==(const FileRange &L, const FileRange &R) {
0085     return std::tie(L.File, L.Begin, L.End) == std::tie(R.File, R.Begin, R.End);
0086   }
0087   friend bool operator!=(const FileRange &L, const FileRange &R) {
0088     return !(L == R);
0089   }
0090 
0091 private:
0092   FileID File;
0093   unsigned Begin;
0094   unsigned End;
0095 };
0096 
0097 /// For debugging purposes.
0098 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R);
0099 
0100 /// A token coming directly from a file or from a macro invocation. Has just
0101 /// enough information to locate the token in the source code.
0102 /// Can represent both expanded and spelled tokens.
0103 class Token {
0104 public:
0105   Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind);
0106   /// EXPECTS: clang::Token is not an annotation token.
0107   explicit Token(const clang::Token &T);
0108 
0109   tok::TokenKind kind() const { return Kind; }
0110   /// Location of the first character of a token.
0111   SourceLocation location() const { return Location; }
0112   /// Location right after the last character of a token.
0113   SourceLocation endLocation() const {
0114     return Location.getLocWithOffset(Length);
0115   }
0116   unsigned length() const { return Length; }
0117 
0118   /// Get the substring covered by the token. Note that will include all
0119   /// digraphs, newline continuations, etc. E.g. tokens for 'int' and
0120   ///    in\
0121   ///    t
0122   /// both have the same kind tok::kw_int, but results of text() are different.
0123   llvm::StringRef text(const SourceManager &SM) const;
0124 
0125   /// Gets a range of this token.
0126   /// EXPECTS: token comes from a file, not from a macro expansion.
0127   FileRange range(const SourceManager &SM) const;
0128 
0129   /// Given two tokens inside the same file, returns a file range that starts at
0130   /// \p First and ends at \p Last.
0131   /// EXPECTS: First and Last are file tokens from the same file, Last starts
0132   ///          after First.
0133   static FileRange range(const SourceManager &SM, const syntax::Token &First,
0134                          const syntax::Token &Last);
0135 
0136   std::string dumpForTests(const SourceManager &SM) const;
0137   /// For debugging purposes.
0138   std::string str() const;
0139 
0140 private:
0141   SourceLocation Location;
0142   unsigned Length;
0143   tok::TokenKind Kind;
0144 };
0145 /// For debugging purposes. Equivalent to a call to Token::str().
0146 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T);
0147 
0148 /// A list of tokens obtained by preprocessing a text buffer and operations to
0149 /// map between the expanded and spelled tokens, i.e. TokenBuffer has
0150 /// information about two token streams:
0151 ///    1. Expanded tokens: tokens produced by the preprocessor after all macro
0152 ///       replacements,
0153 ///    2. Spelled tokens: corresponding directly to the source code of a file
0154 ///       before any macro replacements occurred.
0155 /// Here's an example to illustrate a difference between those two:
0156 ///     #define FOO 10
0157 ///     int a = FOO;
0158 ///
0159 /// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}.
0160 /// Expanded tokens are {'int','a','=','10',';','eof'}.
0161 ///
0162 /// Note that the expanded token stream has a tok::eof token at the end, the
0163 /// spelled tokens never store a 'eof' token.
0164 ///
0165 /// The full list expanded tokens can be obtained with expandedTokens(). Spelled
0166 /// tokens for each of the files can be obtained via spelledTokens(FileID).
0167 ///
0168 /// To map between the expanded and spelled tokens use findSpelledByExpanded().
0169 ///
0170 /// To build a token buffer use the TokenCollector class. You can also compute
0171 /// the spelled tokens of a file using the tokenize() helper.
0172 ///
0173 /// FIXME: allow mappings into macro arguments.
0174 class TokenBuffer {
0175 public:
0176   TokenBuffer(const SourceManager &SourceMgr) : SourceMgr(&SourceMgr) {}
0177 
0178   TokenBuffer(TokenBuffer &&) = default;
0179   TokenBuffer(const TokenBuffer &) = delete;
0180   TokenBuffer &operator=(TokenBuffer &&) = default;
0181   TokenBuffer &operator=(const TokenBuffer &) = delete;
0182 
0183   /// All tokens produced by the preprocessor after all macro replacements,
0184   /// directives, etc. Source locations found in the clang AST will always
0185   /// point to one of these tokens.
0186   /// Tokens are in TU order (per SourceManager::isBeforeInTranslationUnit()).
0187   /// FIXME: figure out how to handle token splitting, e.g. '>>' can be split
0188   ///        into two '>' tokens by the parser. However, TokenBuffer currently
0189   ///        keeps it as a single '>>' token.
0190   llvm::ArrayRef<syntax::Token> expandedTokens() const {
0191     return ExpandedTokens;
0192   }
0193 
0194   /// Builds a cache to make future calls to expandedToken(SourceRange) faster.
0195   /// Creates an index only once. Further calls to it will be no-op.
0196   void indexExpandedTokens();
0197 
0198   /// Returns the subrange of expandedTokens() corresponding to the closed
0199   /// token range R.
0200   /// Consider calling indexExpandedTokens() before for faster lookups.
0201   llvm::ArrayRef<syntax::Token> expandedTokens(SourceRange R) const;
0202 
0203   /// Returns the subrange of spelled tokens corresponding to AST node spanning
0204   /// \p Expanded. This is the text that should be replaced if a refactoring
0205   /// were to rewrite the node. If \p Expanded is empty, the returned value is
0206   /// std::nullopt.
0207   ///
0208   /// Will fail if the expanded tokens do not correspond to a sequence of
0209   /// spelled tokens. E.g. for the following example:
0210   ///
0211   ///   #define FIRST f1 f2 f3
0212   ///   #define SECOND s1 s2 s3
0213   ///   #define ID2(X, Y) X Y
0214   ///
0215   ///   a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c
0216   ///   d ID2(e f g, h) i  // expanded tokens are: d e f g h i
0217   ///
0218   /// the results would be:
0219   ///   expanded   => spelled
0220   ///   ------------------------
0221   ///            a => a
0222   ///     s1 s2 s3 => SECOND
0223   ///   a f1 f2 f3 => a FIRST
0224   ///         a f1 => can't map
0225   ///        s1 s2 => can't map
0226   ///         e f  => e f
0227   ///         g h  => can't map
0228   ///
0229   /// EXPECTS: \p Expanded is a subrange of expandedTokens().
0230   /// Complexity is logarithmic.
0231   std::optional<llvm::ArrayRef<syntax::Token>>
0232   spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const;
0233 
0234   /// Find the subranges of expanded tokens, corresponding to \p Spelled.
0235   ///
0236   /// Some spelled tokens may not be present in the expanded token stream, so
0237   /// this function can return an empty vector, e.g. for tokens of macro
0238   /// directives or disabled preprocessor branches.
0239   ///
0240   /// Some spelled tokens can be duplicated in the expanded token stream
0241   /// multiple times and this function will return multiple results in those
0242   /// cases. This happens when \p Spelled is inside a macro argument.
0243   ///
0244   /// FIXME: return correct results on macro arguments. For now, we return an
0245   ///        empty list.
0246   ///
0247   /// (!) will return empty vector on tokens from #define body:
0248   /// E.g. for the following example:
0249   ///
0250   ///   #define FIRST(A) f1 A = A f2
0251   ///   #define SECOND s
0252   ///
0253   ///   a FIRST(arg) b SECOND c // expanded tokens are: a f1 arg = arg f2 b s
0254   /// The results would be
0255   ///   spelled           => expanded
0256   ///   ------------------------
0257   ///   #define FIRST     => {}
0258   ///   a FIRST(arg)      => {a f1 arg = arg f2}
0259   ///   arg               => {arg, arg} // arg #1 is before `=` and arg #2 is
0260   ///                                   // after `=` in the expanded tokens.
0261   llvm::SmallVector<llvm::ArrayRef<syntax::Token>, 1>
0262   expandedForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const;
0263 
0264   /// An expansion produced by the preprocessor, includes macro expansions and
0265   /// preprocessor directives. Preprocessor always maps a non-empty range of
0266   /// spelled tokens to a (possibly empty) range of expanded tokens. Here is a
0267   /// few examples of expansions:
0268   ///    #pragma once      // Expands to an empty range.
0269   ///    #define FOO 1 2 3 // Expands an empty range.
0270   ///    FOO               // Expands to "1 2 3".
0271   /// FIXME(ibiryukov): implement this, currently #include expansions are empty.
0272   ///    #include <vector> // Expands to tokens produced by the include.
0273   struct Expansion {
0274     llvm::ArrayRef<syntax::Token> Spelled;
0275     llvm::ArrayRef<syntax::Token> Expanded;
0276   };
0277   /// If \p Spelled starts a mapping (e.g. if it's a macro name or '#' starting
0278   /// a preprocessor directive) return the subrange of expanded tokens that the
0279   /// macro expands to.
0280   std::optional<Expansion>
0281   expansionStartingAt(const syntax::Token *Spelled) const;
0282   /// Returns all expansions (partially) expanded from the specified tokens.
0283   /// This is the expansions whose Spelled range intersects \p Spelled.
0284   std::vector<Expansion>
0285   expansionsOverlapping(llvm::ArrayRef<syntax::Token> Spelled) const;
0286 
0287   /// Lexed tokens of a file before preprocessing. E.g. for the following input
0288   ///     #define DECL(name) int name = 10
0289   ///     DECL(a);
0290   /// spelledTokens() returns
0291   ///    {"#", "define", "DECL", "(", "name", ")", "int", "name", "=", "10",
0292   ///     "DECL", "(", "a", ")", ";"}
0293   llvm::ArrayRef<syntax::Token> spelledTokens(FileID FID) const;
0294 
0295   /// Returns the spelled Token containing the Loc, if there are no such tokens
0296   /// returns nullptr.
0297   const syntax::Token *spelledTokenContaining(SourceLocation Loc) const;
0298 
0299   /// Get all tokens that expand a macro in \p FID. For the following input
0300   ///     #define FOO B
0301   ///     #define FOO2(X) int X
0302   ///     FOO2(XY)
0303   ///     int B;
0304   ///     FOO;
0305   /// macroExpansions() returns {"FOO2", "FOO"} (from line 3 and 5
0306   /// respecitvely).
0307   std::vector<const syntax::Token *> macroExpansions(FileID FID) const;
0308 
0309   const SourceManager &sourceManager() const { return *SourceMgr; }
0310 
0311   std::string dumpForTests() const;
0312 
0313 private:
0314   /// Describes a mapping between a continuous subrange of spelled tokens and
0315   /// expanded tokens. Represents macro expansions, preprocessor directives,
0316   /// conditionally disabled pp regions, etc.
0317   ///   #define FOO 1+2
0318   ///   #define BAR(a) a + 1
0319   ///   FOO    // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}.
0320   ///   BAR(1) // invocation #2, tokens = {'a', '+', '1'},
0321   ///                            macroTokens = {'BAR', '(', '1', ')'}.
0322   struct Mapping {
0323     // Positions in the corresponding spelled token stream. The corresponding
0324     // range is never empty.
0325     unsigned BeginSpelled = 0;
0326     unsigned EndSpelled = 0;
0327     // Positions in the expanded token stream. The corresponding range can be
0328     // empty.
0329     unsigned BeginExpanded = 0;
0330     unsigned EndExpanded = 0;
0331 
0332     /// For debugging purposes.
0333     std::string str() const;
0334   };
0335   /// Spelled tokens of the file with information about the subranges.
0336   struct MarkedFile {
0337     /// Lexed, but not preprocessed, tokens of the file. These map directly to
0338     /// text in the corresponding files and include tokens of all preprocessor
0339     /// directives.
0340     /// FIXME: spelled tokens don't change across FileID that map to the same
0341     ///        FileEntry. We could consider deduplicating them to save memory.
0342     std::vector<syntax::Token> SpelledTokens;
0343     /// A sorted list to convert between the spelled and expanded token streams.
0344     std::vector<Mapping> Mappings;
0345     /// The first expanded token produced for this FileID.
0346     unsigned BeginExpanded = 0;
0347     unsigned EndExpanded = 0;
0348   };
0349 
0350   friend class TokenCollector;
0351 
0352   /// Maps a single expanded token to its spelled counterpart or a mapping that
0353   /// produced it.
0354   std::pair<const syntax::Token *, const Mapping *>
0355   spelledForExpandedToken(const syntax::Token *Expanded) const;
0356 
0357   /// Returns a mapping starting before \p Spelled token, or nullptr if no
0358   /// such mapping exists.
0359   static const Mapping *
0360   mappingStartingBeforeSpelled(const MarkedFile &F,
0361                                const syntax::Token *Spelled);
0362 
0363   /// Convert a private Mapping to a public Expansion.
0364   Expansion makeExpansion(const MarkedFile &, const Mapping &) const;
0365   /// Returns the file that the Spelled tokens are taken from.
0366   /// Asserts that they are non-empty, from a tracked file, and in-bounds.
0367   const MarkedFile &fileForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const;
0368 
0369   /// Token stream produced after preprocessing, conceputally this captures the
0370   /// same stream as 'clang -E' (excluding the preprocessor directives like
0371   /// #file, etc.).
0372   std::vector<syntax::Token> ExpandedTokens;
0373   // Index of ExpandedTokens for faster lookups by SourceLocation.
0374   llvm::DenseMap<SourceLocation, unsigned> ExpandedTokIndex;
0375   llvm::DenseMap<FileID, MarkedFile> Files;
0376   // The value is never null, pointer instead of reference to avoid disabling
0377   // implicit assignment operator.
0378   const SourceManager *SourceMgr;
0379 };
0380 
0381 /// The spelled tokens that overlap or touch a spelling location Loc.
0382 /// This always returns 0-2 tokens.
0383 llvm::ArrayRef<syntax::Token>
0384 spelledTokensTouching(SourceLocation Loc, const syntax::TokenBuffer &Tokens);
0385 llvm::ArrayRef<syntax::Token>
0386 spelledTokensTouching(SourceLocation Loc, llvm::ArrayRef<syntax::Token> Tokens);
0387 
0388 /// The identifier token that overlaps or touches a spelling location Loc.
0389 /// If there is none, returns nullptr.
0390 const syntax::Token *
0391 spelledIdentifierTouching(SourceLocation Loc,
0392                           llvm::ArrayRef<syntax::Token> Tokens);
0393 const syntax::Token *
0394 spelledIdentifierTouching(SourceLocation Loc,
0395                           const syntax::TokenBuffer &Tokens);
0396 
0397 /// Lex the text buffer, corresponding to \p FID, in raw mode and record the
0398 /// resulting spelled tokens. Does minimal post-processing on raw identifiers,
0399 /// setting the appropriate token kind (instead of the raw_identifier reported
0400 /// by lexer in raw mode). This is a very low-level function, most users should
0401 /// prefer to use TokenCollector. Lexing in raw mode produces wildly different
0402 /// results from what one might expect when running a C++ frontend, e.g.
0403 /// preprocessor does not run at all.
0404 /// The result will *not* have a 'eof' token at the end.
0405 std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM,
0406                                     const LangOptions &LO);
0407 /// Similar to one above, instead of whole file tokenizes a part of it. Note
0408 /// that, the first token might be incomplete if FR.startOffset is not at the
0409 /// beginning of a token, and the last token returned will start before the
0410 /// FR.endOffset but might end after it.
0411 std::vector<syntax::Token>
0412 tokenize(const FileRange &FR, const SourceManager &SM, const LangOptions &LO);
0413 
0414 /// Collects tokens for the main file while running the frontend action. An
0415 /// instance of this object should be created on
0416 /// FrontendAction::BeginSourceFile() and the results should be consumed after
0417 /// FrontendAction::Execute() finishes.
0418 class TokenCollector {
0419 public:
0420   /// Adds the hooks to collect the tokens. Should be called before the
0421   /// preprocessing starts, i.e. as a part of BeginSourceFile() or
0422   /// CreateASTConsumer().
0423   TokenCollector(Preprocessor &P);
0424 
0425   /// Finalizes token collection. Should be called after preprocessing is
0426   /// finished, i.e. after running Execute().
0427   [[nodiscard]] TokenBuffer consume() &&;
0428 
0429 private:
0430   /// Maps from a start to an end spelling location of transformations
0431   /// performed by the preprocessor. These include:
0432   ///   1. range from '#' to the last token in the line for PP directives,
0433   ///   2. macro name and arguments for macro expansions.
0434   /// Note that we record only top-level macro expansions, intermediate
0435   /// expansions (e.g. inside macro arguments) are ignored.
0436   ///
0437   /// Used to find correct boundaries of macro calls and directives when
0438   /// building mappings from spelled to expanded tokens.
0439   ///
0440   /// Logically, at each point of the preprocessor execution there is a stack of
0441   /// macro expansions being processed and we could use it to recover the
0442   /// location information we need. However, the public preprocessor API only
0443   /// exposes the points when macro expansions start (when we push a macro onto
0444   /// the stack) and not when they end (when we pop a macro from the stack).
0445   /// To workaround this limitation, we rely on source location information
0446   /// stored in this map.
0447   using PPExpansions = llvm::DenseMap<SourceLocation, SourceLocation>;
0448   class Builder;
0449   class CollectPPExpansions;
0450 
0451   std::vector<syntax::Token> Expanded;
0452   // FIXME: we only store macro expansions, also add directives(#pragma, etc.)
0453   PPExpansions Expansions;
0454   Preprocessor &PP;
0455   CollectPPExpansions *Collector;
0456 };
0457 
0458 } // namespace syntax
0459 } // namespace clang
0460 
0461 #endif