|
|
|||
File indexing completed on 2026-05-10 08:37:12
0001 //===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===// 0002 // 0003 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 0004 // See https://llvm.org/LICENSE.txt for license information. 0005 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 0006 // 0007 //===----------------------------------------------------------------------===// 0008 // Record tokens that a preprocessor emits and define operations to map between 0009 // the tokens written in a file and tokens produced by the preprocessor. 0010 // 0011 // When running the compiler, there are two token streams we are interested in: 0012 // - "spelled" tokens directly correspond to a substring written in some 0013 // source file. 0014 // - "expanded" tokens represent the result of preprocessing, parses consumes 0015 // this token stream to produce the AST. 0016 // 0017 // Expanded tokens correspond directly to locations found in the AST, allowing 0018 // to find subranges of the token stream covered by various AST nodes. Spelled 0019 // tokens correspond directly to the source code written by the user. 0020 // 0021 // To allow composing these two use-cases, we also define operations that map 0022 // between expanded and spelled tokens that produced them (macro calls, 0023 // directives, etc). 0024 // 0025 //===----------------------------------------------------------------------===// 0026 0027 #ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H 0028 #define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H 0029 0030 #include "clang/Basic/LangOptions.h" 0031 #include "clang/Basic/SourceLocation.h" 0032 #include "clang/Basic/SourceManager.h" 0033 #include "clang/Basic/TokenKinds.h" 0034 #include "clang/Lex/Token.h" 0035 #include "llvm/ADT/ArrayRef.h" 0036 #include "llvm/ADT/DenseMap.h" 0037 #include "llvm/ADT/StringRef.h" 0038 #include "llvm/Support/Compiler.h" 0039 #include "llvm/Support/raw_ostream.h" 0040 #include <cstdint> 0041 #include <tuple> 0042 0043 namespace clang { 0044 class Preprocessor; 0045 0046 namespace syntax { 0047 0048 /// A half-open character range inside a particular file, the start offset is 0049 /// included and the end offset is excluded from the range. 0050 struct FileRange { 0051 /// EXPECTS: File.isValid() && Begin <= End. 0052 FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset); 0053 /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(). 0054 FileRange(const SourceManager &SM, SourceLocation BeginLoc, unsigned Length); 0055 /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(), Begin <= End and files 0056 /// are the same. 0057 FileRange(const SourceManager &SM, SourceLocation BeginLoc, 0058 SourceLocation EndLoc); 0059 0060 FileID file() const { return File; } 0061 /// Start is a start offset (inclusive) in the corresponding file. 0062 unsigned beginOffset() const { return Begin; } 0063 /// End offset (exclusive) in the corresponding file. 0064 unsigned endOffset() const { return End; } 0065 0066 unsigned length() const { return End - Begin; } 0067 0068 /// Check if \p Offset is inside the range. 0069 bool contains(unsigned Offset) const { 0070 return Begin <= Offset && Offset < End; 0071 } 0072 /// Check \p Offset is inside the range or equal to its endpoint. 0073 bool touches(unsigned Offset) const { 0074 return Begin <= Offset && Offset <= End; 0075 } 0076 0077 /// Gets the substring that this FileRange refers to. 0078 llvm::StringRef text(const SourceManager &SM) const; 0079 0080 /// Convert to the clang range. The returned range is always a char range, 0081 /// never a token range. 0082 CharSourceRange toCharRange(const SourceManager &SM) const; 0083 0084 friend bool operator==(const FileRange &L, const FileRange &R) { 0085 return std::tie(L.File, L.Begin, L.End) == std::tie(R.File, R.Begin, R.End); 0086 } 0087 friend bool operator!=(const FileRange &L, const FileRange &R) { 0088 return !(L == R); 0089 } 0090 0091 private: 0092 FileID File; 0093 unsigned Begin; 0094 unsigned End; 0095 }; 0096 0097 /// For debugging purposes. 0098 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R); 0099 0100 /// A token coming directly from a file or from a macro invocation. Has just 0101 /// enough information to locate the token in the source code. 0102 /// Can represent both expanded and spelled tokens. 0103 class Token { 0104 public: 0105 Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind); 0106 /// EXPECTS: clang::Token is not an annotation token. 0107 explicit Token(const clang::Token &T); 0108 0109 tok::TokenKind kind() const { return Kind; } 0110 /// Location of the first character of a token. 0111 SourceLocation location() const { return Location; } 0112 /// Location right after the last character of a token. 0113 SourceLocation endLocation() const { 0114 return Location.getLocWithOffset(Length); 0115 } 0116 unsigned length() const { return Length; } 0117 0118 /// Get the substring covered by the token. Note that will include all 0119 /// digraphs, newline continuations, etc. E.g. tokens for 'int' and 0120 /// in\ 0121 /// t 0122 /// both have the same kind tok::kw_int, but results of text() are different. 0123 llvm::StringRef text(const SourceManager &SM) const; 0124 0125 /// Gets a range of this token. 0126 /// EXPECTS: token comes from a file, not from a macro expansion. 0127 FileRange range(const SourceManager &SM) const; 0128 0129 /// Given two tokens inside the same file, returns a file range that starts at 0130 /// \p First and ends at \p Last. 0131 /// EXPECTS: First and Last are file tokens from the same file, Last starts 0132 /// after First. 0133 static FileRange range(const SourceManager &SM, const syntax::Token &First, 0134 const syntax::Token &Last); 0135 0136 std::string dumpForTests(const SourceManager &SM) const; 0137 /// For debugging purposes. 0138 std::string str() const; 0139 0140 private: 0141 SourceLocation Location; 0142 unsigned Length; 0143 tok::TokenKind Kind; 0144 }; 0145 /// For debugging purposes. Equivalent to a call to Token::str(). 0146 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T); 0147 0148 /// A list of tokens obtained by preprocessing a text buffer and operations to 0149 /// map between the expanded and spelled tokens, i.e. TokenBuffer has 0150 /// information about two token streams: 0151 /// 1. Expanded tokens: tokens produced by the preprocessor after all macro 0152 /// replacements, 0153 /// 2. Spelled tokens: corresponding directly to the source code of a file 0154 /// before any macro replacements occurred. 0155 /// Here's an example to illustrate a difference between those two: 0156 /// #define FOO 10 0157 /// int a = FOO; 0158 /// 0159 /// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}. 0160 /// Expanded tokens are {'int','a','=','10',';','eof'}. 0161 /// 0162 /// Note that the expanded token stream has a tok::eof token at the end, the 0163 /// spelled tokens never store a 'eof' token. 0164 /// 0165 /// The full list expanded tokens can be obtained with expandedTokens(). Spelled 0166 /// tokens for each of the files can be obtained via spelledTokens(FileID). 0167 /// 0168 /// To map between the expanded and spelled tokens use findSpelledByExpanded(). 0169 /// 0170 /// To build a token buffer use the TokenCollector class. You can also compute 0171 /// the spelled tokens of a file using the tokenize() helper. 0172 /// 0173 /// FIXME: allow mappings into macro arguments. 0174 class TokenBuffer { 0175 public: 0176 TokenBuffer(const SourceManager &SourceMgr) : SourceMgr(&SourceMgr) {} 0177 0178 TokenBuffer(TokenBuffer &&) = default; 0179 TokenBuffer(const TokenBuffer &) = delete; 0180 TokenBuffer &operator=(TokenBuffer &&) = default; 0181 TokenBuffer &operator=(const TokenBuffer &) = delete; 0182 0183 /// All tokens produced by the preprocessor after all macro replacements, 0184 /// directives, etc. Source locations found in the clang AST will always 0185 /// point to one of these tokens. 0186 /// Tokens are in TU order (per SourceManager::isBeforeInTranslationUnit()). 0187 /// FIXME: figure out how to handle token splitting, e.g. '>>' can be split 0188 /// into two '>' tokens by the parser. However, TokenBuffer currently 0189 /// keeps it as a single '>>' token. 0190 llvm::ArrayRef<syntax::Token> expandedTokens() const { 0191 return ExpandedTokens; 0192 } 0193 0194 /// Builds a cache to make future calls to expandedToken(SourceRange) faster. 0195 /// Creates an index only once. Further calls to it will be no-op. 0196 void indexExpandedTokens(); 0197 0198 /// Returns the subrange of expandedTokens() corresponding to the closed 0199 /// token range R. 0200 /// Consider calling indexExpandedTokens() before for faster lookups. 0201 llvm::ArrayRef<syntax::Token> expandedTokens(SourceRange R) const; 0202 0203 /// Returns the subrange of spelled tokens corresponding to AST node spanning 0204 /// \p Expanded. This is the text that should be replaced if a refactoring 0205 /// were to rewrite the node. If \p Expanded is empty, the returned value is 0206 /// std::nullopt. 0207 /// 0208 /// Will fail if the expanded tokens do not correspond to a sequence of 0209 /// spelled tokens. E.g. for the following example: 0210 /// 0211 /// #define FIRST f1 f2 f3 0212 /// #define SECOND s1 s2 s3 0213 /// #define ID2(X, Y) X Y 0214 /// 0215 /// a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c 0216 /// d ID2(e f g, h) i // expanded tokens are: d e f g h i 0217 /// 0218 /// the results would be: 0219 /// expanded => spelled 0220 /// ------------------------ 0221 /// a => a 0222 /// s1 s2 s3 => SECOND 0223 /// a f1 f2 f3 => a FIRST 0224 /// a f1 => can't map 0225 /// s1 s2 => can't map 0226 /// e f => e f 0227 /// g h => can't map 0228 /// 0229 /// EXPECTS: \p Expanded is a subrange of expandedTokens(). 0230 /// Complexity is logarithmic. 0231 std::optional<llvm::ArrayRef<syntax::Token>> 0232 spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const; 0233 0234 /// Find the subranges of expanded tokens, corresponding to \p Spelled. 0235 /// 0236 /// Some spelled tokens may not be present in the expanded token stream, so 0237 /// this function can return an empty vector, e.g. for tokens of macro 0238 /// directives or disabled preprocessor branches. 0239 /// 0240 /// Some spelled tokens can be duplicated in the expanded token stream 0241 /// multiple times and this function will return multiple results in those 0242 /// cases. This happens when \p Spelled is inside a macro argument. 0243 /// 0244 /// FIXME: return correct results on macro arguments. For now, we return an 0245 /// empty list. 0246 /// 0247 /// (!) will return empty vector on tokens from #define body: 0248 /// E.g. for the following example: 0249 /// 0250 /// #define FIRST(A) f1 A = A f2 0251 /// #define SECOND s 0252 /// 0253 /// a FIRST(arg) b SECOND c // expanded tokens are: a f1 arg = arg f2 b s 0254 /// The results would be 0255 /// spelled => expanded 0256 /// ------------------------ 0257 /// #define FIRST => {} 0258 /// a FIRST(arg) => {a f1 arg = arg f2} 0259 /// arg => {arg, arg} // arg #1 is before `=` and arg #2 is 0260 /// // after `=` in the expanded tokens. 0261 llvm::SmallVector<llvm::ArrayRef<syntax::Token>, 1> 0262 expandedForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const; 0263 0264 /// An expansion produced by the preprocessor, includes macro expansions and 0265 /// preprocessor directives. Preprocessor always maps a non-empty range of 0266 /// spelled tokens to a (possibly empty) range of expanded tokens. Here is a 0267 /// few examples of expansions: 0268 /// #pragma once // Expands to an empty range. 0269 /// #define FOO 1 2 3 // Expands an empty range. 0270 /// FOO // Expands to "1 2 3". 0271 /// FIXME(ibiryukov): implement this, currently #include expansions are empty. 0272 /// #include <vector> // Expands to tokens produced by the include. 0273 struct Expansion { 0274 llvm::ArrayRef<syntax::Token> Spelled; 0275 llvm::ArrayRef<syntax::Token> Expanded; 0276 }; 0277 /// If \p Spelled starts a mapping (e.g. if it's a macro name or '#' starting 0278 /// a preprocessor directive) return the subrange of expanded tokens that the 0279 /// macro expands to. 0280 std::optional<Expansion> 0281 expansionStartingAt(const syntax::Token *Spelled) const; 0282 /// Returns all expansions (partially) expanded from the specified tokens. 0283 /// This is the expansions whose Spelled range intersects \p Spelled. 0284 std::vector<Expansion> 0285 expansionsOverlapping(llvm::ArrayRef<syntax::Token> Spelled) const; 0286 0287 /// Lexed tokens of a file before preprocessing. E.g. for the following input 0288 /// #define DECL(name) int name = 10 0289 /// DECL(a); 0290 /// spelledTokens() returns 0291 /// {"#", "define", "DECL", "(", "name", ")", "int", "name", "=", "10", 0292 /// "DECL", "(", "a", ")", ";"} 0293 llvm::ArrayRef<syntax::Token> spelledTokens(FileID FID) const; 0294 0295 /// Returns the spelled Token containing the Loc, if there are no such tokens 0296 /// returns nullptr. 0297 const syntax::Token *spelledTokenContaining(SourceLocation Loc) const; 0298 0299 /// Get all tokens that expand a macro in \p FID. For the following input 0300 /// #define FOO B 0301 /// #define FOO2(X) int X 0302 /// FOO2(XY) 0303 /// int B; 0304 /// FOO; 0305 /// macroExpansions() returns {"FOO2", "FOO"} (from line 3 and 5 0306 /// respecitvely). 0307 std::vector<const syntax::Token *> macroExpansions(FileID FID) const; 0308 0309 const SourceManager &sourceManager() const { return *SourceMgr; } 0310 0311 std::string dumpForTests() const; 0312 0313 private: 0314 /// Describes a mapping between a continuous subrange of spelled tokens and 0315 /// expanded tokens. Represents macro expansions, preprocessor directives, 0316 /// conditionally disabled pp regions, etc. 0317 /// #define FOO 1+2 0318 /// #define BAR(a) a + 1 0319 /// FOO // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}. 0320 /// BAR(1) // invocation #2, tokens = {'a', '+', '1'}, 0321 /// macroTokens = {'BAR', '(', '1', ')'}. 0322 struct Mapping { 0323 // Positions in the corresponding spelled token stream. The corresponding 0324 // range is never empty. 0325 unsigned BeginSpelled = 0; 0326 unsigned EndSpelled = 0; 0327 // Positions in the expanded token stream. The corresponding range can be 0328 // empty. 0329 unsigned BeginExpanded = 0; 0330 unsigned EndExpanded = 0; 0331 0332 /// For debugging purposes. 0333 std::string str() const; 0334 }; 0335 /// Spelled tokens of the file with information about the subranges. 0336 struct MarkedFile { 0337 /// Lexed, but not preprocessed, tokens of the file. These map directly to 0338 /// text in the corresponding files and include tokens of all preprocessor 0339 /// directives. 0340 /// FIXME: spelled tokens don't change across FileID that map to the same 0341 /// FileEntry. We could consider deduplicating them to save memory. 0342 std::vector<syntax::Token> SpelledTokens; 0343 /// A sorted list to convert between the spelled and expanded token streams. 0344 std::vector<Mapping> Mappings; 0345 /// The first expanded token produced for this FileID. 0346 unsigned BeginExpanded = 0; 0347 unsigned EndExpanded = 0; 0348 }; 0349 0350 friend class TokenCollector; 0351 0352 /// Maps a single expanded token to its spelled counterpart or a mapping that 0353 /// produced it. 0354 std::pair<const syntax::Token *, const Mapping *> 0355 spelledForExpandedToken(const syntax::Token *Expanded) const; 0356 0357 /// Returns a mapping starting before \p Spelled token, or nullptr if no 0358 /// such mapping exists. 0359 static const Mapping * 0360 mappingStartingBeforeSpelled(const MarkedFile &F, 0361 const syntax::Token *Spelled); 0362 0363 /// Convert a private Mapping to a public Expansion. 0364 Expansion makeExpansion(const MarkedFile &, const Mapping &) const; 0365 /// Returns the file that the Spelled tokens are taken from. 0366 /// Asserts that they are non-empty, from a tracked file, and in-bounds. 0367 const MarkedFile &fileForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const; 0368 0369 /// Token stream produced after preprocessing, conceputally this captures the 0370 /// same stream as 'clang -E' (excluding the preprocessor directives like 0371 /// #file, etc.). 0372 std::vector<syntax::Token> ExpandedTokens; 0373 // Index of ExpandedTokens for faster lookups by SourceLocation. 0374 llvm::DenseMap<SourceLocation, unsigned> ExpandedTokIndex; 0375 llvm::DenseMap<FileID, MarkedFile> Files; 0376 // The value is never null, pointer instead of reference to avoid disabling 0377 // implicit assignment operator. 0378 const SourceManager *SourceMgr; 0379 }; 0380 0381 /// The spelled tokens that overlap or touch a spelling location Loc. 0382 /// This always returns 0-2 tokens. 0383 llvm::ArrayRef<syntax::Token> 0384 spelledTokensTouching(SourceLocation Loc, const syntax::TokenBuffer &Tokens); 0385 llvm::ArrayRef<syntax::Token> 0386 spelledTokensTouching(SourceLocation Loc, llvm::ArrayRef<syntax::Token> Tokens); 0387 0388 /// The identifier token that overlaps or touches a spelling location Loc. 0389 /// If there is none, returns nullptr. 0390 const syntax::Token * 0391 spelledIdentifierTouching(SourceLocation Loc, 0392 llvm::ArrayRef<syntax::Token> Tokens); 0393 const syntax::Token * 0394 spelledIdentifierTouching(SourceLocation Loc, 0395 const syntax::TokenBuffer &Tokens); 0396 0397 /// Lex the text buffer, corresponding to \p FID, in raw mode and record the 0398 /// resulting spelled tokens. Does minimal post-processing on raw identifiers, 0399 /// setting the appropriate token kind (instead of the raw_identifier reported 0400 /// by lexer in raw mode). This is a very low-level function, most users should 0401 /// prefer to use TokenCollector. Lexing in raw mode produces wildly different 0402 /// results from what one might expect when running a C++ frontend, e.g. 0403 /// preprocessor does not run at all. 0404 /// The result will *not* have a 'eof' token at the end. 0405 std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM, 0406 const LangOptions &LO); 0407 /// Similar to one above, instead of whole file tokenizes a part of it. Note 0408 /// that, the first token might be incomplete if FR.startOffset is not at the 0409 /// beginning of a token, and the last token returned will start before the 0410 /// FR.endOffset but might end after it. 0411 std::vector<syntax::Token> 0412 tokenize(const FileRange &FR, const SourceManager &SM, const LangOptions &LO); 0413 0414 /// Collects tokens for the main file while running the frontend action. An 0415 /// instance of this object should be created on 0416 /// FrontendAction::BeginSourceFile() and the results should be consumed after 0417 /// FrontendAction::Execute() finishes. 0418 class TokenCollector { 0419 public: 0420 /// Adds the hooks to collect the tokens. Should be called before the 0421 /// preprocessing starts, i.e. as a part of BeginSourceFile() or 0422 /// CreateASTConsumer(). 0423 TokenCollector(Preprocessor &P); 0424 0425 /// Finalizes token collection. Should be called after preprocessing is 0426 /// finished, i.e. after running Execute(). 0427 [[nodiscard]] TokenBuffer consume() &&; 0428 0429 private: 0430 /// Maps from a start to an end spelling location of transformations 0431 /// performed by the preprocessor. These include: 0432 /// 1. range from '#' to the last token in the line for PP directives, 0433 /// 2. macro name and arguments for macro expansions. 0434 /// Note that we record only top-level macro expansions, intermediate 0435 /// expansions (e.g. inside macro arguments) are ignored. 0436 /// 0437 /// Used to find correct boundaries of macro calls and directives when 0438 /// building mappings from spelled to expanded tokens. 0439 /// 0440 /// Logically, at each point of the preprocessor execution there is a stack of 0441 /// macro expansions being processed and we could use it to recover the 0442 /// location information we need. However, the public preprocessor API only 0443 /// exposes the points when macro expansions start (when we push a macro onto 0444 /// the stack) and not when they end (when we pop a macro from the stack). 0445 /// To workaround this limitation, we rely on source location information 0446 /// stored in this map. 0447 using PPExpansions = llvm::DenseMap<SourceLocation, SourceLocation>; 0448 class Builder; 0449 class CollectPPExpansions; 0450 0451 std::vector<syntax::Token> Expanded; 0452 // FIXME: we only store macro expansions, also add directives(#pragma, etc.) 0453 PPExpansions Expansions; 0454 Preprocessor &PP; 0455 CollectPPExpansions *Collector; 0456 }; 0457 0458 } // namespace syntax 0459 } // namespace clang 0460 0461 #endif
| [ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
|
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
|