clang/Lex/Token.h

0001 //===--- Token.h - Token interface ------------------------------*- C++ -*-===//
0002 //
0003 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
0004 // See https://llvm.org/LICENSE.txt for license information.
0005 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
0006 //
0007 //===----------------------------------------------------------------------===//
0008 //
0009 //  This file defines the Token interface.
0010 //
0011 //===----------------------------------------------------------------------===//
0012
0013 #ifndef LLVM_CLANG_LEX_TOKEN_H
0014 #define LLVM_CLANG_LEX_TOKEN_H
0015
0016 #include "clang/Basic/SourceLocation.h"
0017 #include "clang/Basic/TokenKinds.h"
0018 #include "llvm/ADT/ArrayRef.h"
0019 #include "llvm/ADT/StringRef.h"
0020 #include <cassert>
0021
0022 namespace clang {
0023
0024 class IdentifierInfo;
0025 class LangOptions;
0026
0027 /// Token - This structure provides full information about a lexed token.
0028 /// It is not intended to be space efficient, it is intended to return as much
0029 /// information as possible about each returned token.  This is expected to be
0030 /// compressed into a smaller form if memory footprint is important.
0031 ///
0032 /// The parser can create a special "annotation token" representing a stream of
0033 /// tokens that were parsed and semantically resolved, e.g.: "foo::MyClass<int>"
0034 /// can be represented by a single typename annotation token that carries
0035 /// information about the SourceRange of the tokens and the type object.
0036 class Token {
0037   /// The location of the token. This is actually a SourceLocation.
0038   SourceLocation::UIntTy Loc;
0039
0040   // Conceptually these next two fields could be in a union.  However, this
0041   // causes gcc 4.2 to pessimize LexTokenInternal, a very performance critical
0042   // routine. Keeping as separate members with casts until a more beautiful fix
0043   // presents itself.
0044
0045   /// UintData - This holds either the length of the token text, when
0046   /// a normal token, or the end of the SourceRange when an annotation
0047   /// token.
0048   SourceLocation::UIntTy UintData;
0049
0050   /// PtrData - This is a union of four different pointer types, which depends
0051   /// on what type of token this is:
0052   ///  Identifiers, keywords, etc:
0053   ///    This is an IdentifierInfo*, which contains the uniqued identifier
0054   ///    spelling.
0055   ///  Literals:  isLiteral() returns true.
0056   ///    This is a pointer to the start of the token in a text buffer, which
0057   ///    may be dirty (have trigraphs / escaped newlines).
0058   ///  Annotations (resolved type names, C++ scopes, etc): isAnnotation().
0059   ///    This is a pointer to sema-specific data for the annotation token.
0060   ///  Eof:
0061   ///    This is a pointer to a Decl.
0062   ///  Other:
0063   ///    This is null.
0064   void *PtrData;
0065
0066   /// Kind - The actual flavor of token this is.
0067   tok::TokenKind Kind;
0068
0069   /// Flags - Bits we track about this token, members of the TokenFlags enum.
0070   unsigned short Flags;
0071
0072 public:
0073   // Various flags set per token:
0074   enum TokenFlags {
0075     StartOfLine = 0x01,   // At start of line or only after whitespace
0076                           // (considering the line after macro expansion).
0077     LeadingSpace = 0x02,  // Whitespace exists before this token (considering
0078                           // whitespace after macro expansion).
0079     DisableExpand = 0x04, // This identifier may never be macro expanded.
0080     NeedsCleaning = 0x08, // Contained an escaped newline or trigraph.
0081     LeadingEmptyMacro = 0x10, // Empty macro exists before this token.
0082     HasUDSuffix = 0x20,  // This string or character literal has a ud-suffix.
0083     HasUCN = 0x40,       // This identifier contains a UCN.
0084     IgnoredComma = 0x80, // This comma is not a macro argument separator (MS).
0085     StringifiedInMacro = 0x100, // This string or character literal is formed by
0086                                 // macro stringizing or charizing operator.
0087     CommaAfterElided = 0x200, // The comma following this token was elided (MS).
0088     IsEditorPlaceholder = 0x400, // This identifier is a placeholder.
0089     IsReinjected = 0x800, // A phase 4 token that was produced before and
0090                           // re-added, e.g. via EnterTokenStream. Annotation
0091                           // tokens are *not* reinjected.
0092   };
0093
0094   tok::TokenKind getKind() const { return Kind; }
0095   void setKind(tok::TokenKind K) { Kind = K; }
0096
0097   /// is/isNot - Predicates to check if this token is a specific kind, as in
0098   /// "if (Tok.is(tok::l_brace)) {...}".
0099   bool is(tok::TokenKind K) const { return Kind == K; }
0100   bool isNot(tok::TokenKind K) const { return Kind != K; }
0101   bool isOneOf(tok::TokenKind K1, tok::TokenKind K2) const {
0102     return is(K1) || is(K2);
0103   }
0104   template <typename... Ts> bool isOneOf(tok::TokenKind K1, Ts... Ks) const {
0105     return is(K1) || isOneOf(Ks...);
0106   }
0107
0108   /// Return true if this is a raw identifier (when lexing
0109   /// in raw mode) or a non-keyword identifier (when lexing in non-raw mode).
0110   bool isAnyIdentifier() const {
0111     return tok::isAnyIdentifier(getKind());
0112   }
0113
0114   /// Return true if this is a "literal", like a numeric
0115   /// constant, string, etc.
0116   bool isLiteral() const {
0117     return tok::isLiteral(getKind());
0118   }
0119
0120   /// Return true if this is any of tok::annot_* kind tokens.
0121   bool isAnnotation() const { return tok::isAnnotation(getKind()); }
0122
0123   /// Return true if the token is a keyword that is parsed in the same
0124   /// position as a standard attribute, but that has semantic meaning
0125   /// and so cannot be a true attribute.
0126   bool isRegularKeywordAttribute() const {
0127     return tok::isRegularKeywordAttribute(getKind());
0128   }
0129
0130   /// Return a source location identifier for the specified
0131   /// offset in the current file.
0132   SourceLocation getLocation() const {
0133     return SourceLocation::getFromRawEncoding(Loc);
0134   }
0135   unsigned getLength() const {
0136     assert(!isAnnotation() && "Annotation tokens have no length field");
0137     return UintData;
0138   }
0139
0140   void setLocation(SourceLocation L) { Loc = L.getRawEncoding(); }
0141   void setLength(unsigned Len) {
0142     assert(!isAnnotation() && "Annotation tokens have no length field");
0143     UintData = Len;
0144   }
0145
0146   SourceLocation getAnnotationEndLoc() const {
0147     assert(isAnnotation() && "Used AnnotEndLocID on non-annotation token");
0148     return SourceLocation::getFromRawEncoding(UintData ? UintData : Loc);
0149   }
0150   void setAnnotationEndLoc(SourceLocation L) {
0151     assert(isAnnotation() && "Used AnnotEndLocID on non-annotation token");
0152     UintData = L.getRawEncoding();
0153   }
0154
0155   SourceLocation getLastLoc() const {
0156     return isAnnotation() ? getAnnotationEndLoc() : getLocation();
0157   }
0158
0159   SourceLocation getEndLoc() const {
0160     return isAnnotation() ? getAnnotationEndLoc()
0161                           : getLocation().getLocWithOffset(getLength());
0162   }
0163
0164   /// SourceRange of the group of tokens that this annotation token
0165   /// represents.
0166   SourceRange getAnnotationRange() const {
0167     return SourceRange(getLocation(), getAnnotationEndLoc());
0168   }
0169   void setAnnotationRange(SourceRange R) {
0170     setLocation(R.getBegin());
0171     setAnnotationEndLoc(R.getEnd());
0172   }
0173
0174   const char *getName() const { return tok::getTokenName(Kind); }
0175
0176   /// Reset all flags to cleared.
0177   void startToken() {
0178     Kind = tok::unknown;
0179     Flags = 0;
0180     PtrData = nullptr;
0181     UintData = 0;
0182     Loc = SourceLocation().getRawEncoding();
0183   }
0184
0185   bool hasPtrData() const { return PtrData != nullptr; }
0186
0187   IdentifierInfo *getIdentifierInfo() const {
0188     assert(isNot(tok::raw_identifier) &&
0189            "getIdentifierInfo() on a tok::raw_identifier token!");
0190     assert(!isAnnotation() &&
0191            "getIdentifierInfo() on an annotation token!");
0192     if (isLiteral()) return nullptr;
0193     if (is(tok::eof)) return nullptr;
0194     return (IdentifierInfo*) PtrData;
0195   }
0196   void setIdentifierInfo(IdentifierInfo *II) {
0197     PtrData = (void*) II;
0198   }
0199
0200   const void *getEofData() const {
0201     assert(is(tok::eof));
0202     return reinterpret_cast<const void *>(PtrData);
0203   }
0204   void setEofData(const void *D) {
0205     assert(is(tok::eof));
0206     assert(!PtrData);
0207     PtrData = const_cast<void *>(D);
0208   }
0209
0210   /// getRawIdentifier - For a raw identifier token (i.e., an identifier
0211   /// lexed in raw mode), returns a reference to the text substring in the
0212   /// buffer if known.
0213   StringRef getRawIdentifier() const {
0214     assert(is(tok::raw_identifier));
0215     return StringRef(reinterpret_cast<const char *>(PtrData), getLength());
0216   }
0217   void setRawIdentifierData(const char *Ptr) {
0218     assert(is(tok::raw_identifier));
0219     PtrData = const_cast<char*>(Ptr);
0220   }
0221
0222   /// getLiteralData - For a literal token (numeric constant, string, etc), this
0223   /// returns a pointer to the start of it in the text buffer if known, null
0224   /// otherwise.
0225   const char *getLiteralData() const {
0226     assert(isLiteral() && "Cannot get literal data of non-literal");
0227     return reinterpret_cast<const char*>(PtrData);
0228   }
0229   void setLiteralData(const char *Ptr) {
0230     assert(isLiteral() && "Cannot set literal data of non-literal");
0231     PtrData = const_cast<char*>(Ptr);
0232   }
0233
0234   void *getAnnotationValue() const {
0235     assert(isAnnotation() && "Used AnnotVal on non-annotation token");
0236     return PtrData;
0237   }
0238   void setAnnotationValue(void *val) {
0239     assert(isAnnotation() && "Used AnnotVal on non-annotation token");
0240     PtrData = val;
0241   }
0242
0243   /// Set the specified flag.
0244   void setFlag(TokenFlags Flag) {
0245     Flags |= Flag;
0246   }
0247
0248   /// Get the specified flag.
0249   bool getFlag(TokenFlags Flag) const {
0250     return (Flags & Flag) != 0;
0251   }
0252
0253   /// Unset the specified flag.
0254   void clearFlag(TokenFlags Flag) {
0255     Flags &= ~Flag;
0256   }
0257
0258   /// Return the internal represtation of the flags.
0259   ///
0260   /// This is only intended for low-level operations such as writing tokens to
0261   /// disk.
0262   unsigned getFlags() const {
0263     return Flags;
0264   }
0265
0266   /// Set a flag to either true or false.
0267   void setFlagValue(TokenFlags Flag, bool Val) {
0268     if (Val)
0269       setFlag(Flag);
0270     else
0271       clearFlag(Flag);
0272   }
0273
0274   /// isAtStartOfLine - Return true if this token is at the start of a line.
0275   ///
0276   bool isAtStartOfLine() const { return getFlag(StartOfLine); }
0277
0278   /// Return true if this token has whitespace before it.
0279   ///
0280   bool hasLeadingSpace() const { return getFlag(LeadingSpace); }
0281
0282   /// Return true if this identifier token should never
0283   /// be expanded in the future, due to C99 6.10.3.4p2.
0284   bool isExpandDisabled() const { return getFlag(DisableExpand); }
0285
0286   /// Return true if we have an ObjC keyword identifier.
0287   bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const;
0288
0289   /// Return the ObjC keyword kind.
0290   tok::ObjCKeywordKind getObjCKeywordID() const;
0291
0292   bool isSimpleTypeSpecifier(const LangOptions &LangOpts) const;
0293
0294   /// Return true if this token has trigraphs or escaped newlines in it.
0295   bool needsCleaning() const { return getFlag(NeedsCleaning); }
0296
0297   /// Return true if this token has an empty macro before it.
0298   ///
0299   bool hasLeadingEmptyMacro() const { return getFlag(LeadingEmptyMacro); }
0300
0301   /// Return true if this token is a string or character literal which
0302   /// has a ud-suffix.
0303   bool hasUDSuffix() const { return getFlag(HasUDSuffix); }
0304
0305   /// Returns true if this token contains a universal character name.
0306   bool hasUCN() const { return getFlag(HasUCN); }
0307
0308   /// Returns true if this token is formed by macro by stringizing or charizing
0309   /// operator.
0310   bool stringifiedInMacro() const { return getFlag(StringifiedInMacro); }
0311
0312   /// Returns true if the comma after this token was elided.
0313   bool commaAfterElided() const { return getFlag(CommaAfterElided); }
0314
0315   /// Returns true if this token is an editor placeholder.
0316   ///
0317   /// Editor placeholders are produced by the code-completion engine and are
0318   /// represented as characters between '<#' and '#>' in the source code. The
0319   /// lexer uses identifier tokens to represent placeholders.
0320   bool isEditorPlaceholder() const { return getFlag(IsEditorPlaceholder); }
0321 };
0322
0323 /// Information about the conditional stack (\#if directives)
0324 /// currently active.
0325 struct PPConditionalInfo {
0326   /// Location where the conditional started.
0327   SourceLocation IfLoc;
0328
0329   /// True if this was contained in a skipping directive, e.g.,
0330   /// in a "\#if 0" block.
0331   bool WasSkipping;
0332
0333   /// True if we have emitted tokens already, and now we're in
0334   /// an \#else block or something.  Only useful in Skipping blocks.
0335   bool FoundNonSkip;
0336
0337   /// True if we've seen a \#else in this block.  If so,
0338   /// \#elif/\#else directives are not allowed.
0339   bool FoundElse;
0340 };
0341
0342 // Extra information needed for annonation tokens.
0343 struct PragmaLoopHintInfo {
0344   Token PragmaName;
0345   Token Option;
0346   ArrayRef<Token> Toks;
0347 };
0348 } // end namespace clang
0349
0350 #endif // LLVM_CLANG_LEX_TOKEN_H