Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-05-10 08:36:30

0001 //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
0002 //
0003 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
0004 // See https://llvm.org/LICENSE.txt for license information.
0005 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
0006 //
0007 //===----------------------------------------------------------------------===//
0008 //
0009 //  This file defines lexer for structured comments and supporting token class.
0010 //
0011 //===----------------------------------------------------------------------===//
0012 
0013 #ifndef LLVM_CLANG_AST_COMMENTLEXER_H
0014 #define LLVM_CLANG_AST_COMMENTLEXER_H
0015 
0016 #include "clang/Basic/Diagnostic.h"
0017 #include "clang/Basic/SourceManager.h"
0018 #include "llvm/ADT/SmallString.h"
0019 #include "llvm/ADT/StringRef.h"
0020 #include "llvm/Support/Allocator.h"
0021 #include "llvm/Support/raw_ostream.h"
0022 
0023 namespace clang {
0024 namespace comments {
0025 
0026 class Lexer;
0027 class TextTokenRetokenizer;
0028 struct CommandInfo;
0029 class CommandTraits;
0030 
0031 namespace tok {
0032 enum TokenKind {
0033   eof,
0034   newline,
0035   text,
0036   unknown_command,   // Command that does not have an ID.
0037   backslash_command, // Command with an ID, that used backslash marker.
0038   at_command,        // Command with an ID, that used 'at' marker.
0039   verbatim_block_begin,
0040   verbatim_block_line,
0041   verbatim_block_end,
0042   verbatim_line_name,
0043   verbatim_line_text,
0044   html_start_tag,     // <tag
0045   html_ident,         // attr
0046   html_equals,        // =
0047   html_quoted_string, // "blah\"blah" or 'blah\'blah'
0048   html_greater,       // >
0049   html_slash_greater, // />
0050   html_end_tag        // </tag
0051 };
0052 } // end namespace tok
0053 
0054 /// Comment token.
0055 class Token {
0056   friend class Lexer;
0057   friend class TextTokenRetokenizer;
0058 
0059   /// The location of the token.
0060   SourceLocation Loc;
0061 
0062   /// The actual kind of the token.
0063   tok::TokenKind Kind;
0064 
0065   /// Integer value associated with a token.
0066   ///
0067   /// If the token is a known command, contains command ID and TextPtr is
0068   /// unused (command spelling can be found with CommandTraits).  Otherwise,
0069   /// contains the length of the string that starts at TextPtr.
0070   unsigned IntVal;
0071 
0072   /// Length of the token spelling in comment.  Can be 0 for synthenized
0073   /// tokens.
0074   unsigned Length;
0075 
0076   /// Contains text value associated with a token.
0077   const char *TextPtr;
0078 
0079 public:
0080   SourceLocation getLocation() const LLVM_READONLY { return Loc; }
0081   void setLocation(SourceLocation SL) { Loc = SL; }
0082 
0083   SourceLocation getEndLocation() const LLVM_READONLY {
0084     if (Length == 0 || Length == 1)
0085       return Loc;
0086     return Loc.getLocWithOffset(Length - 1);
0087   }
0088 
0089   tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
0090   void setKind(tok::TokenKind K) { Kind = K; }
0091 
0092   bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
0093   bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
0094 
0095   unsigned getLength() const LLVM_READONLY { return Length; }
0096   void setLength(unsigned L) { Length = L; }
0097 
0098   StringRef getText() const LLVM_READONLY {
0099     assert(is(tok::text));
0100     return StringRef(TextPtr, IntVal);
0101   }
0102 
0103   void setText(StringRef Text) {
0104     assert(is(tok::text));
0105     TextPtr = Text.data();
0106     IntVal = Text.size();
0107   }
0108 
0109   StringRef getUnknownCommandName() const LLVM_READONLY {
0110     assert(is(tok::unknown_command));
0111     return StringRef(TextPtr, IntVal);
0112   }
0113 
0114   void setUnknownCommandName(StringRef Name) {
0115     assert(is(tok::unknown_command));
0116     TextPtr = Name.data();
0117     IntVal = Name.size();
0118   }
0119 
0120   unsigned getCommandID() const LLVM_READONLY {
0121     assert(is(tok::backslash_command) || is(tok::at_command));
0122     return IntVal;
0123   }
0124 
0125   void setCommandID(unsigned ID) {
0126     assert(is(tok::backslash_command) || is(tok::at_command));
0127     IntVal = ID;
0128   }
0129 
0130   unsigned getVerbatimBlockID() const LLVM_READONLY {
0131     assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
0132     return IntVal;
0133   }
0134 
0135   void setVerbatimBlockID(unsigned ID) {
0136     assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
0137     IntVal = ID;
0138   }
0139 
0140   StringRef getVerbatimBlockText() const LLVM_READONLY {
0141     assert(is(tok::verbatim_block_line));
0142     return StringRef(TextPtr, IntVal);
0143   }
0144 
0145   void setVerbatimBlockText(StringRef Text) {
0146     assert(is(tok::verbatim_block_line));
0147     TextPtr = Text.data();
0148     IntVal = Text.size();
0149   }
0150 
0151   unsigned getVerbatimLineID() const LLVM_READONLY {
0152     assert(is(tok::verbatim_line_name));
0153     return IntVal;
0154   }
0155 
0156   void setVerbatimLineID(unsigned ID) {
0157     assert(is(tok::verbatim_line_name));
0158     IntVal = ID;
0159   }
0160 
0161   StringRef getVerbatimLineText() const LLVM_READONLY {
0162     assert(is(tok::verbatim_line_text));
0163     return StringRef(TextPtr, IntVal);
0164   }
0165 
0166   void setVerbatimLineText(StringRef Text) {
0167     assert(is(tok::verbatim_line_text));
0168     TextPtr = Text.data();
0169     IntVal = Text.size();
0170   }
0171 
0172   StringRef getHTMLTagStartName() const LLVM_READONLY {
0173     assert(is(tok::html_start_tag));
0174     return StringRef(TextPtr, IntVal);
0175   }
0176 
0177   void setHTMLTagStartName(StringRef Name) {
0178     assert(is(tok::html_start_tag));
0179     TextPtr = Name.data();
0180     IntVal = Name.size();
0181   }
0182 
0183   StringRef getHTMLIdent() const LLVM_READONLY {
0184     assert(is(tok::html_ident));
0185     return StringRef(TextPtr, IntVal);
0186   }
0187 
0188   void setHTMLIdent(StringRef Name) {
0189     assert(is(tok::html_ident));
0190     TextPtr = Name.data();
0191     IntVal = Name.size();
0192   }
0193 
0194   StringRef getHTMLQuotedString() const LLVM_READONLY {
0195     assert(is(tok::html_quoted_string));
0196     return StringRef(TextPtr, IntVal);
0197   }
0198 
0199   void setHTMLQuotedString(StringRef Str) {
0200     assert(is(tok::html_quoted_string));
0201     TextPtr = Str.data();
0202     IntVal = Str.size();
0203   }
0204 
0205   StringRef getHTMLTagEndName() const LLVM_READONLY {
0206     assert(is(tok::html_end_tag));
0207     return StringRef(TextPtr, IntVal);
0208   }
0209 
0210   void setHTMLTagEndName(StringRef Name) {
0211     assert(is(tok::html_end_tag));
0212     TextPtr = Name.data();
0213     IntVal = Name.size();
0214   }
0215 
0216   void dump(const Lexer &L, const SourceManager &SM) const;
0217 };
0218 
0219 /// Comment lexer.
0220 class Lexer {
0221 private:
0222   Lexer(const Lexer &) = delete;
0223   void operator=(const Lexer &) = delete;
0224 
0225   /// Allocator for strings that are semantic values of tokens and have to be
0226   /// computed (for example, resolved decimal character references).
0227   llvm::BumpPtrAllocator &Allocator;
0228 
0229   DiagnosticsEngine &Diags;
0230 
0231   const CommandTraits &Traits;
0232 
0233   const char *const BufferStart;
0234   const char *const BufferEnd;
0235 
0236   const char *BufferPtr;
0237 
0238   /// One past end pointer for the current comment.  For BCPL comments points
0239   /// to newline or BufferEnd, for C comments points to star in '*/'.
0240   const char *CommentEnd;
0241 
0242   SourceLocation FileLoc;
0243 
0244   /// If true, the commands, html tags, etc will be parsed and reported as
0245   /// separate tokens inside the comment body. If false, the comment text will
0246   /// be parsed into text and newline tokens.
0247   bool ParseCommands;
0248 
0249   enum LexerCommentState : uint8_t {
0250     LCS_BeforeComment,
0251     LCS_InsideBCPLComment,
0252     LCS_InsideCComment,
0253     LCS_BetweenComments
0254   };
0255 
0256   /// Low-level lexer state, track if we are inside or outside of comment.
0257   LexerCommentState CommentState;
0258 
0259   enum LexerState : uint8_t {
0260     /// Lexing normal comment text
0261     LS_Normal,
0262 
0263     /// Finished lexing verbatim block beginning command, will lex first body
0264     /// line.
0265     LS_VerbatimBlockFirstLine,
0266 
0267     /// Lexing verbatim block body line-by-line, skipping line-starting
0268     /// decorations.
0269     LS_VerbatimBlockBody,
0270 
0271     /// Finished lexing verbatim line beginning command, will lex text (one
0272     /// line).
0273     LS_VerbatimLineText,
0274 
0275     /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
0276     LS_HTMLStartTag,
0277 
0278     /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
0279     LS_HTMLEndTag
0280   };
0281 
0282   /// Current lexing mode.
0283   LexerState State;
0284 
0285   /// If State is LS_VerbatimBlock, contains the name of verbatim end
0286   /// command, including command marker.
0287   SmallString<16> VerbatimBlockEndCommandName;
0288 
0289   /// Given a character reference name (e.g., "lt"), return the character that
0290   /// it stands for (e.g., "<").
0291   StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
0292 
0293   /// Given a Unicode codepoint as base-10 integer, return the character.
0294   StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
0295 
0296   /// Given a Unicode codepoint as base-16 integer, return the character.
0297   StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
0298 
0299   void formTokenWithChars(Token &Result, const char *TokEnd,
0300                           tok::TokenKind Kind);
0301 
0302   void formTextToken(Token &Result, const char *TokEnd) {
0303     StringRef Text(BufferPtr, TokEnd - BufferPtr);
0304     formTokenWithChars(Result, TokEnd, tok::text);
0305     Result.setText(Text);
0306   }
0307 
0308   SourceLocation getSourceLocation(const char *Loc) const {
0309     assert(Loc >= BufferStart && Loc <= BufferEnd &&
0310            "Location out of range for this buffer!");
0311 
0312     const unsigned CharNo = Loc - BufferStart;
0313     return FileLoc.getLocWithOffset(CharNo);
0314   }
0315 
0316   DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
0317     return Diags.Report(Loc, DiagID);
0318   }
0319 
0320   /// Eat string matching regexp \code \s*\* \endcode.
0321   void skipLineStartingDecorations();
0322 
0323   /// Skip over pure text.
0324   const char *skipTextToken();
0325 
0326   /// Lex comment text, including commands if ParseCommands is set to true.
0327   void lexCommentText(Token &T);
0328 
0329   void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker,
0330                                 const CommandInfo *Info);
0331 
0332   void lexVerbatimBlockFirstLine(Token &T);
0333 
0334   void lexVerbatimBlockBody(Token &T);
0335 
0336   void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
0337                                const CommandInfo *Info);
0338 
0339   void lexVerbatimLineText(Token &T);
0340 
0341   void lexHTMLCharacterReference(Token &T);
0342 
0343   void setupAndLexHTMLStartTag(Token &T);
0344 
0345   void lexHTMLStartTag(Token &T);
0346 
0347   void setupAndLexHTMLEndTag(Token &T);
0348 
0349   void lexHTMLEndTag(Token &T);
0350 
0351 public:
0352   Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
0353         const CommandTraits &Traits, SourceLocation FileLoc,
0354         const char *BufferStart, const char *BufferEnd,
0355         bool ParseCommands = true);
0356 
0357   void lex(Token &T);
0358 
0359   StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const;
0360 };
0361 
0362 } // end namespace comments
0363 } // end namespace clang
0364 
0365 #endif
0366