Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-05-10 08:36:26

0001 //===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===//
0002 //
0003 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
0004 // See https://llvm.org/LICENSE.txt for license information.
0005 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
0006 //
0007 //===----------------------------------------------------------------------===//
0008 ///
0009 /// \file
0010 /// This file defines classes for searching and analyzing source code clones.
0011 ///
0012 //===----------------------------------------------------------------------===//
0013 
0014 #ifndef LLVM_CLANG_ANALYSIS_CLONEDETECTION_H
0015 #define LLVM_CLANG_ANALYSIS_CLONEDETECTION_H
0016 
0017 #include "clang/AST/StmtVisitor.h"
0018 #include "llvm/Support/Regex.h"
0019 #include <vector>
0020 
0021 namespace clang {
0022 
0023 class Stmt;
0024 class Decl;
0025 class VarDecl;
0026 class ASTContext;
0027 class CompoundStmt;
0028 
0029 /// Identifies a list of statements.
0030 ///
0031 /// Can either identify a single arbitrary Stmt object, a continuous sequence of
0032 /// child statements inside a CompoundStmt or no statements at all.
0033 class StmtSequence {
0034   /// If this object identifies a sequence of statements inside a CompoundStmt,
0035   /// S points to this CompoundStmt. If this object only identifies a single
0036   /// Stmt, then S is a pointer to this Stmt.
0037   const Stmt *S;
0038 
0039   /// The declaration that contains the statements.
0040   const Decl *D;
0041 
0042   /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence
0043   /// instance is representing the CompoundStmt children inside the array
0044   /// [StartIndex, EndIndex).
0045   unsigned StartIndex;
0046   unsigned EndIndex;
0047 
0048 public:
0049   /// Constructs a StmtSequence holding multiple statements.
0050   ///
0051   /// The resulting StmtSequence identifies a continuous sequence of statements
0052   /// in the body of the given CompoundStmt. Which statements of the body should
0053   /// be identified needs to be specified by providing a start and end index
0054   /// that describe a non-empty sub-array in the body of the given CompoundStmt.
0055   ///
0056   /// \param Stmt A CompoundStmt that contains all statements in its body.
0057   /// \param D The Decl containing this Stmt.
0058   /// \param StartIndex The inclusive start index in the children array of
0059   ///                   \p Stmt
0060   /// \param EndIndex The exclusive end index in the children array of \p Stmt.
0061   StmtSequence(const CompoundStmt *Stmt, const Decl *D, unsigned StartIndex,
0062                unsigned EndIndex);
0063 
0064   /// Constructs a StmtSequence holding a single statement.
0065   ///
0066   /// \param Stmt An arbitrary Stmt.
0067   /// \param D The Decl containing this Stmt.
0068   StmtSequence(const Stmt *Stmt, const Decl *D);
0069 
0070   /// Constructs an empty StmtSequence.
0071   StmtSequence();
0072 
0073   typedef const Stmt *const *iterator;
0074 
0075   /// Returns an iterator pointing to the first statement in this sequence.
0076   iterator begin() const;
0077 
0078   /// Returns an iterator pointing behind the last statement in this sequence.
0079   iterator end() const;
0080 
0081   /// Returns the first statement in this sequence.
0082   ///
0083   /// This method should only be called on a non-empty StmtSequence object.
0084   const Stmt *front() const {
0085     assert(!empty());
0086     return begin()[0];
0087   }
0088 
0089   /// Returns the last statement in this sequence.
0090   ///
0091   /// This method should only be called on a non-empty StmtSequence object.
0092   const Stmt *back() const {
0093     assert(!empty());
0094     return begin()[size() - 1];
0095   }
0096 
0097   /// Returns the number of statements this object holds.
0098   unsigned size() const {
0099     if (holdsSequence())
0100       return EndIndex - StartIndex;
0101     if (S == nullptr)
0102       return 0;
0103     return 1;
0104   }
0105 
0106   /// Returns true if and only if this StmtSequence contains no statements.
0107   bool empty() const { return size() == 0; }
0108 
0109   /// Returns the related ASTContext for the stored Stmts.
0110   ASTContext &getASTContext() const;
0111 
0112   /// Returns the declaration that contains the stored Stmts.
0113   const Decl *getContainingDecl() const {
0114     assert(D);
0115     return D;
0116   }
0117 
0118   /// Returns true if this objects holds a list of statements.
0119   bool holdsSequence() const { return EndIndex != 0; }
0120 
0121   /// Returns the start sourcelocation of the first statement in this sequence.
0122   ///
0123   /// This method should only be called on a non-empty StmtSequence object.
0124   SourceLocation getBeginLoc() const;
0125 
0126   /// Returns the end sourcelocation of the last statement in this sequence.
0127   ///
0128   /// This method should only be called on a non-empty StmtSequence object.
0129   SourceLocation getEndLoc() const;
0130 
0131   /// Returns the source range of the whole sequence - from the beginning
0132   /// of the first statement to the end of the last statement.
0133   SourceRange getSourceRange() const;
0134 
0135   bool operator==(const StmtSequence &Other) const {
0136     return std::tie(S, StartIndex, EndIndex) ==
0137            std::tie(Other.S, Other.StartIndex, Other.EndIndex);
0138   }
0139 
0140   bool operator!=(const StmtSequence &Other) const {
0141     return std::tie(S, StartIndex, EndIndex) !=
0142            std::tie(Other.S, Other.StartIndex, Other.EndIndex);
0143   }
0144 
0145   /// Returns true if and only if this sequence covers a source range that
0146   /// contains the source range of the given sequence \p Other.
0147   ///
0148   /// This method should only be called on a non-empty StmtSequence object
0149   /// and passed a non-empty StmtSequence object.
0150   bool contains(const StmtSequence &Other) const;
0151 };
0152 
0153 /// Searches for similar subtrees in the AST.
0154 ///
0155 /// First, this class needs several declarations with statement bodies which
0156 /// can be passed via analyzeCodeBody. Afterwards all statements can be
0157 /// searched for clones by calling findClones with a given list of constraints
0158 /// that should specify the wanted properties of the clones.
0159 ///
0160 /// The result of findClones can be further constrained with the constrainClones
0161 /// method.
0162 ///
0163 /// This class only searches for clones in executable source code
0164 /// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
0165 /// are not supported.
0166 class CloneDetector {
0167 
0168 public:
0169   /// A collection of StmtSequences that share an arbitrary property.
0170   typedef llvm::SmallVector<StmtSequence, 8> CloneGroup;
0171 
0172   /// Generates and stores search data for all statements in the body of
0173   /// the given Decl.
0174   void analyzeCodeBody(const Decl *D);
0175 
0176   /// Constrains the given list of clone groups with the given constraint.
0177   ///
0178   /// The constraint is expected to have a method with the signature
0179   ///     `void constrain(std::vector<CloneDetector::CloneGroup> &Sequences)`
0180   /// as this is the interface that the CloneDetector uses for applying the
0181   /// constraint. The constraint is supposed to directly modify the passed list
0182   /// so that all clones in the list fulfill the specific property this
0183   /// constraint ensures.
0184   template <typename T>
0185   static void constrainClones(std::vector<CloneGroup> &CloneGroups, T C) {
0186     C.constrain(CloneGroups);
0187   }
0188 
0189   /// Constrains the given list of clone groups with the given list of
0190   /// constraints.
0191   ///
0192   /// The constraints are applied in sequence in the order in which they are
0193   /// passed to this function.
0194   template <typename T1, typename... Ts>
0195   static void constrainClones(std::vector<CloneGroup> &CloneGroups, T1 C,
0196                               Ts... ConstraintList) {
0197     constrainClones(CloneGroups, C);
0198     constrainClones(CloneGroups, ConstraintList...);
0199   }
0200 
0201   /// Searches for clones in all previously passed statements.
0202   /// \param Result Output parameter to which all created clone groups are
0203   ///               added.
0204   /// \param ConstraintList The constraints that should be applied to the
0205   //         result.
0206   template <typename... Ts>
0207   void findClones(std::vector<CloneGroup> &Result, Ts... ConstraintList) {
0208     // The initial assumption is that there is only one clone group and every
0209     // statement is a clone of the others. This clone group will then be
0210     // split up with the help of the constraints.
0211     Result.push_back(Sequences);
0212 
0213     constrainClones(Result, ConstraintList...);
0214   }
0215 
0216 private:
0217   CloneGroup Sequences;
0218 };
0219 
0220 /// This class is a utility class that contains utility functions for building
0221 /// custom constraints.
0222 class CloneConstraint {
0223 public:
0224   /// Removes all groups by using a filter function.
0225   /// \param CloneGroups The list of CloneGroups that is supposed to be
0226   ///                    filtered.
0227   /// \param Filter The filter function that should return true for all groups
0228   ///               that should be removed from the list.
0229   static void filterGroups(
0230       std::vector<CloneDetector::CloneGroup> &CloneGroups,
0231       llvm::function_ref<bool(const CloneDetector::CloneGroup &)> Filter) {
0232     llvm::erase_if(CloneGroups, Filter);
0233   }
0234 
0235   /// Splits the given CloneGroups until the given Compare function returns true
0236   /// for all clones in a single group.
0237   /// \param CloneGroups A list of CloneGroups that should be modified.
0238   /// \param Compare The comparison function that all clones are supposed to
0239   ///                pass. Should return true if and only if two clones belong
0240   ///                to the same CloneGroup.
0241   static void splitCloneGroups(
0242       std::vector<CloneDetector::CloneGroup> &CloneGroups,
0243       llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)>
0244           Compare);
0245 };
0246 
0247 /// This constraint moves clones into clone groups of type II via hashing.
0248 ///
0249 /// Clones with different hash values are moved into separate clone groups.
0250 /// Collisions are possible, and this constraint does nothing to address this
0251 /// them. Add the slower RecursiveCloneTypeIIVerifyConstraint later in the
0252 /// constraint chain, not necessarily immediately, to eliminate hash collisions
0253 /// through a more detailed analysis.
0254 class RecursiveCloneTypeIIHashConstraint {
0255 public:
0256   void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
0257 };
0258 
0259 /// This constraint moves clones into clone groups of type II by comparing them.
0260 ///
0261 /// Clones that aren't type II clones are moved into separate clone groups.
0262 /// In contrast to the RecursiveCloneTypeIIHashConstraint, all clones in a clone
0263 /// group are guaranteed to be type II clones of each other, but it is too
0264 /// slow to efficiently handle large amounts of clones.
0265 class RecursiveCloneTypeIIVerifyConstraint {
0266 public:
0267   void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
0268 };
0269 
0270 /// Ensures that every clone has at least the given complexity.
0271 ///
0272 /// Complexity is here defined as the total amount of children of a statement.
0273 /// This constraint assumes the first statement in the group is representative
0274 /// for all other statements in the group in terms of complexity.
0275 class MinComplexityConstraint {
0276   unsigned MinComplexity;
0277 
0278 public:
0279   MinComplexityConstraint(unsigned MinComplexity)
0280       : MinComplexity(MinComplexity) {}
0281 
0282   /// Calculates the complexity of the given StmtSequence.
0283   /// \param Limit The limit of complexity we probe for. After reaching
0284   ///              this limit during calculation, this method is exiting
0285   ///              early to improve performance and returns this limit.
0286   size_t calculateStmtComplexity(const StmtSequence &Seq, std::size_t Limit,
0287                                  const std::string &ParentMacroStack = "");
0288 
0289   void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
0290     CloneConstraint::filterGroups(
0291         CloneGroups, [this](const CloneDetector::CloneGroup &A) {
0292           if (!A.empty())
0293             return calculateStmtComplexity(A.front(), MinComplexity) <
0294                    MinComplexity;
0295           else
0296             return false;
0297         });
0298   }
0299 };
0300 
0301 /// Ensures that all clone groups contain at least the given amount of clones.
0302 class MinGroupSizeConstraint {
0303   unsigned MinGroupSize;
0304 
0305 public:
0306   MinGroupSizeConstraint(unsigned MinGroupSize = 2)
0307       : MinGroupSize(MinGroupSize) {}
0308 
0309   void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
0310     CloneConstraint::filterGroups(CloneGroups,
0311                                   [this](const CloneDetector::CloneGroup &A) {
0312                                     return A.size() < MinGroupSize;
0313                                   });
0314   }
0315 };
0316 
0317 /// Ensures that no clone group fully contains another clone group.
0318 struct OnlyLargestCloneConstraint {
0319   void constrain(std::vector<CloneDetector::CloneGroup> &Result);
0320 };
0321 
0322 struct FilenamePatternConstraint {
0323   StringRef IgnoredFilesPattern;
0324   std::shared_ptr<llvm::Regex> IgnoredFilesRegex;
0325 
0326   FilenamePatternConstraint(StringRef IgnoredFilesPattern)
0327       : IgnoredFilesPattern(IgnoredFilesPattern) {
0328     IgnoredFilesRegex = std::make_shared<llvm::Regex>("^(" +
0329         IgnoredFilesPattern.str() + "$)");
0330   }
0331 
0332   bool isAutoGenerated(const CloneDetector::CloneGroup &Group);
0333 
0334   void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
0335     CloneConstraint::filterGroups(
0336         CloneGroups, [this](const CloneDetector::CloneGroup &Group) {
0337           return isAutoGenerated(Group);
0338         });
0339   }
0340 };
0341 
0342 /// Analyzes the pattern of the referenced variables in a statement.
0343 class VariablePattern {
0344 
0345   /// Describes an occurrence of a variable reference in a statement.
0346   struct VariableOccurence {
0347     /// The index of the associated VarDecl in the Variables vector.
0348     size_t KindID;
0349     /// The statement in the code where the variable was referenced.
0350     const Stmt *Mention;
0351 
0352     VariableOccurence(size_t KindID, const Stmt *Mention)
0353         : KindID(KindID), Mention(Mention) {}
0354   };
0355 
0356   /// All occurrences of referenced variables in the order of appearance.
0357   std::vector<VariableOccurence> Occurences;
0358   /// List of referenced variables in the order of appearance.
0359   /// Every item in this list is unique.
0360   std::vector<const VarDecl *> Variables;
0361 
0362   /// Adds a new variable referenced to this pattern.
0363   /// \param VarDecl The declaration of the variable that is referenced.
0364   /// \param Mention The SourceRange where this variable is referenced.
0365   void addVariableOccurence(const VarDecl *VarDecl, const Stmt *Mention);
0366 
0367   /// Adds each referenced variable from the given statement.
0368   void addVariables(const Stmt *S);
0369 
0370 public:
0371   /// Creates an VariablePattern object with information about the given
0372   /// StmtSequence.
0373   VariablePattern(const StmtSequence &Sequence) {
0374     for (const Stmt *S : Sequence)
0375       addVariables(S);
0376   }
0377 
0378   /// Describes two clones that reference their variables in a different pattern
0379   /// which could indicate a programming error.
0380   struct SuspiciousClonePair {
0381     /// Utility class holding the relevant information about a single
0382     /// clone in this pair.
0383     struct SuspiciousCloneInfo {
0384       /// The variable which referencing in this clone was against the pattern.
0385       const VarDecl *Variable;
0386       /// Where the variable was referenced.
0387       const Stmt *Mention;
0388       /// The variable that should have been referenced to follow the pattern.
0389       /// If Suggestion is a nullptr then it's not possible to fix the pattern
0390       /// by referencing a different variable in this clone.
0391       const VarDecl *Suggestion;
0392       SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention,
0393                           const VarDecl *Suggestion)
0394           : Variable(Variable), Mention(Mention), Suggestion(Suggestion) {}
0395       SuspiciousCloneInfo() {}
0396     };
0397     /// The first clone in the pair which always has a suggested variable.
0398     SuspiciousCloneInfo FirstCloneInfo;
0399     /// This other clone in the pair which can have a suggested variable.
0400     SuspiciousCloneInfo SecondCloneInfo;
0401   };
0402 
0403   /// Counts the differences between this pattern and the given one.
0404   /// \param Other The given VariablePattern to compare with.
0405   /// \param FirstMismatch Output parameter that will be filled with information
0406   ///        about the first difference between the two patterns. This parameter
0407   ///        can be a nullptr, in which case it will be ignored.
0408   /// \return Returns the number of differences between the pattern this object
0409   ///         is following and the given VariablePattern.
0410   ///
0411   /// For example, the following statements all have the same pattern and this
0412   /// function would return zero:
0413   ///
0414   ///   if (a < b) return a; return b;
0415   ///   if (x < y) return x; return y;
0416   ///   if (u2 < u1) return u2; return u1;
0417   ///
0418   /// But the following statement has a different pattern (note the changed
0419   /// variables in the return statements) and would have two differences when
0420   /// compared with one of the statements above.
0421   ///
0422   ///   if (a < b) return b; return a;
0423   ///
0424   /// This function should only be called if the related statements of the given
0425   /// pattern and the statements of this objects are clones of each other.
0426   unsigned countPatternDifferences(
0427       const VariablePattern &Other,
0428       VariablePattern::SuspiciousClonePair *FirstMismatch = nullptr);
0429 };
0430 
0431 /// Ensures that all clones reference variables in the same pattern.
0432 struct MatchingVariablePatternConstraint {
0433   void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups);
0434 };
0435 
0436 } // end namespace clang
0437 
0438 #endif // LLVM_CLANG_ANALYSIS_CLONEDETECTION_H