|
|
|||
File indexing completed on 2026-05-10 08:36:26
0001 //===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===// 0002 // 0003 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 0004 // See https://llvm.org/LICENSE.txt for license information. 0005 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 0006 // 0007 //===----------------------------------------------------------------------===// 0008 /// 0009 /// \file 0010 /// This file defines classes for searching and analyzing source code clones. 0011 /// 0012 //===----------------------------------------------------------------------===// 0013 0014 #ifndef LLVM_CLANG_ANALYSIS_CLONEDETECTION_H 0015 #define LLVM_CLANG_ANALYSIS_CLONEDETECTION_H 0016 0017 #include "clang/AST/StmtVisitor.h" 0018 #include "llvm/Support/Regex.h" 0019 #include <vector> 0020 0021 namespace clang { 0022 0023 class Stmt; 0024 class Decl; 0025 class VarDecl; 0026 class ASTContext; 0027 class CompoundStmt; 0028 0029 /// Identifies a list of statements. 0030 /// 0031 /// Can either identify a single arbitrary Stmt object, a continuous sequence of 0032 /// child statements inside a CompoundStmt or no statements at all. 0033 class StmtSequence { 0034 /// If this object identifies a sequence of statements inside a CompoundStmt, 0035 /// S points to this CompoundStmt. If this object only identifies a single 0036 /// Stmt, then S is a pointer to this Stmt. 0037 const Stmt *S; 0038 0039 /// The declaration that contains the statements. 0040 const Decl *D; 0041 0042 /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence 0043 /// instance is representing the CompoundStmt children inside the array 0044 /// [StartIndex, EndIndex). 0045 unsigned StartIndex; 0046 unsigned EndIndex; 0047 0048 public: 0049 /// Constructs a StmtSequence holding multiple statements. 0050 /// 0051 /// The resulting StmtSequence identifies a continuous sequence of statements 0052 /// in the body of the given CompoundStmt. Which statements of the body should 0053 /// be identified needs to be specified by providing a start and end index 0054 /// that describe a non-empty sub-array in the body of the given CompoundStmt. 0055 /// 0056 /// \param Stmt A CompoundStmt that contains all statements in its body. 0057 /// \param D The Decl containing this Stmt. 0058 /// \param StartIndex The inclusive start index in the children array of 0059 /// \p Stmt 0060 /// \param EndIndex The exclusive end index in the children array of \p Stmt. 0061 StmtSequence(const CompoundStmt *Stmt, const Decl *D, unsigned StartIndex, 0062 unsigned EndIndex); 0063 0064 /// Constructs a StmtSequence holding a single statement. 0065 /// 0066 /// \param Stmt An arbitrary Stmt. 0067 /// \param D The Decl containing this Stmt. 0068 StmtSequence(const Stmt *Stmt, const Decl *D); 0069 0070 /// Constructs an empty StmtSequence. 0071 StmtSequence(); 0072 0073 typedef const Stmt *const *iterator; 0074 0075 /// Returns an iterator pointing to the first statement in this sequence. 0076 iterator begin() const; 0077 0078 /// Returns an iterator pointing behind the last statement in this sequence. 0079 iterator end() const; 0080 0081 /// Returns the first statement in this sequence. 0082 /// 0083 /// This method should only be called on a non-empty StmtSequence object. 0084 const Stmt *front() const { 0085 assert(!empty()); 0086 return begin()[0]; 0087 } 0088 0089 /// Returns the last statement in this sequence. 0090 /// 0091 /// This method should only be called on a non-empty StmtSequence object. 0092 const Stmt *back() const { 0093 assert(!empty()); 0094 return begin()[size() - 1]; 0095 } 0096 0097 /// Returns the number of statements this object holds. 0098 unsigned size() const { 0099 if (holdsSequence()) 0100 return EndIndex - StartIndex; 0101 if (S == nullptr) 0102 return 0; 0103 return 1; 0104 } 0105 0106 /// Returns true if and only if this StmtSequence contains no statements. 0107 bool empty() const { return size() == 0; } 0108 0109 /// Returns the related ASTContext for the stored Stmts. 0110 ASTContext &getASTContext() const; 0111 0112 /// Returns the declaration that contains the stored Stmts. 0113 const Decl *getContainingDecl() const { 0114 assert(D); 0115 return D; 0116 } 0117 0118 /// Returns true if this objects holds a list of statements. 0119 bool holdsSequence() const { return EndIndex != 0; } 0120 0121 /// Returns the start sourcelocation of the first statement in this sequence. 0122 /// 0123 /// This method should only be called on a non-empty StmtSequence object. 0124 SourceLocation getBeginLoc() const; 0125 0126 /// Returns the end sourcelocation of the last statement in this sequence. 0127 /// 0128 /// This method should only be called on a non-empty StmtSequence object. 0129 SourceLocation getEndLoc() const; 0130 0131 /// Returns the source range of the whole sequence - from the beginning 0132 /// of the first statement to the end of the last statement. 0133 SourceRange getSourceRange() const; 0134 0135 bool operator==(const StmtSequence &Other) const { 0136 return std::tie(S, StartIndex, EndIndex) == 0137 std::tie(Other.S, Other.StartIndex, Other.EndIndex); 0138 } 0139 0140 bool operator!=(const StmtSequence &Other) const { 0141 return std::tie(S, StartIndex, EndIndex) != 0142 std::tie(Other.S, Other.StartIndex, Other.EndIndex); 0143 } 0144 0145 /// Returns true if and only if this sequence covers a source range that 0146 /// contains the source range of the given sequence \p Other. 0147 /// 0148 /// This method should only be called on a non-empty StmtSequence object 0149 /// and passed a non-empty StmtSequence object. 0150 bool contains(const StmtSequence &Other) const; 0151 }; 0152 0153 /// Searches for similar subtrees in the AST. 0154 /// 0155 /// First, this class needs several declarations with statement bodies which 0156 /// can be passed via analyzeCodeBody. Afterwards all statements can be 0157 /// searched for clones by calling findClones with a given list of constraints 0158 /// that should specify the wanted properties of the clones. 0159 /// 0160 /// The result of findClones can be further constrained with the constrainClones 0161 /// method. 0162 /// 0163 /// This class only searches for clones in executable source code 0164 /// (e.g. function bodies). Other clones (e.g. cloned comments or declarations) 0165 /// are not supported. 0166 class CloneDetector { 0167 0168 public: 0169 /// A collection of StmtSequences that share an arbitrary property. 0170 typedef llvm::SmallVector<StmtSequence, 8> CloneGroup; 0171 0172 /// Generates and stores search data for all statements in the body of 0173 /// the given Decl. 0174 void analyzeCodeBody(const Decl *D); 0175 0176 /// Constrains the given list of clone groups with the given constraint. 0177 /// 0178 /// The constraint is expected to have a method with the signature 0179 /// `void constrain(std::vector<CloneDetector::CloneGroup> &Sequences)` 0180 /// as this is the interface that the CloneDetector uses for applying the 0181 /// constraint. The constraint is supposed to directly modify the passed list 0182 /// so that all clones in the list fulfill the specific property this 0183 /// constraint ensures. 0184 template <typename T> 0185 static void constrainClones(std::vector<CloneGroup> &CloneGroups, T C) { 0186 C.constrain(CloneGroups); 0187 } 0188 0189 /// Constrains the given list of clone groups with the given list of 0190 /// constraints. 0191 /// 0192 /// The constraints are applied in sequence in the order in which they are 0193 /// passed to this function. 0194 template <typename T1, typename... Ts> 0195 static void constrainClones(std::vector<CloneGroup> &CloneGroups, T1 C, 0196 Ts... ConstraintList) { 0197 constrainClones(CloneGroups, C); 0198 constrainClones(CloneGroups, ConstraintList...); 0199 } 0200 0201 /// Searches for clones in all previously passed statements. 0202 /// \param Result Output parameter to which all created clone groups are 0203 /// added. 0204 /// \param ConstraintList The constraints that should be applied to the 0205 // result. 0206 template <typename... Ts> 0207 void findClones(std::vector<CloneGroup> &Result, Ts... ConstraintList) { 0208 // The initial assumption is that there is only one clone group and every 0209 // statement is a clone of the others. This clone group will then be 0210 // split up with the help of the constraints. 0211 Result.push_back(Sequences); 0212 0213 constrainClones(Result, ConstraintList...); 0214 } 0215 0216 private: 0217 CloneGroup Sequences; 0218 }; 0219 0220 /// This class is a utility class that contains utility functions for building 0221 /// custom constraints. 0222 class CloneConstraint { 0223 public: 0224 /// Removes all groups by using a filter function. 0225 /// \param CloneGroups The list of CloneGroups that is supposed to be 0226 /// filtered. 0227 /// \param Filter The filter function that should return true for all groups 0228 /// that should be removed from the list. 0229 static void filterGroups( 0230 std::vector<CloneDetector::CloneGroup> &CloneGroups, 0231 llvm::function_ref<bool(const CloneDetector::CloneGroup &)> Filter) { 0232 llvm::erase_if(CloneGroups, Filter); 0233 } 0234 0235 /// Splits the given CloneGroups until the given Compare function returns true 0236 /// for all clones in a single group. 0237 /// \param CloneGroups A list of CloneGroups that should be modified. 0238 /// \param Compare The comparison function that all clones are supposed to 0239 /// pass. Should return true if and only if two clones belong 0240 /// to the same CloneGroup. 0241 static void splitCloneGroups( 0242 std::vector<CloneDetector::CloneGroup> &CloneGroups, 0243 llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)> 0244 Compare); 0245 }; 0246 0247 /// This constraint moves clones into clone groups of type II via hashing. 0248 /// 0249 /// Clones with different hash values are moved into separate clone groups. 0250 /// Collisions are possible, and this constraint does nothing to address this 0251 /// them. Add the slower RecursiveCloneTypeIIVerifyConstraint later in the 0252 /// constraint chain, not necessarily immediately, to eliminate hash collisions 0253 /// through a more detailed analysis. 0254 class RecursiveCloneTypeIIHashConstraint { 0255 public: 0256 void constrain(std::vector<CloneDetector::CloneGroup> &Sequences); 0257 }; 0258 0259 /// This constraint moves clones into clone groups of type II by comparing them. 0260 /// 0261 /// Clones that aren't type II clones are moved into separate clone groups. 0262 /// In contrast to the RecursiveCloneTypeIIHashConstraint, all clones in a clone 0263 /// group are guaranteed to be type II clones of each other, but it is too 0264 /// slow to efficiently handle large amounts of clones. 0265 class RecursiveCloneTypeIIVerifyConstraint { 0266 public: 0267 void constrain(std::vector<CloneDetector::CloneGroup> &Sequences); 0268 }; 0269 0270 /// Ensures that every clone has at least the given complexity. 0271 /// 0272 /// Complexity is here defined as the total amount of children of a statement. 0273 /// This constraint assumes the first statement in the group is representative 0274 /// for all other statements in the group in terms of complexity. 0275 class MinComplexityConstraint { 0276 unsigned MinComplexity; 0277 0278 public: 0279 MinComplexityConstraint(unsigned MinComplexity) 0280 : MinComplexity(MinComplexity) {} 0281 0282 /// Calculates the complexity of the given StmtSequence. 0283 /// \param Limit The limit of complexity we probe for. After reaching 0284 /// this limit during calculation, this method is exiting 0285 /// early to improve performance and returns this limit. 0286 size_t calculateStmtComplexity(const StmtSequence &Seq, std::size_t Limit, 0287 const std::string &ParentMacroStack = ""); 0288 0289 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) { 0290 CloneConstraint::filterGroups( 0291 CloneGroups, [this](const CloneDetector::CloneGroup &A) { 0292 if (!A.empty()) 0293 return calculateStmtComplexity(A.front(), MinComplexity) < 0294 MinComplexity; 0295 else 0296 return false; 0297 }); 0298 } 0299 }; 0300 0301 /// Ensures that all clone groups contain at least the given amount of clones. 0302 class MinGroupSizeConstraint { 0303 unsigned MinGroupSize; 0304 0305 public: 0306 MinGroupSizeConstraint(unsigned MinGroupSize = 2) 0307 : MinGroupSize(MinGroupSize) {} 0308 0309 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) { 0310 CloneConstraint::filterGroups(CloneGroups, 0311 [this](const CloneDetector::CloneGroup &A) { 0312 return A.size() < MinGroupSize; 0313 }); 0314 } 0315 }; 0316 0317 /// Ensures that no clone group fully contains another clone group. 0318 struct OnlyLargestCloneConstraint { 0319 void constrain(std::vector<CloneDetector::CloneGroup> &Result); 0320 }; 0321 0322 struct FilenamePatternConstraint { 0323 StringRef IgnoredFilesPattern; 0324 std::shared_ptr<llvm::Regex> IgnoredFilesRegex; 0325 0326 FilenamePatternConstraint(StringRef IgnoredFilesPattern) 0327 : IgnoredFilesPattern(IgnoredFilesPattern) { 0328 IgnoredFilesRegex = std::make_shared<llvm::Regex>("^(" + 0329 IgnoredFilesPattern.str() + "$)"); 0330 } 0331 0332 bool isAutoGenerated(const CloneDetector::CloneGroup &Group); 0333 0334 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) { 0335 CloneConstraint::filterGroups( 0336 CloneGroups, [this](const CloneDetector::CloneGroup &Group) { 0337 return isAutoGenerated(Group); 0338 }); 0339 } 0340 }; 0341 0342 /// Analyzes the pattern of the referenced variables in a statement. 0343 class VariablePattern { 0344 0345 /// Describes an occurrence of a variable reference in a statement. 0346 struct VariableOccurence { 0347 /// The index of the associated VarDecl in the Variables vector. 0348 size_t KindID; 0349 /// The statement in the code where the variable was referenced. 0350 const Stmt *Mention; 0351 0352 VariableOccurence(size_t KindID, const Stmt *Mention) 0353 : KindID(KindID), Mention(Mention) {} 0354 }; 0355 0356 /// All occurrences of referenced variables in the order of appearance. 0357 std::vector<VariableOccurence> Occurences; 0358 /// List of referenced variables in the order of appearance. 0359 /// Every item in this list is unique. 0360 std::vector<const VarDecl *> Variables; 0361 0362 /// Adds a new variable referenced to this pattern. 0363 /// \param VarDecl The declaration of the variable that is referenced. 0364 /// \param Mention The SourceRange where this variable is referenced. 0365 void addVariableOccurence(const VarDecl *VarDecl, const Stmt *Mention); 0366 0367 /// Adds each referenced variable from the given statement. 0368 void addVariables(const Stmt *S); 0369 0370 public: 0371 /// Creates an VariablePattern object with information about the given 0372 /// StmtSequence. 0373 VariablePattern(const StmtSequence &Sequence) { 0374 for (const Stmt *S : Sequence) 0375 addVariables(S); 0376 } 0377 0378 /// Describes two clones that reference their variables in a different pattern 0379 /// which could indicate a programming error. 0380 struct SuspiciousClonePair { 0381 /// Utility class holding the relevant information about a single 0382 /// clone in this pair. 0383 struct SuspiciousCloneInfo { 0384 /// The variable which referencing in this clone was against the pattern. 0385 const VarDecl *Variable; 0386 /// Where the variable was referenced. 0387 const Stmt *Mention; 0388 /// The variable that should have been referenced to follow the pattern. 0389 /// If Suggestion is a nullptr then it's not possible to fix the pattern 0390 /// by referencing a different variable in this clone. 0391 const VarDecl *Suggestion; 0392 SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention, 0393 const VarDecl *Suggestion) 0394 : Variable(Variable), Mention(Mention), Suggestion(Suggestion) {} 0395 SuspiciousCloneInfo() {} 0396 }; 0397 /// The first clone in the pair which always has a suggested variable. 0398 SuspiciousCloneInfo FirstCloneInfo; 0399 /// This other clone in the pair which can have a suggested variable. 0400 SuspiciousCloneInfo SecondCloneInfo; 0401 }; 0402 0403 /// Counts the differences between this pattern and the given one. 0404 /// \param Other The given VariablePattern to compare with. 0405 /// \param FirstMismatch Output parameter that will be filled with information 0406 /// about the first difference between the two patterns. This parameter 0407 /// can be a nullptr, in which case it will be ignored. 0408 /// \return Returns the number of differences between the pattern this object 0409 /// is following and the given VariablePattern. 0410 /// 0411 /// For example, the following statements all have the same pattern and this 0412 /// function would return zero: 0413 /// 0414 /// if (a < b) return a; return b; 0415 /// if (x < y) return x; return y; 0416 /// if (u2 < u1) return u2; return u1; 0417 /// 0418 /// But the following statement has a different pattern (note the changed 0419 /// variables in the return statements) and would have two differences when 0420 /// compared with one of the statements above. 0421 /// 0422 /// if (a < b) return b; return a; 0423 /// 0424 /// This function should only be called if the related statements of the given 0425 /// pattern and the statements of this objects are clones of each other. 0426 unsigned countPatternDifferences( 0427 const VariablePattern &Other, 0428 VariablePattern::SuspiciousClonePair *FirstMismatch = nullptr); 0429 }; 0430 0431 /// Ensures that all clones reference variables in the same pattern. 0432 struct MatchingVariablePatternConstraint { 0433 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups); 0434 }; 0435 0436 } // end namespace clang 0437 0438 #endif // LLVM_CLANG_ANALYSIS_CLONEDETECTION_H
| [ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
|
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
|