Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-05-10 08:44:34

0001 //===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===//
0002 //
0003 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
0004 // See https://llvm.org/LICENSE.txt for license information.
0005 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
0006 //
0007 //===----------------------------------------------------------------------===//
0008 // A data structure for fast substring queries.
0009 //
0010 // Suffix trees represent the suffixes of their input strings in their leaves.
0011 // A suffix tree is a type of compressed trie structure where each node
0012 // represents an entire substring rather than a single character. Each leaf
0013 // of the tree is a suffix.
0014 //
0015 // A suffix tree can be seen as a type of state machine where each state is a
0016 // substring of the full string. The tree is structured so that, for a string
0017 // of length N, there are exactly N leaves in the tree. This structure allows
0018 // us to quickly find repeated substrings of the input string.
0019 //
0020 // In this implementation, a "string" is a vector of unsigned integers.
0021 // These integers may result from hashing some data type. A suffix tree can
0022 // contain 1 or many strings, which can then be queried as one large string.
0023 //
0024 // The suffix tree is implemented using Ukkonen's algorithm for linear-time
0025 // suffix tree construction. Ukkonen's algorithm is explained in more detail
0026 // in the paper by Esko Ukkonen "On-line construction of suffix trees. The
0027 // paper is available at
0028 //
0029 // https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
0030 //===----------------------------------------------------------------------===//
0031 
0032 #ifndef LLVM_SUPPORT_SUFFIXTREE_H
0033 #define LLVM_SUPPORT_SUFFIXTREE_H
0034 
0035 #include "llvm/ADT/ArrayRef.h"
0036 #include "llvm/Support/Allocator.h"
0037 #include "llvm/Support/SuffixTreeNode.h"
0038 
0039 namespace llvm {
0040 class SuffixTree {
0041 public:
0042   /// Each element is an integer representing an instruction in the module.
0043   ArrayRef<unsigned> Str;
0044 
0045   /// Whether to consider leaf descendants or only leaf children.
0046   bool OutlinerLeafDescendants;
0047 
0048   /// A repeated substring in the tree.
0049   struct RepeatedSubstring {
0050     /// The length of the string.
0051     unsigned Length;
0052 
0053     /// The start indices of each occurrence.
0054     SmallVector<unsigned> StartIndices;
0055   };
0056 
0057 private:
0058   /// Maintains internal nodes in the tree.
0059   SpecificBumpPtrAllocator<SuffixTreeInternalNode> InternalNodeAllocator;
0060   /// Maintains leaf nodes in the tree.
0061   SpecificBumpPtrAllocator<SuffixTreeLeafNode> LeafNodeAllocator;
0062 
0063   /// The root of the suffix tree.
0064   ///
0065   /// The root represents the empty string. It is maintained by the
0066   /// \p NodeAllocator like every other node in the tree.
0067   SuffixTreeInternalNode *Root = nullptr;
0068 
0069   /// The end index of each leaf in the tree.
0070   unsigned LeafEndIdx = SuffixTreeNode::EmptyIdx;
0071 
0072   /// Helper struct which keeps track of the next insertion point in
0073   /// Ukkonen's algorithm.
0074   struct ActiveState {
0075     /// The next node to insert at.
0076     SuffixTreeInternalNode *Node = nullptr;
0077 
0078     /// The index of the first character in the substring currently being added.
0079     unsigned Idx = SuffixTreeNode::EmptyIdx;
0080 
0081     /// The length of the substring we have to add at the current step.
0082     unsigned Len = 0;
0083   };
0084 
0085   /// The point the next insertion will take place at in the
0086   /// construction algorithm.
0087   ActiveState Active;
0088 
0089   /// Allocate a leaf node and add it to the tree.
0090   ///
0091   /// \param Parent The parent of this node.
0092   /// \param StartIdx The start index of this node's associated string.
0093   /// \param Edge The label on the edge leaving \p Parent to this node.
0094   ///
0095   /// \returns A pointer to the allocated leaf node.
0096   SuffixTreeNode *insertLeaf(SuffixTreeInternalNode &Parent, unsigned StartIdx,
0097                              unsigned Edge);
0098 
0099   /// Allocate an internal node and add it to the tree.
0100   ///
0101   /// \param Parent The parent of this node. Only null when allocating the root.
0102   /// \param StartIdx The start index of this node's associated string.
0103   /// \param EndIdx The end index of this node's associated string.
0104   /// \param Edge The label on the edge leaving \p Parent to this node.
0105   ///
0106   /// \returns A pointer to the allocated internal node.
0107   SuffixTreeInternalNode *insertInternalNode(SuffixTreeInternalNode *Parent,
0108                                              unsigned StartIdx, unsigned EndIdx,
0109                                              unsigned Edge);
0110 
0111   /// Allocate the root node and add it to the tree.
0112   ///
0113   /// \returns A pointer to the root.
0114   SuffixTreeInternalNode *insertRoot();
0115 
0116   /// Set the suffix indices of the leaves to the start indices of their
0117   /// respective suffixes.
0118   void setSuffixIndices();
0119 
0120   /// Construct the suffix tree for the prefix of the input ending at
0121   /// \p EndIdx.
0122   ///
0123   /// Used to construct the full suffix tree iteratively. At the end of each
0124   /// step, the constructed suffix tree is either a valid suffix tree, or a
0125   /// suffix tree with implicit suffixes. At the end of the final step, the
0126   /// suffix tree is a valid tree.
0127   ///
0128   /// \param EndIdx The end index of the current prefix in the main string.
0129   /// \param SuffixesToAdd The number of suffixes that must be added
0130   /// to complete the suffix tree at the current phase.
0131   ///
0132   /// \returns The number of suffixes that have not been added at the end of
0133   /// this step.
0134   unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd);
0135 
0136   /// This vector contains all leaf nodes of this suffix tree. These leaf nodes
0137   /// are identified using post-order depth-first traversal, so that the order
0138   /// of these leaf nodes in the vector matches the order of the leaves in the
0139   /// tree from left to right if one were to draw the tree on paper.
0140   std::vector<SuffixTreeLeafNode *> LeafNodes;
0141 
0142   /// Perform a post-order depth-first traversal of the tree and perform two
0143   /// tasks during the traversal. The first is to populate LeafNodes, adding
0144   /// nodes in order of the traversal. The second is to keep track of the leaf
0145   /// descendants of every internal node by assigning values to LeftLeafIndex
0146   /// and RightLefIndex fields of SuffixTreeNode for all internal nodes.
0147   void setLeafNodes();
0148 
0149 public:
0150   /// Construct a suffix tree from a sequence of unsigned integers.
0151   ///
0152   /// \param Str The string to construct the suffix tree for.
0153   /// \param OutlinerLeafDescendants Whether to consider leaf descendants or
0154   /// only leaf children (used by Machine Outliner).
0155   SuffixTree(const ArrayRef<unsigned> &Str,
0156              bool OutlinerLeafDescendants = false);
0157 
0158   /// Iterator for finding all repeated substrings in the suffix tree.
0159   struct RepeatedSubstringIterator {
0160   private:
0161     /// The current node we're visiting.
0162     SuffixTreeNode *N = nullptr;
0163 
0164     /// The repeated substring associated with this node.
0165     RepeatedSubstring RS;
0166 
0167     /// The nodes left to visit.
0168     SmallVector<SuffixTreeInternalNode *> InternalNodesToVisit;
0169 
0170     /// The minimum length of a repeated substring to find.
0171     /// Since we're outlining, we want at least two instructions in the range.
0172     /// FIXME: This may not be true for targets like X86 which support many
0173     /// instruction lengths.
0174     const unsigned MinLength = 2;
0175 
0176     /// Vector of leaf nodes of the suffix tree.
0177     const std::vector<SuffixTreeLeafNode *> &LeafNodes;
0178 
0179     /// Whether to consider leaf descendants or only leaf children.
0180     bool OutlinerLeafDescendants = !LeafNodes.empty();
0181 
0182     /// Move the iterator to the next repeated substring.
0183     void advance();
0184 
0185   public:
0186     /// Return the current repeated substring.
0187     RepeatedSubstring &operator*() { return RS; }
0188 
0189     RepeatedSubstringIterator &operator++() {
0190       advance();
0191       return *this;
0192     }
0193 
0194     RepeatedSubstringIterator operator++(int I) {
0195       RepeatedSubstringIterator It(*this);
0196       advance();
0197       return It;
0198     }
0199 
0200     bool operator==(const RepeatedSubstringIterator &Other) const {
0201       return N == Other.N;
0202     }
0203     bool operator!=(const RepeatedSubstringIterator &Other) const {
0204       return !(*this == Other);
0205     }
0206 
0207     RepeatedSubstringIterator(
0208         SuffixTreeInternalNode *N,
0209         const std::vector<SuffixTreeLeafNode *> &LeafNodes = {})
0210         : N(N), LeafNodes(LeafNodes) {
0211       // Do we have a non-null node?
0212       if (!N)
0213         return;
0214       // Yes. At the first step, we need to visit all of N's children.
0215       // Note: This means that we visit N last.
0216       InternalNodesToVisit.push_back(N);
0217       advance();
0218     }
0219   };
0220 
0221   typedef RepeatedSubstringIterator iterator;
0222   iterator begin() { return iterator(Root, LeafNodes); }
0223   iterator end() { return iterator(nullptr); }
0224 };
0225 
0226 } // namespace llvm
0227 
0228 #endif // LLVM_SUPPORT_SUFFIXTREE_H