|
|
|||
File indexing completed on 2026-05-10 08:44:34
0001 //===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===// 0002 // 0003 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 0004 // See https://llvm.org/LICENSE.txt for license information. 0005 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 0006 // 0007 //===----------------------------------------------------------------------===// 0008 // A data structure for fast substring queries. 0009 // 0010 // Suffix trees represent the suffixes of their input strings in their leaves. 0011 // A suffix tree is a type of compressed trie structure where each node 0012 // represents an entire substring rather than a single character. Each leaf 0013 // of the tree is a suffix. 0014 // 0015 // A suffix tree can be seen as a type of state machine where each state is a 0016 // substring of the full string. The tree is structured so that, for a string 0017 // of length N, there are exactly N leaves in the tree. This structure allows 0018 // us to quickly find repeated substrings of the input string. 0019 // 0020 // In this implementation, a "string" is a vector of unsigned integers. 0021 // These integers may result from hashing some data type. A suffix tree can 0022 // contain 1 or many strings, which can then be queried as one large string. 0023 // 0024 // The suffix tree is implemented using Ukkonen's algorithm for linear-time 0025 // suffix tree construction. Ukkonen's algorithm is explained in more detail 0026 // in the paper by Esko Ukkonen "On-line construction of suffix trees. The 0027 // paper is available at 0028 // 0029 // https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf 0030 //===----------------------------------------------------------------------===// 0031 0032 #ifndef LLVM_SUPPORT_SUFFIXTREE_H 0033 #define LLVM_SUPPORT_SUFFIXTREE_H 0034 0035 #include "llvm/ADT/ArrayRef.h" 0036 #include "llvm/Support/Allocator.h" 0037 #include "llvm/Support/SuffixTreeNode.h" 0038 0039 namespace llvm { 0040 class SuffixTree { 0041 public: 0042 /// Each element is an integer representing an instruction in the module. 0043 ArrayRef<unsigned> Str; 0044 0045 /// Whether to consider leaf descendants or only leaf children. 0046 bool OutlinerLeafDescendants; 0047 0048 /// A repeated substring in the tree. 0049 struct RepeatedSubstring { 0050 /// The length of the string. 0051 unsigned Length; 0052 0053 /// The start indices of each occurrence. 0054 SmallVector<unsigned> StartIndices; 0055 }; 0056 0057 private: 0058 /// Maintains internal nodes in the tree. 0059 SpecificBumpPtrAllocator<SuffixTreeInternalNode> InternalNodeAllocator; 0060 /// Maintains leaf nodes in the tree. 0061 SpecificBumpPtrAllocator<SuffixTreeLeafNode> LeafNodeAllocator; 0062 0063 /// The root of the suffix tree. 0064 /// 0065 /// The root represents the empty string. It is maintained by the 0066 /// \p NodeAllocator like every other node in the tree. 0067 SuffixTreeInternalNode *Root = nullptr; 0068 0069 /// The end index of each leaf in the tree. 0070 unsigned LeafEndIdx = SuffixTreeNode::EmptyIdx; 0071 0072 /// Helper struct which keeps track of the next insertion point in 0073 /// Ukkonen's algorithm. 0074 struct ActiveState { 0075 /// The next node to insert at. 0076 SuffixTreeInternalNode *Node = nullptr; 0077 0078 /// The index of the first character in the substring currently being added. 0079 unsigned Idx = SuffixTreeNode::EmptyIdx; 0080 0081 /// The length of the substring we have to add at the current step. 0082 unsigned Len = 0; 0083 }; 0084 0085 /// The point the next insertion will take place at in the 0086 /// construction algorithm. 0087 ActiveState Active; 0088 0089 /// Allocate a leaf node and add it to the tree. 0090 /// 0091 /// \param Parent The parent of this node. 0092 /// \param StartIdx The start index of this node's associated string. 0093 /// \param Edge The label on the edge leaving \p Parent to this node. 0094 /// 0095 /// \returns A pointer to the allocated leaf node. 0096 SuffixTreeNode *insertLeaf(SuffixTreeInternalNode &Parent, unsigned StartIdx, 0097 unsigned Edge); 0098 0099 /// Allocate an internal node and add it to the tree. 0100 /// 0101 /// \param Parent The parent of this node. Only null when allocating the root. 0102 /// \param StartIdx The start index of this node's associated string. 0103 /// \param EndIdx The end index of this node's associated string. 0104 /// \param Edge The label on the edge leaving \p Parent to this node. 0105 /// 0106 /// \returns A pointer to the allocated internal node. 0107 SuffixTreeInternalNode *insertInternalNode(SuffixTreeInternalNode *Parent, 0108 unsigned StartIdx, unsigned EndIdx, 0109 unsigned Edge); 0110 0111 /// Allocate the root node and add it to the tree. 0112 /// 0113 /// \returns A pointer to the root. 0114 SuffixTreeInternalNode *insertRoot(); 0115 0116 /// Set the suffix indices of the leaves to the start indices of their 0117 /// respective suffixes. 0118 void setSuffixIndices(); 0119 0120 /// Construct the suffix tree for the prefix of the input ending at 0121 /// \p EndIdx. 0122 /// 0123 /// Used to construct the full suffix tree iteratively. At the end of each 0124 /// step, the constructed suffix tree is either a valid suffix tree, or a 0125 /// suffix tree with implicit suffixes. At the end of the final step, the 0126 /// suffix tree is a valid tree. 0127 /// 0128 /// \param EndIdx The end index of the current prefix in the main string. 0129 /// \param SuffixesToAdd The number of suffixes that must be added 0130 /// to complete the suffix tree at the current phase. 0131 /// 0132 /// \returns The number of suffixes that have not been added at the end of 0133 /// this step. 0134 unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd); 0135 0136 /// This vector contains all leaf nodes of this suffix tree. These leaf nodes 0137 /// are identified using post-order depth-first traversal, so that the order 0138 /// of these leaf nodes in the vector matches the order of the leaves in the 0139 /// tree from left to right if one were to draw the tree on paper. 0140 std::vector<SuffixTreeLeafNode *> LeafNodes; 0141 0142 /// Perform a post-order depth-first traversal of the tree and perform two 0143 /// tasks during the traversal. The first is to populate LeafNodes, adding 0144 /// nodes in order of the traversal. The second is to keep track of the leaf 0145 /// descendants of every internal node by assigning values to LeftLeafIndex 0146 /// and RightLefIndex fields of SuffixTreeNode for all internal nodes. 0147 void setLeafNodes(); 0148 0149 public: 0150 /// Construct a suffix tree from a sequence of unsigned integers. 0151 /// 0152 /// \param Str The string to construct the suffix tree for. 0153 /// \param OutlinerLeafDescendants Whether to consider leaf descendants or 0154 /// only leaf children (used by Machine Outliner). 0155 SuffixTree(const ArrayRef<unsigned> &Str, 0156 bool OutlinerLeafDescendants = false); 0157 0158 /// Iterator for finding all repeated substrings in the suffix tree. 0159 struct RepeatedSubstringIterator { 0160 private: 0161 /// The current node we're visiting. 0162 SuffixTreeNode *N = nullptr; 0163 0164 /// The repeated substring associated with this node. 0165 RepeatedSubstring RS; 0166 0167 /// The nodes left to visit. 0168 SmallVector<SuffixTreeInternalNode *> InternalNodesToVisit; 0169 0170 /// The minimum length of a repeated substring to find. 0171 /// Since we're outlining, we want at least two instructions in the range. 0172 /// FIXME: This may not be true for targets like X86 which support many 0173 /// instruction lengths. 0174 const unsigned MinLength = 2; 0175 0176 /// Vector of leaf nodes of the suffix tree. 0177 const std::vector<SuffixTreeLeafNode *> &LeafNodes; 0178 0179 /// Whether to consider leaf descendants or only leaf children. 0180 bool OutlinerLeafDescendants = !LeafNodes.empty(); 0181 0182 /// Move the iterator to the next repeated substring. 0183 void advance(); 0184 0185 public: 0186 /// Return the current repeated substring. 0187 RepeatedSubstring &operator*() { return RS; } 0188 0189 RepeatedSubstringIterator &operator++() { 0190 advance(); 0191 return *this; 0192 } 0193 0194 RepeatedSubstringIterator operator++(int I) { 0195 RepeatedSubstringIterator It(*this); 0196 advance(); 0197 return It; 0198 } 0199 0200 bool operator==(const RepeatedSubstringIterator &Other) const { 0201 return N == Other.N; 0202 } 0203 bool operator!=(const RepeatedSubstringIterator &Other) const { 0204 return !(*this == Other); 0205 } 0206 0207 RepeatedSubstringIterator( 0208 SuffixTreeInternalNode *N, 0209 const std::vector<SuffixTreeLeafNode *> &LeafNodes = {}) 0210 : N(N), LeafNodes(LeafNodes) { 0211 // Do we have a non-null node? 0212 if (!N) 0213 return; 0214 // Yes. At the first step, we need to visit all of N's children. 0215 // Note: This means that we visit N last. 0216 InternalNodesToVisit.push_back(N); 0217 advance(); 0218 } 0219 }; 0220 0221 typedef RepeatedSubstringIterator iterator; 0222 iterator begin() { return iterator(Root, LeafNodes); } 0223 iterator end() { return iterator(nullptr); } 0224 }; 0225 0226 } // namespace llvm 0227 0228 #endif // LLVM_SUPPORT_SUFFIXTREE_H
| [ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
|
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
|