|
|
|||
File indexing completed on 2026-05-10 08:44:34
0001 //===- llvm/ADT/SuffixTreeNode.h - Nodes for SuffixTrees --------*- C++ -*-===// 0002 // 0003 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 0004 // See https://llvm.org/LICENSE.txt for license information. 0005 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 0006 // 0007 //===----------------------------------------------------------------------===// 0008 // 0009 // This file defines nodes for use within a SuffixTree. 0010 // 0011 // Each node has either no children or at least two children, with the root 0012 // being a exception in the empty tree. 0013 // 0014 // Children are represented as a map between unsigned integers and nodes. If 0015 // a node N has a child M on unsigned integer k, then the mapping represented 0016 // by N is a proper prefix of the mapping represented by M. Note that this, 0017 // although similar to a trie is somewhat different: each node stores a full 0018 // substring of the full mapping rather than a single character state. 0019 // 0020 // Each internal node contains a pointer to the internal node representing 0021 // the same string, but with the first character chopped off. This is stored 0022 // in \p Link. Each leaf node stores the start index of its respective 0023 // suffix in \p SuffixIdx. 0024 //===----------------------------------------------------------------------===// 0025 0026 #ifndef LLVM_SUPPORT_SUFFIXTREE_NODE_H 0027 #define LLVM_SUPPORT_SUFFIXTREE_NODE_H 0028 #include "llvm/ADT/DenseMap.h" 0029 0030 namespace llvm { 0031 0032 /// A node in a suffix tree which represents a substring or suffix. 0033 struct SuffixTreeNode { 0034 public: 0035 /// Represents an undefined index in the suffix tree. 0036 static const unsigned EmptyIdx = -1; 0037 enum class NodeKind { ST_Leaf, ST_Internal }; 0038 0039 private: 0040 const NodeKind Kind; 0041 0042 /// The start index of this node's substring in the main string. 0043 unsigned StartIdx = EmptyIdx; 0044 0045 /// The length of the string formed by concatenating the edge labels from 0046 /// the root to this node. 0047 unsigned ConcatLen = 0; 0048 0049 /// These two indices give a range of indices for its leaf descendants. 0050 /// Imagine drawing a tree on paper and assigning a unique index to each leaf 0051 /// node in monotonically increasing order from left to right. This way of 0052 /// numbering the leaf nodes allows us to associate a continuous range of 0053 /// indices with each internal node. For example, if a node has leaf 0054 /// descendants with indices i, i+1, ..., j, then its LeftLeafIdx is i and 0055 /// its RightLeafIdx is j. These indices are for LeafNodes in the SuffixTree 0056 /// class, which is constructed using post-order depth-first traversal. 0057 unsigned LeftLeafIdx = EmptyIdx; 0058 unsigned RightLeafIdx = EmptyIdx; 0059 0060 public: 0061 // LLVM RTTI boilerplate. 0062 NodeKind getKind() const { return Kind; } 0063 0064 /// \return the start index of this node's substring in the entire string. 0065 unsigned getStartIdx() const; 0066 0067 /// \returns the end index of this node. 0068 virtual unsigned getEndIdx() const = 0; 0069 0070 /// \return the index of this node's left most leaf node. 0071 unsigned getLeftLeafIdx() const; 0072 0073 /// \return the index of this node's right most leaf node. 0074 unsigned getRightLeafIdx() const; 0075 0076 /// Set the index of the left most leaf node of this node to \p Idx. 0077 void setLeftLeafIdx(unsigned Idx); 0078 0079 /// Set the index of the right most leaf node of this node to \p Idx. 0080 void setRightLeafIdx(unsigned Idx); 0081 0082 /// Advance this node's StartIdx by \p Inc. 0083 void incrementStartIdx(unsigned Inc); 0084 0085 /// Set the length of the string from the root to this node to \p Len. 0086 void setConcatLen(unsigned Len); 0087 0088 /// \returns the length of the string from the root to this node. 0089 unsigned getConcatLen() const; 0090 0091 SuffixTreeNode(NodeKind Kind, unsigned StartIdx) 0092 : Kind(Kind), StartIdx(StartIdx) {} 0093 virtual ~SuffixTreeNode() = default; 0094 }; 0095 0096 // A node with two or more children, or the root. 0097 struct SuffixTreeInternalNode : SuffixTreeNode { 0098 private: 0099 /// The end index of this node's substring in the main string. 0100 /// 0101 /// Every leaf node must have its \p EndIdx incremented at the end of every 0102 /// step in the construction algorithm. To avoid having to update O(N) 0103 /// nodes individually at the end of every step, the end index is stored 0104 /// as a pointer. 0105 unsigned EndIdx = EmptyIdx; 0106 0107 /// A pointer to the internal node representing the same sequence with the 0108 /// first character chopped off. 0109 /// 0110 /// This acts as a shortcut in Ukkonen's algorithm. One of the things that 0111 /// Ukkonen's algorithm does to achieve linear-time construction is 0112 /// keep track of which node the next insert should be at. This makes each 0113 /// insert O(1), and there are a total of O(N) inserts. The suffix link 0114 /// helps with inserting children of internal nodes. 0115 /// 0116 /// Say we add a child to an internal node with associated mapping S. The 0117 /// next insertion must be at the node representing S - its first character. 0118 /// This is given by the way that we iteratively build the tree in Ukkonen's 0119 /// algorithm. The main idea is to look at the suffixes of each prefix in the 0120 /// string, starting with the longest suffix of the prefix, and ending with 0121 /// the shortest. Therefore, if we keep pointers between such nodes, we can 0122 /// move to the next insertion point in O(1) time. If we don't, then we'd 0123 /// have to query from the root, which takes O(N) time. This would make the 0124 /// construction algorithm O(N^2) rather than O(N). 0125 SuffixTreeInternalNode *Link = nullptr; 0126 0127 public: 0128 // LLVM RTTI boilerplate. 0129 static bool classof(const SuffixTreeNode *N) { 0130 return N->getKind() == NodeKind::ST_Internal; 0131 } 0132 0133 /// \returns true if this node is the root of its owning \p SuffixTree. 0134 bool isRoot() const; 0135 0136 /// \returns the end index of this node's substring in the entire string. 0137 unsigned getEndIdx() const override; 0138 0139 /// Sets \p Link to \p L. Assumes \p L is not null. 0140 void setLink(SuffixTreeInternalNode *L); 0141 0142 /// \returns the pointer to the Link node. 0143 SuffixTreeInternalNode *getLink() const; 0144 0145 /// The children of this node. 0146 /// 0147 /// A child existing on an unsigned integer implies that from the mapping 0148 /// represented by the current node, there is a way to reach another 0149 /// mapping by tacking that character on the end of the current string. 0150 DenseMap<unsigned, SuffixTreeNode *> Children; 0151 0152 SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx, 0153 SuffixTreeInternalNode *Link) 0154 : SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx), 0155 Link(Link) {} 0156 0157 virtual ~SuffixTreeInternalNode() = default; 0158 }; 0159 0160 // A node representing a suffix. 0161 struct SuffixTreeLeafNode : SuffixTreeNode { 0162 private: 0163 /// The start index of the suffix represented by this leaf. 0164 unsigned SuffixIdx = EmptyIdx; 0165 0166 /// The end index of this node's substring in the main string. 0167 /// 0168 /// Every leaf node must have its \p EndIdx incremented at the end of every 0169 /// step in the construction algorithm. To avoid having to update O(N) 0170 /// nodes individually at the end of every step, the end index is stored 0171 /// as a pointer. 0172 unsigned *EndIdx = nullptr; 0173 0174 public: 0175 // LLVM RTTI boilerplate. 0176 static bool classof(const SuffixTreeNode *N) { 0177 return N->getKind() == NodeKind::ST_Leaf; 0178 } 0179 0180 /// \returns the end index of this node's substring in the entire string. 0181 unsigned getEndIdx() const override; 0182 0183 /// \returns the start index of the suffix represented by this leaf. 0184 unsigned getSuffixIdx() const; 0185 0186 /// Sets the start index of the suffix represented by this leaf to \p Idx. 0187 void setSuffixIdx(unsigned Idx); 0188 SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx) 0189 : SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {} 0190 0191 virtual ~SuffixTreeLeafNode() = default; 0192 }; 0193 } // namespace llvm 0194 #endif // LLVM_SUPPORT_SUFFIXTREE_NODE_H
| [ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
|
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
|