Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-05-10 08:44:34

0001 //===- llvm/ADT/SuffixTreeNode.h - Nodes for SuffixTrees --------*- C++ -*-===//
0002 //
0003 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
0004 // See https://llvm.org/LICENSE.txt for license information.
0005 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
0006 //
0007 //===----------------------------------------------------------------------===//
0008 //
0009 // This file defines nodes for use within a SuffixTree.
0010 //
0011 // Each node has either no children or at least two children, with the root
0012 // being a exception in the empty tree.
0013 //
0014 // Children are represented as a map between unsigned integers and nodes. If
0015 // a node N has a child M on unsigned integer k, then the mapping represented
0016 // by N is a proper prefix of the mapping represented by M. Note that this,
0017 // although similar to a trie is somewhat different: each node stores a full
0018 // substring of the full mapping rather than a single character state.
0019 //
0020 // Each internal node contains a pointer to the internal node representing
0021 // the same string, but with the first character chopped off. This is stored
0022 // in \p Link. Each leaf node stores the start index of its respective
0023 // suffix in \p SuffixIdx.
0024 //===----------------------------------------------------------------------===//
0025 
0026 #ifndef LLVM_SUPPORT_SUFFIXTREE_NODE_H
0027 #define LLVM_SUPPORT_SUFFIXTREE_NODE_H
0028 #include "llvm/ADT/DenseMap.h"
0029 
0030 namespace llvm {
0031 
0032 /// A node in a suffix tree which represents a substring or suffix.
0033 struct SuffixTreeNode {
0034 public:
0035   /// Represents an undefined index in the suffix tree.
0036   static const unsigned EmptyIdx = -1;
0037   enum class NodeKind { ST_Leaf, ST_Internal };
0038 
0039 private:
0040   const NodeKind Kind;
0041 
0042   /// The start index of this node's substring in the main string.
0043   unsigned StartIdx = EmptyIdx;
0044 
0045   /// The length of the string formed by concatenating the edge labels from
0046   /// the root to this node.
0047   unsigned ConcatLen = 0;
0048 
0049   /// These two indices give a range of indices for its leaf descendants.
0050   /// Imagine drawing a tree on paper and assigning a unique index to each leaf
0051   /// node in monotonically increasing order from left to right. This way of
0052   /// numbering the leaf nodes allows us to associate a continuous range of
0053   /// indices with each internal node. For example, if a node has leaf
0054   /// descendants with indices i, i+1, ..., j, then its LeftLeafIdx is i and
0055   /// its RightLeafIdx is j. These indices are for LeafNodes in the SuffixTree
0056   /// class, which is constructed using post-order depth-first traversal.
0057   unsigned LeftLeafIdx = EmptyIdx;
0058   unsigned RightLeafIdx = EmptyIdx;
0059 
0060 public:
0061   // LLVM RTTI boilerplate.
0062   NodeKind getKind() const { return Kind; }
0063 
0064   /// \return the start index of this node's substring in the entire string.
0065   unsigned getStartIdx() const;
0066 
0067   /// \returns the end index of this node.
0068   virtual unsigned getEndIdx() const = 0;
0069 
0070   /// \return the index of this node's left most leaf node.
0071   unsigned getLeftLeafIdx() const;
0072 
0073   /// \return the index of this node's right most leaf node.
0074   unsigned getRightLeafIdx() const;
0075 
0076   /// Set the index of the left most leaf node of this node to \p Idx.
0077   void setLeftLeafIdx(unsigned Idx);
0078 
0079   /// Set the index of the right most leaf node of this node to \p Idx.
0080   void setRightLeafIdx(unsigned Idx);
0081 
0082   /// Advance this node's StartIdx by \p Inc.
0083   void incrementStartIdx(unsigned Inc);
0084 
0085   /// Set the length of the string from the root to this node to \p Len.
0086   void setConcatLen(unsigned Len);
0087 
0088   /// \returns the length of the string from the root to this node.
0089   unsigned getConcatLen() const;
0090 
0091   SuffixTreeNode(NodeKind Kind, unsigned StartIdx)
0092       : Kind(Kind), StartIdx(StartIdx) {}
0093   virtual ~SuffixTreeNode() = default;
0094 };
0095 
0096 // A node with two or more children, or the root.
0097 struct SuffixTreeInternalNode : SuffixTreeNode {
0098 private:
0099   /// The end index of this node's substring in the main string.
0100   ///
0101   /// Every leaf node must have its \p EndIdx incremented at the end of every
0102   /// step in the construction algorithm. To avoid having to update O(N)
0103   /// nodes individually at the end of every step, the end index is stored
0104   /// as a pointer.
0105   unsigned EndIdx = EmptyIdx;
0106 
0107   /// A pointer to the internal node representing the same sequence with the
0108   /// first character chopped off.
0109   ///
0110   /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
0111   /// Ukkonen's algorithm does to achieve linear-time construction is
0112   /// keep track of which node the next insert should be at. This makes each
0113   /// insert O(1), and there are a total of O(N) inserts. The suffix link
0114   /// helps with inserting children of internal nodes.
0115   ///
0116   /// Say we add a child to an internal node with associated mapping S. The
0117   /// next insertion must be at the node representing S - its first character.
0118   /// This is given by the way that we iteratively build the tree in Ukkonen's
0119   /// algorithm. The main idea is to look at the suffixes of each prefix in the
0120   /// string, starting with the longest suffix of the prefix, and ending with
0121   /// the shortest. Therefore, if we keep pointers between such nodes, we can
0122   /// move to the next insertion point in O(1) time. If we don't, then we'd
0123   /// have to query from the root, which takes O(N) time. This would make the
0124   /// construction algorithm O(N^2) rather than O(N).
0125   SuffixTreeInternalNode *Link = nullptr;
0126 
0127 public:
0128   // LLVM RTTI boilerplate.
0129   static bool classof(const SuffixTreeNode *N) {
0130     return N->getKind() == NodeKind::ST_Internal;
0131   }
0132 
0133   /// \returns true if this node is the root of its owning \p SuffixTree.
0134   bool isRoot() const;
0135 
0136   /// \returns the end index of this node's substring in the entire string.
0137   unsigned getEndIdx() const override;
0138 
0139   /// Sets \p Link to \p L. Assumes \p L is not null.
0140   void setLink(SuffixTreeInternalNode *L);
0141 
0142   /// \returns the pointer to the Link node.
0143   SuffixTreeInternalNode *getLink() const;
0144 
0145   /// The children of this node.
0146   ///
0147   /// A child existing on an unsigned integer implies that from the mapping
0148   /// represented by the current node, there is a way to reach another
0149   /// mapping by tacking that character on the end of the current string.
0150   DenseMap<unsigned, SuffixTreeNode *> Children;
0151 
0152   SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx,
0153                          SuffixTreeInternalNode *Link)
0154       : SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx),
0155         Link(Link) {}
0156 
0157   virtual ~SuffixTreeInternalNode() = default;
0158 };
0159 
0160 // A node representing a suffix.
0161 struct SuffixTreeLeafNode : SuffixTreeNode {
0162 private:
0163   /// The start index of the suffix represented by this leaf.
0164   unsigned SuffixIdx = EmptyIdx;
0165 
0166   /// The end index of this node's substring in the main string.
0167   ///
0168   /// Every leaf node must have its \p EndIdx incremented at the end of every
0169   /// step in the construction algorithm. To avoid having to update O(N)
0170   /// nodes individually at the end of every step, the end index is stored
0171   /// as a pointer.
0172   unsigned *EndIdx = nullptr;
0173 
0174 public:
0175   // LLVM RTTI boilerplate.
0176   static bool classof(const SuffixTreeNode *N) {
0177     return N->getKind() == NodeKind::ST_Leaf;
0178   }
0179 
0180   /// \returns the end index of this node's substring in the entire string.
0181   unsigned getEndIdx() const override;
0182 
0183   /// \returns the start index of the suffix represented by this leaf.
0184   unsigned getSuffixIdx() const;
0185 
0186   /// Sets the start index of the suffix represented by this leaf to \p Idx.
0187   void setSuffixIdx(unsigned Idx);
0188   SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx)
0189       : SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {}
0190 
0191   virtual ~SuffixTreeLeafNode() = default;
0192 };
0193 } // namespace llvm
0194 #endif // LLVM_SUPPORT_SUFFIXTREE_NODE_H