File indexing completed on 2025-08-28 08:27:10
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 #pragma once
0019
0020 #include <cassert>
0021 #include <cstdint>
0022 #include <cstring>
0023 #include <iosfwd>
0024 #include <limits>
0025 #include <string>
0026 #include <string_view>
0027 #include <utility>
0028 #include <vector>
0029
0030 #include "arrow/status.h"
0031 #include "arrow/util/macros.h"
0032 #include "arrow/util/visibility.h"
0033
0034 namespace arrow {
0035 namespace internal {
0036
0037
0038
0039
0040
0041 template <uint8_t N>
0042 class SmallString {
0043 public:
0044 SmallString() : length_(0) {}
0045
0046 template <typename T>
0047 SmallString(const T& v) {
0048 *this = std::string_view(v);
0049 }
0050
0051 SmallString& operator=(const std::string_view s) {
0052 #ifndef NDEBUG
0053 CheckSize(s.size());
0054 #endif
0055 length_ = static_cast<uint8_t>(s.size());
0056 std::memcpy(data_, s.data(), length_);
0057 return *this;
0058 }
0059
0060 SmallString& operator=(const std::string& s) {
0061 *this = std::string_view(s);
0062 return *this;
0063 }
0064
0065 SmallString& operator=(const char* s) {
0066 *this = std::string_view(s);
0067 return *this;
0068 }
0069
0070 explicit operator std::string_view() const { return std::string_view(data_, length_); }
0071
0072 const char* data() const { return data_; }
0073 size_t length() const { return length_; }
0074 bool empty() const { return length_ == 0; }
0075 char operator[](size_t pos) const {
0076 #ifdef NDEBUG
0077 assert(pos <= length_);
0078 #endif
0079 return data_[pos];
0080 }
0081
0082 SmallString substr(size_t pos) const {
0083 return SmallString(std::string_view(*this).substr(pos));
0084 }
0085
0086 SmallString substr(size_t pos, size_t count) const {
0087 return SmallString(std::string_view(*this).substr(pos, count));
0088 }
0089
0090 template <typename T>
0091 bool operator==(T&& other) const {
0092 return std::string_view(*this) == std::string_view(std::forward<T>(other));
0093 }
0094
0095 template <typename T>
0096 bool operator!=(T&& other) const {
0097 return std::string_view(*this) != std::string_view(std::forward<T>(other));
0098 }
0099
0100 protected:
0101 uint8_t length_;
0102 char data_[N];
0103
0104 void CheckSize(size_t n) { assert(n <= N); }
0105 };
0106
0107 template <uint8_t N>
0108 std::ostream& operator<<(std::ostream& os, const SmallString<N>& str) {
0109 return os << std::string_view(str);
0110 }
0111
0112
0113
0114 class ARROW_EXPORT Trie {
0115 using index_type = int16_t;
0116 using fast_index_type = int_fast16_t;
0117 static constexpr auto kMaxIndex = std::numeric_limits<index_type>::max();
0118
0119 public:
0120 Trie() : size_(0) {}
0121 Trie(Trie&&) = default;
0122 Trie& operator=(Trie&&) = default;
0123
0124 int32_t Find(std::string_view s) const {
0125 const Node* node = &nodes_[0];
0126 fast_index_type pos = 0;
0127 if (s.length() > static_cast<size_t>(kMaxIndex)) {
0128 return -1;
0129 }
0130 fast_index_type remaining = static_cast<fast_index_type>(s.length());
0131
0132 while (remaining > 0) {
0133 auto substring_length = node->substring_length();
0134 if (substring_length > 0) {
0135 auto substring_data = node->substring_data();
0136 if (remaining < substring_length) {
0137
0138 return -1;
0139 }
0140 for (fast_index_type i = 0; i < substring_length; ++i) {
0141 if (s[pos++] != substring_data[i]) {
0142
0143 return -1;
0144 }
0145 --remaining;
0146 }
0147 if (remaining == 0) {
0148
0149 return node->found_index_;
0150 }
0151 }
0152
0153 if (node->child_lookup_ == -1) {
0154
0155 return -1;
0156 }
0157 auto c = static_cast<uint8_t>(s[pos++]);
0158 --remaining;
0159 auto child_index = lookup_table_[node->child_lookup_ * 256 + c];
0160 if (child_index == -1) {
0161
0162 return -1;
0163 }
0164 node = &nodes_[child_index];
0165 }
0166
0167
0168 if (node->substring_.empty()) {
0169
0170 return node->found_index_;
0171 } else {
0172 return -1;
0173 }
0174 }
0175
0176 Status Validate() const;
0177
0178 void Dump() const;
0179
0180 protected:
0181 static constexpr size_t kNodeSize = 16;
0182 static constexpr auto kMaxSubstringLength =
0183 kNodeSize - 2 * sizeof(index_type) - sizeof(int8_t);
0184
0185 struct Node {
0186
0187 index_type found_index_;
0188
0189 index_type child_lookup_;
0190
0191 SmallString<kMaxSubstringLength> substring_;
0192
0193 fast_index_type substring_length() const {
0194 return static_cast<fast_index_type>(substring_.length());
0195 }
0196 const char* substring_data() const { return substring_.data(); }
0197 };
0198
0199 static_assert(sizeof(Node) == kNodeSize, "Unexpected node size");
0200
0201 ARROW_DISALLOW_COPY_AND_ASSIGN(Trie);
0202
0203 void Dump(const Node* node, const std::string& indent) const;
0204
0205
0206 std::vector<Node> nodes_;
0207
0208
0209 std::vector<index_type> lookup_table_;
0210
0211
0212 index_type size_;
0213
0214 friend class TrieBuilder;
0215 };
0216
0217 class ARROW_EXPORT TrieBuilder {
0218 using index_type = Trie::index_type;
0219 using fast_index_type = Trie::fast_index_type;
0220
0221 public:
0222 TrieBuilder();
0223 Status Append(std::string_view s, bool allow_duplicate = false);
0224 Trie Finish();
0225
0226 protected:
0227
0228 Status ExtendLookupTable(index_type* out_lookup_index);
0229
0230 Status SplitNode(fast_index_type node_index, fast_index_type split_at);
0231
0232 Status AppendChildNode(Trie::Node* parent, uint8_t ch, Trie::Node&& node);
0233
0234 Status CreateChildNode(Trie::Node* parent, uint8_t ch, std::string_view substring);
0235 Status CreateChildNode(Trie::Node* parent, char ch, std::string_view substring);
0236
0237 Trie trie_;
0238
0239 static constexpr auto kMaxIndex = std::numeric_limits<index_type>::max();
0240 };
0241
0242 }
0243 }