Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-08-28 08:27:10

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <cassert>
0021 #include <cstdint>
0022 #include <cstring>
0023 #include <iosfwd>
0024 #include <limits>
0025 #include <string>
0026 #include <string_view>
0027 #include <utility>
0028 #include <vector>
0029 
0030 #include "arrow/status.h"
0031 #include "arrow/util/macros.h"
0032 #include "arrow/util/visibility.h"
0033 
0034 namespace arrow {
0035 namespace internal {
0036 
0037 // A non-zero-terminated small string class.
0038 // std::string usually has a small string optimization
0039 // (see review at https://shaharmike.com/cpp/std-string/)
0040 // but this one allows tight control and optimization of memory layout.
0041 template <uint8_t N>
0042 class SmallString {
0043  public:
0044   SmallString() : length_(0) {}
0045 
0046   template <typename T>
0047   SmallString(const T& v) {  // NOLINT implicit constructor
0048     *this = std::string_view(v);
0049   }
0050 
0051   SmallString& operator=(const std::string_view s) {
0052 #ifndef NDEBUG
0053     CheckSize(s.size());
0054 #endif
0055     length_ = static_cast<uint8_t>(s.size());
0056     std::memcpy(data_, s.data(), length_);
0057     return *this;
0058   }
0059 
0060   SmallString& operator=(const std::string& s) {
0061     *this = std::string_view(s);
0062     return *this;
0063   }
0064 
0065   SmallString& operator=(const char* s) {
0066     *this = std::string_view(s);
0067     return *this;
0068   }
0069 
0070   explicit operator std::string_view() const { return std::string_view(data_, length_); }
0071 
0072   const char* data() const { return data_; }
0073   size_t length() const { return length_; }
0074   bool empty() const { return length_ == 0; }
0075   char operator[](size_t pos) const {
0076 #ifdef NDEBUG
0077     assert(pos <= length_);
0078 #endif
0079     return data_[pos];
0080   }
0081 
0082   SmallString substr(size_t pos) const {
0083     return SmallString(std::string_view(*this).substr(pos));
0084   }
0085 
0086   SmallString substr(size_t pos, size_t count) const {
0087     return SmallString(std::string_view(*this).substr(pos, count));
0088   }
0089 
0090   template <typename T>
0091   bool operator==(T&& other) const {
0092     return std::string_view(*this) == std::string_view(std::forward<T>(other));
0093   }
0094 
0095   template <typename T>
0096   bool operator!=(T&& other) const {
0097     return std::string_view(*this) != std::string_view(std::forward<T>(other));
0098   }
0099 
0100  protected:
0101   uint8_t length_;
0102   char data_[N];
0103 
0104   void CheckSize(size_t n) { assert(n <= N); }
0105 };
0106 
0107 template <uint8_t N>
0108 std::ostream& operator<<(std::ostream& os, const SmallString<N>& str) {
0109   return os << std::string_view(str);
0110 }
0111 
0112 // A trie class for byte strings, optimized for small sets of short strings.
0113 // This class is immutable by design, use a TrieBuilder to construct it.
0114 class ARROW_EXPORT Trie {
0115   using index_type = int16_t;
0116   using fast_index_type = int_fast16_t;
0117   static constexpr auto kMaxIndex = std::numeric_limits<index_type>::max();
0118 
0119  public:
0120   Trie() : size_(0) {}
0121   Trie(Trie&&) = default;
0122   Trie& operator=(Trie&&) = default;
0123 
0124   int32_t Find(std::string_view s) const {
0125     const Node* node = &nodes_[0];
0126     fast_index_type pos = 0;
0127     if (s.length() > static_cast<size_t>(kMaxIndex)) {
0128       return -1;
0129     }
0130     fast_index_type remaining = static_cast<fast_index_type>(s.length());
0131 
0132     while (remaining > 0) {
0133       auto substring_length = node->substring_length();
0134       if (substring_length > 0) {
0135         auto substring_data = node->substring_data();
0136         if (remaining < substring_length) {
0137           // Input too short
0138           return -1;
0139         }
0140         for (fast_index_type i = 0; i < substring_length; ++i) {
0141           if (s[pos++] != substring_data[i]) {
0142             // Mismatching substring
0143             return -1;
0144           }
0145           --remaining;
0146         }
0147         if (remaining == 0) {
0148           // Matched node exactly
0149           return node->found_index_;
0150         }
0151       }
0152       // Lookup child using next input character
0153       if (node->child_lookup_ == -1) {
0154         // Input too long
0155         return -1;
0156       }
0157       auto c = static_cast<uint8_t>(s[pos++]);
0158       --remaining;
0159       auto child_index = lookup_table_[node->child_lookup_ * 256 + c];
0160       if (child_index == -1) {
0161         // Child not found
0162         return -1;
0163       }
0164       node = &nodes_[child_index];
0165     }
0166 
0167     // Input exhausted
0168     if (node->substring_.empty()) {
0169       // Matched node exactly
0170       return node->found_index_;
0171     } else {
0172       return -1;
0173     }
0174   }
0175 
0176   Status Validate() const;
0177 
0178   void Dump() const;
0179 
0180  protected:
0181   static constexpr size_t kNodeSize = 16;
0182   static constexpr auto kMaxSubstringLength =
0183       kNodeSize - 2 * sizeof(index_type) - sizeof(int8_t);
0184 
0185   struct Node {
0186     // If this node is a valid end of string, index of found string, otherwise -1
0187     index_type found_index_;
0188     // Base index for child lookup in lookup_table_ (-1 if no child nodes)
0189     index_type child_lookup_;
0190     // The substring for this node.
0191     SmallString<kMaxSubstringLength> substring_;
0192 
0193     fast_index_type substring_length() const {
0194       return static_cast<fast_index_type>(substring_.length());
0195     }
0196     const char* substring_data() const { return substring_.data(); }
0197   };
0198 
0199   static_assert(sizeof(Node) == kNodeSize, "Unexpected node size");
0200 
0201   ARROW_DISALLOW_COPY_AND_ASSIGN(Trie);
0202 
0203   void Dump(const Node* node, const std::string& indent) const;
0204 
0205   // Node table: entry 0 is the root node
0206   std::vector<Node> nodes_;
0207 
0208   // Indexed lookup structure: gives index in node table, or -1 if not found
0209   std::vector<index_type> lookup_table_;
0210 
0211   // Number of entries
0212   index_type size_;
0213 
0214   friend class TrieBuilder;
0215 };
0216 
0217 class ARROW_EXPORT TrieBuilder {
0218   using index_type = Trie::index_type;
0219   using fast_index_type = Trie::fast_index_type;
0220 
0221  public:
0222   TrieBuilder();
0223   Status Append(std::string_view s, bool allow_duplicate = false);
0224   Trie Finish();
0225 
0226  protected:
0227   // Extend the lookup table by 256 entries, return the index of the new span
0228   Status ExtendLookupTable(index_type* out_lookup_index);
0229   // Split the node given by the index at the substring index `split_at`
0230   Status SplitNode(fast_index_type node_index, fast_index_type split_at);
0231   // Append an already constructed child node to the parent
0232   Status AppendChildNode(Trie::Node* parent, uint8_t ch, Trie::Node&& node);
0233   // Create a matching child node from this parent
0234   Status CreateChildNode(Trie::Node* parent, uint8_t ch, std::string_view substring);
0235   Status CreateChildNode(Trie::Node* parent, char ch, std::string_view substring);
0236 
0237   Trie trie_;
0238 
0239   static constexpr auto kMaxIndex = std::numeric_limits<index_type>::max();
0240 };
0241 
0242 }  // namespace internal
0243 }  // namespace arrow