Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-08-28 08:27:09

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 // Private header, not to be exported
0019 
0020 #pragma once
0021 
0022 #include <algorithm>
0023 #include <cassert>
0024 #include <cmath>
0025 #include <cstdint>
0026 #include <cstring>
0027 #include <limits>
0028 #include <memory>
0029 #include <string>
0030 #include <type_traits>
0031 #include <utility>
0032 #include <vector>
0033 
0034 #include "arrow/array/builder_binary.h"
0035 #include "arrow/buffer_builder.h"
0036 #include "arrow/result.h"
0037 #include "arrow/status.h"
0038 #include "arrow/type_fwd.h"
0039 #include "arrow/type_traits.h"
0040 #include "arrow/util/bit_util.h"
0041 #include "arrow/util/bitmap_builders.h"
0042 #include "arrow/util/endian.h"
0043 #include "arrow/util/logging.h"
0044 #include "arrow/util/macros.h"
0045 #include "arrow/util/ubsan.h"
0046 
0047 #define XXH_INLINE_ALL
0048 
0049 #include "arrow/vendored/xxhash.h"  // IWYU pragma: keep
0050 
0051 namespace arrow {
0052 namespace internal {
0053 
0054 // XXX would it help to have a 32-bit hash value on large datasets?
0055 typedef uint64_t hash_t;
0056 
0057 // Notes about the choice of a hash function.
0058 // - XXH3 is extremely fast on most data sizes, from small to huge;
0059 //   faster even than HW CRC-based hashing schemes
0060 // - our custom hash function for tiny values (< 16 bytes) is still
0061 //   significantly faster (~30%), at least on this machine and compiler
0062 
0063 template <uint64_t AlgNum>
0064 inline hash_t ComputeStringHash(const void* data, int64_t length);
0065 
0066 /// \brief A hash function for bitmaps that can handle offsets and lengths in
0067 /// terms of number of bits. The hash only depends on the bits actually hashed.
0068 ///
0069 /// It's the caller's responsibility to ensure that bits_offset + num_bits are
0070 /// readable from the bitmap.
0071 ///
0072 /// \pre bits_offset >= 0
0073 /// \pre num_bits >= 0
0074 /// \pre (bits_offset + num_bits + 7) / 8 <= readable length in bytes from bitmap
0075 ///
0076 /// \param bitmap The pointer to the bitmap.
0077 /// \param seed The seed for the hash function (useful when chaining hash functions).
0078 /// \param bits_offset The offset in bits relative to the start of the bitmap.
0079 /// \param num_bits The number of bits after the offset to be hashed.
0080 ARROW_EXPORT hash_t ComputeBitmapHash(const uint8_t* bitmap, hash_t seed,
0081                                       int64_t bits_offset, int64_t num_bits);
0082 
0083 template <typename Scalar, uint64_t AlgNum>
0084 struct ScalarHelperBase {
0085   static bool CompareScalars(Scalar u, Scalar v) { return u == v; }
0086 
0087   static hash_t ComputeHash(const Scalar& value) {
0088     // Generic hash computation for scalars.  Simply apply the string hash
0089     // to the bit representation of the value.
0090 
0091     // XXX in the case of FP values, we'd like equal values to have the same hash,
0092     // even if they have different bit representations...
0093     return ComputeStringHash<AlgNum>(&value, sizeof(value));
0094   }
0095 };
0096 
0097 template <typename Scalar, uint64_t AlgNum = 0, typename Enable = void>
0098 struct ScalarHelper : public ScalarHelperBase<Scalar, AlgNum> {};
0099 
0100 template <typename Scalar, uint64_t AlgNum>
0101 struct ScalarHelper<Scalar, AlgNum, enable_if_t<std::is_integral<Scalar>::value>>
0102     : public ScalarHelperBase<Scalar, AlgNum> {
0103   // ScalarHelper specialization for integers
0104 
0105   static hash_t ComputeHash(const Scalar& value) {
0106     // Faster hash computation for integers.
0107 
0108     // Two of xxhash's prime multipliers (which are chosen for their
0109     // bit dispersion properties)
0110     static constexpr uint64_t multipliers[] = {11400714785074694791ULL,
0111                                                14029467366897019727ULL};
0112 
0113     // Multiplying by the prime number mixes the low bits into the high bits,
0114     // then byte-swapping (which is a single CPU instruction) allows the
0115     // combined high and low bits to participate in the initial hash table index.
0116     auto h = static_cast<hash_t>(value);
0117     return bit_util::ByteSwap(multipliers[AlgNum] * h);
0118   }
0119 };
0120 
0121 template <typename Scalar, uint64_t AlgNum>
0122 struct ScalarHelper<Scalar, AlgNum,
0123                     enable_if_t<std::is_same<std::string_view, Scalar>::value>>
0124     : public ScalarHelperBase<Scalar, AlgNum> {
0125   // ScalarHelper specialization for std::string_view
0126 
0127   static hash_t ComputeHash(std::string_view value) {
0128     return ComputeStringHash<AlgNum>(value.data(), static_cast<int64_t>(value.size()));
0129   }
0130 };
0131 
0132 template <typename Scalar, uint64_t AlgNum>
0133 struct ScalarHelper<Scalar, AlgNum, enable_if_t<std::is_floating_point<Scalar>::value>>
0134     : public ScalarHelperBase<Scalar, AlgNum> {
0135   // ScalarHelper specialization for reals
0136 
0137   static bool CompareScalars(Scalar u, Scalar v) {
0138     if (std::isnan(u)) {
0139       // XXX should we do a bit-precise comparison?
0140       return std::isnan(v);
0141     }
0142     return u == v;
0143   }
0144 };
0145 
0146 template <uint64_t AlgNum = 0>
0147 hash_t ComputeStringHash(const void* data, int64_t length) {
0148   if (ARROW_PREDICT_TRUE(length <= 16)) {
0149     // Specialize for small hash strings, as they are quite common as
0150     // hash table keys.  Even XXH3 isn't quite as fast.
0151     auto p = reinterpret_cast<const uint8_t*>(data);
0152     auto n = static_cast<uint32_t>(length);
0153     if (n <= 8) {
0154       if (n <= 3) {
0155         if (n == 0) {
0156           return 1U;
0157         }
0158         uint32_t x = (n << 24) ^ (p[0] << 16) ^ (p[n / 2] << 8) ^ p[n - 1];
0159         return ScalarHelper<uint32_t, AlgNum>::ComputeHash(x);
0160       }
0161       // 4 <= length <= 8
0162       // We can read the string as two overlapping 32-bit ints, apply
0163       // different hash functions to each of them in parallel, then XOR
0164       // the results
0165       uint32_t x, y;
0166       hash_t hx, hy;
0167       x = util::SafeLoadAs<uint32_t>(p + n - 4);
0168       y = util::SafeLoadAs<uint32_t>(p);
0169       hx = ScalarHelper<uint32_t, AlgNum>::ComputeHash(x);
0170       hy = ScalarHelper<uint32_t, AlgNum ^ 1>::ComputeHash(y);
0171       return n ^ hx ^ hy;
0172     }
0173     // 8 <= length <= 16
0174     // Apply the same principle as above
0175     uint64_t x, y;
0176     hash_t hx, hy;
0177     x = util::SafeLoadAs<uint64_t>(p + n - 8);
0178     y = util::SafeLoadAs<uint64_t>(p);
0179     hx = ScalarHelper<uint64_t, AlgNum>::ComputeHash(x);
0180     hy = ScalarHelper<uint64_t, AlgNum ^ 1>::ComputeHash(y);
0181     return n ^ hx ^ hy;
0182   }
0183 
0184 #if XXH3_SECRET_SIZE_MIN != 136
0185 #  error XXH3_SECRET_SIZE_MIN changed, please fix kXxh3Secrets
0186 #endif
0187 
0188   // XXH3_64bits_withSeed generates a secret based on the seed, which is too slow.
0189   // Instead, we use hard-coded random secrets.  To maximize cache efficiency,
0190   // they reuse the same memory area.
0191   static constexpr unsigned char kXxh3Secrets[XXH3_SECRET_SIZE_MIN + 1] = {
0192       0xe7, 0x8b, 0x13, 0xf9, 0xfc, 0xb5, 0x8e, 0xef, 0x81, 0x48, 0x2c, 0xbf, 0xf9, 0x9f,
0193       0xc1, 0x1e, 0x43, 0x6d, 0xbf, 0xa6, 0x6d, 0xb5, 0x72, 0xbc, 0x97, 0xd8, 0x61, 0x24,
0194       0x0f, 0x12, 0xe3, 0x05, 0x21, 0xf7, 0x5c, 0x66, 0x67, 0xa5, 0x65, 0x03, 0x96, 0x26,
0195       0x69, 0xd8, 0x29, 0x20, 0xf8, 0xc7, 0xb0, 0x3d, 0xdd, 0x7d, 0x18, 0xa0, 0x60, 0x75,
0196       0x92, 0xa4, 0xce, 0xba, 0xc0, 0x77, 0xf4, 0xac, 0xb7, 0x03, 0x53, 0xf0, 0x98, 0xce,
0197       0xe6, 0x2b, 0x20, 0xc7, 0x82, 0x91, 0xab, 0xbf, 0x68, 0x5c, 0x62, 0x4d, 0x33, 0xa3,
0198       0xe1, 0xb3, 0xff, 0x97, 0x54, 0x4c, 0x44, 0x34, 0xb5, 0xb9, 0x32, 0x4c, 0x75, 0x42,
0199       0x89, 0x53, 0x94, 0xd4, 0x9f, 0x2b, 0x76, 0x4d, 0x4e, 0xe6, 0xfa, 0x15, 0x3e, 0xc1,
0200       0xdb, 0x71, 0x4b, 0x2c, 0x94, 0xf5, 0xfc, 0x8c, 0x89, 0x4b, 0xfb, 0xc1, 0x82, 0xa5,
0201       0x6a, 0x53, 0xf9, 0x4a, 0xba, 0xce, 0x1f, 0xc0, 0x97, 0x1a, 0x87};
0202 
0203   static_assert(AlgNum < 2, "AlgNum too large");
0204   static constexpr auto secret = kXxh3Secrets + AlgNum;
0205   return XXH3_64bits_withSecret(data, static_cast<size_t>(length), secret,
0206                                 XXH3_SECRET_SIZE_MIN);
0207 }
0208 
0209 // XXX add a HashEq<ArrowType> struct with both hash and compare functions?
0210 
0211 // ----------------------------------------------------------------------
0212 // An open-addressing insert-only hash table (no deletes)
0213 
0214 template <typename Payload>
0215 class HashTable {
0216  public:
0217   static constexpr hash_t kSentinel = 0ULL;
0218   static constexpr int64_t kLoadFactor = 2UL;
0219 
0220   struct Entry {
0221     hash_t h;
0222     Payload payload;
0223 
0224     // An entry is valid if the hash is different from the sentinel value
0225     operator bool() const { return h != kSentinel; }
0226   };
0227 
0228   HashTable(MemoryPool* pool, uint64_t capacity) : entries_builder_(pool) {
0229     DCHECK_NE(pool, nullptr);
0230     // Minimum of 32 elements
0231     capacity = std::max<uint64_t>(capacity, 32UL);
0232     capacity_ = bit_util::NextPower2(capacity);
0233     capacity_mask_ = capacity_ - 1;
0234     size_ = 0;
0235 
0236     DCHECK_OK(UpsizeBuffer(capacity_));
0237   }
0238 
0239   // Lookup with non-linear probing
0240   // cmp_func should have signature bool(const Payload*).
0241   // Return a (Entry*, found) pair.
0242   template <typename CmpFunc>
0243   std::pair<Entry*, bool> Lookup(hash_t h, CmpFunc&& cmp_func) {
0244     auto p = Lookup<DoCompare, CmpFunc>(h, entries_, capacity_mask_,
0245                                         std::forward<CmpFunc>(cmp_func));
0246     return {&entries_[p.first], p.second};
0247   }
0248 
0249   template <typename CmpFunc>
0250   std::pair<const Entry*, bool> Lookup(hash_t h, CmpFunc&& cmp_func) const {
0251     auto p = Lookup<DoCompare, CmpFunc>(h, entries_, capacity_mask_,
0252                                         std::forward<CmpFunc>(cmp_func));
0253     return {&entries_[p.first], p.second};
0254   }
0255 
0256   Status Insert(Entry* entry, hash_t h, const Payload& payload) {
0257     // Ensure entry is empty before inserting
0258     assert(!*entry);
0259     entry->h = FixHash(h);
0260     entry->payload = payload;
0261     ++size_;
0262 
0263     if (ARROW_PREDICT_FALSE(NeedUpsizing())) {
0264       // Resize less frequently since it is expensive
0265       return Upsize(capacity_ * kLoadFactor * 2);
0266     }
0267     return Status::OK();
0268   }
0269 
0270   uint64_t size() const { return size_; }
0271 
0272   // Visit all non-empty entries in the table
0273   // The visit_func should have signature void(const Entry*)
0274   template <typename VisitFunc>
0275   void VisitEntries(VisitFunc&& visit_func) const {
0276     for (uint64_t i = 0; i < capacity_; i++) {
0277       const auto& entry = entries_[i];
0278       if (entry) {
0279         visit_func(&entry);
0280       }
0281     }
0282   }
0283 
0284  protected:
0285   // NoCompare is for when the value is known not to exist in the table
0286   enum CompareKind { DoCompare, NoCompare };
0287 
0288   // The workhorse lookup function
0289   template <CompareKind CKind, typename CmpFunc>
0290   std::pair<uint64_t, bool> Lookup(hash_t h, const Entry* entries, uint64_t size_mask,
0291                                    CmpFunc&& cmp_func) const {
0292     static constexpr uint8_t perturb_shift = 5;
0293 
0294     uint64_t index, perturb;
0295     const Entry* entry;
0296 
0297     h = FixHash(h);
0298     index = h & size_mask;
0299     perturb = (h >> perturb_shift) + 1U;
0300 
0301     while (true) {
0302       entry = &entries[index];
0303       if (CompareEntry<CKind, CmpFunc>(h, entry, std::forward<CmpFunc>(cmp_func))) {
0304         // Found
0305         return {index, true};
0306       }
0307       if (entry->h == kSentinel) {
0308         // Empty slot
0309         return {index, false};
0310       }
0311 
0312       // Perturbation logic inspired from CPython's set / dict object.
0313       // The goal is that all 64 bits of the unmasked hash value eventually
0314       // participate in the probing sequence, to minimize clustering.
0315       index = (index + perturb) & size_mask;
0316       perturb = (perturb >> perturb_shift) + 1U;
0317     }
0318   }
0319 
0320   template <CompareKind CKind, typename CmpFunc>
0321   bool CompareEntry(hash_t h, const Entry* entry, CmpFunc&& cmp_func) const {
0322     if (CKind == NoCompare) {
0323       return false;
0324     } else {
0325       return entry->h == h && cmp_func(&entry->payload);
0326     }
0327   }
0328 
0329   bool NeedUpsizing() const {
0330     // Keep the load factor <= 1/2
0331     return size_ * kLoadFactor >= capacity_;
0332   }
0333 
0334   Status UpsizeBuffer(uint64_t capacity) {
0335     RETURN_NOT_OK(entries_builder_.Resize(capacity));
0336     entries_ = entries_builder_.mutable_data();
0337     memset(static_cast<void*>(entries_), 0, capacity * sizeof(Entry));
0338 
0339     return Status::OK();
0340   }
0341 
0342   Status Upsize(uint64_t new_capacity) {
0343     assert(new_capacity > capacity_);
0344     uint64_t new_mask = new_capacity - 1;
0345     assert((new_capacity & new_mask) == 0);  // it's a power of two
0346 
0347     // Stash old entries and seal builder, effectively resetting the Buffer
0348     const Entry* old_entries = entries_;
0349     ARROW_ASSIGN_OR_RAISE(auto previous, entries_builder_.FinishWithLength(capacity_));
0350     // Allocate new buffer
0351     RETURN_NOT_OK(UpsizeBuffer(new_capacity));
0352 
0353     for (uint64_t i = 0; i < capacity_; i++) {
0354       const auto& entry = old_entries[i];
0355       if (entry) {
0356         // Dummy compare function will not be called
0357         auto p = Lookup<NoCompare>(entry.h, entries_, new_mask,
0358                                    [](const Payload*) { return false; });
0359         // Lookup<NoCompare> (and CompareEntry<NoCompare>) ensure that an
0360         // empty slots is always returned
0361         assert(!p.second);
0362         entries_[p.first] = entry;
0363       }
0364     }
0365     capacity_ = new_capacity;
0366     capacity_mask_ = new_mask;
0367 
0368     return Status::OK();
0369   }
0370 
0371   hash_t FixHash(hash_t h) const { return (h == kSentinel) ? 42U : h; }
0372 
0373   // The number of slots available in the hash table array.
0374   uint64_t capacity_;
0375   uint64_t capacity_mask_;
0376   // The number of used slots in the hash table array.
0377   uint64_t size_;
0378 
0379   Entry* entries_;
0380   TypedBufferBuilder<Entry> entries_builder_;
0381 };
0382 
0383 // XXX typedef memo_index_t int32_t ?
0384 
0385 constexpr int32_t kKeyNotFound = -1;
0386 
0387 // ----------------------------------------------------------------------
0388 // A base class for memoization table.
0389 
0390 class MemoTable {
0391  public:
0392   virtual ~MemoTable() = default;
0393 
0394   virtual int32_t size() const = 0;
0395 };
0396 
0397 // ----------------------------------------------------------------------
0398 // A memoization table for memory-cheap scalar values.
0399 
0400 // The memoization table remembers and allows to look up the insertion
0401 // index for each key.
0402 
0403 template <typename Scalar, template <class> class HashTableTemplateType = HashTable>
0404 class ScalarMemoTable : public MemoTable {
0405  public:
0406   explicit ScalarMemoTable(MemoryPool* pool, int64_t entries = 0)
0407       : hash_table_(pool, static_cast<uint64_t>(entries)) {}
0408 
0409   int32_t Get(const Scalar& value) const {
0410     auto cmp_func = [value](const Payload* payload) -> bool {
0411       return ScalarHelper<Scalar, 0>::CompareScalars(payload->value, value);
0412     };
0413     hash_t h = ComputeHash(value);
0414     auto p = hash_table_.Lookup(h, cmp_func);
0415     if (p.second) {
0416       return p.first->payload.memo_index;
0417     } else {
0418       return kKeyNotFound;
0419     }
0420   }
0421 
0422   template <typename Func1, typename Func2>
0423   Status GetOrInsert(const Scalar& value, Func1&& on_found, Func2&& on_not_found,
0424                      int32_t* out_memo_index) {
0425     auto cmp_func = [value](const Payload* payload) -> bool {
0426       return ScalarHelper<Scalar, 0>::CompareScalars(value, payload->value);
0427     };
0428     hash_t h = ComputeHash(value);
0429     auto p = hash_table_.Lookup(h, cmp_func);
0430     int32_t memo_index;
0431     if (p.second) {
0432       memo_index = p.first->payload.memo_index;
0433       on_found(memo_index);
0434     } else {
0435       memo_index = size();
0436       RETURN_NOT_OK(hash_table_.Insert(p.first, h, {value, memo_index}));
0437       on_not_found(memo_index);
0438     }
0439     *out_memo_index = memo_index;
0440     return Status::OK();
0441   }
0442 
0443   Status GetOrInsert(const Scalar& value, int32_t* out_memo_index) {
0444     return GetOrInsert(
0445         value, [](int32_t i) {}, [](int32_t i) {}, out_memo_index);
0446   }
0447 
0448   int32_t GetNull() const { return null_index_; }
0449 
0450   template <typename Func1, typename Func2>
0451   int32_t GetOrInsertNull(Func1&& on_found, Func2&& on_not_found) {
0452     int32_t memo_index = GetNull();
0453     if (memo_index != kKeyNotFound) {
0454       on_found(memo_index);
0455     } else {
0456       null_index_ = memo_index = size();
0457       on_not_found(memo_index);
0458     }
0459     return memo_index;
0460   }
0461 
0462   int32_t GetOrInsertNull() {
0463     return GetOrInsertNull([](int32_t i) {}, [](int32_t i) {});
0464   }
0465 
0466   // The number of entries in the memo table +1 if null was added.
0467   // (which is also 1 + the largest memo index)
0468   int32_t size() const override {
0469     return static_cast<int32_t>(hash_table_.size()) + (GetNull() != kKeyNotFound);
0470   }
0471 
0472   // Copy values starting from index `start` into `out_data`
0473   void CopyValues(int32_t start, Scalar* out_data) const {
0474     hash_table_.VisitEntries([=](const HashTableEntry* entry) {
0475       int32_t index = entry->payload.memo_index - start;
0476       if (index >= 0) {
0477         out_data[index] = entry->payload.value;
0478       }
0479     });
0480     // Zero-initialize the null entry
0481     if (null_index_ != kKeyNotFound) {
0482       int32_t index = null_index_ - start;
0483       if (index >= 0) {
0484         out_data[index] = Scalar{};
0485       }
0486     }
0487   }
0488 
0489   void CopyValues(Scalar* out_data) const { CopyValues(0, out_data); }
0490 
0491  protected:
0492   struct Payload {
0493     Scalar value;
0494     int32_t memo_index;
0495   };
0496 
0497   using HashTableType = HashTableTemplateType<Payload>;
0498   using HashTableEntry = typename HashTableType::Entry;
0499   HashTableType hash_table_;
0500   int32_t null_index_ = kKeyNotFound;
0501 
0502   hash_t ComputeHash(const Scalar& value) const {
0503     return ScalarHelper<Scalar, 0>::ComputeHash(value);
0504   }
0505 
0506  public:
0507   // defined here so that `HashTableType` is visible
0508   // Merge entries from `other_table` into `this->hash_table_`.
0509   Status MergeTable(const ScalarMemoTable& other_table) {
0510     const HashTableType& other_hashtable = other_table.hash_table_;
0511 
0512     other_hashtable.VisitEntries([this](const HashTableEntry* other_entry) {
0513       int32_t unused;
0514       DCHECK_OK(this->GetOrInsert(other_entry->payload.value, &unused));
0515     });
0516     // TODO: ARROW-17074 - implement proper error handling
0517     return Status::OK();
0518   }
0519 };
0520 
0521 // ----------------------------------------------------------------------
0522 // A memoization table for small scalar values, using direct indexing
0523 
0524 template <typename Scalar, typename Enable = void>
0525 struct SmallScalarTraits {};
0526 
0527 template <>
0528 struct SmallScalarTraits<bool> {
0529   static constexpr int32_t cardinality = 2;
0530 
0531   static uint32_t AsIndex(bool value) { return value ? 1 : 0; }
0532 };
0533 
0534 template <typename Scalar>
0535 struct SmallScalarTraits<Scalar, enable_if_t<std::is_integral<Scalar>::value>> {
0536   using Unsigned = typename std::make_unsigned<Scalar>::type;
0537 
0538   static constexpr int32_t cardinality = 1U + std::numeric_limits<Unsigned>::max();
0539 
0540   static uint32_t AsIndex(Scalar value) { return static_cast<Unsigned>(value); }
0541 };
0542 
0543 template <typename Scalar, template <class> class HashTableTemplateType = HashTable>
0544 class SmallScalarMemoTable : public MemoTable {
0545  public:
0546   explicit SmallScalarMemoTable(MemoryPool* pool, int64_t entries = 0) {
0547     std::fill(value_to_index_, value_to_index_ + cardinality + 1, kKeyNotFound);
0548     index_to_value_.reserve(cardinality);
0549   }
0550 
0551   int32_t Get(const Scalar value) const {
0552     auto value_index = AsIndex(value);
0553     return value_to_index_[value_index];
0554   }
0555 
0556   template <typename Func1, typename Func2>
0557   Status GetOrInsert(const Scalar value, Func1&& on_found, Func2&& on_not_found,
0558                      int32_t* out_memo_index) {
0559     auto value_index = AsIndex(value);
0560     auto memo_index = value_to_index_[value_index];
0561     if (memo_index == kKeyNotFound) {
0562       memo_index = static_cast<int32_t>(index_to_value_.size());
0563       index_to_value_.push_back(value);
0564       value_to_index_[value_index] = memo_index;
0565       DCHECK_LT(memo_index, cardinality + 1);
0566       on_not_found(memo_index);
0567     } else {
0568       on_found(memo_index);
0569     }
0570     *out_memo_index = memo_index;
0571     return Status::OK();
0572   }
0573 
0574   Status GetOrInsert(const Scalar value, int32_t* out_memo_index) {
0575     return GetOrInsert(
0576         value, [](int32_t i) {}, [](int32_t i) {}, out_memo_index);
0577   }
0578 
0579   int32_t GetNull() const { return value_to_index_[cardinality]; }
0580 
0581   template <typename Func1, typename Func2>
0582   int32_t GetOrInsertNull(Func1&& on_found, Func2&& on_not_found) {
0583     auto memo_index = GetNull();
0584     if (memo_index == kKeyNotFound) {
0585       memo_index = value_to_index_[cardinality] = size();
0586       index_to_value_.push_back(0);
0587       on_not_found(memo_index);
0588     } else {
0589       on_found(memo_index);
0590     }
0591     return memo_index;
0592   }
0593 
0594   int32_t GetOrInsertNull() {
0595     return GetOrInsertNull([](int32_t i) {}, [](int32_t i) {});
0596   }
0597 
0598   // The number of entries in the memo table
0599   // (which is also 1 + the largest memo index)
0600   int32_t size() const override { return static_cast<int32_t>(index_to_value_.size()); }
0601 
0602   // Merge entries from `other_table` into `this`.
0603   Status MergeTable(const SmallScalarMemoTable& other_table) {
0604     for (const Scalar& other_val : other_table.index_to_value_) {
0605       int32_t unused;
0606       RETURN_NOT_OK(this->GetOrInsert(other_val, &unused));
0607     }
0608     return Status::OK();
0609   }
0610 
0611   // Copy values starting from index `start` into `out_data`
0612   void CopyValues(int32_t start, Scalar* out_data) const {
0613     DCHECK_GE(start, 0);
0614     DCHECK_LE(static_cast<size_t>(start), index_to_value_.size());
0615     int64_t offset = start * static_cast<int32_t>(sizeof(Scalar));
0616     memcpy(out_data, index_to_value_.data() + offset, (size() - start) * sizeof(Scalar));
0617   }
0618 
0619   void CopyValues(Scalar* out_data) const { CopyValues(0, out_data); }
0620 
0621   const std::vector<Scalar>& values() const { return index_to_value_; }
0622 
0623  protected:
0624   static constexpr auto cardinality = SmallScalarTraits<Scalar>::cardinality;
0625   static_assert(cardinality <= 256, "cardinality too large for direct-addressed table");
0626 
0627   uint32_t AsIndex(Scalar value) const {
0628     return SmallScalarTraits<Scalar>::AsIndex(value);
0629   }
0630 
0631   // The last index is reserved for the null element.
0632   int32_t value_to_index_[cardinality + 1];
0633   std::vector<Scalar> index_to_value_;
0634 };
0635 
0636 // ----------------------------------------------------------------------
0637 // A memoization table for variable-sized binary data.
0638 
0639 template <typename BinaryBuilderT>
0640 class BinaryMemoTable : public MemoTable {
0641  public:
0642   using builder_offset_type = typename BinaryBuilderT::offset_type;
0643   explicit BinaryMemoTable(MemoryPool* pool, int64_t entries = 0,
0644                            int64_t values_size = -1)
0645       : hash_table_(pool, static_cast<uint64_t>(entries)), binary_builder_(pool) {
0646     const int64_t data_size = (values_size < 0) ? entries * 4 : values_size;
0647     DCHECK_OK(binary_builder_.Resize(entries));
0648     DCHECK_OK(binary_builder_.ReserveData(data_size));
0649   }
0650 
0651   int32_t Get(const void* data, builder_offset_type length) const {
0652     hash_t h = ComputeStringHash<0>(data, length);
0653     auto p = Lookup(h, data, length);
0654     if (p.second) {
0655       return p.first->payload.memo_index;
0656     } else {
0657       return kKeyNotFound;
0658     }
0659   }
0660 
0661   int32_t Get(std::string_view value) const {
0662     return Get(value.data(), static_cast<builder_offset_type>(value.length()));
0663   }
0664 
0665   template <typename Func1, typename Func2>
0666   Status GetOrInsert(const void* data, builder_offset_type length, Func1&& on_found,
0667                      Func2&& on_not_found, int32_t* out_memo_index) {
0668     hash_t h = ComputeStringHash<0>(data, length);
0669     auto p = Lookup(h, data, length);
0670     int32_t memo_index;
0671     if (p.second) {
0672       memo_index = p.first->payload.memo_index;
0673       on_found(memo_index);
0674     } else {
0675       memo_index = size();
0676       // Insert string value
0677       RETURN_NOT_OK(binary_builder_.Append(static_cast<const char*>(data), length));
0678       // Insert hash entry
0679       RETURN_NOT_OK(
0680           hash_table_.Insert(const_cast<HashTableEntry*>(p.first), h, {memo_index}));
0681 
0682       on_not_found(memo_index);
0683     }
0684     *out_memo_index = memo_index;
0685     return Status::OK();
0686   }
0687 
0688   template <typename Func1, typename Func2>
0689   Status GetOrInsert(std::string_view value, Func1&& on_found, Func2&& on_not_found,
0690                      int32_t* out_memo_index) {
0691     return GetOrInsert(value.data(), static_cast<builder_offset_type>(value.length()),
0692                        std::forward<Func1>(on_found), std::forward<Func2>(on_not_found),
0693                        out_memo_index);
0694   }
0695 
0696   Status GetOrInsert(const void* data, builder_offset_type length,
0697                      int32_t* out_memo_index) {
0698     return GetOrInsert(
0699         data, length, [](int32_t i) {}, [](int32_t i) {}, out_memo_index);
0700   }
0701 
0702   Status GetOrInsert(std::string_view value, int32_t* out_memo_index) {
0703     return GetOrInsert(value.data(), static_cast<builder_offset_type>(value.length()),
0704                        out_memo_index);
0705   }
0706 
0707   int32_t GetNull() const { return null_index_; }
0708 
0709   template <typename Func1, typename Func2>
0710   int32_t GetOrInsertNull(Func1&& on_found, Func2&& on_not_found) {
0711     int32_t memo_index = GetNull();
0712     if (memo_index == kKeyNotFound) {
0713       memo_index = null_index_ = size();
0714       DCHECK_OK(binary_builder_.AppendNull());
0715       on_not_found(memo_index);
0716     } else {
0717       on_found(memo_index);
0718     }
0719     return memo_index;
0720   }
0721 
0722   int32_t GetOrInsertNull() {
0723     return GetOrInsertNull([](int32_t i) {}, [](int32_t i) {});
0724   }
0725 
0726   // The number of entries in the memo table
0727   // (which is also 1 + the largest memo index)
0728   int32_t size() const override {
0729     return static_cast<int32_t>(hash_table_.size() + (GetNull() != kKeyNotFound));
0730   }
0731 
0732   int64_t values_size() const { return binary_builder_.value_data_length(); }
0733 
0734   // Copy (n + 1) offsets starting from index `start` into `out_data`
0735   template <class Offset>
0736   void CopyOffsets(int32_t start, Offset* out_data) const {
0737     DCHECK_LE(start, size());
0738 
0739     const builder_offset_type* offsets = binary_builder_.offsets_data();
0740     const builder_offset_type delta =
0741         start < binary_builder_.length() ? offsets[start] : 0;
0742     for (int32_t i = start; i < size(); ++i) {
0743       const builder_offset_type adjusted_offset = offsets[i] - delta;
0744       Offset cast_offset = static_cast<Offset>(adjusted_offset);
0745       assert(static_cast<builder_offset_type>(cast_offset) ==
0746              adjusted_offset);  // avoid truncation
0747       *out_data++ = cast_offset;
0748     }
0749 
0750     // Copy last value since BinaryBuilder only materializes it on in Finish()
0751     *out_data = static_cast<Offset>(binary_builder_.value_data_length() - delta);
0752   }
0753 
0754   template <class Offset>
0755   void CopyOffsets(Offset* out_data) const {
0756     CopyOffsets(0, out_data);
0757   }
0758 
0759   // Copy values starting from index `start` into `out_data`
0760   void CopyValues(int32_t start, uint8_t* out_data) const {
0761     CopyValues(start, -1, out_data);
0762   }
0763 
0764   // Same as above, but check output size in debug mode
0765   void CopyValues(int32_t start, int64_t out_size, uint8_t* out_data) const {
0766     DCHECK_LE(start, size());
0767 
0768     // The absolute byte offset of `start` value in the binary buffer.
0769     const builder_offset_type offset = binary_builder_.offset(start);
0770     const auto length = binary_builder_.value_data_length() - static_cast<size_t>(offset);
0771 
0772     if (out_size != -1) {
0773       assert(static_cast<int64_t>(length) <= out_size);
0774     }
0775 
0776     auto view = binary_builder_.GetView(start);
0777     memcpy(out_data, view.data(), length);
0778   }
0779 
0780   void CopyValues(uint8_t* out_data) const { CopyValues(0, -1, out_data); }
0781 
0782   void CopyValues(int64_t out_size, uint8_t* out_data) const {
0783     CopyValues(0, out_size, out_data);
0784   }
0785 
0786   void CopyFixedWidthValues(int32_t start, int32_t width_size, int64_t out_size,
0787                             uint8_t* out_data) const {
0788     // This method exists to cope with the fact that the BinaryMemoTable does
0789     // not know the fixed width when inserting the null value. The data
0790     // buffer hold a zero length string for the null value (if found).
0791     //
0792     // Thus, the method will properly inject an empty value of the proper width
0793     // in the output buffer.
0794     //
0795     if (start >= size()) {
0796       return;
0797     }
0798 
0799     int32_t null_index = GetNull();
0800     if (null_index < start) {
0801       // Nothing to skip, proceed as usual.
0802       CopyValues(start, out_size, out_data);
0803       return;
0804     }
0805 
0806     builder_offset_type left_offset = binary_builder_.offset(start);
0807 
0808     // Ensure that the data length is exactly missing width_size bytes to fit
0809     // in the expected output (n_values * width_size).
0810 #ifndef NDEBUG
0811     int64_t data_length = values_size() - static_cast<size_t>(left_offset);
0812     assert(data_length + width_size == out_size);
0813     ARROW_UNUSED(data_length);
0814 #endif
0815 
0816     auto in_data = binary_builder_.value_data() + left_offset;
0817     // The null use 0-length in the data, slice the data in 2 and skip by
0818     // width_size in out_data. [part_1][width_size][part_2]
0819     auto null_data_offset = binary_builder_.offset(null_index);
0820     auto left_size = null_data_offset - left_offset;
0821     if (left_size > 0) {
0822       memcpy(out_data, in_data + left_offset, left_size);
0823     }
0824     // Zero-initialize the null entry
0825     memset(out_data + left_size, 0, width_size);
0826 
0827     auto right_size = values_size() - static_cast<size_t>(null_data_offset);
0828     if (right_size > 0) {
0829       // skip the null fixed size value.
0830       auto out_offset = left_size + width_size;
0831       assert(out_data + out_offset + right_size == out_data + out_size);
0832       memcpy(out_data + out_offset, in_data + null_data_offset, right_size);
0833     }
0834   }
0835 
0836   // Visit the stored values in insertion order.
0837   // The visitor function should have the signature `void(std::string_view)`
0838   // or `void(const std::string_view&)`.
0839   template <typename VisitFunc>
0840   void VisitValues(int32_t start, VisitFunc&& visit) const {
0841     for (int32_t i = start; i < size(); ++i) {
0842       visit(binary_builder_.GetView(i));
0843     }
0844   }
0845 
0846   // Visit the stored value at a specific index in insertion order.
0847   // The visitor function should have the signature `void(std::string_view)`
0848   // or `void(const std::string_view&)`.
0849   template <typename VisitFunc>
0850   void VisitValue(int32_t idx, VisitFunc&& visit) const {
0851     visit(binary_builder_.GetView(idx));
0852   }
0853 
0854  protected:
0855   struct Payload {
0856     int32_t memo_index;
0857   };
0858 
0859   using HashTableType = HashTable<Payload>;
0860   using HashTableEntry = typename HashTable<Payload>::Entry;
0861   HashTableType hash_table_;
0862   BinaryBuilderT binary_builder_;
0863 
0864   int32_t null_index_ = kKeyNotFound;
0865 
0866   std::pair<const HashTableEntry*, bool> Lookup(hash_t h, const void* data,
0867                                                 builder_offset_type length) const {
0868     auto cmp_func = [&](const Payload* payload) {
0869       std::string_view lhs = binary_builder_.GetView(payload->memo_index);
0870       std::string_view rhs(static_cast<const char*>(data), length);
0871       return lhs == rhs;
0872     };
0873     return hash_table_.Lookup(h, cmp_func);
0874   }
0875 
0876  public:
0877   Status MergeTable(const BinaryMemoTable& other_table) {
0878     other_table.VisitValues(0, [this](std::string_view other_value) {
0879       int32_t unused;
0880       DCHECK_OK(this->GetOrInsert(other_value, &unused));
0881     });
0882     return Status::OK();
0883   }
0884 };
0885 
0886 template <typename T, typename Enable = void>
0887 struct HashTraits {};
0888 
0889 template <>
0890 struct HashTraits<BooleanType> {
0891   using MemoTableType = SmallScalarMemoTable<bool>;
0892 };
0893 
0894 template <typename T>
0895 struct HashTraits<T, enable_if_8bit_int<T>> {
0896   using c_type = typename T::c_type;
0897   using MemoTableType = SmallScalarMemoTable<typename T::c_type>;
0898 };
0899 
0900 template <typename T>
0901 struct HashTraits<T, enable_if_t<has_c_type<T>::value && !is_8bit_int<T>::value>> {
0902   using c_type = typename T::c_type;
0903   using MemoTableType = ScalarMemoTable<c_type, HashTable>;
0904 };
0905 
0906 template <typename T>
0907 struct HashTraits<T, enable_if_t<has_string_view<T>::value &&
0908                                  !std::is_base_of<LargeBinaryType, T>::value>> {
0909   using MemoTableType = BinaryMemoTable<BinaryBuilder>;
0910 };
0911 
0912 template <typename T>
0913 struct HashTraits<T, enable_if_decimal<T>> {
0914   using MemoTableType = BinaryMemoTable<BinaryBuilder>;
0915 };
0916 
0917 template <typename T>
0918 struct HashTraits<T, enable_if_t<std::is_base_of<LargeBinaryType, T>::value>> {
0919   using MemoTableType = BinaryMemoTable<LargeBinaryBuilder>;
0920 };
0921 
0922 template <typename MemoTableType>
0923 static inline Status ComputeNullBitmap(MemoryPool* pool, const MemoTableType& memo_table,
0924                                        int64_t start_offset, int64_t* null_count,
0925                                        std::shared_ptr<Buffer>* null_bitmap) {
0926   int64_t dict_length = static_cast<int64_t>(memo_table.size()) - start_offset;
0927   int64_t null_index = memo_table.GetNull();
0928 
0929   *null_count = 0;
0930   *null_bitmap = nullptr;
0931 
0932   if (null_index != kKeyNotFound && null_index >= start_offset) {
0933     null_index -= start_offset;
0934     *null_count = 1;
0935     ARROW_ASSIGN_OR_RAISE(*null_bitmap,
0936                           internal::BitmapAllButOne(pool, dict_length, null_index));
0937   }
0938 
0939   return Status::OK();
0940 }
0941 
0942 struct StringViewHash {
0943   // std::hash compatible hasher for use with std::unordered_*
0944   // (the std::hash specialization provided by nonstd constructs std::string
0945   // temporaries then invokes std::hash<std::string> against those)
0946   hash_t operator()(std::string_view value) const {
0947     return ComputeStringHash<0>(value.data(), static_cast<int64_t>(value.size()));
0948   }
0949 };
0950 
0951 }  // namespace internal
0952 }  // namespace arrow