arrow/util/hashing.h

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017
0018 // Private header, not to be exported
0019
0020 #pragma once
0021
0022 #include <algorithm>
0023 #include <cassert>
0024 #include <cmath>
0025 #include <cstdint>
0026 #include <cstring>
0027 #include <limits>
0028 #include <memory>
0029 #include <string>
0030 #include <type_traits>
0031 #include <utility>
0032 #include <vector>
0033
0034 #include "arrow/array/builder_binary.h"
0035 #include "arrow/buffer_builder.h"
0036 #include "arrow/result.h"
0037 #include "arrow/status.h"
0038 #include "arrow/type_fwd.h"
0039 #include "arrow/type_traits.h"
0040 #include "arrow/util/bit_util.h"
0041 #include "arrow/util/bitmap_builders.h"
0042 #include "arrow/util/endian.h"
0043 #include "arrow/util/logging.h"
0044 #include "arrow/util/macros.h"
0045 #include "arrow/util/ubsan.h"
0046
0047 #define XXH_INLINE_ALL
0048
0049 #include "arrow/vendored/xxhash.h"  // IWYU pragma: keep
0050
0051 namespace arrow {
0052 namespace internal {
0053
0054 // XXX would it help to have a 32-bit hash value on large datasets?
0055 typedef uint64_t hash_t;
0056
0057 // Notes about the choice of a hash function.
0058 // - XXH3 is extremely fast on most data sizes, from small to huge;
0059 //   faster even than HW CRC-based hashing schemes
0060 // - our custom hash function for tiny values (< 16 bytes) is still
0061 //   significantly faster (~30%), at least on this machine and compiler
0062
0063 template <uint64_t AlgNum>
0064 inline hash_t ComputeStringHash(const void* data, int64_t length);
0065
0066 /// \brief A hash function for bitmaps that can handle offsets and lengths in
0067 /// terms of number of bits. The hash only depends on the bits actually hashed.
0068 ///
0069 /// It's the caller's responsibility to ensure that bits_offset + num_bits are
0070 /// readable from the bitmap.
0071 ///
0072 /// \pre bits_offset >= 0
0073 /// \pre num_bits >= 0
0074 /// \pre (bits_offset + num_bits + 7) / 8 <= readable length in bytes from bitmap
0075 ///
0076 /// \param bitmap The pointer to the bitmap.
0077 /// \param seed The seed for the hash function (useful when chaining hash functions).
0078 /// \param bits_offset The offset in bits relative to the start of the bitmap.
0079 /// \param num_bits The number of bits after the offset to be hashed.
0080 ARROW_EXPORT hash_t ComputeBitmapHash(const uint8_t* bitmap, hash_t seed,
0081                                       int64_t bits_offset, int64_t num_bits);
0082
0083 template <typename Scalar, uint64_t AlgNum>
0084 struct ScalarHelperBase {
0085   static bool CompareScalars(Scalar u, Scalar v) { return u == v; }
0086
0087   static hash_t ComputeHash(const Scalar& value) {
0088     // Generic hash computation for scalars.  Simply apply the string hash
0089     // to the bit representation of the value.
0090
0091     // XXX in the case of FP values, we'd like equal values to have the same hash,
0092     // even if they have different bit representations...
0093     return ComputeStringHash<AlgNum>(&value, sizeof(value));
0094   }
0095 };
0096
0097 template <typename Scalar, uint64_t AlgNum = 0, typename Enable = void>
0098 struct ScalarHelper : public ScalarHelperBase<Scalar, AlgNum> {};
0099
0100 template <typename Scalar, uint64_t AlgNum>
0101 struct ScalarHelper<Scalar, AlgNum, enable_if_t<std::is_integral<Scalar>::value>>
0102     : public ScalarHelperBase<Scalar, AlgNum> {
0103   // ScalarHelper specialization for integers
0104
0105   static hash_t ComputeHash(const Scalar& value) {
0106     // Faster hash computation for integers.
0107
0108     // Two of xxhash's prime multipliers (which are chosen for their
0109     // bit dispersion properties)
0110     static constexpr uint64_t multipliers[] = {11400714785074694791ULL,
0111                                                14029467366897019727ULL};
0112
0113     // Multiplying by the prime number mixes the low bits into the high bits,
0114     // then byte-swapping (which is a single CPU instruction) allows the
0115     // combined high and low bits to participate in the initial hash table index.
0116     auto h = static_cast<hash_t>(value);
0117     return bit_util::ByteSwap(multipliers[AlgNum] * h);
0118   }
0119 };
0120
0121 template <typename Scalar, uint64_t AlgNum>
0122 struct ScalarHelper<Scalar, AlgNum,
0123                     enable_if_t<std::is_same<std::string_view, Scalar>::value>>
0124     : public ScalarHelperBase<Scalar, AlgNum> {
0125   // ScalarHelper specialization for std::string_view
0126
0127   static hash_t ComputeHash(std::string_view value) {
0128     return ComputeStringHash<AlgNum>(value.data(), static_cast<int64_t>(value.size()));
0129   }
0130 };
0131
0132 template <typename Scalar, uint64_t AlgNum>
0133 struct ScalarHelper<Scalar, AlgNum, enable_if_t<std::is_floating_point<Scalar>::value>>
0134     : public ScalarHelperBase<Scalar, AlgNum> {
0135   // ScalarHelper specialization for reals
0136
0137   static bool CompareScalars(Scalar u, Scalar v) {
0138     if (std::isnan(u)) {
0139       // XXX should we do a bit-precise comparison?
0140       return std::isnan(v);
0141     }
0142     return u == v;
0143   }
0144 };
0145
0146 template <uint64_t AlgNum = 0>
0147 hash_t ComputeStringHash(const void* data, int64_t length) {
0148   if (ARROW_PREDICT_TRUE(length <= 16)) {
0149     // Specialize for small hash strings, as they are quite common as
0150     // hash table keys.  Even XXH3 isn't quite as fast.
0151     auto p = reinterpret_cast<const uint8_t*>(data);
0152     auto n = static_cast<uint32_t>(length);
0153     if (n <= 8) {
0154       if (n <= 3) {
0155         if (n == 0) {
0156           return 1U;
0157         }
0158         uint32_t x = (n << 24) ^ (p[0] << 16) ^ (p[n / 2] << 8) ^ p[n - 1];
0159         return ScalarHelper<uint32_t, AlgNum>::ComputeHash(x);
0160       }
0161       // 4 <= length <= 8
0162       // We can read the string as two overlapping 32-bit ints, apply
0163       // different hash functions to each of them in parallel, then XOR
0164       // the results
0165       uint32_t x, y;
0166       hash_t hx, hy;
0167       x = util::SafeLoadAs<uint32_t>(p + n - 4);
0168       y = util::SafeLoadAs<uint32_t>(p);
0169       hx = ScalarHelper<uint32_t, AlgNum>::ComputeHash(x);
0170       hy = ScalarHelper<uint32_t, AlgNum ^ 1>::ComputeHash(y);
0171       return n ^ hx ^ hy;
0172     }
0173     // 8 <= length <= 16
0174     // Apply the same principle as above
0175     uint64_t x, y;
0176     hash_t hx, hy;
0177     x = util::SafeLoadAs<uint64_t>(p + n - 8);
0178     y = util::SafeLoadAs<uint64_t>(p);
0179     hx = ScalarHelper<uint64_t, AlgNum>::ComputeHash(x);
0180     hy = ScalarHelper<uint64_t, AlgNum ^ 1>::ComputeHash(y);
0181     return n ^ hx ^ hy;
0182   }
0183
0184 #if XXH3_SECRET_SIZE_MIN != 136
0185 #  error XXH3_SECRET_SIZE_MIN changed, please fix kXxh3Secrets
0186 #endif
0187
0188   // XXH3_64bits_withSeed generates a secret based on the seed, which is too slow.
0189   // Instead, we use hard-coded random secrets.  To maximize cache efficiency,
0190   // they reuse the same memory area.
0191   static constexpr unsigned char kXxh3Secrets[XXH3_SECRET_SIZE_MIN + 1] = {
0192       0xe7, 0x8b, 0x13, 0xf9, 0xfc, 0xb5, 0x8e, 0xef, 0x81, 0x48, 0x2c, 0xbf, 0xf9, 0x9f,
0193       0xc1, 0x1e, 0x43, 0x6d, 0xbf, 0xa6, 0x6d, 0xb5, 0x72, 0xbc, 0x97, 0xd8, 0x61, 0x24,
0194       0x0f, 0x12, 0xe3, 0x05, 0x21, 0xf7, 0x5c, 0x66, 0x67, 0xa5, 0x65, 0x03, 0x96, 0x26,
0195       0x69, 0xd8, 0x29, 0x20, 0xf8, 0xc7, 0xb0, 0x3d, 0xdd, 0x7d, 0x18, 0xa0, 0x60, 0x75,
0196       0x92, 0xa4, 0xce, 0xba, 0xc0, 0x77, 0xf4, 0xac, 0xb7, 0x03, 0x53, 0xf0, 0x98, 0xce,
0197       0xe6, 0x2b, 0x20, 0xc7, 0x82, 0x91, 0xab, 0xbf, 0x68, 0x5c, 0x62, 0x4d, 0x33, 0xa3,
0198       0xe1, 0xb3, 0xff, 0x97, 0x54, 0x4c, 0x44, 0x34, 0xb5, 0xb9, 0x32, 0x4c, 0x75, 0x42,
0199       0x89, 0x53, 0x94, 0xd4, 0x9f, 0x2b, 0x76, 0x4d, 0x4e, 0xe6, 0xfa, 0x15, 0x3e, 0xc1,
0200       0xdb, 0x71, 0x4b, 0x2c, 0x94, 0xf5, 0xfc, 0x8c, 0x89, 0x4b, 0xfb, 0xc1, 0x82, 0xa5,
0201       0x6a, 0x53, 0xf9, 0x4a, 0xba, 0xce, 0x1f, 0xc0, 0x97, 0x1a, 0x87};
0202
0203   static_assert(AlgNum < 2, "AlgNum too large");
0204   static constexpr auto secret = kXxh3Secrets + AlgNum;
0205   return XXH3_64bits_withSecret(data, static_cast<size_t>(length), secret,
0206                                 XXH3_SECRET_SIZE_MIN);
0207 }
0208
0209 // XXX add a HashEq<ArrowType> struct with both hash and compare functions?
0210
0211 // ----------------------------------------------------------------------
0212 // An open-addressing insert-only hash table (no deletes)
0213
0214 template <typename Payload>
0215 class HashTable {
0216  public:
0217   static constexpr hash_t kSentinel = 0ULL;
0218   static constexpr int64_t kLoadFactor = 2UL;
0219
0220   struct Entry {
0221     hash_t h;
0222     Payload payload;
0223
0224     // An entry is valid if the hash is different from the sentinel value
0225     operator bool() const { return h != kSentinel; }
0226   };
0227
0228   HashTable(MemoryPool* pool, uint64_t capacity) : entries_builder_(pool) {
0229     DCHECK_NE(pool, nullptr);
0230     // Minimum of 32 elements
0231     capacity = std::max<uint64_t>(capacity, 32UL);
0232     capacity_ = bit_util::NextPower2(capacity);
0233     capacity_mask_ = capacity_ - 1;
0234     size_ = 0;
0235
0236     DCHECK_OK(UpsizeBuffer(capacity_));
0237   }
0238
0239   // Lookup with non-linear probing
0240   // cmp_func should have signature bool(const Payload*).
0241   // Return a (Entry*, found) pair.
0242   template <typename CmpFunc>
0243   std::pair<Entry*, bool> Lookup(hash_t h, CmpFunc&& cmp_func) {
0244     auto p = Lookup<DoCompare, CmpFunc>(h, entries_, capacity_mask_,
0245                                         std::forward<CmpFunc>(cmp_func));
0246     return {&entries_[p.first], p.second};
0247   }
0248
0249   template <typename CmpFunc>
0250   std::pair<const Entry*, bool> Lookup(hash_t h, CmpFunc&& cmp_func) const {
0251     auto p = Lookup<DoCompare, CmpFunc>(h, entries_, capacity_mask_,
0252                                         std::forward<CmpFunc>(cmp_func));
0253     return {&entries_[p.first], p.second};
0254   }
0255
0256   Status Insert(Entry* entry, hash_t h, const Payload& payload) {
0257     // Ensure entry is empty before inserting
0258     assert(!*entry);
0259     entry->h = FixHash(h);
0260     entry->payload = payload;
0261     ++size_;
0262
0263     if (ARROW_PREDICT_FALSE(NeedUpsizing())) {
0264       // Resize less frequently since it is expensive
0265       return Upsize(capacity_ * kLoadFactor * 2);
0266     }
0267     return Status::OK();
0268   }
0269
0270   uint64_t size() const { return size_; }
0271
0272   // Visit all non-empty entries in the table
0273   // The visit_func should have signature void(const Entry*)
0274   template <typename VisitFunc>
0275   void VisitEntries(VisitFunc&& visit_func) const {
0276     for (uint64_t i = 0; i < capacity_; i++) {
0277       const auto& entry = entries_[i];
0278       if (entry) {
0279         visit_func(&entry);
0280       }
0281     }
0282   }
0283
0284  protected:
0285   // NoCompare is for when the value is known not to exist in the table
0286   enum CompareKind { DoCompare, NoCompare };
0287
0288   // The workhorse lookup function
0289   template <CompareKind CKind, typename CmpFunc>
0290   std::pair<uint64_t, bool> Lookup(hash_t h, const Entry* entries, uint64_t size_mask,
0291                                    CmpFunc&& cmp_func) const {
0292     static constexpr uint8_t perturb_shift = 5;
0293
0294     uint64_t index, perturb;
0295     const Entry* entry;
0296
0297     h = FixHash(h);
0298     index = h & size_mask;
0299     perturb = (h >> perturb_shift) + 1U;
0300
0301     while (true) {
0302       entry = &entries[index];
0303       if (CompareEntry<CKind, CmpFunc>(h, entry, std::forward<CmpFunc>(cmp_func))) {
0304         // Found
0305         return {index, true};
0306       }
0307       if (entry->h == kSentinel) {
0308         // Empty slot
0309         return {index, false};
0310       }
0311
0312       // Perturbation logic inspired from CPython's set / dict object.
0313       // The goal is that all 64 bits of the unmasked hash value eventually
0314       // participate in the probing sequence, to minimize clustering.
0315       index = (index + perturb) & size_mask;
0316       perturb = (perturb >> perturb_shift) + 1U;
0317     }
0318   }
0319
0320   template <CompareKind CKind, typename CmpFunc>
0321   bool CompareEntry(hash_t h, const Entry* entry, CmpFunc&& cmp_func) const {
0322     if (CKind == NoCompare) {
0323       return false;
0324     } else {
0325       return entry->h == h && cmp_func(&entry->payload);
0326     }
0327   }
0328
0329   bool NeedUpsizing() const {
0330     // Keep the load factor <= 1/2
0331     return size_ * kLoadFactor >= capacity_;
0332   }
0333
0334   Status UpsizeBuffer(uint64_t capacity) {
0335     RETURN_NOT_OK(entries_builder_.Resize(capacity));
0336     entries_ = entries_builder_.mutable_data();
0337     memset(static_cast<void*>(entries_), 0, capacity * sizeof(Entry));
0338
0339     return Status::OK();
0340   }
0341
0342   Status Upsize(uint64_t new_capacity) {
0343     assert(new_capacity > capacity_);
0344     uint64_t new_mask = new_capacity - 1;
0345     assert((new_capacity & new_mask) == 0);  // it's a power of two
0346
0347     // Stash old entries and seal builder, effectively resetting the Buffer
0348     const Entry* old_entries = entries_;
0349     ARROW_ASSIGN_OR_RAISE(auto previous, entries_builder_.FinishWithLength(capacity_));
0350     // Allocate new buffer
0351     RETURN_NOT_OK(UpsizeBuffer(new_capacity));
0352
0353     for (uint64_t i = 0; i < capacity_; i++) {
0354       const auto& entry = old_entries[i];
0355       if (entry) {
0356         // Dummy compare function will not be called
0357         auto p = Lookup<NoCompare>(entry.h, entries_, new_mask,
0358                                    [](const Payload*) { return false; });
0359         // Lookup<NoCompare> (and CompareEntry<NoCompare>) ensure that an
0360         // empty slots is always returned
0361         assert(!p.second);
0362         entries_[p.first] = entry;
0363       }
0364     }
0365     capacity_ = new_capacity;
0366     capacity_mask_ = new_mask;
0367
0368     return Status::OK();
0369   }
0370
0371   hash_t FixHash(hash_t h) const { return (h == kSentinel) ? 42U : h; }
0372
0373   // The number of slots available in the hash table array.
0374   uint64_t capacity_;
0375   uint64_t capacity_mask_;
0376   // The number of used slots in the hash table array.
0377   uint64_t size_;
0378
0379   Entry* entries_;
0380   TypedBufferBuilder<Entry> entries_builder_;
0381 };
0382
0383 // XXX typedef memo_index_t int32_t ?
0384
0385 constexpr int32_t kKeyNotFound = -1;
0386
0387 // ----------------------------------------------------------------------
0388 // A base class for memoization table.
0389
0390 class MemoTable {
0391  public:
0392   virtual ~MemoTable() = default;
0393
0394   virtual int32_t size() const = 0;
0395 };
0396
0397 // ----------------------------------------------------------------------
0398 // A memoization table for memory-cheap scalar values.
0399
0400 // The memoization table remembers and allows to look up the insertion
0401 // index for each key.
0402
0403 template <typename Scalar, template <class> class HashTableTemplateType = HashTable>
0404 class ScalarMemoTable : public MemoTable {
0405  public:
0406   explicit ScalarMemoTable(MemoryPool* pool, int64_t entries = 0)
0407       : hash_table_(pool, static_cast<uint64_t>(entries)) {}
0408
0409   int32_t Get(const Scalar& value) const {
0410     auto cmp_func = [value](const Payload* payload) -> bool {
0411       return ScalarHelper<Scalar, 0>::CompareScalars(payload->value, value);
0412     };
0413     hash_t h = ComputeHash(value);
0414     auto p = hash_table_.Lookup(h, cmp_func);
0415     if (p.second) {
0416       return p.first->payload.memo_index;
0417     } else {
0418       return kKeyNotFound;
0419     }
0420   }
0421
0422   template <typename Func1, typename Func2>
0423   Status GetOrInsert(const Scalar& value, Func1&& on_found, Func2&& on_not_found,
0424                      int32_t* out_memo_index) {
0425     auto cmp_func = [value](const Payload* payload) -> bool {
0426       return ScalarHelper<Scalar, 0>::CompareScalars(value, payload->value);
0427     };
0428     hash_t h = ComputeHash(value);
0429     auto p = hash_table_.Lookup(h, cmp_func);
0430     int32_t memo_index;
0431     if (p.second) {
0432       memo_index = p.first->payload.memo_index;
0433       on_found(memo_index);
0434     } else {
0435       memo_index = size();
0436       RETURN_NOT_OK(hash_table_.Insert(p.first, h, {value, memo_index}));
0437       on_not_found(memo_index);
0438     }
0439     *out_memo_index = memo_index;
0440     return Status::OK();
0441   }
0442
0443   Status GetOrInsert(const Scalar& value, int32_t* out_memo_index) {
0444     return GetOrInsert(
0445         value, [](int32_t i) {}, [](int32_t i) {}, out_memo_index);
0446   }
0447
0448   int32_t GetNull() const { return null_index_; }
0449
0450   template <typename Func1, typename Func2>
0451   int32_t GetOrInsertNull(Func1&& on_found, Func2&& on_not_found) {
0452     int32_t memo_index = GetNull();
0453     if (memo_index != kKeyNotFound) {
0454       on_found(memo_index);
0455     } else {
0456       null_index_ = memo_index = size();
0457       on_not_found(memo_index);
0458     }
0459     return memo_index;
0460   }
0461
0462   int32_t GetOrInsertNull() {
0463     return GetOrInsertNull([](int32_t i) {}, [](int32_t i) {});
0464   }
0465
0466   // The number of entries in the memo table +1 if null was added.
0467   // (which is also 1 + the largest memo index)
0468   int32_t size() const override {
0469     return static_cast<int32_t>(hash_table_.size()) + (GetNull() != kKeyNotFound);
0470   }
0471
0472   // Copy values starting from index `start` into `out_data`
0473   void CopyValues(int32_t start, Scalar* out_data) const {
0474     hash_table_.VisitEntries([=](const HashTableEntry* entry) {
0475       int32_t index = entry->payload.memo_index - start;
0476       if (index >= 0) {
0477         out_data[index] = entry->payload.value;
0478       }
0479     });
0480     // Zero-initialize the null entry
0481     if (null_index_ != kKeyNotFound) {
0482       int32_t index = null_index_ - start;
0483       if (index >= 0) {
0484         out_data[index] = Scalar{};
0485       }
0486     }
0487   }
0488
0489   void CopyValues(Scalar* out_data) const { CopyValues(0, out_data); }
0490
0491  protected:
0492   struct Payload {
0493     Scalar value;
0494     int32_t memo_index;
0495   };
0496
0497   using HashTableType = HashTableTemplateType<Payload>;
0498   using HashTableEntry = typename HashTableType::Entry;
0499   HashTableType hash_table_;
0500   int32_t null_index_ = kKeyNotFound;
0501
0502   hash_t ComputeHash(const Scalar& value) const {
0503     return ScalarHelper<Scalar, 0>::ComputeHash(value);
0504   }
0505
0506  public:
0507   // defined here so that `HashTableType` is visible
0508   // Merge entries from `other_table` into `this->hash_table_`.
0509   Status MergeTable(const ScalarMemoTable& other_table) {
0510     const HashTableType& other_hashtable = other_table.hash_table_;
0511
0512     other_hashtable.VisitEntries([this](const HashTableEntry* other_entry) {
0513       int32_t unused;
0514       DCHECK_OK(this->GetOrInsert(other_entry->payload.value, &unused));
0515     });
0516     // TODO: ARROW-17074 - implement proper error handling
0517     return Status::OK();
0518   }
0519 };
0520
0521 // ----------------------------------------------------------------------
0522 // A memoization table for small scalar values, using direct indexing
0523
0524 template <typename Scalar, typename Enable = void>
0525 struct SmallScalarTraits {};
0526
0527 template <>
0528 struct SmallScalarTraits<bool> {
0529   static constexpr int32_t cardinality = 2;
0530
0531   static uint32_t AsIndex(bool value) { return value ? 1 : 0; }
0532 };
0533
0534 template <typename Scalar>
0535 struct SmallScalarTraits<Scalar, enable_if_t<std::is_integral<Scalar>::value>> {
0536   using Unsigned = typename std::make_unsigned<Scalar>::type;
0537
0538   static constexpr int32_t cardinality = 1U + std::numeric_limits<Unsigned>::max();
0539
0540   static uint32_t AsIndex(Scalar value) { return static_cast<Unsigned>(value); }
0541 };
0542
0543 template <typename Scalar, template <class> class HashTableTemplateType = HashTable>
0544 class SmallScalarMemoTable : public MemoTable {
0545  public:
0546   explicit SmallScalarMemoTable(MemoryPool* pool, int64_t entries = 0) {
0547     std::fill(value_to_index_, value_to_index_ + cardinality + 1, kKeyNotFound);
0548     index_to_value_.reserve(cardinality);
0549   }
0550
0551   int32_t Get(const Scalar value) const {
0552     auto value_index = AsIndex(value);
0553     return value_to_index_[value_index];
0554   }
0555
0556   template <typename Func1, typename Func2>
0557   Status GetOrInsert(const Scalar value, Func1&& on_found, Func2&& on_not_found,
0558                      int32_t* out_memo_index) {
0559     auto value_index = AsIndex(value);
0560     auto memo_index = value_to_index_[value_index];
0561     if (memo_index == kKeyNotFound) {
0562       memo_index = static_cast<int32_t>(index_to_value_.size());
0563       index_to_value_.push_back(value);
0564       value_to_index_[value_index] = memo_index;
0565       DCHECK_LT(memo_index, cardinality + 1);
0566       on_not_found(memo_index);
0567     } else {
0568       on_found(memo_index);
0569     }
0570     *out_memo_index = memo_index;
0571     return Status::OK();
0572   }
0573
0574   Status GetOrInsert(const Scalar value, int32_t* out_memo_index) {
0575     return GetOrInsert(
0576         value, [](int32_t i) {}, [](int32_t i) {}, out_memo_index);
0577   }
0578
0579   int32_t GetNull() const { return value_to_index_[cardinality]; }
0580
0581   template <typename Func1, typename Func2>
0582   int32_t GetOrInsertNull(Func1&& on_found, Func2&& on_not_found) {
0583     auto memo_index = GetNull();
0584     if (memo_index == kKeyNotFound) {
0585       memo_index = value_to_index_[cardinality] = size();
0586       index_to_value_.push_back(0);
0587       on_not_found(memo_index);
0588     } else {
0589       on_found(memo_index);
0590     }
0591     return memo_index;
0592   }
0593
0594   int32_t GetOrInsertNull() {
0595     return GetOrInsertNull([](int32_t i) {}, [](int32_t i) {});
0596   }
0597
0598   // The number of entries in the memo table
0599   // (which is also 1 + the largest memo index)
0600   int32_t size() const override { return static_cast<int32_t>(index_to_value_.size()); }
0601
0602   // Merge entries from `other_table` into `this`.
0603   Status MergeTable(const SmallScalarMemoTable& other_table) {
0604     for (const Scalar& other_val : other_table.index_to_value_) {
0605       int32_t unused;
0606       RETURN_NOT_OK(this->GetOrInsert(other_val, &unused));
0607     }
0608     return Status::OK();
0609   }
0610
0611   // Copy values starting from index `start` into `out_data`
0612   void CopyValues(int32_t start, Scalar* out_data) const {
0613     DCHECK_GE(start, 0);
0614     DCHECK_LE(static_cast<size_t>(start), index_to_value_.size());
0615     int64_t offset = start * static_cast<int32_t>(sizeof(Scalar));
0616     memcpy(out_data, index_to_value_.data() + offset, (size() - start) * sizeof(Scalar));
0617   }
0618
0619   void CopyValues(Scalar* out_data) const { CopyValues(0, out_data); }
0620
0621   const std::vector<Scalar>& values() const { return index_to_value_; }
0622
0623  protected:
0624   static constexpr auto cardinality = SmallScalarTraits<Scalar>::cardinality;
0625   static_assert(cardinality <= 256, "cardinality too large for direct-addressed table");
0626
0627   uint32_t AsIndex(Scalar value) const {
0628     return SmallScalarTraits<Scalar>::AsIndex(value);
0629   }
0630
0631   // The last index is reserved for the null element.
0632   int32_t value_to_index_[cardinality + 1];
0633   std::vector<Scalar> index_to_value_;
0634 };
0635
0636 // ----------------------------------------------------------------------
0637 // A memoization table for variable-sized binary data.
0638
0639 template <typename BinaryBuilderT>
0640 class BinaryMemoTable : public MemoTable {
0641  public:
0642   using builder_offset_type = typename BinaryBuilderT::offset_type;
0643   explicit BinaryMemoTable(MemoryPool* pool, int64_t entries = 0,
0644                            int64_t values_size = -1)
0645       : hash_table_(pool, static_cast<uint64_t>(entries)), binary_builder_(pool) {
0646     const int64_t data_size = (values_size < 0) ? entries * 4 : values_size;
0647     DCHECK_OK(binary_builder_.Resize(entries));
0648     DCHECK_OK(binary_builder_.ReserveData(data_size));
0649   }
0650
0651   int32_t Get(const void* data, builder_offset_type length) const {
0652     hash_t h = ComputeStringHash<0>(data, length);
0653     auto p = Lookup(h, data, length);
0654     if (p.second) {
0655       return p.first->payload.memo_index;
0656     } else {
0657       return kKeyNotFound;
0658     }
0659   }
0660
0661   int32_t Get(std::string_view value) const {
0662     return Get(value.data(), static_cast<builder_offset_type>(value.length()));
0663   }
0664
0665   template <typename Func1, typename Func2>
0666   Status GetOrInsert(const void* data, builder_offset_type length, Func1&& on_found,
0667                      Func2&& on_not_found, int32_t* out_memo_index) {
0668     hash_t h = ComputeStringHash<0>(data, length);
0669     auto p = Lookup(h, data, length);
0670     int32_t memo_index;
0671     if (p.second) {
0672       memo_index = p.first->payload.memo_index;
0673       on_found(memo_index);
0674     } else {
0675       memo_index = size();
0676       // Insert string value
0677       RETURN_NOT_OK(binary_builder_.Append(static_cast<const char*>(data), length));
0678       // Insert hash entry
0679       RETURN_NOT_OK(
0680           hash_table_.Insert(const_cast<HashTableEntry*>(p.first), h, {memo_index}));
0681
0682       on_not_found(memo_index);
0683     }
0684     *out_memo_index = memo_index;
0685     return Status::OK();
0686   }
0687
0688   template <typename Func1, typename Func2>
0689   Status GetOrInsert(std::string_view value, Func1&& on_found, Func2&& on_not_found,
0690                      int32_t* out_memo_index) {
0691     return GetOrInsert(value.data(), static_cast<builder_offset_type>(value.length()),
0692                        std::forward<Func1>(on_found), std::forward<Func2>(on_not_found),
0693                        out_memo_index);
0694   }
0695
0696   Status GetOrInsert(const void* data, builder_offset_type length,
0697                      int32_t* out_memo_index) {
0698     return GetOrInsert(
0699         data, length, [](int32_t i) {}, [](int32_t i) {}, out_memo_index);
0700   }
0701
0702   Status GetOrInsert(std::string_view value, int32_t* out_memo_index) {
0703     return GetOrInsert(value.data(), static_cast<builder_offset_type>(value.length()),
0704                        out_memo_index);
0705   }
0706
0707   int32_t GetNull() const { return null_index_; }
0708
0709   template <typename Func1, typename Func2>
0710   int32_t GetOrInsertNull(Func1&& on_found, Func2&& on_not_found) {
0711     int32_t memo_index = GetNull();
0712     if (memo_index == kKeyNotFound) {
0713       memo_index = null_index_ = size();
0714       DCHECK_OK(binary_builder_.AppendNull());
0715       on_not_found(memo_index);
0716     } else {
0717       on_found(memo_index);
0718     }
0719     return memo_index;
0720   }
0721
0722   int32_t GetOrInsertNull() {
0723     return GetOrInsertNull([](int32_t i) {}, [](int32_t i) {});
0724   }
0725
0726   // The number of entries in the memo table
0727   // (which is also 1 + the largest memo index)
0728   int32_t size() const override {
0729     return static_cast<int32_t>(hash_table_.size() + (GetNull() != kKeyNotFound));
0730   }
0731
0732   int64_t values_size() const { return binary_builder_.value_data_length(); }
0733
0734   // Copy (n + 1) offsets starting from index `start` into `out_data`
0735   template <class Offset>
0736   void CopyOffsets(int32_t start, Offset* out_data) const {
0737     DCHECK_LE(start, size());
0738
0739     const builder_offset_type* offsets = binary_builder_.offsets_data();
0740     const builder_offset_type delta =
0741         start < binary_builder_.length() ? offsets[start] : 0;
0742     for (int32_t i = start; i < size(); ++i) {
0743       const builder_offset_type adjusted_offset = offsets[i] - delta;
0744       Offset cast_offset = static_cast<Offset>(adjusted_offset);
0745       assert(static_cast<builder_offset_type>(cast_offset) ==
0746              adjusted_offset);  // avoid truncation
0747       *out_data++ = cast_offset;
0748     }
0749
0750     // Copy last value since BinaryBuilder only materializes it on in Finish()
0751     *out_data = static_cast<Offset>(binary_builder_.value_data_length() - delta);
0752   }
0753
0754   template <class Offset>
0755   void CopyOffsets(Offset* out_data) const {
0756     CopyOffsets(0, out_data);
0757   }
0758
0759   // Copy values starting from index `start` into `out_data`
0760   void CopyValues(int32_t start, uint8_t* out_data) const {
0761     CopyValues(start, -1, out_data);
0762   }
0763
0764   // Same as above, but check output size in debug mode
0765   void CopyValues(int32_t start, int64_t out_size, uint8_t* out_data) const {
0766     DCHECK_LE(start, size());
0767
0768     // The absolute byte offset of `start` value in the binary buffer.
0769     const builder_offset_type offset = binary_builder_.offset(start);
0770     const auto length = binary_builder_.value_data_length() - static_cast<size_t>(offset);
0771
0772     if (out_size != -1) {
0773       assert(static_cast<int64_t>(length) <= out_size);
0774     }
0775
0776     auto view = binary_builder_.GetView(start);
0777     memcpy(out_data, view.data(), length);
0778   }
0779
0780   void CopyValues(uint8_t* out_data) const { CopyValues(0, -1, out_data); }
0781
0782   void CopyValues(int64_t out_size, uint8_t* out_data) const {
0783     CopyValues(0, out_size, out_data);
0784   }
0785
0786   void CopyFixedWidthValues(int32_t start, int32_t width_size, int64_t out_size,
0787                             uint8_t* out_data) const {
0788     // This method exists to cope with the fact that the BinaryMemoTable does
0789     // not know the fixed width when inserting the null value. The data
0790     // buffer hold a zero length string for the null value (if found).
0791     //
0792     // Thus, the method will properly inject an empty value of the proper width
0793     // in the output buffer.
0794     //
0795     if (start >= size()) {
0796       return;
0797     }
0798
0799     int32_t null_index = GetNull();
0800     if (null_index < start) {
0801       // Nothing to skip, proceed as usual.
0802       CopyValues(start, out_size, out_data);
0803       return;
0804     }
0805
0806     builder_offset_type left_offset = binary_builder_.offset(start);
0807
0808     // Ensure that the data length is exactly missing width_size bytes to fit
0809     // in the expected output (n_values * width_size).
0810 #ifndef NDEBUG
0811     int64_t data_length = values_size() - static_cast<size_t>(left_offset);
0812     assert(data_length + width_size == out_size);
0813     ARROW_UNUSED(data_length);
0814 #endif
0815
0816     auto in_data = binary_builder_.value_data() + left_offset;
0817     // The null use 0-length in the data, slice the data in 2 and skip by
0818     // width_size in out_data. [part_1][width_size][part_2]
0819     auto null_data_offset = binary_builder_.offset(null_index);
0820     auto left_size = null_data_offset - left_offset;
0821     if (left_size > 0) {
0822       memcpy(out_data, in_data + left_offset, left_size);
0823     }
0824     // Zero-initialize the null entry
0825     memset(out_data + left_size, 0, width_size);
0826
0827     auto right_size = values_size() - static_cast<size_t>(null_data_offset);
0828     if (right_size > 0) {
0829       // skip the null fixed size value.
0830       auto out_offset = left_size + width_size;
0831       assert(out_data + out_offset + right_size == out_data + out_size);
0832       memcpy(out_data + out_offset, in_data + null_data_offset, right_size);
0833     }
0834   }
0835
0836   // Visit the stored values in insertion order.
0837   // The visitor function should have the signature `void(std::string_view)`
0838   // or `void(const std::string_view&)`.
0839   template <typename VisitFunc>
0840   void VisitValues(int32_t start, VisitFunc&& visit) const {
0841     for (int32_t i = start; i < size(); ++i) {
0842       visit(binary_builder_.GetView(i));
0843     }
0844   }
0845
0846   // Visit the stored value at a specific index in insertion order.
0847   // The visitor function should have the signature `void(std::string_view)`
0848   // or `void(const std::string_view&)`.
0849   template <typename VisitFunc>
0850   void VisitValue(int32_t idx, VisitFunc&& visit) const {
0851     visit(binary_builder_.GetView(idx));
0852   }
0853
0854  protected:
0855   struct Payload {
0856     int32_t memo_index;
0857   };
0858
0859   using HashTableType = HashTable<Payload>;
0860   using HashTableEntry = typename HashTable<Payload>::Entry;
0861   HashTableType hash_table_;
0862   BinaryBuilderT binary_builder_;
0863
0864   int32_t null_index_ = kKeyNotFound;
0865
0866   std::pair<const HashTableEntry*, bool> Lookup(hash_t h, const void* data,
0867                                                 builder_offset_type length) const {
0868     auto cmp_func = [&](const Payload* payload) {
0869       std::string_view lhs = binary_builder_.GetView(payload->memo_index);
0870       std::string_view rhs(static_cast<const char*>(data), length);
0871       return lhs == rhs;
0872     };
0873     return hash_table_.Lookup(h, cmp_func);
0874   }
0875
0876  public:
0877   Status MergeTable(const BinaryMemoTable& other_table) {
0878     other_table.VisitValues(0, [this](std::string_view other_value) {
0879       int32_t unused;
0880       DCHECK_OK(this->GetOrInsert(other_value, &unused));
0881     });
0882     return Status::OK();
0883   }
0884 };
0885
0886 template <typename T, typename Enable = void>
0887 struct HashTraits {};
0888
0889 template <>
0890 struct HashTraits<BooleanType> {
0891   using MemoTableType = SmallScalarMemoTable<bool>;
0892 };
0893
0894 template <typename T>
0895 struct HashTraits<T, enable_if_8bit_int<T>> {
0896   using c_type = typename T::c_type;
0897   using MemoTableType = SmallScalarMemoTable<typename T::c_type>;
0898 };
0899
0900 template <typename T>
0901 struct HashTraits<T, enable_if_t<has_c_type<T>::value && !is_8bit_int<T>::value>> {
0902   using c_type = typename T::c_type;
0903   using MemoTableType = ScalarMemoTable<c_type, HashTable>;
0904 };
0905
0906 template <typename T>
0907 struct HashTraits<T, enable_if_t<has_string_view<T>::value &&
0908                                  !std::is_base_of<LargeBinaryType, T>::value>> {
0909   using MemoTableType = BinaryMemoTable<BinaryBuilder>;
0910 };
0911
0912 template <typename T>
0913 struct HashTraits<T, enable_if_decimal<T>> {
0914   using MemoTableType = BinaryMemoTable<BinaryBuilder>;
0915 };
0916
0917 template <typename T>
0918 struct HashTraits<T, enable_if_t<std::is_base_of<LargeBinaryType, T>::value>> {
0919   using MemoTableType = BinaryMemoTable<LargeBinaryBuilder>;
0920 };
0921
0922 template <typename MemoTableType>
0923 static inline Status ComputeNullBitmap(MemoryPool* pool, const MemoTableType& memo_table,
0924                                        int64_t start_offset, int64_t* null_count,
0925                                        std::shared_ptr<Buffer>* null_bitmap) {
0926   int64_t dict_length = static_cast<int64_t>(memo_table.size()) - start_offset;
0927   int64_t null_index = memo_table.GetNull();
0928
0929   *null_count = 0;
0930   *null_bitmap = nullptr;
0931
0932   if (null_index != kKeyNotFound && null_index >= start_offset) {
0933     null_index -= start_offset;
0934     *null_count = 1;
0935     ARROW_ASSIGN_OR_RAISE(*null_bitmap,
0936                           internal::BitmapAllButOne(pool, dict_length, null_index));
0937   }
0938
0939   return Status::OK();
0940 }
0941
0942 struct StringViewHash {
0943   // std::hash compatible hasher for use with std::unordered_*
0944   // (the std::hash specialization provided by nonstd constructs std::string
0945   // temporaries then invokes std::hash<std::string> against those)
0946   hash_t operator()(std::string_view value) const {
0947     return ComputeStringHash<0>(value.data(), static_cast<int64_t>(value.size()));
0948   }
0949 };
0950
0951 }  // namespace internal
0952 }  // namespace arrow