File indexing completed on 2025-08-28 08:27:09
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020 #pragma once
0021
0022 #include <algorithm>
0023 #include <cassert>
0024 #include <cmath>
0025 #include <cstdint>
0026 #include <cstring>
0027 #include <limits>
0028 #include <memory>
0029 #include <string>
0030 #include <type_traits>
0031 #include <utility>
0032 #include <vector>
0033
0034 #include "arrow/array/builder_binary.h"
0035 #include "arrow/buffer_builder.h"
0036 #include "arrow/result.h"
0037 #include "arrow/status.h"
0038 #include "arrow/type_fwd.h"
0039 #include "arrow/type_traits.h"
0040 #include "arrow/util/bit_util.h"
0041 #include "arrow/util/bitmap_builders.h"
0042 #include "arrow/util/endian.h"
0043 #include "arrow/util/logging.h"
0044 #include "arrow/util/macros.h"
0045 #include "arrow/util/ubsan.h"
0046
0047 #define XXH_INLINE_ALL
0048
0049 #include "arrow/vendored/xxhash.h" // IWYU pragma: keep
0050
0051 namespace arrow {
0052 namespace internal {
0053
0054
0055 typedef uint64_t hash_t;
0056
0057
0058
0059
0060
0061
0062
0063 template <uint64_t AlgNum>
0064 inline hash_t ComputeStringHash(const void* data, int64_t length);
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080 ARROW_EXPORT hash_t ComputeBitmapHash(const uint8_t* bitmap, hash_t seed,
0081 int64_t bits_offset, int64_t num_bits);
0082
0083 template <typename Scalar, uint64_t AlgNum>
0084 struct ScalarHelperBase {
0085 static bool CompareScalars(Scalar u, Scalar v) { return u == v; }
0086
0087 static hash_t ComputeHash(const Scalar& value) {
0088
0089
0090
0091
0092
0093 return ComputeStringHash<AlgNum>(&value, sizeof(value));
0094 }
0095 };
0096
0097 template <typename Scalar, uint64_t AlgNum = 0, typename Enable = void>
0098 struct ScalarHelper : public ScalarHelperBase<Scalar, AlgNum> {};
0099
0100 template <typename Scalar, uint64_t AlgNum>
0101 struct ScalarHelper<Scalar, AlgNum, enable_if_t<std::is_integral<Scalar>::value>>
0102 : public ScalarHelperBase<Scalar, AlgNum> {
0103
0104
0105 static hash_t ComputeHash(const Scalar& value) {
0106
0107
0108
0109
0110 static constexpr uint64_t multipliers[] = {11400714785074694791ULL,
0111 14029467366897019727ULL};
0112
0113
0114
0115
0116 auto h = static_cast<hash_t>(value);
0117 return bit_util::ByteSwap(multipliers[AlgNum] * h);
0118 }
0119 };
0120
0121 template <typename Scalar, uint64_t AlgNum>
0122 struct ScalarHelper<Scalar, AlgNum,
0123 enable_if_t<std::is_same<std::string_view, Scalar>::value>>
0124 : public ScalarHelperBase<Scalar, AlgNum> {
0125
0126
0127 static hash_t ComputeHash(std::string_view value) {
0128 return ComputeStringHash<AlgNum>(value.data(), static_cast<int64_t>(value.size()));
0129 }
0130 };
0131
0132 template <typename Scalar, uint64_t AlgNum>
0133 struct ScalarHelper<Scalar, AlgNum, enable_if_t<std::is_floating_point<Scalar>::value>>
0134 : public ScalarHelperBase<Scalar, AlgNum> {
0135
0136
0137 static bool CompareScalars(Scalar u, Scalar v) {
0138 if (std::isnan(u)) {
0139
0140 return std::isnan(v);
0141 }
0142 return u == v;
0143 }
0144 };
0145
0146 template <uint64_t AlgNum = 0>
0147 hash_t ComputeStringHash(const void* data, int64_t length) {
0148 if (ARROW_PREDICT_TRUE(length <= 16)) {
0149
0150
0151 auto p = reinterpret_cast<const uint8_t*>(data);
0152 auto n = static_cast<uint32_t>(length);
0153 if (n <= 8) {
0154 if (n <= 3) {
0155 if (n == 0) {
0156 return 1U;
0157 }
0158 uint32_t x = (n << 24) ^ (p[0] << 16) ^ (p[n / 2] << 8) ^ p[n - 1];
0159 return ScalarHelper<uint32_t, AlgNum>::ComputeHash(x);
0160 }
0161
0162
0163
0164
0165 uint32_t x, y;
0166 hash_t hx, hy;
0167 x = util::SafeLoadAs<uint32_t>(p + n - 4);
0168 y = util::SafeLoadAs<uint32_t>(p);
0169 hx = ScalarHelper<uint32_t, AlgNum>::ComputeHash(x);
0170 hy = ScalarHelper<uint32_t, AlgNum ^ 1>::ComputeHash(y);
0171 return n ^ hx ^ hy;
0172 }
0173
0174
0175 uint64_t x, y;
0176 hash_t hx, hy;
0177 x = util::SafeLoadAs<uint64_t>(p + n - 8);
0178 y = util::SafeLoadAs<uint64_t>(p);
0179 hx = ScalarHelper<uint64_t, AlgNum>::ComputeHash(x);
0180 hy = ScalarHelper<uint64_t, AlgNum ^ 1>::ComputeHash(y);
0181 return n ^ hx ^ hy;
0182 }
0183
0184 #if XXH3_SECRET_SIZE_MIN != 136
0185 # error XXH3_SECRET_SIZE_MIN changed, please fix kXxh3Secrets
0186 #endif
0187
0188
0189
0190
0191 static constexpr unsigned char kXxh3Secrets[XXH3_SECRET_SIZE_MIN + 1] = {
0192 0xe7, 0x8b, 0x13, 0xf9, 0xfc, 0xb5, 0x8e, 0xef, 0x81, 0x48, 0x2c, 0xbf, 0xf9, 0x9f,
0193 0xc1, 0x1e, 0x43, 0x6d, 0xbf, 0xa6, 0x6d, 0xb5, 0x72, 0xbc, 0x97, 0xd8, 0x61, 0x24,
0194 0x0f, 0x12, 0xe3, 0x05, 0x21, 0xf7, 0x5c, 0x66, 0x67, 0xa5, 0x65, 0x03, 0x96, 0x26,
0195 0x69, 0xd8, 0x29, 0x20, 0xf8, 0xc7, 0xb0, 0x3d, 0xdd, 0x7d, 0x18, 0xa0, 0x60, 0x75,
0196 0x92, 0xa4, 0xce, 0xba, 0xc0, 0x77, 0xf4, 0xac, 0xb7, 0x03, 0x53, 0xf0, 0x98, 0xce,
0197 0xe6, 0x2b, 0x20, 0xc7, 0x82, 0x91, 0xab, 0xbf, 0x68, 0x5c, 0x62, 0x4d, 0x33, 0xa3,
0198 0xe1, 0xb3, 0xff, 0x97, 0x54, 0x4c, 0x44, 0x34, 0xb5, 0xb9, 0x32, 0x4c, 0x75, 0x42,
0199 0x89, 0x53, 0x94, 0xd4, 0x9f, 0x2b, 0x76, 0x4d, 0x4e, 0xe6, 0xfa, 0x15, 0x3e, 0xc1,
0200 0xdb, 0x71, 0x4b, 0x2c, 0x94, 0xf5, 0xfc, 0x8c, 0x89, 0x4b, 0xfb, 0xc1, 0x82, 0xa5,
0201 0x6a, 0x53, 0xf9, 0x4a, 0xba, 0xce, 0x1f, 0xc0, 0x97, 0x1a, 0x87};
0202
0203 static_assert(AlgNum < 2, "AlgNum too large");
0204 static constexpr auto secret = kXxh3Secrets + AlgNum;
0205 return XXH3_64bits_withSecret(data, static_cast<size_t>(length), secret,
0206 XXH3_SECRET_SIZE_MIN);
0207 }
0208
0209
0210
0211
0212
0213
0214 template <typename Payload>
0215 class HashTable {
0216 public:
0217 static constexpr hash_t kSentinel = 0ULL;
0218 static constexpr int64_t kLoadFactor = 2UL;
0219
0220 struct Entry {
0221 hash_t h;
0222 Payload payload;
0223
0224
0225 operator bool() const { return h != kSentinel; }
0226 };
0227
0228 HashTable(MemoryPool* pool, uint64_t capacity) : entries_builder_(pool) {
0229 DCHECK_NE(pool, nullptr);
0230
0231 capacity = std::max<uint64_t>(capacity, 32UL);
0232 capacity_ = bit_util::NextPower2(capacity);
0233 capacity_mask_ = capacity_ - 1;
0234 size_ = 0;
0235
0236 DCHECK_OK(UpsizeBuffer(capacity_));
0237 }
0238
0239
0240
0241
0242 template <typename CmpFunc>
0243 std::pair<Entry*, bool> Lookup(hash_t h, CmpFunc&& cmp_func) {
0244 auto p = Lookup<DoCompare, CmpFunc>(h, entries_, capacity_mask_,
0245 std::forward<CmpFunc>(cmp_func));
0246 return {&entries_[p.first], p.second};
0247 }
0248
0249 template <typename CmpFunc>
0250 std::pair<const Entry*, bool> Lookup(hash_t h, CmpFunc&& cmp_func) const {
0251 auto p = Lookup<DoCompare, CmpFunc>(h, entries_, capacity_mask_,
0252 std::forward<CmpFunc>(cmp_func));
0253 return {&entries_[p.first], p.second};
0254 }
0255
0256 Status Insert(Entry* entry, hash_t h, const Payload& payload) {
0257
0258 assert(!*entry);
0259 entry->h = FixHash(h);
0260 entry->payload = payload;
0261 ++size_;
0262
0263 if (ARROW_PREDICT_FALSE(NeedUpsizing())) {
0264
0265 return Upsize(capacity_ * kLoadFactor * 2);
0266 }
0267 return Status::OK();
0268 }
0269
0270 uint64_t size() const { return size_; }
0271
0272
0273
0274 template <typename VisitFunc>
0275 void VisitEntries(VisitFunc&& visit_func) const {
0276 for (uint64_t i = 0; i < capacity_; i++) {
0277 const auto& entry = entries_[i];
0278 if (entry) {
0279 visit_func(&entry);
0280 }
0281 }
0282 }
0283
0284 protected:
0285
0286 enum CompareKind { DoCompare, NoCompare };
0287
0288
0289 template <CompareKind CKind, typename CmpFunc>
0290 std::pair<uint64_t, bool> Lookup(hash_t h, const Entry* entries, uint64_t size_mask,
0291 CmpFunc&& cmp_func) const {
0292 static constexpr uint8_t perturb_shift = 5;
0293
0294 uint64_t index, perturb;
0295 const Entry* entry;
0296
0297 h = FixHash(h);
0298 index = h & size_mask;
0299 perturb = (h >> perturb_shift) + 1U;
0300
0301 while (true) {
0302 entry = &entries[index];
0303 if (CompareEntry<CKind, CmpFunc>(h, entry, std::forward<CmpFunc>(cmp_func))) {
0304
0305 return {index, true};
0306 }
0307 if (entry->h == kSentinel) {
0308
0309 return {index, false};
0310 }
0311
0312
0313
0314
0315 index = (index + perturb) & size_mask;
0316 perturb = (perturb >> perturb_shift) + 1U;
0317 }
0318 }
0319
0320 template <CompareKind CKind, typename CmpFunc>
0321 bool CompareEntry(hash_t h, const Entry* entry, CmpFunc&& cmp_func) const {
0322 if (CKind == NoCompare) {
0323 return false;
0324 } else {
0325 return entry->h == h && cmp_func(&entry->payload);
0326 }
0327 }
0328
0329 bool NeedUpsizing() const {
0330
0331 return size_ * kLoadFactor >= capacity_;
0332 }
0333
0334 Status UpsizeBuffer(uint64_t capacity) {
0335 RETURN_NOT_OK(entries_builder_.Resize(capacity));
0336 entries_ = entries_builder_.mutable_data();
0337 memset(static_cast<void*>(entries_), 0, capacity * sizeof(Entry));
0338
0339 return Status::OK();
0340 }
0341
0342 Status Upsize(uint64_t new_capacity) {
0343 assert(new_capacity > capacity_);
0344 uint64_t new_mask = new_capacity - 1;
0345 assert((new_capacity & new_mask) == 0);
0346
0347
0348 const Entry* old_entries = entries_;
0349 ARROW_ASSIGN_OR_RAISE(auto previous, entries_builder_.FinishWithLength(capacity_));
0350
0351 RETURN_NOT_OK(UpsizeBuffer(new_capacity));
0352
0353 for (uint64_t i = 0; i < capacity_; i++) {
0354 const auto& entry = old_entries[i];
0355 if (entry) {
0356
0357 auto p = Lookup<NoCompare>(entry.h, entries_, new_mask,
0358 [](const Payload*) { return false; });
0359
0360
0361 assert(!p.second);
0362 entries_[p.first] = entry;
0363 }
0364 }
0365 capacity_ = new_capacity;
0366 capacity_mask_ = new_mask;
0367
0368 return Status::OK();
0369 }
0370
0371 hash_t FixHash(hash_t h) const { return (h == kSentinel) ? 42U : h; }
0372
0373
0374 uint64_t capacity_;
0375 uint64_t capacity_mask_;
0376
0377 uint64_t size_;
0378
0379 Entry* entries_;
0380 TypedBufferBuilder<Entry> entries_builder_;
0381 };
0382
0383
0384
0385 constexpr int32_t kKeyNotFound = -1;
0386
0387
0388
0389
0390 class MemoTable {
0391 public:
0392 virtual ~MemoTable() = default;
0393
0394 virtual int32_t size() const = 0;
0395 };
0396
0397
0398
0399
0400
0401
0402
0403 template <typename Scalar, template <class> class HashTableTemplateType = HashTable>
0404 class ScalarMemoTable : public MemoTable {
0405 public:
0406 explicit ScalarMemoTable(MemoryPool* pool, int64_t entries = 0)
0407 : hash_table_(pool, static_cast<uint64_t>(entries)) {}
0408
0409 int32_t Get(const Scalar& value) const {
0410 auto cmp_func = [value](const Payload* payload) -> bool {
0411 return ScalarHelper<Scalar, 0>::CompareScalars(payload->value, value);
0412 };
0413 hash_t h = ComputeHash(value);
0414 auto p = hash_table_.Lookup(h, cmp_func);
0415 if (p.second) {
0416 return p.first->payload.memo_index;
0417 } else {
0418 return kKeyNotFound;
0419 }
0420 }
0421
0422 template <typename Func1, typename Func2>
0423 Status GetOrInsert(const Scalar& value, Func1&& on_found, Func2&& on_not_found,
0424 int32_t* out_memo_index) {
0425 auto cmp_func = [value](const Payload* payload) -> bool {
0426 return ScalarHelper<Scalar, 0>::CompareScalars(value, payload->value);
0427 };
0428 hash_t h = ComputeHash(value);
0429 auto p = hash_table_.Lookup(h, cmp_func);
0430 int32_t memo_index;
0431 if (p.second) {
0432 memo_index = p.first->payload.memo_index;
0433 on_found(memo_index);
0434 } else {
0435 memo_index = size();
0436 RETURN_NOT_OK(hash_table_.Insert(p.first, h, {value, memo_index}));
0437 on_not_found(memo_index);
0438 }
0439 *out_memo_index = memo_index;
0440 return Status::OK();
0441 }
0442
0443 Status GetOrInsert(const Scalar& value, int32_t* out_memo_index) {
0444 return GetOrInsert(
0445 value, [](int32_t i) {}, [](int32_t i) {}, out_memo_index);
0446 }
0447
0448 int32_t GetNull() const { return null_index_; }
0449
0450 template <typename Func1, typename Func2>
0451 int32_t GetOrInsertNull(Func1&& on_found, Func2&& on_not_found) {
0452 int32_t memo_index = GetNull();
0453 if (memo_index != kKeyNotFound) {
0454 on_found(memo_index);
0455 } else {
0456 null_index_ = memo_index = size();
0457 on_not_found(memo_index);
0458 }
0459 return memo_index;
0460 }
0461
0462 int32_t GetOrInsertNull() {
0463 return GetOrInsertNull([](int32_t i) {}, [](int32_t i) {});
0464 }
0465
0466
0467
0468 int32_t size() const override {
0469 return static_cast<int32_t>(hash_table_.size()) + (GetNull() != kKeyNotFound);
0470 }
0471
0472
0473 void CopyValues(int32_t start, Scalar* out_data) const {
0474 hash_table_.VisitEntries([=](const HashTableEntry* entry) {
0475 int32_t index = entry->payload.memo_index - start;
0476 if (index >= 0) {
0477 out_data[index] = entry->payload.value;
0478 }
0479 });
0480
0481 if (null_index_ != kKeyNotFound) {
0482 int32_t index = null_index_ - start;
0483 if (index >= 0) {
0484 out_data[index] = Scalar{};
0485 }
0486 }
0487 }
0488
0489 void CopyValues(Scalar* out_data) const { CopyValues(0, out_data); }
0490
0491 protected:
0492 struct Payload {
0493 Scalar value;
0494 int32_t memo_index;
0495 };
0496
0497 using HashTableType = HashTableTemplateType<Payload>;
0498 using HashTableEntry = typename HashTableType::Entry;
0499 HashTableType hash_table_;
0500 int32_t null_index_ = kKeyNotFound;
0501
0502 hash_t ComputeHash(const Scalar& value) const {
0503 return ScalarHelper<Scalar, 0>::ComputeHash(value);
0504 }
0505
0506 public:
0507
0508
0509 Status MergeTable(const ScalarMemoTable& other_table) {
0510 const HashTableType& other_hashtable = other_table.hash_table_;
0511
0512 other_hashtable.VisitEntries([this](const HashTableEntry* other_entry) {
0513 int32_t unused;
0514 DCHECK_OK(this->GetOrInsert(other_entry->payload.value, &unused));
0515 });
0516
0517 return Status::OK();
0518 }
0519 };
0520
0521
0522
0523
0524 template <typename Scalar, typename Enable = void>
0525 struct SmallScalarTraits {};
0526
0527 template <>
0528 struct SmallScalarTraits<bool> {
0529 static constexpr int32_t cardinality = 2;
0530
0531 static uint32_t AsIndex(bool value) { return value ? 1 : 0; }
0532 };
0533
0534 template <typename Scalar>
0535 struct SmallScalarTraits<Scalar, enable_if_t<std::is_integral<Scalar>::value>> {
0536 using Unsigned = typename std::make_unsigned<Scalar>::type;
0537
0538 static constexpr int32_t cardinality = 1U + std::numeric_limits<Unsigned>::max();
0539
0540 static uint32_t AsIndex(Scalar value) { return static_cast<Unsigned>(value); }
0541 };
0542
0543 template <typename Scalar, template <class> class HashTableTemplateType = HashTable>
0544 class SmallScalarMemoTable : public MemoTable {
0545 public:
0546 explicit SmallScalarMemoTable(MemoryPool* pool, int64_t entries = 0) {
0547 std::fill(value_to_index_, value_to_index_ + cardinality + 1, kKeyNotFound);
0548 index_to_value_.reserve(cardinality);
0549 }
0550
0551 int32_t Get(const Scalar value) const {
0552 auto value_index = AsIndex(value);
0553 return value_to_index_[value_index];
0554 }
0555
0556 template <typename Func1, typename Func2>
0557 Status GetOrInsert(const Scalar value, Func1&& on_found, Func2&& on_not_found,
0558 int32_t* out_memo_index) {
0559 auto value_index = AsIndex(value);
0560 auto memo_index = value_to_index_[value_index];
0561 if (memo_index == kKeyNotFound) {
0562 memo_index = static_cast<int32_t>(index_to_value_.size());
0563 index_to_value_.push_back(value);
0564 value_to_index_[value_index] = memo_index;
0565 DCHECK_LT(memo_index, cardinality + 1);
0566 on_not_found(memo_index);
0567 } else {
0568 on_found(memo_index);
0569 }
0570 *out_memo_index = memo_index;
0571 return Status::OK();
0572 }
0573
0574 Status GetOrInsert(const Scalar value, int32_t* out_memo_index) {
0575 return GetOrInsert(
0576 value, [](int32_t i) {}, [](int32_t i) {}, out_memo_index);
0577 }
0578
0579 int32_t GetNull() const { return value_to_index_[cardinality]; }
0580
0581 template <typename Func1, typename Func2>
0582 int32_t GetOrInsertNull(Func1&& on_found, Func2&& on_not_found) {
0583 auto memo_index = GetNull();
0584 if (memo_index == kKeyNotFound) {
0585 memo_index = value_to_index_[cardinality] = size();
0586 index_to_value_.push_back(0);
0587 on_not_found(memo_index);
0588 } else {
0589 on_found(memo_index);
0590 }
0591 return memo_index;
0592 }
0593
0594 int32_t GetOrInsertNull() {
0595 return GetOrInsertNull([](int32_t i) {}, [](int32_t i) {});
0596 }
0597
0598
0599
0600 int32_t size() const override { return static_cast<int32_t>(index_to_value_.size()); }
0601
0602
0603 Status MergeTable(const SmallScalarMemoTable& other_table) {
0604 for (const Scalar& other_val : other_table.index_to_value_) {
0605 int32_t unused;
0606 RETURN_NOT_OK(this->GetOrInsert(other_val, &unused));
0607 }
0608 return Status::OK();
0609 }
0610
0611
0612 void CopyValues(int32_t start, Scalar* out_data) const {
0613 DCHECK_GE(start, 0);
0614 DCHECK_LE(static_cast<size_t>(start), index_to_value_.size());
0615 int64_t offset = start * static_cast<int32_t>(sizeof(Scalar));
0616 memcpy(out_data, index_to_value_.data() + offset, (size() - start) * sizeof(Scalar));
0617 }
0618
0619 void CopyValues(Scalar* out_data) const { CopyValues(0, out_data); }
0620
0621 const std::vector<Scalar>& values() const { return index_to_value_; }
0622
0623 protected:
0624 static constexpr auto cardinality = SmallScalarTraits<Scalar>::cardinality;
0625 static_assert(cardinality <= 256, "cardinality too large for direct-addressed table");
0626
0627 uint32_t AsIndex(Scalar value) const {
0628 return SmallScalarTraits<Scalar>::AsIndex(value);
0629 }
0630
0631
0632 int32_t value_to_index_[cardinality + 1];
0633 std::vector<Scalar> index_to_value_;
0634 };
0635
0636
0637
0638
0639 template <typename BinaryBuilderT>
0640 class BinaryMemoTable : public MemoTable {
0641 public:
0642 using builder_offset_type = typename BinaryBuilderT::offset_type;
0643 explicit BinaryMemoTable(MemoryPool* pool, int64_t entries = 0,
0644 int64_t values_size = -1)
0645 : hash_table_(pool, static_cast<uint64_t>(entries)), binary_builder_(pool) {
0646 const int64_t data_size = (values_size < 0) ? entries * 4 : values_size;
0647 DCHECK_OK(binary_builder_.Resize(entries));
0648 DCHECK_OK(binary_builder_.ReserveData(data_size));
0649 }
0650
0651 int32_t Get(const void* data, builder_offset_type length) const {
0652 hash_t h = ComputeStringHash<0>(data, length);
0653 auto p = Lookup(h, data, length);
0654 if (p.second) {
0655 return p.first->payload.memo_index;
0656 } else {
0657 return kKeyNotFound;
0658 }
0659 }
0660
0661 int32_t Get(std::string_view value) const {
0662 return Get(value.data(), static_cast<builder_offset_type>(value.length()));
0663 }
0664
0665 template <typename Func1, typename Func2>
0666 Status GetOrInsert(const void* data, builder_offset_type length, Func1&& on_found,
0667 Func2&& on_not_found, int32_t* out_memo_index) {
0668 hash_t h = ComputeStringHash<0>(data, length);
0669 auto p = Lookup(h, data, length);
0670 int32_t memo_index;
0671 if (p.second) {
0672 memo_index = p.first->payload.memo_index;
0673 on_found(memo_index);
0674 } else {
0675 memo_index = size();
0676
0677 RETURN_NOT_OK(binary_builder_.Append(static_cast<const char*>(data), length));
0678
0679 RETURN_NOT_OK(
0680 hash_table_.Insert(const_cast<HashTableEntry*>(p.first), h, {memo_index}));
0681
0682 on_not_found(memo_index);
0683 }
0684 *out_memo_index = memo_index;
0685 return Status::OK();
0686 }
0687
0688 template <typename Func1, typename Func2>
0689 Status GetOrInsert(std::string_view value, Func1&& on_found, Func2&& on_not_found,
0690 int32_t* out_memo_index) {
0691 return GetOrInsert(value.data(), static_cast<builder_offset_type>(value.length()),
0692 std::forward<Func1>(on_found), std::forward<Func2>(on_not_found),
0693 out_memo_index);
0694 }
0695
0696 Status GetOrInsert(const void* data, builder_offset_type length,
0697 int32_t* out_memo_index) {
0698 return GetOrInsert(
0699 data, length, [](int32_t i) {}, [](int32_t i) {}, out_memo_index);
0700 }
0701
0702 Status GetOrInsert(std::string_view value, int32_t* out_memo_index) {
0703 return GetOrInsert(value.data(), static_cast<builder_offset_type>(value.length()),
0704 out_memo_index);
0705 }
0706
0707 int32_t GetNull() const { return null_index_; }
0708
0709 template <typename Func1, typename Func2>
0710 int32_t GetOrInsertNull(Func1&& on_found, Func2&& on_not_found) {
0711 int32_t memo_index = GetNull();
0712 if (memo_index == kKeyNotFound) {
0713 memo_index = null_index_ = size();
0714 DCHECK_OK(binary_builder_.AppendNull());
0715 on_not_found(memo_index);
0716 } else {
0717 on_found(memo_index);
0718 }
0719 return memo_index;
0720 }
0721
0722 int32_t GetOrInsertNull() {
0723 return GetOrInsertNull([](int32_t i) {}, [](int32_t i) {});
0724 }
0725
0726
0727
0728 int32_t size() const override {
0729 return static_cast<int32_t>(hash_table_.size() + (GetNull() != kKeyNotFound));
0730 }
0731
0732 int64_t values_size() const { return binary_builder_.value_data_length(); }
0733
0734
0735 template <class Offset>
0736 void CopyOffsets(int32_t start, Offset* out_data) const {
0737 DCHECK_LE(start, size());
0738
0739 const builder_offset_type* offsets = binary_builder_.offsets_data();
0740 const builder_offset_type delta =
0741 start < binary_builder_.length() ? offsets[start] : 0;
0742 for (int32_t i = start; i < size(); ++i) {
0743 const builder_offset_type adjusted_offset = offsets[i] - delta;
0744 Offset cast_offset = static_cast<Offset>(adjusted_offset);
0745 assert(static_cast<builder_offset_type>(cast_offset) ==
0746 adjusted_offset);
0747 *out_data++ = cast_offset;
0748 }
0749
0750
0751 *out_data = static_cast<Offset>(binary_builder_.value_data_length() - delta);
0752 }
0753
0754 template <class Offset>
0755 void CopyOffsets(Offset* out_data) const {
0756 CopyOffsets(0, out_data);
0757 }
0758
0759
0760 void CopyValues(int32_t start, uint8_t* out_data) const {
0761 CopyValues(start, -1, out_data);
0762 }
0763
0764
0765 void CopyValues(int32_t start, int64_t out_size, uint8_t* out_data) const {
0766 DCHECK_LE(start, size());
0767
0768
0769 const builder_offset_type offset = binary_builder_.offset(start);
0770 const auto length = binary_builder_.value_data_length() - static_cast<size_t>(offset);
0771
0772 if (out_size != -1) {
0773 assert(static_cast<int64_t>(length) <= out_size);
0774 }
0775
0776 auto view = binary_builder_.GetView(start);
0777 memcpy(out_data, view.data(), length);
0778 }
0779
0780 void CopyValues(uint8_t* out_data) const { CopyValues(0, -1, out_data); }
0781
0782 void CopyValues(int64_t out_size, uint8_t* out_data) const {
0783 CopyValues(0, out_size, out_data);
0784 }
0785
0786 void CopyFixedWidthValues(int32_t start, int32_t width_size, int64_t out_size,
0787 uint8_t* out_data) const {
0788
0789
0790
0791
0792
0793
0794
0795 if (start >= size()) {
0796 return;
0797 }
0798
0799 int32_t null_index = GetNull();
0800 if (null_index < start) {
0801
0802 CopyValues(start, out_size, out_data);
0803 return;
0804 }
0805
0806 builder_offset_type left_offset = binary_builder_.offset(start);
0807
0808
0809
0810 #ifndef NDEBUG
0811 int64_t data_length = values_size() - static_cast<size_t>(left_offset);
0812 assert(data_length + width_size == out_size);
0813 ARROW_UNUSED(data_length);
0814 #endif
0815
0816 auto in_data = binary_builder_.value_data() + left_offset;
0817
0818
0819 auto null_data_offset = binary_builder_.offset(null_index);
0820 auto left_size = null_data_offset - left_offset;
0821 if (left_size > 0) {
0822 memcpy(out_data, in_data + left_offset, left_size);
0823 }
0824
0825 memset(out_data + left_size, 0, width_size);
0826
0827 auto right_size = values_size() - static_cast<size_t>(null_data_offset);
0828 if (right_size > 0) {
0829
0830 auto out_offset = left_size + width_size;
0831 assert(out_data + out_offset + right_size == out_data + out_size);
0832 memcpy(out_data + out_offset, in_data + null_data_offset, right_size);
0833 }
0834 }
0835
0836
0837
0838
0839 template <typename VisitFunc>
0840 void VisitValues(int32_t start, VisitFunc&& visit) const {
0841 for (int32_t i = start; i < size(); ++i) {
0842 visit(binary_builder_.GetView(i));
0843 }
0844 }
0845
0846
0847
0848
0849 template <typename VisitFunc>
0850 void VisitValue(int32_t idx, VisitFunc&& visit) const {
0851 visit(binary_builder_.GetView(idx));
0852 }
0853
0854 protected:
0855 struct Payload {
0856 int32_t memo_index;
0857 };
0858
0859 using HashTableType = HashTable<Payload>;
0860 using HashTableEntry = typename HashTable<Payload>::Entry;
0861 HashTableType hash_table_;
0862 BinaryBuilderT binary_builder_;
0863
0864 int32_t null_index_ = kKeyNotFound;
0865
0866 std::pair<const HashTableEntry*, bool> Lookup(hash_t h, const void* data,
0867 builder_offset_type length) const {
0868 auto cmp_func = [&](const Payload* payload) {
0869 std::string_view lhs = binary_builder_.GetView(payload->memo_index);
0870 std::string_view rhs(static_cast<const char*>(data), length);
0871 return lhs == rhs;
0872 };
0873 return hash_table_.Lookup(h, cmp_func);
0874 }
0875
0876 public:
0877 Status MergeTable(const BinaryMemoTable& other_table) {
0878 other_table.VisitValues(0, [this](std::string_view other_value) {
0879 int32_t unused;
0880 DCHECK_OK(this->GetOrInsert(other_value, &unused));
0881 });
0882 return Status::OK();
0883 }
0884 };
0885
0886 template <typename T, typename Enable = void>
0887 struct HashTraits {};
0888
0889 template <>
0890 struct HashTraits<BooleanType> {
0891 using MemoTableType = SmallScalarMemoTable<bool>;
0892 };
0893
0894 template <typename T>
0895 struct HashTraits<T, enable_if_8bit_int<T>> {
0896 using c_type = typename T::c_type;
0897 using MemoTableType = SmallScalarMemoTable<typename T::c_type>;
0898 };
0899
0900 template <typename T>
0901 struct HashTraits<T, enable_if_t<has_c_type<T>::value && !is_8bit_int<T>::value>> {
0902 using c_type = typename T::c_type;
0903 using MemoTableType = ScalarMemoTable<c_type, HashTable>;
0904 };
0905
0906 template <typename T>
0907 struct HashTraits<T, enable_if_t<has_string_view<T>::value &&
0908 !std::is_base_of<LargeBinaryType, T>::value>> {
0909 using MemoTableType = BinaryMemoTable<BinaryBuilder>;
0910 };
0911
0912 template <typename T>
0913 struct HashTraits<T, enable_if_decimal<T>> {
0914 using MemoTableType = BinaryMemoTable<BinaryBuilder>;
0915 };
0916
0917 template <typename T>
0918 struct HashTraits<T, enable_if_t<std::is_base_of<LargeBinaryType, T>::value>> {
0919 using MemoTableType = BinaryMemoTable<LargeBinaryBuilder>;
0920 };
0921
0922 template <typename MemoTableType>
0923 static inline Status ComputeNullBitmap(MemoryPool* pool, const MemoTableType& memo_table,
0924 int64_t start_offset, int64_t* null_count,
0925 std::shared_ptr<Buffer>* null_bitmap) {
0926 int64_t dict_length = static_cast<int64_t>(memo_table.size()) - start_offset;
0927 int64_t null_index = memo_table.GetNull();
0928
0929 *null_count = 0;
0930 *null_bitmap = nullptr;
0931
0932 if (null_index != kKeyNotFound && null_index >= start_offset) {
0933 null_index -= start_offset;
0934 *null_count = 1;
0935 ARROW_ASSIGN_OR_RAISE(*null_bitmap,
0936 internal::BitmapAllButOne(pool, dict_length, null_index));
0937 }
0938
0939 return Status::OK();
0940 }
0941
0942 struct StringViewHash {
0943
0944
0945
0946 hash_t operator()(std::string_view value) const {
0947 return ComputeStringHash<0>(value.data(), static_cast<int64_t>(value.size()));
0948 }
0949 };
0950
0951 }
0952 }