File indexing completed on 2025-08-28 08:27:02
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 #pragma once
0019
0020 #include <algorithm>
0021 #include <cstdint>
0022 #include <limits>
0023 #include <memory>
0024
0025 #include "arrow/buffer.h"
0026 #include "arrow/status.h"
0027 #include "arrow/util/bit_util.h"
0028 #include "arrow/util/endian.h"
0029 #include "arrow/util/macros.h"
0030 #include "arrow/util/ubsan.h"
0031 #include "arrow/util/visibility.h"
0032
0033 namespace arrow {
0034 namespace internal {
0035 namespace detail {
0036
0037 inline uint64_t LoadWord(const uint8_t* bytes) {
0038 return bit_util::ToLittleEndian(util::SafeLoadAs<uint64_t>(bytes));
0039 }
0040
0041 inline uint64_t ShiftWord(uint64_t current, uint64_t next, int64_t shift) {
0042 if (shift == 0) {
0043 return current;
0044 }
0045 return (current >> shift) | (next << (64 - shift));
0046 }
0047
0048
0049
0050 template <typename T>
0051 constexpr T BitNot(T x) {
0052 return ~x;
0053 }
0054
0055 template <>
0056 constexpr bool BitNot(bool x) {
0057 return !x;
0058 }
0059
0060 struct BitBlockAnd {
0061 template <typename T>
0062 static constexpr T Call(T left, T right) {
0063 return left & right;
0064 }
0065 };
0066
0067 struct BitBlockAndNot {
0068 template <typename T>
0069 static constexpr T Call(T left, T right) {
0070 return left & BitNot(right);
0071 }
0072 };
0073
0074 struct BitBlockOr {
0075 template <typename T>
0076 static constexpr T Call(T left, T right) {
0077 return left | right;
0078 }
0079 };
0080
0081 struct BitBlockOrNot {
0082 template <typename T>
0083 static constexpr T Call(T left, T right) {
0084 return left | BitNot(right);
0085 }
0086 };
0087
0088 }
0089
0090
0091
0092 struct BitBlockCount {
0093 int16_t length;
0094 int16_t popcount;
0095
0096 bool NoneSet() const { return this->popcount == 0; }
0097 bool AllSet() const { return this->length == this->popcount; }
0098 };
0099
0100
0101
0102
0103 class ARROW_EXPORT BitBlockCounter {
0104 public:
0105 BitBlockCounter(const uint8_t* bitmap, int64_t start_offset, int64_t length)
0106 : bitmap_(util::MakeNonNull(bitmap) + start_offset / 8),
0107 bits_remaining_(length),
0108 offset_(start_offset % 8) {}
0109
0110
0111 static constexpr int64_t kWordBits = 64;
0112
0113
0114 static constexpr int64_t kFourWordsBits = kWordBits * 4;
0115
0116
0117
0118
0119
0120
0121 BitBlockCount NextFourWords() {
0122 using detail::LoadWord;
0123 using detail::ShiftWord;
0124
0125 if (!bits_remaining_) {
0126 return {0, 0};
0127 }
0128 int64_t total_popcount = 0;
0129 if (offset_ == 0) {
0130 if (bits_remaining_ < kFourWordsBits) {
0131 return GetBlockSlow(kFourWordsBits);
0132 }
0133 total_popcount += bit_util::PopCount(LoadWord(bitmap_));
0134 total_popcount += bit_util::PopCount(LoadWord(bitmap_ + 8));
0135 total_popcount += bit_util::PopCount(LoadWord(bitmap_ + 16));
0136 total_popcount += bit_util::PopCount(LoadWord(bitmap_ + 24));
0137 } else {
0138
0139
0140 if (bits_remaining_ < 5 * kFourWordsBits - offset_) {
0141 return GetBlockSlow(kFourWordsBits);
0142 }
0143 auto current = LoadWord(bitmap_);
0144 auto next = LoadWord(bitmap_ + 8);
0145 total_popcount += bit_util::PopCount(ShiftWord(current, next, offset_));
0146 current = next;
0147 next = LoadWord(bitmap_ + 16);
0148 total_popcount += bit_util::PopCount(ShiftWord(current, next, offset_));
0149 current = next;
0150 next = LoadWord(bitmap_ + 24);
0151 total_popcount += bit_util::PopCount(ShiftWord(current, next, offset_));
0152 current = next;
0153 next = LoadWord(bitmap_ + 32);
0154 total_popcount += bit_util::PopCount(ShiftWord(current, next, offset_));
0155 }
0156 bitmap_ += bit_util::BytesForBits(kFourWordsBits);
0157 bits_remaining_ -= kFourWordsBits;
0158 return {256, static_cast<int16_t>(total_popcount)};
0159 }
0160
0161
0162
0163
0164
0165
0166 BitBlockCount NextWord() {
0167 using detail::LoadWord;
0168 using detail::ShiftWord;
0169
0170 if (!bits_remaining_) {
0171 return {0, 0};
0172 }
0173 int64_t popcount = 0;
0174 if (offset_ == 0) {
0175 if (bits_remaining_ < kWordBits) {
0176 return GetBlockSlow(kWordBits);
0177 }
0178 popcount = bit_util::PopCount(LoadWord(bitmap_));
0179 } else {
0180
0181
0182 if (bits_remaining_ < 2 * kWordBits - offset_) {
0183 return GetBlockSlow(kWordBits);
0184 }
0185 popcount = bit_util::PopCount(
0186 ShiftWord(LoadWord(bitmap_), LoadWord(bitmap_ + 8), offset_));
0187 }
0188 bitmap_ += kWordBits / 8;
0189 bits_remaining_ -= kWordBits;
0190 return {64, static_cast<int16_t>(popcount)};
0191 }
0192
0193 private:
0194
0195
0196 BitBlockCount GetBlockSlow(int64_t block_size) noexcept;
0197
0198 const uint8_t* bitmap_;
0199 int64_t bits_remaining_;
0200 int64_t offset_;
0201 };
0202
0203
0204
0205
0206 class ARROW_EXPORT OptionalBitBlockCounter {
0207 public:
0208
0209 OptionalBitBlockCounter(const uint8_t* validity_bitmap, int64_t offset, int64_t length);
0210
0211
0212 OptionalBitBlockCounter(const std::shared_ptr<Buffer>& validity_bitmap, int64_t offset,
0213 int64_t length);
0214
0215
0216
0217
0218 BitBlockCount NextBlock() {
0219 static constexpr int64_t kMaxBlockSize = std::numeric_limits<int16_t>::max();
0220 if (has_bitmap_) {
0221 BitBlockCount block = counter_.NextWord();
0222 position_ += block.length;
0223 return block;
0224 } else {
0225 int16_t block_size =
0226 static_cast<int16_t>(std::min(kMaxBlockSize, length_ - position_));
0227 position_ += block_size;
0228
0229 return {block_size, block_size};
0230 }
0231 }
0232
0233
0234
0235 BitBlockCount NextWord() {
0236 static constexpr int64_t kWordSize = 64;
0237 if (has_bitmap_) {
0238 BitBlockCount block = counter_.NextWord();
0239 position_ += block.length;
0240 return block;
0241 } else {
0242 int16_t block_size = static_cast<int16_t>(std::min(kWordSize, length_ - position_));
0243 position_ += block_size;
0244
0245 return {block_size, block_size};
0246 }
0247 }
0248
0249 private:
0250 const bool has_bitmap_;
0251 int64_t position_;
0252 int64_t length_;
0253 BitBlockCounter counter_;
0254 };
0255
0256
0257
0258
0259
0260 class ARROW_EXPORT BinaryBitBlockCounter {
0261 public:
0262 BinaryBitBlockCounter(const uint8_t* left_bitmap, int64_t left_offset,
0263 const uint8_t* right_bitmap, int64_t right_offset, int64_t length)
0264 : left_bitmap_(util::MakeNonNull(left_bitmap) + left_offset / 8),
0265 left_offset_(left_offset % 8),
0266 right_bitmap_(util::MakeNonNull(right_bitmap) + right_offset / 8),
0267 right_offset_(right_offset % 8),
0268 bits_remaining_(length) {}
0269
0270
0271
0272
0273
0274
0275 BitBlockCount NextAndWord() { return NextWord<detail::BitBlockAnd>(); }
0276
0277
0278 BitBlockCount NextAndNotWord() { return NextWord<detail::BitBlockAndNot>(); }
0279
0280
0281 BitBlockCount NextOrWord() { return NextWord<detail::BitBlockOr>(); }
0282
0283
0284 BitBlockCount NextOrNotWord() { return NextWord<detail::BitBlockOrNot>(); }
0285
0286 private:
0287 template <class Op>
0288 BitBlockCount NextWord() {
0289 using detail::LoadWord;
0290 using detail::ShiftWord;
0291
0292 if (!bits_remaining_) {
0293 return {0, 0};
0294 }
0295
0296
0297 constexpr int64_t kWordBits = BitBlockCounter::kWordBits;
0298 const int64_t bits_required_to_use_words =
0299 std::max(left_offset_ == 0 ? 64 : 64 + (64 - left_offset_),
0300 right_offset_ == 0 ? 64 : 64 + (64 - right_offset_));
0301 if (bits_remaining_ < bits_required_to_use_words) {
0302 const int16_t run_length =
0303 static_cast<int16_t>(std::min(bits_remaining_, kWordBits));
0304 int16_t popcount = 0;
0305 for (int64_t i = 0; i < run_length; ++i) {
0306 if (Op::Call(bit_util::GetBit(left_bitmap_, left_offset_ + i),
0307 bit_util::GetBit(right_bitmap_, right_offset_ + i))) {
0308 ++popcount;
0309 }
0310 }
0311
0312
0313 left_bitmap_ += run_length / 8;
0314 right_bitmap_ += run_length / 8;
0315 bits_remaining_ -= run_length;
0316 return {run_length, popcount};
0317 }
0318
0319 int64_t popcount = 0;
0320 if (left_offset_ == 0 && right_offset_ == 0) {
0321 popcount =
0322 bit_util::PopCount(Op::Call(LoadWord(left_bitmap_), LoadWord(right_bitmap_)));
0323 } else {
0324 auto left_word =
0325 ShiftWord(LoadWord(left_bitmap_), LoadWord(left_bitmap_ + 8), left_offset_);
0326 auto right_word =
0327 ShiftWord(LoadWord(right_bitmap_), LoadWord(right_bitmap_ + 8), right_offset_);
0328 popcount = bit_util::PopCount(Op::Call(left_word, right_word));
0329 }
0330 left_bitmap_ += kWordBits / 8;
0331 right_bitmap_ += kWordBits / 8;
0332 bits_remaining_ -= kWordBits;
0333 return {64, static_cast<int16_t>(popcount)};
0334 }
0335
0336 const uint8_t* left_bitmap_;
0337 int64_t left_offset_;
0338 const uint8_t* right_bitmap_;
0339 int64_t right_offset_;
0340 int64_t bits_remaining_;
0341 };
0342
0343 class ARROW_EXPORT OptionalBinaryBitBlockCounter {
0344 public:
0345
0346 OptionalBinaryBitBlockCounter(const uint8_t* left_bitmap, int64_t left_offset,
0347 const uint8_t* right_bitmap, int64_t right_offset,
0348 int64_t length);
0349
0350
0351 OptionalBinaryBitBlockCounter(const std::shared_ptr<Buffer>& left_bitmap,
0352 int64_t left_offset,
0353 const std::shared_ptr<Buffer>& right_bitmap,
0354 int64_t right_offset, int64_t length);
0355
0356 BitBlockCount NextAndBlock() {
0357 static constexpr int64_t kMaxBlockSize = std::numeric_limits<int16_t>::max();
0358 switch (has_bitmap_) {
0359 case HasBitmap::BOTH: {
0360 BitBlockCount block = binary_counter_.NextAndWord();
0361 position_ += block.length;
0362 return block;
0363 }
0364 case HasBitmap::ONE: {
0365 BitBlockCount block = unary_counter_.NextWord();
0366 position_ += block.length;
0367 return block;
0368 }
0369 case HasBitmap::NONE:
0370 default: {
0371 const int16_t block_size =
0372 static_cast<int16_t>(std::min(kMaxBlockSize, length_ - position_));
0373 position_ += block_size;
0374
0375 return {block_size, block_size};
0376 }
0377 }
0378 }
0379
0380 BitBlockCount NextOrNotBlock() {
0381 static constexpr int64_t kMaxBlockSize = std::numeric_limits<int16_t>::max();
0382 switch (has_bitmap_) {
0383 case HasBitmap::BOTH: {
0384 BitBlockCount block = binary_counter_.NextOrNotWord();
0385 position_ += block.length;
0386 return block;
0387 }
0388 case HasBitmap::ONE: {
0389 BitBlockCount block = unary_counter_.NextWord();
0390 position_ += block.length;
0391 return block;
0392 }
0393 case HasBitmap::NONE:
0394 default: {
0395 const int16_t block_size =
0396 static_cast<int16_t>(std::min(kMaxBlockSize, length_ - position_));
0397 position_ += block_size;
0398
0399 return {block_size, block_size};
0400 }
0401 }
0402 }
0403
0404 private:
0405 enum class HasBitmap : int { BOTH, ONE, NONE };
0406
0407 const HasBitmap has_bitmap_;
0408 int64_t position_;
0409 int64_t length_;
0410 BitBlockCounter unary_counter_;
0411 BinaryBitBlockCounter binary_counter_;
0412
0413 static HasBitmap HasBitmapFromBitmaps(bool has_left, bool has_right) {
0414 switch (static_cast<int>(has_left) + static_cast<int>(has_right)) {
0415 case 0:
0416 return HasBitmap::NONE;
0417 case 1:
0418 return HasBitmap::ONE;
0419 default:
0420 return HasBitmap::BOTH;
0421 }
0422 }
0423 };
0424
0425
0426
0427 template <typename VisitNotNull, typename VisitNull>
0428 static Status VisitBitBlocks(const uint8_t* bitmap, int64_t offset, int64_t length,
0429 VisitNotNull&& visit_not_null, VisitNull&& visit_null) {
0430 internal::OptionalBitBlockCounter bit_counter(bitmap, offset, length);
0431 int64_t position = 0;
0432 while (position < length) {
0433 internal::BitBlockCount block = bit_counter.NextBlock();
0434 if (block.AllSet()) {
0435 for (int64_t i = 0; i < block.length; ++i, ++position) {
0436 ARROW_RETURN_NOT_OK(visit_not_null(position));
0437 }
0438 } else if (block.NoneSet()) {
0439 for (int64_t i = 0; i < block.length; ++i, ++position) {
0440 ARROW_RETURN_NOT_OK(visit_null());
0441 }
0442 } else {
0443 for (int64_t i = 0; i < block.length; ++i, ++position) {
0444 if (bit_util::GetBit(bitmap, offset + position)) {
0445 ARROW_RETURN_NOT_OK(visit_not_null(position));
0446 } else {
0447 ARROW_RETURN_NOT_OK(visit_null());
0448 }
0449 }
0450 }
0451 }
0452 return Status::OK();
0453 }
0454
0455 template <typename VisitNotNull, typename VisitNull>
0456 static void VisitBitBlocksVoid(const uint8_t* bitmap, int64_t offset, int64_t length,
0457 VisitNotNull&& visit_not_null, VisitNull&& visit_null) {
0458 internal::OptionalBitBlockCounter bit_counter(bitmap, offset, length);
0459 int64_t position = 0;
0460 while (position < length) {
0461 internal::BitBlockCount block = bit_counter.NextBlock();
0462 if (block.AllSet()) {
0463 for (int64_t i = 0; i < block.length; ++i, ++position) {
0464 visit_not_null(position);
0465 }
0466 } else if (block.NoneSet()) {
0467 for (int64_t i = 0; i < block.length; ++i, ++position) {
0468 visit_null();
0469 }
0470 } else {
0471 for (int64_t i = 0; i < block.length; ++i, ++position) {
0472 if (bit_util::GetBit(bitmap, offset + position)) {
0473 visit_not_null(position);
0474 } else {
0475 visit_null();
0476 }
0477 }
0478 }
0479 }
0480 }
0481
0482 template <typename VisitNotNull, typename VisitNull>
0483 static Status VisitTwoBitBlocks(const uint8_t* left_bitmap, int64_t left_offset,
0484 const uint8_t* right_bitmap, int64_t right_offset,
0485 int64_t length, VisitNotNull&& visit_not_null,
0486 VisitNull&& visit_null) {
0487 if (left_bitmap == NULLPTR || right_bitmap == NULLPTR) {
0488
0489 if (left_bitmap == NULLPTR) {
0490 return VisitBitBlocks(right_bitmap, right_offset, length,
0491 std::forward<VisitNotNull>(visit_not_null),
0492 std::forward<VisitNull>(visit_null));
0493 } else {
0494 return VisitBitBlocks(left_bitmap, left_offset, length,
0495 std::forward<VisitNotNull>(visit_not_null),
0496 std::forward<VisitNull>(visit_null));
0497 }
0498 }
0499 BinaryBitBlockCounter bit_counter(left_bitmap, left_offset, right_bitmap, right_offset,
0500 length);
0501 int64_t position = 0;
0502 while (position < length) {
0503 BitBlockCount block = bit_counter.NextAndWord();
0504 if (block.AllSet()) {
0505 for (int64_t i = 0; i < block.length; ++i, ++position) {
0506 ARROW_RETURN_NOT_OK(visit_not_null(position));
0507 }
0508 } else if (block.NoneSet()) {
0509 for (int64_t i = 0; i < block.length; ++i, ++position) {
0510 ARROW_RETURN_NOT_OK(visit_null());
0511 }
0512 } else {
0513 for (int64_t i = 0; i < block.length; ++i, ++position) {
0514 if (bit_util::GetBit(left_bitmap, left_offset + position) &&
0515 bit_util::GetBit(right_bitmap, right_offset + position)) {
0516 ARROW_RETURN_NOT_OK(visit_not_null(position));
0517 } else {
0518 ARROW_RETURN_NOT_OK(visit_null());
0519 }
0520 }
0521 }
0522 }
0523 return Status::OK();
0524 }
0525
0526 template <typename VisitNotNull, typename VisitNull>
0527 static void VisitTwoBitBlocksVoid(const uint8_t* left_bitmap, int64_t left_offset,
0528 const uint8_t* right_bitmap, int64_t right_offset,
0529 int64_t length, VisitNotNull&& visit_not_null,
0530 VisitNull&& visit_null) {
0531 if (left_bitmap == NULLPTR || right_bitmap == NULLPTR) {
0532
0533 if (left_bitmap == NULLPTR) {
0534 return VisitBitBlocksVoid(right_bitmap, right_offset, length,
0535 std::forward<VisitNotNull>(visit_not_null),
0536 std::forward<VisitNull>(visit_null));
0537 } else {
0538 return VisitBitBlocksVoid(left_bitmap, left_offset, length,
0539 std::forward<VisitNotNull>(visit_not_null),
0540 std::forward<VisitNull>(visit_null));
0541 }
0542 }
0543 BinaryBitBlockCounter bit_counter(left_bitmap, left_offset, right_bitmap, right_offset,
0544 length);
0545 int64_t position = 0;
0546 while (position < length) {
0547 BitBlockCount block = bit_counter.NextAndWord();
0548 if (block.AllSet()) {
0549 for (int64_t i = 0; i < block.length; ++i, ++position) {
0550 visit_not_null(position);
0551 }
0552 } else if (block.NoneSet()) {
0553 for (int64_t i = 0; i < block.length; ++i, ++position) {
0554 visit_null();
0555 }
0556 } else {
0557 for (int64_t i = 0; i < block.length; ++i, ++position) {
0558 if (bit_util::GetBit(left_bitmap, left_offset + position) &&
0559 bit_util::GetBit(right_bitmap, right_offset + position)) {
0560 visit_not_null(position);
0561 } else {
0562 visit_null();
0563 }
0564 }
0565 }
0566 }
0567 }
0568
0569 }
0570 }