File indexing completed on 2025-08-28 08:27:03
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 #pragma once
0019
0020 #include <cassert>
0021 #include <cstdint>
0022 #include <cstring>
0023 #include <string>
0024
0025 #include "arrow/util/bit_util.h"
0026 #include "arrow/util/bitmap_reader.h"
0027 #include "arrow/util/endian.h"
0028 #include "arrow/util/macros.h"
0029 #include "arrow/util/visibility.h"
0030
0031 namespace arrow {
0032 namespace internal {
0033
0034 struct BitRun {
0035 int64_t length;
0036
0037 bool set;
0038
0039 std::string ToString() const {
0040 return std::string("{Length: ") + std::to_string(length) +
0041 ", set=" + std::to_string(set) + "}";
0042 }
0043 };
0044
0045 inline bool operator==(const BitRun& lhs, const BitRun& rhs) {
0046 return lhs.length == rhs.length && lhs.set == rhs.set;
0047 }
0048
0049 inline bool operator!=(const BitRun& lhs, const BitRun& rhs) {
0050 return lhs.length != rhs.length || lhs.set != rhs.set;
0051 }
0052
0053 class BitRunReaderLinear {
0054 public:
0055 BitRunReaderLinear(const uint8_t* bitmap, int64_t start_offset, int64_t length)
0056 : reader_(bitmap, start_offset, length) {}
0057
0058 BitRun NextRun() {
0059 BitRun rl = {0, reader_.IsSet()};
0060
0061 while (reader_.position() < reader_.length() && reader_.IsSet() == rl.set) {
0062 rl.length++;
0063 reader_.Next();
0064 }
0065 return rl;
0066 }
0067
0068 private:
0069 BitmapReader reader_;
0070 };
0071
0072 #if ARROW_LITTLE_ENDIAN
0073
0074
0075 class ARROW_EXPORT BitRunReader {
0076 public:
0077
0078
0079
0080
0081
0082 BitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t length);
0083
0084
0085
0086
0087 BitRun NextRun() {
0088 if (ARROW_PREDICT_FALSE(position_ >= length_)) {
0089 return {0, false};
0090 }
0091
0092
0093
0094
0095
0096
0097
0098
0099 current_run_bit_set_ = !current_run_bit_set_;
0100
0101 int64_t start_position = position_;
0102 int64_t start_bit_offset = start_position & 63;
0103
0104
0105 word_ = ~word_ & ~bit_util::LeastSignificantBitMask(start_bit_offset);
0106
0107
0108 int64_t new_bits = bit_util::CountTrailingZeros(word_) - start_bit_offset;
0109 position_ += new_bits;
0110
0111 if (ARROW_PREDICT_FALSE(bit_util::IsMultipleOf64(position_)) &&
0112 ARROW_PREDICT_TRUE(position_ < length_)) {
0113
0114
0115 AdvanceUntilChange();
0116 }
0117
0118 return {position_ - start_position, current_run_bit_set_};
0119 }
0120
0121 private:
0122 void AdvanceUntilChange() {
0123 int64_t new_bits = 0;
0124 do {
0125
0126 bitmap_ += sizeof(uint64_t);
0127 LoadNextWord();
0128 new_bits = bit_util::CountTrailingZeros(word_);
0129
0130 position_ += new_bits;
0131 } while (ARROW_PREDICT_FALSE(bit_util::IsMultipleOf64(position_)) &&
0132 ARROW_PREDICT_TRUE(position_ < length_) && new_bits > 0);
0133 }
0134
0135 void LoadNextWord() { return LoadWord(length_ - position_); }
0136
0137
0138 void LoadWord(int64_t bits_remaining) {
0139 word_ = 0;
0140
0141 if (ARROW_PREDICT_TRUE(bits_remaining >= 64)) {
0142 std::memcpy(&word_, bitmap_, 8);
0143 } else {
0144 int64_t bytes_to_load = bit_util::BytesForBits(bits_remaining);
0145 auto word_ptr = reinterpret_cast<uint8_t*>(&word_);
0146 std::memcpy(word_ptr, bitmap_, bytes_to_load);
0147
0148
0149 bit_util::SetBitTo(word_ptr, bits_remaining,
0150 !bit_util::GetBit(word_ptr, bits_remaining - 1));
0151 }
0152
0153
0154
0155
0156
0157 if (current_run_bit_set_) {
0158 word_ = ~word_;
0159 }
0160 }
0161 const uint8_t* bitmap_;
0162 int64_t position_;
0163 int64_t length_;
0164 uint64_t word_;
0165 bool current_run_bit_set_;
0166 };
0167 #else
0168 using BitRunReader = BitRunReaderLinear;
0169 #endif
0170
0171 struct SetBitRun {
0172 int64_t position;
0173 int64_t length;
0174
0175 bool AtEnd() const { return length == 0; }
0176
0177 std::string ToString() const {
0178 return std::string("{pos=") + std::to_string(position) +
0179 ", len=" + std::to_string(length) + "}";
0180 }
0181
0182 bool operator==(const SetBitRun& other) const {
0183 return position == other.position && length == other.length;
0184 }
0185 bool operator!=(const SetBitRun& other) const {
0186 return position != other.position || length != other.length;
0187 }
0188 };
0189
0190 template <bool Reverse>
0191 class BaseSetBitRunReader {
0192 public:
0193
0194
0195
0196
0197
0198 ARROW_NOINLINE
0199 BaseSetBitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
0200 : bitmap_(util::MakeNonNull(bitmap)),
0201 length_(length),
0202 remaining_(length_),
0203 current_word_(0),
0204 current_num_bits_(0) {
0205 if (Reverse) {
0206 bitmap_ += (start_offset + length) / 8;
0207 const int8_t end_bit_offset = static_cast<int8_t>((start_offset + length) % 8);
0208 if (length > 0 && end_bit_offset) {
0209
0210 ++bitmap_;
0211 current_num_bits_ =
0212 std::min(static_cast<int32_t>(length), static_cast<int32_t>(end_bit_offset));
0213 current_word_ = LoadPartialWord(8 - end_bit_offset, current_num_bits_);
0214 }
0215 } else {
0216 bitmap_ += start_offset / 8;
0217 const int8_t bit_offset = static_cast<int8_t>(start_offset % 8);
0218 if (length > 0 && bit_offset) {
0219
0220 current_num_bits_ =
0221 std::min(static_cast<int32_t>(length), static_cast<int32_t>(8 - bit_offset));
0222 current_word_ = LoadPartialWord(bit_offset, current_num_bits_);
0223 }
0224 }
0225 }
0226
0227 ARROW_NOINLINE
0228 SetBitRun NextRun() {
0229 int64_t pos = 0;
0230 int64_t len = 0;
0231 if (current_num_bits_) {
0232 const auto run = FindCurrentRun();
0233 assert(remaining_ >= 0);
0234 if (run.length && current_num_bits_) {
0235
0236 return AdjustRun(run);
0237 }
0238 pos = run.position;
0239 len = run.length;
0240 }
0241 if (!len) {
0242
0243
0244 SkipNextZeros();
0245 if (remaining_ == 0) {
0246 return {0, 0};
0247 }
0248 assert(current_num_bits_);
0249 pos = position();
0250 } else if (!current_num_bits_) {
0251 if (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
0252 current_word_ = LoadFullWord();
0253 current_num_bits_ = 64;
0254 } else if (remaining_ > 0) {
0255 current_word_ = LoadPartialWord(0, remaining_);
0256 current_num_bits_ = static_cast<int32_t>(remaining_);
0257 } else {
0258
0259 return AdjustRun({pos, len});
0260 }
0261
0262 if (!(current_word_ & kFirstBit)) {
0263 return AdjustRun({pos, len});
0264 }
0265 }
0266
0267 len += CountNextOnes();
0268 return AdjustRun({pos, len});
0269 }
0270
0271 protected:
0272 int64_t position() const {
0273 if (Reverse) {
0274 return remaining_;
0275 } else {
0276 return length_ - remaining_;
0277 }
0278 }
0279
0280 SetBitRun AdjustRun(SetBitRun run) {
0281 if (Reverse) {
0282 assert(run.position >= run.length);
0283 run.position -= run.length;
0284 }
0285 return run;
0286 }
0287
0288 uint64_t LoadFullWord() {
0289 uint64_t word;
0290 if (Reverse) {
0291 bitmap_ -= 8;
0292 }
0293 memcpy(&word, bitmap_, 8);
0294 if (!Reverse) {
0295 bitmap_ += 8;
0296 }
0297 return bit_util::ToLittleEndian(word);
0298 }
0299
0300 uint64_t LoadPartialWord(int8_t bit_offset, int64_t num_bits) {
0301 assert(num_bits > 0);
0302 uint64_t word = 0;
0303 const int64_t num_bytes = bit_util::BytesForBits(num_bits);
0304 if (Reverse) {
0305
0306 bitmap_ -= num_bytes;
0307 memcpy(reinterpret_cast<char*>(&word) + 8 - num_bytes, bitmap_, num_bytes);
0308
0309 return (bit_util::ToLittleEndian(word) << bit_offset) &
0310 ~bit_util::LeastSignificantBitMask(64 - num_bits);
0311 } else {
0312 memcpy(&word, bitmap_, num_bytes);
0313 bitmap_ += num_bytes;
0314 return (bit_util::ToLittleEndian(word) >> bit_offset) &
0315 bit_util::LeastSignificantBitMask(num_bits);
0316 }
0317 }
0318
0319 void SkipNextZeros() {
0320 assert(current_num_bits_ == 0);
0321 while (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
0322 current_word_ = LoadFullWord();
0323 const auto num_zeros = CountFirstZeros(current_word_);
0324 if (num_zeros < 64) {
0325
0326 current_word_ = ConsumeBits(current_word_, num_zeros);
0327 current_num_bits_ = 64 - num_zeros;
0328 remaining_ -= num_zeros;
0329 assert(remaining_ >= 0);
0330 assert(current_num_bits_ >= 0);
0331 return;
0332 }
0333 remaining_ -= 64;
0334 }
0335
0336 if (remaining_ > 0) {
0337 current_word_ = LoadPartialWord(0, remaining_);
0338 current_num_bits_ = static_cast<int32_t>(remaining_);
0339 const auto num_zeros =
0340 std::min<int32_t>(current_num_bits_, CountFirstZeros(current_word_));
0341 current_word_ = ConsumeBits(current_word_, num_zeros);
0342 current_num_bits_ -= num_zeros;
0343 remaining_ -= num_zeros;
0344 assert(remaining_ >= 0);
0345 assert(current_num_bits_ >= 0);
0346 }
0347 }
0348
0349 int64_t CountNextOnes() {
0350 assert(current_word_ & kFirstBit);
0351
0352 int64_t len;
0353 if (~current_word_) {
0354 const auto num_ones = CountFirstZeros(~current_word_);
0355 assert(num_ones <= current_num_bits_);
0356 assert(num_ones <= remaining_);
0357 remaining_ -= num_ones;
0358 current_word_ = ConsumeBits(current_word_, num_ones);
0359 current_num_bits_ -= num_ones;
0360 if (current_num_bits_) {
0361
0362 return num_ones;
0363 }
0364 len = num_ones;
0365 } else {
0366
0367 remaining_ -= 64;
0368 current_num_bits_ = 0;
0369 len = 64;
0370 }
0371
0372 while (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
0373 current_word_ = LoadFullWord();
0374 const auto num_ones = CountFirstZeros(~current_word_);
0375 len += num_ones;
0376 remaining_ -= num_ones;
0377 if (num_ones < 64) {
0378
0379 current_word_ = ConsumeBits(current_word_, num_ones);
0380 current_num_bits_ = 64 - num_ones;
0381 return len;
0382 }
0383 }
0384
0385 if (remaining_ > 0) {
0386 current_word_ = LoadPartialWord(0, remaining_);
0387 current_num_bits_ = static_cast<int32_t>(remaining_);
0388 const auto num_ones = CountFirstZeros(~current_word_);
0389 assert(num_ones <= current_num_bits_);
0390 assert(num_ones <= remaining_);
0391 current_word_ = ConsumeBits(current_word_, num_ones);
0392 current_num_bits_ -= num_ones;
0393 remaining_ -= num_ones;
0394 len += num_ones;
0395 }
0396 return len;
0397 }
0398
0399 SetBitRun FindCurrentRun() {
0400
0401 const auto num_zeros = CountFirstZeros(current_word_);
0402 if (num_zeros >= current_num_bits_) {
0403 remaining_ -= current_num_bits_;
0404 current_word_ = 0;
0405 current_num_bits_ = 0;
0406 return {0, 0};
0407 }
0408 assert(num_zeros <= remaining_);
0409 current_word_ = ConsumeBits(current_word_, num_zeros);
0410 current_num_bits_ -= num_zeros;
0411 remaining_ -= num_zeros;
0412 const int64_t pos = position();
0413
0414 const auto num_ones = CountFirstZeros(~current_word_);
0415 assert(num_ones <= current_num_bits_);
0416 assert(num_ones <= remaining_);
0417 current_word_ = ConsumeBits(current_word_, num_ones);
0418 current_num_bits_ -= num_ones;
0419 remaining_ -= num_ones;
0420 return {pos, num_ones};
0421 }
0422
0423 inline int CountFirstZeros(uint64_t word);
0424 inline uint64_t ConsumeBits(uint64_t word, int32_t num_bits);
0425
0426 const uint8_t* bitmap_;
0427 const int64_t length_;
0428 int64_t remaining_;
0429 uint64_t current_word_;
0430 int32_t current_num_bits_;
0431
0432 static constexpr uint64_t kFirstBit = Reverse ? 0x8000000000000000ULL : 1;
0433 };
0434
0435 template <>
0436 inline int BaseSetBitRunReader<false>::CountFirstZeros(uint64_t word) {
0437 return bit_util::CountTrailingZeros(word);
0438 }
0439
0440 template <>
0441 inline int BaseSetBitRunReader<true>::CountFirstZeros(uint64_t word) {
0442 return bit_util::CountLeadingZeros(word);
0443 }
0444
0445 template <>
0446 inline uint64_t BaseSetBitRunReader<false>::ConsumeBits(uint64_t word, int32_t num_bits) {
0447 return word >> num_bits;
0448 }
0449
0450 template <>
0451 inline uint64_t BaseSetBitRunReader<true>::ConsumeBits(uint64_t word, int32_t num_bits) {
0452 return word << num_bits;
0453 }
0454
0455 using SetBitRunReader = BaseSetBitRunReader<false>;
0456 using ReverseSetBitRunReader = BaseSetBitRunReader<true>;
0457
0458
0459
0460
0461
0462
0463
0464 template <typename Visit>
0465 inline Status VisitSetBitRuns(const uint8_t* bitmap, int64_t offset, int64_t length,
0466 Visit&& visit) {
0467 if (bitmap == NULLPTR) {
0468
0469 return visit(static_cast<int64_t>(0), static_cast<int64_t>(length));
0470 }
0471 SetBitRunReader reader(bitmap, offset, length);
0472 while (true) {
0473 const auto run = reader.NextRun();
0474 if (run.length == 0) {
0475 break;
0476 }
0477 ARROW_RETURN_NOT_OK(visit(run.position, run.length));
0478 }
0479 return Status::OK();
0480 }
0481
0482 template <typename Visit>
0483 inline void VisitSetBitRunsVoid(const uint8_t* bitmap, int64_t offset, int64_t length,
0484 Visit&& visit) {
0485 if (bitmap == NULLPTR) {
0486
0487 visit(static_cast<int64_t>(0), static_cast<int64_t>(length));
0488 return;
0489 }
0490 SetBitRunReader reader(bitmap, offset, length);
0491 while (true) {
0492 const auto run = reader.NextRun();
0493 if (run.length == 0) {
0494 break;
0495 }
0496 visit(run.position, run.length);
0497 }
0498 }
0499
0500 template <typename Visit>
0501 inline Status VisitSetBitRuns(const std::shared_ptr<Buffer>& bitmap, int64_t offset,
0502 int64_t length, Visit&& visit) {
0503 return VisitSetBitRuns(bitmap ? bitmap->data() : NULLPTR, offset, length,
0504 std::forward<Visit>(visit));
0505 }
0506
0507 template <typename Visit>
0508 inline void VisitSetBitRunsVoid(const std::shared_ptr<Buffer>& bitmap, int64_t offset,
0509 int64_t length, Visit&& visit) {
0510 VisitSetBitRunsVoid(bitmap ? bitmap->data() : NULLPTR, offset, length,
0511 std::forward<Visit>(visit));
0512 }
0513
0514 }
0515 }