File indexing completed on 2025-08-28 08:27:03
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 #pragma once
0019
0020 #include <algorithm>
0021 #include <array>
0022 #include <bitset>
0023 #include <cassert>
0024 #include <cstdint>
0025 #include <cstring>
0026 #include <memory>
0027 #include <string>
0028 #include <string_view>
0029 #include <utility>
0030
0031 #include "arrow/buffer.h"
0032 #include "arrow/util/bit_util.h"
0033 #include "arrow/util/bitmap_ops.h"
0034 #include "arrow/util/bitmap_reader.h"
0035 #include "arrow/util/bitmap_writer.h"
0036 #include "arrow/util/compare.h"
0037 #include "arrow/util/endian.h"
0038 #include "arrow/util/functional.h"
0039 #include "arrow/util/span.h"
0040 #include "arrow/util/string_builder.h"
0041 #include "arrow/util/visibility.h"
0042
0043 namespace arrow {
0044
0045 class BooleanArray;
0046
0047 namespace internal {
0048
0049 class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
0050 public util::EqualityComparable<Bitmap> {
0051 public:
0052 Bitmap() = default;
0053
0054 Bitmap(const std::shared_ptr<Buffer>& buffer, int64_t offset, int64_t length)
0055 : data_(buffer->data()), offset_(offset), length_(length) {
0056 if (buffer->is_mutable()) {
0057 mutable_data_ = buffer->mutable_data();
0058 }
0059 }
0060
0061 Bitmap(const void* data, int64_t offset, int64_t length)
0062 : data_(reinterpret_cast<const uint8_t*>(data)), offset_(offset), length_(length) {}
0063
0064 Bitmap(void* data, int64_t offset, int64_t length)
0065 : data_(reinterpret_cast<const uint8_t*>(data)),
0066 mutable_data_(reinterpret_cast<uint8_t*>(data)),
0067 offset_(offset),
0068 length_(length) {}
0069
0070 Bitmap Slice(int64_t offset) const {
0071 if (mutable_data_ != NULLPTR) {
0072 return {mutable_data_, offset_ + offset, length_ - offset};
0073 } else {
0074 return {data_, offset_ + offset, length_ - offset};
0075 }
0076 }
0077
0078 Bitmap Slice(int64_t offset, int64_t length) const {
0079 if (mutable_data_ != NULLPTR) {
0080 return {mutable_data_, offset_ + offset, length};
0081 } else {
0082 return {data_, offset_ + offset, length};
0083 }
0084 }
0085
0086 std::string ToString() const;
0087
0088 bool Equals(const Bitmap& other) const;
0089
0090 std::string Diff(const Bitmap& other) const;
0091
0092 bool GetBit(int64_t i) const { return bit_util::GetBit(data_, i + offset_); }
0093
0094 bool operator[](int64_t i) const { return GetBit(i); }
0095
0096 void SetBitTo(int64_t i, bool v) const {
0097 bit_util::SetBitTo(mutable_data_, i + offset_, v);
0098 }
0099
0100 void SetBitsTo(bool v) { bit_util::SetBitsTo(mutable_data_, offset_, length_, v); }
0101
0102 void CopyFrom(const Bitmap& other);
0103 void CopyFromInverted(const Bitmap& other);
0104
0105
0106
0107
0108 template <size_t N, typename Visitor>
0109 static void VisitBits(const Bitmap (&bitmaps)[N], Visitor&& visitor) {
0110 int64_t bit_length = BitLength(bitmaps, N);
0111 std::bitset<N> bits;
0112 for (int64_t bit_i = 0; bit_i < bit_length; ++bit_i) {
0113 for (size_t i = 0; i < N; ++i) {
0114 bits[i] = bitmaps[i].GetBit(bit_i);
0115 }
0116 visitor(bits);
0117 }
0118 }
0119
0120
0121
0122
0123 template <size_t N, typename Visitor>
0124 static void VisitBits(const std::array<Bitmap, N>& bitmaps, Visitor&& visitor) {
0125 int64_t bit_length = BitLength(bitmaps);
0126 std::bitset<N> bits;
0127 for (int64_t bit_i = 0; bit_i < bit_length; ++bit_i) {
0128 for (size_t i = 0; i < N; ++i) {
0129 bits[i] = bitmaps[i].GetBit(bit_i);
0130 }
0131 visitor(bits);
0132 }
0133 }
0134
0135
0136
0137
0138
0139
0140
0141
0142
0143
0144
0145
0146
0147
0148 template <size_t N, typename Visitor,
0149 typename Word = typename std::decay<
0150 internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
0151 static int64_t VisitWords(const Bitmap (&bitmaps_arg)[N], Visitor&& visitor) {
0152 constexpr int64_t kBitWidth = sizeof(Word) * 8;
0153
0154
0155 Bitmap bitmaps[N];
0156 int64_t offsets[N];
0157 int64_t bit_length = BitLength(bitmaps_arg, N);
0158 util::span<const Word> words[N];
0159 for (size_t i = 0; i < N; ++i) {
0160 bitmaps[i] = bitmaps_arg[i];
0161 offsets[i] = bitmaps[i].template word_offset<Word>();
0162 assert(offsets[i] >= 0 && offsets[i] < kBitWidth);
0163 words[i] = bitmaps[i].template words<Word>();
0164 }
0165
0166 auto consume = [&](int64_t consumed_bits) {
0167 for (size_t i = 0; i < N; ++i) {
0168 bitmaps[i] = bitmaps[i].Slice(consumed_bits, bit_length - consumed_bits);
0169 offsets[i] = bitmaps[i].template word_offset<Word>();
0170 assert(offsets[i] >= 0 && offsets[i] < kBitWidth);
0171 words[i] = bitmaps[i].template words<Word>();
0172 }
0173 bit_length -= consumed_bits;
0174 };
0175
0176 std::array<Word, N> visited_words;
0177 visited_words.fill(0);
0178
0179 if (bit_length <= kBitWidth * 2) {
0180
0181 while (bit_length > 0) {
0182 auto leading_bits = std::min(bit_length, kBitWidth);
0183 SafeLoadWords(bitmaps, 0, leading_bits, false, &visited_words);
0184 visitor(visited_words);
0185 consume(leading_bits);
0186 }
0187 return 0;
0188 }
0189
0190 int64_t max_offset = *std::max_element(offsets, offsets + N);
0191 int64_t min_offset = *std::min_element(offsets, offsets + N);
0192 if (max_offset > 0) {
0193
0194 auto leading_bits = kBitWidth - min_offset;
0195 SafeLoadWords(bitmaps, 0, leading_bits, true, &visited_words);
0196 visitor(visited_words);
0197 consume(leading_bits);
0198 }
0199 assert(*std::min_element(offsets, offsets + N) == 0);
0200
0201 int64_t whole_word_count = bit_length / kBitWidth;
0202 assert(whole_word_count >= 1);
0203
0204 if (min_offset == max_offset) {
0205
0206 assert(
0207 std::all_of(offsets, offsets + N, [](int64_t offset) { return offset == 0; }));
0208
0209 for (int64_t word_i = 0; word_i < whole_word_count; ++word_i) {
0210 for (size_t i = 0; i < N; ++i) {
0211 visited_words[i] = words[i][word_i];
0212 }
0213 visitor(visited_words);
0214 }
0215 consume(whole_word_count * kBitWidth);
0216 } else {
0217
0218
0219
0220
0221 for (int64_t word_i = 0; word_i < whole_word_count - 1; ++word_i) {
0222 for (size_t i = 0; i < N; ++i) {
0223 if (offsets[i] == 0) {
0224 visited_words[i] = words[i][word_i];
0225 } else {
0226 auto words0 = bit_util::ToLittleEndian(words[i][word_i]);
0227 auto words1 = bit_util::ToLittleEndian(words[i][word_i + 1]);
0228 visited_words[i] = bit_util::FromLittleEndian(
0229 (words0 >> offsets[i]) | (words1 << (kBitWidth - offsets[i])));
0230 }
0231 }
0232 visitor(visited_words);
0233 }
0234 consume((whole_word_count - 1) * kBitWidth);
0235
0236 SafeLoadWords(bitmaps, 0, kBitWidth, false, &visited_words);
0237
0238 visitor(visited_words);
0239 consume(kBitWidth);
0240 }
0241
0242
0243 if (bit_length > 0) {
0244 SafeLoadWords(bitmaps, 0, bit_length, false, &visited_words);
0245 visitor(visited_words);
0246 }
0247
0248 return min_offset;
0249 }
0250
0251 template <size_t N, size_t M, typename ReaderT, typename WriterT, typename Visitor,
0252 typename Word = typename std::decay<
0253 internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
0254 static void RunVisitWordsAndWriteLoop(int64_t bit_length,
0255 std::array<ReaderT, N>& readers,
0256 std::array<WriterT, M>& writers,
0257 Visitor&& visitor) {
0258 constexpr int64_t kBitWidth = sizeof(Word) * 8;
0259
0260 std::array<Word, N> visited_words;
0261 std::array<Word, M> output_words;
0262
0263
0264
0265
0266 auto n_words = readers[0].words();
0267 bit_length -= n_words * kBitWidth;
0268 while (n_words--) {
0269
0270 for (size_t i = 0; i < N; i++) {
0271 visited_words[i] = readers[i].NextWord();
0272 }
0273 visitor(visited_words, &output_words);
0274 for (size_t i = 0; i < M; i++) {
0275 writers[i].PutNextWord(output_words[i]);
0276 }
0277 }
0278
0279
0280
0281
0282
0283 if (bit_length) {
0284
0285 auto byte_visitor = [&](const std::array<uint8_t, N>& in,
0286 std::array<uint8_t, M>* out) {
0287 std::array<Word, N> in_words;
0288 std::array<Word, M> out_words;
0289 std::copy(in.begin(), in.end(), in_words.begin());
0290 visitor(in_words, &out_words);
0291 for (size_t i = 0; i < M; i++) {
0292 out->at(i) = static_cast<uint8_t>(out_words[i]);
0293 }
0294 };
0295
0296 std::array<uint8_t, N> visited_bytes;
0297 std::array<uint8_t, M> output_bytes;
0298 int n_bytes = readers[0].trailing_bytes();
0299 while (n_bytes--) {
0300 visited_bytes.fill(0);
0301 output_bytes.fill(0);
0302 int valid_bits;
0303 for (size_t i = 0; i < N; i++) {
0304 visited_bytes[i] = readers[i].NextTrailingByte(valid_bits);
0305 }
0306 byte_visitor(visited_bytes, &output_bytes);
0307 for (size_t i = 0; i < M; i++) {
0308 writers[i].PutNextTrailingByte(output_bytes[i], valid_bits);
0309 }
0310 }
0311 }
0312 }
0313
0314
0315
0316
0317
0318
0319
0320
0321
0322
0323
0324
0325
0326
0327
0328
0329 template <size_t N, size_t M, typename Visitor,
0330 typename Word = typename std::decay<
0331 internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
0332 static void VisitWordsAndWrite(const std::array<Bitmap, N>& bitmaps_arg,
0333 std::array<Bitmap, M>* out_bitmaps_arg,
0334 Visitor&& visitor) {
0335 int64_t bit_length = BitLength(bitmaps_arg);
0336 assert(bit_length == BitLength(*out_bitmaps_arg));
0337
0338
0339 if (std::all_of(bitmaps_arg.begin(), bitmaps_arg.end(),
0340 [](const Bitmap& b) { return b.offset_ % 8 == 0; }) &&
0341 std::all_of(out_bitmaps_arg->begin(), out_bitmaps_arg->end(),
0342 [](const Bitmap& b) { return b.offset_ % 8 == 0; })) {
0343 std::array<BitmapWordReader<Word, false>, N> readers;
0344 for (size_t i = 0; i < N; ++i) {
0345 const Bitmap& in_bitmap = bitmaps_arg[i];
0346 readers[i] = BitmapWordReader<Word, false>(
0347 in_bitmap.data_, in_bitmap.offset_, in_bitmap.length_);
0348 }
0349
0350 std::array<BitmapWordWriter<Word, false>, M> writers;
0351 for (size_t i = 0; i < M; ++i) {
0352 const Bitmap& out_bitmap = out_bitmaps_arg->at(i);
0353 writers[i] = BitmapWordWriter<Word, false>(
0354 out_bitmap.mutable_data_, out_bitmap.offset_, out_bitmap.length_);
0355 }
0356
0357 RunVisitWordsAndWriteLoop(bit_length, readers, writers, visitor);
0358 } else {
0359 std::array<BitmapWordReader<Word>, N> readers;
0360 for (size_t i = 0; i < N; ++i) {
0361 const Bitmap& in_bitmap = bitmaps_arg[i];
0362 readers[i] =
0363 BitmapWordReader<Word>(in_bitmap.data_, in_bitmap.offset_, in_bitmap.length_);
0364 }
0365
0366 std::array<BitmapWordWriter<Word>, M> writers;
0367 for (size_t i = 0; i < M; ++i) {
0368 const Bitmap& out_bitmap = out_bitmaps_arg->at(i);
0369 writers[i] = BitmapWordWriter<Word>(out_bitmap.mutable_data_, out_bitmap.offset_,
0370 out_bitmap.length_);
0371 }
0372
0373 RunVisitWordsAndWriteLoop(bit_length, readers, writers, visitor);
0374 }
0375 }
0376
0377 const uint8_t* data() const { return data_; }
0378 uint8_t* mutable_data() { return mutable_data_; }
0379
0380
0381 int64_t offset() const { return offset_; }
0382
0383
0384 int64_t length() const { return length_; }
0385
0386
0387 util::span<const uint8_t> bytes() const {
0388 auto byte_offset = offset_ / 8;
0389 auto byte_count = bit_util::CeilDiv(offset_ + length_, 8) - byte_offset;
0390 return {data_ + byte_offset, static_cast<size_t>(byte_count)};
0391 }
0392
0393 private:
0394
0395
0396
0397
0398
0399
0400
0401
0402
0403
0404
0405
0406 template <typename Word>
0407 util::span<const Word> words() const {
0408 auto bytes_addr = reinterpret_cast<intptr_t>(bytes().data());
0409 auto words_addr = bytes_addr - bytes_addr % sizeof(Word);
0410 auto word_byte_count =
0411 bit_util::RoundUpToPowerOf2(static_cast<int64_t>(bytes_addr + bytes().size()),
0412 static_cast<int64_t>(sizeof(Word))) -
0413 words_addr;
0414 return {reinterpret_cast<const Word*>(words_addr),
0415 static_cast<size_t>(word_byte_count / sizeof(Word))};
0416 }
0417
0418
0419 template <typename Word>
0420 int64_t word_offset() const {
0421 return offset_ + 8 * (reinterpret_cast<intptr_t>(data_) -
0422 reinterpret_cast<intptr_t>(words<Word>().data()));
0423 }
0424
0425
0426 template <size_t N, typename Word>
0427 static void SafeLoadWords(const Bitmap (&bitmaps)[N], int64_t offset,
0428 int64_t out_length, bool set_trailing_bits,
0429 std::array<Word, N>* out) {
0430 out->fill(0);
0431
0432 int64_t out_offset = set_trailing_bits ? sizeof(Word) * 8 - out_length : 0;
0433
0434 Bitmap slices[N], out_bitmaps[N];
0435 for (size_t i = 0; i < N; ++i) {
0436 slices[i] = bitmaps[i].Slice(offset, out_length);
0437 out_bitmaps[i] = Bitmap(&out->at(i), out_offset, out_length);
0438 }
0439
0440 int64_t bit_i = 0;
0441 Bitmap::VisitBits(slices, [&](std::bitset<N> bits) {
0442 for (size_t i = 0; i < N; ++i) {
0443 out_bitmaps[i].SetBitTo(bit_i, bits[i]);
0444 }
0445 ++bit_i;
0446 });
0447 }
0448
0449
0450 static int64_t BitLength(const Bitmap* bitmaps, size_t N);
0451
0452 template <size_t N>
0453 static int64_t BitLength(const std::array<Bitmap, N>& bitmaps) {
0454 for (size_t i = 1; i < N; ++i) {
0455 assert(bitmaps[i].length() == bitmaps[0].length());
0456 }
0457 return bitmaps[0].length();
0458 }
0459
0460 const uint8_t* data_ = NULLPTR;
0461 uint8_t* mutable_data_ = NULLPTR;
0462 int64_t offset_ = 0, length_ = 0;
0463 };
0464
0465 }
0466 }