File indexing completed on 2025-08-28 08:27:03
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 #pragma once
0019
0020 #include <cassert>
0021 #include <cstdint>
0022 #include <cstring>
0023
0024 #include "arrow/buffer.h"
0025 #include "arrow/util/bit_util.h"
0026 #include "arrow/util/endian.h"
0027 #include "arrow/util/macros.h"
0028
0029 namespace arrow {
0030 namespace internal {
0031
0032 class BitmapReader {
0033 public:
0034 BitmapReader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
0035 : bitmap_(bitmap), position_(0), length_(length) {
0036 current_byte_ = 0;
0037 byte_offset_ = start_offset / 8;
0038 bit_offset_ = start_offset % 8;
0039 if (length > 0) {
0040 current_byte_ = bitmap[byte_offset_];
0041 }
0042 }
0043
0044 bool IsSet() const { return (current_byte_ & (1 << bit_offset_)) != 0; }
0045
0046 bool IsNotSet() const { return (current_byte_ & (1 << bit_offset_)) == 0; }
0047
0048 void Next() {
0049 ++bit_offset_;
0050 ++position_;
0051 if (ARROW_PREDICT_FALSE(bit_offset_ == 8)) {
0052 bit_offset_ = 0;
0053 ++byte_offset_;
0054 if (ARROW_PREDICT_TRUE(position_ < length_)) {
0055 current_byte_ = bitmap_[byte_offset_];
0056 }
0057 }
0058 }
0059
0060 int64_t position() const { return position_; }
0061
0062 int64_t length() const { return length_; }
0063
0064 private:
0065 const uint8_t* bitmap_;
0066 int64_t position_;
0067 int64_t length_;
0068
0069 uint8_t current_byte_;
0070 int64_t byte_offset_;
0071 int64_t bit_offset_;
0072 };
0073
0074
0075
0076
0077 class BitmapUInt64Reader {
0078 public:
0079 BitmapUInt64Reader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
0080 : bitmap_(util::MakeNonNull(bitmap) + start_offset / 8),
0081 num_carry_bits_(8 - start_offset % 8),
0082 length_(length),
0083 remaining_length_(length_),
0084 carry_bits_(0) {
0085 if (length_ > 0) {
0086
0087 if (length_ >= num_carry_bits_) {
0088 carry_bits_ =
0089 LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), num_carry_bits_);
0090 } else {
0091 carry_bits_ = LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), length_);
0092 }
0093 }
0094 }
0095
0096 uint64_t NextWord() {
0097 if (ARROW_PREDICT_TRUE(remaining_length_ >= 64 + num_carry_bits_)) {
0098
0099 uint64_t next_word = LoadFullWord();
0100
0101 uint64_t word = carry_bits_ | (next_word << num_carry_bits_);
0102 carry_bits_ = next_word >> (64 - num_carry_bits_);
0103 remaining_length_ -= 64;
0104 return word;
0105 } else if (remaining_length_ > num_carry_bits_) {
0106
0107 uint64_t next_word =
0108 LoadPartialWord(0, remaining_length_ - num_carry_bits_);
0109 uint64_t word = carry_bits_ | (next_word << num_carry_bits_);
0110 carry_bits_ = next_word >> (64 - num_carry_bits_);
0111 remaining_length_ = std::max<int64_t>(remaining_length_ - 64, 0);
0112 return word;
0113 } else {
0114 remaining_length_ = 0;
0115 return carry_bits_;
0116 }
0117 }
0118
0119 int64_t position() const { return length_ - remaining_length_; }
0120
0121 int64_t length() const { return length_; }
0122
0123 private:
0124 uint64_t LoadFullWord() {
0125 uint64_t word;
0126 memcpy(&word, bitmap_, 8);
0127 bitmap_ += 8;
0128 return bit_util::ToLittleEndian(word);
0129 }
0130
0131 uint64_t LoadPartialWord(int8_t bit_offset, int64_t num_bits) {
0132 uint64_t word = 0;
0133 const int64_t num_bytes = bit_util::BytesForBits(num_bits);
0134 memcpy(&word, bitmap_, num_bytes);
0135 bitmap_ += num_bytes;
0136 return (bit_util::ToLittleEndian(word) >> bit_offset) &
0137 bit_util::LeastSignificantBitMask(num_bits);
0138 }
0139
0140 const uint8_t* bitmap_;
0141 const int64_t num_carry_bits_;
0142 const int64_t length_;
0143 int64_t remaining_length_;
0144 uint64_t carry_bits_;
0145 };
0146
0147
0148
0149
0150
0151 template <typename Word, bool may_have_byte_offset = true>
0152 class BitmapWordReader {
0153 public:
0154 BitmapWordReader() = default;
0155 BitmapWordReader(const uint8_t* bitmap, int64_t offset, int64_t length)
0156 : offset_(static_cast<int64_t>(may_have_byte_offset) * (offset % 8)),
0157 bitmap_(bitmap + offset / 8),
0158 bitmap_end_(bitmap_ + bit_util::BytesForBits(offset_ + length)) {
0159
0160 nwords_ = length / (sizeof(Word) * 8) - 1;
0161 if (nwords_ < 0) {
0162 nwords_ = 0;
0163 }
0164 trailing_bits_ = static_cast<int>(length - nwords_ * sizeof(Word) * 8);
0165 trailing_bytes_ = static_cast<int>(bit_util::BytesForBits(trailing_bits_));
0166
0167 if (nwords_ > 0) {
0168 current_data.word_ = load<Word>(bitmap_);
0169 } else if (length > 0) {
0170 current_data.epi.byte_ = load<uint8_t>(bitmap_);
0171 }
0172 }
0173
0174 Word NextWord() {
0175 bitmap_ += sizeof(Word);
0176 const Word next_word = load<Word>(bitmap_);
0177 Word word = current_data.word_;
0178 if (may_have_byte_offset && offset_) {
0179
0180
0181
0182
0183
0184
0185
0186
0187
0188
0189
0190 word >>= offset_;
0191 word |= next_word << (sizeof(Word) * 8 - offset_);
0192 }
0193 current_data.word_ = next_word;
0194 return word;
0195 }
0196
0197 uint8_t NextTrailingByte(int& valid_bits) {
0198 uint8_t byte;
0199 assert(trailing_bits_ > 0);
0200
0201 if (trailing_bits_ <= 8) {
0202
0203 valid_bits = trailing_bits_;
0204 trailing_bits_ = 0;
0205 byte = 0;
0206 internal::BitmapReader reader(bitmap_, offset_, valid_bits);
0207 for (int i = 0; i < valid_bits; ++i) {
0208 byte >>= 1;
0209 if (reader.IsSet()) {
0210 byte |= 0x80;
0211 }
0212 reader.Next();
0213 }
0214 byte >>= (8 - valid_bits);
0215 } else {
0216 ++bitmap_;
0217 const uint8_t next_byte = load<uint8_t>(bitmap_);
0218 byte = current_data.epi.byte_;
0219 if (may_have_byte_offset && offset_) {
0220 byte >>= offset_;
0221 byte |= next_byte << (8 - offset_);
0222 }
0223 current_data.epi.byte_ = next_byte;
0224 trailing_bits_ -= 8;
0225 trailing_bytes_--;
0226 valid_bits = 8;
0227 }
0228 return byte;
0229 }
0230
0231 int64_t words() const { return nwords_; }
0232 int trailing_bytes() const { return trailing_bytes_; }
0233
0234 private:
0235 int64_t offset_;
0236 const uint8_t* bitmap_;
0237
0238 const uint8_t* bitmap_end_;
0239 int64_t nwords_;
0240 int trailing_bits_;
0241 int trailing_bytes_;
0242 union {
0243 Word word_;
0244 struct {
0245 #if ARROW_LITTLE_ENDIAN == 0
0246 uint8_t padding_bytes_[sizeof(Word) - 1];
0247 #endif
0248 uint8_t byte_;
0249 } epi;
0250 } current_data;
0251
0252 template <typename DType>
0253 DType load(const uint8_t* bitmap) {
0254 assert(bitmap + sizeof(DType) <= bitmap_end_);
0255 return bit_util::ToLittleEndian(util::SafeLoadAs<DType>(bitmap));
0256 }
0257 };
0258
0259
0260 struct OptionalBitIndexer {
0261 const uint8_t* bitmap;
0262 const int64_t offset;
0263
0264 explicit OptionalBitIndexer(const uint8_t* buffer = NULLPTR, int64_t offset = 0)
0265 : bitmap(buffer), offset(offset) {}
0266
0267 bool operator[](int64_t i) const {
0268 return bitmap == NULLPTR || bit_util::GetBit(bitmap, offset + i);
0269 }
0270 };
0271
0272 }
0273 }