Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-08-28 08:27:03

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <cassert>
0021 #include <cstdint>
0022 #include <cstring>
0023 
0024 #include "arrow/buffer.h"
0025 #include "arrow/util/bit_util.h"
0026 #include "arrow/util/endian.h"
0027 #include "arrow/util/macros.h"
0028 
0029 namespace arrow {
0030 namespace internal {
0031 
0032 class BitmapReader {
0033  public:
0034   BitmapReader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
0035       : bitmap_(bitmap), position_(0), length_(length) {
0036     current_byte_ = 0;
0037     byte_offset_ = start_offset / 8;
0038     bit_offset_ = start_offset % 8;
0039     if (length > 0) {
0040       current_byte_ = bitmap[byte_offset_];
0041     }
0042   }
0043 
0044   bool IsSet() const { return (current_byte_ & (1 << bit_offset_)) != 0; }
0045 
0046   bool IsNotSet() const { return (current_byte_ & (1 << bit_offset_)) == 0; }
0047 
0048   void Next() {
0049     ++bit_offset_;
0050     ++position_;
0051     if (ARROW_PREDICT_FALSE(bit_offset_ == 8)) {
0052       bit_offset_ = 0;
0053       ++byte_offset_;
0054       if (ARROW_PREDICT_TRUE(position_ < length_)) {
0055         current_byte_ = bitmap_[byte_offset_];
0056       }
0057     }
0058   }
0059 
0060   int64_t position() const { return position_; }
0061 
0062   int64_t length() const { return length_; }
0063 
0064  private:
0065   const uint8_t* bitmap_;
0066   int64_t position_;
0067   int64_t length_;
0068 
0069   uint8_t current_byte_;
0070   int64_t byte_offset_;
0071   int64_t bit_offset_;
0072 };
0073 
0074 // XXX Cannot name it BitmapWordReader because the name is already used
0075 // in bitmap_ops.cc
0076 
0077 class BitmapUInt64Reader {
0078  public:
0079   BitmapUInt64Reader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
0080       : bitmap_(util::MakeNonNull(bitmap) + start_offset / 8),
0081         num_carry_bits_(8 - start_offset % 8),
0082         length_(length),
0083         remaining_length_(length_),
0084         carry_bits_(0) {
0085     if (length_ > 0) {
0086       // Load carry bits from the first byte's MSBs
0087       if (length_ >= num_carry_bits_) {
0088         carry_bits_ =
0089             LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), num_carry_bits_);
0090       } else {
0091         carry_bits_ = LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), length_);
0092       }
0093     }
0094   }
0095 
0096   uint64_t NextWord() {
0097     if (ARROW_PREDICT_TRUE(remaining_length_ >= 64 + num_carry_bits_)) {
0098       // We can load a full word
0099       uint64_t next_word = LoadFullWord();
0100       // Carry bits come first, then the (64 - num_carry_bits_) LSBs from next_word
0101       uint64_t word = carry_bits_ | (next_word << num_carry_bits_);
0102       carry_bits_ = next_word >> (64 - num_carry_bits_);
0103       remaining_length_ -= 64;
0104       return word;
0105     } else if (remaining_length_ > num_carry_bits_) {
0106       // We can load a partial word
0107       uint64_t next_word =
0108           LoadPartialWord(/*bit_offset=*/0, remaining_length_ - num_carry_bits_);
0109       uint64_t word = carry_bits_ | (next_word << num_carry_bits_);
0110       carry_bits_ = next_word >> (64 - num_carry_bits_);
0111       remaining_length_ = std::max<int64_t>(remaining_length_ - 64, 0);
0112       return word;
0113     } else {
0114       remaining_length_ = 0;
0115       return carry_bits_;
0116     }
0117   }
0118 
0119   int64_t position() const { return length_ - remaining_length_; }
0120 
0121   int64_t length() const { return length_; }
0122 
0123  private:
0124   uint64_t LoadFullWord() {
0125     uint64_t word;
0126     memcpy(&word, bitmap_, 8);
0127     bitmap_ += 8;
0128     return bit_util::ToLittleEndian(word);
0129   }
0130 
0131   uint64_t LoadPartialWord(int8_t bit_offset, int64_t num_bits) {
0132     uint64_t word = 0;
0133     const int64_t num_bytes = bit_util::BytesForBits(num_bits);
0134     memcpy(&word, bitmap_, num_bytes);
0135     bitmap_ += num_bytes;
0136     return (bit_util::ToLittleEndian(word) >> bit_offset) &
0137            bit_util::LeastSignificantBitMask(num_bits);
0138   }
0139 
0140   const uint8_t* bitmap_;
0141   const int64_t num_carry_bits_;  // in [1, 8]
0142   const int64_t length_;
0143   int64_t remaining_length_;
0144   uint64_t carry_bits_;
0145 };
0146 
0147 // BitmapWordReader here is faster than BitmapUInt64Reader (in bitmap_reader.h)
0148 // on sufficiently large inputs.  However, it has a larger prolog / epilog overhead
0149 // and should probably not be used for small bitmaps.
0150 
0151 template <typename Word, bool may_have_byte_offset = true>
0152 class BitmapWordReader {
0153  public:
0154   BitmapWordReader() = default;
0155   BitmapWordReader(const uint8_t* bitmap, int64_t offset, int64_t length)
0156       : offset_(static_cast<int64_t>(may_have_byte_offset) * (offset % 8)),
0157         bitmap_(bitmap + offset / 8),
0158         bitmap_end_(bitmap_ + bit_util::BytesForBits(offset_ + length)) {
0159     // decrement word count by one as we may touch two adjacent words in one iteration
0160     nwords_ = length / (sizeof(Word) * 8) - 1;
0161     if (nwords_ < 0) {
0162       nwords_ = 0;
0163     }
0164     trailing_bits_ = static_cast<int>(length - nwords_ * sizeof(Word) * 8);
0165     trailing_bytes_ = static_cast<int>(bit_util::BytesForBits(trailing_bits_));
0166 
0167     if (nwords_ > 0) {
0168       current_data.word_ = load<Word>(bitmap_);
0169     } else if (length > 0) {
0170       current_data.epi.byte_ = load<uint8_t>(bitmap_);
0171     }
0172   }
0173 
0174   Word NextWord() {
0175     bitmap_ += sizeof(Word);
0176     const Word next_word = load<Word>(bitmap_);
0177     Word word = current_data.word_;
0178     if (may_have_byte_offset && offset_) {
0179       // combine two adjacent words into one word
0180       // |<------ next ----->|<---- current ---->|
0181       // +-------------+-----+-------------+-----+
0182       // |     ---     |  A  |      B      | --- |
0183       // +-------------+-----+-------------+-----+
0184       //                  |         |       offset
0185       //                  v         v
0186       //               +-----+-------------+
0187       //               |  A  |      B      |
0188       //               +-----+-------------+
0189       //               |<------ word ----->|
0190       word >>= offset_;
0191       word |= next_word << (sizeof(Word) * 8 - offset_);
0192     }
0193     current_data.word_ = next_word;
0194     return word;
0195   }
0196 
0197   uint8_t NextTrailingByte(int& valid_bits) {
0198     uint8_t byte;
0199     assert(trailing_bits_ > 0);
0200 
0201     if (trailing_bits_ <= 8) {
0202       // last byte
0203       valid_bits = trailing_bits_;
0204       trailing_bits_ = 0;
0205       byte = 0;
0206       internal::BitmapReader reader(bitmap_, offset_, valid_bits);
0207       for (int i = 0; i < valid_bits; ++i) {
0208         byte >>= 1;
0209         if (reader.IsSet()) {
0210           byte |= 0x80;
0211         }
0212         reader.Next();
0213       }
0214       byte >>= (8 - valid_bits);
0215     } else {
0216       ++bitmap_;
0217       const uint8_t next_byte = load<uint8_t>(bitmap_);
0218       byte = current_data.epi.byte_;
0219       if (may_have_byte_offset && offset_) {
0220         byte >>= offset_;
0221         byte |= next_byte << (8 - offset_);
0222       }
0223       current_data.epi.byte_ = next_byte;
0224       trailing_bits_ -= 8;
0225       trailing_bytes_--;
0226       valid_bits = 8;
0227     }
0228     return byte;
0229   }
0230 
0231   int64_t words() const { return nwords_; }
0232   int trailing_bytes() const { return trailing_bytes_; }
0233 
0234  private:
0235   int64_t offset_;
0236   const uint8_t* bitmap_;
0237 
0238   const uint8_t* bitmap_end_;
0239   int64_t nwords_;
0240   int trailing_bits_;
0241   int trailing_bytes_;
0242   union {
0243     Word word_;
0244     struct {
0245 #if ARROW_LITTLE_ENDIAN == 0
0246       uint8_t padding_bytes_[sizeof(Word) - 1];
0247 #endif
0248       uint8_t byte_;
0249     } epi;
0250   } current_data;
0251 
0252   template <typename DType>
0253   DType load(const uint8_t* bitmap) {
0254     assert(bitmap + sizeof(DType) <= bitmap_end_);
0255     return bit_util::ToLittleEndian(util::SafeLoadAs<DType>(bitmap));
0256   }
0257 };
0258 
0259 /// \brief Index into a possibly nonexistent bitmap
0260 struct OptionalBitIndexer {
0261   const uint8_t* bitmap;
0262   const int64_t offset;
0263 
0264   explicit OptionalBitIndexer(const uint8_t* buffer = NULLPTR, int64_t offset = 0)
0265       : bitmap(buffer), offset(offset) {}
0266 
0267   bool operator[](int64_t i) const {
0268     return bitmap == NULLPTR || bit_util::GetBit(bitmap, offset + i);
0269   }
0270 };
0271 
0272 }  // namespace internal
0273 }  // namespace arrow