File indexing completed on 2025-08-27 08:47:18
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 #pragma once
0019
0020 #include <atomic>
0021 #include <cassert>
0022 #include <cstdint>
0023 #include <limits>
0024 #include <type_traits>
0025 #include <vector>
0026
0027 #include "arrow/type_fwd.h"
0028 #include "arrow/util/macros.h"
0029 #include "arrow/util/span.h"
0030
0031 namespace arrow {
0032
0033 class ChunkResolver;
0034
0035 template <typename IndexType>
0036 struct ARROW_EXPORT TypedChunkLocation {
0037
0038
0039
0040
0041 IndexType chunk_index = 0;
0042
0043
0044
0045
0046 IndexType index_in_chunk = 0;
0047
0048 TypedChunkLocation() = default;
0049
0050 TypedChunkLocation(IndexType chunk_index, IndexType index_in_chunk)
0051 : chunk_index(chunk_index), index_in_chunk(index_in_chunk) {
0052 static_assert(sizeof(TypedChunkLocation<IndexType>) == 2 * sizeof(IndexType));
0053 static_assert(alignof(TypedChunkLocation<IndexType>) == alignof(IndexType));
0054 }
0055
0056 bool operator==(TypedChunkLocation other) const {
0057 return chunk_index == other.chunk_index && index_in_chunk == other.index_in_chunk;
0058 }
0059 };
0060
0061 using ChunkLocation = TypedChunkLocation<int64_t>;
0062
0063
0064
0065 class ARROW_EXPORT ChunkResolver {
0066 private:
0067
0068
0069
0070
0071 std::vector<int64_t> offsets_;
0072
0073
0074
0075
0076 mutable std::atomic<int32_t> cached_chunk_;
0077
0078 public:
0079 explicit ChunkResolver(const ArrayVector& chunks) noexcept;
0080 explicit ChunkResolver(util::span<const Array* const> chunks) noexcept;
0081 explicit ChunkResolver(const RecordBatchVector& batches) noexcept;
0082
0083
0084
0085
0086
0087
0088 explicit ChunkResolver(std::vector<int64_t> offsets) noexcept
0089 : offsets_(std::move(offsets)), cached_chunk_(0) {
0090 #ifndef NDEBUG
0091 assert(offsets_.size() >= 1);
0092 assert(offsets_[0] == 0);
0093 for (size_t i = 1; i < offsets_.size(); i++) {
0094 assert(offsets_[i] >= offsets_[i - 1]);
0095 }
0096 assert(offsets_.size() - 1 <=
0097 static_cast<size_t>(std::numeric_limits<int32_t>::max()));
0098 #endif
0099 }
0100
0101 ChunkResolver(ChunkResolver&& other) noexcept;
0102 ChunkResolver& operator=(ChunkResolver&& other) noexcept;
0103
0104 ChunkResolver(const ChunkResolver& other) noexcept;
0105 ChunkResolver& operator=(const ChunkResolver& other) noexcept;
0106
0107 int64_t logical_array_length() const { return offsets_.back(); }
0108 int32_t num_chunks() const { return static_cast<int32_t>(offsets_.size() - 1); }
0109
0110 int64_t chunk_length(int64_t chunk_index) const {
0111 return offsets_[chunk_index + 1] - offsets_[chunk_index];
0112 }
0113
0114
0115
0116
0117
0118
0119
0120
0121
0122
0123
0124
0125 inline ChunkLocation Resolve(int64_t index) const {
0126 const auto cached_chunk = cached_chunk_.load(std::memory_order_relaxed);
0127 const auto chunk_index =
0128 ResolveChunkIndex<true>(index, cached_chunk);
0129 return ChunkLocation{chunk_index, index - offsets_[chunk_index]};
0130 }
0131
0132
0133
0134
0135
0136
0137
0138
0139
0140
0141
0142
0143
0144
0145 inline ChunkLocation ResolveWithHint(int64_t index, ChunkLocation hint) const {
0146 assert(hint.chunk_index < static_cast<uint32_t>(offsets_.size()));
0147 const auto chunk_index = ResolveChunkIndex<false>(
0148 index, static_cast<int32_t>(hint.chunk_index));
0149 return ChunkLocation{chunk_index, index - offsets_[chunk_index]};
0150 }
0151
0152
0153
0154
0155
0156
0157
0158
0159
0160
0161
0162
0163
0164
0165
0166
0167
0168
0169
0170
0171 template <typename IndexType>
0172 [[nodiscard]] bool ResolveMany(int64_t n_indices, const IndexType* logical_index_vec,
0173 TypedChunkLocation<IndexType>* out_chunk_location_vec,
0174 IndexType chunk_hint = 0) const {
0175 if constexpr (sizeof(IndexType) < sizeof(uint32_t)) {
0176
0177 constexpr int64_t kMaxIndexTypeValue = std::numeric_limits<IndexType>::max();
0178
0179
0180 const bool chunk_index_fits_on_type = num_chunks() <= kMaxIndexTypeValue;
0181 if (ARROW_PREDICT_FALSE(!chunk_index_fits_on_type)) {
0182 return false;
0183 }
0184
0185
0186 }
0187 if constexpr (std::is_signed_v<IndexType>) {
0188
0189
0190
0191
0192
0193
0194
0195
0196 using U = std::make_unsigned_t<IndexType>;
0197 ResolveManyImpl(n_indices, reinterpret_cast<const U*>(logical_index_vec),
0198 reinterpret_cast<TypedChunkLocation<U>*>(out_chunk_location_vec),
0199 static_cast<int32_t>(chunk_hint));
0200 } else {
0201 static_assert(std::is_unsigned_v<IndexType>);
0202 ResolveManyImpl(n_indices, logical_index_vec, out_chunk_location_vec,
0203 static_cast<int32_t>(chunk_hint));
0204 }
0205 return true;
0206 }
0207
0208 private:
0209 template <bool StoreCachedChunk>
0210 inline int64_t ResolveChunkIndex(int64_t index, int32_t cached_chunk) const {
0211
0212
0213
0214
0215
0216
0217
0218 const auto num_offsets = static_cast<uint32_t>(offsets_.size());
0219 const int64_t* offsets = offsets_.data();
0220 if (ARROW_PREDICT_TRUE(index >= offsets[cached_chunk]) &&
0221 (static_cast<uint32_t>(cached_chunk + 1) == num_offsets ||
0222 index < offsets[cached_chunk + 1])) {
0223 return cached_chunk;
0224 }
0225
0226 const auto chunk_index = Bisect(index, offsets, 0, num_offsets);
0227 if constexpr (StoreCachedChunk) {
0228 assert(static_cast<uint32_t>(chunk_index) < static_cast<uint32_t>(offsets_.size()));
0229 cached_chunk_.store(chunk_index, std::memory_order_relaxed);
0230 }
0231 return chunk_index;
0232 }
0233
0234
0235
0236 void ResolveManyImpl(int64_t, const uint8_t*, TypedChunkLocation<uint8_t>*,
0237 int32_t) const;
0238 void ResolveManyImpl(int64_t, const uint16_t*, TypedChunkLocation<uint16_t>*,
0239 int32_t) const;
0240 void ResolveManyImpl(int64_t, const uint32_t*, TypedChunkLocation<uint32_t>*,
0241 int32_t) const;
0242 void ResolveManyImpl(int64_t, const uint64_t*, TypedChunkLocation<uint64_t>*,
0243 int32_t) const;
0244
0245 public:
0246
0247
0248
0249
0250
0251
0252
0253
0254
0255
0256 static inline int32_t Bisect(int64_t index, const int64_t* offsets, int32_t lo,
0257 int32_t hi) {
0258 return Bisect(static_cast<uint64_t>(index),
0259 reinterpret_cast<const uint64_t*>(offsets), static_cast<uint32_t>(lo),
0260 static_cast<uint32_t>(hi));
0261 }
0262
0263 static inline int32_t Bisect(uint64_t index, const uint64_t* offsets, uint32_t lo,
0264 uint32_t hi) {
0265
0266
0267 auto n = hi - lo;
0268
0269
0270 assert(n > 1 && "lo < hi is a precondition of Bisect");
0271 do {
0272 const uint32_t m = n >> 1;
0273 const uint32_t mid = lo + m;
0274 if (index >= offsets[mid]) {
0275 lo = mid;
0276 n -= m;
0277 } else {
0278 n = m;
0279 }
0280 } while (n > 1);
0281 return lo;
0282 }
0283 };
0284
0285
0286 template struct TypedChunkLocation<int32_t>;
0287 template struct TypedChunkLocation<int16_t>;
0288 template struct TypedChunkLocation<int8_t>;
0289 template struct TypedChunkLocation<uint8_t>;
0290 template struct TypedChunkLocation<uint16_t>;
0291 template struct TypedChunkLocation<uint32_t>;
0292 template struct TypedChunkLocation<int64_t>;
0293 template struct TypedChunkLocation<uint64_t>;
0294 }