File indexing completed on 2025-08-28 08:27:10
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 #pragma once
0019
0020 #include <algorithm>
0021 #include <cassert>
0022 #include <cstdint>
0023
0024 #include "arrow/array/data.h"
0025 #include "arrow/type_traits.h"
0026 #include "arrow/util/checked_cast.h"
0027 #include "arrow/util/macros.h"
0028
0029 namespace arrow {
0030 namespace ree_util {
0031
0032
0033 inline const ArraySpan& RunEndsArray(const ArraySpan& span) { return span.child_data[0]; }
0034
0035
0036 inline const ArraySpan& ValuesArray(const ArraySpan& span) { return span.child_data[1]; }
0037
0038
0039 template <typename RunEndCType>
0040 const RunEndCType* RunEnds(const ArraySpan& span) {
0041 assert(RunEndsArray(span).type->id() == CTypeTraits<RunEndCType>::ArrowType::type_id);
0042 return RunEndsArray(span).GetValues<RunEndCType>(1);
0043 }
0044
0045
0046
0047
0048
0049
0050
0051 Status ValidateRunEndEncodedChildren(const RunEndEncodedType& type,
0052 int64_t logical_length,
0053 const std::shared_ptr<ArrayData>& run_ends_data,
0054 const std::shared_ptr<ArrayData>& values_data,
0055 int64_t null_count, int64_t logical_offset);
0056
0057
0058 int64_t LogicalNullCount(const ArraySpan& span);
0059
0060 namespace internal {
0061
0062
0063
0064
0065
0066
0067 template <typename RunEndCType>
0068 int64_t FindPhysicalIndex(const RunEndCType* run_ends, int64_t run_ends_size, int64_t i,
0069 int64_t absolute_offset) {
0070 assert(absolute_offset + i >= 0);
0071 auto it = std::upper_bound(run_ends, run_ends + run_ends_size, absolute_offset + i);
0072 int64_t result = std::distance(run_ends, it);
0073 assert(result <= run_ends_size);
0074 return result;
0075 }
0076
0077
0078
0079
0080
0081
0082 template <typename RunEndCType>
0083 std::pair<int64_t, int64_t> FindPhysicalRange(const RunEndCType* run_ends,
0084 int64_t run_ends_size, int64_t length,
0085 int64_t offset) {
0086 const int64_t physical_offset =
0087 FindPhysicalIndex<RunEndCType>(run_ends, run_ends_size, 0, offset);
0088
0089
0090 if (length == 0) {
0091 return {physical_offset, 0};
0092 }
0093 const int64_t physical_index_of_last = FindPhysicalIndex<RunEndCType>(
0094 run_ends + physical_offset, run_ends_size - physical_offset, length - 1, offset);
0095
0096 assert(physical_index_of_last < run_ends_size - physical_offset);
0097 return {physical_offset, physical_index_of_last + 1};
0098 }
0099
0100
0101
0102
0103 template <typename RunEndCType>
0104 int64_t FindPhysicalLength(const RunEndCType* run_ends, int64_t run_ends_size,
0105 int64_t length, int64_t offset) {
0106 auto [_, physical_length] =
0107 FindPhysicalRange<RunEndCType>(run_ends, run_ends_size, length, offset);
0108
0109
0110
0111
0112 ARROW_UNUSED(_);
0113 return physical_length;
0114 }
0115
0116
0117
0118
0119 template <typename RunEndCType>
0120 int64_t FindPhysicalIndex(const ArraySpan& span, int64_t i, int64_t absolute_offset) {
0121 const int64_t run_ends_size = RunEndsArray(span).length;
0122 return FindPhysicalIndex(RunEnds<RunEndCType>(span), run_ends_size, i, absolute_offset);
0123 }
0124
0125
0126
0127
0128
0129
0130
0131
0132
0133
0134 template <typename RunEndCType>
0135 int64_t FindPhysicalLength(const ArraySpan& span) {
0136 return FindPhysicalLength(
0137 RunEnds<RunEndCType>(span),
0138 RunEndsArray(span).length,
0139 span.length,
0140 span.offset);
0141 }
0142
0143 template <typename RunEndCType>
0144 struct PhysicalIndexFinder;
0145
0146
0147 ARROW_EXPORT int64_t FindPhysicalIndexImpl16(PhysicalIndexFinder<int16_t>& self,
0148 int64_t i);
0149 ARROW_EXPORT int64_t FindPhysicalIndexImpl32(PhysicalIndexFinder<int32_t>& self,
0150 int64_t i);
0151 ARROW_EXPORT int64_t FindPhysicalIndexImpl64(PhysicalIndexFinder<int64_t>& self,
0152 int64_t i);
0153
0154
0155
0156
0157
0158
0159
0160
0161
0162
0163
0164
0165
0166
0167
0168
0169
0170
0171
0172
0173
0174 template <typename RunEndCType>
0175 struct PhysicalIndexFinder {
0176 const ArraySpan array_span;
0177 const RunEndCType* run_ends;
0178 int64_t last_physical_index = 0;
0179
0180 explicit PhysicalIndexFinder(const ArrayData& data)
0181 : array_span(data),
0182 run_ends(RunEndsArray(array_span).template GetValues<RunEndCType>(1)) {
0183 assert(CTypeTraits<RunEndCType>::ArrowType::type_id ==
0184 ::arrow::internal::checked_cast<const RunEndEncodedType&>(*data.type)
0185 .run_end_type()
0186 ->id());
0187 }
0188
0189
0190
0191
0192
0193
0194 int64_t FindPhysicalIndex(int64_t i) {
0195 if constexpr (std::is_same_v<RunEndCType, int16_t>) {
0196 return FindPhysicalIndexImpl16(*this, i);
0197 } else if constexpr (std::is_same_v<RunEndCType, int32_t>) {
0198 return FindPhysicalIndexImpl32(*this, i);
0199 } else {
0200 static_assert(std::is_same_v<RunEndCType, int64_t>, "Unsupported RunEndCType.");
0201 return FindPhysicalIndexImpl64(*this, i);
0202 }
0203 }
0204 };
0205
0206 }
0207
0208
0209
0210
0211 ARROW_EXPORT int64_t FindPhysicalIndex(const ArraySpan& span, int64_t i,
0212 int64_t absolute_offset);
0213
0214
0215
0216
0217
0218
0219
0220
0221
0222
0223 ARROW_EXPORT int64_t FindPhysicalLength(const ArraySpan& span);
0224
0225
0226
0227
0228
0229 ARROW_EXPORT std::pair<int64_t, int64_t> FindPhysicalRange(const ArraySpan& span,
0230 int64_t offset,
0231 int64_t length);
0232
0233
0234 template <typename RunEndCType>
0235 using PhysicalIndexFinder = internal::PhysicalIndexFinder<RunEndCType>;
0236
0237 template <typename RunEndCType>
0238 class RunEndEncodedArraySpan {
0239 private:
0240 struct PrivateTag {};
0241
0242 public:
0243
0244
0245 class Iterator {
0246 public:
0247 Iterator(PrivateTag, const RunEndEncodedArraySpan& span, int64_t logical_pos,
0248 int64_t physical_pos)
0249 : span(span), logical_pos_(logical_pos), physical_pos_(physical_pos) {}
0250
0251
0252
0253
0254
0255
0256
0257
0258 int64_t index_into_array() const { return physical_pos_; }
0259
0260
0261
0262
0263
0264 int64_t logical_position() const { return logical_pos_; }
0265
0266
0267
0268
0269 int64_t run_end() const { return span.run_end(physical_pos_); }
0270
0271
0272
0273
0274 int64_t run_length() const { return run_end() - logical_pos_; }
0275
0276
0277
0278
0279
0280
0281
0282 bool is_end(const RunEndEncodedArraySpan& span) const {
0283 return logical_pos_ >= span.length();
0284 }
0285
0286 Iterator& operator++() {
0287 logical_pos_ = span.run_end(physical_pos_);
0288 physical_pos_ += 1;
0289 return *this;
0290 }
0291
0292 Iterator operator++(int) {
0293 const Iterator prev = *this;
0294 ++(*this);
0295 return prev;
0296 }
0297
0298 Iterator& operator--() {
0299 physical_pos_ -= 1;
0300 logical_pos_ = (physical_pos_ > 0) ? span.run_end(physical_pos_ - 1) : 0;
0301 return *this;
0302 }
0303
0304 Iterator operator--(int) {
0305 const Iterator prev = *this;
0306 --(*this);
0307 return prev;
0308 }
0309
0310 bool operator==(const Iterator& other) const {
0311 return logical_pos_ == other.logical_pos_;
0312 }
0313
0314 bool operator!=(const Iterator& other) const {
0315 return logical_pos_ != other.logical_pos_;
0316 }
0317
0318 public:
0319 const RunEndEncodedArraySpan& span;
0320
0321 private:
0322 int64_t logical_pos_;
0323 int64_t physical_pos_;
0324 };
0325
0326
0327
0328 explicit RunEndEncodedArraySpan(const ArrayData& data) = delete;
0329
0330
0331
0332
0333
0334
0335
0336
0337
0338
0339
0340
0341 explicit RunEndEncodedArraySpan(const ArraySpan& array_span, int64_t offset,
0342 int64_t length)
0343 : array_span_{array_span},
0344 run_ends_(RunEnds<RunEndCType>(array_span_)),
0345 length_(length),
0346 offset_(offset) {
0347 assert(array_span_.type->id() == Type::RUN_END_ENCODED);
0348 }
0349
0350 explicit RunEndEncodedArraySpan(const ArraySpan& array_span)
0351 : RunEndEncodedArraySpan(array_span, array_span.offset, array_span.length) {}
0352
0353 int64_t offset() const { return offset_; }
0354 int64_t length() const { return length_; }
0355
0356 int64_t PhysicalIndex(int64_t logical_pos) const {
0357 return internal::FindPhysicalIndex(run_ends_, RunEndsArray(array_span_).length,
0358 logical_pos, offset_);
0359 }
0360
0361
0362
0363
0364
0365
0366 Iterator iterator(int64_t logical_pos, int64_t physical_offset) const {
0367 return Iterator{PrivateTag{}, *this, logical_pos, physical_offset};
0368 }
0369
0370
0371
0372
0373 Iterator iterator(int64_t logical_pos) const {
0374 if (logical_pos < length()) {
0375 return iterator(logical_pos, PhysicalIndex(logical_pos));
0376 }
0377
0378
0379
0380
0381 return (length() == 0) ? iterator(0, PhysicalIndex(0))
0382 : iterator(length(), PhysicalIndex(length() - 1) + 1);
0383 }
0384
0385
0386
0387 Iterator begin() const { return iterator(0, PhysicalIndex(0)); }
0388
0389
0390
0391
0392
0393
0394
0395
0396
0397
0398
0399
0400
0401
0402
0403
0404
0405
0406
0407
0408
0409 Iterator end() const {
0410 return iterator(length(),
0411 (length() == 0) ? PhysicalIndex(0) : PhysicalIndex(length() - 1) + 1);
0412 }
0413
0414
0415 inline int64_t run_end(int64_t physical_pos) const {
0416 assert(physical_pos < RunEndsArray(array_span_).length);
0417
0418 const int64_t logical_run_end =
0419 std::max<int64_t>(static_cast<int64_t>(run_ends_[physical_pos]) - offset(), 0);
0420
0421 return std::min(logical_run_end, length());
0422 }
0423
0424 private:
0425 const ArraySpan& array_span_;
0426 const RunEndCType* run_ends_;
0427 const int64_t length_;
0428 const int64_t offset_;
0429 };
0430
0431
0432
0433
0434
0435
0436 template <typename Left, typename Right>
0437 class MergedRunsIterator {
0438 private:
0439 using LeftIterator = typename Left::Iterator;
0440 using RightIterator = typename Right::Iterator;
0441
0442 MergedRunsIterator(LeftIterator left_it, RightIterator right_it,
0443 int64_t common_logical_length, int64_t common_logical_pos)
0444 : ree_iterators_{std::move(left_it), std::move(right_it)},
0445 logical_length_(common_logical_length),
0446 logical_pos_(common_logical_pos) {}
0447
0448 public:
0449
0450
0451
0452 MergedRunsIterator(const Left& left, const Right& right)
0453 : MergedRunsIterator(left.begin(), right.begin(), left.length(), 0) {
0454 assert(left.length() == right.length());
0455 }
0456
0457 static Result<MergedRunsIterator> MakeBegin(const Left& left, const Right& right) {
0458 if (left.length() != right.length()) {
0459 return Status::Invalid(
0460 "MergedRunsIterator expects RunEndEncodedArraySpans of the same length");
0461 }
0462 return MergedRunsIterator(left, right);
0463 }
0464
0465 static Result<MergedRunsIterator> MakeEnd(const Left& left, const Right& right) {
0466 if (left.length() != right.length()) {
0467 return Status::Invalid(
0468 "MergedRunsIterator expects RunEndEncodedArraySpans of the same length");
0469 }
0470 return MergedRunsIterator(left.end(), right.end(), left.length(), left.length());
0471 }
0472
0473
0474 const Left& left() const { return std::get<0>(ree_iterators_).span; }
0475
0476
0477 const Right& right() const { return std::get<1>(ree_iterators_).span; }
0478
0479
0480
0481
0482 int64_t logical_position() const { return logical_pos_; }
0483
0484
0485 bool is_begin() const { return logical_pos_ == 0; }
0486
0487
0488 bool is_end() const { return logical_pos_ == logical_length_; }
0489
0490
0491
0492
0493 int64_t run_end() const {
0494 const auto& left_it = std::get<0>(ree_iterators_);
0495 const auto& right_it = std::get<1>(ree_iterators_);
0496 return std::min(left_it.run_end(), right_it.run_end());
0497 }
0498
0499
0500
0501
0502 int64_t run_length() const { return run_end() - logical_pos_; }
0503
0504
0505
0506 template <size_t input_id>
0507 int64_t index_into_array() const {
0508 return std::get<input_id>(ree_iterators_).index_into_array();
0509 }
0510
0511 int64_t index_into_left_array() const { return index_into_array<0>(); }
0512 int64_t index_into_right_array() const { return index_into_array<1>(); }
0513
0514 MergedRunsIterator& operator++() {
0515 auto& left_it = std::get<0>(ree_iterators_);
0516 auto& right_it = std::get<1>(ree_iterators_);
0517
0518 const int64_t left_run_end = left_it.run_end();
0519 const int64_t right_run_end = right_it.run_end();
0520
0521 if (left_run_end < right_run_end) {
0522 logical_pos_ = left_run_end;
0523 ++left_it;
0524 } else if (left_run_end > right_run_end) {
0525 logical_pos_ = right_run_end;
0526 ++right_it;
0527 } else {
0528 logical_pos_ = left_run_end;
0529 ++left_it;
0530 ++right_it;
0531 }
0532 return *this;
0533 }
0534
0535 MergedRunsIterator operator++(int) {
0536 MergedRunsIterator prev = *this;
0537 ++(*this);
0538 return prev;
0539 }
0540
0541 MergedRunsIterator& operator--() {
0542 auto& left_it = std::get<0>(ree_iterators_);
0543 auto& right_it = std::get<1>(ree_iterators_);
0544
0545
0546 const int64_t left_logical_pos = left_it.logical_position();
0547 const int64_t right_logical_pos = right_it.logical_position();
0548
0549 if (left_logical_pos < right_logical_pos) {
0550 --right_it;
0551 logical_pos_ = std::max(left_logical_pos, right_it.logical_position());
0552 } else if (left_logical_pos > right_logical_pos) {
0553 --left_it;
0554 logical_pos_ = std::max(left_it.logical_position(), right_logical_pos);
0555 } else {
0556 --left_it;
0557 --right_it;
0558 logical_pos_ = std::max(left_it.logical_position(), right_it.logical_position());
0559 }
0560 return *this;
0561 }
0562
0563 MergedRunsIterator operator--(int) {
0564 MergedRunsIterator prev = *this;
0565 --(*this);
0566 return prev;
0567 }
0568
0569 bool operator==(const MergedRunsIterator& other) const {
0570 return logical_pos_ == other.logical_position();
0571 }
0572
0573 bool operator!=(const MergedRunsIterator& other) const { return !(*this == other); }
0574
0575 private:
0576 std::tuple<LeftIterator, RightIterator> ree_iterators_;
0577 const int64_t logical_length_;
0578 int64_t logical_pos_;
0579 };
0580
0581 }
0582 }