File indexing completed on 2025-08-28 08:26:55
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 #pragma once
0019
0020 #include <atomic> // IWYU pragma: export
0021 #include <cassert>
0022 #include <cstdint>
0023 #include <memory>
0024 #include <utility>
0025 #include <vector>
0026
0027 #include "arrow/array/statistics.h"
0028 #include "arrow/buffer.h"
0029 #include "arrow/result.h"
0030 #include "arrow/type.h"
0031 #include "arrow/type_fwd.h"
0032 #include "arrow/util/bit_util.h"
0033 #include "arrow/util/macros.h"
0034 #include "arrow/util/span.h"
0035 #include "arrow/util/visibility.h"
0036
0037 namespace arrow {
0038
0039 namespace internal {
0040
0041
0042
0043 ARROW_EXPORT bool IsNullSparseUnion(const ArrayData& data, int64_t i);
0044 ARROW_EXPORT bool IsNullDenseUnion(const ArrayData& data, int64_t i);
0045 ARROW_EXPORT bool IsNullRunEndEncoded(const ArrayData& data, int64_t i);
0046
0047 ARROW_EXPORT bool UnionMayHaveLogicalNulls(const ArrayData& data);
0048 ARROW_EXPORT bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data);
0049 ARROW_EXPORT bool DictionaryMayHaveLogicalNulls(const ArrayData& data);
0050
0051 }
0052
0053
0054
0055
0056
0057 constexpr int64_t kUnknownNullCount = -1;
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092
0093 struct ARROW_EXPORT ArrayData {
0094 ArrayData() = default;
0095
0096 ArrayData(std::shared_ptr<DataType> type, int64_t length,
0097 int64_t null_count = kUnknownNullCount, int64_t offset = 0)
0098 : type(std::move(type)), length(length), null_count(null_count), offset(offset) {}
0099
0100 ArrayData(std::shared_ptr<DataType> type, int64_t length,
0101 std::vector<std::shared_ptr<Buffer>> buffers,
0102 int64_t null_count = kUnknownNullCount, int64_t offset = 0)
0103 : ArrayData(std::move(type), length, null_count, offset) {
0104 this->buffers = std::move(buffers);
0105 #ifndef NDEBUG
0106
0107
0108 ARROW_UNUSED(this->device_type());
0109 #endif
0110 }
0111
0112 ArrayData(std::shared_ptr<DataType> type, int64_t length,
0113 std::vector<std::shared_ptr<Buffer>> buffers,
0114 std::vector<std::shared_ptr<ArrayData>> child_data,
0115 int64_t null_count = kUnknownNullCount, int64_t offset = 0)
0116 : ArrayData(std::move(type), length, null_count, offset) {
0117 this->buffers = std::move(buffers);
0118 this->child_data = std::move(child_data);
0119 #ifndef NDEBUG
0120
0121
0122
0123 ARROW_UNUSED(this->device_type());
0124 #endif
0125 }
0126
0127 static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
0128 std::vector<std::shared_ptr<Buffer>> buffers,
0129 int64_t null_count = kUnknownNullCount,
0130 int64_t offset = 0);
0131
0132 static std::shared_ptr<ArrayData> Make(
0133 std::shared_ptr<DataType> type, int64_t length,
0134 std::vector<std::shared_ptr<Buffer>> buffers,
0135 std::vector<std::shared_ptr<ArrayData>> child_data,
0136 int64_t null_count = kUnknownNullCount, int64_t offset = 0);
0137
0138 static std::shared_ptr<ArrayData> Make(
0139 std::shared_ptr<DataType> type, int64_t length,
0140 std::vector<std::shared_ptr<Buffer>> buffers,
0141 std::vector<std::shared_ptr<ArrayData>> child_data,
0142 std::shared_ptr<ArrayData> dictionary, int64_t null_count = kUnknownNullCount,
0143 int64_t offset = 0);
0144
0145 static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
0146 int64_t null_count = kUnknownNullCount,
0147 int64_t offset = 0);
0148
0149
0150 ArrayData(ArrayData&& other) noexcept
0151 : type(std::move(other.type)),
0152 length(other.length),
0153 null_count(other.null_count.load()),
0154 offset(other.offset),
0155 buffers(std::move(other.buffers)),
0156 child_data(std::move(other.child_data)),
0157 dictionary(std::move(other.dictionary)),
0158 statistics(std::move(other.statistics)) {}
0159
0160
0161 ArrayData(const ArrayData& other) noexcept
0162 : type(other.type),
0163 length(other.length),
0164 null_count(other.null_count.load()),
0165 offset(other.offset),
0166 buffers(other.buffers),
0167 child_data(other.child_data),
0168 dictionary(other.dictionary),
0169 statistics(other.statistics) {}
0170
0171
0172 ArrayData& operator=(ArrayData&& other) {
0173 type = std::move(other.type);
0174 length = other.length;
0175 SetNullCount(other.null_count);
0176 offset = other.offset;
0177 buffers = std::move(other.buffers);
0178 child_data = std::move(other.child_data);
0179 dictionary = std::move(other.dictionary);
0180 statistics = std::move(other.statistics);
0181 return *this;
0182 }
0183
0184
0185 ArrayData& operator=(const ArrayData& other) {
0186 type = other.type;
0187 length = other.length;
0188 SetNullCount(other.null_count);
0189 offset = other.offset;
0190 buffers = other.buffers;
0191 child_data = other.child_data;
0192 dictionary = other.dictionary;
0193 statistics = other.statistics;
0194 return *this;
0195 }
0196
0197 std::shared_ptr<ArrayData> Copy() const { return std::make_shared<ArrayData>(*this); }
0198
0199
0200
0201
0202
0203
0204 Result<std::shared_ptr<ArrayData>> CopyTo(
0205 const std::shared_ptr<MemoryManager>& to) const;
0206
0207
0208
0209
0210
0211 Result<std::shared_ptr<ArrayData>> ViewOrCopyTo(
0212 const std::shared_ptr<MemoryManager>& to) const;
0213
0214 bool IsNull(int64_t i) const { return !IsValid(i); }
0215
0216 bool IsValid(int64_t i) const {
0217 if (buffers[0] != NULLPTR) {
0218 return bit_util::GetBit(buffers[0]->data(), i + offset);
0219 }
0220 const auto type = this->type->id();
0221 if (type == Type::SPARSE_UNION) {
0222 return !internal::IsNullSparseUnion(*this, i);
0223 }
0224 if (type == Type::DENSE_UNION) {
0225 return !internal::IsNullDenseUnion(*this, i);
0226 }
0227 if (type == Type::RUN_END_ENCODED) {
0228 return !internal::IsNullRunEndEncoded(*this, i);
0229 }
0230 return null_count.load() != length;
0231 }
0232
0233
0234 template <typename T>
0235 inline const T* GetValues(int i, int64_t absolute_offset) const {
0236 if (buffers[i]) {
0237 return reinterpret_cast<const T*>(buffers[i]->data()) + absolute_offset;
0238 } else {
0239 return NULLPTR;
0240 }
0241 }
0242
0243 template <typename T>
0244 inline const T* GetValues(int i) const {
0245 return GetValues<T>(i, offset);
0246 }
0247
0248
0249
0250 template <typename T>
0251 inline const T* GetValuesSafe(int i, int64_t absolute_offset) const {
0252 if (buffers[i] && buffers[i]->is_cpu()) {
0253 return reinterpret_cast<const T*>(buffers[i]->data()) + absolute_offset;
0254 } else {
0255 return NULLPTR;
0256 }
0257 }
0258
0259 template <typename T>
0260 inline const T* GetValuesSafe(int i) const {
0261 return GetValuesSafe<T>(i, offset);
0262 }
0263
0264
0265 template <typename T>
0266 inline T* GetMutableValues(int i, int64_t absolute_offset) {
0267 if (buffers[i]) {
0268 return reinterpret_cast<T*>(buffers[i]->mutable_data()) + absolute_offset;
0269 } else {
0270 return NULLPTR;
0271 }
0272 }
0273
0274 template <typename T>
0275 inline T* GetMutableValues(int i) {
0276 return GetMutableValues<T>(i, offset);
0277 }
0278
0279
0280
0281
0282
0283
0284
0285
0286
0287
0288
0289
0290
0291
0292 std::shared_ptr<ArrayData> Slice(int64_t offset, int64_t length) const;
0293
0294
0295
0296
0297
0298 Result<std::shared_ptr<ArrayData>> SliceSafe(int64_t offset, int64_t length) const;
0299
0300 void SetNullCount(int64_t v) { null_count.store(v); }
0301
0302
0303 int64_t GetNullCount() const;
0304
0305
0306
0307
0308
0309
0310
0311
0312
0313
0314 bool MayHaveNulls() const {
0315
0316
0317 return null_count.load() != 0 && buffers[0] != NULLPTR;
0318 }
0319
0320
0321 bool HasValidityBitmap() const { return buffers[0] != NULLPTR; }
0322
0323
0324
0325
0326
0327
0328
0329
0330
0331
0332
0333
0334
0335
0336
0337
0338
0339
0340
0341
0342
0343
0344
0345
0346
0347
0348
0349
0350
0351
0352
0353
0354
0355
0356
0357
0358
0359 bool MayHaveLogicalNulls() const {
0360 if (buffers[0] != NULLPTR) {
0361 return null_count.load() != 0;
0362 }
0363 const auto t = type->id();
0364 if (t == Type::SPARSE_UNION || t == Type::DENSE_UNION) {
0365 return internal::UnionMayHaveLogicalNulls(*this);
0366 }
0367 if (t == Type::RUN_END_ENCODED) {
0368 return internal::RunEndEncodedMayHaveLogicalNulls(*this);
0369 }
0370 if (t == Type::DICTIONARY) {
0371 return internal::DictionaryMayHaveLogicalNulls(*this);
0372 }
0373 return null_count.load() != 0;
0374 }
0375
0376
0377
0378
0379
0380
0381
0382
0383
0384
0385 int64_t ComputeLogicalNullCount() const;
0386
0387
0388
0389
0390
0391
0392
0393
0394
0395 DeviceAllocationType device_type() const;
0396
0397 std::shared_ptr<DataType> type;
0398 int64_t length = 0;
0399 mutable std::atomic<int64_t> null_count{0};
0400
0401
0402 int64_t offset = 0;
0403 std::vector<std::shared_ptr<Buffer>> buffers;
0404 std::vector<std::shared_ptr<ArrayData>> child_data;
0405
0406
0407 std::shared_ptr<ArrayData> dictionary;
0408
0409
0410 std::shared_ptr<ArrayStatistics> statistics;
0411 };
0412
0413
0414 struct ARROW_EXPORT BufferSpan {
0415
0416
0417
0418 uint8_t* data = NULLPTR;
0419 int64_t size = 0;
0420
0421 const std::shared_ptr<Buffer>* owner = NULLPTR;
0422
0423 template <typename T>
0424 const T* data_as() const {
0425 return reinterpret_cast<const T*>(data);
0426 }
0427 template <typename T>
0428 T* mutable_data_as() {
0429 return reinterpret_cast<T*>(data);
0430 }
0431 };
0432
0433
0434
0435
0436 struct ARROW_EXPORT ArraySpan {
0437 const DataType* type = NULLPTR;
0438 int64_t length = 0;
0439 mutable int64_t null_count = kUnknownNullCount;
0440 int64_t offset = 0;
0441 BufferSpan buffers[3];
0442
0443 ArraySpan() = default;
0444
0445 explicit ArraySpan(const DataType* type, int64_t length) : type(type), length(length) {}
0446
0447 ArraySpan(const ArrayData& data) {
0448 SetMembers(data);
0449 }
0450 explicit ArraySpan(const Scalar& data) { FillFromScalar(data); }
0451
0452
0453 std::vector<ArraySpan> child_data;
0454
0455
0456
0457 void FillFromScalar(const Scalar& value);
0458
0459 void SetMembers(const ArrayData& data);
0460
0461 void SetBuffer(int index, const std::shared_ptr<Buffer>& buffer) {
0462 this->buffers[index].data = const_cast<uint8_t*>(buffer->data());
0463 this->buffers[index].size = buffer->size();
0464 this->buffers[index].owner = &buffer;
0465 }
0466
0467 const ArraySpan& dictionary() const { return child_data[0]; }
0468
0469
0470
0471 int num_buffers() const;
0472
0473
0474 template <typename T>
0475 inline T* GetValues(int i, int64_t absolute_offset) {
0476 return reinterpret_cast<T*>(buffers[i].data) + absolute_offset;
0477 }
0478
0479 template <typename T>
0480 inline T* GetValues(int i) {
0481 return GetValues<T>(i, this->offset);
0482 }
0483
0484
0485 template <typename T>
0486 inline const T* GetValues(int i, int64_t absolute_offset) const {
0487 return reinterpret_cast<const T*>(buffers[i].data) + absolute_offset;
0488 }
0489
0490 template <typename T>
0491 inline const T* GetValues(int i) const {
0492 return GetValues<T>(i, this->offset);
0493 }
0494
0495
0496
0497
0498
0499
0500
0501
0502
0503 template <typename T>
0504 util::span<const T> GetSpan(int i, int64_t length) const {
0505 const int64_t buffer_length = buffers[i].size / static_cast<int64_t>(sizeof(T));
0506 assert(i > 0 && length + offset <= buffer_length);
0507 ARROW_UNUSED(buffer_length);
0508 return util::span<const T>(buffers[i].data_as<T>() + this->offset, length);
0509 }
0510
0511
0512
0513
0514
0515
0516
0517
0518
0519 template <typename T>
0520 util::span<T> GetSpan(int i, int64_t length) {
0521 const int64_t buffer_length = buffers[i].size / static_cast<int64_t>(sizeof(T));
0522 assert(i > 0 && length + offset <= buffer_length);
0523 ARROW_UNUSED(buffer_length);
0524 return util::span<T>(buffers[i].mutable_data_as<T>() + this->offset, length);
0525 }
0526
0527 inline bool IsNull(int64_t i) const { return !IsValid(i); }
0528
0529 inline bool IsValid(int64_t i) const {
0530 if (this->buffers[0].data != NULLPTR) {
0531 return bit_util::GetBit(this->buffers[0].data, i + this->offset);
0532 } else {
0533 const auto type = this->type->id();
0534 if (type == Type::SPARSE_UNION) {
0535 return !IsNullSparseUnion(i);
0536 }
0537 if (type == Type::DENSE_UNION) {
0538 return !IsNullDenseUnion(i);
0539 }
0540 if (type == Type::RUN_END_ENCODED) {
0541 return !IsNullRunEndEncoded(i);
0542 }
0543 return this->null_count != this->length;
0544 }
0545 }
0546
0547 std::shared_ptr<ArrayData> ToArrayData() const;
0548
0549 std::shared_ptr<Array> ToArray() const;
0550
0551 std::shared_ptr<Buffer> GetBuffer(int index) const {
0552 const BufferSpan& buf = this->buffers[index];
0553 if (buf.owner) {
0554 return *buf.owner;
0555 } else if (buf.data != NULLPTR) {
0556
0557 return std::make_shared<Buffer>(buf.data, buf.size);
0558 } else {
0559 return NULLPTR;
0560 }
0561 }
0562
0563 void SetSlice(int64_t offset, int64_t length) {
0564 this->offset = offset;
0565 this->length = length;
0566 if (this->type->id() == Type::NA) {
0567 this->null_count = this->length;
0568 } else if (this->MayHaveNulls()) {
0569 this->null_count = kUnknownNullCount;
0570 } else {
0571 this->null_count = 0;
0572 }
0573 }
0574
0575
0576 int64_t GetNullCount() const;
0577
0578
0579
0580
0581
0582
0583
0584
0585
0586
0587 bool MayHaveNulls() const {
0588
0589
0590 return null_count != 0 && buffers[0].data != NULLPTR;
0591 }
0592
0593
0594 bool HasValidityBitmap() const { return buffers[0].data != NULLPTR; }
0595
0596
0597
0598
0599
0600
0601 bool MayHaveLogicalNulls() const {
0602 if (buffers[0].data != NULLPTR) {
0603 return null_count != 0;
0604 }
0605 const auto t = type->id();
0606 if (t == Type::SPARSE_UNION || t == Type::DENSE_UNION) {
0607 return UnionMayHaveLogicalNulls();
0608 }
0609 if (t == Type::RUN_END_ENCODED) {
0610 return RunEndEncodedMayHaveLogicalNulls();
0611 }
0612 if (t == Type::DICTIONARY) {
0613 return DictionaryMayHaveLogicalNulls();
0614 }
0615 return null_count != 0;
0616 }
0617
0618
0619
0620
0621
0622
0623
0624
0625
0626
0627 int64_t ComputeLogicalNullCount() const;
0628
0629
0630
0631
0632
0633
0634
0635
0636 util::span<const std::shared_ptr<Buffer>> GetVariadicBuffers() const;
0637 bool HasVariadicBuffers() const;
0638
0639 private:
0640 ARROW_FRIEND_EXPORT friend bool internal::IsNullRunEndEncoded(const ArrayData& data,
0641 int64_t i);
0642
0643 bool IsNullSparseUnion(int64_t i) const;
0644 bool IsNullDenseUnion(int64_t i) const;
0645
0646
0647
0648
0649
0650
0651
0652 bool IsNullRunEndEncoded(int64_t i) const;
0653
0654 bool UnionMayHaveLogicalNulls() const;
0655 bool RunEndEncodedMayHaveLogicalNulls() const;
0656 bool DictionaryMayHaveLogicalNulls() const;
0657 };
0658
0659 namespace internal {
0660
0661 void FillZeroLengthArray(const DataType* type, ArraySpan* span);
0662
0663
0664
0665
0666
0667
0668
0669 ARROW_EXPORT
0670 Result<std::shared_ptr<ArrayData>> GetArrayView(const std::shared_ptr<ArrayData>& data,
0671 const std::shared_ptr<DataType>& type);
0672
0673 }
0674 }