File indexing completed on 2025-08-28 08:26:54
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 #pragma once
0019
0020 #include <array>
0021 #include <cstddef>
0022 #include <cstdint>
0023 #include <cstring>
0024 #include <limits>
0025 #include <memory>
0026 #include <numeric>
0027 #include <string>
0028 #include <string_view>
0029 #include <vector>
0030
0031 #include "arrow/array/array_base.h"
0032 #include "arrow/array/array_binary.h"
0033 #include "arrow/array/builder_base.h"
0034 #include "arrow/array/data.h"
0035 #include "arrow/buffer.h"
0036 #include "arrow/buffer_builder.h"
0037 #include "arrow/status.h"
0038 #include "arrow/type.h"
0039 #include "arrow/util/binary_view_util.h"
0040 #include "arrow/util/macros.h"
0041 #include "arrow/util/visibility.h"
0042
0043 namespace arrow {
0044
0045
0046
0047
0048
0049
0050
0051
0052 template <typename TYPE>
0053 class BaseBinaryBuilder
0054 : public ArrayBuilder,
0055 public internal::ArrayBuilderExtraOps<BaseBinaryBuilder<TYPE>, std::string_view> {
0056 public:
0057 using TypeClass = TYPE;
0058 using offset_type = typename TypeClass::offset_type;
0059
0060 explicit BaseBinaryBuilder(MemoryPool* pool = default_memory_pool(),
0061 int64_t alignment = kDefaultBufferAlignment)
0062 : ArrayBuilder(pool, alignment),
0063 offsets_builder_(pool, alignment),
0064 value_data_builder_(pool, alignment) {}
0065
0066 BaseBinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
0067 : BaseBinaryBuilder(pool) {}
0068
0069 Status Append(const uint8_t* value, offset_type length) {
0070 ARROW_RETURN_NOT_OK(Reserve(1));
0071 UnsafeAppendNextOffset();
0072
0073 if (ARROW_PREDICT_TRUE(length > 0)) {
0074 ARROW_RETURN_NOT_OK(ValidateOverflow(length));
0075 ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
0076 }
0077
0078 UnsafeAppendToBitmap(true);
0079 return Status::OK();
0080 }
0081
0082 Status Append(const char* value, offset_type length) {
0083 return Append(reinterpret_cast<const uint8_t*>(value), length);
0084 }
0085
0086 Status Append(std::string_view value) {
0087 return Append(value.data(), static_cast<offset_type>(value.size()));
0088 }
0089
0090
0091
0092
0093 Status ExtendCurrent(const uint8_t* value, offset_type length) {
0094
0095 if (ARROW_PREDICT_TRUE(length > 0)) {
0096 ARROW_RETURN_NOT_OK(ValidateOverflow(length));
0097 ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
0098 }
0099 return Status::OK();
0100 }
0101
0102 Status ExtendCurrent(std::string_view value) {
0103 return ExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
0104 static_cast<offset_type>(value.size()));
0105 }
0106
0107 Status AppendNulls(int64_t length) final {
0108 const int64_t num_bytes = value_data_builder_.length();
0109 ARROW_RETURN_NOT_OK(Reserve(length));
0110 for (int64_t i = 0; i < length; ++i) {
0111 offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
0112 }
0113 UnsafeAppendToBitmap(length, false);
0114 return Status::OK();
0115 }
0116
0117 Status AppendNull() final {
0118 ARROW_RETURN_NOT_OK(Reserve(1));
0119 UnsafeAppendNextOffset();
0120 UnsafeAppendToBitmap(false);
0121 return Status::OK();
0122 }
0123
0124 Status AppendEmptyValue() final {
0125 ARROW_RETURN_NOT_OK(Reserve(1));
0126 UnsafeAppendNextOffset();
0127 UnsafeAppendToBitmap(true);
0128 return Status::OK();
0129 }
0130
0131 Status AppendEmptyValues(int64_t length) final {
0132 const int64_t num_bytes = value_data_builder_.length();
0133 ARROW_RETURN_NOT_OK(Reserve(length));
0134 for (int64_t i = 0; i < length; ++i) {
0135 offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
0136 }
0137 UnsafeAppendToBitmap(length, true);
0138 return Status::OK();
0139 }
0140
0141
0142
0143
0144
0145 void UnsafeAppend(const uint8_t* value, offset_type length) {
0146 UnsafeAppendNextOffset();
0147 value_data_builder_.UnsafeAppend(value, length);
0148 UnsafeAppendToBitmap(true);
0149 }
0150
0151 void UnsafeAppend(const char* value, offset_type length) {
0152 UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
0153 }
0154
0155 void UnsafeAppend(const std::string& value) {
0156 UnsafeAppend(value.c_str(), static_cast<offset_type>(value.size()));
0157 }
0158
0159 void UnsafeAppend(std::string_view value) {
0160 UnsafeAppend(value.data(), static_cast<offset_type>(value.size()));
0161 }
0162
0163
0164 void UnsafeExtendCurrent(const uint8_t* value, offset_type length) {
0165 value_data_builder_.UnsafeAppend(value, length);
0166 }
0167
0168 void UnsafeExtendCurrent(std::string_view value) {
0169 UnsafeExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
0170 static_cast<offset_type>(value.size()));
0171 }
0172
0173 void UnsafeAppendNull() {
0174 const int64_t num_bytes = value_data_builder_.length();
0175 offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
0176 UnsafeAppendToBitmap(false);
0177 }
0178
0179 void UnsafeAppendEmptyValue() {
0180 const int64_t num_bytes = value_data_builder_.length();
0181 offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
0182 UnsafeAppendToBitmap(true);
0183 }
0184
0185
0186
0187
0188
0189
0190
0191 Status AppendValues(const std::vector<std::string>& values,
0192 const uint8_t* valid_bytes = NULLPTR) {
0193 std::size_t total_length = std::accumulate(
0194 values.begin(), values.end(), 0ULL,
0195 [](uint64_t sum, const std::string& str) { return sum + str.size(); });
0196 ARROW_RETURN_NOT_OK(Reserve(values.size()));
0197 ARROW_RETURN_NOT_OK(ReserveData(total_length));
0198
0199 if (valid_bytes != NULLPTR) {
0200 for (std::size_t i = 0; i < values.size(); ++i) {
0201 UnsafeAppendNextOffset();
0202 if (valid_bytes[i]) {
0203 value_data_builder_.UnsafeAppend(
0204 reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size());
0205 }
0206 }
0207 } else {
0208 for (const auto& value : values) {
0209 UnsafeAppendNextOffset();
0210 value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(value.data()),
0211 value.size());
0212 }
0213 }
0214
0215 UnsafeAppendToBitmap(valid_bytes, values.size());
0216 return Status::OK();
0217 }
0218
0219
0220
0221
0222
0223
0224
0225
0226
0227
0228 Status AppendValues(const char** values, int64_t length,
0229 const uint8_t* valid_bytes = NULLPTR) {
0230 std::size_t total_length = 0;
0231 std::vector<std::size_t> value_lengths(length);
0232 bool have_null_value = false;
0233 for (int64_t i = 0; i < length; ++i) {
0234 if (values[i] != NULLPTR) {
0235 auto value_length = strlen(values[i]);
0236 value_lengths[i] = value_length;
0237 total_length += value_length;
0238 } else {
0239 have_null_value = true;
0240 }
0241 }
0242 ARROW_RETURN_NOT_OK(Reserve(length));
0243 ARROW_RETURN_NOT_OK(ReserveData(total_length));
0244
0245 if (valid_bytes) {
0246 int64_t valid_bytes_offset = 0;
0247 for (int64_t i = 0; i < length; ++i) {
0248 UnsafeAppendNextOffset();
0249 if (valid_bytes[i]) {
0250 if (values[i]) {
0251 value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
0252 value_lengths[i]);
0253 } else {
0254 UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset,
0255 i - valid_bytes_offset);
0256 UnsafeAppendToBitmap(false);
0257 valid_bytes_offset = i + 1;
0258 }
0259 }
0260 }
0261 UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset);
0262 } else {
0263 if (have_null_value) {
0264 std::vector<uint8_t> valid_vector(length, 0);
0265 for (int64_t i = 0; i < length; ++i) {
0266 UnsafeAppendNextOffset();
0267 if (values[i]) {
0268 value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
0269 value_lengths[i]);
0270 valid_vector[i] = 1;
0271 }
0272 }
0273 UnsafeAppendToBitmap(valid_vector.data(), length);
0274 } else {
0275 for (int64_t i = 0; i < length; ++i) {
0276 UnsafeAppendNextOffset();
0277 value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
0278 value_lengths[i]);
0279 }
0280 UnsafeAppendToBitmap(NULLPTR, length);
0281 }
0282 }
0283 return Status::OK();
0284 }
0285
0286 Status AppendArraySlice(const ArraySpan& array, int64_t offset,
0287 int64_t length) override {
0288 auto bitmap = array.GetValues<uint8_t>(0, 0);
0289 auto offsets = array.GetValues<offset_type>(1);
0290 auto data = array.GetValues<uint8_t>(2, 0);
0291 auto total_length = offsets[offset + length] - offsets[offset];
0292 ARROW_RETURN_NOT_OK(Reserve(length));
0293 ARROW_RETURN_NOT_OK(ReserveData(total_length));
0294 for (int64_t i = 0; i < length; i++) {
0295 if (!bitmap || bit_util::GetBit(bitmap, array.offset + offset + i)) {
0296 const offset_type start = offsets[offset + i];
0297 const offset_type end = offsets[offset + i + 1];
0298 UnsafeAppend(data + start, end - start);
0299 } else {
0300 UnsafeAppendNull();
0301 }
0302 }
0303 return Status::OK();
0304 }
0305
0306 void Reset() override {
0307 ArrayBuilder::Reset();
0308 offsets_builder_.Reset();
0309 value_data_builder_.Reset();
0310 }
0311
0312 Status ValidateOverflow(int64_t new_bytes) {
0313 auto new_size = value_data_builder_.length() + new_bytes;
0314 if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
0315 return Status::CapacityError("array cannot contain more than ", memory_limit(),
0316 " bytes, have ", new_size);
0317 } else {
0318 return Status::OK();
0319 }
0320 }
0321
0322 Status Resize(int64_t capacity) override {
0323 ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
0324
0325 ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
0326 return ArrayBuilder::Resize(capacity);
0327 }
0328
0329
0330
0331 Status ReserveData(int64_t elements) {
0332 ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
0333 return value_data_builder_.Reserve(elements);
0334 }
0335
0336 Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
0337
0338 ARROW_RETURN_NOT_OK(AppendNextOffset());
0339
0340
0341 std::shared_ptr<Buffer> offsets, value_data, null_bitmap;
0342 ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
0343 ARROW_RETURN_NOT_OK(value_data_builder_.Finish(&value_data));
0344 ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
0345
0346 *out = ArrayData::Make(type(), length_, {null_bitmap, offsets, value_data},
0347 null_count_, 0);
0348 Reset();
0349 return Status::OK();
0350 }
0351
0352
0353 const uint8_t* value_data() const { return value_data_builder_.data(); }
0354
0355 int64_t value_data_length() const { return value_data_builder_.length(); }
0356
0357 int64_t value_data_capacity() const { return value_data_builder_.capacity(); }
0358
0359
0360 const offset_type* offsets_data() const { return offsets_builder_.data(); }
0361
0362
0363
0364
0365 const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
0366 const offset_type* offsets = offsets_builder_.data();
0367 const auto offset = offsets[i];
0368 if (i == (length_ - 1)) {
0369 *out_length = static_cast<offset_type>(value_data_builder_.length()) - offset;
0370 } else {
0371 *out_length = offsets[i + 1] - offset;
0372 }
0373 return value_data_builder_.data() + offset;
0374 }
0375
0376 offset_type offset(int64_t i) const { return offsets_data()[i]; }
0377
0378
0379
0380
0381 std::string_view GetView(int64_t i) const {
0382 offset_type value_length;
0383 const uint8_t* value_data = GetValue(i, &value_length);
0384 return std::string_view(reinterpret_cast<const char*>(value_data), value_length);
0385 }
0386
0387
0388 static constexpr int64_t memory_limit() {
0389 return std::numeric_limits<offset_type>::max() - 1;
0390 }
0391
0392 protected:
0393 TypedBufferBuilder<offset_type> offsets_builder_;
0394 TypedBufferBuilder<uint8_t> value_data_builder_;
0395
0396 Status AppendNextOffset() {
0397 const int64_t num_bytes = value_data_builder_.length();
0398 return offsets_builder_.Append(static_cast<offset_type>(num_bytes));
0399 }
0400
0401 void UnsafeAppendNextOffset() {
0402 const int64_t num_bytes = value_data_builder_.length();
0403 offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
0404 }
0405 };
0406
0407
0408
0409 class ARROW_EXPORT BinaryBuilder : public BaseBinaryBuilder<BinaryType> {
0410 public:
0411 using BaseBinaryBuilder::BaseBinaryBuilder;
0412
0413
0414 using ArrayBuilder::Finish;
0415
0416
0417 Status Finish(std::shared_ptr<BinaryArray>* out) { return FinishTyped(out); }
0418
0419 std::shared_ptr<DataType> type() const override { return binary(); }
0420 };
0421
0422
0423
0424 class ARROW_EXPORT StringBuilder : public BinaryBuilder {
0425 public:
0426 using BinaryBuilder::BinaryBuilder;
0427
0428
0429 using ArrayBuilder::Finish;
0430
0431
0432 Status Finish(std::shared_ptr<StringArray>* out) { return FinishTyped(out); }
0433
0434 std::shared_ptr<DataType> type() const override { return utf8(); }
0435 };
0436
0437
0438
0439 class ARROW_EXPORT LargeBinaryBuilder : public BaseBinaryBuilder<LargeBinaryType> {
0440 public:
0441 using BaseBinaryBuilder::BaseBinaryBuilder;
0442
0443
0444 using ArrayBuilder::Finish;
0445
0446
0447 Status Finish(std::shared_ptr<LargeBinaryArray>* out) { return FinishTyped(out); }
0448
0449 std::shared_ptr<DataType> type() const override { return large_binary(); }
0450 };
0451
0452
0453
0454 class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder {
0455 public:
0456 using LargeBinaryBuilder::LargeBinaryBuilder;
0457
0458
0459 using ArrayBuilder::Finish;
0460
0461
0462 Status Finish(std::shared_ptr<LargeStringArray>* out) { return FinishTyped(out); }
0463
0464 std::shared_ptr<DataType> type() const override { return large_utf8(); }
0465 };
0466
0467
0468
0469
0470
0471
0472 namespace internal {
0473
0474
0475
0476
0477
0478
0479
0480
0481 class ARROW_EXPORT StringHeapBuilder {
0482 public:
0483 static constexpr int64_t kDefaultBlocksize = 32 << 10;
0484
0485 StringHeapBuilder(MemoryPool* pool, int64_t alignment)
0486 : pool_(pool), alignment_(alignment) {}
0487
0488 void SetBlockSize(int64_t blocksize) { blocksize_ = blocksize; }
0489
0490 using c_type = BinaryViewType::c_type;
0491
0492 template <bool Safe>
0493 std::conditional_t<Safe, Result<c_type>, c_type> Append(const uint8_t* value,
0494 int64_t length) {
0495 if (length <= BinaryViewType::kInlineSize) {
0496 return util::ToInlineBinaryView(value, static_cast<int32_t>(length));
0497 }
0498
0499 if constexpr (Safe) {
0500 ARROW_RETURN_NOT_OK(Reserve(length));
0501 }
0502
0503 auto v = util::ToNonInlineBinaryView(value, static_cast<int32_t>(length),
0504 static_cast<int32_t>(blocks_.size() - 1),
0505 current_offset_);
0506
0507 memcpy(current_out_buffer_, value, static_cast<size_t>(length));
0508 current_out_buffer_ += length;
0509 current_remaining_bytes_ -= length;
0510 current_offset_ += static_cast<int32_t>(length);
0511 return v;
0512 }
0513
0514 static constexpr int64_t ValueSizeLimit() {
0515 return std::numeric_limits<int32_t>::max();
0516 }
0517
0518
0519
0520 Status Reserve(int64_t num_bytes) {
0521 if (ARROW_PREDICT_FALSE(num_bytes > ValueSizeLimit())) {
0522 return Status::CapacityError(
0523 "BinaryView or StringView elements cannot reference "
0524 "strings larger than 2GB");
0525 }
0526 if (num_bytes > current_remaining_bytes_) {
0527 ARROW_RETURN_NOT_OK(FinishLastBlock());
0528 current_remaining_bytes_ = num_bytes > blocksize_ ? num_bytes : blocksize_;
0529 ARROW_ASSIGN_OR_RAISE(
0530 std::shared_ptr<ResizableBuffer> new_block,
0531 AllocateResizableBuffer(current_remaining_bytes_, alignment_, pool_));
0532 current_offset_ = 0;
0533 current_out_buffer_ = new_block->mutable_data();
0534 blocks_.emplace_back(std::move(new_block));
0535 }
0536 return Status::OK();
0537 }
0538
0539 void Reset() {
0540 current_offset_ = 0;
0541 current_out_buffer_ = NULLPTR;
0542 current_remaining_bytes_ = 0;
0543 blocks_.clear();
0544 }
0545
0546 int64_t current_remaining_bytes() const { return current_remaining_bytes_; }
0547
0548 Result<std::vector<std::shared_ptr<ResizableBuffer>>> Finish() {
0549 if (!blocks_.empty()) {
0550 ARROW_RETURN_NOT_OK(FinishLastBlock());
0551 }
0552 current_offset_ = 0;
0553 current_out_buffer_ = NULLPTR;
0554 current_remaining_bytes_ = 0;
0555 return std::move(blocks_);
0556 }
0557
0558 private:
0559 Status FinishLastBlock() {
0560 if (current_remaining_bytes_ > 0) {
0561
0562 ARROW_RETURN_NOT_OK(
0563 blocks_.back()->Resize(blocks_.back()->size() - current_remaining_bytes_,
0564 true));
0565 blocks_.back()->ZeroPadding();
0566 }
0567 return Status::OK();
0568 }
0569
0570 MemoryPool* pool_;
0571 int64_t alignment_;
0572 int64_t blocksize_ = kDefaultBlocksize;
0573 std::vector<std::shared_ptr<ResizableBuffer>> blocks_;
0574
0575 int32_t current_offset_ = 0;
0576 uint8_t* current_out_buffer_ = NULLPTR;
0577 int64_t current_remaining_bytes_ = 0;
0578 };
0579
0580 }
0581
0582 class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder {
0583 public:
0584 using TypeClass = BinaryViewType;
0585
0586
0587 BinaryViewBuilder(const std::shared_ptr<DataType>&, MemoryPool* pool);
0588
0589 explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool(),
0590 int64_t alignment = kDefaultBufferAlignment)
0591 : ArrayBuilder(pool, alignment),
0592 data_builder_(pool, alignment),
0593 data_heap_builder_(pool, alignment) {}
0594
0595
0596
0597
0598
0599
0600
0601 void SetBlockSize(int64_t blocksize) { data_heap_builder_.SetBlockSize(blocksize); }
0602
0603
0604
0605 int64_t current_block_bytes_remaining() const {
0606 return data_heap_builder_.current_remaining_bytes();
0607 }
0608
0609 Status Append(const uint8_t* value, int64_t length) {
0610 ARROW_RETURN_NOT_OK(Reserve(1));
0611 UnsafeAppendToBitmap(true);
0612 ARROW_ASSIGN_OR_RAISE(auto v,
0613 data_heap_builder_.Append<true>(value, length));
0614 data_builder_.UnsafeAppend(v);
0615 return Status::OK();
0616 }
0617
0618 Status Append(const char* value, int64_t length) {
0619 return Append(reinterpret_cast<const uint8_t*>(value), length);
0620 }
0621
0622 Status Append(std::string_view value) {
0623 return Append(value.data(), static_cast<int64_t>(value.size()));
0624 }
0625
0626
0627
0628
0629
0630 void UnsafeAppend(const uint8_t* value, int64_t length) {
0631 UnsafeAppendToBitmap(true);
0632 auto v = data_heap_builder_.Append<false>(value, length);
0633 data_builder_.UnsafeAppend(v);
0634 }
0635
0636 void UnsafeAppend(const char* value, int64_t length) {
0637 UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
0638 }
0639
0640 void UnsafeAppend(const std::string& value) {
0641 UnsafeAppend(value.c_str(), static_cast<int64_t>(value.size()));
0642 }
0643
0644 void UnsafeAppend(std::string_view value) {
0645 UnsafeAppend(value.data(), static_cast<int64_t>(value.size()));
0646 }
0647
0648
0649
0650
0651 Status ReserveData(int64_t length);
0652
0653 Status AppendNulls(int64_t length) final {
0654 ARROW_RETURN_NOT_OK(Reserve(length));
0655 data_builder_.UnsafeAppend(length, BinaryViewType::c_type{});
0656 UnsafeSetNull(length);
0657 return Status::OK();
0658 }
0659
0660
0661 Status AppendNull() final {
0662 ARROW_RETURN_NOT_OK(Reserve(1));
0663 data_builder_.UnsafeAppend(BinaryViewType::c_type{});
0664 UnsafeAppendToBitmap(false);
0665 return Status::OK();
0666 }
0667
0668
0669 Status AppendEmptyValue() final {
0670 ARROW_RETURN_NOT_OK(Reserve(1));
0671 data_builder_.UnsafeAppend(BinaryViewType::c_type{});
0672 UnsafeAppendToBitmap(true);
0673 return Status::OK();
0674 }
0675
0676
0677 Status AppendEmptyValues(int64_t length) final {
0678 ARROW_RETURN_NOT_OK(Reserve(length));
0679 data_builder_.UnsafeAppend(length, BinaryViewType::c_type{});
0680 UnsafeSetNotNull(length);
0681 return Status::OK();
0682 }
0683
0684 void UnsafeAppendNull() {
0685 data_builder_.UnsafeAppend(BinaryViewType::c_type{});
0686 UnsafeAppendToBitmap(false);
0687 }
0688
0689 void UnsafeAppendEmptyValue() {
0690 data_builder_.UnsafeAppend(BinaryViewType::c_type{});
0691 UnsafeAppendToBitmap(true);
0692 }
0693
0694
0695
0696 Status AppendArraySlice(const ArraySpan& array, int64_t offset,
0697 int64_t length) override;
0698
0699 void Reset() override;
0700
0701 Status Resize(int64_t capacity) override {
0702 ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
0703 capacity = std::max(capacity, kMinBuilderCapacity);
0704 ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity));
0705 return ArrayBuilder::Resize(capacity);
0706 }
0707
0708 Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
0709
0710 std::shared_ptr<DataType> type() const override { return binary_view(); }
0711
0712 protected:
0713 TypedBufferBuilder<BinaryViewType::c_type> data_builder_;
0714
0715
0716
0717 internal::StringHeapBuilder data_heap_builder_;
0718 };
0719
0720 class ARROW_EXPORT StringViewBuilder : public BinaryViewBuilder {
0721 public:
0722 using BinaryViewBuilder::BinaryViewBuilder;
0723 std::shared_ptr<DataType> type() const override { return utf8_view(); }
0724 };
0725
0726
0727
0728
0729 class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
0730 public:
0731 using TypeClass = FixedSizeBinaryType;
0732
0733 explicit FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type,
0734 MemoryPool* pool = default_memory_pool(),
0735 int64_t alignment = kDefaultBufferAlignment);
0736
0737 Status Append(const uint8_t* value) {
0738 ARROW_RETURN_NOT_OK(Reserve(1));
0739 UnsafeAppend(value);
0740 return Status::OK();
0741 }
0742
0743 Status Append(const char* value) {
0744 return Append(reinterpret_cast<const uint8_t*>(value));
0745 }
0746
0747 Status Append(std::string_view view) {
0748 ARROW_RETURN_NOT_OK(Reserve(1));
0749 UnsafeAppend(view);
0750 return Status::OK();
0751 }
0752
0753 Status Append(const std::string& s) {
0754 ARROW_RETURN_NOT_OK(Reserve(1));
0755 UnsafeAppend(s);
0756 return Status::OK();
0757 }
0758
0759 Status Append(const Buffer& s) {
0760 ARROW_RETURN_NOT_OK(Reserve(1));
0761 UnsafeAppend(s);
0762 return Status::OK();
0763 }
0764
0765 Status Append(const std::shared_ptr<Buffer>& s) { return Append(*s); }
0766
0767 template <size_t NBYTES>
0768 Status Append(const std::array<uint8_t, NBYTES>& value) {
0769 ARROW_RETURN_NOT_OK(Reserve(1));
0770 UnsafeAppend(
0771 std::string_view(reinterpret_cast<const char*>(value.data()), value.size()));
0772 return Status::OK();
0773 }
0774
0775 Status AppendValues(const uint8_t* data, int64_t length,
0776 const uint8_t* valid_bytes = NULLPTR);
0777
0778 Status AppendValues(const uint8_t* data, int64_t length, const uint8_t* validity,
0779 int64_t bitmap_offset);
0780
0781 Status AppendNull() final;
0782 Status AppendNulls(int64_t length) final;
0783
0784 Status AppendEmptyValue() final;
0785 Status AppendEmptyValues(int64_t length) final;
0786
0787 Status AppendArraySlice(const ArraySpan& array, int64_t offset,
0788 int64_t length) override {
0789 return AppendValues(
0790 array.GetValues<uint8_t>(1, 0) + ((array.offset + offset) * byte_width_), length,
0791 array.GetValues<uint8_t>(0, 0), array.offset + offset);
0792 }
0793
0794 void UnsafeAppend(const uint8_t* value) {
0795 UnsafeAppendToBitmap(true);
0796 if (ARROW_PREDICT_TRUE(byte_width_ > 0)) {
0797 byte_builder_.UnsafeAppend(value, byte_width_);
0798 }
0799 }
0800
0801 void UnsafeAppend(const char* value) {
0802 UnsafeAppend(reinterpret_cast<const uint8_t*>(value));
0803 }
0804
0805 void UnsafeAppend(std::string_view value) {
0806 #ifndef NDEBUG
0807 CheckValueSize(static_cast<size_t>(value.size()));
0808 #endif
0809 UnsafeAppend(reinterpret_cast<const uint8_t*>(value.data()));
0810 }
0811
0812 void UnsafeAppend(const Buffer& s) { UnsafeAppend(std::string_view{s}); }
0813
0814 void UnsafeAppend(const std::shared_ptr<Buffer>& s) { UnsafeAppend(*s); }
0815
0816 void UnsafeAppendNull() {
0817 UnsafeAppendToBitmap(false);
0818 byte_builder_.UnsafeAppend(byte_width_, 0);
0819 }
0820
0821 Status ValidateOverflow(int64_t new_bytes) const {
0822 auto new_size = byte_builder_.length() + new_bytes;
0823 if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
0824 return Status::CapacityError("array cannot contain more than ", memory_limit(),
0825 " bytes, have ", new_size);
0826 } else {
0827 return Status::OK();
0828 }
0829 }
0830
0831
0832
0833 Status ReserveData(int64_t elements) {
0834 ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
0835 return byte_builder_.Reserve(elements);
0836 }
0837
0838 void Reset() override;
0839 Status Resize(int64_t capacity) override;
0840 Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
0841
0842
0843 using ArrayBuilder::Finish;
0844
0845
0846 Status Finish(std::shared_ptr<FixedSizeBinaryArray>* out) { return FinishTyped(out); }
0847
0848
0849 int64_t value_data_length() const { return byte_builder_.length(); }
0850
0851 int32_t byte_width() const { return byte_width_; }
0852
0853
0854
0855
0856 const uint8_t* GetValue(int64_t i) const;
0857
0858
0859
0860
0861 std::string_view GetView(int64_t i) const;
0862
0863 static constexpr int64_t memory_limit() {
0864 return std::numeric_limits<int64_t>::max() - 1;
0865 }
0866
0867 std::shared_ptr<DataType> type() const override {
0868 return fixed_size_binary(byte_width_);
0869 }
0870
0871 protected:
0872 int32_t byte_width_;
0873 BufferBuilder byte_builder_;
0874
0875
0876
0877
0878 uint8_t* GetMutableValue(int64_t i) {
0879 uint8_t* data_ptr = byte_builder_.mutable_data();
0880 return data_ptr + i * byte_width_;
0881 }
0882
0883 void CheckValueSize(int64_t size);
0884 };
0885
0886
0887
0888
0889
0890
0891
0892 namespace internal {
0893
0894 class ARROW_EXPORT ChunkedBinaryBuilder {
0895 public:
0896 explicit ChunkedBinaryBuilder(int32_t max_chunk_value_length,
0897 MemoryPool* pool = default_memory_pool());
0898
0899 ChunkedBinaryBuilder(int32_t max_chunk_value_length, int32_t max_chunk_length,
0900 MemoryPool* pool = default_memory_pool());
0901
0902 virtual ~ChunkedBinaryBuilder() = default;
0903
0904 Status Append(const uint8_t* value, int32_t length) {
0905 if (ARROW_PREDICT_FALSE(length + builder_->value_data_length() >
0906 max_chunk_value_length_)) {
0907 if (builder_->value_data_length() == 0) {
0908
0909
0910 ARROW_RETURN_NOT_OK(builder_->Append(value, length));
0911 return NextChunk();
0912 }
0913
0914
0915
0916 ARROW_RETURN_NOT_OK(NextChunk());
0917 return Append(value, length);
0918 }
0919
0920 if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
0921
0922
0923 ARROW_RETURN_NOT_OK(NextChunk());
0924 }
0925
0926 return builder_->Append(value, length);
0927 }
0928
0929 Status Append(std::string_view value) {
0930 return Append(reinterpret_cast<const uint8_t*>(value.data()),
0931 static_cast<int32_t>(value.size()));
0932 }
0933
0934 Status AppendNull() {
0935 if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
0936 ARROW_RETURN_NOT_OK(NextChunk());
0937 }
0938 return builder_->AppendNull();
0939 }
0940
0941 Status Reserve(int64_t values);
0942
0943 virtual Status Finish(ArrayVector* out);
0944
0945 protected:
0946 Status NextChunk();
0947
0948
0949 int64_t max_chunk_value_length_;
0950
0951
0952 int64_t max_chunk_length_ = kListMaximumElements;
0953
0954
0955
0956 int64_t extra_capacity_ = 0;
0957
0958 std::unique_ptr<BinaryBuilder> builder_;
0959 std::vector<std::shared_ptr<Array>> chunks_;
0960 };
0961
0962 class ARROW_EXPORT ChunkedStringBuilder : public ChunkedBinaryBuilder {
0963 public:
0964 using ChunkedBinaryBuilder::ChunkedBinaryBuilder;
0965
0966 Status Finish(ArrayVector* out) override;
0967 };
0968
0969 }
0970
0971 }