Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-08-28 08:26:54

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <array>
0021 #include <cstddef>
0022 #include <cstdint>
0023 #include <cstring>
0024 #include <limits>
0025 #include <memory>
0026 #include <numeric>
0027 #include <string>
0028 #include <string_view>
0029 #include <vector>
0030 
0031 #include "arrow/array/array_base.h"
0032 #include "arrow/array/array_binary.h"
0033 #include "arrow/array/builder_base.h"
0034 #include "arrow/array/data.h"
0035 #include "arrow/buffer.h"
0036 #include "arrow/buffer_builder.h"
0037 #include "arrow/status.h"
0038 #include "arrow/type.h"
0039 #include "arrow/util/binary_view_util.h"
0040 #include "arrow/util/macros.h"
0041 #include "arrow/util/visibility.h"
0042 
0043 namespace arrow {
0044 
0045 /// \addtogroup binary-builders
0046 ///
0047 /// @{
0048 
0049 // ----------------------------------------------------------------------
0050 // Binary and String
0051 
0052 template <typename TYPE>
0053 class BaseBinaryBuilder
0054     : public ArrayBuilder,
0055       public internal::ArrayBuilderExtraOps<BaseBinaryBuilder<TYPE>, std::string_view> {
0056  public:
0057   using TypeClass = TYPE;
0058   using offset_type = typename TypeClass::offset_type;
0059 
0060   explicit BaseBinaryBuilder(MemoryPool* pool = default_memory_pool(),
0061                              int64_t alignment = kDefaultBufferAlignment)
0062       : ArrayBuilder(pool, alignment),
0063         offsets_builder_(pool, alignment),
0064         value_data_builder_(pool, alignment) {}
0065 
0066   BaseBinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
0067       : BaseBinaryBuilder(pool) {}
0068 
0069   Status Append(const uint8_t* value, offset_type length) {
0070     ARROW_RETURN_NOT_OK(Reserve(1));
0071     UnsafeAppendNextOffset();
0072     // Safety check for UBSAN.
0073     if (ARROW_PREDICT_TRUE(length > 0)) {
0074       ARROW_RETURN_NOT_OK(ValidateOverflow(length));
0075       ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
0076     }
0077 
0078     UnsafeAppendToBitmap(true);
0079     return Status::OK();
0080   }
0081 
0082   Status Append(const char* value, offset_type length) {
0083     return Append(reinterpret_cast<const uint8_t*>(value), length);
0084   }
0085 
0086   Status Append(std::string_view value) {
0087     return Append(value.data(), static_cast<offset_type>(value.size()));
0088   }
0089 
0090   /// Extend the last appended value by appending more data at the end
0091   ///
0092   /// Unlike Append, this does not create a new offset.
0093   Status ExtendCurrent(const uint8_t* value, offset_type length) {
0094     // Safety check for UBSAN.
0095     if (ARROW_PREDICT_TRUE(length > 0)) {
0096       ARROW_RETURN_NOT_OK(ValidateOverflow(length));
0097       ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
0098     }
0099     return Status::OK();
0100   }
0101 
0102   Status ExtendCurrent(std::string_view value) {
0103     return ExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
0104                          static_cast<offset_type>(value.size()));
0105   }
0106 
0107   Status AppendNulls(int64_t length) final {
0108     const int64_t num_bytes = value_data_builder_.length();
0109     ARROW_RETURN_NOT_OK(Reserve(length));
0110     for (int64_t i = 0; i < length; ++i) {
0111       offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
0112     }
0113     UnsafeAppendToBitmap(length, false);
0114     return Status::OK();
0115   }
0116 
0117   Status AppendNull() final {
0118     ARROW_RETURN_NOT_OK(Reserve(1));
0119     UnsafeAppendNextOffset();
0120     UnsafeAppendToBitmap(false);
0121     return Status::OK();
0122   }
0123 
0124   Status AppendEmptyValue() final {
0125     ARROW_RETURN_NOT_OK(Reserve(1));
0126     UnsafeAppendNextOffset();
0127     UnsafeAppendToBitmap(true);
0128     return Status::OK();
0129   }
0130 
0131   Status AppendEmptyValues(int64_t length) final {
0132     const int64_t num_bytes = value_data_builder_.length();
0133     ARROW_RETURN_NOT_OK(Reserve(length));
0134     for (int64_t i = 0; i < length; ++i) {
0135       offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
0136     }
0137     UnsafeAppendToBitmap(length, true);
0138     return Status::OK();
0139   }
0140 
0141   /// \brief Append without checking capacity
0142   ///
0143   /// Offsets and data should have been presized using Reserve() and
0144   /// ReserveData(), respectively.
0145   void UnsafeAppend(const uint8_t* value, offset_type length) {
0146     UnsafeAppendNextOffset();
0147     value_data_builder_.UnsafeAppend(value, length);
0148     UnsafeAppendToBitmap(true);
0149   }
0150 
0151   void UnsafeAppend(const char* value, offset_type length) {
0152     UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
0153   }
0154 
0155   void UnsafeAppend(const std::string& value) {
0156     UnsafeAppend(value.c_str(), static_cast<offset_type>(value.size()));
0157   }
0158 
0159   void UnsafeAppend(std::string_view value) {
0160     UnsafeAppend(value.data(), static_cast<offset_type>(value.size()));
0161   }
0162 
0163   /// Like ExtendCurrent, but do not check capacity
0164   void UnsafeExtendCurrent(const uint8_t* value, offset_type length) {
0165     value_data_builder_.UnsafeAppend(value, length);
0166   }
0167 
0168   void UnsafeExtendCurrent(std::string_view value) {
0169     UnsafeExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
0170                         static_cast<offset_type>(value.size()));
0171   }
0172 
0173   void UnsafeAppendNull() {
0174     const int64_t num_bytes = value_data_builder_.length();
0175     offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
0176     UnsafeAppendToBitmap(false);
0177   }
0178 
0179   void UnsafeAppendEmptyValue() {
0180     const int64_t num_bytes = value_data_builder_.length();
0181     offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
0182     UnsafeAppendToBitmap(true);
0183   }
0184 
0185   /// \brief Append a sequence of strings in one shot.
0186   ///
0187   /// \param[in] values a vector of strings
0188   /// \param[in] valid_bytes an optional sequence of bytes where non-zero
0189   /// indicates a valid (non-null) value
0190   /// \return Status
0191   Status AppendValues(const std::vector<std::string>& values,
0192                       const uint8_t* valid_bytes = NULLPTR) {
0193     std::size_t total_length = std::accumulate(
0194         values.begin(), values.end(), 0ULL,
0195         [](uint64_t sum, const std::string& str) { return sum + str.size(); });
0196     ARROW_RETURN_NOT_OK(Reserve(values.size()));
0197     ARROW_RETURN_NOT_OK(ReserveData(total_length));
0198 
0199     if (valid_bytes != NULLPTR) {
0200       for (std::size_t i = 0; i < values.size(); ++i) {
0201         UnsafeAppendNextOffset();
0202         if (valid_bytes[i]) {
0203           value_data_builder_.UnsafeAppend(
0204               reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size());
0205         }
0206       }
0207     } else {
0208       for (const auto& value : values) {
0209         UnsafeAppendNextOffset();
0210         value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(value.data()),
0211                                          value.size());
0212       }
0213     }
0214 
0215     UnsafeAppendToBitmap(valid_bytes, values.size());
0216     return Status::OK();
0217   }
0218 
0219   /// \brief Append a sequence of nul-terminated strings in one shot.
0220   ///        If one of the values is NULL, it is processed as a null
0221   ///        value even if the corresponding valid_bytes entry is 1.
0222   ///
0223   /// \param[in] values a contiguous C array of nul-terminated char *
0224   /// \param[in] length the number of values to append
0225   /// \param[in] valid_bytes an optional sequence of bytes where non-zero
0226   /// indicates a valid (non-null) value
0227   /// \return Status
0228   Status AppendValues(const char** values, int64_t length,
0229                       const uint8_t* valid_bytes = NULLPTR) {
0230     std::size_t total_length = 0;
0231     std::vector<std::size_t> value_lengths(length);
0232     bool have_null_value = false;
0233     for (int64_t i = 0; i < length; ++i) {
0234       if (values[i] != NULLPTR) {
0235         auto value_length = strlen(values[i]);
0236         value_lengths[i] = value_length;
0237         total_length += value_length;
0238       } else {
0239         have_null_value = true;
0240       }
0241     }
0242     ARROW_RETURN_NOT_OK(Reserve(length));
0243     ARROW_RETURN_NOT_OK(ReserveData(total_length));
0244 
0245     if (valid_bytes) {
0246       int64_t valid_bytes_offset = 0;
0247       for (int64_t i = 0; i < length; ++i) {
0248         UnsafeAppendNextOffset();
0249         if (valid_bytes[i]) {
0250           if (values[i]) {
0251             value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
0252                                              value_lengths[i]);
0253           } else {
0254             UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset,
0255                                  i - valid_bytes_offset);
0256             UnsafeAppendToBitmap(false);
0257             valid_bytes_offset = i + 1;
0258           }
0259         }
0260       }
0261       UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset);
0262     } else {
0263       if (have_null_value) {
0264         std::vector<uint8_t> valid_vector(length, 0);
0265         for (int64_t i = 0; i < length; ++i) {
0266           UnsafeAppendNextOffset();
0267           if (values[i]) {
0268             value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
0269                                              value_lengths[i]);
0270             valid_vector[i] = 1;
0271           }
0272         }
0273         UnsafeAppendToBitmap(valid_vector.data(), length);
0274       } else {
0275         for (int64_t i = 0; i < length; ++i) {
0276           UnsafeAppendNextOffset();
0277           value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
0278                                            value_lengths[i]);
0279         }
0280         UnsafeAppendToBitmap(NULLPTR, length);
0281       }
0282     }
0283     return Status::OK();
0284   }
0285 
0286   Status AppendArraySlice(const ArraySpan& array, int64_t offset,
0287                           int64_t length) override {
0288     auto bitmap = array.GetValues<uint8_t>(0, 0);
0289     auto offsets = array.GetValues<offset_type>(1);
0290     auto data = array.GetValues<uint8_t>(2, 0);
0291     auto total_length = offsets[offset + length] - offsets[offset];
0292     ARROW_RETURN_NOT_OK(Reserve(length));
0293     ARROW_RETURN_NOT_OK(ReserveData(total_length));
0294     for (int64_t i = 0; i < length; i++) {
0295       if (!bitmap || bit_util::GetBit(bitmap, array.offset + offset + i)) {
0296         const offset_type start = offsets[offset + i];
0297         const offset_type end = offsets[offset + i + 1];
0298         UnsafeAppend(data + start, end - start);
0299       } else {
0300         UnsafeAppendNull();
0301       }
0302     }
0303     return Status::OK();
0304   }
0305 
0306   void Reset() override {
0307     ArrayBuilder::Reset();
0308     offsets_builder_.Reset();
0309     value_data_builder_.Reset();
0310   }
0311 
0312   Status ValidateOverflow(int64_t new_bytes) {
0313     auto new_size = value_data_builder_.length() + new_bytes;
0314     if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
0315       return Status::CapacityError("array cannot contain more than ", memory_limit(),
0316                                    " bytes, have ", new_size);
0317     } else {
0318       return Status::OK();
0319     }
0320   }
0321 
0322   Status Resize(int64_t capacity) override {
0323     ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
0324     // One more than requested for offsets
0325     ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
0326     return ArrayBuilder::Resize(capacity);
0327   }
0328 
0329   /// \brief Ensures there is enough allocated capacity to append the indicated
0330   /// number of bytes to the value data buffer without additional allocations
0331   Status ReserveData(int64_t elements) {
0332     ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
0333     return value_data_builder_.Reserve(elements);
0334   }
0335 
0336   Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
0337     // Write final offset (values length)
0338     ARROW_RETURN_NOT_OK(AppendNextOffset());
0339 
0340     // These buffers' padding zeroed by BufferBuilder
0341     std::shared_ptr<Buffer> offsets, value_data, null_bitmap;
0342     ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
0343     ARROW_RETURN_NOT_OK(value_data_builder_.Finish(&value_data));
0344     ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
0345 
0346     *out = ArrayData::Make(type(), length_, {null_bitmap, offsets, value_data},
0347                            null_count_, 0);
0348     Reset();
0349     return Status::OK();
0350   }
0351 
0352   /// \return data pointer of the value date builder
0353   const uint8_t* value_data() const { return value_data_builder_.data(); }
0354   /// \return size of values buffer so far
0355   int64_t value_data_length() const { return value_data_builder_.length(); }
0356   /// \return capacity of values buffer
0357   int64_t value_data_capacity() const { return value_data_builder_.capacity(); }
0358 
0359   /// \return data pointer of the value date builder
0360   const offset_type* offsets_data() const { return offsets_builder_.data(); }
0361 
0362   /// Temporary access to a value.
0363   ///
0364   /// This pointer becomes invalid on the next modifying operation.
0365   const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
0366     const offset_type* offsets = offsets_builder_.data();
0367     const auto offset = offsets[i];
0368     if (i == (length_ - 1)) {
0369       *out_length = static_cast<offset_type>(value_data_builder_.length()) - offset;
0370     } else {
0371       *out_length = offsets[i + 1] - offset;
0372     }
0373     return value_data_builder_.data() + offset;
0374   }
0375 
0376   offset_type offset(int64_t i) const { return offsets_data()[i]; }
0377 
0378   /// Temporary access to a value.
0379   ///
0380   /// This view becomes invalid on the next modifying operation.
0381   std::string_view GetView(int64_t i) const {
0382     offset_type value_length;
0383     const uint8_t* value_data = GetValue(i, &value_length);
0384     return std::string_view(reinterpret_cast<const char*>(value_data), value_length);
0385   }
0386 
0387   // Cannot make this a static attribute because of linking issues
0388   static constexpr int64_t memory_limit() {
0389     return std::numeric_limits<offset_type>::max() - 1;
0390   }
0391 
0392  protected:
0393   TypedBufferBuilder<offset_type> offsets_builder_;
0394   TypedBufferBuilder<uint8_t> value_data_builder_;
0395 
0396   Status AppendNextOffset() {
0397     const int64_t num_bytes = value_data_builder_.length();
0398     return offsets_builder_.Append(static_cast<offset_type>(num_bytes));
0399   }
0400 
0401   void UnsafeAppendNextOffset() {
0402     const int64_t num_bytes = value_data_builder_.length();
0403     offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
0404   }
0405 };
0406 
0407 /// \class BinaryBuilder
0408 /// \brief Builder class for variable-length binary data
0409 class ARROW_EXPORT BinaryBuilder : public BaseBinaryBuilder<BinaryType> {
0410  public:
0411   using BaseBinaryBuilder::BaseBinaryBuilder;
0412 
0413   /// \cond FALSE
0414   using ArrayBuilder::Finish;
0415   /// \endcond
0416 
0417   Status Finish(std::shared_ptr<BinaryArray>* out) { return FinishTyped(out); }
0418 
0419   std::shared_ptr<DataType> type() const override { return binary(); }
0420 };
0421 
0422 /// \class StringBuilder
0423 /// \brief Builder class for UTF8 strings
0424 class ARROW_EXPORT StringBuilder : public BinaryBuilder {
0425  public:
0426   using BinaryBuilder::BinaryBuilder;
0427 
0428   /// \cond FALSE
0429   using ArrayBuilder::Finish;
0430   /// \endcond
0431 
0432   Status Finish(std::shared_ptr<StringArray>* out) { return FinishTyped(out); }
0433 
0434   std::shared_ptr<DataType> type() const override { return utf8(); }
0435 };
0436 
0437 /// \class LargeBinaryBuilder
0438 /// \brief Builder class for large variable-length binary data
0439 class ARROW_EXPORT LargeBinaryBuilder : public BaseBinaryBuilder<LargeBinaryType> {
0440  public:
0441   using BaseBinaryBuilder::BaseBinaryBuilder;
0442 
0443   /// \cond FALSE
0444   using ArrayBuilder::Finish;
0445   /// \endcond
0446 
0447   Status Finish(std::shared_ptr<LargeBinaryArray>* out) { return FinishTyped(out); }
0448 
0449   std::shared_ptr<DataType> type() const override { return large_binary(); }
0450 };
0451 
0452 /// \class LargeStringBuilder
0453 /// \brief Builder class for large UTF8 strings
0454 class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder {
0455  public:
0456   using LargeBinaryBuilder::LargeBinaryBuilder;
0457 
0458   /// \cond FALSE
0459   using ArrayBuilder::Finish;
0460   /// \endcond
0461 
0462   Status Finish(std::shared_ptr<LargeStringArray>* out) { return FinishTyped(out); }
0463 
0464   std::shared_ptr<DataType> type() const override { return large_utf8(); }
0465 };
0466 
0467 // ----------------------------------------------------------------------
0468 // BinaryViewBuilder, StringViewBuilder
0469 //
0470 // These builders do not support building raw pointer view arrays.
0471 
0472 namespace internal {
0473 
0474 // We allocate medium-sized memory chunks and accumulate data in those, which
0475 // may result in some waste if there are many large-ish strings. If a string
0476 // comes along that does not fit into a block, we allocate a new block and
0477 // write into that.
0478 //
0479 // Later we can implement optimizations to continuing filling underfull blocks
0480 // after encountering a large string that required allocating a new block.
0481 class ARROW_EXPORT StringHeapBuilder {
0482  public:
0483   static constexpr int64_t kDefaultBlocksize = 32 << 10;  // 32KB
0484 
0485   StringHeapBuilder(MemoryPool* pool, int64_t alignment)
0486       : pool_(pool), alignment_(alignment) {}
0487 
0488   void SetBlockSize(int64_t blocksize) { blocksize_ = blocksize; }
0489 
0490   using c_type = BinaryViewType::c_type;
0491 
0492   template <bool Safe>
0493   std::conditional_t<Safe, Result<c_type>, c_type> Append(const uint8_t* value,
0494                                                           int64_t length) {
0495     if (length <= BinaryViewType::kInlineSize) {
0496       return util::ToInlineBinaryView(value, static_cast<int32_t>(length));
0497     }
0498 
0499     if constexpr (Safe) {
0500       ARROW_RETURN_NOT_OK(Reserve(length));
0501     }
0502 
0503     auto v = util::ToNonInlineBinaryView(value, static_cast<int32_t>(length),
0504                                          static_cast<int32_t>(blocks_.size() - 1),
0505                                          current_offset_);
0506 
0507     memcpy(current_out_buffer_, value, static_cast<size_t>(length));
0508     current_out_buffer_ += length;
0509     current_remaining_bytes_ -= length;
0510     current_offset_ += static_cast<int32_t>(length);
0511     return v;
0512   }
0513 
0514   static constexpr int64_t ValueSizeLimit() {
0515     return std::numeric_limits<int32_t>::max();
0516   }
0517 
0518   /// \brief Ensure that the indicated number of bytes can be appended via
0519   /// UnsafeAppend operations without the need to allocate more memory
0520   Status Reserve(int64_t num_bytes) {
0521     if (ARROW_PREDICT_FALSE(num_bytes > ValueSizeLimit())) {
0522       return Status::CapacityError(
0523           "BinaryView or StringView elements cannot reference "
0524           "strings larger than 2GB");
0525     }
0526     if (num_bytes > current_remaining_bytes_) {
0527       ARROW_RETURN_NOT_OK(FinishLastBlock());
0528       current_remaining_bytes_ = num_bytes > blocksize_ ? num_bytes : blocksize_;
0529       ARROW_ASSIGN_OR_RAISE(
0530           std::shared_ptr<ResizableBuffer> new_block,
0531           AllocateResizableBuffer(current_remaining_bytes_, alignment_, pool_));
0532       current_offset_ = 0;
0533       current_out_buffer_ = new_block->mutable_data();
0534       blocks_.emplace_back(std::move(new_block));
0535     }
0536     return Status::OK();
0537   }
0538 
0539   void Reset() {
0540     current_offset_ = 0;
0541     current_out_buffer_ = NULLPTR;
0542     current_remaining_bytes_ = 0;
0543     blocks_.clear();
0544   }
0545 
0546   int64_t current_remaining_bytes() const { return current_remaining_bytes_; }
0547 
0548   Result<std::vector<std::shared_ptr<ResizableBuffer>>> Finish() {
0549     if (!blocks_.empty()) {
0550       ARROW_RETURN_NOT_OK(FinishLastBlock());
0551     }
0552     current_offset_ = 0;
0553     current_out_buffer_ = NULLPTR;
0554     current_remaining_bytes_ = 0;
0555     return std::move(blocks_);
0556   }
0557 
0558  private:
0559   Status FinishLastBlock() {
0560     if (current_remaining_bytes_ > 0) {
0561       // Avoid leaking uninitialized bytes from the allocator
0562       ARROW_RETURN_NOT_OK(
0563           blocks_.back()->Resize(blocks_.back()->size() - current_remaining_bytes_,
0564                                  /*shrink_to_fit=*/true));
0565       blocks_.back()->ZeroPadding();
0566     }
0567     return Status::OK();
0568   }
0569 
0570   MemoryPool* pool_;
0571   int64_t alignment_;
0572   int64_t blocksize_ = kDefaultBlocksize;
0573   std::vector<std::shared_ptr<ResizableBuffer>> blocks_;
0574 
0575   int32_t current_offset_ = 0;
0576   uint8_t* current_out_buffer_ = NULLPTR;
0577   int64_t current_remaining_bytes_ = 0;
0578 };
0579 
0580 }  // namespace internal
0581 
0582 class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder {
0583  public:
0584   using TypeClass = BinaryViewType;
0585 
0586   // this constructor provided for MakeBuilder compatibility
0587   BinaryViewBuilder(const std::shared_ptr<DataType>&, MemoryPool* pool);
0588 
0589   explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool(),
0590                              int64_t alignment = kDefaultBufferAlignment)
0591       : ArrayBuilder(pool, alignment),
0592         data_builder_(pool, alignment),
0593         data_heap_builder_(pool, alignment) {}
0594 
0595   /// Set the size for future preallocated data buffers.
0596   ///
0597   /// The default size is 32KB, so after each 32KB of string data appended to the builder
0598   /// a new data buffer will be allocated. Adjust this to a larger value to decrease the
0599   /// frequency of allocation, or to a smaller value to lower the overhead of each
0600   /// allocation.
0601   void SetBlockSize(int64_t blocksize) { data_heap_builder_.SetBlockSize(blocksize); }
0602 
0603   /// The number of bytes which can be appended to this builder without allocating another
0604   /// data buffer.
0605   int64_t current_block_bytes_remaining() const {
0606     return data_heap_builder_.current_remaining_bytes();
0607   }
0608 
0609   Status Append(const uint8_t* value, int64_t length) {
0610     ARROW_RETURN_NOT_OK(Reserve(1));
0611     UnsafeAppendToBitmap(true);
0612     ARROW_ASSIGN_OR_RAISE(auto v,
0613                           data_heap_builder_.Append</*Safe=*/true>(value, length));
0614     data_builder_.UnsafeAppend(v);
0615     return Status::OK();
0616   }
0617 
0618   Status Append(const char* value, int64_t length) {
0619     return Append(reinterpret_cast<const uint8_t*>(value), length);
0620   }
0621 
0622   Status Append(std::string_view value) {
0623     return Append(value.data(), static_cast<int64_t>(value.size()));
0624   }
0625 
0626   /// \brief Append without checking capacity
0627   ///
0628   /// Builder should have been presized using Reserve() and ReserveData(),
0629   /// respectively, and the value must not be larger than 2GB
0630   void UnsafeAppend(const uint8_t* value, int64_t length) {
0631     UnsafeAppendToBitmap(true);
0632     auto v = data_heap_builder_.Append</*Safe=*/false>(value, length);
0633     data_builder_.UnsafeAppend(v);
0634   }
0635 
0636   void UnsafeAppend(const char* value, int64_t length) {
0637     UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
0638   }
0639 
0640   void UnsafeAppend(const std::string& value) {
0641     UnsafeAppend(value.c_str(), static_cast<int64_t>(value.size()));
0642   }
0643 
0644   void UnsafeAppend(std::string_view value) {
0645     UnsafeAppend(value.data(), static_cast<int64_t>(value.size()));
0646   }
0647 
0648   /// \brief Ensures there is enough allocated available capacity in the
0649   /// out-of-line data heap to append the indicated number of bytes without
0650   /// additional allocations
0651   Status ReserveData(int64_t length);
0652 
0653   Status AppendNulls(int64_t length) final {
0654     ARROW_RETURN_NOT_OK(Reserve(length));
0655     data_builder_.UnsafeAppend(length, BinaryViewType::c_type{});
0656     UnsafeSetNull(length);
0657     return Status::OK();
0658   }
0659 
0660   /// \brief Append a single null element
0661   Status AppendNull() final {
0662     ARROW_RETURN_NOT_OK(Reserve(1));
0663     data_builder_.UnsafeAppend(BinaryViewType::c_type{});
0664     UnsafeAppendToBitmap(false);
0665     return Status::OK();
0666   }
0667 
0668   /// \brief Append a empty element (length-0 inline string)
0669   Status AppendEmptyValue() final {
0670     ARROW_RETURN_NOT_OK(Reserve(1));
0671     data_builder_.UnsafeAppend(BinaryViewType::c_type{});
0672     UnsafeAppendToBitmap(true);
0673     return Status::OK();
0674   }
0675 
0676   /// \brief Append several empty elements
0677   Status AppendEmptyValues(int64_t length) final {
0678     ARROW_RETURN_NOT_OK(Reserve(length));
0679     data_builder_.UnsafeAppend(length, BinaryViewType::c_type{});
0680     UnsafeSetNotNull(length);
0681     return Status::OK();
0682   }
0683 
0684   void UnsafeAppendNull() {
0685     data_builder_.UnsafeAppend(BinaryViewType::c_type{});
0686     UnsafeAppendToBitmap(false);
0687   }
0688 
0689   void UnsafeAppendEmptyValue() {
0690     data_builder_.UnsafeAppend(BinaryViewType::c_type{});
0691     UnsafeAppendToBitmap(true);
0692   }
0693 
0694   /// \brief Append a slice of a BinaryViewArray passed as an ArraySpan. Copies
0695   /// the underlying out-of-line string memory to avoid memory lifetime issues
0696   Status AppendArraySlice(const ArraySpan& array, int64_t offset,
0697                           int64_t length) override;
0698 
0699   void Reset() override;
0700 
0701   Status Resize(int64_t capacity) override {
0702     ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
0703     capacity = std::max(capacity, kMinBuilderCapacity);
0704     ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity));
0705     return ArrayBuilder::Resize(capacity);
0706   }
0707 
0708   Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
0709 
0710   std::shared_ptr<DataType> type() const override { return binary_view(); }
0711 
0712  protected:
0713   TypedBufferBuilder<BinaryViewType::c_type> data_builder_;
0714 
0715   // Accumulates out-of-line data in fixed-size chunks which are then attached
0716   // to the resulting ArrayData
0717   internal::StringHeapBuilder data_heap_builder_;
0718 };
0719 
0720 class ARROW_EXPORT StringViewBuilder : public BinaryViewBuilder {
0721  public:
0722   using BinaryViewBuilder::BinaryViewBuilder;
0723   std::shared_ptr<DataType> type() const override { return utf8_view(); }
0724 };
0725 
0726 // ----------------------------------------------------------------------
0727 // FixedSizeBinaryBuilder
0728 
0729 class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
0730  public:
0731   using TypeClass = FixedSizeBinaryType;
0732 
0733   explicit FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type,
0734                                   MemoryPool* pool = default_memory_pool(),
0735                                   int64_t alignment = kDefaultBufferAlignment);
0736 
0737   Status Append(const uint8_t* value) {
0738     ARROW_RETURN_NOT_OK(Reserve(1));
0739     UnsafeAppend(value);
0740     return Status::OK();
0741   }
0742 
0743   Status Append(const char* value) {
0744     return Append(reinterpret_cast<const uint8_t*>(value));
0745   }
0746 
0747   Status Append(std::string_view view) {
0748     ARROW_RETURN_NOT_OK(Reserve(1));
0749     UnsafeAppend(view);
0750     return Status::OK();
0751   }
0752 
0753   Status Append(const std::string& s) {
0754     ARROW_RETURN_NOT_OK(Reserve(1));
0755     UnsafeAppend(s);
0756     return Status::OK();
0757   }
0758 
0759   Status Append(const Buffer& s) {
0760     ARROW_RETURN_NOT_OK(Reserve(1));
0761     UnsafeAppend(s);
0762     return Status::OK();
0763   }
0764 
0765   Status Append(const std::shared_ptr<Buffer>& s) { return Append(*s); }
0766 
0767   template <size_t NBYTES>
0768   Status Append(const std::array<uint8_t, NBYTES>& value) {
0769     ARROW_RETURN_NOT_OK(Reserve(1));
0770     UnsafeAppend(
0771         std::string_view(reinterpret_cast<const char*>(value.data()), value.size()));
0772     return Status::OK();
0773   }
0774 
0775   Status AppendValues(const uint8_t* data, int64_t length,
0776                       const uint8_t* valid_bytes = NULLPTR);
0777 
0778   Status AppendValues(const uint8_t* data, int64_t length, const uint8_t* validity,
0779                       int64_t bitmap_offset);
0780 
0781   Status AppendNull() final;
0782   Status AppendNulls(int64_t length) final;
0783 
0784   Status AppendEmptyValue() final;
0785   Status AppendEmptyValues(int64_t length) final;
0786 
0787   Status AppendArraySlice(const ArraySpan& array, int64_t offset,
0788                           int64_t length) override {
0789     return AppendValues(
0790         array.GetValues<uint8_t>(1, 0) + ((array.offset + offset) * byte_width_), length,
0791         array.GetValues<uint8_t>(0, 0), array.offset + offset);
0792   }
0793 
0794   void UnsafeAppend(const uint8_t* value) {
0795     UnsafeAppendToBitmap(true);
0796     if (ARROW_PREDICT_TRUE(byte_width_ > 0)) {
0797       byte_builder_.UnsafeAppend(value, byte_width_);
0798     }
0799   }
0800 
0801   void UnsafeAppend(const char* value) {
0802     UnsafeAppend(reinterpret_cast<const uint8_t*>(value));
0803   }
0804 
0805   void UnsafeAppend(std::string_view value) {
0806 #ifndef NDEBUG
0807     CheckValueSize(static_cast<size_t>(value.size()));
0808 #endif
0809     UnsafeAppend(reinterpret_cast<const uint8_t*>(value.data()));
0810   }
0811 
0812   void UnsafeAppend(const Buffer& s) { UnsafeAppend(std::string_view{s}); }
0813 
0814   void UnsafeAppend(const std::shared_ptr<Buffer>& s) { UnsafeAppend(*s); }
0815 
0816   void UnsafeAppendNull() {
0817     UnsafeAppendToBitmap(false);
0818     byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0);
0819   }
0820 
0821   Status ValidateOverflow(int64_t new_bytes) const {
0822     auto new_size = byte_builder_.length() + new_bytes;
0823     if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
0824       return Status::CapacityError("array cannot contain more than ", memory_limit(),
0825                                    " bytes, have ", new_size);
0826     } else {
0827       return Status::OK();
0828     }
0829   }
0830 
0831   /// \brief Ensures there is enough allocated capacity to append the indicated
0832   /// number of bytes to the value data buffer without additional allocations
0833   Status ReserveData(int64_t elements) {
0834     ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
0835     return byte_builder_.Reserve(elements);
0836   }
0837 
0838   void Reset() override;
0839   Status Resize(int64_t capacity) override;
0840   Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
0841 
0842   /// \cond FALSE
0843   using ArrayBuilder::Finish;
0844   /// \endcond
0845 
0846   Status Finish(std::shared_ptr<FixedSizeBinaryArray>* out) { return FinishTyped(out); }
0847 
0848   /// \return size of values buffer so far
0849   int64_t value_data_length() const { return byte_builder_.length(); }
0850 
0851   int32_t byte_width() const { return byte_width_; }
0852 
0853   /// Temporary access to a value.
0854   ///
0855   /// This pointer becomes invalid on the next modifying operation.
0856   const uint8_t* GetValue(int64_t i) const;
0857 
0858   /// Temporary access to a value.
0859   ///
0860   /// This view becomes invalid on the next modifying operation.
0861   std::string_view GetView(int64_t i) const;
0862 
0863   static constexpr int64_t memory_limit() {
0864     return std::numeric_limits<int64_t>::max() - 1;
0865   }
0866 
0867   std::shared_ptr<DataType> type() const override {
0868     return fixed_size_binary(byte_width_);
0869   }
0870 
0871  protected:
0872   int32_t byte_width_;
0873   BufferBuilder byte_builder_;
0874 
0875   /// Temporary access to a value.
0876   ///
0877   /// This pointer becomes invalid on the next modifying operation.
0878   uint8_t* GetMutableValue(int64_t i) {
0879     uint8_t* data_ptr = byte_builder_.mutable_data();
0880     return data_ptr + i * byte_width_;
0881   }
0882 
0883   void CheckValueSize(int64_t size);
0884 };
0885 
0886 /// @}
0887 
0888 // ----------------------------------------------------------------------
0889 // Chunked builders: build a sequence of BinaryArray or StringArray that are
0890 // limited to a particular size (to the upper limit of 2GB)
0891 
0892 namespace internal {
0893 
0894 class ARROW_EXPORT ChunkedBinaryBuilder {
0895  public:
0896   explicit ChunkedBinaryBuilder(int32_t max_chunk_value_length,
0897                                 MemoryPool* pool = default_memory_pool());
0898 
0899   ChunkedBinaryBuilder(int32_t max_chunk_value_length, int32_t max_chunk_length,
0900                        MemoryPool* pool = default_memory_pool());
0901 
0902   virtual ~ChunkedBinaryBuilder() = default;
0903 
0904   Status Append(const uint8_t* value, int32_t length) {
0905     if (ARROW_PREDICT_FALSE(length + builder_->value_data_length() >
0906                             max_chunk_value_length_)) {
0907       if (builder_->value_data_length() == 0) {
0908         // The current item is larger than max_chunk_size_;
0909         // this chunk will be oversize and hold *only* this item
0910         ARROW_RETURN_NOT_OK(builder_->Append(value, length));
0911         return NextChunk();
0912       }
0913       // The current item would cause builder_->value_data_length() to exceed
0914       // max_chunk_size_, so finish this chunk and append the current item to the next
0915       // chunk
0916       ARROW_RETURN_NOT_OK(NextChunk());
0917       return Append(value, length);
0918     }
0919 
0920     if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
0921       // The current item would cause builder_->length() to exceed max_chunk_length_, so
0922       // finish this chunk and append the current item to the next chunk
0923       ARROW_RETURN_NOT_OK(NextChunk());
0924     }
0925 
0926     return builder_->Append(value, length);
0927   }
0928 
0929   Status Append(std::string_view value) {
0930     return Append(reinterpret_cast<const uint8_t*>(value.data()),
0931                   static_cast<int32_t>(value.size()));
0932   }
0933 
0934   Status AppendNull() {
0935     if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
0936       ARROW_RETURN_NOT_OK(NextChunk());
0937     }
0938     return builder_->AppendNull();
0939   }
0940 
0941   Status Reserve(int64_t values);
0942 
0943   virtual Status Finish(ArrayVector* out);
0944 
0945  protected:
0946   Status NextChunk();
0947 
0948   // maximum total character data size per chunk
0949   int64_t max_chunk_value_length_;
0950 
0951   // maximum elements allowed per chunk
0952   int64_t max_chunk_length_ = kListMaximumElements;
0953 
0954   // when Reserve() would cause builder_ to exceed its max_chunk_length_,
0955   // add to extra_capacity_ instead and wait to reserve until the next chunk
0956   int64_t extra_capacity_ = 0;
0957 
0958   std::unique_ptr<BinaryBuilder> builder_;
0959   std::vector<std::shared_ptr<Array>> chunks_;
0960 };
0961 
0962 class ARROW_EXPORT ChunkedStringBuilder : public ChunkedBinaryBuilder {
0963  public:
0964   using ChunkedBinaryBuilder::ChunkedBinaryBuilder;
0965 
0966   Status Finish(ArrayVector* out) override;
0967 };
0968 
0969 }  // namespace internal
0970 
0971 }  // namespace arrow