Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-08-27 08:47:18

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <cstdint>
0021 #include <memory>
0022 #include <string>
0023 #include <utility>
0024 #include <vector>
0025 
0026 #include "arrow/chunk_resolver.h"
0027 #include "arrow/compare.h"
0028 #include "arrow/device_allocation_type_set.h"
0029 #include "arrow/result.h"
0030 #include "arrow/status.h"
0031 #include "arrow/type_fwd.h"
0032 #include "arrow/util/macros.h"
0033 #include "arrow/util/visibility.h"
0034 
0035 namespace arrow {
0036 
0037 class Array;
0038 class DataType;
0039 class MemoryPool;
0040 namespace stl {
0041 template <typename T, typename V>
0042 class ChunkedArrayIterator;
0043 }  // namespace stl
0044 
0045 /// \class ChunkedArray
0046 /// \brief A data structure managing a list of primitive Arrow arrays logically
0047 /// as one large array
0048 ///
0049 /// Data chunking is treated throughout this project largely as an
0050 /// implementation detail for performance and memory use optimization.
0051 /// ChunkedArray allows Array objects to be collected and interpreted
0052 /// as a single logical array without requiring an expensive concatenation
0053 /// step.
0054 ///
0055 /// In some cases, data produced by a function may exceed the capacity of an
0056 /// Array (like BinaryArray or StringArray) and so returning multiple Arrays is
0057 /// the only possibility. In these cases, we recommend returning a ChunkedArray
0058 /// instead of vector of Arrays or some alternative.
0059 ///
0060 /// When data is processed in parallel, it may not be practical or possible to
0061 /// create large contiguous memory allocations and write output into them. With
0062 /// some data types, like binary and string types, it is not possible at all to
0063 /// produce non-chunked array outputs without requiring a concatenation step at
0064 /// the end of processing.
0065 ///
0066 /// Application developers may tune chunk sizes based on analysis of
0067 /// performance profiles but many developer-users will not need to be
0068 /// especially concerned with the chunking details.
0069 ///
0070 /// Preserving the chunk layout/sizes in processing steps is generally not
0071 /// considered to be a contract in APIs. A function may decide to alter the
0072 /// chunking of its result. Similarly, APIs accepting multiple ChunkedArray
0073 /// inputs should not expect the chunk layout to be the same in each input.
0074 class ARROW_EXPORT ChunkedArray {
0075  public:
0076   ChunkedArray(ChunkedArray&&) = default;
0077   ChunkedArray& operator=(ChunkedArray&&) = default;
0078 
0079   /// \brief Construct a chunked array from a single Array
0080   explicit ChunkedArray(std::shared_ptr<Array> chunk)
0081       : ChunkedArray(ArrayVector{std::move(chunk)}) {}
0082 
0083   /// \brief Construct a chunked array from a vector of arrays and an optional data type
0084   ///
0085   /// The vector elements must have the same data type.
0086   /// If the data type is passed explicitly, the vector may be empty.
0087   /// If the data type is omitted, the vector must be non-empty.
0088   explicit ChunkedArray(ArrayVector chunks, std::shared_ptr<DataType> type = NULLPTR);
0089 
0090   // \brief Constructor with basic input validation.
0091   static Result<std::shared_ptr<ChunkedArray>> Make(
0092       ArrayVector chunks, std::shared_ptr<DataType> type = NULLPTR);
0093 
0094   /// \brief Create an empty ChunkedArray of a given type
0095   ///
0096   /// The output ChunkedArray will have one chunk with an empty
0097   /// array of the given type.
0098   ///
0099   /// \param[in] type the data type of the empty ChunkedArray
0100   /// \param[in] pool the memory pool to allocate memory from
0101   /// \return the resulting ChunkedArray
0102   static Result<std::shared_ptr<ChunkedArray>> MakeEmpty(
0103       std::shared_ptr<DataType> type, MemoryPool* pool = default_memory_pool());
0104 
0105   /// \return the total length of the chunked array; computed on construction
0106   int64_t length() const { return length_; }
0107 
0108   /// \return the total number of nulls among all chunks
0109   int64_t null_count() const { return null_count_; }
0110 
0111   /// \return the total number of chunks in the chunked array
0112   int num_chunks() const { return static_cast<int>(chunks_.size()); }
0113 
0114   /// \return chunk a particular chunk from the chunked array
0115   const std::shared_ptr<Array>& chunk(int i) const { return chunks_[i]; }
0116 
0117   /// \return an ArrayVector of chunks
0118   const ArrayVector& chunks() const { return chunks_; }
0119 
0120   /// \return The set of device allocation types used by the chunks in this
0121   /// chunked array.
0122   DeviceAllocationTypeSet device_types() const;
0123 
0124   /// \return true if all chunks are allocated on CPU-accessible memory.
0125   bool is_cpu() const { return device_types().is_cpu_only(); }
0126 
0127   /// \brief Construct a zero-copy slice of the chunked array with the
0128   /// indicated offset and length
0129   ///
0130   /// \param[in] offset the position of the first element in the constructed
0131   /// slice
0132   /// \param[in] length the length of the slice. If there are not enough
0133   /// elements in the chunked array, the length will be adjusted accordingly
0134   ///
0135   /// \return a new object wrapped in std::shared_ptr<ChunkedArray>
0136   std::shared_ptr<ChunkedArray> Slice(int64_t offset, int64_t length) const;
0137 
0138   /// \brief Slice from offset until end of the chunked array
0139   std::shared_ptr<ChunkedArray> Slice(int64_t offset) const;
0140 
0141   /// \brief Flatten this chunked array as a vector of chunked arrays, one
0142   /// for each struct field
0143   ///
0144   /// \param[in] pool The pool for buffer allocations, if any
0145   Result<std::vector<std::shared_ptr<ChunkedArray>>> Flatten(
0146       MemoryPool* pool = default_memory_pool()) const;
0147 
0148   /// Construct a zero-copy view of this chunked array with the given
0149   /// type. Calls Array::View on each constituent chunk. Always succeeds if
0150   /// there are zero chunks
0151   Result<std::shared_ptr<ChunkedArray>> View(const std::shared_ptr<DataType>& type) const;
0152 
0153   /// \brief Return the type of the chunked array
0154   const std::shared_ptr<DataType>& type() const { return type_; }
0155 
0156   /// \brief Return a Scalar containing the value of this array at index
0157   Result<std::shared_ptr<Scalar>> GetScalar(int64_t index) const;
0158 
0159   /// \brief Determine if two chunked arrays are equal.
0160   ///
0161   /// Two chunked arrays can be equal only if they have equal datatypes.
0162   /// However, they may be equal even if they have different chunkings.
0163   bool Equals(const ChunkedArray& other,
0164               const EqualOptions& opts = EqualOptions::Defaults()) const;
0165   /// \brief Determine if two chunked arrays are equal.
0166   bool Equals(const std::shared_ptr<ChunkedArray>& other,
0167               const EqualOptions& opts = EqualOptions::Defaults()) const;
0168   /// \brief Determine if two chunked arrays approximately equal
0169   bool ApproxEquals(const ChunkedArray& other,
0170                     const EqualOptions& = EqualOptions::Defaults()) const;
0171 
0172   /// \return PrettyPrint representation suitable for debugging
0173   std::string ToString() const;
0174 
0175   /// \brief Perform cheap validation checks to determine obvious inconsistencies
0176   /// within the chunk array's internal data.
0177   ///
0178   /// This is O(k*m) where k is the number of array descendents,
0179   /// and m is the number of chunks.
0180   ///
0181   /// \return Status
0182   Status Validate() const;
0183 
0184   /// \brief Perform extensive validation checks to determine inconsistencies
0185   /// within the chunk array's internal data.
0186   ///
0187   /// This is O(k*n) where k is the number of array descendents,
0188   /// and n is the length in elements.
0189   ///
0190   /// \return Status
0191   Status ValidateFull() const;
0192 
0193  protected:
0194   ArrayVector chunks_;
0195   std::shared_ptr<DataType> type_;
0196   int64_t length_;
0197   int64_t null_count_;
0198 
0199  private:
0200   template <typename T, typename V>
0201   friend class ::arrow::stl::ChunkedArrayIterator;
0202   ChunkResolver chunk_resolver_;
0203   ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray);
0204 };
0205 
0206 namespace internal {
0207 
0208 /// \brief EXPERIMENTAL: Utility for incremental iteration over contiguous
0209 /// pieces of potentially differently-chunked ChunkedArray objects
0210 class ARROW_EXPORT MultipleChunkIterator {
0211  public:
0212   MultipleChunkIterator(const ChunkedArray& left, const ChunkedArray& right)
0213       : left_(left),
0214         right_(right),
0215         pos_(0),
0216         length_(left.length()),
0217         chunk_idx_left_(0),
0218         chunk_idx_right_(0),
0219         chunk_pos_left_(0),
0220         chunk_pos_right_(0) {}
0221 
0222   bool Next(std::shared_ptr<Array>* next_left, std::shared_ptr<Array>* next_right);
0223 
0224   int64_t position() const { return pos_; }
0225 
0226  private:
0227   const ChunkedArray& left_;
0228   const ChunkedArray& right_;
0229 
0230   // The amount of the entire ChunkedArray consumed
0231   int64_t pos_;
0232 
0233   // Length of the chunked array(s)
0234   int64_t length_;
0235 
0236   // Current left chunk
0237   int chunk_idx_left_;
0238 
0239   // Current right chunk
0240   int chunk_idx_right_;
0241 
0242   // Offset into the current left chunk
0243   int64_t chunk_pos_left_;
0244 
0245   // Offset into the current right chunk
0246   int64_t chunk_pos_right_;
0247 };
0248 
0249 /// \brief Evaluate binary function on two ChunkedArray objects having possibly
0250 /// different chunk layouts. The passed binary function / functor should have
0251 /// the following signature.
0252 ///
0253 ///    Status(const Array&, const Array&, int64_t)
0254 ///
0255 /// The third argument is the absolute position relative to the start of each
0256 /// ChunkedArray. The function is executed against each contiguous pair of
0257 /// array segments, slicing if necessary.
0258 ///
0259 /// For example, if two arrays have chunk sizes
0260 ///
0261 ///   left: [10, 10, 20]
0262 ///   right: [15, 10, 15]
0263 ///
0264 /// Then the following invocations take place (pseudocode)
0265 ///
0266 ///   func(left.chunk[0][0:10], right.chunk[0][0:10], 0)
0267 ///   func(left.chunk[1][0:5], right.chunk[0][10:15], 10)
0268 ///   func(left.chunk[1][5:10], right.chunk[1][0:5], 15)
0269 ///   func(left.chunk[2][0:5], right.chunk[1][5:10], 20)
0270 ///   func(left.chunk[2][5:20], right.chunk[2][:], 25)
0271 template <typename Action>
0272 Status ApplyBinaryChunked(const ChunkedArray& left, const ChunkedArray& right,
0273                           Action&& action) {
0274   MultipleChunkIterator iterator(left, right);
0275   std::shared_ptr<Array> left_piece, right_piece;
0276   while (iterator.Next(&left_piece, &right_piece)) {
0277     ARROW_RETURN_NOT_OK(action(*left_piece, *right_piece, iterator.position()));
0278   }
0279   return Status::OK();
0280 }
0281 
0282 }  // namespace internal
0283 }  // namespace arrow