![]() |
|
|||
File indexing completed on 2025-08-27 08:47:18
0001 // Licensed to the Apache Software Foundation (ASF) under one 0002 // or more contributor license agreements. See the NOTICE file 0003 // distributed with this work for additional information 0004 // regarding copyright ownership. The ASF licenses this file 0005 // to you under the Apache License, Version 2.0 (the 0006 // "License"); you may not use this file except in compliance 0007 // with the License. You may obtain a copy of the License at 0008 // 0009 // http://www.apache.org/licenses/LICENSE-2.0 0010 // 0011 // Unless required by applicable law or agreed to in writing, 0012 // software distributed under the License is distributed on an 0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 0014 // KIND, either express or implied. See the License for the 0015 // specific language governing permissions and limitations 0016 // under the License. 0017 0018 #pragma once 0019 0020 #include <cstdint> 0021 #include <memory> 0022 #include <string> 0023 #include <utility> 0024 #include <vector> 0025 0026 #include "arrow/chunk_resolver.h" 0027 #include "arrow/compare.h" 0028 #include "arrow/device_allocation_type_set.h" 0029 #include "arrow/result.h" 0030 #include "arrow/status.h" 0031 #include "arrow/type_fwd.h" 0032 #include "arrow/util/macros.h" 0033 #include "arrow/util/visibility.h" 0034 0035 namespace arrow { 0036 0037 class Array; 0038 class DataType; 0039 class MemoryPool; 0040 namespace stl { 0041 template <typename T, typename V> 0042 class ChunkedArrayIterator; 0043 } // namespace stl 0044 0045 /// \class ChunkedArray 0046 /// \brief A data structure managing a list of primitive Arrow arrays logically 0047 /// as one large array 0048 /// 0049 /// Data chunking is treated throughout this project largely as an 0050 /// implementation detail for performance and memory use optimization. 0051 /// ChunkedArray allows Array objects to be collected and interpreted 0052 /// as a single logical array without requiring an expensive concatenation 0053 /// step. 0054 /// 0055 /// In some cases, data produced by a function may exceed the capacity of an 0056 /// Array (like BinaryArray or StringArray) and so returning multiple Arrays is 0057 /// the only possibility. In these cases, we recommend returning a ChunkedArray 0058 /// instead of vector of Arrays or some alternative. 0059 /// 0060 /// When data is processed in parallel, it may not be practical or possible to 0061 /// create large contiguous memory allocations and write output into them. With 0062 /// some data types, like binary and string types, it is not possible at all to 0063 /// produce non-chunked array outputs without requiring a concatenation step at 0064 /// the end of processing. 0065 /// 0066 /// Application developers may tune chunk sizes based on analysis of 0067 /// performance profiles but many developer-users will not need to be 0068 /// especially concerned with the chunking details. 0069 /// 0070 /// Preserving the chunk layout/sizes in processing steps is generally not 0071 /// considered to be a contract in APIs. A function may decide to alter the 0072 /// chunking of its result. Similarly, APIs accepting multiple ChunkedArray 0073 /// inputs should not expect the chunk layout to be the same in each input. 0074 class ARROW_EXPORT ChunkedArray { 0075 public: 0076 ChunkedArray(ChunkedArray&&) = default; 0077 ChunkedArray& operator=(ChunkedArray&&) = default; 0078 0079 /// \brief Construct a chunked array from a single Array 0080 explicit ChunkedArray(std::shared_ptr<Array> chunk) 0081 : ChunkedArray(ArrayVector{std::move(chunk)}) {} 0082 0083 /// \brief Construct a chunked array from a vector of arrays and an optional data type 0084 /// 0085 /// The vector elements must have the same data type. 0086 /// If the data type is passed explicitly, the vector may be empty. 0087 /// If the data type is omitted, the vector must be non-empty. 0088 explicit ChunkedArray(ArrayVector chunks, std::shared_ptr<DataType> type = NULLPTR); 0089 0090 // \brief Constructor with basic input validation. 0091 static Result<std::shared_ptr<ChunkedArray>> Make( 0092 ArrayVector chunks, std::shared_ptr<DataType> type = NULLPTR); 0093 0094 /// \brief Create an empty ChunkedArray of a given type 0095 /// 0096 /// The output ChunkedArray will have one chunk with an empty 0097 /// array of the given type. 0098 /// 0099 /// \param[in] type the data type of the empty ChunkedArray 0100 /// \param[in] pool the memory pool to allocate memory from 0101 /// \return the resulting ChunkedArray 0102 static Result<std::shared_ptr<ChunkedArray>> MakeEmpty( 0103 std::shared_ptr<DataType> type, MemoryPool* pool = default_memory_pool()); 0104 0105 /// \return the total length of the chunked array; computed on construction 0106 int64_t length() const { return length_; } 0107 0108 /// \return the total number of nulls among all chunks 0109 int64_t null_count() const { return null_count_; } 0110 0111 /// \return the total number of chunks in the chunked array 0112 int num_chunks() const { return static_cast<int>(chunks_.size()); } 0113 0114 /// \return chunk a particular chunk from the chunked array 0115 const std::shared_ptr<Array>& chunk(int i) const { return chunks_[i]; } 0116 0117 /// \return an ArrayVector of chunks 0118 const ArrayVector& chunks() const { return chunks_; } 0119 0120 /// \return The set of device allocation types used by the chunks in this 0121 /// chunked array. 0122 DeviceAllocationTypeSet device_types() const; 0123 0124 /// \return true if all chunks are allocated on CPU-accessible memory. 0125 bool is_cpu() const { return device_types().is_cpu_only(); } 0126 0127 /// \brief Construct a zero-copy slice of the chunked array with the 0128 /// indicated offset and length 0129 /// 0130 /// \param[in] offset the position of the first element in the constructed 0131 /// slice 0132 /// \param[in] length the length of the slice. If there are not enough 0133 /// elements in the chunked array, the length will be adjusted accordingly 0134 /// 0135 /// \return a new object wrapped in std::shared_ptr<ChunkedArray> 0136 std::shared_ptr<ChunkedArray> Slice(int64_t offset, int64_t length) const; 0137 0138 /// \brief Slice from offset until end of the chunked array 0139 std::shared_ptr<ChunkedArray> Slice(int64_t offset) const; 0140 0141 /// \brief Flatten this chunked array as a vector of chunked arrays, one 0142 /// for each struct field 0143 /// 0144 /// \param[in] pool The pool for buffer allocations, if any 0145 Result<std::vector<std::shared_ptr<ChunkedArray>>> Flatten( 0146 MemoryPool* pool = default_memory_pool()) const; 0147 0148 /// Construct a zero-copy view of this chunked array with the given 0149 /// type. Calls Array::View on each constituent chunk. Always succeeds if 0150 /// there are zero chunks 0151 Result<std::shared_ptr<ChunkedArray>> View(const std::shared_ptr<DataType>& type) const; 0152 0153 /// \brief Return the type of the chunked array 0154 const std::shared_ptr<DataType>& type() const { return type_; } 0155 0156 /// \brief Return a Scalar containing the value of this array at index 0157 Result<std::shared_ptr<Scalar>> GetScalar(int64_t index) const; 0158 0159 /// \brief Determine if two chunked arrays are equal. 0160 /// 0161 /// Two chunked arrays can be equal only if they have equal datatypes. 0162 /// However, they may be equal even if they have different chunkings. 0163 bool Equals(const ChunkedArray& other, 0164 const EqualOptions& opts = EqualOptions::Defaults()) const; 0165 /// \brief Determine if two chunked arrays are equal. 0166 bool Equals(const std::shared_ptr<ChunkedArray>& other, 0167 const EqualOptions& opts = EqualOptions::Defaults()) const; 0168 /// \brief Determine if two chunked arrays approximately equal 0169 bool ApproxEquals(const ChunkedArray& other, 0170 const EqualOptions& = EqualOptions::Defaults()) const; 0171 0172 /// \return PrettyPrint representation suitable for debugging 0173 std::string ToString() const; 0174 0175 /// \brief Perform cheap validation checks to determine obvious inconsistencies 0176 /// within the chunk array's internal data. 0177 /// 0178 /// This is O(k*m) where k is the number of array descendents, 0179 /// and m is the number of chunks. 0180 /// 0181 /// \return Status 0182 Status Validate() const; 0183 0184 /// \brief Perform extensive validation checks to determine inconsistencies 0185 /// within the chunk array's internal data. 0186 /// 0187 /// This is O(k*n) where k is the number of array descendents, 0188 /// and n is the length in elements. 0189 /// 0190 /// \return Status 0191 Status ValidateFull() const; 0192 0193 protected: 0194 ArrayVector chunks_; 0195 std::shared_ptr<DataType> type_; 0196 int64_t length_; 0197 int64_t null_count_; 0198 0199 private: 0200 template <typename T, typename V> 0201 friend class ::arrow::stl::ChunkedArrayIterator; 0202 ChunkResolver chunk_resolver_; 0203 ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray); 0204 }; 0205 0206 namespace internal { 0207 0208 /// \brief EXPERIMENTAL: Utility for incremental iteration over contiguous 0209 /// pieces of potentially differently-chunked ChunkedArray objects 0210 class ARROW_EXPORT MultipleChunkIterator { 0211 public: 0212 MultipleChunkIterator(const ChunkedArray& left, const ChunkedArray& right) 0213 : left_(left), 0214 right_(right), 0215 pos_(0), 0216 length_(left.length()), 0217 chunk_idx_left_(0), 0218 chunk_idx_right_(0), 0219 chunk_pos_left_(0), 0220 chunk_pos_right_(0) {} 0221 0222 bool Next(std::shared_ptr<Array>* next_left, std::shared_ptr<Array>* next_right); 0223 0224 int64_t position() const { return pos_; } 0225 0226 private: 0227 const ChunkedArray& left_; 0228 const ChunkedArray& right_; 0229 0230 // The amount of the entire ChunkedArray consumed 0231 int64_t pos_; 0232 0233 // Length of the chunked array(s) 0234 int64_t length_; 0235 0236 // Current left chunk 0237 int chunk_idx_left_; 0238 0239 // Current right chunk 0240 int chunk_idx_right_; 0241 0242 // Offset into the current left chunk 0243 int64_t chunk_pos_left_; 0244 0245 // Offset into the current right chunk 0246 int64_t chunk_pos_right_; 0247 }; 0248 0249 /// \brief Evaluate binary function on two ChunkedArray objects having possibly 0250 /// different chunk layouts. The passed binary function / functor should have 0251 /// the following signature. 0252 /// 0253 /// Status(const Array&, const Array&, int64_t) 0254 /// 0255 /// The third argument is the absolute position relative to the start of each 0256 /// ChunkedArray. The function is executed against each contiguous pair of 0257 /// array segments, slicing if necessary. 0258 /// 0259 /// For example, if two arrays have chunk sizes 0260 /// 0261 /// left: [10, 10, 20] 0262 /// right: [15, 10, 15] 0263 /// 0264 /// Then the following invocations take place (pseudocode) 0265 /// 0266 /// func(left.chunk[0][0:10], right.chunk[0][0:10], 0) 0267 /// func(left.chunk[1][0:5], right.chunk[0][10:15], 10) 0268 /// func(left.chunk[1][5:10], right.chunk[1][0:5], 15) 0269 /// func(left.chunk[2][0:5], right.chunk[1][5:10], 20) 0270 /// func(left.chunk[2][5:20], right.chunk[2][:], 25) 0271 template <typename Action> 0272 Status ApplyBinaryChunked(const ChunkedArray& left, const ChunkedArray& right, 0273 Action&& action) { 0274 MultipleChunkIterator iterator(left, right); 0275 std::shared_ptr<Array> left_piece, right_piece; 0276 while (iterator.Next(&left_piece, &right_piece)) { 0277 ARROW_RETURN_NOT_OK(action(*left_piece, *right_piece, iterator.position())); 0278 } 0279 return Status::OK(); 0280 } 0281 0282 } // namespace internal 0283 } // namespace arrow
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
![]() ![]() |