![]() |
|
|||
File indexing completed on 2025-08-27 08:47:21
0001 // Licensed to the Apache Software Foundation (ASF) under one 0002 // or more contributor license agreements. See the NOTICE file 0003 // distributed with this work for additional information 0004 // regarding copyright ownership. The ASF licenses this file 0005 // to you under the Apache License, Version 2.0 (the 0006 // "License"); you may not use this file except in compliance 0007 // with the License. You may obtain a copy of the License at 0008 // 0009 // http://www.apache.org/licenses/LICENSE-2.0 0010 // 0011 // Unless required by applicable law or agreed to in writing, 0012 // software distributed under the License is distributed on an 0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 0014 // KIND, either express or implied. See the License for the 0015 // specific language governing permissions and limitations 0016 // under the License. 0017 0018 #pragma once 0019 0020 #include <cstdint> 0021 #include <memory> 0022 #include <string> 0023 #include <vector> 0024 0025 #include "arrow/chunked_array.h" // IWYU pragma: keep 0026 #include "arrow/record_batch.h" 0027 #include "arrow/status.h" 0028 #include "arrow/type.h" 0029 #include "arrow/type_fwd.h" 0030 #include "arrow/util/macros.h" 0031 #include "arrow/util/visibility.h" 0032 0033 namespace arrow { 0034 0035 class Array; 0036 class ChunkedArray; 0037 class KeyValueMetadata; 0038 class MemoryPool; 0039 0040 /// \class Table 0041 /// \brief Logical table as sequence of chunked arrays 0042 class ARROW_EXPORT Table { 0043 public: 0044 virtual ~Table() = default; 0045 0046 /// \brief Construct a Table from schema and columns 0047 /// 0048 /// If columns is zero-length, the table's number of rows is zero 0049 /// 0050 /// \param[in] schema The table schema (column types) 0051 /// \param[in] columns The table's columns as chunked arrays 0052 /// \param[in] num_rows number of rows in table, -1 (default) to infer from columns 0053 static std::shared_ptr<Table> Make(std::shared_ptr<Schema> schema, 0054 std::vector<std::shared_ptr<ChunkedArray>> columns, 0055 int64_t num_rows = -1); 0056 0057 /// \brief Construct a Table from schema and arrays 0058 /// 0059 /// \param[in] schema The table schema (column types) 0060 /// \param[in] arrays The table's columns as arrays 0061 /// \param[in] num_rows number of rows in table, -1 (default) to infer from columns 0062 static std::shared_ptr<Table> Make(std::shared_ptr<Schema> schema, 0063 const std::vector<std::shared_ptr<Array>>& arrays, 0064 int64_t num_rows = -1); 0065 0066 /// \brief Create an empty Table of a given schema 0067 /// 0068 /// The output Table will be created with a single empty chunk per column. 0069 /// 0070 /// \param[in] schema the schema of the empty Table 0071 /// \param[in] pool the memory pool to allocate memory from 0072 /// \return the resulting Table 0073 static Result<std::shared_ptr<Table>> MakeEmpty( 0074 std::shared_ptr<Schema> schema, MemoryPool* pool = default_memory_pool()); 0075 0076 /// \brief Construct a Table from a RecordBatchReader. 0077 /// 0078 /// \param[in] reader the arrow::RecordBatchReader that produces batches 0079 static Result<std::shared_ptr<Table>> FromRecordBatchReader(RecordBatchReader* reader); 0080 0081 /// \brief Construct a Table from RecordBatches, using schema supplied by the first 0082 /// RecordBatch. 0083 /// 0084 /// \param[in] batches a std::vector of record batches 0085 static Result<std::shared_ptr<Table>> FromRecordBatches( 0086 const std::vector<std::shared_ptr<RecordBatch>>& batches); 0087 0088 /// \brief Construct a Table from RecordBatches, using supplied schema. There may be 0089 /// zero record batches 0090 /// 0091 /// \param[in] schema the arrow::Schema for each batch 0092 /// \param[in] batches a std::vector of record batches 0093 static Result<std::shared_ptr<Table>> FromRecordBatches( 0094 std::shared_ptr<Schema> schema, 0095 const std::vector<std::shared_ptr<RecordBatch>>& batches); 0096 0097 /// \brief Construct a Table from a chunked StructArray. One column will be produced 0098 /// for each field of the StructArray. 0099 /// 0100 /// \param[in] array a chunked StructArray 0101 static Result<std::shared_ptr<Table>> FromChunkedStructArray( 0102 const std::shared_ptr<ChunkedArray>& array); 0103 0104 /// \brief Return the table schema 0105 const std::shared_ptr<Schema>& schema() const { return schema_; } 0106 0107 /// \brief Return a column by index 0108 virtual std::shared_ptr<ChunkedArray> column(int i) const = 0; 0109 0110 /// \brief Return vector of all columns for table 0111 virtual const std::vector<std::shared_ptr<ChunkedArray>>& columns() const = 0; 0112 0113 /// Return a column's field by index 0114 std::shared_ptr<Field> field(int i) const { return schema_->field(i); } 0115 0116 /// \brief Return vector of all fields for table 0117 std::vector<std::shared_ptr<Field>> fields() const; 0118 0119 /// \brief Construct a zero-copy slice of the table with the 0120 /// indicated offset and length 0121 /// 0122 /// \param[in] offset the index of the first row in the constructed 0123 /// slice 0124 /// \param[in] length the number of rows of the slice. If there are not enough 0125 /// rows in the table, the length will be adjusted accordingly 0126 /// 0127 /// \return a new object wrapped in std::shared_ptr<Table> 0128 virtual std::shared_ptr<Table> Slice(int64_t offset, int64_t length) const = 0; 0129 0130 /// \brief Slice from first row at offset until end of the table 0131 std::shared_ptr<Table> Slice(int64_t offset) const { return Slice(offset, num_rows_); } 0132 0133 /// \brief Return a column by name 0134 /// \param[in] name field name 0135 /// \return an Array or null if no field was found 0136 std::shared_ptr<ChunkedArray> GetColumnByName(const std::string& name) const { 0137 auto i = schema_->GetFieldIndex(name); 0138 return i == -1 ? NULLPTR : column(i); 0139 } 0140 0141 /// \brief Remove column from the table, producing a new Table 0142 virtual Result<std::shared_ptr<Table>> RemoveColumn(int i) const = 0; 0143 0144 /// \brief Add column to the table, producing a new Table 0145 virtual Result<std::shared_ptr<Table>> AddColumn( 0146 int i, std::shared_ptr<Field> field_arg, 0147 std::shared_ptr<ChunkedArray> column) const = 0; 0148 0149 /// \brief Replace a column in the table, producing a new Table 0150 virtual Result<std::shared_ptr<Table>> SetColumn( 0151 int i, std::shared_ptr<Field> field_arg, 0152 std::shared_ptr<ChunkedArray> column) const = 0; 0153 0154 /// \brief Return names of all columns 0155 std::vector<std::string> ColumnNames() const; 0156 0157 /// \brief Rename columns with provided names 0158 Result<std::shared_ptr<Table>> RenameColumns( 0159 const std::vector<std::string>& names) const; 0160 0161 /// \brief Return new table with specified columns 0162 Result<std::shared_ptr<Table>> SelectColumns(const std::vector<int>& indices) const; 0163 0164 /// \brief Replace schema key-value metadata with new metadata 0165 /// \since 0.5.0 0166 /// 0167 /// \param[in] metadata new KeyValueMetadata 0168 /// \return new Table 0169 virtual std::shared_ptr<Table> ReplaceSchemaMetadata( 0170 const std::shared_ptr<const KeyValueMetadata>& metadata) const = 0; 0171 0172 /// \brief Flatten the table, producing a new Table. Any column with a 0173 /// struct type will be flattened into multiple columns 0174 /// 0175 /// \param[in] pool The pool for buffer allocations, if any 0176 virtual Result<std::shared_ptr<Table>> Flatten( 0177 MemoryPool* pool = default_memory_pool()) const = 0; 0178 0179 /// \return PrettyPrint representation suitable for debugging 0180 std::string ToString() const; 0181 0182 /// \brief Perform cheap validation checks to determine obvious inconsistencies 0183 /// within the table's schema and internal data. 0184 /// 0185 /// This is O(k*m) where k is the total number of field descendents, 0186 /// and m is the number of chunks. 0187 /// 0188 /// \return Status 0189 virtual Status Validate() const = 0; 0190 0191 /// \brief Perform extensive validation checks to determine inconsistencies 0192 /// within the table's schema and internal data. 0193 /// 0194 /// This is O(k*n) where k is the total number of field descendents, 0195 /// and n is the number of rows. 0196 /// 0197 /// \return Status 0198 virtual Status ValidateFull() const = 0; 0199 0200 /// \brief Return the number of columns in the table 0201 int num_columns() const { return schema_->num_fields(); } 0202 0203 /// \brief Return the number of rows (equal to each column's logical length) 0204 int64_t num_rows() const { return num_rows_; } 0205 0206 /// \brief Determine if tables are equal 0207 /// 0208 /// Two tables can be equal only if they have equal schemas. 0209 /// However, they may be equal even if they have different chunkings. 0210 bool Equals(const Table& other, bool check_metadata = false) const; 0211 0212 /// \brief Make a new table by combining the chunks this table has. 0213 /// 0214 /// All the underlying chunks in the ChunkedArray of each column are 0215 /// concatenated into zero or one chunk. 0216 /// 0217 /// \param[in] pool The pool for buffer allocations 0218 Result<std::shared_ptr<Table>> CombineChunks( 0219 MemoryPool* pool = default_memory_pool()) const; 0220 0221 /// \brief Make a new record batch by combining the chunks this table has. 0222 /// 0223 /// All the underlying chunks in the ChunkedArray of each column are 0224 /// concatenated into a single chunk. 0225 /// 0226 /// \param[in] pool The pool for buffer allocations 0227 Result<std::shared_ptr<RecordBatch>> CombineChunksToBatch( 0228 MemoryPool* pool = default_memory_pool()) const; 0229 0230 protected: 0231 Table(); 0232 0233 std::shared_ptr<Schema> schema_; 0234 int64_t num_rows_; 0235 0236 private: 0237 ARROW_DISALLOW_COPY_AND_ASSIGN(Table); 0238 }; 0239 0240 /// \brief Compute a stream of record batches from a (possibly chunked) Table 0241 /// 0242 /// The conversion is zero-copy: each record batch is a view over a slice 0243 /// of the table's columns. 0244 /// 0245 /// The table is expected to be valid prior to using it with the batch reader. 0246 class ARROW_EXPORT TableBatchReader : public RecordBatchReader { 0247 public: 0248 /// \brief Construct a TableBatchReader for the given table 0249 explicit TableBatchReader(const Table& table); 0250 explicit TableBatchReader(std::shared_ptr<Table> table); 0251 0252 std::shared_ptr<Schema> schema() const override; 0253 0254 Status ReadNext(std::shared_ptr<RecordBatch>* out) override; 0255 0256 /// \brief Set the desired maximum number of rows for record batches 0257 /// 0258 /// The actual number of rows in each record batch may be smaller, depending 0259 /// on actual chunking characteristics of each table column. 0260 void set_chunksize(int64_t chunksize); 0261 0262 private: 0263 std::shared_ptr<Table> owned_table_; 0264 const Table& table_; 0265 std::vector<ChunkedArray*> column_data_; 0266 std::vector<int> chunk_numbers_; 0267 std::vector<int64_t> chunk_offsets_; 0268 int64_t absolute_row_position_; 0269 int64_t max_chunksize_; 0270 }; 0271 0272 /// \defgroup concat-tables ConcatenateTables function. 0273 /// 0274 /// ConcatenateTables function. 0275 /// @{ 0276 0277 /// \brief Controls the behavior of ConcatenateTables(). 0278 struct ARROW_EXPORT ConcatenateTablesOptions { 0279 /// If true, the schemas of the tables will be first unified with fields of 0280 /// the same name being merged, according to `field_merge_options`, then each 0281 /// table will be promoted to the unified schema before being concatenated. 0282 /// Otherwise, all tables should have the same schema. Each column in the output table 0283 /// is the result of concatenating the corresponding columns in all input tables. 0284 bool unify_schemas = false; 0285 0286 /// options to control how fields are merged when unifying schemas 0287 /// 0288 /// This field will be ignored if unify_schemas is false 0289 Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults(); 0290 0291 static ConcatenateTablesOptions Defaults() { return {}; } 0292 }; 0293 0294 /// \brief Construct a new table from multiple input tables. 0295 /// 0296 /// The new table is assembled from existing column chunks without copying, 0297 /// if schemas are identical. If schemas do not match exactly and 0298 /// unify_schemas is enabled in options (off by default), an attempt is 0299 /// made to unify them, and then column chunks are converted to their 0300 /// respective unified datatype, which will probably incur a copy. 0301 /// :func:`arrow::PromoteTableToSchema` is used to unify schemas. 0302 /// 0303 /// Tables are concatenated in order they are provided in and the order of 0304 /// rows within tables will be preserved. 0305 /// 0306 /// \param[in] tables a std::vector of Tables to be concatenated 0307 /// \param[in] options specify how to unify schema of input tables 0308 /// \param[in] memory_pool MemoryPool to be used if null-filled arrays need to 0309 /// be created or if existing column chunks need to endure type conversion 0310 /// \return new Table 0311 0312 ARROW_EXPORT 0313 Result<std::shared_ptr<Table>> ConcatenateTables( 0314 const std::vector<std::shared_ptr<Table>>& tables, 0315 ConcatenateTablesOptions options = ConcatenateTablesOptions::Defaults(), 0316 MemoryPool* memory_pool = default_memory_pool()); 0317 0318 namespace compute { 0319 class CastOptions; 0320 } 0321 0322 /// \brief Promotes a table to conform to the given schema. 0323 /// 0324 /// If a field in the schema does not have a corresponding column in 0325 /// the table, a column of nulls will be added to the resulting table. 0326 /// If the corresponding column is of type Null, it will be promoted 0327 /// to the type specified by schema, with null values filled. The 0328 /// column will be casted to the type specified by the schema. 0329 /// 0330 /// Returns an error: 0331 /// - if the corresponding column's type is not compatible with the 0332 /// schema. 0333 /// - if there is a column in the table that does not exist in the schema. 0334 /// - if the cast fails or casting would be required but is not available. 0335 /// 0336 /// \param[in] table the input Table 0337 /// \param[in] schema the target schema to promote to 0338 /// \param[in] pool The memory pool to be used if null-filled arrays need to 0339 /// be created. 0340 ARROW_EXPORT 0341 Result<std::shared_ptr<Table>> PromoteTableToSchema( 0342 const std::shared_ptr<Table>& table, const std::shared_ptr<Schema>& schema, 0343 MemoryPool* pool = default_memory_pool()); 0344 0345 /// \brief Promotes a table to conform to the given schema. 0346 /// 0347 /// If a field in the schema does not have a corresponding column in 0348 /// the table, a column of nulls will be added to the resulting table. 0349 /// If the corresponding column is of type Null, it will be promoted 0350 /// to the type specified by schema, with null values filled. The column 0351 /// will be casted to the type specified by the schema. 0352 /// 0353 /// Returns an error: 0354 /// - if the corresponding column's type is not compatible with the 0355 /// schema. 0356 /// - if there is a column in the table that does not exist in the schema. 0357 /// - if the cast fails or casting would be required but is not available. 0358 /// 0359 /// \param[in] table the input Table 0360 /// \param[in] schema the target schema to promote to 0361 /// \param[in] options The cast options to allow promotion of types 0362 /// \param[in] pool The memory pool to be used if null-filled arrays need to 0363 /// be created. 0364 ARROW_EXPORT 0365 Result<std::shared_ptr<Table>> PromoteTableToSchema( 0366 const std::shared_ptr<Table>& table, const std::shared_ptr<Schema>& schema, 0367 const compute::CastOptions& options, MemoryPool* pool = default_memory_pool()); 0368 0369 } // namespace arrow
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
![]() ![]() |