Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-08-27 08:47:21

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <cstdint>
0021 #include <memory>
0022 #include <string>
0023 #include <vector>
0024 
0025 #include "arrow/chunked_array.h"  // IWYU pragma: keep
0026 #include "arrow/record_batch.h"
0027 #include "arrow/status.h"
0028 #include "arrow/type.h"
0029 #include "arrow/type_fwd.h"
0030 #include "arrow/util/macros.h"
0031 #include "arrow/util/visibility.h"
0032 
0033 namespace arrow {
0034 
0035 class Array;
0036 class ChunkedArray;
0037 class KeyValueMetadata;
0038 class MemoryPool;
0039 
0040 /// \class Table
0041 /// \brief Logical table as sequence of chunked arrays
0042 class ARROW_EXPORT Table {
0043  public:
0044   virtual ~Table() = default;
0045 
0046   /// \brief Construct a Table from schema and columns
0047   ///
0048   /// If columns is zero-length, the table's number of rows is zero
0049   ///
0050   /// \param[in] schema The table schema (column types)
0051   /// \param[in] columns The table's columns as chunked arrays
0052   /// \param[in] num_rows number of rows in table, -1 (default) to infer from columns
0053   static std::shared_ptr<Table> Make(std::shared_ptr<Schema> schema,
0054                                      std::vector<std::shared_ptr<ChunkedArray>> columns,
0055                                      int64_t num_rows = -1);
0056 
0057   /// \brief Construct a Table from schema and arrays
0058   ///
0059   /// \param[in] schema The table schema (column types)
0060   /// \param[in] arrays The table's columns as arrays
0061   /// \param[in] num_rows number of rows in table, -1 (default) to infer from columns
0062   static std::shared_ptr<Table> Make(std::shared_ptr<Schema> schema,
0063                                      const std::vector<std::shared_ptr<Array>>& arrays,
0064                                      int64_t num_rows = -1);
0065 
0066   /// \brief Create an empty Table of a given schema
0067   ///
0068   /// The output Table will be created with a single empty chunk per column.
0069   ///
0070   /// \param[in] schema the schema of the empty Table
0071   /// \param[in] pool the memory pool to allocate memory from
0072   /// \return the resulting Table
0073   static Result<std::shared_ptr<Table>> MakeEmpty(
0074       std::shared_ptr<Schema> schema, MemoryPool* pool = default_memory_pool());
0075 
0076   /// \brief Construct a Table from a RecordBatchReader.
0077   ///
0078   /// \param[in] reader the arrow::RecordBatchReader that produces batches
0079   static Result<std::shared_ptr<Table>> FromRecordBatchReader(RecordBatchReader* reader);
0080 
0081   /// \brief Construct a Table from RecordBatches, using schema supplied by the first
0082   /// RecordBatch.
0083   ///
0084   /// \param[in] batches a std::vector of record batches
0085   static Result<std::shared_ptr<Table>> FromRecordBatches(
0086       const std::vector<std::shared_ptr<RecordBatch>>& batches);
0087 
0088   /// \brief Construct a Table from RecordBatches, using supplied schema. There may be
0089   /// zero record batches
0090   ///
0091   /// \param[in] schema the arrow::Schema for each batch
0092   /// \param[in] batches a std::vector of record batches
0093   static Result<std::shared_ptr<Table>> FromRecordBatches(
0094       std::shared_ptr<Schema> schema,
0095       const std::vector<std::shared_ptr<RecordBatch>>& batches);
0096 
0097   /// \brief Construct a Table from a chunked StructArray. One column will be produced
0098   /// for each field of the StructArray.
0099   ///
0100   /// \param[in] array a chunked StructArray
0101   static Result<std::shared_ptr<Table>> FromChunkedStructArray(
0102       const std::shared_ptr<ChunkedArray>& array);
0103 
0104   /// \brief Return the table schema
0105   const std::shared_ptr<Schema>& schema() const { return schema_; }
0106 
0107   /// \brief Return a column by index
0108   virtual std::shared_ptr<ChunkedArray> column(int i) const = 0;
0109 
0110   /// \brief Return vector of all columns for table
0111   virtual const std::vector<std::shared_ptr<ChunkedArray>>& columns() const = 0;
0112 
0113   /// Return a column's field by index
0114   std::shared_ptr<Field> field(int i) const { return schema_->field(i); }
0115 
0116   /// \brief Return vector of all fields for table
0117   std::vector<std::shared_ptr<Field>> fields() const;
0118 
0119   /// \brief Construct a zero-copy slice of the table with the
0120   /// indicated offset and length
0121   ///
0122   /// \param[in] offset the index of the first row in the constructed
0123   /// slice
0124   /// \param[in] length the number of rows of the slice. If there are not enough
0125   /// rows in the table, the length will be adjusted accordingly
0126   ///
0127   /// \return a new object wrapped in std::shared_ptr<Table>
0128   virtual std::shared_ptr<Table> Slice(int64_t offset, int64_t length) const = 0;
0129 
0130   /// \brief Slice from first row at offset until end of the table
0131   std::shared_ptr<Table> Slice(int64_t offset) const { return Slice(offset, num_rows_); }
0132 
0133   /// \brief Return a column by name
0134   /// \param[in] name field name
0135   /// \return an Array or null if no field was found
0136   std::shared_ptr<ChunkedArray> GetColumnByName(const std::string& name) const {
0137     auto i = schema_->GetFieldIndex(name);
0138     return i == -1 ? NULLPTR : column(i);
0139   }
0140 
0141   /// \brief Remove column from the table, producing a new Table
0142   virtual Result<std::shared_ptr<Table>> RemoveColumn(int i) const = 0;
0143 
0144   /// \brief Add column to the table, producing a new Table
0145   virtual Result<std::shared_ptr<Table>> AddColumn(
0146       int i, std::shared_ptr<Field> field_arg,
0147       std::shared_ptr<ChunkedArray> column) const = 0;
0148 
0149   /// \brief Replace a column in the table, producing a new Table
0150   virtual Result<std::shared_ptr<Table>> SetColumn(
0151       int i, std::shared_ptr<Field> field_arg,
0152       std::shared_ptr<ChunkedArray> column) const = 0;
0153 
0154   /// \brief Return names of all columns
0155   std::vector<std::string> ColumnNames() const;
0156 
0157   /// \brief Rename columns with provided names
0158   Result<std::shared_ptr<Table>> RenameColumns(
0159       const std::vector<std::string>& names) const;
0160 
0161   /// \brief Return new table with specified columns
0162   Result<std::shared_ptr<Table>> SelectColumns(const std::vector<int>& indices) const;
0163 
0164   /// \brief Replace schema key-value metadata with new metadata
0165   /// \since 0.5.0
0166   ///
0167   /// \param[in] metadata new KeyValueMetadata
0168   /// \return new Table
0169   virtual std::shared_ptr<Table> ReplaceSchemaMetadata(
0170       const std::shared_ptr<const KeyValueMetadata>& metadata) const = 0;
0171 
0172   /// \brief Flatten the table, producing a new Table.  Any column with a
0173   /// struct type will be flattened into multiple columns
0174   ///
0175   /// \param[in] pool The pool for buffer allocations, if any
0176   virtual Result<std::shared_ptr<Table>> Flatten(
0177       MemoryPool* pool = default_memory_pool()) const = 0;
0178 
0179   /// \return PrettyPrint representation suitable for debugging
0180   std::string ToString() const;
0181 
0182   /// \brief Perform cheap validation checks to determine obvious inconsistencies
0183   /// within the table's schema and internal data.
0184   ///
0185   /// This is O(k*m) where k is the total number of field descendents,
0186   /// and m is the number of chunks.
0187   ///
0188   /// \return Status
0189   virtual Status Validate() const = 0;
0190 
0191   /// \brief Perform extensive validation checks to determine inconsistencies
0192   /// within the table's schema and internal data.
0193   ///
0194   /// This is O(k*n) where k is the total number of field descendents,
0195   /// and n is the number of rows.
0196   ///
0197   /// \return Status
0198   virtual Status ValidateFull() const = 0;
0199 
0200   /// \brief Return the number of columns in the table
0201   int num_columns() const { return schema_->num_fields(); }
0202 
0203   /// \brief Return the number of rows (equal to each column's logical length)
0204   int64_t num_rows() const { return num_rows_; }
0205 
0206   /// \brief Determine if tables are equal
0207   ///
0208   /// Two tables can be equal only if they have equal schemas.
0209   /// However, they may be equal even if they have different chunkings.
0210   bool Equals(const Table& other, bool check_metadata = false) const;
0211 
0212   /// \brief Make a new table by combining the chunks this table has.
0213   ///
0214   /// All the underlying chunks in the ChunkedArray of each column are
0215   /// concatenated into zero or one chunk.
0216   ///
0217   /// \param[in] pool The pool for buffer allocations
0218   Result<std::shared_ptr<Table>> CombineChunks(
0219       MemoryPool* pool = default_memory_pool()) const;
0220 
0221   /// \brief Make a new record batch by combining the chunks this table has.
0222   ///
0223   /// All the underlying chunks in the ChunkedArray of each column are
0224   /// concatenated into a single chunk.
0225   ///
0226   /// \param[in] pool The pool for buffer allocations
0227   Result<std::shared_ptr<RecordBatch>> CombineChunksToBatch(
0228       MemoryPool* pool = default_memory_pool()) const;
0229 
0230  protected:
0231   Table();
0232 
0233   std::shared_ptr<Schema> schema_;
0234   int64_t num_rows_;
0235 
0236  private:
0237   ARROW_DISALLOW_COPY_AND_ASSIGN(Table);
0238 };
0239 
0240 /// \brief Compute a stream of record batches from a (possibly chunked) Table
0241 ///
0242 /// The conversion is zero-copy: each record batch is a view over a slice
0243 /// of the table's columns.
0244 ///
0245 /// The table is expected to be valid prior to using it with the batch reader.
0246 class ARROW_EXPORT TableBatchReader : public RecordBatchReader {
0247  public:
0248   /// \brief Construct a TableBatchReader for the given table
0249   explicit TableBatchReader(const Table& table);
0250   explicit TableBatchReader(std::shared_ptr<Table> table);
0251 
0252   std::shared_ptr<Schema> schema() const override;
0253 
0254   Status ReadNext(std::shared_ptr<RecordBatch>* out) override;
0255 
0256   /// \brief Set the desired maximum number of rows for record batches
0257   ///
0258   /// The actual number of rows in each record batch may be smaller, depending
0259   /// on actual chunking characteristics of each table column.
0260   void set_chunksize(int64_t chunksize);
0261 
0262  private:
0263   std::shared_ptr<Table> owned_table_;
0264   const Table& table_;
0265   std::vector<ChunkedArray*> column_data_;
0266   std::vector<int> chunk_numbers_;
0267   std::vector<int64_t> chunk_offsets_;
0268   int64_t absolute_row_position_;
0269   int64_t max_chunksize_;
0270 };
0271 
0272 /// \defgroup concat-tables ConcatenateTables function.
0273 ///
0274 /// ConcatenateTables function.
0275 /// @{
0276 
0277 /// \brief Controls the behavior of ConcatenateTables().
0278 struct ARROW_EXPORT ConcatenateTablesOptions {
0279   /// If true, the schemas of the tables will be first unified with fields of
0280   /// the same name being merged, according to `field_merge_options`, then each
0281   /// table will be promoted to the unified schema before being concatenated.
0282   /// Otherwise, all tables should have the same schema. Each column in the output table
0283   /// is the result of concatenating the corresponding columns in all input tables.
0284   bool unify_schemas = false;
0285 
0286   /// options to control how fields are merged when unifying schemas
0287   ///
0288   /// This field will be ignored if unify_schemas is false
0289   Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults();
0290 
0291   static ConcatenateTablesOptions Defaults() { return {}; }
0292 };
0293 
0294 /// \brief Construct a new table from multiple input tables.
0295 ///
0296 /// The new table is assembled from existing column chunks without copying,
0297 /// if schemas are identical. If schemas do not match exactly and
0298 /// unify_schemas is enabled in options (off by default), an attempt is
0299 /// made to unify them, and then column chunks are converted to their
0300 /// respective unified datatype, which will probably incur a copy.
0301 /// :func:`arrow::PromoteTableToSchema` is used to unify schemas.
0302 ///
0303 /// Tables are concatenated in order they are provided in and the order of
0304 /// rows within tables will be preserved.
0305 ///
0306 /// \param[in] tables a std::vector of Tables to be concatenated
0307 /// \param[in] options specify how to unify schema of input tables
0308 /// \param[in] memory_pool MemoryPool to be used if null-filled arrays need to
0309 /// be created or if existing column chunks need to endure type conversion
0310 /// \return new Table
0311 
0312 ARROW_EXPORT
0313 Result<std::shared_ptr<Table>> ConcatenateTables(
0314     const std::vector<std::shared_ptr<Table>>& tables,
0315     ConcatenateTablesOptions options = ConcatenateTablesOptions::Defaults(),
0316     MemoryPool* memory_pool = default_memory_pool());
0317 
0318 namespace compute {
0319 class CastOptions;
0320 }
0321 
0322 /// \brief Promotes a table to conform to the given schema.
0323 ///
0324 /// If a field in the schema does not have a corresponding column in
0325 /// the table, a column of nulls will be added to the resulting table.
0326 /// If the corresponding column is of type Null, it will be promoted
0327 /// to the type specified by schema, with null values filled. The
0328 /// column will be casted to the type specified by the schema.
0329 ///
0330 /// Returns an error:
0331 /// - if the corresponding column's type is not compatible with the
0332 ///   schema.
0333 /// - if there is a column in the table that does not exist in the schema.
0334 /// - if the cast fails or casting would be required but is not available.
0335 ///
0336 /// \param[in] table the input Table
0337 /// \param[in] schema the target schema to promote to
0338 /// \param[in] pool The memory pool to be used if null-filled arrays need to
0339 /// be created.
0340 ARROW_EXPORT
0341 Result<std::shared_ptr<Table>> PromoteTableToSchema(
0342     const std::shared_ptr<Table>& table, const std::shared_ptr<Schema>& schema,
0343     MemoryPool* pool = default_memory_pool());
0344 
0345 /// \brief Promotes a table to conform to the given schema.
0346 ///
0347 /// If a field in the schema does not have a corresponding column in
0348 /// the table, a column of nulls will be added to the resulting table.
0349 /// If the corresponding column is of type Null, it will be promoted
0350 /// to the type specified by schema, with null values filled. The column
0351 /// will be casted to the type specified by the schema.
0352 ///
0353 /// Returns an error:
0354 /// - if the corresponding column's type is not compatible with the
0355 ///   schema.
0356 /// - if there is a column in the table that does not exist in the schema.
0357 /// - if the cast fails or casting would be required but is not available.
0358 ///
0359 /// \param[in] table the input Table
0360 /// \param[in] schema the target schema to promote to
0361 /// \param[in] options The cast options to allow promotion of types
0362 /// \param[in] pool The memory pool to be used if null-filled arrays need to
0363 /// be created.
0364 ARROW_EXPORT
0365 Result<std::shared_ptr<Table>> PromoteTableToSchema(
0366     const std::shared_ptr<Table>& table, const std::shared_ptr<Schema>& schema,
0367     const compute::CastOptions& options, MemoryPool* pool = default_memory_pool());
0368 
0369 }  // namespace arrow