Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-17 08:28:53

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <cstdint>
0021 #include <memory>
0022 
0023 #include "parquet/platform.h"
0024 #include "parquet/properties.h"
0025 
0026 namespace arrow {
0027 
0028 class Array;
0029 class ChunkedArray;
0030 class RecordBatch;
0031 class Schema;
0032 class Table;
0033 
0034 }  // namespace arrow
0035 
0036 namespace parquet {
0037 
0038 class FileMetaData;
0039 class ParquetFileWriter;
0040 
0041 namespace arrow {
0042 
0043 /// \brief Iterative FileWriter class
0044 ///
0045 /// For basic usage, can write a Table at a time, creating one or more row
0046 /// groups per write call.
0047 ///
0048 /// For advanced usage, can write column-by-column: Start a new RowGroup or
0049 /// Chunk with NewRowGroup, then write column-by-column the whole column chunk.
0050 ///
0051 /// If PARQUET:field_id is present as a metadata key on a field, and the corresponding
0052 /// value is a nonnegative integer, then it will be used as the field_id in the parquet
0053 /// file.
0054 class PARQUET_EXPORT FileWriter {
0055  public:
0056   static ::arrow::Status Make(MemoryPool* pool, std::unique_ptr<ParquetFileWriter> writer,
0057                               std::shared_ptr<::arrow::Schema> schema,
0058                               std::shared_ptr<ArrowWriterProperties> arrow_properties,
0059                               std::unique_ptr<FileWriter>* out);
0060 
0061   /// \brief Try to create an Arrow to Parquet file writer.
0062   ///
0063   /// \param schema schema of data that will be passed.
0064   /// \param pool memory pool to use.
0065   /// \param sink output stream to write Parquet data.
0066   /// \param properties general Parquet writer properties.
0067   /// \param arrow_properties Arrow-specific writer properties.
0068   ///
0069   /// \since 11.0.0
0070   static ::arrow::Result<std::unique_ptr<FileWriter>> Open(
0071       const ::arrow::Schema& schema, MemoryPool* pool,
0072       std::shared_ptr<::arrow::io::OutputStream> sink,
0073       std::shared_ptr<WriterProperties> properties = default_writer_properties(),
0074       std::shared_ptr<ArrowWriterProperties> arrow_properties =
0075           default_arrow_writer_properties());
0076 
0077   /// Return the Arrow schema to be written to.
0078   virtual std::shared_ptr<::arrow::Schema> schema() const = 0;
0079 
0080   /// \brief Write a Table to Parquet.
0081   ///
0082   /// \param table Arrow table to write.
0083   /// \param chunk_size maximum number of rows to write per row group.
0084   virtual ::arrow::Status WriteTable(
0085       const ::arrow::Table& table, int64_t chunk_size = DEFAULT_MAX_ROW_GROUP_LENGTH) = 0;
0086 
0087   /// \brief Start a new row group.
0088   ///
0089   /// Returns an error if not all columns have been written.
0090   virtual ::arrow::Status NewRowGroup() = 0;
0091 
0092   /// \brief Write ColumnChunk in row group using an array.
0093   virtual ::arrow::Status WriteColumnChunk(const ::arrow::Array& data) = 0;
0094 
0095   /// \brief Write ColumnChunk in row group using slice of a ChunkedArray
0096   virtual ::arrow::Status WriteColumnChunk(
0097       const std::shared_ptr<::arrow::ChunkedArray>& data, int64_t offset,
0098       int64_t size) = 0;
0099 
0100   /// \brief Write ColumnChunk in a row group using a ChunkedArray
0101   virtual ::arrow::Status WriteColumnChunk(
0102       const std::shared_ptr<::arrow::ChunkedArray>& data) = 0;
0103 
0104   /// \brief Start a new buffered row group.
0105   ///
0106   /// Returns an error if not all columns have been written.
0107   virtual ::arrow::Status NewBufferedRowGroup() = 0;
0108 
0109   /// \brief Write a RecordBatch into the buffered row group.
0110   ///
0111   /// Multiple RecordBatches can be written into the same row group
0112   /// through this method.
0113   ///
0114   /// WriterProperties.max_row_group_length() is respected and a new
0115   /// row group will be created if the current row group exceeds the
0116   /// limit.
0117   ///
0118   /// Batches get flushed to the output stream once NewBufferedRowGroup()
0119   /// or Close() is called.
0120   ///
0121   /// WARNING: If you are writing multiple files in parallel in the same
0122   /// executor, deadlock may occur if ArrowWriterProperties::use_threads
0123   /// is set to true to write columns in parallel. Please disable use_threads
0124   /// option in this case.
0125   virtual ::arrow::Status WriteRecordBatch(const ::arrow::RecordBatch& batch) = 0;
0126 
0127   /// \brief Write the footer and close the file.
0128   virtual ::arrow::Status Close() = 0;
0129   virtual ~FileWriter();
0130 
0131   virtual MemoryPool* memory_pool() const = 0;
0132   /// \brief Add key-value metadata to the file.
0133   /// \param[in] key_value_metadata the metadata to add.
0134   /// \note This will overwrite any existing metadata with the same key.
0135   /// \return Error if Close() has been called.
0136   ///
0137   /// WARNING: If `store_schema` is enabled, `ARROW:schema` would be stored
0138   /// in the key-value metadata. Overwriting this key would result in
0139   /// `store_schema` being unusable during read.
0140   virtual ::arrow::Status AddKeyValueMetadata(
0141       const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata) = 0;
0142   /// \brief Return the file metadata, only available after calling Close().
0143   virtual const std::shared_ptr<FileMetaData> metadata() const = 0;
0144 };
0145 
0146 /// \brief Write Parquet file metadata only to indicated Arrow OutputStream
0147 PARQUET_EXPORT
0148 ::arrow::Status WriteFileMetaData(const FileMetaData& file_metadata,
0149                                   ::arrow::io::OutputStream* sink);
0150 
0151 /// \brief Write metadata-only Parquet file to indicated Arrow OutputStream
0152 PARQUET_EXPORT
0153 ::arrow::Status WriteMetaDataFile(const FileMetaData& file_metadata,
0154                                   ::arrow::io::OutputStream* sink);
0155 
0156 /// \brief Write a Table to Parquet.
0157 ///
0158 /// This writes one table in a single shot. To write a Parquet file with
0159 /// multiple tables iteratively, see parquet::arrow::FileWriter.
0160 ///
0161 /// \param table Table to write.
0162 /// \param pool memory pool to use.
0163 /// \param sink output stream to write Parquet data.
0164 /// \param chunk_size maximum number of rows to write per row group.
0165 /// \param properties general Parquet writer properties.
0166 /// \param arrow_properties Arrow-specific writer properties.
0167 ::arrow::Status PARQUET_EXPORT
0168 WriteTable(const ::arrow::Table& table, MemoryPool* pool,
0169            std::shared_ptr<::arrow::io::OutputStream> sink,
0170            int64_t chunk_size = DEFAULT_MAX_ROW_GROUP_LENGTH,
0171            std::shared_ptr<WriterProperties> properties = default_writer_properties(),
0172            std::shared_ptr<ArrowWriterProperties> arrow_properties =
0173                default_arrow_writer_properties());
0174 
0175 }  // namespace arrow
0176 }  // namespace parquet