|
|
|||
File indexing completed on 2026-04-17 08:28:53
0001 // Licensed to the Apache Software Foundation (ASF) under one 0002 // or more contributor license agreements. See the NOTICE file 0003 // distributed with this work for additional information 0004 // regarding copyright ownership. The ASF licenses this file 0005 // to you under the Apache License, Version 2.0 (the 0006 // "License"); you may not use this file except in compliance 0007 // with the License. You may obtain a copy of the License at 0008 // 0009 // http://www.apache.org/licenses/LICENSE-2.0 0010 // 0011 // Unless required by applicable law or agreed to in writing, 0012 // software distributed under the License is distributed on an 0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 0014 // KIND, either express or implied. See the License for the 0015 // specific language governing permissions and limitations 0016 // under the License. 0017 0018 #pragma once 0019 0020 #include <cstdint> 0021 #include <memory> 0022 0023 #include "parquet/platform.h" 0024 #include "parquet/properties.h" 0025 0026 namespace arrow { 0027 0028 class Array; 0029 class ChunkedArray; 0030 class RecordBatch; 0031 class Schema; 0032 class Table; 0033 0034 } // namespace arrow 0035 0036 namespace parquet { 0037 0038 class FileMetaData; 0039 class ParquetFileWriter; 0040 0041 namespace arrow { 0042 0043 /// \brief Iterative FileWriter class 0044 /// 0045 /// For basic usage, can write a Table at a time, creating one or more row 0046 /// groups per write call. 0047 /// 0048 /// For advanced usage, can write column-by-column: Start a new RowGroup or 0049 /// Chunk with NewRowGroup, then write column-by-column the whole column chunk. 0050 /// 0051 /// If PARQUET:field_id is present as a metadata key on a field, and the corresponding 0052 /// value is a nonnegative integer, then it will be used as the field_id in the parquet 0053 /// file. 0054 class PARQUET_EXPORT FileWriter { 0055 public: 0056 static ::arrow::Status Make(MemoryPool* pool, std::unique_ptr<ParquetFileWriter> writer, 0057 std::shared_ptr<::arrow::Schema> schema, 0058 std::shared_ptr<ArrowWriterProperties> arrow_properties, 0059 std::unique_ptr<FileWriter>* out); 0060 0061 /// \brief Try to create an Arrow to Parquet file writer. 0062 /// 0063 /// \param schema schema of data that will be passed. 0064 /// \param pool memory pool to use. 0065 /// \param sink output stream to write Parquet data. 0066 /// \param properties general Parquet writer properties. 0067 /// \param arrow_properties Arrow-specific writer properties. 0068 /// 0069 /// \since 11.0.0 0070 static ::arrow::Result<std::unique_ptr<FileWriter>> Open( 0071 const ::arrow::Schema& schema, MemoryPool* pool, 0072 std::shared_ptr<::arrow::io::OutputStream> sink, 0073 std::shared_ptr<WriterProperties> properties = default_writer_properties(), 0074 std::shared_ptr<ArrowWriterProperties> arrow_properties = 0075 default_arrow_writer_properties()); 0076 0077 /// Return the Arrow schema to be written to. 0078 virtual std::shared_ptr<::arrow::Schema> schema() const = 0; 0079 0080 /// \brief Write a Table to Parquet. 0081 /// 0082 /// \param table Arrow table to write. 0083 /// \param chunk_size maximum number of rows to write per row group. 0084 virtual ::arrow::Status WriteTable( 0085 const ::arrow::Table& table, int64_t chunk_size = DEFAULT_MAX_ROW_GROUP_LENGTH) = 0; 0086 0087 /// \brief Start a new row group. 0088 /// 0089 /// Returns an error if not all columns have been written. 0090 virtual ::arrow::Status NewRowGroup() = 0; 0091 0092 /// \brief Write ColumnChunk in row group using an array. 0093 virtual ::arrow::Status WriteColumnChunk(const ::arrow::Array& data) = 0; 0094 0095 /// \brief Write ColumnChunk in row group using slice of a ChunkedArray 0096 virtual ::arrow::Status WriteColumnChunk( 0097 const std::shared_ptr<::arrow::ChunkedArray>& data, int64_t offset, 0098 int64_t size) = 0; 0099 0100 /// \brief Write ColumnChunk in a row group using a ChunkedArray 0101 virtual ::arrow::Status WriteColumnChunk( 0102 const std::shared_ptr<::arrow::ChunkedArray>& data) = 0; 0103 0104 /// \brief Start a new buffered row group. 0105 /// 0106 /// Returns an error if not all columns have been written. 0107 virtual ::arrow::Status NewBufferedRowGroup() = 0; 0108 0109 /// \brief Write a RecordBatch into the buffered row group. 0110 /// 0111 /// Multiple RecordBatches can be written into the same row group 0112 /// through this method. 0113 /// 0114 /// WriterProperties.max_row_group_length() is respected and a new 0115 /// row group will be created if the current row group exceeds the 0116 /// limit. 0117 /// 0118 /// Batches get flushed to the output stream once NewBufferedRowGroup() 0119 /// or Close() is called. 0120 /// 0121 /// WARNING: If you are writing multiple files in parallel in the same 0122 /// executor, deadlock may occur if ArrowWriterProperties::use_threads 0123 /// is set to true to write columns in parallel. Please disable use_threads 0124 /// option in this case. 0125 virtual ::arrow::Status WriteRecordBatch(const ::arrow::RecordBatch& batch) = 0; 0126 0127 /// \brief Write the footer and close the file. 0128 virtual ::arrow::Status Close() = 0; 0129 virtual ~FileWriter(); 0130 0131 virtual MemoryPool* memory_pool() const = 0; 0132 /// \brief Add key-value metadata to the file. 0133 /// \param[in] key_value_metadata the metadata to add. 0134 /// \note This will overwrite any existing metadata with the same key. 0135 /// \return Error if Close() has been called. 0136 /// 0137 /// WARNING: If `store_schema` is enabled, `ARROW:schema` would be stored 0138 /// in the key-value metadata. Overwriting this key would result in 0139 /// `store_schema` being unusable during read. 0140 virtual ::arrow::Status AddKeyValueMetadata( 0141 const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata) = 0; 0142 /// \brief Return the file metadata, only available after calling Close(). 0143 virtual const std::shared_ptr<FileMetaData> metadata() const = 0; 0144 }; 0145 0146 /// \brief Write Parquet file metadata only to indicated Arrow OutputStream 0147 PARQUET_EXPORT 0148 ::arrow::Status WriteFileMetaData(const FileMetaData& file_metadata, 0149 ::arrow::io::OutputStream* sink); 0150 0151 /// \brief Write metadata-only Parquet file to indicated Arrow OutputStream 0152 PARQUET_EXPORT 0153 ::arrow::Status WriteMetaDataFile(const FileMetaData& file_metadata, 0154 ::arrow::io::OutputStream* sink); 0155 0156 /// \brief Write a Table to Parquet. 0157 /// 0158 /// This writes one table in a single shot. To write a Parquet file with 0159 /// multiple tables iteratively, see parquet::arrow::FileWriter. 0160 /// 0161 /// \param table Table to write. 0162 /// \param pool memory pool to use. 0163 /// \param sink output stream to write Parquet data. 0164 /// \param chunk_size maximum number of rows to write per row group. 0165 /// \param properties general Parquet writer properties. 0166 /// \param arrow_properties Arrow-specific writer properties. 0167 ::arrow::Status PARQUET_EXPORT 0168 WriteTable(const ::arrow::Table& table, MemoryPool* pool, 0169 std::shared_ptr<::arrow::io::OutputStream> sink, 0170 int64_t chunk_size = DEFAULT_MAX_ROW_GROUP_LENGTH, 0171 std::shared_ptr<WriterProperties> properties = default_writer_properties(), 0172 std::shared_ptr<ArrowWriterProperties> arrow_properties = 0173 default_arrow_writer_properties()); 0174 0175 } // namespace arrow 0176 } // namespace parquet
| [ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
|
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
|