Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-17 08:28:54

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <cstdint>
0021 #include <memory>
0022 #include <utility>
0023 
0024 #include "parquet/metadata.h"
0025 #include "parquet/platform.h"
0026 #include "parquet/properties.h"
0027 #include "parquet/schema.h"
0028 
0029 namespace parquet {
0030 
0031 class ColumnWriter;
0032 
0033 // FIXME: copied from reader-internal.cc
0034 static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'};
0035 static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'};
0036 
0037 class PARQUET_EXPORT RowGroupWriter {
0038  public:
0039   // Forward declare a virtual class 'Contents' to aid dependency injection and more
0040   // easily create test fixtures
0041   // An implementation of the Contents class is defined in the .cc file
0042   struct Contents {
0043     virtual ~Contents() = default;
0044     virtual int num_columns() const = 0;
0045     virtual int64_t num_rows() const = 0;
0046 
0047     // to be used only with ParquetFileWriter::AppendRowGroup
0048     virtual ColumnWriter* NextColumn() = 0;
0049     // to be used only with ParquetFileWriter::AppendBufferedRowGroup
0050     virtual ColumnWriter* column(int i) = 0;
0051 
0052     virtual int current_column() const = 0;
0053     virtual void Close() = 0;
0054 
0055     /// \brief total uncompressed bytes written by the page writer
0056     virtual int64_t total_bytes_written() const = 0;
0057     /// \brief total bytes still compressed but not written by the page writer
0058     virtual int64_t total_compressed_bytes() const = 0;
0059     /// \brief total compressed bytes written by the page writer
0060     virtual int64_t total_compressed_bytes_written() const = 0;
0061 
0062     virtual bool buffered() const = 0;
0063   };
0064 
0065   explicit RowGroupWriter(std::unique_ptr<Contents> contents);
0066 
0067   /// Construct a ColumnWriter for the indicated row group-relative column.
0068   ///
0069   /// To be used only with ParquetFileWriter::AppendRowGroup
0070   /// Ownership is solely within the RowGroupWriter. The ColumnWriter is only
0071   /// valid until the next call to NextColumn or Close. As the contents are
0072   /// directly written to the sink, once a new column is started, the contents
0073   /// of the previous one cannot be modified anymore.
0074   ColumnWriter* NextColumn();
0075   /// Index of currently written column. Equal to -1 if NextColumn()
0076   /// has not been called yet.
0077   int current_column();
0078   void Close();
0079 
0080   int num_columns() const;
0081 
0082   /// Construct a ColumnWriter for the indicated row group column.
0083   ///
0084   /// To be used only with ParquetFileWriter::AppendBufferedRowGroup
0085   /// Ownership is solely within the RowGroupWriter. The ColumnWriter is
0086   /// valid until Close. The contents are buffered in memory and written to sink
0087   /// on Close
0088   ColumnWriter* column(int i);
0089 
0090   /**
0091    * Number of rows that shall be written as part of this RowGroup.
0092    */
0093   int64_t num_rows() const;
0094 
0095   /// \brief total uncompressed bytes written by the page writer
0096   int64_t total_bytes_written() const;
0097   /// \brief total bytes still compressed but not written by the page writer.
0098   /// It will always return 0 from the SerializedPageWriter.
0099   int64_t total_compressed_bytes() const;
0100   /// \brief total compressed bytes written by the page writer
0101   int64_t total_compressed_bytes_written() const;
0102 
0103   /// Returns whether the current RowGroupWriter is in the buffered mode and is created
0104   /// by calling ParquetFileWriter::AppendBufferedRowGroup.
0105   bool buffered() const;
0106 
0107  private:
0108   // Holds a pointer to an instance of Contents implementation
0109   std::unique_ptr<Contents> contents_;
0110 };
0111 
0112 PARQUET_EXPORT
0113 void WriteFileMetaData(const FileMetaData& file_metadata,
0114                        ::arrow::io::OutputStream* sink);
0115 
0116 PARQUET_EXPORT
0117 void WriteMetaDataFile(const FileMetaData& file_metadata,
0118                        ::arrow::io::OutputStream* sink);
0119 
0120 PARQUET_EXPORT
0121 void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
0122                                 ArrowOutputStream* sink,
0123                                 const std::shared_ptr<Encryptor>& encryptor,
0124                                 bool encrypt_footer);
0125 
0126 PARQUET_EXPORT
0127 void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
0128                                 ::arrow::io::OutputStream* sink,
0129                                 const std::shared_ptr<Encryptor>& encryptor = NULLPTR,
0130                                 bool encrypt_footer = false);
0131 PARQUET_EXPORT
0132 void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata,
0133                              ::arrow::io::OutputStream* sink);
0134 
0135 class PARQUET_EXPORT ParquetFileWriter {
0136  public:
0137   // Forward declare a virtual class 'Contents' to aid dependency injection and more
0138   // easily create test fixtures
0139   // An implementation of the Contents class is defined in the .cc file
0140   struct Contents {
0141     Contents(std::shared_ptr<::parquet::schema::GroupNode> schema,
0142              std::shared_ptr<const KeyValueMetadata> key_value_metadata)
0143         : schema_(), key_value_metadata_(std::move(key_value_metadata)) {
0144       schema_.Init(std::move(schema));
0145     }
0146     virtual ~Contents() {}
0147     // Perform any cleanup associated with the file contents
0148     virtual void Close() = 0;
0149 
0150     virtual RowGroupWriter* AppendRowGroup() = 0;
0151     virtual RowGroupWriter* AppendBufferedRowGroup() = 0;
0152 
0153     virtual int64_t num_rows() const = 0;
0154     virtual int num_columns() const = 0;
0155     virtual int num_row_groups() const = 0;
0156 
0157     virtual const std::shared_ptr<WriterProperties>& properties() const = 0;
0158 
0159     const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const {
0160       return key_value_metadata_;
0161     }
0162 
0163     virtual void AddKeyValueMetadata(
0164         const std::shared_ptr<const KeyValueMetadata>& key_value_metadata) = 0;
0165 
0166     // Return const-pointer to make it clear that this object is not to be copied
0167     const SchemaDescriptor* schema() const { return &schema_; }
0168 
0169     SchemaDescriptor schema_;
0170 
0171     /// This should be the only place this is stored. Everything else is a const reference
0172     std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
0173 
0174     const std::shared_ptr<FileMetaData>& metadata() const { return file_metadata_; }
0175     std::shared_ptr<FileMetaData> file_metadata_;
0176   };
0177 
0178   ParquetFileWriter();
0179   ~ParquetFileWriter();
0180 
0181   static std::unique_ptr<ParquetFileWriter> Open(
0182       std::shared_ptr<::arrow::io::OutputStream> sink,
0183       std::shared_ptr<schema::GroupNode> schema,
0184       std::shared_ptr<WriterProperties> properties = default_writer_properties(),
0185       std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
0186 
0187   void Open(std::unique_ptr<Contents> contents);
0188   void Close();
0189 
0190   /// Construct a RowGroupWriter with an arbitrary number of rows.
0191   ///
0192   /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
0193   /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
0194   RowGroupWriter* AppendRowGroup();
0195 
0196   /// Construct a RowGroupWriter that buffers all the values until the RowGroup is ready.
0197   /// Use this if you want to write a RowGroup based on a certain size
0198   ///
0199   /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
0200   /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
0201   RowGroupWriter* AppendBufferedRowGroup();
0202 
0203   /// \brief Add key-value metadata to the file.
0204   /// \param[in] key_value_metadata the metadata to add.
0205   /// \note This will overwrite any existing metadata with the same key(s).
0206   /// \throw ParquetException if Close() has been called.
0207   void AddKeyValueMetadata(
0208       const std::shared_ptr<const KeyValueMetadata>& key_value_metadata);
0209 
0210   /// Number of columns.
0211   ///
0212   /// This number is fixed during the lifetime of the writer as it is determined via
0213   /// the schema.
0214   int num_columns() const;
0215 
0216   /// Number of rows in the yet started RowGroups.
0217   ///
0218   /// Changes on the addition of a new RowGroup.
0219   int64_t num_rows() const;
0220 
0221   /// Number of started RowGroups.
0222   int num_row_groups() const;
0223 
0224   /// Configuration passed to the writer, e.g. the used Parquet format version.
0225   const std::shared_ptr<WriterProperties>& properties() const;
0226 
0227   /// Returns the file schema descriptor
0228   const SchemaDescriptor* schema() const;
0229 
0230   /// Returns a column descriptor in schema
0231   const ColumnDescriptor* descr(int i) const;
0232 
0233   /// Returns the file custom metadata
0234   const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
0235 
0236   /// Returns the file metadata, only available after calling Close().
0237   const std::shared_ptr<FileMetaData> metadata() const;
0238 
0239  private:
0240   // Holds a pointer to an instance of Contents implementation
0241   std::unique_ptr<Contents> contents_;
0242   std::shared_ptr<FileMetaData> file_metadata_;
0243 };
0244 
0245 }  // namespace parquet