Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-17 08:28:54

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <cstdint>
0021 #include <map>
0022 #include <memory>
0023 #include <optional>
0024 #include <string>
0025 #include <vector>
0026 
0027 #include "parquet/encryption/type_fwd.h"
0028 #include "parquet/platform.h"
0029 #include "parquet/properties.h"
0030 #include "parquet/type_fwd.h"
0031 
0032 namespace parquet {
0033 
0034 using KeyValueMetadata = ::arrow::KeyValueMetadata;
0035 
0036 class PARQUET_EXPORT ApplicationVersion {
0037  public:
0038   // Known Versions with Issues
0039   static const ApplicationVersion& PARQUET_251_FIXED_VERSION();
0040   static const ApplicationVersion& PARQUET_816_FIXED_VERSION();
0041   static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION();
0042   static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION();
0043   static const ApplicationVersion& PARQUET_CPP_10353_FIXED_VERSION();
0044 
0045   // Application that wrote the file. e.g. "IMPALA"
0046   std::string application_;
0047   // Build name
0048   std::string build_;
0049 
0050   // Version of the application that wrote the file, expressed as
0051   // (<major>.<minor>.<patch>). Unmatched parts default to 0.
0052   // "1.2.3"    => {1, 2, 3}
0053   // "1.2"      => {1, 2, 0}
0054   // "1.2-cdh5" => {1, 2, 0}
0055   struct {
0056     int major;
0057     int minor;
0058     int patch;
0059     std::string unknown;
0060     std::string pre_release;
0061     std::string build_info;
0062   } version;
0063 
0064   ApplicationVersion() = default;
0065   explicit ApplicationVersion(const std::string& created_by);
0066   ApplicationVersion(std::string application, int major, int minor, int patch);
0067 
0068   // Returns true if version is strictly less than other_version
0069   bool VersionLt(const ApplicationVersion& other_version) const;
0070 
0071   // Returns true if version is strictly equal with other_version
0072   bool VersionEq(const ApplicationVersion& other_version) const;
0073 
0074   // Checks if the Version has the correct statistics for a given column
0075   bool HasCorrectStatistics(Type::type primitive, const EncodedStatistics& statistics,
0076                             SortOrder::type sort_order = SortOrder::SIGNED) const;
0077 };
0078 
0079 class PARQUET_EXPORT ColumnCryptoMetaData {
0080  public:
0081   static std::unique_ptr<ColumnCryptoMetaData> Make(const uint8_t* metadata);
0082   ~ColumnCryptoMetaData();
0083 
0084   bool Equals(const ColumnCryptoMetaData& other) const;
0085 
0086   std::shared_ptr<schema::ColumnPath> path_in_schema() const;
0087   bool encrypted_with_footer_key() const;
0088   const std::string& key_metadata() const;
0089 
0090  private:
0091   explicit ColumnCryptoMetaData(const uint8_t* metadata);
0092 
0093   class ColumnCryptoMetaDataImpl;
0094   std::unique_ptr<ColumnCryptoMetaDataImpl> impl_;
0095 };
0096 
0097 /// \brief Public struct for Thrift PageEncodingStats in ColumnChunkMetaData
0098 struct PageEncodingStats {
0099   PageType::type page_type;
0100   Encoding::type encoding;
0101   int32_t count;
0102 };
0103 
0104 /// \brief Public struct for location to page index in ColumnChunkMetaData.
0105 struct IndexLocation {
0106   /// File offset of the given index, in bytes
0107   int64_t offset;
0108   /// Length of the given index, in bytes
0109   int32_t length;
0110 };
0111 
0112 /// \brief ColumnChunkMetaData is a proxy around format::ColumnChunkMetaData.
0113 class PARQUET_EXPORT ColumnChunkMetaData {
0114  public:
0115   // API convenience to get a MetaData accessor
0116   static std::unique_ptr<ColumnChunkMetaData> Make(
0117       const void* metadata, const ColumnDescriptor* descr,
0118       const ReaderProperties& properties = default_reader_properties(),
0119       const ApplicationVersion* writer_version = NULLPTR, int16_t row_group_ordinal = -1,
0120       int16_t column_ordinal = -1,
0121       std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
0122 
0123   ~ColumnChunkMetaData();
0124 
0125   bool Equals(const ColumnChunkMetaData& other) const;
0126 
0127   // Byte offset of `ColumnMetaData` in `file_path()`.
0128   //
0129   // Note that the meaning of this field has been inconsistent among implementations
0130   // so its use has since been deprecated in the Parquet specification. Modern
0131   // implementations will set this to `0` to indicate that the `ColumnMetaData` is solely
0132   // contained in the `ColumnChunk` struct.
0133   int64_t file_offset() const;
0134 
0135   // parameter is only used when a dataset is spread across multiple files
0136   const std::string& file_path() const;
0137 
0138   // column metadata
0139   bool is_metadata_set() const;
0140   Type::type type() const;
0141   int64_t num_values() const;
0142   std::shared_ptr<schema::ColumnPath> path_in_schema() const;
0143   bool is_stats_set() const;
0144   bool is_geo_stats_set() const;
0145   std::shared_ptr<Statistics> statistics() const;
0146   std::shared_ptr<EncodedStatistics> encoded_statistics() const;
0147   std::shared_ptr<SizeStatistics> size_statistics() const;
0148   std::shared_ptr<geospatial::GeoStatistics> geo_statistics() const;
0149 
0150   Compression::type compression() const;
0151   // Indicate if the ColumnChunk compression is supported by the current
0152   // compiled parquet library.
0153   bool can_decompress() const;
0154 
0155   const std::vector<Encoding::type>& encodings() const;
0156   const std::vector<PageEncodingStats>& encoding_stats() const;
0157   std::optional<int64_t> bloom_filter_offset() const;
0158   std::optional<int64_t> bloom_filter_length() const;
0159   bool has_dictionary_page() const;
0160   int64_t dictionary_page_offset() const;
0161   int64_t data_page_offset() const;
0162   bool has_index_page() const;
0163   int64_t index_page_offset() const;
0164   int64_t total_compressed_size() const;
0165   int64_t total_uncompressed_size() const;
0166   std::unique_ptr<ColumnCryptoMetaData> crypto_metadata() const;
0167   std::optional<IndexLocation> GetColumnIndexLocation() const;
0168   std::optional<IndexLocation> GetOffsetIndexLocation() const;
0169   const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
0170 
0171  private:
0172   explicit ColumnChunkMetaData(
0173       const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal,
0174       int16_t column_ordinal, const ReaderProperties& properties,
0175       const ApplicationVersion* writer_version = NULLPTR,
0176       std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
0177   // PIMPL Idiom
0178   class ColumnChunkMetaDataImpl;
0179   std::unique_ptr<ColumnChunkMetaDataImpl> impl_;
0180 };
0181 
0182 /// \brief RowGroupMetaData is a proxy around format::RowGroupMetaData.
0183 class PARQUET_EXPORT RowGroupMetaData {
0184  public:
0185   /// \brief Create a RowGroupMetaData from a serialized thrift message.
0186   static std::unique_ptr<RowGroupMetaData> Make(
0187       const void* metadata, const SchemaDescriptor* schema,
0188       const ReaderProperties& properties = default_reader_properties(),
0189       const ApplicationVersion* writer_version = NULLPTR,
0190       std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
0191 
0192   ~RowGroupMetaData();
0193 
0194   bool Equals(const RowGroupMetaData& other) const;
0195 
0196   /// \brief The number of columns in this row group. The order must match the
0197   /// parent's column ordering.
0198   int num_columns() const;
0199 
0200   /// \brief Return the ColumnChunkMetaData of the corresponding column ordinal.
0201   ///
0202   /// WARNING, the returned object references memory location in it's parent
0203   /// (RowGroupMetaData) object. Hence, the parent must outlive the returned
0204   /// object.
0205   ///
0206   /// \param[in] index of the ColumnChunkMetaData to retrieve.
0207   ///
0208   /// \throws ParquetException if the index is out of bound.
0209   std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int index) const;
0210 
0211   /// \brief Number of rows in this row group.
0212   int64_t num_rows() const;
0213 
0214   /// \brief Total byte size of all the uncompressed column data in this row group.
0215   int64_t total_byte_size() const;
0216 
0217   /// \brief Total byte size of all the compressed (and potentially encrypted)
0218   /// column data in this row group.
0219   ///
0220   /// This information is optional and may be 0 if omitted.
0221   int64_t total_compressed_size() const;
0222 
0223   /// \brief Byte offset from beginning of file to first page (data or
0224   /// dictionary) in this row group
0225   ///
0226   /// The file_offset field that this method exposes is optional. This method
0227   /// will return 0 if that field is not set to a meaningful value.
0228   int64_t file_offset() const;
0229   // Return const-pointer to make it clear that this object is not to be copied
0230   const SchemaDescriptor* schema() const;
0231   // Indicate if all of the RowGroup's ColumnChunks can be decompressed.
0232   bool can_decompress() const;
0233   // Sorting columns of the row group if any.
0234   std::vector<SortingColumn> sorting_columns() const;
0235 
0236  private:
0237   explicit RowGroupMetaData(
0238       const void* metadata, const SchemaDescriptor* schema,
0239       const ReaderProperties& properties,
0240       const ApplicationVersion* writer_version = NULLPTR,
0241       std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
0242   // PIMPL Idiom
0243   class RowGroupMetaDataImpl;
0244   std::unique_ptr<RowGroupMetaDataImpl> impl_;
0245 };
0246 
0247 class FileMetaDataBuilder;
0248 
0249 /// \brief FileMetaData is a proxy around format::FileMetaData.
0250 class PARQUET_EXPORT FileMetaData {
0251  public:
0252   /// \brief Create a FileMetaData from a serialized thrift message.
0253   static std::shared_ptr<FileMetaData> Make(
0254       const void* serialized_metadata, uint32_t* inout_metadata_len,
0255       const ReaderProperties& properties = default_reader_properties(),
0256       std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
0257 
0258   ~FileMetaData();
0259 
0260   bool Equals(const FileMetaData& other) const;
0261 
0262   /// \brief The number of parquet "leaf" columns.
0263   ///
0264   /// Parquet thrift definition requires that nested schema elements are
0265   /// flattened. This method returns the number of columns in the flattened
0266   /// version.
0267   /// For instance, if the schema looks like this :
0268   /// 0 foo.bar
0269   ///       foo.bar.baz           0
0270   ///       foo.bar.baz2          1
0271   ///   foo.qux                   2
0272   /// 1 foo2                      3
0273   /// 2 foo3                      4
0274   /// This method will return 5, because there are 5 "leaf" fields (so 5
0275   /// flattened fields)
0276   int num_columns() const;
0277 
0278   /// \brief The number of flattened schema elements.
0279   ///
0280   /// Parquet thrift definition requires that nested schema elements are
0281   /// flattened. This method returns the total number of elements in the
0282   /// flattened list.
0283   int num_schema_elements() const;
0284 
0285   /// \brief The total number of rows.
0286   ///
0287   /// If the FileMetaData was obtained by calling `SubSet()`, this is the total
0288   /// number of rows in the selected row groups.
0289   int64_t num_rows() const;
0290 
0291   /// \brief The number of row groups in the file.
0292   ///
0293   /// If the FileMetaData was obtained by calling `SubSet()`, this is the number
0294   /// of selected row groups.
0295   int num_row_groups() const;
0296 
0297   /// \brief Return the RowGroupMetaData of the corresponding row group ordinal.
0298   ///
0299   /// WARNING, the returned object references memory location in it's parent
0300   /// (FileMetaData) object. Hence, the parent must outlive the returned object.
0301   ///
0302   /// \param[in] index of the RowGroup to retrieve.
0303   ///
0304   /// \throws ParquetException if the index is out of bound.
0305   std::unique_ptr<RowGroupMetaData> RowGroup(int index) const;
0306 
0307   /// \brief Return the "version" of the file
0308   ///
0309   /// WARNING: The value returned by this method is unreliable as 1) the Parquet
0310   /// file metadata stores the version as a single integer and 2) some producers
0311   /// are known to always write a hardcoded value.  Therefore, you cannot use
0312   /// this value to know which features are used in the file.
0313   ParquetVersion::type version() const;
0314 
0315   /// \brief Return the application's user-agent string of the writer.
0316   const std::string& created_by() const;
0317 
0318   /// \brief Return the application's version of the writer.
0319   const ApplicationVersion& writer_version() const;
0320 
0321   /// \brief Size of the original thrift encoded metadata footer.
0322   uint32_t size() const;
0323 
0324   /// \brief Indicate if all of the FileMetaData's RowGroups can be decompressed.
0325   ///
0326   /// This will return false if any of the RowGroup's page is compressed with a
0327   /// compression format which is not compiled in the current parquet library.
0328   bool can_decompress() const;
0329 
0330   bool is_encryption_algorithm_set() const;
0331   EncryptionAlgorithm encryption_algorithm() const;
0332   const std::string& footer_signing_key_metadata() const;
0333 
0334   /// \brief Verify signature of FileMetaData when file is encrypted but footer
0335   /// is not encrypted (plaintext footer).
0336   bool VerifySignature(const void* signature);
0337 
0338   void WriteTo(::arrow::io::OutputStream* dst,
0339                const std::shared_ptr<Encryptor>& encryptor = NULLPTR) const;
0340 
0341   /// \brief Return Thrift-serialized representation of the metadata as a
0342   /// string
0343   std::string SerializeToString() const;
0344 
0345   // Return const-pointer to make it clear that this object is not to be copied
0346   const SchemaDescriptor* schema() const;
0347 
0348   const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
0349 
0350   /// \brief Set a path to all ColumnChunk for all RowGroups.
0351   ///
0352   /// Commonly used by systems (Dask, Spark) who generates an metadata-only
0353   /// parquet file. The path is usually relative to said index file.
0354   ///
0355   /// \param[in] path to set.
0356   void set_file_path(const std::string& path);
0357 
0358   /// \brief Merge row groups from another metadata file into this one.
0359   ///
0360   /// The schema of the input FileMetaData must be equal to the
0361   /// schema of this object.
0362   ///
0363   /// This is used by systems who creates an aggregate metadata-only file by
0364   /// concatenating the row groups of multiple files. This newly created
0365   /// metadata file acts as an index of all available row groups.
0366   ///
0367   /// \param[in] other FileMetaData to merge the row groups from.
0368   ///
0369   /// \throws ParquetException if schemas are not equal.
0370   void AppendRowGroups(const FileMetaData& other);
0371 
0372   /// \brief Return a FileMetaData containing a subset of the row groups in this
0373   /// FileMetaData.
0374   std::shared_ptr<FileMetaData> Subset(const std::vector<int>& row_groups) const;
0375 
0376   /// \brief Serialize metadata unencrypted as string
0377   ///
0378   /// \param[in] scrub whether to remove sensitive information from the metadata.
0379   /// \param[in] debug whether to serialize the metadata as Thrift (if false) or
0380   /// debug text (if true).
0381   std::string SerializeUnencrypted(bool scrub, bool debug) const;
0382 
0383  private:
0384   friend FileMetaDataBuilder;
0385   friend class SerializedFile;
0386   friend class SerializedRowGroup;
0387 
0388   explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len,
0389                         const ReaderProperties& properties,
0390                         std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
0391 
0392   void set_file_decryptor(std::shared_ptr<InternalFileDecryptor> file_decryptor);
0393   const std::shared_ptr<InternalFileDecryptor>& file_decryptor() const;
0394 
0395   // PIMPL Idiom
0396   FileMetaData();
0397   class FileMetaDataImpl;
0398   std::unique_ptr<FileMetaDataImpl> impl_;
0399 };
0400 
0401 class PARQUET_EXPORT FileCryptoMetaData {
0402  public:
0403   // API convenience to get a MetaData accessor
0404   static std::shared_ptr<FileCryptoMetaData> Make(
0405       const uint8_t* serialized_metadata, uint32_t* metadata_len,
0406       const ReaderProperties& properties = default_reader_properties());
0407   ~FileCryptoMetaData();
0408 
0409   EncryptionAlgorithm encryption_algorithm() const;
0410   const std::string& key_metadata() const;
0411 
0412   void WriteTo(::arrow::io::OutputStream* dst) const;
0413 
0414  private:
0415   friend FileMetaDataBuilder;
0416   FileCryptoMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len,
0417                      const ReaderProperties& properties);
0418 
0419   // PIMPL Idiom
0420   FileCryptoMetaData();
0421   class FileCryptoMetaDataImpl;
0422   std::unique_ptr<FileCryptoMetaDataImpl> impl_;
0423 };
0424 
0425 // Builder API
0426 class PARQUET_EXPORT ColumnChunkMetaDataBuilder {
0427  public:
0428   // API convenience to get a MetaData reader
0429   static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
0430       std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column);
0431 
0432   static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
0433       std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column,
0434       void* contents);
0435 
0436   ~ColumnChunkMetaDataBuilder();
0437 
0438   // column chunk
0439   // Used when a dataset is spread across multiple files
0440   void set_file_path(const std::string& path);
0441 
0442   // column metadata
0443   void SetStatistics(const EncodedStatistics& stats);
0444   void SetSizeStatistics(const SizeStatistics& size_stats);
0445 
0446   // column geometry statistics
0447   void SetGeoStatistics(const geospatial::EncodedGeoStatistics& geo_stats);
0448 
0449   void SetKeyValueMetadata(std::shared_ptr<const KeyValueMetadata> key_value_metadata);
0450 
0451   // get the column descriptor
0452   const ColumnDescriptor* descr() const;
0453 
0454   int64_t total_compressed_size() const;
0455   // commit the metadata
0456 
0457   void Finish(int64_t num_values, int64_t dictionary_page_offset,
0458               int64_t index_page_offset, int64_t data_page_offset,
0459               int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary,
0460               bool dictionary_fallback,
0461               const std::map<Encoding::type, int32_t>& dict_encoding_stats_,
0462               const std::map<Encoding::type, int32_t>& data_encoding_stats_,
0463               const std::shared_ptr<Encryptor>& encryptor = NULLPTR);
0464 
0465   // The metadata contents, suitable for passing to ColumnChunkMetaData::Make
0466   const void* contents() const;
0467 
0468   // For writing metadata at end of column chunk
0469   void WriteTo(::arrow::io::OutputStream* sink);
0470 
0471  private:
0472   explicit ColumnChunkMetaDataBuilder(std::shared_ptr<WriterProperties> props,
0473                                       const ColumnDescriptor* column);
0474   explicit ColumnChunkMetaDataBuilder(std::shared_ptr<WriterProperties> props,
0475                                       const ColumnDescriptor* column, void* contents);
0476   // PIMPL Idiom
0477   class ColumnChunkMetaDataBuilderImpl;
0478   std::unique_ptr<ColumnChunkMetaDataBuilderImpl> impl_;
0479 };
0480 
0481 class PARQUET_EXPORT RowGroupMetaDataBuilder {
0482  public:
0483   // API convenience to get a MetaData reader
0484   static std::unique_ptr<RowGroupMetaDataBuilder> Make(
0485       std::shared_ptr<WriterProperties> props, const SchemaDescriptor* schema_,
0486       void* contents);
0487 
0488   ~RowGroupMetaDataBuilder();
0489 
0490   ColumnChunkMetaDataBuilder* NextColumnChunk();
0491   int num_columns();
0492   int64_t num_rows();
0493   int current_column() const;
0494 
0495   void set_num_rows(int64_t num_rows);
0496 
0497   // commit the metadata
0498   void Finish(int64_t total_bytes_written, int16_t row_group_ordinal = -1);
0499 
0500  private:
0501   explicit RowGroupMetaDataBuilder(std::shared_ptr<WriterProperties> props,
0502                                    const SchemaDescriptor* schema_, void* contents);
0503   // PIMPL Idiom
0504   class RowGroupMetaDataBuilderImpl;
0505   std::unique_ptr<RowGroupMetaDataBuilderImpl> impl_;
0506 };
0507 
0508 /// \brief Public struct for location to all page indexes in a parquet file.
0509 struct PageIndexLocation {
0510   /// Alias type of page index location of a row group. The index location
0511   /// is located by column ordinal. If the column does not have the page index,
0512   /// its value is set to std::nullopt.
0513   using RowGroupIndexLocation = std::vector<std::optional<IndexLocation>>;
0514   /// Alias type of page index location of a parquet file. The index location
0515   /// is located by the row group ordinal.
0516   using FileIndexLocation = std::map<size_t, RowGroupIndexLocation>;
0517   /// Row group column index locations which uses row group ordinal as the key.
0518   FileIndexLocation column_index_location;
0519   /// Row group offset index locations which uses row group ordinal as the key.
0520   FileIndexLocation offset_index_location;
0521 };
0522 
0523 class PARQUET_EXPORT FileMetaDataBuilder {
0524  public:
0525   // API convenience to get a MetaData builder
0526   static std::unique_ptr<FileMetaDataBuilder> Make(
0527       const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props);
0528 
0529   ~FileMetaDataBuilder();
0530 
0531   // The prior RowGroupMetaDataBuilder (if any) is destroyed
0532   RowGroupMetaDataBuilder* AppendRowGroup();
0533 
0534   // Update location to all page indexes in the parquet file
0535   void SetPageIndexLocation(const PageIndexLocation& location);
0536 
0537   // Complete the Thrift structure
0538   std::unique_ptr<FileMetaData> Finish(
0539       const std::shared_ptr<const KeyValueMetadata>& key_value_metadata = NULLPTR);
0540 
0541   // crypto metadata
0542   std::unique_ptr<FileCryptoMetaData> GetCryptoMetaData();
0543 
0544  private:
0545   explicit FileMetaDataBuilder(const SchemaDescriptor* schema,
0546                                std::shared_ptr<WriterProperties> props);
0547   // PIMPL Idiom
0548   class FileMetaDataBuilderImpl;
0549   std::unique_ptr<FileMetaDataBuilderImpl> impl_;
0550 };
0551 
0552 PARQUET_EXPORT std::string ParquetVersionToString(ParquetVersion::type ver);
0553 
0554 }  // namespace parquet