File indexing completed on 2026-04-17 08:28:54
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 #pragma once
0019
0020 #include <cstdint>
0021 #include <map>
0022 #include <memory>
0023 #include <optional>
0024 #include <string>
0025 #include <vector>
0026
0027 #include "parquet/encryption/type_fwd.h"
0028 #include "parquet/platform.h"
0029 #include "parquet/properties.h"
0030 #include "parquet/type_fwd.h"
0031
0032 namespace parquet {
0033
0034 using KeyValueMetadata = ::arrow::KeyValueMetadata;
0035
0036 class PARQUET_EXPORT ApplicationVersion {
0037 public:
0038
0039 static const ApplicationVersion& PARQUET_251_FIXED_VERSION();
0040 static const ApplicationVersion& PARQUET_816_FIXED_VERSION();
0041 static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION();
0042 static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION();
0043 static const ApplicationVersion& PARQUET_CPP_10353_FIXED_VERSION();
0044
0045
0046 std::string application_;
0047
0048 std::string build_;
0049
0050
0051
0052
0053
0054
0055 struct {
0056 int major;
0057 int minor;
0058 int patch;
0059 std::string unknown;
0060 std::string pre_release;
0061 std::string build_info;
0062 } version;
0063
0064 ApplicationVersion() = default;
0065 explicit ApplicationVersion(const std::string& created_by);
0066 ApplicationVersion(std::string application, int major, int minor, int patch);
0067
0068
0069 bool VersionLt(const ApplicationVersion& other_version) const;
0070
0071
0072 bool VersionEq(const ApplicationVersion& other_version) const;
0073
0074
0075 bool HasCorrectStatistics(Type::type primitive, const EncodedStatistics& statistics,
0076 SortOrder::type sort_order = SortOrder::SIGNED) const;
0077 };
0078
0079 class PARQUET_EXPORT ColumnCryptoMetaData {
0080 public:
0081 static std::unique_ptr<ColumnCryptoMetaData> Make(const uint8_t* metadata);
0082 ~ColumnCryptoMetaData();
0083
0084 bool Equals(const ColumnCryptoMetaData& other) const;
0085
0086 std::shared_ptr<schema::ColumnPath> path_in_schema() const;
0087 bool encrypted_with_footer_key() const;
0088 const std::string& key_metadata() const;
0089
0090 private:
0091 explicit ColumnCryptoMetaData(const uint8_t* metadata);
0092
0093 class ColumnCryptoMetaDataImpl;
0094 std::unique_ptr<ColumnCryptoMetaDataImpl> impl_;
0095 };
0096
0097
0098 struct PageEncodingStats {
0099 PageType::type page_type;
0100 Encoding::type encoding;
0101 int32_t count;
0102 };
0103
0104
0105 struct IndexLocation {
0106
0107 int64_t offset;
0108
0109 int32_t length;
0110 };
0111
0112
0113 class PARQUET_EXPORT ColumnChunkMetaData {
0114 public:
0115
0116 static std::unique_ptr<ColumnChunkMetaData> Make(
0117 const void* metadata, const ColumnDescriptor* descr,
0118 const ReaderProperties& properties = default_reader_properties(),
0119 const ApplicationVersion* writer_version = NULLPTR, int16_t row_group_ordinal = -1,
0120 int16_t column_ordinal = -1,
0121 std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
0122
0123 ~ColumnChunkMetaData();
0124
0125 bool Equals(const ColumnChunkMetaData& other) const;
0126
0127
0128
0129
0130
0131
0132
0133 int64_t file_offset() const;
0134
0135
0136 const std::string& file_path() const;
0137
0138
0139 bool is_metadata_set() const;
0140 Type::type type() const;
0141 int64_t num_values() const;
0142 std::shared_ptr<schema::ColumnPath> path_in_schema() const;
0143 bool is_stats_set() const;
0144 bool is_geo_stats_set() const;
0145 std::shared_ptr<Statistics> statistics() const;
0146 std::shared_ptr<EncodedStatistics> encoded_statistics() const;
0147 std::shared_ptr<SizeStatistics> size_statistics() const;
0148 std::shared_ptr<geospatial::GeoStatistics> geo_statistics() const;
0149
0150 Compression::type compression() const;
0151
0152
0153 bool can_decompress() const;
0154
0155 const std::vector<Encoding::type>& encodings() const;
0156 const std::vector<PageEncodingStats>& encoding_stats() const;
0157 std::optional<int64_t> bloom_filter_offset() const;
0158 std::optional<int64_t> bloom_filter_length() const;
0159 bool has_dictionary_page() const;
0160 int64_t dictionary_page_offset() const;
0161 int64_t data_page_offset() const;
0162 bool has_index_page() const;
0163 int64_t index_page_offset() const;
0164 int64_t total_compressed_size() const;
0165 int64_t total_uncompressed_size() const;
0166 std::unique_ptr<ColumnCryptoMetaData> crypto_metadata() const;
0167 std::optional<IndexLocation> GetColumnIndexLocation() const;
0168 std::optional<IndexLocation> GetOffsetIndexLocation() const;
0169 const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
0170
0171 private:
0172 explicit ColumnChunkMetaData(
0173 const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal,
0174 int16_t column_ordinal, const ReaderProperties& properties,
0175 const ApplicationVersion* writer_version = NULLPTR,
0176 std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
0177
0178 class ColumnChunkMetaDataImpl;
0179 std::unique_ptr<ColumnChunkMetaDataImpl> impl_;
0180 };
0181
0182
0183 class PARQUET_EXPORT RowGroupMetaData {
0184 public:
0185
0186 static std::unique_ptr<RowGroupMetaData> Make(
0187 const void* metadata, const SchemaDescriptor* schema,
0188 const ReaderProperties& properties = default_reader_properties(),
0189 const ApplicationVersion* writer_version = NULLPTR,
0190 std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
0191
0192 ~RowGroupMetaData();
0193
0194 bool Equals(const RowGroupMetaData& other) const;
0195
0196
0197
0198 int num_columns() const;
0199
0200
0201
0202
0203
0204
0205
0206
0207
0208
0209 std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int index) const;
0210
0211
0212 int64_t num_rows() const;
0213
0214
0215 int64_t total_byte_size() const;
0216
0217
0218
0219
0220
0221 int64_t total_compressed_size() const;
0222
0223
0224
0225
0226
0227
0228 int64_t file_offset() const;
0229
0230 const SchemaDescriptor* schema() const;
0231
0232 bool can_decompress() const;
0233
0234 std::vector<SortingColumn> sorting_columns() const;
0235
0236 private:
0237 explicit RowGroupMetaData(
0238 const void* metadata, const SchemaDescriptor* schema,
0239 const ReaderProperties& properties,
0240 const ApplicationVersion* writer_version = NULLPTR,
0241 std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
0242
0243 class RowGroupMetaDataImpl;
0244 std::unique_ptr<RowGroupMetaDataImpl> impl_;
0245 };
0246
0247 class FileMetaDataBuilder;
0248
0249
0250 class PARQUET_EXPORT FileMetaData {
0251 public:
0252
0253 static std::shared_ptr<FileMetaData> Make(
0254 const void* serialized_metadata, uint32_t* inout_metadata_len,
0255 const ReaderProperties& properties = default_reader_properties(),
0256 std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
0257
0258 ~FileMetaData();
0259
0260 bool Equals(const FileMetaData& other) const;
0261
0262
0263
0264
0265
0266
0267
0268
0269
0270
0271
0272
0273
0274
0275
0276 int num_columns() const;
0277
0278
0279
0280
0281
0282
0283 int num_schema_elements() const;
0284
0285
0286
0287
0288
0289 int64_t num_rows() const;
0290
0291
0292
0293
0294
0295 int num_row_groups() const;
0296
0297
0298
0299
0300
0301
0302
0303
0304
0305 std::unique_ptr<RowGroupMetaData> RowGroup(int index) const;
0306
0307
0308
0309
0310
0311
0312
0313 ParquetVersion::type version() const;
0314
0315
0316 const std::string& created_by() const;
0317
0318
0319 const ApplicationVersion& writer_version() const;
0320
0321
0322 uint32_t size() const;
0323
0324
0325
0326
0327
0328 bool can_decompress() const;
0329
0330 bool is_encryption_algorithm_set() const;
0331 EncryptionAlgorithm encryption_algorithm() const;
0332 const std::string& footer_signing_key_metadata() const;
0333
0334
0335
0336 bool VerifySignature(const void* signature);
0337
0338 void WriteTo(::arrow::io::OutputStream* dst,
0339 const std::shared_ptr<Encryptor>& encryptor = NULLPTR) const;
0340
0341
0342
0343 std::string SerializeToString() const;
0344
0345
0346 const SchemaDescriptor* schema() const;
0347
0348 const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
0349
0350
0351
0352
0353
0354
0355
0356 void set_file_path(const std::string& path);
0357
0358
0359
0360
0361
0362
0363
0364
0365
0366
0367
0368
0369
0370 void AppendRowGroups(const FileMetaData& other);
0371
0372
0373
0374 std::shared_ptr<FileMetaData> Subset(const std::vector<int>& row_groups) const;
0375
0376
0377
0378
0379
0380
0381 std::string SerializeUnencrypted(bool scrub, bool debug) const;
0382
0383 private:
0384 friend FileMetaDataBuilder;
0385 friend class SerializedFile;
0386 friend class SerializedRowGroup;
0387
0388 explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len,
0389 const ReaderProperties& properties,
0390 std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
0391
0392 void set_file_decryptor(std::shared_ptr<InternalFileDecryptor> file_decryptor);
0393 const std::shared_ptr<InternalFileDecryptor>& file_decryptor() const;
0394
0395
0396 FileMetaData();
0397 class FileMetaDataImpl;
0398 std::unique_ptr<FileMetaDataImpl> impl_;
0399 };
0400
0401 class PARQUET_EXPORT FileCryptoMetaData {
0402 public:
0403
0404 static std::shared_ptr<FileCryptoMetaData> Make(
0405 const uint8_t* serialized_metadata, uint32_t* metadata_len,
0406 const ReaderProperties& properties = default_reader_properties());
0407 ~FileCryptoMetaData();
0408
0409 EncryptionAlgorithm encryption_algorithm() const;
0410 const std::string& key_metadata() const;
0411
0412 void WriteTo(::arrow::io::OutputStream* dst) const;
0413
0414 private:
0415 friend FileMetaDataBuilder;
0416 FileCryptoMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len,
0417 const ReaderProperties& properties);
0418
0419
0420 FileCryptoMetaData();
0421 class FileCryptoMetaDataImpl;
0422 std::unique_ptr<FileCryptoMetaDataImpl> impl_;
0423 };
0424
0425
0426 class PARQUET_EXPORT ColumnChunkMetaDataBuilder {
0427 public:
0428
0429 static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
0430 std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column);
0431
0432 static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
0433 std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column,
0434 void* contents);
0435
0436 ~ColumnChunkMetaDataBuilder();
0437
0438
0439
0440 void set_file_path(const std::string& path);
0441
0442
0443 void SetStatistics(const EncodedStatistics& stats);
0444 void SetSizeStatistics(const SizeStatistics& size_stats);
0445
0446
0447 void SetGeoStatistics(const geospatial::EncodedGeoStatistics& geo_stats);
0448
0449 void SetKeyValueMetadata(std::shared_ptr<const KeyValueMetadata> key_value_metadata);
0450
0451
0452 const ColumnDescriptor* descr() const;
0453
0454 int64_t total_compressed_size() const;
0455
0456
0457 void Finish(int64_t num_values, int64_t dictionary_page_offset,
0458 int64_t index_page_offset, int64_t data_page_offset,
0459 int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary,
0460 bool dictionary_fallback,
0461 const std::map<Encoding::type, int32_t>& dict_encoding_stats_,
0462 const std::map<Encoding::type, int32_t>& data_encoding_stats_,
0463 const std::shared_ptr<Encryptor>& encryptor = NULLPTR);
0464
0465
0466 const void* contents() const;
0467
0468
0469 void WriteTo(::arrow::io::OutputStream* sink);
0470
0471 private:
0472 explicit ColumnChunkMetaDataBuilder(std::shared_ptr<WriterProperties> props,
0473 const ColumnDescriptor* column);
0474 explicit ColumnChunkMetaDataBuilder(std::shared_ptr<WriterProperties> props,
0475 const ColumnDescriptor* column, void* contents);
0476
0477 class ColumnChunkMetaDataBuilderImpl;
0478 std::unique_ptr<ColumnChunkMetaDataBuilderImpl> impl_;
0479 };
0480
0481 class PARQUET_EXPORT RowGroupMetaDataBuilder {
0482 public:
0483
0484 static std::unique_ptr<RowGroupMetaDataBuilder> Make(
0485 std::shared_ptr<WriterProperties> props, const SchemaDescriptor* schema_,
0486 void* contents);
0487
0488 ~RowGroupMetaDataBuilder();
0489
0490 ColumnChunkMetaDataBuilder* NextColumnChunk();
0491 int num_columns();
0492 int64_t num_rows();
0493 int current_column() const;
0494
0495 void set_num_rows(int64_t num_rows);
0496
0497
0498 void Finish(int64_t total_bytes_written, int16_t row_group_ordinal = -1);
0499
0500 private:
0501 explicit RowGroupMetaDataBuilder(std::shared_ptr<WriterProperties> props,
0502 const SchemaDescriptor* schema_, void* contents);
0503
0504 class RowGroupMetaDataBuilderImpl;
0505 std::unique_ptr<RowGroupMetaDataBuilderImpl> impl_;
0506 };
0507
0508
0509 struct PageIndexLocation {
0510
0511
0512
0513 using RowGroupIndexLocation = std::vector<std::optional<IndexLocation>>;
0514
0515
0516 using FileIndexLocation = std::map<size_t, RowGroupIndexLocation>;
0517
0518 FileIndexLocation column_index_location;
0519
0520 FileIndexLocation offset_index_location;
0521 };
0522
0523 class PARQUET_EXPORT FileMetaDataBuilder {
0524 public:
0525
0526 static std::unique_ptr<FileMetaDataBuilder> Make(
0527 const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props);
0528
0529 ~FileMetaDataBuilder();
0530
0531
0532 RowGroupMetaDataBuilder* AppendRowGroup();
0533
0534
0535 void SetPageIndexLocation(const PageIndexLocation& location);
0536
0537
0538 std::unique_ptr<FileMetaData> Finish(
0539 const std::shared_ptr<const KeyValueMetadata>& key_value_metadata = NULLPTR);
0540
0541
0542 std::unique_ptr<FileCryptoMetaData> GetCryptoMetaData();
0543
0544 private:
0545 explicit FileMetaDataBuilder(const SchemaDescriptor* schema,
0546 std::shared_ptr<WriterProperties> props);
0547
0548 class FileMetaDataBuilderImpl;
0549 std::unique_ptr<FileMetaDataBuilderImpl> impl_;
0550 };
0551
0552 PARQUET_EXPORT std::string ParquetVersionToString(ParquetVersion::type ver);
0553
0554 }