File indexing completed on 2026-04-17 08:28:55
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 #pragma once
0019
0020 #include <memory>
0021 #include <string>
0022 #include <unordered_map>
0023 #include <unordered_set>
0024 #include <utility>
0025
0026 #include "arrow/io/caching.h"
0027 #include "arrow/type_fwd.h"
0028 #include "arrow/util/compression.h"
0029 #include "arrow/util/type_fwd.h"
0030 #include "parquet/encryption/encryption.h"
0031 #include "parquet/exception.h"
0032 #include "parquet/parquet_version.h"
0033 #include "parquet/platform.h"
0034 #include "parquet/schema.h"
0035 #include "parquet/type_fwd.h"
0036 #include "parquet/types.h"
0037
0038 namespace parquet {
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048 enum class ParquetDataPageVersion { V1, V2 };
0049
0050
0051 enum class SizeStatisticsLevel : uint8_t {
0052
0053 None = 0,
0054
0055 ColumnChunk,
0056
0057 PageAndColumnChunk
0058 };
0059
0060
0061 constexpr int64_t kDefaultBufferSize = 4096 * 4;
0062
0063 constexpr int32_t kDefaultThriftStringSizeLimit = 100 * 1000 * 1000;
0064
0065
0066
0067 constexpr int32_t kDefaultThriftContainerSizeLimit = 1000 * 1000;
0068
0069
0070 constexpr int64_t kDefaultFooterReadSize = 64 * 1024;
0071
0072 class PARQUET_EXPORT ReaderProperties {
0073 public:
0074 explicit ReaderProperties(MemoryPool* pool = ::arrow::default_memory_pool())
0075 : pool_(pool) {}
0076
0077 MemoryPool* memory_pool() const { return pool_; }
0078
0079 std::shared_ptr<ArrowInputStream> GetStream(std::shared_ptr<ArrowInputFile> source,
0080 int64_t start, int64_t num_bytes);
0081
0082
0083
0084
0085
0086
0087
0088
0089 bool is_buffered_stream_enabled() const { return buffered_stream_enabled_; }
0090
0091 void enable_buffered_stream() { buffered_stream_enabled_ = true; }
0092
0093 void disable_buffered_stream() { buffered_stream_enabled_ = false; }
0094
0095 bool read_dense_for_nullable() const { return read_dense_for_nullable_; }
0096 void enable_read_dense_for_nullable() { read_dense_for_nullable_ = true; }
0097 void disable_read_dense_for_nullable() { read_dense_for_nullable_ = false; }
0098
0099
0100 int64_t buffer_size() const { return buffer_size_; }
0101
0102 void set_buffer_size(int64_t size) { buffer_size_ = size; }
0103
0104
0105
0106
0107
0108 int32_t thrift_string_size_limit() const { return thrift_string_size_limit_; }
0109
0110 void set_thrift_string_size_limit(int32_t size) { thrift_string_size_limit_ = size; }
0111
0112
0113
0114
0115
0116 int32_t thrift_container_size_limit() const { return thrift_container_size_limit_; }
0117
0118 void set_thrift_container_size_limit(int32_t size) {
0119 thrift_container_size_limit_ = size;
0120 }
0121
0122
0123 void file_decryption_properties(std::shared_ptr<FileDecryptionProperties> decryption) {
0124 file_decryption_properties_ = std::move(decryption);
0125 }
0126
0127 const std::shared_ptr<FileDecryptionProperties>& file_decryption_properties() const {
0128 return file_decryption_properties_;
0129 }
0130
0131 bool page_checksum_verification() const { return page_checksum_verification_; }
0132 void set_page_checksum_verification(bool check_crc) {
0133 page_checksum_verification_ = check_crc;
0134 }
0135
0136
0137
0138
0139 void set_footer_read_size(size_t size) { footer_read_size_ = size; }
0140 size_t footer_read_size() const { return footer_read_size_; }
0141
0142 private:
0143 MemoryPool* pool_;
0144 int64_t buffer_size_ = kDefaultBufferSize;
0145 int32_t thrift_string_size_limit_ = kDefaultThriftStringSizeLimit;
0146 int32_t thrift_container_size_limit_ = kDefaultThriftContainerSizeLimit;
0147 bool buffered_stream_enabled_ = false;
0148 bool page_checksum_verification_ = false;
0149
0150 bool read_dense_for_nullable_ = false;
0151 size_t footer_read_size_ = kDefaultFooterReadSize;
0152 std::shared_ptr<FileDecryptionProperties> file_decryption_properties_;
0153 };
0154
0155 ReaderProperties PARQUET_EXPORT default_reader_properties();
0156
0157 static constexpr int64_t kDefaultDataPageSize = 1024 * 1024;
0158 static constexpr bool DEFAULT_IS_DICTIONARY_ENABLED = true;
0159 static constexpr int64_t DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT = kDefaultDataPageSize;
0160 static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024;
0161 static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 1024 * 1024;
0162 static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true;
0163 static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
0164 static constexpr Encoding::type DEFAULT_ENCODING = Encoding::UNKNOWN;
0165 static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION;
0166 static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED;
0167 static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = true;
0168 static constexpr SizeStatisticsLevel DEFAULT_SIZE_STATISTICS_LEVEL =
0169 SizeStatisticsLevel::PageAndColumnChunk;
0170
0171 class PARQUET_EXPORT ColumnProperties {
0172 public:
0173 ColumnProperties(Encoding::type encoding = DEFAULT_ENCODING,
0174 Compression::type codec = DEFAULT_COMPRESSION_TYPE,
0175 bool dictionary_enabled = DEFAULT_IS_DICTIONARY_ENABLED,
0176 bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED,
0177 size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE,
0178 bool page_index_enabled = DEFAULT_IS_PAGE_INDEX_ENABLED)
0179 : encoding_(encoding),
0180 codec_(codec),
0181 dictionary_enabled_(dictionary_enabled),
0182 statistics_enabled_(statistics_enabled),
0183 max_stats_size_(max_stats_size),
0184 page_index_enabled_(page_index_enabled) {}
0185
0186 void set_encoding(Encoding::type encoding) { encoding_ = encoding; }
0187
0188 void set_compression(Compression::type codec) { codec_ = codec; }
0189
0190 void set_dictionary_enabled(bool dictionary_enabled) {
0191 dictionary_enabled_ = dictionary_enabled;
0192 }
0193
0194 void set_statistics_enabled(bool statistics_enabled) {
0195 statistics_enabled_ = statistics_enabled;
0196 }
0197
0198 void set_max_statistics_size(size_t max_stats_size) {
0199 max_stats_size_ = max_stats_size;
0200 }
0201
0202 void set_compression_level(int compression_level) {
0203 if (!codec_options_) {
0204 codec_options_ = std::make_shared<CodecOptions>();
0205 }
0206 codec_options_->compression_level = compression_level;
0207 }
0208
0209 void set_codec_options(const std::shared_ptr<CodecOptions>& codec_options) {
0210 codec_options_ = codec_options;
0211 }
0212
0213 void set_page_index_enabled(bool page_index_enabled) {
0214 page_index_enabled_ = page_index_enabled;
0215 }
0216
0217 Encoding::type encoding() const { return encoding_; }
0218
0219 Compression::type compression() const { return codec_; }
0220
0221 bool dictionary_enabled() const { return dictionary_enabled_; }
0222
0223 bool statistics_enabled() const { return statistics_enabled_; }
0224
0225 size_t max_statistics_size() const { return max_stats_size_; }
0226
0227 int compression_level() const {
0228 if (!codec_options_) {
0229 return ::arrow::util::kUseDefaultCompressionLevel;
0230 }
0231 return codec_options_->compression_level;
0232 }
0233
0234 const std::shared_ptr<CodecOptions>& codec_options() const { return codec_options_; }
0235
0236 bool page_index_enabled() const { return page_index_enabled_; }
0237
0238 private:
0239 Encoding::type encoding_;
0240 Compression::type codec_;
0241 bool dictionary_enabled_;
0242 bool statistics_enabled_;
0243 size_t max_stats_size_;
0244 std::shared_ptr<CodecOptions> codec_options_;
0245 bool page_index_enabled_;
0246 };
0247
0248
0249
0250
0251
0252
0253
0254
0255
0256
0257
0258
0259 struct PARQUET_EXPORT CdcOptions {
0260
0261
0262
0263
0264 int64_t min_chunk_size = 256 * 1024;
0265
0266
0267
0268
0269
0270
0271 int64_t max_chunk_size = 1024 * 1024;
0272
0273
0274
0275
0276
0277
0278
0279
0280
0281
0282
0283 int norm_level = 0;
0284 };
0285
0286 class PARQUET_EXPORT WriterProperties {
0287 public:
0288 class Builder {
0289 public:
0290 Builder()
0291 : pool_(::arrow::default_memory_pool()),
0292 dictionary_pagesize_limit_(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT),
0293 write_batch_size_(DEFAULT_WRITE_BATCH_SIZE),
0294 max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH),
0295 pagesize_(kDefaultDataPageSize),
0296 version_(ParquetVersion::PARQUET_2_6),
0297 data_page_version_(ParquetDataPageVersion::V1),
0298 created_by_(DEFAULT_CREATED_BY),
0299 store_decimal_as_integer_(false),
0300 page_checksum_enabled_(false),
0301 size_statistics_level_(DEFAULT_SIZE_STATISTICS_LEVEL),
0302 content_defined_chunking_enabled_(false),
0303 content_defined_chunking_options_({}) {}
0304
0305 explicit Builder(const WriterProperties& properties)
0306 : pool_(properties.memory_pool()),
0307 dictionary_pagesize_limit_(properties.dictionary_pagesize_limit()),
0308 write_batch_size_(properties.write_batch_size()),
0309 max_row_group_length_(properties.max_row_group_length()),
0310 pagesize_(properties.data_pagesize()),
0311 version_(properties.version()),
0312 data_page_version_(properties.data_page_version()),
0313 created_by_(properties.created_by()),
0314 store_decimal_as_integer_(properties.store_decimal_as_integer()),
0315 page_checksum_enabled_(properties.page_checksum_enabled()),
0316 size_statistics_level_(properties.size_statistics_level()),
0317 sorting_columns_(properties.sorting_columns()),
0318 default_column_properties_(properties.default_column_properties()),
0319 content_defined_chunking_enabled_(
0320 properties.content_defined_chunking_enabled()),
0321 content_defined_chunking_options_(
0322 properties.content_defined_chunking_options()) {}
0323
0324 virtual ~Builder() {}
0325
0326
0327
0328
0329
0330
0331
0332
0333
0334
0335 Builder* enable_content_defined_chunking() {
0336 content_defined_chunking_enabled_ = true;
0337 return this;
0338 }
0339
0340
0341 Builder* disable_content_defined_chunking() {
0342 content_defined_chunking_enabled_ = false;
0343 return this;
0344 }
0345
0346
0347 Builder* content_defined_chunking_options(const CdcOptions& options) {
0348 content_defined_chunking_options_ = options;
0349 return this;
0350 }
0351
0352
0353 Builder* memory_pool(MemoryPool* pool) {
0354 pool_ = pool;
0355 return this;
0356 }
0357
0358
0359
0360 Builder* enable_dictionary() {
0361 default_column_properties_.set_dictionary_enabled(true);
0362 return this;
0363 }
0364
0365
0366
0367 Builder* disable_dictionary() {
0368 default_column_properties_.set_dictionary_enabled(false);
0369 return this;
0370 }
0371
0372
0373
0374 Builder* enable_dictionary(const std::string& path) {
0375 dictionary_enabled_[path] = true;
0376 return this;
0377 }
0378
0379
0380
0381 Builder* enable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) {
0382 return this->enable_dictionary(path->ToDotString());
0383 }
0384
0385
0386
0387 Builder* disable_dictionary(const std::string& path) {
0388 dictionary_enabled_[path] = false;
0389 return this;
0390 }
0391
0392
0393
0394 Builder* disable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) {
0395 return this->disable_dictionary(path->ToDotString());
0396 }
0397
0398
0399 Builder* dictionary_pagesize_limit(int64_t dictionary_psize_limit) {
0400 dictionary_pagesize_limit_ = dictionary_psize_limit;
0401 return this;
0402 }
0403
0404
0405
0406 Builder* write_batch_size(int64_t write_batch_size) {
0407 write_batch_size_ = write_batch_size;
0408 return this;
0409 }
0410
0411
0412
0413 Builder* max_row_group_length(int64_t max_row_group_length) {
0414 max_row_group_length_ = max_row_group_length;
0415 return this;
0416 }
0417
0418
0419
0420 Builder* data_pagesize(int64_t pg_size) {
0421 pagesize_ = pg_size;
0422 return this;
0423 }
0424
0425
0426
0427 Builder* data_page_version(ParquetDataPageVersion data_page_version) {
0428 data_page_version_ = data_page_version;
0429 return this;
0430 }
0431
0432
0433
0434 Builder* version(ParquetVersion::type version) {
0435 version_ = version;
0436 return this;
0437 }
0438
0439 Builder* created_by(const std::string& created_by) {
0440 created_by_ = created_by;
0441 return this;
0442 }
0443
0444 Builder* enable_page_checksum() {
0445 page_checksum_enabled_ = true;
0446 return this;
0447 }
0448
0449 Builder* disable_page_checksum() {
0450 page_checksum_enabled_ = false;
0451 return this;
0452 }
0453
0454
0455
0456
0457
0458 Builder* encoding(Encoding::type encoding_type) {
0459 if (encoding_type == Encoding::PLAIN_DICTIONARY ||
0460 encoding_type == Encoding::RLE_DICTIONARY) {
0461 throw ParquetException("Can't use dictionary encoding as fallback encoding");
0462 }
0463
0464 default_column_properties_.set_encoding(encoding_type);
0465 return this;
0466 }
0467
0468
0469
0470
0471
0472 Builder* encoding(const std::string& path, Encoding::type encoding_type) {
0473 if (encoding_type == Encoding::PLAIN_DICTIONARY ||
0474 encoding_type == Encoding::RLE_DICTIONARY) {
0475 throw ParquetException("Can't use dictionary encoding as fallback encoding");
0476 }
0477
0478 encodings_[path] = encoding_type;
0479 return this;
0480 }
0481
0482
0483
0484
0485
0486 Builder* encoding(const std::shared_ptr<schema::ColumnPath>& path,
0487 Encoding::type encoding_type) {
0488 return this->encoding(path->ToDotString(), encoding_type);
0489 }
0490
0491
0492
0493 Builder* compression(Compression::type codec) {
0494 default_column_properties_.set_compression(codec);
0495 return this;
0496 }
0497
0498
0499
0500 Builder* max_statistics_size(size_t max_stats_sz) {
0501 default_column_properties_.set_max_statistics_size(max_stats_sz);
0502 return this;
0503 }
0504
0505
0506
0507 Builder* compression(const std::string& path, Compression::type codec) {
0508 codecs_[path] = codec;
0509 return this;
0510 }
0511
0512
0513
0514 Builder* compression(const std::shared_ptr<schema::ColumnPath>& path,
0515 Compression::type codec) {
0516 return this->compression(path->ToDotString(), codec);
0517 }
0518
0519
0520
0521
0522
0523
0524
0525
0526
0527
0528
0529
0530
0531
0532
0533
0534 Builder* compression_level(int compression_level) {
0535 default_column_properties_.set_compression_level(compression_level);
0536 return this;
0537 }
0538
0539
0540
0541
0542
0543
0544
0545
0546
0547
0548
0549
0550 Builder* compression_level(const std::string& path, int compression_level) {
0551 if (!codec_options_[path]) {
0552 codec_options_[path] = std::make_shared<CodecOptions>();
0553 }
0554 codec_options_[path]->compression_level = compression_level;
0555 return this;
0556 }
0557
0558
0559
0560
0561
0562
0563
0564
0565
0566
0567
0568
0569 Builder* compression_level(const std::shared_ptr<schema::ColumnPath>& path,
0570 int compression_level) {
0571 return this->compression_level(path->ToDotString(), compression_level);
0572 }
0573
0574
0575
0576
0577
0578
0579 Builder* codec_options(
0580 const std::shared_ptr<::arrow::util::CodecOptions>& codec_options) {
0581 default_column_properties_.set_codec_options(codec_options);
0582 return this;
0583 }
0584
0585
0586
0587 Builder* codec_options(
0588 const std::string& path,
0589 const std::shared_ptr<::arrow::util::CodecOptions>& codec_options) {
0590 codec_options_[path] = codec_options;
0591 return this;
0592 }
0593
0594
0595
0596 Builder* codec_options(
0597 const std::shared_ptr<schema::ColumnPath>& path,
0598 const std::shared_ptr<::arrow::util::CodecOptions>& codec_options) {
0599 return this->codec_options(path->ToDotString(), codec_options);
0600 }
0601
0602
0603
0604 Builder* encryption(
0605 std::shared_ptr<FileEncryptionProperties> file_encryption_properties) {
0606 file_encryption_properties_ = std::move(file_encryption_properties);
0607 return this;
0608 }
0609
0610
0611
0612 Builder* enable_statistics() {
0613 default_column_properties_.set_statistics_enabled(true);
0614 return this;
0615 }
0616
0617
0618
0619 Builder* disable_statistics() {
0620 default_column_properties_.set_statistics_enabled(false);
0621 return this;
0622 }
0623
0624
0625
0626 Builder* enable_statistics(const std::string& path) {
0627 statistics_enabled_[path] = true;
0628 return this;
0629 }
0630
0631
0632
0633 Builder* enable_statistics(const std::shared_ptr<schema::ColumnPath>& path) {
0634 return this->enable_statistics(path->ToDotString());
0635 }
0636
0637
0638
0639
0640
0641
0642
0643 Builder* set_sorting_columns(std::vector<SortingColumn> sorting_columns) {
0644 sorting_columns_ = std::move(sorting_columns);
0645 return this;
0646 }
0647
0648
0649
0650 Builder* disable_statistics(const std::string& path) {
0651 statistics_enabled_[path] = false;
0652 return this;
0653 }
0654
0655
0656
0657 Builder* disable_statistics(const std::shared_ptr<schema::ColumnPath>& path) {
0658 return this->disable_statistics(path->ToDotString());
0659 }
0660
0661
0662
0663
0664
0665
0666
0667
0668
0669
0670
0671
0672
0673
0674
0675
0676
0677
0678
0679 Builder* enable_store_decimal_as_integer() {
0680 store_decimal_as_integer_ = true;
0681 return this;
0682 }
0683
0684
0685
0686
0687
0688 Builder* disable_store_decimal_as_integer() {
0689 store_decimal_as_integer_ = false;
0690 return this;
0691 }
0692
0693
0694
0695
0696
0697
0698
0699
0700
0701
0702
0703 Builder* enable_write_page_index() {
0704 default_column_properties_.set_page_index_enabled(true);
0705 return this;
0706 }
0707
0708
0709 Builder* disable_write_page_index() {
0710 default_column_properties_.set_page_index_enabled(false);
0711 return this;
0712 }
0713
0714
0715 Builder* enable_write_page_index(const std::string& path) {
0716 page_index_enabled_[path] = true;
0717 return this;
0718 }
0719
0720
0721 Builder* enable_write_page_index(const std::shared_ptr<schema::ColumnPath>& path) {
0722 return this->enable_write_page_index(path->ToDotString());
0723 }
0724
0725
0726 Builder* disable_write_page_index(const std::string& path) {
0727 page_index_enabled_[path] = false;
0728 return this;
0729 }
0730
0731
0732 Builder* disable_write_page_index(const std::shared_ptr<schema::ColumnPath>& path) {
0733 return this->disable_write_page_index(path->ToDotString());
0734 }
0735
0736
0737
0738
0739
0740
0741 Builder* set_size_statistics_level(SizeStatisticsLevel level) {
0742 size_statistics_level_ = level;
0743 return this;
0744 }
0745
0746
0747
0748 std::shared_ptr<WriterProperties> build() {
0749 std::unordered_map<std::string, ColumnProperties> column_properties;
0750 auto get = [&](const std::string& key) -> ColumnProperties& {
0751 auto it = column_properties.find(key);
0752 if (it == column_properties.end())
0753 return column_properties[key] = default_column_properties_;
0754 else
0755 return it->second;
0756 };
0757
0758 for (const auto& item : encodings_) get(item.first).set_encoding(item.second);
0759 for (const auto& item : codecs_) get(item.first).set_compression(item.second);
0760 for (const auto& item : codec_options_)
0761 get(item.first).set_codec_options(item.second);
0762 for (const auto& item : dictionary_enabled_)
0763 get(item.first).set_dictionary_enabled(item.second);
0764 for (const auto& item : statistics_enabled_)
0765 get(item.first).set_statistics_enabled(item.second);
0766 for (const auto& item : page_index_enabled_)
0767 get(item.first).set_page_index_enabled(item.second);
0768
0769 return std::shared_ptr<WriterProperties>(new WriterProperties(
0770 pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_,
0771 pagesize_, version_, created_by_, page_checksum_enabled_,
0772 size_statistics_level_, std::move(file_encryption_properties_),
0773 default_column_properties_, column_properties, data_page_version_,
0774 store_decimal_as_integer_, std::move(sorting_columns_),
0775 content_defined_chunking_enabled_, content_defined_chunking_options_));
0776 }
0777
0778 private:
0779 MemoryPool* pool_;
0780 int64_t dictionary_pagesize_limit_;
0781 int64_t write_batch_size_;
0782 int64_t max_row_group_length_;
0783 int64_t pagesize_;
0784 ParquetVersion::type version_;
0785 ParquetDataPageVersion data_page_version_;
0786 std::string created_by_;
0787 bool store_decimal_as_integer_;
0788 bool page_checksum_enabled_;
0789 SizeStatisticsLevel size_statistics_level_;
0790
0791 std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
0792
0793
0794 std::vector<SortingColumn> sorting_columns_;
0795
0796
0797 ColumnProperties default_column_properties_;
0798 std::unordered_map<std::string, Encoding::type> encodings_;
0799 std::unordered_map<std::string, Compression::type> codecs_;
0800 std::unordered_map<std::string, std::shared_ptr<CodecOptions>> codec_options_;
0801 std::unordered_map<std::string, bool> dictionary_enabled_;
0802 std::unordered_map<std::string, bool> statistics_enabled_;
0803 std::unordered_map<std::string, bool> page_index_enabled_;
0804
0805 bool content_defined_chunking_enabled_;
0806 CdcOptions content_defined_chunking_options_;
0807 };
0808
0809 inline MemoryPool* memory_pool() const { return pool_; }
0810
0811 inline int64_t dictionary_pagesize_limit() const { return dictionary_pagesize_limit_; }
0812
0813 inline int64_t write_batch_size() const { return write_batch_size_; }
0814
0815 inline int64_t max_row_group_length() const { return max_row_group_length_; }
0816
0817 inline int64_t data_pagesize() const { return pagesize_; }
0818
0819 inline ParquetDataPageVersion data_page_version() const {
0820 return parquet_data_page_version_;
0821 }
0822
0823 inline ParquetVersion::type version() const { return parquet_version_; }
0824
0825 inline std::string created_by() const { return parquet_created_by_; }
0826
0827 inline bool store_decimal_as_integer() const { return store_decimal_as_integer_; }
0828
0829 inline bool page_checksum_enabled() const { return page_checksum_enabled_; }
0830
0831 inline bool content_defined_chunking_enabled() const {
0832 return content_defined_chunking_enabled_;
0833 }
0834 inline CdcOptions content_defined_chunking_options() const {
0835 return content_defined_chunking_options_;
0836 }
0837
0838 inline SizeStatisticsLevel size_statistics_level() const {
0839 return size_statistics_level_;
0840 }
0841
0842 inline Encoding::type dictionary_index_encoding() const {
0843 if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
0844 return Encoding::PLAIN_DICTIONARY;
0845 } else {
0846 return Encoding::RLE_DICTIONARY;
0847 }
0848 }
0849
0850 inline Encoding::type dictionary_page_encoding() const {
0851 if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
0852 return Encoding::PLAIN_DICTIONARY;
0853 } else {
0854 return Encoding::PLAIN;
0855 }
0856 }
0857
0858 const ColumnProperties& column_properties(
0859 const std::shared_ptr<schema::ColumnPath>& path) const {
0860 auto it = column_properties_.find(path->ToDotString());
0861 if (it != column_properties_.end()) return it->second;
0862 return default_column_properties_;
0863 }
0864
0865 Encoding::type encoding(const std::shared_ptr<schema::ColumnPath>& path) const {
0866 return column_properties(path).encoding();
0867 }
0868
0869 Compression::type compression(const std::shared_ptr<schema::ColumnPath>& path) const {
0870 return column_properties(path).compression();
0871 }
0872
0873 int compression_level(const std::shared_ptr<schema::ColumnPath>& path) const {
0874 return column_properties(path).compression_level();
0875 }
0876
0877 const std::shared_ptr<CodecOptions> codec_options(
0878 const std::shared_ptr<schema::ColumnPath>& path) const {
0879 return column_properties(path).codec_options();
0880 }
0881
0882 bool dictionary_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
0883 return column_properties(path).dictionary_enabled();
0884 }
0885
0886 const std::vector<SortingColumn>& sorting_columns() const { return sorting_columns_; }
0887
0888 bool statistics_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
0889 return column_properties(path).statistics_enabled();
0890 }
0891
0892 size_t max_statistics_size(const std::shared_ptr<schema::ColumnPath>& path) const {
0893 return column_properties(path).max_statistics_size();
0894 }
0895
0896 bool page_index_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
0897 return column_properties(path).page_index_enabled();
0898 }
0899
0900 bool page_index_enabled() const {
0901 if (default_column_properties_.page_index_enabled()) {
0902 return true;
0903 }
0904 for (const auto& item : column_properties_) {
0905 if (item.second.page_index_enabled()) {
0906 return true;
0907 }
0908 }
0909 return false;
0910 }
0911
0912 inline FileEncryptionProperties* file_encryption_properties() const {
0913 return file_encryption_properties_.get();
0914 }
0915
0916 std::shared_ptr<ColumnEncryptionProperties> column_encryption_properties(
0917 const std::string& path) const {
0918 if (file_encryption_properties_) {
0919 return file_encryption_properties_->column_encryption_properties(path);
0920 } else {
0921 return NULLPTR;
0922 }
0923 }
0924
0925
0926 const ColumnProperties& default_column_properties() const {
0927 return default_column_properties_;
0928 }
0929
0930 private:
0931 explicit WriterProperties(
0932 MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size,
0933 int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type version,
0934 const std::string& created_by, bool page_write_checksum_enabled,
0935 SizeStatisticsLevel size_statistics_level,
0936 std::shared_ptr<FileEncryptionProperties> file_encryption_properties,
0937 const ColumnProperties& default_column_properties,
0938 const std::unordered_map<std::string, ColumnProperties>& column_properties,
0939 ParquetDataPageVersion data_page_version, bool store_short_decimal_as_integer,
0940 std::vector<SortingColumn> sorting_columns, bool content_defined_chunking_enabled,
0941 CdcOptions content_defined_chunking_options)
0942 : pool_(pool),
0943 dictionary_pagesize_limit_(dictionary_pagesize_limit),
0944 write_batch_size_(write_batch_size),
0945 max_row_group_length_(max_row_group_length),
0946 pagesize_(pagesize),
0947 parquet_data_page_version_(data_page_version),
0948 parquet_version_(version),
0949 parquet_created_by_(created_by),
0950 store_decimal_as_integer_(store_short_decimal_as_integer),
0951 page_checksum_enabled_(page_write_checksum_enabled),
0952 size_statistics_level_(size_statistics_level),
0953 file_encryption_properties_(file_encryption_properties),
0954 sorting_columns_(std::move(sorting_columns)),
0955 default_column_properties_(default_column_properties),
0956 column_properties_(column_properties),
0957 content_defined_chunking_enabled_(content_defined_chunking_enabled),
0958 content_defined_chunking_options_(content_defined_chunking_options) {}
0959
0960 MemoryPool* pool_;
0961 int64_t dictionary_pagesize_limit_;
0962 int64_t write_batch_size_;
0963 int64_t max_row_group_length_;
0964 int64_t pagesize_;
0965 ParquetDataPageVersion parquet_data_page_version_;
0966 ParquetVersion::type parquet_version_;
0967 std::string parquet_created_by_;
0968 bool store_decimal_as_integer_;
0969 bool page_checksum_enabled_;
0970 SizeStatisticsLevel size_statistics_level_;
0971
0972 std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
0973
0974 std::vector<SortingColumn> sorting_columns_;
0975
0976 ColumnProperties default_column_properties_;
0977 std::unordered_map<std::string, ColumnProperties> column_properties_;
0978
0979 bool content_defined_chunking_enabled_;
0980 CdcOptions content_defined_chunking_options_;
0981 };
0982
0983 PARQUET_EXPORT const std::shared_ptr<WriterProperties>& default_writer_properties();
0984
0985
0986
0987
0988 static constexpr bool kArrowDefaultUseThreads = false;
0989
0990
0991 static constexpr int64_t kArrowDefaultBatchSize = 64 * 1024;
0992
0993 constexpr inline ::arrow::Type::type kArrowDefaultBinaryType = ::arrow::Type::BINARY;
0994 constexpr inline ::arrow::Type::type kArrowDefaultListType = ::arrow::Type::LIST;
0995
0996
0997 class PARQUET_EXPORT ArrowReaderProperties {
0998 public:
0999 explicit ArrowReaderProperties(bool use_threads = kArrowDefaultUseThreads)
1000 : use_threads_(use_threads),
1001 read_dict_indices_(),
1002 batch_size_(kArrowDefaultBatchSize),
1003 pre_buffer_(true),
1004 cache_options_(::arrow::io::CacheOptions::LazyDefaults()),
1005 coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO),
1006 binary_type_(kArrowDefaultBinaryType),
1007 list_type_(kArrowDefaultListType),
1008 arrow_extensions_enabled_(false),
1009 should_load_statistics_(false),
1010 smallest_decimal_enabled_(false) {}
1011
1012
1013
1014
1015 void set_use_threads(bool use_threads) { use_threads_ = use_threads; }
1016
1017 bool use_threads() const { return use_threads_; }
1018
1019
1020
1021
1022
1023
1024
1025 void set_read_dictionary(int column_index, bool read_dict) {
1026 if (read_dict) {
1027 read_dict_indices_.insert(column_index);
1028 } else {
1029 read_dict_indices_.erase(column_index);
1030 }
1031 }
1032
1033 bool read_dictionary(int column_index) const {
1034 if (read_dict_indices_.find(column_index) != read_dict_indices_.end()) {
1035 return true;
1036 } else {
1037 return false;
1038 }
1039 }
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053 void set_binary_type(::arrow::Type::type value) { binary_type_ = value; }
1054
1055 ::arrow::Type::type binary_type() const { return binary_type_; }
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065 void set_list_type(::arrow::Type::type value) { list_type_ = value; }
1066
1067 ::arrow::Type::type list_type() const { return list_type_; }
1068
1069
1070
1071
1072
1073 void set_batch_size(int64_t batch_size) { batch_size_ = batch_size; }
1074
1075
1076
1077 int64_t batch_size() const { return batch_size_; }
1078
1079
1080
1081
1082
1083
1084 void set_pre_buffer(bool pre_buffer) { pre_buffer_ = pre_buffer; }
1085
1086 bool pre_buffer() const { return pre_buffer_; }
1087
1088
1089
1090 void set_cache_options(::arrow::io::CacheOptions options) { cache_options_ = options; }
1091
1092 const ::arrow::io::CacheOptions& cache_options() const { return cache_options_; }
1093
1094
1095 void set_io_context(const ::arrow::io::IOContext& ctx) { io_context_ = ctx; }
1096
1097 const ::arrow::io::IOContext& io_context() const { return io_context_; }
1098
1099
1100
1101 void set_coerce_int96_timestamp_unit(::arrow::TimeUnit::type unit) {
1102 coerce_int96_timestamp_unit_ = unit;
1103 }
1104
1105 ::arrow::TimeUnit::type coerce_int96_timestamp_unit() const {
1106 return coerce_int96_timestamp_unit_;
1107 }
1108
1109
1110
1111
1112
1113
1114
1115
1116 void set_arrow_extensions_enabled(bool extensions_enabled) {
1117 arrow_extensions_enabled_ = extensions_enabled;
1118 }
1119 bool get_arrow_extensions_enabled() const { return arrow_extensions_enabled_; }
1120
1121
1122
1123
1124 void set_should_load_statistics(bool should_load_statistics) {
1125 should_load_statistics_ = should_load_statistics;
1126 }
1127
1128 bool should_load_statistics() const { return should_load_statistics_; }
1129
1130
1131
1132
1133
1134 void set_smallest_decimal_enabled(bool smallest_decimal_enable) {
1135 smallest_decimal_enabled_ = smallest_decimal_enable;
1136 }
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146 bool smallest_decimal_enabled() const { return smallest_decimal_enabled_; }
1147
1148 private:
1149 bool use_threads_;
1150 std::unordered_set<int> read_dict_indices_;
1151 int64_t batch_size_;
1152 bool pre_buffer_;
1153 ::arrow::io::IOContext io_context_;
1154 ::arrow::io::CacheOptions cache_options_;
1155 ::arrow::TimeUnit::type coerce_int96_timestamp_unit_;
1156 ::arrow::Type::type binary_type_;
1157 ::arrow::Type::type list_type_;
1158 bool arrow_extensions_enabled_;
1159 bool should_load_statistics_;
1160 bool smallest_decimal_enabled_;
1161 };
1162
1163
1164 PARQUET_EXPORT
1165 ArrowReaderProperties default_arrow_reader_properties();
1166
1167 class PARQUET_EXPORT ArrowWriterProperties {
1168 public:
1169 enum EngineVersion {
1170 V1,
1171 V2
1172 };
1173 class Builder {
1174 public:
1175 Builder()
1176 : write_timestamps_as_int96_(false),
1177 coerce_timestamps_enabled_(false),
1178 coerce_timestamps_unit_(::arrow::TimeUnit::SECOND),
1179 truncated_timestamps_allowed_(false),
1180 store_schema_(false),
1181 compliant_nested_types_(true),
1182 engine_version_(V2),
1183 use_threads_(kArrowDefaultUseThreads),
1184 executor_(NULLPTR),
1185 write_time_adjusted_to_utc_(false) {}
1186 virtual ~Builder() = default;
1187
1188
1189 Builder* disable_deprecated_int96_timestamps() {
1190 write_timestamps_as_int96_ = false;
1191 return this;
1192 }
1193
1194
1195
1196
1197
1198 Builder* enable_deprecated_int96_timestamps() {
1199 write_timestamps_as_int96_ = true;
1200 return this;
1201 }
1202
1203
1204
1205
1206 Builder* coerce_timestamps(::arrow::TimeUnit::type unit) {
1207 coerce_timestamps_enabled_ = true;
1208 coerce_timestamps_unit_ = unit;
1209 return this;
1210 }
1211
1212
1213
1214
1215 Builder* allow_truncated_timestamps() {
1216 truncated_timestamps_allowed_ = true;
1217 return this;
1218 }
1219
1220
1221 Builder* disallow_truncated_timestamps() {
1222 truncated_timestamps_allowed_ = false;
1223 return this;
1224 }
1225
1226
1227
1228
1229 Builder* store_schema() {
1230 store_schema_ = true;
1231 return this;
1232 }
1233
1234
1235
1236
1237
1238
1239
1240
1241 Builder* enable_compliant_nested_types() {
1242 compliant_nested_types_ = true;
1243 return this;
1244 }
1245
1246
1247 Builder* disable_compliant_nested_types() {
1248 compliant_nested_types_ = false;
1249 return this;
1250 }
1251
1252
1253 Builder* set_engine_version(EngineVersion version) {
1254 engine_version_ = version;
1255 return this;
1256 }
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266 Builder* set_use_threads(bool use_threads) {
1267 use_threads_ = use_threads;
1268 return this;
1269 }
1270
1271
1272
1273
1274
1275 Builder* set_executor(::arrow::internal::Executor* executor) {
1276 executor_ = executor;
1277 return this;
1278 }
1279
1280
1281
1282
1283
1284 Builder* set_time_adjusted_to_utc(bool adjusted) {
1285 write_time_adjusted_to_utc_ = adjusted;
1286 return this;
1287 }
1288
1289
1290 std::shared_ptr<ArrowWriterProperties> build() {
1291 return std::shared_ptr<ArrowWriterProperties>(new ArrowWriterProperties(
1292 write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_,
1293 truncated_timestamps_allowed_, store_schema_, compliant_nested_types_,
1294 engine_version_, use_threads_, executor_, write_time_adjusted_to_utc_));
1295 }
1296
1297 private:
1298 bool write_timestamps_as_int96_;
1299
1300 bool coerce_timestamps_enabled_;
1301 ::arrow::TimeUnit::type coerce_timestamps_unit_;
1302 bool truncated_timestamps_allowed_;
1303
1304 bool store_schema_;
1305 bool compliant_nested_types_;
1306 EngineVersion engine_version_;
1307
1308 bool use_threads_;
1309 ::arrow::internal::Executor* executor_;
1310
1311 bool write_time_adjusted_to_utc_;
1312 };
1313
1314 bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; }
1315
1316 bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_; }
1317 ::arrow::TimeUnit::type coerce_timestamps_unit() const {
1318 return coerce_timestamps_unit_;
1319 }
1320
1321 bool truncated_timestamps_allowed() const { return truncated_timestamps_allowed_; }
1322
1323 bool store_schema() const { return store_schema_; }
1324
1325
1326
1327
1328
1329
1330 bool compliant_nested_types() const { return compliant_nested_types_; }
1331
1332
1333
1334
1335
1336 EngineVersion engine_version() const { return engine_version_; }
1337
1338
1339
1340 bool use_threads() const { return use_threads_; }
1341
1342
1343 ::arrow::internal::Executor* executor() const;
1344
1345
1346
1347
1348 bool write_time_adjusted_to_utc() const { return write_time_adjusted_to_utc_; }
1349
1350 private:
1351 explicit ArrowWriterProperties(bool write_nanos_as_int96,
1352 bool coerce_timestamps_enabled,
1353 ::arrow::TimeUnit::type coerce_timestamps_unit,
1354 bool truncated_timestamps_allowed, bool store_schema,
1355 bool compliant_nested_types,
1356 EngineVersion engine_version, bool use_threads,
1357 ::arrow::internal::Executor* executor,
1358 bool write_time_adjusted_to_utc)
1359 : write_timestamps_as_int96_(write_nanos_as_int96),
1360 coerce_timestamps_enabled_(coerce_timestamps_enabled),
1361 coerce_timestamps_unit_(coerce_timestamps_unit),
1362 truncated_timestamps_allowed_(truncated_timestamps_allowed),
1363 store_schema_(store_schema),
1364 compliant_nested_types_(compliant_nested_types),
1365 engine_version_(engine_version),
1366 use_threads_(use_threads),
1367 executor_(executor),
1368 write_time_adjusted_to_utc_(write_time_adjusted_to_utc) {}
1369
1370 const bool write_timestamps_as_int96_;
1371 const bool coerce_timestamps_enabled_;
1372 const ::arrow::TimeUnit::type coerce_timestamps_unit_;
1373 const bool truncated_timestamps_allowed_;
1374 const bool store_schema_;
1375 const bool compliant_nested_types_;
1376 const EngineVersion engine_version_;
1377 const bool use_threads_;
1378 ::arrow::internal::Executor* executor_;
1379 const bool write_time_adjusted_to_utc_;
1380 };
1381
1382
1383
1384 struct ArrowWriteContext {
1385 ArrowWriteContext(MemoryPool* memory_pool, ArrowWriterProperties* properties)
1386 : memory_pool(memory_pool),
1387 properties(properties),
1388 data_buffer(AllocateBuffer(memory_pool)),
1389 def_levels_buffer(AllocateBuffer(memory_pool)) {}
1390
1391 template <typename T>
1392 ::arrow::Status GetScratchData(const int64_t num_values, T** out) {
1393 ARROW_RETURN_NOT_OK(this->data_buffer->Resize(num_values * sizeof(T), false));
1394 *out = reinterpret_cast<T*>(this->data_buffer->mutable_data());
1395 return ::arrow::Status::OK();
1396 }
1397
1398 MemoryPool* memory_pool;
1399 const ArrowWriterProperties* properties;
1400
1401
1402
1403 std::shared_ptr<ResizableBuffer> data_buffer;
1404
1405
1406 std::shared_ptr<ResizableBuffer> def_levels_buffer;
1407 };
1408
1409 PARQUET_EXPORT
1410 std::shared_ptr<ArrowWriterProperties> default_arrow_writer_properties();
1411
1412 }