include/parquet/properties.h

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017
0018 #pragma once
0019
0020 #include <memory>
0021 #include <string>
0022 #include <unordered_map>
0023 #include <unordered_set>
0024 #include <utility>
0025
0026 #include "arrow/io/caching.h"
0027 #include "arrow/type_fwd.h"
0028 #include "arrow/util/compression.h"
0029 #include "arrow/util/type_fwd.h"
0030 #include "parquet/encryption/encryption.h"
0031 #include "parquet/exception.h"
0032 #include "parquet/parquet_version.h"
0033 #include "parquet/platform.h"
0034 #include "parquet/schema.h"
0035 #include "parquet/type_fwd.h"
0036 #include "parquet/types.h"
0037
0038 namespace parquet {
0039
0040 /// Controls serialization format of data pages.  parquet-format v2.0.0
0041 /// introduced a new data page metadata type DataPageV2 and serialized page
0042 /// structure (for example, encoded levels are no longer compressed). Prior to
0043 /// the completion of PARQUET-457 in 2020, this library did not implement
0044 /// DataPageV2 correctly, so if you use the V2 data page format, you may have
0045 /// forward compatibility issues (older versions of the library will be unable
0046 /// to read the files). Note that some Parquet implementations do not implement
0047 /// DataPageV2 at all.
0048 enum class ParquetDataPageVersion { V1, V2 };
0049
0050 /// Controls the level of size statistics that are written to the file.
0051 enum class SizeStatisticsLevel : uint8_t {
0052   // No size statistics are written.
0053   None = 0,
0054   // Only column chunk size statistics are written.
0055   ColumnChunk,
0056   // Both size statistics in the column chunk and page index are written.
0057   PageAndColumnChunk
0058 };
0059
0060 /// Align the default buffer size to a small multiple of a page size.
0061 constexpr int64_t kDefaultBufferSize = 4096 * 4;
0062
0063 constexpr int32_t kDefaultThriftStringSizeLimit = 100 * 1000 * 1000;
0064 // Structs in the thrift definition are relatively large (at least 300 bytes).
0065 // This limits total memory to the same order of magnitude as
0066 // kDefaultStringSizeLimit.
0067 constexpr int32_t kDefaultThriftContainerSizeLimit = 1000 * 1000;
0068
0069 // PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file
0070 constexpr int64_t kDefaultFooterReadSize = 64 * 1024;
0071
0072 class PARQUET_EXPORT ReaderProperties {
0073  public:
0074   explicit ReaderProperties(MemoryPool* pool = ::arrow::default_memory_pool())
0075       : pool_(pool) {}
0076
0077   MemoryPool* memory_pool() const { return pool_; }
0078
0079   std::shared_ptr<ArrowInputStream> GetStream(std::shared_ptr<ArrowInputFile> source,
0080                                               int64_t start, int64_t num_bytes);
0081
0082   /// Buffered stream reading allows the user to control the memory usage of
0083   /// parquet readers. This ensure that all `RandomAccessFile::ReadAt` calls are
0084   /// wrapped in a buffered reader that uses a fix sized buffer (of size
0085   /// `buffer_size()`) instead of the full size of the ReadAt.
0086   ///
0087   /// The primary reason for this control knobs is for resource control and not
0088   /// performance.
0089   bool is_buffered_stream_enabled() const { return buffered_stream_enabled_; }
0090   /// Enable buffered stream reading.
0091   void enable_buffered_stream() { buffered_stream_enabled_ = true; }
0092   /// Disable buffered stream reading.
0093   void disable_buffered_stream() { buffered_stream_enabled_ = false; }
0094
0095   bool read_dense_for_nullable() const { return read_dense_for_nullable_; }
0096   void enable_read_dense_for_nullable() { read_dense_for_nullable_ = true; }
0097   void disable_read_dense_for_nullable() { read_dense_for_nullable_ = false; }
0098
0099   /// Return the size of the buffered stream buffer.
0100   int64_t buffer_size() const { return buffer_size_; }
0101   /// Set the size of the buffered stream buffer in bytes.
0102   void set_buffer_size(int64_t size) { buffer_size_ = size; }
0103
0104   /// \brief Return the size limit on thrift strings.
0105   ///
0106   /// This limit helps prevent space and time bombs in files, but may need to
0107   /// be increased in order to read files with especially large headers.
0108   int32_t thrift_string_size_limit() const { return thrift_string_size_limit_; }
0109   /// Set the size limit on thrift strings.
0110   void set_thrift_string_size_limit(int32_t size) { thrift_string_size_limit_ = size; }
0111
0112   /// \brief Return the size limit on thrift containers.
0113   ///
0114   /// This limit helps prevent space and time bombs in files, but may need to
0115   /// be increased in order to read files with especially large headers.
0116   int32_t thrift_container_size_limit() const { return thrift_container_size_limit_; }
0117   /// Set the size limit on thrift containers.
0118   void set_thrift_container_size_limit(int32_t size) {
0119     thrift_container_size_limit_ = size;
0120   }
0121
0122   /// Set the decryption properties.
0123   void file_decryption_properties(std::shared_ptr<FileDecryptionProperties> decryption) {
0124     file_decryption_properties_ = std::move(decryption);
0125   }
0126   /// Return the decryption properties.
0127   const std::shared_ptr<FileDecryptionProperties>& file_decryption_properties() const {
0128     return file_decryption_properties_;
0129   }
0130
0131   bool page_checksum_verification() const { return page_checksum_verification_; }
0132   void set_page_checksum_verification(bool check_crc) {
0133     page_checksum_verification_ = check_crc;
0134   }
0135
0136   // Set the default read size to read the footer from a file. For high latency
0137   // file systems and files with large metadata (>64KB) this can increase performance
0138   // by reducing the number of round-trips to retrieve the entire file metadata.
0139   void set_footer_read_size(size_t size) { footer_read_size_ = size; }
0140   size_t footer_read_size() const { return footer_read_size_; }
0141
0142  private:
0143   MemoryPool* pool_;
0144   int64_t buffer_size_ = kDefaultBufferSize;
0145   int32_t thrift_string_size_limit_ = kDefaultThriftStringSizeLimit;
0146   int32_t thrift_container_size_limit_ = kDefaultThriftContainerSizeLimit;
0147   bool buffered_stream_enabled_ = false;
0148   bool page_checksum_verification_ = false;
0149   // Used with a RecordReader.
0150   bool read_dense_for_nullable_ = false;
0151   size_t footer_read_size_ = kDefaultFooterReadSize;
0152   std::shared_ptr<FileDecryptionProperties> file_decryption_properties_;
0153 };
0154
0155 ReaderProperties PARQUET_EXPORT default_reader_properties();
0156
0157 static constexpr int64_t kDefaultDataPageSize = 1024 * 1024;
0158 static constexpr bool DEFAULT_IS_DICTIONARY_ENABLED = true;
0159 static constexpr int64_t DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT = kDefaultDataPageSize;
0160 static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024;
0161 static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 1024 * 1024;
0162 static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true;
0163 static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
0164 static constexpr Encoding::type DEFAULT_ENCODING = Encoding::UNKNOWN;
0165 static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION;
0166 static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED;
0167 static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = true;
0168 static constexpr SizeStatisticsLevel DEFAULT_SIZE_STATISTICS_LEVEL =
0169     SizeStatisticsLevel::PageAndColumnChunk;
0170
0171 class PARQUET_EXPORT ColumnProperties {
0172  public:
0173   ColumnProperties(Encoding::type encoding = DEFAULT_ENCODING,
0174                    Compression::type codec = DEFAULT_COMPRESSION_TYPE,
0175                    bool dictionary_enabled = DEFAULT_IS_DICTIONARY_ENABLED,
0176                    bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED,
0177                    size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE,
0178                    bool page_index_enabled = DEFAULT_IS_PAGE_INDEX_ENABLED)
0179       : encoding_(encoding),
0180         codec_(codec),
0181         dictionary_enabled_(dictionary_enabled),
0182         statistics_enabled_(statistics_enabled),
0183         max_stats_size_(max_stats_size),
0184         page_index_enabled_(page_index_enabled) {}
0185
0186   void set_encoding(Encoding::type encoding) { encoding_ = encoding; }
0187
0188   void set_compression(Compression::type codec) { codec_ = codec; }
0189
0190   void set_dictionary_enabled(bool dictionary_enabled) {
0191     dictionary_enabled_ = dictionary_enabled;
0192   }
0193
0194   void set_statistics_enabled(bool statistics_enabled) {
0195     statistics_enabled_ = statistics_enabled;
0196   }
0197
0198   void set_max_statistics_size(size_t max_stats_size) {
0199     max_stats_size_ = max_stats_size;
0200   }
0201
0202   void set_compression_level(int compression_level) {
0203     if (!codec_options_) {
0204       codec_options_ = std::make_shared<CodecOptions>();
0205     }
0206     codec_options_->compression_level = compression_level;
0207   }
0208
0209   void set_codec_options(const std::shared_ptr<CodecOptions>& codec_options) {
0210     codec_options_ = codec_options;
0211   }
0212
0213   void set_page_index_enabled(bool page_index_enabled) {
0214     page_index_enabled_ = page_index_enabled;
0215   }
0216
0217   Encoding::type encoding() const { return encoding_; }
0218
0219   Compression::type compression() const { return codec_; }
0220
0221   bool dictionary_enabled() const { return dictionary_enabled_; }
0222
0223   bool statistics_enabled() const { return statistics_enabled_; }
0224
0225   size_t max_statistics_size() const { return max_stats_size_; }
0226
0227   int compression_level() const {
0228     if (!codec_options_) {
0229       return ::arrow::util::kUseDefaultCompressionLevel;
0230     }
0231     return codec_options_->compression_level;
0232   }
0233
0234   const std::shared_ptr<CodecOptions>& codec_options() const { return codec_options_; }
0235
0236   bool page_index_enabled() const { return page_index_enabled_; }
0237
0238  private:
0239   Encoding::type encoding_;
0240   Compression::type codec_;
0241   bool dictionary_enabled_;
0242   bool statistics_enabled_;
0243   size_t max_stats_size_;
0244   std::shared_ptr<CodecOptions> codec_options_;
0245   bool page_index_enabled_;
0246 };
0247
0248 // EXPERIMENTAL: Options for content-defined chunking.
0249 ///
0250 /// Content-defined chunking is an experimental feature that optimizes parquet
0251 /// files for content addressable storage (CAS) systems by writing data pages
0252 /// according to content-defined chunk boundaries. This allows for more
0253 /// efficient deduplication of data across files, hence more efficient network
0254 /// transfers and storage.
0255 /// Each content-defined chunk is written as a separate parquet data page. The
0256 /// following options control the chunks' size and the chunking process. Note
0257 /// that the chunk size is calculated based on the logical value of the data,
0258 /// before any encoding or compression is applied.
0259 struct PARQUET_EXPORT CdcOptions {
0260   /// Minimum chunk size in bytes, default is 256 KiB
0261   /// The rolling hash will not be updated until this size is reached for each chunk.
0262   /// Note that all data sent through the hash function is counted towards the chunk
0263   /// size, including definition and repetition levels if present.
0264   int64_t min_chunk_size = 256 * 1024;
0265   /// Maximum chunk size in bytes, default is 1024 KiB
0266   /// The chunker will create a new chunk whenever the chunk size exceeds this value.
0267   /// Note that the parquet writer has a related `pagesize` property that controls
0268   /// the maximum size of a parquet data page after encoding. While setting
0269   /// `pagesize` to a smaller value than `max_chunk_size` doesn't affect the
0270   /// chunking effectiveness, it results in more small parquet data pages.
0271   int64_t max_chunk_size = 1024 * 1024;
0272   /// Number of bit adjustment to the gearhash mask in order to center the chunk size
0273   /// around the average size more aggressively, default is 0
0274   /// Increasing the normalization level increases the probability of finding a chunk,
0275   /// improving the deduplication ratio, but also increasing the number of small chunks
0276   /// resulting in many small parquet data pages. The default value provides a good
0277   /// balance between deduplication ratio and fragmentation.
0278   /// Use norm_level=1 or norm_level=2 to reach a higher deduplication ratio at the
0279   /// expense of fragmentation. Negative values can also be used to reduce the
0280   /// probability of finding a chunk, resulting in larger chunks and fewer data pages.
0281   /// Note that values outside [-3, 3] are not recommended, prefer using the default
0282   /// value of 0 for most use cases.
0283   int norm_level = 0;
0284 };
0285
0286 class PARQUET_EXPORT WriterProperties {
0287  public:
0288   class Builder {
0289    public:
0290     Builder()
0291         : pool_(::arrow::default_memory_pool()),
0292           dictionary_pagesize_limit_(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT),
0293           write_batch_size_(DEFAULT_WRITE_BATCH_SIZE),
0294           max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH),
0295           pagesize_(kDefaultDataPageSize),
0296           version_(ParquetVersion::PARQUET_2_6),
0297           data_page_version_(ParquetDataPageVersion::V1),
0298           created_by_(DEFAULT_CREATED_BY),
0299           store_decimal_as_integer_(false),
0300           page_checksum_enabled_(false),
0301           size_statistics_level_(DEFAULT_SIZE_STATISTICS_LEVEL),
0302           content_defined_chunking_enabled_(false),
0303           content_defined_chunking_options_({}) {}
0304
0305     explicit Builder(const WriterProperties& properties)
0306         : pool_(properties.memory_pool()),
0307           dictionary_pagesize_limit_(properties.dictionary_pagesize_limit()),
0308           write_batch_size_(properties.write_batch_size()),
0309           max_row_group_length_(properties.max_row_group_length()),
0310           pagesize_(properties.data_pagesize()),
0311           version_(properties.version()),
0312           data_page_version_(properties.data_page_version()),
0313           created_by_(properties.created_by()),
0314           store_decimal_as_integer_(properties.store_decimal_as_integer()),
0315           page_checksum_enabled_(properties.page_checksum_enabled()),
0316           size_statistics_level_(properties.size_statistics_level()),
0317           sorting_columns_(properties.sorting_columns()),
0318           default_column_properties_(properties.default_column_properties()),
0319           content_defined_chunking_enabled_(
0320               properties.content_defined_chunking_enabled()),
0321           content_defined_chunking_options_(
0322               properties.content_defined_chunking_options()) {}
0323
0324     virtual ~Builder() {}
0325
0326     /// \brief EXPERIMENTAL: Use content-defined page chunking for all columns.
0327     ///
0328     /// Optimize parquet files for content addressable storage (CAS) systems by writing
0329     /// data pages according to content-defined chunk boundaries. This allows for more
0330     /// efficient deduplication of data across files, hence more efficient network
0331     /// transfers and storage. The chunking is based on a rolling hash algorithm that
0332     /// identifies chunk boundaries based on the actual content of the data.
0333     ///
0334     /// Note that only the WriteArrow() interface is supported at the moment.
0335     Builder* enable_content_defined_chunking() {
0336       content_defined_chunking_enabled_ = true;
0337       return this;
0338     }
0339
0340     /// \brief EXPERIMENTAL: Disable content-defined page chunking for all columns.
0341     Builder* disable_content_defined_chunking() {
0342       content_defined_chunking_enabled_ = false;
0343       return this;
0344     }
0345
0346     /// \brief EXPERIMENTAL: Specify content-defined chunking options, see CdcOptions.
0347     Builder* content_defined_chunking_options(const CdcOptions& options) {
0348       content_defined_chunking_options_ = options;
0349       return this;
0350     }
0351
0352     /// Specify the memory pool for the writer. Default default_memory_pool.
0353     Builder* memory_pool(MemoryPool* pool) {
0354       pool_ = pool;
0355       return this;
0356     }
0357
0358     /// Enable dictionary encoding in general for all columns. Default
0359     /// enabled.
0360     Builder* enable_dictionary() {
0361       default_column_properties_.set_dictionary_enabled(true);
0362       return this;
0363     }
0364
0365     /// Disable dictionary encoding in general for all columns. Default
0366     /// enabled.
0367     Builder* disable_dictionary() {
0368       default_column_properties_.set_dictionary_enabled(false);
0369       return this;
0370     }
0371
0372     /// Enable dictionary encoding for column specified by `path`. Default
0373     /// enabled.
0374     Builder* enable_dictionary(const std::string& path) {
0375       dictionary_enabled_[path] = true;
0376       return this;
0377     }
0378
0379     /// Enable dictionary encoding for column specified by `path`. Default
0380     /// enabled.
0381     Builder* enable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) {
0382       return this->enable_dictionary(path->ToDotString());
0383     }
0384
0385     /// Disable dictionary encoding for column specified by `path`. Default
0386     /// enabled.
0387     Builder* disable_dictionary(const std::string& path) {
0388       dictionary_enabled_[path] = false;
0389       return this;
0390     }
0391
0392     /// Disable dictionary encoding for column specified by `path`. Default
0393     /// enabled.
0394     Builder* disable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) {
0395       return this->disable_dictionary(path->ToDotString());
0396     }
0397
0398     /// Specify the dictionary page size limit per row group. Default 1MB.
0399     Builder* dictionary_pagesize_limit(int64_t dictionary_psize_limit) {
0400       dictionary_pagesize_limit_ = dictionary_psize_limit;
0401       return this;
0402     }
0403
0404     /// Specify the write batch size while writing batches of Arrow values
0405     /// into Parquet. Default 1024.
0406     Builder* write_batch_size(int64_t write_batch_size) {
0407       write_batch_size_ = write_batch_size;
0408       return this;
0409     }
0410
0411     /// Specify the max number of rows to put in a single row group.
0412     /// Default 1Mi rows.
0413     Builder* max_row_group_length(int64_t max_row_group_length) {
0414       max_row_group_length_ = max_row_group_length;
0415       return this;
0416     }
0417
0418     /// Specify the data page size.
0419     /// Default 1MB.
0420     Builder* data_pagesize(int64_t pg_size) {
0421       pagesize_ = pg_size;
0422       return this;
0423     }
0424
0425     /// Specify the data page version.
0426     /// Default V1.
0427     Builder* data_page_version(ParquetDataPageVersion data_page_version) {
0428       data_page_version_ = data_page_version;
0429       return this;
0430     }
0431
0432     /// Specify the Parquet file version.
0433     /// Default PARQUET_2_6.
0434     Builder* version(ParquetVersion::type version) {
0435       version_ = version;
0436       return this;
0437     }
0438
0439     Builder* created_by(const std::string& created_by) {
0440       created_by_ = created_by;
0441       return this;
0442     }
0443
0444     Builder* enable_page_checksum() {
0445       page_checksum_enabled_ = true;
0446       return this;
0447     }
0448
0449     Builder* disable_page_checksum() {
0450       page_checksum_enabled_ = false;
0451       return this;
0452     }
0453
0454     /// \brief Define the encoding that is used when we don't utilise dictionary encoding.
0455     //
0456     /// This is only applied if dictionary encoding is disabled. If the dictionary grows
0457     /// too large we always fall back to the PLAIN encoding.
0458     Builder* encoding(Encoding::type encoding_type) {
0459       if (encoding_type == Encoding::PLAIN_DICTIONARY ||
0460           encoding_type == Encoding::RLE_DICTIONARY) {
0461         throw ParquetException("Can't use dictionary encoding as fallback encoding");
0462       }
0463
0464       default_column_properties_.set_encoding(encoding_type);
0465       return this;
0466     }
0467
0468     /// \brief Define the encoding that is used when we don't utilise dictionary encoding.
0469     //
0470     /// This is only applied if dictionary encoding is disabled. If the dictionary grows
0471     /// too large we always fall back to the PLAIN encoding.
0472     Builder* encoding(const std::string& path, Encoding::type encoding_type) {
0473       if (encoding_type == Encoding::PLAIN_DICTIONARY ||
0474           encoding_type == Encoding::RLE_DICTIONARY) {
0475         throw ParquetException("Can't use dictionary encoding as fallback encoding");
0476       }
0477
0478       encodings_[path] = encoding_type;
0479       return this;
0480     }
0481
0482     /// \brief Define the encoding that is used when we don't utilise dictionary encoding.
0483     //
0484     /// This is only applied if dictionary encoding is disabled. If the dictionary grows
0485     /// too large we always fall back to the PLAIN encoding.
0486     Builder* encoding(const std::shared_ptr<schema::ColumnPath>& path,
0487                       Encoding::type encoding_type) {
0488       return this->encoding(path->ToDotString(), encoding_type);
0489     }
0490
0491     /// Specify compression codec in general for all columns.
0492     /// Default UNCOMPRESSED.
0493     Builder* compression(Compression::type codec) {
0494       default_column_properties_.set_compression(codec);
0495       return this;
0496     }
0497
0498     /// Specify max statistics size to store min max value.
0499     /// Default 4KB.
0500     Builder* max_statistics_size(size_t max_stats_sz) {
0501       default_column_properties_.set_max_statistics_size(max_stats_sz);
0502       return this;
0503     }
0504
0505     /// Specify compression codec for the column specified by `path`.
0506     /// Default UNCOMPRESSED.
0507     Builder* compression(const std::string& path, Compression::type codec) {
0508       codecs_[path] = codec;
0509       return this;
0510     }
0511
0512     /// Specify compression codec for the column specified by `path`.
0513     /// Default UNCOMPRESSED.
0514     Builder* compression(const std::shared_ptr<schema::ColumnPath>& path,
0515                          Compression::type codec) {
0516       return this->compression(path->ToDotString(), codec);
0517     }
0518
0519     /// \brief Specify the default compression level for the compressor in
0520     /// every column.  In case a column does not have an explicitly specified
0521     /// compression level, the default one would be used.
0522     ///
0523     /// The provided compression level is compressor specific. The user would
0524     /// have to familiarize oneself with the available levels for the selected
0525     /// compressor.  If the compressor does not allow for selecting different
0526     /// compression levels, calling this function would not have any effect.
0527     /// Parquet and Arrow do not validate the passed compression level.  If no
0528     /// level is selected by the user or if the special
0529     /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
0530     /// compression level.
0531     ///
0532     /// If other compressor-specific options need to be set in addition to the compression
0533     /// level, use the codec_options method.
0534     Builder* compression_level(int compression_level) {
0535       default_column_properties_.set_compression_level(compression_level);
0536       return this;
0537     }
0538
0539     /// \brief Specify a compression level for the compressor for the column
0540     /// described by path.
0541     ///
0542     /// The provided compression level is compressor specific. The user would
0543     /// have to familiarize oneself with the available levels for the selected
0544     /// compressor.  If the compressor does not allow for selecting different
0545     /// compression levels, calling this function would not have any effect.
0546     /// Parquet and Arrow do not validate the passed compression level.  If no
0547     /// level is selected by the user or if the special
0548     /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
0549     /// compression level.
0550     Builder* compression_level(const std::string& path, int compression_level) {
0551       if (!codec_options_[path]) {
0552         codec_options_[path] = std::make_shared<CodecOptions>();
0553       }
0554       codec_options_[path]->compression_level = compression_level;
0555       return this;
0556     }
0557
0558     /// \brief Specify a compression level for the compressor for the column
0559     /// described by path.
0560     ///
0561     /// The provided compression level is compressor specific. The user would
0562     /// have to familiarize oneself with the available levels for the selected
0563     /// compressor.  If the compressor does not allow for selecting different
0564     /// compression levels, calling this function would not have any effect.
0565     /// Parquet and Arrow do not validate the passed compression level.  If no
0566     /// level is selected by the user or if the special
0567     /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
0568     /// compression level.
0569     Builder* compression_level(const std::shared_ptr<schema::ColumnPath>& path,
0570                                int compression_level) {
0571       return this->compression_level(path->ToDotString(), compression_level);
0572     }
0573
0574     /// \brief Specify the default codec options for the compressor in
0575     /// every column.
0576     ///
0577     /// The codec options allow configuring the compression level as well
0578     /// as other codec-specific options.
0579     Builder* codec_options(
0580         const std::shared_ptr<::arrow::util::CodecOptions>& codec_options) {
0581       default_column_properties_.set_codec_options(codec_options);
0582       return this;
0583     }
0584
0585     /// \brief Specify the codec options for the compressor for the column
0586     /// described by path.
0587     Builder* codec_options(
0588         const std::string& path,
0589         const std::shared_ptr<::arrow::util::CodecOptions>& codec_options) {
0590       codec_options_[path] = codec_options;
0591       return this;
0592     }
0593
0594     /// \brief Specify the codec options for the compressor for the column
0595     /// described by path.
0596     Builder* codec_options(
0597         const std::shared_ptr<schema::ColumnPath>& path,
0598         const std::shared_ptr<::arrow::util::CodecOptions>& codec_options) {
0599       return this->codec_options(path->ToDotString(), codec_options);
0600     }
0601
0602     /// Define the file encryption properties.
0603     /// Default NULL.
0604     Builder* encryption(
0605         std::shared_ptr<FileEncryptionProperties> file_encryption_properties) {
0606       file_encryption_properties_ = std::move(file_encryption_properties);
0607       return this;
0608     }
0609
0610     /// Enable statistics in general.
0611     /// Default enabled.
0612     Builder* enable_statistics() {
0613       default_column_properties_.set_statistics_enabled(true);
0614       return this;
0615     }
0616
0617     /// Disable statistics in general.
0618     /// Default enabled.
0619     Builder* disable_statistics() {
0620       default_column_properties_.set_statistics_enabled(false);
0621       return this;
0622     }
0623
0624     /// Enable statistics for the column specified by `path`.
0625     /// Default enabled.
0626     Builder* enable_statistics(const std::string& path) {
0627       statistics_enabled_[path] = true;
0628       return this;
0629     }
0630
0631     /// Enable statistics for the column specified by `path`.
0632     /// Default enabled.
0633     Builder* enable_statistics(const std::shared_ptr<schema::ColumnPath>& path) {
0634       return this->enable_statistics(path->ToDotString());
0635     }
0636
0637     /// Define the sorting columns.
0638     /// Default empty.
0639     ///
0640     /// If sorting columns are set, user should ensure that records
0641     /// are sorted by sorting columns. Otherwise, the storing data
0642     /// will be inconsistent with sorting_columns metadata.
0643     Builder* set_sorting_columns(std::vector<SortingColumn> sorting_columns) {
0644       sorting_columns_ = std::move(sorting_columns);
0645       return this;
0646     }
0647
0648     /// Disable statistics for the column specified by `path`.
0649     /// Default enabled.
0650     Builder* disable_statistics(const std::string& path) {
0651       statistics_enabled_[path] = false;
0652       return this;
0653     }
0654
0655     /// Disable statistics for the column specified by `path`.
0656     /// Default enabled.
0657     Builder* disable_statistics(const std::shared_ptr<schema::ColumnPath>& path) {
0658       return this->disable_statistics(path->ToDotString());
0659     }
0660
0661     /// Allow decimals with 1 <= precision <= 18 to be stored as integers.
0662     ///
0663     /// In Parquet, DECIMAL can be stored in any of the following physical types:
0664     /// - int32: for 1 <= precision <= 9.
0665     /// - int64: for 10 <= precision <= 18.
0666     /// - fixed_len_byte_array: precision is limited by the array size.
0667     ///   Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits.
0668     /// - binary: precision is unlimited. The minimum number of bytes to store
0669     ///   the unscaled value is used.
0670     ///
0671     /// By default, this is DISABLED and all decimal types annotate fixed_len_byte_array.
0672     ///
0673     /// When enabled, the C++ writer will use following physical types to store decimals:
0674     /// - int32: for 1 <= precision <= 9.
0675     /// - int64: for 10 <= precision <= 18.
0676     /// - fixed_len_byte_array: for precision > 18.
0677     ///
0678     /// As a consequence, decimal columns stored in integer types are more compact.
0679     Builder* enable_store_decimal_as_integer() {
0680       store_decimal_as_integer_ = true;
0681       return this;
0682     }
0683
0684     /// Disable decimal logical type with 1 <= precision <= 18 to be stored
0685     /// as integer physical type.
0686     ///
0687     /// Default disabled.
0688     Builder* disable_store_decimal_as_integer() {
0689       store_decimal_as_integer_ = false;
0690       return this;
0691     }
0692
0693     /// Enable writing page index in general for all columns. Default disabled.
0694     ///
0695     /// Writing statistics to the page index disables the old method of writing
0696     /// statistics to each data page header.
0697     /// The page index makes filtering more efficient than the page header, as
0698     /// it gathers all the statistics for a Parquet file in a single place,
0699     /// avoiding scattered I/O.
0700     ///
0701     /// Please check the link below for more details:
0702     /// https://github.com/apache/parquet-format/blob/master/PageIndex.md
0703     Builder* enable_write_page_index() {
0704       default_column_properties_.set_page_index_enabled(true);
0705       return this;
0706     }
0707
0708     /// Disable writing page index in general for all columns. Default disabled.
0709     Builder* disable_write_page_index() {
0710       default_column_properties_.set_page_index_enabled(false);
0711       return this;
0712     }
0713
0714     /// Enable writing page index for column specified by `path`. Default disabled.
0715     Builder* enable_write_page_index(const std::string& path) {
0716       page_index_enabled_[path] = true;
0717       return this;
0718     }
0719
0720     /// Enable writing page index for column specified by `path`. Default disabled.
0721     Builder* enable_write_page_index(const std::shared_ptr<schema::ColumnPath>& path) {
0722       return this->enable_write_page_index(path->ToDotString());
0723     }
0724
0725     /// Disable writing page index for column specified by `path`. Default disabled.
0726     Builder* disable_write_page_index(const std::string& path) {
0727       page_index_enabled_[path] = false;
0728       return this;
0729     }
0730
0731     /// Disable writing page index for column specified by `path`. Default disabled.
0732     Builder* disable_write_page_index(const std::shared_ptr<schema::ColumnPath>& path) {
0733       return this->disable_write_page_index(path->ToDotString());
0734     }
0735
0736     /// \brief Set the level to write size statistics for all columns. Default is None.
0737     ///
0738     /// \param level The level to write size statistics. Note that if page index is not
0739     /// enabled, page level size statistics will not be written even if the level
0740     /// is set to PageAndColumnChunk.
0741     Builder* set_size_statistics_level(SizeStatisticsLevel level) {
0742       size_statistics_level_ = level;
0743       return this;
0744     }
0745
0746     /// \brief Build the WriterProperties with the builder parameters.
0747     /// \return The WriterProperties defined by the builder.
0748     std::shared_ptr<WriterProperties> build() {
0749       std::unordered_map<std::string, ColumnProperties> column_properties;
0750       auto get = [&](const std::string& key) -> ColumnProperties& {
0751         auto it = column_properties.find(key);
0752         if (it == column_properties.end())
0753           return column_properties[key] = default_column_properties_;
0754         else
0755           return it->second;
0756       };
0757
0758       for (const auto& item : encodings_) get(item.first).set_encoding(item.second);
0759       for (const auto& item : codecs_) get(item.first).set_compression(item.second);
0760       for (const auto& item : codec_options_)
0761         get(item.first).set_codec_options(item.second);
0762       for (const auto& item : dictionary_enabled_)
0763         get(item.first).set_dictionary_enabled(item.second);
0764       for (const auto& item : statistics_enabled_)
0765         get(item.first).set_statistics_enabled(item.second);
0766       for (const auto& item : page_index_enabled_)
0767         get(item.first).set_page_index_enabled(item.second);
0768
0769       return std::shared_ptr<WriterProperties>(new WriterProperties(
0770           pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_,
0771           pagesize_, version_, created_by_, page_checksum_enabled_,
0772           size_statistics_level_, std::move(file_encryption_properties_),
0773           default_column_properties_, column_properties, data_page_version_,
0774           store_decimal_as_integer_, std::move(sorting_columns_),
0775           content_defined_chunking_enabled_, content_defined_chunking_options_));
0776     }
0777
0778    private:
0779     MemoryPool* pool_;
0780     int64_t dictionary_pagesize_limit_;
0781     int64_t write_batch_size_;
0782     int64_t max_row_group_length_;
0783     int64_t pagesize_;
0784     ParquetVersion::type version_;
0785     ParquetDataPageVersion data_page_version_;
0786     std::string created_by_;
0787     bool store_decimal_as_integer_;
0788     bool page_checksum_enabled_;
0789     SizeStatisticsLevel size_statistics_level_;
0790
0791     std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
0792
0793     // If empty, there is no sorting columns.
0794     std::vector<SortingColumn> sorting_columns_;
0795
0796     // Settings used for each column unless overridden in any of the maps below
0797     ColumnProperties default_column_properties_;
0798     std::unordered_map<std::string, Encoding::type> encodings_;
0799     std::unordered_map<std::string, Compression::type> codecs_;
0800     std::unordered_map<std::string, std::shared_ptr<CodecOptions>> codec_options_;
0801     std::unordered_map<std::string, bool> dictionary_enabled_;
0802     std::unordered_map<std::string, bool> statistics_enabled_;
0803     std::unordered_map<std::string, bool> page_index_enabled_;
0804
0805     bool content_defined_chunking_enabled_;
0806     CdcOptions content_defined_chunking_options_;
0807   };
0808
0809   inline MemoryPool* memory_pool() const { return pool_; }
0810
0811   inline int64_t dictionary_pagesize_limit() const { return dictionary_pagesize_limit_; }
0812
0813   inline int64_t write_batch_size() const { return write_batch_size_; }
0814
0815   inline int64_t max_row_group_length() const { return max_row_group_length_; }
0816
0817   inline int64_t data_pagesize() const { return pagesize_; }
0818
0819   inline ParquetDataPageVersion data_page_version() const {
0820     return parquet_data_page_version_;
0821   }
0822
0823   inline ParquetVersion::type version() const { return parquet_version_; }
0824
0825   inline std::string created_by() const { return parquet_created_by_; }
0826
0827   inline bool store_decimal_as_integer() const { return store_decimal_as_integer_; }
0828
0829   inline bool page_checksum_enabled() const { return page_checksum_enabled_; }
0830
0831   inline bool content_defined_chunking_enabled() const {
0832     return content_defined_chunking_enabled_;
0833   }
0834   inline CdcOptions content_defined_chunking_options() const {
0835     return content_defined_chunking_options_;
0836   }
0837
0838   inline SizeStatisticsLevel size_statistics_level() const {
0839     return size_statistics_level_;
0840   }
0841
0842   inline Encoding::type dictionary_index_encoding() const {
0843     if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
0844       return Encoding::PLAIN_DICTIONARY;
0845     } else {
0846       return Encoding::RLE_DICTIONARY;
0847     }
0848   }
0849
0850   inline Encoding::type dictionary_page_encoding() const {
0851     if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
0852       return Encoding::PLAIN_DICTIONARY;
0853     } else {
0854       return Encoding::PLAIN;
0855     }
0856   }
0857
0858   const ColumnProperties& column_properties(
0859       const std::shared_ptr<schema::ColumnPath>& path) const {
0860     auto it = column_properties_.find(path->ToDotString());
0861     if (it != column_properties_.end()) return it->second;
0862     return default_column_properties_;
0863   }
0864
0865   Encoding::type encoding(const std::shared_ptr<schema::ColumnPath>& path) const {
0866     return column_properties(path).encoding();
0867   }
0868
0869   Compression::type compression(const std::shared_ptr<schema::ColumnPath>& path) const {
0870     return column_properties(path).compression();
0871   }
0872
0873   int compression_level(const std::shared_ptr<schema::ColumnPath>& path) const {
0874     return column_properties(path).compression_level();
0875   }
0876
0877   const std::shared_ptr<CodecOptions> codec_options(
0878       const std::shared_ptr<schema::ColumnPath>& path) const {
0879     return column_properties(path).codec_options();
0880   }
0881
0882   bool dictionary_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
0883     return column_properties(path).dictionary_enabled();
0884   }
0885
0886   const std::vector<SortingColumn>& sorting_columns() const { return sorting_columns_; }
0887
0888   bool statistics_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
0889     return column_properties(path).statistics_enabled();
0890   }
0891
0892   size_t max_statistics_size(const std::shared_ptr<schema::ColumnPath>& path) const {
0893     return column_properties(path).max_statistics_size();
0894   }
0895
0896   bool page_index_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
0897     return column_properties(path).page_index_enabled();
0898   }
0899
0900   bool page_index_enabled() const {
0901     if (default_column_properties_.page_index_enabled()) {
0902       return true;
0903     }
0904     for (const auto& item : column_properties_) {
0905       if (item.second.page_index_enabled()) {
0906         return true;
0907       }
0908     }
0909     return false;
0910   }
0911
0912   inline FileEncryptionProperties* file_encryption_properties() const {
0913     return file_encryption_properties_.get();
0914   }
0915
0916   std::shared_ptr<ColumnEncryptionProperties> column_encryption_properties(
0917       const std::string& path) const {
0918     if (file_encryption_properties_) {
0919       return file_encryption_properties_->column_encryption_properties(path);
0920     } else {
0921       return NULLPTR;
0922     }
0923   }
0924
0925   // \brief Return the default column properties
0926   const ColumnProperties& default_column_properties() const {
0927     return default_column_properties_;
0928   }
0929
0930  private:
0931   explicit WriterProperties(
0932       MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size,
0933       int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type version,
0934       const std::string& created_by, bool page_write_checksum_enabled,
0935       SizeStatisticsLevel size_statistics_level,
0936       std::shared_ptr<FileEncryptionProperties> file_encryption_properties,
0937       const ColumnProperties& default_column_properties,
0938       const std::unordered_map<std::string, ColumnProperties>& column_properties,
0939       ParquetDataPageVersion data_page_version, bool store_short_decimal_as_integer,
0940       std::vector<SortingColumn> sorting_columns, bool content_defined_chunking_enabled,
0941       CdcOptions content_defined_chunking_options)
0942       : pool_(pool),
0943         dictionary_pagesize_limit_(dictionary_pagesize_limit),
0944         write_batch_size_(write_batch_size),
0945         max_row_group_length_(max_row_group_length),
0946         pagesize_(pagesize),
0947         parquet_data_page_version_(data_page_version),
0948         parquet_version_(version),
0949         parquet_created_by_(created_by),
0950         store_decimal_as_integer_(store_short_decimal_as_integer),
0951         page_checksum_enabled_(page_write_checksum_enabled),
0952         size_statistics_level_(size_statistics_level),
0953         file_encryption_properties_(file_encryption_properties),
0954         sorting_columns_(std::move(sorting_columns)),
0955         default_column_properties_(default_column_properties),
0956         column_properties_(column_properties),
0957         content_defined_chunking_enabled_(content_defined_chunking_enabled),
0958         content_defined_chunking_options_(content_defined_chunking_options) {}
0959
0960   MemoryPool* pool_;
0961   int64_t dictionary_pagesize_limit_;
0962   int64_t write_batch_size_;
0963   int64_t max_row_group_length_;
0964   int64_t pagesize_;
0965   ParquetDataPageVersion parquet_data_page_version_;
0966   ParquetVersion::type parquet_version_;
0967   std::string parquet_created_by_;
0968   bool store_decimal_as_integer_;
0969   bool page_checksum_enabled_;
0970   SizeStatisticsLevel size_statistics_level_;
0971
0972   std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
0973
0974   std::vector<SortingColumn> sorting_columns_;
0975
0976   ColumnProperties default_column_properties_;
0977   std::unordered_map<std::string, ColumnProperties> column_properties_;
0978
0979   bool content_defined_chunking_enabled_;
0980   CdcOptions content_defined_chunking_options_;
0981 };
0982
0983 PARQUET_EXPORT const std::shared_ptr<WriterProperties>& default_writer_properties();
0984
0985 // ----------------------------------------------------------------------
0986 // Properties specific to Apache Arrow columnar read and write
0987
0988 static constexpr bool kArrowDefaultUseThreads = false;
0989
0990 // Default number of rows to read when using ::arrow::RecordBatchReader
0991 static constexpr int64_t kArrowDefaultBatchSize = 64 * 1024;
0992
0993 constexpr inline ::arrow::Type::type kArrowDefaultBinaryType = ::arrow::Type::BINARY;
0994 constexpr inline ::arrow::Type::type kArrowDefaultListType = ::arrow::Type::LIST;
0995
0996 /// EXPERIMENTAL: Properties for configuring FileReader behavior.
0997 class PARQUET_EXPORT ArrowReaderProperties {
0998  public:
0999   explicit ArrowReaderProperties(bool use_threads = kArrowDefaultUseThreads)
1000       : use_threads_(use_threads),
1001         read_dict_indices_(),
1002         batch_size_(kArrowDefaultBatchSize),
1003         pre_buffer_(true),
1004         cache_options_(::arrow::io::CacheOptions::LazyDefaults()),
1005         coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO),
1006         binary_type_(kArrowDefaultBinaryType),
1007         list_type_(kArrowDefaultListType),
1008         arrow_extensions_enabled_(false),
1009         should_load_statistics_(false),
1010         smallest_decimal_enabled_(false) {}
1011
1012   /// \brief Set whether to use the IO thread pool to parse columns in parallel.
1013   ///
1014   /// Default is false.
1015   void set_use_threads(bool use_threads) { use_threads_ = use_threads; }
1016   /// Return whether will use multiple threads.
1017   bool use_threads() const { return use_threads_; }
1018
1019   /// \brief Set whether to read a particular column as dictionary encoded.
1020   ///
1021   /// If the file metadata contains a serialized Arrow schema, then ...
1022   ////
1023   /// This is only supported for columns with a Parquet physical type of
1024   /// BYTE_ARRAY, such as string or binary types.
1025   void set_read_dictionary(int column_index, bool read_dict) {
1026     if (read_dict) {
1027       read_dict_indices_.insert(column_index);
1028     } else {
1029       read_dict_indices_.erase(column_index);
1030     }
1031   }
1032   /// Return whether the column at the index will be read as dictionary.
1033   bool read_dictionary(int column_index) const {
1034     if (read_dict_indices_.find(column_index) != read_dict_indices_.end()) {
1035       return true;
1036     } else {
1037       return false;
1038     }
1039   }
1040
1041   /// \brief Set the Arrow binary type to read BYTE_ARRAY columns as.
1042   ///
1043   /// Allowed values are Type::BINARY, Type::LARGE_BINARY and Type::BINARY_VIEW.
1044   /// Default is Type::BINARY.
1045   ///
1046   /// If a BYTE_ARRAY column has the STRING logical type, it is read as the
1047   /// Arrow string type corresponding to the configured binary type (for example
1048   /// Type::LARGE_STRING if the configured binary type is Type::LARGE_BINARY).
1049   ///
1050   /// However, if a serialized Arrow schema is found in the Parquet metadata,
1051   /// this setting is ignored and the Arrow schema takes precedence
1052   /// (see ArrowWriterProperties::store_schema).
1053   void set_binary_type(::arrow::Type::type value) { binary_type_ = value; }
1054   /// Return the Arrow binary type to read BYTE_ARRAY columns as.
1055   ::arrow::Type::type binary_type() const { return binary_type_; }
1056
1057   /// \brief Set the Arrow list type to read Parquet list columns as.
1058   ///
1059   /// Allowed values are Type::LIST and Type::LARGE_LIST.
1060   /// Default is Type::LIST.
1061   ///
1062   /// However, if a serialized Arrow schema is found in the Parquet metadata,
1063   /// this setting is ignored and the Arrow schema takes precedence
1064   /// (see ArrowWriterProperties::store_schema).
1065   void set_list_type(::arrow::Type::type value) { list_type_ = value; }
1066   /// Return the Arrow list type to read Parquet list columns as.
1067   ::arrow::Type::type list_type() const { return list_type_; }
1068
1069   /// \brief Set the maximum number of rows to read into a record batch.
1070   ///
1071   /// Will only be fewer rows when there are no more rows in the file.
1072   /// Note that some APIs such as ReadTable may ignore this setting.
1073   void set_batch_size(int64_t batch_size) { batch_size_ = batch_size; }
1074   /// Return the batch size in rows.
1075   ///
1076   /// Note that some APIs such as ReadTable may ignore this setting.
1077   int64_t batch_size() const { return batch_size_; }
1078
1079   /// Enable read coalescing (default false).
1080   ///
1081   /// When enabled, the Arrow reader will pre-buffer necessary regions
1082   /// of the file in-memory. This is intended to improve performance on
1083   /// high-latency filesystems (e.g. Amazon S3).
1084   void set_pre_buffer(bool pre_buffer) { pre_buffer_ = pre_buffer; }
1085   /// Return whether read coalescing is enabled.
1086   bool pre_buffer() const { return pre_buffer_; }
1087
1088   /// Set options for read coalescing. This can be used to tune the
1089   /// implementation for characteristics of different filesystems.
1090   void set_cache_options(::arrow::io::CacheOptions options) { cache_options_ = options; }
1091   /// Return the options for read coalescing.
1092   const ::arrow::io::CacheOptions& cache_options() const { return cache_options_; }
1093
1094   /// Set execution context for read coalescing.
1095   void set_io_context(const ::arrow::io::IOContext& ctx) { io_context_ = ctx; }
1096   /// Return the execution context used for read coalescing.
1097   const ::arrow::io::IOContext& io_context() const { return io_context_; }
1098
1099   /// Set timestamp unit to use for deprecated INT96-encoded timestamps
1100   /// (default is NANO).
1101   void set_coerce_int96_timestamp_unit(::arrow::TimeUnit::type unit) {
1102     coerce_int96_timestamp_unit_ = unit;
1103   }
1104
1105   ::arrow::TimeUnit::type coerce_int96_timestamp_unit() const {
1106     return coerce_int96_timestamp_unit_;
1107   }
1108
1109   /// Enable Parquet-supported Arrow extension types.
1110   ///
1111   /// When enabled, Parquet logical types will be mapped to their corresponding Arrow
1112   /// extension types at read time, if such exist. Currently only arrow::extension::json()
1113   /// extension type is supported. Columns whose LogicalType is JSON will be interpreted
1114   /// as arrow::extension::json(), with storage type inferred from the serialized Arrow
1115   /// schema if present, or `utf8` by default.
1116   void set_arrow_extensions_enabled(bool extensions_enabled) {
1117     arrow_extensions_enabled_ = extensions_enabled;
1118   }
1119   bool get_arrow_extensions_enabled() const { return arrow_extensions_enabled_; }
1120
1121   /// \brief Set whether to load statistics as much as possible.
1122   ///
1123   /// Default is false.
1124   void set_should_load_statistics(bool should_load_statistics) {
1125     should_load_statistics_ = should_load_statistics;
1126   }
1127   /// Return whether loading statistics as much as possible.
1128   bool should_load_statistics() const { return should_load_statistics_; }
1129
1130   /// \brief Set whether to infer Decimal32/64 from Parquet decimal logical types.
1131   ///
1132   /// Default is false for compatibility, meaning that only Decimal128 and Decimal256
1133   /// can be inferred.
1134   void set_smallest_decimal_enabled(bool smallest_decimal_enable) {
1135     smallest_decimal_enabled_ = smallest_decimal_enable;
1136   }
1137   /// \brief Whether to infer Decimal32/64 from Parquet decimal logical types.
1138   ///
1139   /// When enabled, Parquet decimal columns will be inferred as the smallest possible
1140   /// Arrow Decimal type.
1141   /// When disabled, Parquet decimal columns will be inferred as either Decimal128 or
1142   /// Decimal256, but not Decimal32/64.
1143   ///
1144   /// Note: if an Arrow schema is found in the Parquet metadata, it will take priority and
1145   /// this setting will be ignored.
1146   bool smallest_decimal_enabled() const { return smallest_decimal_enabled_; }
1147
1148  private:
1149   bool use_threads_;
1150   std::unordered_set<int> read_dict_indices_;
1151   int64_t batch_size_;
1152   bool pre_buffer_;
1153   ::arrow::io::IOContext io_context_;
1154   ::arrow::io::CacheOptions cache_options_;
1155   ::arrow::TimeUnit::type coerce_int96_timestamp_unit_;
1156   ::arrow::Type::type binary_type_;
1157   ::arrow::Type::type list_type_;
1158   bool arrow_extensions_enabled_;
1159   bool should_load_statistics_;
1160   bool smallest_decimal_enabled_;
1161 };
1162
1163 /// EXPERIMENTAL: Constructs the default ArrowReaderProperties
1164 PARQUET_EXPORT
1165 ArrowReaderProperties default_arrow_reader_properties();
1166
1167 class PARQUET_EXPORT ArrowWriterProperties {
1168  public:
1169   enum EngineVersion {
1170     V1,  // Supports only nested lists.
1171     V2   // Full support for all nesting combinations
1172   };
1173   class Builder {
1174    public:
1175     Builder()
1176         : write_timestamps_as_int96_(false),
1177           coerce_timestamps_enabled_(false),
1178           coerce_timestamps_unit_(::arrow::TimeUnit::SECOND),
1179           truncated_timestamps_allowed_(false),
1180           store_schema_(false),
1181           compliant_nested_types_(true),
1182           engine_version_(V2),
1183           use_threads_(kArrowDefaultUseThreads),
1184           executor_(NULLPTR),
1185           write_time_adjusted_to_utc_(false) {}
1186     virtual ~Builder() = default;
1187
1188     /// \brief Disable writing legacy int96 timestamps (default disabled).
1189     Builder* disable_deprecated_int96_timestamps() {
1190       write_timestamps_as_int96_ = false;
1191       return this;
1192     }
1193
1194     /// \brief Enable writing legacy int96 timestamps (default disabled).
1195     ///
1196     /// May be turned on to write timestamps compatible with older Parquet writers.
1197     /// This takes precedent over coerce_timestamps.
1198     Builder* enable_deprecated_int96_timestamps() {
1199       write_timestamps_as_int96_ = true;
1200       return this;
1201     }
1202
1203     /// \brief Coerce all timestamps to the specified time unit.
1204     /// \param unit time unit to truncate to.
1205     /// For Parquet versions 1.0 and 2.4, nanoseconds are casted to microseconds.
1206     Builder* coerce_timestamps(::arrow::TimeUnit::type unit) {
1207       coerce_timestamps_enabled_ = true;
1208       coerce_timestamps_unit_ = unit;
1209       return this;
1210     }
1211
1212     /// \brief Allow loss of data when truncating timestamps.
1213     ///
1214     /// This is disallowed by default and an error will be returned.
1215     Builder* allow_truncated_timestamps() {
1216       truncated_timestamps_allowed_ = true;
1217       return this;
1218     }
1219
1220     /// \brief Disallow loss of data when truncating timestamps (default).
1221     Builder* disallow_truncated_timestamps() {
1222       truncated_timestamps_allowed_ = false;
1223       return this;
1224     }
1225
1226     /// \brief EXPERIMENTAL: Write binary serialized Arrow schema to the file,
1227     /// to enable certain read options (like "read_dictionary") to be set
1228     /// automatically
1229     Builder* store_schema() {
1230       store_schema_ = true;
1231       return this;
1232     }
1233
1234     /// \brief When enabled, will not preserve Arrow field names for list types.
1235     ///
1236     /// Instead of using the field names Arrow uses for the values array of
1237     /// list types (default "item"), will use "element", as is specified in
1238     /// the Parquet spec.
1239     ///
1240     /// This is enabled by default.
1241     Builder* enable_compliant_nested_types() {
1242       compliant_nested_types_ = true;
1243       return this;
1244     }
1245
1246     /// Preserve Arrow list field name.
1247     Builder* disable_compliant_nested_types() {
1248       compliant_nested_types_ = false;
1249       return this;
1250     }
1251
1252     /// Set the version of the Parquet writer engine.
1253     Builder* set_engine_version(EngineVersion version) {
1254       engine_version_ = version;
1255       return this;
1256     }
1257
1258     /// \brief Set whether to use multiple threads to write columns
1259     /// in parallel in the buffered row group mode.
1260     ///
1261     /// WARNING: If writing multiple files in parallel in the same
1262     /// executor, deadlock may occur if use_threads is true. Please
1263     /// disable it in this case.
1264     ///
1265     /// Default is false.
1266     Builder* set_use_threads(bool use_threads) {
1267       use_threads_ = use_threads;
1268       return this;
1269     }
1270
1271     /// \brief Set the executor to write columns in parallel in the
1272     /// buffered row group mode.
1273     ///
1274     /// Default is nullptr and the default cpu executor will be used.
1275     Builder* set_executor(::arrow::internal::Executor* executor) {
1276       executor_ = executor;
1277       return this;
1278     }
1279
1280     /// \brief Set the value of isAdjustedTOUTC when writing a TIME column
1281     ///
1282     /// Default is false because Arrow TIME data is expressed in an unspecified timezone.
1283     /// Note this setting doesn't affect TIMESTAMP data.
1284     Builder* set_time_adjusted_to_utc(bool adjusted) {
1285       write_time_adjusted_to_utc_ = adjusted;
1286       return this;
1287     }
1288
1289     /// Create the final properties.
1290     std::shared_ptr<ArrowWriterProperties> build() {
1291       return std::shared_ptr<ArrowWriterProperties>(new ArrowWriterProperties(
1292           write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_,
1293           truncated_timestamps_allowed_, store_schema_, compliant_nested_types_,
1294           engine_version_, use_threads_, executor_, write_time_adjusted_to_utc_));
1295     }
1296
1297    private:
1298     bool write_timestamps_as_int96_;
1299
1300     bool coerce_timestamps_enabled_;
1301     ::arrow::TimeUnit::type coerce_timestamps_unit_;
1302     bool truncated_timestamps_allowed_;
1303
1304     bool store_schema_;
1305     bool compliant_nested_types_;
1306     EngineVersion engine_version_;
1307
1308     bool use_threads_;
1309     ::arrow::internal::Executor* executor_;
1310
1311     bool write_time_adjusted_to_utc_;
1312   };
1313
1314   bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; }
1315
1316   bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_; }
1317   ::arrow::TimeUnit::type coerce_timestamps_unit() const {
1318     return coerce_timestamps_unit_;
1319   }
1320
1321   bool truncated_timestamps_allowed() const { return truncated_timestamps_allowed_; }
1322
1323   bool store_schema() const { return store_schema_; }
1324
1325   /// \brief Enable nested type naming according to the parquet specification.
1326   ///
1327   /// Older versions of arrow wrote out field names for nested lists based on the name
1328   /// of the field.  According to the parquet specification they should always be
1329   /// "element".
1330   bool compliant_nested_types() const { return compliant_nested_types_; }
1331
1332   /// \brief The underlying engine version to use when writing Arrow data.
1333   ///
1334   /// V2 is currently the latest V1 is considered deprecated but left in
1335   /// place in case there are bugs detected in V2.
1336   EngineVersion engine_version() const { return engine_version_; }
1337
1338   /// \brief Returns whether the writer will use multiple threads
1339   /// to write columns in parallel in the buffered row group mode.
1340   bool use_threads() const { return use_threads_; }
1341
1342   /// \brief Returns the executor used to write columns in parallel.
1343   ::arrow::internal::Executor* executor() const;
1344
1345   /// \brief The value of isAdjustedTOUTC when writing a TIME column
1346   ///
1347   /// Note this setting doesn't affect TIMESTAMP data.
1348   bool write_time_adjusted_to_utc() const { return write_time_adjusted_to_utc_; }
1349
1350  private:
1351   explicit ArrowWriterProperties(bool write_nanos_as_int96,
1352                                  bool coerce_timestamps_enabled,
1353                                  ::arrow::TimeUnit::type coerce_timestamps_unit,
1354                                  bool truncated_timestamps_allowed, bool store_schema,
1355                                  bool compliant_nested_types,
1356                                  EngineVersion engine_version, bool use_threads,
1357                                  ::arrow::internal::Executor* executor,
1358                                  bool write_time_adjusted_to_utc)
1359       : write_timestamps_as_int96_(write_nanos_as_int96),
1360         coerce_timestamps_enabled_(coerce_timestamps_enabled),
1361         coerce_timestamps_unit_(coerce_timestamps_unit),
1362         truncated_timestamps_allowed_(truncated_timestamps_allowed),
1363         store_schema_(store_schema),
1364         compliant_nested_types_(compliant_nested_types),
1365         engine_version_(engine_version),
1366         use_threads_(use_threads),
1367         executor_(executor),
1368         write_time_adjusted_to_utc_(write_time_adjusted_to_utc) {}
1369
1370   const bool write_timestamps_as_int96_;
1371   const bool coerce_timestamps_enabled_;
1372   const ::arrow::TimeUnit::type coerce_timestamps_unit_;
1373   const bool truncated_timestamps_allowed_;
1374   const bool store_schema_;
1375   const bool compliant_nested_types_;
1376   const EngineVersion engine_version_;
1377   const bool use_threads_;
1378   ::arrow::internal::Executor* executor_;
1379   const bool write_time_adjusted_to_utc_;
1380 };
1381
1382 /// \brief State object used for writing Arrow data directly to a Parquet
1383 /// column chunk. API possibly not stable
1384 struct ArrowWriteContext {
1385   ArrowWriteContext(MemoryPool* memory_pool, ArrowWriterProperties* properties)
1386       : memory_pool(memory_pool),
1387         properties(properties),
1388         data_buffer(AllocateBuffer(memory_pool)),
1389         def_levels_buffer(AllocateBuffer(memory_pool)) {}
1390
1391   template <typename T>
1392   ::arrow::Status GetScratchData(const int64_t num_values, T** out) {
1393     ARROW_RETURN_NOT_OK(this->data_buffer->Resize(num_values * sizeof(T), false));
1394     *out = reinterpret_cast<T*>(this->data_buffer->mutable_data());
1395     return ::arrow::Status::OK();
1396   }
1397
1398   MemoryPool* memory_pool;
1399   const ArrowWriterProperties* properties;
1400
1401   // Buffer used for storing the data of an array converted to the physical type
1402   // as expected by parquet-cpp.
1403   std::shared_ptr<ResizableBuffer> data_buffer;
1404
1405   // We use the shared ownership of this buffer
1406   std::shared_ptr<ResizableBuffer> def_levels_buffer;
1407 };
1408
1409 PARQUET_EXPORT
1410 std::shared_ptr<ArrowWriterProperties> default_arrow_writer_properties();
1411
1412 }  // namespace parquet