Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-17 08:28:55

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <algorithm>
0021 #include <cstdint>
0022 #include <cstring>
0023 #include <iterator>
0024 #include <memory>
0025 #include <sstream>
0026 #include <string>
0027 #include <string_view>
0028 
0029 #include "parquet/platform.h"
0030 #include "parquet/type_fwd.h"
0031 #include "parquet/windows_fixup.h"  // for OPTIONAL
0032 
0033 namespace arrow::util {
0034 
0035 class Codec;
0036 
0037 }  // namespace arrow::util
0038 
0039 namespace parquet {
0040 
0041 // ----------------------------------------------------------------------
0042 // Metadata enums to match Thrift metadata
0043 //
0044 // The reason we maintain our own enums is to avoid transitive dependency on
0045 // the compiled Thrift headers (and thus thrift/Thrift.h) for users of the
0046 // public API. After building parquet-cpp, you should not need to include
0047 // Thrift headers in your application. This means some boilerplate to convert
0048 // between our types and Parquet's Thrift types.
0049 //
0050 // We can also add special values like NONE to distinguish between metadata
0051 // values being set and not set. As an example consider ConvertedType and
0052 // CompressionCodec
0053 
0054 // Mirrors parquet::Type
0055 struct Type {
0056   enum type {
0057     BOOLEAN = 0,
0058     INT32 = 1,
0059     INT64 = 2,
0060     INT96 = 3,
0061     FLOAT = 4,
0062     DOUBLE = 5,
0063     BYTE_ARRAY = 6,
0064     FIXED_LEN_BYTE_ARRAY = 7,
0065     // Should always be last element.
0066     UNDEFINED = 8
0067   };
0068 };
0069 
0070 // Mirrors parquet::ConvertedType
0071 struct ConvertedType {
0072   enum type {
0073     NONE,  // Not a real converted type, but means no converted type is specified
0074     UTF8,
0075     MAP,
0076     MAP_KEY_VALUE,
0077     LIST,
0078     ENUM,
0079     DECIMAL,
0080     DATE,
0081     TIME_MILLIS,
0082     TIME_MICROS,
0083     TIMESTAMP_MILLIS,
0084     TIMESTAMP_MICROS,
0085     UINT_8,
0086     UINT_16,
0087     UINT_32,
0088     UINT_64,
0089     INT_8,
0090     INT_16,
0091     INT_32,
0092     INT_64,
0093     JSON,
0094     BSON,
0095     INTERVAL,
0096     // DEPRECATED INVALID ConvertedType for all-null data.
0097     // Only useful for reading legacy files written out by interim Parquet C++ releases.
0098     // For writing, always emit LogicalType::Null instead.
0099     // See PARQUET-1990.
0100     NA = 25,
0101     UNDEFINED = 26  // Not a real converted type; should always be last element
0102   };
0103 };
0104 
0105 // forward declaration
0106 namespace format {
0107 
0108 class LogicalType;
0109 
0110 }
0111 
0112 // Mirrors parquet::FieldRepetitionType
0113 struct Repetition {
0114   enum type { REQUIRED = 0, OPTIONAL = 1, REPEATED = 2, /*Always last*/ UNDEFINED = 3 };
0115 };
0116 
0117 // Reference:
0118 // parquet-mr/parquet-hadoop/src/main/java/org/apache/parquet/
0119 //                            format/converter/ParquetMetadataConverter.java
0120 // Sort order for page and column statistics. Types are associated with sort
0121 // orders (e.g., UTF8 columns should use UNSIGNED) and column stats are
0122 // aggregated using a sort order. As of parquet-format version 2.3.1, the
0123 // order used to aggregate stats is always SIGNED and is not stored in the
0124 // Parquet file. These stats are discarded for types that need unsigned.
0125 // See PARQUET-686.
0126 struct SortOrder {
0127   enum type { SIGNED, UNSIGNED, UNKNOWN };
0128 };
0129 
0130 namespace schema {
0131 
0132 struct DecimalMetadata {
0133   bool isset;
0134   int32_t scale;
0135   int32_t precision;
0136 };
0137 
0138 }  // namespace schema
0139 
0140 /// \brief Implementation of parquet.thrift LogicalType types.
0141 class PARQUET_EXPORT LogicalType {
0142  public:
0143   struct Type {
0144     enum type {
0145       UNDEFINED = 0,  // Not a real logical type
0146       STRING = 1,
0147       MAP,
0148       LIST,
0149       ENUM,
0150       DECIMAL,
0151       DATE,
0152       TIME,
0153       TIMESTAMP,
0154       INTERVAL,
0155       INT,
0156       NIL,  // Thrift NullType: annotates data that is always null
0157       JSON,
0158       BSON,
0159       UUID,
0160       FLOAT16,
0161       GEOMETRY,
0162       GEOGRAPHY,
0163       VARIANT,
0164       NONE  // Not a real logical type; should always be last element
0165     };
0166   };
0167 
0168   struct TimeUnit {
0169     enum unit { UNKNOWN = 0, MILLIS = 1, MICROS, NANOS };
0170   };
0171 
0172   enum class EdgeInterpolationAlgorithm {
0173     UNKNOWN = 0,
0174     SPHERICAL = 1,
0175     VINCENTY = 2,
0176     THOMAS = 3,
0177     ANDOYER = 4,
0178     KARNEY = 5
0179   };
0180 
0181   /// \brief The latest supported Variant specification version by this library
0182   static constexpr int8_t kVariantSpecVersion = 1;
0183 
0184   /// \brief If possible, return a logical type equivalent to the given legacy
0185   /// converted type (and decimal metadata if applicable).
0186   static std::shared_ptr<const LogicalType> FromConvertedType(
0187       const parquet::ConvertedType::type converted_type,
0188       const parquet::schema::DecimalMetadata converted_decimal_metadata = {false, -1,
0189                                                                            -1});
0190 
0191   /// \brief Return the logical type represented by the Thrift intermediary object.
0192   static std::shared_ptr<const LogicalType> FromThrift(
0193       const parquet::format::LogicalType& thrift_logical_type);
0194 
0195   /// \brief Return the explicitly requested logical type.
0196   static std::shared_ptr<const LogicalType> String();
0197   static std::shared_ptr<const LogicalType> Map();
0198   static std::shared_ptr<const LogicalType> List();
0199   static std::shared_ptr<const LogicalType> Enum();
0200   static std::shared_ptr<const LogicalType> Decimal(int32_t precision, int32_t scale = 0);
0201   static std::shared_ptr<const LogicalType> Date();
0202   static std::shared_ptr<const LogicalType> Time(bool is_adjusted_to_utc,
0203                                                  LogicalType::TimeUnit::unit time_unit);
0204 
0205   /// \brief Create a Timestamp logical type
0206   /// \param[in] is_adjusted_to_utc set true if the data is UTC-normalized
0207   /// \param[in] time_unit the resolution of the timestamp
0208   /// \param[in] is_from_converted_type if true, the timestamp was generated
0209   /// by translating a legacy converted type of TIMESTAMP_MILLIS or
0210   /// TIMESTAMP_MICROS. Default is false.
0211   /// \param[in] force_set_converted_type if true, always set the
0212   /// legacy ConvertedType TIMESTAMP_MICROS and TIMESTAMP_MILLIS
0213   /// metadata. Default is false
0214   static std::shared_ptr<const LogicalType> Timestamp(
0215       bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit,
0216       bool is_from_converted_type = false, bool force_set_converted_type = false);
0217 
0218   static std::shared_ptr<const LogicalType> Interval();
0219   static std::shared_ptr<const LogicalType> Int(int bit_width, bool is_signed);
0220 
0221   /// \brief Create a logical type for data that's always null
0222   ///
0223   /// Any physical type can be annotated with this logical type.
0224   static std::shared_ptr<const LogicalType> Null();
0225 
0226   static std::shared_ptr<const LogicalType> JSON();
0227   static std::shared_ptr<const LogicalType> BSON();
0228   static std::shared_ptr<const LogicalType> UUID();
0229   static std::shared_ptr<const LogicalType> Float16();
0230   static std::shared_ptr<const LogicalType> Variant(
0231       int8_t specVersion = kVariantSpecVersion);
0232 
0233   static std::shared_ptr<const LogicalType> Geometry(std::string crs = "");
0234 
0235   static std::shared_ptr<const LogicalType> Geography(
0236       std::string crs = "", LogicalType::EdgeInterpolationAlgorithm algorithm =
0237                                 EdgeInterpolationAlgorithm::SPHERICAL);
0238 
0239   /// \brief Create a placeholder for when no logical type is specified
0240   static std::shared_ptr<const LogicalType> None();
0241 
0242   /// \brief Return true if this logical type is consistent with the given underlying
0243   /// physical type.
0244   bool is_applicable(parquet::Type::type primitive_type,
0245                      int32_t primitive_length = -1) const;
0246 
0247   /// \brief Return true if this logical type is equivalent to the given legacy converted
0248   /// type (and decimal metadata if applicable).
0249   bool is_compatible(parquet::ConvertedType::type converted_type,
0250                      parquet::schema::DecimalMetadata converted_decimal_metadata = {
0251                          false, -1, -1}) const;
0252 
0253   /// \brief If possible, return the legacy converted type (and decimal metadata if
0254   /// applicable) equivalent to this logical type.
0255   parquet::ConvertedType::type ToConvertedType(
0256       parquet::schema::DecimalMetadata* out_decimal_metadata) const;
0257 
0258   /// \brief Return a printable representation of this logical type.
0259   std::string ToString() const;
0260 
0261   /// \brief Return a JSON representation of this logical type.
0262   std::string ToJSON() const;
0263 
0264   /// \brief Return a serializable Thrift object for this logical type.
0265   parquet::format::LogicalType ToThrift() const;
0266 
0267   /// \brief Return true if the given logical type is equivalent to this logical type.
0268   bool Equals(const LogicalType& other) const;
0269 
0270   /// \brief Return the enumerated type of this logical type.
0271   LogicalType::Type::type type() const;
0272 
0273   /// \brief Return the appropriate sort order for this logical type.
0274   SortOrder::type sort_order() const;
0275 
0276   // Type checks ...
0277   bool is_string() const;
0278   bool is_map() const;
0279   bool is_list() const;
0280   bool is_enum() const;
0281   bool is_decimal() const;
0282   bool is_date() const;
0283   bool is_time() const;
0284   bool is_timestamp() const;
0285   bool is_interval() const;
0286   bool is_int() const;
0287   bool is_null() const;
0288   bool is_JSON() const;
0289   bool is_BSON() const;
0290   bool is_UUID() const;
0291   bool is_float16() const;
0292   bool is_geometry() const;
0293   bool is_geography() const;
0294   bool is_variant() const;
0295   bool is_none() const;
0296   /// \brief Return true if this logical type is of a known type.
0297   bool is_valid() const;
0298   bool is_invalid() const;
0299   /// \brief Return true if this logical type is suitable for a schema GroupNode.
0300   bool is_nested() const;
0301   bool is_nonnested() const;
0302   /// \brief Return true if this logical type is included in the Thrift output for its
0303   /// node.
0304   bool is_serialized() const;
0305 
0306   LogicalType(const LogicalType&) = delete;
0307   LogicalType& operator=(const LogicalType&) = delete;
0308   virtual ~LogicalType() noexcept;
0309 
0310  protected:
0311   LogicalType();
0312 
0313   class Impl;
0314   std::unique_ptr<const Impl> impl_;
0315 };
0316 
0317 /// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8.
0318 class PARQUET_EXPORT StringLogicalType : public LogicalType {
0319  public:
0320   static std::shared_ptr<const LogicalType> Make();
0321 
0322  private:
0323   StringLogicalType() = default;
0324 };
0325 
0326 /// \brief Allowed for group nodes only.
0327 class PARQUET_EXPORT MapLogicalType : public LogicalType {
0328  public:
0329   static std::shared_ptr<const LogicalType> Make();
0330 
0331  private:
0332   MapLogicalType() = default;
0333 };
0334 
0335 /// \brief Allowed for group nodes only.
0336 class PARQUET_EXPORT ListLogicalType : public LogicalType {
0337  public:
0338   static std::shared_ptr<const LogicalType> Make();
0339 
0340  private:
0341   ListLogicalType() = default;
0342 };
0343 
0344 /// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8.
0345 class PARQUET_EXPORT EnumLogicalType : public LogicalType {
0346  public:
0347   static std::shared_ptr<const LogicalType> Make();
0348 
0349  private:
0350   EnumLogicalType() = default;
0351 };
0352 
0353 /// \brief Allowed for physical type INT32, INT64, FIXED_LEN_BYTE_ARRAY, or BYTE_ARRAY,
0354 /// depending on the precision.
0355 class PARQUET_EXPORT DecimalLogicalType : public LogicalType {
0356  public:
0357   static std::shared_ptr<const LogicalType> Make(int32_t precision, int32_t scale = 0);
0358   int32_t precision() const;
0359   int32_t scale() const;
0360 
0361  private:
0362   DecimalLogicalType() = default;
0363 };
0364 
0365 /// \brief Allowed for physical type INT32.
0366 class PARQUET_EXPORT DateLogicalType : public LogicalType {
0367  public:
0368   static std::shared_ptr<const LogicalType> Make();
0369 
0370  private:
0371   DateLogicalType() = default;
0372 };
0373 
0374 /// \brief Allowed for physical type INT32 (for MILLIS) or INT64 (for MICROS and NANOS).
0375 class PARQUET_EXPORT TimeLogicalType : public LogicalType {
0376  public:
0377   static std::shared_ptr<const LogicalType> Make(bool is_adjusted_to_utc,
0378                                                  LogicalType::TimeUnit::unit time_unit);
0379   bool is_adjusted_to_utc() const;
0380   LogicalType::TimeUnit::unit time_unit() const;
0381 
0382  private:
0383   TimeLogicalType() = default;
0384 };
0385 
0386 /// \brief Allowed for physical type INT64.
0387 class PARQUET_EXPORT TimestampLogicalType : public LogicalType {
0388  public:
0389   static std::shared_ptr<const LogicalType> Make(bool is_adjusted_to_utc,
0390                                                  LogicalType::TimeUnit::unit time_unit,
0391                                                  bool is_from_converted_type = false,
0392                                                  bool force_set_converted_type = false);
0393   bool is_adjusted_to_utc() const;
0394   LogicalType::TimeUnit::unit time_unit() const;
0395 
0396   /// \brief If true, will not set LogicalType in Thrift metadata
0397   bool is_from_converted_type() const;
0398 
0399   /// \brief If true, will set ConvertedType for micros and millis
0400   /// resolution in legacy ConvertedType Thrift metadata
0401   bool force_set_converted_type() const;
0402 
0403  private:
0404   TimestampLogicalType() = default;
0405 };
0406 
0407 /// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 12
0408 class PARQUET_EXPORT IntervalLogicalType : public LogicalType {
0409  public:
0410   static std::shared_ptr<const LogicalType> Make();
0411 
0412  private:
0413   IntervalLogicalType() = default;
0414 };
0415 
0416 /// \brief Allowed for physical type INT32 (for bit widths 8, 16, and 32) and INT64
0417 /// (for bit width 64).
0418 class PARQUET_EXPORT IntLogicalType : public LogicalType {
0419  public:
0420   static std::shared_ptr<const LogicalType> Make(int bit_width, bool is_signed);
0421   int bit_width() const;
0422   bool is_signed() const;
0423 
0424  private:
0425   IntLogicalType() = default;
0426 };
0427 
0428 /// \brief Allowed for any physical type.
0429 class PARQUET_EXPORT NullLogicalType : public LogicalType {
0430  public:
0431   static std::shared_ptr<const LogicalType> Make();
0432 
0433  private:
0434   NullLogicalType() = default;
0435 };
0436 
0437 /// \brief Allowed for physical type BYTE_ARRAY.
0438 class PARQUET_EXPORT JSONLogicalType : public LogicalType {
0439  public:
0440   static std::shared_ptr<const LogicalType> Make();
0441 
0442  private:
0443   JSONLogicalType() = default;
0444 };
0445 
0446 /// \brief Allowed for physical type BYTE_ARRAY.
0447 class PARQUET_EXPORT BSONLogicalType : public LogicalType {
0448  public:
0449   static std::shared_ptr<const LogicalType> Make();
0450 
0451  private:
0452   BSONLogicalType() = default;
0453 };
0454 
0455 /// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 16,
0456 /// must encode raw UUID bytes.
0457 class PARQUET_EXPORT UUIDLogicalType : public LogicalType {
0458  public:
0459   static std::shared_ptr<const LogicalType> Make();
0460 
0461  private:
0462   UUIDLogicalType() = default;
0463 };
0464 
0465 /// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 2,
0466 /// must encode raw FLOAT16 bytes.
0467 class PARQUET_EXPORT Float16LogicalType : public LogicalType {
0468  public:
0469   static std::shared_ptr<const LogicalType> Make();
0470 
0471  private:
0472   Float16LogicalType() = default;
0473 };
0474 
0475 class PARQUET_EXPORT GeometryLogicalType : public LogicalType {
0476  public:
0477   static std::shared_ptr<const LogicalType> Make(std::string crs = "");
0478 
0479   const std::string& crs() const;
0480 
0481  private:
0482   GeometryLogicalType() = default;
0483 };
0484 
0485 class PARQUET_EXPORT GeographyLogicalType : public LogicalType {
0486  public:
0487   static std::shared_ptr<const LogicalType> Make(
0488       std::string crs = "", LogicalType::EdgeInterpolationAlgorithm algorithm =
0489                                 EdgeInterpolationAlgorithm::SPHERICAL);
0490 
0491   const std::string& crs() const;
0492   LogicalType::EdgeInterpolationAlgorithm algorithm() const;
0493   std::string_view algorithm_name() const;
0494 
0495  private:
0496   GeographyLogicalType() = default;
0497 };
0498 
0499 /// \brief Allowed for group nodes only.
0500 class PARQUET_EXPORT VariantLogicalType : public LogicalType {
0501  public:
0502   static std::shared_ptr<const LogicalType> Make(
0503       int8_t specVersion = kVariantSpecVersion);
0504 
0505   int8_t spec_version() const;
0506 
0507  private:
0508   VariantLogicalType() = default;
0509 };
0510 
0511 /// \brief Allowed for any physical type.
0512 class PARQUET_EXPORT NoLogicalType : public LogicalType {
0513  public:
0514   static std::shared_ptr<const LogicalType> Make();
0515 
0516  private:
0517   NoLogicalType() = default;
0518 };
0519 
0520 // Internal API, for unrecognized logical types
0521 class PARQUET_EXPORT UndefinedLogicalType : public LogicalType {
0522  public:
0523   static std::shared_ptr<const LogicalType> Make();
0524 
0525  private:
0526   UndefinedLogicalType() = default;
0527 };
0528 
0529 // Data encodings. Mirrors parquet::Encoding
0530 struct Encoding {
0531   enum type {
0532     PLAIN = 0,
0533     PLAIN_DICTIONARY = 2,
0534     RLE = 3,
0535     BIT_PACKED = 4,
0536     DELTA_BINARY_PACKED = 5,
0537     DELTA_LENGTH_BYTE_ARRAY = 6,
0538     DELTA_BYTE_ARRAY = 7,
0539     RLE_DICTIONARY = 8,
0540     BYTE_STREAM_SPLIT = 9,
0541     // Should always be last element (except UNKNOWN)
0542     UNDEFINED = 10,
0543     UNKNOWN = 999
0544   };
0545 };
0546 
0547 // Exposed data encodings. It is the encoding of the data read from the file,
0548 // rather than the encoding of the data in the file. E.g., the data encoded as
0549 // RLE_DICTIONARY in the file can be read as dictionary indices by RLE
0550 // decoding, in which case the data read from the file is DICTIONARY encoded.
0551 enum class ExposedEncoding {
0552   NO_ENCODING = 0,  // data is not encoded, i.e. already decoded during reading
0553   DICTIONARY = 1
0554 };
0555 
0556 /// \brief Return true if Parquet supports indicated compression type
0557 PARQUET_EXPORT
0558 bool IsCodecSupported(Compression::type codec);
0559 
0560 PARQUET_EXPORT
0561 std::unique_ptr<Codec> GetCodec(Compression::type codec);
0562 
0563 PARQUET_EXPORT
0564 std::unique_ptr<Codec> GetCodec(Compression::type codec,
0565                                 const CodecOptions& codec_options);
0566 
0567 PARQUET_EXPORT
0568 std::unique_ptr<Codec> GetCodec(Compression::type codec, int compression_level);
0569 
0570 struct ParquetCipher {
0571   enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 };
0572 };
0573 
0574 struct AadMetadata {
0575   std::string aad_prefix;
0576   std::string aad_file_unique;
0577   bool supply_aad_prefix;
0578 };
0579 
0580 struct EncryptionAlgorithm {
0581   ParquetCipher::type algorithm;
0582   AadMetadata aad;
0583 };
0584 
0585 // parquet::PageType
0586 struct PageType {
0587   enum type {
0588     DATA_PAGE,
0589     INDEX_PAGE,
0590     DICTIONARY_PAGE,
0591     DATA_PAGE_V2,
0592     // Should always be last element
0593     UNDEFINED
0594   };
0595 };
0596 
0597 bool PageCanUseChecksum(PageType::type pageType);
0598 
0599 class ColumnOrder {
0600  public:
0601   enum type { UNDEFINED, TYPE_DEFINED_ORDER };
0602   explicit ColumnOrder(ColumnOrder::type column_order) : column_order_(column_order) {}
0603   // Default to Type Defined Order
0604   ColumnOrder() : column_order_(type::TYPE_DEFINED_ORDER) {}
0605   ColumnOrder::type get_order() { return column_order_; }
0606 
0607   static ColumnOrder undefined_;
0608   static ColumnOrder type_defined_;
0609 
0610  private:
0611   ColumnOrder::type column_order_;
0612 };
0613 
0614 /// \brief BoundaryOrder is a proxy around format::BoundaryOrder.
0615 struct BoundaryOrder {
0616   enum type {
0617     Unordered = 0,
0618     Ascending = 1,
0619     Descending = 2,
0620     // Should always be last element
0621     UNDEFINED = 3
0622   };
0623 };
0624 
0625 /// \brief SortingColumn is a proxy around format::SortingColumn.
0626 struct PARQUET_EXPORT SortingColumn {
0627   // The column index (in this row group)
0628   int32_t column_idx;
0629 
0630   // If true, indicates this column is sorted in descending order.
0631   bool descending;
0632 
0633   // If true, nulls will come before non-null values, otherwise, nulls go at the end.
0634   bool nulls_first;
0635 };
0636 
0637 inline bool operator==(const SortingColumn& left, const SortingColumn& right) {
0638   return left.nulls_first == right.nulls_first && left.descending == right.descending &&
0639          left.column_idx == right.column_idx;
0640 }
0641 
0642 inline bool operator!=(const SortingColumn& left, const SortingColumn& right) {
0643   return !(left == right);
0644 }
0645 
0646 // ----------------------------------------------------------------------
0647 
0648 struct ByteArray {
0649   ByteArray() : len(0), ptr(NULLPTR) {}
0650   ByteArray(uint32_t len, const uint8_t* ptr) : len(len), ptr(ptr) {}
0651 
0652   ByteArray(::std::string_view view)  // NOLINT implicit conversion
0653       : ByteArray(static_cast<uint32_t>(view.size()),
0654                   reinterpret_cast<const uint8_t*>(view.data())) {}
0655 
0656   explicit operator std::string_view() const {
0657     return std::string_view{reinterpret_cast<const char*>(ptr), len};
0658   }
0659 
0660   uint32_t len;
0661   const uint8_t* ptr;
0662 };
0663 
0664 inline bool operator==(const ByteArray& left, const ByteArray& right) {
0665   return left.len == right.len &&
0666          (left.len == 0 || std::memcmp(left.ptr, right.ptr, left.len) == 0);
0667 }
0668 
0669 inline bool operator!=(const ByteArray& left, const ByteArray& right) {
0670   return !(left == right);
0671 }
0672 
0673 struct FixedLenByteArray {
0674   FixedLenByteArray() : ptr(NULLPTR) {}
0675   explicit FixedLenByteArray(const uint8_t* ptr) : ptr(ptr) {}
0676   const uint8_t* ptr;
0677 };
0678 
0679 using FLBA = FixedLenByteArray;
0680 
0681 // Julian day at unix epoch.
0682 //
0683 // The Julian Day Number (JDN) is the integer assigned to a whole solar day in
0684 // the Julian day count starting from noon Universal time, with Julian day
0685 // number 0 assigned to the day starting at noon on Monday, January 1, 4713 BC,
0686 // proleptic Julian calendar (November 24, 4714 BC, in the proleptic Gregorian
0687 // calendar),
0688 constexpr int64_t kJulianToUnixEpochDays = INT64_C(2440588);
0689 constexpr int64_t kSecondsPerDay = INT64_C(60 * 60 * 24);
0690 constexpr int64_t kMillisecondsPerDay = kSecondsPerDay * INT64_C(1000);
0691 constexpr int64_t kMicrosecondsPerDay = kMillisecondsPerDay * INT64_C(1000);
0692 constexpr int64_t kNanosecondsPerDay = kMicrosecondsPerDay * INT64_C(1000);
0693 
0694 MANUALLY_ALIGNED_STRUCT(1) Int96 { uint32_t value[3]; };
0695 STRUCT_END(Int96, 12);
0696 
0697 inline bool operator==(const Int96& left, const Int96& right) {
0698   return std::equal(left.value, left.value + 3, right.value);
0699 }
0700 
0701 inline bool operator!=(const Int96& left, const Int96& right) { return !(left == right); }
0702 
0703 static inline std::string ByteArrayToString(const ByteArray& a) {
0704   return std::string(reinterpret_cast<const char*>(a.ptr), a.len);
0705 }
0706 
0707 static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds) {
0708   std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds));
0709 }
0710 
0711 struct DecodedInt96 {
0712   uint64_t days_since_epoch;
0713   uint64_t nanoseconds;
0714 };
0715 
0716 static inline DecodedInt96 DecodeInt96Timestamp(const parquet::Int96& i96) {
0717   // We do the computations in the unsigned domain to avoid unsigned behaviour
0718   // on overflow.
0719   DecodedInt96 result;
0720   result.days_since_epoch = i96.value[2] - static_cast<uint64_t>(kJulianToUnixEpochDays);
0721   result.nanoseconds = 0;
0722 
0723   memcpy(&result.nanoseconds, &i96.value, sizeof(uint64_t));
0724   return result;
0725 }
0726 
0727 static inline int64_t Int96GetNanoSeconds(const parquet::Int96& i96) {
0728   const auto decoded = DecodeInt96Timestamp(i96);
0729   return static_cast<int64_t>(decoded.days_since_epoch * kNanosecondsPerDay +
0730                               decoded.nanoseconds);
0731 }
0732 
0733 static inline int64_t Int96GetMicroSeconds(const parquet::Int96& i96) {
0734   const auto decoded = DecodeInt96Timestamp(i96);
0735   uint64_t microseconds = decoded.nanoseconds / static_cast<uint64_t>(1000);
0736   return static_cast<int64_t>(decoded.days_since_epoch * kMicrosecondsPerDay +
0737                               microseconds);
0738 }
0739 
0740 static inline int64_t Int96GetMilliSeconds(const parquet::Int96& i96) {
0741   const auto decoded = DecodeInt96Timestamp(i96);
0742   uint64_t milliseconds = decoded.nanoseconds / static_cast<uint64_t>(1000000);
0743   return static_cast<int64_t>(decoded.days_since_epoch * kMillisecondsPerDay +
0744                               milliseconds);
0745 }
0746 
0747 static inline int64_t Int96GetSeconds(const parquet::Int96& i96) {
0748   const auto decoded = DecodeInt96Timestamp(i96);
0749   uint64_t seconds = decoded.nanoseconds / static_cast<uint64_t>(1000000000);
0750   return static_cast<int64_t>(decoded.days_since_epoch * kSecondsPerDay + seconds);
0751 }
0752 
0753 static inline std::string Int96ToString(const Int96& a) {
0754   std::ostringstream result;
0755   std::copy(a.value, a.value + 3, std::ostream_iterator<uint32_t>(result, " "));
0756   return result.str();
0757 }
0758 
0759 static inline std::string FixedLenByteArrayToString(const FixedLenByteArray& a, int len) {
0760   std::ostringstream result;
0761   std::copy(a.ptr, a.ptr + len, std::ostream_iterator<uint32_t>(result, " "));
0762   return result.str();
0763 }
0764 
0765 template <Type::type TYPE>
0766 struct type_traits {};
0767 
0768 template <>
0769 struct type_traits<Type::BOOLEAN> {
0770   using value_type = bool;
0771 
0772   static constexpr int value_byte_size = 1;
0773   static constexpr const char* printf_code = "d";
0774 };
0775 
0776 template <>
0777 struct type_traits<Type::INT32> {
0778   using value_type = int32_t;
0779 
0780   static constexpr int value_byte_size = 4;
0781   static constexpr const char* printf_code = "d";
0782 };
0783 
0784 template <>
0785 struct type_traits<Type::INT64> {
0786   using value_type = int64_t;
0787 
0788   static constexpr int value_byte_size = 8;
0789   static constexpr const char* printf_code =
0790       (sizeof(long) == 64) ? "ld" : "lld";  // NOLINT: runtime/int
0791 };
0792 
0793 template <>
0794 struct type_traits<Type::INT96> {
0795   using value_type = Int96;
0796 
0797   static constexpr int value_byte_size = 12;
0798   static constexpr const char* printf_code = "s";
0799 };
0800 
0801 template <>
0802 struct type_traits<Type::FLOAT> {
0803   using value_type = float;
0804 
0805   static constexpr int value_byte_size = 4;
0806   static constexpr const char* printf_code = "f";
0807 };
0808 
0809 template <>
0810 struct type_traits<Type::DOUBLE> {
0811   using value_type = double;
0812 
0813   static constexpr int value_byte_size = 8;
0814   static constexpr const char* printf_code = "lf";
0815 };
0816 
0817 template <>
0818 struct type_traits<Type::BYTE_ARRAY> {
0819   using value_type = ByteArray;
0820 
0821   static constexpr int value_byte_size = sizeof(ByteArray);
0822   static constexpr const char* printf_code = "s";
0823 };
0824 
0825 template <>
0826 struct type_traits<Type::FIXED_LEN_BYTE_ARRAY> {
0827   using value_type = FixedLenByteArray;
0828 
0829   static constexpr int value_byte_size = sizeof(FixedLenByteArray);
0830   static constexpr const char* printf_code = "s";
0831 };
0832 
0833 template <Type::type TYPE>
0834 struct PhysicalType {
0835   using c_type = typename type_traits<TYPE>::value_type;
0836   static constexpr Type::type type_num = TYPE;
0837 };
0838 
0839 using BooleanType = PhysicalType<Type::BOOLEAN>;
0840 using Int32Type = PhysicalType<Type::INT32>;
0841 using Int64Type = PhysicalType<Type::INT64>;
0842 using Int96Type = PhysicalType<Type::INT96>;
0843 using FloatType = PhysicalType<Type::FLOAT>;
0844 using DoubleType = PhysicalType<Type::DOUBLE>;
0845 using ByteArrayType = PhysicalType<Type::BYTE_ARRAY>;
0846 using FLBAType = PhysicalType<Type::FIXED_LEN_BYTE_ARRAY>;
0847 
0848 template <typename Type>
0849 inline std::string format_fwf(int width) {
0850   std::stringstream ss;
0851   ss << "%-" << width << type_traits<Type::type_num>::printf_code;
0852   return ss.str();
0853 }
0854 
0855 PARQUET_EXPORT std::string EncodingToString(Encoding::type t);
0856 
0857 PARQUET_EXPORT std::string ConvertedTypeToString(ConvertedType::type t);
0858 
0859 PARQUET_EXPORT std::string TypeToString(Type::type t);
0860 
0861 PARQUET_EXPORT std::string TypeToString(Type::type t, int type_length);
0862 
0863 PARQUET_EXPORT std::string FormatStatValue(
0864     Type::type parquet_type, ::std::string_view val,
0865     const std::shared_ptr<const LogicalType>& logical_type = NULLPTR);
0866 
0867 PARQUET_EXPORT int GetTypeByteSize(Type::type t);
0868 
0869 PARQUET_EXPORT SortOrder::type DefaultSortOrder(Type::type primitive);
0870 
0871 PARQUET_EXPORT SortOrder::type GetSortOrder(ConvertedType::type converted,
0872                                             Type::type primitive);
0873 
0874 PARQUET_EXPORT SortOrder::type GetSortOrder(
0875     const std::shared_ptr<const LogicalType>& logical_type, Type::type primitive);
0876 
0877 // PLAIN_DICTIONARY is deprecated but used to be used as a dictionary index
0878 // encoding.
0879 constexpr bool IsDictionaryIndexEncoding(Encoding::type e) {
0880   return e == Encoding::RLE_DICTIONARY || e == Encoding::PLAIN_DICTIONARY;
0881 }
0882 
0883 }  // namespace parquet