include/parquet/encoding.h

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017
0018 #pragma once
0019
0020 #include <cstdint>
0021 #include <cstring>
0022 #include <memory>
0023 #include <vector>
0024
0025 #include "arrow/type_fwd.h"
0026
0027 #include "parquet/exception.h"
0028 #include "parquet/platform.h"
0029 #include "parquet/types.h"
0030
0031 namespace arrow {
0032 template <typename T>
0033 class Dictionary32Builder;
0034 }
0035
0036 namespace parquet {
0037
0038 template <typename DType>
0039 class TypedEncoder;
0040
0041 using BooleanEncoder = TypedEncoder<BooleanType>;
0042 using Int32Encoder = TypedEncoder<Int32Type>;
0043 using Int64Encoder = TypedEncoder<Int64Type>;
0044 using Int96Encoder = TypedEncoder<Int96Type>;
0045 using FloatEncoder = TypedEncoder<FloatType>;
0046 using DoubleEncoder = TypedEncoder<DoubleType>;
0047 using ByteArrayEncoder = TypedEncoder<ByteArrayType>;
0048 using FLBAEncoder = TypedEncoder<FLBAType>;
0049
0050 template <typename DType>
0051 class TypedDecoder;
0052
0053 class BooleanDecoder;
0054 using Int32Decoder = TypedDecoder<Int32Type>;
0055 using Int64Decoder = TypedDecoder<Int64Type>;
0056 using Int96Decoder = TypedDecoder<Int96Type>;
0057 using FloatDecoder = TypedDecoder<FloatType>;
0058 using DoubleDecoder = TypedDecoder<DoubleType>;
0059 using ByteArrayDecoder = TypedDecoder<ByteArrayType>;
0060 class FLBADecoder;
0061
0062 template <typename T>
0063 struct EncodingTraits;
0064
0065 template <>
0066 struct EncodingTraits<BooleanType> {
0067   using Encoder = BooleanEncoder;
0068   using Decoder = BooleanDecoder;
0069
0070   using ArrowType = ::arrow::BooleanType;
0071   using Accumulator = ::arrow::BooleanBuilder;
0072   struct DictAccumulator {};
0073 };
0074
0075 template <>
0076 struct EncodingTraits<Int32Type> {
0077   using Encoder = Int32Encoder;
0078   using Decoder = Int32Decoder;
0079
0080   using ArrowType = ::arrow::Int32Type;
0081   using Accumulator = ::arrow::NumericBuilder<::arrow::Int32Type>;
0082   using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int32Type>;
0083 };
0084
0085 template <>
0086 struct EncodingTraits<Int64Type> {
0087   using Encoder = Int64Encoder;
0088   using Decoder = Int64Decoder;
0089
0090   using ArrowType = ::arrow::Int64Type;
0091   using Accumulator = ::arrow::NumericBuilder<::arrow::Int64Type>;
0092   using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int64Type>;
0093 };
0094
0095 template <>
0096 struct EncodingTraits<Int96Type> {
0097   using Encoder = Int96Encoder;
0098   using Decoder = Int96Decoder;
0099
0100   struct Accumulator {};
0101   struct DictAccumulator {};
0102 };
0103
0104 template <>
0105 struct EncodingTraits<FloatType> {
0106   using Encoder = FloatEncoder;
0107   using Decoder = FloatDecoder;
0108
0109   using ArrowType = ::arrow::FloatType;
0110   using Accumulator = ::arrow::NumericBuilder<::arrow::FloatType>;
0111   using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FloatType>;
0112 };
0113
0114 template <>
0115 struct EncodingTraits<DoubleType> {
0116   using Encoder = DoubleEncoder;
0117   using Decoder = DoubleDecoder;
0118
0119   using ArrowType = ::arrow::DoubleType;
0120   using Accumulator = ::arrow::NumericBuilder<::arrow::DoubleType>;
0121   using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::DoubleType>;
0122 };
0123
0124 template <>
0125 struct EncodingTraits<ByteArrayType> {
0126   using Encoder = ByteArrayEncoder;
0127   using Decoder = ByteArrayDecoder;
0128
0129   /// \brief Internal helper class for decoding BYTE_ARRAY data
0130   ///
0131   /// This class allows the caller to choose the concrete Arrow data type
0132   /// by passing a corresponding `ArrayBuilder`.
0133   /// Supported `ArrayBuilder` classes are `BinaryBuilder`, `LargeBinaryBuilder`
0134   /// and `BinaryViewBuilder`.
0135   /// If the builder is a `BinaryBuilder`, `chunks` can accumulate several
0136   /// arrays as needed to work around the 32-bit offset limit.
0137   struct Accumulator {
0138     std::unique_ptr<::arrow::ArrayBuilder> builder;
0139     std::vector<std::shared_ptr<::arrow::Array>> chunks;
0140   };
0141   using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>;
0142 };
0143
0144 template <>
0145 struct EncodingTraits<FLBAType> {
0146   using Encoder = FLBAEncoder;
0147   using Decoder = FLBADecoder;
0148
0149   using ArrowType = ::arrow::FixedSizeBinaryType;
0150   using Accumulator = ::arrow::FixedSizeBinaryBuilder;
0151   using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FixedSizeBinaryType>;
0152 };
0153
0154 class ColumnDescriptor;
0155
0156 // Untyped base for all encoders
0157 class Encoder {
0158  public:
0159   virtual ~Encoder() = default;
0160
0161   virtual int64_t EstimatedDataEncodedSize() = 0;
0162   virtual std::shared_ptr<Buffer> FlushValues() = 0;
0163   virtual Encoding::type encoding() const = 0;
0164
0165   virtual void Put(const ::arrow::Array& values) = 0;
0166
0167   // Report the number of bytes written to the encoder since the last report.
0168   // It only works for BYTE_ARRAY type and throw for other types.
0169   // This call is not idempotent since it resets the internal counter.
0170   virtual int64_t ReportUnencodedDataBytes() = 0;
0171
0172   virtual MemoryPool* memory_pool() const = 0;
0173 };
0174
0175 // Base class for value encoders. Since encoders may or not have state (e.g.,
0176 // dictionary encoding) we use a class instance to maintain any state.
0177 //
0178 // Encode interfaces are internal, subject to change without deprecation.
0179 template <typename DType>
0180 class TypedEncoder : virtual public Encoder {
0181  public:
0182   using T = typename DType::c_type;
0183
0184   using Encoder::Put;
0185
0186   virtual void Put(const T* src, int num_values) = 0;
0187
0188   virtual void Put(const std::vector<T>& src, int num_values = -1);
0189
0190   virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
0191                          int64_t valid_bits_offset) = 0;
0192 };
0193
0194 template <typename DType>
0195 void TypedEncoder<DType>::Put(const std::vector<T>& src, int num_values) {
0196   if (num_values == -1) {
0197     num_values = static_cast<int>(src.size());
0198   }
0199   Put(src.data(), num_values);
0200 }
0201
0202 template <>
0203 inline void TypedEncoder<BooleanType>::Put(const std::vector<bool>& src, int num_values) {
0204   // NOTE(wesm): This stub is here only to satisfy the compiler; it is
0205   // overridden later with the actual implementation
0206 }
0207
0208 // Base class for dictionary encoders
0209 template <typename DType>
0210 class DictEncoder : virtual public TypedEncoder<DType> {
0211  public:
0212   /// Writes out any buffered indices to buffer preceded by the bit width of this data.
0213   /// Returns the number of bytes written.
0214   /// If the supplied buffer is not big enough, returns -1.
0215   /// buffer must be preallocated with buffer_len bytes. Use EstimatedDataEncodedSize()
0216   /// to size buffer.
0217   virtual int WriteIndices(uint8_t* buffer, int buffer_len) = 0;
0218
0219   virtual int dict_encoded_size() const = 0;
0220
0221   virtual int bit_width() const = 0;
0222
0223   /// Writes out the encoded dictionary to buffer. buffer must be preallocated to
0224   /// dict_encoded_size() bytes.
0225   virtual void WriteDict(uint8_t* buffer) const = 0;
0226
0227   virtual int num_entries() const = 0;
0228
0229   /// \brief EXPERIMENTAL: Append dictionary indices into the encoder. It is
0230   /// assumed (without any boundschecking) that the indices reference
0231   /// preexisting dictionary values
0232   /// \param[in] indices the dictionary index values. Only Int32Array currently
0233   /// supported
0234   virtual void PutIndices(const ::arrow::Array& indices) = 0;
0235
0236   /// \brief EXPERIMENTAL: Append dictionary into encoder, inserting indices
0237   /// separately. Currently throws exception if the current dictionary memo is
0238   /// non-empty
0239   /// \param[in] values the dictionary values. Only valid for certain
0240   /// Parquet/Arrow type combinations, like BYTE_ARRAY/BinaryArray
0241   virtual void PutDictionary(const ::arrow::Array& values) = 0;
0242 };
0243
0244 // ----------------------------------------------------------------------
0245 // Value decoding
0246
0247 class Decoder {
0248  public:
0249   virtual ~Decoder() = default;
0250
0251   // Sets the data for a new page. This will be called multiple times on the same
0252   // decoder and should reset all internal state.
0253   //
0254   // `num_values` comes from the data page header, and may be greater than the number of
0255   // physical values in the data buffer if there are some omitted (null) values.
0256   // `len`, on the other hand, is the size in bytes of the data buffer and
0257   // directly relates to the number of physical values.
0258   virtual void SetData(int num_values, const uint8_t* data, int len) = 0;
0259
0260   // Returns the number of values left (for the last call to SetData()). This is
0261   // the number of values left in this page.
0262   virtual int values_left() const = 0;
0263   virtual Encoding::type encoding() const = 0;
0264 };
0265
0266 template <typename DType>
0267 class TypedDecoder : virtual public Decoder {
0268  public:
0269   using T = typename DType::c_type;
0270
0271   /// \brief Decode values into a buffer
0272   ///
0273   /// Subclasses may override the more specialized Decode methods below.
0274   ///
0275   /// \param[in] buffer destination for decoded values
0276   /// \param[in] max_values maximum number of values to decode
0277   /// \return The number of values decoded. Should be identical to max_values except
0278   /// at the end of the current data page.
0279   virtual int Decode(T* buffer, int max_values) = 0;
0280
0281   /// \brief Decode the values in this data page but leave spaces for null entries.
0282   ///
0283   /// \param[in] buffer destination for decoded values
0284   /// \param[in] num_values size of the def_levels and buffer arrays including the number
0285   /// of null slots
0286   /// \param[in] null_count number of null slots
0287   /// \param[in] valid_bits bitmap data indicating position of valid slots
0288   /// \param[in] valid_bits_offset offset into valid_bits
0289   /// \return The number of values decoded, including nulls.
0290   virtual int DecodeSpaced(T* buffer, int num_values, int null_count,
0291                            const uint8_t* valid_bits, int64_t valid_bits_offset) = 0;
0292
0293   /// \brief Decode into an ArrayBuilder or other accumulator
0294   ///
0295   /// This function assumes the definition levels were already decoded
0296   /// as a validity bitmap in the given `valid_bits`.  `null_count`
0297   /// is the number of 0s in `valid_bits`.
0298   /// As a space optimization, it is allowed for `valid_bits` to be null
0299   /// if `null_count` is zero.
0300   ///
0301   /// \return number of values decoded
0302   virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
0303                           int64_t valid_bits_offset,
0304                           typename EncodingTraits<DType>::Accumulator* out) = 0;
0305
0306   /// \brief Decode into an ArrayBuilder or other accumulator ignoring nulls
0307   ///
0308   /// \return number of values decoded
0309   int DecodeArrowNonNull(int num_values,
0310                          typename EncodingTraits<DType>::Accumulator* out) {
0311     return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, out);
0312   }
0313
0314   /// \brief Decode into a DictionaryBuilder
0315   ///
0316   /// This function assumes the definition levels were already decoded
0317   /// as a validity bitmap in the given `valid_bits`.  `null_count`
0318   /// is the number of 0s in `valid_bits`.
0319   /// As a space optimization, it is allowed for `valid_bits` to be null
0320   /// if `null_count` is zero.
0321   ///
0322   /// \return number of values decoded
0323   virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
0324                           int64_t valid_bits_offset,
0325                           typename EncodingTraits<DType>::DictAccumulator* builder) = 0;
0326
0327   /// \brief Decode into a DictionaryBuilder ignoring nulls
0328   ///
0329   /// \return number of values decoded
0330   int DecodeArrowNonNull(int num_values,
0331                          typename EncodingTraits<DType>::DictAccumulator* builder) {
0332     return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, builder);
0333   }
0334 };
0335
0336 template <typename DType>
0337 class DictDecoder : virtual public TypedDecoder<DType> {
0338  public:
0339   using T = typename DType::c_type;
0340
0341   virtual void SetDict(TypedDecoder<DType>* dictionary) = 0;
0342
0343   /// \brief Insert dictionary values into the Arrow dictionary builder's memo,
0344   /// but do not append any indices
0345   virtual void InsertDictionary(::arrow::ArrayBuilder* builder) = 0;
0346
0347   /// \brief Decode only dictionary indices and append to dictionary
0348   /// builder. The builder must have had the dictionary from this decoder
0349   /// inserted already.
0350   ///
0351   /// \warning Remember to reset the builder each time the dict decoder is initialized
0352   /// with a new dictionary page
0353   virtual int DecodeIndicesSpaced(int num_values, int null_count,
0354                                   const uint8_t* valid_bits, int64_t valid_bits_offset,
0355                                   ::arrow::ArrayBuilder* builder) = 0;
0356
0357   /// \brief Decode only dictionary indices (no nulls)
0358   ///
0359   /// \warning Remember to reset the builder each time the dict decoder is initialized
0360   /// with a new dictionary page
0361   virtual int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) = 0;
0362
0363   /// \brief Decode only dictionary indices (no nulls). Same as above
0364   /// DecodeIndices but target is an array instead of a builder.
0365   ///
0366   /// \note API EXPERIMENTAL
0367   virtual int DecodeIndices(int num_values, int32_t* indices) = 0;
0368
0369   /// \brief Get dictionary. The reader will call this API when it encounters a
0370   /// new dictionary.
0371   ///
0372   /// @param[out] dictionary The pointer to dictionary values. Dictionary is owned by
0373   /// the decoder and is destroyed when the decoder is destroyed.
0374   /// @param[out] dictionary_length The dictionary length.
0375   ///
0376   /// \note API EXPERIMENTAL
0377   virtual void GetDictionary(const T** dictionary, int32_t* dictionary_length) = 0;
0378 };
0379
0380 // ----------------------------------------------------------------------
0381 // TypedEncoder specializations, traits, and factory functions
0382
0383 class BooleanDecoder : virtual public TypedDecoder<BooleanType> {
0384  public:
0385   using TypedDecoder<BooleanType>::Decode;
0386
0387   /// \brief Decode and bit-pack values into a buffer
0388   ///
0389   /// \param[in] buffer destination for decoded values
0390   /// This buffer will contain bit-packed values. If
0391   /// max_values is not a multiple of 8, the trailing bits
0392   /// of the last byte will be undefined.
0393   /// \param[in] max_values max values to decode.
0394   /// \return The number of values decoded. Should be identical to max_values except
0395   /// at the end of the current data page.
0396   virtual int Decode(uint8_t* buffer, int max_values) = 0;
0397 };
0398
0399 class FLBADecoder : virtual public TypedDecoder<FLBAType> {
0400  public:
0401   using TypedDecoder<FLBAType>::DecodeSpaced;
0402
0403   // TODO(wesm): As possible follow-up to PARQUET-1508, we should examine if
0404   // there is value in adding specialized read methods for
0405   // FIXED_LEN_BYTE_ARRAY. If only Decimal data can occur with this data type
0406   // then perhaps not
0407 };
0408
0409 PARQUET_EXPORT
0410 std::unique_ptr<Encoder> MakeEncoder(
0411     Type::type type_num, Encoding::type encoding, bool use_dictionary = false,
0412     const ColumnDescriptor* descr = NULLPTR,
0413     ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
0414
0415 template <typename DType>
0416 std::unique_ptr<typename EncodingTraits<DType>::Encoder> MakeTypedEncoder(
0417     Encoding::type encoding, bool use_dictionary = false,
0418     const ColumnDescriptor* descr = NULLPTR,
0419     ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
0420   using OutType = typename EncodingTraits<DType>::Encoder;
0421   std::unique_ptr<Encoder> base =
0422       MakeEncoder(DType::type_num, encoding, use_dictionary, descr, pool);
0423   return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
0424 }
0425
0426 PARQUET_EXPORT
0427 std::unique_ptr<Decoder> MakeDecoder(
0428     Type::type type_num, Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR,
0429     ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
0430
0431 namespace detail {
0432
0433 PARQUET_EXPORT
0434 std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
0435                                          const ColumnDescriptor* descr,
0436                                          ::arrow::MemoryPool* pool);
0437
0438 }  // namespace detail
0439
0440 template <typename DType>
0441 std::unique_ptr<DictDecoder<DType>> MakeDictDecoder(
0442     const ColumnDescriptor* descr = NULLPTR,
0443     ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
0444   using OutType = DictDecoder<DType>;
0445   auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool);
0446   return std::unique_ptr<OutType>(dynamic_cast<OutType*>(decoder.release()));
0447 }
0448
0449 template <typename DType>
0450 std::unique_ptr<typename EncodingTraits<DType>::Decoder> MakeTypedDecoder(
0451     Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR,
0452     ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
0453   using OutType = typename EncodingTraits<DType>::Decoder;
0454   std::unique_ptr<Decoder> base = MakeDecoder(DType::type_num, encoding, descr, pool);
0455   return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
0456 }
0457
0458 }  // namespace parquet