File indexing completed on 2026-04-17 08:28:54
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 #pragma once
0019
0020 #include <cstdint>
0021 #include <cstring>
0022 #include <memory>
0023 #include <vector>
0024
0025 #include "arrow/type_fwd.h"
0026
0027 #include "parquet/exception.h"
0028 #include "parquet/platform.h"
0029 #include "parquet/types.h"
0030
0031 namespace arrow {
0032 template <typename T>
0033 class Dictionary32Builder;
0034 }
0035
0036 namespace parquet {
0037
0038 template <typename DType>
0039 class TypedEncoder;
0040
0041 using BooleanEncoder = TypedEncoder<BooleanType>;
0042 using Int32Encoder = TypedEncoder<Int32Type>;
0043 using Int64Encoder = TypedEncoder<Int64Type>;
0044 using Int96Encoder = TypedEncoder<Int96Type>;
0045 using FloatEncoder = TypedEncoder<FloatType>;
0046 using DoubleEncoder = TypedEncoder<DoubleType>;
0047 using ByteArrayEncoder = TypedEncoder<ByteArrayType>;
0048 using FLBAEncoder = TypedEncoder<FLBAType>;
0049
0050 template <typename DType>
0051 class TypedDecoder;
0052
0053 class BooleanDecoder;
0054 using Int32Decoder = TypedDecoder<Int32Type>;
0055 using Int64Decoder = TypedDecoder<Int64Type>;
0056 using Int96Decoder = TypedDecoder<Int96Type>;
0057 using FloatDecoder = TypedDecoder<FloatType>;
0058 using DoubleDecoder = TypedDecoder<DoubleType>;
0059 using ByteArrayDecoder = TypedDecoder<ByteArrayType>;
0060 class FLBADecoder;
0061
0062 template <typename T>
0063 struct EncodingTraits;
0064
0065 template <>
0066 struct EncodingTraits<BooleanType> {
0067 using Encoder = BooleanEncoder;
0068 using Decoder = BooleanDecoder;
0069
0070 using ArrowType = ::arrow::BooleanType;
0071 using Accumulator = ::arrow::BooleanBuilder;
0072 struct DictAccumulator {};
0073 };
0074
0075 template <>
0076 struct EncodingTraits<Int32Type> {
0077 using Encoder = Int32Encoder;
0078 using Decoder = Int32Decoder;
0079
0080 using ArrowType = ::arrow::Int32Type;
0081 using Accumulator = ::arrow::NumericBuilder<::arrow::Int32Type>;
0082 using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int32Type>;
0083 };
0084
0085 template <>
0086 struct EncodingTraits<Int64Type> {
0087 using Encoder = Int64Encoder;
0088 using Decoder = Int64Decoder;
0089
0090 using ArrowType = ::arrow::Int64Type;
0091 using Accumulator = ::arrow::NumericBuilder<::arrow::Int64Type>;
0092 using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int64Type>;
0093 };
0094
0095 template <>
0096 struct EncodingTraits<Int96Type> {
0097 using Encoder = Int96Encoder;
0098 using Decoder = Int96Decoder;
0099
0100 struct Accumulator {};
0101 struct DictAccumulator {};
0102 };
0103
0104 template <>
0105 struct EncodingTraits<FloatType> {
0106 using Encoder = FloatEncoder;
0107 using Decoder = FloatDecoder;
0108
0109 using ArrowType = ::arrow::FloatType;
0110 using Accumulator = ::arrow::NumericBuilder<::arrow::FloatType>;
0111 using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FloatType>;
0112 };
0113
0114 template <>
0115 struct EncodingTraits<DoubleType> {
0116 using Encoder = DoubleEncoder;
0117 using Decoder = DoubleDecoder;
0118
0119 using ArrowType = ::arrow::DoubleType;
0120 using Accumulator = ::arrow::NumericBuilder<::arrow::DoubleType>;
0121 using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::DoubleType>;
0122 };
0123
0124 template <>
0125 struct EncodingTraits<ByteArrayType> {
0126 using Encoder = ByteArrayEncoder;
0127 using Decoder = ByteArrayDecoder;
0128
0129
0130
0131
0132
0133
0134
0135
0136
0137 struct Accumulator {
0138 std::unique_ptr<::arrow::ArrayBuilder> builder;
0139 std::vector<std::shared_ptr<::arrow::Array>> chunks;
0140 };
0141 using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>;
0142 };
0143
0144 template <>
0145 struct EncodingTraits<FLBAType> {
0146 using Encoder = FLBAEncoder;
0147 using Decoder = FLBADecoder;
0148
0149 using ArrowType = ::arrow::FixedSizeBinaryType;
0150 using Accumulator = ::arrow::FixedSizeBinaryBuilder;
0151 using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FixedSizeBinaryType>;
0152 };
0153
0154 class ColumnDescriptor;
0155
0156
0157 class Encoder {
0158 public:
0159 virtual ~Encoder() = default;
0160
0161 virtual int64_t EstimatedDataEncodedSize() = 0;
0162 virtual std::shared_ptr<Buffer> FlushValues() = 0;
0163 virtual Encoding::type encoding() const = 0;
0164
0165 virtual void Put(const ::arrow::Array& values) = 0;
0166
0167
0168
0169
0170 virtual int64_t ReportUnencodedDataBytes() = 0;
0171
0172 virtual MemoryPool* memory_pool() const = 0;
0173 };
0174
0175
0176
0177
0178
0179 template <typename DType>
0180 class TypedEncoder : virtual public Encoder {
0181 public:
0182 using T = typename DType::c_type;
0183
0184 using Encoder::Put;
0185
0186 virtual void Put(const T* src, int num_values) = 0;
0187
0188 virtual void Put(const std::vector<T>& src, int num_values = -1);
0189
0190 virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
0191 int64_t valid_bits_offset) = 0;
0192 };
0193
0194 template <typename DType>
0195 void TypedEncoder<DType>::Put(const std::vector<T>& src, int num_values) {
0196 if (num_values == -1) {
0197 num_values = static_cast<int>(src.size());
0198 }
0199 Put(src.data(), num_values);
0200 }
0201
0202 template <>
0203 inline void TypedEncoder<BooleanType>::Put(const std::vector<bool>& src, int num_values) {
0204
0205
0206 }
0207
0208
0209 template <typename DType>
0210 class DictEncoder : virtual public TypedEncoder<DType> {
0211 public:
0212
0213
0214
0215
0216
0217 virtual int WriteIndices(uint8_t* buffer, int buffer_len) = 0;
0218
0219 virtual int dict_encoded_size() const = 0;
0220
0221 virtual int bit_width() const = 0;
0222
0223
0224
0225 virtual void WriteDict(uint8_t* buffer) const = 0;
0226
0227 virtual int num_entries() const = 0;
0228
0229
0230
0231
0232
0233
0234 virtual void PutIndices(const ::arrow::Array& indices) = 0;
0235
0236
0237
0238
0239
0240
0241 virtual void PutDictionary(const ::arrow::Array& values) = 0;
0242 };
0243
0244
0245
0246
0247 class Decoder {
0248 public:
0249 virtual ~Decoder() = default;
0250
0251
0252
0253
0254
0255
0256
0257
0258 virtual void SetData(int num_values, const uint8_t* data, int len) = 0;
0259
0260
0261
0262 virtual int values_left() const = 0;
0263 virtual Encoding::type encoding() const = 0;
0264 };
0265
0266 template <typename DType>
0267 class TypedDecoder : virtual public Decoder {
0268 public:
0269 using T = typename DType::c_type;
0270
0271
0272
0273
0274
0275
0276
0277
0278
0279 virtual int Decode(T* buffer, int max_values) = 0;
0280
0281
0282
0283
0284
0285
0286
0287
0288
0289
0290 virtual int DecodeSpaced(T* buffer, int num_values, int null_count,
0291 const uint8_t* valid_bits, int64_t valid_bits_offset) = 0;
0292
0293
0294
0295
0296
0297
0298
0299
0300
0301
0302 virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
0303 int64_t valid_bits_offset,
0304 typename EncodingTraits<DType>::Accumulator* out) = 0;
0305
0306
0307
0308
0309 int DecodeArrowNonNull(int num_values,
0310 typename EncodingTraits<DType>::Accumulator* out) {
0311 return DecodeArrow(num_values, 0, NULLPTR, 0, out);
0312 }
0313
0314
0315
0316
0317
0318
0319
0320
0321
0322
0323 virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
0324 int64_t valid_bits_offset,
0325 typename EncodingTraits<DType>::DictAccumulator* builder) = 0;
0326
0327
0328
0329
0330 int DecodeArrowNonNull(int num_values,
0331 typename EncodingTraits<DType>::DictAccumulator* builder) {
0332 return DecodeArrow(num_values, 0, NULLPTR, 0, builder);
0333 }
0334 };
0335
0336 template <typename DType>
0337 class DictDecoder : virtual public TypedDecoder<DType> {
0338 public:
0339 using T = typename DType::c_type;
0340
0341 virtual void SetDict(TypedDecoder<DType>* dictionary) = 0;
0342
0343
0344
0345 virtual void InsertDictionary(::arrow::ArrayBuilder* builder) = 0;
0346
0347
0348
0349
0350
0351
0352
0353 virtual int DecodeIndicesSpaced(int num_values, int null_count,
0354 const uint8_t* valid_bits, int64_t valid_bits_offset,
0355 ::arrow::ArrayBuilder* builder) = 0;
0356
0357
0358
0359
0360
0361 virtual int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) = 0;
0362
0363
0364
0365
0366
0367 virtual int DecodeIndices(int num_values, int32_t* indices) = 0;
0368
0369
0370
0371
0372
0373
0374
0375
0376
0377 virtual void GetDictionary(const T** dictionary, int32_t* dictionary_length) = 0;
0378 };
0379
0380
0381
0382
0383 class BooleanDecoder : virtual public TypedDecoder<BooleanType> {
0384 public:
0385 using TypedDecoder<BooleanType>::Decode;
0386
0387
0388
0389
0390
0391
0392
0393
0394
0395
0396 virtual int Decode(uint8_t* buffer, int max_values) = 0;
0397 };
0398
0399 class FLBADecoder : virtual public TypedDecoder<FLBAType> {
0400 public:
0401 using TypedDecoder<FLBAType>::DecodeSpaced;
0402
0403
0404
0405
0406
0407 };
0408
0409 PARQUET_EXPORT
0410 std::unique_ptr<Encoder> MakeEncoder(
0411 Type::type type_num, Encoding::type encoding, bool use_dictionary = false,
0412 const ColumnDescriptor* descr = NULLPTR,
0413 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
0414
0415 template <typename DType>
0416 std::unique_ptr<typename EncodingTraits<DType>::Encoder> MakeTypedEncoder(
0417 Encoding::type encoding, bool use_dictionary = false,
0418 const ColumnDescriptor* descr = NULLPTR,
0419 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
0420 using OutType = typename EncodingTraits<DType>::Encoder;
0421 std::unique_ptr<Encoder> base =
0422 MakeEncoder(DType::type_num, encoding, use_dictionary, descr, pool);
0423 return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
0424 }
0425
0426 PARQUET_EXPORT
0427 std::unique_ptr<Decoder> MakeDecoder(
0428 Type::type type_num, Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR,
0429 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
0430
0431 namespace detail {
0432
0433 PARQUET_EXPORT
0434 std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
0435 const ColumnDescriptor* descr,
0436 ::arrow::MemoryPool* pool);
0437
0438 }
0439
0440 template <typename DType>
0441 std::unique_ptr<DictDecoder<DType>> MakeDictDecoder(
0442 const ColumnDescriptor* descr = NULLPTR,
0443 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
0444 using OutType = DictDecoder<DType>;
0445 auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool);
0446 return std::unique_ptr<OutType>(dynamic_cast<OutType*>(decoder.release()));
0447 }
0448
0449 template <typename DType>
0450 std::unique_ptr<typename EncodingTraits<DType>::Decoder> MakeTypedDecoder(
0451 Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR,
0452 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
0453 using OutType = typename EncodingTraits<DType>::Decoder;
0454 std::unique_ptr<Decoder> base = MakeDecoder(DType::type_num, encoding, descr, pool);
0455 return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
0456 }
0457
0458 }