File indexing completed on 2026-04-17 08:28:55
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 #pragma once
0019
0020 #include <algorithm>
0021 #include <cstddef>
0022 #include <cstdint>
0023 #include <memory>
0024 #include <string>
0025 #include <utility>
0026
0027 #include "parquet/platform.h"
0028 #include "parquet/types.h"
0029
0030 namespace arrow {
0031
0032 class Array;
0033 class BinaryArray;
0034
0035 }
0036
0037 namespace parquet {
0038
0039 class ColumnDescriptor;
0040
0041
0042
0043
0044
0045
0046 class PARQUET_EXPORT Comparator {
0047 public:
0048 virtual ~Comparator() {}
0049
0050
0051
0052
0053
0054
0055
0056
0057 static std::shared_ptr<Comparator> Make(Type::type physical_type,
0058 SortOrder::type sort_order,
0059 int type_length = -1);
0060
0061
0062
0063
0064 static std::shared_ptr<Comparator> Make(const ColumnDescriptor* descr);
0065 };
0066
0067
0068
0069 template <typename DType>
0070 class TypedComparator : public Comparator {
0071 public:
0072 using T = typename DType::c_type;
0073
0074
0075
0076 virtual bool Compare(const T& a, const T& b) const = 0;
0077
0078
0079
0080 virtual std::pair<T, T> GetMinMax(const T* values, int64_t length) const = 0;
0081
0082
0083
0084
0085 virtual std::pair<T, T> GetMinMax(const ::arrow::Array& values) const = 0;
0086
0087
0088
0089
0090
0091
0092
0093
0094
0095
0096
0097 virtual std::pair<T, T> GetMinMaxSpaced(const T* values, int64_t length,
0098 const uint8_t* valid_bits,
0099 int64_t valid_bits_offset) const = 0;
0100 };
0101
0102
0103 template <typename DType>
0104 std::shared_ptr<TypedComparator<DType>> MakeComparator(Type::type physical_type,
0105 SortOrder::type sort_order,
0106 int type_length = -1) {
0107 return std::static_pointer_cast<TypedComparator<DType>>(
0108 Comparator::Make(physical_type, sort_order, type_length));
0109 }
0110
0111
0112 template <typename DType>
0113 std::shared_ptr<TypedComparator<DType>> MakeComparator(const ColumnDescriptor* descr) {
0114 return std::static_pointer_cast<TypedComparator<DType>>(Comparator::Make(descr));
0115 }
0116
0117
0118
0119
0120
0121 class PARQUET_EXPORT EncodedStatistics {
0122 std::string max_, min_;
0123 bool is_signed_ = false;
0124
0125 public:
0126 EncodedStatistics() = default;
0127
0128 const std::string& max() const { return max_; }
0129 const std::string& min() const { return min_; }
0130
0131 std::optional<bool> is_max_value_exact;
0132 std::optional<bool> is_min_value_exact;
0133
0134 int64_t null_count = 0;
0135 int64_t distinct_count = 0;
0136
0137 bool has_min = false;
0138 bool has_max = false;
0139 bool has_null_count = false;
0140 bool has_distinct_count = false;
0141
0142
0143
0144
0145
0146 bool all_null_value = false;
0147
0148
0149
0150
0151
0152
0153 void ApplyStatSizeLimits(size_t length) {
0154 if (max_.length() > length) {
0155 has_max = false;
0156 max_.clear();
0157 is_max_value_exact = std::nullopt;
0158 }
0159 if (min_.length() > length) {
0160 has_min = false;
0161 min_.clear();
0162 is_min_value_exact = std::nullopt;
0163 }
0164 }
0165
0166
0167 void ClearMinMax() {
0168 has_max = false;
0169 max_.clear();
0170 has_min = false;
0171 min_.clear();
0172 }
0173
0174 bool is_set() const {
0175 return has_min || has_max || has_null_count || has_distinct_count;
0176 }
0177
0178 bool is_signed() const { return is_signed_; }
0179
0180 void set_is_signed(bool is_signed) { is_signed_ = is_signed; }
0181
0182 EncodedStatistics& set_max(std::string value) {
0183 max_ = std::move(value);
0184 has_max = true;
0185 return *this;
0186 }
0187
0188 EncodedStatistics& set_min(std::string value) {
0189 min_ = std::move(value);
0190 has_min = true;
0191 return *this;
0192 }
0193
0194 EncodedStatistics& set_null_count(int64_t value) {
0195 null_count = value;
0196 has_null_count = true;
0197 return *this;
0198 }
0199
0200 EncodedStatistics& set_distinct_count(int64_t value) {
0201 distinct_count = value;
0202 has_distinct_count = true;
0203 return *this;
0204 }
0205 };
0206
0207
0208 class PARQUET_EXPORT Statistics {
0209 public:
0210 virtual ~Statistics() {}
0211
0212
0213
0214
0215
0216 static std::shared_ptr<Statistics> Make(
0217 const ColumnDescriptor* descr,
0218 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
0219
0220
0221
0222
0223
0224
0225
0226
0227
0228
0229
0230
0231
0232 static std::shared_ptr<Statistics> Make(
0233 const ColumnDescriptor* descr, const std::string& encoded_min,
0234 const std::string& encoded_max, int64_t num_values, int64_t null_count,
0235 int64_t distinct_count, bool has_min_max, bool has_null_count,
0236 bool has_distinct_count,
0237 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
0238
0239
0240
0241
0242
0243
0244
0245
0246
0247
0248
0249
0250
0251
0252
0253 static std::shared_ptr<Statistics> Make(
0254 const ColumnDescriptor* descr, const std::string& encoded_min,
0255 const std::string& encoded_max, int64_t num_values, int64_t null_count,
0256 int64_t distinct_count, bool has_min_max, bool has_null_count,
0257 bool has_distinct_count, std::optional<bool> is_min_value_exact,
0258 std::optional<bool> is_max_value_exact,
0259 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
0260
0261
0262
0263
0264 static std::shared_ptr<Statistics> Make(
0265 const ColumnDescriptor* descr, const EncodedStatistics* encoded_statistics,
0266 int64_t num_values = -1,
0267 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
0268
0269
0270 virtual bool HasNullCount() const = 0;
0271
0272
0273 virtual int64_t null_count() const = 0;
0274
0275
0276 virtual bool HasDistinctCount() const = 0;
0277
0278
0279 virtual int64_t distinct_count() const = 0;
0280
0281
0282 virtual int64_t num_values() const = 0;
0283
0284
0285
0286 virtual bool HasMinMax() const = 0;
0287
0288
0289 virtual void Reset() = 0;
0290
0291
0292 virtual std::string EncodeMin() const = 0;
0293
0294
0295 virtual std::string EncodeMax() const = 0;
0296
0297
0298
0299 virtual std::optional<bool> is_min_value_exact() const = 0;
0300
0301
0302
0303 virtual std::optional<bool> is_max_value_exact() const = 0;
0304
0305
0306 virtual EncodedStatistics Encode() = 0;
0307
0308
0309 virtual Type::type physical_type() const = 0;
0310
0311
0312 virtual const ColumnDescriptor* descr() const = 0;
0313
0314
0315 virtual bool Equals(const Statistics& other) const = 0;
0316
0317 protected:
0318 static std::shared_ptr<Statistics> Make(Type::type physical_type, const void* min,
0319 const void* max, int64_t num_values,
0320 int64_t null_count, int64_t distinct_count);
0321 };
0322
0323
0324 template <typename DType>
0325 class TypedStatistics : public Statistics {
0326 public:
0327 using T = typename DType::c_type;
0328
0329
0330 virtual const T& min() const = 0;
0331
0332
0333 virtual const T& max() const = 0;
0334
0335
0336 virtual void Merge(const TypedStatistics<DType>& other) = 0;
0337
0338
0339 virtual void Update(const T* values, int64_t num_values, int64_t null_count) = 0;
0340
0341
0342
0343
0344
0345
0346
0347
0348
0349
0350
0351
0352 virtual void UpdateSpaced(const T* values, const uint8_t* valid_bits,
0353 int64_t valid_bits_offset, int64_t num_spaced_values,
0354 int64_t num_values, int64_t null_count) = 0;
0355
0356
0357
0358
0359
0360
0361
0362
0363
0364
0365 virtual void Update(const ::arrow::Array& values, bool update_counts = true) = 0;
0366
0367
0368 virtual void SetMinMax(const T& min, const T& max) = 0;
0369
0370
0371
0372
0373
0374 virtual void IncrementNullCount(int64_t n) = 0;
0375
0376
0377
0378 virtual void IncrementNumValues(int64_t n) = 0;
0379 };
0380
0381 using BoolStatistics = TypedStatistics<BooleanType>;
0382 using Int32Statistics = TypedStatistics<Int32Type>;
0383 using Int64Statistics = TypedStatistics<Int64Type>;
0384 using FloatStatistics = TypedStatistics<FloatType>;
0385 using DoubleStatistics = TypedStatistics<DoubleType>;
0386 using ByteArrayStatistics = TypedStatistics<ByteArrayType>;
0387 using FLBAStatistics = TypedStatistics<FLBAType>;
0388
0389
0390 template <typename DType>
0391 std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
0392 const ColumnDescriptor* descr,
0393 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
0394 return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(descr, pool));
0395 }
0396
0397
0398
0399
0400
0401
0402
0403 template <typename DType>
0404 std::shared_ptr<TypedStatistics<DType>> MakeStatistics(const typename DType::c_type& min,
0405 const typename DType::c_type& max,
0406 int64_t num_values,
0407 int64_t null_count,
0408 int64_t distinct_count) {
0409 return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
0410 DType::type_num, &min, &max, num_values, null_count, distinct_count));
0411 }
0412
0413
0414 template <typename DType>
0415 std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
0416 const ColumnDescriptor* descr, const std::string& encoded_min,
0417 const std::string& encoded_max, int64_t num_values, int64_t null_count,
0418 int64_t distinct_count, bool has_min_max, bool has_null_count,
0419 bool has_distinct_count, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
0420 return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
0421 descr, encoded_min, encoded_max, num_values, null_count, distinct_count,
0422 has_min_max, has_null_count, has_distinct_count,
0423 std::nullopt, std::nullopt, pool));
0424 }
0425
0426
0427 template <typename DType>
0428 std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
0429 const ColumnDescriptor* descr, const std::string& encoded_min,
0430 const std::string& encoded_max, int64_t num_values, int64_t null_count,
0431 int64_t distinct_count, bool has_min_max, bool has_null_count,
0432 bool has_distinct_count, std::optional<bool> is_min_value_exact,
0433 std::optional<bool> is_max_value_exact,
0434 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
0435 return std::static_pointer_cast<TypedStatistics<DType>>(
0436 Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count,
0437 distinct_count, has_min_max, has_null_count, has_distinct_count,
0438 is_min_value_exact, is_max_value_exact, pool));
0439 }
0440
0441 }