File indexing completed on 2026-04-17 08:28:55
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022 #pragma once
0023
0024 #include <algorithm>
0025 #include <limits>
0026 #include <memory>
0027 #include <random>
0028 #include <string>
0029 #include <vector>
0030
0031 #include <gtest/gtest.h>
0032
0033 #include "arrow/extension_type.h"
0034 #include "arrow/io/memory.h"
0035 #include "arrow/testing/util.h"
0036 #include "arrow/util/float16.h"
0037
0038 #include "parquet/column_page.h"
0039 #include "parquet/column_reader.h"
0040 #include "parquet/column_writer.h"
0041 #include "parquet/encoding.h"
0042 #include "parquet/platform.h"
0043
0044
0045
0046 #define EXPECT_THROW_THAT(callable, ex_type, property) \
0047 EXPECT_THROW( \
0048 try { (callable)(); } catch (const ex_type& err) { \
0049 EXPECT_THAT(err, (property)); \
0050 throw; \
0051 }, \
0052 ex_type)
0053
0054 namespace parquet {
0055
0056 static constexpr int FLBA_LENGTH = 12;
0057
0058 inline bool operator==(const FixedLenByteArray& a, const FixedLenByteArray& b) {
0059 return 0 == memcmp(a.ptr, b.ptr, FLBA_LENGTH);
0060 }
0061
0062 namespace test {
0063
0064 typedef ::testing::Types<BooleanType, Int32Type, Int64Type, Int96Type, FloatType,
0065 DoubleType, ByteArrayType, FLBAType>
0066 ParquetTypes;
0067
0068 class ParquetTestException : public parquet::ParquetException {
0069 using ParquetException::ParquetException;
0070 };
0071
0072 const char* get_data_dir();
0073 std::string get_bad_data_dir();
0074
0075 std::string get_data_file(const std::string& filename, bool is_good = true);
0076
0077 template <typename T>
0078 static inline void assert_vector_equal(const std::vector<T>& left,
0079 const std::vector<T>& right) {
0080 ASSERT_EQ(left.size(), right.size());
0081
0082 for (size_t i = 0; i < left.size(); ++i) {
0083 ASSERT_EQ(left[i], right[i]) << i;
0084 }
0085 }
0086
0087 template <typename T>
0088 static inline bool vector_equal(const std::vector<T>& left, const std::vector<T>& right) {
0089 if (left.size() != right.size()) {
0090 return false;
0091 }
0092
0093 for (size_t i = 0; i < left.size(); ++i) {
0094 if (left[i] != right[i]) {
0095 std::cerr << "index " << i << " left was " << left[i] << " right was " << right[i]
0096 << std::endl;
0097 return false;
0098 }
0099 }
0100
0101 return true;
0102 }
0103
0104 template <typename T>
0105 static std::vector<T> slice(const std::vector<T>& values, int start, int end) {
0106 if (end < start) {
0107 return std::vector<T>(0);
0108 }
0109
0110 std::vector<T> out(end - start);
0111 for (int i = start; i < end; ++i) {
0112 out[i - start] = values[i];
0113 }
0114 return out;
0115 }
0116
0117 void random_bytes(int n, uint32_t seed, std::vector<uint8_t>* out);
0118 void random_bools(int n, double p, uint32_t seed, bool* out);
0119
0120 template <typename T>
0121 inline void random_numbers(int n, uint32_t seed, T min_value, T max_value, T* out) {
0122 std::default_random_engine gen(seed);
0123 std::uniform_int_distribution<T> d(min_value, max_value);
0124 for (int i = 0; i < n; ++i) {
0125 out[i] = d(gen);
0126 }
0127 }
0128
0129 template <>
0130 inline void random_numbers(int n, uint32_t seed, float min_value, float max_value,
0131 float* out) {
0132 std::default_random_engine gen(seed);
0133 std::uniform_real_distribution<float> d(min_value, max_value);
0134 for (int i = 0; i < n; ++i) {
0135 out[i] = d(gen);
0136 }
0137 }
0138
0139 template <>
0140 inline void random_numbers(int n, uint32_t seed, double min_value, double max_value,
0141 double* out) {
0142 std::default_random_engine gen(seed);
0143 std::uniform_real_distribution<double> d(min_value, max_value);
0144 for (int i = 0; i < n; ++i) {
0145 out[i] = d(gen);
0146 }
0147 }
0148
0149 void random_Int96_numbers(int n, uint32_t seed, int32_t min_value, int32_t max_value,
0150 Int96* out);
0151
0152 void random_float16_numbers(int n, uint32_t seed, ::arrow::util::Float16 min_value,
0153 ::arrow::util::Float16 max_value, uint16_t* out);
0154
0155 void random_fixed_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out);
0156
0157 void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int min_size,
0158 int max_size);
0159
0160 void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int max_size);
0161
0162 void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out,
0163 int min_size, int max_size, double prefixed_probability);
0164
0165 void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out,
0166 double prefixed_probability);
0167
0168 template <typename Type, typename Sequence>
0169 std::shared_ptr<Buffer> EncodeValues(Encoding::type encoding, bool use_dictionary,
0170 const Sequence& values, int length,
0171 const ColumnDescriptor* descr) {
0172 auto encoder = MakeTypedEncoder<Type>(encoding, use_dictionary, descr);
0173 encoder->Put(values, length);
0174 return encoder->FlushValues();
0175 }
0176
0177 template <typename T>
0178 static void InitValues(int num_values, uint32_t seed, std::vector<T>& values,
0179 std::vector<uint8_t>& buffer) {
0180 random_numbers(num_values, seed, std::numeric_limits<T>::min(),
0181 std::numeric_limits<T>::max(), values.data());
0182 }
0183
0184 template <typename T>
0185 static void InitValues(int num_values, std::vector<T>& values,
0186 std::vector<uint8_t>& buffer) {
0187 InitValues(num_values, 0, values, buffer);
0188 }
0189
0190 template <typename T>
0191 static void InitDictValues(int num_values, int num_dicts, std::vector<T>& values,
0192 std::vector<uint8_t>& buffer) {
0193 int repeat_factor = num_values / num_dicts;
0194 InitValues<T>(num_dicts, values, buffer);
0195
0196 for (int j = 1; j < repeat_factor; ++j) {
0197 for (int i = 0; i < num_dicts; ++i) {
0198 std::memcpy(&values[num_dicts * j + i], &values[i], sizeof(T));
0199 }
0200 }
0201
0202
0203 for (int i = num_dicts * repeat_factor; i < num_values; ++i) {
0204 std::memcpy(&values[i], &values[i - num_dicts * repeat_factor], sizeof(T));
0205 }
0206 }
0207
0208 template <>
0209 inline void InitDictValues<bool>(int num_values, int num_dicts, std::vector<bool>& values,
0210 std::vector<uint8_t>& buffer) {
0211
0212 }
0213
0214 class MockPageReader : public PageReader {
0215 public:
0216 explicit MockPageReader(const std::vector<std::shared_ptr<Page>>& pages)
0217 : pages_(pages), page_index_(0) {}
0218
0219 std::shared_ptr<Page> NextPage() override {
0220 if (page_index_ == static_cast<int>(pages_.size())) {
0221
0222 return std::shared_ptr<Page>(nullptr);
0223 }
0224 return pages_[page_index_++];
0225 }
0226
0227
0228 void set_max_page_header_size(uint32_t size) override {}
0229
0230 private:
0231 std::vector<std::shared_ptr<Page>> pages_;
0232 int page_index_;
0233 };
0234
0235
0236
0237 template <typename Type>
0238 class DataPageBuilder {
0239 public:
0240 using c_type = typename Type::c_type;
0241
0242
0243 explicit DataPageBuilder(ArrowOutputStream* sink)
0244 : sink_(sink),
0245 num_values_(0),
0246 encoding_(Encoding::PLAIN),
0247 definition_level_encoding_(Encoding::RLE),
0248 repetition_level_encoding_(Encoding::RLE),
0249 have_def_levels_(false),
0250 have_rep_levels_(false),
0251 have_values_(false) {}
0252
0253 void AppendDefLevels(const std::vector<int16_t>& levels, int16_t max_level,
0254 Encoding::type encoding = Encoding::RLE) {
0255 AppendLevels(levels, max_level, encoding);
0256
0257 num_values_ = std::max(static_cast<int32_t>(levels.size()), num_values_);
0258 definition_level_encoding_ = encoding;
0259 have_def_levels_ = true;
0260 }
0261
0262 void AppendRepLevels(const std::vector<int16_t>& levels, int16_t max_level,
0263 Encoding::type encoding = Encoding::RLE) {
0264 AppendLevels(levels, max_level, encoding);
0265
0266 num_values_ = std::max(static_cast<int32_t>(levels.size()), num_values_);
0267 repetition_level_encoding_ = encoding;
0268 have_rep_levels_ = true;
0269 }
0270
0271 void AppendValues(const ColumnDescriptor* d, const std::vector<c_type>& values,
0272 Encoding::type encoding = Encoding::PLAIN) {
0273 std::shared_ptr<Buffer> values_sink = EncodeValues<Type>(
0274 encoding, false, values.data(), static_cast<int>(values.size()), d);
0275 PARQUET_THROW_NOT_OK(sink_->Write(values_sink->data(), values_sink->size()));
0276
0277 num_values_ = std::max(static_cast<int32_t>(values.size()), num_values_);
0278 encoding_ = encoding;
0279 have_values_ = true;
0280 }
0281
0282 int32_t num_values() const { return num_values_; }
0283
0284 Encoding::type encoding() const { return encoding_; }
0285
0286 Encoding::type rep_level_encoding() const { return repetition_level_encoding_; }
0287
0288 Encoding::type def_level_encoding() const { return definition_level_encoding_; }
0289
0290 private:
0291 ArrowOutputStream* sink_;
0292
0293 int32_t num_values_;
0294 Encoding::type encoding_;
0295 Encoding::type definition_level_encoding_;
0296 Encoding::type repetition_level_encoding_;
0297
0298 bool have_def_levels_;
0299 bool have_rep_levels_;
0300 bool have_values_;
0301
0302
0303 void AppendLevels(const std::vector<int16_t>& levels, int16_t max_level,
0304 Encoding::type encoding) {
0305 if (encoding != Encoding::RLE) {
0306 ParquetException::NYI("only rle encoding currently implemented");
0307 }
0308
0309 std::vector<uint8_t> encode_buffer(LevelEncoder::MaxBufferSize(
0310 Encoding::RLE, max_level, static_cast<int>(levels.size())));
0311
0312
0313
0314
0315 LevelEncoder encoder;
0316 encoder.Init(encoding, max_level, static_cast<int>(levels.size()),
0317 encode_buffer.data(), static_cast<int>(encode_buffer.size()));
0318
0319 encoder.Encode(static_cast<int>(levels.size()), levels.data());
0320
0321 int32_t rle_bytes = encoder.len();
0322 PARQUET_THROW_NOT_OK(
0323 sink_->Write(reinterpret_cast<const uint8_t*>(&rle_bytes), sizeof(int32_t)));
0324 PARQUET_THROW_NOT_OK(sink_->Write(encode_buffer.data(), rle_bytes));
0325 }
0326 };
0327
0328 template <>
0329 inline void DataPageBuilder<BooleanType>::AppendValues(const ColumnDescriptor* d,
0330 const std::vector<bool>& values,
0331 Encoding::type encoding) {
0332 if (encoding != Encoding::PLAIN) {
0333 ParquetException::NYI("only plain encoding currently implemented");
0334 }
0335
0336 auto encoder = MakeTypedEncoder<BooleanType>(Encoding::PLAIN, false, d);
0337 dynamic_cast<BooleanEncoder*>(encoder.get())
0338 ->Put(values, static_cast<int>(values.size()));
0339 std::shared_ptr<Buffer> buffer = encoder->FlushValues();
0340 PARQUET_THROW_NOT_OK(sink_->Write(buffer->data(), buffer->size()));
0341
0342 num_values_ = std::max(static_cast<int32_t>(values.size()), num_values_);
0343 encoding_ = encoding;
0344 have_values_ = true;
0345 }
0346
0347 template <typename Type>
0348 static std::shared_ptr<DataPageV1> MakeDataPage(
0349 const ColumnDescriptor* d, const std::vector<typename Type::c_type>& values,
0350 int num_vals, Encoding::type encoding, const uint8_t* indices, int indices_size,
0351 const std::vector<int16_t>& def_levels, int16_t max_def_level,
0352 const std::vector<int16_t>& rep_levels, int16_t max_rep_level) {
0353 int num_values = 0;
0354
0355 auto page_stream = CreateOutputStream();
0356 test::DataPageBuilder<Type> page_builder(page_stream.get());
0357
0358 if (!rep_levels.empty()) {
0359 page_builder.AppendRepLevels(rep_levels, max_rep_level);
0360 }
0361 if (!def_levels.empty()) {
0362 page_builder.AppendDefLevels(def_levels, max_def_level);
0363 }
0364
0365 if (encoding == Encoding::PLAIN) {
0366 page_builder.AppendValues(d, values, encoding);
0367 num_values = std::max(page_builder.num_values(), num_vals);
0368 } else {
0369 PARQUET_THROW_NOT_OK(page_stream->Write(indices, indices_size));
0370 num_values = std::max(page_builder.num_values(), num_vals);
0371 }
0372
0373 PARQUET_ASSIGN_OR_THROW(auto buffer, page_stream->Finish());
0374
0375 return std::make_shared<DataPageV1>(buffer, num_values, encoding,
0376 page_builder.def_level_encoding(),
0377 page_builder.rep_level_encoding(), buffer->size());
0378 }
0379
0380 template <typename TYPE>
0381 class DictionaryPageBuilder {
0382 public:
0383 typedef typename TYPE::c_type TC;
0384 static constexpr int TN = TYPE::type_num;
0385 using SpecializedEncoder = typename EncodingTraits<TYPE>::Encoder;
0386
0387
0388 explicit DictionaryPageBuilder(const ColumnDescriptor* d)
0389 : num_dict_values_(0), have_values_(false) {
0390 auto encoder = MakeTypedEncoder<TYPE>(Encoding::PLAIN, true, d);
0391 dict_traits_ = dynamic_cast<DictEncoder<TYPE>*>(encoder.get());
0392 encoder_.reset(dynamic_cast<SpecializedEncoder*>(encoder.release()));
0393 }
0394
0395 ~DictionaryPageBuilder() {}
0396
0397 std::shared_ptr<Buffer> AppendValues(const std::vector<TC>& values) {
0398 int num_values = static_cast<int>(values.size());
0399
0400 encoder_->Put(values.data(), num_values);
0401 num_dict_values_ = dict_traits_->num_entries();
0402 have_values_ = true;
0403 return encoder_->FlushValues();
0404 }
0405
0406 std::shared_ptr<Buffer> WriteDict() {
0407 std::shared_ptr<Buffer> dict_buffer =
0408 AllocateBuffer(::arrow::default_memory_pool(), dict_traits_->dict_encoded_size());
0409 dict_traits_->WriteDict(dict_buffer->mutable_data());
0410 return dict_buffer;
0411 }
0412
0413 int32_t num_values() const { return num_dict_values_; }
0414
0415 private:
0416 DictEncoder<TYPE>* dict_traits_;
0417 std::unique_ptr<SpecializedEncoder> encoder_;
0418 int32_t num_dict_values_;
0419 bool have_values_;
0420 };
0421
0422 template <>
0423 inline DictionaryPageBuilder<BooleanType>::DictionaryPageBuilder(
0424 const ColumnDescriptor* d) {
0425 ParquetException::NYI("only plain encoding currently implemented for boolean");
0426 }
0427
0428 template <>
0429 inline std::shared_ptr<Buffer> DictionaryPageBuilder<BooleanType>::WriteDict() {
0430 ParquetException::NYI("only plain encoding currently implemented for boolean");
0431 return nullptr;
0432 }
0433
0434 template <>
0435 inline std::shared_ptr<Buffer> DictionaryPageBuilder<BooleanType>::AppendValues(
0436 const std::vector<TC>& values) {
0437 ParquetException::NYI("only plain encoding currently implemented for boolean");
0438 return nullptr;
0439 }
0440
0441 template <typename Type>
0442 inline static std::shared_ptr<DictionaryPage> MakeDictPage(
0443 const ColumnDescriptor* d, const std::vector<typename Type::c_type>& values,
0444 const std::vector<int>& values_per_page, Encoding::type encoding,
0445 std::vector<std::shared_ptr<Buffer>>& rle_indices) {
0446 test::DictionaryPageBuilder<Type> page_builder(d);
0447 int num_pages = static_cast<int>(values_per_page.size());
0448 int value_start = 0;
0449
0450 for (int i = 0; i < num_pages; i++) {
0451 rle_indices.push_back(page_builder.AppendValues(
0452 slice(values, value_start, value_start + values_per_page[i])));
0453 value_start += values_per_page[i];
0454 }
0455
0456 auto buffer = page_builder.WriteDict();
0457
0458 return std::make_shared<DictionaryPage>(buffer, page_builder.num_values(),
0459 Encoding::PLAIN);
0460 }
0461
0462
0463 template <typename Type>
0464 inline static void PaginateDict(const ColumnDescriptor* d,
0465 const std::vector<typename Type::c_type>& values,
0466 const std::vector<int16_t>& def_levels,
0467 int16_t max_def_level,
0468 const std::vector<int16_t>& rep_levels,
0469 int16_t max_rep_level, int num_levels_per_page,
0470 const std::vector<int>& values_per_page,
0471 std::vector<std::shared_ptr<Page>>& pages,
0472 Encoding::type encoding = Encoding::RLE_DICTIONARY) {
0473 int num_pages = static_cast<int>(values_per_page.size());
0474 std::vector<std::shared_ptr<Buffer>> rle_indices;
0475 std::shared_ptr<DictionaryPage> dict_page =
0476 MakeDictPage<Type>(d, values, values_per_page, encoding, rle_indices);
0477 pages.push_back(dict_page);
0478 int def_level_start = 0;
0479 int def_level_end = 0;
0480 int rep_level_start = 0;
0481 int rep_level_end = 0;
0482 for (int i = 0; i < num_pages; i++) {
0483 if (max_def_level > 0) {
0484 def_level_start = i * num_levels_per_page;
0485 def_level_end = (i + 1) * num_levels_per_page;
0486 }
0487 if (max_rep_level > 0) {
0488 rep_level_start = i * num_levels_per_page;
0489 rep_level_end = (i + 1) * num_levels_per_page;
0490 }
0491 std::shared_ptr<DataPageV1> data_page = MakeDataPage<Int32Type>(
0492 d, {}, values_per_page[i], encoding, rle_indices[i]->data(),
0493 static_cast<int>(rle_indices[i]->size()),
0494 slice(def_levels, def_level_start, def_level_end), max_def_level,
0495 slice(rep_levels, rep_level_start, rep_level_end), max_rep_level);
0496 pages.push_back(data_page);
0497 }
0498 }
0499
0500
0501 template <typename Type>
0502 static inline void PaginatePlain(const ColumnDescriptor* d,
0503 const std::vector<typename Type::c_type>& values,
0504 const std::vector<int16_t>& def_levels,
0505 int16_t max_def_level,
0506 const std::vector<int16_t>& rep_levels,
0507 int16_t max_rep_level, int num_levels_per_page,
0508 const std::vector<int>& values_per_page,
0509 std::vector<std::shared_ptr<Page>>& pages,
0510 Encoding::type encoding = Encoding::PLAIN) {
0511 int num_pages = static_cast<int>(values_per_page.size());
0512 int def_level_start = 0;
0513 int def_level_end = 0;
0514 int rep_level_start = 0;
0515 int rep_level_end = 0;
0516 int value_start = 0;
0517 for (int i = 0; i < num_pages; i++) {
0518 if (max_def_level > 0) {
0519 def_level_start = i * num_levels_per_page;
0520 def_level_end = (i + 1) * num_levels_per_page;
0521 }
0522 if (max_rep_level > 0) {
0523 rep_level_start = i * num_levels_per_page;
0524 rep_level_end = (i + 1) * num_levels_per_page;
0525 }
0526 std::shared_ptr<DataPage> page = MakeDataPage<Type>(
0527 d, slice(values, value_start, value_start + values_per_page[i]),
0528 values_per_page[i], encoding, nullptr, 0,
0529 slice(def_levels, def_level_start, def_level_end), max_def_level,
0530 slice(rep_levels, rep_level_start, rep_level_end), max_rep_level);
0531 pages.push_back(page);
0532 value_start += values_per_page[i];
0533 }
0534 }
0535
0536
0537 template <typename Type>
0538 static inline int MakePages(const ColumnDescriptor* d, int num_pages, int levels_per_page,
0539 std::vector<int16_t>& def_levels,
0540 std::vector<int16_t>& rep_levels,
0541 std::vector<typename Type::c_type>& values,
0542 std::vector<uint8_t>& buffer,
0543 std::vector<std::shared_ptr<Page>>& pages,
0544 Encoding::type encoding = Encoding::PLAIN,
0545 uint32_t seed = 0) {
0546 int num_levels = levels_per_page * num_pages;
0547 int num_values = 0;
0548 int16_t zero = 0;
0549 int16_t max_def_level = d->max_definition_level();
0550 int16_t max_rep_level = d->max_repetition_level();
0551 std::vector<int> values_per_page(num_pages, levels_per_page);
0552
0553 if (max_def_level > 0 && num_levels != 0) {
0554 def_levels.resize(num_levels);
0555 random_numbers(num_levels, seed, zero, max_def_level, def_levels.data());
0556 for (int p = 0; p < num_pages; p++) {
0557 int num_values_per_page = 0;
0558 for (int i = 0; i < levels_per_page; i++) {
0559 if (def_levels[i + p * levels_per_page] == max_def_level) {
0560 num_values_per_page++;
0561 num_values++;
0562 }
0563 }
0564 values_per_page[p] = num_values_per_page;
0565 }
0566 } else {
0567 num_values = num_levels;
0568 }
0569
0570 if (max_rep_level > 0 && num_levels != 0) {
0571 rep_levels.resize(num_levels);
0572
0573 random_numbers(num_levels, seed + 789, zero, max_rep_level, rep_levels.data());
0574
0575
0576 rep_levels[0] = 0;
0577
0578
0579
0580 for (int i = 0; i < num_levels - 1; ++i) {
0581 if (rep_levels[i] == 0 && def_levels[i] == 0) {
0582 rep_levels[i + 1] = 0;
0583 }
0584 }
0585 }
0586
0587 values.resize(num_values);
0588 if (encoding == Encoding::PLAIN) {
0589 InitValues<typename Type::c_type>(num_values, values, buffer);
0590 PaginatePlain<Type>(d, values, def_levels, max_def_level, rep_levels, max_rep_level,
0591 levels_per_page, values_per_page, pages);
0592 } else if (encoding == Encoding::RLE_DICTIONARY ||
0593 encoding == Encoding::PLAIN_DICTIONARY) {
0594
0595 InitDictValues<typename Type::c_type>(num_values, levels_per_page, values, buffer);
0596 PaginateDict<Type>(d, values, def_levels, max_def_level, rep_levels, max_rep_level,
0597 levels_per_page, values_per_page, pages);
0598 }
0599
0600 return num_values;
0601 }
0602
0603
0604
0605
0606 template <>
0607 void inline InitValues<bool>(int num_values, uint32_t seed, std::vector<bool>& values,
0608 std::vector<uint8_t>& buffer) {
0609 values = {};
0610 if (seed == 0) {
0611 seed = static_cast<uint32_t>(::arrow::random_seed());
0612 }
0613 ::arrow::random_is_valid(num_values, 0.5, &values, static_cast<int>(seed));
0614 }
0615
0616 template <>
0617 inline void InitValues<ByteArray>(int num_values, uint32_t seed,
0618 std::vector<ByteArray>& values,
0619 std::vector<uint8_t>& buffer) {
0620 int max_byte_array_len = 12;
0621 int num_bytes = static_cast<int>(max_byte_array_len + sizeof(uint32_t));
0622 size_t nbytes = num_values * num_bytes;
0623 buffer.resize(nbytes);
0624 random_byte_array(num_values, seed, buffer.data(), values.data(), max_byte_array_len);
0625 }
0626
0627 inline void InitWideByteArrayValues(int num_values, std::vector<ByteArray>& values,
0628 std::vector<uint8_t>& buffer, int min_len,
0629 int max_len) {
0630 int num_bytes = static_cast<int>(max_len + sizeof(uint32_t));
0631 size_t nbytes = num_values * num_bytes;
0632 buffer.resize(nbytes);
0633 random_byte_array(num_values, 0, buffer.data(), values.data(), min_len, max_len);
0634 }
0635
0636 template <>
0637 inline void InitValues<FLBA>(int num_values, uint32_t seed, std::vector<FLBA>& values,
0638 std::vector<uint8_t>& buffer) {
0639 size_t nbytes = num_values * FLBA_LENGTH;
0640 buffer.resize(nbytes);
0641 random_fixed_byte_array(num_values, seed, buffer.data(), FLBA_LENGTH, values.data());
0642 }
0643
0644 template <>
0645 inline void InitValues<Int96>(int num_values, uint32_t seed, std::vector<Int96>& values,
0646 std::vector<uint8_t>& buffer) {
0647 random_Int96_numbers(num_values, seed, std::numeric_limits<int32_t>::min(),
0648 std::numeric_limits<int32_t>::max(), values.data());
0649 }
0650
0651 inline std::string TestColumnName(int i) {
0652 std::stringstream col_name;
0653 col_name << "column_" << i;
0654 return col_name.str();
0655 }
0656
0657
0658 template <typename TestType>
0659 class PrimitiveTypedTest : public ::testing::Test {
0660 public:
0661 using c_type = typename TestType::c_type;
0662
0663 virtual void SetUpSchema(Repetition::type repetition, int num_columns) {
0664 std::vector<schema::NodePtr> fields;
0665
0666 for (int i = 0; i < num_columns; ++i) {
0667 std::string name = TestColumnName(i);
0668 fields.push_back(schema::PrimitiveNode::Make(name, repetition, TestType::type_num,
0669 ConvertedType::NONE, FLBA_LENGTH));
0670 }
0671 node_ = schema::GroupNode::Make("schema", Repetition::REQUIRED, fields);
0672 schema_.Init(node_);
0673 }
0674
0675 void SetUpSchema(Repetition::type repetition) { this->SetUpSchema(repetition, 1); }
0676
0677 void GenerateData(int64_t num_values, uint32_t seed = 0);
0678 void SetupValuesOut(int64_t num_values);
0679 void SyncValuesOut();
0680
0681 protected:
0682 schema::NodePtr node_;
0683 SchemaDescriptor schema_;
0684
0685
0686 std::vector<c_type> values_;
0687
0688 std::vector<int16_t> def_levels_;
0689
0690 std::vector<uint8_t> buffer_;
0691
0692 c_type* values_ptr_;
0693 std::vector<uint8_t> bool_buffer_;
0694
0695
0696 std::vector<c_type> values_out_;
0697 std::vector<uint8_t> bool_buffer_out_;
0698 c_type* values_out_ptr_;
0699 };
0700
0701 template <typename TestType>
0702 inline void PrimitiveTypedTest<TestType>::SyncValuesOut() {}
0703
0704 template <>
0705 inline void PrimitiveTypedTest<BooleanType>::SyncValuesOut() {
0706 std::vector<uint8_t>::const_iterator source_iterator = bool_buffer_out_.begin();
0707 std::vector<c_type>::iterator destination_iterator = values_out_.begin();
0708 while (source_iterator != bool_buffer_out_.end()) {
0709 *destination_iterator++ = *source_iterator++ != 0;
0710 }
0711 }
0712
0713 template <typename TestType>
0714 inline void PrimitiveTypedTest<TestType>::SetupValuesOut(int64_t num_values) {
0715 values_out_.clear();
0716 values_out_.resize(num_values);
0717 values_out_ptr_ = values_out_.data();
0718 }
0719
0720 template <>
0721 inline void PrimitiveTypedTest<BooleanType>::SetupValuesOut(int64_t num_values) {
0722 values_out_.clear();
0723 values_out_.resize(num_values);
0724
0725 bool_buffer_out_.clear();
0726 bool_buffer_out_.resize(num_values);
0727
0728
0729 std::fill(bool_buffer_out_.begin(), bool_buffer_out_.end(), true);
0730 values_out_ptr_ = reinterpret_cast<bool*>(bool_buffer_out_.data());
0731 }
0732
0733 template <typename TestType>
0734 inline void PrimitiveTypedTest<TestType>::GenerateData(int64_t num_values,
0735 uint32_t seed) {
0736 def_levels_.resize(num_values);
0737 values_.resize(num_values);
0738
0739 InitValues<c_type>(static_cast<int>(num_values), seed, values_, buffer_);
0740 values_ptr_ = values_.data();
0741
0742 std::fill(def_levels_.begin(), def_levels_.end(), 1);
0743 }
0744
0745 template <>
0746 inline void PrimitiveTypedTest<BooleanType>::GenerateData(int64_t num_values,
0747 uint32_t seed) {
0748 def_levels_.resize(num_values);
0749 values_.resize(num_values);
0750
0751 InitValues<c_type>(static_cast<int>(num_values), seed, values_, buffer_);
0752 bool_buffer_.resize(num_values);
0753 std::copy(values_.begin(), values_.end(), bool_buffer_.begin());
0754 values_ptr_ = reinterpret_cast<bool*>(bool_buffer_.data());
0755
0756 std::fill(def_levels_.begin(), def_levels_.end(), 1);
0757 }
0758
0759
0760
0761
0762 template <typename T>
0763 inline void GenerateData(int num_values, T* out, std::vector<uint8_t>* heap) {
0764
0765 random_numbers(num_values, 0, std::numeric_limits<T>::min(),
0766 std::numeric_limits<T>::max(), out);
0767 }
0768
0769 template <typename T>
0770 inline void GenerateBoundData(int num_values, T* out, T min, T max,
0771 std::vector<uint8_t>* heap) {
0772
0773 random_numbers(num_values, 0, min, max, out);
0774 }
0775
0776 template <>
0777 inline void GenerateData<bool>(int num_values, bool* out, std::vector<uint8_t>* heap) {
0778
0779 random_bools(num_values, 0.5, 0, out);
0780 }
0781
0782 template <>
0783 inline void GenerateData<Int96>(int num_values, Int96* out, std::vector<uint8_t>* heap) {
0784
0785 random_Int96_numbers(num_values, 0, std::numeric_limits<int32_t>::min(),
0786 std::numeric_limits<int32_t>::max(), out);
0787 }
0788
0789 template <>
0790 inline void GenerateData<ByteArray>(int num_values, ByteArray* out,
0791 std::vector<uint8_t>* heap) {
0792 int max_byte_array_len = 12;
0793 heap->resize(num_values * max_byte_array_len);
0794
0795 random_byte_array(num_values, 0, heap->data(), out, 2, max_byte_array_len);
0796 }
0797
0798
0799
0800
0801 template <typename T>
0802 inline void GeneratePrefixedData(int num_values, T* out, std::vector<uint8_t>* heap,
0803 double prefixed_probability);
0804
0805 template <>
0806 inline void GeneratePrefixedData(int num_values, ByteArray* out,
0807 std::vector<uint8_t>* heap,
0808 double prefixed_probability) {
0809 int max_byte_array_len = 12;
0810 heap->resize(num_values * max_byte_array_len);
0811
0812 prefixed_random_byte_array(num_values, 0, heap->data(), out, 2,
0813 max_byte_array_len, prefixed_probability);
0814 }
0815
0816 static constexpr int kGenerateDataFLBALength = 8;
0817
0818 template <>
0819 inline void GeneratePrefixedData<FLBA>(int num_values, FLBA* out,
0820 std::vector<uint8_t>* heap,
0821 double prefixed_probability) {
0822 heap->resize(num_values * kGenerateDataFLBALength);
0823
0824 prefixed_random_byte_array(num_values, 0, heap->data(),
0825 kGenerateDataFLBALength, out, prefixed_probability);
0826 }
0827
0828 template <>
0829 inline void GenerateData<FLBA>(int num_values, FLBA* out, std::vector<uint8_t>* heap) {
0830 heap->resize(num_values * kGenerateDataFLBALength);
0831
0832 random_fixed_byte_array(num_values, 0, heap->data(), kGenerateDataFLBALength, out);
0833 }
0834
0835
0836
0837
0838 #if defined(ARROW_LITTLE_ENDIAN)
0839 static constexpr uint8_t kWkbNativeEndianness = 0x01;
0840 #else
0841 static constexpr uint8_t kWkbNativeEndianness = 0x00;
0842 #endif
0843
0844
0845
0846 static constexpr int kWkbPointXYSize = 21;
0847
0848 std::string MakeWKBPoint(const std::vector<double>& xyzm, bool has_z, bool has_m);
0849
0850 std::optional<std::pair<double, double>> GetWKBPointCoordinateXY(const ByteArray& value);
0851
0852
0853 class GeoArrowWkbExtensionType : public ::arrow::ExtensionType {
0854 public:
0855 explicit GeoArrowWkbExtensionType(std::shared_ptr<::arrow::DataType> storage_type,
0856 std::string metadata)
0857 : ::arrow::ExtensionType(std::move(storage_type)), metadata_(std::move(metadata)) {}
0858
0859 std::string extension_name() const override { return "geoarrow.wkb"; }
0860
0861 std::string Serialize() const override { return metadata_; }
0862
0863 ::arrow::Result<std::shared_ptr<::arrow::DataType>> Deserialize(
0864 std::shared_ptr<::arrow::DataType> storage_type,
0865 const std::string& serialized_data) const override {
0866 return std::make_shared<GeoArrowWkbExtensionType>(std::move(storage_type),
0867 serialized_data);
0868 }
0869
0870 std::shared_ptr<::arrow::Array> MakeArray(
0871 std::shared_ptr<::arrow::ArrayData> data) const override {
0872 return std::make_shared<::arrow::ExtensionArray>(data);
0873 }
0874
0875 bool ExtensionEquals(const ExtensionType& other) const override {
0876 return other.extension_name() == extension_name() && other.Serialize() == Serialize();
0877 }
0878
0879 private:
0880 std::string metadata_;
0881 };
0882
0883 std::shared_ptr<::arrow::DataType> geoarrow_wkb(
0884 std::string metadata = "{}",
0885 const std::shared_ptr<::arrow::DataType> storage = ::arrow::binary());
0886
0887 std::shared_ptr<::arrow::DataType> geoarrow_wkb_lonlat(
0888 const std::shared_ptr<::arrow::DataType> storage = ::arrow::binary());
0889
0890 }
0891 }