Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-17 08:28:55

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 // This module defines an abstract interface for iterating through pages in a
0019 // Parquet column chunk within a row group. It could be extended in the future
0020 // to iterate through all data pages in all chunks in a file.
0021 
0022 #pragma once
0023 
0024 #include <algorithm>
0025 #include <limits>
0026 #include <memory>
0027 #include <random>
0028 #include <string>
0029 #include <vector>
0030 
0031 #include <gtest/gtest.h>
0032 
0033 #include "arrow/extension_type.h"
0034 #include "arrow/io/memory.h"
0035 #include "arrow/testing/util.h"
0036 #include "arrow/util/float16.h"
0037 
0038 #include "parquet/column_page.h"
0039 #include "parquet/column_reader.h"
0040 #include "parquet/column_writer.h"
0041 #include "parquet/encoding.h"
0042 #include "parquet/platform.h"
0043 
0044 // https://github.com/google/googletest/pull/2904 might not be available
0045 // in our version of gtest/gmock
0046 #define EXPECT_THROW_THAT(callable, ex_type, property)   \
0047   EXPECT_THROW(                                          \
0048       try { (callable)(); } catch (const ex_type& err) { \
0049         EXPECT_THAT(err, (property));                    \
0050         throw;                                           \
0051       },                                                 \
0052       ex_type)
0053 
0054 namespace parquet {
0055 
0056 static constexpr int FLBA_LENGTH = 12;
0057 
0058 inline bool operator==(const FixedLenByteArray& a, const FixedLenByteArray& b) {
0059   return 0 == memcmp(a.ptr, b.ptr, FLBA_LENGTH);
0060 }
0061 
0062 namespace test {
0063 
0064 typedef ::testing::Types<BooleanType, Int32Type, Int64Type, Int96Type, FloatType,
0065                          DoubleType, ByteArrayType, FLBAType>
0066     ParquetTypes;
0067 
0068 class ParquetTestException : public parquet::ParquetException {
0069   using ParquetException::ParquetException;
0070 };
0071 
0072 const char* get_data_dir();
0073 std::string get_bad_data_dir();
0074 
0075 std::string get_data_file(const std::string& filename, bool is_good = true);
0076 
0077 template <typename T>
0078 static inline void assert_vector_equal(const std::vector<T>& left,
0079                                        const std::vector<T>& right) {
0080   ASSERT_EQ(left.size(), right.size());
0081 
0082   for (size_t i = 0; i < left.size(); ++i) {
0083     ASSERT_EQ(left[i], right[i]) << i;
0084   }
0085 }
0086 
0087 template <typename T>
0088 static inline bool vector_equal(const std::vector<T>& left, const std::vector<T>& right) {
0089   if (left.size() != right.size()) {
0090     return false;
0091   }
0092 
0093   for (size_t i = 0; i < left.size(); ++i) {
0094     if (left[i] != right[i]) {
0095       std::cerr << "index " << i << " left was " << left[i] << " right was " << right[i]
0096                 << std::endl;
0097       return false;
0098     }
0099   }
0100 
0101   return true;
0102 }
0103 
0104 template <typename T>
0105 static std::vector<T> slice(const std::vector<T>& values, int start, int end) {
0106   if (end < start) {
0107     return std::vector<T>(0);
0108   }
0109 
0110   std::vector<T> out(end - start);
0111   for (int i = start; i < end; ++i) {
0112     out[i - start] = values[i];
0113   }
0114   return out;
0115 }
0116 
0117 void random_bytes(int n, uint32_t seed, std::vector<uint8_t>* out);
0118 void random_bools(int n, double p, uint32_t seed, bool* out);
0119 
0120 template <typename T>
0121 inline void random_numbers(int n, uint32_t seed, T min_value, T max_value, T* out) {
0122   std::default_random_engine gen(seed);
0123   std::uniform_int_distribution<T> d(min_value, max_value);
0124   for (int i = 0; i < n; ++i) {
0125     out[i] = d(gen);
0126   }
0127 }
0128 
0129 template <>
0130 inline void random_numbers(int n, uint32_t seed, float min_value, float max_value,
0131                            float* out) {
0132   std::default_random_engine gen(seed);
0133   std::uniform_real_distribution<float> d(min_value, max_value);
0134   for (int i = 0; i < n; ++i) {
0135     out[i] = d(gen);
0136   }
0137 }
0138 
0139 template <>
0140 inline void random_numbers(int n, uint32_t seed, double min_value, double max_value,
0141                            double* out) {
0142   std::default_random_engine gen(seed);
0143   std::uniform_real_distribution<double> d(min_value, max_value);
0144   for (int i = 0; i < n; ++i) {
0145     out[i] = d(gen);
0146   }
0147 }
0148 
0149 void random_Int96_numbers(int n, uint32_t seed, int32_t min_value, int32_t max_value,
0150                           Int96* out);
0151 
0152 void random_float16_numbers(int n, uint32_t seed, ::arrow::util::Float16 min_value,
0153                             ::arrow::util::Float16 max_value, uint16_t* out);
0154 
0155 void random_fixed_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out);
0156 
0157 void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int min_size,
0158                        int max_size);
0159 
0160 void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int max_size);
0161 
0162 void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out,
0163                                 int min_size, int max_size, double prefixed_probability);
0164 
0165 void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out,
0166                                 double prefixed_probability);
0167 
0168 template <typename Type, typename Sequence>
0169 std::shared_ptr<Buffer> EncodeValues(Encoding::type encoding, bool use_dictionary,
0170                                      const Sequence& values, int length,
0171                                      const ColumnDescriptor* descr) {
0172   auto encoder = MakeTypedEncoder<Type>(encoding, use_dictionary, descr);
0173   encoder->Put(values, length);
0174   return encoder->FlushValues();
0175 }
0176 
0177 template <typename T>
0178 static void InitValues(int num_values, uint32_t seed, std::vector<T>& values,
0179                        std::vector<uint8_t>& buffer) {
0180   random_numbers(num_values, seed, std::numeric_limits<T>::min(),
0181                  std::numeric_limits<T>::max(), values.data());
0182 }
0183 
0184 template <typename T>
0185 static void InitValues(int num_values, std::vector<T>& values,
0186                        std::vector<uint8_t>& buffer) {
0187   InitValues(num_values, 0, values, buffer);
0188 }
0189 
0190 template <typename T>
0191 static void InitDictValues(int num_values, int num_dicts, std::vector<T>& values,
0192                            std::vector<uint8_t>& buffer) {
0193   int repeat_factor = num_values / num_dicts;
0194   InitValues<T>(num_dicts, values, buffer);
0195   // add some repeated values
0196   for (int j = 1; j < repeat_factor; ++j) {
0197     for (int i = 0; i < num_dicts; ++i) {
0198       std::memcpy(&values[num_dicts * j + i], &values[i], sizeof(T));
0199     }
0200   }
0201   // computed only dict_per_page * repeat_factor - 1 values < num_values
0202   // compute remaining
0203   for (int i = num_dicts * repeat_factor; i < num_values; ++i) {
0204     std::memcpy(&values[i], &values[i - num_dicts * repeat_factor], sizeof(T));
0205   }
0206 }
0207 
0208 template <>
0209 inline void InitDictValues<bool>(int num_values, int num_dicts, std::vector<bool>& values,
0210                                  std::vector<uint8_t>& buffer) {
0211   // No op for bool
0212 }
0213 
0214 class MockPageReader : public PageReader {
0215  public:
0216   explicit MockPageReader(const std::vector<std::shared_ptr<Page>>& pages)
0217       : pages_(pages), page_index_(0) {}
0218 
0219   std::shared_ptr<Page> NextPage() override {
0220     if (page_index_ == static_cast<int>(pages_.size())) {
0221       // EOS to consumer
0222       return std::shared_ptr<Page>(nullptr);
0223     }
0224     return pages_[page_index_++];
0225   }
0226 
0227   // No-op
0228   void set_max_page_header_size(uint32_t size) override {}
0229 
0230  private:
0231   std::vector<std::shared_ptr<Page>> pages_;
0232   int page_index_;
0233 };
0234 
0235 // TODO(wesm): this is only used for testing for now. Refactor to form part of
0236 // primary file write path
0237 template <typename Type>
0238 class DataPageBuilder {
0239  public:
0240   using c_type = typename Type::c_type;
0241 
0242   // This class writes data and metadata to the passed inputs
0243   explicit DataPageBuilder(ArrowOutputStream* sink)
0244       : sink_(sink),
0245         num_values_(0),
0246         encoding_(Encoding::PLAIN),
0247         definition_level_encoding_(Encoding::RLE),
0248         repetition_level_encoding_(Encoding::RLE),
0249         have_def_levels_(false),
0250         have_rep_levels_(false),
0251         have_values_(false) {}
0252 
0253   void AppendDefLevels(const std::vector<int16_t>& levels, int16_t max_level,
0254                        Encoding::type encoding = Encoding::RLE) {
0255     AppendLevels(levels, max_level, encoding);
0256 
0257     num_values_ = std::max(static_cast<int32_t>(levels.size()), num_values_);
0258     definition_level_encoding_ = encoding;
0259     have_def_levels_ = true;
0260   }
0261 
0262   void AppendRepLevels(const std::vector<int16_t>& levels, int16_t max_level,
0263                        Encoding::type encoding = Encoding::RLE) {
0264     AppendLevels(levels, max_level, encoding);
0265 
0266     num_values_ = std::max(static_cast<int32_t>(levels.size()), num_values_);
0267     repetition_level_encoding_ = encoding;
0268     have_rep_levels_ = true;
0269   }
0270 
0271   void AppendValues(const ColumnDescriptor* d, const std::vector<c_type>& values,
0272                     Encoding::type encoding = Encoding::PLAIN) {
0273     std::shared_ptr<Buffer> values_sink = EncodeValues<Type>(
0274         encoding, false, values.data(), static_cast<int>(values.size()), d);
0275     PARQUET_THROW_NOT_OK(sink_->Write(values_sink->data(), values_sink->size()));
0276 
0277     num_values_ = std::max(static_cast<int32_t>(values.size()), num_values_);
0278     encoding_ = encoding;
0279     have_values_ = true;
0280   }
0281 
0282   int32_t num_values() const { return num_values_; }
0283 
0284   Encoding::type encoding() const { return encoding_; }
0285 
0286   Encoding::type rep_level_encoding() const { return repetition_level_encoding_; }
0287 
0288   Encoding::type def_level_encoding() const { return definition_level_encoding_; }
0289 
0290  private:
0291   ArrowOutputStream* sink_;
0292 
0293   int32_t num_values_;
0294   Encoding::type encoding_;
0295   Encoding::type definition_level_encoding_;
0296   Encoding::type repetition_level_encoding_;
0297 
0298   bool have_def_levels_;
0299   bool have_rep_levels_;
0300   bool have_values_;
0301 
0302   // Used internally for both repetition and definition levels
0303   void AppendLevels(const std::vector<int16_t>& levels, int16_t max_level,
0304                     Encoding::type encoding) {
0305     if (encoding != Encoding::RLE) {
0306       ParquetException::NYI("only rle encoding currently implemented");
0307     }
0308 
0309     std::vector<uint8_t> encode_buffer(LevelEncoder::MaxBufferSize(
0310         Encoding::RLE, max_level, static_cast<int>(levels.size())));
0311 
0312     // We encode into separate memory from the output stream because the
0313     // RLE-encoded bytes have to be preceded in the stream by their absolute
0314     // size.
0315     LevelEncoder encoder;
0316     encoder.Init(encoding, max_level, static_cast<int>(levels.size()),
0317                  encode_buffer.data(), static_cast<int>(encode_buffer.size()));
0318 
0319     encoder.Encode(static_cast<int>(levels.size()), levels.data());
0320 
0321     int32_t rle_bytes = encoder.len();
0322     PARQUET_THROW_NOT_OK(
0323         sink_->Write(reinterpret_cast<const uint8_t*>(&rle_bytes), sizeof(int32_t)));
0324     PARQUET_THROW_NOT_OK(sink_->Write(encode_buffer.data(), rle_bytes));
0325   }
0326 };
0327 
0328 template <>
0329 inline void DataPageBuilder<BooleanType>::AppendValues(const ColumnDescriptor* d,
0330                                                        const std::vector<bool>& values,
0331                                                        Encoding::type encoding) {
0332   if (encoding != Encoding::PLAIN) {
0333     ParquetException::NYI("only plain encoding currently implemented");
0334   }
0335 
0336   auto encoder = MakeTypedEncoder<BooleanType>(Encoding::PLAIN, false, d);
0337   dynamic_cast<BooleanEncoder*>(encoder.get())
0338       ->Put(values, static_cast<int>(values.size()));
0339   std::shared_ptr<Buffer> buffer = encoder->FlushValues();
0340   PARQUET_THROW_NOT_OK(sink_->Write(buffer->data(), buffer->size()));
0341 
0342   num_values_ = std::max(static_cast<int32_t>(values.size()), num_values_);
0343   encoding_ = encoding;
0344   have_values_ = true;
0345 }
0346 
0347 template <typename Type>
0348 static std::shared_ptr<DataPageV1> MakeDataPage(
0349     const ColumnDescriptor* d, const std::vector<typename Type::c_type>& values,
0350     int num_vals, Encoding::type encoding, const uint8_t* indices, int indices_size,
0351     const std::vector<int16_t>& def_levels, int16_t max_def_level,
0352     const std::vector<int16_t>& rep_levels, int16_t max_rep_level) {
0353   int num_values = 0;
0354 
0355   auto page_stream = CreateOutputStream();
0356   test::DataPageBuilder<Type> page_builder(page_stream.get());
0357 
0358   if (!rep_levels.empty()) {
0359     page_builder.AppendRepLevels(rep_levels, max_rep_level);
0360   }
0361   if (!def_levels.empty()) {
0362     page_builder.AppendDefLevels(def_levels, max_def_level);
0363   }
0364 
0365   if (encoding == Encoding::PLAIN) {
0366     page_builder.AppendValues(d, values, encoding);
0367     num_values = std::max(page_builder.num_values(), num_vals);
0368   } else {  // DICTIONARY PAGES
0369     PARQUET_THROW_NOT_OK(page_stream->Write(indices, indices_size));
0370     num_values = std::max(page_builder.num_values(), num_vals);
0371   }
0372 
0373   PARQUET_ASSIGN_OR_THROW(auto buffer, page_stream->Finish());
0374 
0375   return std::make_shared<DataPageV1>(buffer, num_values, encoding,
0376                                       page_builder.def_level_encoding(),
0377                                       page_builder.rep_level_encoding(), buffer->size());
0378 }
0379 
0380 template <typename TYPE>
0381 class DictionaryPageBuilder {
0382  public:
0383   typedef typename TYPE::c_type TC;
0384   static constexpr int TN = TYPE::type_num;
0385   using SpecializedEncoder = typename EncodingTraits<TYPE>::Encoder;
0386 
0387   // This class writes data and metadata to the passed inputs
0388   explicit DictionaryPageBuilder(const ColumnDescriptor* d)
0389       : num_dict_values_(0), have_values_(false) {
0390     auto encoder = MakeTypedEncoder<TYPE>(Encoding::PLAIN, true, d);
0391     dict_traits_ = dynamic_cast<DictEncoder<TYPE>*>(encoder.get());
0392     encoder_.reset(dynamic_cast<SpecializedEncoder*>(encoder.release()));
0393   }
0394 
0395   ~DictionaryPageBuilder() {}
0396 
0397   std::shared_ptr<Buffer> AppendValues(const std::vector<TC>& values) {
0398     int num_values = static_cast<int>(values.size());
0399     // Dictionary encoding
0400     encoder_->Put(values.data(), num_values);
0401     num_dict_values_ = dict_traits_->num_entries();
0402     have_values_ = true;
0403     return encoder_->FlushValues();
0404   }
0405 
0406   std::shared_ptr<Buffer> WriteDict() {
0407     std::shared_ptr<Buffer> dict_buffer =
0408         AllocateBuffer(::arrow::default_memory_pool(), dict_traits_->dict_encoded_size());
0409     dict_traits_->WriteDict(dict_buffer->mutable_data());
0410     return dict_buffer;
0411   }
0412 
0413   int32_t num_values() const { return num_dict_values_; }
0414 
0415  private:
0416   DictEncoder<TYPE>* dict_traits_;
0417   std::unique_ptr<SpecializedEncoder> encoder_;
0418   int32_t num_dict_values_;
0419   bool have_values_;
0420 };
0421 
0422 template <>
0423 inline DictionaryPageBuilder<BooleanType>::DictionaryPageBuilder(
0424     const ColumnDescriptor* d) {
0425   ParquetException::NYI("only plain encoding currently implemented for boolean");
0426 }
0427 
0428 template <>
0429 inline std::shared_ptr<Buffer> DictionaryPageBuilder<BooleanType>::WriteDict() {
0430   ParquetException::NYI("only plain encoding currently implemented for boolean");
0431   return nullptr;
0432 }
0433 
0434 template <>
0435 inline std::shared_ptr<Buffer> DictionaryPageBuilder<BooleanType>::AppendValues(
0436     const std::vector<TC>& values) {
0437   ParquetException::NYI("only plain encoding currently implemented for boolean");
0438   return nullptr;
0439 }
0440 
0441 template <typename Type>
0442 inline static std::shared_ptr<DictionaryPage> MakeDictPage(
0443     const ColumnDescriptor* d, const std::vector<typename Type::c_type>& values,
0444     const std::vector<int>& values_per_page, Encoding::type encoding,
0445     std::vector<std::shared_ptr<Buffer>>& rle_indices) {
0446   test::DictionaryPageBuilder<Type> page_builder(d);
0447   int num_pages = static_cast<int>(values_per_page.size());
0448   int value_start = 0;
0449 
0450   for (int i = 0; i < num_pages; i++) {
0451     rle_indices.push_back(page_builder.AppendValues(
0452         slice(values, value_start, value_start + values_per_page[i])));
0453     value_start += values_per_page[i];
0454   }
0455 
0456   auto buffer = page_builder.WriteDict();
0457 
0458   return std::make_shared<DictionaryPage>(buffer, page_builder.num_values(),
0459                                           Encoding::PLAIN);
0460 }
0461 
0462 // Given def/rep levels and values create multiple dict pages
0463 template <typename Type>
0464 inline static void PaginateDict(const ColumnDescriptor* d,
0465                                 const std::vector<typename Type::c_type>& values,
0466                                 const std::vector<int16_t>& def_levels,
0467                                 int16_t max_def_level,
0468                                 const std::vector<int16_t>& rep_levels,
0469                                 int16_t max_rep_level, int num_levels_per_page,
0470                                 const std::vector<int>& values_per_page,
0471                                 std::vector<std::shared_ptr<Page>>& pages,
0472                                 Encoding::type encoding = Encoding::RLE_DICTIONARY) {
0473   int num_pages = static_cast<int>(values_per_page.size());
0474   std::vector<std::shared_ptr<Buffer>> rle_indices;
0475   std::shared_ptr<DictionaryPage> dict_page =
0476       MakeDictPage<Type>(d, values, values_per_page, encoding, rle_indices);
0477   pages.push_back(dict_page);
0478   int def_level_start = 0;
0479   int def_level_end = 0;
0480   int rep_level_start = 0;
0481   int rep_level_end = 0;
0482   for (int i = 0; i < num_pages; i++) {
0483     if (max_def_level > 0) {
0484       def_level_start = i * num_levels_per_page;
0485       def_level_end = (i + 1) * num_levels_per_page;
0486     }
0487     if (max_rep_level > 0) {
0488       rep_level_start = i * num_levels_per_page;
0489       rep_level_end = (i + 1) * num_levels_per_page;
0490     }
0491     std::shared_ptr<DataPageV1> data_page = MakeDataPage<Int32Type>(
0492         d, {}, values_per_page[i], encoding, rle_indices[i]->data(),
0493         static_cast<int>(rle_indices[i]->size()),
0494         slice(def_levels, def_level_start, def_level_end), max_def_level,
0495         slice(rep_levels, rep_level_start, rep_level_end), max_rep_level);
0496     pages.push_back(data_page);
0497   }
0498 }
0499 
0500 // Given def/rep levels and values create multiple plain pages
0501 template <typename Type>
0502 static inline void PaginatePlain(const ColumnDescriptor* d,
0503                                  const std::vector<typename Type::c_type>& values,
0504                                  const std::vector<int16_t>& def_levels,
0505                                  int16_t max_def_level,
0506                                  const std::vector<int16_t>& rep_levels,
0507                                  int16_t max_rep_level, int num_levels_per_page,
0508                                  const std::vector<int>& values_per_page,
0509                                  std::vector<std::shared_ptr<Page>>& pages,
0510                                  Encoding::type encoding = Encoding::PLAIN) {
0511   int num_pages = static_cast<int>(values_per_page.size());
0512   int def_level_start = 0;
0513   int def_level_end = 0;
0514   int rep_level_start = 0;
0515   int rep_level_end = 0;
0516   int value_start = 0;
0517   for (int i = 0; i < num_pages; i++) {
0518     if (max_def_level > 0) {
0519       def_level_start = i * num_levels_per_page;
0520       def_level_end = (i + 1) * num_levels_per_page;
0521     }
0522     if (max_rep_level > 0) {
0523       rep_level_start = i * num_levels_per_page;
0524       rep_level_end = (i + 1) * num_levels_per_page;
0525     }
0526     std::shared_ptr<DataPage> page = MakeDataPage<Type>(
0527         d, slice(values, value_start, value_start + values_per_page[i]),
0528         values_per_page[i], encoding, nullptr, 0,
0529         slice(def_levels, def_level_start, def_level_end), max_def_level,
0530         slice(rep_levels, rep_level_start, rep_level_end), max_rep_level);
0531     pages.push_back(page);
0532     value_start += values_per_page[i];
0533   }
0534 }
0535 
0536 // Generates pages from randomly generated data
0537 template <typename Type>
0538 static inline int MakePages(const ColumnDescriptor* d, int num_pages, int levels_per_page,
0539                             std::vector<int16_t>& def_levels,
0540                             std::vector<int16_t>& rep_levels,
0541                             std::vector<typename Type::c_type>& values,
0542                             std::vector<uint8_t>& buffer,
0543                             std::vector<std::shared_ptr<Page>>& pages,
0544                             Encoding::type encoding = Encoding::PLAIN,
0545                             uint32_t seed = 0) {
0546   int num_levels = levels_per_page * num_pages;
0547   int num_values = 0;
0548   int16_t zero = 0;
0549   int16_t max_def_level = d->max_definition_level();
0550   int16_t max_rep_level = d->max_repetition_level();
0551   std::vector<int> values_per_page(num_pages, levels_per_page);
0552   // Create definition levels
0553   if (max_def_level > 0 && num_levels != 0) {
0554     def_levels.resize(num_levels);
0555     random_numbers(num_levels, seed, zero, max_def_level, def_levels.data());
0556     for (int p = 0; p < num_pages; p++) {
0557       int num_values_per_page = 0;
0558       for (int i = 0; i < levels_per_page; i++) {
0559         if (def_levels[i + p * levels_per_page] == max_def_level) {
0560           num_values_per_page++;
0561           num_values++;
0562         }
0563       }
0564       values_per_page[p] = num_values_per_page;
0565     }
0566   } else {
0567     num_values = num_levels;
0568   }
0569   // Create repetition levels
0570   if (max_rep_level > 0 && num_levels != 0) {
0571     rep_levels.resize(num_levels);
0572     // Using a different seed so that def_levels and rep_levels are different.
0573     random_numbers(num_levels, seed + 789, zero, max_rep_level, rep_levels.data());
0574     // The generated levels are random. Force the very first page to start with a new
0575     // record.
0576     rep_levels[0] = 0;
0577     // For a null value, rep_levels and def_levels are both 0.
0578     // If we have a repeated value right after this, it needs to start with
0579     // rep_level = 0 to indicate a new record.
0580     for (int i = 0; i < num_levels - 1; ++i) {
0581       if (rep_levels[i] == 0 && def_levels[i] == 0) {
0582         rep_levels[i + 1] = 0;
0583       }
0584     }
0585   }
0586   // Create values
0587   values.resize(num_values);
0588   if (encoding == Encoding::PLAIN) {
0589     InitValues<typename Type::c_type>(num_values, values, buffer);
0590     PaginatePlain<Type>(d, values, def_levels, max_def_level, rep_levels, max_rep_level,
0591                         levels_per_page, values_per_page, pages);
0592   } else if (encoding == Encoding::RLE_DICTIONARY ||
0593              encoding == Encoding::PLAIN_DICTIONARY) {
0594     // Calls InitValues and repeats the data
0595     InitDictValues<typename Type::c_type>(num_values, levels_per_page, values, buffer);
0596     PaginateDict<Type>(d, values, def_levels, max_def_level, rep_levels, max_rep_level,
0597                        levels_per_page, values_per_page, pages);
0598   }
0599 
0600   return num_values;
0601 }
0602 
0603 // ----------------------------------------------------------------------
0604 // Test data generation
0605 
0606 template <>
0607 void inline InitValues<bool>(int num_values, uint32_t seed, std::vector<bool>& values,
0608                              std::vector<uint8_t>& buffer) {
0609   values = {};
0610   if (seed == 0) {
0611     seed = static_cast<uint32_t>(::arrow::random_seed());
0612   }
0613   ::arrow::random_is_valid(num_values, 0.5, &values, static_cast<int>(seed));
0614 }
0615 
0616 template <>
0617 inline void InitValues<ByteArray>(int num_values, uint32_t seed,
0618                                   std::vector<ByteArray>& values,
0619                                   std::vector<uint8_t>& buffer) {
0620   int max_byte_array_len = 12;
0621   int num_bytes = static_cast<int>(max_byte_array_len + sizeof(uint32_t));
0622   size_t nbytes = num_values * num_bytes;
0623   buffer.resize(nbytes);
0624   random_byte_array(num_values, seed, buffer.data(), values.data(), max_byte_array_len);
0625 }
0626 
0627 inline void InitWideByteArrayValues(int num_values, std::vector<ByteArray>& values,
0628                                     std::vector<uint8_t>& buffer, int min_len,
0629                                     int max_len) {
0630   int num_bytes = static_cast<int>(max_len + sizeof(uint32_t));
0631   size_t nbytes = num_values * num_bytes;
0632   buffer.resize(nbytes);
0633   random_byte_array(num_values, 0, buffer.data(), values.data(), min_len, max_len);
0634 }
0635 
0636 template <>
0637 inline void InitValues<FLBA>(int num_values, uint32_t seed, std::vector<FLBA>& values,
0638                              std::vector<uint8_t>& buffer) {
0639   size_t nbytes = num_values * FLBA_LENGTH;
0640   buffer.resize(nbytes);
0641   random_fixed_byte_array(num_values, seed, buffer.data(), FLBA_LENGTH, values.data());
0642 }
0643 
0644 template <>
0645 inline void InitValues<Int96>(int num_values, uint32_t seed, std::vector<Int96>& values,
0646                               std::vector<uint8_t>& buffer) {
0647   random_Int96_numbers(num_values, seed, std::numeric_limits<int32_t>::min(),
0648                        std::numeric_limits<int32_t>::max(), values.data());
0649 }
0650 
0651 inline std::string TestColumnName(int i) {
0652   std::stringstream col_name;
0653   col_name << "column_" << i;
0654   return col_name.str();
0655 }
0656 
0657 // This class lives here because of its dependency on the InitValues specializations.
0658 template <typename TestType>
0659 class PrimitiveTypedTest : public ::testing::Test {
0660  public:
0661   using c_type = typename TestType::c_type;
0662 
0663   virtual void SetUpSchema(Repetition::type repetition, int num_columns) {
0664     std::vector<schema::NodePtr> fields;
0665 
0666     for (int i = 0; i < num_columns; ++i) {
0667       std::string name = TestColumnName(i);
0668       fields.push_back(schema::PrimitiveNode::Make(name, repetition, TestType::type_num,
0669                                                    ConvertedType::NONE, FLBA_LENGTH));
0670     }
0671     node_ = schema::GroupNode::Make("schema", Repetition::REQUIRED, fields);
0672     schema_.Init(node_);
0673   }
0674 
0675   void SetUpSchema(Repetition::type repetition) { this->SetUpSchema(repetition, 1); }
0676 
0677   void GenerateData(int64_t num_values, uint32_t seed = 0);
0678   void SetupValuesOut(int64_t num_values);
0679   void SyncValuesOut();
0680 
0681  protected:
0682   schema::NodePtr node_;
0683   SchemaDescriptor schema_;
0684 
0685   // Input buffers
0686   std::vector<c_type> values_;
0687 
0688   std::vector<int16_t> def_levels_;
0689 
0690   std::vector<uint8_t> buffer_;
0691   // Pointer to the values, needed as we cannot use std::vector<bool>::data()
0692   c_type* values_ptr_;
0693   std::vector<uint8_t> bool_buffer_;
0694 
0695   // Output buffers
0696   std::vector<c_type> values_out_;
0697   std::vector<uint8_t> bool_buffer_out_;
0698   c_type* values_out_ptr_;
0699 };
0700 
0701 template <typename TestType>
0702 inline void PrimitiveTypedTest<TestType>::SyncValuesOut() {}
0703 
0704 template <>
0705 inline void PrimitiveTypedTest<BooleanType>::SyncValuesOut() {
0706   std::vector<uint8_t>::const_iterator source_iterator = bool_buffer_out_.begin();
0707   std::vector<c_type>::iterator destination_iterator = values_out_.begin();
0708   while (source_iterator != bool_buffer_out_.end()) {
0709     *destination_iterator++ = *source_iterator++ != 0;
0710   }
0711 }
0712 
0713 template <typename TestType>
0714 inline void PrimitiveTypedTest<TestType>::SetupValuesOut(int64_t num_values) {
0715   values_out_.clear();
0716   values_out_.resize(num_values);
0717   values_out_ptr_ = values_out_.data();
0718 }
0719 
0720 template <>
0721 inline void PrimitiveTypedTest<BooleanType>::SetupValuesOut(int64_t num_values) {
0722   values_out_.clear();
0723   values_out_.resize(num_values);
0724 
0725   bool_buffer_out_.clear();
0726   bool_buffer_out_.resize(num_values);
0727   // Write once to all values so we can copy it without getting Valgrind errors
0728   // about uninitialised values.
0729   std::fill(bool_buffer_out_.begin(), bool_buffer_out_.end(), true);
0730   values_out_ptr_ = reinterpret_cast<bool*>(bool_buffer_out_.data());
0731 }
0732 
0733 template <typename TestType>
0734 inline void PrimitiveTypedTest<TestType>::GenerateData(int64_t num_values,
0735                                                        uint32_t seed) {
0736   def_levels_.resize(num_values);
0737   values_.resize(num_values);
0738 
0739   InitValues<c_type>(static_cast<int>(num_values), seed, values_, buffer_);
0740   values_ptr_ = values_.data();
0741 
0742   std::fill(def_levels_.begin(), def_levels_.end(), 1);
0743 }
0744 
0745 template <>
0746 inline void PrimitiveTypedTest<BooleanType>::GenerateData(int64_t num_values,
0747                                                           uint32_t seed) {
0748   def_levels_.resize(num_values);
0749   values_.resize(num_values);
0750 
0751   InitValues<c_type>(static_cast<int>(num_values), seed, values_, buffer_);
0752   bool_buffer_.resize(num_values);
0753   std::copy(values_.begin(), values_.end(), bool_buffer_.begin());
0754   values_ptr_ = reinterpret_cast<bool*>(bool_buffer_.data());
0755 
0756   std::fill(def_levels_.begin(), def_levels_.end(), 1);
0757 }
0758 
0759 // ----------------------------------------------------------------------
0760 // test data generation
0761 
0762 template <typename T>
0763 inline void GenerateData(int num_values, T* out, std::vector<uint8_t>* heap) {
0764   // seed the prng so failure is deterministic
0765   random_numbers(num_values, 0, std::numeric_limits<T>::min(),
0766                  std::numeric_limits<T>::max(), out);
0767 }
0768 
0769 template <typename T>
0770 inline void GenerateBoundData(int num_values, T* out, T min, T max,
0771                               std::vector<uint8_t>* heap) {
0772   // seed the prng so failure is deterministic
0773   random_numbers(num_values, 0, min, max, out);
0774 }
0775 
0776 template <>
0777 inline void GenerateData<bool>(int num_values, bool* out, std::vector<uint8_t>* heap) {
0778   // seed the prng so failure is deterministic
0779   random_bools(num_values, 0.5, 0, out);
0780 }
0781 
0782 template <>
0783 inline void GenerateData<Int96>(int num_values, Int96* out, std::vector<uint8_t>* heap) {
0784   // seed the prng so failure is deterministic
0785   random_Int96_numbers(num_values, 0, std::numeric_limits<int32_t>::min(),
0786                        std::numeric_limits<int32_t>::max(), out);
0787 }
0788 
0789 template <>
0790 inline void GenerateData<ByteArray>(int num_values, ByteArray* out,
0791                                     std::vector<uint8_t>* heap) {
0792   int max_byte_array_len = 12;
0793   heap->resize(num_values * max_byte_array_len);
0794   // seed the prng so failure is deterministic
0795   random_byte_array(num_values, 0, heap->data(), out, 2, max_byte_array_len);
0796 }
0797 
0798 // Generate ByteArray or FLBA data where there is a given probability
0799 // for each value to share a common prefix with its predecessor.
0800 // This is useful to exercise prefix-based encodings such as DELTA_BYTE_ARRAY.
0801 template <typename T>
0802 inline void GeneratePrefixedData(int num_values, T* out, std::vector<uint8_t>* heap,
0803                                  double prefixed_probability);
0804 
0805 template <>
0806 inline void GeneratePrefixedData(int num_values, ByteArray* out,
0807                                  std::vector<uint8_t>* heap,
0808                                  double prefixed_probability) {
0809   int max_byte_array_len = 12;
0810   heap->resize(num_values * max_byte_array_len);
0811   // seed the prng so failure is deterministic
0812   prefixed_random_byte_array(num_values, /*seed=*/0, heap->data(), out, /*min_size=*/2,
0813                              /*max_size=*/max_byte_array_len, prefixed_probability);
0814 }
0815 
0816 static constexpr int kGenerateDataFLBALength = 8;
0817 
0818 template <>
0819 inline void GeneratePrefixedData<FLBA>(int num_values, FLBA* out,
0820                                        std::vector<uint8_t>* heap,
0821                                        double prefixed_probability) {
0822   heap->resize(num_values * kGenerateDataFLBALength);
0823   // seed the prng so failure is deterministic
0824   prefixed_random_byte_array(num_values, /*seed=*/0, heap->data(),
0825                              kGenerateDataFLBALength, out, prefixed_probability);
0826 }
0827 
0828 template <>
0829 inline void GenerateData<FLBA>(int num_values, FLBA* out, std::vector<uint8_t>* heap) {
0830   heap->resize(num_values * kGenerateDataFLBALength);
0831   // seed the prng so failure is deterministic
0832   random_fixed_byte_array(num_values, 0, heap->data(), kGenerateDataFLBALength, out);
0833 }
0834 
0835 // ----------------------------------------------------------------------
0836 // Test utility functions for geometry
0837 
0838 #if defined(ARROW_LITTLE_ENDIAN)
0839 static constexpr uint8_t kWkbNativeEndianness = 0x01;
0840 #else
0841 static constexpr uint8_t kWkbNativeEndianness = 0x00;
0842 #endif
0843 
0844 /// \brief Number of bytes in a WKB Point with X and Y dimensions (uint8_t endian,
0845 /// uint32_t geometry type, 2 * double coordinates)
0846 static constexpr int kWkbPointXYSize = 21;
0847 
0848 std::string MakeWKBPoint(const std::vector<double>& xyzm, bool has_z, bool has_m);
0849 
0850 std::optional<std::pair<double, double>> GetWKBPointCoordinateXY(const ByteArray& value);
0851 
0852 // A minimal version of a geoarrow.wkb extension type to test interoperability
0853 class GeoArrowWkbExtensionType : public ::arrow::ExtensionType {
0854  public:
0855   explicit GeoArrowWkbExtensionType(std::shared_ptr<::arrow::DataType> storage_type,
0856                                     std::string metadata)
0857       : ::arrow::ExtensionType(std::move(storage_type)), metadata_(std::move(metadata)) {}
0858 
0859   std::string extension_name() const override { return "geoarrow.wkb"; }
0860 
0861   std::string Serialize() const override { return metadata_; }
0862 
0863   ::arrow::Result<std::shared_ptr<::arrow::DataType>> Deserialize(
0864       std::shared_ptr<::arrow::DataType> storage_type,
0865       const std::string& serialized_data) const override {
0866     return std::make_shared<GeoArrowWkbExtensionType>(std::move(storage_type),
0867                                                       serialized_data);
0868   }
0869 
0870   std::shared_ptr<::arrow::Array> MakeArray(
0871       std::shared_ptr<::arrow::ArrayData> data) const override {
0872     return std::make_shared<::arrow::ExtensionArray>(data);
0873   }
0874 
0875   bool ExtensionEquals(const ExtensionType& other) const override {
0876     return other.extension_name() == extension_name() && other.Serialize() == Serialize();
0877   }
0878 
0879  private:
0880   std::string metadata_;
0881 };
0882 
0883 std::shared_ptr<::arrow::DataType> geoarrow_wkb(
0884     std::string metadata = "{}",
0885     const std::shared_ptr<::arrow::DataType> storage = ::arrow::binary());
0886 
0887 std::shared_ptr<::arrow::DataType> geoarrow_wkb_lonlat(
0888     const std::shared_ptr<::arrow::DataType> storage = ::arrow::binary());
0889 
0890 }  // namespace test
0891 }  // namespace parquet