Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-17 08:28:53

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <limits>
0021 #include <memory>
0022 #include <random>
0023 #include <string>
0024 #include <utility>
0025 #include <vector>
0026 
0027 #include "arrow/array.h"
0028 #include "arrow/array/builder_binary.h"
0029 #include "arrow/array/builder_decimal.h"
0030 #include "arrow/array/builder_primitive.h"
0031 #include "arrow/testing/gtest_util.h"
0032 #include "arrow/testing/random.h"
0033 #include "arrow/type_fwd.h"
0034 #include "arrow/type_traits.h"
0035 #include "arrow/util/decimal.h"
0036 #include "arrow/util/float16.h"
0037 #include "parquet/column_reader.h"
0038 #include "parquet/test_util.h"
0039 
0040 namespace parquet {
0041 
0042 using internal::RecordReader;
0043 
0044 namespace arrow {
0045 
0046 using ::arrow::Array;
0047 using ::arrow::ChunkedArray;
0048 using ::arrow::Status;
0049 
0050 template <typename T, int32_t PRECISION, typename = ::arrow::enable_if_decimal<T>>
0051 struct DecimalWithPrecisionAndScale {
0052   using type = T;
0053   static_assert(PRECISION >= T::kMinPrecision && PRECISION <= T::kMaxPrecision,
0054                 "Invalid precision value");
0055   static constexpr ::arrow::Type::type type_id = T::type_id;
0056   static constexpr int32_t precision = PRECISION;
0057   static constexpr int32_t scale = PRECISION - 1;
0058 };
0059 template <int32_t PRECISION>
0060 using Decimal32WithPrecisionAndScale =
0061     DecimalWithPrecisionAndScale<::arrow::Decimal32Type, PRECISION>;
0062 template <int32_t PRECISION>
0063 using Decimal64WithPrecisionAndScale =
0064     DecimalWithPrecisionAndScale<::arrow::Decimal64Type, PRECISION>;
0065 template <int32_t PRECISION>
0066 using Decimal128WithPrecisionAndScale =
0067     DecimalWithPrecisionAndScale<::arrow::Decimal128Type, PRECISION>;
0068 template <int32_t PRECISION>
0069 using Decimal256WithPrecisionAndScale =
0070     DecimalWithPrecisionAndScale<::arrow::Decimal256Type, PRECISION>;
0071 
0072 template <class ArrowType>
0073 ::arrow::enable_if_floating_point<ArrowType, Status> NonNullArray(
0074     size_t size, std::shared_ptr<Array>* out) {
0075   using c_type = typename ArrowType::c_type;
0076   std::vector<c_type> values;
0077   if constexpr (::arrow::is_half_float_type<ArrowType>::value) {
0078     values.resize(size);
0079     test::random_float16_numbers(static_cast<int>(size), 0, ::arrow::util::Float16(0.0f),
0080                                  ::arrow::util::Float16(1.0f), values.data());
0081   } else {
0082     ::arrow::random_real(size, 0, static_cast<c_type>(0), static_cast<c_type>(1),
0083                          &values);
0084   }
0085   ::arrow::NumericBuilder<ArrowType> builder;
0086   RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
0087   return builder.Finish(out);
0088 }
0089 
0090 template <class ArrowType>
0091 ::arrow::enable_if_integer<ArrowType, Status> NonNullArray(size_t size,
0092                                                            std::shared_ptr<Array>* out) {
0093   std::vector<typename ArrowType::c_type> values;
0094   ::arrow::randint(size, 0, 64, &values);
0095 
0096   // Passing data type so this will work with TimestampType too
0097   ::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
0098                                              ::arrow::default_memory_pool());
0099   RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
0100   return builder.Finish(out);
0101 }
0102 
0103 template <class ArrowType>
0104 ::arrow::enable_if_date<ArrowType, Status> NonNullArray(size_t size,
0105                                                         std::shared_ptr<Array>* out) {
0106   std::vector<typename ArrowType::c_type> values;
0107   ::arrow::randint(size, 0, 24, &values);
0108   for (size_t i = 0; i < size; i++) {
0109     values[i] *= 86400000;
0110   }
0111 
0112   // Passing data type so this will work with TimestampType too
0113   ::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
0114                                              ::arrow::default_memory_pool());
0115   RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
0116   return builder.Finish(out);
0117 }
0118 
0119 template <class ArrowType>
0120 ::arrow::enable_if_base_binary<ArrowType, Status> NonNullArray(
0121     size_t size, std::shared_ptr<Array>* out) {
0122   using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
0123   BuilderType builder;
0124   for (size_t i = 0; i < size; i++) {
0125     RETURN_NOT_OK(builder.Append("test-string"));
0126   }
0127   return builder.Finish(out);
0128 }
0129 
0130 template <typename ArrowType>
0131 ::arrow::enable_if_fixed_size_binary<ArrowType, Status> NonNullArray(
0132     size_t size, std::shared_ptr<Array>* out) {
0133   using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
0134   // set byte_width to the length of "fixed": 5
0135   // todo: find a way to generate test data with more diversity.
0136   BuilderType builder(::arrow::fixed_size_binary(5));
0137   for (size_t i = 0; i < size; i++) {
0138     RETURN_NOT_OK(builder.Append("fixed"));
0139   }
0140   return builder.Finish(out);
0141 }
0142 
0143 template <int32_t byte_width>
0144 static void random_decimals(int64_t n, uint32_t seed, int32_t precision, uint8_t* out) {
0145   auto gen = ::arrow::random::RandomArrayGenerator(seed);
0146   std::shared_ptr<Array> decimals;
0147   if constexpr (byte_width == 4) {
0148     decimals = gen.Decimal32(::arrow::decimal32(precision, 0), n);
0149   } else if constexpr (byte_width == 8) {
0150     decimals = gen.Decimal64(::arrow::decimal64(precision, 0), n);
0151   } else if constexpr (byte_width == 16) {
0152     decimals = gen.Decimal128(::arrow::decimal128(precision, 0), n);
0153   } else {
0154     decimals = gen.Decimal256(::arrow::decimal256(precision, 0), n);
0155   }
0156   std::memcpy(out, decimals->data()->GetValues<uint8_t>(1, 0), byte_width * n);
0157 }
0158 
0159 template <typename ArrowType, int32_t precision = ArrowType::precision>
0160 ::arrow::enable_if_t<std::is_same_v<ArrowType, DecimalWithPrecisionAndScale<
0161                                                    typename ArrowType::type, precision>>,
0162                      Status>
0163 NonNullArray(size_t size, std::shared_ptr<Array>* out) {
0164   constexpr int32_t kDecimalPrecision = precision;
0165   constexpr int32_t kDecimalScale = ArrowType::scale;
0166 
0167   const auto type =
0168       std::make_shared<typename ArrowType::type>(kDecimalPrecision, kDecimalScale);
0169   const int32_t byte_width = type->byte_width();
0170 
0171   constexpr int32_t seed = 0;
0172 
0173   ARROW_ASSIGN_OR_RAISE(auto out_buf, ::arrow::AllocateBuffer(size * byte_width));
0174   random_decimals<ArrowType::type::kByteWidth>(size, seed, kDecimalPrecision,
0175                                                out_buf->mutable_data());
0176 
0177   using Builder = typename ::arrow::TypeTraits<typename ArrowType::type>::BuilderType;
0178   Builder builder(type);
0179   RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size));
0180   return builder.Finish(out);
0181 }
0182 
0183 template <class ArrowType>
0184 ::arrow::enable_if_boolean<ArrowType, Status> NonNullArray(size_t size,
0185                                                            std::shared_ptr<Array>* out) {
0186   std::vector<uint8_t> values;
0187   ::arrow::randint(size, 0, 1, &values);
0188   ::arrow::BooleanBuilder builder;
0189   RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
0190   return builder.Finish(out);
0191 }
0192 
0193 // This helper function only supports (size/2) nulls.
0194 template <typename ArrowType>
0195 ::arrow::enable_if_floating_point<ArrowType, Status> NullableArray(
0196     size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<Array>* out) {
0197   using c_type = typename ArrowType::c_type;
0198   std::vector<c_type> values;
0199   if constexpr (::arrow::is_half_float_type<ArrowType>::value) {
0200     values.resize(size);
0201     test::random_float16_numbers(static_cast<int>(size), 0, ::arrow::util::Float16(-1e4f),
0202                                  ::arrow::util::Float16(1e4f), values.data());
0203   } else {
0204     ::arrow::random_real(size, seed, static_cast<c_type>(-1e10),
0205                          static_cast<c_type>(1e10), &values);
0206   }
0207   std::vector<uint8_t> valid_bytes(size, 1);
0208 
0209   for (size_t i = 0; i < num_nulls; i++) {
0210     valid_bytes[i * 2] = 0;
0211   }
0212 
0213   ::arrow::NumericBuilder<ArrowType> builder;
0214   if (values.size() > 0) {
0215     RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
0216   }
0217   return builder.Finish(out);
0218 }
0219 
0220 // This helper function only supports (size/2) nulls.
0221 template <typename ArrowType>
0222 ::arrow::enable_if_integer<ArrowType, Status> NullableArray(size_t size, size_t num_nulls,
0223                                                             uint32_t seed,
0224                                                             std::shared_ptr<Array>* out) {
0225   std::vector<typename ArrowType::c_type> values;
0226 
0227   // Seed is random in Arrow right now
0228   (void)seed;
0229   ::arrow::randint(size, 0, 64, &values);
0230   std::vector<uint8_t> valid_bytes(size, 1);
0231 
0232   for (size_t i = 0; i < num_nulls; i++) {
0233     valid_bytes[i * 2] = 0;
0234   }
0235 
0236   // Passing data type so this will work with TimestampType too
0237   ::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
0238                                              ::arrow::default_memory_pool());
0239   RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
0240   return builder.Finish(out);
0241 }
0242 
0243 template <typename ArrowType>
0244 ::arrow::enable_if_date<ArrowType, Status> NullableArray(size_t size, size_t num_nulls,
0245                                                          uint32_t seed,
0246                                                          std::shared_ptr<Array>* out) {
0247   std::vector<typename ArrowType::c_type> values;
0248 
0249   // Seed is random in Arrow right now
0250   (void)seed;
0251   ::arrow::randint(size, 0, 24, &values);
0252   for (size_t i = 0; i < size; i++) {
0253     values[i] *= 86400000;
0254   }
0255   std::vector<uint8_t> valid_bytes(size, 1);
0256 
0257   for (size_t i = 0; i < num_nulls; i++) {
0258     valid_bytes[i * 2] = 0;
0259   }
0260 
0261   // Passing data type so this will work with TimestampType too
0262   ::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
0263                                              ::arrow::default_memory_pool());
0264   RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
0265   return builder.Finish(out);
0266 }
0267 
0268 // This helper function only supports (size/2) nulls yet.
0269 template <typename ArrowType>
0270 ::arrow::enable_if_base_binary<ArrowType, Status> NullableArray(
0271     size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<::arrow::Array>* out) {
0272   std::vector<uint8_t> valid_bytes(size, 1);
0273 
0274   for (size_t i = 0; i < num_nulls; i++) {
0275     valid_bytes[i * 2] = 0;
0276   }
0277 
0278   using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
0279   BuilderType builder;
0280 
0281   const int kBufferSize = 10;
0282   uint8_t buffer[kBufferSize];
0283   for (size_t i = 0; i < size; i++) {
0284     if (!valid_bytes[i]) {
0285       RETURN_NOT_OK(builder.AppendNull());
0286     } else {
0287       ::arrow::random_bytes(kBufferSize, seed + static_cast<uint32_t>(i), buffer);
0288       if (ArrowType::is_utf8) {
0289         // Trivially force data to be valid UTF8 by making it all ASCII
0290         for (auto& byte : buffer) {
0291           byte &= 0x7f;
0292         }
0293       }
0294       RETURN_NOT_OK(builder.Append(buffer, kBufferSize));
0295     }
0296   }
0297   return builder.Finish(out);
0298 }
0299 
0300 // This helper function only supports (size/2) nulls yet,
0301 // same as NullableArray<String|Binary>(..)
0302 template <typename ArrowType>
0303 ::arrow::enable_if_fixed_size_binary<ArrowType, Status> NullableArray(
0304     size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<::arrow::Array>* out) {
0305   std::vector<uint8_t> valid_bytes(size, 1);
0306 
0307   for (size_t i = 0; i < num_nulls; i++) {
0308     valid_bytes[i * 2] = 0;
0309   }
0310 
0311   using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
0312   const int byte_width = 10;
0313   BuilderType builder(::arrow::fixed_size_binary(byte_width));
0314 
0315   const int kBufferSize = byte_width;
0316   uint8_t buffer[kBufferSize];
0317   for (size_t i = 0; i < size; i++) {
0318     if (!valid_bytes[i]) {
0319       RETURN_NOT_OK(builder.AppendNull());
0320     } else {
0321       ::arrow::random_bytes(kBufferSize, seed + static_cast<uint32_t>(i), buffer);
0322       RETURN_NOT_OK(builder.Append(buffer));
0323     }
0324   }
0325   return builder.Finish(out);
0326 }
0327 
0328 template <typename ArrowType, int32_t precision = ArrowType::precision>
0329 ::arrow::enable_if_t<std::is_same_v<ArrowType, DecimalWithPrecisionAndScale<
0330                                                    typename ArrowType::type, precision>>,
0331                      Status>
0332 NullableArray(size_t size, size_t num_nulls, uint32_t seed,
0333               std::shared_ptr<::arrow::Array>* out) {
0334   std::vector<uint8_t> valid_bytes(size, '\1');
0335 
0336   for (size_t i = 0; i < num_nulls; ++i) {
0337     valid_bytes[i * 2] = '\0';
0338   }
0339 
0340   constexpr int32_t kDecimalPrecision = precision;
0341   constexpr int32_t kDecimalScale = ArrowType::scale;
0342 
0343   const auto type =
0344       std::make_shared<typename ArrowType::type>(kDecimalPrecision, kDecimalScale);
0345   const int32_t byte_width = type->byte_width();
0346 
0347   ARROW_ASSIGN_OR_RAISE(auto out_buf, ::arrow::AllocateBuffer(size * byte_width));
0348   random_decimals<ArrowType::type::kByteWidth>(size, seed, precision,
0349                                                out_buf->mutable_data());
0350 
0351   using Builder = typename ::arrow::TypeTraits<typename ArrowType::type>::BuilderType;
0352   Builder builder(type);
0353   RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size, valid_bytes.data()));
0354   return builder.Finish(out);
0355 }
0356 
0357 // This helper function only supports (size/2) nulls yet.
0358 template <class ArrowType>
0359 ::arrow::enable_if_boolean<ArrowType, Status> NullableArray(size_t size, size_t num_nulls,
0360                                                             uint32_t seed,
0361                                                             std::shared_ptr<Array>* out) {
0362   std::vector<uint8_t> values;
0363 
0364   // Seed is random in Arrow right now
0365   (void)seed;
0366 
0367   ::arrow::randint(size, 0, 1, &values);
0368   std::vector<uint8_t> valid_bytes(size, 1);
0369 
0370   for (size_t i = 0; i < num_nulls; i++) {
0371     valid_bytes[i * 2] = 0;
0372   }
0373 
0374   ::arrow::BooleanBuilder builder;
0375   RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
0376   return builder.Finish(out);
0377 }
0378 
0379 /// Wrap an Array into a ListArray by splitting it up into size lists.
0380 ///
0381 /// This helper function only supports (size/2) nulls.
0382 Status MakeListArray(const std::shared_ptr<Array>& values, int64_t size,
0383                      int64_t null_count, const std::string& item_name,
0384                      bool nullable_values, std::shared_ptr<::arrow::ListArray>* out) {
0385   // We always include an empty list
0386   int64_t non_null_entries = size - null_count - 1;
0387   int64_t length_per_entry = values->length() / non_null_entries;
0388 
0389   auto offsets = AllocateBuffer();
0390   RETURN_NOT_OK(offsets->Resize((size + 1) * sizeof(int32_t)));
0391   int32_t* offsets_ptr = reinterpret_cast<int32_t*>(offsets->mutable_data());
0392 
0393   auto null_bitmap = AllocateBuffer();
0394   int64_t bitmap_size = ::arrow::bit_util::BytesForBits(size);
0395   RETURN_NOT_OK(null_bitmap->Resize(bitmap_size));
0396   uint8_t* null_bitmap_ptr = null_bitmap->mutable_data();
0397   memset(null_bitmap_ptr, 0, bitmap_size);
0398 
0399   int32_t current_offset = 0;
0400   for (int64_t i = 0; i < size; i++) {
0401     offsets_ptr[i] = current_offset;
0402     if (!(((i % 2) == 0) && ((i / 2) < null_count))) {
0403       // Non-null list (list with index 1 is always empty).
0404       ::arrow::bit_util::SetBit(null_bitmap_ptr, i);
0405       if (i != 1) {
0406         current_offset += static_cast<int32_t>(length_per_entry);
0407       }
0408     }
0409   }
0410   offsets_ptr[size] = static_cast<int32_t>(values->length());
0411 
0412   auto value_field = ::arrow::field(item_name, values->type(), nullable_values);
0413   *out = std::make_shared<::arrow::ListArray>(::arrow::list(value_field), size, offsets,
0414                                               values, null_bitmap, null_count);
0415 
0416   return Status::OK();
0417 }
0418 
0419 // Make an array containing only empty lists, with a null values array
0420 Status MakeEmptyListsArray(int64_t size, std::shared_ptr<Array>* out_array) {
0421   // Allocate an offsets buffer containing only zeroes
0422   const int64_t offsets_nbytes = (size + 1) * sizeof(int32_t);
0423   ARROW_ASSIGN_OR_RAISE(auto offsets_buffer, ::arrow::AllocateBuffer(offsets_nbytes));
0424   memset(offsets_buffer->mutable_data(), 0, offsets_nbytes);
0425 
0426   auto value_field =
0427       ::arrow::field("item", ::arrow::float64(), false /* nullable_values */);
0428   auto list_type = ::arrow::list(value_field);
0429 
0430   std::vector<std::shared_ptr<Buffer>> child_buffers = {nullptr /* null bitmap */,
0431                                                         nullptr /* values */};
0432   auto child_data =
0433       ::arrow::ArrayData::Make(value_field->type(), 0, std::move(child_buffers));
0434 
0435   std::vector<std::shared_ptr<Buffer>> buffers = {nullptr /* bitmap */,
0436                                                   std::move(offsets_buffer)};
0437   auto array_data = ::arrow::ArrayData::Make(list_type, size, std::move(buffers));
0438   array_data->child_data.push_back(child_data);
0439 
0440   *out_array = ::arrow::MakeArray(array_data);
0441   return Status::OK();
0442 }
0443 
0444 std::shared_ptr<::arrow::Table> MakeSimpleTable(
0445     const std::shared_ptr<ChunkedArray>& values, bool nullable) {
0446   auto schema = ::arrow::schema({::arrow::field("col", values->type(), nullable)});
0447   return ::arrow::Table::Make(schema, {values});
0448 }
0449 
0450 std::shared_ptr<::arrow::Table> MakeSimpleTable(const std::shared_ptr<Array>& values,
0451                                                 bool nullable) {
0452   auto carr = std::make_shared<::arrow::ChunkedArray>(values);
0453   return MakeSimpleTable(carr, nullable);
0454 }
0455 
0456 template <typename T>
0457 void ExpectArray(T* expected, Array* result) {
0458   auto p_array = static_cast<::arrow::PrimitiveArray*>(result);
0459   for (int i = 0; i < result->length(); i++) {
0460     EXPECT_EQ(expected[i], reinterpret_cast<const T*>(p_array->values()->data())[i]);
0461   }
0462 }
0463 
0464 template <typename ArrowType>
0465 void ExpectArrayT(void* expected, Array* result) {
0466   ::arrow::PrimitiveArray* p_array = static_cast<::arrow::PrimitiveArray*>(result);
0467   for (int64_t i = 0; i < result->length(); i++) {
0468     EXPECT_EQ(reinterpret_cast<typename ArrowType::c_type*>(expected)[i],
0469               reinterpret_cast<const typename ArrowType::c_type*>(
0470                   p_array->values()->data())[i]);
0471   }
0472 }
0473 
0474 template <>
0475 void ExpectArrayT<::arrow::BooleanType>(void* expected, Array* result) {
0476   ::arrow::BooleanBuilder builder;
0477   ARROW_EXPECT_OK(
0478       builder.AppendValues(reinterpret_cast<uint8_t*>(expected), result->length()));
0479 
0480   std::shared_ptr<Array> expected_array;
0481   ARROW_EXPECT_OK(builder.Finish(&expected_array));
0482   EXPECT_TRUE(result->Equals(*expected_array));
0483 }
0484 
0485 }  // namespace arrow
0486 
0487 }  // namespace parquet