Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-08-28 08:27:00

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <memory>
0021 #include <random>
0022 #include <sstream>
0023 #include <string>
0024 #include <string_view>
0025 #include <utility>
0026 #include <vector>
0027 
0028 #include "arrow/array.h"
0029 #include "arrow/array/builder_binary.h"
0030 #include "arrow/io/memory.h"
0031 #include "arrow/json/converter.h"
0032 #include "arrow/json/options.h"
0033 #include "arrow/json/parser.h"
0034 #include "arrow/json/rapidjson_defs.h"
0035 #include "arrow/testing/gtest_util.h"
0036 #include "arrow/type.h"
0037 #include "arrow/util/checked_cast.h"
0038 #include "arrow/visit_type_inline.h"
0039 
0040 #include "rapidjson/document.h"
0041 #include "rapidjson/prettywriter.h"
0042 #include "rapidjson/reader.h"
0043 #include "rapidjson/writer.h"
0044 
0045 namespace arrow {
0046 
0047 using internal::checked_cast;
0048 
0049 namespace json {
0050 
0051 namespace rj = arrow::rapidjson;
0052 
0053 using rj::StringBuffer;
0054 using std::string_view;
0055 using Writer = rj::Writer<StringBuffer>;
0056 
0057 struct GenerateOptions {
0058   // Probability of a field being written
0059   double field_probability = 1.0;
0060   // Probability of a value being null
0061   double null_probability = 0.2;
0062   // Whether to randomize the order of written fields
0063   bool randomize_field_order = false;
0064 
0065   static constexpr GenerateOptions Defaults() { return GenerateOptions{}; }
0066 };
0067 
0068 inline static Status OK(bool ok) { return ok ? Status::OK() : Status::Invalid(""); }
0069 
0070 template <typename Engine>
0071 inline static Status Generate(
0072     const std::shared_ptr<DataType>& type, Engine& e, Writer* writer,
0073     const GenerateOptions& options = GenerateOptions::Defaults());
0074 
0075 template <typename Engine>
0076 inline static Status Generate(
0077     const std::vector<std::shared_ptr<Field>>& fields, Engine& e, Writer* writer,
0078     const GenerateOptions& options = GenerateOptions::Defaults());
0079 
0080 template <typename Engine>
0081 inline static Status Generate(
0082     const std::shared_ptr<Schema>& schm, Engine& e, Writer* writer,
0083     const GenerateOptions& options = GenerateOptions::Defaults()) {
0084   return Generate(schm->fields(), e, writer, options);
0085 }
0086 
0087 template <typename Engine>
0088 struct GenerateImpl {
0089   Status Visit(const NullType&) { return OK(writer.Null()); }
0090 
0091   Status Visit(const BooleanType&) {
0092     return OK(writer.Bool(std::uniform_int_distribution<uint16_t>{}(e)&1));
0093   }
0094 
0095   template <typename T>
0096   enable_if_physical_unsigned_integer<T, Status> Visit(const T&) {
0097     auto val = std::uniform_int_distribution<>{}(e);
0098     return OK(writer.Uint64(static_cast<typename T::c_type>(val)));
0099   }
0100 
0101   template <typename T>
0102   enable_if_physical_signed_integer<T, Status> Visit(const T&) {
0103     auto val = std::uniform_int_distribution<>{}(e);
0104     return OK(writer.Int64(static_cast<typename T::c_type>(val)));
0105   }
0106 
0107   template <typename T>
0108   enable_if_physical_floating_point<T, Status> Visit(const T&) {
0109     auto val = std::normal_distribution<typename T::c_type>{0, 1 << 10}(e);
0110     return OK(writer.Double(val));
0111   }
0112 
0113   Status GenerateAscii(const DataType&) {
0114     auto size = std::poisson_distribution<>{4}(e);
0115     std::uniform_int_distribution<uint16_t> gen_char(32, 126);  // FIXME generate UTF8
0116     std::string s(size, '\0');
0117     for (char& ch : s) ch = static_cast<char>(gen_char(e));
0118     return OK(writer.String(s.c_str()));
0119   }
0120 
0121   template <typename T>
0122   enable_if_base_binary<T, Status> Visit(const T& t) {
0123     return GenerateAscii(t);
0124   }
0125 
0126   Status Visit(const BinaryViewType& t) { return GenerateAscii(t); }
0127 
0128   template <typename T>
0129   enable_if_list_like<T, Status> Visit(const T& t) {
0130     auto size = std::poisson_distribution<>{4}(e);
0131     writer.StartArray();
0132     for (int i = 0; i < size; ++i) {
0133       RETURN_NOT_OK(Generate(t.value_type(), e, &writer, options));
0134     }
0135     return OK(writer.EndArray(size));
0136   }
0137 
0138   Status Visit(const ListViewType& t) { return NotImplemented(t); }
0139 
0140   Status Visit(const LargeListViewType& t) { return NotImplemented(t); }
0141 
0142   Status Visit(const StructType& t) { return Generate(t.fields(), e, &writer, options); }
0143 
0144   Status Visit(const DayTimeIntervalType& t) { return NotImplemented(t); }
0145 
0146   Status Visit(const MonthDayNanoIntervalType& t) { return NotImplemented(t); }
0147 
0148   Status Visit(const DictionaryType& t) { return NotImplemented(t); }
0149 
0150   Status Visit(const ExtensionType& t) { return NotImplemented(t); }
0151 
0152   Status Visit(const Decimal128Type& t) { return NotImplemented(t); }
0153 
0154   Status Visit(const FixedSizeBinaryType& t) { return NotImplemented(t); }
0155 
0156   Status Visit(const UnionType& t) { return NotImplemented(t); }
0157 
0158   Status Visit(const RunEndEncodedType& t) { return NotImplemented(t); }
0159 
0160   Status NotImplemented(const DataType& t) {
0161     return Status::NotImplemented("random generation of arrays of type ", t);
0162   }
0163 
0164   Engine& e;
0165   rj::Writer<rj::StringBuffer>& writer;
0166   const GenerateOptions& options;
0167 };
0168 
0169 template <typename Engine>
0170 inline static Status Generate(const std::shared_ptr<DataType>& type, Engine& e,
0171                               Writer* writer, const GenerateOptions& options) {
0172   if (std::bernoulli_distribution(options.null_probability)(e)) {
0173     writer->Null();
0174     return Status::OK();
0175   }
0176   GenerateImpl<Engine> visitor = {e, *writer, options};
0177   return VisitTypeInline(*type, &visitor);
0178 }
0179 
0180 template <typename Engine>
0181 inline static Status Generate(const std::vector<std::shared_ptr<Field>>& fields,
0182                               Engine& e, Writer* writer, const GenerateOptions& options) {
0183   RETURN_NOT_OK(OK(writer->StartObject()));
0184 
0185   int num_fields = 0;
0186   auto write_field = [&](const Field& f) {
0187     ++num_fields;
0188     writer->Key(f.name().c_str());
0189     return Generate(f.type(), e, writer, options);
0190   };
0191 
0192   std::bernoulli_distribution bool_dist(options.field_probability);
0193   if (options.randomize_field_order) {
0194     std::vector<size_t> indices;
0195     indices.reserve(static_cast<size_t>(fields.size() * options.field_probability));
0196     for (size_t i = 0; i < fields.size(); ++i) {
0197       if (bool_dist(e)) {
0198         indices.push_back(i);
0199       }
0200     }
0201     std::shuffle(indices.begin(), indices.end(), e);
0202     for (auto i : indices) {
0203       RETURN_NOT_OK(write_field(*fields[i]));
0204     }
0205   } else {
0206     for (const auto& f : fields) {
0207       if (bool_dist(e)) {
0208         RETURN_NOT_OK(write_field(*f));
0209       }
0210     }
0211   }
0212 
0213   return OK(writer->EndObject(num_fields));
0214 }
0215 
0216 inline static Status MakeStream(string_view src_str,
0217                                 std::shared_ptr<io::InputStream>* out) {
0218   auto src = std::make_shared<Buffer>(src_str);
0219   *out = std::make_shared<io::BufferReader>(src);
0220   return Status::OK();
0221 }
0222 
0223 // scalar values (numbers and strings) are parsed into a
0224 // dictionary<index:int32, value:string>. This can be decoded for ease of comparison
0225 inline static Status DecodeStringDictionary(const DictionaryArray& dict_array,
0226                                             std::shared_ptr<Array>* decoded) {
0227   const StringArray& dict = checked_cast<const StringArray&>(*dict_array.dictionary());
0228   const Int32Array& indices = checked_cast<const Int32Array&>(*dict_array.indices());
0229   StringBuilder builder;
0230   RETURN_NOT_OK(builder.Resize(indices.length()));
0231   for (int64_t i = 0; i < indices.length(); ++i) {
0232     if (indices.IsNull(i)) {
0233       builder.UnsafeAppendNull();
0234       continue;
0235     }
0236     auto value = dict.GetView(indices.GetView(i));
0237     RETURN_NOT_OK(builder.ReserveData(value.size()));
0238     builder.UnsafeAppend(value);
0239   }
0240   return builder.Finish(decoded);
0241 }
0242 
0243 inline static Status ParseFromString(ParseOptions options, string_view src_str,
0244                                      std::shared_ptr<Array>* parsed) {
0245   auto src = std::make_shared<Buffer>(src_str);
0246   std::unique_ptr<BlockParser> parser;
0247   RETURN_NOT_OK(BlockParser::Make(options, &parser));
0248   RETURN_NOT_OK(parser->Parse(src));
0249   return parser->Finish(parsed);
0250 }
0251 
0252 inline static Status ParseFromString(ParseOptions options, string_view src_str,
0253                                      std::shared_ptr<StructArray>* parsed) {
0254   std::shared_ptr<Array> parsed_non_struct;
0255   RETURN_NOT_OK(ParseFromString(options, src_str, &parsed_non_struct));
0256   *parsed = internal::checked_pointer_cast<StructArray>(parsed_non_struct);
0257   return Status::OK();
0258 }
0259 
0260 static inline std::string PrettyPrint(string_view one_line) {
0261   rj::Document document;
0262 
0263   // Must pass size to avoid ASAN issues.
0264   document.Parse(one_line.data(), one_line.size());
0265   rj::StringBuffer sb;
0266   rj::PrettyWriter<rj::StringBuffer> writer(sb);
0267   document.Accept(writer);
0268   return sb.GetString();
0269 }
0270 
0271 template <typename T>
0272 std::string RowsOfOneColumn(std::string_view name, std::initializer_list<T> values,
0273                             decltype(std::to_string(*values.begin()))* = nullptr) {
0274   std::stringstream ss;
0275   for (auto value : values) {
0276     ss << R"({")" << name << R"(":)" << std::to_string(value) << "}\n";
0277   }
0278   return ss.str();
0279 }
0280 
0281 inline std::string RowsOfOneColumn(std::string_view name,
0282                                    std::initializer_list<std::string> values) {
0283   std::stringstream ss;
0284   for (auto value : values) {
0285     ss << R"({")" << name << R"(":)" << value << "}\n";
0286   }
0287   return ss.str();
0288 }
0289 
0290 inline static std::string scalars_only_src() {
0291   return R"(
0292     { "hello": 3.5, "world": false, "yo": "thing" }
0293     { "hello": 3.25, "world": null }
0294     { "hello": 3.125, "world": null, "yo": "\u5fcd" }
0295     { "hello": 0.0, "world": true, "yo": null }
0296   )";
0297 }
0298 
0299 inline static std::string nested_src() {
0300   return R"(
0301     { "hello": 3.5, "world": false, "yo": "thing", "arr": [1, 2, 3], "nuf": {} }
0302     { "hello": 3.25, "world": null, "arr": [2], "nuf": null }
0303     { "hello": 3.125, "world": null, "yo": "\u5fcd", "arr": [], "nuf": { "ps": 78 } }
0304     { "hello": 0.0, "world": true, "yo": null, "arr": null, "nuf": { "ps": 90 } }
0305   )";
0306 }
0307 
0308 inline static std::string null_src() {
0309   return R"(
0310     { "plain": null, "list1": [], "list2": [], "struct": { "plain": null } }
0311     { "plain": null, "list1": [], "list2": [null], "struct": {} }
0312   )";
0313 }
0314 
0315 inline static std::string unquoted_decimal_src() {
0316   return R"(
0317     { "price": 30.04, "cost":30.001 }
0318     { "price": 1.23, "cost":1.229 }
0319   )";
0320 }
0321 
0322 inline static std::string mixed_decimal_src() {
0323   return R"(
0324     { "price": 30.04, "cost": 30.001 }
0325     { "price": "1.23", "cost": "1.229" }
0326   )";
0327 }
0328 
0329 }  // namespace json
0330 }  // namespace arrow