File indexing completed on 2025-08-28 08:27:00
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 #pragma once
0019
0020 #include <memory>
0021 #include <random>
0022 #include <sstream>
0023 #include <string>
0024 #include <string_view>
0025 #include <utility>
0026 #include <vector>
0027
0028 #include "arrow/array.h"
0029 #include "arrow/array/builder_binary.h"
0030 #include "arrow/io/memory.h"
0031 #include "arrow/json/converter.h"
0032 #include "arrow/json/options.h"
0033 #include "arrow/json/parser.h"
0034 #include "arrow/json/rapidjson_defs.h"
0035 #include "arrow/testing/gtest_util.h"
0036 #include "arrow/type.h"
0037 #include "arrow/util/checked_cast.h"
0038 #include "arrow/visit_type_inline.h"
0039
0040 #include "rapidjson/document.h"
0041 #include "rapidjson/prettywriter.h"
0042 #include "rapidjson/reader.h"
0043 #include "rapidjson/writer.h"
0044
0045 namespace arrow {
0046
0047 using internal::checked_cast;
0048
0049 namespace json {
0050
0051 namespace rj = arrow::rapidjson;
0052
0053 using rj::StringBuffer;
0054 using std::string_view;
0055 using Writer = rj::Writer<StringBuffer>;
0056
0057 struct GenerateOptions {
0058
0059 double field_probability = 1.0;
0060
0061 double null_probability = 0.2;
0062
0063 bool randomize_field_order = false;
0064
0065 static constexpr GenerateOptions Defaults() { return GenerateOptions{}; }
0066 };
0067
0068 inline static Status OK(bool ok) { return ok ? Status::OK() : Status::Invalid(""); }
0069
0070 template <typename Engine>
0071 inline static Status Generate(
0072 const std::shared_ptr<DataType>& type, Engine& e, Writer* writer,
0073 const GenerateOptions& options = GenerateOptions::Defaults());
0074
0075 template <typename Engine>
0076 inline static Status Generate(
0077 const std::vector<std::shared_ptr<Field>>& fields, Engine& e, Writer* writer,
0078 const GenerateOptions& options = GenerateOptions::Defaults());
0079
0080 template <typename Engine>
0081 inline static Status Generate(
0082 const std::shared_ptr<Schema>& schm, Engine& e, Writer* writer,
0083 const GenerateOptions& options = GenerateOptions::Defaults()) {
0084 return Generate(schm->fields(), e, writer, options);
0085 }
0086
0087 template <typename Engine>
0088 struct GenerateImpl {
0089 Status Visit(const NullType&) { return OK(writer.Null()); }
0090
0091 Status Visit(const BooleanType&) {
0092 return OK(writer.Bool(std::uniform_int_distribution<uint16_t>{}(e)&1));
0093 }
0094
0095 template <typename T>
0096 enable_if_physical_unsigned_integer<T, Status> Visit(const T&) {
0097 auto val = std::uniform_int_distribution<>{}(e);
0098 return OK(writer.Uint64(static_cast<typename T::c_type>(val)));
0099 }
0100
0101 template <typename T>
0102 enable_if_physical_signed_integer<T, Status> Visit(const T&) {
0103 auto val = std::uniform_int_distribution<>{}(e);
0104 return OK(writer.Int64(static_cast<typename T::c_type>(val)));
0105 }
0106
0107 template <typename T>
0108 enable_if_physical_floating_point<T, Status> Visit(const T&) {
0109 auto val = std::normal_distribution<typename T::c_type>{0, 1 << 10}(e);
0110 return OK(writer.Double(val));
0111 }
0112
0113 Status GenerateAscii(const DataType&) {
0114 auto size = std::poisson_distribution<>{4}(e);
0115 std::uniform_int_distribution<uint16_t> gen_char(32, 126);
0116 std::string s(size, '\0');
0117 for (char& ch : s) ch = static_cast<char>(gen_char(e));
0118 return OK(writer.String(s.c_str()));
0119 }
0120
0121 template <typename T>
0122 enable_if_base_binary<T, Status> Visit(const T& t) {
0123 return GenerateAscii(t);
0124 }
0125
0126 Status Visit(const BinaryViewType& t) { return GenerateAscii(t); }
0127
0128 template <typename T>
0129 enable_if_list_like<T, Status> Visit(const T& t) {
0130 auto size = std::poisson_distribution<>{4}(e);
0131 writer.StartArray();
0132 for (int i = 0; i < size; ++i) {
0133 RETURN_NOT_OK(Generate(t.value_type(), e, &writer, options));
0134 }
0135 return OK(writer.EndArray(size));
0136 }
0137
0138 Status Visit(const ListViewType& t) { return NotImplemented(t); }
0139
0140 Status Visit(const LargeListViewType& t) { return NotImplemented(t); }
0141
0142 Status Visit(const StructType& t) { return Generate(t.fields(), e, &writer, options); }
0143
0144 Status Visit(const DayTimeIntervalType& t) { return NotImplemented(t); }
0145
0146 Status Visit(const MonthDayNanoIntervalType& t) { return NotImplemented(t); }
0147
0148 Status Visit(const DictionaryType& t) { return NotImplemented(t); }
0149
0150 Status Visit(const ExtensionType& t) { return NotImplemented(t); }
0151
0152 Status Visit(const Decimal128Type& t) { return NotImplemented(t); }
0153
0154 Status Visit(const FixedSizeBinaryType& t) { return NotImplemented(t); }
0155
0156 Status Visit(const UnionType& t) { return NotImplemented(t); }
0157
0158 Status Visit(const RunEndEncodedType& t) { return NotImplemented(t); }
0159
0160 Status NotImplemented(const DataType& t) {
0161 return Status::NotImplemented("random generation of arrays of type ", t);
0162 }
0163
0164 Engine& e;
0165 rj::Writer<rj::StringBuffer>& writer;
0166 const GenerateOptions& options;
0167 };
0168
0169 template <typename Engine>
0170 inline static Status Generate(const std::shared_ptr<DataType>& type, Engine& e,
0171 Writer* writer, const GenerateOptions& options) {
0172 if (std::bernoulli_distribution(options.null_probability)(e)) {
0173 writer->Null();
0174 return Status::OK();
0175 }
0176 GenerateImpl<Engine> visitor = {e, *writer, options};
0177 return VisitTypeInline(*type, &visitor);
0178 }
0179
0180 template <typename Engine>
0181 inline static Status Generate(const std::vector<std::shared_ptr<Field>>& fields,
0182 Engine& e, Writer* writer, const GenerateOptions& options) {
0183 RETURN_NOT_OK(OK(writer->StartObject()));
0184
0185 int num_fields = 0;
0186 auto write_field = [&](const Field& f) {
0187 ++num_fields;
0188 writer->Key(f.name().c_str());
0189 return Generate(f.type(), e, writer, options);
0190 };
0191
0192 std::bernoulli_distribution bool_dist(options.field_probability);
0193 if (options.randomize_field_order) {
0194 std::vector<size_t> indices;
0195 indices.reserve(static_cast<size_t>(fields.size() * options.field_probability));
0196 for (size_t i = 0; i < fields.size(); ++i) {
0197 if (bool_dist(e)) {
0198 indices.push_back(i);
0199 }
0200 }
0201 std::shuffle(indices.begin(), indices.end(), e);
0202 for (auto i : indices) {
0203 RETURN_NOT_OK(write_field(*fields[i]));
0204 }
0205 } else {
0206 for (const auto& f : fields) {
0207 if (bool_dist(e)) {
0208 RETURN_NOT_OK(write_field(*f));
0209 }
0210 }
0211 }
0212
0213 return OK(writer->EndObject(num_fields));
0214 }
0215
0216 inline static Status MakeStream(string_view src_str,
0217 std::shared_ptr<io::InputStream>* out) {
0218 auto src = std::make_shared<Buffer>(src_str);
0219 *out = std::make_shared<io::BufferReader>(src);
0220 return Status::OK();
0221 }
0222
0223
0224
0225 inline static Status DecodeStringDictionary(const DictionaryArray& dict_array,
0226 std::shared_ptr<Array>* decoded) {
0227 const StringArray& dict = checked_cast<const StringArray&>(*dict_array.dictionary());
0228 const Int32Array& indices = checked_cast<const Int32Array&>(*dict_array.indices());
0229 StringBuilder builder;
0230 RETURN_NOT_OK(builder.Resize(indices.length()));
0231 for (int64_t i = 0; i < indices.length(); ++i) {
0232 if (indices.IsNull(i)) {
0233 builder.UnsafeAppendNull();
0234 continue;
0235 }
0236 auto value = dict.GetView(indices.GetView(i));
0237 RETURN_NOT_OK(builder.ReserveData(value.size()));
0238 builder.UnsafeAppend(value);
0239 }
0240 return builder.Finish(decoded);
0241 }
0242
0243 inline static Status ParseFromString(ParseOptions options, string_view src_str,
0244 std::shared_ptr<Array>* parsed) {
0245 auto src = std::make_shared<Buffer>(src_str);
0246 std::unique_ptr<BlockParser> parser;
0247 RETURN_NOT_OK(BlockParser::Make(options, &parser));
0248 RETURN_NOT_OK(parser->Parse(src));
0249 return parser->Finish(parsed);
0250 }
0251
0252 inline static Status ParseFromString(ParseOptions options, string_view src_str,
0253 std::shared_ptr<StructArray>* parsed) {
0254 std::shared_ptr<Array> parsed_non_struct;
0255 RETURN_NOT_OK(ParseFromString(options, src_str, &parsed_non_struct));
0256 *parsed = internal::checked_pointer_cast<StructArray>(parsed_non_struct);
0257 return Status::OK();
0258 }
0259
0260 static inline std::string PrettyPrint(string_view one_line) {
0261 rj::Document document;
0262
0263
0264 document.Parse(one_line.data(), one_line.size());
0265 rj::StringBuffer sb;
0266 rj::PrettyWriter<rj::StringBuffer> writer(sb);
0267 document.Accept(writer);
0268 return sb.GetString();
0269 }
0270
0271 template <typename T>
0272 std::string RowsOfOneColumn(std::string_view name, std::initializer_list<T> values,
0273 decltype(std::to_string(*values.begin()))* = nullptr) {
0274 std::stringstream ss;
0275 for (auto value : values) {
0276 ss << R"({")" << name << R"(":)" << std::to_string(value) << "}\n";
0277 }
0278 return ss.str();
0279 }
0280
0281 inline std::string RowsOfOneColumn(std::string_view name,
0282 std::initializer_list<std::string> values) {
0283 std::stringstream ss;
0284 for (auto value : values) {
0285 ss << R"({")" << name << R"(":)" << value << "}\n";
0286 }
0287 return ss.str();
0288 }
0289
0290 inline static std::string scalars_only_src() {
0291 return R"(
0292 { "hello": 3.5, "world": false, "yo": "thing" }
0293 { "hello": 3.25, "world": null }
0294 { "hello": 3.125, "world": null, "yo": "\u5fcd" }
0295 { "hello": 0.0, "world": true, "yo": null }
0296 )";
0297 }
0298
0299 inline static std::string nested_src() {
0300 return R"(
0301 { "hello": 3.5, "world": false, "yo": "thing", "arr": [1, 2, 3], "nuf": {} }
0302 { "hello": 3.25, "world": null, "arr": [2], "nuf": null }
0303 { "hello": 3.125, "world": null, "yo": "\u5fcd", "arr": [], "nuf": { "ps": 78 } }
0304 { "hello": 0.0, "world": true, "yo": null, "arr": null, "nuf": { "ps": 90 } }
0305 )";
0306 }
0307
0308 inline static std::string null_src() {
0309 return R"(
0310 { "plain": null, "list1": [], "list2": [], "struct": { "plain": null } }
0311 { "plain": null, "list1": [], "list2": [null], "struct": {} }
0312 )";
0313 }
0314
0315 inline static std::string unquoted_decimal_src() {
0316 return R"(
0317 { "price": 30.04, "cost":30.001 }
0318 { "price": 1.23, "cost":1.229 }
0319 )";
0320 }
0321
0322 inline static std::string mixed_decimal_src() {
0323 return R"(
0324 { "price": 30.04, "cost": 30.001 }
0325 { "price": "1.23", "cost": "1.229" }
0326 )";
0327 }
0328
0329 }
0330 }