![]() |
|
|||
File indexing completed on 2025-08-28 08:26:56
0001 // Licensed to the Apache Software Foundation (ASF) under one 0002 // or more contributor license agreements. See the NOTICE file 0003 // distributed with this work for additional information 0004 // regarding copyright ownership. The ASF licenses this file 0005 // to you under the Apache License, Version 2.0 (the 0006 // "License"); you may not use this file except in compliance 0007 // with the License. You may obtain a copy of the License at 0008 // 0009 // http://www.apache.org/licenses/LICENSE-2.0 0010 // 0011 // Unless required by applicable law or agreed to in writing, 0012 // software distributed under the License is distributed on an 0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 0014 // KIND, either express or implied. See the License for the 0015 // specific language governing permissions and limitations 0016 // under the License. 0017 0018 // Eager evaluation convenience APIs for invoking common functions, including 0019 // necessary memory allocations 0020 0021 #pragma once 0022 0023 #include <vector> 0024 0025 #include "arrow/compute/function_options.h" 0026 #include "arrow/datum.h" 0027 #include "arrow/result.h" 0028 #include "arrow/util/macros.h" 0029 #include "arrow/util/visibility.h" 0030 0031 namespace arrow { 0032 0033 class Array; 0034 0035 namespace compute { 0036 0037 class ExecContext; 0038 0039 // ---------------------------------------------------------------------- 0040 // Aggregate functions 0041 0042 /// \addtogroup compute-concrete-options 0043 /// @{ 0044 0045 /// \brief Control general scalar aggregate kernel behavior 0046 /// 0047 /// By default, null values are ignored (skip_nulls = true). 0048 class ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions { 0049 public: 0050 explicit ScalarAggregateOptions(bool skip_nulls = true, uint32_t min_count = 1); 0051 static constexpr char const kTypeName[] = "ScalarAggregateOptions"; 0052 static ScalarAggregateOptions Defaults() { return ScalarAggregateOptions{}; } 0053 0054 /// If true (the default), null values are ignored. Otherwise, if any value is null, 0055 /// emit null. 0056 bool skip_nulls; 0057 /// If less than this many non-null values are observed, emit null. 0058 uint32_t min_count; 0059 }; 0060 0061 /// \brief Control count aggregate kernel behavior. 0062 /// 0063 /// By default, only non-null values are counted. 0064 class ARROW_EXPORT CountOptions : public FunctionOptions { 0065 public: 0066 enum CountMode { 0067 /// Count only non-null values. 0068 ONLY_VALID = 0, 0069 /// Count only null values. 0070 ONLY_NULL, 0071 /// Count both non-null and null values. 0072 ALL, 0073 }; 0074 explicit CountOptions(CountMode mode = CountMode::ONLY_VALID); 0075 static constexpr char const kTypeName[] = "CountOptions"; 0076 static CountOptions Defaults() { return CountOptions{}; } 0077 0078 CountMode mode; 0079 }; 0080 0081 /// \brief Control Mode kernel behavior 0082 /// 0083 /// Returns top-n common values and counts. 0084 /// By default, returns the most common value and count. 0085 class ARROW_EXPORT ModeOptions : public FunctionOptions { 0086 public: 0087 explicit ModeOptions(int64_t n = 1, bool skip_nulls = true, uint32_t min_count = 0); 0088 static constexpr char const kTypeName[] = "ModeOptions"; 0089 static ModeOptions Defaults() { return ModeOptions{}; } 0090 0091 int64_t n = 1; 0092 /// If true (the default), null values are ignored. Otherwise, if any value is null, 0093 /// emit null. 0094 bool skip_nulls; 0095 /// If less than this many non-null values are observed, emit null. 0096 uint32_t min_count; 0097 }; 0098 0099 /// \brief Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel 0100 /// 0101 /// The divisor used in calculations is N - ddof, where N is the number of elements. 0102 /// By default, ddof is zero, and population variance or stddev is returned. 0103 class ARROW_EXPORT VarianceOptions : public FunctionOptions { 0104 public: 0105 explicit VarianceOptions(int ddof = 0, bool skip_nulls = true, uint32_t min_count = 0); 0106 static constexpr char const kTypeName[] = "VarianceOptions"; 0107 static VarianceOptions Defaults() { return VarianceOptions{}; } 0108 0109 int ddof = 0; 0110 /// If true (the default), null values are ignored. Otherwise, if any value is null, 0111 /// emit null. 0112 bool skip_nulls; 0113 /// If less than this many non-null values are observed, emit null. 0114 uint32_t min_count; 0115 }; 0116 0117 /// \brief Control Quantile kernel behavior 0118 /// 0119 /// By default, returns the median value. 0120 class ARROW_EXPORT QuantileOptions : public FunctionOptions { 0121 public: 0122 /// Interpolation method to use when quantile lies between two data points 0123 enum Interpolation { 0124 LINEAR = 0, 0125 LOWER, 0126 HIGHER, 0127 NEAREST, 0128 MIDPOINT, 0129 }; 0130 0131 explicit QuantileOptions(double q = 0.5, enum Interpolation interpolation = LINEAR, 0132 bool skip_nulls = true, uint32_t min_count = 0); 0133 0134 explicit QuantileOptions(std::vector<double> q, 0135 enum Interpolation interpolation = LINEAR, 0136 bool skip_nulls = true, uint32_t min_count = 0); 0137 0138 static constexpr char const kTypeName[] = "QuantileOptions"; 0139 static QuantileOptions Defaults() { return QuantileOptions{}; } 0140 0141 /// probability level of quantile must be between 0 and 1 inclusive 0142 std::vector<double> q; 0143 enum Interpolation interpolation; 0144 /// If true (the default), null values are ignored. Otherwise, if any value is null, 0145 /// emit null. 0146 bool skip_nulls; 0147 /// If less than this many non-null values are observed, emit null. 0148 uint32_t min_count; 0149 }; 0150 0151 /// \brief Control TDigest approximate quantile kernel behavior 0152 /// 0153 /// By default, returns the median value. 0154 class ARROW_EXPORT TDigestOptions : public FunctionOptions { 0155 public: 0156 explicit TDigestOptions(double q = 0.5, uint32_t delta = 100, 0157 uint32_t buffer_size = 500, bool skip_nulls = true, 0158 uint32_t min_count = 0); 0159 explicit TDigestOptions(std::vector<double> q, uint32_t delta = 100, 0160 uint32_t buffer_size = 500, bool skip_nulls = true, 0161 uint32_t min_count = 0); 0162 static constexpr char const kTypeName[] = "TDigestOptions"; 0163 static TDigestOptions Defaults() { return TDigestOptions{}; } 0164 0165 /// probability level of quantile must be between 0 and 1 inclusive 0166 std::vector<double> q; 0167 /// compression parameter, default 100 0168 uint32_t delta; 0169 /// input buffer size, default 500 0170 uint32_t buffer_size; 0171 /// If true (the default), null values are ignored. Otherwise, if any value is null, 0172 /// emit null. 0173 bool skip_nulls; 0174 /// If less than this many non-null values are observed, emit null. 0175 uint32_t min_count; 0176 }; 0177 0178 /// \brief Control Index kernel behavior 0179 class ARROW_EXPORT IndexOptions : public FunctionOptions { 0180 public: 0181 explicit IndexOptions(std::shared_ptr<Scalar> value); 0182 // Default constructor for serialization 0183 IndexOptions(); 0184 static constexpr char const kTypeName[] = "IndexOptions"; 0185 0186 std::shared_ptr<Scalar> value; 0187 }; 0188 0189 /// \brief Configure a grouped aggregation 0190 struct ARROW_EXPORT Aggregate { 0191 Aggregate() = default; 0192 0193 Aggregate(std::string function, std::shared_ptr<FunctionOptions> options, 0194 std::vector<FieldRef> target, std::string name = "") 0195 : function(std::move(function)), 0196 options(std::move(options)), 0197 target(std::move(target)), 0198 name(std::move(name)) {} 0199 0200 Aggregate(std::string function, std::shared_ptr<FunctionOptions> options, 0201 FieldRef target, std::string name = "") 0202 : Aggregate(std::move(function), std::move(options), 0203 std::vector<FieldRef>{std::move(target)}, std::move(name)) {} 0204 0205 Aggregate(std::string function, FieldRef target, std::string name) 0206 : Aggregate(std::move(function), /*options=*/NULLPTR, 0207 std::vector<FieldRef>{std::move(target)}, std::move(name)) {} 0208 0209 Aggregate(std::string function, std::string name) 0210 : Aggregate(std::move(function), /*options=*/NULLPTR, 0211 /*target=*/std::vector<FieldRef>{}, std::move(name)) {} 0212 0213 /// the name of the aggregation function 0214 std::string function; 0215 0216 /// options for the aggregation function 0217 std::shared_ptr<FunctionOptions> options; 0218 0219 /// zero or more fields to which aggregations will be applied 0220 std::vector<FieldRef> target; 0221 0222 /// optional output field name for aggregations 0223 std::string name; 0224 }; 0225 0226 /// @} 0227 0228 /// \brief Count values in an array. 0229 /// 0230 /// \param[in] options counting options, see CountOptions for more information 0231 /// \param[in] datum to count 0232 /// \param[in] ctx the function execution context, optional 0233 /// \return out resulting datum 0234 /// 0235 /// \since 1.0.0 0236 /// \note API not yet finalized 0237 ARROW_EXPORT 0238 Result<Datum> Count(const Datum& datum, 0239 const CountOptions& options = CountOptions::Defaults(), 0240 ExecContext* ctx = NULLPTR); 0241 0242 /// \brief Compute the mean of a numeric array. 0243 /// 0244 /// \param[in] value datum to compute the mean, expecting Array 0245 /// \param[in] options see ScalarAggregateOptions for more information 0246 /// \param[in] ctx the function execution context, optional 0247 /// \return datum of the computed mean as a DoubleScalar 0248 /// 0249 /// \since 1.0.0 0250 /// \note API not yet finalized 0251 ARROW_EXPORT 0252 Result<Datum> Mean( 0253 const Datum& value, 0254 const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), 0255 ExecContext* ctx = NULLPTR); 0256 0257 /// \brief Compute the product of values of a numeric array. 0258 /// 0259 /// \param[in] value datum to compute product of, expecting Array or ChunkedArray 0260 /// \param[in] options see ScalarAggregateOptions for more information 0261 /// \param[in] ctx the function execution context, optional 0262 /// \return datum of the computed sum as a Scalar 0263 /// 0264 /// \since 6.0.0 0265 /// \note API not yet finalized 0266 ARROW_EXPORT 0267 Result<Datum> Product( 0268 const Datum& value, 0269 const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), 0270 ExecContext* ctx = NULLPTR); 0271 0272 /// \brief Sum values of a numeric array. 0273 /// 0274 /// \param[in] value datum to sum, expecting Array or ChunkedArray 0275 /// \param[in] options see ScalarAggregateOptions for more information 0276 /// \param[in] ctx the function execution context, optional 0277 /// \return datum of the computed sum as a Scalar 0278 /// 0279 /// \since 1.0.0 0280 /// \note API not yet finalized 0281 ARROW_EXPORT 0282 Result<Datum> Sum( 0283 const Datum& value, 0284 const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), 0285 ExecContext* ctx = NULLPTR); 0286 0287 /// \brief Calculate the first value of an array 0288 /// 0289 /// \param[in] value input datum, expecting Array or ChunkedArray 0290 /// \param[in] options see ScalarAggregateOptions for more information 0291 /// \param[in] ctx the function execution context, optional 0292 /// \return datum of the computed first as Scalar 0293 /// 0294 /// \since 13.0.0 0295 /// \note API not yet finalized 0296 ARROW_EXPORT 0297 Result<Datum> First( 0298 const Datum& value, 0299 const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), 0300 ExecContext* ctx = NULLPTR); 0301 0302 /// \brief Calculate the last value of an array 0303 /// 0304 /// \param[in] value input datum, expecting Array or ChunkedArray 0305 /// \param[in] options see ScalarAggregateOptions for more information 0306 /// \param[in] ctx the function execution context, optional 0307 /// \return datum of the computed last as a Scalar 0308 /// 0309 /// \since 13.0.0 0310 /// \note API not yet finalized 0311 ARROW_EXPORT 0312 Result<Datum> Last( 0313 const Datum& value, 0314 const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), 0315 ExecContext* ctx = NULLPTR); 0316 0317 /// \brief Calculate the min / max of a numeric array 0318 /// 0319 /// This function returns both the min and max as a struct scalar, with type 0320 /// struct<min: T, max: T>, where T is the input type 0321 /// 0322 /// \param[in] value input datum, expecting Array or ChunkedArray 0323 /// \param[in] options see ScalarAggregateOptions for more information 0324 /// \param[in] ctx the function execution context, optional 0325 /// \return resulting datum as a struct<min: T, max: T> scalar 0326 /// 0327 /// \since 1.0.0 0328 /// \note API not yet finalized 0329 ARROW_EXPORT 0330 Result<Datum> MinMax( 0331 const Datum& value, 0332 const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), 0333 ExecContext* ctx = NULLPTR); 0334 0335 /// \brief Test whether any element in a boolean array evaluates to true. 0336 /// 0337 /// This function returns true if any of the elements in the array evaluates 0338 /// to true and false otherwise. Null values are ignored by default. 0339 /// If null values are taken into account by setting ScalarAggregateOptions 0340 /// parameter skip_nulls = false then Kleene logic is used. 0341 /// See KleeneOr for more details on Kleene logic. 0342 /// 0343 /// \param[in] value input datum, expecting a boolean array 0344 /// \param[in] options see ScalarAggregateOptions for more information 0345 /// \param[in] ctx the function execution context, optional 0346 /// \return resulting datum as a BooleanScalar 0347 /// 0348 /// \since 3.0.0 0349 /// \note API not yet finalized 0350 ARROW_EXPORT 0351 Result<Datum> Any( 0352 const Datum& value, 0353 const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), 0354 ExecContext* ctx = NULLPTR); 0355 0356 /// \brief Test whether all elements in a boolean array evaluate to true. 0357 /// 0358 /// This function returns true if all of the elements in the array evaluate 0359 /// to true and false otherwise. Null values are ignored by default. 0360 /// If null values are taken into account by setting ScalarAggregateOptions 0361 /// parameter skip_nulls = false then Kleene logic is used. 0362 /// See KleeneAnd for more details on Kleene logic. 0363 /// 0364 /// \param[in] value input datum, expecting a boolean array 0365 /// \param[in] options see ScalarAggregateOptions for more information 0366 /// \param[in] ctx the function execution context, optional 0367 /// \return resulting datum as a BooleanScalar 0368 0369 /// \since 3.0.0 0370 /// \note API not yet finalized 0371 ARROW_EXPORT 0372 Result<Datum> All( 0373 const Datum& value, 0374 const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), 0375 ExecContext* ctx = NULLPTR); 0376 0377 /// \brief Calculate the modal (most common) value of a numeric array 0378 /// 0379 /// This function returns top-n most common values and number of times they occur as 0380 /// an array of `struct<mode: T, count: int64>`, where T is the input type. 0381 /// Values with larger counts are returned before smaller ones. 0382 /// If there are more than one values with same count, smaller value is returned first. 0383 /// 0384 /// \param[in] value input datum, expecting Array or ChunkedArray 0385 /// \param[in] options see ModeOptions for more information 0386 /// \param[in] ctx the function execution context, optional 0387 /// \return resulting datum as an array of struct<mode: T, count: int64> 0388 /// 0389 /// \since 2.0.0 0390 /// \note API not yet finalized 0391 ARROW_EXPORT 0392 Result<Datum> Mode(const Datum& value, 0393 const ModeOptions& options = ModeOptions::Defaults(), 0394 ExecContext* ctx = NULLPTR); 0395 0396 /// \brief Calculate the standard deviation of a numeric array 0397 /// 0398 /// \param[in] value input datum, expecting Array or ChunkedArray 0399 /// \param[in] options see VarianceOptions for more information 0400 /// \param[in] ctx the function execution context, optional 0401 /// \return datum of the computed standard deviation as a DoubleScalar 0402 /// 0403 /// \since 2.0.0 0404 /// \note API not yet finalized 0405 ARROW_EXPORT 0406 Result<Datum> Stddev(const Datum& value, 0407 const VarianceOptions& options = VarianceOptions::Defaults(), 0408 ExecContext* ctx = NULLPTR); 0409 0410 /// \brief Calculate the variance of a numeric array 0411 /// 0412 /// \param[in] value input datum, expecting Array or ChunkedArray 0413 /// \param[in] options see VarianceOptions for more information 0414 /// \param[in] ctx the function execution context, optional 0415 /// \return datum of the computed variance as a DoubleScalar 0416 /// 0417 /// \since 2.0.0 0418 /// \note API not yet finalized 0419 ARROW_EXPORT 0420 Result<Datum> Variance(const Datum& value, 0421 const VarianceOptions& options = VarianceOptions::Defaults(), 0422 ExecContext* ctx = NULLPTR); 0423 0424 /// \brief Calculate the quantiles of a numeric array 0425 /// 0426 /// \param[in] value input datum, expecting Array or ChunkedArray 0427 /// \param[in] options see QuantileOptions for more information 0428 /// \param[in] ctx the function execution context, optional 0429 /// \return resulting datum as an array 0430 /// 0431 /// \since 4.0.0 0432 /// \note API not yet finalized 0433 ARROW_EXPORT 0434 Result<Datum> Quantile(const Datum& value, 0435 const QuantileOptions& options = QuantileOptions::Defaults(), 0436 ExecContext* ctx = NULLPTR); 0437 0438 /// \brief Calculate the approximate quantiles of a numeric array with T-Digest algorithm 0439 /// 0440 /// \param[in] value input datum, expecting Array or ChunkedArray 0441 /// \param[in] options see TDigestOptions for more information 0442 /// \param[in] ctx the function execution context, optional 0443 /// \return resulting datum as an array 0444 /// 0445 /// \since 4.0.0 0446 /// \note API not yet finalized 0447 ARROW_EXPORT 0448 Result<Datum> TDigest(const Datum& value, 0449 const TDigestOptions& options = TDigestOptions::Defaults(), 0450 ExecContext* ctx = NULLPTR); 0451 0452 /// \brief Find the first index of a value in an array. 0453 /// 0454 /// \param[in] value The array to search. 0455 /// \param[in] options The array to search for. See IndexOptions. 0456 /// \param[in] ctx the function execution context, optional 0457 /// \return out a Scalar containing the index (or -1 if not found). 0458 /// 0459 /// \since 5.0.0 0460 /// \note API not yet finalized 0461 ARROW_EXPORT 0462 Result<Datum> Index(const Datum& value, const IndexOptions& options, 0463 ExecContext* ctx = NULLPTR); 0464 0465 } // namespace compute 0466 } // namespace arrow
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
![]() ![]() |