Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-08-28 08:26:56

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 // Eager evaluation convenience APIs for invoking common functions, including
0019 // necessary memory allocations
0020 
0021 #pragma once
0022 
0023 #include <vector>
0024 
0025 #include "arrow/compute/function_options.h"
0026 #include "arrow/datum.h"
0027 #include "arrow/result.h"
0028 #include "arrow/util/macros.h"
0029 #include "arrow/util/visibility.h"
0030 
0031 namespace arrow {
0032 
0033 class Array;
0034 
0035 namespace compute {
0036 
0037 class ExecContext;
0038 
0039 // ----------------------------------------------------------------------
0040 // Aggregate functions
0041 
0042 /// \addtogroup compute-concrete-options
0043 /// @{
0044 
0045 /// \brief Control general scalar aggregate kernel behavior
0046 ///
0047 /// By default, null values are ignored (skip_nulls = true).
0048 class ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions {
0049  public:
0050   explicit ScalarAggregateOptions(bool skip_nulls = true, uint32_t min_count = 1);
0051   static constexpr char const kTypeName[] = "ScalarAggregateOptions";
0052   static ScalarAggregateOptions Defaults() { return ScalarAggregateOptions{}; }
0053 
0054   /// If true (the default), null values are ignored. Otherwise, if any value is null,
0055   /// emit null.
0056   bool skip_nulls;
0057   /// If less than this many non-null values are observed, emit null.
0058   uint32_t min_count;
0059 };
0060 
0061 /// \brief Control count aggregate kernel behavior.
0062 ///
0063 /// By default, only non-null values are counted.
0064 class ARROW_EXPORT CountOptions : public FunctionOptions {
0065  public:
0066   enum CountMode {
0067     /// Count only non-null values.
0068     ONLY_VALID = 0,
0069     /// Count only null values.
0070     ONLY_NULL,
0071     /// Count both non-null and null values.
0072     ALL,
0073   };
0074   explicit CountOptions(CountMode mode = CountMode::ONLY_VALID);
0075   static constexpr char const kTypeName[] = "CountOptions";
0076   static CountOptions Defaults() { return CountOptions{}; }
0077 
0078   CountMode mode;
0079 };
0080 
0081 /// \brief Control Mode kernel behavior
0082 ///
0083 /// Returns top-n common values and counts.
0084 /// By default, returns the most common value and count.
0085 class ARROW_EXPORT ModeOptions : public FunctionOptions {
0086  public:
0087   explicit ModeOptions(int64_t n = 1, bool skip_nulls = true, uint32_t min_count = 0);
0088   static constexpr char const kTypeName[] = "ModeOptions";
0089   static ModeOptions Defaults() { return ModeOptions{}; }
0090 
0091   int64_t n = 1;
0092   /// If true (the default), null values are ignored. Otherwise, if any value is null,
0093   /// emit null.
0094   bool skip_nulls;
0095   /// If less than this many non-null values are observed, emit null.
0096   uint32_t min_count;
0097 };
0098 
0099 /// \brief Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel
0100 ///
0101 /// The divisor used in calculations is N - ddof, where N is the number of elements.
0102 /// By default, ddof is zero, and population variance or stddev is returned.
0103 class ARROW_EXPORT VarianceOptions : public FunctionOptions {
0104  public:
0105   explicit VarianceOptions(int ddof = 0, bool skip_nulls = true, uint32_t min_count = 0);
0106   static constexpr char const kTypeName[] = "VarianceOptions";
0107   static VarianceOptions Defaults() { return VarianceOptions{}; }
0108 
0109   int ddof = 0;
0110   /// If true (the default), null values are ignored. Otherwise, if any value is null,
0111   /// emit null.
0112   bool skip_nulls;
0113   /// If less than this many non-null values are observed, emit null.
0114   uint32_t min_count;
0115 };
0116 
0117 /// \brief Control Quantile kernel behavior
0118 ///
0119 /// By default, returns the median value.
0120 class ARROW_EXPORT QuantileOptions : public FunctionOptions {
0121  public:
0122   /// Interpolation method to use when quantile lies between two data points
0123   enum Interpolation {
0124     LINEAR = 0,
0125     LOWER,
0126     HIGHER,
0127     NEAREST,
0128     MIDPOINT,
0129   };
0130 
0131   explicit QuantileOptions(double q = 0.5, enum Interpolation interpolation = LINEAR,
0132                            bool skip_nulls = true, uint32_t min_count = 0);
0133 
0134   explicit QuantileOptions(std::vector<double> q,
0135                            enum Interpolation interpolation = LINEAR,
0136                            bool skip_nulls = true, uint32_t min_count = 0);
0137 
0138   static constexpr char const kTypeName[] = "QuantileOptions";
0139   static QuantileOptions Defaults() { return QuantileOptions{}; }
0140 
0141   /// probability level of quantile must be between 0 and 1 inclusive
0142   std::vector<double> q;
0143   enum Interpolation interpolation;
0144   /// If true (the default), null values are ignored. Otherwise, if any value is null,
0145   /// emit null.
0146   bool skip_nulls;
0147   /// If less than this many non-null values are observed, emit null.
0148   uint32_t min_count;
0149 };
0150 
0151 /// \brief Control TDigest approximate quantile kernel behavior
0152 ///
0153 /// By default, returns the median value.
0154 class ARROW_EXPORT TDigestOptions : public FunctionOptions {
0155  public:
0156   explicit TDigestOptions(double q = 0.5, uint32_t delta = 100,
0157                           uint32_t buffer_size = 500, bool skip_nulls = true,
0158                           uint32_t min_count = 0);
0159   explicit TDigestOptions(std::vector<double> q, uint32_t delta = 100,
0160                           uint32_t buffer_size = 500, bool skip_nulls = true,
0161                           uint32_t min_count = 0);
0162   static constexpr char const kTypeName[] = "TDigestOptions";
0163   static TDigestOptions Defaults() { return TDigestOptions{}; }
0164 
0165   /// probability level of quantile must be between 0 and 1 inclusive
0166   std::vector<double> q;
0167   /// compression parameter, default 100
0168   uint32_t delta;
0169   /// input buffer size, default 500
0170   uint32_t buffer_size;
0171   /// If true (the default), null values are ignored. Otherwise, if any value is null,
0172   /// emit null.
0173   bool skip_nulls;
0174   /// If less than this many non-null values are observed, emit null.
0175   uint32_t min_count;
0176 };
0177 
0178 /// \brief Control Index kernel behavior
0179 class ARROW_EXPORT IndexOptions : public FunctionOptions {
0180  public:
0181   explicit IndexOptions(std::shared_ptr<Scalar> value);
0182   // Default constructor for serialization
0183   IndexOptions();
0184   static constexpr char const kTypeName[] = "IndexOptions";
0185 
0186   std::shared_ptr<Scalar> value;
0187 };
0188 
0189 /// \brief Configure a grouped aggregation
0190 struct ARROW_EXPORT Aggregate {
0191   Aggregate() = default;
0192 
0193   Aggregate(std::string function, std::shared_ptr<FunctionOptions> options,
0194             std::vector<FieldRef> target, std::string name = "")
0195       : function(std::move(function)),
0196         options(std::move(options)),
0197         target(std::move(target)),
0198         name(std::move(name)) {}
0199 
0200   Aggregate(std::string function, std::shared_ptr<FunctionOptions> options,
0201             FieldRef target, std::string name = "")
0202       : Aggregate(std::move(function), std::move(options),
0203                   std::vector<FieldRef>{std::move(target)}, std::move(name)) {}
0204 
0205   Aggregate(std::string function, FieldRef target, std::string name)
0206       : Aggregate(std::move(function), /*options=*/NULLPTR,
0207                   std::vector<FieldRef>{std::move(target)}, std::move(name)) {}
0208 
0209   Aggregate(std::string function, std::string name)
0210       : Aggregate(std::move(function), /*options=*/NULLPTR,
0211                   /*target=*/std::vector<FieldRef>{}, std::move(name)) {}
0212 
0213   /// the name of the aggregation function
0214   std::string function;
0215 
0216   /// options for the aggregation function
0217   std::shared_ptr<FunctionOptions> options;
0218 
0219   /// zero or more fields to which aggregations will be applied
0220   std::vector<FieldRef> target;
0221 
0222   /// optional output field name for aggregations
0223   std::string name;
0224 };
0225 
0226 /// @}
0227 
0228 /// \brief Count values in an array.
0229 ///
0230 /// \param[in] options counting options, see CountOptions for more information
0231 /// \param[in] datum to count
0232 /// \param[in] ctx the function execution context, optional
0233 /// \return out resulting datum
0234 ///
0235 /// \since 1.0.0
0236 /// \note API not yet finalized
0237 ARROW_EXPORT
0238 Result<Datum> Count(const Datum& datum,
0239                     const CountOptions& options = CountOptions::Defaults(),
0240                     ExecContext* ctx = NULLPTR);
0241 
0242 /// \brief Compute the mean of a numeric array.
0243 ///
0244 /// \param[in] value datum to compute the mean, expecting Array
0245 /// \param[in] options see ScalarAggregateOptions for more information
0246 /// \param[in] ctx the function execution context, optional
0247 /// \return datum of the computed mean as a DoubleScalar
0248 ///
0249 /// \since 1.0.0
0250 /// \note API not yet finalized
0251 ARROW_EXPORT
0252 Result<Datum> Mean(
0253     const Datum& value,
0254     const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
0255     ExecContext* ctx = NULLPTR);
0256 
0257 /// \brief Compute the product of values of a numeric array.
0258 ///
0259 /// \param[in] value datum to compute product of, expecting Array or ChunkedArray
0260 /// \param[in] options see ScalarAggregateOptions for more information
0261 /// \param[in] ctx the function execution context, optional
0262 /// \return datum of the computed sum as a Scalar
0263 ///
0264 /// \since 6.0.0
0265 /// \note API not yet finalized
0266 ARROW_EXPORT
0267 Result<Datum> Product(
0268     const Datum& value,
0269     const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
0270     ExecContext* ctx = NULLPTR);
0271 
0272 /// \brief Sum values of a numeric array.
0273 ///
0274 /// \param[in] value datum to sum, expecting Array or ChunkedArray
0275 /// \param[in] options see ScalarAggregateOptions for more information
0276 /// \param[in] ctx the function execution context, optional
0277 /// \return datum of the computed sum as a Scalar
0278 ///
0279 /// \since 1.0.0
0280 /// \note API not yet finalized
0281 ARROW_EXPORT
0282 Result<Datum> Sum(
0283     const Datum& value,
0284     const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
0285     ExecContext* ctx = NULLPTR);
0286 
0287 /// \brief Calculate the first value of an array
0288 ///
0289 /// \param[in] value input datum, expecting Array or ChunkedArray
0290 /// \param[in] options see ScalarAggregateOptions for more information
0291 /// \param[in] ctx the function execution context, optional
0292 /// \return datum of the computed first as Scalar
0293 ///
0294 /// \since 13.0.0
0295 /// \note API not yet finalized
0296 ARROW_EXPORT
0297 Result<Datum> First(
0298     const Datum& value,
0299     const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
0300     ExecContext* ctx = NULLPTR);
0301 
0302 /// \brief Calculate the last value of an array
0303 ///
0304 /// \param[in] value input datum, expecting Array or ChunkedArray
0305 /// \param[in] options see ScalarAggregateOptions for more information
0306 /// \param[in] ctx the function execution context, optional
0307 /// \return datum of the computed last as a Scalar
0308 ///
0309 /// \since 13.0.0
0310 /// \note API not yet finalized
0311 ARROW_EXPORT
0312 Result<Datum> Last(
0313     const Datum& value,
0314     const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
0315     ExecContext* ctx = NULLPTR);
0316 
0317 /// \brief Calculate the min / max of a numeric array
0318 ///
0319 /// This function returns both the min and max as a struct scalar, with type
0320 /// struct<min: T, max: T>, where T is the input type
0321 ///
0322 /// \param[in] value input datum, expecting Array or ChunkedArray
0323 /// \param[in] options see ScalarAggregateOptions for more information
0324 /// \param[in] ctx the function execution context, optional
0325 /// \return resulting datum as a struct<min: T, max: T> scalar
0326 ///
0327 /// \since 1.0.0
0328 /// \note API not yet finalized
0329 ARROW_EXPORT
0330 Result<Datum> MinMax(
0331     const Datum& value,
0332     const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
0333     ExecContext* ctx = NULLPTR);
0334 
0335 /// \brief Test whether any element in a boolean array evaluates to true.
0336 ///
0337 /// This function returns true if any of the elements in the array evaluates
0338 /// to true and false otherwise. Null values are ignored by default.
0339 /// If null values are taken into account by setting ScalarAggregateOptions
0340 /// parameter skip_nulls = false then Kleene logic is used.
0341 /// See KleeneOr for more details on Kleene logic.
0342 ///
0343 /// \param[in] value input datum, expecting a boolean array
0344 /// \param[in] options see ScalarAggregateOptions for more information
0345 /// \param[in] ctx the function execution context, optional
0346 /// \return resulting datum as a BooleanScalar
0347 ///
0348 /// \since 3.0.0
0349 /// \note API not yet finalized
0350 ARROW_EXPORT
0351 Result<Datum> Any(
0352     const Datum& value,
0353     const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
0354     ExecContext* ctx = NULLPTR);
0355 
0356 /// \brief Test whether all elements in a boolean array evaluate to true.
0357 ///
0358 /// This function returns true if all of the elements in the array evaluate
0359 /// to true and false otherwise. Null values are ignored by default.
0360 /// If null values are taken into account by setting ScalarAggregateOptions
0361 /// parameter skip_nulls = false then Kleene logic is used.
0362 /// See KleeneAnd for more details on Kleene logic.
0363 ///
0364 /// \param[in] value input datum, expecting a boolean array
0365 /// \param[in] options see ScalarAggregateOptions for more information
0366 /// \param[in] ctx the function execution context, optional
0367 /// \return resulting datum as a BooleanScalar
0368 
0369 /// \since 3.0.0
0370 /// \note API not yet finalized
0371 ARROW_EXPORT
0372 Result<Datum> All(
0373     const Datum& value,
0374     const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
0375     ExecContext* ctx = NULLPTR);
0376 
0377 /// \brief Calculate the modal (most common) value of a numeric array
0378 ///
0379 /// This function returns top-n most common values and number of times they occur as
0380 /// an array of `struct<mode: T, count: int64>`, where T is the input type.
0381 /// Values with larger counts are returned before smaller ones.
0382 /// If there are more than one values with same count, smaller value is returned first.
0383 ///
0384 /// \param[in] value input datum, expecting Array or ChunkedArray
0385 /// \param[in] options see ModeOptions for more information
0386 /// \param[in] ctx the function execution context, optional
0387 /// \return resulting datum as an array of struct<mode: T, count: int64>
0388 ///
0389 /// \since 2.0.0
0390 /// \note API not yet finalized
0391 ARROW_EXPORT
0392 Result<Datum> Mode(const Datum& value,
0393                    const ModeOptions& options = ModeOptions::Defaults(),
0394                    ExecContext* ctx = NULLPTR);
0395 
0396 /// \brief Calculate the standard deviation of a numeric array
0397 ///
0398 /// \param[in] value input datum, expecting Array or ChunkedArray
0399 /// \param[in] options see VarianceOptions for more information
0400 /// \param[in] ctx the function execution context, optional
0401 /// \return datum of the computed standard deviation as a DoubleScalar
0402 ///
0403 /// \since 2.0.0
0404 /// \note API not yet finalized
0405 ARROW_EXPORT
0406 Result<Datum> Stddev(const Datum& value,
0407                      const VarianceOptions& options = VarianceOptions::Defaults(),
0408                      ExecContext* ctx = NULLPTR);
0409 
0410 /// \brief Calculate the variance of a numeric array
0411 ///
0412 /// \param[in] value input datum, expecting Array or ChunkedArray
0413 /// \param[in] options see VarianceOptions for more information
0414 /// \param[in] ctx the function execution context, optional
0415 /// \return datum of the computed variance as a DoubleScalar
0416 ///
0417 /// \since 2.0.0
0418 /// \note API not yet finalized
0419 ARROW_EXPORT
0420 Result<Datum> Variance(const Datum& value,
0421                        const VarianceOptions& options = VarianceOptions::Defaults(),
0422                        ExecContext* ctx = NULLPTR);
0423 
0424 /// \brief Calculate the quantiles of a numeric array
0425 ///
0426 /// \param[in] value input datum, expecting Array or ChunkedArray
0427 /// \param[in] options see QuantileOptions for more information
0428 /// \param[in] ctx the function execution context, optional
0429 /// \return resulting datum as an array
0430 ///
0431 /// \since 4.0.0
0432 /// \note API not yet finalized
0433 ARROW_EXPORT
0434 Result<Datum> Quantile(const Datum& value,
0435                        const QuantileOptions& options = QuantileOptions::Defaults(),
0436                        ExecContext* ctx = NULLPTR);
0437 
0438 /// \brief Calculate the approximate quantiles of a numeric array with T-Digest algorithm
0439 ///
0440 /// \param[in] value input datum, expecting Array or ChunkedArray
0441 /// \param[in] options see TDigestOptions for more information
0442 /// \param[in] ctx the function execution context, optional
0443 /// \return resulting datum as an array
0444 ///
0445 /// \since 4.0.0
0446 /// \note API not yet finalized
0447 ARROW_EXPORT
0448 Result<Datum> TDigest(const Datum& value,
0449                       const TDigestOptions& options = TDigestOptions::Defaults(),
0450                       ExecContext* ctx = NULLPTR);
0451 
0452 /// \brief Find the first index of a value in an array.
0453 ///
0454 /// \param[in] value The array to search.
0455 /// \param[in] options The array to search for. See IndexOptions.
0456 /// \param[in] ctx the function execution context, optional
0457 /// \return out a Scalar containing the index (or -1 if not found).
0458 ///
0459 /// \since 5.0.0
0460 /// \note API not yet finalized
0461 ARROW_EXPORT
0462 Result<Datum> Index(const Datum& value, const IndexOptions& options,
0463                     ExecContext* ctx = NULLPTR);
0464 
0465 }  // namespace compute
0466 }  // namespace arrow