Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-08-28 08:26:55

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <memory>
0021 #include <vector>
0022 
0023 #include "arrow/compute/kernel.h"
0024 #include "arrow/datum.h"
0025 #include "arrow/result.h"
0026 #include "arrow/util/visibility.h"
0027 
0028 namespace arrow {
0029 namespace compute {
0030 
0031 /// \brief A segment
0032 /// A segment group is a chunk of continuous rows that have the same segment key. (For
0033 /// example, in ordered time series processing, segment key can be "date", and a segment
0034 /// group can be all the rows that belong to the same date.) A segment group can span
0035 /// across multiple exec batches. A segment is a chunk of continuous rows that has the
0036 /// same segment key within a given batch. When a segment group span cross batches, it
0037 /// will have multiple segments. A segment never spans cross batches. The segment data
0038 /// structure only makes sense when used along with a exec batch.
0039 struct ARROW_EXPORT Segment {
0040   /// \brief the offset into the batch where the segment starts
0041   int64_t offset;
0042   /// \brief the length of the segment
0043   int64_t length;
0044   /// \brief whether the segment may be extended by a next one
0045   bool is_open;
0046   /// \brief whether the segment extends a preceeding one
0047   bool extends;
0048 };
0049 
0050 inline bool operator==(const Segment& segment1, const Segment& segment2) {
0051   return segment1.offset == segment2.offset && segment1.length == segment2.length &&
0052          segment1.is_open == segment2.is_open && segment1.extends == segment2.extends;
0053 }
0054 inline bool operator!=(const Segment& segment1, const Segment& segment2) {
0055   return !(segment1 == segment2);
0056 }
0057 
0058 /// \brief a helper class to divide a batch into segments of equal values
0059 ///
0060 /// For example, given a batch with two columns specifed as segment keys:
0061 ///
0062 /// A A [other columns]...
0063 /// A A ...
0064 /// A B ...
0065 /// A B ...
0066 /// A A ...
0067 ///
0068 /// Then the batch could be divided into 3 segments.  The first would be rows 0 & 1,
0069 /// the second would be rows 2 & 3, and the third would be row 4.
0070 ///
0071 /// Further, a segmenter keeps track of the last value seen.  This allows it to calculate
0072 /// segments which span batches.  In our above example the last batch we emit would set
0073 /// the "open" flag, which indicates whether the segment may extend into the next batch.
0074 ///
0075 /// If the next call to the segmenter starts with `A A` then that segment would set the
0076 /// "extends" flag, which indicates whether the segment continues the last open batch.
0077 class ARROW_EXPORT RowSegmenter {
0078  public:
0079   virtual ~RowSegmenter() = default;
0080 
0081   /// \brief Construct a Segmenter which segments on the specified key types
0082   ///
0083   /// \param[in] key_types the specified key types
0084   /// \param[in] nullable_keys whether values of the specified keys may be null
0085   /// \param[in] ctx the execution context to use
0086   static Result<std::unique_ptr<RowSegmenter>> Make(
0087       const std::vector<TypeHolder>& key_types, bool nullable_keys, ExecContext* ctx);
0088 
0089   /// \brief Return the key types of this segmenter
0090   virtual const std::vector<TypeHolder>& key_types() const = 0;
0091 
0092   /// \brief Reset this segmenter
0093   ///
0094   /// A segmenter normally extends (see `Segment`) a segment from one batch to the next.
0095   /// If segment-extension is undesirable, for example when each batch is processed
0096   /// independently, then `Reset` should be invoked before processing the next batch.
0097   virtual Status Reset() = 0;
0098 
0099   /// \brief Get the next segment for the given batch starting from the given offset
0100   /// DEPRECATED: Due to its inefficiency, use GetSegments instead.
0101   ARROW_DEPRECATED("Deprecated in 18.0.0. Use GetSegments instead.")
0102   virtual Result<Segment> GetNextSegment(const ExecSpan& batch, int64_t offset) = 0;
0103 
0104   /// \brief Get all segments for the given batch
0105   virtual Result<std::vector<Segment>> GetSegments(const ExecSpan& batch) = 0;
0106 };
0107 
0108 /// Consumes batches of keys and yields batches of the group ids.
0109 class ARROW_EXPORT Grouper {
0110  public:
0111   virtual ~Grouper() = default;
0112 
0113   /// Construct a Grouper which receives the specified key types
0114   static Result<std::unique_ptr<Grouper>> Make(const std::vector<TypeHolder>& key_types,
0115                                                ExecContext* ctx = default_exec_context());
0116 
0117   /// Reset all intermediate state, make the grouper logically as just `Make`ed.
0118   /// The underlying buffers, if any, may or may not be released though.
0119   virtual Status Reset() = 0;
0120 
0121   /// Consume a batch of keys, producing the corresponding group ids as an integer array,
0122   /// over a slice defined by an offset and length, which defaults to the batch length.
0123   /// Currently only uint32 indices will be produced, eventually the bit width will only
0124   /// be as wide as necessary.
0125   virtual Result<Datum> Consume(const ExecSpan& batch, int64_t offset = 0,
0126                                 int64_t length = -1) = 0;
0127 
0128   /// Get current unique keys. May be called multiple times.
0129   virtual Result<ExecBatch> GetUniques() = 0;
0130 
0131   /// Get the current number of groups.
0132   virtual uint32_t num_groups() const = 0;
0133 
0134   /// \brief Assemble lists of indices of identical elements.
0135   ///
0136   /// \param[in] ids An unsigned, all-valid integral array which will be
0137   ///                used as grouping criteria.
0138   /// \param[in] num_groups An upper bound for the elements of ids
0139   /// \param[in] ctx Execution context to use during the operation
0140   /// \return A num_groups-long ListArray where the slot at i contains a
0141   ///         list of indices where i appears in ids.
0142   ///
0143   ///   MakeGroupings([
0144   ///       2,
0145   ///       2,
0146   ///       5,
0147   ///       5,
0148   ///       2,
0149   ///       3
0150   ///   ], 8) == [
0151   ///       [],
0152   ///       [],
0153   ///       [0, 1, 4],
0154   ///       [5],
0155   ///       [],
0156   ///       [2, 3],
0157   ///       [],
0158   ///       []
0159   ///   ]
0160   static Result<std::shared_ptr<ListArray>> MakeGroupings(
0161       const UInt32Array& ids, uint32_t num_groups,
0162       ExecContext* ctx = default_exec_context());
0163 
0164   /// \brief Produce a ListArray whose slots are selections of `array` which correspond to
0165   /// the provided groupings.
0166   ///
0167   /// For example,
0168   ///   ApplyGroupings([
0169   ///       [],
0170   ///       [],
0171   ///       [0, 1, 4],
0172   ///       [5],
0173   ///       [],
0174   ///       [2, 3],
0175   ///       [],
0176   ///       []
0177   ///   ], [2, 2, 5, 5, 2, 3]) == [
0178   ///       [],
0179   ///       [],
0180   ///       [2, 2, 2],
0181   ///       [3],
0182   ///       [],
0183   ///       [5, 5],
0184   ///       [],
0185   ///       []
0186   ///   ]
0187   static Result<std::shared_ptr<ListArray>> ApplyGroupings(
0188       const ListArray& groupings, const Array& array,
0189       ExecContext* ctx = default_exec_context());
0190 };
0191 
0192 }  // namespace compute
0193 }  // namespace arrow