![]() |
|
|||
File indexing completed on 2025-08-28 08:26:55
0001 // Licensed to the Apache Software Foundation (ASF) under one 0002 // or more contributor license agreements. See the NOTICE file 0003 // distributed with this work for additional information 0004 // regarding copyright ownership. The ASF licenses this file 0005 // to you under the Apache License, Version 2.0 (the 0006 // "License"); you may not use this file except in compliance 0007 // with the License. You may obtain a copy of the License at 0008 // 0009 // http://www.apache.org/licenses/LICENSE-2.0 0010 // 0011 // Unless required by applicable law or agreed to in writing, 0012 // software distributed under the License is distributed on an 0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 0014 // KIND, either express or implied. See the License for the 0015 // specific language governing permissions and limitations 0016 // under the License. 0017 0018 #pragma once 0019 0020 #include <memory> 0021 #include <vector> 0022 0023 #include "arrow/compute/kernel.h" 0024 #include "arrow/datum.h" 0025 #include "arrow/result.h" 0026 #include "arrow/util/visibility.h" 0027 0028 namespace arrow { 0029 namespace compute { 0030 0031 /// \brief A segment 0032 /// A segment group is a chunk of continuous rows that have the same segment key. (For 0033 /// example, in ordered time series processing, segment key can be "date", and a segment 0034 /// group can be all the rows that belong to the same date.) A segment group can span 0035 /// across multiple exec batches. A segment is a chunk of continuous rows that has the 0036 /// same segment key within a given batch. When a segment group span cross batches, it 0037 /// will have multiple segments. A segment never spans cross batches. The segment data 0038 /// structure only makes sense when used along with a exec batch. 0039 struct ARROW_EXPORT Segment { 0040 /// \brief the offset into the batch where the segment starts 0041 int64_t offset; 0042 /// \brief the length of the segment 0043 int64_t length; 0044 /// \brief whether the segment may be extended by a next one 0045 bool is_open; 0046 /// \brief whether the segment extends a preceeding one 0047 bool extends; 0048 }; 0049 0050 inline bool operator==(const Segment& segment1, const Segment& segment2) { 0051 return segment1.offset == segment2.offset && segment1.length == segment2.length && 0052 segment1.is_open == segment2.is_open && segment1.extends == segment2.extends; 0053 } 0054 inline bool operator!=(const Segment& segment1, const Segment& segment2) { 0055 return !(segment1 == segment2); 0056 } 0057 0058 /// \brief a helper class to divide a batch into segments of equal values 0059 /// 0060 /// For example, given a batch with two columns specifed as segment keys: 0061 /// 0062 /// A A [other columns]... 0063 /// A A ... 0064 /// A B ... 0065 /// A B ... 0066 /// A A ... 0067 /// 0068 /// Then the batch could be divided into 3 segments. The first would be rows 0 & 1, 0069 /// the second would be rows 2 & 3, and the third would be row 4. 0070 /// 0071 /// Further, a segmenter keeps track of the last value seen. This allows it to calculate 0072 /// segments which span batches. In our above example the last batch we emit would set 0073 /// the "open" flag, which indicates whether the segment may extend into the next batch. 0074 /// 0075 /// If the next call to the segmenter starts with `A A` then that segment would set the 0076 /// "extends" flag, which indicates whether the segment continues the last open batch. 0077 class ARROW_EXPORT RowSegmenter { 0078 public: 0079 virtual ~RowSegmenter() = default; 0080 0081 /// \brief Construct a Segmenter which segments on the specified key types 0082 /// 0083 /// \param[in] key_types the specified key types 0084 /// \param[in] nullable_keys whether values of the specified keys may be null 0085 /// \param[in] ctx the execution context to use 0086 static Result<std::unique_ptr<RowSegmenter>> Make( 0087 const std::vector<TypeHolder>& key_types, bool nullable_keys, ExecContext* ctx); 0088 0089 /// \brief Return the key types of this segmenter 0090 virtual const std::vector<TypeHolder>& key_types() const = 0; 0091 0092 /// \brief Reset this segmenter 0093 /// 0094 /// A segmenter normally extends (see `Segment`) a segment from one batch to the next. 0095 /// If segment-extension is undesirable, for example when each batch is processed 0096 /// independently, then `Reset` should be invoked before processing the next batch. 0097 virtual Status Reset() = 0; 0098 0099 /// \brief Get the next segment for the given batch starting from the given offset 0100 /// DEPRECATED: Due to its inefficiency, use GetSegments instead. 0101 ARROW_DEPRECATED("Deprecated in 18.0.0. Use GetSegments instead.") 0102 virtual Result<Segment> GetNextSegment(const ExecSpan& batch, int64_t offset) = 0; 0103 0104 /// \brief Get all segments for the given batch 0105 virtual Result<std::vector<Segment>> GetSegments(const ExecSpan& batch) = 0; 0106 }; 0107 0108 /// Consumes batches of keys and yields batches of the group ids. 0109 class ARROW_EXPORT Grouper { 0110 public: 0111 virtual ~Grouper() = default; 0112 0113 /// Construct a Grouper which receives the specified key types 0114 static Result<std::unique_ptr<Grouper>> Make(const std::vector<TypeHolder>& key_types, 0115 ExecContext* ctx = default_exec_context()); 0116 0117 /// Reset all intermediate state, make the grouper logically as just `Make`ed. 0118 /// The underlying buffers, if any, may or may not be released though. 0119 virtual Status Reset() = 0; 0120 0121 /// Consume a batch of keys, producing the corresponding group ids as an integer array, 0122 /// over a slice defined by an offset and length, which defaults to the batch length. 0123 /// Currently only uint32 indices will be produced, eventually the bit width will only 0124 /// be as wide as necessary. 0125 virtual Result<Datum> Consume(const ExecSpan& batch, int64_t offset = 0, 0126 int64_t length = -1) = 0; 0127 0128 /// Get current unique keys. May be called multiple times. 0129 virtual Result<ExecBatch> GetUniques() = 0; 0130 0131 /// Get the current number of groups. 0132 virtual uint32_t num_groups() const = 0; 0133 0134 /// \brief Assemble lists of indices of identical elements. 0135 /// 0136 /// \param[in] ids An unsigned, all-valid integral array which will be 0137 /// used as grouping criteria. 0138 /// \param[in] num_groups An upper bound for the elements of ids 0139 /// \param[in] ctx Execution context to use during the operation 0140 /// \return A num_groups-long ListArray where the slot at i contains a 0141 /// list of indices where i appears in ids. 0142 /// 0143 /// MakeGroupings([ 0144 /// 2, 0145 /// 2, 0146 /// 5, 0147 /// 5, 0148 /// 2, 0149 /// 3 0150 /// ], 8) == [ 0151 /// [], 0152 /// [], 0153 /// [0, 1, 4], 0154 /// [5], 0155 /// [], 0156 /// [2, 3], 0157 /// [], 0158 /// [] 0159 /// ] 0160 static Result<std::shared_ptr<ListArray>> MakeGroupings( 0161 const UInt32Array& ids, uint32_t num_groups, 0162 ExecContext* ctx = default_exec_context()); 0163 0164 /// \brief Produce a ListArray whose slots are selections of `array` which correspond to 0165 /// the provided groupings. 0166 /// 0167 /// For example, 0168 /// ApplyGroupings([ 0169 /// [], 0170 /// [], 0171 /// [0, 1, 4], 0172 /// [5], 0173 /// [], 0174 /// [2, 3], 0175 /// [], 0176 /// [] 0177 /// ], [2, 2, 5, 5, 2, 3]) == [ 0178 /// [], 0179 /// [], 0180 /// [2, 2, 2], 0181 /// [3], 0182 /// [], 0183 /// [5, 5], 0184 /// [], 0185 /// [] 0186 /// ] 0187 static Result<std::shared_ptr<ListArray>> ApplyGroupings( 0188 const ListArray& groupings, const Array& array, 0189 ExecContext* ctx = default_exec_context()); 0190 }; 0191 0192 } // namespace compute 0193 } // namespace arrow
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
![]() ![]() |