![]() |
|
|||
File indexing completed on 2025-08-28 08:26:55
0001 // Licensed to the Apache Software Foundation (ASF) under one 0002 // or more contributor license agreements. See the NOTICE file 0003 // distributed with this work for additional information 0004 // regarding copyright ownership. The ASF licenses this file 0005 // to you under the Apache License, Version 2.0 (the 0006 // "License"); you may not use this file except in compliance 0007 // with the License. You may obtain a copy of the License at 0008 // 0009 // http://www.apache.org/licenses/LICENSE-2.0 0010 // 0011 // Unless required by applicable law or agreed to in writing, 0012 // software distributed under the License is distributed on an 0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 0014 // KIND, either express or implied. See the License for the 0015 // specific language governing permissions and limitations 0016 // under the License. 0017 0018 #pragma once 0019 0020 #include <cstdint> 0021 #include <limits> 0022 #include <memory> 0023 #include <utility> 0024 #include <vector> 0025 0026 #include "arrow/array.h" 0027 #include "arrow/array/builder_base.h" 0028 0029 namespace arrow { 0030 0031 /// \addtogroup run-end-encoded-builders 0032 /// 0033 /// @{ 0034 0035 namespace internal { 0036 0037 /// \brief An ArrayBuilder that deduplicates repeated values as they are 0038 /// appended to the inner-ArrayBuilder and reports the length of the current run 0039 /// of identical values. 0040 /// 0041 /// The following sequence of calls 0042 /// 0043 /// Append(2) 0044 /// Append(2) 0045 /// Append(2) 0046 /// Append(7) 0047 /// Append(7) 0048 /// Append(2) 0049 /// FinishInternal() 0050 /// 0051 /// will cause the inner-builder to receive only 3 Append calls 0052 /// 0053 /// Append(2) 0054 /// Append(7) 0055 /// Append(2) 0056 /// FinishInternal() 0057 /// 0058 /// Note that values returned by length(), null_count() and capacity() are 0059 /// related to the compressed array built by the inner-ArrayBuilder. 0060 class RunCompressorBuilder : public ArrayBuilder { 0061 public: 0062 RunCompressorBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> inner_builder, 0063 std::shared_ptr<DataType> type); 0064 0065 ~RunCompressorBuilder() override; 0066 0067 ARROW_DISALLOW_COPY_AND_ASSIGN(RunCompressorBuilder); 0068 0069 /// \brief Called right before a run is being closed 0070 /// 0071 /// Subclasses can override this function to perform an additional action when 0072 /// a run is closed (i.e. run-length is known and value is appended to the 0073 /// inner builder). 0074 /// 0075 /// \param value can be NULLPTR if closing a run of NULLs 0076 /// \param length the greater than 0 length of the value run being closed 0077 virtual Status WillCloseRun(const std::shared_ptr<const Scalar>& value, 0078 int64_t length) { 0079 return Status::OK(); 0080 } 0081 0082 /// \brief Called right before a run of empty values is being closed 0083 /// 0084 /// Subclasses can override this function to perform an additional action when 0085 /// a run of empty values is appended (i.e. run-length is known and a single 0086 /// empty value is appended to the inner builder). 0087 /// 0088 /// \param length the greater than 0 length of the value run being closed 0089 virtual Status WillCloseRunOfEmptyValues(int64_t length) { return Status::OK(); } 0090 0091 /// \brief Allocate enough memory for a given number of array elements. 0092 /// 0093 /// NOTE: Conservatively resizing a run-length compressed array for a given 0094 /// number of logical elements is not possible, since the physical length will 0095 /// vary depending on the values to be appended in the future. But we can 0096 /// pessimistically assume that each run will contain a single value and 0097 /// allocate that number of runs. 0098 Status Resize(int64_t capacity) override { return ResizePhysical(capacity); } 0099 0100 /// \brief Allocate enough memory for a given number of runs. 0101 /// 0102 /// Like Resize on non-encoded builders, it does not account for variable size 0103 /// data. 0104 Status ResizePhysical(int64_t capacity); 0105 0106 Status ReservePhysical(int64_t additional_capacity) { 0107 return Reserve(additional_capacity); 0108 } 0109 0110 void Reset() override; 0111 0112 Status AppendNull() final { return AppendNulls(1); } 0113 Status AppendNulls(int64_t length) override; 0114 0115 Status AppendEmptyValue() final { return AppendEmptyValues(1); } 0116 Status AppendEmptyValues(int64_t length) override; 0117 0118 Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override; 0119 Status AppendScalars(const ScalarVector& scalars) override; 0120 0121 // AppendArraySlice() is not implemented. 0122 0123 /// \brief Append a slice of an array containing values from already 0124 /// compressed runs. 0125 /// 0126 /// NOTE: WillCloseRun() is not called as the length of each run cannot be 0127 /// determined at this point. Caller should ensure that !has_open_run() by 0128 /// calling FinishCurrentRun() before calling this. 0129 /// 0130 /// Pre-condition: !has_open_run() 0131 Status AppendRunCompressedArraySlice(const ArraySpan& array, int64_t offset, 0132 int64_t length); 0133 0134 /// \brief Forces the closing of the current run if one is currently open. 0135 /// 0136 /// This can be called when one wants to ensure the current run will not be 0137 /// extended. This may cause identical values to appear close to each other in 0138 /// the underlying array (i.e. two runs that could be a single run) if more 0139 /// values are appended after this is called. 0140 /// 0141 /// Finish() and FinishInternal() call this automatically. 0142 virtual Status FinishCurrentRun(); 0143 0144 Status FinishInternal(std::shared_ptr<ArrayData>* out) override; 0145 0146 ArrayBuilder& inner_builder() const { return *inner_builder_; } 0147 0148 std::shared_ptr<DataType> type() const override { return inner_builder_->type(); } 0149 0150 bool has_open_run() const { return current_run_length_ > 0; } 0151 int64_t open_run_length() const { return current_run_length_; } 0152 0153 private: 0154 inline void UpdateDimensions() { 0155 capacity_ = inner_builder_->capacity(); 0156 length_ = inner_builder_->length(); 0157 null_count_ = inner_builder_->null_count(); 0158 } 0159 0160 private: 0161 std::shared_ptr<ArrayBuilder> inner_builder_; 0162 std::shared_ptr<const Scalar> current_value_ = NULLPTR; 0163 int64_t current_run_length_ = 0; 0164 }; 0165 0166 } // namespace internal 0167 0168 // ---------------------------------------------------------------------- 0169 // RunEndEncoded builder 0170 0171 /// \brief Run-end encoded array builder. 0172 /// 0173 /// NOTE: the value returned by and capacity() is related to the 0174 /// compressed array (physical) and not the decoded array (logical) that is 0175 /// run-end encoded. null_count() always returns 0. length(), on the other hand, 0176 /// returns the logical length of the run-end encoded array. 0177 class ARROW_EXPORT RunEndEncodedBuilder : public ArrayBuilder { 0178 private: 0179 // An internal::RunCompressorBuilder that produces a run-end in the 0180 // RunEndEncodedBuilder every time a value-run is closed. 0181 class ValueRunBuilder : public internal::RunCompressorBuilder { 0182 public: 0183 ValueRunBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& value_builder, 0184 const std::shared_ptr<DataType>& value_type, 0185 RunEndEncodedBuilder& ree_builder); 0186 0187 ~ValueRunBuilder() override = default; 0188 0189 Status WillCloseRun(const std::shared_ptr<const Scalar>&, int64_t length) override { 0190 return ree_builder_.CloseRun(length); 0191 } 0192 0193 Status WillCloseRunOfEmptyValues(int64_t length) override { 0194 return ree_builder_.CloseRun(length); 0195 } 0196 0197 private: 0198 RunEndEncodedBuilder& ree_builder_; 0199 }; 0200 0201 public: 0202 RunEndEncodedBuilder(MemoryPool* pool, 0203 const std::shared_ptr<ArrayBuilder>& run_end_builder, 0204 const std::shared_ptr<ArrayBuilder>& value_builder, 0205 std::shared_ptr<DataType> type); 0206 0207 /// \brief Allocate enough memory for a given number of array elements. 0208 /// 0209 /// NOTE: Conservatively resizing an REE for a given number of logical 0210 /// elements is not possible, since the physical length will vary depending on 0211 /// the values to be appended in the future. But we can pessimistically assume 0212 /// that each run will contain a single value and allocate that number of 0213 /// runs. 0214 Status Resize(int64_t capacity) override { return ResizePhysical(capacity); } 0215 0216 /// \brief Allocate enough memory for a given number of runs. 0217 Status ResizePhysical(int64_t capacity); 0218 0219 /// \brief Ensure that there is enough space allocated to append the indicated 0220 /// number of run without any further reallocation. Overallocation is 0221 /// used in order to minimize the impact of incremental ReservePhysical() calls. 0222 /// Note that additional_capacity is relative to the current number of elements 0223 /// rather than to the current capacity, so calls to Reserve() which are not 0224 /// interspersed with addition of new elements may not increase the capacity. 0225 /// 0226 /// \param[in] additional_capacity the number of additional runs 0227 /// \return Status 0228 Status ReservePhysical(int64_t additional_capacity) { 0229 return Reserve(additional_capacity); 0230 } 0231 0232 void Reset() override; 0233 0234 Status AppendNull() final { return AppendNulls(1); } 0235 Status AppendNulls(int64_t length) override; 0236 0237 Status AppendEmptyValue() final { return AppendEmptyValues(1); } 0238 Status AppendEmptyValues(int64_t length) override; 0239 Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override; 0240 Status AppendScalars(const ScalarVector& scalars) override; 0241 Status AppendArraySlice(const ArraySpan& array, int64_t offset, 0242 int64_t length) override; 0243 Status FinishInternal(std::shared_ptr<ArrayData>* out) override; 0244 0245 /// \cond FALSE 0246 using ArrayBuilder::Finish; 0247 /// \endcond 0248 0249 Status Finish(std::shared_ptr<RunEndEncodedArray>* out) { return FinishTyped(out); } 0250 0251 /// \brief Forces the closing of the current run if one is currently open. 0252 /// 0253 /// This can be called when one wants to ensure the current run will not be 0254 /// extended. This may cause identical values to appear close to each other in 0255 /// the values array (i.e. two runs that could be a single run) if more 0256 /// values are appended after this is called. 0257 Status FinishCurrentRun(); 0258 0259 std::shared_ptr<DataType> type() const override; 0260 0261 private: 0262 /// \brief Update physical capacity and logical length 0263 /// 0264 /// \param committed_logical_length number of logical values that have been 0265 /// committed to the values array 0266 /// \param open_run_length number of logical values in the currently open run if any 0267 inline void UpdateDimensions(int64_t committed_logical_length, 0268 int64_t open_run_length) { 0269 capacity_ = run_end_builder().capacity(); 0270 length_ = committed_logical_length + open_run_length; 0271 committed_logical_length_ = committed_logical_length; 0272 } 0273 0274 // Pre-condition: !value_run_builder_.has_open_run() 0275 template <typename RunEndCType> 0276 Status DoAppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length); 0277 0278 template <typename RunEndCType> 0279 Status DoAppendRunEnd(int64_t run_end); 0280 0281 /// \brief Cast run_end to the appropriate type and appends it to the run_ends 0282 /// array. 0283 Status AppendRunEnd(int64_t run_end); 0284 0285 /// \brief Close a run by appending a value to the run_ends array and updating 0286 /// length_ to reflect the new run. 0287 /// 0288 /// Pre-condition: run_length > 0. 0289 [[nodiscard]] Status CloseRun(int64_t run_length); 0290 0291 ArrayBuilder& run_end_builder(); 0292 ArrayBuilder& value_builder(); 0293 0294 private: 0295 std::shared_ptr<RunEndEncodedType> type_; 0296 ValueRunBuilder* value_run_builder_; 0297 // The length not counting the current open run in the value_run_builder_ 0298 int64_t committed_logical_length_ = 0; 0299 }; 0300 0301 /// @} 0302 0303 } // namespace arrow
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
![]() ![]() |