Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-08-28 08:26:55

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <cstdint>
0021 #include <limits>
0022 #include <memory>
0023 #include <utility>
0024 #include <vector>
0025 
0026 #include "arrow/array.h"
0027 #include "arrow/array/builder_base.h"
0028 
0029 namespace arrow {
0030 
0031 /// \addtogroup run-end-encoded-builders
0032 ///
0033 /// @{
0034 
0035 namespace internal {
0036 
0037 /// \brief An ArrayBuilder that deduplicates repeated values as they are
0038 /// appended to the inner-ArrayBuilder and reports the length of the current run
0039 /// of identical values.
0040 ///
0041 /// The following sequence of calls
0042 ///
0043 ///     Append(2)
0044 ///     Append(2)
0045 ///     Append(2)
0046 ///     Append(7)
0047 ///     Append(7)
0048 ///     Append(2)
0049 ///     FinishInternal()
0050 ///
0051 /// will cause the inner-builder to receive only 3 Append calls
0052 ///
0053 ///     Append(2)
0054 ///     Append(7)
0055 ///     Append(2)
0056 ///     FinishInternal()
0057 ///
0058 /// Note that values returned by length(), null_count() and capacity() are
0059 /// related to the compressed array built by the inner-ArrayBuilder.
0060 class RunCompressorBuilder : public ArrayBuilder {
0061  public:
0062   RunCompressorBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> inner_builder,
0063                        std::shared_ptr<DataType> type);
0064 
0065   ~RunCompressorBuilder() override;
0066 
0067   ARROW_DISALLOW_COPY_AND_ASSIGN(RunCompressorBuilder);
0068 
0069   /// \brief Called right before a run is being closed
0070   ///
0071   /// Subclasses can override this function to perform an additional action when
0072   /// a run is closed (i.e. run-length is known and value is appended to the
0073   /// inner builder).
0074   ///
0075   /// \param value can be NULLPTR if closing a run of NULLs
0076   /// \param length the greater than 0 length of the value run being closed
0077   virtual Status WillCloseRun(const std::shared_ptr<const Scalar>& value,
0078                               int64_t length) {
0079     return Status::OK();
0080   }
0081 
0082   /// \brief Called right before a run of empty values is being closed
0083   ///
0084   /// Subclasses can override this function to perform an additional action when
0085   /// a run of empty values is appended (i.e. run-length is known and a single
0086   /// empty value is appended to the inner builder).
0087   ///
0088   /// \param length the greater than 0 length of the value run being closed
0089   virtual Status WillCloseRunOfEmptyValues(int64_t length) { return Status::OK(); }
0090 
0091   /// \brief Allocate enough memory for a given number of array elements.
0092   ///
0093   /// NOTE: Conservatively resizing a run-length compressed array for a given
0094   /// number of logical elements is not possible, since the physical length will
0095   /// vary depending on the values to be appended in the future. But we can
0096   /// pessimistically assume that each run will contain a single value and
0097   /// allocate that number of runs.
0098   Status Resize(int64_t capacity) override { return ResizePhysical(capacity); }
0099 
0100   /// \brief Allocate enough memory for a given number of runs.
0101   ///
0102   /// Like Resize on non-encoded builders, it does not account for variable size
0103   /// data.
0104   Status ResizePhysical(int64_t capacity);
0105 
0106   Status ReservePhysical(int64_t additional_capacity) {
0107     return Reserve(additional_capacity);
0108   }
0109 
0110   void Reset() override;
0111 
0112   Status AppendNull() final { return AppendNulls(1); }
0113   Status AppendNulls(int64_t length) override;
0114 
0115   Status AppendEmptyValue() final { return AppendEmptyValues(1); }
0116   Status AppendEmptyValues(int64_t length) override;
0117 
0118   Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override;
0119   Status AppendScalars(const ScalarVector& scalars) override;
0120 
0121   // AppendArraySlice() is not implemented.
0122 
0123   /// \brief Append a slice of an array containing values from already
0124   /// compressed runs.
0125   ///
0126   /// NOTE: WillCloseRun() is not called as the length of each run cannot be
0127   /// determined at this point. Caller should ensure that !has_open_run() by
0128   /// calling FinishCurrentRun() before calling this.
0129   ///
0130   /// Pre-condition: !has_open_run()
0131   Status AppendRunCompressedArraySlice(const ArraySpan& array, int64_t offset,
0132                                        int64_t length);
0133 
0134   /// \brief Forces the closing of the current run if one is currently open.
0135   ///
0136   /// This can be called when one wants to ensure the current run will not be
0137   /// extended. This may cause identical values to appear close to each other in
0138   /// the underlying array (i.e. two runs that could be a single run) if more
0139   /// values are appended after this is called.
0140   ///
0141   /// Finish() and FinishInternal() call this automatically.
0142   virtual Status FinishCurrentRun();
0143 
0144   Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
0145 
0146   ArrayBuilder& inner_builder() const { return *inner_builder_; }
0147 
0148   std::shared_ptr<DataType> type() const override { return inner_builder_->type(); }
0149 
0150   bool has_open_run() const { return current_run_length_ > 0; }
0151   int64_t open_run_length() const { return current_run_length_; }
0152 
0153  private:
0154   inline void UpdateDimensions() {
0155     capacity_ = inner_builder_->capacity();
0156     length_ = inner_builder_->length();
0157     null_count_ = inner_builder_->null_count();
0158   }
0159 
0160  private:
0161   std::shared_ptr<ArrayBuilder> inner_builder_;
0162   std::shared_ptr<const Scalar> current_value_ = NULLPTR;
0163   int64_t current_run_length_ = 0;
0164 };
0165 
0166 }  // namespace internal
0167 
0168 // ----------------------------------------------------------------------
0169 // RunEndEncoded builder
0170 
0171 /// \brief Run-end encoded array builder.
0172 ///
0173 /// NOTE: the value returned by and capacity() is related to the
0174 /// compressed array (physical) and not the decoded array (logical) that is
0175 /// run-end encoded. null_count() always returns 0. length(), on the other hand,
0176 /// returns the logical length of the run-end encoded array.
0177 class ARROW_EXPORT RunEndEncodedBuilder : public ArrayBuilder {
0178  private:
0179   // An internal::RunCompressorBuilder that produces a run-end in the
0180   // RunEndEncodedBuilder every time a value-run is closed.
0181   class ValueRunBuilder : public internal::RunCompressorBuilder {
0182    public:
0183     ValueRunBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& value_builder,
0184                     const std::shared_ptr<DataType>& value_type,
0185                     RunEndEncodedBuilder& ree_builder);
0186 
0187     ~ValueRunBuilder() override = default;
0188 
0189     Status WillCloseRun(const std::shared_ptr<const Scalar>&, int64_t length) override {
0190       return ree_builder_.CloseRun(length);
0191     }
0192 
0193     Status WillCloseRunOfEmptyValues(int64_t length) override {
0194       return ree_builder_.CloseRun(length);
0195     }
0196 
0197    private:
0198     RunEndEncodedBuilder& ree_builder_;
0199   };
0200 
0201  public:
0202   RunEndEncodedBuilder(MemoryPool* pool,
0203                        const std::shared_ptr<ArrayBuilder>& run_end_builder,
0204                        const std::shared_ptr<ArrayBuilder>& value_builder,
0205                        std::shared_ptr<DataType> type);
0206 
0207   /// \brief Allocate enough memory for a given number of array elements.
0208   ///
0209   /// NOTE: Conservatively resizing an REE for a given number of logical
0210   /// elements is not possible, since the physical length will vary depending on
0211   /// the values to be appended in the future. But we can pessimistically assume
0212   /// that each run will contain a single value and allocate that number of
0213   /// runs.
0214   Status Resize(int64_t capacity) override { return ResizePhysical(capacity); }
0215 
0216   /// \brief Allocate enough memory for a given number of runs.
0217   Status ResizePhysical(int64_t capacity);
0218 
0219   /// \brief Ensure that there is enough space allocated to append the indicated
0220   /// number of run without any further reallocation. Overallocation is
0221   /// used in order to minimize the impact of incremental ReservePhysical() calls.
0222   /// Note that additional_capacity is relative to the current number of elements
0223   /// rather than to the current capacity, so calls to Reserve() which are not
0224   /// interspersed with addition of new elements may not increase the capacity.
0225   ///
0226   /// \param[in] additional_capacity the number of additional runs
0227   /// \return Status
0228   Status ReservePhysical(int64_t additional_capacity) {
0229     return Reserve(additional_capacity);
0230   }
0231 
0232   void Reset() override;
0233 
0234   Status AppendNull() final { return AppendNulls(1); }
0235   Status AppendNulls(int64_t length) override;
0236 
0237   Status AppendEmptyValue() final { return AppendEmptyValues(1); }
0238   Status AppendEmptyValues(int64_t length) override;
0239   Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override;
0240   Status AppendScalars(const ScalarVector& scalars) override;
0241   Status AppendArraySlice(const ArraySpan& array, int64_t offset,
0242                           int64_t length) override;
0243   Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
0244 
0245   /// \cond FALSE
0246   using ArrayBuilder::Finish;
0247   /// \endcond
0248 
0249   Status Finish(std::shared_ptr<RunEndEncodedArray>* out) { return FinishTyped(out); }
0250 
0251   /// \brief Forces the closing of the current run if one is currently open.
0252   ///
0253   /// This can be called when one wants to ensure the current run will not be
0254   /// extended. This may cause identical values to appear close to each other in
0255   /// the values array (i.e. two runs that could be a single run) if more
0256   /// values are appended after this is called.
0257   Status FinishCurrentRun();
0258 
0259   std::shared_ptr<DataType> type() const override;
0260 
0261  private:
0262   /// \brief Update physical capacity and logical length
0263   ///
0264   /// \param committed_logical_length number of logical values that have been
0265   ///                                 committed to the values array
0266   /// \param open_run_length number of logical values in the currently open run if any
0267   inline void UpdateDimensions(int64_t committed_logical_length,
0268                                int64_t open_run_length) {
0269     capacity_ = run_end_builder().capacity();
0270     length_ = committed_logical_length + open_run_length;
0271     committed_logical_length_ = committed_logical_length;
0272   }
0273 
0274   // Pre-condition: !value_run_builder_.has_open_run()
0275   template <typename RunEndCType>
0276   Status DoAppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length);
0277 
0278   template <typename RunEndCType>
0279   Status DoAppendRunEnd(int64_t run_end);
0280 
0281   /// \brief Cast run_end to the appropriate type and appends it to the run_ends
0282   /// array.
0283   Status AppendRunEnd(int64_t run_end);
0284 
0285   /// \brief Close a run by appending a value to the run_ends array and updating
0286   /// length_ to reflect the new run.
0287   ///
0288   /// Pre-condition: run_length > 0.
0289   [[nodiscard]] Status CloseRun(int64_t run_length);
0290 
0291   ArrayBuilder& run_end_builder();
0292   ArrayBuilder& value_builder();
0293 
0294  private:
0295   std::shared_ptr<RunEndEncodedType> type_;
0296   ValueRunBuilder* value_run_builder_;
0297   // The length not counting the current open run in the value_run_builder_
0298   int64_t committed_logical_length_ = 0;
0299 };
0300 
0301 /// @}
0302 
0303 }  // namespace arrow