arrow/util/tdigest.h

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017
0018 // approximate quantiles from arbitrary length dataset with O(1) space
0019 // based on 'Computing Extremely Accurate Quantiles Using t-Digests' from Dunning & Ertl
0020 // - https://arxiv.org/abs/1902.04023
0021 // - https://github.com/tdunning/t-digest
0022
0023 #pragma once
0024
0025 #include <cmath>
0026 #include <memory>
0027 #include <vector>
0028
0029 #include "arrow/util/logging.h"
0030 #include "arrow/util/macros.h"
0031 #include "arrow/util/visibility.h"
0032
0033 namespace arrow {
0034
0035 class Status;
0036
0037 namespace internal {
0038
0039 class ARROW_EXPORT TDigest {
0040  public:
0041   explicit TDigest(uint32_t delta = 100, uint32_t buffer_size = 500);
0042   ~TDigest();
0043   TDigest(TDigest&&);
0044   TDigest& operator=(TDigest&&);
0045
0046   // reset and re-use this tdigest
0047   void Reset();
0048
0049   // validate data integrity
0050   Status Validate() const;
0051
0052   // dump internal data, only for debug
0053   void Dump() const;
0054
0055   // buffer a single data point, consume internal buffer if full
0056   // this function is intensively called and performance critical
0057   // call it only if you are sure no NAN exists in input data
0058   void Add(double value) {
0059     ARROW_DCHECK(!std::isnan(value)) << "cannot add NAN";
0060     if (ARROW_PREDICT_FALSE(input_.size() == input_.capacity())) {
0061       MergeInput();
0062     }
0063     input_.push_back(value);
0064   }
0065
0066   // skip NAN on adding
0067   template <typename T>
0068   typename std::enable_if<std::is_floating_point<T>::value>::type NanAdd(T value) {
0069     if (!std::isnan(value)) Add(value);
0070   }
0071
0072   template <typename T>
0073   typename std::enable_if<std::is_integral<T>::value>::type NanAdd(T value) {
0074     Add(static_cast<double>(value));
0075   }
0076
0077   // merge with other t-digests, called infrequently
0078   void Merge(const std::vector<TDigest>& others);
0079   void Merge(const TDigest& other);
0080
0081   // calculate quantile
0082   double Quantile(double q) const;
0083
0084   double Min() const { return Quantile(0); }
0085   double Max() const { return Quantile(1); }
0086   double Mean() const;
0087
0088   // check if this tdigest contains no valid data points
0089   bool is_empty() const;
0090
0091  private:
0092   // merge input data with current tdigest
0093   void MergeInput() const;
0094
0095   // input buffer, size = buffer_size * sizeof(double)
0096   mutable std::vector<double> input_;
0097
0098   // hide other members with pimpl
0099   class TDigestImpl;
0100   std::unique_ptr<TDigestImpl> impl_;
0101 };
0102
0103 }  // namespace internal
0104 }  // namespace arrow