root/ROOT/RHistStats.hxx

0001 /// \file
0002 /// \warning This is part of the %ROOT 7 prototype! It will change without notice. It might trigger earthquakes.
0003 /// Feedback is welcome!
0004
0005 #ifndef ROOT_RHistStats
0006 #define ROOT_RHistStats
0007
0008 #include "RHistUtils.hxx"
0009 #include "RLinearizedIndex.hxx"
0010 #include "RWeight.hxx"
0011
0012 #include <cmath>
0013 #include <cstdint>
0014 #include <stdexcept>
0015 #include <tuple>
0016 #include <vector>
0017
0018 class TBuffer;
0019
0020 namespace ROOT {
0021 namespace Experimental {
0022
0023 /**
0024 Histogram statistics of unbinned values.
0025
0026 Every call to \ref Fill(const A &... args) "Fill" updates sums to compute the number of effective entries as well as the
0027 arithmetic mean and other statistical quantities per dimension:
0028 \code
0029 ROOT::Experimental::RHistStats stats(1);
0030 stats.Fill(8.5);
0031 stats.Fill(1.5);
0032 // stats.ComputeMean() will return 5.0
0033 \endcode
0034
0035 \warning This is part of the %ROOT 7 prototype! It will change without notice. It might trigger earthquakes.
0036 Feedback is welcome!
0037 */
0038 class RHistStats final {
0039 public:
0040    /// Statistics for one dimension.
0041    struct RDimensionStats final {
0042       double fSumWX = 0.0;
0043       double fSumWX2 = 0.0;
0044       double fSumWX3 = 0.0;
0045       double fSumWX4 = 0.0;
0046
0047       void Add(double x)
0048       {
0049          fSumWX += x;
0050          fSumWX2 += x * x;
0051          fSumWX3 += x * x * x;
0052          fSumWX4 += x * x * x * x;
0053       }
0054
0055       void Add(double x, double w)
0056       {
0057          fSumWX += w * x;
0058          fSumWX2 += w * x * x;
0059          fSumWX3 += w * x * x * x;
0060          fSumWX4 += w * x * x * x * x;
0061       }
0062
0063       void Add(const RDimensionStats &other)
0064       {
0065          fSumWX += other.fSumWX;
0066          fSumWX2 += other.fSumWX2;
0067          fSumWX3 += other.fSumWX3;
0068          fSumWX4 += other.fSumWX4;
0069       }
0070
0071       void Clear()
0072       {
0073          fSumWX = 0.0;
0074          fSumWX2 = 0.0;
0075          fSumWX3 = 0.0;
0076          fSumWX4 = 0.0;
0077       }
0078    };
0079
0080 private:
0081    /// The number of entries
0082    std::uint64_t fNEntries = 0;
0083    /// The sum of weights
0084    double fSumW = 0.0;
0085    /// The sum of weights squared
0086    double fSumW2 = 0.0;
0087    /// The sums per dimension
0088    std::vector<RDimensionStats> fDimensionStats;
0089
0090 public:
0091    /// Construct a statistics object.
0092    ///
0093    /// \param[in] nDimensions the number of dimensions, must be > 0
0094    explicit RHistStats(std::size_t nDimensions)
0095    {
0096       if (nDimensions == 0) {
0097          throw std::invalid_argument("nDimensions must be > 0");
0098       }
0099       fDimensionStats.resize(nDimensions);
0100    }
0101
0102    std::size_t GetNDimensions() const { return fDimensionStats.size(); }
0103
0104    std::uint64_t GetNEntries() const { return fNEntries; }
0105    double GetSumW() const { return fSumW; }
0106    double GetSumW2() const { return fSumW2; }
0107
0108    const RDimensionStats &GetDimensionStats(std::size_t dim = 0) const { return fDimensionStats.at(dim); }
0109
0110    /// Add all entries from another statistics object.
0111    ///
0112    /// Throws an exception if the number of dimensions are not identical.
0113    ///
0114    /// \param[in] other another statistics object
0115    void Add(const RHistStats &other)
0116    {
0117       if (fDimensionStats.size() != other.fDimensionStats.size()) {
0118          throw std::invalid_argument("number of dimensions not identical in Add");
0119       }
0120       fNEntries += other.fNEntries;
0121       fSumW += other.fSumW;
0122       fSumW2 += other.fSumW2;
0123       for (std::size_t i = 0; i < fDimensionStats.size(); i++) {
0124          fDimensionStats[i].Add(other.fDimensionStats[i]);
0125       }
0126    }
0127
0128    /// Clear this statistics object.
0129    void Clear()
0130    {
0131       fNEntries = 0;
0132       fSumW = 0;
0133       fSumW2 = 0;
0134       for (std::size_t i = 0; i < fDimensionStats.size(); i++) {
0135          fDimensionStats[i].Clear();
0136       }
0137    }
0138
0139    /// Compute the number of effective entries.
0140    ///
0141    /// \f[
0142    /// \frac{(\sum w_i)^2}{\sum w_i^2}
0143    /// \f]
0144    ///
0145    /// \return the number of effective entries
0146    double ComputeNEffectiveEntries() const
0147    {
0148       if (fSumW2 == 0) {
0149          return 0;
0150       }
0151       return fSumW * fSumW / fSumW2;
0152    }
0153
0154    /// Compute the arithmetic mean of unbinned values.
0155    ///
0156    /// \f[
0157    /// \mu = \frac{\sum w_i \cdot x_i}{\sum w_i}
0158    /// \f]
0159    ///
0160    /// \param[in] dim the dimension index, starting at 0
0161    /// \return the arithmetic mean of unbinned values
0162    double ComputeMean(std::size_t dim = 0) const
0163    {
0164       // First get the statistics, which includes checking the argument.
0165       auto &stats = fDimensionStats.at(dim);
0166       if (fSumW == 0) {
0167          return 0;
0168       }
0169       return stats.fSumWX / fSumW;
0170    }
0171
0172    /// Compute the variance of unbinned values.
0173    ///
0174    /// This function computes the uncorrected sample variance:
0175    /// \f[
0176    /// \sigma^2 = \frac{1}{\sum w_i} \sum(w_i \cdot x_i - \mu)^2
0177    /// \f]
0178    /// With some rewriting, this is equivalent to:
0179    /// \f[
0180    /// \sigma^2 = \frac{\sum w_i \cdot x_i^2}{\sum w_i} - \mu^2
0181    /// \f]
0182    ///
0183    /// This function does not include Bessel's correction needed for an unbiased estimator of population variance.
0184    /// In other words, the return value is a biased estimation lower than the actual population variance.
0185    ///
0186    /// \param[in] dim the dimension index, starting at 0
0187    /// \return the variance of unbinned values
0188    double ComputeVariance(std::size_t dim = 0) const
0189    {
0190       // First get the statistics, which includes checking the argument.
0191       auto &stats = fDimensionStats.at(dim);
0192       if (fSumW == 0) {
0193          return 0;
0194       }
0195       double mean = ComputeMean(dim);
0196       return stats.fSumWX2 / fSumW - mean * mean;
0197    }
0198
0199    /// Compute the standard deviation of unbinned values.
0200    ///
0201    /// This function computes the uncorrected sample standard deviation:
0202    /// \f[
0203    /// \sigma = \sqrt{\frac{1}{\sum w_i} \sum(w_i \cdot x_i - \mu)^2}
0204    /// \f]
0205    /// With some rewriting, this is equivalent to:
0206    /// \f[
0207    /// \sigma = \sqrt{\frac{\sum w_i \cdot x_i^2}{\sum w_i} - \frac{(\sum w_i \cdot x_i)^2}{(\sum w_i)^2}}
0208    /// \f]
0209    ///
0210    /// This function does not include Bessel's correction needed for an unbiased estimator of population variance.
0211    /// In other words, the return value is a biased estimation lower than the actual population standard deviation.
0212    ///
0213    /// \param[in] dim the dimension index, starting at 0
0214    /// \return the standard deviation of unbinned values
0215    double ComputeStdDev(std::size_t dim = 0) const { return std::sqrt(ComputeVariance(dim)); }
0216
0217    // clang-format off
0218    /// Compute the skewness of unbinned values.
0219    ///
0220    /// The skewness is the third standardized moment:
0221    /// \f[
0222    /// E\left[\left(\frac{X - \mu}{\sigma}\right)^3\right]
0223    /// \f]
0224    /// With support for weighted filling and after some rewriting, it is computed as:
0225    /// \f[
0226    /// \frac{\frac{\sum w_i \cdot x_i^3}{\sum w_i} - 3 \cdot \frac{\sum w_i \cdot x_i^2}{\sum w_i} \cdot \mu + 2 \cdot \mu^3}{\sigma^3}
0227    /// \f]
0228    ///
0229    /// \param[in] dim the dimension index, starting at 0
0230    /// \return the skewness of unbinned values
0231    // clang-format on
0232    double ComputeSkewness(std::size_t dim = 0) const
0233    {
0234       // First get the statistics, which includes checking the argument.
0235       auto &stats = fDimensionStats.at(dim);
0236       if (fSumW == 0) {
0237          return 0;
0238       }
0239       double mean = ComputeMean(dim);
0240       double var = ComputeVariance(dim);
0241       if (var == 0) {
0242          return 0;
0243       }
0244       double EWX3 = stats.fSumWX3 / fSumW;
0245       double EWX2 = stats.fSumWX2 / fSumW;
0246       return (EWX3 - 3 * EWX2 * mean + 2 * mean * mean * mean) / std::pow(var, 1.5);
0247    }
0248
0249    // clang-format off
0250    /// Compute the (excess) kurtosis of unbinned values.
0251    ///
0252    /// The kurtosis is based on the fourth standardized moment:
0253    /// \f[
0254    /// E\left[\left(\frac{X - \mu}{\sigma}\right)^4\right]
0255    /// \f]
0256    /// The excess kurtosis subtracts 3 from the standardized moment to have a value of 0 for a normal distribution:
0257    /// \f[
0258    /// E\left[\left(\frac{X - \mu}{\sigma}\right)^4\right] - 3
0259    /// \f]
0260    ///
0261    /// With support for weighted filling and after some rewriting, the (excess kurtosis) is computed as:
0262    /// \f[
0263    /// \frac{\frac{\sum w_i \cdot x_i^4}{\sum w_i} - 4 \cdot \frac{\sum w_i \cdot x_i^3}{\sum w_i} \cdot \mu + 6 \cdot \frac{\sum w_i \cdot x_i^2}{\sum w_i} \cdot \mu^2 - 3 \cdot \mu^4}{\sigma^4} - 3
0264    /// \f]
0265    ///
0266    /// \param[in] dim the dimension index, starting at 0
0267    /// \return the (excess) kurtosis of unbinned values
0268    // clang-format on
0269    double ComputeKurtosis(std::size_t dim = 0) const
0270    {
0271       // First get the statistics, which includes checking the argument.
0272       auto &stats = fDimensionStats.at(dim);
0273       if (fSumW == 0) {
0274          return 0;
0275       }
0276       double mean = ComputeMean(dim);
0277       double var = ComputeVariance(dim);
0278       if (var == 0) {
0279          return 0;
0280       }
0281       double EWX4 = stats.fSumWX4 / fSumW;
0282       double EWX3 = stats.fSumWX3 / fSumW;
0283       double EWX2 = stats.fSumWX2 / fSumW;
0284       return (EWX4 - 4 * EWX3 * mean + 6 * EWX2 * mean * mean - 3 * mean * mean * mean * mean) / (var * var) - 3;
0285    }
0286
0287 private:
0288    template <std::size_t I, typename... A>
0289    void FillImpl(const std::tuple<A...> &args)
0290    {
0291       fDimensionStats[I].Add(std::get<I>(args));
0292       if constexpr (I + 1 < sizeof...(A)) {
0293          FillImpl<I + 1>(args);
0294       }
0295    }
0296
0297    template <std::size_t I, std::size_t N, typename... A>
0298    void FillImpl(const std::tuple<A...> &args, double w)
0299    {
0300       fDimensionStats[I].Add(std::get<I>(args), w);
0301       if constexpr (I + 1 < N) {
0302          FillImpl<I + 1, N>(args, w);
0303       }
0304    }
0305
0306 public:
0307    /// Fill an entry into this statistics object.
0308    ///
0309    /// \code
0310    /// ROOT::Experimental::RHistStats stats(2);
0311    /// auto args = std::make_tuple(8.5, 10.5);
0312    /// stats.Fill(args);
0313    /// \endcode
0314    ///
0315    /// Throws an exception if the number of arguments does not match the number of dimensions.
0316    ///
0317    /// \param[in] args the arguments for each dimension
0318    /// \par See also
0319    /// the \ref Fill(const A &... args) "variadic function template overload" accepting arguments directly and the
0320    /// \ref Fill(const std::tuple<A...> &args, RWeight weight) "overload for weighted filling"
0321    template <typename... A>
0322    void Fill(const std::tuple<A...> &args)
0323    {
0324       if (sizeof...(A) != fDimensionStats.size()) {
0325          throw std::invalid_argument("invalid number of arguments to Fill");
0326       }
0327       fNEntries++;
0328       fSumW++;
0329       fSumW2++;
0330       FillImpl<0>(args);
0331    }
0332
0333    /// Fill an entry into this statistics object with a weight.
0334    ///
0335    /// \code
0336    /// ROOT::Experimental::RHistStats stats(2);
0337    /// auto args = std::make_tuple(8.5, 10.5);
0338    /// stats.Fill(args, ROOT::Experimental::RWeight(0.8));
0339    /// \endcode
0340    ///
0341    /// \param[in] args the arguments for each dimension
0342    /// \param[in] weight the weight for this entry
0343    /// \par See also
0344    /// the \ref Fill(const A &... args) "variadic function template overload" accepting arguments directly and the
0345    /// \ref Fill(const std::tuple<A...> &args) "overload for unweighted filling"
0346    template <typename... A>
0347    void Fill(const std::tuple<A...> &args, RWeight weight)
0348    {
0349       if (sizeof...(A) != fDimensionStats.size()) {
0350          throw std::invalid_argument("invalid number of arguments to Fill");
0351       }
0352       fNEntries++;
0353       double w = weight.fValue;
0354       fSumW += w;
0355       fSumW2 += w * w;
0356       FillImpl<0, sizeof...(A)>(args, w);
0357    }
0358
0359    /// Fill an entry into this statistics object.
0360    ///
0361    /// \code
0362    /// ROOT::Experimental::RHistStats stats(2);
0363    /// stats.Fill(8.5, 10.5);
0364    /// \endcode
0365    /// For weighted filling, pass an RWeight as the last argument:
0366    /// \code
0367    /// ROOT::Experimental::RHistStats stats(2);
0368    /// stats.Fill(8.5, 10.5, ROOT::Experimental::RWeight(0.8));
0369    /// \endcode
0370    ///
0371    /// Throws an exception if the number of arguments does not match the number of dimensions.
0372    ///
0373    /// \param[in] args the arguments for each dimension
0374    /// \par See also
0375    /// the function overloads accepting `std::tuple` \ref Fill(const std::tuple<A...> &args) "for unweighted filling"
0376    /// and \ref Fill(const std::tuple<A...> &args, RWeight) "for weighted filling"
0377    template <typename... A>
0378    void Fill(const A &...args)
0379    {
0380       auto t = std::forward_as_tuple(args...);
0381       if constexpr (std::is_same_v<typename Internal::LastType<A...>::type, RWeight>) {
0382          static constexpr std::size_t N = sizeof...(A) - 1;
0383          if (N != fDimensionStats.size()) {
0384             throw std::invalid_argument("invalid number of arguments to Fill");
0385          }
0386          fNEntries++;
0387          double w = std::get<N>(t).fValue;
0388          fSumW += w;
0389          fSumW2 += w * w;
0390          FillImpl<0, N>(t, w);
0391       } else {
0392          Fill(t);
0393       }
0394    }
0395
0396    /// %ROOT Streamer function to throw when trying to store an object of this class.
0397    void Streamer(TBuffer &) { throw std::runtime_error("unable to store RHistStats"); }
0398 };
0399
0400 } // namespace Experimental
0401 } // namespace ROOT
0402
0403 #endif