YODA/Utils/StatsUtils.h

0001 //
0002 // This file is part of YODA -- Yet more Objects for Data Analysis
0003 // Copyright (C) 2008-2025 The YODA collaboration (see AUTHORS for details)
0004 //
0005 #ifndef YODA_StatsUtils_H
0006 #define YODA_StatsUtils_H
0007
0008 #include "YODA/Exceptions.h"
0009 #include "YODA/Config/BuildConfig.h"
0010 #include "YODA/Utils/MathUtils.h"
0011
0012 #include <algorithm>
0013 #include <map>
0014 #include <string>
0015 #include <vector>
0016
0017 namespace YODA {
0018
0019   /// @name Statistics functions
0020   /// @{
0021
0022   /// @brief Calculate the effective number of entries of a sample.
0023   inline double effNumEntries(const double sumW, const double sumW2) {
0024     if (isZero(sumW2))  return 0;
0025     return sqr(sumW) / sumW2;
0026   }
0027
0028   /// @brief Calculate the effective number of entries of a sample.
0029   inline double effNumEntries(const std::vector<double>& weights) {
0030     double sumW = 0.0, sumW2 = 0.0;
0031     for (size_t i = 0; i < weights.size(); ++i) {
0032       sumW += weights[i];
0033       sumW2 += sqr(weights[i]);
0034     }
0035     return effNumEntries(sumW, sumW2);
0036   }
0037
0038   /// @brief Calculate the mean of a sample.
0039   template<typename T>
0040   inline double mean(const std::vector<T>& sample) {
0041     double mean = 0.0;
0042     for (size_t i=0; i<sample.size(); ++i) {
0043       mean += (double)sample[i];
0044     }
0045     return mean/(double)sample.size();
0046   }
0047
0048   /// @brief Calculate the weighted mean of a sample.
0049   inline double mean(const double sumWX, const double sumW) {
0050     return sumW? sumWX / sumW : std::numeric_limits<double>::quiet_NaN();
0051   }
0052
0053   /// @brief Calculate the weighted mean of a sample.
0054   inline double mean(const std::vector<double>& sample,
0055                      const std::vector<double>& weights) {
0056     if (sample.size() != weights.size())  throw RangeError("Inputs should have equal length!");
0057     double sumWX = 0., sumW = 0.;
0058     for (size_t i = 0; i < sample.size(); ++i) {
0059       sumW  += weights[i];
0060       sumWX += weights[i]*sample[i];
0061     }
0062     return mean(sumWX, sumW);
0063   }
0064
0065   /// @brief Calculate the weighted variance of a sample.
0066   ///
0067   /// Weighted variance defined as
0068   /// sig2 = ( sum(wx**2) * sum(w) - sum(wx)**2 ) / ( sum(w)**2 - sum(w**2) )
0069   /// see http://en.wikipedia.org/wiki/Weighted_mean
0070   inline double variance(const double sumWX, const double sumW,
0071                          const double sumWX2, const double sumW2) {
0072     const double num = subtract(sumWX2*sumW, sqr(sumWX));
0073     const double den = subtract(sqr(sumW), sumW2);
0074     /// @todo Isn't this sensitive to the overall scale of the weights?
0075     /// Shouldn't it check if den is bigger then num by a set number of
0076     /// orders of magnitude and vice versa?
0077     // if (fabs(num) < 1e-10 && fabs(den) < 1e-10) {
0078     //   return std::numeric_limits<double>::quiet_NaN();
0079     // }
0080     /// We take the modulus of the weighted variance
0081     /// since the ratio can be negative with weighted means
0082     /// @todo Is this the correct approach? There is no information
0083     /// online other than "weights are non-negative"...
0084     return den? fabs(num/den): std::numeric_limits<double>::quiet_NaN();
0085   }
0086
0087   /// @brief Calculate the weighted variance of a sample.
0088   inline double variance(const std::vector<double>& sample,
0089                          const std::vector<double>& weights) {
0090     if (sample.size() != weights.size())  throw RangeError("Inputs should have equal length!");
0091     if (fuzzyLessEquals(effNumEntries(weights), 1.0)) {
0092        //throw LowStatsError("Requested variance of a distribution with only one effective entry");
0093        return std::numeric_limits<double>::quiet_NaN();
0094     }
0095     double sumWX = 0., sumW = 0.;
0096     double sumWX2 = 0., sumW2 = 0.;
0097     for (size_t i = 0; i < sample.size(); ++i) {
0098       sumW   += weights[i];
0099       sumWX  += weights[i]*sample[i];
0100       sumW2  += sqr(weights[i]);
0101       sumWX2 += weights[i]*sqr(sample[i]);
0102     }
0103     return variance(sumWX, sumW, sumWX2, sumW2);
0104   }
0105
0106   /// @brief Calculate the weighted standard deviation of a sample.
0107   inline double stdDev(const double sumWX, const double sumW,
0108                        const double sumWX2, const double sumW2) {
0109     return std::sqrt(variance(sumWX, sumW, sumWX2, sumW2));
0110   }
0111
0112   /// @brief Calculate the weighted variance of a sample.
0113   inline double stdDev(const std::vector<double>& sample,
0114                        const std::vector<double>& weights) {
0115     return std::sqrt(variance(sample, weights));
0116   }
0117
0118   /// @brief Calculate the weighted standard error of a sample.
0119   inline double stdErr(const double sumWX, const double sumW,
0120                        const double sumWX2, const double sumW2) {
0121     const double effN = effNumEntries(sumW, sumW2);
0122     if (effN == 0)  return std::numeric_limits<double>::quiet_NaN();
0123     const double var = variance(sumWX, sumW, sumWX2, sumW2);
0124     return std::sqrt(var / effN);
0125   }
0126
0127   /// @brief Calculate the weighted variance of a sample.
0128   inline double stdErr(const std::vector<double>& sample,
0129                        const std::vector<double>& weights) {
0130     if (sample.size() != weights.size())  throw RangeError("Inputs should have equal length!");
0131     const double effN = effNumEntries(weights);
0132     if (effN == 0)  return std::numeric_limits<double>::quiet_NaN();
0133     const double var = variance(sample, weights);
0134     return std::sqrt(var / effN);
0135   }
0136
0137   /// @brief Calculate the weighted RMS of a sample.
0138   inline double rms(const double sumWX2, const double sumW, const double sumW2) {
0139     // Weighted RMS defined as
0140     // rms = sqrt(sum{w x^2} / sum{w})
0141     const double effN = effNumEntries(sumW, sumW2);
0142     if (effN == 0)  return std::numeric_limits<double>::quiet_NaN();
0143     const double meanSq = sumWX2 / sumW;
0144     return std::sqrt(meanSq);
0145   }
0146
0147   /// @brief Calculate the weighted RMS of a sample.
0148   inline double rms(const std::vector<double>& sample,
0149                     const std::vector<double>& weights) {
0150     if (sample.size() != weights.size())  throw RangeError("Inputs should have equal length!");
0151     double sumWX2 = 0., sumW = 0., sumW2 = 0.;
0152     for (size_t i = 0; i < sample.size(); ++i) {
0153       sumW   += weights[i];
0154       sumW2  += sqr(weights[i]);
0155       sumWX2 += weights[i]*sqr(sample[i]);
0156     }
0157     return rms(sumWX2, sumW, sumW2);
0158   }
0159
0160   /// @brief Alias for rms
0161   ///
0162   /// @deprecated Just use rms()!
0163   inline double RMS(const double sumWX2, const double sumW, const double sumW2) {
0164     return rms(sumWX2, sumW, sumW2);
0165   }
0166
0167   /// @brief Alias for rms
0168   ///
0169   /// @deprecated Just use rms()!
0170   inline double RMS(const std::vector<double>& sample,
0171                     const std::vector<double>& weights) {
0172     return rms(sample, weights);
0173   }
0174
0175   /// @brief Calculate the covariance (variance) between two samples.
0176   template<typename T>
0177   inline double covariance(const std::vector<T>& sample1, const std::vector<T>& sample2) {
0178     const double mean1 = mean(sample1);
0179     const double mean2 = mean(sample2);
0180     const size_t N = sample1.size();
0181     double cov = 0.0;
0182     for (size_t i = 0; i < N; i++) {
0183       const double cov_i = (sample1[i] - mean1)*(sample2[i] - mean2);
0184       cov += cov_i;
0185     }
0186     if (N > 1) return cov/(N-1);
0187     else return 0.0;
0188   }
0189
0190
0191   /// @brief Calculate the correlation strength between two samples.
0192   template<typename T>
0193   inline double correlation(const std::vector<T>& sample1, const std::vector<T>& sample2) {
0194     const double cov = covariance(sample1, sample2);
0195     const double var1 = covariance(sample1, sample1);
0196     const double var2 = covariance(sample2, sample2);
0197     const double correlation = cov/sqrt(var1*var2);
0198     const double corr_strength = correlation*sqrt(var2/var1);
0199     return corr_strength;
0200   }
0201
0202
0203   /// @brief Calculate the cumulative distribution function for a sample.
0204   inline std::vector<double> cdf(std::vector<double> sample) {
0205     if (sample.empty())  return sample;
0206
0207     const double total = std::accumulate(sample.begin(), sample.end(), 0.0);
0208     if (total == 0.0)  return sample;
0209
0210     sample[0] /= total;
0211     for (size_t i = 1; i < sample.size(); ++i) {
0212         sample[i] = sample[i-1] + sample[i] / total;
0213     }
0214     return sample;
0215   }
0216
0217   /// @brief Calculate the weighted average of two sets of values @a values1
0218   /// and @a values2 with associated uncertainties @a errors1 and @a errors2,
0219   /// respectively, taking the inverse squared uncertainty as the weight.
0220   inline std::vector<std::vector<double>> weightedAverage(const std::vector<double>& sample1,
0221                                                           const std::vector<double>& s1errors,
0222                                                           const std::vector<double>& sample2,
0223                                                           const std::vector<double>& s2errors) {
0224     if (sample1.size() !=  sample2.size())  throw RangeError("Inputs should have equal length!");
0225     if (sample1.size() != s1errors.size())  throw RangeError("Inputs should have equal length!");
0226     if (sample1.size() != s2errors.size())  throw RangeError("Inputs should have equal length!");
0227     std::vector<std::vector<double>> rtn; rtn.resize(2);
0228     rtn[0].reserve(sample1.size());
0229     rtn[1].reserve(sample1.size());
0230     for (size_t i=0; i<sample1.size(); ++i) {
0231       const double w1 = s1errors[i]? 1.0 / sqr(s1errors[i]) : 0.;
0232       const double w2 = s2errors[i]? 1.0 / sqr(s2errors[i]) : 0.;
0233       const double wsum = w1 + w2;
0234       const double wtot = wsum? 1.0 / wsum : 0.0;
0235       rtn[0].push_back( wtot * (w1*sample1[i] + w2*sample2[i]));
0236       rtn[1].push_back( wtot );
0237     }
0238     return rtn;
0239   }
0240
0241   /// @brief Calculate the Kolmogorov-Smirnov test statistic between two samples.
0242   ///
0243   /// @note This implementation assumes that the bin widths are small compared
0244   /// with any physical phenomena of interest.
0245   inline double ksTest(const std::vector<double>& sample1, const std::vector<double>& sample2) {
0246     if (sample1.size() != sample2.size())  throw RangeError("Inputs should have equal length!");
0247     const std::vector<double> cdf1 = cdf(sample1);
0248     const std::vector<double> cdf2 = cdf(sample2);
0249     double D = 0.0;
0250     for (size_t i = 0; i < cdf1.size(); ++i) {
0251       D = std::max(D, std::abs(cdf1[i] - cdf2[i]));
0252     }
0253     return D;
0254   }
0255
0256   /// @brief Approximate asymptotic p-value for a KS test statistics @a D,
0257   /// given two effective sample sizes @a n1 and @a n2.
0258   inline double pValFromKS(const double D, const double n1, const double n2,
0259                            const double tolerance = 1e-5) {
0260     if (isZero(D) || isZero(n1) || isZero(n2))  return 1.0;
0261     const double neff = (n1 * n2) / (n1 + n2);
0262     const double x = D * std::sqrt(neff);
0263     // According to https://en.m.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test
0264     // the cumulative Kolmogorov distribution is given by
0265     // Pr(K <= x) = 1 - 2* sum_{k=1}^inf (-1)^(k-1) exp(-2 * k^2 * x^2)
0266     // and so one-sided p-value = 1 - Pr(K <= x):
0267     size_t k = 0;
0268     double p = 0.0;
0269     while (++k) {
0270       const double term = std::exp(-2. * k * k * x * x);
0271       p += ((k % 2) ? 1 : -1) * term;
0272       if (term < tolerance) break;
0273     }
0274     return std::clamp(2*p, 0.0, 1.0);
0275   }
0276
0277   /// @brief Calculate the error-weighted chi2 statistic between two samples
0278   ///
0279   /// @note This calculation is rather naive as it neglects the error breakdowns that
0280   /// may be available for the curves being compared. More sophisticated comparisons
0281   /// will require a separate analysis based on the Estimate objects utilising the full
0282   /// covariance information.
0283   inline double naiveChi2(const std::vector<double>& sample1, const std::vector<double>& sample2,
0284                           const std::vector<double>& s1errors = std::vector<double>{},
0285                           const std::vector<double>& s2errors = std::vector<double>{}) {
0286     if (sample1.size() != sample2.size()) {
0287       throw RangeError("Inputs should have equal length!");
0288     }
0289     if (s1errors.size() && sample1.size() != s1errors.size()) {
0290       throw RangeError("Inputs should have equal length!");
0291     }
0292     if (s2errors.size() && sample2.size() != s2errors.size()) {
0293       throw RangeError("Inputs should have equal length!");
0294     }
0295     const size_t N = sample1.size();
0296     double chi2 = 0.0;
0297     for (size_t i = 0; i < N; ++i) {
0298       double temp = sqr(sample1[i] - sample2[i]);
0299       if (s1errors.size()) {
0300         temp /= sqr(s1errors[i]) + sqr(s2errors[i]);
0301       }
0302       chi2 += temp;
0303     }
0304     return chi2;
0305   }
0306
0307   /// @brief Calculate the error-weighted reduced chi2 statistic between two samples
0308   ///
0309   /// @note This calculation is rather naive as it neglects the error breakdowns that
0310   /// may be available for the curves being compared. More sophisticated comparisons
0311   /// will require a separate analysis based on the Estimate objects utilising the full
0312   /// covariance information.
0313   inline double naiveChi2reduced(const std::vector<double>& sample1, const std::vector<double>& sample2,
0314                                  const std::vector<double>& s1errors = std::vector<double>{},
0315                                  const std::vector<double>& s2errors = std::vector<double>{}) {
0316     if (sample1.empty()) throw RangeError("Inputs should not have 0 length!");
0317     return naiveChi2(sample1, sample2, s1errors, s2errors)/sample1.size();
0318   }
0319
0320   /// @}
0321
0322 }
0323
0324 #endif