File indexing completed on 2026-05-16 08:31:31
0001
0002
0003
0004
0005 #ifndef YODA_StatsUtils_H
0006 #define YODA_StatsUtils_H
0007
0008 #include "YODA/Exceptions.h"
0009 #include "YODA/Config/BuildConfig.h"
0010 #include "YODA/Utils/MathUtils.h"
0011
0012 #include <algorithm>
0013 #include <map>
0014 #include <string>
0015 #include <vector>
0016
0017 namespace YODA {
0018
0019
0020
0021
0022
0023 inline double effNumEntries(const double sumW, const double sumW2) {
0024 if (isZero(sumW2)) return 0;
0025 return sqr(sumW) / sumW2;
0026 }
0027
0028
0029 inline double effNumEntries(const std::vector<double>& weights) {
0030 double sumW = 0.0, sumW2 = 0.0;
0031 for (size_t i = 0; i < weights.size(); ++i) {
0032 sumW += weights[i];
0033 sumW2 += sqr(weights[i]);
0034 }
0035 return effNumEntries(sumW, sumW2);
0036 }
0037
0038
0039 template<typename T>
0040 inline double mean(const std::vector<T>& sample) {
0041 double mean = 0.0;
0042 for (size_t i=0; i<sample.size(); ++i) {
0043 mean += (double)sample[i];
0044 }
0045 return mean/(double)sample.size();
0046 }
0047
0048
0049 inline double mean(const double sumWX, const double sumW) {
0050 return sumW? sumWX / sumW : std::numeric_limits<double>::quiet_NaN();
0051 }
0052
0053
0054 inline double mean(const std::vector<double>& sample,
0055 const std::vector<double>& weights) {
0056 if (sample.size() != weights.size()) throw RangeError("Inputs should have equal length!");
0057 double sumWX = 0., sumW = 0.;
0058 for (size_t i = 0; i < sample.size(); ++i) {
0059 sumW += weights[i];
0060 sumWX += weights[i]*sample[i];
0061 }
0062 return mean(sumWX, sumW);
0063 }
0064
0065
0066
0067
0068
0069
0070 inline double variance(const double sumWX, const double sumW,
0071 const double sumWX2, const double sumW2) {
0072 const double num = subtract(sumWX2*sumW, sqr(sumWX));
0073 const double den = subtract(sqr(sumW), sumW2);
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084 return den? fabs(num/den): std::numeric_limits<double>::quiet_NaN();
0085 }
0086
0087
0088 inline double variance(const std::vector<double>& sample,
0089 const std::vector<double>& weights) {
0090 if (sample.size() != weights.size()) throw RangeError("Inputs should have equal length!");
0091 if (fuzzyLessEquals(effNumEntries(weights), 1.0)) {
0092
0093 return std::numeric_limits<double>::quiet_NaN();
0094 }
0095 double sumWX = 0., sumW = 0.;
0096 double sumWX2 = 0., sumW2 = 0.;
0097 for (size_t i = 0; i < sample.size(); ++i) {
0098 sumW += weights[i];
0099 sumWX += weights[i]*sample[i];
0100 sumW2 += sqr(weights[i]);
0101 sumWX2 += weights[i]*sqr(sample[i]);
0102 }
0103 return variance(sumWX, sumW, sumWX2, sumW2);
0104 }
0105
0106
0107 inline double stdDev(const double sumWX, const double sumW,
0108 const double sumWX2, const double sumW2) {
0109 return std::sqrt(variance(sumWX, sumW, sumWX2, sumW2));
0110 }
0111
0112
0113 inline double stdDev(const std::vector<double>& sample,
0114 const std::vector<double>& weights) {
0115 return std::sqrt(variance(sample, weights));
0116 }
0117
0118
0119 inline double stdErr(const double sumWX, const double sumW,
0120 const double sumWX2, const double sumW2) {
0121 const double effN = effNumEntries(sumW, sumW2);
0122 if (effN == 0) return std::numeric_limits<double>::quiet_NaN();
0123 const double var = variance(sumWX, sumW, sumWX2, sumW2);
0124 return std::sqrt(var / effN);
0125 }
0126
0127
0128 inline double stdErr(const std::vector<double>& sample,
0129 const std::vector<double>& weights) {
0130 if (sample.size() != weights.size()) throw RangeError("Inputs should have equal length!");
0131 const double effN = effNumEntries(weights);
0132 if (effN == 0) return std::numeric_limits<double>::quiet_NaN();
0133 const double var = variance(sample, weights);
0134 return std::sqrt(var / effN);
0135 }
0136
0137
0138 inline double rms(const double sumWX2, const double sumW, const double sumW2) {
0139
0140
0141 const double effN = effNumEntries(sumW, sumW2);
0142 if (effN == 0) return std::numeric_limits<double>::quiet_NaN();
0143 const double meanSq = sumWX2 / sumW;
0144 return std::sqrt(meanSq);
0145 }
0146
0147
0148 inline double rms(const std::vector<double>& sample,
0149 const std::vector<double>& weights) {
0150 if (sample.size() != weights.size()) throw RangeError("Inputs should have equal length!");
0151 double sumWX2 = 0., sumW = 0., sumW2 = 0.;
0152 for (size_t i = 0; i < sample.size(); ++i) {
0153 sumW += weights[i];
0154 sumW2 += sqr(weights[i]);
0155 sumWX2 += weights[i]*sqr(sample[i]);
0156 }
0157 return rms(sumWX2, sumW, sumW2);
0158 }
0159
0160
0161
0162
0163 inline double RMS(const double sumWX2, const double sumW, const double sumW2) {
0164 return rms(sumWX2, sumW, sumW2);
0165 }
0166
0167
0168
0169
0170 inline double RMS(const std::vector<double>& sample,
0171 const std::vector<double>& weights) {
0172 return rms(sample, weights);
0173 }
0174
0175
0176 template<typename T>
0177 inline double covariance(const std::vector<T>& sample1, const std::vector<T>& sample2) {
0178 const double mean1 = mean(sample1);
0179 const double mean2 = mean(sample2);
0180 const size_t N = sample1.size();
0181 double cov = 0.0;
0182 for (size_t i = 0; i < N; i++) {
0183 const double cov_i = (sample1[i] - mean1)*(sample2[i] - mean2);
0184 cov += cov_i;
0185 }
0186 if (N > 1) return cov/(N-1);
0187 else return 0.0;
0188 }
0189
0190
0191
0192 template<typename T>
0193 inline double correlation(const std::vector<T>& sample1, const std::vector<T>& sample2) {
0194 const double cov = covariance(sample1, sample2);
0195 const double var1 = covariance(sample1, sample1);
0196 const double var2 = covariance(sample2, sample2);
0197 const double correlation = cov/sqrt(var1*var2);
0198 const double corr_strength = correlation*sqrt(var2/var1);
0199 return corr_strength;
0200 }
0201
0202
0203
0204 inline std::vector<double> cdf(std::vector<double> sample) {
0205 if (sample.empty()) return sample;
0206
0207 const double total = std::accumulate(sample.begin(), sample.end(), 0.0);
0208 if (total == 0.0) return sample;
0209
0210 sample[0] /= total;
0211 for (size_t i = 1; i < sample.size(); ++i) {
0212 sample[i] = sample[i-1] + sample[i] / total;
0213 }
0214 return sample;
0215 }
0216
0217
0218
0219
0220 inline std::vector<std::vector<double>> weightedAverage(const std::vector<double>& sample1,
0221 const std::vector<double>& s1errors,
0222 const std::vector<double>& sample2,
0223 const std::vector<double>& s2errors) {
0224 if (sample1.size() != sample2.size()) throw RangeError("Inputs should have equal length!");
0225 if (sample1.size() != s1errors.size()) throw RangeError("Inputs should have equal length!");
0226 if (sample1.size() != s2errors.size()) throw RangeError("Inputs should have equal length!");
0227 std::vector<std::vector<double>> rtn; rtn.resize(2);
0228 rtn[0].reserve(sample1.size());
0229 rtn[1].reserve(sample1.size());
0230 for (size_t i=0; i<sample1.size(); ++i) {
0231 const double w1 = s1errors[i]? 1.0 / sqr(s1errors[i]) : 0.;
0232 const double w2 = s2errors[i]? 1.0 / sqr(s2errors[i]) : 0.;
0233 const double wsum = w1 + w2;
0234 const double wtot = wsum? 1.0 / wsum : 0.0;
0235 rtn[0].push_back( wtot * (w1*sample1[i] + w2*sample2[i]));
0236 rtn[1].push_back( wtot );
0237 }
0238 return rtn;
0239 }
0240
0241
0242
0243
0244
0245 inline double ksTest(const std::vector<double>& sample1, const std::vector<double>& sample2) {
0246 if (sample1.size() != sample2.size()) throw RangeError("Inputs should have equal length!");
0247 const std::vector<double> cdf1 = cdf(sample1);
0248 const std::vector<double> cdf2 = cdf(sample2);
0249 double D = 0.0;
0250 for (size_t i = 0; i < cdf1.size(); ++i) {
0251 D = std::max(D, std::abs(cdf1[i] - cdf2[i]));
0252 }
0253 return D;
0254 }
0255
0256
0257
0258 inline double pValFromKS(const double D, const double n1, const double n2,
0259 const double tolerance = 1e-5) {
0260 if (isZero(D) || isZero(n1) || isZero(n2)) return 1.0;
0261 const double neff = (n1 * n2) / (n1 + n2);
0262 const double x = D * std::sqrt(neff);
0263
0264
0265
0266
0267 size_t k = 0;
0268 double p = 0.0;
0269 while (++k) {
0270 const double term = std::exp(-2. * k * k * x * x);
0271 p += ((k % 2) ? 1 : -1) * term;
0272 if (term < tolerance) break;
0273 }
0274 return std::clamp(2*p, 0.0, 1.0);
0275 }
0276
0277
0278
0279
0280
0281
0282
0283 inline double naiveChi2(const std::vector<double>& sample1, const std::vector<double>& sample2,
0284 const std::vector<double>& s1errors = std::vector<double>{},
0285 const std::vector<double>& s2errors = std::vector<double>{}) {
0286 if (sample1.size() != sample2.size()) {
0287 throw RangeError("Inputs should have equal length!");
0288 }
0289 if (s1errors.size() && sample1.size() != s1errors.size()) {
0290 throw RangeError("Inputs should have equal length!");
0291 }
0292 if (s2errors.size() && sample2.size() != s2errors.size()) {
0293 throw RangeError("Inputs should have equal length!");
0294 }
0295 const size_t N = sample1.size();
0296 double chi2 = 0.0;
0297 for (size_t i = 0; i < N; ++i) {
0298 double temp = sqr(sample1[i] - sample2[i]);
0299 if (s1errors.size()) {
0300 temp /= sqr(s1errors[i]) + sqr(s2errors[i]);
0301 }
0302 chi2 += temp;
0303 }
0304 return chi2;
0305 }
0306
0307
0308
0309
0310
0311
0312
0313 inline double naiveChi2reduced(const std::vector<double>& sample1, const std::vector<double>& sample2,
0314 const std::vector<double>& s1errors = std::vector<double>{},
0315 const std::vector<double>& s2errors = std::vector<double>{}) {
0316 if (sample1.empty()) throw RangeError("Inputs should not have 0 length!");
0317 return naiveChi2(sample1, sample2, s1errors, s2errors)/sample1.size();
0318 }
0319
0320
0321
0322 }
0323
0324 #endif