|
||||
File indexing completed on 2025-01-18 10:10:16
0001 // @(#)root/mathcore:$Id$ 0002 // Authors: Bartolomeu Rabacal 05/2010 0003 /********************************************************************** 0004 * * 0005 * Copyright (c) 2006 , LCG ROOT MathLib Team * 0006 * * 0007 * * 0008 **********************************************************************/ 0009 // Header file for GoFTest 0010 0011 #ifndef ROOT_Math_GoFTest 0012 #define ROOT_Math_GoFTest 0013 0014 #include "Math/WrappedFunction.h" 0015 #include "TMath.h" 0016 0017 #include <memory> 0018 #include <vector> 0019 0020 /* 0021 */ 0022 0023 namespace ROOT { 0024 0025 namespace Fit { 0026 class BinData; 0027 } 0028 namespace Math { 0029 0030 0031 /** 0032 @defgroup GoFClasses Goodness of Fit Tests 0033 Classical one-dimensional goodness of git tests for unbinned data. 0034 ROOT provides 1 sample goodness of fit test (comparison of data with a theoretical distribution) and 0035 2-sample test (comparison of two data sets) through the class ROOT::Math::GoFTest 0036 The algorithms provided are the Kolmogorov-Smirnov and Anderson-Darling. 0037 These tests could be applied approximately also to binned data, assuming the bin size is much smaller than the intrinsic 0038 data variations. It is assumed than a bin is like many data at the same bin center value. 0039 For these binned version tests look at `TH1::KolmogorovTest` and `TH1::AndersonDarlingTest` 0040 @ingroup MathCore 0041 */ 0042 0043 /** 0044 * GoFTest class implementing the 1 sample and 2 sample goodness of fit tests 0045 * for uni-variate distributions and data. 0046 * The class implements the AndersonDarling and the KolmogorovSmirnov tests 0047 * 0048 * In the case of the 1-sample test the user needs to provide: 0049 * - input data 0050 * - theoretical distribution. The distribution can be provided as a function object (functor) or an object implementing 0051 * the `ROOT::Math::IGenFunction` interface. One can provide either the PDF (default) of the CDF (cumulative distribution) 0052 * One can also provide a pre-defined function. In that case one needs to give also the distribution parameters otherwise the default values will be used. 0053 * The pre-defined distributions are: 0054 * - kGaussian with default parameter mean=0, sigma=1 0055 * - kExponential with default parameter rate=1 0056 * - kLogNormal with default parameter meanlog=0, sigmalog=1 0057 * 0058 * Note that one should not use data computed distribution parameters, otherwise the test will be biased. 0059 * The 1-sample KS test using data computed quantities is called Lilliefors test (see https://en.wikipedia.org/wiki/Lilliefors_test) 0060 * 0061 * @ingroup GoFClasses 0062 */ 0063 0064 0065 class GoFTest { 0066 public: 0067 0068 /// H0 distributions for using only with 1-sample tests. 0069 /// One should provide the distribution parameters otherwise the default values will be used 0070 enum EDistribution { 0071 kUndefined, /// Default value for non templated 1-sample test. Set with SetDistribution 0072 kUserDefined, /// For internal use only within the class's template constructor 0073 kGaussian, /// Gaussian distribution with default mean=0, sigma=1 0074 kLogNormal, /// Lognormal distribution with default meanlog=0, sigmalog=1 0075 kExponential /// Exponential distribution with default rate=1 0076 }; 0077 0078 /// User input distribution option 0079 enum EUserDistribution { 0080 kCDF, /// Input distribution is a CDF : cumulative distribution function 0081 kPDF /// Input distribution is a PDF (Default value) 0082 }; 0083 0084 /// Goodness of Fit test types for using with the class's unary functions as a shorthand for the in-built methods 0085 enum ETestType { 0086 kAD, /// Anderson-Darling Test. Default value 0087 kAD2s, /// Anderson-Darling 2-Samples Test 0088 kKS, /// Kolmogorov-Smirnov Test 0089 kKS2s /// Kolmogorov-Smirnov 2-Samples Test 0090 }; 0091 0092 /// Constructor for 2-samples tests 0093 GoFTest(size_t sample1Size, const Double_t* sample1, size_t sample2Size, const Double_t* sample2); 0094 0095 /// Constructor for 1-sample tests with a specified distribution. 0096 /// If a specific distribution is not specified it can be set later using SetDistribution. 0097 GoFTest(size_t sampleSize, const Double_t* sample, EDistribution dist = kUndefined, const std::vector<double> & distParams = {}); 0098 0099 /// Templated constructor for 1-sample tests with a user specified distribution as a functor object implementing `double operator()(double x)`. 0100 template<class Dist> 0101 GoFTest(size_t sampleSize, const Double_t* sample, Dist& dist, EUserDistribution userDist = kPDF, 0102 Double_t xmin = 1, Double_t xmax = 0) 0103 { 0104 Instantiate(sample, sampleSize); 0105 SetUserDistribution<Dist>(dist, userDist, xmin, xmax); 0106 } 0107 0108 /// Constructor for 1-sample tests with a user specified distribution implementing the ROOT::Math::IGenFunction interface. 0109 GoFTest(size_t sampleSize, const Double_t* sample, const IGenFunction& dist, EUserDistribution userDist = kPDF, 0110 Double_t xmin = 1, Double_t xmax = 0) 0111 { 0112 Instantiate(sample, sampleSize); 0113 SetUserDistribution(dist, userDist, xmin, xmax); 0114 } 0115 0116 /// Sets the user input distribution function for 1-sample test as a generic functor object. 0117 template<class Dist> 0118 void SetUserDistribution(Dist& dist, EUserDistribution userDist = kPDF, Double_t xmin = 1, Double_t xmax = 0) { 0119 WrappedFunction<Dist&> wdist(dist); 0120 SetDistributionFunction(wdist, userDist, xmin, xmax); 0121 } 0122 0123 /// Sets the user input distribution function for 1-sample test using the ROOT::Math::IGenFunction interface. 0124 void SetUserDistribution(const IGenFunction& dist, GoFTest::EUserDistribution userDist = kPDF, Double_t xmin = 1, Double_t xmax = 0) { 0125 SetDistributionFunction(dist, userDist, xmin, xmax); 0126 } 0127 0128 /// Sets the user input distribution as a probability density function for 1-sample tests. 0129 template<class Dist> 0130 void SetUserPDF(Dist& pdf, Double_t xmin = 1, Double_t xmax = 0) { 0131 SetUserDistribution<Dist>(pdf, kPDF, xmin, xmax); 0132 } 0133 0134 /// Specialization to set the user input distribution as a probability density function for 1-sample tests using the ROOT::Math::IGenFunction interface. 0135 void SetUserPDF(const IGenFunction& pdf, Double_t xmin = 1, Double_t xmax = 0) { 0136 SetUserDistribution(pdf, kPDF, xmin, xmax); 0137 } 0138 0139 /// Sets the user input distribution as a cumulative distribution function for 1-sample tests. 0140 /// The CDF must return zero for x=xmin and 1 for x=xmax. 0141 template<class Dist> 0142 void SetUserCDF(Dist& cdf, Double_t xmin = 1, Double_t xmax = 0) { 0143 SetUserDistribution<Dist>(cdf, kCDF, xmin, xmax); 0144 } 0145 0146 /// Specialization to set the user input distribution as a cumulative distribution function for 1-sample tests. 0147 void SetUserCDF(const IGenFunction& cdf, Double_t xmin = 1, Double_t xmax = 0) { 0148 SetUserDistribution(cdf, kCDF, xmin, xmax); 0149 } 0150 0151 0152 /// Sets the distribution for the predefined distribution types and optionally its parameters for 1-sample tests. 0153 void SetDistribution(EDistribution dist, const std::vector<double> & distParams = {}); 0154 0155 0156 virtual ~GoFTest(); 0157 0158 /// Performs the Anderson-Darling 2-Sample Test. 0159 /// The Anderson-Darling K-Sample Test algorithm is described and taken from 0160 /// http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/andeksam.htm 0161 /// and from 0162 /// (1) Scholz F.W., Stephens M.A. (1987), K-sample Anderson-Darling Tests, Journal of the American Statistical Association, 82, 918–924. 0163 /// (2-samples variant implemented). 0164 void AndersonDarling2SamplesTest(Double_t& pvalue, Double_t& testStat) const; 0165 0166 /// Anderson-Darling 2-Sample Test. 0167 /// Returns by default the p-value; when using option "t" returns the test statistic value "A2". 0168 Double_t AndersonDarling2SamplesTest(const Char_t* option = "p") const; 0169 0170 /** 0171 Performs the Anderson-Darling 1-Sample Test. 0172 The Anderson-Darling 1-Sample Test algorithm for a specific distribution is described at 0173 http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/andedarl.htm 0174 and described and taken from (2) 0175 Marsaglia J.C.W., Marsaglia G. (2004), Evaluating the Anderson-Darling Distribution, Journal of Statistical Software, Volume 09, Issue i02. 0176 and described and taken from (3) 0177 Lewis P.A.W. (1961), The Annals of Mathematical Statistics, Distribution of the Anderson-Darling Statistic, Volume 32, Number 4, 1118-1124. 0178 */ 0179 void AndersonDarlingTest(Double_t& pvalue, Double_t& testStat) const; 0180 0181 /// Anderson-Darling 2-Sample Test. 0182 /// Returns default p-value; option "t" returns the test statistic value "A2" 0183 Double_t AndersonDarlingTest(const Char_t* option = "p") const; 0184 0185 /** 0186 * @brief Kolmogorov-Smirnov 2-Samples Test. 0187 The Kolmogorov-Smirnov 2-Samples Test algorithm is described at 0188 http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ks2samp.htm 0189 and described and taken from 0190 http://root.cern.ch/root/html/TMath.html#TMath:KolmogorovTest 0191 */ 0192 void KolmogorovSmirnov2SamplesTest(Double_t& pvalue, Double_t& testStat) const; 0193 0194 /// Kolmogorov-Smirnov 2-Samples Test. 0195 /// Returns by default the p-value; option "t" returns the test statistic value "Dn". 0196 Double_t KolmogorovSmirnov2SamplesTest(const Char_t* option = "p") const; 0197 0198 /** 0199 * @brief Kolmogorov-Smirnov 1-Sample Test. 0200 * 0201 The Kolmogorov-Smirnov 1-Sample Test algorithm for a specific distribution is described at 0202 http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/kstest.htm 0203 and described and taken from (4) 0204 Press W. H., Teukolsky S.A., Vetterling W.T., Flannery B.P. (2007), Numerical Recipes - 0205 The Art of Scientific Computing (Third Edition), Cambridge University Press 0206 */ 0207 void KolmogorovSmirnovTest(Double_t& pvalue, Double_t& testStat) const; 0208 0209 /// Kolmogorov-Smirnov 1-Sample Test. 0210 /// Returns default p-value; option "t" returns the test statistic value "Dn". 0211 Double_t KolmogorovSmirnovTest(const Char_t* option = "p") const; 0212 0213 /// The class's unary functions performing the gif test according to the ETestType provided. 0214 void operator()(ETestType test, Double_t& pvalue, Double_t& testStat) const; 0215 0216 /// Returns default Anderson Darling 1-Sample Test and default p-value; option "t" returns the test statistic value 0217 /// specific to the test type. 0218 Double_t operator()(ETestType test = kAD, const Char_t* option = "p") const; 0219 0220 /// Computation of the K-Sample Anderson-Darling Test's p-value as described in (1) 0221 // given a normalized test statistic. The first variant described in the paper is used. 0222 static Double_t PValueADKSamples(size_t nsamples, Double_t A2 ); 0223 0224 /// Compute the 2-Sample Anderson Darling test for binned data 0225 /// assuming equal data are present at the bin center values. 0226 /// Used by `TH1::AndersonDarling` 0227 static void AndersonDarling2SamplesTest(const ROOT::Fit::BinData & data1, const ROOT::Fit::BinData & data2, Double_t& pvalue, Double_t& testStat); 0228 0229 private: 0230 0231 GoFTest(); ///< Disallowed default constructor 0232 GoFTest(GoFTest& gof); ///< Disallowed copy constructor 0233 GoFTest operator=(GoFTest& gof); ///< Disallowed assign operator 0234 0235 std::unique_ptr<IGenFunction> fCDF; ///< Pointer to CDF used in 1-sample test 0236 0237 0238 EDistribution fDist; ///< Type of distribution 0239 std::vector<Double_t> fParams; ///< The distribution parameters (e.g. fParams[0] = mean, fParams[1] = sigma for a Gaussian) 0240 0241 std::vector<Double_t> fCombinedSamples; ///< The combined data 0242 0243 std::vector<std::vector<Double_t> > fSamples; ///< The input data 0244 0245 Bool_t fTestSampleFromH0; 0246 0247 void SetCDF(); 0248 void SetDistributionFunction(const IGenFunction& cdf, Bool_t isPDF, Double_t xmin, Double_t xmax); 0249 0250 void Instantiate(const Double_t* sample, size_t sampleSize); 0251 0252 0253 Double_t LogNormalCDF(Double_t x) const; 0254 Double_t GaussianCDF(Double_t x) const; 0255 Double_t ExponentialCDF(Double_t x) const; 0256 0257 /// Computation of sigma_N as described in (1) 0258 static Double_t GetSigmaN(const std::vector<size_t> & ns, size_t N); 0259 0260 /// Linear interpolation used in GoFTest::PValueAD2Samples 0261 static Double_t InterpolatePValues(int nsamples,Double_t A2); 0262 0263 /// Computation of the 1-Sample Anderson-Darling Test's p-value 0264 Double_t PValueAD1Sample(Double_t A2) const; 0265 0266 /// Applies the logarithm to the sample when the specified distribution to test is LogNormal 0267 void LogSample(); 0268 0269 /// set a vector of samples 0270 void SetSamples(std::vector<const Double_t*> samples, const std::vector<size_t> samplesSizes); 0271 0272 /// Sets the distribution parameters 0273 void SetParameters(const std::vector<double> & params); 0274 0275 }; // end GoFTest class 0276 0277 0278 } // ROOT namespace 0279 } // Math namespace 0280 #endif
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |