root/Math/GoFTest.h

0001 // @(#)root/mathcore:$Id$
0002 // Authors: Bartolomeu Rabacal    05/2010
0003 /**********************************************************************
0004  *                                                                    *
0005  * Copyright (c) 2006 , LCG ROOT MathLib Team                         *
0006  *                                                                    *
0007  *                                                                    *
0008  **********************************************************************/
0009 // Header file for GoFTest
0010
0011 #ifndef ROOT_Math_GoFTest
0012 #define ROOT_Math_GoFTest
0013
0014 #include "Math/WrappedFunction.h"
0015 #include "TMath.h"
0016
0017 #include <memory>
0018 #include <vector>
0019
0020 /*
0021 */
0022
0023 namespace ROOT {
0024
0025    namespace Fit {
0026       class BinData;
0027    }
0028 namespace Math {
0029
0030
0031 /**
0032   @defgroup GoFClasses Goodness of Fit Tests
0033   Classical one-dimensional goodness of git tests for unbinned data.
0034   ROOT provides 1 sample goodness of fit test (comparison of data with a theoretical distribution) and
0035   2-sample test (comparison of two data sets) through the class ROOT::Math::GoFTest
0036   The algorithms provided are the Kolmogorov-Smirnov and Anderson-Darling.
0037   These tests could be applied approximately also to binned data, assuming the bin size is much smaller than the intrinsic
0038   data variations. It is assumed than a bin is like many data at the same bin center value.
0039   For these binned version tests look at `TH1::KolmogorovTest` and `TH1::AndersonDarlingTest`
0040   @ingroup MathCore
0041  */
0042
0043 /**
0044  * GoFTest class implementing the 1 sample and 2 sample goodness of fit tests
0045  * for uni-variate distributions and data.
0046  * The class implements the AndersonDarling and the KolmogorovSmirnov tests
0047  *
0048  * In the case of the 1-sample test the user needs to  provide:
0049  *   - input data
0050  *   - theoretical distribution. The distribution can be provided as a function object (functor) or an object implementing
0051  *     the `ROOT::Math::IGenFunction` interface. One can provide either the PDF (default) of the CDF (cumulative distribution)
0052  *     One can also provide a pre-defined function. In that case one needs to give also the distribution parameters otherwise the default values will be used.
0053  *     The pre-defined distributions are:
0054  *     - kGaussian  with default parameter mean=0, sigma=1
0055  *     - kExponential with default parameter rate=1
0056  *     - kLogNormal with default parameter meanlog=0, sigmalog=1
0057  *
0058  *     Note that one should not use data computed distribution parameters, otherwise the test will be biased.
0059  *     The 1-sample KS test using data computed quantities is called Lilliefors test (see https://en.wikipedia.org/wiki/Lilliefors_test)
0060  *
0061  *  @ingroup GoFClasses
0062  */
0063
0064
0065 class GoFTest {
0066 public:
0067
0068    /// H0 distributions for using only with 1-sample tests.
0069    /// One should provide the distribution parameters otherwise the default values will be used
0070    enum EDistribution {
0071       kUndefined,       /// Default value for non templated 1-sample test. Set with SetDistribution
0072       kUserDefined,     /// For internal use only within the class's template constructor
0073       kGaussian,        /// Gaussian distribution with default  mean=0, sigma=1
0074       kLogNormal,       /// Lognormal distribution with default  meanlog=0, sigmalog=1
0075       kExponential      /// Exponential distribution with default rate=1
0076    };
0077
0078    /// User input distribution option
0079    enum EUserDistribution {
0080       kCDF,             /// Input distribution is a CDF : cumulative distribution function
0081       kPDF              /// Input distribution is a PDF (Default value)
0082    };
0083
0084    /// Goodness of Fit test types for using with the class's unary functions as a shorthand for the in-built methods
0085    enum ETestType {
0086       kAD,   /// Anderson-Darling Test. Default value
0087       kAD2s, /// Anderson-Darling 2-Samples Test
0088       kKS,   /// Kolmogorov-Smirnov Test
0089       kKS2s  /// Kolmogorov-Smirnov 2-Samples Test
0090    };
0091
0092    /// Constructor for  2-samples tests
0093    GoFTest(size_t sample1Size, const Double_t* sample1, size_t sample2Size, const Double_t* sample2);
0094
0095    /// Constructor for 1-sample tests with a specified distribution.
0096    /// If a specific distribution is not specified it can be set later using SetDistribution.
0097    GoFTest(size_t sampleSize, const Double_t* sample, EDistribution dist = kUndefined, const std::vector<double>  & distParams = {});
0098
0099    /// Templated constructor for 1-sample tests with a user specified distribution as a functor object implementing `double operator()(double x)`.
0100    template<class Dist>
0101    GoFTest(size_t sampleSize, const Double_t* sample, Dist& dist, EUserDistribution userDist = kPDF,
0102            Double_t xmin = 1, Double_t xmax = 0)
0103    {
0104       Instantiate(sample, sampleSize);
0105       SetUserDistribution<Dist>(dist, userDist, xmin, xmax);
0106    }
0107
0108    /// Constructor for 1-sample tests with a user specified distribution implementing the ROOT::Math::IGenFunction interface.
0109    GoFTest(size_t sampleSize, const Double_t* sample, const IGenFunction& dist, EUserDistribution userDist = kPDF,
0110            Double_t xmin = 1, Double_t xmax = 0)
0111    {
0112       Instantiate(sample, sampleSize);
0113       SetUserDistribution(dist, userDist, xmin, xmax);
0114    }
0115
0116    /// Sets the user input distribution function for 1-sample test as a generic functor object.
0117    template<class Dist>
0118    void SetUserDistribution(Dist& dist, EUserDistribution userDist = kPDF, Double_t xmin = 1, Double_t xmax = 0) {
0119       WrappedFunction<Dist&> wdist(dist);
0120       SetDistributionFunction(wdist, userDist, xmin, xmax);
0121    }
0122
0123    ///  Sets the user input distribution function for 1-sample test using the ROOT::Math::IGenFunction interface.
0124    void SetUserDistribution(const IGenFunction& dist, GoFTest::EUserDistribution userDist = kPDF, Double_t xmin = 1, Double_t xmax = 0) {
0125       SetDistributionFunction(dist, userDist, xmin, xmax);
0126    }
0127
0128    /// Sets the user input distribution as a probability density function for 1-sample tests.
0129    template<class Dist>
0130    void SetUserPDF(Dist& pdf, Double_t xmin = 1, Double_t xmax = 0) {
0131       SetUserDistribution<Dist>(pdf, kPDF, xmin, xmax);
0132    }
0133
0134    /// Specialization to set the user input distribution as a probability density function for 1-sample tests using the ROOT::Math::IGenFunction interface.
0135    void SetUserPDF(const IGenFunction& pdf, Double_t xmin = 1, Double_t xmax = 0) {
0136       SetUserDistribution(pdf, kPDF, xmin, xmax);
0137    }
0138
0139    /// Sets the user input distribution as a cumulative distribution function for 1-sample tests.
0140    /// The CDF must return zero for x=xmin and 1 for x=xmax.
0141    template<class Dist>
0142    void SetUserCDF(Dist& cdf, Double_t xmin = 1, Double_t xmax = 0) {
0143       SetUserDistribution<Dist>(cdf, kCDF, xmin, xmax);
0144    }
0145
0146    /// Specialization to set the user input distribution as a cumulative distribution function for 1-sample tests.
0147    void SetUserCDF(const IGenFunction& cdf, Double_t xmin = 1, Double_t xmax = 0)  {
0148       SetUserDistribution(cdf, kCDF, xmin, xmax);
0149    }
0150
0151
0152    /// Sets the distribution for the predefined distribution types and optionally its parameters for 1-sample tests.
0153    void SetDistribution(EDistribution dist, const std::vector<double>  & distParams = {});
0154
0155
0156    virtual ~GoFTest();
0157
0158    /// Performs the Anderson-Darling 2-Sample Test.
0159    ///  The Anderson-Darling K-Sample Test algorithm is described and taken from
0160    ///  http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/andeksam.htm
0161    ///  and from
0162    ///   (1) Scholz F.W., Stephens M.A. (1987), K-sample Anderson-Darling Tests, Journal of the American Statistical Association, 82, 918–924.
0163    ///   (2-samples variant implemented).
0164    void AndersonDarling2SamplesTest(Double_t& pvalue, Double_t& testStat) const;
0165
0166    ///  Anderson-Darling 2-Sample Test.
0167    ///  Returns by default the p-value; when using option "t" returns the test statistic value "A2".
0168    Double_t AndersonDarling2SamplesTest(const Char_t* option = "p") const;
0169
0170    /**
0171    Performs the Anderson-Darling 1-Sample Test.
0172    The Anderson-Darling 1-Sample Test algorithm for a specific distribution is described at
0173    http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/andedarl.htm
0174    and described and taken from (2)
0175    Marsaglia J.C.W., Marsaglia G. (2004), Evaluating the Anderson-Darling Distribution, Journal of Statistical Software, Volume 09, Issue i02.
0176    and described and taken from (3)
0177    Lewis P.A.W. (1961), The Annals of Mathematical Statistics, Distribution of the Anderson-Darling Statistic, Volume 32, Number 4, 1118-1124.
0178    */
0179    void AndersonDarlingTest(Double_t& pvalue, Double_t& testStat) const;
0180
0181    /// Anderson-Darling 2-Sample Test.
0182    /// Returns default p-value; option "t" returns the test statistic value "A2"
0183    Double_t AndersonDarlingTest(const Char_t* option = "p") const;
0184
0185    /**
0186    * @brief Kolmogorov-Smirnov 2-Samples Test.
0187    The Kolmogorov-Smirnov 2-Samples Test algorithm is described at
0188    http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ks2samp.htm
0189    and described and taken from
0190    https://root.cern/doc/master/namespaceTMath.html
0191    */
0192    void KolmogorovSmirnov2SamplesTest(Double_t& pvalue, Double_t& testStat) const;
0193
0194    /// Kolmogorov-Smirnov 2-Samples Test.
0195    /// Returns by default the p-value; option "t" returns the test statistic value "Dn".
0196    Double_t KolmogorovSmirnov2SamplesTest(const Char_t* option = "p") const;
0197
0198   /**
0199    * @brief  Kolmogorov-Smirnov 1-Sample Test.
0200    *
0201      The Kolmogorov-Smirnov 1-Sample Test algorithm for a specific distribution is described at
0202      http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/kstest.htm
0203      and described and taken from (4)
0204      Press W. H., Teukolsky S.A., Vetterling W.T., Flannery B.P. (2007), Numerical Recipes -
0205      The Art of Scientific Computing (Third Edition), Cambridge University Press
0206    */
0207    void KolmogorovSmirnovTest(Double_t& pvalue, Double_t& testStat) const;
0208
0209    /// Kolmogorov-Smirnov 1-Sample Test.
0210    /// Returns default p-value; option "t" returns the test statistic value "Dn".
0211    Double_t KolmogorovSmirnovTest(const Char_t* option = "p") const;
0212
0213    /// The class's unary functions performing the gif test according to the ETestType provided.
0214    void operator()(ETestType test, Double_t& pvalue, Double_t& testStat) const;
0215
0216    /// Returns default Anderson Darling 1-Sample Test and default p-value; option "t" returns the test statistic value
0217    /// specific to the test type.
0218    Double_t operator()(ETestType test = kAD, const Char_t* option = "p") const;
0219
0220    /// Computation of the K-Sample Anderson-Darling Test's p-value as described in (1)
0221    // given a normalized test statistic. The first variant described in the paper is used.
0222    static Double_t PValueADKSamples(size_t nsamples, Double_t A2 );
0223
0224    /// Compute the 2-Sample Anderson Darling test for binned data
0225    /// assuming equal data are present at the bin center values.
0226    /// Used by `TH1::AndersonDarling`
0227    static void  AndersonDarling2SamplesTest(const ROOT::Fit::BinData & data1, const ROOT::Fit::BinData & data2, Double_t& pvalue, Double_t& testStat);
0228
0229 private:
0230
0231    GoFTest();                       ///< Disallowed default constructor
0232    GoFTest(GoFTest& gof);           ///< Disallowed copy constructor
0233    GoFTest operator=(GoFTest& gof); ///< Disallowed assign operator
0234
0235    std::unique_ptr<IGenFunction> fCDF;  ///< Pointer to CDF used in 1-sample test
0236
0237
0238    EDistribution fDist;                ///< Type of distribution
0239    std::vector<Double_t> fParams;      ///< The distribution parameters (e.g. fParams[0] = mean, fParams[1] = sigma for a Gaussian)
0240
0241    std::vector<Double_t> fCombinedSamples;       ///< The combined data
0242
0243    std::vector<std::vector<Double_t> > fSamples;  ///< The input data
0244
0245    Bool_t fTestSampleFromH0;
0246
0247    void SetCDF();
0248    void SetDistributionFunction(const IGenFunction& cdf, Bool_t isPDF, Double_t xmin, Double_t xmax);
0249
0250    void Instantiate(const Double_t* sample, size_t sampleSize);
0251
0252
0253    Double_t LogNormalCDF(Double_t x) const;
0254    Double_t GaussianCDF(Double_t x) const;
0255    Double_t ExponentialCDF(Double_t x) const;
0256
0257    /// Computation of sigma_N as described in (1)
0258    static Double_t GetSigmaN(const std::vector<size_t> & ns, size_t N);
0259
0260    /// Linear interpolation used in GoFTest::PValueAD2Samples
0261    static Double_t InterpolatePValues(int nsamples,Double_t A2);
0262
0263    /// Computation of the 1-Sample Anderson-Darling Test's p-value
0264    Double_t PValueAD1Sample(Double_t A2) const;
0265
0266    /// Applies the logarithm to the sample when the specified distribution to test is LogNormal
0267    void LogSample();
0268
0269    /// set a vector of samples
0270    void SetSamples(std::vector<const Double_t*> samples, const std::vector<size_t> samplesSizes);
0271
0272    /// Sets the distribution parameters
0273    void SetParameters(const std::vector<double> & params);
0274
0275 }; // end GoFTest class
0276
0277
0278 } // ROOT namespace
0279 } // Math namespace
0280 #endif