root/TMVA/RuleFitParams.h

0001 // @(#)root/tmva $Id$
0002 // Author: Andreas Hoecker, Joerg Stelzer, Fredrik Tegenfeldt, Helge Voss
0003
0004 /**********************************************************************************
0005  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis       *
0006  * Package: TMVA                                                                  *
0007  * Class  : RuleFitParams                                                         *
0008  *                                             *
0009  *                                                                                *
0010  * Description:                                                                   *
0011  *      A class doing the actual fitting of a linear model using rules as         *
0012  *      base functions.                                                           *
0013  *      Reference paper: 1.Gradient Directed Regularization                       *
0014  *                         Friedman, Popescu, 2004                                *
0015  *                       2.Predictive Learning with Rule Ensembles                *
0016  *                         Friedman, Popescu, 2005                                *
0017  *                                                                                *
0018  *                                                                                *
0019  * Authors (alphabetical):                                                        *
0020  *      Fredrik Tegenfeldt <Fredrik.Tegenfeldt@cern.ch> - Iowa State U., USA      *
0021  *      Helge Voss         <Helge.Voss@cern.ch>         - MPI-KP Heidelberg, Ger. *
0022  *                                                                                *
0023  * Copyright (c) 2005:                                                            *
0024  *      CERN, Switzerland                                                         *
0025  *      Iowa State U.                                                             *
0026  *      MPI-K Heidelberg, Germany                                                 *
0027  *                                                                                *
0028  * Redistribution and use in source and binary forms, with or without             *
0029  * modification, are permitted according to the terms listed in LICENSE           *
0030  * (see tmva/doc/LICENSE)                                          *
0031  **********************************************************************************/
0032
0033 #ifndef ROOT_TMVA_RuleFitParams
0034 #define ROOT_TMVA_RuleFitParams
0035
0036 #include "TMathBase.h"
0037
0038 #include "TMVA/Event.h"
0039
0040 #include <vector>
0041
0042 class TTree;
0043
0044 namespace TMVA {
0045
0046    class RuleEnsemble;
0047    class MsgLogger;
0048    class RuleFit;
0049    class RuleFitParams {
0050
0051    public:
0052
0053       RuleFitParams();
0054       virtual ~RuleFitParams();
0055
0056       void Init();
0057
0058       // set message type
0059       void SetMsgType( EMsgType t );
0060
0061       // set RuleFit ptr
0062       void SetRuleFit( RuleFit *rf )    { fRuleFit = rf; }
0063       //
0064       // GD path: set N(path steps)
0065       void SetGDNPathSteps( Int_t np )  { fGDNPathSteps = np; }
0066
0067       // GD path: set path step size
0068       void SetGDPathStep( Double_t s )  { fGDPathStep = s; }
0069
0070       // GD path: set tau search range
0071       void SetGDTauRange( Double_t t0, Double_t t1 )
0072       {
0073          fGDTauMin = (t0>1.0 ? 1.0:(t0<0.0 ? 0.0:t0));
0074          fGDTauMax = (t1>1.0 ? 1.0:(t1<0.0 ? 0.0:t1));
0075          if (fGDTauMax<fGDTauMin) fGDTauMax = fGDTauMin;
0076       }
0077
0078       // GD path: set number of steps in tau search range
0079       void SetGDTauScan( UInt_t n )        { fGDTauScan = n; }
0080
0081       // GD path: set tau
0082       void SetGDTau( Double_t t ) { fGDTau = t; }
0083
0084
0085       void SetGDErrScale( Double_t s ) { fGDErrScale = s; }
0086       void SetGDTauPrec( Double_t p )  { fGDTauPrec=p; CalcGDNTau(); fGDTauVec.resize(fGDNTau); }
0087
0088       // return type such that +1 = signal and -1 = background
0089       Int_t Type( const Event * e ) const; // return (fRuleFit->GetMethodRuleFit()->DataInfo().IsSignal(e) ? 1:-1); }
0090       //
0091       UInt_t                            GetPathIdx1() const { return fPathIdx1; }
0092       UInt_t                            GetPathIdx2() const { return fPathIdx2; }
0093       UInt_t                            GetPerfIdx1() const { return fPerfIdx1; }
0094       UInt_t                            GetPerfIdx2() const { return fPerfIdx2; }
0095
0096       // Loss function; Huber loss eq 33
0097       Double_t LossFunction( const Event& e ) const;
0098
0099       // same but using evt idx (faster)
0100       Double_t LossFunction( UInt_t evtidx ) const;
0101       Double_t LossFunction( UInt_t evtidx, UInt_t itau ) const;
0102
0103       // Empirical risk
0104       Double_t Risk(UInt_t ind1, UInt_t ind2, Double_t neff) const;
0105       Double_t Risk(UInt_t ind1, UInt_t ind2, Double_t neff, UInt_t itau) const;
0106
0107       // Risk evaluation for fPathIdx and fPerfInd
0108       Double_t RiskPath() const { return Risk(fPathIdx1,fPathIdx2,fNEveEffPath); }
0109       Double_t RiskPerf() const { return Risk(fPerfIdx1,fPerfIdx2,fNEveEffPerf); }
0110       Double_t RiskPerf( UInt_t itau ) const { return Risk(fPerfIdx1,fPerfIdx2,fNEveEffPerf,itau); }
0111
0112       // Risk evaluation for all tau
0113       UInt_t RiskPerfTst();
0114
0115       // Penalty function; Lasso function (eq 8)
0116       Double_t Penalty() const;
0117
0118       // initialize GD path
0119       void InitGD();
0120
0121       // find best tau and return the number of scan steps used
0122       Int_t FindGDTau();
0123
0124       // make path for binary classification (squared-error ramp, sect 6 in ref 1)
0125       void MakeGDPath();
0126
0127    protected:
0128
0129       // typedef of an Event const iterator
0130       typedef std::vector<const TMVA::Event *>::const_iterator  EventItr;
0131
0132       // init ntuple
0133       void InitNtuple();
0134
0135       // calculate N(tau) in scan - limit to 100000.
0136       void CalcGDNTau()  { fGDNTau = static_cast<UInt_t>(1.0/fGDTauPrec)+1; if (fGDNTau>100000) fGDNTau=100000; }
0137
0138       // fill ntuple with coefficient info
0139       void FillCoefficients();
0140
0141       // estimate the optimum scoring function
0142       void CalcFStar();
0143
0144       // estimate of binary error rate
0145       Double_t ErrorRateBin();
0146
0147       // estimate of scale average error rate
0148       Double_t ErrorRateReg();
0149
0150       // estimate 1-area under ROC
0151       Double_t ErrorRateRocRaw( std::vector<Double_t> & sFsig, std::vector<Double_t> & sFbkg );
0152       Double_t ErrorRateRoc();
0153       void     ErrorRateRocTst();
0154
0155       // estimate optimism
0156       Double_t Optimism();
0157
0158       // make gradient vector (eq 44 in ref 1)
0159       void MakeGradientVector();
0160
0161       // Calculate the direction in parameter space (eq 25, ref 1) and update coeffs (eq 22, ref 1)
0162       void UpdateCoefficients();
0163
0164       // calculate average of responses of F
0165       Double_t CalcAverageResponse();
0166       Double_t CalcAverageResponseOLD();
0167
0168       // calculate average of true response (initial estimate of a0)
0169       Double_t CalcAverageTruth();
0170
0171       // calculate the average of each variable over the range
0172       void EvaluateAverage(UInt_t ind1, UInt_t ind2,
0173                            std::vector<Double_t> &avsel,
0174                            std::vector<Double_t> &avrul);
0175
0176       // evaluate using fPathIdx1,2
0177       void EvaluateAveragePath() { EvaluateAverage( fPathIdx1, fPathIdx2, fAverageSelectorPath, fAverageRulePath ); }
0178
0179       // evaluate using fPerfIdx1,2
0180       void EvaluateAveragePerf() { EvaluateAverage( fPerfIdx1, fPerfIdx2, fAverageSelectorPerf, fAverageRulePerf ); }
0181
0182       // the same as above but for the various tau
0183       void MakeTstGradientVector();
0184       void UpdateTstCoefficients();
0185       void CalcTstAverageResponse();
0186
0187
0188       RuleFit             * fRuleFit;      ///< rule fit
0189       RuleEnsemble        * fRuleEnsemble; ///< rule ensemble
0190       //
0191       UInt_t                fNRules;       ///< number of rules
0192       UInt_t                fNLinear;      ///< number of linear terms
0193       //
0194       // Event indices for path/validation - TODO: should let the user decide
0195       // Now it is just a simple one-fold cross validation.
0196       //
0197       UInt_t                fPathIdx1;       ///< first event index for path search
0198       UInt_t                fPathIdx2;       ///< last event index for path search
0199       UInt_t                fPerfIdx1;       ///< first event index for performance evaluation
0200       UInt_t                fPerfIdx2;       ///< last event index for performance evaluation
0201       Double_t              fNEveEffPath;    ///< sum of weights for Path events
0202       Double_t              fNEveEffPerf;    ///< idem for Perf events
0203
0204       std::vector<Double_t> fAverageSelectorPath; ///< average of each variable over the range fPathIdx1,2
0205       std::vector<Double_t> fAverageRulePath;     ///< average of each rule, same range
0206       std::vector<Double_t> fAverageSelectorPerf; ///< average of each variable over the range fPerfIdx1,2
0207       std::vector<Double_t> fAverageRulePerf;     ///< average of each rule, same range
0208
0209       std::vector<Double_t> fGradVec;        ///< gradient vector - dimension = number of rules in ensemble
0210       std::vector<Double_t> fGradVecLin;     ///< gradient vector - dimension = number of variables
0211
0212       std::vector< std::vector<Double_t> > fGradVecTst;    ///< gradient vector - one per tau
0213       std::vector< std::vector<Double_t> > fGradVecLinTst; ///< gradient vector, linear terms - one per tau
0214       //
0215       std::vector<Double_t> fGDErrTst;     ///< error rates per tau
0216       std::vector<Char_t>   fGDErrTstOK;   ///< error rate is sufficiently low <--- stores boolean
0217       std::vector< std::vector<Double_t> > fGDCoefTst;    ///< rule coeffs - one per tau
0218       std::vector< std::vector<Double_t> > fGDCoefLinTst; ///< linear coeffs - one per tau
0219       std::vector<Double_t> fGDOfsTst;       ///< offset per tau
0220       std::vector< Double_t > fGDTauVec;     ///< the tau's
0221       UInt_t                fGDNTauTstOK;    ///< number of tau in the test-phase that are ok
0222       UInt_t                fGDNTau;         ///< number of tau-paths - calculated in SetGDTauPrec
0223       Double_t              fGDTauPrec;      ///< precision in tau
0224       UInt_t                fGDTauScan;      ///< number scan for tau-paths
0225       Double_t              fGDTauMin;       ///< min threshold parameter (tau in eq 26, ref 1)
0226       Double_t              fGDTauMax;       ///< max threshold parameter (tau in eq 26, ref 1)
0227       Double_t              fGDTau;          ///< selected threshold parameter (tau in eq 26, ref 1)
0228       Double_t              fGDPathStep;     ///< step size along path (delta nu in eq 22, ref 1)
0229       Int_t                 fGDNPathSteps;   ///< number of path steps
0230       Double_t              fGDErrScale;     ///< stop scan at error = scale*errmin
0231       //
0232       Double_t              fAverageTruth;   ///< average truth, ie sum(y)/N, y=+-1
0233       //
0234       std::vector<Double_t> fFstar;          ///< vector of F*() - filled in CalcFStar()
0235       Double_t              fFstarMedian;    ///< median value of F*() using
0236       //
0237       TTree                *fGDNtuple;       ///< Gradient path ntuple, contains params for each step along the path
0238       Double_t              fNTRisk;         ///< GD path: risk
0239       Double_t              fNTErrorRate;    ///< GD path: error rate (or performance)
0240       Double_t              fNTNuval;        ///< GD path: value of nu
0241       Double_t              fNTCoefRad;      ///< GD path: 'radius' of all rulecoeffs
0242       Double_t              fNTOffset;       ///< GD path: model offset
0243       Double_t             *fNTCoeff;        ///< GD path: rule coefficients
0244       Double_t             *fNTLinCoeff;     ///< GD path: linear coefficients
0245
0246       Double_t              fsigave;         ///< Sigma of current signal score function F(sig)
0247       Double_t              fsigrms;         ///< Rms of F(sig)
0248       Double_t              fbkgave;         ///< Average of F(bkg)
0249       Double_t              fbkgrms;         ///< Rms of F(bkg)
0250
0251    private:
0252
0253       mutable MsgLogger*    fLogger;         ///<! message logger
0254       MsgLogger& Log() const { return *fLogger; }
0255
0256    };
0257
0258    // --------------------------------------------------------
0259
0260    class AbsValue {
0261
0262    public:
0263
0264       Bool_t operator()( Double_t first, Double_t second ) const { return TMath::Abs(first) < TMath::Abs(second); }
0265    };
0266 }
0267
0268
0269 #endif