Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-30 10:22:53

0001 // @(#)root/tmva $Id$
0002 // Author: Andreas Hoecker, Joerg Stelzer, Helge Voss, Kai Voss, Jan Therhaag
0003 
0004 /**********************************************************************************
0005  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis       *
0006  * Package: TMVA                                                                  *
0007  * Class  : MethodBDT  (Boosted Decision Trees)                                   *
0008  *                                             *
0009  *                                                                                *
0010  * Description:                                                                   *
0011  *      Analysis of Boosted Decision Trees                                        *
0012  *                                                                                *
0013  * Authors (alphabetical):                                                        *
0014  *      Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland              *
0015  *      Helge Voss      <Helge.Voss@cern.ch>     - MPI-K Heidelberg, Germany      *
0016  *      Kai Voss        <Kai.Voss@cern.ch>       - U. of Victoria, Canada         *
0017  *      Doug Schouten   <dschoute@sfu.ca>        - Simon Fraser U., Canada        *
0018  *      Jan Therhaag    <jan.therhaag@cern.ch>   - U. of Bonn, Germany            *
0019  *                                                                                *
0020  * Copyright (c) 2005-2011:                                                       *
0021  *      CERN, Switzerland                                                         *
0022  *      U. of Victoria, Canada                                                    *
0023  *      MPI-K Heidelberg, Germany                                                 *
0024  *      U. of Bonn, Germany                                                       *
0025  *                                                                                *
0026  * Redistribution and use in source and binary forms, with or without             *
0027  * modification, are permitted according to the terms listed in LICENSE           *
0028  * (see tmva/doc/LICENSE)                                          *
0029  **********************************************************************************/
0030 
0031 #ifndef ROOT_TMVA_MethodBDT
0032 #define ROOT_TMVA_MethodBDT
0033 
0034 //////////////////////////////////////////////////////////////////////////
0035 //                                                                      //
0036 // MethodBDT                                                            //
0037 //                                                                      //
0038 // Analysis of Boosted Decision Trees                                   //
0039 //                                                                      //
0040 //////////////////////////////////////////////////////////////////////////
0041 
0042 #include <vector>
0043 #include <memory>
0044 #include <map>
0045 
0046 #include "TH2.h"
0047 #include "TTree.h"
0048 #include "TMVA/MethodBase.h"
0049 #include "TMVA/DecisionTree.h"
0050 #include "TMVA/Event.h"
0051 #include "TMVA/LossFunction.h"
0052 
0053 // Multithreading only if the compilation flag is turned on
0054 #ifdef R__USE_IMT
0055 #include <ROOT/TThreadExecutor.hxx>
0056 #include "TSystem.h"
0057 #endif
0058 
0059 namespace TMVA {
0060 
0061    class SeparationBase;
0062 
0063    class MethodBDT : public MethodBase {
0064 
0065    public:
0066 
0067       // constructor for training and reading
0068       MethodBDT( const TString& jobName,
0069                  const TString& methodTitle,
0070                  DataSetInfo& theData,
0071                  const TString& theOption = "");
0072 
0073       // constructor for calculating BDT-MVA using previously generated decision trees
0074       MethodBDT( DataSetInfo& theData,
0075                  const TString& theWeightFile);
0076 
0077       virtual ~MethodBDT( void );
0078 
0079       virtual Bool_t HasAnalysisType( Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets );
0080 
0081 
0082       // write all Events from the Tree into a vector of Events, that are
0083       // more easily manipulated
0084       void InitEventSample();
0085 
0086       // optimize tuning parameters
0087       virtual std::map<TString,Double_t> OptimizeTuningParameters(TString fomType="ROCIntegral", TString fitType="FitGA");
0088       virtual void SetTuneParameters(std::map<TString,Double_t> tuneParameters);
0089 
0090       // training method
0091       void Train( void );
0092 
0093       // revoke training
0094       void Reset( void );
0095 
0096       using MethodBase::ReadWeightsFromStream;
0097 
0098       // write weights to file
0099       void AddWeightsXMLTo( void* parent ) const;
0100 
0101       // read weights from file
0102       void ReadWeightsFromStream( std::istream& istr );
0103       void ReadWeightsFromXML(void* parent);
0104 
0105       // write method specific histos to target file
0106       void WriteMonitoringHistosToFile( void ) const;
0107 
0108       // calculate the MVA value
0109       Double_t GetMvaValue( Double_t* err = nullptr, Double_t* errUpper = nullptr);
0110 
0111       // get the actual forest size (might be less than fNTrees, the requested one, if boosting is stopped early
0112       UInt_t   GetNTrees() const {return fForest.size();}
0113    private:
0114 
0115       Double_t GetMvaValue( Double_t* err, Double_t* errUpper, UInt_t useNTrees );
0116       Double_t PrivateGetMvaValue( const TMVA::Event *ev, Double_t* err=nullptr, Double_t* errUpper=nullptr, UInt_t useNTrees=0 );
0117       void     BoostMonitor(Int_t iTree);
0118 
0119    public:
0120       const std::vector<Float_t>& GetMulticlassValues();
0121 
0122       // regression response
0123       const std::vector<Float_t>& GetRegressionValues();
0124 
0125       // apply the boost algorithm to a tree in the collection
0126       Double_t Boost( std::vector<const TMVA::Event*>&, DecisionTree *dt, UInt_t cls = 0);
0127 
0128       // ranking of input variables
0129       const Ranking* CreateRanking();
0130 
0131       // the option handling methods
0132       void DeclareOptions();
0133       void ProcessOptions();
0134       void SetMaxDepth(Int_t d){fMaxDepth = d;}
0135       void SetMinNodeSize(Double_t sizeInPercent);
0136       void SetMinNodeSize(TString sizeInPercent);
0137 
0138       void SetNTrees(Int_t d){fNTrees = d;}
0139       void SetAdaBoostBeta(Double_t b){fAdaBoostBeta = b;}
0140       void SetNodePurityLimit(Double_t l){fNodePurityLimit = l;}
0141       void SetShrinkage(Double_t s){fShrinkage = s;}
0142       void SetUseNvars(Int_t n){fUseNvars = n;}
0143       void SetBaggedSampleFraction(Double_t f){fBaggedSampleFraction = f;}
0144 
0145 
0146       // get the forest
0147       inline const std::vector<TMVA::DecisionTree*> & GetForest() const;
0148 
0149       // get the forest
0150       inline const std::vector<const TMVA::Event*> & GetTrainingEvents() const;
0151 
0152       inline const std::vector<double> & GetBoostWeights() const;
0153 
0154       //return the individual relative variable importance
0155       std::vector<Double_t> GetVariableImportance();
0156       Double_t GetVariableImportance(UInt_t ivar);
0157 
0158       Double_t TestTreeQuality( DecisionTree *dt );
0159 
0160       // make ROOT-independent C++ class for classifier response (classifier-specific implementation)
0161       void MakeClassSpecific( std::ostream&, const TString& ) const;
0162 
0163       // header and auxiliary classes
0164       void MakeClassSpecificHeader( std::ostream&, const TString& ) const;
0165 
0166       void MakeClassInstantiateNode( DecisionTreeNode *n, std::ostream& fout,
0167                                      const TString& className ) const;
0168 
0169       void GetHelpMessage() const;
0170 
0171    protected:
0172       void DeclareCompatibilityOptions();
0173 
0174    private:
0175       // Init used in the various constructors
0176       void Init( void );
0177 
0178       void PreProcessNegativeEventWeights();
0179 
0180       // boosting algorithm (adaptive boosting)
0181       Double_t AdaBoost( std::vector<const TMVA::Event*>&, DecisionTree *dt );
0182 
0183       // boosting algorithm (adaptive boosting with cost matrix)
0184       Double_t AdaCost( std::vector<const TMVA::Event*>&, DecisionTree *dt );
0185 
0186       // boosting as a random re-weighting
0187       Double_t Bagging( );
0188 
0189       // boosting special for regression
0190       Double_t RegBoost( std::vector<const TMVA::Event*>&, DecisionTree *dt );
0191 
0192       // adaboost adapted to regression
0193       Double_t AdaBoostR2( std::vector<const TMVA::Event*>&, DecisionTree *dt );
0194 
0195       // binomial likelihood gradient boost for classification
0196       // (see Friedman: "Greedy Function Approximation: a Gradient Boosting Machine"
0197       // Technical report, Dept. of Statistics, Stanford University)
0198       Double_t GradBoost( std::vector<const TMVA::Event*>&, DecisionTree *dt, UInt_t cls = 0);
0199       Double_t GradBoostRegression(std::vector<const TMVA::Event*>&, DecisionTree *dt );
0200       void InitGradBoost( std::vector<const TMVA::Event*>&);
0201       void UpdateTargets( std::vector<const TMVA::Event*>&, UInt_t cls = 0);
0202       void UpdateTargetsRegression( std::vector<const TMVA::Event*>&,Bool_t first=kFALSE);
0203       Double_t GetGradBoostMVA(const TMVA::Event *e, UInt_t nTrees);
0204       void     GetBaggedSubSample(std::vector<const TMVA::Event*>&);
0205 
0206       std::vector<const TMVA::Event*>       fEventSample;      ///< the training events
0207       std::vector<const TMVA::Event*>       fValidationSample; ///< the Validation events
0208       std::vector<const TMVA::Event*>       fSubSample;        ///< subsample for bagged grad boost
0209       std::vector<const TMVA::Event*>      *fTrainSample;      ///< pointer to sample actually used in training (fEventSample or fSubSample) for example
0210 
0211       Int_t                           fNTrees;            ///< number of decision trees requested
0212       std::vector<DecisionTree*>      fForest;            ///< the collection of decision trees
0213       std::vector<double>             fBoostWeights;      ///< the weights applied in the individual boosts
0214       Double_t                        fSigToBkgFraction;  ///< Signal to Background fraction assumed during training
0215       TString                         fBoostType;         ///< string specifying the boost type
0216       Double_t                        fAdaBoostBeta;      ///< beta parameter for AdaBoost algorithm
0217       TString                         fAdaBoostR2Loss;    ///< loss type used in AdaBoostR2 (Linear,Quadratic or Exponential)
0218       //Double_t                        fTransitionPoint; ///< break-down point for gradient regression
0219       Double_t                        fShrinkage;         ///< learning rate for gradient boost;
0220       Bool_t                          fBaggedBoost;       ///< turn bagging in combination with boost on/off
0221       Bool_t                          fBaggedGradBoost;   ///< turn bagging in combination with grad boost on/off
0222       //Double_t                        fSumOfWeights;    ///< sum of all event weights
0223       //std::map< const TMVA::Event*, std::pair<Double_t, Double_t> >       fWeightedResiduals;   ///< weighted regression residuals
0224       std::map< const TMVA::Event*, LossFunctionEventInfo>                fLossFunctionEventInfo; ///< map event to true value, predicted value, and weight
0225                                                                                                   /// used by different loss functions for BDT regression
0226       std::map< const TMVA::Event*,std::vector<double> > fResiduals; ///< individual event residuals for gradient boost
0227 
0228       //options for the decision Tree
0229       SeparationBase                 *fSepType;         ///< the separation used in node splitting
0230       TString                         fSepTypeS;        ///< the separation (option string) used in node splitting
0231       Int_t                           fMinNodeEvents;   ///< min number of events in node
0232       Float_t                         fMinNodeSize;     ///< min percentage of training events in node
0233       TString                         fMinNodeSizeS;    ///< string containing min percentage of training events in node
0234 
0235       Int_t                           fNCuts;               ///< grid used in cut applied in node splitting
0236       Bool_t                          fUseFisherCuts;       ///< use multivariate splits using the Fisher criterium
0237       Double_t                        fMinLinCorrForFisher; ///< the minimum linear correlation between two variables demanded for use in fisher criterium in node splitting
0238       Bool_t                          fUseExclusiveVars;    ///< individual variables already used in fisher criterium are not anymore analysed individually for node splitting
0239       Bool_t                          fUseYesNoLeaf;        ///< use sig or bkg classification in leave nodes or sig/bkg
0240       Double_t                        fNodePurityLimit;     ///< purity limit for sig/bkg nodes
0241       UInt_t                          fNNodesMax;           ///< max # of nodes
0242       UInt_t                          fMaxDepth;            ///< max depth
0243 
0244       DecisionTree::EPruneMethod       fPruneMethod;       ///< method used for pruning
0245       TString                          fPruneMethodS;      ///< prune method option String
0246       Double_t                         fPruneStrength;     ///< a parameter to set the "amount" of pruning..needs to be adjusted
0247       Double_t                         fFValidationEvents; ///< fraction of events to use for pruning
0248       Bool_t                           fAutomatic;         ///< use user given prune strength or automatically determined one using a validation sample
0249       Bool_t                           fRandomisedTrees;   ///< choose a random subset of possible cut variables at each node during training
0250       UInt_t                           fUseNvars;          ///< the number of variables used in the randomised tree splitting
0251       Bool_t                           fUsePoissonNvars;   ///< use "fUseNvars" not as fixed number but as mean of a poisson distr. in each split
0252       UInt_t                           fUseNTrainEvents;   ///< number of randomly picked training events used in randomised (and bagged) trees
0253 
0254       Double_t                         fBaggedSampleFraction;   ///< relative size of bagged event sample to original sample size
0255       TString                          fNegWeightTreatment;     ///< variable that holds the option of how to treat negative event weights in training
0256       Bool_t                           fNoNegWeightsInTraining; ///< ignore negative event weights in the training
0257       Bool_t                           fInverseBoostNegWeights; ///< boost ev. with neg. weights with 1/boostweight rather than boostweight
0258       Bool_t                           fPairNegWeightsGlobal;   ///< pair ev. with neg. and pos. weights in training sample and "annihilate" them
0259       Bool_t                           fTrainWithNegWeights;    ///< yes there are negative event weights and we don't ignore them
0260       Bool_t                           fDoBoostMonitor;         ///< create control plot with ROC integral vs tree number
0261 
0262 
0263       //some histograms for monitoring
0264       TTree*                           fMonitorNtuple;   ///< monitoring ntuple
0265       Int_t                            fITree;           ///< ntuple var: ith tree
0266       Double_t                         fBoostWeight;     ///< ntuple var: boost weight
0267       Double_t                         fErrorFraction;   ///< ntuple var: misclassification error fraction
0268 
0269       Double_t                         fCss;             ///< Cost factor
0270       Double_t                         fCts_sb;          ///< Cost factor
0271       Double_t                         fCtb_ss;          ///< Cost factor
0272       Double_t                         fCbb;             ///< Cost factor
0273 
0274       Bool_t                           fDoPreselection;  ///< do or do not perform automatic pre-selection of 100% eff. cuts
0275 
0276       Bool_t                           fSkipNormalization; ///< true for skipping normalization at initialization of trees
0277 
0278       std::vector<Double_t>            fVariableImportance; ///< the relative importance of the different variables
0279 
0280 
0281       void                             DeterminePreselectionCuts(const std::vector<const TMVA::Event*>& eventSample);
0282       Double_t                         ApplyPreselectionCuts(const Event* ev);
0283 
0284       std::vector<Double_t> fLowSigCut;
0285       std::vector<Double_t> fLowBkgCut;
0286       std::vector<Double_t> fHighSigCut;
0287       std::vector<Double_t> fHighBkgCut;
0288 
0289       std::vector<Bool_t>  fIsLowSigCut;
0290       std::vector<Bool_t>  fIsLowBkgCut;
0291       std::vector<Bool_t>  fIsHighSigCut;
0292       std::vector<Bool_t>  fIsHighBkgCut;
0293 
0294       Bool_t fHistoricBool; //historic variable, only needed for "CompatibilityOptions"
0295 
0296       TString                         fRegressionLossFunctionBDTGS;       ///< the option string determining the loss function for BDT regression
0297       Double_t                        fHuberQuantile;                     ///< the option string determining the quantile for the Huber Loss Function
0298                                                                           ///< in BDT regression.
0299       LossFunctionBDT* fRegressionLossFunctionBDTG;
0300 
0301       // debugging flags
0302       static const Int_t               fgDebugLevel;     ///< debug level determining some printout/control plots etc.
0303 
0304       // for backward compatibility
0305       ClassDef(MethodBDT,0);  // Analysis of Boosted Decision Trees
0306    };
0307 
0308 } // namespace TMVA
0309 
0310 const std::vector<TMVA::DecisionTree*>& TMVA::MethodBDT::GetForest()         const { return fForest; }
0311 const std::vector<const TMVA::Event*> & TMVA::MethodBDT::GetTrainingEvents() const { return fEventSample; }
0312 const std::vector<double>&              TMVA::MethodBDT::GetBoostWeights()   const { return fBoostWeights; }
0313 
0314 #endif