TMVA/DNN/BatchNormLayer.h

0001
0002 // Author: Vladimir Ilievski
0003
0004 /**********************************************************************************
0005  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis       *
0006  * Package: TMVA                                                                  *
0007  * Class  : TBatchNormLayer                                                           *
0008  *                                             *
0009  *                                                                                *
0010  * Description:                                                                   *
0011  *      Dense Layer Class                                                         *
0012  *                                                                                *
0013  * Authors (alphabetical):                                                        *
0014  *      Vladimir Ilievski      <ilievski.vladimir@live.com>  - CERN, Switzerland  *
0015  *                                                                                *
0016  * Copyright (c) 2005-2015:                                                       *
0017  *      CERN, Switzerland                                                         *
0018  *      U. of Victoria, Canada                                                    *
0019  *      MPI-K Heidelberg, Germany                                                 *
0020  *      U. of Bonn, Germany                                                       *
0021  *                                                                                *
0022  * Redistribution and use in source and binary forms, with or without             *
0023  * modification, are permitted according to the terms listed in LICENSE           *
0024  * (see tmva/doc/LICENSE)                                          *
0025  **********************************************************************************/
0026
0027 #ifndef TMVA_DNN_BatchNormLayer
0028 #define TMVA_DNN_BatchNormLayer
0029
0030 #include "TMVA/DNN/GeneralLayer.h"
0031 #include "TMVA/DNN/Functions.h"
0032
0033 #include "TMVA/DNN/Architectures/Reference.h"
0034
0035 #include "TMVA/DNN/CNN/ContextHandles.h"
0036
0037 #include <iostream>
0038 #include <iomanip>
0039 #include <vector>
0040
0041 namespace TMVA {
0042 namespace DNN {
0043
0044 /** \class TBatchNormLayer
0045
0046       Layer implementing Batch Normalization
0047
0048      The input from each batch are normalized during training to have zero mean and unit variance
0049      and they are then scaled by two parameter, different for each input variable:
0050       - a scale factor gamma
0051       - an offset beta
0052
0053    In addition a running batch mean and variance is computed and stored in the class
0054    During inference the inputs are not normalized using the batch mean but the previously computed
0055   at  running mean and variance
0056    If momentum is in [0,1) the running mean and variances are the exponential averages using the momentum value
0057      running_mean = momentum * running_mean + (1-momentum) * batch_mean
0058    If instead momentum<1 the cumulative average is computed
0059    running_mean = (nb/(nb+1) * running_mean + 1/(nb+1) * batch_mean
0060
0061    See more at [https://arxiv.org/pdf/1502.03167v3.pdf]
0062 */
0063 template <typename Architecture_t>
0064 class TBatchNormLayer : public VGeneralLayer<Architecture_t> {
0065 public:
0066
0067    using Scalar_t = typename Architecture_t::Scalar_t;
0068    using Matrix_t = typename Architecture_t::Matrix_t;
0069    using Tensor_t = typename Architecture_t::Tensor_t;
0070
0071    using HelperDescriptor_t  = typename Architecture_t::TensorDescriptor_t;
0072    using BNormDescriptors_t = typename Architecture_t::BNormDescriptors_t;
0073
0074
0075 private:
0076
0077    Tensor_t fDerivatives; ///< First fDerivatives of the activations of this layer.
0078
0079    int      fNormAxis; ///< Normalization axis. For each element of this axis we will compute mean and stddev
0080
0081    Scalar_t fMomentum; ///< The weight decay.
0082    Scalar_t fEpsilon;
0083
0084    Matrix_t fMu;
0085    Matrix_t fVar;
0086    Matrix_t fIVar;
0087
0088    Matrix_t fMu_Training;
0089    Matrix_t fVar_Training;
0090
0091    // cached tensor used for Cudnn to get correct shape
0092    Tensor_t fReshapedData;  // cached reshaped data tensor
0093
0094    // counter of trained batches for computing testing and variance means
0095    int fTrainedBatches = 0;
0096
0097    TDescriptors * fDescriptors = nullptr;
0098
0099 public:
0100    /*! Constructor */
0101    TBatchNormLayer(size_t batchSize, size_t inputDepth, size_t inputHeight, size_t inputWidth,
0102                    const std::vector<size_t> & shape, int axis = -1, Scalar_t momentum = -1., Scalar_t epsilon = 0.0001);
0103
0104    /*! Copy the dense layer provided as a pointer */
0105    TBatchNormLayer(TBatchNormLayer<Architecture_t> *layer);
0106
0107    /*! Copy Constructor */
0108    TBatchNormLayer(const TBatchNormLayer &);
0109
0110    /*! Destructor */
0111    ~TBatchNormLayer();
0112
0113    /*! Compute activation of the layer for the given input. The input
0114     * must be in 3D tensor form with the different matrices corresponding to
0115     * different events in the batch. Computes activations as well as
0116     * the first partial derivative of the activation function at those
0117     * activations. */
0118    void Forward(Tensor_t &input, bool inTraining = true) override;
0119
0120    /*! Compute weight, bias and activation gradients. Uses the precomputed
0121     *  first partial derivatives of the activation function computed during
0122     *  forward propagation and modifies them. Must only be called directly
0123     *  a the corresponding call to Forward(...). */
0124    void Backward(Tensor_t &gradients_backward, const Tensor_t &activations_backward) override;
0125    //              Tensor_t &inp1, Tensor_t &inp2);
0126
0127
0128    /* reset at end of training the batch counter */
0129    void ResetTraining() override { fTrainedBatches = 0; }
0130
0131    /*! Printing the layer info. */
0132    void Print() const override;
0133
0134    /*! Writes the information and the weights about the layer in an XML node. */
0135    void AddWeightsXMLTo(void *parent) override;
0136
0137    /*! Read the information and the weights about the layer from XML node. */
0138    void ReadWeightsFromXML(void *parent) override;
0139
0140    /* initialize weights */
0141    void Initialize() override;
0142
0143    /*  get number of trained batches */
0144    const int & GetNTrainedBatches() const { return fTrainedBatches;}
0145    int & GetNTrainedBatches() { return fTrainedBatches;}
0146
0147    /*  get batch means for the training phase */
0148    const Matrix_t & GetBatchMean() const { return fMu;}
0149    Matrix_t & GetBatchMean() { return fMu;}
0150
0151    /*  Get the normalized batch examples */
0152    //const Matrix_t & GetNormedBatch() const { return fXhat;}
0153    //Matrix_t & GetNormedBatch() { return fXhat;}
0154
0155    /*  Get the gradient of gamma for backpropagation */
0156    const Matrix_t & GetVariance() const { return fVar;}
0157    Matrix_t & GetVariance() { return fVar;}
0158
0159    /*  Get the sqrt of the batch variances for the training phase */
0160    const Matrix_t & GetIVariance() const { return fIVar;}
0161    Matrix_t & GetIVariance() { return fIVar;}
0162
0163    /*  get vector of averages computed in the training phase */
0164    const Matrix_t & GetMuVector() const { return fMu_Training;}
0165    Matrix_t & GetMuVector() { return fMu_Training;}
0166
0167    /*  get vector of variances computed in the training phase */
0168    const Matrix_t & GetVarVector() const { return fVar_Training;}
0169    Matrix_t & GetVarVector()  { return fVar_Training;}
0170
0171    // Scalar_t GetWeightDecay() const { return fWeightDecay; }
0172
0173    /*  Get the momentum of the running mean/variance */
0174    Scalar_t GetMomentum() const { return fMomentum;}
0175
0176    /*  Get epsilon */
0177    Scalar_t GetEpsilon() const { return fEpsilon;}
0178
0179    /*  Get normalization axis (the one which will have each element normalized) */
0180    Scalar_t GetNormAxis() const { return fNormAxis;}
0181
0182    const Matrix_t &GetReshapedData() const { return fReshapedData; }
0183    Matrix_t &GetReshapedData() { return fReshapedData; }
0184
0185    std::vector<Matrix_t> GetExtraLayerParameters() const override {
0186       std::vector<Matrix_t> params(2);
0187       params[0] = this->GetMuVector();
0188       params[1] = this->GetVarVector();
0189       return params;
0190    }
0191
0192    void SetExtraLayerParameters(const std::vector<Matrix_t> & params) override
0193    {
0194       this->GetMuVector() = params[0];
0195       this->GetVarVector() = params[1];
0196    }
0197
0198 protected:
0199    static size_t CalculateNormDim(int axis, size_t c, size_t h, size_t w)
0200    {
0201       if (axis == -1)
0202          return c * h * w;
0203       else if (axis == 1)
0204          return c;
0205       else if (axis == 2)
0206          return h;
0207       else if (axis == 3)
0208          return w;
0209       return 0;
0210       }
0211 };
0212
0213
0214 //
0215 //
0216 //  The Dense Layer Class - Implementation
0217 //______________________________________________________________________________
0218 template <typename Architecture_t>
0219 TBatchNormLayer<Architecture_t>::TBatchNormLayer(size_t batchSize, size_t inputDepth, size_t inputHeight,
0220                                                  size_t inputWidth, const std::vector<size_t> &shape, int axis,
0221                                                  Scalar_t momentum, Scalar_t epsilon)
0222    : VGeneralLayer<Architecture_t>(batchSize, inputDepth, inputHeight, inputWidth, // bs + input shape
0223                                    inputDepth, inputHeight, inputWidth,            // output shape
0224                                    2, 1,
0225                                    CalculateNormDim(axis, inputDepth, inputHeight, inputWidth), // weight tensor dim.
0226                                    1, 1, 1,                                                      // bias
0227                                    shape[2], shape[0], shape[1],                                 // output tensor shape as bsize, depth, hw
0228                                    EInitialization::kZero),
0229      fNormAxis(axis), fMomentum(momentum), fEpsilon(epsilon),
0230      fMu(1, VGeneralLayer<Architecture_t>::GetWeightsAt(0).GetNcols()), // dimension is same as weights
0231      fVar(1, VGeneralLayer<Architecture_t>::GetWeightsAt(0).GetNcols()),
0232      fIVar(1, VGeneralLayer<Architecture_t>::GetWeightsAt(0).GetNcols()),
0233      fMu_Training(1, VGeneralLayer<Architecture_t>::GetWeightsAt(0).GetNcols()),
0234      fVar_Training(1, VGeneralLayer<Architecture_t>::GetWeightsAt(0).GetNcols()),
0235      fReshapedData(1,1,1)  // use a dummy single element tensor
0236
0237 {
0238
0239 }
0240 //______________________________________________________________________________
0241 template <typename Architecture_t>
0242 TBatchNormLayer<Architecture_t>::TBatchNormLayer(TBatchNormLayer<Architecture_t> *layer)
0243    : VGeneralLayer<Architecture_t>(layer)
0244 {
0245    // to be implemented
0246    printf("Error - copy ctor not implemented\n");
0247 }
0248
0249 //______________________________________________________________________________
0250 template <typename Architecture_t>
0251 TBatchNormLayer<Architecture_t>::TBatchNormLayer(const TBatchNormLayer &layer) : VGeneralLayer<Architecture_t>(layer)
0252 {
0253    // to be implemented
0254    printf("Error - copy ctor not implemented\n");
0255 }
0256
0257 //______________________________________________________________________________
0258 template <typename Architecture_t>
0259 TBatchNormLayer<Architecture_t>::~TBatchNormLayer()
0260 {
0261    // release descriptors
0262    if (fDescriptors) {
0263       Architecture_t::ReleaseBNormDescriptors(fDescriptors);
0264       delete fDescriptors;
0265    }
0266 }
0267
0268 template <typename Architecture_t>
0269 auto TBatchNormLayer<Architecture_t>::Initialize() -> void
0270 {
0271    Matrix_t &gamma = this->GetWeightsAt(0);
0272    Matrix_t &beta = this->GetWeightsAt(1);
0273    size_t bndim = gamma.GetNcols();
0274
0275    initialize<Architecture_t>(beta, EInitialization::kZero);
0276    for (size_t i = 0; i < bndim; ++i) {
0277       gamma(0, i) = 1.;
0278       // assign default values for the other parameters
0279       fMu_Training(0,i) = 0;
0280       fVar_Training(0,i) = 1;
0281    }
0282
0283    Matrix_t &dgamma = this->GetWeightGradientsAt(0);
0284    Matrix_t &dbeta = this->GetWeightGradientsAt(1);
0285    initialize<Architecture_t>(dgamma, EInitialization::kZero);
0286    initialize<Architecture_t>(dbeta, EInitialization::kZero);
0287
0288    fTrainedBatches = 0;
0289
0290    Architecture_t::InitializeBNormDescriptors(fDescriptors, this);
0291 }
0292
0293 //______________________________________________________________________________
0294 template <typename Architecture_t>
0295 auto TBatchNormLayer<Architecture_t>::Forward(Tensor_t &x, bool inTraining) -> void
0296 {
0297    Tensor_t x2;
0298    Tensor_t y2;
0299    if (x.GetLayout() != fReshapedData.GetLayout()) {
0300       x2 = Tensor_t(x.GetDeviceBuffer(), fReshapedData.GetShape(), fReshapedData.GetLayout());
0301       y2 = Tensor_t(this->GetOutput().GetDeviceBuffer(), fReshapedData.GetShape(), fReshapedData.GetLayout());
0302    }
0303    else{
0304       x2 = x;
0305       y2 = this->GetOutput();
0306    }
0307
0308    auto descr = static_cast<BNormDescriptors_t *> (fDescriptors);
0309    if (inTraining) {
0310       Architecture_t::BatchNormLayerForwardTraining(fNormAxis, x2, y2,
0311                                                     this->GetWeightsAt(0), this->GetWeightsAt(1),
0312                                                     this->GetBatchMean(), this->GetVariance(), this->GetIVariance(),
0313                                                     this->GetMuVector(),
0314                                                     this->GetVarVector(), this->GetNTrainedBatches(),
0315                                                     this->GetMomentum(), this->GetEpsilon(),
0316                                                     descr->HelperDescriptor);
0317       fTrainedBatches++;
0318    }
0319
0320    else {
0321       // if (fTrainedBatches > 0) {
0322       //    Architecture_t::PrintTensor(Tensor_t(this->GetWeightsAt(0)), "bnorm gamma");
0323       //    Architecture_t::PrintTensor(Tensor_t(this->GetWeightsAt(1)), "bnorm beta");
0324       //    Architecture_t::PrintTensor(Tensor_t(this->GetMuVector()), "bnorm mu");
0325       //    Architecture_t::PrintTensor(Tensor_t(this->GetVarVector()), "bnorm var");
0326       // }
0327       Architecture_t::BatchNormLayerForwardInference(fNormAxis, x2, this->GetWeightsAt(0), this->GetWeightsAt(1),
0328                                                      y2, this->GetMuVector(), this->GetVarVector(),
0329                                                      this->GetEpsilon(), descr->HelperDescriptor);
0330       fTrainedBatches = 0;
0331    }
0332
0333 }
0334
0335 //______________________________________________________________________________
0336 template <typename Architecture_t>
0337 auto TBatchNormLayer<Architecture_t>::Backward(Tensor_t &gradients_backward,
0338                                                const Tensor_t & activations_backward ) -> void
0339 //                                               Tensor_t &, Tensor_t &) -> void
0340 {
0341    auto descr = static_cast<BNormDescriptors_t *> (fDescriptors);
0342
0343
0344    if (activations_backward.GetLayout() != fReshapedData.GetLayout()) {
0345       Tensor_t x = Tensor_t(activations_backward.GetDeviceBuffer(), fReshapedData.GetShape(), fReshapedData.GetLayout());
0346       Tensor_t dx = Tensor_t(gradients_backward.GetDeviceBuffer(), fReshapedData.GetShape(), fReshapedData.GetLayout());
0347       Tensor_t dy = Tensor_t(this->GetActivationGradients().GetDeviceBuffer(), fReshapedData.GetShape(), fReshapedData.GetLayout());
0348
0349       Architecture_t::BatchNormLayerBackward(fNormAxis, x, dy, dx,
0350                                              this->GetWeightsAt(0),           // gamma (beta is not needed)
0351                                              this->GetWeightGradientsAt(0), this->GetWeightGradientsAt(1),
0352                                              this->GetBatchMean(), this->GetVariance(), this->GetIVariance(),
0353                                              this->GetEpsilon(), descr->HelperDescriptor);
0354
0355    } else {
0356
0357       Architecture_t::BatchNormLayerBackward(fNormAxis, activations_backward, // x
0358                                           this->GetActivationGradients(), // dy
0359                                           gradients_backward,             // dx
0360                                           this->GetWeightsAt(0),          // gamma (beta is not needed)
0361                                           this->GetWeightGradientsAt(0), this->GetWeightGradientsAt(1),
0362                                           this->GetBatchMean(), this->GetVariance(), this->GetIVariance(),
0363                                           this->GetEpsilon(), descr->HelperDescriptor);
0364    }
0365 }
0366
0367 //______________________________________________________________________________
0368 template <typename Architecture_t>
0369 void TBatchNormLayer<Architecture_t>::Print() const
0370 {
0371    std::cout << " BATCH NORM Layer: \t";
0372    std::cout << " Input/Output = ( " ;
0373    auto &shape = this->GetOutput().GetShape();
0374    for (size_t i = 0; i < shape.size(); ++i) {
0375       if (i > 0) std::cout << " , ";
0376       std::cout << shape[i];
0377    }
0378    std::cout  << " ) ";
0379    std::cout << "\t Norm dim =" << std::setw(6) << this->GetWeightsAt(0).GetNcols();
0380    std::cout << "\t axis = " << fNormAxis << std::endl;
0381    std::cout << std::endl;
0382 }
0383
0384 //______________________________________________________________________________
0385
0386 template <typename Architecture_t>
0387 void TBatchNormLayer<Architecture_t>::AddWeightsXMLTo(void *parent)
0388 {
0389
0390    // write layer width activation function + weight and bias matrices
0391
0392    auto layerxml = gTools().xmlengine().NewChild(parent, nullptr, "BatchNormLayer");
0393
0394
0395    gTools().AddAttr(layerxml, "Momentum", fMomentum);
0396    gTools().AddAttr(layerxml, "Epsilon", fEpsilon);
0397
0398    // write stored mean and variances
0399    //using Scalar_t = typename Architecture_t::Scalar_t;
0400
0401    this->WriteMatrixToXML(layerxml, "Training-mu", this->GetMuVector());
0402    this->WriteMatrixToXML(layerxml, "Training-variance", this->GetVarVector());
0403
0404    // write weights (gamma and beta)
0405    this->WriteMatrixToXML(layerxml, "Gamma", this->GetWeightsAt(0));
0406    this->WriteMatrixToXML(layerxml, "Beta", this->GetWeightsAt(1));
0407
0408 }
0409
0410 //______________________________________________________________________________
0411 template <typename Architecture_t>
0412 void TBatchNormLayer<Architecture_t>::ReadWeightsFromXML(void *parent)
0413 {
0414    // momentum and epsilon can be added after constructing the class
0415    gTools().ReadAttr(parent, "Momentum", fMomentum);
0416    gTools().ReadAttr(parent, "Epsilon", fEpsilon);
0417    // Read layer weights and biases from XML
0418
0419    this->ReadMatrixXML(parent, "Training-mu", this->GetMuVector());
0420    this->ReadMatrixXML(parent, "Training-variance", this->GetVarVector());
0421
0422    this->ReadMatrixXML(parent, "Gamma", this->GetWeightsAt(0));
0423    this->ReadMatrixXML(parent, "Beta", this->GetWeightsAt(1));
0424 }
0425
0426 } // namespace DNN
0427 } // namespace TMVA
0428
0429 #endif