TMVA/DNN/Adadelta.h

0001 // @(#)root/tmva/tmva/dnn:$Id$
0002 // Author: Ravi Kiran S
0003
0004 /**********************************************************************************
0005  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis       *
0006  * Package: TMVA                                                                  *
0007  * Class  : TAdadelta                                                                 *
0008  *                                             *
0009  *                                                                                *
0010  * Description:                                                                   *
0011  *      Adadelta Optimizer Class                                                      *
0012  *                                                                                *
0013  * Authors (alphabetical):                                                        *
0014  *      Ravi Kiran S      <sravikiran0606@gmail.com>  - CERN, Switzerland         *
0015  *                                                                                *
0016  * Copyright (c) 2005-2018:                                                       *
0017  *      CERN, Switzerland                                                         *
0018  *      U. of Victoria, Canada                                                    *
0019  *      MPI-K Heidelberg, Germany                                                 *
0020  *      U. of Bonn, Germany                                                       *
0021  *                                                                                *
0022  * Redistribution and use in source and binary forms, with or without             *
0023  * modification, are permitted according to the terms listed in LICENSE           *
0024  * (see tmva/doc/LICENSE)                                          *
0025  **********************************************************************************/
0026
0027 #ifndef TMVA_DNN_ADADELTA
0028 #define TMVA_DNN_ADADELTA
0029
0030 #include "TMatrix.h"
0031 #include "TMVA/DNN/Optimizer.h"
0032 #include "TMVA/DNN/Functions.h"
0033 #include <vector>
0034
0035 namespace TMVA {
0036 namespace DNN {
0037
0038 /** \class TAdadelta
0039  *  Adadelta Optimizer class
0040  *
0041  *  This class represents the Adadelta Optimizer.
0042  */
0043 template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
0044           typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
0045 class TAdadelta : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
0046 public:
0047    using Matrix_t = typename Architecture_t::Matrix_t;
0048    using Scalar_t = typename Architecture_t::Scalar_t;
0049
0050 protected:
0051    Scalar_t fRho;     ///< The Rho constant used by the optimizer.
0052    Scalar_t fEpsilon; ///< The Smoothing term used to avoid division by zero.
0053    std::vector<std::vector<Matrix_t>> fPastSquaredWeightGradients; ///< The accumulation of the square of the past
0054                                                                    /// weight gradients associated with the deep net.
0055    std::vector<std::vector<Matrix_t>> fPastSquaredBiasGradients;   ///< The accumulation of the square of the past bias
0056                                                                    /// gradients associated with the deep net.
0057
0058    std::vector<std::vector<Matrix_t>> fPastSquaredWeightUpdates; ///< The accumulation of the square of the past weight
0059                                                                  /// updates associated with the deep net.
0060    std::vector<std::vector<Matrix_t>> fPastSquaredBiasUpdates;   ///< The accumulation of the square of the past bias
0061                                                                  /// updates associated with the deep net.
0062    std::vector<std::vector<Matrix_t>>  fWorkWeightTensor1; ///< working tensor used to keep a temporary copy of weights or weight gradients
0063    std::vector<std::vector<Matrix_t>>  fWorkBiasTensor1; ///< working tensor used to keep a temporary copy of bias or bias gradients
0064    std::vector<std::vector<Matrix_t>>  fWorkWeightTensor2; ///< working tensor used to keep a temporary copy of weights or weight gradients
0065    std::vector<std::vector<Matrix_t>>  fWorkBiasTensor2; ///< working tensor used to keep a temporary copy of bias or bias gradients
0066
0067    /*! Update the weights, given the current weight gradients. */
0068    void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients) override;
0069
0070    /*! Update the biases, given the current bias gradients. */
0071    void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients) override;
0072
0073 public:
0074    /*! Constructor. */
0075    TAdadelta(DeepNet_t &deepNet, Scalar_t learningRate = 1.0, Scalar_t rho = 0.95, Scalar_t epsilon = 1e-8);
0076
0077    /*! Destructor. */
0078    ~TAdadelta() = default;
0079
0080    /*! Getters */
0081    Scalar_t GetRho() const { return fRho; }
0082    Scalar_t GetEpsilon() const { return fEpsilon; }
0083
0084    std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightGradients() { return fPastSquaredWeightGradients; }
0085    std::vector<Matrix_t> &GetPastSquaredWeightGradientsAt(size_t i) { return fPastSquaredWeightGradients[i]; }
0086
0087    std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasGradients() { return fPastSquaredBiasGradients; }
0088    std::vector<Matrix_t> &GetPastSquaredBiasGradientsAt(size_t i) { return fPastSquaredBiasGradients[i]; }
0089
0090    std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightUpdates() { return fPastSquaredWeightUpdates; }
0091    std::vector<Matrix_t> &GetPastSquaredWeightUpdatesAt(size_t i) { return fPastSquaredWeightUpdates[i]; }
0092
0093    std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasUpdates() { return fPastSquaredBiasUpdates; }
0094    std::vector<Matrix_t> &GetPastSquaredBiasUpdatesAt(size_t i) { return fPastSquaredBiasUpdates[i]; }
0095 };
0096
0097 //
0098 //
0099 //  The Adadelta Optimizer Class - Implementation
0100 //_________________________________________________________________________________________________
0101 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0102 TAdadelta<Architecture_t, Layer_t, DeepNet_t>::TAdadelta(DeepNet_t &deepNet, Scalar_t learningRate, Scalar_t rho,
0103                                                          Scalar_t epsilon)
0104    : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fRho(rho), fEpsilon(epsilon)
0105 {
0106    std::vector<Layer_t *> &layers = deepNet.GetLayers();
0107    const size_t layersNSlices = layers.size();
0108    fPastSquaredWeightGradients.resize(layersNSlices);
0109    fPastSquaredBiasGradients.resize(layersNSlices);
0110    fPastSquaredWeightUpdates.resize(layersNSlices);
0111    fPastSquaredBiasUpdates.resize(layersNSlices);
0112    fWorkWeightTensor1.resize(layersNSlices);
0113    fWorkBiasTensor1.resize(layersNSlices);
0114    fWorkWeightTensor2.resize(layersNSlices);
0115    fWorkBiasTensor2.resize(layersNSlices);
0116
0117    for (size_t i = 0; i < layersNSlices; i++) {
0118       const size_t weightsNSlices = (layers[i]->GetWeights()).size();
0119
0120       Architecture_t::CreateWeightTensors( fPastSquaredWeightGradients[i], layers[i]->GetWeights());
0121       Architecture_t::CreateWeightTensors( fPastSquaredWeightUpdates[i], layers[i]->GetWeights());
0122
0123       for (size_t j = 0; j < weightsNSlices; j++) {
0124          initialize<Architecture_t>(fPastSquaredWeightGradients[i][j], EInitialization::kZero);
0125          initialize<Architecture_t>(fPastSquaredWeightUpdates[i][j], EInitialization::kZero);
0126       }
0127
0128       const size_t biasesNSlices = (layers[i]->GetBiases()).size();
0129
0130       Architecture_t::CreateWeightTensors( fPastSquaredBiasGradients[i], layers[i]->GetBiases());
0131       Architecture_t::CreateWeightTensors( fPastSquaredBiasUpdates[i], layers[i]->GetBiases());
0132
0133       for (size_t j = 0; j < biasesNSlices; j++) {
0134          initialize<Architecture_t>(fPastSquaredBiasGradients[i][j], EInitialization::kZero);
0135          initialize<Architecture_t>(fPastSquaredBiasUpdates[i][j], EInitialization::kZero);
0136       }
0137
0138       Architecture_t::CreateWeightTensors(fWorkWeightTensor1[i], layers[i]->GetWeights());
0139       Architecture_t::CreateWeightTensors(fWorkBiasTensor1[i], layers[i]->GetBiases());
0140       Architecture_t::CreateWeightTensors(fWorkWeightTensor2[i], layers[i]->GetWeights());
0141       Architecture_t::CreateWeightTensors(fWorkBiasTensor2[i], layers[i]->GetBiases());
0142    }
0143 }
0144
0145 //_________________________________________________________________________________________________
0146 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0147 auto TAdadelta<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
0148                                                                   const std::vector<Matrix_t> &weightGradients) -> void
0149 {
0150    std::vector<Matrix_t> &currentLayerPastSquaredWeightGradients = this->GetPastSquaredWeightGradientsAt(layerIndex);
0151    std::vector<Matrix_t> &currentLayerPastSquaredWeightUpdates = this->GetPastSquaredWeightUpdatesAt(layerIndex);
0152
0153    const size_t weightsNSlices = weights.size();
0154    assert(currentLayerPastSquaredWeightGradients.size() == weightsNSlices);
0155
0156    for (size_t i = 0; i < weightsNSlices; i++) {
0157       // accumulation matrix used for temporary storing of the current accumulation
0158       auto &accumulation = fWorkWeightTensor1[layerIndex][i];
0159       auto &currentSquaredWeightGradients = fWorkWeightTensor2[layerIndex][i];
0160
0161       // Vt = rho * Vt-1 + (1-rho) * currentSquaredWeightGradients
0162       initialize<Architecture_t>(accumulation, EInitialization::kZero);
0163
0164       Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[i]);
0165       Architecture_t::SquareElementWise(currentSquaredWeightGradients);
0166       Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredWeightGradients[i], this->GetRho());
0167       Architecture_t::ScaleAdd(accumulation, currentSquaredWeightGradients, 1 - (this->GetRho()));
0168       Architecture_t::Copy(currentLayerPastSquaredWeightGradients[i], accumulation);
0169
0170
0171       // updating the weights.
0172       // currentWeightUpdates = sqrt(Wt + epsilon) * currentGradients / sqrt(Vt + epsilon)
0173
0174       // dummy1 = sqrt(Wt + epsilon)
0175       auto &dummy1 = fWorkWeightTensor1[layerIndex][i];  // reuse working tensor
0176       Architecture_t::Copy(dummy1, currentLayerPastSquaredWeightUpdates[i]);
0177       Architecture_t::ConstAdd(dummy1, this->GetEpsilon());
0178       Architecture_t::SqrtElementWise(dummy1);
0179
0180       auto &currentWeightUpdates = fWorkWeightTensor2[layerIndex][i]; // reuse the work tensor for the weight updates now
0181
0182       Architecture_t::Copy(currentWeightUpdates, currentLayerPastSquaredWeightGradients[i]);
0183       Architecture_t::ConstAdd(currentWeightUpdates, this->GetEpsilon());
0184       Architecture_t::SqrtElementWise(currentWeightUpdates);
0185       Architecture_t::ReciprocalElementWise(currentWeightUpdates);
0186       Architecture_t::Hadamard(currentWeightUpdates, weightGradients[i]);
0187       Architecture_t::Hadamard(currentWeightUpdates, dummy1);
0188
0189       // theta = theta - learningRate * currentWeightUpdates
0190       Architecture_t::ScaleAdd(weights[i], currentWeightUpdates, -this->GetLearningRate());
0191
0192       // Wt = rho * Wt-1 + (1-rho) * currentSquaredWeightUpdates
0193       // re-use accumulation matrix used for temporary storing of the current accumulation
0194       initialize<Architecture_t>(accumulation, EInitialization::kZero);
0195       auto &currentSquaredWeightUpdates = fWorkWeightTensor2[layerIndex][i]; // reuse work tensor
0196       Architecture_t::Copy(currentSquaredWeightUpdates, currentWeightUpdates);
0197       Architecture_t::SquareElementWise(currentSquaredWeightUpdates);
0198       Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredWeightUpdates[i], this->GetRho());
0199       Architecture_t::ScaleAdd(accumulation, currentSquaredWeightUpdates, 1 - (this->GetRho()));
0200       Architecture_t::Copy(currentLayerPastSquaredWeightUpdates[i], accumulation);
0201    }
0202 }
0203
0204 //_________________________________________________________________________________________________
0205 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0206 auto TAdadelta<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
0207                                                                  const std::vector<Matrix_t> &biasGradients) -> void
0208 {
0209    std::vector<Matrix_t> &currentLayerPastSquaredBiasGradients = this->GetPastSquaredBiasGradientsAt(layerIndex);
0210    std::vector<Matrix_t> &currentLayerPastSquaredBiasUpdates = this->GetPastSquaredBiasUpdatesAt(layerIndex);
0211
0212    const size_t biasesNSlices = biases.size();
0213    assert(currentLayerPastSquaredBiasGradients.size() == biasesNSlices);
0214    for (size_t i = 0; i < biasesNSlices; i++) {
0215
0216       // accumulation matrix used for temporary storing of the current accumulation
0217       auto &accumulation = fWorkBiasTensor1[layerIndex][i];
0218
0219       // Vt = rho * Vt-1 + (1-rho) * currentSquaredBiasGradients
0220       initialize<Architecture_t>(accumulation, EInitialization::kZero);
0221
0222       auto &currentSquaredBiasGradients = fWorkBiasTensor2[layerIndex][i];
0223       Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[i]);
0224       Architecture_t::SquareElementWise(currentSquaredBiasGradients);
0225       Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredBiasGradients[i], this->GetRho());
0226       Architecture_t::ScaleAdd(accumulation, currentSquaredBiasGradients, 1 - (this->GetRho()));
0227       Architecture_t::Copy(currentLayerPastSquaredBiasGradients[i], accumulation);
0228
0229       // updating the biases.
0230
0231       // currentBiasUpdates = sqrt(Wt + epsilon) * currentGradients / sqrt(Vt + epsilon)
0232       // dummy1 = sqrt(Wt + epsilon)
0233       auto &dummy1 = fWorkBiasTensor1[layerIndex][i]; // reuse working tensor
0234       Architecture_t::Copy(dummy1, currentLayerPastSquaredBiasUpdates[i]);
0235       Architecture_t::ConstAdd(dummy1, this->GetEpsilon());
0236       Architecture_t::SqrtElementWise(dummy1);
0237
0238       auto &currentBiasUpdates = fWorkBiasTensor2[layerIndex][i];
0239       Architecture_t::Copy(currentBiasUpdates, currentLayerPastSquaredBiasGradients[i]);
0240       Architecture_t::ConstAdd(currentBiasUpdates, this->GetEpsilon());
0241       Architecture_t::SqrtElementWise(currentBiasUpdates);
0242       Architecture_t::ReciprocalElementWise(currentBiasUpdates);
0243       Architecture_t::Hadamard(currentBiasUpdates, biasGradients[i]);
0244       Architecture_t::Hadamard(currentBiasUpdates, dummy1);
0245
0246       // theta = theta - learningRate * currentBiasUpdates
0247       Architecture_t::ScaleAdd(biases[i], currentBiasUpdates, -this->GetLearningRate());
0248
0249
0250       // Wt = rho * Wt-1 + (1-rho) * currentSquaredBiasUpdates
0251       // re-use accumulation matrix used for temporary storing of the current accumulation
0252       initialize<Architecture_t>(accumulation, EInitialization::kZero);
0253       auto &currentSquaredBiasUpdates = fWorkBiasTensor2[layerIndex][i]; // reuse work tensor
0254       Architecture_t::Copy(currentSquaredBiasUpdates, currentBiasUpdates);
0255       Architecture_t::SquareElementWise(currentSquaredBiasUpdates);
0256       Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredBiasUpdates[i], this->GetRho());
0257       Architecture_t::ScaleAdd(accumulation, currentSquaredBiasUpdates, 1 - (this->GetRho()));
0258       Architecture_t::Copy(currentLayerPastSquaredBiasUpdates[i], accumulation);
0259    }
0260 }
0261
0262 } // namespace DNN
0263 } // namespace TMVA
0264
0265 #endif