TMVA/DNN/DLMinimizers.h

0001 // @(#)root/tmva/tmva/cnn:$Id$
0002 // Author: Vladimir Ilievski
0003
0004 /**********************************************************************************
0005  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis       *
0006  * Package: TMVA                                                                  *
0007  * Class  : TDLGradientDescent                                                    *
0008  *                                             *
0009  *                                                                                *
0010  * Description:                                                                   *
0011  *      Deel Learning Minimizers                                                  *
0012  *                                                                                *
0013  * Authors (alphabetical):                                                        *
0014  *      Vladimir Ilievski      <ilievski.vladimir@live.com>  - CERN, Switzerland  *
0015  *                                                                                *
0016  * Copyright (c) 2005-2015:                                                       *
0017  *      CERN, Switzerland                                                         *
0018  *      U. of Victoria, Canada                                                    *
0019  *      MPI-K Heidelberg, Germany                                                 *
0020  *      U. of Bonn, Germany                                                       *
0021  *                                                                                *
0022  * Redistribution and use in source and binary forms, with or without             *
0023  * modification, are permitted according to the terms listed in LICENSE           *
0024  * (see tmva/doc/LICENSE)                                          *
0025  **********************************************************************************/
0026
0027 #ifndef TMVA_DNN_DLMINIMIZERS
0028 #define TMVA_DNN_DLMINIMIZERS
0029
0030 #include "TMVA/DNN/TensorDataLoader.h"
0031 #include "TMVA/DNN/Functions.h"
0032 #include "TMVA/DNN/DeepNet.h"
0033
0034 #include <limits>
0035 #include <vector>
0036
0037 namespace TMVA {
0038 namespace DNN {
0039
0040 /*** \class TDLGradientDescent
0041  *
0042  *   Generic implementation of gradient descent minimization for the
0043  *   deep learning neural nets.
0044  *
0045  *   The TDLGradientDescent class implements an architecture, input data and
0046  *   deep learning neural network type independent implementation of the gradient
0047  *   descent minimization algorithm.
0048  *
0049 *    This is provided by the Step(...), StepMomentum(...) and
0050  *   StepNesterov(...) functions that perform a single minimization step.
0051  *
0052  *   The main training characteristics are defined by the provided learning rate,
0053  *   the test interval, and the convergence steps required for convergence. The
0054  *   test interval defines how often the error on the validation set is computed,
0055  *   and the values with which the step counter is increased each time the
0056  *   HasConverged() member function is called. A convergence step is defined as
0057  *   a step in which the test error is NOT less than 0.999 times the current
0058  *   minimal test error that has been reached. If between two subsequent calls
0059  *   to HasConverged(Double_t) the test error has not been sufficiently reduced
0060  *   it is assumed that a number of convergence steps equal to the test interval
0061  *   has been performed.
0062  */
0063
0064 template <typename Architecture_t>
0065 class TDLGradientDescent {
0066 public:
0067    using DeepNet_t = TDeepNet<Architecture_t>;
0068    using Scalar_t = typename Architecture_t::Scalar_t;
0069    using Matrix_t = typename Architecture_t::Matrix_t;
0070
0071 private:
0072    size_t fBatchSize;        ///< Batch size to use for the training.
0073    size_t fStepCount;        ///< Number of steps performed in the current training session
0074    size_t fConvergenceSteps; ///< Number of training epochs without considerable
0075    ///< decrease in the test error for convergence.
0076    size_t fConvergenceCount; ///< Current number of training epochs without
0077    ///< considerable decrease in the test error.
0078    size_t fTestInterval;    ///< Interval for the computation of the test error.
0079    Scalar_t fTrainingError; ///< Holds the most recently computed training loss.
0080    Scalar_t fTestError;     ///< Holds the most recently computed test loss.
0081    Scalar_t fLearningRate;  ///< Learning rate \f$\alpha\f$
0082    Scalar_t fMinimumError;  ///< The minimum loss achieved on the training set during the current training session.
0083
0084 public:
0085    TDLGradientDescent();
0086    TDLGradientDescent(Scalar_t learningRate, size_t convergenceSteps, size_t testInterval);
0087
0088    /** Reset minimizer object to default state. */
0089    void Reset()
0090    {
0091       fMinimumError = std::numeric_limits<Scalar_t>::infinity();
0092       fConvergenceCount = 0;
0093       fStepCount = 0;
0094    };
0095
0096    /** Perform a single optimization step on a given batch. Propagates the input
0097        matrix forward through the net, evaluates the loss and propagates the gradients
0098        backward through the net. The computed gradients are scaled by the learning
0099        rate \f$\alpha\f$ and subtracted from the weights and bias values of each
0100        layer. */
0101    void Step(DeepNet_t &deepNet, std::vector<Matrix_t> &input, const Matrix_t &output, const Matrix_t &weights);
0102
0103    /** Does not evaluate the loss and therefore not trigger a possible synchronization
0104     *  with the device. Trains the weights of each layer, but only the bias terms of
0105     *  the first layer for compatibility with the previous implementation. */
0106    void StepReducedWeights(DeepNet_t &deepNet, std::vector<Matrix_t> &input, const Matrix_t &output,
0107                            const Matrix_t &weights);
0108
0109    /** Same as Step(...) but also evaluate the loss on the given training data.
0110     *  Note that this requires synchronization between host and device. */
0111    Scalar_t StepLoss(DeepNet_t &deepNet, std::vector<Matrix_t> &input, const Matrix_t &output, const Matrix_t &weights);
0112
0113    /** Similar to StepReducedWeights(...) but also evaluates the loss. May trigger
0114      * synchronization with the device. */
0115    Scalar_t StepReducedWeightsLoss(DeepNet_t &deepNet, std::vector<Matrix_t> &input, const Matrix_t &output,
0116                                    const Matrix_t &weights);
0117
0118    /** Perform multiple optimization steps simultaneously. Performs the
0119     *  backprop algorithm on the input batches given in \p batches on
0120     *  the neural networks given in \p nets. The forward and backward propagation
0121     *  steps are executed in an interleaving manner in order to exploit potential
0122     *  batch-level parallelism for asynchronous device calls.
0123     */
0124    void Step(DeepNet_t &master, std::vector<DeepNet_t> &nets, std::vector<TTensorBatch<Architecture_t>> &batches);
0125
0126    /** Same as the Step(...) method for multiple batches but uses momentum. */
0127    void StepMomentum(DeepNet_t &master, std::vector<DeepNet_t> &nets,
0128                      std::vector<TTensorBatch<Architecture_t>> &batches, Scalar_t momentum);
0129
0130    /** Same as the Step(...) method for multiple batches but uses Nesterov
0131     *  momentum. */
0132    void StepNesterov(DeepNet_t &master, std::vector<DeepNet_t> &nets,
0133                      std::vector<TTensorBatch<Architecture_t>> &batches, Scalar_t momentum);
0134
0135    /** Increases the minimization step counter by the test error evaluation
0136     *  period and uses the current internal value of the test error to
0137     *  determine if the minimization has converged. */
0138    bool HasConverged();
0139
0140    /** Increases the minimization step counter by the test error evaluation
0141     *  period and uses the provided test error value to determine if the
0142     *  minimization has converged. */
0143    bool HasConverged(Scalar_t testError);
0144
0145    /** Getters */
0146    size_t GetConvergenceCount() const { return fConvergenceCount; }
0147    size_t GetConvergenceSteps() const { return fConvergenceSteps; }
0148    Scalar_t GetTrainingError() const { return fTrainingError; }
0149    Scalar_t GetTestError() const { return fTestError; }
0150    size_t GetTestInterval() const { return fTestInterval; }
0151
0152    /** Setters */
0153    void SetConvergenceSteps(size_t steps) { fConvergenceSteps = steps; }
0154    void SetTestInterval(size_t interval) { fTestInterval = interval; }
0155    void SetLearningRate(Scalar_t rate) { fLearningRate = rate; }
0156    void SetBatchSize(Scalar_t rate) { fBatchSize = rate; }
0157 };
0158
0159 //
0160 // Implementation
0161 //______________________________________________________________________________
0162 template <typename Architecture_t>
0163 TDLGradientDescent<Architecture_t>::TDLGradientDescent()
0164    : fBatchSize(0), fStepCount(0), fConvergenceSteps(0), fConvergenceCount(0), fTestInterval(0), fLearningRate(0),
0165      fMinimumError(std::numeric_limits<Scalar_t>::infinity())
0166 {
0167    // Nothing to do here.
0168 }
0169
0170 //______________________________________________________________________________
0171 template <typename Architecture_t>
0172 TDLGradientDescent<Architecture_t>::TDLGradientDescent(Scalar_t learningRate, size_t convergenceSteps,
0173                                                        size_t testInterval)
0174    : fBatchSize(0), fStepCount(0), fConvergenceSteps(convergenceSteps), fConvergenceCount(0),
0175      fTestInterval(testInterval), fLearningRate(learningRate), fMinimumError(std::numeric_limits<Scalar_t>::infinity())
0176 {
0177    // Nothing to do here.
0178 }
0179
0180 //______________________________________________________________________________
0181 template <typename Architecture_t>
0182 void TDLGradientDescent<Architecture_t>::Step(DeepNet_t &deepNet, std::vector<Matrix_t> &input, const Matrix_t &output,
0183                                               const Matrix_t &weights)
0184 {
0185    // Make forward and backward pass and update the net afterwards
0186    deepNet.Forward(input, true);
0187    deepNet.Backward(input, output, weights);
0188    deepNet.Update(fLearningRate);
0189 }
0190
0191 //______________________________________________________________________________
0192 template <typename Architecture_t>
0193 void TDLGradientDescent<Architecture_t>::StepReducedWeights(DeepNet_t &deepNet, std::vector<Matrix_t> &input,
0194                                                             const Matrix_t &output, const Matrix_t &weights)
0195 {
0196    // Make forward and backward pass and update the net afterwards
0197    deepNet.Forward(input, true);
0198    deepNet.Backward(input, output, weights);
0199
0200    for (size_t i = 0; i < deepNet.GetDepth(); i++) {
0201       auto *layer = deepNet.GetLayerAt(i);
0202
0203       layer->UpdateWeights(layer->GetWeightGradients(), fLearningRate);
0204       if (i == 0) {
0205          layer->UpdateBiases(layer->GetBiasGradients(), fLearningRate);
0206       }
0207    }
0208 }
0209
0210 //______________________________________________________________________________
0211 template <typename Architecture_t>
0212 auto TDLGradientDescent<Architecture_t>::StepLoss(DeepNet_t &deepNet, std::vector<Matrix_t> &input,
0213                                                   const Matrix_t &output, const Matrix_t &weights) -> Scalar_t
0214 {
0215    Scalar_t loss = deepNet.Loss(input, output);
0216    deepNet.Backward(input, output, weights);
0217    deepNet.Update(fLearningRate);
0218
0219    return loss;
0220 }
0221
0222 //______________________________________________________________________________
0223 template <typename Architecture_t>
0224 auto TDLGradientDescent<Architecture_t>::StepReducedWeightsLoss(DeepNet_t &deepNet, std::vector<Matrix_t> &input,
0225                                                                 const Matrix_t &output, const Matrix_t &weights)
0226    -> Scalar_t
0227 {
0228    Scalar_t loss = deepNet.Loss(input, output);
0229    fTrainingError = loss;
0230    deepNet.Backward(input, output, weights);
0231
0232    for (size_t i = 0; i < deepNet.GetDepth(); i++) {
0233       auto *layer = deepNet.GetLayerAt(i);
0234
0235       layer->UpdateWeights(layer->GetWeightGradients(), fLearningRate);
0236       if (i == 0) {
0237          layer->UpdateBiases(layer->GetBiasGradients(), fLearningRate);
0238       }
0239    }
0240
0241    return loss;
0242 }
0243
0244 //______________________________________________________________________________
0245 template <typename Architecture_t>
0246 void TDLGradientDescent<Architecture_t>::Step(DeepNet_t &master, std::vector<DeepNet_t> &nets,
0247                                               std::vector<TTensorBatch<Architecture_t>> &batches)
0248 {
0249
0250    master.ParallelForward(nets, batches);
0251    master.ParallelBackward(nets, batches, fLearningRate);
0252 }
0253
0254 //______________________________________________________________________________
0255 template <typename Architecture_t>
0256 void TDLGradientDescent<Architecture_t>::StepMomentum(DeepNet_t &master, std::vector<DeepNet_t> &nets,
0257                                                       std::vector<TTensorBatch<Architecture_t>> &batches,
0258                                                       Scalar_t momentum)
0259 {
0260    master.ParallelForward(nets, batches);
0261    master.ParallelBackwardMomentum(nets, batches, fLearningRate, momentum);
0262 }
0263
0264 //______________________________________________________________________________
0265 template <typename Architecture_t>
0266 void TDLGradientDescent<Architecture_t>::StepNesterov(DeepNet_t &master, std::vector<DeepNet_t> &nets,
0267                                                       std::vector<TTensorBatch<Architecture_t>> &batches,
0268                                                       Scalar_t momentum)
0269 {
0270    master.ParallelForward(nets, batches);
0271    master.ParallelBackwardNestorov(nets, batches, fLearningRate, momentum);
0272 }
0273
0274 //______________________________________________________________________________
0275 template <typename Architecture_t>
0276 bool TDLGradientDescent<Architecture_t>::HasConverged()
0277 {
0278    if (fTestError < fMinimumError * 0.999) {
0279       fConvergenceCount = 0;
0280       fMinimumError = fTestError;
0281    } else {
0282       fConvergenceCount++;
0283    }
0284
0285    return (fConvergenceCount >= fConvergenceSteps);
0286 }
0287
0288 //______________________________________________________________________________
0289 template <typename Architecture_t>
0290 bool TDLGradientDescent<Architecture_t>::HasConverged(Scalar_t testError)
0291 {
0292    fTestError = testError;
0293    if (fTestError < fMinimumError * 0.999) {
0294       fConvergenceCount = 0;
0295       fMinimumError = fTestError;
0296    } else {
0297       fConvergenceCount += fTestInterval;
0298    }
0299    return (fConvergenceCount >= fConvergenceSteps);
0300 }
0301
0302 } // namespace DNN
0303 } // namespace TMVA
0304
0305 #endif