TMVA/DNN/Layer.h

0001 // @(#)root/tmva/tmva/dnn:$Id$
0002 // Author: Simon Pfreundschuh 20/06/16
0003
0004 /*************************************************************************
0005  * Copyright (C) 2016, Simon Pfreundschuh                                *
0006  * All rights reserved.                                                  *
0007  *                                                                       *
0008  * For the licensing terms see $ROOTSYS/LICENSE.                         *
0009  * For the list of contributors see $ROOTSYS/README/CREDITS.             *
0010  *************************************************************************/
0011
0012 //////////////////////////////////////////////////////////////////////
0013 // Contains Layer and SharedLayer classes, that represent layers in //
0014 // neural networks.                                                 //
0015 //////////////////////////////////////////////////////////////////////
0016
0017 #ifndef TMVA_DNN_LAYER
0018 #define TMVA_DNN_LAYER
0019
0020 #include <iostream>
0021
0022 #include "TMatrix.h"
0023 #include "Functions.h"
0024
0025 namespace TMVA
0026 {
0027 namespace DNN
0028 {
0029
0030 //______________________________________________________________________________
0031 //
0032 //  The Layer Class
0033 //______________________________________________________________________________
0034
0035 /** \class TLayer
0036
0037     Generic layer class.
0038
0039     This generic layer class represents a layer of a neural network with
0040     a given width n and activation function f. The activation
0041     function of each layer is given by \f$\mathbf{u} =
0042     \mathbf{W}\mathbf{x} + \boldsymbol{\theta}\f$.
0043
0044     In addition to the weight and bias matrices, each layer allocates memory
0045     for its activations and the corresponding first partial fDerivatives of
0046     the activation function as well as the gradients of the fWeights and fBiases.
0047
0048     The layer provides member functions for the forward propagation of
0049     activations through the given layer.
0050 */
0051 template<typename Architecture_t>
0052    class TLayer
0053 {
0054
0055 public:
0056    using Scalar_t = typename Architecture_t::Scalar_t;
0057    using Matrix_t = typename Architecture_t::Matrix_t;
0058    using Tensor_t = typename Architecture_t::Tensor_t;
0059
0060
0061 private:
0062
0063    size_t fBatchSize;  ///< Batch size used for training and evaluation.
0064    size_t fInputWidth; ///< Number of neurons of the previous layer.
0065    size_t fWidth;      ///< Number of neurons of this layer.
0066
0067    Scalar_t fDropoutProbability;  ///< Probability that an input is active.
0068
0069    Matrix_t fWeights;             ///< The fWeights of this layer.
0070    Matrix_t fBiases;              ///< The bias values of this layer.
0071    Matrix_t fOutput;              ///< Activations of this layer.
0072    Matrix_t fDerivatives;         ///< First fDerivatives of the activations of this layer.
0073    Matrix_t fWeightGradients;     ///< Gradients w.r.t. the weights of this layer.
0074    Matrix_t fBiasGradients;       ///< Gradients w.r.t. the bias values of this layer.
0075    Matrix_t fActivationGradients; ///< Gradients w.r.t. the activations of this layer.
0076
0077    EActivationFunction fF; ///< Activation function of the layer.
0078
0079 public:
0080
0081    TLayer(size_t             BatchSize,
0082           size_t             InputWidth,
0083           size_t             Width,
0084           EActivationFunction f,
0085           Scalar_t           dropoutProbability);
0086    TLayer(const TLayer &);
0087
0088    /*! Initialize fWeights according to the given initialization
0089     *  method. */
0090    void Initialize(EInitialization m);
0091    /*! Compute activation of the layer for the given input. The input
0092     * must be in matrix form with the different rows corresponding to
0093     * different events in the batch. Computes activations as well as
0094     * the first partial derivative of the activation function at those
0095     * activations. */
0096    void inline Forward(Matrix_t & input, bool applyDropout = false);
0097    /*! Compute weight, bias and activation gradients. Uses the precomputed
0098     *  first partial derivatives of the activation function computed during
0099     *  forward propagation and modifies them. Must only be called directly
0100     *  a the corresponding call to Forward(...). */
0101    void inline Backward(Matrix_t & gradients_backward,
0102                         const Matrix_t & activations_backward,
0103                         ERegularization r,
0104                         Scalar_t weightDecay);
0105
0106    void Print() const;
0107
0108    size_t GetBatchSize()          const {return fBatchSize;}
0109    size_t GetInputWidth()         const {return fInputWidth;}
0110    size_t GetWidth()              const {return fWidth;}
0111    size_t GetDropoutProbability() const {return fDropoutProbability;}
0112
0113    void SetDropoutProbability(Scalar_t p) {fDropoutProbability = p;}
0114
0115    EActivationFunction GetActivationFunction() const {return fF;}
0116
0117    Matrix_t       & GetOutput()        {return fOutput;}
0118    const Matrix_t & GetOutput() const  {return fOutput;}
0119    Matrix_t       & GetWeights()       {return fWeights;}
0120    const Matrix_t & GetWeights() const {return fWeights;}
0121    Matrix_t       & GetBiases()       {return fBiases;}
0122    const Matrix_t & GetBiases() const {return fBiases;}
0123    Matrix_t       & GetActivationGradients()       {return fActivationGradients;}
0124    const Matrix_t & GetActivationGradients() const {return fActivationGradients;}
0125    Matrix_t       & GetBiasGradients()       {return fBiasGradients;}
0126    const Matrix_t & GetBiasGradients() const {return fBiasGradients;}
0127    Matrix_t       & GetWeightGradients()       {return fWeightGradients;}
0128    const Matrix_t & GetWeightGradients() const {return fWeightGradients;}
0129
0130 };
0131
0132 //______________________________________________________________________________
0133 //
0134 //  The Shared Layer Class
0135 //______________________________________________________________________________
0136
0137 /** \class TSharedLayer
0138
0139     Layer class width shared weight and bias layers.
0140
0141     Like the Layer class only that weight matrices are shared between
0142     different instances of the net, which can be used to implement
0143     multithreading 'Hogwild' style.
0144 */
0145
0146 template<typename Architecture_t>
0147 class TSharedLayer
0148 {
0149
0150 public:
0151
0152    using Scalar_t = typename Architecture_t::Scalar_t;
0153    using Matrix_t = typename Architecture_t::Matrix_t;
0154    using Tensor_t = typename Architecture_t::Tensor_t;
0155
0156
0157 private:
0158
0159    size_t fBatchSize;  ///< Batch size used for training and evaluation.
0160    size_t fInputWidth; ///< Number of neurons of the previous layer.
0161    size_t fWidth;      ///< Number of neurons of this layer.
0162
0163    Scalar_t fDropoutProbability;  ///< Probability that an input is active.
0164
0165    Matrix_t & fWeights;           ///< Reference to the weight matrix of this layer.
0166    Matrix_t & fBiases;            ///< Reference to the bias vectors of this layer.
0167    Matrix_t fOutput;              ///< Activations of this layer.
0168    Matrix_t fDerivatives;         ///< First fDerivatives of the activations of this layer.
0169    Matrix_t fWeightGradients;     ///< Gradients w.r.t. the weights of this layer.
0170    Matrix_t fBiasGradients;       ///< Gradients w.r.t. the bias values of this layer.
0171    Matrix_t fActivationGradients; ///< Gradients w.r.t. the activations of this layer.
0172
0173    EActivationFunction fF; ///< Activation function of the layer.
0174
0175 public:
0176
0177    TSharedLayer(size_t fBatchSize,
0178                 TLayer<Architecture_t> & layer);
0179    TSharedLayer(const TSharedLayer & layer);
0180
0181    /*! Compute activation of the layer for the given input. The input
0182     * must be in matrix form with the different rows corresponding to
0183     * different events in the batch. Computes activations as well as
0184     * the first partial derivative of the activation function at those
0185     * activations. */
0186    void inline Forward(Matrix_t & input, bool applyDropout = false);
0187    /*! Compute weight, bias and activation gradients. Uses the precomputed
0188     *  first partial derivatives of the activation function computed during
0189     *  forward propagation and modifies them. Must only be called directly
0190     *  a the corresponding call to Forward(...). */
0191    void inline Backward(Matrix_t & gradients_backward,
0192                         const Matrix_t & activations_backward,
0193                         ERegularization r,
0194                         Scalar_t weightDecay);
0195
0196    void Print() const;
0197
0198    size_t GetBatchSize()          const {return fBatchSize;}
0199    size_t GetInputWidth()         const {return fInputWidth;}
0200    size_t GetWidth()              const {return fWidth;}
0201    size_t GetDropoutProbability() const {return fDropoutProbability;}
0202
0203    void SetDropoutProbability(Scalar_t p) {fDropoutProbability = p;}
0204
0205    EActivationFunction GetActivationFunction() const {return fF;}
0206
0207    Matrix_t       & GetOutput()        {return fOutput;}
0208    const Matrix_t & GetOutput() const  {return fOutput;}
0209    Matrix_t       & GetWeights() const {return fWeights;}
0210    Matrix_t       & GetBiases()       {return fBiases;}
0211    const Matrix_t & GetBiases() const {return fBiases;}
0212    Matrix_t       & GetActivationGradients()       {return fActivationGradients;}
0213    const Matrix_t & GetActivationGradients() const {return fActivationGradients;}
0214    Matrix_t       & GetBiasGradients()       {return fBiasGradients;}
0215    const Matrix_t & GetBiasGradients() const {return fBiasGradients;}
0216    Matrix_t       & GetWeightGradients()       {return fWeightGradients;}
0217    const Matrix_t & GetWeightGradients() const {return fWeightGradients;}
0218
0219 };
0220
0221 //______________________________________________________________________________
0222 //
0223 //  The Layer Class - Implementation
0224 //______________________________________________________________________________
0225
0226 template<typename Architecture_t>
0227    TLayer<Architecture_t>::TLayer(size_t batchSize,
0228                                   size_t inputWidth,
0229                                   size_t width,
0230                                   EActivationFunction f,
0231                                   Scalar_t dropoutProbability)
0232    : fBatchSize(batchSize), fInputWidth(inputWidth), fWidth(width),
0233      fDropoutProbability(dropoutProbability), fWeights(width, fInputWidth),
0234      fBiases(width, 1), fOutput(fBatchSize, width), fDerivatives(fBatchSize, width),
0235      fWeightGradients(width, fInputWidth), fBiasGradients(width, 1),
0236      fActivationGradients(fBatchSize, width), fF(f)
0237 {
0238    // Nothing to do here.
0239 }
0240
0241 //______________________________________________________________________________
0242 template<typename Architecture_t>
0243 TLayer<Architecture_t>::TLayer(const TLayer &layer)
0244     : fBatchSize(layer.fBatchSize), fInputWidth(layer.fInputWidth),
0245     fWidth(layer.fWidth), fDropoutProbability(layer.fDropoutProbability),
0246     fWeights(layer.fWidth, layer.fInputWidth), fBiases(layer.fWidth, 1),
0247     fOutput(layer.fBatchSize, layer.fWidth),
0248     fDerivatives(layer.fBatchSize, layer.fWidth),
0249     fWeightGradients(layer.fWidth, layer.fInputWidth),
0250     fBiasGradients(layer.fWidth, 1),
0251     fActivationGradients(layer.fBatchSize, layer.fWidth),
0252     fF(layer.fF)
0253 {
0254    Architecture_t::Copy(fWeights, layer.GetWeights());
0255    Architecture_t::Copy(fBiases,  layer.GetBiases());
0256 }
0257
0258 //______________________________________________________________________________
0259 template<typename Architecture_t>
0260 auto TLayer<Architecture_t>::Initialize(EInitialization m)
0261 -> void
0262 {
0263    initialize<Architecture_t>(fWeights, m);
0264    initialize<Architecture_t>(fBiases,  EInitialization::kZero);
0265 }
0266
0267 //______________________________________________________________________________
0268 template<typename Architecture_t>
0269 auto inline TLayer<Architecture_t>::Forward(Matrix_t & input,
0270                                             bool applyDropout)
0271 -> void
0272 {
0273    if (applyDropout && (fDropoutProbability != 1.0)) {
0274       Architecture_t::DropoutForward(input, fDropoutProbability);
0275    }
0276    Architecture_t::MultiplyTranspose(fOutput, input, fWeights);
0277    Architecture_t::AddRowWise(fOutput, fBiases);
0278    Tensor_t tOutput(fOutput);
0279    Tensor_t tDerivatives(fDerivatives);
0280    evaluateDerivative<Architecture_t>(tDerivatives, fF, tOutput);
0281
0282    evaluate<Architecture_t>(tOutput, fF);
0283 }
0284
0285 //______________________________________________________________________________
0286 template<typename Architecture_t>
0287 auto TLayer<Architecture_t>::Backward(Matrix_t & gradients_backward,
0288                                     const Matrix_t & activations_backward,
0289                                     ERegularization r,
0290                                     Scalar_t weightDecay)
0291 -> void
0292 {
0293
0294    Tensor_t tGradBw(gradients_backward);
0295    Tensor_t tActBw(activations_backward);
0296    Tensor_t tActGrad(fActivationGradients);
0297    Tensor_t tDeriv(fDerivatives);
0298
0299    Architecture_t::Hadamard( tDeriv, tActGrad);
0300    Architecture_t::Backward( tGradBw,
0301                             fWeightGradients,
0302                             fBiasGradients,
0303                             tDeriv,
0304                             tActGrad,
0305                             fWeights,
0306                             tActBw);
0307    addRegularizationGradients<Architecture_t>(fWeightGradients,
0308                                               fWeights,
0309                                               weightDecay, r);
0310 }
0311
0312 //______________________________________________________________________________
0313 template<typename Architecture_t>
0314    void TLayer<Architecture_t>::Print() const
0315 {
0316    std::cout << "Width = " << fWeights.GetNrows();
0317    std::cout << ", Activation Function = ";
0318    std::cout << static_cast<int>(fF) << std::endl;
0319 }
0320
0321 //______________________________________________________________________________
0322 //
0323 //  The Shared Layer Class - Implementation
0324 //______________________________________________________________________________
0325
0326 //______________________________________________________________________________
0327 template<typename Architecture_t>
0328 TSharedLayer<Architecture_t>::TSharedLayer(size_t BatchSize,
0329                                          TLayer<Architecture_t> &layer)
0330 : fBatchSize(BatchSize),
0331 fInputWidth(layer.GetInputWidth()), fWidth(layer.GetWidth()),
0332 fDropoutProbability(layer.GetDropoutProbability()),
0333 fWeights(layer.GetWeights()), fBiases(layer.GetBiases()),
0334 fOutput(fBatchSize, fWidth), fDerivatives(fBatchSize, fWidth),
0335 fWeightGradients(fWidth, fInputWidth), fBiasGradients(fWidth, 1),
0336 fActivationGradients(fBatchSize, fWidth), fF(layer.GetActivationFunction())
0337 {
0338    // Nothing to do here.
0339 }
0340
0341 //______________________________________________________________________________
0342 template<typename Architecture_t>
0343 TSharedLayer<Architecture_t>::TSharedLayer(const TSharedLayer &layer)
0344     : fBatchSize(layer.fBatchSize),
0345     fInputWidth(layer.GetInputWidth()), fWidth(layer.GetWidth()),
0346     fDropoutProbability(layer.fDropoutProbability), fWeights(layer.fWeights),
0347     fBiases(layer.fBiases), fOutput(layer.fBatchSize, fWidth),
0348     fDerivatives(layer.fBatchSize, fWidth), fWeightGradients(fWidth, fInputWidth),
0349     fBiasGradients(fWidth, 1), fActivationGradients(layer.fBatchSize, fWidth),
0350     fF(layer.fF)
0351 {
0352 }
0353
0354 //______________________________________________________________________________
0355 template<typename Architecture_t>
0356 auto inline TSharedLayer<Architecture_t>::Forward(Matrix_t & input,
0357                                                   bool applyDropout)
0358 -> void
0359 {
0360    if (applyDropout && (fDropoutProbability != 1.0)) {
0361       Architecture_t::DropoutForward(input, fDropoutProbability);
0362    }
0363    Architecture_t::MultiplyTranspose(fOutput, input, fWeights);
0364    Architecture_t::AddRowWise(fOutput, fBiases);
0365    Tensor_t tOutput(fOutput);
0366    Tensor_t tDerivatives(fDerivatives);
0367    evaluateDerivative<Architecture_t>(tDerivatives, fF, tOutput);
0368    evaluate<Architecture_t>(tOutput, fF);
0369 }
0370
0371 //______________________________________________________________________________
0372 template<typename Architecture_t>
0373 auto inline TSharedLayer<Architecture_t>::Backward(Matrix_t & gradients_backward,
0374                                                  const Matrix_t & activations_backward,
0375                                                  ERegularization r,
0376                                                  Scalar_t weightDecay)
0377 -> void
0378 {
0379    Architecture_t::Backward(gradients_backward,
0380                             fWeightGradients,
0381                             fBiasGradients,
0382                             fDerivatives,
0383                             fActivationGradients,
0384                             fWeights,
0385                             activations_backward);
0386    addRegularizationGradients<Architecture_t>(fWeightGradients,
0387                                               fWeights,
0388                                               weightDecay, r);
0389 }
0390
0391 //______________________________________________________________________________
0392 template<typename Architecture_t>
0393 void TSharedLayer<Architecture_t>::Print() const
0394 {
0395    std::cout << "Width = " << fWeights.GetNrows();
0396    std::cout << ", Activation Function = ";
0397    std::cout << static_cast<int>(fF) << std::endl;
0398 }
0399
0400 } // namespace DNN
0401 } // namespace TMVA
0402
0403 #endif