File indexing completed on 2025-01-18 10:10:55
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027 #ifndef TMVA_DNN_ADAM
0028 #define TMVA_DNN_ADAM
0029
0030 #include "TMatrix.h"
0031 #include "TMVA/DNN/Optimizer.h"
0032 #include "TMVA/DNN/Functions.h"
0033 #include <vector>
0034
0035 namespace TMVA {
0036 namespace DNN {
0037
0038
0039
0040
0041
0042
0043 template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
0044 typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
0045 class TAdam : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
0046 public:
0047 using Matrix_t = typename Architecture_t::Matrix_t;
0048 using Scalar_t = typename Architecture_t::Scalar_t;
0049
0050 protected:
0051 Scalar_t fBeta1;
0052 Scalar_t fBeta2;
0053 Scalar_t fEpsilon;
0054
0055 std::vector<std::vector<Matrix_t>> fFirstMomentWeights;
0056
0057 std::vector<std::vector<Matrix_t>> fFirstMomentBiases;
0058
0059
0060 std::vector<std::vector<Matrix_t>> fSecondMomentWeights;
0061
0062 std::vector<std::vector<Matrix_t>> fSecondMomentBiases;
0063
0064
0065
0066 void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients);
0067
0068
0069 void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients);
0070
0071 public:
0072
0073 TAdam(DeepNet_t &deepNet, Scalar_t learningRate = 0.001, Scalar_t beta1 = 0.9, Scalar_t beta2 = 0.999,
0074 Scalar_t epsilon = 1e-7);
0075
0076
0077 ~TAdam() = default;
0078
0079
0080 Scalar_t GetBeta1() const { return fBeta1; }
0081 Scalar_t GetBeta2() const { return fBeta2; }
0082 Scalar_t GetEpsilon() const { return fEpsilon; }
0083
0084 std::vector<std::vector<Matrix_t>> &GetFirstMomentWeights() { return fFirstMomentWeights; }
0085 std::vector<Matrix_t> &GetFirstMomentWeightsAt(size_t i) { return fFirstMomentWeights[i]; }
0086
0087 std::vector<std::vector<Matrix_t>> &GetFirstMomentBiases() { return fFirstMomentBiases; }
0088 std::vector<Matrix_t> &GetFirstMomentBiasesAt(size_t i) { return fFirstMomentBiases[i]; }
0089
0090 std::vector<std::vector<Matrix_t>> &GetSecondMomentWeights() { return fSecondMomentWeights; }
0091 std::vector<Matrix_t> &GetSecondMomentWeightsAt(size_t i) { return fSecondMomentWeights[i]; }
0092
0093 std::vector<std::vector<Matrix_t>> &GetSecondMomentBiases() { return fSecondMomentBiases; }
0094 std::vector<Matrix_t> &GetSecondMomentBiasesAt(size_t i) { return fSecondMomentBiases[i]; }
0095 };
0096
0097
0098
0099
0100
0101 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0102 TAdam<Architecture_t, Layer_t, DeepNet_t>::TAdam(DeepNet_t &deepNet, Scalar_t learningRate, Scalar_t beta1,
0103 Scalar_t beta2, Scalar_t epsilon)
0104 : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fBeta1(beta1), fBeta2(beta2),
0105 fEpsilon(epsilon)
0106 {
0107 std::vector<Layer_t *> &layers = deepNet.GetLayers();
0108 const size_t layersNSlices = layers.size();
0109 fFirstMomentWeights.resize(layersNSlices);
0110 fFirstMomentBiases.resize(layersNSlices);
0111 fSecondMomentWeights.resize(layersNSlices);
0112 fSecondMomentBiases.resize(layersNSlices);
0113
0114
0115 for (size_t i = 0; i < layersNSlices; i++) {
0116
0117 Architecture_t::CreateWeightTensors( fFirstMomentWeights[i], layers[i]->GetWeights());
0118 Architecture_t::CreateWeightTensors( fSecondMomentWeights[i], layers[i]->GetWeights());
0119
0120 const size_t weightsNSlices = (layers[i]->GetWeights()).size();
0121
0122 for (size_t j = 0; j < weightsNSlices; j++) {
0123 initialize<Architecture_t>(fFirstMomentWeights[i][j], EInitialization::kZero);
0124 initialize<Architecture_t>(fSecondMomentWeights[i][j], EInitialization::kZero);
0125 }
0126
0127 const size_t biasesNSlices = (layers[i]->GetBiases()).size();
0128
0129 Architecture_t::CreateWeightTensors( fFirstMomentBiases[i], layers[i]->GetBiases());
0130 Architecture_t::CreateWeightTensors( fSecondMomentBiases[i], layers[i]->GetBiases());
0131
0132 for (size_t j = 0; j < biasesNSlices; j++) {
0133 initialize<Architecture_t>(fFirstMomentBiases[i][j], EInitialization::kZero);
0134 initialize<Architecture_t>(fSecondMomentBiases[i][j], EInitialization::kZero);
0135 }
0136 }
0137 }
0138
0139
0140 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0141 auto TAdam<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
0142 const std::vector<Matrix_t> &weightGradients) -> void
0143 {
0144
0145
0146
0147
0148 std::vector<Matrix_t> ¤tLayerFirstMomentWeights = this->GetFirstMomentWeightsAt(layerIndex);
0149 std::vector<Matrix_t> ¤tLayerSecondMomentWeights = this->GetSecondMomentWeightsAt(layerIndex);
0150
0151
0152 Scalar_t alpha = (this->GetLearningRate()) * (sqrt(1 - pow(this->GetBeta2(), this->GetGlobalStep()))) /
0153 (1 - pow(this->GetBeta1(), this->GetGlobalStep()));
0154
0155
0156 for (size_t i = 0; i < weights.size(); i++) {
0157
0158 Architecture_t::AdamUpdateFirstMom(currentLayerFirstMomentWeights[i], weightGradients[i], this->GetBeta1() );
0159
0160 Architecture_t::AdamUpdateSecondMom(currentLayerSecondMomentWeights[i], weightGradients[i], this->GetBeta2() );
0161
0162 Architecture_t::AdamUpdate(weights[i], currentLayerFirstMomentWeights[i], currentLayerSecondMomentWeights[i],
0163 alpha, this->GetEpsilon() );
0164 }
0165 }
0166
0167
0168 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0169 auto TAdam<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
0170 const std::vector<Matrix_t> &biasGradients) -> void
0171 {
0172 std::vector<Matrix_t> ¤tLayerFirstMomentBiases = this->GetFirstMomentBiasesAt(layerIndex);
0173 std::vector<Matrix_t> ¤tLayerSecondMomentBiases = this->GetSecondMomentBiasesAt(layerIndex);
0174
0175
0176 Scalar_t alpha = (this->GetLearningRate()) * (sqrt(1 - pow(this->GetBeta2(), this->GetGlobalStep()))) /
0177 (1 - pow(this->GetBeta1(), this->GetGlobalStep()));
0178
0179
0180 for (size_t i = 0; i < biases.size(); i++) {
0181
0182 Architecture_t::AdamUpdateFirstMom(currentLayerFirstMomentBiases[i], biasGradients[i], this->GetBeta1() );
0183
0184 Architecture_t::AdamUpdateSecondMom(currentLayerSecondMomentBiases[i], biasGradients[i], this->GetBeta2() );
0185
0186 Architecture_t::AdamUpdate(biases[i], currentLayerFirstMomentBiases[i], currentLayerSecondMomentBiases[i],
0187 alpha, this->GetEpsilon() );
0188 }
0189 }
0190
0191 }
0192 }
0193
0194 #endif