root/TMVA/NeuralNet.icc

0001 #ifndef TMVA_NEURAL_NET_I
0002 #define TMVA_NEURAL_NET_I
0003
0004 #ifndef TMVA_NEURAL_NET
0005 #error "Do not use NeuralNet.icc directly. #include \"NeuralNet.h\" instead."
0006 #endif // TMVA_NEURAL_NET
0007 #pragma once
0008 #ifndef _MSC_VER
0009 #pragma GCC diagnostic ignored "-Wunused-variable"
0010 #endif
0011
0012 #include "Math/Util.h"
0013
0014 #include "TMVA/Pattern.h"
0015 #include "TMVA/MethodBase.h"
0016
0017 #include <tuple>
0018 #include <future>
0019 #include <random>
0020
0021 namespace TMVA
0022 {
0023     namespace DNN
0024     {
0025
0026
0027
0028
0029
0030
0031
0032
0033         template <typename T>
0034             T uniformFromTo (T from, T to)
0035         {
0036             return from + (rand ()* (to - from)/RAND_MAX);
0037         }
0038
0039
0040
0041         template <typename Container, typename T>
0042             void uniformDouble (Container& container, T maxValue)
0043         {
0044             for (auto it = begin (container), itEnd = end (container); it != itEnd; ++it)
0045             {
0046 //        (*it) = uniformFromTo (-1.0*maxValue, 1.0*maxValue);
0047                 (*it) = TMVA::DNN::uniformFromTo (-1.0*maxValue, 1.0*maxValue);
0048             }
0049         }
0050
0051
0052         extern std::shared_ptr<std::function<double(double)>> ZeroFnc;
0053
0054
0055         extern std::shared_ptr<std::function<double(double)>> Sigmoid;
0056         extern std::shared_ptr<std::function<double(double)>> InvSigmoid;
0057
0058         extern std::shared_ptr<std::function<double(double)>> Tanh;
0059         extern std::shared_ptr<std::function<double(double)>> InvTanh;
0060
0061         extern std::shared_ptr<std::function<double(double)>> Linear;
0062         extern std::shared_ptr<std::function<double(double)>> InvLinear;
0063
0064         extern std::shared_ptr<std::function<double(double)>> SymmReLU;
0065         extern std::shared_ptr<std::function<double(double)>> InvSymmReLU;
0066
0067         extern std::shared_ptr<std::function<double(double)>> ReLU;
0068         extern std::shared_ptr<std::function<double(double)>> InvReLU;
0069
0070         extern std::shared_ptr<std::function<double(double)>> SoftPlus;
0071         extern std::shared_ptr<std::function<double(double)>> InvSoftPlus;
0072
0073         extern std::shared_ptr<std::function<double(double)>> TanhShift;
0074         extern std::shared_ptr<std::function<double(double)>> InvTanhShift;
0075
0076         extern std::shared_ptr<std::function<double(double)>> SoftSign;
0077         extern std::shared_ptr<std::function<double(double)>> InvSoftSign;
0078
0079         extern std::shared_ptr<std::function<double(double)>> Gauss;
0080         extern std::shared_ptr<std::function<double(double)>> InvGauss;
0081
0082         extern std::shared_ptr<std::function<double(double)>> GaussComplement;
0083         extern std::shared_ptr<std::function<double(double)>> InvGaussComplement;
0084
0085
0086 /*! \brief apply weights using drop-out; for no drop out, provide (&bool = true) to itDrop such that *itDrop becomes "true"
0087  *
0088  * itDrop correlates with itSourceBegin
0089  */
0090 template <bool HasDropOut, typename ItSource, typename ItWeight, typename ItTarget, typename ItDrop>
0091             void applyWeights (ItSource itSourceBegin, ItSource itSourceEnd,
0092                                ItWeight itWeight,
0093                                ItTarget itTargetBegin, ItTarget itTargetEnd,
0094                                ItDrop itDrop)
0095         {
0096             for (auto itSource = itSourceBegin; itSource != itSourceEnd; ++itSource)
0097             {
0098                 for (auto itTarget = itTargetBegin; itTarget != itTargetEnd; ++itTarget)
0099                 {
0100             if (!HasDropOut || *itDrop)
0101                         (*itTarget) += (*itSource) * (*itWeight);
0102                     ++itWeight;
0103                 }
0104         if (HasDropOut) ++itDrop;
0105             }
0106         }
0107
0108
0109
0110
0111
0112
0113 /*! \brief apply weights backwards (for backprop); for no drop out, provide (&bool = true) to itDrop such that *itDrop becomes "true"
0114  *
0115  * itDrop correlates with itPrev (to be in agreement with "applyWeights" where it correlates with itSources (same node as itTarget here in applyBackwards)
0116  */
0117 template <bool HasDropOut, typename ItSource, typename ItWeight, typename ItPrev, typename ItDrop>
0118             void applyWeightsBackwards (ItSource itCurrBegin, ItSource itCurrEnd,
0119                                         ItWeight itWeight,
0120                             ItPrev itPrevBegin, ItPrev itPrevEnd,
0121                             ItDrop itDrop)
0122         {
0123             for (auto itPrev = itPrevBegin; itPrev != itPrevEnd; ++itPrev)
0124             {
0125                 for (auto itCurr = itCurrBegin; itCurr != itCurrEnd; ++itCurr)
0126                 {
0127                    if (!HasDropOut || *itDrop)
0128                       (*itPrev) += (*itCurr) * (*itWeight);
0129                     ++itWeight;
0130                 }
0131         if (HasDropOut) ++itDrop;
0132             }
0133         }
0134
0135
0136
0137
0138
0139
0140
0141 /*! \brief apply the activation functions
0142  *
0143  *
0144  */
0145
0146         template <typename ItValue, typename Fnc>
0147             void applyFunctions (ItValue itValue, ItValue itValueEnd, Fnc fnc)
0148         {
0149             while (itValue != itValueEnd)
0150             {
0151                 auto& value = (*itValue);
0152                 value = (*fnc.get ()) (value);
0153
0154                 ++itValue;
0155             }
0156         }
0157
0158
0159 /*! \brief apply the activation functions and compute the gradient
0160  *
0161  *
0162  */
0163         template <typename ItValue, typename Fnc, typename InvFnc, typename ItGradient>
0164             void applyFunctions (ItValue itValue, ItValue itValueEnd, Fnc fnc, InvFnc invFnc, ItGradient itGradient)
0165         {
0166             while (itValue != itValueEnd)
0167             {
0168                 auto& value = (*itValue);
0169                 value = (*fnc.get ()) (value);
0170                 (*itGradient) = (*invFnc.get ()) (value);
0171
0172                 ++itValue; ++itGradient;
0173             }
0174         }
0175
0176
0177
0178 /*! \brief update the gradients
0179  *
0180  *
0181  */
0182         template <typename ItSource, typename ItDelta, typename ItTargetGradient, typename ItGradient>
0183             void update (ItSource itSource, ItSource itSourceEnd,
0184                          ItDelta itTargetDeltaBegin, ItDelta itTargetDeltaEnd,
0185                          ItTargetGradient itTargetGradientBegin,
0186                          ItGradient itGradient)
0187         {
0188             while (itSource != itSourceEnd)
0189             {
0190                 auto itTargetDelta = itTargetDeltaBegin;
0191                 auto itTargetGradient = itTargetGradientBegin;
0192                 while (itTargetDelta != itTargetDeltaEnd)
0193                 {
0194             (*itGradient) -= (*itTargetDelta) * (*itSource) * (*itTargetGradient);
0195                     ++itTargetDelta; ++itTargetGradient; ++itGradient;
0196                 }
0197                 ++itSource;
0198             }
0199         }
0200
0201
0202
0203
0204 /*! \brief compute the regularization (L1, L2)
0205  *
0206  *
0207  */
0208         template <EnumRegularization Regularization>
0209             inline double computeRegularization (double weight, const double& factorWeightDecay)
0210         {
0211            MATH_UNUSED(weight);
0212            MATH_UNUSED(factorWeightDecay);
0213
0214             return 0;
0215         }
0216
0217 // L1 regularization
0218         template <>
0219             inline double computeRegularization<EnumRegularization::L1> (double weight, const double& factorWeightDecay)
0220         {
0221             return weight == 0.0 ? 0.0 : std::copysign (factorWeightDecay, weight);
0222         }
0223
0224 // L2 regularization
0225         template <>
0226             inline double computeRegularization<EnumRegularization::L2> (double weight, const double& factorWeightDecay)
0227         {
0228             return factorWeightDecay * weight;
0229         }
0230
0231
0232 /*! \brief update the gradients, using regularization
0233  *
0234  *
0235  */
0236         template <EnumRegularization Regularization, typename ItSource, typename ItDelta, typename ItTargetGradient, typename ItGradient, typename ItWeight>
0237             void update (ItSource itSource, ItSource itSourceEnd,
0238                          ItDelta itTargetDeltaBegin, ItDelta itTargetDeltaEnd,
0239                          ItTargetGradient itTargetGradientBegin,
0240                          ItGradient itGradient,
0241                          ItWeight itWeight, double weightDecay)
0242         {
0243             // ! the factor weightDecay has to be already scaled by 1/n where n is the number of weights
0244             while (itSource != itSourceEnd)
0245             {
0246                 auto itTargetDelta = itTargetDeltaBegin;
0247                 auto itTargetGradient = itTargetGradientBegin;
0248                 while (itTargetDelta != itTargetDeltaEnd)
0249                 {
0250                     (*itGradient) -= + (*itTargetDelta) * (*itSource) * (*itTargetGradient) + computeRegularization<Regularization>(*itWeight,weightDecay);
0251                     ++itTargetDelta; ++itTargetGradient; ++itGradient; ++itWeight;
0252                 }
0253                 ++itSource;
0254             }
0255         }
0256
0257
0258
0259
0260
0261
0262 #define USELOCALWEIGHTS 1
0263
0264
0265
0266 /*! \brief implementation of the steepest gradient descent algorithm
0267  *
0268  * Can be used with multithreading (i.e. "HogWild!" style); see call in trainCycle
0269  */
0270         template <typename Function, typename Weights, typename PassThrough>
0271             double Steepest::operator() (Function& fitnessFunction, Weights& weights, PassThrough& passThrough)
0272         {
0273             size_t numWeights = weights.size ();
0274             // std::vector<double> gradients (numWeights, 0.0);
0275             m_localGradients.assign (numWeights, 0.0);
0276             // std::vector<double> localWeights (begin (weights), end (weights));
0277             // m_localWeights.reserve (numWeights);
0278             m_localWeights.assign (begin (weights), end (weights));
0279
0280             double E = 1e10;
0281             if (m_prevGradients.size () != numWeights)
0282             {
0283                 m_prevGradients.clear ();
0284                 m_prevGradients.assign (weights.size (), 0);
0285             }
0286
0287             bool success = true;
0288             size_t currentRepetition = 0;
0289             while (success)
0290             {
0291                 if (currentRepetition >= m_repetitions)
0292                     break;
0293
0294                 m_localGradients.assign (numWeights, 0.0);
0295
0296                 // --- nesterov momentum ---
0297                 // apply momentum before computing the new gradient
0298                 auto itPrevG = begin (m_prevGradients);
0299                 auto itPrevGEnd = end (m_prevGradients);
0300                 auto itLocWeight = begin (m_localWeights);
0301                 for (; itPrevG != itPrevGEnd; ++itPrevG, ++itLocWeight)
0302                 {
0303                     (*itPrevG) *= m_beta;
0304                     (*itLocWeight) += (*itPrevG);
0305                 }
0306
0307                 E = fitnessFunction (passThrough, m_localWeights, m_localGradients);
0308 //            plotGradients (gradients);
0309 //            plotWeights (localWeights);
0310
0311                 double alpha = gaussDouble (m_alpha, m_alpha/2.0);
0312 //                double alpha = m_alpha;
0313
0314                 auto itG = begin (m_localGradients);
0315                 auto itGEnd = end (m_localGradients);
0316                 itPrevG = begin (m_prevGradients);
0317                 double maxGrad = 0.0;
0318                 for (; itG != itGEnd; ++itG, ++itPrevG)
0319                 {
0320                     double currGrad = (*itG);
0321                     double prevGrad = (*itPrevG);
0322                     currGrad *= alpha;
0323
0324                     //(*itPrevG) = m_beta * (prevGrad + currGrad);
0325                     currGrad += prevGrad;
0326                     (*itG) = currGrad;
0327                     (*itPrevG) = currGrad;
0328
0329                     if (std::fabs (currGrad) > maxGrad)
0330                         maxGrad = currGrad;
0331                 }
0332
0333                 if (maxGrad > 1)
0334                 {
0335                     m_alpha /= 2;
0336                     std::cout << "\nlearning rate reduced to " << m_alpha << std::endl;
0337                     std::for_each (weights.begin (), weights.end (), [maxGrad](double& w)
0338                                    {
0339                                        w /= maxGrad;
0340                                    });
0341                     m_prevGradients.clear ();
0342                 }
0343                 else
0344                 {
0345                     auto itW = std::begin (weights);
0346                     std::for_each (std::begin (m_localGradients), std::end (m_localGradients), [&itW](double& g)
0347                                    {
0348                                        *itW += g;
0349                                        ++itW;
0350                                    });
0351                 }
0352
0353                 ++currentRepetition;
0354             }
0355             return E;
0356         }
0357
0358
0359
0360
0361
0362
0363
0364
0365
0366
0367
0368
0369
0370
0371
0372
0373
0374
0375
0376
0377 /*! \brief sum of squares error function
0378  *
0379  *
0380  */
0381         template <typename ItOutput, typename ItTruth, typename ItDelta, typename InvFnc>
0382             double sumOfSquares (ItOutput itOutputBegin, ItOutput itOutputEnd, ItTruth itTruthBegin, ItTruth /*itTruthEnd*/, ItDelta itDelta, ItDelta itDeltaEnd, InvFnc invFnc, double patternWeight)
0383         {
0384             double errorSum = 0.0;
0385
0386             // output - truth
0387             ItTruth itTruth = itTruthBegin;
0388             bool hasDeltas = (itDelta != itDeltaEnd);
0389             for (ItOutput itOutput = itOutputBegin; itOutput != itOutputEnd; ++itOutput, ++itTruth)
0390             {
0391 //  assert (itTruth != itTruthEnd);
0392                 double output = (*itOutput);
0393                 double error = output - (*itTruth);
0394                 if (hasDeltas)
0395                 {
0396                     (*itDelta) = (*invFnc.get ()) (output) * error * patternWeight;
0397                     ++itDelta;
0398                 }
0399                 errorSum += error*error  * patternWeight;
0400             }
0401
0402             return 0.5*errorSum;
0403         }
0404
0405
0406
0407 /*! \brief cross entropy error function
0408  *
0409  *
0410  */
0411         template <typename ItProbability, typename ItTruth, typename ItDelta, typename ItInvActFnc>
0412             double crossEntropy (ItProbability itProbabilityBegin, ItProbability itProbabilityEnd, ItTruth itTruthBegin, ItTruth /*itTruthEnd*/, ItDelta itDelta, ItDelta itDeltaEnd, ItInvActFnc /*itInvActFnc*/, double patternWeight)
0413         {
0414             bool hasDeltas = (itDelta != itDeltaEnd);
0415
0416             double errorSum = 0.0;
0417             for (ItProbability itProbability = itProbabilityBegin; itProbability != itProbabilityEnd; ++itProbability)
0418             {
0419                 double probability = *itProbability;
0420                 double truth = *itTruthBegin;
0421                 /* truth = truth < 0.1 ? 0.1 : truth; */
0422                 /* truth = truth > 0.9 ? 0.9 : truth; */
0423                 truth = truth < 0.5 ? 0.1 : 0.9;
0424                 if (hasDeltas)
0425                 {
0426                     double delta = probability - truth;
0427                     (*itDelta) = delta*patternWeight;
0428 //      (*itDelta) = (*itInvActFnc)(probability) * delta * patternWeight;
0429                     ++itDelta;
0430                 }
0431                 double error (0);
0432                 if (probability == 0) // protection against log (0)
0433                 {
0434                     if (truth >= 0.5)
0435                         error += 1.0;
0436                 }
0437                 else if (probability == 1)
0438                 {
0439                     if (truth < 0.5)
0440                         error += 1.0;
0441                 }
0442                 else
0443                     error += - (truth * log (probability) + (1.0-truth) * log (1.0-probability)); // cross entropy function
0444                 errorSum += error * patternWeight;
0445
0446             }
0447             return errorSum;
0448         }
0449
0450
0451
0452
0453 /*! \brief soft-max-cross-entropy error function (for mutual exclusive cross-entropy)
0454  *
0455  *
0456  */
0457         template <typename ItOutput, typename ItTruth, typename ItDelta, typename ItInvActFnc>
0458             double softMaxCrossEntropy (ItOutput itProbabilityBegin, ItOutput itProbabilityEnd, ItTruth itTruthBegin, ItTruth /*itTruthEnd*/, ItDelta itDelta, ItDelta itDeltaEnd, ItInvActFnc /*itInvActFnc*/, double patternWeight)
0459         {
0460             double errorSum = 0.0;
0461
0462             bool hasDeltas = (itDelta != itDeltaEnd);
0463             // output - truth
0464             ItTruth itTruth = itTruthBegin;
0465             for (auto itProbability = itProbabilityBegin; itProbability != itProbabilityEnd; ++itProbability, ++itTruth)
0466             {
0467 //  assert (itTruth != itTruthEnd);
0468                 double probability = (*itProbability);
0469                 double truth = (*itTruth);
0470                 if (hasDeltas)
0471                 {
0472                     (*itDelta) = probability - truth;
0473 //      (*itDelta) = (*itInvActFnc)(sm) * delta * patternWeight;
0474                     ++itDelta; //++itInvActFnc;
0475                 }
0476                 double error (0);
0477
0478                 error += truth * log (probability);
0479                 errorSum += error;
0480             }
0481
0482             return -errorSum * patternWeight;
0483         }
0484
0485
0486
0487
0488
0489
0490
0491
0492
0493 /*! \brief compute the weight decay for regularization (L1 or L2)
0494  *
0495  *
0496  */
0497         template <typename ItWeight>
0498             double weightDecay (double error, ItWeight itWeight, ItWeight itWeightEnd, double factorWeightDecay, EnumRegularization eRegularization)
0499         {
0500             if (eRegularization == EnumRegularization::L1)
0501             {
0502                 // weight decay (regularization)
0503                 double w = 0;
0504                 size_t n = 0;
0505                 for (; itWeight != itWeightEnd; ++itWeight, ++n)
0506                 {
0507                     double weight = (*itWeight);
0508                     w += std::fabs (weight);
0509                 }
0510                 return error + 0.5 * w * factorWeightDecay / n;
0511             }
0512             else if (eRegularization == EnumRegularization::L2)
0513             {
0514                 // weight decay (regularization)
0515                 double w = 0;
0516                 size_t n = 0;
0517                 for (; itWeight != itWeightEnd; ++itWeight, ++n)
0518                 {
0519                     double weight = (*itWeight);
0520                     w += weight*weight;
0521                 }
0522                 return error + 0.5 * w * factorWeightDecay / n;
0523             }
0524             else
0525                 return error;
0526         }
0527
0528
0529
0530
0531
0532
0533
0534
0535
0536
0537
0538
0539
0540
0541 /*! \brief apply the weights (and functions) in forward direction of the DNN
0542  *
0543  *
0544  */
0545         template <typename LAYERDATA>
0546             void forward (const LAYERDATA& prevLayerData, LAYERDATA& currLayerData)
0547         {
0548             if (prevLayerData.hasDropOut ())
0549             {
0550         applyWeights<true> (prevLayerData.valuesBegin (), prevLayerData.valuesEnd (),
0551                               currLayerData.weightsBegin (),
0552                               currLayerData.valuesBegin (), currLayerData.valuesEnd (),
0553                               prevLayerData.dropOut ());
0554             }
0555             else
0556             {
0557         bool dummy = true;
0558         applyWeights<false> (prevLayerData.valuesBegin (), prevLayerData.valuesEnd (),
0559                               currLayerData.weightsBegin (),
0560                              currLayerData.valuesBegin (), currLayerData.valuesEnd (),
0561                              &dummy); // dummy to turn on all nodes (no drop out)
0562             }
0563         }
0564
0565
0566
0567 /*! \brief backward application of the weights (back-propagation of the error)
0568  *
0569  *
0570  */
0571 template <typename LAYERDATA>
0572     void backward (LAYERDATA& prevLayerData, LAYERDATA& currLayerData)
0573 {
0574     if (prevLayerData.hasDropOut ())
0575     {
0576         applyWeightsBackwards<true> (currLayerData.deltasBegin (), currLayerData.deltasEnd (),
0577                                      currLayerData.weightsBegin (),
0578                                      prevLayerData.deltasBegin (), prevLayerData.deltasEnd (),
0579                                      prevLayerData.dropOut ());
0580     }
0581     else
0582     {
0583         bool dummy = true;
0584         applyWeightsBackwards<false> (currLayerData.deltasBegin (), currLayerData.deltasEnd (),
0585                                       currLayerData.weightsBegin (),
0586                                       prevLayerData.deltasBegin (), prevLayerData.deltasEnd (),
0587                                       &dummy); // dummy to use all nodes (no drop out)
0588     }
0589 }
0590
0591
0592
0593
0594
0595 /*! \brief update the node values
0596  *
0597  *
0598  */
0599         template <typename LAYERDATA>
0600             void update (const LAYERDATA& prevLayerData, LAYERDATA& currLayerData, double factorWeightDecay, EnumRegularization regularization)
0601         {
0602             // ! the "factorWeightDecay" has already to be scaled by 1/n where n is the number of weights
0603             if (factorWeightDecay != 0.0) // has weight regularization
0604                 if (regularization == EnumRegularization::L1)  // L1 regularization ( sum(|w|) )
0605                 {
0606                     update<EnumRegularization::L1> (prevLayerData.valuesBegin (), prevLayerData.valuesEnd (),
0607                                                     currLayerData.deltasBegin (), currLayerData.deltasEnd (),
0608                                                     currLayerData.valueGradientsBegin (), currLayerData.gradientsBegin (),
0609                                                     currLayerData.weightsBegin (), factorWeightDecay);
0610                 }
0611                 else if (regularization == EnumRegularization::L2) // L2 regularization ( sum(w^2) )
0612                 {
0613                     update<EnumRegularization::L2> (prevLayerData.valuesBegin (), prevLayerData.valuesEnd (),
0614                                                     currLayerData.deltasBegin (), currLayerData.deltasEnd (),
0615                                                     currLayerData.valueGradientsBegin (), currLayerData.gradientsBegin (),
0616                                                     currLayerData.weightsBegin (), factorWeightDecay);
0617                 }
0618                 else
0619                 {
0620                     update (prevLayerData.valuesBegin (), prevLayerData.valuesEnd (),
0621                             currLayerData.deltasBegin (), currLayerData.deltasEnd (),
0622                             currLayerData.valueGradientsBegin (), currLayerData.gradientsBegin ());
0623                 }
0624
0625             else
0626             { // no weight regularization
0627                 update (prevLayerData.valuesBegin (), prevLayerData.valuesEnd (),
0628                         currLayerData.deltasBegin (), currLayerData.deltasEnd (),
0629                         currLayerData.valueGradientsBegin (), currLayerData.gradientsBegin ());
0630             }
0631         }
0632
0633
0634
0635
0636
0637
0638
0639
0640
0641
0642
0643
0644 /*! \brief compute the drop-out-weight factor
0645  *
0646  * when using drop-out a fraction of the nodes is turned off at each cycle of the computation
0647  * once all nodes are turned on again (for instances when the test samples are evaluated),
0648  * the weights have to be adjusted to account for the different number of active nodes
0649  * this function computes the factor and applies it to the weights
0650  */
0651         template <typename WeightsType, typename DropProbabilities>
0652             void Net::dropOutWeightFactor (WeightsType& weights,
0653                                            const DropProbabilities& drops,
0654                                            bool inverse)
0655         {
0656             if (drops.empty () || weights.empty ())
0657                 return;
0658
0659             auto itWeight = std::begin (weights);
0660             auto itWeightEnd = std::end (weights);
0661             auto itDrop = std::begin (drops);
0662             auto itDropEnd = std::end (drops);
0663             size_t numNodesPrev = inputSize ();
0664             double dropFractionPrev = *itDrop;
0665             ++itDrop;
0666
0667             for (auto& layer : layers ())
0668             {
0669                 if (itDrop == itDropEnd)
0670                     break;
0671
0672                 size_t _numNodes = layer.numNodes ();
0673
0674                 double dropFraction = *itDrop;
0675                 double pPrev = 1.0 - dropFractionPrev;
0676                 double p = 1.0 - dropFraction;
0677                 p *= pPrev;
0678
0679                 if (inverse)
0680                 {
0681                     p = 1.0/p;
0682                 }
0683                 size_t _numWeights = layer.numWeights (numNodesPrev);
0684                 for (size_t iWeight = 0; iWeight < _numWeights; ++iWeight)
0685                 {
0686                     if (itWeight == itWeightEnd)
0687                         break;
0688
0689                     *itWeight *= p;
0690                     ++itWeight;
0691                 }
0692                 numNodesPrev = _numNodes;
0693                 dropFractionPrev = dropFraction;
0694                 ++itDrop;
0695             }
0696         }
0697
0698
0699
0700
0701
0702
0703 /*! \brief execute the training until convergence emerges
0704  *
0705  * \param weights the container with the weights (synapses)
0706  * \param trainPattern the pattern for the training
0707  * \param testPattern the pattern for the testing
0708  * \param minimizer the minimizer (e.g. steepest gradient descent) to be used
0709  * \param settings the settings for the training (e.g. multithreading or not, regularization etc.)
0710  */
0711         template <typename Minimizer>
0712             double Net::train (std::vector<double>& weights,
0713                                std::vector<Pattern>& trainPattern,
0714                                const std::vector<Pattern>& testPattern,
0715                            Minimizer& minimizer,
0716                            Settings& settings)
0717         {
0718 //        std::cout << "START TRAINING" << std::endl;
0719             settings.startTrainCycle ();
0720
0721             // JsMVA progress bar maximum (100%)
0722             if (fIPyMaxIter) *fIPyMaxIter = 100;
0723
0724             settings.pads (4);
0725             settings.create ("trainErrors", 100, 0, 100, 100, 0,1);
0726             settings.create ("testErrors", 100, 0, 100, 100, 0,1);
0727
0728             size_t cycleCount = 0;
0729             size_t testCycleCount = 0;
0730             double testError = 1e20;
0731             double trainError = 1e20;
0732             size_t dropOutChangeCount = 0;
0733
0734             DropContainer dropContainer;
0735             DropContainer dropContainerTest;
0736             const std::vector<double>& dropFractions = settings.dropFractions ();
0737             bool isWeightsForDrop = false;
0738
0739
0740             // until convergence
0741             do
0742             {
0743                 ++cycleCount;
0744
0745                 // if dropOut enabled
0746                 size_t dropIndex = 0;
0747                 if (!dropFractions.empty () && dropOutChangeCount % settings.dropRepetitions () == 0)
0748                 {
0749                     // fill the dropOut-container
0750                     dropContainer.clear ();
0751                     size_t _numNodes = inputSize ();
0752                     double dropFraction = 0.0;
0753                     dropFraction = dropFractions.at (dropIndex);
0754                     ++dropIndex;
0755                     fillDropContainer (dropContainer, dropFraction, _numNodes);
0756                     for (auto itLayer = begin (m_layers), itLayerEnd = end (m_layers); itLayer != itLayerEnd; ++itLayer, ++dropIndex)
0757                     {
0758                         auto& layer = *itLayer;
0759                         _numNodes = layer.numNodes ();
0760                         // how many nodes have to be dropped
0761                         dropFraction = 0.0;
0762                         if (dropFractions.size () > dropIndex)
0763                             dropFraction = dropFractions.at (dropIndex);
0764
0765                         fillDropContainer (dropContainer, dropFraction, _numNodes);
0766                     }
0767                     isWeightsForDrop = true;
0768                 }
0769
0770                 // execute training cycle
0771                 trainError = trainCycle (minimizer, weights, begin (trainPattern), end (trainPattern), settings, dropContainer);
0772
0773
0774         // ------ check if we have to execute a test ------------------
0775                 bool hasConverged = false;
0776             if (testCycleCount % settings.testRepetitions () == 0) // we test only everye "testRepetitions" repetition
0777                 {
0778                     if (isWeightsForDrop)
0779                     {
0780                         dropOutWeightFactor (weights, dropFractions);
0781                         isWeightsForDrop = false;
0782                     }
0783
0784
0785                     testError = 0;
0786                     //double weightSum = 0;
0787                     settings.startTestCycle ();
0788                     if (settings.useMultithreading ())
0789                     {
0790                         size_t numThreads = std::thread::hardware_concurrency ();
0791                         size_t patternPerThread = testPattern.size () / numThreads;
0792                         std::vector<Batch> batches;
0793                         auto itPat = testPattern.begin ();
0794                         // auto itPatEnd = testPattern.end ();
0795                         for (size_t idxThread = 0; idxThread < numThreads-1; ++idxThread)
0796                         {
0797                             batches.push_back (Batch (itPat, itPat + patternPerThread));
0798                             itPat += patternPerThread;
0799                         }
0800                         if (itPat != testPattern.end ())
0801                             batches.push_back (Batch (itPat, testPattern.end ()));
0802
0803                         std::vector<std::future<std::tuple<double,std::vector<double>>>> futures;
0804                         for (auto& batch : batches)
0805                         {
0806                             // -------------------- execute each of the batch ranges on a different thread -------------------------------
0807                             futures.push_back (
0808                                 std::async (std::launch::async, [&]()
0809                                             {
0810                                                 std::vector<double> localOutput;
0811                                                 pass_through_type passThrough (settings, batch, dropContainerTest);
0812                                                 double testBatchError = (*this) (passThrough, weights, ModeOutput::FETCH, localOutput);
0813                                                 return std::make_tuple (testBatchError, localOutput);
0814                                             })
0815                                 );
0816                         }
0817
0818                         auto itBatch = batches.begin  ();
0819                         for (auto& f : futures)
0820                         {
0821                             std::tuple<double,std::vector<double>> result = f.get ();
0822                             testError += std::get<0>(result) / batches.size ();
0823                             std::vector<double> output = std::get<1>(result);
0824                             if (output.size() == (outputSize() - 1) * itBatch->size())
0825                             {
0826                                 auto output_iterator = output.begin();
0827                                 for (auto pattern_it = itBatch->begin(); pattern_it != itBatch->end(); ++pattern_it)
0828                                 {
0829                                     for (size_t output_index = 1; output_index < outputSize(); ++output_index)
0830                                     {
0831                                         settings.testSample (0, *output_iterator, (*pattern_it).output ().at (0),
0832                                                                                   (*pattern_it).weight ());
0833                                         ++output_iterator;
0834                                     }
0835                                 }
0836                             }
0837                         ++itBatch;
0838                         }
0839
0840                     }
0841                     else
0842                     {
0843                         std::vector<double> output;
0844                     //for (auto it = begin (testPattern), itEnd = end (testPattern); it != itEnd; ++it)
0845                         {
0846                         //const Pattern& p = (*it);
0847                         //double weight = p.weight ();
0848                         //Batch batch (it, it+1);
0849                         Batch batch (begin (testPattern), end (testPattern));
0850                             output.clear ();
0851                         pass_through_type passThrough (settings, batch, dropContainerTest);
0852                             double testPatternError = (*this) (passThrough, weights, ModeOutput::FETCH, output);
0853                         if (output.size() == (outputSize() - 1) * batch.size())
0854                         {
0855                             auto output_iterator = output.begin();
0856                             for (auto pattern_it = batch.begin(); pattern_it != batch.end(); ++pattern_it)
0857                             {
0858                                 for (size_t output_index = 1; output_index < outputSize(); ++output_index)
0859                                 {
0860                                     settings.testSample (0, *output_iterator, (*pattern_it).output ().at (0),
0861                                                                               (*pattern_it).weight ());
0862                                     ++output_iterator;
0863                                 }
0864                             }
0865                         }
0866                         testError += testPatternError; /// batch.size ();
0867                         }
0868                     // testError /= testPattern.size ();
0869                     }
0870                     settings.endTestCycle ();
0871 //                    testError /= weightSum;
0872
0873                     settings.computeResult (*this, weights);
0874
0875                     hasConverged = settings.hasConverged (testError);
0876                     if (!hasConverged && !isWeightsForDrop)
0877                     {
0878                         dropOutWeightFactor (weights, dropFractions, true); // inverse
0879                         isWeightsForDrop = true;
0880                     }
0881                 }
0882                 ++testCycleCount;
0883                 ++dropOutChangeCount;
0884
0885
0886 //            settings.resetPlot ("errors");
0887                 settings.addPoint ("trainErrors", cycleCount, trainError);
0888                 settings.addPoint ("testErrors", cycleCount, testError);
0889                 settings.plot ("trainErrors", "C", 1, kBlue);
0890                 settings.plot ("testErrors", "C", 1, kMagenta);
0891
0892
0893                 // setup error plots and progress bar variables for JsMVA
0894                 if (fInteractive){
0895                   fInteractive->AddPoint(cycleCount, trainError, testError);
0896                   if (*fExitFromTraining) break;
0897                   *fIPyCurrentIter = 100*(double)settings.maxConvergenceCount () /(double)settings.convergenceSteps ();
0898                 }
0899
0900                 if (hasConverged)
0901                     break;
0902
0903                 if ((int)cycleCount % 10 == 0) {
0904
0905                    TString convText = TString::Format( "(train/test/epo/conv/maxco): %.3g/%.3g/%d/%d/%d",
0906                                             trainError,
0907                                             testError,
0908                                             (int)cycleCount,
0909                                             (int)settings.convergenceCount (),
0910                                             (int)settings.maxConvergenceCount ());
0911                    double progress = 100*(double)settings.maxConvergenceCount () /(double)settings.convergenceSteps ();
0912                    settings.cycle (progress, convText);
0913                 }
0914             }
0915             while (true);
0916             settings.endTrainCycle (trainError);
0917
0918             TString convText = TString::Format( "(train/test/epoch): %.4g/%.4g/%d", trainError, testError, (int)cycleCount);
0919             double progress = 100*(double)settings.maxConvergenceCount() /(double)settings.convergenceSteps ();
0920             settings.cycle (progress, convText);
0921
0922             return testError;
0923         }
0924
0925
0926
0927 /*! \brief execute a single training cycle
0928  *
0929  * uses multithreading if turned on
0930  *
0931  * \param minimizer the minimizer to be used (e.g. SGD)
0932  * \param weights the weight container with all the synapse weights
0933  * \param itPatternBegin begin of the pattern container
0934  * \param itPatternEnd the end of the pattern container
0935  * \param settings the settings for this training (e.g. multithreading or not, regularization, etc.)
0936  * \param dropContainer the data for dropping-out nodes (regularization technique)
0937  */
0938         template <typename Iterator, typename Minimizer>
0939             inline double Net::trainCycle (Minimizer& minimizer, std::vector<double>& weights,
0940                                            Iterator itPatternBegin, Iterator itPatternEnd, Settings& settings, DropContainer& dropContainer)
0941         {
0942             double error = 0.0;
0943             size_t numPattern = std::distance (itPatternBegin, itPatternEnd);
0944             size_t numBatches = numPattern/settings.batchSize ();
0945             size_t numBatches_stored = numBatches;
0946
0947             std::shuffle(itPatternBegin, itPatternEnd, std::default_random_engine{});
0948             Iterator itPatternBatchBegin = itPatternBegin;
0949             Iterator itPatternBatchEnd = itPatternBatchBegin;
0950
0951             // create batches
0952             std::vector<Batch> batches;
0953             while (numBatches > 0)
0954             {
0955                 std::advance (itPatternBatchEnd, settings.batchSize ());
0956                 batches.push_back (Batch (itPatternBatchBegin, itPatternBatchEnd));
0957                 itPatternBatchBegin = itPatternBatchEnd;
0958                 --numBatches;
0959             }
0960
0961             // add the last pattern to the last batch
0962             if (itPatternBatchEnd != itPatternEnd)
0963                 batches.push_back (Batch (itPatternBatchEnd, itPatternEnd));
0964
0965
0966             ///< turn on multithreading if requested
0967             if (settings.useMultithreading ())
0968             {
0969                 // -------------------- divide the batches into bunches for each thread --------------
0970                 size_t numThreads = std::thread::hardware_concurrency ();
0971                 size_t batchesPerThread = batches.size () / numThreads;
0972                 typedef std::vector<Batch>::iterator batch_iterator;
0973                 std::vector<std::pair<batch_iterator,batch_iterator>> batchVec;
0974                 batch_iterator itBatchBegin = std::begin (batches);
0975                 batch_iterator itBatchCurrEnd = std::begin (batches);
0976                 batch_iterator itBatchEnd = std::end (batches);
0977                 for (size_t iT = 0; iT < numThreads; ++iT)
0978                 {
0979                     if (iT == numThreads-1)
0980                         itBatchCurrEnd = itBatchEnd;
0981                     else
0982                         std::advance (itBatchCurrEnd, batchesPerThread);
0983                     batchVec.push_back (std::make_pair (itBatchBegin, itBatchCurrEnd));
0984                     itBatchBegin = itBatchCurrEnd;
0985                 }
0986
0987                 // -------------------- loop  over batches -------------------------------------------
0988                 std::vector<std::future<double>> futures;
0989                 for (auto& batchRange : batchVec)
0990                 {
0991                     // -------------------- execute each of the batch ranges on a different thread -------------------------------
0992                     futures.push_back (
0993                         std::async (std::launch::async, [&]()
0994                                     {
0995                                         double localError = 0.0;
0996                                         for (auto it = batchRange.first, itEnd = batchRange.second; it != itEnd; ++it)
0997                                         {
0998                                             Batch& batch = *it;
0999                                             pass_through_type settingsAndBatch (settings, batch, dropContainer);
1000                                             Minimizer minimizerClone (minimizer);
1001                                             localError += minimizerClone ((*this), weights, settingsAndBatch); /// call the minimizer
1002                                         }
1003                                         return localError;
1004                                     })
1005                         );
1006                 }
1007
1008                 for (auto& f : futures)
1009                     error += f.get ();
1010             }
1011             else
1012             {
1013                 for (auto& batch : batches)
1014                 {
1015                     std::tuple<Settings&, Batch&, DropContainer&> settingsAndBatch (settings, batch, dropContainer);
1016                     error += minimizer ((*this), weights, settingsAndBatch);
1017                 }
1018             }
1019
1020             numBatches_stored = std::max (numBatches_stored, size_t(1)); /// normalize the error
1021             error /= numBatches_stored;
1022             settings.testIteration ();
1023
1024             return error;
1025         }
1026
1027
1028
1029
1030
1031 /*! \brief compute the neural net
1032  *
1033  * \param input the input data
1034  * \param weights the weight data
1035  */
1036         template <typename Weights>
1037             std::vector<double> Net::compute (const std::vector<double>& input, const Weights& weights) const
1038         {
1039             std::vector<LayerData> layerData;
1040             layerData.reserve (m_layers.size ()+1);
1041             auto itWeight = begin (weights);
1042             auto itInputBegin = begin (input);
1043             auto itInputEnd = end (input);
1044             layerData.push_back (LayerData (itInputBegin, itInputEnd));
1045             size_t numNodesPrev = input.size ();
1046
1047         // -------------------- prepare layer data with one pattern -------------------------------
1048             for (auto& layer: m_layers)
1049             {
1050                 layerData.push_back (LayerData (layer.numNodes (), itWeight,
1051                                                 layer.activationFunction (),
1052                                                 layer.modeOutputValues ()));
1053                 size_t _numWeights = layer.numWeights (numNodesPrev);
1054                 itWeight += _numWeights;
1055                 numNodesPrev = layer.numNodes ();
1056             }
1057
1058
1059             // --------- forward -------------
1060         forwardPattern (m_layers, layerData);
1061
1062             // ------------- fetch output ------------------
1063                 std::vector<double> output;
1064         fetchOutput (layerData.back (), output);
1065             return output;
1066         }
1067
1068
1069         template <typename Weights, typename PassThrough>
1070             double Net::operator() (PassThrough& settingsAndBatch, const Weights& weights) const
1071         {
1072             std::vector<double> nothing; // empty gradients; no backpropagation is done, just forward
1073             assert (numWeights () == weights.size ());
1074     double error = forward_backward(m_layers, settingsAndBatch, std::begin (weights), std::end (weights), std::begin (nothing), std::end (nothing), 10000, nothing, false);
1075             return error;
1076         }
1077
1078         template <typename Weights, typename PassThrough, typename OutContainer>
1079             double Net::operator() (PassThrough& settingsAndBatch, const Weights& weights, ModeOutput /*eFetch*/, OutContainer& outputContainer) const
1080         {
1081             std::vector<double> nothing; // empty gradients; no backpropagation is done, just forward
1082             assert (numWeights () == weights.size ());
1083     double error = forward_backward(m_layers, settingsAndBatch, std::begin (weights), std::end (weights), std::begin (nothing), std::end (nothing), 10000, outputContainer, true);
1084             return error;
1085         }
1086
1087
1088         template <typename Weights, typename Gradients, typename PassThrough>
1089         double Net::operator() (PassThrough& settingsAndBatch, Weights& weights, Gradients& gradients) const
1090         {
1091             std::vector<double> nothing;
1092             assert (numWeights () == weights.size ());
1093             assert (weights.size () == gradients.size ());
1094     double error = forward_backward(m_layers, settingsAndBatch, std::begin (weights), std::end (weights), std::begin (gradients), std::end (gradients), 0, nothing, false);
1095             return error;
1096         }
1097
1098         template <typename Weights, typename Gradients, typename PassThrough, typename OutContainer>
1099         double Net::operator() (PassThrough& settingsAndBatch, Weights& weights, Gradients& gradients, ModeOutput eFetch, OutContainer& outputContainer) const
1100         {
1101             MATH_UNUSED(eFetch);
1102             assert (numWeights () == weights.size ());
1103             assert (weights.size () == gradients.size ());
1104     double error = forward_backward(m_layers, settingsAndBatch, std::begin (weights), std::end (weights), std::begin (gradients), std::end (gradients), 0, outputContainer, true);
1105             return error;
1106         }
1107
1108
1109
1110     template <typename LayerContainer, typename DropContainer, typename ItWeight, typename ItGradient>
1111         std::vector<std::vector<LayerData>> Net::prepareLayerData (LayerContainer& _layers,
1112                                                                    Batch& batch,
1113                                                                    const DropContainer& dropContainer,
1114                                                                    ItWeight itWeightBegin,
1115                                                                    ItWeight /*itWeightEnd*/,
1116                                                                    ItGradient itGradientBegin,
1117                                                                    ItGradient itGradientEnd,
1118                                                                    size_t& totalNumWeights) const
1119     {
1120         LayerData::const_dropout_iterator itDropOut;
1121         bool usesDropOut = !dropContainer.empty ();
1122         if (usesDropOut)
1123             itDropOut = std::begin (dropContainer);
1124
1125     if (_layers.empty ())
1126         throw std::string ("no layers in this net");
1127
1128
1129         // ----------- create layer data -------------------------------------------------------
1130         //LM- This assert not needed anymore (outputsize is actually numNodes+1)
1131         //assert (_layers.back ().numNodes () == outputSize ());
1132         totalNumWeights = 0;
1133         std::vector<std::vector<LayerData>> layerPatternData;
1134         layerPatternData.reserve (_layers.size ()+1);
1135         ItWeight itWeight = itWeightBegin;
1136         ItGradient itGradient = itGradientBegin;
1137         size_t numNodesPrev = inputSize ();
1138         typename Pattern::const_iterator itInputBegin;
1139         typename Pattern::const_iterator itInputEnd;
1140
1141         // ItWeight itGammaBegin = itWeightBegin + numWeights ();
1142         // ItWeight itBetaBegin = itWeightBegin + numWeights () + numNodes ();
1143         // ItGradient itGradGammaBegin = itGradientBegin + numWeights ();
1144         // ItGradient itGradBetaBegin = itGradientBegin + numWeights () + numNodes ();
1145
1146
1147         // --------------------- prepare layer data for input layer ----------------------------
1148         layerPatternData.push_back (std::vector<LayerData>());
1149     for (const Pattern& _pattern : batch)
1150         {
1151             std::vector<LayerData>& layerData = layerPatternData.back ();
1152             layerData.push_back (LayerData (numNodesPrev));
1153
1154             itInputBegin = _pattern.beginInput ();
1155             itInputEnd = _pattern.endInput ();
1156             layerData.back ().setInput (itInputBegin, itInputEnd);
1157
1158             if (usesDropOut)
1159                 layerData.back ().setDropOut (itDropOut);
1160
1161         }
1162
1163
1164         if (usesDropOut)
1165             itDropOut += _layers.back ().numNodes ();
1166
1167         // ---------------- prepare subsequent layers ---------------------------------------------
1168         // for each of the layers
1169         for (auto itLayer = begin (_layers), itLayerEnd = end (_layers); itLayer != itLayerEnd; ++itLayer)
1170         {
1171             bool isOutputLayer = (itLayer+1 == itLayerEnd);
1172             bool isFirstHiddenLayer = (itLayer == begin (_layers));
1173
1174             auto& layer = *itLayer;
1175             layerPatternData.push_back (std::vector<LayerData>());
1176             // for each pattern, prepare a layerData
1177             for (const Pattern& _pattern : batch)
1178             {
1179                 std::vector<LayerData>& layerData = layerPatternData.back ();
1180                 //layerData.push_back (LayerData (numNodesPrev));
1181
1182                 if (itGradientBegin == itGradientEnd)
1183                 {
1184                     layerData.push_back (LayerData (layer.numNodes (), itWeight,
1185                                                     layer.activationFunction (),
1186                                                     layer.modeOutputValues ()));
1187                 }
1188                 else
1189                 {
1190                     layerData.push_back (LayerData (layer.numNodes (), itWeight, itGradient,
1191                                                     layer.activationFunction (),
1192                                                     layer.inverseActivationFunction (),
1193                                                     layer.modeOutputValues ()));
1194                 }
1195
1196                 if (usesDropOut)
1197                 {
1198                     layerData.back ().setDropOut (itDropOut);
1199                 }
1200
1201             }
1202
1203             if (usesDropOut)
1204             {
1205                 itDropOut += layer.numNodes ();
1206             }
1207             size_t _numWeights = layer.numWeights (numNodesPrev);
1208             totalNumWeights += _numWeights;
1209             itWeight += _numWeights;
1210             itGradient += _numWeights;
1211             numNodesPrev = layer.numNodes ();
1212
1213         }
1214     assert (totalNumWeights > 0);
1215         return layerPatternData;
1216 }
1217
1218
1219
1220     template <typename LayerContainer>
1221         void Net::forwardPattern (const LayerContainer& _layers,
1222                                   std::vector<LayerData>& layerData) const
1223     {
1224     size_t idxLayer = 0, idxLayerEnd = _layers.size ();
1225     for (; idxLayer < idxLayerEnd; ++idxLayer)
1226     {
1227         LayerData& prevLayerData = layerData.at (idxLayer);
1228         LayerData& currLayerData = layerData.at (idxLayer+1);
1229
1230         forward (prevLayerData, currLayerData);
1231
1232             applyFunctions (currLayerData.valuesBegin (), currLayerData.valuesEnd (), currLayerData.activationFunction ());
1233     }
1234     }
1235
1236
1237
1238
1239     template <typename LayerContainer, typename LayerPatternContainer>
1240         void Net::forwardBatch (const LayerContainer& _layers,
1241                                 LayerPatternContainer& layerPatternData,
1242                                 std::vector<double>& valuesMean,
1243                                 std::vector<double>& valuesStdDev,
1244                                 size_t trainFromLayer) const
1245     {
1246         valuesMean.clear ();
1247         valuesStdDev.clear ();
1248
1249         // ---------------------------------- loop over layers and pattern -------------------------------------------------------
1250     for (size_t idxLayer = 0, idxLayerEnd = layerPatternData.size (); idxLayer < idxLayerEnd-1; ++idxLayer)
1251     {
1252         bool doTraining = idxLayer >= trainFromLayer;
1253
1254             // get layer-pattern data for this and the corresponding one from the next layer
1255             std::vector<LayerData>& prevLayerPatternData = layerPatternData.at (idxLayer);
1256             std::vector<LayerData>& currLayerPatternData = layerPatternData.at (idxLayer+1);
1257
1258             size_t numPattern = prevLayerPatternData.size ();
1259             size_t numNodesLayer = _layers.at (idxLayer).numNodes ();
1260
1261             std::vector<MeanVariance> means (numNodesLayer);
1262             // ---------------- loop over layerDatas of pattern compute forward ----------------------------
1263             for (size_t idxPattern = 0; idxPattern < numPattern; ++idxPattern)
1264             {
1265         const LayerData& prevLayerData = prevLayerPatternData.at (idxPattern);
1266         LayerData& currLayerData = currLayerPatternData.at (idxPattern);
1267
1268
1269                 forward (prevLayerData, currLayerData); // feed forward
1270             }
1271
1272             // ---------------- loop over layerDatas of pattern apply non-linearities ----------------------------
1273             for (size_t idxPattern = 0; idxPattern < numPattern; ++idxPattern)
1274             {
1275         //const LayerData& prevLayerData = prevLayerPatternData.at (idxPattern);
1276         LayerData& currLayerData = currLayerPatternData.at (idxPattern);
1277
1278         if (doTraining)
1279                     applyFunctions (currLayerData.valuesBegin (), currLayerData.valuesEnd (), currLayerData.activationFunction (),
1280                                     currLayerData.inverseActivationFunction (), currLayerData.valueGradientsBegin ());
1281         else
1282                     applyFunctions (currLayerData.valuesBegin (), currLayerData.valuesEnd (), currLayerData.activationFunction ());
1283             }
1284         }
1285 }
1286
1287
1288
1289
1290     template <typename OutputContainer>
1291         void Net::fetchOutput (const LayerData& lastLayerData, OutputContainer& outputContainer) const
1292     {
1293         ModeOutputValues eModeOutput = lastLayerData.outputMode ();
1294         if (isFlagSet (ModeOutputValues::DIRECT, eModeOutput))
1295         {
1296             outputContainer.insert (outputContainer.end (), lastLayerData.valuesBegin (), lastLayerData.valuesEnd ());
1297         }
1298         else if (isFlagSet (ModeOutputValues::SIGMOID, eModeOutput) ||
1299                  isFlagSet (ModeOutputValues::SOFTMAX, eModeOutput))
1300         {
1301             const auto& prob = lastLayerData.probabilities ();
1302             outputContainer.insert (outputContainer.end (), prob.begin (), prob.end ()) ;
1303         }
1304         else
1305             assert (false);
1306     }
1307
1308
1309
1310
1311     template <typename OutputContainer>
1312         void Net::fetchOutput (const std::vector<LayerData>& lastLayerPatternData, OutputContainer& outputContainer) const
1313     {
1314         for (const LayerData& lastLayerData : lastLayerPatternData)
1315             fetchOutput (lastLayerData, outputContainer);
1316     }
1317
1318
1319
1320     template <typename ItWeight>
1321         std::tuple</*sumError*/double,/*sumWeights*/double> Net::computeError (const Settings& settings,
1322                                                                                std::vector<LayerData>& lastLayerData,
1323                                                                                Batch& batch,
1324                                                                                ItWeight itWeightBegin,
1325                                                                                ItWeight itWeightEnd) const
1326     {
1327         typename std::vector<LayerData>::iterator itLayerData    = lastLayerData.begin ();
1328 //        typename std::vector<LayerData>::iterator itLayerDataEnd = lastLayerData.end ();
1329
1330         typename std::vector<Pattern>::const_iterator itPattern = batch.begin ();
1331         typename std::vector<Pattern>::const_iterator itPatternEnd = batch.end ();
1332
1333         double sumWeights (0.0);
1334         double sumError (0.0);
1335
1336 // FIXME: check that iteration doesn't go beyond itLayerDataEnd!
1337         for ( ; itPattern != itPatternEnd; ++itPattern, ++itLayerData)
1338         {
1339             // compute E and the deltas of the computed output and the true output
1340             LayerData& layerData = (*itLayerData);
1341             const Pattern& _pattern = (*itPattern);
1342             double error = errorFunction (layerData, _pattern.output (),
1343                                           itWeightBegin, itWeightEnd,
1344                                           _pattern.weight (), settings.factorWeightDecay (),
1345                                           settings.regularization ());
1346             sumWeights += fabs (_pattern.weight ());
1347             sumError += error;
1348         }
1349         return std::make_tuple (sumError, sumWeights);
1350     }
1351
1352
1353
1354     template <typename Settings>
1355         void Net::backPropagate (std::vector<std::vector<LayerData>>& layerPatternData,
1356                                  const Settings& settings,
1357                                  size_t trainFromLayer,
1358                                  size_t totalNumWeights) const
1359     {
1360         bool doTraining = layerPatternData.size () > trainFromLayer;
1361         if (doTraining) // training
1362         {
1363             // ------------- backpropagation -------------
1364             size_t idxLayer = layerPatternData.size ();
1365             for (auto itLayerPatternData = layerPatternData.rbegin (), itLayerPatternDataBegin = layerPatternData.rend ();
1366                  itLayerPatternData != itLayerPatternDataBegin; ++itLayerPatternData)
1367             {
1368                 --idxLayer;
1369                 if (idxLayer <= trainFromLayer) // no training
1370                     break;
1371
1372                 std::vector<LayerData>& currLayerDataColl = *(itLayerPatternData);
1373                 std::vector<LayerData>& prevLayerDataColl = *(itLayerPatternData+1);
1374
1375 // FIXME: check that itPrevLayerData doesn't go beyond itPrevLayerDataEnd!
1376                 for (typename std::vector<LayerData>::iterator itCurrLayerData = begin (currLayerDataColl), itCurrLayerDataEnd = end (currLayerDataColl),
1377                      itPrevLayerData = begin (prevLayerDataColl) /*, itPrevLayerDataEnd = end (prevLayerDataColl)*/;
1378                      itCurrLayerData != itCurrLayerDataEnd; ++itCurrLayerData, ++itPrevLayerData)
1379                 {
1380                     LayerData& currLayerData = (*itCurrLayerData);
1381                     LayerData& prevLayerData = *(itPrevLayerData);
1382
1383                     backward (prevLayerData, currLayerData);
1384
1385                     // the factorWeightDecay has to be scaled by 1/n where n is the number of weights (synapses)
1386                     // because L1 and L2 regularization
1387                     //
1388                     //  http://neuralnetworksanddeeplearning.com/chap3.html#overfitting_and_regularization
1389                     //
1390                     // L1 : -factorWeightDecay*sgn(w)/numWeights
1391                     // L2 : -factorWeightDecay/numWeights
1392                     update (prevLayerData, currLayerData, settings.factorWeightDecay ()/totalNumWeights, settings.regularization ());
1393                 }
1394             }
1395         }
1396     }
1397
1398
1399
1400 /*! \brief forward propagation and backward propagation
1401  *
1402  *
1403  */
1404         template <typename LayerContainer, typename PassThrough, typename ItWeight, typename ItGradient, typename OutContainer>
1405             double Net::forward_backward (LayerContainer& _layers, PassThrough& settingsAndBatch,
1406                                       ItWeight itWeightBegin, ItWeight itWeightEnd,
1407                                           ItGradient itGradientBegin, ItGradient itGradientEnd,
1408                                           size_t trainFromLayer,
1409                                       OutContainer& outputContainer, bool doFetchOutput) const
1410         {
1411             Settings& settings = std::get<0>(settingsAndBatch);
1412             Batch& batch = std::get<1>(settingsAndBatch);
1413             DropContainer& dropContainer = std::get<2>(settingsAndBatch);
1414
1415             double sumError = 0.0;
1416             double sumWeights = 0.0;    // -------------
1417
1418
1419         // ----------------------------- prepare layer data -------------------------------------
1420         size_t totalNumWeights (0);
1421         std::vector<std::vector<LayerData>> layerPatternData = prepareLayerData (_layers,
1422                                                                                  batch,
1423                                                                                  dropContainer,
1424                                                                                  itWeightBegin,
1425                                                                                  itWeightEnd,
1426                                                                                  itGradientBegin,
1427                                                                                  itGradientEnd,
1428                                                                                  totalNumWeights);
1429
1430
1431
1432         // ---------------------------------- propagate forward ------------------------------------------------------------------
1433         std::vector<double> valuesMean;
1434         std::vector<double> valuesStdDev;
1435         forwardBatch (_layers, layerPatternData, valuesMean, valuesStdDev, trainFromLayer);
1436
1437
1438             // ------------- fetch output ------------------
1439         if (doFetchOutput)
1440             {
1441             fetchOutput (layerPatternData.back (), outputContainer);
1442             }
1443
1444
1445             // ------------- error computation -------------
1446         std::tie (sumError, sumWeights) = computeError (settings, layerPatternData.back (), batch, itWeightBegin, itWeightBegin + totalNumWeights);
1447
1448
1449                 // ------------- backpropagation -------------
1450         backPropagate (layerPatternData, settings, trainFromLayer, totalNumWeights);
1451
1452
1453         // --- compile the measures
1454             double batchSize = std::distance (std::begin (batch), std::end (batch));
1455             for (auto it = itGradientBegin; it != itGradientEnd; ++it)
1456                 (*it) /= batchSize;
1457
1458
1459             sumError /= sumWeights;
1460             return sumError;
1461         }
1462
1463
1464
1465 /*! \brief initialization of the weights
1466  *
1467  *
1468  */
1469         template <typename OutIterator>
1470             void Net::initializeWeights (WeightInitializationStrategy eInitStrategy, OutIterator itWeight)
1471         {
1472             if (eInitStrategy == WeightInitializationStrategy::XAVIER)
1473             {
1474                 // input and output properties
1475                 int numInput = inputSize ();
1476
1477                 // compute variance and mean of input and output
1478                 //...
1479
1480
1481                 // compute the weights
1482                 for (auto& layer: layers ())
1483                 {
1484                     double nIn = numInput;
1485                     double stdDev = sqrt (2.0/nIn);
1486                     for (size_t iWeight = 0, iWeightEnd = layer.numWeights (numInput); iWeight < iWeightEnd; ++iWeight)
1487                     {
1488                         (*itWeight) = DNN::gaussDouble (0.0, stdDev); // factor 2.0 for ReLU
1489                         ++itWeight;
1490                     }
1491                     numInput = layer.numNodes ();
1492                 }
1493                 return;
1494             }
1495
1496             if (eInitStrategy == WeightInitializationStrategy::XAVIERUNIFORM)
1497             {
1498                 // input and output properties
1499                 int numInput = inputSize ();
1500
1501                 // compute variance and mean of input and output
1502                 //...
1503
1504
1505                 // compute the weights
1506                 for (auto& layer: layers ())
1507                 {
1508                     double nIn = numInput;
1509                     double minVal = -sqrt(2.0/nIn);
1510                     double maxVal = sqrt (2.0/nIn);
1511                     for (size_t iWeight = 0, iWeightEnd = layer.numWeights (numInput); iWeight < iWeightEnd; ++iWeight)
1512                     {
1513
1514                         (*itWeight) = DNN::uniformDouble (minVal, maxVal); // factor 2.0 for ReLU
1515                         ++itWeight;
1516                     }
1517                     numInput = layer.numNodes ();
1518                 }
1519                 return;
1520             }
1521
1522             if (eInitStrategy == WeightInitializationStrategy::TEST)
1523             {
1524                 // input and output properties
1525                 int numInput = inputSize ();
1526
1527                 // compute variance and mean of input and output
1528                 //...
1529
1530
1531                 // compute the weights
1532                 for (auto& layer: layers ())
1533                 {
1534 //                double nIn = numInput;
1535                     for (size_t iWeight = 0, iWeightEnd = layer.numWeights (numInput); iWeight < iWeightEnd; ++iWeight)
1536                     {
1537                         (*itWeight) = DNN::gaussDouble (0.0, 0.1);
1538                         ++itWeight;
1539                     }
1540                     numInput = layer.numNodes ();
1541                 }
1542                 return;
1543             }
1544
1545             if (eInitStrategy == WeightInitializationStrategy::LAYERSIZE)
1546             {
1547                 // input and output properties
1548                 int numInput = inputSize ();
1549
1550                 // compute variance and mean of input and output
1551                 //...
1552
1553
1554                 // compute the weights
1555                 for (auto& layer: layers ())
1556                 {
1557                     double nIn = numInput;
1558                     for (size_t iWeight = 0, iWeightEnd = layer.numWeights (numInput); iWeight < iWeightEnd; ++iWeight)
1559                     {
1560                         (*itWeight) = DNN::gaussDouble (0.0, sqrt (layer.numWeights (nIn))); // factor 2.0 for ReLU
1561                         ++itWeight;
1562                     }
1563                     numInput = layer.numNodes ();
1564                 }
1565                 return;
1566             }
1567
1568         }
1569
1570
1571
1572
1573
1574 /*! \brief compute the error function
1575  *
1576  *
1577  */
1578         template <typename Container, typename ItWeight>
1579             double Net::errorFunction (LayerData& layerData,
1580                                        Container truth,
1581                                        ItWeight itWeight,
1582                                        ItWeight itWeightEnd,
1583                                        double patternWeight,
1584                                        double factorWeightDecay,
1585                                        EnumRegularization eRegularization) const
1586         {
1587             double error (0);
1588             switch (m_eErrorFunction)
1589             {
1590             case ModeErrorFunction::SUMOFSQUARES:
1591             {
1592                 error = sumOfSquares (layerData.valuesBegin (), layerData.valuesEnd (), begin (truth), end (truth),
1593                                       layerData.deltasBegin (), layerData.deltasEnd (),
1594                                       layerData.inverseActivationFunction (),
1595                                       patternWeight);
1596                 break;
1597             }
1598             case ModeErrorFunction::CROSSENTROPY:
1599             {
1600                 assert (!TMVA::DNN::isFlagSet (ModeOutputValues::DIRECT, layerData.outputMode ()));
1601                 std::vector<double> probabilities = layerData.probabilities ();
1602                 error = crossEntropy (begin (probabilities), end (probabilities),
1603                                       begin (truth), end (truth),
1604                                       layerData.deltasBegin (), layerData.deltasEnd (),
1605                                       layerData.inverseActivationFunction (),
1606                                       patternWeight);
1607                 break;
1608             }
1609             case ModeErrorFunction::CROSSENTROPY_MUTUALEXCLUSIVE:
1610             {
1611                 std::cout << "softmax." << std::endl;
1612                 assert (!TMVA::DNN::isFlagSet (ModeOutputValues::DIRECT, layerData.outputMode ()));
1613                 std::vector<double> probabilities = layerData.probabilities ();
1614                 error = softMaxCrossEntropy (begin (probabilities), end (probabilities),
1615                                              begin (truth), end (truth),
1616                                              layerData.deltasBegin (), layerData.deltasEnd (),
1617                                              layerData.inverseActivationFunction (),
1618                                              patternWeight);
1619                 break;
1620             }
1621             }
1622             if (factorWeightDecay != 0 && eRegularization != EnumRegularization::NONE)
1623             {
1624                 error = weightDecay (error, itWeight, itWeightEnd, factorWeightDecay, eRegularization);
1625             }
1626             return error;
1627         }
1628
1629
1630
1631
1632
1633
1634
1635 // /*! \brief pre-training
1636 //  *
1637 //  * in development
1638 //  */
1639 //         template <typename Minimizer>
1640 //             void Net::preTrain (std::vector<double>& weights,
1641 //                                 std::vector<Pattern>& trainPattern,
1642 //                                 const std::vector<Pattern>& testPattern,
1643 //                                 Minimizer& minimizer, Settings& settings)
1644 //         {
1645 //             auto itWeightGeneral = std::begin (weights);
1646 //             std::vector<Pattern> prePatternTrain (trainPattern.size ());
1647 //             std::vector<Pattern> prePatternTest (testPattern.size ());
1648
1649 //             size_t _inputSize = inputSize ();
1650
1651 //             // transform pattern using the created preNet
1652 //             auto initializePrePattern = [&](const std::vector<Pattern>& pttrnInput, std::vector<Pattern>& pttrnOutput)
1653 //                 {
1654 //                     pttrnOutput.clear ();
1655 //                     std::transform (std::begin (pttrnInput), std::end (pttrnInput),
1656 //                                     std::back_inserter (pttrnOutput),
1657 //                                     [](const Pattern& p)
1658 //             {
1659 //                 Pattern pat (p.input (), p.input (), p.weight ());
1660 //                 return pat;
1661 //             });
1662 //                 };
1663
1664 //             initializePrePattern (trainPattern, prePatternTrain);
1665 //             initializePrePattern (testPattern, prePatternTest);
1666
1667 //             std::vector<double> originalDropFractions = settings.dropFractions ();
1668
1669 //             for (auto& _layer : layers ())
1670 //             {
1671 //                 // compute number of weights (as a function of the number of incoming nodes)
1672 //                 // fetch number of nodes
1673 //                 size_t numNodes = _layer.numNodes ();
1674 //                 size_t _numWeights = _layer.numWeights (_inputSize);
1675
1676 //                 // ------------------
1677 //                 DNN::Net preNet;
1678 //                 if (!originalDropFractions.empty ())
1679 //                 {
1680 //                     originalDropFractions.erase (originalDropFractions.begin ());
1681 //                     settings.setDropOut (originalDropFractions.begin (), originalDropFractions.end (), settings.dropRepetitions ());
1682 //                 }
1683 //                 std::vector<double> preWeights;
1684
1685 //                 // define the preNet (pretraining-net) for this layer
1686 //                 // outputSize == inputSize, because this is an autoencoder;
1687 //                 preNet.setInputSize (_inputSize);
1688 //                 preNet.addLayer (DNN::Layer (numNodes, _layer.activationFunctionType ()));
1689 //                 preNet.addLayer (DNN::Layer (_inputSize, DNN::EnumFunction::LINEAR, DNN::ModeOutputValues::DIRECT));
1690 //                 preNet.setErrorFunction (DNN::ModeErrorFunction::SUMOFSQUARES);
1691 //                 preNet.setOutputSize (_inputSize); // outputSize is the inputSize (autoencoder)
1692
1693 //                 // initialize weights
1694 //                 preNet.initializeWeights (DNN::WeightInitializationStrategy::XAVIERUNIFORM,
1695 //                                           std::back_inserter (preWeights));
1696
1697 //                 // overwrite already existing weights from the "general" weights
1698 //                 std::copy (itWeightGeneral, itWeightGeneral+_numWeights, preWeights.begin ());
1699 //                 std::copy (itWeightGeneral, itWeightGeneral+_numWeights, preWeights.begin ()+_numWeights); // set identical weights for the temporary output layer
1700
1701
1702 //                 // train the "preNet"
1703 //                 preNet.train (preWeights, prePatternTrain, prePatternTest, minimizer, settings);
1704
1705 //                 // fetch the pre-trained weights (without the output part of the autoencoder)
1706 //                 std::copy (std::begin (preWeights), std::begin (preWeights) + _numWeights, itWeightGeneral);
1707
1708 //                 // advance the iterator on the incoming weights
1709 //                 itWeightGeneral += _numWeights;
1710
1711 //                 // remove the weights of the output layer of the preNet
1712 //                 preWeights.erase (preWeights.begin () + _numWeights, preWeights.end ());
1713
1714 //                 // remove the outputLayer of the preNet
1715 //                 preNet.removeLayer ();
1716
1717 //                 // set the output size to the number of nodes in the new output layer (== last hidden layer)
1718 //                 preNet.setOutputSize (numNodes);
1719
1720 //                 // transform pattern using the created preNet
1721 //                 auto proceedPattern = [&](std::vector<Pattern>& pttrn)
1722 //                     {
1723 //                         std::vector<Pattern> newPttrn;
1724 //                         std::for_each (std::begin (pttrn), std::end (pttrn),
1725 //                                        [&preNet,&preWeights,&newPttrn](Pattern& p)
1726 //                 {
1727 //                     std::vector<double> output = preNet.compute (p.input (), preWeights);
1728 //                     Pattern pat (output, output, p.weight ());
1729 //                     newPttrn.push_back (pat);
1730 // //                    p = pat;
1731 //                 });
1732 //                         return newPttrn;
1733 //                     };
1734
1735
1736 //                 prePatternTrain = proceedPattern (prePatternTrain);
1737 //                 prePatternTest = proceedPattern (prePatternTest);
1738
1739
1740 //                 // the new input size is the output size of the already reduced preNet
1741 //                 _inputSize = preNet.layers ().back ().numNodes ();
1742 //             }
1743 //         }
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760     } // namespace DNN
1761 } // namespace TMVA
1762
1763 #endif