root/ROOT/TThreadExecutor.hxx

0001 // @(#)root/thread:$Id$
0002 // Author: Xavier Valls March 2016
0003
0004 /*************************************************************************
0005  * Copyright (C) 1995-2020, Rene Brun and Fons Rademakers.               *
0006  * All rights reserved.                                                  *
0007  *                                                                       *
0008  * For the licensing terms see $ROOTSYS/LICENSE.                         *
0009  * For the list of contributors see $ROOTSYS/README/CREDITS.             *
0010  *************************************************************************/
0011
0012 #ifndef ROOT_TThreadExecutor
0013 #define ROOT_TThreadExecutor
0014
0015 #include "RConfigure.h"
0016
0017 // exclude in case ROOT does not have IMT support
0018 #ifndef R__USE_IMT
0019 // No need to error out for dictionaries.
0020 # if !defined(__ROOTCLING__) && !defined(G__DICTIONARY)
0021 #  error "Cannot use ROOT::TThreadExecutor without defining R__USE_IMT."
0022 # endif
0023 #else
0024
0025 #include "ROOT/TExecutorCRTP.hxx"
0026 #include "ROOT/TSeq.hxx"
0027 #include "ROOT/TypeTraits.hxx" // InvokeResult
0028 #include "RTaskArena.hxx"
0029 #include "TError.h"
0030
0031 #include <functional> //std::function
0032 #include <initializer_list>
0033 #include <memory>
0034 #include <numeric> //std::accumulate
0035 #include <type_traits> //std::enable_if
0036 #include <utility> //std::move
0037 #include <vector>
0038
0039 namespace ROOT {
0040
0041    class TThreadExecutor: public TExecutorCRTP<TThreadExecutor> {
0042       friend TExecutorCRTP;
0043
0044    public:
0045
0046       explicit TThreadExecutor(UInt_t nThreads = 0u);
0047
0048       TThreadExecutor(const TThreadExecutor &) = delete;
0049       TThreadExecutor &operator=(const TThreadExecutor &) = delete;
0050
0051       // ForEach
0052       //
0053       template<class F>
0054       void Foreach(F func, unsigned nTimes, unsigned nChunks = 0);
0055       template<class F, class INTEGER>
0056       void Foreach(F func, ROOT::TSeq<INTEGER> args, unsigned nChunks = 0);
0057       template<class F, class T>
0058       void Foreach(F func, std::initializer_list<T> args, unsigned nChunks = 0);
0059       template<class F, class T>
0060       void Foreach(F func, std::vector<T> &args, unsigned nChunks = 0);
0061       template<class F, class T>
0062       void Foreach(F func, const std::vector<T> &args, unsigned nChunks = 0);
0063
0064       // Map
0065       //
0066       using TExecutorCRTP<TThreadExecutor>::Map;
0067
0068       // Extension of the Map interfaces with chunking, specific to this class
0069       template <class F, class R, class Cond = validMapReturnCond<F>>
0070       auto Map(F func, unsigned nTimes, R redfunc, unsigned nChunks) -> std::vector<InvokeResult_t<F>>;
0071       template <class F, class INTEGER, class R, class Cond = validMapReturnCond<F, INTEGER>>
0072       auto Map(F func, ROOT::TSeq<INTEGER> args, R redfunc, unsigned nChunks)
0073          -> std::vector<InvokeResult_t<F, INTEGER>>;
0074       template <class F, class T, class R, class Cond = validMapReturnCond<F, T>>
0075       auto Map(F func, std::initializer_list<T> args, R redfunc, unsigned nChunks) -> std::vector<InvokeResult_t<F, T>>;
0076       template <class F, class T, class R, class Cond = validMapReturnCond<F, T>>
0077       auto Map(F func, std::vector<T> &args, R redfunc, unsigned nChunks) -> std::vector<InvokeResult_t<F, T>>;
0078       template <class F, class T, class R, class Cond = validMapReturnCond<F, T>>
0079       auto Map(F func, const std::vector<T> &args, R redfunc, unsigned nChunks) -> std::vector<InvokeResult_t<F, T>>;
0080
0081       // MapReduce
0082       //
0083       // We need to reimplement the MapReduce interfaces to allow for parallel reduction, defined in
0084       // this class but not in the base class.
0085       //
0086       // the late return types also check at compile-time whether redfunc is compatible with func,
0087       // other than checking that func is compatible with the type of arguments.
0088       // a static_assert check in TThreadExecutor::Reduce is used to check that redfunc is compatible with the type returned by func
0089       using TExecutorCRTP<TThreadExecutor>::MapReduce;
0090       template <class F, class R, class Cond = validMapReturnCond<F>>
0091       auto MapReduce(F func, unsigned nTimes, R redfunc) -> InvokeResult_t<F>;
0092       template <class F, class R, class Cond = validMapReturnCond<F>>
0093       auto MapReduce(F func, unsigned nTimes, R redfunc, unsigned nChunks) -> InvokeResult_t<F>;
0094       template <class F, class INTEGER, class R, class Cond = validMapReturnCond<F, INTEGER>>
0095       auto MapReduce(F func, ROOT::TSeq<INTEGER> args, R redfunc, unsigned nChunks) -> InvokeResult_t<F, INTEGER>;
0096       template <class F, class T, class R, class Cond = validMapReturnCond<F, T>>
0097       auto MapReduce(F func, std::initializer_list<T> args, R redfunc, unsigned nChunks) -> InvokeResult_t<F, T>;
0098       template <class F, class T, class R, class Cond = validMapReturnCond<F, T>>
0099       auto MapReduce(F func, std::vector<T> &args, R redfunc) -> InvokeResult_t<F, T>;
0100       template <class F, class T, class R, class Cond = validMapReturnCond<F, T>>
0101       auto MapReduce(F func, const std::vector<T> &args, R redfunc) -> InvokeResult_t<F, T>;
0102       template <class F, class T, class R, class Cond = validMapReturnCond<F, T>>
0103       auto MapReduce(F func, std::vector<T> &args, R redfunc, unsigned nChunks) -> InvokeResult_t<F, T>;
0104       template <class F, class T, class R, class Cond = validMapReturnCond<F, T>>
0105       auto MapReduce(F func, const std::vector<T> &args, R redfunc, unsigned nChunks) -> InvokeResult_t<F, T>;
0106
0107       using TExecutorCRTP<TThreadExecutor>::Reduce;
0108       template<class T, class R> auto Reduce(const std::vector<T> &objs, R redfunc) -> decltype(redfunc(objs));
0109       template<class T, class BINARYOP> auto Reduce(const std::vector<T> &objs, BINARYOP redfunc) -> decltype(redfunc(objs.front(), objs.front()));
0110
0111       unsigned GetPoolSize() const;
0112
0113    private:
0114       // Implementation of the Map functions declared in the parent class (TExecutorCRTP)
0115       //
0116       template <class F, class Cond = validMapReturnCond<F>>
0117       auto MapImpl(F func, unsigned nTimes) -> std::vector<InvokeResult_t<F>>;
0118       template <class F, class INTEGER, class Cond = validMapReturnCond<F, INTEGER>>
0119       auto MapImpl(F func, ROOT::TSeq<INTEGER> args) -> std::vector<InvokeResult_t<F, INTEGER>>;
0120       template <class F, class T, class Cond = validMapReturnCond<F, T>>
0121       auto MapImpl(F func, std::vector<T> &args) -> std::vector<InvokeResult_t<F, T>>;
0122       template <class F, class T, class Cond = validMapReturnCond<F, T>>
0123       auto MapImpl(F func, const std::vector<T> &args) -> std::vector<InvokeResult_t<F, T>>;
0124
0125       // Functions that interface with the parallel library used as a backend
0126       void   ParallelFor(unsigned start, unsigned end, unsigned step, const std::function<void(unsigned int i)> &f);
0127       double ParallelReduce(const std::vector<double> &objs, const std::function<double(double a, double b)> &redfunc);
0128       float  ParallelReduce(const std::vector<float> &objs, const std::function<float(float a, float b)> &redfunc);
0129       template<class T, class R>
0130       auto SeqReduce(const std::vector<T> &objs, R redfunc) -> decltype(redfunc(objs));
0131
0132       /// Pointer to the TBB task arena wrapper
0133       std::shared_ptr<ROOT::Internal::RTaskArenaWrapper> fTaskArenaW = nullptr;
0134    };
0135
0136    /************ TEMPLATE METHODS IMPLEMENTATION ******************/
0137
0138    //////////////////////////////////////////////////////////////////////////
0139    /// \brief Execute a function without arguments several times in parallel, dividing the execution in nChunks.
0140    ///
0141    /// \param func Function to be executed.
0142    /// \param nTimes Number of times function should be called.
0143    /// \param nChunks Number of chunks to split the input data for processing.
0144    template<class F>
0145    void TThreadExecutor::Foreach(F func, unsigned nTimes, unsigned nChunks) {
0146       if (nChunks == 0) {
0147          ParallelFor(0U, nTimes, 1, [&](unsigned int){func();});
0148          return;
0149       }
0150
0151       unsigned step = (nTimes + nChunks - 1) / nChunks;
0152       auto lambda = [&](unsigned int i)
0153       {
0154          for (unsigned j = 0; j < step && (i + j) < nTimes; j++) {
0155             func();
0156          }
0157       };
0158       ParallelFor(0U, nTimes, step, lambda);
0159    }
0160
0161    //////////////////////////////////////////////////////////////////////////
0162    /// \brief Execute a function in parallel over a sequence of indexes, dividing the execution in nChunks.
0163    ///
0164    /// \param func Function to be executed. Must take an element of the sequence passed assecond argument as a parameter.
0165    /// \param args Sequence of indexes to execute `func` on.
0166    /// \param nChunks Number of chunks to split the input data for processing.
0167    template<class F, class INTEGER>
0168    void TThreadExecutor::Foreach(F func, ROOT::TSeq<INTEGER> args, unsigned nChunks) {
0169       if (nChunks == 0) {
0170          ParallelFor(*args.begin(), *args.end(), args.step(), [&](unsigned int i){func(i);});
0171          return;
0172       }
0173       unsigned start = *args.begin();
0174       unsigned end = *args.end();
0175       unsigned seqStep = args.step();
0176       unsigned step = (end - start + nChunks - 1) / nChunks; //ceiling the division
0177
0178       auto lambda = [&](unsigned int i)
0179       {
0180          for (unsigned j = 0; j < step && (i + j) < end; j+=seqStep) {
0181             func(i + j);
0182          }
0183       };
0184       ParallelFor(start, end, step, lambda);
0185    }
0186
0187    //////////////////////////////////////////////////////////////////////////
0188    /// \brief Execute a function in parallel over the elements of an initializer_list, dividing the execution in nChunks.
0189    ///
0190    /// \param func Function to be executed on the elements of the initializer_list passed as second parameter.
0191    /// \param args initializer_list for a vector to apply `func` on.
0192    /// \param nChunks Number of chunks to split the input data for processing.
0193    template<class F, class T>
0194    void TThreadExecutor::Foreach(F func, std::initializer_list<T> args, unsigned nChunks) {
0195       std::vector<T> vargs(std::move(args));
0196       Foreach(func, vargs, nChunks);
0197    }
0198
0199    //////////////////////////////////////////////////////////////////////////
0200    /// \brief Execute a function in parallel over the elements of a vector, dividing the execution in nChunks.
0201    ///
0202    /// \param func Function to be executed on the elements of the vector passed as second parameter.
0203    /// \param args Vector of elements passed as an argument to `func`.
0204    /// \param nChunks Number of chunks to split the input data for processing.
0205    template<class F, class T>
0206    void TThreadExecutor::Foreach(F func, std::vector<T> &args, unsigned nChunks) {
0207       unsigned int nToProcess = args.size();
0208       if (nChunks == 0) {
0209          ParallelFor(0U, nToProcess, 1, [&](unsigned int i){func(args[i]);});
0210          return;
0211       }
0212
0213       unsigned step = (nToProcess + nChunks - 1) / nChunks; //ceiling the division
0214       auto lambda = [&](unsigned int i)
0215       {
0216          for (unsigned j = 0; j < step && (i + j) < nToProcess; j++) {
0217             func(args[i + j]);
0218          }
0219       };
0220       ParallelFor(0U, nToProcess, step, lambda);
0221    }
0222
0223    //////////////////////////////////////////////////////////////////////////
0224    /// \brief Execute a function in parallel over the elements of a immutable vector, dividing the execution in nChunks.
0225    ///
0226    /// \param func Function to be executed on the elements of the vector passed as second parameter.
0227    /// \param args Immutable vector of elements passed as an argument to `func`.
0228    /// \param nChunks Number of chunks to split the input data for processing.
0229    template<class F, class T>
0230    void TThreadExecutor::Foreach(F func, const std::vector<T> &args, unsigned nChunks) {
0231       unsigned int nToProcess = args.size();
0232       if (nChunks == 0) {
0233          ParallelFor(0U, nToProcess, 1, [&](unsigned int i){func(args[i]);});
0234          return;
0235       }
0236
0237       unsigned step = (nToProcess + nChunks - 1) / nChunks; //ceiling the division
0238       auto lambda = [&](unsigned int i)
0239       {
0240          for (unsigned j = 0; j < step && (i + j) < nToProcess; j++) {
0241             func(args[i + j]);
0242          }
0243       };
0244       ParallelFor(0U, nToProcess, step, lambda);
0245    }
0246
0247    //////////////////////////////////////////////////////////////////////////
0248    /// \brief Execute a function without arguments several times in parallel.
0249    /// Implementation of the Map method.
0250    ///
0251    /// \copydetails TExecutorCRTP::Map(F func,unsigned nTimes)
0252    template <class F, class Cond>
0253    auto TThreadExecutor::MapImpl(F func, unsigned nTimes) -> std::vector<InvokeResult_t<F>>
0254    {
0255       using retType = decltype(func());
0256       std::vector<retType> reslist(nTimes);
0257       auto lambda = [&](unsigned int i)
0258       {
0259          reslist[i] = func();
0260       };
0261       ParallelFor(0U, nTimes, 1, lambda);
0262
0263       return reslist;
0264    }
0265
0266    //////////////////////////////////////////////////////////////////////////
0267    /// \brief Execute a function over a sequence of indexes in parallel.
0268    /// Implementation of the Map method.
0269    ///
0270    /// \copydetails TExecutorCRTP::Map(F func,ROOT::TSeq<INTEGER> args)
0271    template <class F, class INTEGER, class Cond>
0272    auto TThreadExecutor::MapImpl(F func, ROOT::TSeq<INTEGER> args) -> std::vector<InvokeResult_t<F, INTEGER>>
0273    {
0274       using retType = decltype(func(*args.begin()));
0275       std::vector<retType> reslist(args.size());
0276       auto lambda = [&](unsigned int i) { reslist[i] = func(args[i]); };
0277       ParallelFor(0U, args.size(), 1, lambda);
0278
0279       return reslist;
0280    }
0281
0282    //////////////////////////////////////////////////////////////////////////
0283    /// \brief Execute a function `nTimes` in parallel, dividing the execution in nChunks and
0284    /// providing a result per chunk.
0285    ///
0286    /// \copydetails ROOT::Internal::TExecutor::Map(F func,unsigned nTimes,R redfunc,unsigned nChunks)
0287    template <class F, class R, class Cond>
0288    auto TThreadExecutor::Map(F func, unsigned nTimes, R redfunc, unsigned nChunks) -> std::vector<InvokeResult_t<F>>
0289    {
0290       if (nChunks == 0)
0291       {
0292          return Map(func, nTimes);
0293       }
0294
0295       unsigned step = (nTimes + nChunks - 1) / nChunks;
0296       // Avoid empty chunks
0297       unsigned actualChunks = (nTimes + step - 1) / step;
0298       using retType = decltype(func());
0299       std::vector<retType> reslist(actualChunks);
0300       auto lambda = [&](unsigned int i)
0301       {
0302          std::vector<retType> partialResults(std::min(nTimes-i, step));
0303          for (unsigned j = 0; j < step && (i + j) < nTimes; j++) {
0304             partialResults[j] = func();
0305          }
0306          reslist[i / step] = Reduce(partialResults, redfunc);
0307       };
0308       ParallelFor(0U, nTimes, step, lambda);
0309
0310       return reslist;
0311    }
0312
0313    //////////////////////////////////////////////////////////////////////////
0314    /// \brief Execute a function over the elements of a vector in parallel.
0315    /// Implementation of the Map method.
0316    ///
0317    /// \copydetails TExecutorCRTP::Map(F func,std::vector<T> &args)
0318    template <class F, class T, class Cond>
0319    auto TThreadExecutor::MapImpl(F func, std::vector<T> &args) -> std::vector<InvokeResult_t<F, T>>
0320    {
0321       // //check whether func is callable
0322       using retType = decltype(func(args.front()));
0323
0324       unsigned int nToProcess = args.size();
0325       std::vector<retType> reslist(nToProcess);
0326
0327       auto lambda = [&](unsigned int i)
0328       {
0329          reslist[i] = func(args[i]);
0330       };
0331
0332       ParallelFor(0U, nToProcess, 1, lambda);
0333
0334       return reslist;
0335    }
0336
0337    //////////////////////////////////////////////////////////////////////////
0338    /// \brief Execute a function over the elements of a vector in parallel.
0339    /// Implementation of the Map method.
0340    ///
0341    /// \copydetails TExecutorCRTP::Map(F func,const std::vector<T> &args)
0342    template <class F, class T, class Cond>
0343    auto TThreadExecutor::MapImpl(F func, const std::vector<T> &args) -> std::vector<InvokeResult_t<F, T>>
0344    {
0345       // //check whether func is callable
0346       using retType = decltype(func(args.front()));
0347
0348       unsigned int nToProcess = args.size();
0349       std::vector<retType> reslist(nToProcess);
0350
0351       auto lambda = [&](unsigned int i)
0352       {
0353          reslist[i] = func(args[i]);
0354       };
0355
0356       ParallelFor(0U, nToProcess, 1, lambda);
0357
0358       return reslist;
0359    }
0360
0361    //////////////////////////////////////////////////////////////////////////
0362    /// \brief Execute a function in parallel over the elements of a sequence, dividing the execution in nChunks and
0363    /// providing a result per chunk.
0364    ///
0365    /// \copydetails ROOT::Internal::TExecutor::Map(F func,ROOT::TSeq<INTEGER> args,R redfunc,unsigned nChunks)
0366    template <class F, class INTEGER, class R, class Cond>
0367    auto TThreadExecutor::Map(F func, ROOT::TSeq<INTEGER> args, R redfunc, unsigned nChunks)
0368       -> std::vector<InvokeResult_t<F, INTEGER>>
0369    {
0370       if (nChunks == 0)
0371       {
0372          return Map(func, args);
0373       }
0374
0375       unsigned nToProcess = args.size();
0376       unsigned step = (nToProcess + nChunks - 1) / nChunks; // ceiling the division
0377       // Avoid empty chunks
0378       unsigned actualChunks = (nToProcess + step - 1) / step;
0379
0380       using retType = decltype(func(*args.begin()));
0381       std::vector<retType> reslist(actualChunks);
0382       auto lambda = [&](unsigned int i) {
0383          std::vector<retType> partialResults(std::min(step, nToProcess - i)); // last chunk might be smaller
0384          for (unsigned j = 0; j < partialResults.size(); j++) {
0385             partialResults[j] = func(args[i + j]);
0386          }
0387          reslist[i / step] = Reduce(partialResults, redfunc);
0388       };
0389
0390       ParallelFor(0U, nToProcess, step, lambda);
0391
0392       return reslist;
0393    }
0394
0395    //////////////////////////////////////////////////////////////////////////
0396    /// \brief Execute a function in parallel over the elements of a vector, dividing the execution in nChunks and
0397    /// providing a result per chunk.
0398    ///
0399    /// \copydetails ROOT::Internal::TExecutor::Map(F func,std::vector<T> &args,R redfunc,unsigned nChunks)
0400    template <class F, class T, class R, class Cond>
0401    auto TThreadExecutor::Map(F func, std::vector<T> &args, R redfunc, unsigned nChunks)
0402       -> std::vector<InvokeResult_t<F, T>>
0403    {
0404       if (nChunks == 0)
0405       {
0406          return Map(func, args);
0407       }
0408
0409       unsigned int nToProcess = args.size();
0410       unsigned step = (nToProcess + nChunks - 1) / nChunks; //ceiling the division
0411       // Avoid empty chunks
0412       unsigned actualChunks = (nToProcess + step - 1) / step;
0413
0414       using retType = decltype(func(args.front()));
0415       std::vector<retType> reslist(actualChunks);
0416       auto lambda = [&](unsigned int i) {
0417          std::vector<retType> partialResults(std::min(step, nToProcess - i));
0418          for (unsigned j = 0; j < partialResults.size(); j++) {
0419             partialResults[j] = func(args[i + j]);
0420          }
0421          reslist[i / step] = Reduce(partialResults, redfunc);
0422       };
0423
0424       ParallelFor(0U, nToProcess, step, lambda);
0425
0426       return reslist;
0427    }
0428
0429    //////////////////////////////////////////////////////////////////////////
0430    /// \brief Execute a function in parallel over the elements of an immutable vector, dividing the execution in nChunks and
0431    /// providing a result per chunk.
0432    ///
0433    /// \copydetails ROOT::Internal::TExecutor::Map(F func,const std::vector<T> &args,R redfunc,unsigned nChunks)
0434    template <class F, class T, class R, class Cond>
0435    auto TThreadExecutor::Map(F func, const std::vector<T> &args, R redfunc, unsigned nChunks)
0436       -> std::vector<InvokeResult_t<F, T>>
0437    {
0438       if (nChunks == 0)
0439       {
0440          return Map(func, args);
0441       }
0442
0443       unsigned int nToProcess = args.size();
0444       unsigned step = (nToProcess + nChunks - 1) / nChunks; //ceiling the division
0445       // Avoid empty chunks
0446       unsigned actualChunks = (nToProcess + step - 1) / step;
0447
0448       using retType = decltype(func(args.front()));
0449       std::vector<retType> reslist(actualChunks);
0450       auto lambda = [&](unsigned int i) {
0451          std::vector<retType> partialResults(std::min(step, nToProcess - i));
0452          for (unsigned j = 0; j < partialResults.size(); j++) {
0453             partialResults[j] = func(args[i + j]);
0454          }
0455          reslist[i / step] = Reduce(partialResults, redfunc);
0456       };
0457
0458       ParallelFor(0U, nToProcess, step, lambda);
0459
0460       return reslist;
0461    }
0462
0463    //////////////////////////////////////////////////////////////////////////
0464    /// \brief Execute a function in parallel over the elements of an initializer_list, dividing the execution in nChunks and
0465    /// providing a result per chunk.
0466    ///
0467    /// \copydetails ROOT::Internal::TExecutor::Map(F func,std::initializer_list<T> args,R redfunc,unsigned nChunks)
0468    template <class F, class T, class R, class Cond>
0469    auto TThreadExecutor::Map(F func, std::initializer_list<T> args, R redfunc, unsigned nChunks)
0470       -> std::vector<InvokeResult_t<F, T>>
0471    {
0472       std::vector<T> vargs(std::move(args));
0473       const auto &reslist = Map(func, vargs, redfunc, nChunks);
0474       return reslist;
0475    }
0476
0477    //////////////////////////////////////////////////////////////////////////
0478    /// \brief Execute a function `nTimes` in parallel (Map) and accumulate the results into a single value (Reduce).
0479    /// \copydetails  ROOT::Internal::TExecutor::MapReduce(F func,unsigned nTimes,R redfunc)
0480    template <class F, class R, class Cond>
0481    auto TThreadExecutor::MapReduce(F func, unsigned nTimes, R redfunc) -> InvokeResult_t<F>
0482    {
0483       return Reduce(Map(func, nTimes), redfunc);
0484    }
0485
0486    //////////////////////////////////////////////////////////////////////////
0487    /// \brief Execute a function in parallel over the elements of a vector (Map) and accumulate the results into a single value (Reduce).
0488    /// Benefits from partial reduction into `nChunks` intermediate results.
0489    ///
0490    /// \copydetails ROOT::Internal::TExecutor::MapReduce(F func,unsigned nTimes,R redfunc,unsigned nChunks)
0491    template <class F, class R, class Cond>
0492    auto TThreadExecutor::MapReduce(F func, unsigned nTimes, R redfunc, unsigned nChunks) -> InvokeResult_t<F>
0493    {
0494       return Reduce(Map(func, nTimes, redfunc, nChunks), redfunc);
0495    }
0496
0497    //////////////////////////////////////////////////////////////////////////
0498    /// \brief Execute a function in parallel over the elements of a vector (Map) and accumulate the results into a single value (Reduce).
0499    /// Benefits from partial reduction into `nChunks` intermediate results.
0500    ///
0501    /// \copydetails ROOT::Internal::TExecutor::MapReduce(F func,ROOT::TSeq<INTEGER> args,R redfunc,unsigned nChunks)
0502    template <class F, class INTEGER, class R, class Cond>
0503    auto TThreadExecutor::MapReduce(F func, ROOT::TSeq<INTEGER> args, R redfunc, unsigned nChunks)
0504       -> InvokeResult_t<F, INTEGER>
0505    {
0506       return Reduce(Map(func, args, redfunc, nChunks), redfunc);
0507    }
0508
0509    //////////////////////////////////////////////////////////////////////////
0510    /// \brief Execute a function in parallel over the elements of an initializer_list (Map) and accumulate the results into a single value (Reduce).
0511    /// Benefits from partial reduction into `nChunks` intermediate results.
0512    ///
0513    /// \copydetails ROOT::Internal::TExecutor::MapReduce(F func,std::initializer_list<T> args,R redfunc,unsigned nChunks)
0514    template <class F, class T, class R, class Cond>
0515    auto TThreadExecutor::MapReduce(F func, std::initializer_list<T> args, R redfunc, unsigned nChunks)
0516       -> InvokeResult_t<F, T>
0517    {
0518       return Reduce(Map(func, args, redfunc, nChunks), redfunc);
0519    }
0520
0521    //////////////////////////////////////////////////////////////////////////
0522    /// \brief Execute a function over the elements of a vector in parallel (Map) and accumulate the results into a single value (Reduce).
0523    /// \copydetails  ROOT::Internal::TExecutor::MapReduce(F func,std::vector<T> &args,R redfunc)
0524    template <class F, class T, class R, class Cond>
0525    auto TThreadExecutor::MapReduce(F func, std::vector<T> &args, R redfunc) -> InvokeResult_t<F, T>
0526    {
0527       return Reduce(Map(func, args), redfunc);
0528    }
0529
0530    //////////////////////////////////////////////////////////////////////////
0531    /// \brief Execute a function over the elements of an immutable vector in parallel (Map) and accumulate the results into a single value (Reduce).
0532    /// \copydetails  ROOT::Internal::TExecutor::MapReduce(F func,const std::vector<T> &args,R redfunc)
0533    template <class F, class T, class R, class Cond>
0534    auto TThreadExecutor::MapReduce(F func, const std::vector<T> &args, R redfunc) -> InvokeResult_t<F, T>
0535    {
0536       return Reduce(Map(func, args), redfunc);
0537    }
0538
0539    //////////////////////////////////////////////////////////////////////////
0540    /// \brief Execute a function in parallel over the elements of a vector (Map) and accumulate the results into a single value (Reduce).
0541    /// Benefits from partial reduction into `nChunks` intermediate results.
0542    ///
0543    /// \copydetails ROOT::Internal::TExecutor::MapReduce(F func,std::vector<T> &args,R redfunc,unsigned nChunks)
0544    template <class F, class T, class R, class Cond>
0545    auto TThreadExecutor::MapReduce(F func, std::vector<T> &args, R redfunc, unsigned nChunks) -> InvokeResult_t<F, T>
0546    {
0547       return Reduce(Map(func, args, redfunc, nChunks), redfunc);
0548    }
0549
0550    //////////////////////////////////////////////////////////////////////////
0551    /// \brief Execute a function in parallel over the elements of an immutable vector (Map) and accumulate the results into a single value (Reduce).
0552    /// Benefits from partial reduction into `nChunks` intermediate results.
0553    ///
0554    /// \copydetails ROOT::Internal::TExecutor::MapReduce(F func,const std::vector<T> &args,R redfunc,unsigned nChunks)
0555    template <class F, class T, class R, class Cond>
0556    auto TThreadExecutor::MapReduce(F func, const std::vector<T> &args, R redfunc, unsigned nChunks)
0557       -> InvokeResult_t<F, T>
0558    {
0559       return Reduce(Map(func, args, redfunc, nChunks), redfunc);
0560    }
0561
0562    //////////////////////////////////////////////////////////////////////////
0563    /// \copydoc ROOT::Internal::TExecutor::Reduce(const std::vector<T> &objs,R redfunc)
0564    template<class T, class R>
0565    auto TThreadExecutor::Reduce(const std::vector<T> &objs, R redfunc) -> decltype(redfunc(objs))
0566    {
0567       // check we can apply reduce to objs
0568       static_assert(std::is_same<decltype(redfunc(objs)), T>::value, "redfunc does not have the correct signature");
0569       return SeqReduce(objs, redfunc);
0570    }
0571
0572    //////////////////////////////////////////////////////////////////////////
0573    /// \brief "Reduce" an std::vector into a single object in parallel by passing a
0574    /// binary function as the second argument defining the reduction operation.
0575    ///
0576    /// \param objs A vector of elements to combine.
0577    /// \param redfunc Binary reduction function to combine the elements of the vector `objs`.
0578    /// \return A value result of combining the vector elements into a single object of the same type.
0579    template<class T, class BINARYOP>
0580    auto TThreadExecutor::Reduce(const std::vector<T> &objs, BINARYOP redfunc) -> decltype(redfunc(objs.front(), objs.front()))
0581    {
0582       // check we can apply reduce to objs
0583       static_assert(std::is_same<decltype(redfunc(objs.front(), objs.front())), T>::value, "redfunc does not have the correct signature");
0584       return ParallelReduce(objs, redfunc);
0585    }
0586
0587    //////////////////////////////////////////////////////////////////////////
0588    /// \brief "Reduce", sequentially, an std::vector into a single object
0589    ///
0590    /// \param objs A vector of elements to combine.
0591    /// \param redfunc Reduction function to combine the elements of the vector `objs`.
0592    /// \return A value result of combining the vector elements into a single object of the same type.
0593    template<class T, class R>
0594    auto TThreadExecutor::SeqReduce(const std::vector<T> &objs, R redfunc) -> decltype(redfunc(objs))
0595    {
0596       return redfunc(objs);
0597    }
0598
0599 } // namespace ROOT
0600
0601 #endif   // R__USE_IMT
0602 #endif