src/Core/Redux.h

0001 // This file is part of Eigen, a lightweight C++ template library
0002 // for linear algebra.
0003 //
0004 // Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
0005 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
0006 //
0007 // This Source Code Form is subject to the terms of the Mozilla
0008 // Public License v. 2.0. If a copy of the MPL was not distributed
0009 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
0010
0011 #ifndef EIGEN_REDUX_H
0012 #define EIGEN_REDUX_H
0013
0014 namespace Eigen {
0015
0016 namespace internal {
0017
0018 // TODO
0019 //  * implement other kind of vectorization
0020 //  * factorize code
0021
0022 /***************************************************************************
0023 * Part 1 : the logic deciding a strategy for vectorization and unrolling
0024 ***************************************************************************/
0025
0026 template<typename Func, typename Evaluator>
0027 struct redux_traits
0028 {
0029 public:
0030     typedef typename find_best_packet<typename Evaluator::Scalar,Evaluator::SizeAtCompileTime>::type PacketType;
0031   enum {
0032     PacketSize = unpacket_traits<PacketType>::size,
0033     InnerMaxSize = int(Evaluator::IsRowMajor)
0034                  ? Evaluator::MaxColsAtCompileTime
0035                  : Evaluator::MaxRowsAtCompileTime,
0036     OuterMaxSize = int(Evaluator::IsRowMajor)
0037                  ? Evaluator::MaxRowsAtCompileTime
0038                  : Evaluator::MaxColsAtCompileTime,
0039     SliceVectorizedWork = int(InnerMaxSize)==Dynamic ? Dynamic
0040                         : int(OuterMaxSize)==Dynamic ? (int(InnerMaxSize)>=int(PacketSize) ? Dynamic : 0)
0041                         : (int(InnerMaxSize)/int(PacketSize)) * int(OuterMaxSize)
0042   };
0043
0044   enum {
0045     MightVectorize = (int(Evaluator::Flags)&ActualPacketAccessBit)
0046                   && (functor_traits<Func>::PacketAccess),
0047     MayLinearVectorize = bool(MightVectorize) && (int(Evaluator::Flags)&LinearAccessBit),
0048     MaySliceVectorize  = bool(MightVectorize) && (int(SliceVectorizedWork)==Dynamic || int(SliceVectorizedWork)>=3)
0049   };
0050
0051 public:
0052   enum {
0053     Traversal = int(MayLinearVectorize) ? int(LinearVectorizedTraversal)
0054               : int(MaySliceVectorize)  ? int(SliceVectorizedTraversal)
0055                                         : int(DefaultTraversal)
0056   };
0057
0058 public:
0059   enum {
0060     Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost
0061          : int(Evaluator::SizeAtCompileTime) * int(Evaluator::CoeffReadCost) + (Evaluator::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
0062     UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize))
0063   };
0064
0065 public:
0066   enum {
0067     Unrolling = Cost <= UnrollingLimit ? CompleteUnrolling : NoUnrolling
0068   };
0069
0070 #ifdef EIGEN_DEBUG_ASSIGN
0071   static void debug()
0072   {
0073     std::cerr << "Xpr: " << typeid(typename Evaluator::XprType).name() << std::endl;
0074     std::cerr.setf(std::ios::hex, std::ios::basefield);
0075     EIGEN_DEBUG_VAR(Evaluator::Flags)
0076     std::cerr.unsetf(std::ios::hex);
0077     EIGEN_DEBUG_VAR(InnerMaxSize)
0078     EIGEN_DEBUG_VAR(OuterMaxSize)
0079     EIGEN_DEBUG_VAR(SliceVectorizedWork)
0080     EIGEN_DEBUG_VAR(PacketSize)
0081     EIGEN_DEBUG_VAR(MightVectorize)
0082     EIGEN_DEBUG_VAR(MayLinearVectorize)
0083     EIGEN_DEBUG_VAR(MaySliceVectorize)
0084     std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl;
0085     EIGEN_DEBUG_VAR(UnrollingLimit)
0086     std::cerr << "Unrolling" << " = " << Unrolling << " (" << demangle_unrolling(Unrolling) << ")" << std::endl;
0087     std::cerr << std::endl;
0088   }
0089 #endif
0090 };
0091
0092 /***************************************************************************
0093 * Part 2 : unrollers
0094 ***************************************************************************/
0095
0096 /*** no vectorization ***/
0097
0098 template<typename Func, typename Evaluator, int Start, int Length>
0099 struct redux_novec_unroller
0100 {
0101   enum {
0102     HalfLength = Length/2
0103   };
0104
0105   typedef typename Evaluator::Scalar Scalar;
0106
0107   EIGEN_DEVICE_FUNC
0108   static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func& func)
0109   {
0110     return func(redux_novec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func),
0111                 redux_novec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func));
0112   }
0113 };
0114
0115 template<typename Func, typename Evaluator, int Start>
0116 struct redux_novec_unroller<Func, Evaluator, Start, 1>
0117 {
0118   enum {
0119     outer = Start / Evaluator::InnerSizeAtCompileTime,
0120     inner = Start % Evaluator::InnerSizeAtCompileTime
0121   };
0122
0123   typedef typename Evaluator::Scalar Scalar;
0124
0125   EIGEN_DEVICE_FUNC
0126   static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func&)
0127   {
0128     return eval.coeffByOuterInner(outer, inner);
0129   }
0130 };
0131
0132 // This is actually dead code and will never be called. It is required
0133 // to prevent false warnings regarding failed inlining though
0134 // for 0 length run() will never be called at all.
0135 template<typename Func, typename Evaluator, int Start>
0136 struct redux_novec_unroller<Func, Evaluator, Start, 0>
0137 {
0138   typedef typename Evaluator::Scalar Scalar;
0139   EIGEN_DEVICE_FUNC
0140   static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); }
0141 };
0142
0143 /*** vectorization ***/
0144
0145 template<typename Func, typename Evaluator, int Start, int Length>
0146 struct redux_vec_unroller
0147 {
0148   template<typename PacketType>
0149   EIGEN_DEVICE_FUNC
0150   static EIGEN_STRONG_INLINE PacketType run(const Evaluator &eval, const Func& func)
0151   {
0152     enum {
0153       PacketSize = unpacket_traits<PacketType>::size,
0154       HalfLength = Length/2
0155     };
0156
0157     return func.packetOp(
0158             redux_vec_unroller<Func, Evaluator, Start, HalfLength>::template run<PacketType>(eval,func),
0159             redux_vec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::template run<PacketType>(eval,func) );
0160   }
0161 };
0162
0163 template<typename Func, typename Evaluator, int Start>
0164 struct redux_vec_unroller<Func, Evaluator, Start, 1>
0165 {
0166   template<typename PacketType>
0167   EIGEN_DEVICE_FUNC
0168   static EIGEN_STRONG_INLINE PacketType run(const Evaluator &eval, const Func&)
0169   {
0170     enum {
0171       PacketSize = unpacket_traits<PacketType>::size,
0172       index = Start * PacketSize,
0173       outer = index / int(Evaluator::InnerSizeAtCompileTime),
0174       inner = index % int(Evaluator::InnerSizeAtCompileTime),
0175       alignment = Evaluator::Alignment
0176     };
0177     return eval.template packetByOuterInner<alignment,PacketType>(outer, inner);
0178   }
0179 };
0180
0181 /***************************************************************************
0182 * Part 3 : implementation of all cases
0183 ***************************************************************************/
0184
0185 template<typename Func, typename Evaluator,
0186          int Traversal = redux_traits<Func, Evaluator>::Traversal,
0187          int Unrolling = redux_traits<Func, Evaluator>::Unrolling
0188 >
0189 struct redux_impl;
0190
0191 template<typename Func, typename Evaluator>
0192 struct redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>
0193 {
0194   typedef typename Evaluator::Scalar Scalar;
0195
0196   template<typename XprType>
0197   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
0198   Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
0199   {
0200     eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
0201     Scalar res;
0202     res = eval.coeffByOuterInner(0, 0);
0203     for(Index i = 1; i < xpr.innerSize(); ++i)
0204       res = func(res, eval.coeffByOuterInner(0, i));
0205     for(Index i = 1; i < xpr.outerSize(); ++i)
0206       for(Index j = 0; j < xpr.innerSize(); ++j)
0207         res = func(res, eval.coeffByOuterInner(i, j));
0208     return res;
0209   }
0210 };
0211
0212 template<typename Func, typename Evaluator>
0213 struct redux_impl<Func,Evaluator, DefaultTraversal, CompleteUnrolling>
0214   : redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime>
0215 {
0216   typedef redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
0217   typedef typename Evaluator::Scalar Scalar;
0218   template<typename XprType>
0219   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
0220   Scalar run(const Evaluator &eval, const Func& func, const XprType& /*xpr*/)
0221   {
0222     return Base::run(eval,func);
0223   }
0224 };
0225
0226 template<typename Func, typename Evaluator>
0227 struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, NoUnrolling>
0228 {
0229   typedef typename Evaluator::Scalar Scalar;
0230   typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
0231
0232   template<typename XprType>
0233   static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
0234   {
0235     const Index size = xpr.size();
0236
0237     const Index packetSize = redux_traits<Func, Evaluator>::PacketSize;
0238     const int packetAlignment = unpacket_traits<PacketScalar>::alignment;
0239     enum {
0240       alignment0 = (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
0241       alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Evaluator::Alignment)
0242     };
0243     const Index alignedStart = internal::first_default_aligned(xpr);
0244     const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
0245     const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
0246     const Index alignedEnd2 = alignedStart + alignedSize2;
0247     const Index alignedEnd  = alignedStart + alignedSize;
0248     Scalar res;
0249     if(alignedSize)
0250     {
0251       PacketScalar packet_res0 = eval.template packet<alignment,PacketScalar>(alignedStart);
0252       if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop
0253       {
0254         PacketScalar packet_res1 = eval.template packet<alignment,PacketScalar>(alignedStart+packetSize);
0255         for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize)
0256         {
0257           packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(index));
0258           packet_res1 = func.packetOp(packet_res1, eval.template packet<alignment,PacketScalar>(index+packetSize));
0259         }
0260
0261         packet_res0 = func.packetOp(packet_res0,packet_res1);
0262         if(alignedEnd>alignedEnd2)
0263           packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(alignedEnd2));
0264       }
0265       res = func.predux(packet_res0);
0266
0267       for(Index index = 0; index < alignedStart; ++index)
0268         res = func(res,eval.coeff(index));
0269
0270       for(Index index = alignedEnd; index < size; ++index)
0271         res = func(res,eval.coeff(index));
0272     }
0273     else // too small to vectorize anything.
0274          // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
0275     {
0276       res = eval.coeff(0);
0277       for(Index index = 1; index < size; ++index)
0278         res = func(res,eval.coeff(index));
0279     }
0280
0281     return res;
0282   }
0283 };
0284
0285 // NOTE: for SliceVectorizedTraversal we simply bypass unrolling
0286 template<typename Func, typename Evaluator, int Unrolling>
0287 struct redux_impl<Func, Evaluator, SliceVectorizedTraversal, Unrolling>
0288 {
0289   typedef typename Evaluator::Scalar Scalar;
0290   typedef typename redux_traits<Func, Evaluator>::PacketType PacketType;
0291
0292   template<typename XprType>
0293   EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
0294   {
0295     eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
0296     const Index innerSize = xpr.innerSize();
0297     const Index outerSize = xpr.outerSize();
0298     enum {
0299       packetSize = redux_traits<Func, Evaluator>::PacketSize
0300     };
0301     const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize;
0302     Scalar res;
0303     if(packetedInnerSize)
0304     {
0305       PacketType packet_res = eval.template packet<Unaligned,PacketType>(0,0);
0306       for(Index j=0; j<outerSize; ++j)
0307         for(Index i=(j==0?packetSize:0); i<packetedInnerSize; i+=Index(packetSize))
0308           packet_res = func.packetOp(packet_res, eval.template packetByOuterInner<Unaligned,PacketType>(j,i));
0309
0310       res = func.predux(packet_res);
0311       for(Index j=0; j<outerSize; ++j)
0312         for(Index i=packetedInnerSize; i<innerSize; ++i)
0313           res = func(res, eval.coeffByOuterInner(j,i));
0314     }
0315     else // too small to vectorize anything.
0316          // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
0317     {
0318       res = redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>::run(eval, func, xpr);
0319     }
0320
0321     return res;
0322   }
0323 };
0324
0325 template<typename Func, typename Evaluator>
0326 struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, CompleteUnrolling>
0327 {
0328   typedef typename Evaluator::Scalar Scalar;
0329
0330   typedef typename redux_traits<Func, Evaluator>::PacketType PacketType;
0331   enum {
0332     PacketSize = redux_traits<Func, Evaluator>::PacketSize,
0333     Size = Evaluator::SizeAtCompileTime,
0334     VectorizedSize = (int(Size) / int(PacketSize)) * int(PacketSize)
0335   };
0336
0337   template<typename XprType>
0338   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
0339   Scalar run(const Evaluator &eval, const Func& func, const XprType &xpr)
0340   {
0341     EIGEN_ONLY_USED_FOR_DEBUG(xpr)
0342     eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
0343     if (VectorizedSize > 0) {
0344       Scalar res = func.predux(redux_vec_unroller<Func, Evaluator, 0, Size / PacketSize>::template run<PacketType>(eval,func));
0345       if (VectorizedSize != Size)
0346         res = func(res,redux_novec_unroller<Func, Evaluator, VectorizedSize, Size-VectorizedSize>::run(eval,func));
0347       return res;
0348     }
0349     else {
0350       return redux_novec_unroller<Func, Evaluator, 0, Size>::run(eval,func);
0351     }
0352   }
0353 };
0354
0355 // evaluator adaptor
0356 template<typename _XprType>
0357 class redux_evaluator : public internal::evaluator<_XprType>
0358 {
0359   typedef internal::evaluator<_XprType> Base;
0360 public:
0361   typedef _XprType XprType;
0362   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
0363   explicit redux_evaluator(const XprType &xpr) : Base(xpr) {}
0364
0365   typedef typename XprType::Scalar Scalar;
0366   typedef typename XprType::CoeffReturnType CoeffReturnType;
0367   typedef typename XprType::PacketScalar PacketScalar;
0368
0369   enum {
0370     MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
0371     MaxColsAtCompileTime = XprType::MaxColsAtCompileTime,
0372     // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator
0373     Flags = Base::Flags & ~DirectAccessBit,
0374     IsRowMajor = XprType::IsRowMajor,
0375     SizeAtCompileTime = XprType::SizeAtCompileTime,
0376     InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime
0377   };
0378
0379   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
0380   CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
0381   { return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
0382
0383   template<int LoadMode, typename PacketType>
0384   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
0385   PacketType packetByOuterInner(Index outer, Index inner) const
0386   { return Base::template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
0387
0388 };
0389
0390 } // end namespace internal
0391
0392 /***************************************************************************
0393 * Part 4 : public API
0394 ***************************************************************************/
0395
0396
0397 /** \returns the result of a full redux operation on the whole matrix or vector using \a func
0398   *
0399   * The template parameter \a BinaryOp is the type of the functor \a func which must be
0400   * an associative operator. Both current C++98 and C++11 functor styles are handled.
0401   *
0402   * \warning the matrix must be not empty, otherwise an assertion is triggered.
0403   *
0404   * \sa DenseBase::sum(), DenseBase::minCoeff(), DenseBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise()
0405   */
0406 template<typename Derived>
0407 template<typename Func>
0408 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
0409 DenseBase<Derived>::redux(const Func& func) const
0410 {
0411   eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
0412
0413   typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
0414   ThisEvaluator thisEval(derived());
0415
0416   // The initial expression is passed to the reducer as an additional argument instead of
0417   // passing it as a member of redux_evaluator to help
0418   return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func, derived());
0419 }
0420
0421 /** \returns the minimum of all coefficients of \c *this.
0422   * In case \c *this contains NaN, NaNPropagation determines the behavior:
0423   *   NaNPropagation == PropagateFast : undefined
0424   *   NaNPropagation == PropagateNaN : result is NaN
0425   *   NaNPropagation == PropagateNumbers : result is minimum of elements that are not NaN
0426   * \warning the matrix must be not empty, otherwise an assertion is triggered.
0427   */
0428 template<typename Derived>
0429 template<int NaNPropagation>
0430 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
0431 DenseBase<Derived>::minCoeff() const
0432 {
0433   return derived().redux(Eigen::internal::scalar_min_op<Scalar,Scalar, NaNPropagation>());
0434 }
0435
0436 /** \returns the maximum of all coefficients of \c *this.
0437   * In case \c *this contains NaN, NaNPropagation determines the behavior:
0438   *   NaNPropagation == PropagateFast : undefined
0439   *   NaNPropagation == PropagateNaN : result is NaN
0440   *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
0441   * \warning the matrix must be not empty, otherwise an assertion is triggered.
0442   */
0443 template<typename Derived>
0444 template<int NaNPropagation>
0445 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
0446 DenseBase<Derived>::maxCoeff() const
0447 {
0448   return derived().redux(Eigen::internal::scalar_max_op<Scalar,Scalar, NaNPropagation>());
0449 }
0450
0451 /** \returns the sum of all coefficients of \c *this
0452   *
0453   * If \c *this is empty, then the value 0 is returned.
0454   *
0455   * \sa trace(), prod(), mean()
0456   */
0457 template<typename Derived>
0458 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
0459 DenseBase<Derived>::sum() const
0460 {
0461   if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
0462     return Scalar(0);
0463   return derived().redux(Eigen::internal::scalar_sum_op<Scalar,Scalar>());
0464 }
0465
0466 /** \returns the mean of all coefficients of *this
0467 *
0468 * \sa trace(), prod(), sum()
0469 */
0470 template<typename Derived>
0471 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
0472 DenseBase<Derived>::mean() const
0473 {
0474 #ifdef __INTEL_COMPILER
0475   #pragma warning push
0476   #pragma warning ( disable : 2259 )
0477 #endif
0478   return Scalar(derived().redux(Eigen::internal::scalar_sum_op<Scalar,Scalar>())) / Scalar(this->size());
0479 #ifdef __INTEL_COMPILER
0480   #pragma warning pop
0481 #endif
0482 }
0483
0484 /** \returns the product of all coefficients of *this
0485   *
0486   * Example: \include MatrixBase_prod.cpp
0487   * Output: \verbinclude MatrixBase_prod.out
0488   *
0489   * \sa sum(), mean(), trace()
0490   */
0491 template<typename Derived>
0492 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
0493 DenseBase<Derived>::prod() const
0494 {
0495   if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
0496     return Scalar(1);
0497   return derived().redux(Eigen::internal::scalar_product_op<Scalar>());
0498 }
0499
0500 /** \returns the trace of \c *this, i.e. the sum of the coefficients on the main diagonal.
0501   *
0502   * \c *this can be any matrix, not necessarily square.
0503   *
0504   * \sa diagonal(), sum()
0505   */
0506 template<typename Derived>
0507 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
0508 MatrixBase<Derived>::trace() const
0509 {
0510   return derived().diagonal().sum();
0511 }
0512
0513 } // end namespace Eigen
0514
0515 #endif // EIGEN_REDUX_H