src/Tensor/TensorCostModel.h

0001 // This file is part of Eigen, a lightweight C++ template library
0002 // for linear algebra.
0003 //
0004 // Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
0005 //
0006 // This Source Code Form is subject to the terms of the Mozilla
0007 // Public License v. 2.0. If a copy of the MPL was not distributed
0008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
0009
0010 #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
0011 #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
0012
0013 namespace Eigen {
0014
0015 /** \class TensorEvaluator
0016   * \ingroup CXX11_Tensor_Module
0017   *
0018   * \brief A cost model used to limit the number of threads used for evaluating
0019   * tensor expression.
0020   *
0021   */
0022
0023 // Class storing the cost of evaluating a tensor expression in terms of the
0024 // estimated number of operand bytes loads, bytes stored, and compute cycles.
0025 class TensorOpCost {
0026  public:
0027   // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple
0028   // model based on minimal reciprocal throughput numbers from Intel or
0029   // Agner Fog's tables would be better than what is there now.
0030   template <typename ArgType>
0031   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() {
0032     return internal::functor_traits<
0033         internal::scalar_product_op<ArgType, ArgType> >::Cost;
0034   }
0035   template <typename ArgType>
0036   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() {
0037     return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
0038   }
0039   template <typename ArgType>
0040   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() {
0041     return internal::functor_traits<
0042         internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
0043   }
0044   template <typename ArgType>
0045   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() {
0046     return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
0047   }
0048   template <typename SrcType, typename TargetType>
0049   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() {
0050     return internal::functor_traits<
0051         internal::scalar_cast_op<SrcType, TargetType> >::Cost;
0052   }
0053
0054   EIGEN_DEVICE_FUNC
0055   TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
0056   EIGEN_DEVICE_FUNC
0057   TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
0058       : bytes_loaded_(bytes_loaded),
0059         bytes_stored_(bytes_stored),
0060         compute_cycles_(compute_cycles) {}
0061
0062   EIGEN_DEVICE_FUNC
0063   TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
0064                bool vectorized, double packet_size)
0065       : bytes_loaded_(bytes_loaded),
0066         bytes_stored_(bytes_stored),
0067         compute_cycles_(vectorized ? compute_cycles / packet_size
0068                                    : compute_cycles) {
0069     eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
0070     eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
0071     eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
0072   }
0073
0074   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const {
0075     return bytes_loaded_;
0076   }
0077   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const {
0078     return bytes_stored_;
0079   }
0080   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const {
0081     return compute_cycles_;
0082   }
0083   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(
0084       double load_cost, double store_cost, double compute_cost) const {
0085     return load_cost * bytes_loaded_ + store_cost * bytes_stored_ +
0086            compute_cost * compute_cycles_;
0087   }
0088
0089   // Drop memory access component. Intended for cases when memory accesses are
0090   // sequential or are completely masked by computations.
0091   EIGEN_DEVICE_FUNC void dropMemoryCost() {
0092     bytes_loaded_ = 0;
0093     bytes_stored_ = 0;
0094   }
0095
0096   // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
0097   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(
0098       const TensorOpCost& rhs) const {
0099     double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
0100     double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
0101     double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
0102     return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
0103   }
0104
0105   // TODO(rmlarsen): Define max in terms of total cost, not elementwise.
0106   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(
0107       const TensorOpCost& rhs) const {
0108     double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
0109     double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
0110     double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
0111     return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
0112   }
0113
0114   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
0115       const TensorOpCost& rhs) {
0116     bytes_loaded_ += rhs.bytes_loaded();
0117     bytes_stored_ += rhs.bytes_stored();
0118     compute_cycles_ += rhs.compute_cycles();
0119     return *this;
0120   }
0121
0122   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) {
0123     bytes_loaded_ *= rhs;
0124     bytes_stored_ *= rhs;
0125     compute_cycles_ *= rhs;
0126     return *this;
0127   }
0128
0129   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(
0130       TensorOpCost lhs, const TensorOpCost& rhs) {
0131     lhs += rhs;
0132     return lhs;
0133   }
0134   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
0135       TensorOpCost lhs, double rhs) {
0136     lhs *= rhs;
0137     return lhs;
0138   }
0139   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
0140       double lhs, TensorOpCost rhs) {
0141     rhs *= lhs;
0142     return rhs;
0143   }
0144
0145   friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) {
0146     return os << "[bytes_loaded = " << tc.bytes_loaded()
0147               << ", bytes_stored = " << tc.bytes_stored()
0148               << ", compute_cycles = " << tc.compute_cycles() << "]";
0149   }
0150
0151  private:
0152   double bytes_loaded_;
0153   double bytes_stored_;
0154   double compute_cycles_;
0155 };
0156
0157 // TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads
0158 // in [1:max_threads] instead of just switching multi-threading off for small
0159 // work units.
0160 template <typename Device>
0161 class TensorCostModel {
0162  public:
0163   // Scaling from Eigen compute cost to device cycles.
0164   static const int kDeviceCyclesPerComputeCycle = 1;
0165
0166  // Costs in device cycles.
0167   static const int kStartupCycles = 100000;
0168   static const int kPerThreadCycles = 100000;
0169   static const int kTaskSize = 40000;
0170
0171   // Returns the number of threads in [1:max_threads] to use for
0172   // evaluating an expression with the given output size and cost per
0173   // coefficient.
0174   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
0175       double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
0176     double cost = totalCost(output_size, cost_per_coeff);
0177     double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
0178     // Make sure we don't invoke undefined behavior when we convert to an int.
0179     threads = numext::mini<double>(threads, GenericNumTraits<int>::highest());
0180     return numext::mini(max_threads,
0181                         numext::maxi<int>(1, static_cast<int>(threads)));
0182   }
0183
0184   // taskSize assesses parallel task size.
0185   // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
0186   // granularity needs to be increased to mitigate parallelization overheads.
0187   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(
0188       double output_size, const TensorOpCost& cost_per_coeff) {
0189     return totalCost(output_size, cost_per_coeff) / kTaskSize;
0190   }
0191
0192   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(
0193       double output_size, const TensorOpCost& cost_per_coeff) {
0194     // Cost of memory fetches from L2 cache. 64 is typical cache line size.
0195     // 11 is L2 cache latency on Haswell.
0196     // We don't know whether data is in L1, L2 or L3. But we are most interested
0197     // in single-threaded computational time around 100us-10ms (smaller time
0198     // is too small for parallelization, larger time is not interesting
0199     // either because we are probably using all available threads already).
0200     // And for the target time range, L2 seems to be what matters. Data set
0201     // fitting into L1 is too small to take noticeable time. Data set fitting
0202     // only into L3 presumably will take more than 10ms to load and process.
0203     const double kLoadCycles = 1.0 / 64 * 11;
0204     const double kStoreCycles = 1.0 / 64 * 11;
0205     // Scaling from Eigen compute cost to device cycles.
0206     return output_size *
0207         cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
0208                                   kDeviceCyclesPerComputeCycle);
0209   }
0210 };
0211
0212 }  // namespace Eigen
0213
0214 #endif  // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H