Warning, file /include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h was not indexed
or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010 #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
0011 #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
0012
0013 namespace Eigen {
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025 class TensorOpCost {
0026 public:
0027
0028
0029
0030 template <typename ArgType>
0031 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() {
0032 return internal::functor_traits<
0033 internal::scalar_product_op<ArgType, ArgType> >::Cost;
0034 }
0035 template <typename ArgType>
0036 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() {
0037 return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
0038 }
0039 template <typename ArgType>
0040 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() {
0041 return internal::functor_traits<
0042 internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
0043 }
0044 template <typename ArgType>
0045 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() {
0046 return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
0047 }
0048 template <typename SrcType, typename TargetType>
0049 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() {
0050 return internal::functor_traits<
0051 internal::scalar_cast_op<SrcType, TargetType> >::Cost;
0052 }
0053
0054 EIGEN_DEVICE_FUNC
0055 TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
0056 EIGEN_DEVICE_FUNC
0057 TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
0058 : bytes_loaded_(bytes_loaded),
0059 bytes_stored_(bytes_stored),
0060 compute_cycles_(compute_cycles) {}
0061
0062 EIGEN_DEVICE_FUNC
0063 TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
0064 bool vectorized, double packet_size)
0065 : bytes_loaded_(bytes_loaded),
0066 bytes_stored_(bytes_stored),
0067 compute_cycles_(vectorized ? compute_cycles / packet_size
0068 : compute_cycles) {
0069 eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
0070 eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
0071 eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
0072 }
0073
0074 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const {
0075 return bytes_loaded_;
0076 }
0077 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const {
0078 return bytes_stored_;
0079 }
0080 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const {
0081 return compute_cycles_;
0082 }
0083 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(
0084 double load_cost, double store_cost, double compute_cost) const {
0085 return load_cost * bytes_loaded_ + store_cost * bytes_stored_ +
0086 compute_cost * compute_cycles_;
0087 }
0088
0089
0090
0091 EIGEN_DEVICE_FUNC void dropMemoryCost() {
0092 bytes_loaded_ = 0;
0093 bytes_stored_ = 0;
0094 }
0095
0096
0097 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(
0098 const TensorOpCost& rhs) const {
0099 double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
0100 double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
0101 double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
0102 return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
0103 }
0104
0105
0106 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(
0107 const TensorOpCost& rhs) const {
0108 double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
0109 double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
0110 double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
0111 return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
0112 }
0113
0114 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
0115 const TensorOpCost& rhs) {
0116 bytes_loaded_ += rhs.bytes_loaded();
0117 bytes_stored_ += rhs.bytes_stored();
0118 compute_cycles_ += rhs.compute_cycles();
0119 return *this;
0120 }
0121
0122 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) {
0123 bytes_loaded_ *= rhs;
0124 bytes_stored_ *= rhs;
0125 compute_cycles_ *= rhs;
0126 return *this;
0127 }
0128
0129 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(
0130 TensorOpCost lhs, const TensorOpCost& rhs) {
0131 lhs += rhs;
0132 return lhs;
0133 }
0134 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
0135 TensorOpCost lhs, double rhs) {
0136 lhs *= rhs;
0137 return lhs;
0138 }
0139 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
0140 double lhs, TensorOpCost rhs) {
0141 rhs *= lhs;
0142 return rhs;
0143 }
0144
0145 friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) {
0146 return os << "[bytes_loaded = " << tc.bytes_loaded()
0147 << ", bytes_stored = " << tc.bytes_stored()
0148 << ", compute_cycles = " << tc.compute_cycles() << "]";
0149 }
0150
0151 private:
0152 double bytes_loaded_;
0153 double bytes_stored_;
0154 double compute_cycles_;
0155 };
0156
0157
0158
0159
0160 template <typename Device>
0161 class TensorCostModel {
0162 public:
0163
0164 static const int kDeviceCyclesPerComputeCycle = 1;
0165
0166
0167 static const int kStartupCycles = 100000;
0168 static const int kPerThreadCycles = 100000;
0169 static const int kTaskSize = 40000;
0170
0171
0172
0173
0174 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
0175 double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
0176 double cost = totalCost(output_size, cost_per_coeff);
0177 double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
0178
0179 threads = numext::mini<double>(threads, GenericNumTraits<int>::highest());
0180 return numext::mini(max_threads,
0181 numext::maxi<int>(1, static_cast<int>(threads)));
0182 }
0183
0184
0185
0186
0187 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(
0188 double output_size, const TensorOpCost& cost_per_coeff) {
0189 return totalCost(output_size, cost_per_coeff) / kTaskSize;
0190 }
0191
0192 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(
0193 double output_size, const TensorOpCost& cost_per_coeff) {
0194
0195
0196
0197
0198
0199
0200
0201
0202
0203 const double kLoadCycles = 1.0 / 64 * 11;
0204 const double kStoreCycles = 1.0 / 64 * 11;
0205
0206 return output_size *
0207 cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
0208 kDeviceCyclesPerComputeCycle);
0209 }
0210 };
0211
0212 }
0213
0214 #endif