arch/SSE/MathFunctions.h

0001 // This file is part of Eigen, a lightweight C++ template library
0002 // for linear algebra.
0003 //
0004 // Copyright (C) 2007 Julien Pommier
0005 // Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
0006 //
0007 // This Source Code Form is subject to the terms of the Mozilla
0008 // Public License v. 2.0. If a copy of the MPL was not distributed
0009 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
0010
0011 /* The sin and cos and functions of this file come from
0012  * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
0013  */
0014
0015 #ifndef EIGEN_MATH_FUNCTIONS_SSE_H
0016 #define EIGEN_MATH_FUNCTIONS_SSE_H
0017
0018 namespace Eigen {
0019
0020 namespace internal {
0021
0022 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
0023 Packet4f plog<Packet4f>(const Packet4f& _x) {
0024   return plog_float(_x);
0025 }
0026
0027 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
0028 Packet2d plog<Packet2d>(const Packet2d& _x) {
0029   return plog_double(_x);
0030 }
0031
0032 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
0033 Packet4f plog2<Packet4f>(const Packet4f& _x) {
0034   return plog2_float(_x);
0035 }
0036
0037 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
0038 Packet2d plog2<Packet2d>(const Packet2d& _x) {
0039   return plog2_double(_x);
0040 }
0041
0042 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
0043 Packet4f plog1p<Packet4f>(const Packet4f& _x) {
0044   return generic_plog1p(_x);
0045 }
0046
0047 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
0048 Packet4f pexpm1<Packet4f>(const Packet4f& _x) {
0049   return generic_expm1(_x);
0050 }
0051
0052 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
0053 Packet4f pexp<Packet4f>(const Packet4f& _x)
0054 {
0055   return pexp_float(_x);
0056 }
0057
0058 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
0059 Packet2d pexp<Packet2d>(const Packet2d& x)
0060 {
0061   return pexp_double(x);
0062 }
0063
0064 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
0065 Packet4f psin<Packet4f>(const Packet4f& _x)
0066 {
0067   return psin_float(_x);
0068 }
0069
0070 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
0071 Packet4f pcos<Packet4f>(const Packet4f& _x)
0072 {
0073   return pcos_float(_x);
0074 }
0075
0076 #if EIGEN_FAST_MATH
0077
0078 // Functions for sqrt.
0079 // The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
0080 // of Newton's method, at a cost of 1-2 bits of precision as opposed to the
0081 // exact solution. It does not handle +inf, or denormalized numbers correctly.
0082 // The main advantage of this approach is not just speed, but also the fact that
0083 // it can be inlined and pipelined with other computations, further reducing its
0084 // effective latency. This is similar to Quake3's fast inverse square root.
0085 // For detail see here: http://www.beyond3d.com/content/articles/8/
0086 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
0087 Packet4f psqrt<Packet4f>(const Packet4f& _x)
0088 {
0089   Packet4f minus_half_x = pmul(_x, pset1<Packet4f>(-0.5f));
0090   Packet4f denormal_mask = pandnot(
0091       pcmp_lt(_x, pset1<Packet4f>((std::numeric_limits<float>::min)())),
0092       pcmp_lt(_x, pzero(_x)));
0093
0094   // Compute approximate reciprocal sqrt.
0095   Packet4f x = _mm_rsqrt_ps(_x);
0096   // Do a single step of Newton's iteration.
0097   x = pmul(x, pmadd(minus_half_x, pmul(x,x), pset1<Packet4f>(1.5f)));
0098   // Flush results for denormals to zero.
0099   return pandnot(pmul(_x,x), denormal_mask);
0100 }
0101
0102 #else
0103
0104 template<>EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
0105 Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
0106
0107 #endif
0108
0109 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
0110 Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
0111
0112 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
0113 Packet16b psqrt<Packet16b>(const Packet16b& x) { return x; }
0114
0115 #if EIGEN_FAST_MATH
0116
0117 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
0118 Packet4f prsqrt<Packet4f>(const Packet4f& _x) {
0119   _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f);
0120   _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f);
0121   _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000u);
0122   _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000u);
0123
0124   Packet4f neg_half = pmul(_x, p4f_minus_half);
0125
0126   // Identity infinite, zero, negative and denormal arguments.
0127   Packet4f lt_min_mask = _mm_cmplt_ps(_x, p4f_flt_min);
0128   Packet4f inf_mask = _mm_cmpeq_ps(_x, p4f_inf);
0129   Packet4f not_normal_finite_mask = _mm_or_ps(lt_min_mask, inf_mask);
0130
0131   // Compute an approximate result using the rsqrt intrinsic.
0132   Packet4f y_approx = _mm_rsqrt_ps(_x);
0133
0134   // Do a single step of Newton-Raphson iteration to improve the approximation.
0135   // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
0136   // It is essential to evaluate the inner term like this because forming
0137   // y_n^2 may over- or underflow.
0138   Packet4f y_newton = pmul(
0139       y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p4f_one_point_five));
0140
0141   // Select the result of the Newton-Raphson step for positive normal arguments.
0142   // For other arguments, choose the output of the intrinsic. This will
0143   // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if
0144   // x is zero or a positive denormalized float (equivalent to flushing positive
0145   // denormalized inputs to zero).
0146   return pselect<Packet4f>(not_normal_finite_mask, y_approx, y_newton);
0147 }
0148
0149 #else
0150
0151 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
0152 Packet4f prsqrt<Packet4f>(const Packet4f& x) {
0153   // Unfortunately we can't use the much faster mm_rsqrt_ps since it only provides an approximation.
0154   return _mm_div_ps(pset1<Packet4f>(1.0f), _mm_sqrt_ps(x));
0155 }
0156
0157 #endif
0158
0159 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
0160 Packet2d prsqrt<Packet2d>(const Packet2d& x) {
0161   return _mm_div_pd(pset1<Packet2d>(1.0), _mm_sqrt_pd(x));
0162 }
0163
0164 // Hyperbolic Tangent function.
0165 template <>
0166 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
0167 ptanh<Packet4f>(const Packet4f& x) {
0168   return internal::generic_fast_tanh_float(x);
0169 }
0170
0171 } // end namespace internal
0172
0173 namespace numext {
0174
0175 template<>
0176 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
0177 float sqrt(const float &x)
0178 {
0179   return internal::pfirst(internal::Packet4f(_mm_sqrt_ss(_mm_set_ss(x))));
0180 }
0181
0182 template<>
0183 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
0184 double sqrt(const double &x)
0185 {
0186 #if EIGEN_COMP_GNUC_STRICT
0187   // This works around a GCC bug generating poor code for _mm_sqrt_pd
0188   // See https://gitlab.com/libeigen/eigen/commit/8dca9f97e38970
0189   return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x))));
0190 #else
0191   return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x))));
0192 #endif
0193 }
0194
0195 } // end namespace numex
0196
0197 } // end namespace Eigen
0198
0199 #endif // EIGEN_MATH_FUNCTIONS_SSE_H