Vc/sse/math.h

0001 /*  This file is part of the Vc library. {{{
0002 Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
0003
0004 Redistribution and use in source and binary forms, with or without
0005 modification, are permitted provided that the following conditions are met:
0006     * Redistributions of source code must retain the above copyright
0007       notice, this list of conditions and the following disclaimer.
0008     * Redistributions in binary form must reproduce the above copyright
0009       notice, this list of conditions and the following disclaimer in the
0010       documentation and/or other materials provided with the distribution.
0011     * Neither the names of contributing organizations nor the
0012       names of its contributors may be used to endorse or promote products
0013       derived from this software without specific prior written permission.
0014
0015 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
0016 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
0017 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0018 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
0019 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
0020 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
0021 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0022 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0023 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
0024 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0025
0026 }}}*/
0027
0028 #ifndef VC_SSE_MATH_H_
0029 #define VC_SSE_MATH_H_
0030
0031 #include "const.h"
0032 #include "macros.h"
0033
0034 namespace Vc_VERSIONED_NAMESPACE
0035 {
0036 // copysign {{{1
0037 Vc_INTRINSIC Vc_CONST SSE::float_v copysign(SSE::float_v mag, SSE::float_v sign)
0038 {
0039     return _mm_or_ps(_mm_and_ps(sign.data(), SSE::_mm_setsignmask_ps()),
0040                      _mm_and_ps(mag.data(), SSE::_mm_setabsmask_ps()));
0041 }
0042 Vc_INTRINSIC Vc_CONST SSE::double_v copysign(SSE::double_v mag, SSE::double_v sign)
0043 {
0044     return _mm_or_pd(_mm_and_pd(sign.data(), SSE::_mm_setsignmask_pd()),
0045                      _mm_and_pd(mag.data(), SSE::_mm_setabsmask_pd()));
0046 }
0047
0048 //}}}1
0049
0050 /**
0051  * splits \p v into exponent and mantissa, the sign is kept with the mantissa
0052  *
0053  * The return value will be in the range [0.5, 1.0[
0054  * The \p e value will be an integer defining the power-of-two exponent
0055  */
0056 inline SSE::double_v frexp(const SSE::double_v &v,
0057                            SimdArray<int, 2, Scalar::int_v, 1> *e)
0058 {
0059     const __m128i exponentBits = SSE::Const<double>::exponentMask().dataI();
0060     const __m128i exponentPart = _mm_and_si128(_mm_castpd_si128(v.data()), exponentBits);
0061     SSE::int_v exponent =
0062         _mm_sub_epi32(_mm_srli_epi64(exponentPart, 52), _mm_set1_epi32(0x3fe));
0063     const __m128d exponentMaximized = _mm_or_pd(v.data(), _mm_castsi128_pd(exponentBits));
0064     SSE::double_v ret = _mm_and_pd(
0065         exponentMaximized,
0066         _mm_load_pd(reinterpret_cast<const double *>(&SSE::c_general::frexpMask[0])));
0067     SSE::double_m zeroMask = v == SSE::double_v::Zero();
0068     ret(isnan(v) || !isfinite(v) || zeroMask) = v;
0069     exponent.setZero(zeroMask.data());
0070     (*e)[0] = exponent[0];
0071     (*e)[1] = exponent[2];
0072     return ret;
0073 }
0074 inline SSE::float_v frexp(const SSE::float_v &v, SimdArray<int, 4, SSE::int_v, 4> *e)
0075 {
0076     const __m128i exponentBits = SSE::Const<float>::exponentMask().dataI();
0077     const __m128i exponentPart = _mm_and_si128(_mm_castps_si128(v.data()), exponentBits);
0078     internal_data(*e) =
0079         _mm_sub_epi32(_mm_srli_epi32(exponentPart, 23), _mm_set1_epi32(0x7e));
0080     const __m128 exponentMaximized = _mm_or_ps(v.data(), _mm_castsi128_ps(exponentBits));
0081     SSE::float_v ret =
0082         _mm_and_ps(exponentMaximized, _mm_castsi128_ps(_mm_set1_epi32(0xbf7fffffu)));
0083     ret(isnan(v) || !isfinite(v) || v == SSE::float_v::Zero()) = v;
0084     e->setZero(v == SSE::float_v::Zero());
0085     return ret;
0086 }
0087
0088 /*             -> x * 2^e
0089  * x == NaN    -> NaN
0090  * x == (-)inf -> (-)inf
0091  */
0092 inline SSE::double_v ldexp(SSE::double_v::AsArg v,
0093                            const SimdArray<int, 2, Scalar::int_v, 1> &_e)
0094 {
0095     SSE::int_v e = _mm_setr_epi32(_e[0], 0, _e[1], 0);
0096     e.setZero((v == SSE::double_v::Zero()).dataI());
0097     const __m128i exponentBits = _mm_slli_epi64(e.data(), 52);
0098     return _mm_castsi128_pd(_mm_add_epi64(_mm_castpd_si128(v.data()), exponentBits));
0099 }
0100 inline SSE::float_v ldexp(SSE::float_v::AsArg v,
0101                           const SimdArray<int, 4, SSE::int_v, 4> &_e)
0102 {
0103     SSE::int_v e = internal_data(_e);
0104     e.setZero(simd_cast<SSE::int_m>(v == SSE::float_v::Zero()));
0105     return reinterpret_components_cast<SSE::float_v>(
0106         reinterpret_components_cast<SSE::int_v>(v) + (e << 23));
0107 }
0108
0109 #ifdef Vc_IMPL_SSE4_1
0110 inline SSE::double_v trunc(SSE::double_v::AsArg v) { return _mm_round_pd(v.data(), 0x3); }
0111 inline SSE::float_v trunc(SSE::float_v::AsArg v) { return _mm_round_ps(v.data(), 0x3); }
0112
0113 inline SSE::double_v floor(SSE::double_v::AsArg v) { return _mm_floor_pd(v.data()); }
0114 inline SSE::float_v floor(SSE::float_v::AsArg v) { return _mm_floor_ps(v.data()); }
0115
0116 inline SSE::double_v ceil(SSE::double_v::AsArg v) { return _mm_ceil_pd(v.data()); }
0117 inline SSE::float_v ceil(SSE::float_v::AsArg v) { return _mm_ceil_ps(v.data()); }
0118 #else
0119 inline SSE::Vector<float> trunc(SSE::Vector<float> x)
0120 {
0121     const auto truncated = _mm_cvtepi32_ps(_mm_cvttps_epi32(x.data()));
0122     const auto no_fractional_values = _mm_castsi128_ps(_mm_cmplt_epi32(
0123         _mm_and_si128(_mm_castps_si128(x.data()), _mm_set1_epi32(0x7f800000u)),
0124         _mm_set1_epi32(0x4b000000)));  // the exponent is so large that no mantissa bits
0125                                        // signify fractional values (0x3f8 + 23*8 = 0x4b0)
0126     return _mm_or_ps(_mm_andnot_ps(no_fractional_values, x.data()),
0127                      _mm_and_ps(no_fractional_values, truncated));
0128 }
0129
0130 inline SSE::Vector<double> trunc(SSE::Vector<double> x)
0131 {
0132     const auto abs_x = Vc::abs(x).data();
0133     const auto min_no_fractional_bits =
0134         _mm_castsi128_pd(_mm_set1_epi64x(0x4330000000000000ull));  // 0x3ff + 52 = 0x433
0135     __m128d truncated =
0136         _mm_sub_pd(_mm_add_pd(abs_x, min_no_fractional_bits), min_no_fractional_bits);
0137     // due to rounding, the result can be too large. In this case `truncated >
0138     // abs(x)` holds, so subtract 1 to truncated if `abs(x) < truncated`
0139     truncated = _mm_sub_pd(truncated,
0140                            _mm_and_pd(_mm_cmplt_pd(abs_x, truncated), _mm_set1_pd(1.)));
0141     // finally, fix the sign bit:
0142     return _mm_or_pd(
0143         _mm_and_pd(_mm_castsi128_pd(_mm_set1_epi64x(0x8000000000000000ull)), x.data()),
0144         truncated);
0145 }
0146
0147 template <typename T> inline SSE::Vector<T> floor(SSE::Vector<T> x)
0148 {
0149     auto y = trunc(x);
0150     y(!(y == x) && x < 0) -= 1;
0151     return y;
0152 }
0153
0154 template <typename T> inline SSE::Vector<T> ceil(SSE::Vector<T> x)
0155 {
0156     auto y = trunc(x);
0157     y(!(y == x || x < 0)) += 1;
0158     return y;
0159 }
0160 #endif
0161 // fma {{{1
0162 template <typename T>
0163 Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> fma(Vector<T, VectorAbi::Sse> a,
0164                                                Vector<T, VectorAbi::Sse> b,
0165                                                Vector<T, VectorAbi::Sse> c)
0166 {
0167     SSE::VectorHelper<T>::fma(a.data(), b.data(), c.data());
0168     return a;
0169 }
0170 // }}}1
0171 }  // namespace Vc
0172
0173 #endif // VC_SSE_MATH_H_
0174
0175 // vim: foldmethod=marker