Warning, /include/Vc/sse/vector.tcc is written in an unsupported language. File is not indexed.
0001 /* This file is part of the Vc library. {{{
0002 Copyright © 2010-2015 Matthias Kretz <kretz@kde.org>
0003
0004 Redistribution and use in source and binary forms, with or without
0005 modification, are permitted provided that the following conditions are met:
0006 * Redistributions of source code must retain the above copyright
0007 notice, this list of conditions and the following disclaimer.
0008 * Redistributions in binary form must reproduce the above copyright
0009 notice, this list of conditions and the following disclaimer in the
0010 documentation and/or other materials provided with the distribution.
0011 * Neither the names of contributing organizations nor the
0012 names of its contributors may be used to endorse or promote products
0013 derived from this software without specific prior written permission.
0014
0015 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
0016 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
0017 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0018 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
0019 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
0020 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
0021 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0022 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0023 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
0024 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0025
0026 }}}*/
0027
0028 #include "../common/x86_prefetches.h"
0029 #include "limits.h"
0030 #include "../common/bitscanintrinsics.h"
0031 #include "../common/set.h"
0032 #include "../common/gatherimplementation.h"
0033 #include "../common/scatterimplementation.h"
0034 #include "../common/transpose.h"
0035 #include "macros.h"
0036
0037 namespace Vc_VERSIONED_NAMESPACE
0038 {
0039 namespace Detail
0040 {
0041 // compare operators {{{1
0042 Vc_INTRINSIC SSE::double_m operator==(SSE::double_v a, SSE::double_v b) { return _mm_cmpeq_pd(a.data(), b.data()); }
0043 Vc_INTRINSIC SSE:: float_m operator==(SSE:: float_v a, SSE:: float_v b) { return _mm_cmpeq_ps(a.data(), b.data()); }
0044 Vc_INTRINSIC SSE:: int_m operator==(SSE:: int_v a, SSE:: int_v b) { return _mm_cmpeq_epi32(a.data(), b.data()); }
0045 Vc_INTRINSIC SSE:: uint_m operator==(SSE:: uint_v a, SSE:: uint_v b) { return _mm_cmpeq_epi32(a.data(), b.data()); }
0046 Vc_INTRINSIC SSE:: short_m operator==(SSE:: short_v a, SSE:: short_v b) { return _mm_cmpeq_epi16(a.data(), b.data()); }
0047 Vc_INTRINSIC SSE::ushort_m operator==(SSE::ushort_v a, SSE::ushort_v b) { return _mm_cmpeq_epi16(a.data(), b.data()); }
0048
0049 Vc_INTRINSIC SSE::double_m operator!=(SSE::double_v a, SSE::double_v b) { return _mm_cmpneq_pd(a.data(), b.data()); }
0050 Vc_INTRINSIC SSE:: float_m operator!=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmpneq_ps(a.data(), b.data()); }
0051 Vc_INTRINSIC SSE:: int_m operator!=(SSE:: int_v a, SSE:: int_v b) { return not_(_mm_cmpeq_epi32(a.data(), b.data())); }
0052 Vc_INTRINSIC SSE:: uint_m operator!=(SSE:: uint_v a, SSE:: uint_v b) { return not_(_mm_cmpeq_epi32(a.data(), b.data())); }
0053 Vc_INTRINSIC SSE:: short_m operator!=(SSE:: short_v a, SSE:: short_v b) { return not_(_mm_cmpeq_epi16(a.data(), b.data())); }
0054 Vc_INTRINSIC SSE::ushort_m operator!=(SSE::ushort_v a, SSE::ushort_v b) { return not_(_mm_cmpeq_epi16(a.data(), b.data())); }
0055
0056 Vc_INTRINSIC SSE::double_m operator> (SSE::double_v a, SSE::double_v b) { return _mm_cmpgt_pd(a.data(), b.data()); }
0057 Vc_INTRINSIC SSE:: float_m operator> (SSE:: float_v a, SSE:: float_v b) { return _mm_cmpgt_ps(a.data(), b.data()); }
0058 Vc_INTRINSIC SSE:: int_m operator> (SSE:: int_v a, SSE:: int_v b) { return _mm_cmpgt_epi32(a.data(), b.data()); }
0059 Vc_INTRINSIC SSE:: uint_m operator> (SSE:: uint_v a, SSE:: uint_v b) {
0060 #ifndef USE_INCORRECT_UNSIGNED_COMPARE
0061 return SSE::cmpgt_epu32(a.data(), b.data());
0062 #else
0063 return _mm_cmpgt_epi32(a.data(), b.data());
0064 #endif
0065 }
0066 Vc_INTRINSIC SSE:: short_m operator> (SSE:: short_v a, SSE:: short_v b) { return _mm_cmpgt_epi16(a.data(), b.data()); }
0067 Vc_INTRINSIC SSE::ushort_m operator> (SSE::ushort_v a, SSE::ushort_v b) {
0068 #ifndef USE_INCORRECT_UNSIGNED_COMPARE
0069 return SSE::cmpgt_epu16(a.data(), b.data());
0070 #else
0071 return _mm_cmpgt_epi16(a.data(), b.data());
0072 #endif
0073 }
0074
0075 Vc_INTRINSIC SSE::double_m operator< (SSE::double_v a, SSE::double_v b) { return _mm_cmplt_pd(a.data(), b.data()); }
0076 Vc_INTRINSIC SSE:: float_m operator< (SSE:: float_v a, SSE:: float_v b) { return _mm_cmplt_ps(a.data(), b.data()); }
0077 Vc_INTRINSIC SSE:: int_m operator< (SSE:: int_v a, SSE:: int_v b) { return _mm_cmplt_epi32(a.data(), b.data()); }
0078 Vc_INTRINSIC SSE:: uint_m operator< (SSE:: uint_v a, SSE:: uint_v b) {
0079 #ifndef USE_INCORRECT_UNSIGNED_COMPARE
0080 return SSE::cmplt_epu32(a.data(), b.data());
0081 #else
0082 return _mm_cmplt_epi32(a.data(), b.data());
0083 #endif
0084 }
0085 Vc_INTRINSIC SSE:: short_m operator< (SSE:: short_v a, SSE:: short_v b) { return _mm_cmplt_epi16(a.data(), b.data()); }
0086 Vc_INTRINSIC SSE::ushort_m operator< (SSE::ushort_v a, SSE::ushort_v b) {
0087 #ifndef USE_INCORRECT_UNSIGNED_COMPARE
0088 return SSE::cmplt_epu16(a.data(), b.data());
0089 #else
0090 return _mm_cmplt_epi16(a.data(), b.data());
0091 #endif
0092 }
0093
0094 Vc_INTRINSIC SSE::double_m operator>=(SSE::double_v a, SSE::double_v b) { return _mm_cmpnlt_pd(a.data(), b.data()); }
0095 Vc_INTRINSIC SSE:: float_m operator>=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmpnlt_ps(a.data(), b.data()); }
0096 Vc_INTRINSIC SSE:: int_m operator>=(SSE:: int_v a, SSE:: int_v b) { return !(a < b); }
0097 Vc_INTRINSIC SSE:: uint_m operator>=(SSE:: uint_v a, SSE:: uint_v b) { return !(a < b); }
0098 Vc_INTRINSIC SSE:: short_m operator>=(SSE:: short_v a, SSE:: short_v b) { return !(a < b); }
0099 Vc_INTRINSIC SSE::ushort_m operator>=(SSE::ushort_v a, SSE::ushort_v b) { return !(a < b); }
0100
0101 Vc_INTRINSIC SSE::double_m operator<=(SSE::double_v a, SSE::double_v b) { return _mm_cmple_pd(a.data(), b.data()); }
0102 Vc_INTRINSIC SSE:: float_m operator<=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmple_ps(a.data(), b.data()); }
0103 Vc_INTRINSIC SSE:: int_m operator<=(SSE:: int_v a, SSE:: int_v b) { return !(a > b); }
0104 Vc_INTRINSIC SSE:: uint_m operator<=(SSE:: uint_v a, SSE:: uint_v b) { return !(a > b); }
0105 Vc_INTRINSIC SSE:: short_m operator<=(SSE:: short_v a, SSE:: short_v b) { return !(a > b); }
0106 Vc_INTRINSIC SSE::ushort_m operator<=(SSE::ushort_v a, SSE::ushort_v b) { return !(a > b); }
0107
0108 // bitwise operators {{{1
0109 template <typename T>
0110 Vc_INTRINSIC SSE::Vector<T> operator^(SSE::Vector<T> a, SSE::Vector<T> b)
0111 {
0112 return xor_(a.data(), b.data());
0113 }
0114 template <typename T>
0115 Vc_INTRINSIC SSE::Vector<T> operator&(SSE::Vector<T> a, SSE::Vector<T> b)
0116 {
0117 return and_(a.data(), b.data());
0118 }
0119 template <typename T>
0120 Vc_INTRINSIC SSE::Vector<T> operator|(SSE::Vector<T> a, SSE::Vector<T> b)
0121 {
0122 return or_(a.data(), b.data());
0123 }
0124 // arithmetic operators {{{1
0125 template <typename T>
0126 Vc_INTRINSIC SSE::Vector<T> operator+(SSE::Vector<T> a, SSE::Vector<T> b)
0127 {
0128 return add(a.data(), b.data(), T());
0129 }
0130 template <typename T>
0131 Vc_INTRINSIC SSE::Vector<T> operator-(SSE::Vector<T> a, SSE::Vector<T> b)
0132 {
0133 return sub(a.data(), b.data(), T());
0134 }
0135 template <typename T>
0136 Vc_INTRINSIC SSE::Vector<T> operator*(SSE::Vector<T> a, SSE::Vector<T> b)
0137 {
0138 return mul(a.data(), b.data(), T());
0139 }
0140 template <typename T>
0141 Vc_INTRINSIC enable_if<std::is_floating_point<T>::value, SSE::Vector<T>> operator/(
0142 SSE::Vector<T> a, SSE::Vector<T> b)
0143 {
0144 return div(a.data(), b.data(), T());
0145 }
0146 template <typename T>
0147 Vc_INTRINSIC
0148 enable_if<std::is_same<int, T>::value || std::is_same<uint, T>::value, SSE::Vector<T>>
0149 operator/(SSE::Vector<T> a, SSE::Vector<T> b)
0150 {
0151 return SSE::Vector<T>::generate([&](int i) { return a[i] / b[i]; });
0152 }
0153 template <typename T>
0154 Vc_INTRINSIC enable_if<std::is_same<short, T>::value || std::is_same<ushort, T>::value,
0155 SSE::Vector<T>>
0156 operator/(SSE::Vector<T> a, SSE::Vector<T> b)
0157 {
0158 using HT = SSE::VectorHelper<T>;
0159 __m128 lo = _mm_cvtepi32_ps(HT::expand0(a.data()));
0160 __m128 hi = _mm_cvtepi32_ps(HT::expand1(a.data()));
0161 lo = _mm_div_ps(lo, _mm_cvtepi32_ps(HT::expand0(b.data())));
0162 hi = _mm_div_ps(hi, _mm_cvtepi32_ps(HT::expand1(b.data())));
0163 return HT::concat(_mm_cvttps_epi32(lo), _mm_cvttps_epi32(hi));
0164 }
0165 template <typename T>
0166 Vc_INTRINSIC enable_if<std::is_integral<T>::value, SSE::Vector<T>> operator%(
0167 SSE::Vector<T> a, SSE::Vector<T> b)
0168 {
0169 return a - a / b * b;
0170 }
0171 // }}}1
0172 } // namespace Detail
0173 // constants {{{1
0174 template<typename T> Vc_INTRINSIC Vector<T, VectorAbi::Sse>::Vector(VectorSpecialInitializerZero)
0175 : d(HV::zero())
0176 {
0177 }
0178
0179 template<typename T> Vc_INTRINSIC Vector<T, VectorAbi::Sse>::Vector(VectorSpecialInitializerOne)
0180 : d(HT::one())
0181 {
0182 }
0183
0184 template <typename T>
0185 Vc_INTRINSIC Vector<T, VectorAbi::Sse>::Vector(VectorSpecialInitializerIndexesFromZero)
0186 : d(Detail::load16(Detail::IndexesFromZero<EntryType, Size>(), Aligned))
0187 {
0188 #if defined Vc_GCC && Vc_GCC < 0x40903 && defined Vc_IMPL_AVX2
0189 // GCC 4.9.2 (at least) miscompiles SSE::short_v::IndexesFromZero() if used implicitly
0190 // from SimdArray<short, 9> compiling for AVX2 to vpmovsxwd (sign extending load from
0191 // a 8x 16-bit constant to 8x 32-bit register)
0192 if (std::is_same<T, short>::value) {
0193 asm("" ::"x"(d.v()));
0194 }
0195 #endif
0196 }
0197
0198 template <>
0199 Vc_INTRINSIC Vector<float, VectorAbi::Sse>::Vector(VectorSpecialInitializerIndexesFromZero)
0200 : d(SSE::convert<int, float>(SSE::int_v::IndexesFromZero().data()))
0201 {
0202 }
0203
0204 template <>
0205 Vc_INTRINSIC Vector<double, VectorAbi::Sse>::Vector(VectorSpecialInitializerIndexesFromZero)
0206 : d(SSE::convert<int, double>(SSE::int_v::IndexesFromZero().data()))
0207 {
0208 }
0209
0210 // load member functions {{{1
0211 template <typename DstT>
0212 template <typename SrcT, typename Flags>
0213 Vc_INTRINSIC typename Vector<DstT, VectorAbi::Sse>::
0214 #ifndef Vc_MSVC
0215 template
0216 #endif
0217 load_concept<SrcT, Flags>::type Vector<DstT, VectorAbi::Sse>::load(const SrcT *mem, Flags flags)
0218 {
0219 Common::handleLoadPrefetches(mem, flags);
0220 d.v() = Detail::load<VectorType, DstT>(mem, flags);
0221 }
0222
0223 // zeroing {{{1
0224 template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::setZero()
0225 {
0226 data() = HV::zero();
0227 }
0228
0229 template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::setZero(const Mask &k)
0230 {
0231 data() = Detail::andnot_(k.data(), data());
0232 }
0233
0234 template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::setZeroInverted(const Mask &k)
0235 {
0236 data() = Detail::and_(k.data(), data());
0237 }
0238
0239 template<> Vc_INTRINSIC void SSE::double_v::setQnan()
0240 {
0241 data() = SSE::_mm_setallone_pd();
0242 }
0243 template<> Vc_INTRINSIC void Vector<double, VectorAbi::Sse>::setQnan(const Mask &k)
0244 {
0245 data() = _mm_or_pd(data(), k.dataD());
0246 }
0247 template<> Vc_INTRINSIC void SSE::float_v::setQnan()
0248 {
0249 data() = SSE::_mm_setallone_ps();
0250 }
0251 template<> Vc_INTRINSIC void Vector<float, VectorAbi::Sse>::setQnan(const Mask &k)
0252 {
0253 data() = _mm_or_ps(data(), k.dataF());
0254 }
0255
0256 ///////////////////////////////////////////////////////////////////////////////////////////
0257 // stores {{{1
0258 template <typename T>
0259 template <typename U, typename Flags, typename>
0260 Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::store(U *mem, Flags flags) const
0261 {
0262 Common::handleStorePrefetches(mem, flags);
0263 HV::template store<Flags>(mem, data());
0264 }
0265
0266 template <typename T>
0267 template <typename U, typename Flags, typename>
0268 Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::store(U *mem, Mask mask, Flags flags) const
0269 {
0270 Common::handleStorePrefetches(mem, flags);
0271 HV::template store<Flags>(mem, data(), mask.data());
0272 }
0273
0274 ///////////////////////////////////////////////////////////////////////////////////////////
0275 // operator- {{{1
0276 template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::operator-() const
0277 {
0278 return Detail::negate(d.v(), std::integral_constant<std::size_t, sizeof(T)>());
0279 }
0280 ///////////////////////////////////////////////////////////////////////////////////////////
0281 // integer ops {{{1
0282 #ifdef Vc_IMPL_XOP
0283 template <> Vc_ALWAYS_INLINE SSE::int_v SSE::int_v::operator<<(const SSE::int_v shift) const { return _mm_sha_epi32(d.v(), shift.d.v()); }
0284 template <> Vc_ALWAYS_INLINE SSE::uint_v SSE::uint_v::operator<<(const SSE::uint_v shift) const { return _mm_shl_epi32(d.v(), shift.d.v()); }
0285 template <> Vc_ALWAYS_INLINE SSE::short_v SSE::short_v::operator<<(const SSE::short_v shift) const { return _mm_sha_epi16(d.v(), shift.d.v()); }
0286 template <> Vc_ALWAYS_INLINE SSE::ushort_v SSE::ushort_v::operator<<(const SSE::ushort_v shift) const { return _mm_shl_epi16(d.v(), shift.d.v()); }
0287 template <> Vc_ALWAYS_INLINE SSE::int_v SSE::int_v::operator>>(const SSE::int_v shift) const { return operator<<(-shift); }
0288 template <> Vc_ALWAYS_INLINE SSE::uint_v SSE::uint_v::operator>>(const SSE::uint_v shift) const { return operator<<(-shift); }
0289 template <> Vc_ALWAYS_INLINE SSE::short_v SSE::short_v::operator>>(const SSE::short_v shift) const { return operator<<(-shift); }
0290 template <> Vc_ALWAYS_INLINE SSE::ushort_v SSE::ushort_v::operator>>(const SSE::ushort_v shift) const { return operator<<(-shift); }
0291 #elif defined Vc_IMPL_AVX2
0292 template <> Vc_ALWAYS_INLINE SSE::Vector< int> Vector< int, VectorAbi::Sse>::operator<<(const SSE::Vector< int> x) const { return _mm_sllv_epi32(d.v(), x.d.v()); }
0293 template <> Vc_ALWAYS_INLINE SSE::Vector< uint> Vector< uint, VectorAbi::Sse>::operator<<(const SSE::Vector< uint> x) const { return _mm_sllv_epi32(d.v(), x.d.v()); }
0294 template <> Vc_ALWAYS_INLINE SSE::Vector< int> Vector< int, VectorAbi::Sse>::operator>>(const SSE::Vector< int> x) const { return _mm_srav_epi32(d.v(), x.d.v()); }
0295 template <> Vc_ALWAYS_INLINE SSE::Vector< uint> Vector< uint, VectorAbi::Sse>::operator>>(const SSE::Vector< uint> x) const { return _mm_srlv_epi32(d.v(), x.d.v()); }
0296 #endif
0297
0298 template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> &Vector<T, VectorAbi::Sse>::operator>>=(int shift) {
0299 d.v() = HT::shiftRight(d.v(), shift);
0300 return *this;
0301 }
0302 template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::operator>>(int shift) const {
0303 return HT::shiftRight(d.v(), shift);
0304 }
0305 template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> &Vector<T, VectorAbi::Sse>::operator<<=(int shift) {
0306 d.v() = HT::shiftLeft(d.v(), shift);
0307 return *this;
0308 }
0309 template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::operator<<(int shift) const {
0310 return HT::shiftLeft(d.v(), shift);
0311 }
0312
0313 ///////////////////////////////////////////////////////////////////////////////////////////
0314 // isnegative {{{1
0315 Vc_INTRINSIC Vc_CONST SSE::float_m isnegative(SSE::float_v x)
0316 {
0317 return sse_cast<__m128>(_mm_srai_epi32(
0318 sse_cast<__m128i>(_mm_and_ps(SSE::_mm_setsignmask_ps(), x.data())), 31));
0319 }
0320 Vc_INTRINSIC Vc_CONST SSE::double_m isnegative(SSE::double_v x)
0321 {
0322 return Mem::permute<X1, X1, X3, X3>(sse_cast<__m128>(_mm_srai_epi32(
0323 sse_cast<__m128i>(_mm_and_pd(SSE::_mm_setsignmask_pd(), x.data())), 31)));
0324 }
0325
0326 // gathers {{{1
0327 #define Vc_GATHER_IMPL(V_) \
0328 template <> \
0329 template <class MT, class IT, int Scale> \
0330 inline void SSE::V_::gatherImplementation( \
0331 const Common::GatherArguments<MT, IT, Scale> &args)
0332 #define Vc_M(i_) static_cast<value_type>(args.address[Scale * args.indexes[i_]])
0333 Vc_GATHER_IMPL(double_v) { d.v() = _mm_setr_pd(Vc_M(0), Vc_M(1)); }
0334 Vc_GATHER_IMPL(float_v) { d.v() = _mm_setr_ps(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
0335 Vc_GATHER_IMPL(int_v) { d.v() = _mm_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
0336 Vc_GATHER_IMPL(uint_v) { d.v() = _mm_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
0337 Vc_GATHER_IMPL(short_v)
0338 {
0339 d.v() =
0340 Vc::set(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6), Vc_M(7));
0341 }
0342 Vc_GATHER_IMPL(ushort_v)
0343 {
0344 d.v() =
0345 Vc::set(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6), Vc_M(7));
0346 }
0347 #undef Vc_M
0348 #undef Vc_GATHER_IMPL
0349
0350 template <typename T>
0351 template <class MT, class IT, int Scale>
0352 inline void Vector<T, VectorAbi::Sse>::gatherImplementation(
0353 const Common::GatherArguments<MT, IT, Scale> &args, MaskArgument mask)
0354 {
0355 const auto *mem = args.address;
0356 const auto indexes = Scale * args.indexes;
0357 using Selector = std::integral_constant < Common::GatherScatterImplementation,
0358 #ifdef Vc_USE_SET_GATHERS
0359 Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
0360 #endif
0361 #ifdef Vc_USE_BSF_GATHERS
0362 Common::GatherScatterImplementation::BitScanLoop
0363 #elif defined Vc_USE_POPCNT_BSF_GATHERS
0364 Common::GatherScatterImplementation::PopcntSwitch
0365 #else
0366 Common::GatherScatterImplementation::SimpleLoop
0367 #endif
0368 > ;
0369 Common::executeGather(Selector(), *this, mem, indexes, mask);
0370 }
0371
0372 // scatters {{{1
0373 template <typename T>
0374 template <typename MT, typename IT>
0375 inline void Vector<T, VectorAbi::Sse>::scatterImplementation(MT *mem, IT &&indexes) const
0376 {
0377 Common::unrolled_loop<std::size_t, 0, Size>([&](std::size_t i) { mem[indexes[i]] = d.m(i); });
0378 }
0379
0380 template <typename T>
0381 template <typename MT, typename IT>
0382 inline void Vector<T, VectorAbi::Sse>::scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const
0383 {
0384 using Selector = std::integral_constant < Common::GatherScatterImplementation,
0385 #ifdef Vc_USE_SET_GATHERS
0386 Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
0387 #endif
0388 #ifdef Vc_USE_BSF_GATHERS
0389 Common::GatherScatterImplementation::BitScanLoop
0390 #elif defined Vc_USE_POPCNT_BSF_GATHERS
0391 Common::GatherScatterImplementation::PopcntSwitch
0392 #else
0393 Common::GatherScatterImplementation::SimpleLoop
0394 #endif
0395 > ;
0396 Common::executeScatter(Selector(), *this, mem, indexes, mask);
0397 }
0398
0399 ///////////////////////////////////////////////////////////////////////////////////////////
0400 // horizontal ops {{{1
0401 template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::partialSum() const
0402 {
0403 // a b c d e f g h
0404 // + a b c d e f g -> a ab bc cd de ef fg gh
0405 // + a ab bc cd de ef -> a ab abc abcd bcde cdef defg efgh
0406 // + a ab abc abcd -> a ab abc abcd abcde abcdef abcdefg abcdefgh
0407 Vector<T, VectorAbi::Sse> tmp = *this;
0408 if (Size > 1) tmp += tmp.shifted(-1);
0409 if (Size > 2) tmp += tmp.shifted(-2);
0410 if (Size > 4) tmp += tmp.shifted(-4);
0411 if (Size > 8) tmp += tmp.shifted(-8);
0412 if (Size > 16) tmp += tmp.shifted(-16);
0413 return tmp;
0414 }
0415 #ifndef Vc_IMPL_SSE4_1
0416 // without SSE4.1 integer multiplication is slow and we rather multiply the scalars
0417 template<> Vc_INTRINSIC Vc_PURE int SSE::int_v::product() const
0418 {
0419 return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3));
0420 }
0421 template<> Vc_INTRINSIC Vc_PURE unsigned int SSE::uint_v::product() const
0422 {
0423 return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3));
0424 }
0425 #endif
0426 template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::min(MaskArg m) const
0427 {
0428 Vector<T, VectorAbi::Sse> tmp = std::numeric_limits<Vector<T, VectorAbi::Sse> >::max();
0429 tmp(m) = *this;
0430 return tmp.min();
0431 }
0432 template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::max(MaskArg m) const
0433 {
0434 Vector<T, VectorAbi::Sse> tmp = std::numeric_limits<Vector<T, VectorAbi::Sse> >::min();
0435 tmp(m) = *this;
0436 return tmp.max();
0437 }
0438 template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::product(MaskArg m) const
0439 {
0440 Vector<T, VectorAbi::Sse> tmp(Vc::One);
0441 tmp(m) = *this;
0442 return tmp.product();
0443 }
0444 template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::sum(MaskArg m) const
0445 {
0446 Vector<T, VectorAbi::Sse> tmp(Vc::Zero);
0447 tmp(m) = *this;
0448 return tmp.sum();
0449 }
0450
0451 ///////////////////////////////////////////////////////////////////////////////////////////
0452 // exponent {{{1
0453 namespace Detail
0454 {
0455 Vc_INTRINSIC Vc_CONST __m128 exponent(__m128 v)
0456 {
0457 __m128i tmp = _mm_srli_epi32(_mm_castps_si128(v), 23);
0458 tmp = _mm_sub_epi32(tmp, _mm_set1_epi32(0x7f));
0459 return _mm_cvtepi32_ps(tmp);
0460 }
0461 Vc_INTRINSIC Vc_CONST __m128d exponent(__m128d v)
0462 {
0463 __m128i tmp = _mm_srli_epi64(_mm_castpd_si128(v), 52);
0464 tmp = _mm_sub_epi32(tmp, _mm_set1_epi32(0x3ff));
0465 return _mm_cvtepi32_pd(_mm_shuffle_epi32(tmp, 0x08));
0466 }
0467 } // namespace Detail
0468
0469 Vc_INTRINSIC Vc_CONST SSE::float_v exponent(SSE::float_v x)
0470 {
0471 using Detail::operator>=;
0472 Vc_ASSERT((x >= x.Zero()).isFull());
0473 return Detail::exponent(x.data());
0474 }
0475 Vc_INTRINSIC Vc_CONST SSE::double_v exponent(SSE::double_v x)
0476 {
0477 using Detail::operator>=;
0478 Vc_ASSERT((x >= x.Zero()).isFull());
0479 return Detail::exponent(x.data());
0480 }
0481 // }}}1
0482 // Random {{{1
0483 static void _doRandomStep(SSE::uint_v &state0,
0484 SSE::uint_v &state1)
0485 {
0486 using SSE::uint_v;
0487 using Detail::operator+;
0488 using Detail::operator*;
0489 state0.load(&Common::RandomState[0]);
0490 state1.load(&Common::RandomState[uint_v::Size]);
0491 (state1 * uint_v(0xdeece66du) + uint_v(11)).store(&Common::RandomState[uint_v::Size]);
0492 uint_v(_mm_xor_si128((state0 * uint_v(0xdeece66du) + uint_v(11)).data(),
0493 _mm_srli_epi32(state1.data(), 16)))
0494 .store(&Common::RandomState[0]);
0495 }
0496
0497 template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::Random()
0498 {
0499 SSE::uint_v state0, state1;
0500 _doRandomStep(state0, state1);
0501 return state0.data();
0502 }
0503
0504 template<> Vc_ALWAYS_INLINE SSE::float_v SSE::float_v::Random()
0505 {
0506 SSE::uint_v state0, state1;
0507 _doRandomStep(state0, state1);
0508 return _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state0.data(), 2)), HT::one()), HT::one());
0509 }
0510
0511 template<> Vc_ALWAYS_INLINE SSE::double_v SSE::double_v::Random()
0512 {
0513 typedef unsigned long long uint64 Vc_MAY_ALIAS;
0514 uint64 state0 = *reinterpret_cast<const uint64 *>(&Common::RandomState[8]);
0515 uint64 state1 = *reinterpret_cast<const uint64 *>(&Common::RandomState[10]);
0516 const __m128i state = _mm_load_si128(reinterpret_cast<const __m128i *>(&Common::RandomState[8]));
0517 *reinterpret_cast<uint64 *>(&Common::RandomState[ 8]) = (state0 * 0x5deece66dull + 11);
0518 *reinterpret_cast<uint64 *>(&Common::RandomState[10]) = (state1 * 0x5deece66dull + 11);
0519 return _mm_sub_pd(_mm_or_pd(_mm_castsi128_pd(_mm_srli_epi64(state, 12)), HT::one()), HT::one());
0520 }
0521 // shifted / rotated {{{1
0522 template<typename T> Vc_INTRINSIC Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::shifted(int amount) const
0523 {
0524 enum {
0525 EntryTypeSizeof = sizeof(EntryType)
0526 };
0527 switch (amount) {
0528 case 0: return *this;
0529 case 1: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 1 * EntryTypeSizeof));
0530 case 2: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 2 * EntryTypeSizeof));
0531 case 3: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 3 * EntryTypeSizeof));
0532 case 4: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 4 * EntryTypeSizeof));
0533 case 5: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 5 * EntryTypeSizeof));
0534 case 6: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 6 * EntryTypeSizeof));
0535 case 7: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 7 * EntryTypeSizeof));
0536 case 8: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 8 * EntryTypeSizeof));
0537 case -1: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 1 * EntryTypeSizeof));
0538 case -2: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 2 * EntryTypeSizeof));
0539 case -3: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 3 * EntryTypeSizeof));
0540 case -4: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 4 * EntryTypeSizeof));
0541 case -5: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 5 * EntryTypeSizeof));
0542 case -6: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 6 * EntryTypeSizeof));
0543 case -7: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 7 * EntryTypeSizeof));
0544 case -8: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 8 * EntryTypeSizeof));
0545 }
0546 return Zero();
0547 }
0548 template<typename T> Vc_INTRINSIC Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::shifted(int amount, Vector shiftIn) const
0549 {
0550 if (amount >= -int(size())) {
0551 constexpr int VectorWidth = int(size());
0552 constexpr int EntryTypeSizeof = sizeof(EntryType);
0553 const __m128i v0 = sse_cast<__m128i>(d.v());
0554 const __m128i v1 = sse_cast<__m128i>(shiftIn.d.v());
0555 auto &&fixup = sse_cast<VectorType, __m128i>;
0556 switch (amount) {
0557 case 0: return *this;
0558 // alignr_epi8: [arg1 arg0] << n
0559 case -1: return fixup(SSE::alignr_epi8<(VectorWidth - 1) * EntryTypeSizeof>(v0, v1));
0560 case -2: return fixup(SSE::alignr_epi8<(VectorWidth - 2) * EntryTypeSizeof>(v0, v1));
0561 case -3: return fixup(SSE::alignr_epi8<(VectorWidth - 3) * EntryTypeSizeof>(v0, v1));
0562 case -4: return fixup(SSE::alignr_epi8<(VectorWidth - 4) * EntryTypeSizeof>(v0, v1));
0563 case -5: return fixup(SSE::alignr_epi8<(VectorWidth - 5) * EntryTypeSizeof>(v0, v1));
0564 case -6: return fixup(SSE::alignr_epi8<(VectorWidth - 6) * EntryTypeSizeof>(v0, v1));
0565 case -7: return fixup(SSE::alignr_epi8<(VectorWidth - 7) * EntryTypeSizeof>(v0, v1));
0566 case -8: return fixup(SSE::alignr_epi8<(VectorWidth - 8) * EntryTypeSizeof>(v0, v1));
0567 case -9: return fixup(SSE::alignr_epi8<(VectorWidth - 9) * EntryTypeSizeof>(v0, v1));
0568 case-10: return fixup(SSE::alignr_epi8<(VectorWidth -10) * EntryTypeSizeof>(v0, v1));
0569 case-11: return fixup(SSE::alignr_epi8<(VectorWidth -11) * EntryTypeSizeof>(v0, v1));
0570 case-12: return fixup(SSE::alignr_epi8<(VectorWidth -12) * EntryTypeSizeof>(v0, v1));
0571 case-13: return fixup(SSE::alignr_epi8<(VectorWidth -13) * EntryTypeSizeof>(v0, v1));
0572 case-14: return fixup(SSE::alignr_epi8<(VectorWidth -14) * EntryTypeSizeof>(v0, v1));
0573 case-15: return fixup(SSE::alignr_epi8<(VectorWidth -15) * EntryTypeSizeof>(v0, v1));
0574 case 1: return fixup(SSE::alignr_epi8< 1 * EntryTypeSizeof>(v1, v0));
0575 case 2: return fixup(SSE::alignr_epi8< 2 * EntryTypeSizeof>(v1, v0));
0576 case 3: return fixup(SSE::alignr_epi8< 3 * EntryTypeSizeof>(v1, v0));
0577 case 4: return fixup(SSE::alignr_epi8< 4 * EntryTypeSizeof>(v1, v0));
0578 case 5: return fixup(SSE::alignr_epi8< 5 * EntryTypeSizeof>(v1, v0));
0579 case 6: return fixup(SSE::alignr_epi8< 6 * EntryTypeSizeof>(v1, v0));
0580 case 7: return fixup(SSE::alignr_epi8< 7 * EntryTypeSizeof>(v1, v0));
0581 case 8: return fixup(SSE::alignr_epi8< 8 * EntryTypeSizeof>(v1, v0));
0582 case 9: return fixup(SSE::alignr_epi8< 9 * EntryTypeSizeof>(v1, v0));
0583 case 10: return fixup(SSE::alignr_epi8<10 * EntryTypeSizeof>(v1, v0));
0584 case 11: return fixup(SSE::alignr_epi8<11 * EntryTypeSizeof>(v1, v0));
0585 case 12: return fixup(SSE::alignr_epi8<12 * EntryTypeSizeof>(v1, v0));
0586 case 13: return fixup(SSE::alignr_epi8<13 * EntryTypeSizeof>(v1, v0));
0587 case 14: return fixup(SSE::alignr_epi8<14 * EntryTypeSizeof>(v1, v0));
0588 case 15: return fixup(SSE::alignr_epi8<15 * EntryTypeSizeof>(v1, v0));
0589 }
0590 }
0591 return shiftIn.shifted(int(size()) + amount);
0592 }
0593 template<typename T> Vc_INTRINSIC Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::rotated(int amount) const
0594 {
0595 enum {
0596 EntryTypeSizeof = sizeof(EntryType)
0597 };
0598 const __m128i v = SSE::sse_cast<__m128i>(d.v());
0599 switch (static_cast<unsigned int>(amount) % Size) {
0600 case 0: return *this;
0601 case 1: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<1 * EntryTypeSizeof>(v, v));
0602 case 2: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<2 * EntryTypeSizeof>(v, v));
0603 case 3: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<3 * EntryTypeSizeof>(v, v));
0604 // warning "Immediate parameter to intrinsic call too large" disabled in VcMacros.cmake.
0605 // ICC fails to see that the modulo operation (Size == sizeof(VectorType) / sizeof(EntryType))
0606 // disables the following four calls unless sizeof(EntryType) == 2.
0607 case 4: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<4 * EntryTypeSizeof>(v, v));
0608 case 5: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<5 * EntryTypeSizeof>(v, v));
0609 case 6: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<6 * EntryTypeSizeof>(v, v));
0610 case 7: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<7 * EntryTypeSizeof>(v, v));
0611 }
0612 return Zero();
0613 }
0614 // sorted {{{1
0615 namespace Detail
0616 {
0617 inline Vc_CONST SSE::double_v sorted(SSE::double_v x_)
0618 {
0619 const __m128d x = x_.data();
0620 const __m128d y = _mm_shuffle_pd(x, x, _MM_SHUFFLE2(0, 1));
0621 return _mm_unpacklo_pd(_mm_min_sd(x, y), _mm_max_sd(x, y));
0622 }
0623 } // namespace Detail
0624 template <typename T>
0625 Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::sorted()
0626 const
0627 {
0628 return Detail::sorted(*this);
0629 }
0630 // interleaveLow/-High {{{1
0631 template <> Vc_INTRINSIC SSE::double_v SSE::double_v::interleaveLow (SSE::double_v x) const { return _mm_unpacklo_pd(data(), x.data()); }
0632 template <> Vc_INTRINSIC SSE::double_v SSE::double_v::interleaveHigh(SSE::double_v x) const { return _mm_unpackhi_pd(data(), x.data()); }
0633 template <> Vc_INTRINSIC SSE::float_v SSE::float_v::interleaveLow ( SSE::float_v x) const { return _mm_unpacklo_ps(data(), x.data()); }
0634 template <> Vc_INTRINSIC SSE::float_v SSE::float_v::interleaveHigh( SSE::float_v x) const { return _mm_unpackhi_ps(data(), x.data()); }
0635 template <> Vc_INTRINSIC SSE::int_v SSE::int_v::interleaveLow ( SSE::int_v x) const { return _mm_unpacklo_epi32(data(), x.data()); }
0636 template <> Vc_INTRINSIC SSE::int_v SSE::int_v::interleaveHigh( SSE::int_v x) const { return _mm_unpackhi_epi32(data(), x.data()); }
0637 template <> Vc_INTRINSIC SSE::uint_v SSE::uint_v::interleaveLow ( SSE::uint_v x) const { return _mm_unpacklo_epi32(data(), x.data()); }
0638 template <> Vc_INTRINSIC SSE::uint_v SSE::uint_v::interleaveHigh( SSE::uint_v x) const { return _mm_unpackhi_epi32(data(), x.data()); }
0639 template <> Vc_INTRINSIC SSE::short_v SSE::short_v::interleaveLow ( SSE::short_v x) const { return _mm_unpacklo_epi16(data(), x.data()); }
0640 template <> Vc_INTRINSIC SSE::short_v SSE::short_v::interleaveHigh( SSE::short_v x) const { return _mm_unpackhi_epi16(data(), x.data()); }
0641 template <> Vc_INTRINSIC SSE::ushort_v SSE::ushort_v::interleaveLow (SSE::ushort_v x) const { return _mm_unpacklo_epi16(data(), x.data()); }
0642 template <> Vc_INTRINSIC SSE::ushort_v SSE::ushort_v::interleaveHigh(SSE::ushort_v x) const { return _mm_unpackhi_epi16(data(), x.data()); }
0643 // }}}1
0644 // generate {{{1
0645 template <> template <typename G> Vc_INTRINSIC SSE::double_v SSE::double_v::generate(G gen)
0646 {
0647 const auto tmp0 = gen(0);
0648 const auto tmp1 = gen(1);
0649 return _mm_setr_pd(tmp0, tmp1);
0650 }
0651 template <> template <typename G> Vc_INTRINSIC SSE::float_v SSE::float_v::generate(G gen)
0652 {
0653 const auto tmp0 = gen(0);
0654 const auto tmp1 = gen(1);
0655 const auto tmp2 = gen(2);
0656 const auto tmp3 = gen(3);
0657 return _mm_setr_ps(tmp0, tmp1, tmp2, tmp3);
0658 }
0659 template <> template <typename G> Vc_INTRINSIC SSE::int_v SSE::int_v::generate(G gen)
0660 {
0661 const auto tmp0 = gen(0);
0662 const auto tmp1 = gen(1);
0663 const auto tmp2 = gen(2);
0664 const auto tmp3 = gen(3);
0665 return _mm_setr_epi32(tmp0, tmp1, tmp2, tmp3);
0666 }
0667 template <> template <typename G> Vc_INTRINSIC SSE::uint_v SSE::uint_v::generate(G gen)
0668 {
0669 const auto tmp0 = gen(0);
0670 const auto tmp1 = gen(1);
0671 const auto tmp2 = gen(2);
0672 const auto tmp3 = gen(3);
0673 return _mm_setr_epi32(tmp0, tmp1, tmp2, tmp3);
0674 }
0675 template <> template <typename G> Vc_INTRINSIC SSE::short_v SSE::short_v::generate(G gen)
0676 {
0677 const auto tmp0 = gen(0);
0678 const auto tmp1 = gen(1);
0679 const auto tmp2 = gen(2);
0680 const auto tmp3 = gen(3);
0681 const auto tmp4 = gen(4);
0682 const auto tmp5 = gen(5);
0683 const auto tmp6 = gen(6);
0684 const auto tmp7 = gen(7);
0685 return _mm_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
0686 }
0687 template <> template <typename G> Vc_INTRINSIC SSE::ushort_v SSE::ushort_v::generate(G gen)
0688 {
0689 const auto tmp0 = gen(0);
0690 const auto tmp1 = gen(1);
0691 const auto tmp2 = gen(2);
0692 const auto tmp3 = gen(3);
0693 const auto tmp4 = gen(4);
0694 const auto tmp5 = gen(5);
0695 const auto tmp6 = gen(6);
0696 const auto tmp7 = gen(7);
0697 return _mm_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
0698 }
0699 // }}}1
0700 // reversed {{{1
0701 template <> Vc_INTRINSIC Vc_PURE SSE::double_v SSE::double_v::reversed() const
0702 {
0703 return Mem::permute<X1, X0>(d.v());
0704 }
0705 template <> Vc_INTRINSIC Vc_PURE SSE::float_v SSE::float_v::reversed() const
0706 {
0707 return Mem::permute<X3, X2, X1, X0>(d.v());
0708 }
0709 template <> Vc_INTRINSIC Vc_PURE SSE::int_v SSE::int_v::reversed() const
0710 {
0711 return Mem::permute<X3, X2, X1, X0>(d.v());
0712 }
0713 template <> Vc_INTRINSIC Vc_PURE SSE::uint_v SSE::uint_v::reversed() const
0714 {
0715 return Mem::permute<X3, X2, X1, X0>(d.v());
0716 }
0717 template <> Vc_INTRINSIC Vc_PURE SSE::short_v SSE::short_v::reversed() const
0718 {
0719 return sse_cast<__m128i>(
0720 Mem::shuffle<X1, Y0>(sse_cast<__m128d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
0721 sse_cast<__m128d>(Mem::permuteLo<X3, X2, X1, X0>(d.v()))));
0722 }
0723 template <> Vc_INTRINSIC Vc_PURE SSE::ushort_v SSE::ushort_v::reversed() const
0724 {
0725 return sse_cast<__m128i>(
0726 Mem::shuffle<X1, Y0>(sse_cast<__m128d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
0727 sse_cast<__m128d>(Mem::permuteLo<X3, X2, X1, X0>(d.v()))));
0728 }
0729 // }}}1
0730 // permutation via operator[] {{{1
0731 template <>
0732 Vc_INTRINSIC SSE::float_v SSE::float_v::operator[](const SSE::int_v &
0733 #ifdef Vc_IMPL_AVX
0734 perm
0735 #endif
0736 ) const
0737 {
0738 /*
0739 const int_m cross128 = concat(_mm_cmpgt_epi32(lo128(perm.data()), _mm_set1_epi32(3)),
0740 _mm_cmplt_epi32(hi128(perm.data()), _mm_set1_epi32(4)));
0741 if (cross128.isNotEmpty()) {
0742 SSE::float_v x = _mm256_permutevar_ps(d.v(), perm.data());
0743 x(cross128) = _mm256_permutevar_ps(Mem::permute128<X1, X0>(d.v()), perm.data());
0744 return x;
0745 } else {
0746 */
0747 #ifdef Vc_IMPL_AVX
0748 return _mm_permutevar_ps(d.v(), perm.data());
0749 #else
0750 return *this;//TODO
0751 #endif
0752 }
0753 // broadcast from constexpr index {{{1
0754 template <> template <int Index> Vc_INTRINSIC SSE::float_v SSE::float_v::broadcast() const
0755 {
0756 constexpr VecPos Inner = static_cast<VecPos>(Index & 0x3);
0757 return Mem::permute<Inner, Inner, Inner, Inner>(d.v());
0758 }
0759 template <> template <int Index> Vc_INTRINSIC SSE::double_v SSE::double_v::broadcast() const
0760 {
0761 constexpr VecPos Inner = static_cast<VecPos>(Index & 0x1);
0762 return Mem::permute<Inner, Inner>(d.v());
0763 }
0764 // }}}1
0765
0766 namespace Common
0767 {
0768 // transpose_impl {{{1
0769 Vc_ALWAYS_INLINE void transpose_impl(
0770 TransposeTag<4, 4>, SSE::float_v *Vc_RESTRICT r[],
0771 const TransposeProxy<SSE::float_v, SSE::float_v, SSE::float_v, SSE::float_v> &proxy)
0772 {
0773 const auto in0 = std::get<0>(proxy.in).data();
0774 const auto in1 = std::get<1>(proxy.in).data();
0775 const auto in2 = std::get<2>(proxy.in).data();
0776 const auto in3 = std::get<3>(proxy.in).data();
0777 const auto tmp0 = _mm_unpacklo_ps(in0, in2);
0778 const auto tmp1 = _mm_unpacklo_ps(in1, in3);
0779 const auto tmp2 = _mm_unpackhi_ps(in0, in2);
0780 const auto tmp3 = _mm_unpackhi_ps(in1, in3);
0781 *r[0] = _mm_unpacklo_ps(tmp0, tmp1);
0782 *r[1] = _mm_unpackhi_ps(tmp0, tmp1);
0783 *r[2] = _mm_unpacklo_ps(tmp2, tmp3);
0784 *r[3] = _mm_unpackhi_ps(tmp2, tmp3);
0785 }
0786 // }}}1
0787 } // namespace Common
0788 }
0789
0790 // vim: foldmethod=marker