Warning, /include/Vc/avx/vector.tcc is written in an unsupported language. File is not indexed.
0001 /* This file is part of the Vc library. {{{
0002 Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>
0003
0004 Redistribution and use in source and binary forms, with or without
0005 modification, are permitted provided that the following conditions are met:
0006 * Redistributions of source code must retain the above copyright
0007 notice, this list of conditions and the following disclaimer.
0008 * Redistributions in binary form must reproduce the above copyright
0009 notice, this list of conditions and the following disclaimer in the
0010 documentation and/or other materials provided with the distribution.
0011 * Neither the names of contributing organizations nor the
0012 names of its contributors may be used to endorse or promote products
0013 derived from this software without specific prior written permission.
0014
0015 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
0016 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
0017 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0018 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
0019 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
0020 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
0021 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0022 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0023 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
0024 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0025
0026 }}}*/
0027
0028 #include "../common/x86_prefetches.h"
0029 #include "../common/gatherimplementation.h"
0030 #include "../common/scatterimplementation.h"
0031 #include "limits.h"
0032 #include "const.h"
0033 #include "../common/set.h"
0034 #include "macros.h"
0035
0036 namespace Vc_VERSIONED_NAMESPACE
0037 {
0038 namespace Detail
0039 {
0040 // compare operators {{{1
0041 Vc_INTRINSIC AVX2::double_m operator==(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpeq_pd(a.data(), b.data()); }
0042 Vc_INTRINSIC AVX2:: float_m operator==(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpeq_ps(a.data(), b.data()); }
0043 Vc_INTRINSIC AVX2::double_m operator!=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpneq_pd(a.data(), b.data()); }
0044 Vc_INTRINSIC AVX2:: float_m operator!=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpneq_ps(a.data(), b.data()); }
0045 Vc_INTRINSIC AVX2::double_m operator>=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpnlt_pd(a.data(), b.data()); }
0046 Vc_INTRINSIC AVX2:: float_m operator>=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpnlt_ps(a.data(), b.data()); }
0047 Vc_INTRINSIC AVX2::double_m operator<=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmple_pd(a.data(), b.data()); }
0048 Vc_INTRINSIC AVX2:: float_m operator<=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmple_ps(a.data(), b.data()); }
0049 Vc_INTRINSIC AVX2::double_m operator> (AVX2::double_v a, AVX2::double_v b) { return AVX::cmpgt_pd(a.data(), b.data()); }
0050 Vc_INTRINSIC AVX2:: float_m operator> (AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpgt_ps(a.data(), b.data()); }
0051 Vc_INTRINSIC AVX2::double_m operator< (AVX2::double_v a, AVX2::double_v b) { return AVX::cmplt_pd(a.data(), b.data()); }
0052 Vc_INTRINSIC AVX2:: float_m operator< (AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmplt_ps(a.data(), b.data()); }
0053
0054 #ifdef Vc_IMPL_AVX2
0055 Vc_INTRINSIC AVX2:: int_m operator==(AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmpeq_epi32(a.data(), b.data()); }
0056 Vc_INTRINSIC AVX2:: uint_m operator==(AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmpeq_epi32(a.data(), b.data()); }
0057 Vc_INTRINSIC AVX2:: short_m operator==(AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmpeq_epi16(a.data(), b.data()); }
0058 Vc_INTRINSIC AVX2::ushort_m operator==(AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmpeq_epi16(a.data(), b.data()); }
0059 Vc_INTRINSIC AVX2:: int_m operator!=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmpeq_epi32(a.data(), b.data())); }
0060 Vc_INTRINSIC AVX2:: uint_m operator!=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmpeq_epi32(a.data(), b.data())); }
0061 Vc_INTRINSIC AVX2:: short_m operator!=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmpeq_epi16(a.data(), b.data())); }
0062 Vc_INTRINSIC AVX2::ushort_m operator!=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmpeq_epi16(a.data(), b.data())); }
0063 Vc_INTRINSIC AVX2:: int_m operator>=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmplt_epi32(a.data(), b.data())); }
0064 Vc_INTRINSIC AVX2:: uint_m operator>=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmplt_epu32(a.data(), b.data())); }
0065 Vc_INTRINSIC AVX2:: short_m operator>=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmplt_epi16(a.data(), b.data())); }
0066 Vc_INTRINSIC AVX2::ushort_m operator>=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmplt_epu16(a.data(), b.data())); }
0067 Vc_INTRINSIC AVX2:: int_m operator<=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmpgt_epi32(a.data(), b.data())); }
0068 Vc_INTRINSIC AVX2:: uint_m operator<=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmpgt_epu32(a.data(), b.data())); }
0069 Vc_INTRINSIC AVX2:: short_m operator<=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmpgt_epi16(a.data(), b.data())); }
0070 Vc_INTRINSIC AVX2::ushort_m operator<=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmpgt_epu16(a.data(), b.data())); }
0071 Vc_INTRINSIC AVX2:: int_m operator> (AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmpgt_epi32(a.data(), b.data()); }
0072 Vc_INTRINSIC AVX2:: uint_m operator> (AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmpgt_epu32(a.data(), b.data()); }
0073 Vc_INTRINSIC AVX2:: short_m operator> (AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmpgt_epi16(a.data(), b.data()); }
0074 Vc_INTRINSIC AVX2::ushort_m operator> (AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmpgt_epu16(a.data(), b.data()); }
0075 Vc_INTRINSIC AVX2:: int_m operator< (AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmplt_epi32(a.data(), b.data()); }
0076 Vc_INTRINSIC AVX2:: uint_m operator< (AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmplt_epu32(a.data(), b.data()); }
0077 Vc_INTRINSIC AVX2:: short_m operator< (AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmplt_epi16(a.data(), b.data()); }
0078 Vc_INTRINSIC AVX2::ushort_m operator< (AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmplt_epu16(a.data(), b.data()); }
0079 #endif // Vc_IMPL_AVX2
0080
0081 // bitwise operators {{{1
0082 template <typename T>
0083 Vc_INTRINSIC AVX2::Vector<T> operator^(AVX2::Vector<T> a, AVX2::Vector<T> b)
0084 {
0085 return xor_(a.data(), b.data());
0086 }
0087 template <typename T>
0088 Vc_INTRINSIC AVX2::Vector<T> operator&(AVX2::Vector<T> a, AVX2::Vector<T> b)
0089 {
0090 return and_(a.data(), b.data());
0091 }
0092 template <typename T>
0093 Vc_INTRINSIC AVX2::Vector<T> operator|(AVX2::Vector<T> a, AVX2::Vector<T> b)
0094 {
0095 return or_(a.data(), b.data());
0096 }
0097 // }}}1
0098 // arithmetic operators {{{1
0099 template <typename T>
0100 Vc_INTRINSIC AVX2::Vector<T> operator+(AVX2::Vector<T> a, AVX2::Vector<T> b)
0101 {
0102 return add(a.data(), b.data(), T());
0103 }
0104 template <typename T>
0105 Vc_INTRINSIC AVX2::Vector<T> operator-(AVX2::Vector<T> a, AVX2::Vector<T> b)
0106 {
0107 return sub(a.data(), b.data(), T());
0108 }
0109 template <typename T>
0110 Vc_INTRINSIC AVX2::Vector<T> operator*(AVX2::Vector<T> a, AVX2::Vector<T> b)
0111 {
0112 return mul(a.data(), b.data(), T());
0113 }
0114 template <typename T>
0115 Vc_INTRINSIC AVX2::Vector<T> operator/(AVX2::Vector<T> a, AVX2::Vector<T> b)
0116 {
0117 return div(a.data(), b.data(), T());
0118 }
0119 Vc_INTRINSIC AVX2::Vector<ushort> operator/(AVX2::Vector<ushort> a,
0120 AVX2::Vector<ushort> b)
0121 {
0122 using namespace AVX;
0123 const __m256 lo = _mm256_div_ps(convert<ushort, float>(lo128(a.data())),
0124 convert<ushort, float>(lo128(b.data())));
0125 const __m256 hi = _mm256_div_ps(convert<ushort, float>(hi128(a.data())),
0126 convert<ushort, float>(hi128(b.data())));
0127 const float_v threshold = 32767.f;
0128 using Detail::operator>;
0129 const __m128i loShort = (Vc_IS_UNLIKELY((float_v(lo) > threshold).isNotEmpty()))
0130 ? convert<float, ushort>(lo)
0131 : convert<float, short>(lo);
0132 const __m128i hiShort = (Vc_IS_UNLIKELY((float_v(hi) > threshold).isNotEmpty()))
0133 ? convert<float, ushort>(hi)
0134 : convert<float, short>(hi);
0135 return concat(loShort, hiShort);
0136 }
0137 template <typename T>
0138 Vc_INTRINSIC enable_if<std::is_integral<T>::value, AVX2::Vector<T>> operator%(
0139 AVX2::Vector<T> a, AVX2::Vector<T> b)
0140 {
0141 return a - a / b * b;
0142 }
0143 // }}}1
0144 } // namespace Detail
0145 ///////////////////////////////////////////////////////////////////////////////////////////
0146 // generate {{{1
0147 template <> template <typename G> Vc_INTRINSIC AVX2::double_v AVX2::double_v::generate(G gen)
0148 {
0149 const auto tmp0 = gen(0);
0150 const auto tmp1 = gen(1);
0151 const auto tmp2 = gen(2);
0152 const auto tmp3 = gen(3);
0153 return _mm256_setr_pd(tmp0, tmp1, tmp2, tmp3);
0154 }
0155 template <> template <typename G> Vc_INTRINSIC AVX2::float_v AVX2::float_v::generate(G gen)
0156 {
0157 const auto tmp0 = gen(0);
0158 const auto tmp1 = gen(1);
0159 const auto tmp2 = gen(2);
0160 const auto tmp3 = gen(3);
0161 const auto tmp4 = gen(4);
0162 const auto tmp5 = gen(5);
0163 const auto tmp6 = gen(6);
0164 const auto tmp7 = gen(7);
0165 return _mm256_setr_ps(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
0166 }
0167 #ifdef Vc_IMPL_AVX2
0168 template <> template <typename G> Vc_INTRINSIC AVX2::int_v AVX2::int_v::generate(G gen)
0169 {
0170 const auto tmp0 = gen(0);
0171 const auto tmp1 = gen(1);
0172 const auto tmp2 = gen(2);
0173 const auto tmp3 = gen(3);
0174 const auto tmp4 = gen(4);
0175 const auto tmp5 = gen(5);
0176 const auto tmp6 = gen(6);
0177 const auto tmp7 = gen(7);
0178 return _mm256_setr_epi32(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
0179 }
0180 template <> template <typename G> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::generate(G gen)
0181 {
0182 const auto tmp0 = gen(0);
0183 const auto tmp1 = gen(1);
0184 const auto tmp2 = gen(2);
0185 const auto tmp3 = gen(3);
0186 const auto tmp4 = gen(4);
0187 const auto tmp5 = gen(5);
0188 const auto tmp6 = gen(6);
0189 const auto tmp7 = gen(7);
0190 return _mm256_setr_epi32(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
0191 }
0192 template <> template <typename G> Vc_INTRINSIC AVX2::short_v AVX2::short_v::generate(G gen)
0193 {
0194 const auto tmp0 = gen(0);
0195 const auto tmp1 = gen(1);
0196 const auto tmp2 = gen(2);
0197 const auto tmp3 = gen(3);
0198 const auto tmp4 = gen(4);
0199 const auto tmp5 = gen(5);
0200 const auto tmp6 = gen(6);
0201 const auto tmp7 = gen(7);
0202 const auto tmp8 = gen(8);
0203 const auto tmp9 = gen(9);
0204 const auto tmp10 = gen(10);
0205 const auto tmp11 = gen(11);
0206 const auto tmp12 = gen(12);
0207 const auto tmp13 = gen(13);
0208 const auto tmp14 = gen(14);
0209 const auto tmp15 = gen(15);
0210 return _mm256_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
0211 }
0212 template <> template <typename G> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::generate(G gen)
0213 {
0214 const auto tmp0 = gen(0);
0215 const auto tmp1 = gen(1);
0216 const auto tmp2 = gen(2);
0217 const auto tmp3 = gen(3);
0218 const auto tmp4 = gen(4);
0219 const auto tmp5 = gen(5);
0220 const auto tmp6 = gen(6);
0221 const auto tmp7 = gen(7);
0222 const auto tmp8 = gen(8);
0223 const auto tmp9 = gen(9);
0224 const auto tmp10 = gen(10);
0225 const auto tmp11 = gen(11);
0226 const auto tmp12 = gen(12);
0227 const auto tmp13 = gen(13);
0228 const auto tmp14 = gen(14);
0229 const auto tmp15 = gen(15);
0230 return _mm256_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
0231 }
0232 #endif
0233
0234 // constants {{{1
0235 template <typename T> Vc_INTRINSIC Vector<T, VectorAbi::Avx>::Vector(VectorSpecialInitializerZero) : d{} {}
0236
0237 template <> Vc_INTRINSIC Vector<double, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_pd()) {}
0238 template <> Vc_INTRINSIC Vector< float, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_ps()) {}
0239 #ifdef Vc_IMPL_AVX2
0240 template <> Vc_INTRINSIC Vector< int, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi32()) {}
0241 template <> Vc_INTRINSIC Vector< uint, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu32()) {}
0242 template <> Vc_INTRINSIC Vector< short, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi16()) {}
0243 template <> Vc_INTRINSIC Vector<ushort, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu16()) {}
0244 template <> Vc_INTRINSIC Vector< schar, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi8()) {}
0245 template <> Vc_INTRINSIC Vector< uchar, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu8()) {}
0246 #endif
0247
0248 template <typename T>
0249 Vc_ALWAYS_INLINE Vector<T, VectorAbi::Avx>::Vector(
0250 VectorSpecialInitializerIndexesFromZero)
0251 : Vector(AVX::IndexesFromZeroData<T>::address(), Vc::Aligned)
0252 {
0253 }
0254
0255 template <>
0256 Vc_ALWAYS_INLINE Vector<float, VectorAbi::Avx>::Vector(VectorSpecialInitializerIndexesFromZero)
0257 : Vector(AVX::IndexesFromZeroData<int>::address(), Vc::Aligned)
0258 {
0259 }
0260 template <>
0261 Vc_ALWAYS_INLINE Vector<double, VectorAbi::Avx>::Vector(VectorSpecialInitializerIndexesFromZero)
0262 : Vector(AVX::IndexesFromZeroData<int>::address(), Vc::Aligned)
0263 {
0264 }
0265
0266 ///////////////////////////////////////////////////////////////////////////////////////////
0267 // load member functions {{{1
0268 // general load, implemented via LoadHelper {{{2
0269 template <typename DstT>
0270 template <typename SrcT, typename Flags>
0271 Vc_INTRINSIC typename Vector<DstT, VectorAbi::Avx>::
0272 #ifndef Vc_MSVC
0273 template
0274 #endif
0275 load_concept<SrcT, Flags>::type Vector<DstT, VectorAbi::Avx>::load(const SrcT *mem, Flags flags)
0276 {
0277 Common::handleLoadPrefetches(mem, flags);
0278 d.v() = Detail::load<VectorType, DstT>(mem, flags);
0279 }
0280
0281 ///////////////////////////////////////////////////////////////////////////////////////////
0282 // zeroing {{{1
0283 template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZero()
0284 {
0285 data() = Detail::zero<VectorType>();
0286 }
0287 template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZero(const Mask &k)
0288 {
0289 data() = Detail::andnot_(k.data(), data());
0290 }
0291 template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZeroInverted(const Mask &k)
0292 {
0293 data() = Detail::and_(k.data(), data());
0294 }
0295
0296 template<> Vc_INTRINSIC void Vector<double, VectorAbi::Avx>::setQnan()
0297 {
0298 data() = Detail::allone<VectorType>();
0299 }
0300 template<> Vc_INTRINSIC void Vector<double, VectorAbi::Avx>::setQnan(MaskArgument k)
0301 {
0302 data() = _mm256_or_pd(data(), k.dataD());
0303 }
0304 template<> Vc_INTRINSIC void Vector<float, VectorAbi::Avx>::setQnan()
0305 {
0306 data() = Detail::allone<VectorType>();
0307 }
0308 template<> Vc_INTRINSIC void Vector<float, VectorAbi::Avx>::setQnan(MaskArgument k)
0309 {
0310 data() = _mm256_or_ps(data(), k.dataF());
0311 }
0312
0313 ///////////////////////////////////////////////////////////////////////////////////////////
0314 // stores {{{1
0315 template <typename T>
0316 template <typename U,
0317 typename Flags,
0318 typename>
0319 Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::store(U *mem, Flags flags) const
0320 {
0321 Common::handleStorePrefetches(mem, flags);
0322 HV::template store<Flags>(mem, data());
0323 }
0324
0325 template <typename T>
0326 template <typename U,
0327 typename Flags,
0328 typename>
0329 Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::store(U *mem, Mask mask, Flags flags) const
0330 {
0331 Common::handleStorePrefetches(mem, flags);
0332 HV::template store<Flags>(mem, data(), mask.data());
0333 }
0334
0335 ///////////////////////////////////////////////////////////////////////////////////////////
0336 // integer ops {{{1
0337 #ifdef Vc_IMPL_AVX2
0338 template <> Vc_ALWAYS_INLINE AVX2::Vector< int> Vector< int, VectorAbi::Avx>::operator<<(AsArg x) const { return _mm256_sllv_epi32(d.v(), x.d.v()); }
0339 template <> Vc_ALWAYS_INLINE AVX2::Vector< uint> Vector< uint, VectorAbi::Avx>::operator<<(AsArg x) const { return _mm256_sllv_epi32(d.v(), x.d.v()); }
0340 template <> Vc_ALWAYS_INLINE AVX2::Vector< int> Vector< int, VectorAbi::Avx>::operator>>(AsArg x) const { return _mm256_srav_epi32(d.v(), x.d.v()); }
0341 template <> Vc_ALWAYS_INLINE AVX2::Vector< uint> Vector< uint, VectorAbi::Avx>::operator>>(AsArg x) const { return _mm256_srlv_epi32(d.v(), x.d.v()); }
0342 template <> Vc_ALWAYS_INLINE AVX2::Vector< short> Vector< short, VectorAbi::Avx>::operator<<(AsArg x) const { return generate([&](int i) { return get(*this, i) << get(x, i); }); }
0343 template <> Vc_ALWAYS_INLINE AVX2::Vector<ushort> Vector<ushort, VectorAbi::Avx>::operator<<(AsArg x) const { return generate([&](int i) { return get(*this, i) << get(x, i); }); }
0344 template <> Vc_ALWAYS_INLINE AVX2::Vector< short> Vector< short, VectorAbi::Avx>::operator>>(AsArg x) const { return generate([&](int i) { return get(*this, i) >> get(x, i); }); }
0345 template <> Vc_ALWAYS_INLINE AVX2::Vector<ushort> Vector<ushort, VectorAbi::Avx>::operator>>(AsArg x) const { return generate([&](int i) { return get(*this, i) >> get(x, i); }); }
0346 template <typename T>
0347 Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator<<=(AsArg x)
0348 {
0349 static_assert(std::is_integral<T>::value,
0350 "bitwise-operators can only be used with Vectors of integral type");
0351 return *this = *this << x;
0352 }
0353 template <typename T>
0354 Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator>>=(AsArg x)
0355 {
0356 static_assert(std::is_integral<T>::value,
0357 "bitwise-operators can only be used with Vectors of integral type");
0358 return *this = *this >> x;
0359 }
0360 #endif
0361
0362 template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator>>=(int shift) {
0363 d.v() = Detail::shiftRight(d.v(), shift, T());
0364 return *static_cast<AVX2::Vector<T> *>(this);
0365 }
0366 template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator>>(int shift) const {
0367 return Detail::shiftRight(d.v(), shift, T());
0368 }
0369 template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator<<=(int shift) {
0370 d.v() = Detail::shiftLeft(d.v(), shift, T());
0371 return *static_cast<AVX2::Vector<T> *>(this);
0372 }
0373 template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator<<(int shift) const {
0374 return Detail::shiftLeft(d.v(), shift, T());
0375 }
0376
0377 // isnegative {{{1
0378 Vc_INTRINSIC Vc_CONST AVX2::float_m isnegative(AVX2::float_v x)
0379 {
0380 return AVX::avx_cast<__m256>(AVX::srai_epi32<31>(
0381 AVX::avx_cast<__m256i>(_mm256_and_ps(AVX::setsignmask_ps(), x.data()))));
0382 }
0383 Vc_INTRINSIC Vc_CONST AVX2::double_m isnegative(AVX2::double_v x)
0384 {
0385 return Mem::permute<X1, X1, X3, X3>(AVX::avx_cast<__m256>(AVX::srai_epi32<31>(
0386 AVX::avx_cast<__m256i>(_mm256_and_pd(AVX::setsignmask_pd(), x.data())))));
0387 }
0388 // gathers {{{1
0389 #define Vc_GATHER_IMPL(V_) \
0390 template <> \
0391 template <class MT, class IT, int Scale> \
0392 inline void AVX2::V_::gatherImplementation( \
0393 const Common::GatherArguments<MT, IT, Scale> &args)
0394 #define Vc_M(i_) static_cast<value_type>(args.address[Scale * args.indexes[i_]])
0395 Vc_GATHER_IMPL(double_v) { d.v() = _mm256_setr_pd(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
0396
0397 Vc_GATHER_IMPL(float_v)
0398 {
0399 d.v() = _mm256_setr_ps(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6),
0400 Vc_M(7));
0401 }
0402
0403 #ifdef Vc_IMPL_AVX2
0404 Vc_GATHER_IMPL(int_v)
0405 {
0406 d.v() = _mm256_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
0407 Vc_M(6), Vc_M(7));
0408 }
0409
0410 Vc_GATHER_IMPL(uint_v)
0411 {
0412 d.v() = _mm256_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
0413 Vc_M(6), Vc_M(7));
0414 }
0415
0416 Vc_GATHER_IMPL(short_v)
0417 {
0418 d.v() = _mm256_setr_epi16(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
0419 Vc_M(6), Vc_M(7), Vc_M(8), Vc_M(9), Vc_M(10), Vc_M(11),
0420 Vc_M(12), Vc_M(13), Vc_M(14), Vc_M(15));
0421 }
0422
0423 Vc_GATHER_IMPL(ushort_v)
0424 {
0425 d.v() = _mm256_setr_epi16(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
0426 Vc_M(6), Vc_M(7), Vc_M(8), Vc_M(9), Vc_M(10), Vc_M(11),
0427 Vc_M(12), Vc_M(13), Vc_M(14), Vc_M(15));
0428 }
0429 #endif
0430 #undef Vc_M
0431 #undef Vc_GATHER_IMPL
0432
0433 template <class T>
0434 template <class MT, class IT, int Scale>
0435 inline void Vector<T, VectorAbi::Avx>::gatherImplementation(
0436 const Common::GatherArguments<MT, IT, Scale> &args, MaskArgument mask)
0437 {
0438 const auto *mem = args.address;
0439 const auto indexes = Scale * args.indexes;
0440 using Selector = std::integral_constant < Common::GatherScatterImplementation,
0441 #ifdef Vc_USE_SET_GATHERS
0442 Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
0443 #endif
0444 #ifdef Vc_USE_BSF_GATHERS
0445 Common::GatherScatterImplementation::BitScanLoop
0446 #elif defined Vc_USE_POPCNT_BSF_GATHERS
0447 Common::GatherScatterImplementation::PopcntSwitch
0448 #else
0449 Common::GatherScatterImplementation::SimpleLoop
0450 #endif
0451 > ;
0452 Common::executeGather(Selector(), *this, mem, indexes, mask);
0453 }
0454
0455 template <typename T>
0456 template <typename MT, typename IT>
0457 inline void Vector<T, VectorAbi::Avx>::scatterImplementation(MT *mem, IT &&indexes) const
0458 {
0459 Common::unrolled_loop<std::size_t, 0, Size>([&](std::size_t i) { mem[indexes[i]] = d.m(i); });
0460 }
0461
0462 template <typename T>
0463 template <typename MT, typename IT>
0464 inline void Vector<T, VectorAbi::Avx>::scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const
0465 {
0466 using Selector = std::integral_constant < Common::GatherScatterImplementation,
0467 #ifdef Vc_USE_SET_GATHERS
0468 Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
0469 #endif
0470 #ifdef Vc_USE_BSF_GATHERS
0471 Common::GatherScatterImplementation::BitScanLoop
0472 #elif defined Vc_USE_POPCNT_BSF_GATHERS
0473 Common::GatherScatterImplementation::PopcntSwitch
0474 #else
0475 Common::GatherScatterImplementation::SimpleLoop
0476 #endif
0477 > ;
0478 Common::executeScatter(Selector(), *this, mem, std::forward<IT>(indexes), mask);
0479 }
0480
0481 ///////////////////////////////////////////////////////////////////////////////////////////
0482 // operator- {{{1
0483 #ifdef Vc_USE_BUILTIN_VECTOR_TYPES
0484 template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator-() const
0485 {
0486 return VectorType(-d.builtin());
0487 }
0488 #else
0489 template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator-() const
0490 {
0491 return Detail::negate(d.v(), std::integral_constant<std::size_t, sizeof(T)>());
0492 }
0493 #endif
0494
0495 ///////////////////////////////////////////////////////////////////////////////////////////
0496 // horizontal ops {{{1
0497 template <typename T>
0498 Vc_INTRINSIC std::pair<Vector<T, VectorAbi::Avx>, int>
0499 Vector<T, VectorAbi::Avx>::minIndex() const
0500 {
0501 AVX2::Vector<T> x = min();
0502 return std::make_pair(x, (*this == x).firstOne());
0503 }
0504 template <typename T>
0505 Vc_INTRINSIC std::pair<Vector<T, VectorAbi::Avx>, int>
0506 Vector<T, VectorAbi::Avx>::maxIndex() const
0507 {
0508 AVX2::Vector<T> x = max();
0509 return std::make_pair(x, (*this == x).firstOne());
0510 }
0511 template <> Vc_INTRINSIC std::pair<AVX2::float_v, int> AVX2::float_v::minIndex() const
0512 {
0513 /*
0514 // 28 cycles latency:
0515 __m256 x = _mm256_min_ps(Mem::permute128<X1, X0>(d.v()), d.v());
0516 x = _mm256_min_ps(x, Reg::permute<X2, X3, X0, X1>(x));
0517 AVX2::float_v xx = _mm256_min_ps(x, Reg::permute<X1, X0, X3, X2>(x));
0518 AVX2::uint_v idx = AVX2::uint_v::IndexesFromZero();
0519 idx = _mm256_castps_si256(
0520 _mm256_or_ps((*this != xx).data(), _mm256_castsi256_ps(idx.data())));
0521 return std::make_pair(xx, (*this == xx).firstOne());
0522
0523 __m128 loData = AVX::lo128(d.v());
0524 __m128 hiData = AVX::hi128(d.v());
0525 const __m128 less2 = _mm_cmplt_ps(hiData, loData);
0526 loData = _mm_min_ps(loData, hiData);
0527 hiData = Mem::permute<X2, X3, X0, X1>(loData);
0528 const __m128 less1 = _mm_cmplt_ps(hiData, loData);
0529 loData = _mm_min_ps(loData, hiData);
0530 hiData = Mem::permute<X1, X0, X3, X2>(loData);
0531 const __m128 less0 = _mm_cmplt_ps(hiData, loData);
0532 unsigned bits = _mm_movemask_ps(less0) & 0x1;
0533 bits |= ((_mm_movemask_ps(less1) << 1) - bits) & 0x2;
0534 bits |= ((_mm_movemask_ps(less2) << 3) - bits) & 0x4;
0535 loData = _mm_min_ps(loData, hiData);
0536 return std::make_pair(AVX::concat(loData, loData), bits);
0537 */
0538
0539 // 28 cycles Latency:
0540 __m256 x = d.v();
0541 __m256 idx = Vector<float>::IndexesFromZero().data();
0542 __m256 y = Mem::permute128<X1, X0>(x);
0543 __m256 idy = Mem::permute128<X1, X0>(idx);
0544 __m256 less = AVX::cmplt_ps(x, y);
0545
0546 x = _mm256_blendv_ps(y, x, less);
0547 idx = _mm256_blendv_ps(idy, idx, less);
0548 y = Reg::permute<X2, X3, X0, X1>(x);
0549 idy = Reg::permute<X2, X3, X0, X1>(idx);
0550 less = AVX::cmplt_ps(x, y);
0551
0552 x = _mm256_blendv_ps(y, x, less);
0553 idx = _mm256_blendv_ps(idy, idx, less);
0554 y = Reg::permute<X1, X0, X3, X2>(x);
0555 idy = Reg::permute<X1, X0, X3, X2>(idx);
0556 less = AVX::cmplt_ps(x, y);
0557
0558 idx = _mm256_blendv_ps(idy, idx, less);
0559
0560 const auto index = _mm_cvtsi128_si32(AVX::avx_cast<__m128i>(idx));
0561 #ifdef Vc_GNU_ASM
0562 __asm__ __volatile__(""); // help GCC to order the instructions better
0563 #endif
0564 x = _mm256_blendv_ps(y, x, less);
0565 return std::make_pair(x, index);
0566 }
0567 template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::partialSum() const
0568 {
0569 // a b c d e f g h
0570 // + a b c d e f g -> a ab bc cd de ef fg gh
0571 // + a ab bc cd de ef -> a ab abc abcd bcde cdef defg efgh
0572 // + a ab abc abcd -> a ab abc abcd abcde abcdef abcdefg abcdefgh
0573 AVX2::Vector<T> tmp = *this;
0574 if (Size > 1) tmp += tmp.shifted(-1);
0575 if (Size > 2) tmp += tmp.shifted(-2);
0576 if (Size > 4) tmp += tmp.shifted(-4);
0577 if (Size > 8) tmp += tmp.shifted(-8);
0578 if (Size > 16) tmp += tmp.shifted(-16);
0579 return tmp;
0580 }
0581
0582 /* This function requires correct masking because the neutral element of \p op is not necessarily 0
0583 *
0584 template<typename T> template<typename BinaryOperation> Vc_ALWAYS_INLINE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::partialSum(BinaryOperation op) const
0585 {
0586 // a b c d e f g h
0587 // + a b c d e f g -> a ab bc cd de ef fg gh
0588 // + a ab bc cd de ef -> a ab abc abcd bcde cdef defg efgh
0589 // + a ab abc abcd -> a ab abc abcd abcde abcdef abcdefg abcdefgh
0590 AVX2::Vector<T> tmp = *this;
0591 Mask mask(true);
0592 if (Size > 1) tmp(mask) = op(tmp, tmp.shifted(-1));
0593 if (Size > 2) tmp(mask) = op(tmp, tmp.shifted(-2));
0594 if (Size > 4) tmp(mask) = op(tmp, tmp.shifted(-4));
0595 if (Size > 8) tmp(mask) = op(tmp, tmp.shifted(-8));
0596 if (Size > 16) tmp(mask) = op(tmp, tmp.shifted(-16));
0597 return tmp;
0598 }
0599 */
0600
0601 template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::min(MaskArgument m) const
0602 {
0603 AVX2::Vector<T> tmp = std::numeric_limits<AVX2::Vector<T> >::max();
0604 tmp(m) = *this;
0605 return tmp.min();
0606 }
0607 template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::max(MaskArgument m) const
0608 {
0609 AVX2::Vector<T> tmp = std::numeric_limits<AVX2::Vector<T> >::min();
0610 tmp(m) = *this;
0611 return tmp.max();
0612 }
0613 template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::product(MaskArgument m) const
0614 {
0615 AVX2::Vector<T> tmp(Vc::One);
0616 tmp(m) = *this;
0617 return tmp.product();
0618 }
0619 template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::sum(MaskArgument m) const
0620 {
0621 AVX2::Vector<T> tmp(Vc::Zero);
0622 tmp(m) = *this;
0623 return tmp.sum();
0624 }//}}}
0625 // exponent {{{1
0626 namespace Detail
0627 {
0628 Vc_INTRINSIC Vc_CONST __m256 exponent(__m256 v)
0629 {
0630 using namespace AVX;
0631 __m128i tmp0 = _mm_srli_epi32(avx_cast<__m128i>(v), 23);
0632 __m128i tmp1 = _mm_srli_epi32(avx_cast<__m128i>(hi128(v)), 23);
0633 tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f));
0634 tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f));
0635 return _mm256_cvtepi32_ps(concat(tmp0, tmp1));
0636 }
0637 Vc_INTRINSIC Vc_CONST __m256d exponent(__m256d v)
0638 {
0639 using namespace AVX;
0640 __m128i tmp0 = _mm_srli_epi64(avx_cast<__m128i>(v), 52);
0641 __m128i tmp1 = _mm_srli_epi64(avx_cast<__m128i>(hi128(v)), 52);
0642 tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x3ff));
0643 tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x3ff));
0644 return _mm256_cvtepi32_pd(avx_cast<__m128i>(Mem::shuffle<X0, X2, Y0, Y2>(avx_cast<__m128>(tmp0), avx_cast<__m128>(tmp1))));
0645 }
0646 } // namespace Detail
0647
0648 Vc_INTRINSIC Vc_CONST AVX2::float_v exponent(AVX2::float_v x)
0649 {
0650 using Detail::operator>=;
0651 Vc_ASSERT((x >= x.Zero()).isFull());
0652 return Detail::exponent(x.data());
0653 }
0654 Vc_INTRINSIC Vc_CONST AVX2::double_v exponent(AVX2::double_v x)
0655 {
0656 using Detail::operator>=;
0657 Vc_ASSERT((x >= x.Zero()).isFull());
0658 return Detail::exponent(x.data());
0659 }
0660 // }}}1
0661 // Random {{{1
0662 static Vc_ALWAYS_INLINE __m256i _doRandomStep()
0663 {
0664 using Detail::operator*;
0665 using Detail::operator+;
0666 #ifdef Vc_IMPL_AVX2
0667 using AVX2::uint_v;
0668 uint_v state0(&Common::RandomState[0]);
0669 uint_v state1(&Common::RandomState[uint_v::Size]);
0670 (state1 * uint_v(0xdeece66du) + uint_v(11)).store(&Common::RandomState[uint_v::Size]);
0671 uint_v(Detail::xor_((state0 * uint_v(0xdeece66du) + uint_v(11)).data(),
0672 _mm256_srli_epi32(state1.data(), 16)))
0673 .store(&Common::RandomState[0]);
0674 return state0.data();
0675 #else
0676 using SSE::uint_v;
0677 uint_v state0(&Common::RandomState[0]);
0678 uint_v state1(&Common::RandomState[uint_v::Size]);
0679 uint_v state2(&Common::RandomState[2 * uint_v::Size]);
0680 uint_v state3(&Common::RandomState[3 * uint_v::Size]);
0681 (state2 * uint_v(0xdeece66du) + uint_v(11))
0682 .store(&Common::RandomState[2 * uint_v::Size]);
0683 (state3 * uint_v(0xdeece66du) + uint_v(11))
0684 .store(&Common::RandomState[3 * uint_v::Size]);
0685 uint_v(Detail::xor_((state0 * uint_v(0xdeece66du) + uint_v(11)).data(),
0686 _mm_srli_epi32(state2.data(), 16)))
0687 .store(&Common::RandomState[0]);
0688 uint_v(Detail::xor_((state1 * uint_v(0xdeece66du) + uint_v(11)).data(),
0689 _mm_srli_epi32(state3.data(), 16)))
0690 .store(&Common::RandomState[uint_v::Size]);
0691 return AVX::concat(state0.data(), state1.data());
0692 #endif
0693 }
0694
0695 #ifdef Vc_IMPL_AVX2
0696 template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::Random()
0697 {
0698 return {_doRandomStep()};
0699 }
0700 #endif
0701
0702 template <> Vc_ALWAYS_INLINE AVX2::float_v AVX2::float_v::Random()
0703 {
0704 return HT::sub(Detail::or_(_cast(AVX::srli_epi32<2>(_doRandomStep())), HT::one()),
0705 HT::one());
0706 }
0707
0708 template<> Vc_ALWAYS_INLINE AVX2::double_v AVX2::double_v::Random()
0709 {
0710 const __m256i state = Detail::load(&Common::RandomState[0], Vc::Aligned,
0711 Detail::LoadTag<__m256i, int>());
0712 for (size_t k = 0; k < 8; k += 2) {
0713 typedef unsigned long long uint64 Vc_MAY_ALIAS;
0714 const uint64 stateX = *aliasing_cast<uint64>(&Common::RandomState[k]);
0715 *aliasing_cast<uint64>(&Common::RandomState[k]) = (stateX * 0x5deece66dull + 11);
0716 }
0717 return HT::sub(Detail::or_(_cast(AVX::srli_epi64<12>(state)), HT::one()), HT::one());
0718 }
0719 // }}}1
0720 // shifted / rotated {{{1
0721 template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::shifted(int amount) const
0722 {
0723 return Detail::shifted<EntryType>(d.v(), amount);
0724 }
0725
0726 template <typename VectorType>
0727 Vc_INTRINSIC Vc_CONST VectorType shifted_shortcut(VectorType left, VectorType right, Common::WidthT<__m128>)
0728 {
0729 return Mem::shuffle<X2, X3, Y0, Y1>(left, right);
0730 }
0731 template <typename VectorType>
0732 Vc_INTRINSIC Vc_CONST VectorType shifted_shortcut(VectorType left, VectorType right, Common::WidthT<__m256>)
0733 {
0734 return Mem::shuffle128<X1, Y0>(left, right);
0735 }
0736
0737 template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::shifted(int amount, Vector shiftIn) const
0738 {
0739 #ifdef __GNUC__
0740 if (__builtin_constant_p(amount)) {
0741 const __m256i a = AVX::avx_cast<__m256i>(d.v());
0742 const __m256i b = AVX::avx_cast<__m256i>(shiftIn.d.v());
0743 if (amount * 2 == int(Size)) {
0744 return shifted_shortcut(d.v(), shiftIn.d.v(), WidthT());
0745 }
0746 if (amount * 2 == -int(Size)) {
0747 return shifted_shortcut(shiftIn.d.v(), d.v(), WidthT());
0748 }
0749 switch (amount) {
0750 case 1:
0751 return AVX::avx_cast<VectorType>(
0752 #ifdef Vc_IMPL_AVX2
0753 _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
0754 sizeof(EntryType))
0755 #else // Vc_IMPL_AVX2
0756 AVX::concat(
0757 _mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a), sizeof(EntryType)),
0758 _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a), sizeof(EntryType)))
0759 #endif // Vc_IMPL_AVX2
0760 );
0761 case 2:
0762 return AVX::avx_cast<VectorType>(
0763 #ifdef Vc_IMPL_AVX2
0764 _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
0765 2 * sizeof(EntryType))
0766 #else // Vc_IMPL_AVX2
0767 AVX::concat(
0768 _mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a), 2 * sizeof(EntryType)),
0769 _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a), 2 * sizeof(EntryType)))
0770 #endif // Vc_IMPL_AVX2
0771 );
0772 case 3:
0773 if (6u < Size) {
0774 return AVX::avx_cast<VectorType>(
0775 #ifdef Vc_IMPL_AVX2
0776 _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
0777 3 * sizeof(EntryType))
0778 #else // Vc_IMPL_AVX2
0779 AVX::concat(_mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a),
0780 3 * sizeof(EntryType)),
0781 _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a),
0782 3 * sizeof(EntryType)))
0783 #endif // Vc_IMPL_AVX2
0784 );
0785 // TODO: } else {
0786 }
0787 }
0788 }
0789 #endif
0790 using Detail::operator|;
0791 return shifted(amount) | (amount > 0 ?
0792 shiftIn.shifted(amount - Size) :
0793 shiftIn.shifted(Size + amount));
0794 }
0795 template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::rotated(int amount) const
0796 {
0797 return Detail::rotated<EntryType, size()>(d.v(), amount);
0798 }
0799 // sorted {{{1
0800 template <typename T>
0801 Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Avx> Vector<T, VectorAbi::Avx>::sorted()
0802 const
0803 {
0804 return Detail::sorted(*this);
0805 }
0806 // interleaveLow/-High {{{1
0807 template <> Vc_INTRINSIC AVX2::double_v AVX2::double_v::interleaveLow(AVX2::double_v x) const
0808 {
0809 return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_pd(data(), x.data()),
0810 _mm256_unpackhi_pd(data(), x.data()));
0811 }
0812 template <> Vc_INTRINSIC AVX2::double_v AVX2::double_v::interleaveHigh(AVX2::double_v x) const
0813 {
0814 return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_pd(data(), x.data()),
0815 _mm256_unpackhi_pd(data(), x.data()));
0816 }
0817 template <> Vc_INTRINSIC AVX2::float_v AVX2::float_v::interleaveLow(AVX2::float_v x) const
0818 {
0819 return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_ps(data(), x.data()),
0820 _mm256_unpackhi_ps(data(), x.data()));
0821 }
0822 template <> Vc_INTRINSIC AVX2::float_v AVX2::float_v::interleaveHigh(AVX2::float_v x) const
0823 {
0824 return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_ps(data(), x.data()),
0825 _mm256_unpackhi_ps(data(), x.data()));
0826 }
0827 #ifdef Vc_IMPL_AVX2
0828 template <> Vc_INTRINSIC AVX2::int_v AVX2::int_v::interleaveLow ( AVX2::int_v x) const {
0829 return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi32(data(), x.data()),
0830 _mm256_unpackhi_epi32(data(), x.data()));
0831 }
0832 template <> Vc_INTRINSIC AVX2::int_v AVX2::int_v::interleaveHigh( AVX2::int_v x) const {
0833 return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi32(data(), x.data()),
0834 _mm256_unpackhi_epi32(data(), x.data()));
0835 }
0836 template <> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::interleaveLow ( AVX2::uint_v x) const {
0837 return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi32(data(), x.data()),
0838 _mm256_unpackhi_epi32(data(), x.data()));
0839 }
0840 template <> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::interleaveHigh( AVX2::uint_v x) const {
0841 return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi32(data(), x.data()),
0842 _mm256_unpackhi_epi32(data(), x.data()));
0843 }
0844 template <> Vc_INTRINSIC AVX2::short_v AVX2::short_v::interleaveLow ( AVX2::short_v x) const {
0845 return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi16(data(), x.data()),
0846 _mm256_unpackhi_epi16(data(), x.data()));
0847 }
0848 template <> Vc_INTRINSIC AVX2::short_v AVX2::short_v::interleaveHigh( AVX2::short_v x) const {
0849 return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi16(data(), x.data()),
0850 _mm256_unpackhi_epi16(data(), x.data()));
0851 }
0852 template <> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::interleaveLow (AVX2::ushort_v x) const {
0853 return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi16(data(), x.data()),
0854 _mm256_unpackhi_epi16(data(), x.data()));
0855 }
0856 template <> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::interleaveHigh(AVX2::ushort_v x) const {
0857 return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi16(data(), x.data()),
0858 _mm256_unpackhi_epi16(data(), x.data()));
0859 }
0860 #endif
0861 // permutation via operator[] {{{1
0862 template <> Vc_INTRINSIC Vc_PURE AVX2::double_v AVX2::double_v::operator[](Permutation::ReversedTag) const
0863 {
0864 return Mem::permute128<X1, X0>(Mem::permute<X1, X0, X3, X2>(d.v()));
0865 }
0866 template <> Vc_INTRINSIC Vc_PURE AVX2::float_v AVX2::float_v::operator[](Permutation::ReversedTag) const
0867 {
0868 return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
0869 }
0870 #ifdef Vc_IMPL_AVX2
0871 template <>
0872 Vc_INTRINSIC Vc_PURE AVX2::int_v AVX2::int_v::operator[](Permutation::ReversedTag) const
0873 {
0874 return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
0875 }
0876 template <>
0877 Vc_INTRINSIC Vc_PURE AVX2::uint_v AVX2::uint_v::operator[](Permutation::ReversedTag) const
0878 {
0879 return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
0880 }
0881 template <>
0882 Vc_INTRINSIC Vc_PURE AVX2::short_v AVX2::short_v::operator[](
0883 Permutation::ReversedTag) const
0884 {
0885 return Mem::permute128<X1, X0>(AVX::avx_cast<__m256i>(Mem::shuffle<X1, Y0, X3, Y2>(
0886 AVX::avx_cast<__m256d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
0887 AVX::avx_cast<__m256d>(Mem::permuteLo<X3, X2, X1, X0>(d.v())))));
0888 }
0889 template <>
0890 Vc_INTRINSIC Vc_PURE AVX2::ushort_v AVX2::ushort_v::operator[](
0891 Permutation::ReversedTag) const
0892 {
0893 return Mem::permute128<X1, X0>(AVX::avx_cast<__m256i>(Mem::shuffle<X1, Y0, X3, Y2>(
0894 AVX::avx_cast<__m256d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
0895 AVX::avx_cast<__m256d>(Mem::permuteLo<X3, X2, X1, X0>(d.v())))));
0896 }
0897 #endif
0898 template <> Vc_INTRINSIC AVX2::float_v Vector<float, VectorAbi::Avx>::operator[](const IndexType &/*perm*/) const
0899 {
0900 // TODO
0901 return *this;
0902 #ifdef Vc_IMPL_AVX2
0903 #else
0904 /*
0905 const int_m cross128 = AVX::concat(_mm_cmpgt_epi32(AVX::lo128(perm.data()), _mm_set1_epi32(3)),
0906 _mm_cmplt_epi32(AVX::hi128(perm.data()), _mm_set1_epi32(4)));
0907 if (cross128.isNotEmpty()) {
0908 AVX2::float_v x = _mm256_permutevar_ps(d.v(), perm.data());
0909 x(cross128) = _mm256_permutevar_ps(Mem::permute128<X1, X0>(d.v()), perm.data());
0910 return x;
0911 } else {
0912 */
0913 #endif
0914 }
0915
0916 // reversed {{{1
0917 template <typename T>
0918 Vc_INTRINSIC Vc_PURE Vector<T, VectorAbi::Avx> Vector<T, VectorAbi::Avx>::reversed() const
0919 {
0920 return (*this)[Permutation::Reversed];
0921 }
0922
0923 // broadcast from constexpr index {{{1
0924 template <> template <int Index> Vc_INTRINSIC AVX2::float_v AVX2::float_v::broadcast() const
0925 {
0926 constexpr VecPos Inner = static_cast<VecPos>(Index & 0x3);
0927 constexpr VecPos Outer = static_cast<VecPos>((Index & 0x4) / 4);
0928 return Mem::permute<Inner, Inner, Inner, Inner>(Mem::permute128<Outer, Outer>(d.v()));
0929 }
0930 template <> template <int Index> Vc_INTRINSIC AVX2::double_v AVX2::double_v::broadcast() const
0931 {
0932 constexpr VecPos Inner = static_cast<VecPos>(Index & 0x1);
0933 constexpr VecPos Outer = static_cast<VecPos>((Index & 0x2) / 2);
0934 return Mem::permute<Inner, Inner>(Mem::permute128<Outer, Outer>(d.v()));
0935 }
0936 // }}}1
0937 } // namespace Vc
0938
0939 // vim: foldmethod=marker