Warning, /include/Vc/sse/mask.tcc is written in an unsupported language. File is not indexed.
0001 /* This file is part of the Vc library. {{{
0002 Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
0003
0004 Redistribution and use in source and binary forms, with or without
0005 modification, are permitted provided that the following conditions are met:
0006 * Redistributions of source code must retain the above copyright
0007 notice, this list of conditions and the following disclaimer.
0008 * Redistributions in binary form must reproduce the above copyright
0009 notice, this list of conditions and the following disclaimer in the
0010 documentation and/or other materials provided with the distribution.
0011 * Neither the names of contributing organizations nor the
0012 names of its contributors may be used to endorse or promote products
0013 derived from this software without specific prior written permission.
0014
0015 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
0016 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
0017 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0018 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
0019 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
0020 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
0021 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0022 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0023 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
0024 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0025
0026 }}}*/
0027
0028 #include "macros.h"
0029
0030 namespace Vc_VERSIONED_NAMESPACE
0031 {
0032 namespace Detail
0033 {
0034 /*mask_count{{{*/
0035 template<> Vc_INTRINSIC Vc_CONST int mask_count<2>(__m128i k)
0036 {
0037 int mask = _mm_movemask_pd(_mm_castsi128_pd(k));
0038 return (mask & 1) + (mask >> 1);
0039 }
0040
0041 template<> Vc_INTRINSIC Vc_CONST int mask_count<4>(__m128i k)
0042 {
0043 #ifdef Vc_IMPL_POPCNT
0044 return _mm_popcnt_u32(_mm_movemask_ps(_mm_castsi128_ps(k)));
0045 #else
0046 auto x = _mm_srli_epi32(k, 31);
0047 x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(0, 1, 2, 3)));
0048 x = _mm_add_epi32(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 0, 3, 2)));
0049 return _mm_cvtsi128_si32(x);
0050 #endif
0051 }
0052
0053 template<> Vc_INTRINSIC Vc_CONST int mask_count<8>(__m128i k)
0054 {
0055 #ifdef Vc_IMPL_POPCNT
0056 return _mm_popcnt_u32(_mm_movemask_epi8(k)) / 2;
0057 #else
0058 auto x = _mm_srli_epi16(k, 15);
0059 x = _mm_add_epi16(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(0, 1, 2, 3)));
0060 x = _mm_add_epi16(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(0, 1, 2, 3)));
0061 x = _mm_add_epi16(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)));
0062 return _mm_extract_epi16(x, 0);
0063 #endif
0064 }
0065
0066 template<> Vc_INTRINSIC Vc_CONST int mask_count<16>(__m128i k)
0067 {
0068 return Detail::popcnt16(_mm_movemask_epi8(k));
0069 }
0070 /*}}}*/
0071 // mask_to_int/*{{{*/
0072 template<> Vc_INTRINSIC Vc_CONST int mask_to_int<2>(__m128i k)
0073 {
0074 return _mm_movemask_pd(_mm_castsi128_pd(k));
0075 }
0076 template<> Vc_INTRINSIC Vc_CONST int mask_to_int<4>(__m128i k)
0077 {
0078 return _mm_movemask_ps(_mm_castsi128_ps(k));
0079 }
0080 template<> Vc_INTRINSIC Vc_CONST int mask_to_int<8>(__m128i k)
0081 {
0082 return _mm_movemask_epi8(_mm_packs_epi16(k, _mm_setzero_si128()));
0083 }
0084 template<> Vc_INTRINSIC Vc_CONST int mask_to_int<16>(__m128i k)
0085 {
0086 return _mm_movemask_epi8(k);
0087 }
0088 /*}}}*/
0089 // mask_store/*{{{*/
0090 template <size_t> Vc_ALWAYS_INLINE void mask_store(__m128i k, bool *mem);
0091 template <> Vc_ALWAYS_INLINE void mask_store<16>(__m128i k, bool *mem)
0092 {
0093 _mm_store_si128(reinterpret_cast<__m128i *>(mem), _mm_and_si128(k, _mm_set1_epi8(1)));
0094 }
0095 template <> Vc_ALWAYS_INLINE void mask_store<8>(__m128i k, bool *mem)
0096 {
0097 k = _mm_srli_epi16(k, 15);
0098 const auto k2 = _mm_packs_epi16(k, _mm_setzero_si128());
0099 #ifdef __x86_64__
0100 *aliasing_cast<int64_t>(mem) = _mm_cvtsi128_si64(k2);
0101 #else
0102 _mm_store_sd(aliasing_cast<double>(mem), _mm_castsi128_pd(k2));
0103 #endif
0104 }
0105 template <> Vc_ALWAYS_INLINE void mask_store<4>(__m128i k, bool *mem)
0106 {
0107 *aliasing_cast<int32_t>(mem) = _mm_cvtsi128_si32(
0108 _mm_packs_epi16(_mm_srli_epi16(_mm_packs_epi32(k, _mm_setzero_si128()), 15),
0109 _mm_setzero_si128()));
0110 }
0111 template <> Vc_ALWAYS_INLINE void mask_store<2>(__m128i k, bool *mem)
0112 {
0113 mem[0] = -SseIntrinsics::extract_epi32<1>(k);
0114 mem[1] = -SseIntrinsics::extract_epi32<3>(k);
0115 }
0116 /*}}}*/
0117 // mask_load/*{{{*/
0118 template<size_t> Vc_ALWAYS_INLINE __m128 mask_load(const bool *mem);
0119 template<> Vc_ALWAYS_INLINE __m128 mask_load<16>(const bool *mem)
0120 {
0121 return sse_cast<__m128>(_mm_cmpgt_epi8(
0122 _mm_load_si128(reinterpret_cast<const __m128i *>(mem)), _mm_setzero_si128()));
0123 }
0124 template<> Vc_ALWAYS_INLINE __m128 mask_load<8>(const bool *mem)
0125 {
0126 #ifdef __x86_64__
0127 __m128i k = _mm_cvtsi64_si128(*reinterpret_cast<const int64_t *>(mem));
0128 #else
0129 __m128i k = _mm_castpd_si128(_mm_load_sd(reinterpret_cast<const double *>(mem)));
0130 #endif
0131 return sse_cast<__m128>(_mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128()));
0132 }
0133 template<> Vc_ALWAYS_INLINE __m128 mask_load<4>(const bool *mem)
0134 {
0135 __m128i k = _mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem));
0136 k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128());
0137 return sse_cast<__m128>(_mm_unpacklo_epi16(k, k));
0138 }
0139 template<> Vc_ALWAYS_INLINE __m128 mask_load<2>(const bool *mem)
0140 {
0141 return sse_cast<__m128>(
0142 _mm_set_epi32(-int(mem[1]), -int(mem[1]), -int(mem[0]), -int(mem[0])));
0143 }
0144 /*}}}*/
0145 // is_equal{{{
0146 template <> Vc_INTRINSIC Vc_CONST bool is_equal<2>(__m128 k1, __m128 k2)
0147 {
0148 return _mm_movemask_pd(_mm_castps_pd(k1)) == _mm_movemask_pd(_mm_castps_pd(k2));
0149 }
0150 template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<2>(__m128 k1, __m128 k2)
0151 {
0152 return _mm_movemask_pd(_mm_castps_pd(k1)) != _mm_movemask_pd(_mm_castps_pd(k2));
0153 }
0154
0155 template <> Vc_INTRINSIC Vc_CONST bool is_equal<4>(__m128 k1, __m128 k2)
0156 {
0157 return _mm_movemask_ps(k1) == _mm_movemask_ps(k2);
0158 }
0159 template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<4>(__m128 k1, __m128 k2)
0160 {
0161 return _mm_movemask_ps(k1) != _mm_movemask_ps(k2);
0162 }
0163
0164 template <> Vc_INTRINSIC Vc_CONST bool is_equal<8>(__m128 k1, __m128 k2)
0165 {
0166 return _mm_movemask_epi8(_mm_castps_si128(k1)) ==
0167 _mm_movemask_epi8(_mm_castps_si128(k2));
0168 }
0169 template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<8>(__m128 k1, __m128 k2)
0170 {
0171 return _mm_movemask_epi8(_mm_castps_si128(k1)) !=
0172 _mm_movemask_epi8(_mm_castps_si128(k2));
0173 }
0174
0175 template <> Vc_INTRINSIC Vc_CONST bool is_equal<16>(__m128 k1, __m128 k2)
0176 {
0177 return _mm_movemask_epi8(_mm_castps_si128(k1)) ==
0178 _mm_movemask_epi8(_mm_castps_si128(k2));
0179 }
0180 template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<16>(__m128 k1, __m128 k2)
0181 {
0182 return _mm_movemask_epi8(_mm_castps_si128(k1)) !=
0183 _mm_movemask_epi8(_mm_castps_si128(k2));
0184 }
0185
0186 // }}}
0187 } // namespace Detail
0188
0189 template<> Vc_ALWAYS_INLINE void SSE::double_m::store(bool *mem) const
0190 {
0191 *aliasing_cast<uint16_t>(mem) = _mm_movemask_epi8(dataI()) & 0x0101;
0192 }
0193 template<typename T> Vc_ALWAYS_INLINE void Mask<T, VectorAbi::Sse>::store(bool *mem) const
0194 {
0195 Detail::mask_store<Size>(dataI(), mem);
0196 }
0197 template<> Vc_ALWAYS_INLINE void SSE::double_m::load(const bool *mem)
0198 {
0199 d.set(0, MaskBool(mem[0]));
0200 d.set(1, MaskBool(mem[1]));
0201 }
0202 template <typename T> Vc_ALWAYS_INLINE void Mask<T, VectorAbi::Sse>::load(const bool *mem)
0203 {
0204 d.v() = sse_cast<VectorType>(Detail::mask_load<Size>(mem));
0205 }
0206
0207 // get / operator[] {{{1
0208 template <>
0209 Vc_INTRINSIC Vc_PURE bool SSE::short_m::get(const SSE::short_m &m, int index) noexcept
0210 {
0211 return m.shiftMask() & (1 << 2 * index);
0212 }
0213 template <>
0214 Vc_INTRINSIC Vc_PURE bool SSE::ushort_m::get(const SSE::ushort_m &m, int index) noexcept
0215 {
0216 return m.shiftMask() & (1 << 2 * index);
0217 }
0218
0219 // firstOne {{{1
0220 template<typename T> Vc_ALWAYS_INLINE Vc_PURE int Mask<T, VectorAbi::Sse>::firstOne() const
0221 {
0222 const int mask = toInt();
0223 #ifdef _MSC_VER
0224 unsigned long bit;
0225 _BitScanForward(&bit, mask);
0226 #else
0227 int bit;
0228 __asm__("bsf %1,%0" : "=&r"(bit) : "r"(mask));
0229 #endif
0230 return bit;
0231 }
0232
0233 // generate {{{1
0234 template <typename M, typename G>
0235 Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 2>)
0236 {
0237 return _mm_set_epi64x(gen(1) ? 0xffffffffffffffffull : 0,
0238 gen(0) ? 0xffffffffffffffffull : 0);
0239 }
0240 template <typename M, typename G>
0241 Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 4>)
0242 {
0243 return _mm_setr_epi32(gen(0) ? 0xfffffffful : 0, gen(1) ? 0xfffffffful : 0,
0244 gen(2) ? 0xfffffffful : 0, gen(3) ? 0xfffffffful : 0);
0245 }
0246 template <typename M, typename G>
0247 Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 8>)
0248 {
0249 return _mm_setr_epi16(gen(0) ? 0xffffu : 0, gen(1) ? 0xffffu : 0,
0250 gen(2) ? 0xffffu : 0, gen(3) ? 0xffffu : 0,
0251 gen(4) ? 0xffffu : 0, gen(5) ? 0xffffu : 0,
0252 gen(6) ? 0xffffu : 0, gen(7) ? 0xffffu : 0);
0253 }
0254 template <typename T>
0255 template <typename G>
0256 Vc_INTRINSIC Mask<T, VectorAbi::Sse> Mask<T, VectorAbi::Sse>::generate(G &&gen)
0257 {
0258 return generate_impl<Mask<T, VectorAbi::Sse>>(std::forward<G>(gen),
0259 std::integral_constant<int, Size>());
0260 }
0261 // shifted {{{1
0262 template <typename T> Vc_INTRINSIC Vc_PURE Mask<T, VectorAbi::Sse> Mask<T, VectorAbi::Sse>::shifted(int amount) const
0263 {
0264 switch (amount * int(sizeof(VectorEntryType))) {
0265 case 0: return *this;
0266 case 1: return Detail::shifted< 1>(dataI());
0267 case 2: return Detail::shifted< 2>(dataI());
0268 case 3: return Detail::shifted< 3>(dataI());
0269 case 4: return Detail::shifted< 4>(dataI());
0270 case 5: return Detail::shifted< 5>(dataI());
0271 case 6: return Detail::shifted< 6>(dataI());
0272 case 7: return Detail::shifted< 7>(dataI());
0273 case 8: return Detail::shifted< 8>(dataI());
0274 case 9: return Detail::shifted< 9>(dataI());
0275 case 10: return Detail::shifted< 10>(dataI());
0276 case 11: return Detail::shifted< 11>(dataI());
0277 case 12: return Detail::shifted< 12>(dataI());
0278 case 13: return Detail::shifted< 13>(dataI());
0279 case 14: return Detail::shifted< 14>(dataI());
0280 case 15: return Detail::shifted< 15>(dataI());
0281 case 16: return Detail::shifted< 16>(dataI());
0282 case -1: return Detail::shifted< -1>(dataI());
0283 case -2: return Detail::shifted< -2>(dataI());
0284 case -3: return Detail::shifted< -3>(dataI());
0285 case -4: return Detail::shifted< -4>(dataI());
0286 case -5: return Detail::shifted< -5>(dataI());
0287 case -6: return Detail::shifted< -6>(dataI());
0288 case -7: return Detail::shifted< -7>(dataI());
0289 case -8: return Detail::shifted< -8>(dataI());
0290 case -9: return Detail::shifted< -9>(dataI());
0291 case -10: return Detail::shifted<-10>(dataI());
0292 case -11: return Detail::shifted<-11>(dataI());
0293 case -12: return Detail::shifted<-12>(dataI());
0294 case -13: return Detail::shifted<-13>(dataI());
0295 case -14: return Detail::shifted<-14>(dataI());
0296 case -15: return Detail::shifted<-15>(dataI());
0297 case -16: return Detail::shifted<-16>(dataI());
0298 }
0299 return Zero();
0300 }
0301 // }}}1
0302
0303 }
0304
0305 // vim: foldmethod=marker