Back to home page

EIC code displayed by LXR

 
 

    


Warning, /include/Vc/sse/mask.tcc is written in an unsupported language. File is not indexed.

0001 /*  This file is part of the Vc library. {{{
0002 Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
0003 
0004 Redistribution and use in source and binary forms, with or without
0005 modification, are permitted provided that the following conditions are met:
0006     * Redistributions of source code must retain the above copyright
0007       notice, this list of conditions and the following disclaimer.
0008     * Redistributions in binary form must reproduce the above copyright
0009       notice, this list of conditions and the following disclaimer in the
0010       documentation and/or other materials provided with the distribution.
0011     * Neither the names of contributing organizations nor the
0012       names of its contributors may be used to endorse or promote products
0013       derived from this software without specific prior written permission.
0014 
0015 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
0016 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
0017 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0018 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
0019 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
0020 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
0021 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0022 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0023 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
0024 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0025 
0026 }}}*/
0027 
0028 #include "macros.h"
0029 
0030 namespace Vc_VERSIONED_NAMESPACE
0031 {
0032 namespace Detail
0033 {
0034 /*mask_count{{{*/
0035 template<> Vc_INTRINSIC Vc_CONST int mask_count<2>(__m128i k)
0036 {
0037     int mask = _mm_movemask_pd(_mm_castsi128_pd(k));
0038     return (mask & 1) + (mask >> 1);
0039 }
0040 
0041 template<> Vc_INTRINSIC Vc_CONST int mask_count<4>(__m128i k)
0042 {
0043 #ifdef Vc_IMPL_POPCNT
0044     return _mm_popcnt_u32(_mm_movemask_ps(_mm_castsi128_ps(k)));
0045 #else
0046     auto x = _mm_srli_epi32(k, 31);
0047     x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(0, 1, 2, 3)));
0048     x = _mm_add_epi32(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 0, 3, 2)));
0049     return _mm_cvtsi128_si32(x);
0050 #endif
0051 }
0052 
0053 template<> Vc_INTRINSIC Vc_CONST int mask_count<8>(__m128i k)
0054 {
0055 #ifdef Vc_IMPL_POPCNT
0056     return _mm_popcnt_u32(_mm_movemask_epi8(k)) / 2;
0057 #else
0058     auto x = _mm_srli_epi16(k, 15);
0059     x = _mm_add_epi16(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(0, 1, 2, 3)));
0060     x = _mm_add_epi16(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(0, 1, 2, 3)));
0061     x = _mm_add_epi16(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)));
0062     return _mm_extract_epi16(x, 0);
0063 #endif
0064 }
0065 
0066 template<> Vc_INTRINSIC Vc_CONST int mask_count<16>(__m128i k)
0067 {
0068     return Detail::popcnt16(_mm_movemask_epi8(k));
0069 }
0070 /*}}}*/
0071 // mask_to_int/*{{{*/
0072 template<> Vc_INTRINSIC Vc_CONST int mask_to_int<2>(__m128i k)
0073 {
0074     return _mm_movemask_pd(_mm_castsi128_pd(k));
0075 }
0076 template<> Vc_INTRINSIC Vc_CONST int mask_to_int<4>(__m128i k)
0077 {
0078     return _mm_movemask_ps(_mm_castsi128_ps(k));
0079 }
0080 template<> Vc_INTRINSIC Vc_CONST int mask_to_int<8>(__m128i k)
0081 {
0082     return _mm_movemask_epi8(_mm_packs_epi16(k, _mm_setzero_si128()));
0083 }
0084 template<> Vc_INTRINSIC Vc_CONST int mask_to_int<16>(__m128i k)
0085 {
0086     return _mm_movemask_epi8(k);
0087 }
0088 /*}}}*/
0089 // mask_store/*{{{*/
0090 template <size_t> Vc_ALWAYS_INLINE void mask_store(__m128i k, bool *mem);
0091 template <> Vc_ALWAYS_INLINE void mask_store<16>(__m128i k, bool *mem)
0092 {
0093     _mm_store_si128(reinterpret_cast<__m128i *>(mem), _mm_and_si128(k, _mm_set1_epi8(1)));
0094 }
0095 template <> Vc_ALWAYS_INLINE void mask_store<8>(__m128i k, bool *mem)
0096 {
0097     k = _mm_srli_epi16(k, 15);
0098     const auto k2 = _mm_packs_epi16(k, _mm_setzero_si128());
0099 #ifdef __x86_64__
0100     *aliasing_cast<int64_t>(mem) = _mm_cvtsi128_si64(k2);
0101 #else
0102     _mm_store_sd(aliasing_cast<double>(mem), _mm_castsi128_pd(k2));
0103 #endif
0104 }
0105 template <> Vc_ALWAYS_INLINE void mask_store<4>(__m128i k, bool *mem)
0106 {
0107     *aliasing_cast<int32_t>(mem) = _mm_cvtsi128_si32(
0108         _mm_packs_epi16(_mm_srli_epi16(_mm_packs_epi32(k, _mm_setzero_si128()), 15),
0109                         _mm_setzero_si128()));
0110 }
0111 template <> Vc_ALWAYS_INLINE void mask_store<2>(__m128i k, bool *mem)
0112 {
0113     mem[0] = -SseIntrinsics::extract_epi32<1>(k);
0114     mem[1] = -SseIntrinsics::extract_epi32<3>(k);
0115 }
0116 /*}}}*/
0117 // mask_load/*{{{*/
0118 template<size_t> Vc_ALWAYS_INLINE __m128 mask_load(const bool *mem);
0119 template<> Vc_ALWAYS_INLINE __m128 mask_load<16>(const bool *mem)
0120 {
0121     return sse_cast<__m128>(_mm_cmpgt_epi8(
0122         _mm_load_si128(reinterpret_cast<const __m128i *>(mem)), _mm_setzero_si128()));
0123 }
0124 template<> Vc_ALWAYS_INLINE __m128 mask_load<8>(const bool *mem)
0125 {
0126 #ifdef __x86_64__
0127     __m128i k = _mm_cvtsi64_si128(*reinterpret_cast<const int64_t *>(mem));
0128 #else
0129     __m128i k = _mm_castpd_si128(_mm_load_sd(reinterpret_cast<const double *>(mem)));
0130 #endif
0131     return sse_cast<__m128>(_mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128()));
0132 }
0133 template<> Vc_ALWAYS_INLINE __m128 mask_load<4>(const bool *mem)
0134 {
0135     __m128i k = _mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem));
0136     k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128());
0137     return sse_cast<__m128>(_mm_unpacklo_epi16(k, k));
0138 }
0139 template<> Vc_ALWAYS_INLINE __m128 mask_load<2>(const bool *mem)
0140 {
0141     return sse_cast<__m128>(
0142         _mm_set_epi32(-int(mem[1]), -int(mem[1]), -int(mem[0]), -int(mem[0])));
0143 }
0144 /*}}}*/
0145 // is_equal{{{
0146 template <> Vc_INTRINSIC Vc_CONST bool is_equal<2>(__m128 k1, __m128 k2)
0147 {
0148     return _mm_movemask_pd(_mm_castps_pd(k1)) == _mm_movemask_pd(_mm_castps_pd(k2));
0149 }
0150 template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<2>(__m128 k1, __m128 k2)
0151 {
0152     return _mm_movemask_pd(_mm_castps_pd(k1)) != _mm_movemask_pd(_mm_castps_pd(k2));
0153 }
0154 
0155 template <> Vc_INTRINSIC Vc_CONST bool is_equal<4>(__m128 k1, __m128 k2)
0156 {
0157     return _mm_movemask_ps(k1) == _mm_movemask_ps(k2);
0158 }
0159 template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<4>(__m128 k1, __m128 k2)
0160 {
0161     return _mm_movemask_ps(k1) != _mm_movemask_ps(k2);
0162 }
0163 
0164 template <> Vc_INTRINSIC Vc_CONST bool is_equal<8>(__m128 k1, __m128 k2)
0165 {
0166     return _mm_movemask_epi8(_mm_castps_si128(k1)) ==
0167            _mm_movemask_epi8(_mm_castps_si128(k2));
0168 }
0169 template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<8>(__m128 k1, __m128 k2)
0170 {
0171     return _mm_movemask_epi8(_mm_castps_si128(k1)) !=
0172            _mm_movemask_epi8(_mm_castps_si128(k2));
0173 }
0174 
0175 template <> Vc_INTRINSIC Vc_CONST bool is_equal<16>(__m128 k1, __m128 k2)
0176 {
0177     return _mm_movemask_epi8(_mm_castps_si128(k1)) ==
0178            _mm_movemask_epi8(_mm_castps_si128(k2));
0179 }
0180 template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<16>(__m128 k1, __m128 k2)
0181 {
0182     return _mm_movemask_epi8(_mm_castps_si128(k1)) !=
0183            _mm_movemask_epi8(_mm_castps_si128(k2));
0184 }
0185 
0186 // }}}
0187 }  // namespace Detail
0188 
0189 template<> Vc_ALWAYS_INLINE void SSE::double_m::store(bool *mem) const
0190 {
0191     *aliasing_cast<uint16_t>(mem) = _mm_movemask_epi8(dataI()) & 0x0101;
0192 }
0193 template<typename T> Vc_ALWAYS_INLINE void Mask<T, VectorAbi::Sse>::store(bool *mem) const
0194 {
0195     Detail::mask_store<Size>(dataI(), mem);
0196 }
0197 template<> Vc_ALWAYS_INLINE void SSE::double_m::load(const bool *mem)
0198 {
0199     d.set(0, MaskBool(mem[0]));
0200     d.set(1, MaskBool(mem[1]));
0201 }
0202 template <typename T> Vc_ALWAYS_INLINE void Mask<T, VectorAbi::Sse>::load(const bool *mem)
0203 {
0204     d.v() = sse_cast<VectorType>(Detail::mask_load<Size>(mem));
0205 }
0206 
0207 // get / operator[] {{{1
0208 template <>
0209 Vc_INTRINSIC Vc_PURE bool SSE::short_m::get(const SSE::short_m &m, int index) noexcept
0210 {
0211     return m.shiftMask() & (1 << 2 * index);
0212 }
0213 template <>
0214 Vc_INTRINSIC Vc_PURE bool SSE::ushort_m::get(const SSE::ushort_m &m, int index) noexcept
0215 {
0216     return m.shiftMask() & (1 << 2 * index);
0217 }
0218 
0219 // firstOne {{{1
0220 template<typename T> Vc_ALWAYS_INLINE Vc_PURE int Mask<T, VectorAbi::Sse>::firstOne() const
0221 {
0222     const int mask = toInt();
0223 #ifdef _MSC_VER
0224     unsigned long bit;
0225     _BitScanForward(&bit, mask);
0226 #else
0227     int bit;
0228     __asm__("bsf %1,%0" : "=&r"(bit) : "r"(mask));
0229 #endif
0230     return bit;
0231 }
0232 
0233 // generate {{{1
0234 template <typename M, typename G>
0235 Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 2>)
0236 {
0237     return _mm_set_epi64x(gen(1) ? 0xffffffffffffffffull : 0,
0238                           gen(0) ? 0xffffffffffffffffull : 0);
0239 }
0240 template <typename M, typename G>
0241 Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 4>)
0242 {
0243     return _mm_setr_epi32(gen(0) ? 0xfffffffful : 0, gen(1) ? 0xfffffffful : 0,
0244                           gen(2) ? 0xfffffffful : 0, gen(3) ? 0xfffffffful : 0);
0245 }
0246 template <typename M, typename G>
0247 Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 8>)
0248 {
0249     return _mm_setr_epi16(gen(0) ? 0xffffu : 0, gen(1) ? 0xffffu : 0,
0250                           gen(2) ? 0xffffu : 0, gen(3) ? 0xffffu : 0,
0251                           gen(4) ? 0xffffu : 0, gen(5) ? 0xffffu : 0,
0252                           gen(6) ? 0xffffu : 0, gen(7) ? 0xffffu : 0);
0253 }
0254 template <typename T>
0255 template <typename G>
0256 Vc_INTRINSIC Mask<T, VectorAbi::Sse> Mask<T, VectorAbi::Sse>::generate(G &&gen)
0257 {
0258     return generate_impl<Mask<T, VectorAbi::Sse>>(std::forward<G>(gen),
0259                                   std::integral_constant<int, Size>());
0260 }
0261 // shifted {{{1
0262 template <typename T> Vc_INTRINSIC Vc_PURE Mask<T, VectorAbi::Sse> Mask<T, VectorAbi::Sse>::shifted(int amount) const
0263 {
0264     switch (amount * int(sizeof(VectorEntryType))) {
0265     case   0: return *this;
0266     case   1: return Detail::shifted<  1>(dataI());
0267     case   2: return Detail::shifted<  2>(dataI());
0268     case   3: return Detail::shifted<  3>(dataI());
0269     case   4: return Detail::shifted<  4>(dataI());
0270     case   5: return Detail::shifted<  5>(dataI());
0271     case   6: return Detail::shifted<  6>(dataI());
0272     case   7: return Detail::shifted<  7>(dataI());
0273     case   8: return Detail::shifted<  8>(dataI());
0274     case   9: return Detail::shifted<  9>(dataI());
0275     case  10: return Detail::shifted< 10>(dataI());
0276     case  11: return Detail::shifted< 11>(dataI());
0277     case  12: return Detail::shifted< 12>(dataI());
0278     case  13: return Detail::shifted< 13>(dataI());
0279     case  14: return Detail::shifted< 14>(dataI());
0280     case  15: return Detail::shifted< 15>(dataI());
0281     case  16: return Detail::shifted< 16>(dataI());
0282     case  -1: return Detail::shifted< -1>(dataI());
0283     case  -2: return Detail::shifted< -2>(dataI());
0284     case  -3: return Detail::shifted< -3>(dataI());
0285     case  -4: return Detail::shifted< -4>(dataI());
0286     case  -5: return Detail::shifted< -5>(dataI());
0287     case  -6: return Detail::shifted< -6>(dataI());
0288     case  -7: return Detail::shifted< -7>(dataI());
0289     case  -8: return Detail::shifted< -8>(dataI());
0290     case  -9: return Detail::shifted< -9>(dataI());
0291     case -10: return Detail::shifted<-10>(dataI());
0292     case -11: return Detail::shifted<-11>(dataI());
0293     case -12: return Detail::shifted<-12>(dataI());
0294     case -13: return Detail::shifted<-13>(dataI());
0295     case -14: return Detail::shifted<-14>(dataI());
0296     case -15: return Detail::shifted<-15>(dataI());
0297     case -16: return Detail::shifted<-16>(dataI());
0298     }
0299     return Zero();
0300 }
0301 // }}}1
0302 
0303 }
0304 
0305 // vim: foldmethod=marker