Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-18 09:53:31

0001 /*
0002  *          Copyright Andrey Semashev 2013, 2022.
0003  * Distributed under the Boost Software License, Version 1.0.
0004  *    (See accompanying file LICENSE_1_0.txt or copy at
0005  *          https://www.boost.org/LICENSE_1_0.txt)
0006  */
0007 /*!
0008  * \file   uuid/detail/uuid_x86.ipp
0009  *
0010  * \brief  This header contains optimized SSE implementation of \c boost::uuid operations.
0011  */
0012 
0013 #ifndef BOOST_UUID_DETAIL_UUID_X86_IPP_INCLUDED_
0014 #define BOOST_UUID_DETAIL_UUID_X86_IPP_INCLUDED_
0015 
0016 // MSVC does not always have immintrin.h (at least, not up to MSVC 10), so include the appropriate header for each instruction set
0017 #if defined(BOOST_UUID_USE_SSE41)
0018 #include <smmintrin.h>
0019 #elif defined(BOOST_UUID_USE_SSE3)
0020 #include <pmmintrin.h>
0021 #else
0022 #include <emmintrin.h>
0023 #endif
0024 
0025 #if defined(BOOST_MSVC) && defined(_M_X64) && (BOOST_MSVC < 1900 /* Fixed in Visual Studio 2015 */ )
0026 // At least MSVC 9 (VS2008) and 12 (VS2013) have an optimizer bug that sometimes results in incorrect SIMD code
0027 // generated in Release x64 mode. In particular, it affects operator==, where the compiler sometimes generates
0028 // pcmpeqd with a memory opereand instead of movdqu followed by pcmpeqd. The problem is that uuid can be
0029 // not aligned to 16 bytes and pcmpeqd causes alignment violation in this case. We cannot be sure that other
0030 // MSVC versions are not affected so we apply the workaround for all versions, except VS2015 on up where
0031 // the bug has been fixed.
0032 //
0033 // https://svn.boost.org/trac/boost/ticket/8509#comment:3
0034 // https://connect.microsoft.com/VisualStudio/feedbackdetail/view/981648#tabs
0035 #define BOOST_UUID_DETAIL_MSVC_BUG981648
0036 #if BOOST_MSVC >= 1600
0037 extern "C" void _ReadWriteBarrier(void);
0038 #pragma intrinsic(_ReadWriteBarrier)
0039 #endif
0040 #endif
0041 
0042 namespace boost {
0043 namespace uuids {
0044 namespace detail {
0045 
0046 BOOST_FORCEINLINE __m128i load_unaligned_si128(const uint8_t* p) BOOST_NOEXCEPT
0047 {
0048 #if !defined(BOOST_UUID_DETAIL_MSVC_BUG981648) || defined(BOOST_UUID_USE_AVX)
0049     return _mm_loadu_si128(reinterpret_cast< const __m128i* >(p));
0050 #elif defined(BOOST_MSVC) && BOOST_MSVC >= 1600
0051     __m128i mm = _mm_loadu_si128(reinterpret_cast< const __m128i* >(p));
0052     // Make sure this load doesn't get merged with the subsequent instructions
0053     _ReadWriteBarrier();
0054     return mm;
0055 #else
0056     // VS2008 x64 doesn't respect _ReadWriteBarrier above, so we have to generate this crippled code to load unaligned data
0057     return _mm_unpacklo_epi64(_mm_loadl_epi64(reinterpret_cast< const __m128i* >(p)), _mm_loadl_epi64(reinterpret_cast< const __m128i* >(p + 8)));
0058 #endif
0059 }
0060 
0061 } // namespace detail
0062 
0063 inline bool uuid::is_nil() const BOOST_NOEXCEPT
0064 {
0065     __m128i mm = uuids::detail::load_unaligned_si128(data);
0066 #if defined(BOOST_UUID_USE_SSE41)
0067     return _mm_test_all_zeros(mm, mm) != 0;
0068 #else
0069     mm = _mm_cmpeq_epi32(mm, _mm_setzero_si128());
0070     return _mm_movemask_epi8(mm) == 0xFFFF;
0071 #endif
0072 }
0073 
0074 inline void uuid::swap(uuid& rhs) BOOST_NOEXCEPT
0075 {
0076     __m128i mm_this = uuids::detail::load_unaligned_si128(data);
0077     __m128i mm_rhs = uuids::detail::load_unaligned_si128(rhs.data);
0078     _mm_storeu_si128(reinterpret_cast< __m128i* >(rhs.data), mm_this);
0079     _mm_storeu_si128(reinterpret_cast< __m128i* >(data), mm_rhs);
0080 }
0081 
0082 inline bool operator== (uuid const& lhs, uuid const& rhs) BOOST_NOEXCEPT
0083 {
0084     __m128i mm_left = uuids::detail::load_unaligned_si128(lhs.data);
0085     __m128i mm_right = uuids::detail::load_unaligned_si128(rhs.data);
0086 
0087 #if defined(BOOST_UUID_USE_SSE41)
0088     __m128i mm = _mm_xor_si128(mm_left, mm_right);
0089     return _mm_test_all_zeros(mm, mm) != 0;
0090 #else
0091     __m128i mm_cmp = _mm_cmpeq_epi32(mm_left, mm_right);
0092     return _mm_movemask_epi8(mm_cmp) == 0xFFFF;
0093 #endif
0094 }
0095 
0096 inline bool operator< (uuid const& lhs, uuid const& rhs) BOOST_NOEXCEPT
0097 {
0098     __m128i mm_left = uuids::detail::load_unaligned_si128(lhs.data);
0099     __m128i mm_right = uuids::detail::load_unaligned_si128(rhs.data);
0100 
0101     // To emulate lexicographical_compare behavior we have to perform two comparisons - the forward and reverse one.
0102     // Then we know which bytes are equivalent and which ones are different, and for those different the comparison results
0103     // will be opposite. Then we'll be able to find the first differing comparison result (for both forward and reverse ways),
0104     // and depending on which way it is for, this will be the result of the operation. There are a few notes to consider:
0105     //
0106     // 1. Due to little endian byte order the first bytes go into the lower part of the xmm registers,
0107     //    so the comparison results in the least significant bits will actually be the most signigicant for the final operation result.
0108     //    This means we have to determine which of the comparison results have the least significant bit on, and this is achieved with
0109     //    the "(x - 1) ^ x" trick.
0110     // 2. Because there is only signed comparison in SSE/AVX, we have to invert byte comparison results whenever signs of the corresponding
0111     //    bytes are different. I.e. in signed comparison it's -1 < 1, but in unsigned it is the opposite (255 > 1). To do that we XOR left and right,
0112     //    making the most significant bit of each byte 1 if the signs are different, and later apply this mask with another XOR to the comparison results.
0113     // 3. pcmpgtw compares for "greater" relation, so we swap the arguments to get what we need.
0114 
0115     const __m128i mm_signs_mask = _mm_xor_si128(mm_left, mm_right);
0116 
0117     __m128i mm_cmp = _mm_cmpgt_epi8(mm_right, mm_left), mm_rcmp = _mm_cmpgt_epi8(mm_left, mm_right);
0118 
0119     mm_cmp = _mm_xor_si128(mm_signs_mask, mm_cmp);
0120     mm_rcmp = _mm_xor_si128(mm_signs_mask, mm_rcmp);
0121 
0122     uint32_t cmp = static_cast< uint32_t >(_mm_movemask_epi8(mm_cmp)), rcmp = static_cast< uint32_t >(_mm_movemask_epi8(mm_rcmp));
0123 
0124     cmp = (cmp - 1u) ^ cmp;
0125     rcmp = (rcmp - 1u) ^ rcmp;
0126 
0127     return cmp < rcmp;
0128 }
0129 
0130 } // namespace uuids
0131 } // namespace boost
0132 
0133 #endif // BOOST_UUID_DETAIL_UUID_X86_IPP_INCLUDED_