Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-07-30 08:46:18

0001 /*
0002     Copyright (c) 2005-2023 Intel Corporation
0003 
0004     Licensed under the Apache License, Version 2.0 (the "License");
0005     you may not use this file except in compliance with the License.
0006     You may obtain a copy of the License at
0007 
0008         http://www.apache.org/licenses/LICENSE-2.0
0009 
0010     Unless required by applicable law or agreed to in writing, software
0011     distributed under the License is distributed on an "AS IS" BASIS,
0012     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0013     See the License for the specific language governing permissions and
0014     limitations under the License.
0015 */
0016 
0017 #ifndef __TBB_detail__machine_H
0018 #define __TBB_detail__machine_H
0019 
0020 #include "_config.h"
0021 #include "_assert.h"
0022 
0023 #include <atomic>
0024 #include <climits>
0025 #include <cstdint>
0026 #include <cstddef>
0027 
0028 #ifdef _WIN32
0029 #include <intrin.h>
0030 #ifdef __TBBMALLOC_BUILD
0031 #define WIN32_LEAN_AND_MEAN
0032 #ifndef NOMINMAX
0033 #define NOMINMAX
0034 #endif
0035 #include <windows.h> // SwitchToThread()
0036 #endif
0037 #ifdef _MSC_VER
0038 #if __TBB_x86_64 || __TBB_x86_32
0039 #pragma intrinsic(__rdtsc)
0040 #endif
0041 #endif
0042 #endif
0043 #if __TBB_x86_64 || __TBB_x86_32
0044 #include <immintrin.h> // _mm_pause
0045 #endif
0046 #if (_WIN32)
0047 #include <float.h> // _control87
0048 #endif
0049 
0050 #if __TBB_GLIBCXX_THIS_THREAD_YIELD_BROKEN
0051 #include <sched.h> // sched_yield
0052 #else
0053 #include <thread> // std::this_thread::yield()
0054 #endif
0055 
0056 namespace tbb {
0057 namespace detail {
0058 inline namespace d0 {
0059 
0060 //--------------------------------------------------------------------------------------------------
0061 // Yield implementation
0062 //--------------------------------------------------------------------------------------------------
0063 
0064 #if __TBB_GLIBCXX_THIS_THREAD_YIELD_BROKEN
0065 static inline void yield() {
0066     int err = sched_yield();
0067     __TBB_ASSERT_EX(err == 0, "sched_yield has failed");
0068 }
0069 #elif __TBBMALLOC_BUILD && _WIN32
0070 // Use Windows API for yield in tbbmalloc to avoid dependency on C++ runtime with some implementations.
0071 static inline void yield() {
0072     SwitchToThread();
0073 }
0074 #else
0075 using std::this_thread::yield;
0076 #endif
0077 
0078 //--------------------------------------------------------------------------------------------------
0079 // atomic_fence_seq_cst implementation
0080 //--------------------------------------------------------------------------------------------------
0081 
0082 static inline void atomic_fence_seq_cst() {
0083 #if (__TBB_x86_64 || __TBB_x86_32) && defined(__GNUC__) && __GNUC__ < 11
0084     unsigned char dummy = 0u;
0085     __asm__ __volatile__ ("lock; notb %0" : "+m" (dummy) :: "memory");
0086 #else
0087     std::atomic_thread_fence(std::memory_order_seq_cst);
0088 #endif
0089 }
0090 
0091 //--------------------------------------------------------------------------------------------------
0092 // Pause implementation
0093 //--------------------------------------------------------------------------------------------------
0094 
0095 static inline void machine_pause(int32_t delay) {
0096 #if __TBB_x86_64 || __TBB_x86_32
0097     while (delay-- > 0) { _mm_pause(); }
0098 #elif __ARM_ARCH_7A__ || __aarch64__
0099     while (delay-- > 0) { __asm__ __volatile__("yield" ::: "memory"); }
0100 #else /* Generic */
0101     (void)delay; // suppress without including _template_helpers.h
0102     yield();
0103 #endif
0104 }
0105 
0106 ////////////////////////////////////////////////////////////////////////////////////////////////////
0107 // tbb::detail::log2() implementation
0108 ////////////////////////////////////////////////////////////////////////////////////////////////////
0109 // TODO: Use log2p1() function that will be available in C++20 standard
0110 
0111 #if defined(__GNUC__) || defined(__clang__)
0112 namespace gnu_builtins {
0113     inline uintptr_t clz(unsigned int x) { return static_cast<uintptr_t>(__builtin_clz(x)); }
0114     inline uintptr_t clz(unsigned long int x) { return static_cast<uintptr_t>(__builtin_clzl(x)); }
0115     inline uintptr_t clz(unsigned long long int x) { return static_cast<uintptr_t>(__builtin_clzll(x)); }
0116 }
0117 #elif defined(_MSC_VER)
0118 #pragma intrinsic(__TBB_W(_BitScanReverse))
0119 namespace msvc_intrinsics {
0120     static inline uintptr_t bit_scan_reverse(uintptr_t i) {
0121         unsigned long j;
0122         __TBB_W(_BitScanReverse)( &j, i );
0123         return j;
0124     }
0125 }
0126 #endif
0127 
0128 template <typename T>
0129 constexpr std::uintptr_t number_of_bits() {
0130     return sizeof(T) * CHAR_BIT;
0131 }
0132 
0133 // logarithm is the index of the most significant non-zero bit
0134 static inline uintptr_t machine_log2(uintptr_t x) {
0135 #if defined(__GNUC__) || defined(__clang__)
0136     // If P is a power of 2 and x<P, then (P-1)-x == (P-1) XOR x
0137     return (number_of_bits<decltype(x)>() - 1) ^ gnu_builtins::clz(x);
0138 #elif defined(_MSC_VER)
0139     return msvc_intrinsics::bit_scan_reverse(x);
0140 #elif __i386__ || __i386 /*for Sun OS*/ || __MINGW32__
0141     uintptr_t j, i = x;
0142     __asm__("bsr %1,%0" : "=r"(j) : "r"(i));
0143     return j;
0144 #elif __powerpc__ || __POWERPC__
0145     #if __TBB_WORDSIZE==8
0146     __asm__ __volatile__ ("cntlzd %0,%0" : "+r"(x));
0147     return 63 - static_cast<intptr_t>(x);
0148     #else
0149     __asm__ __volatile__ ("cntlzw %0,%0" : "+r"(x));
0150     return 31 - static_cast<intptr_t>(x);
0151     #endif /*__TBB_WORDSIZE*/
0152 #elif __sparc
0153     uint64_t count;
0154     // one hot encode
0155     x |= (x >> 1);
0156     x |= (x >> 2);
0157     x |= (x >> 4);
0158     x |= (x >> 8);
0159     x |= (x >> 16);
0160     x |= (x >> 32);
0161     // count 1's
0162     __asm__ ("popc %1, %0" : "=r"(count) : "r"(x) );
0163     return count - 1;
0164 #else
0165     intptr_t result = 0;
0166 
0167     if( sizeof(x) > 4 && (uintptr_t tmp = x >> 32) ) { x = tmp; result += 32; }
0168     if( uintptr_t tmp = x >> 16 ) { x = tmp; result += 16; }
0169     if( uintptr_t tmp = x >> 8 )  { x = tmp; result += 8; }
0170     if( uintptr_t tmp = x >> 4 )  { x = tmp; result += 4; }
0171     if( uintptr_t tmp = x >> 2 )  { x = tmp; result += 2; }
0172 
0173     return (x & 2) ? result + 1 : result;
0174 #endif
0175 }
0176 
0177 ////////////////////////////////////////////////////////////////////////////////////////////////////
0178 // tbb::detail::reverse_bits() implementation
0179 ////////////////////////////////////////////////////////////////////////////////////////////////////
0180 #if TBB_USE_CLANG_BITREVERSE_BUILTINS
0181 namespace  llvm_builtins {
0182     inline uint8_t  builtin_bitreverse(uint8_t  x) { return __builtin_bitreverse8 (x); }
0183     inline uint16_t builtin_bitreverse(uint16_t x) { return __builtin_bitreverse16(x); }
0184     inline uint32_t builtin_bitreverse(uint32_t x) { return __builtin_bitreverse32(x); }
0185     inline uint64_t builtin_bitreverse(uint64_t x) { return __builtin_bitreverse64(x); }
0186 }
0187 #else // generic
0188 template<typename T>
0189 struct reverse {
0190     static const T byte_table[256];
0191 };
0192 
0193 template<typename T>
0194 const T reverse<T>::byte_table[256] = {
0195     0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0,
0196     0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8,
0197     0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4,
0198     0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC,
0199     0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2,
0200     0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,
0201     0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
0202     0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE,
0203     0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,
0204     0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9,
0205     0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5,
0206     0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,
0207     0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3,
0208     0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
0209     0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7,
0210     0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF
0211 };
0212 
0213 inline unsigned char reverse_byte(unsigned char src) {
0214     return reverse<unsigned char>::byte_table[src];
0215 }
0216 #endif // TBB_USE_CLANG_BITREVERSE_BUILTINS
0217 
0218 template<typename T>
0219 T machine_reverse_bits(T src) {
0220 #if TBB_USE_CLANG_BITREVERSE_BUILTINS
0221     return builtin_bitreverse(fixed_width_cast(src));
0222 #else /* Generic */
0223     T dst;
0224     unsigned char *original = reinterpret_cast<unsigned char *>(&src);
0225     unsigned char *reversed = reinterpret_cast<unsigned char *>(&dst);
0226 
0227     for ( int i = sizeof(T) - 1; i >= 0; i-- ) {
0228         reversed[i] = reverse_byte( original[sizeof(T) - i - 1] );
0229     }
0230 
0231     return dst;
0232 #endif // TBB_USE_CLANG_BITREVERSE_BUILTINS
0233 }
0234 
0235 } // inline namespace d0
0236 
0237 namespace d1 {
0238 
0239 #if (_WIN32)
0240 // API to retrieve/update FPU control setting
0241 #define __TBB_CPU_CTL_ENV_PRESENT 1
0242 struct cpu_ctl_env {
0243     unsigned int x87cw{};
0244 #if (__TBB_x86_64)
0245     // Changing the infinity mode or the floating-point precision is not supported on x64.
0246     // The attempt causes an assertion. See
0247     // https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/control87-controlfp-control87-2
0248     static constexpr unsigned int X87CW_CONTROL_MASK = _MCW_DN | _MCW_EM | _MCW_RC;
0249 #else
0250     static constexpr unsigned int X87CW_CONTROL_MASK = ~0U;
0251 #endif
0252 #if (__TBB_x86_32 || __TBB_x86_64)
0253     unsigned int mxcsr{};
0254     static constexpr unsigned int MXCSR_CONTROL_MASK = ~0x3fu; /* all except last six status bits */
0255 #endif
0256 
0257     bool operator!=( const cpu_ctl_env& ctl ) const {
0258         return
0259 #if (__TBB_x86_32 || __TBB_x86_64)
0260             mxcsr != ctl.mxcsr ||
0261 #endif
0262             x87cw != ctl.x87cw;
0263     }
0264     void get_env() {
0265         x87cw = _control87(0, 0);
0266 #if (__TBB_x86_32 || __TBB_x86_64)
0267         mxcsr = _mm_getcsr();
0268 #endif
0269     }
0270     void set_env() const {
0271         _control87(x87cw, X87CW_CONTROL_MASK);
0272 #if (__TBB_x86_32 || __TBB_x86_64)
0273         _mm_setcsr(mxcsr & MXCSR_CONTROL_MASK);
0274 #endif
0275     }
0276 };
0277 #elif (__TBB_x86_32 || __TBB_x86_64)
0278 // API to retrieve/update FPU control setting
0279 #define __TBB_CPU_CTL_ENV_PRESENT 1
0280 struct cpu_ctl_env {
0281     int     mxcsr{};
0282     short   x87cw{};
0283     static const int MXCSR_CONTROL_MASK = ~0x3f; /* all except last six status bits */
0284 
0285     bool operator!=(const cpu_ctl_env& ctl) const {
0286         return mxcsr != ctl.mxcsr || x87cw != ctl.x87cw;
0287     }
0288     void get_env() {
0289         __asm__ __volatile__(
0290             "stmxcsr %0\n\t"
0291             "fstcw %1"
0292             : "=m"(mxcsr), "=m"(x87cw)
0293         );
0294         mxcsr &= MXCSR_CONTROL_MASK;
0295     }
0296     void set_env() const {
0297         __asm__ __volatile__(
0298             "ldmxcsr %0\n\t"
0299             "fldcw %1"
0300             : : "m"(mxcsr), "m"(x87cw)
0301         );
0302     }
0303 };
0304 #endif
0305 
0306 } // namespace d1
0307 
0308 } // namespace detail
0309 } // namespace tbb
0310 
0311 #if !__TBB_CPU_CTL_ENV_PRESENT
0312 #include <fenv.h>
0313 
0314 #include <cstring>
0315 
0316 namespace tbb {
0317 namespace detail {
0318 
0319 namespace r1 {
0320 void* __TBB_EXPORTED_FUNC cache_aligned_allocate(std::size_t size);
0321 void __TBB_EXPORTED_FUNC cache_aligned_deallocate(void* p);
0322 } // namespace r1
0323 
0324 namespace d1 {
0325 
0326 class cpu_ctl_env {
0327     fenv_t *my_fenv_ptr;
0328 public:
0329     cpu_ctl_env() : my_fenv_ptr(nullptr) {}
0330     ~cpu_ctl_env() {
0331         if ( my_fenv_ptr )
0332             r1::cache_aligned_deallocate( (void*)my_fenv_ptr );
0333     }
0334     // It is possible not to copy memory but just to copy pointers but the following issues should be addressed:
0335     //   1. The arena lifetime and the context lifetime are independent;
0336     //   2. The user is allowed to recapture different FPU settings to context so 'current FPU settings' inside
0337     //   dispatch loop may become invalid.
0338     // But do we really want to improve the fenv implementation? It seems to be better to replace the fenv implementation
0339     // with a platform specific implementation.
0340     cpu_ctl_env( const cpu_ctl_env &src ) : my_fenv_ptr(nullptr) {
0341         *this = src;
0342     }
0343     cpu_ctl_env& operator=( const cpu_ctl_env &src ) {
0344         __TBB_ASSERT( src.my_fenv_ptr, nullptr);
0345         if ( !my_fenv_ptr )
0346             my_fenv_ptr = (fenv_t*)r1::cache_aligned_allocate(sizeof(fenv_t));
0347         *my_fenv_ptr = *src.my_fenv_ptr;
0348         return *this;
0349     }
0350     bool operator!=( const cpu_ctl_env &ctl ) const {
0351         __TBB_ASSERT( my_fenv_ptr, "cpu_ctl_env is not initialized." );
0352         __TBB_ASSERT( ctl.my_fenv_ptr, "cpu_ctl_env is not initialized." );
0353         return std::memcmp( (void*)my_fenv_ptr, (void*)ctl.my_fenv_ptr, sizeof(fenv_t) );
0354     }
0355     void get_env () {
0356         if ( !my_fenv_ptr )
0357             my_fenv_ptr = (fenv_t*)r1::cache_aligned_allocate(sizeof(fenv_t));
0358         fegetenv( my_fenv_ptr );
0359     }
0360     const cpu_ctl_env& set_env () const {
0361         __TBB_ASSERT( my_fenv_ptr, "cpu_ctl_env is not initialized." );
0362         fesetenv( my_fenv_ptr );
0363         return *this;
0364     }
0365 };
0366 
0367 } // namespace d1
0368 } // namespace detail
0369 } // namespace tbb
0370 
0371 #endif /* !__TBB_CPU_CTL_ENV_PRESENT */
0372 
0373 #endif // __TBB_detail__machine_H