Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2024-05-18 08:30:12

0001 #pragma once
0002 #ifndef PSIMD_H
0003 #define PSIMD_H
0004 
0005 #if defined(__CUDA_ARCH__)
0006     /* CUDA compiler */
0007     #define PSIMD_INTRINSIC __forceinline__ __device__
0008 #elif defined(__OPENCL_VERSION__)
0009     /* OpenCL compiler */
0010     #define PSIMD_INTRINSIC inline static
0011 #elif defined(__INTEL_COMPILER)
0012     /* Intel compiler, even on Windows */
0013     #define PSIMD_INTRINSIC inline static __attribute__((__always_inline__))
0014 #elif defined(__GNUC__)
0015     /* GCC-compatible compiler (gcc/clang/icc) */
0016     #define PSIMD_INTRINSIC inline static __attribute__((__always_inline__))
0017 #elif defined(_MSC_VER)
0018     /* MSVC-compatible compiler (cl/icl/clang-cl) */
0019     #define PSIMD_INTRINSIC __forceinline static
0020 #elif defined(__cplusplus)
0021     /* Generic C++ compiler */
0022     #define PSIMD_INTRINSIC inline static
0023 #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
0024     /* Generic C99 compiler */
0025     #define PSIMD_INTRINSIC inline static
0026 #else
0027     /* Generic C compiler */
0028     #define PSIMD_INTRINSIC static
0029 #endif
0030 
0031 #if defined(__GNUC__) || defined(__clang__)
0032     #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0033         #include <arm_neon.h>
0034     #endif
0035 
0036     #if defined(__SSE2__)
0037         #include <emmintrin.h>
0038     #endif
0039 
0040     #if defined(__SSE3__)
0041         #include <pmmintrin.h>
0042     #endif
0043 
0044     #if defined(__SSSE3__)
0045         #include <tmmintrin.h>
0046     #endif
0047 
0048     #if defined(__SSE4_1__)
0049         #include <smmintrin.h>
0050     #endif
0051 
0052     #if defined(__SSE4_2__)
0053         #include <nmmintrin.h>
0054     #endif
0055 
0056     #if defined(__AVX__)
0057         #include <immintrin.h>
0058     #endif
0059 #elif defined(_MSC_VER)
0060     #include <intrin.h>
0061 #endif
0062 
0063 #if defined(__cplusplus)
0064     #define PSIMD_CXX_SYNTAX
0065 #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
0066     #define PSIMD_C11_SYNTAX
0067 #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
0068     #define PSIMD_C99_SYNTAX
0069 #else
0070     #define PSIMD_C89_SYNTAX
0071 #endif
0072 
0073 #if defined(__cplusplus) && (__cplusplus >= 201103L)
0074     #include <cstddef>
0075     #include <cstdint>
0076 #elif !defined(__OPENCL_VERSION__)
0077     #include <stddef.h>
0078     #include <stdint.h>
0079 #endif
0080 
0081 #if defined(__GNUC__) || defined(__clang__)
0082     #define PSIMD_HAVE_F64 0
0083     #define PSIMD_HAVE_F32 1
0084     #define PSIMD_HAVE_U8 1
0085     #define PSIMD_HAVE_S8 1
0086     #define PSIMD_HAVE_U16 1
0087     #define PSIMD_HAVE_S16 1
0088     #define PSIMD_HAVE_U32 1
0089     #define PSIMD_HAVE_S32 1
0090     #define PSIMD_HAVE_U64 0
0091     #define PSIMD_HAVE_S64 0
0092 
0093     typedef int8_t   psimd_s8  __attribute__((vector_size(16), aligned(1)));
0094     typedef uint8_t  psimd_u8  __attribute__((vector_size(16), aligned(1)));
0095     typedef int16_t  psimd_s16 __attribute__((vector_size(16), aligned(2)));
0096     typedef uint16_t psimd_u16 __attribute__((vector_size(16), aligned(2)));
0097     typedef int32_t  psimd_s32 __attribute__((vector_size(16), aligned(4)));
0098     typedef uint32_t psimd_u32 __attribute__((vector_size(16), aligned(4)));
0099     typedef float    psimd_f32 __attribute__((vector_size(16), aligned(4)));
0100 
0101     typedef struct {
0102         psimd_s8 lo;
0103         psimd_s8 hi;
0104     } psimd_s8x2;
0105 
0106     typedef struct {
0107         psimd_u8 lo;
0108         psimd_u8 hi;
0109     } psimd_u8x2;
0110 
0111     typedef struct {
0112         psimd_s16 lo;
0113         psimd_s16 hi;
0114     } psimd_s16x2;
0115 
0116     typedef struct {
0117         psimd_u16 lo;
0118         psimd_u16 hi;
0119     } psimd_u16x2;
0120 
0121     typedef struct {
0122         psimd_s32 lo;
0123         psimd_s32 hi;
0124     } psimd_s32x2;
0125 
0126     typedef struct {
0127         psimd_u32 lo;
0128         psimd_u32 hi;
0129     } psimd_u32x2;
0130 
0131     typedef struct {
0132         psimd_f32 lo;
0133         psimd_f32 hi;
0134     } psimd_f32x2;
0135 
0136     /* Bit casts */
0137     PSIMD_INTRINSIC psimd_u32x2 psimd_cast_s32x2_u32x2(psimd_s32x2 v) {
0138         return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi };
0139     }
0140 
0141     PSIMD_INTRINSIC psimd_f32x2 psimd_cast_s32x2_f32x2(psimd_s32x2 v) {
0142         return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi };
0143     }
0144 
0145     PSIMD_INTRINSIC psimd_s32x2 psimd_cast_u32x2_s32x2(psimd_u32x2 v) {
0146         return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi };
0147     }
0148 
0149     PSIMD_INTRINSIC psimd_f32x2 psimd_cast_u32x2_f32x2(psimd_u32x2 v) {
0150         return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi };
0151     }
0152 
0153     PSIMD_INTRINSIC psimd_s32x2 psimd_cast_f32x2_s32x2(psimd_f32x2 v) {
0154         return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi };
0155     }
0156 
0157     PSIMD_INTRINSIC psimd_u32x2 psimd_cast_f32x2_u32x2(psimd_f32x2 v) {
0158         return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi };
0159     }
0160 
0161     /* Swap */
0162     PSIMD_INTRINSIC void psimd_swap_s8(psimd_s8 a[1], psimd_s8 b[1]) {
0163         const psimd_s8 new_a = *b;
0164         const psimd_s8 new_b = *a;
0165         *a = new_a;
0166         *b = new_b;
0167     }
0168 
0169     PSIMD_INTRINSIC void psimd_swap_u8(psimd_u8 a[1], psimd_u8 b[1]) {
0170         const psimd_u8 new_a = *b;
0171         const psimd_u8 new_b = *a;
0172         *a = new_a;
0173         *b = new_b;
0174     }
0175 
0176     PSIMD_INTRINSIC void psimd_swap_s16(psimd_s16 a[1], psimd_s16 b[1]) {
0177         const psimd_s16 new_a = *b;
0178         const psimd_s16 new_b = *a;
0179         *a = new_a;
0180         *b = new_b;
0181     }
0182 
0183     PSIMD_INTRINSIC void psimd_swap_u16(psimd_u16 a[1], psimd_u16 b[1]) {
0184         const psimd_u16 new_a = *b;
0185         const psimd_u16 new_b = *a;
0186         *a = new_a;
0187         *b = new_b;
0188     }
0189 
0190     PSIMD_INTRINSIC void psimd_swap_s32(psimd_s32 a[1], psimd_s32 b[1]) {
0191         const psimd_s32 new_a = *b;
0192         const psimd_s32 new_b = *a;
0193         *a = new_a;
0194         *b = new_b;
0195     }
0196 
0197     PSIMD_INTRINSIC void psimd_swap_u32(psimd_u32 a[1], psimd_u32 b[1]) {
0198         const psimd_u32 new_a = *b;
0199         const psimd_u32 new_b = *a;
0200         *a = new_a;
0201         *b = new_b;
0202     }
0203 
0204     PSIMD_INTRINSIC void psimd_swap_f32(psimd_f32 a[1], psimd_f32 b[1]) {
0205         const psimd_f32 new_a = *b;
0206         const psimd_f32 new_b = *a;
0207         *a = new_a;
0208         *b = new_b;
0209     }
0210 
0211     /* Zero-initialization */
0212     PSIMD_INTRINSIC psimd_s8 psimd_zero_s8(void) {
0213         return (psimd_s8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
0214     }
0215 
0216     PSIMD_INTRINSIC psimd_u8 psimd_zero_u8(void) {
0217         return (psimd_u8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
0218     }
0219 
0220     PSIMD_INTRINSIC psimd_s16 psimd_zero_s16(void) {
0221         return (psimd_s16) { 0, 0, 0, 0, 0, 0, 0, 0 };
0222     }
0223 
0224     PSIMD_INTRINSIC psimd_u16 psimd_zero_u16(void) {
0225         return (psimd_u16) { 0, 0, 0, 0, 0, 0, 0, 0 };
0226     }
0227 
0228     PSIMD_INTRINSIC psimd_s32 psimd_zero_s32(void) {
0229         return (psimd_s32) { 0, 0, 0, 0 };
0230     }
0231 
0232     PSIMD_INTRINSIC psimd_u32 psimd_zero_u32(void) {
0233         return (psimd_u32) { 0, 0, 0, 0 };
0234     }
0235 
0236     PSIMD_INTRINSIC psimd_f32 psimd_zero_f32(void) {
0237         return (psimd_f32) { 0.0f, 0.0f, 0.0f, 0.0f };
0238     }
0239 
0240     /* Initialization to the same constant */
0241     PSIMD_INTRINSIC psimd_s8 psimd_splat_s8(int8_t c) {
0242         return (psimd_s8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c };
0243     }
0244 
0245     PSIMD_INTRINSIC psimd_u8 psimd_splat_u8(uint8_t c) {
0246         return (psimd_u8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c };
0247     }
0248 
0249     PSIMD_INTRINSIC psimd_s16 psimd_splat_s16(int16_t c) {
0250         return (psimd_s16) { c, c, c, c, c, c, c, c };
0251     }
0252 
0253     PSIMD_INTRINSIC psimd_u16 psimd_splat_u16(uint16_t c) {
0254         return (psimd_u16) { c, c, c, c, c, c, c, c };
0255     }
0256 
0257     PSIMD_INTRINSIC psimd_s32 psimd_splat_s32(int32_t c) {
0258         return (psimd_s32) { c, c, c, c };
0259     }
0260 
0261     PSIMD_INTRINSIC psimd_u32 psimd_splat_u32(uint32_t c) {
0262         return (psimd_u32) { c, c, c, c };
0263     }
0264 
0265     PSIMD_INTRINSIC psimd_f32 psimd_splat_f32(float c) {
0266         return (psimd_f32) { c, c, c, c };
0267     }
0268 
0269     /* Load vector */
0270     PSIMD_INTRINSIC psimd_s8 psimd_load_s8(const void* address) {
0271         return *((const psimd_s8*) address);
0272     }
0273 
0274     PSIMD_INTRINSIC psimd_u8 psimd_load_u8(const void* address) {
0275         return *((const psimd_u8*) address);
0276     }
0277 
0278     PSIMD_INTRINSIC psimd_s16 psimd_load_s16(const void* address) {
0279         return *((const psimd_s16*) address);
0280     }
0281 
0282     PSIMD_INTRINSIC psimd_u16 psimd_load_u16(const void* address) {
0283         return *((const psimd_u16*) address);
0284     }
0285 
0286     PSIMD_INTRINSIC psimd_s32 psimd_load_s32(const void* address) {
0287         return *((const psimd_s32*) address);
0288     }
0289 
0290     PSIMD_INTRINSIC psimd_u32 psimd_load_u32(const void* address) {
0291         return *((const psimd_u32*) address);
0292     }
0293 
0294     PSIMD_INTRINSIC psimd_f32 psimd_load_f32(const void* address) {
0295         return *((const psimd_f32*) address);
0296     }
0297 
0298     PSIMD_INTRINSIC psimd_s8 psimd_load_splat_s8(const void* address) {
0299         return psimd_splat_s8(*((const int8_t*) address));
0300     }
0301 
0302     PSIMD_INTRINSIC psimd_u8 psimd_load_splat_u8(const void* address) {
0303         return psimd_splat_u8(*((const uint8_t*) address));
0304     }
0305 
0306     PSIMD_INTRINSIC psimd_s16 psimd_load_splat_s16(const void* address) {
0307         return psimd_splat_s16(*((const int16_t*) address));
0308     }
0309 
0310     PSIMD_INTRINSIC psimd_u16 psimd_load_splat_u16(const void* address) {
0311         return psimd_splat_u16(*((const uint16_t*) address));
0312     }
0313 
0314     PSIMD_INTRINSIC psimd_s32 psimd_load_splat_s32(const void* address) {
0315         return psimd_splat_s32(*((const int32_t*) address));
0316     }
0317 
0318     PSIMD_INTRINSIC psimd_u32 psimd_load_splat_u32(const void* address) {
0319         return psimd_splat_u32(*((const uint32_t*) address));
0320     }
0321 
0322     PSIMD_INTRINSIC psimd_f32 psimd_load_splat_f32(const void* address) {
0323         return psimd_splat_f32(*((const float*) address));
0324     }
0325 
0326     PSIMD_INTRINSIC psimd_s32 psimd_load1_s32(const void* address) {
0327         return (psimd_s32) { *((const int32_t*) address), 0, 0, 0 };
0328     }
0329 
0330     PSIMD_INTRINSIC psimd_u32 psimd_load1_u32(const void* address) {
0331         return (psimd_u32) { *((const uint32_t*) address), 0, 0, 0 };
0332     }
0333 
0334     PSIMD_INTRINSIC psimd_f32 psimd_load1_f32(const void* address) {
0335         return (psimd_f32) { *((const float*) address), 0.0f, 0.0f, 0.0f };
0336     }
0337 
0338     PSIMD_INTRINSIC psimd_s32 psimd_load2_s32(const void* address) {
0339         const int32_t* address_s32 = (const int32_t*) address;
0340         return (psimd_s32) { address_s32[0], address_s32[1], 0, 0 };
0341     }
0342 
0343     PSIMD_INTRINSIC psimd_u32 psimd_load2_u32(const void* address) {
0344         const uint32_t* address_u32 = (const uint32_t*) address;
0345         return (psimd_u32) { address_u32[0], address_u32[1], 0, 0 };
0346     }
0347 
0348     PSIMD_INTRINSIC psimd_f32 psimd_load2_f32(const void* address) {
0349         const float* address_f32 = (const float*) address;
0350         return (psimd_f32) { address_f32[0], address_f32[1], 0.0f, 0.0f };
0351     }
0352 
0353     PSIMD_INTRINSIC psimd_s32 psimd_load3_s32(const void* address) {
0354         const int32_t* address_s32 = (const int32_t*) address;
0355         return (psimd_s32) { address_s32[0], address_s32[1], address_s32[2], 0 };
0356     }
0357 
0358     PSIMD_INTRINSIC psimd_u32 psimd_load3_u32(const void* address) {
0359         const uint32_t* address_u32 = (const uint32_t*) address;
0360         return (psimd_u32) { address_u32[0], address_u32[1], address_u32[2], 0 };
0361     }
0362 
0363     PSIMD_INTRINSIC psimd_f32 psimd_load3_f32(const void* address) {
0364         const float* address_f32 = (const float*) address;
0365         return (psimd_f32) { address_f32[0], address_f32[1], address_f32[2], 0.0f };
0366     }
0367 
0368     PSIMD_INTRINSIC psimd_s32 psimd_load4_s32(const void* address) {
0369         return psimd_load_s32(address);
0370     }
0371 
0372     PSIMD_INTRINSIC psimd_u32 psimd_load4_u32(const void* address) {
0373         return psimd_load_u32(address);
0374     }
0375 
0376     PSIMD_INTRINSIC psimd_f32 psimd_load4_f32(const void* address) {
0377         return psimd_load_f32(address);
0378     }
0379 
0380     PSIMD_INTRINSIC psimd_f32 psimd_load_stride2_f32(const void* address) {
0381         const psimd_f32 v0x1x = psimd_load_f32(address);
0382         const psimd_f32 vx2x3 = psimd_load_f32((const float*) address + 3);
0383         #if defined(__clang__)
0384             return __builtin_shufflevector(v0x1x, vx2x3, 0, 2, 5, 7);
0385         #else
0386             return __builtin_shuffle(v0x1x, vx2x3, (psimd_s32) { 0, 2, 5, 7 });
0387         #endif
0388     }
0389 
0390     PSIMD_INTRINSIC psimd_f32 psimd_load1_stride2_f32(const void* address) {
0391         return psimd_load_f32(address);
0392     }
0393 
0394     PSIMD_INTRINSIC psimd_f32 psimd_load2_stride2_f32(const void* address) {
0395         const float* address_f32 = (const float*) address;
0396         return (psimd_f32) { address_f32[0], address_f32[2], 0.0f, 0.0f };
0397     }
0398 
0399     PSIMD_INTRINSIC psimd_f32 psimd_load3_stride2_f32(const void* address) {
0400         const psimd_f32 v0x1x = psimd_load_f32(address);
0401         const psimd_f32 v2zzz = psimd_load1_f32((const float*) address + 2);
0402         #if defined(__clang__)
0403             return __builtin_shufflevector(v0x1x, v2zzz, 0, 2, 4, 6);
0404         #else
0405             return __builtin_shuffle(v0x1x, v2zzz, (psimd_s32) { 0, 2, 4, 6 });
0406         #endif
0407     }
0408 
0409     PSIMD_INTRINSIC psimd_f32 psimd_load4_stride2_f32(const void* address) {
0410         return psimd_load_stride2_f32(address);
0411     }
0412 
0413     PSIMD_INTRINSIC psimd_f32 psimd_load_stride_f32(const void* address, size_t stride) {
0414         const float* address0_f32 = (const float*) address;
0415         const float* address1_f32 = address0_f32 + stride;
0416         const float* address2_f32 = address1_f32 + stride;
0417         const float* address3_f32 = address2_f32 + stride;
0418         return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, *address3_f32 };
0419     }
0420 
0421     PSIMD_INTRINSIC psimd_f32 psimd_load1_stride_f32(const void* address, size_t stride) {
0422         return psimd_load1_f32(address);
0423     }
0424 
0425     PSIMD_INTRINSIC psimd_f32 psimd_load2_stride_f32(const void* address, size_t stride) {
0426         const float* address_f32 = (const float*) address;
0427         return (psimd_f32) { address_f32[0], address_f32[stride], 0.0f, 0.0f };
0428     }
0429 
0430     PSIMD_INTRINSIC psimd_f32 psimd_load3_stride_f32(const void* address, size_t stride) {
0431         const float* address0_f32 = (const float*) address;
0432         const float* address1_f32 = address0_f32 + stride;
0433         const float* address2_f32 = address1_f32 + stride;
0434         return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, 0.0f };
0435     }
0436 
0437     PSIMD_INTRINSIC psimd_f32 psimd_load4_stride_f32(const void* address, size_t stride) {
0438         return psimd_load_stride_f32(address, stride);
0439     }
0440 
0441     /* Store vector */
0442     PSIMD_INTRINSIC void psimd_store_s8(void* address, psimd_s8 value) {
0443         *((psimd_s8*) address) = value;
0444     }
0445 
0446     PSIMD_INTRINSIC void psimd_store_u8(void* address, psimd_u8 value) {
0447         *((psimd_u8*) address) = value;
0448     }
0449 
0450     PSIMD_INTRINSIC void psimd_store_s16(void* address, psimd_s16 value) {
0451         *((psimd_s16*) address) = value;
0452     }
0453 
0454     PSIMD_INTRINSIC void psimd_store_u16(void* address, psimd_u16 value) {
0455         *((psimd_u16*) address) = value;
0456     }
0457 
0458     PSIMD_INTRINSIC void psimd_store_s32(void* address, psimd_s32 value) {
0459         *((psimd_s32*) address) = value;
0460     }
0461 
0462     PSIMD_INTRINSIC void psimd_store_u32(void* address, psimd_u32 value) {
0463         *((psimd_u32*) address) = value;
0464     }
0465 
0466     PSIMD_INTRINSIC void psimd_store_f32(void* address, psimd_f32 value) {
0467         *((psimd_f32*) address) = value;
0468     }
0469 
0470     PSIMD_INTRINSIC void psimd_store1_s32(void* address, psimd_s32 value) {
0471         *((int32_t*) address) = value[0];
0472     }
0473 
0474     PSIMD_INTRINSIC void psimd_store1_u32(void* address, psimd_u32 value) {
0475         *((uint32_t*) address) = value[0];
0476     }
0477 
0478     PSIMD_INTRINSIC void psimd_store1_f32(void* address, psimd_f32 value) {
0479         *((float*) address) = value[0];
0480     }
0481 
0482     PSIMD_INTRINSIC void psimd_store2_s32(void* address, psimd_s32 value) {
0483         int32_t* address_s32 = (int32_t*) address;
0484         address_s32[0] = value[0];
0485         address_s32[1] = value[1];
0486     }
0487 
0488     PSIMD_INTRINSIC void psimd_store2_u32(void* address, psimd_u32 value) {
0489         uint32_t* address_u32 = (uint32_t*) address;
0490         address_u32[0] = value[0];
0491         address_u32[1] = value[1];
0492     }
0493 
0494     PSIMD_INTRINSIC void psimd_store2_f32(void* address, psimd_f32 value) {
0495         float* address_f32 = (float*) address;
0496         address_f32[0] = value[0];
0497         address_f32[1] = value[1];
0498     }
0499 
0500     PSIMD_INTRINSIC void psimd_store3_s32(void* address, psimd_s32 value) {
0501         int32_t* address_s32 = (int32_t*) address;
0502         address_s32[0] = value[0];
0503         address_s32[1] = value[1];
0504         address_s32[2] = value[2];
0505     }
0506 
0507     PSIMD_INTRINSIC void psimd_store3_u32(void* address, psimd_u32 value) {
0508         uint32_t* address_u32 = (uint32_t*) address;
0509         address_u32[0] = value[0];
0510         address_u32[1] = value[1];
0511         address_u32[2] = value[2];
0512     }
0513 
0514     PSIMD_INTRINSIC void psimd_store3_f32(void* address, psimd_f32 value) {
0515         float* address_f32 = (float*) address;
0516         address_f32[0] = value[0];
0517         address_f32[1] = value[1];
0518         address_f32[2] = value[2];
0519     }
0520 
0521     PSIMD_INTRINSIC void psimd_store4_s32(void* address, psimd_s32 value) {
0522         psimd_store_s32(address, value);
0523     }
0524 
0525     PSIMD_INTRINSIC void psimd_store4_u32(void* address, psimd_u32 value) {
0526         psimd_store_u32(address, value);
0527     }
0528 
0529     PSIMD_INTRINSIC void psimd_store4_f32(void* address, psimd_f32 value) {
0530         psimd_store_f32(address, value);
0531     }
0532 
0533     PSIMD_INTRINSIC void psimd_store_stride_f32(void* address, size_t stride, psimd_f32 value) {
0534         float* address0_f32 = (float*) address;
0535         float* address1_f32 = address0_f32 + stride;
0536         float* address2_f32 = address1_f32 + stride;
0537         float* address3_f32 = address2_f32 + stride;
0538         *address0_f32 = value[0];
0539         *address1_f32 = value[1];
0540         *address2_f32 = value[2];
0541         *address3_f32 = value[3];
0542     }
0543 
0544     PSIMD_INTRINSIC void psimd_store1_stride_f32(void* address, size_t stride, psimd_f32 value) {
0545         psimd_store1_f32(address, value);
0546     }
0547 
0548     PSIMD_INTRINSIC void psimd_store2_stride_f32(void* address, size_t stride, psimd_f32 value) {
0549         float* address_f32 = (float*) address;
0550         address_f32[0]      = value[0];
0551         address_f32[stride] = value[1];
0552     }
0553 
0554     PSIMD_INTRINSIC void psimd_store3_stride_f32(void* address, size_t stride, psimd_f32 value) {
0555         float* address0_f32 = (float*) address;
0556         float* address1_f32 = address0_f32 + stride;
0557         float* address2_f32 = address1_f32 + stride;
0558         *address0_f32 = value[0];
0559         *address1_f32 = value[1];
0560         *address2_f32 = value[2];
0561     }
0562 
0563     /* Vector addition */
0564     PSIMD_INTRINSIC psimd_s8 psimd_add_s8(psimd_s8 a, psimd_s8 b) {
0565         return a + b;
0566     }
0567 
0568     PSIMD_INTRINSIC psimd_u8 psimd_add_u8(psimd_u8 a, psimd_u8 b) {
0569         return a + b;
0570     }
0571 
0572     PSIMD_INTRINSIC psimd_s16 psimd_add_s16(psimd_s16 a, psimd_s16 b) {
0573         return a + b;
0574     }
0575 
0576     PSIMD_INTRINSIC psimd_u16 psimd_add_u16(psimd_u16 a, psimd_u16 b) {
0577         return a + b;
0578     }
0579 
0580     PSIMD_INTRINSIC psimd_s32 psimd_add_s32(psimd_s32 a, psimd_s32 b) {
0581         return a + b;
0582     }
0583 
0584     PSIMD_INTRINSIC psimd_u32 psimd_add_u32(psimd_u32 a, psimd_u32 b) {
0585         return a + b;
0586     }
0587 
0588     PSIMD_INTRINSIC psimd_f32 psimd_add_f32(psimd_f32 a, psimd_f32 b) {
0589         #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
0590             return (psimd_f32) vaddq_f32((float32x4_t) a, (float32x4_t) b);
0591         #else
0592             return a + b;
0593         #endif
0594     }
0595 
0596     /* Vector subtraction */
0597     PSIMD_INTRINSIC psimd_s8 psimd_sub_s8(psimd_s8 a, psimd_s8 b) {
0598         return a - b;
0599     }
0600 
0601     PSIMD_INTRINSIC psimd_u8 psimd_sub_u8(psimd_u8 a, psimd_u8 b) {
0602         return a - b;
0603     }
0604 
0605     PSIMD_INTRINSIC psimd_s16 psimd_sub_s16(psimd_s16 a, psimd_s16 b) {
0606         return a - b;
0607     }
0608 
0609     PSIMD_INTRINSIC psimd_u16 psimd_sub_u16(psimd_u16 a, psimd_u16 b) {
0610         return a - b;
0611     }
0612 
0613     PSIMD_INTRINSIC psimd_s32 psimd_sub_s32(psimd_s32 a, psimd_s32 b) {
0614         return a - b;
0615     }
0616 
0617     PSIMD_INTRINSIC psimd_u32 psimd_sub_u32(psimd_u32 a, psimd_u32 b) {
0618         return a - b;
0619     }
0620 
0621     PSIMD_INTRINSIC psimd_f32 psimd_sub_f32(psimd_f32 a, psimd_f32 b) {
0622         #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
0623             return (psimd_f32) vsubq_f32((float32x4_t) a, (float32x4_t) b);
0624         #else
0625             return a - b;
0626         #endif
0627     }
0628 
0629     /* Vector multiplication */
0630     PSIMD_INTRINSIC psimd_s8 psimd_mul_s8(psimd_s8 a, psimd_s8 b) {
0631         return a * b;
0632     }
0633 
0634     PSIMD_INTRINSIC psimd_u8 psimd_mul_u8(psimd_u8 a, psimd_u8 b) {
0635         return a * b;
0636     }
0637 
0638     PSIMD_INTRINSIC psimd_s16 psimd_mul_s16(psimd_s16 a, psimd_s16 b) {
0639         return a * b;
0640     }
0641 
0642     PSIMD_INTRINSIC psimd_u16 psimd_mul_u16(psimd_u16 a, psimd_u16 b) {
0643         return a * b;
0644     }
0645 
0646     PSIMD_INTRINSIC psimd_s32 psimd_mul_s32(psimd_s32 a, psimd_s32 b) {
0647         return a * b;
0648     }
0649 
0650     PSIMD_INTRINSIC psimd_u32 psimd_mul_u32(psimd_u32 a, psimd_u32 b) {
0651         return a * b;
0652     }
0653 
0654     PSIMD_INTRINSIC psimd_f32 psimd_mul_f32(psimd_f32 a, psimd_f32 b) {
0655         #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
0656             return (psimd_f32) vmulq_f32((float32x4_t) a, (float32x4_t) b);
0657         #else
0658             return a * b;
0659         #endif
0660     }
0661 
0662     /* Quasi-Fused Multiply-Add */
0663     PSIMD_INTRINSIC psimd_f32 psimd_qfma_f32(psimd_f32 a, psimd_f32 b, psimd_f32 c) {
0664         #if defined(__aarch64__) || defined(__ARM_NEON__) && defined(__ARM_FEATURE_FMA)
0665             return (psimd_f32) vfmaq_f32((float32x4_t) a, (float32x4_t) b, (float32x4_t) c);
0666         #elif (defined(__x86_64__) || defined(__i386__) || defined(__i686__)) && defined(__FMA__)
0667             return (psimd_f32) _mm_fmadd_ps((__m128) b, (__m128) c, (__m128) a);
0668         #elif (defined(__x86_64__) || defined(__i386__) || defined(__i686__)) && defined(__FMA4__)
0669             return (psimd_f32) _mm_macc_ps((__m128) b, (__m128) c, (__m128) a);
0670         #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) && PSIMD_ENABLE_WASM_QFMA
0671             return (psimd_f32) __builtin_wasm_qfma_f32x4(a, b, c);
0672         #else
0673             return a + b * c;
0674         #endif
0675     }
0676 
0677     PSIMD_INTRINSIC psimd_f32 psimd_div_f32(psimd_f32 a, psimd_f32 b) {
0678         return a / b;
0679     }
0680 
0681     /* Vector and */
0682     PSIMD_INTRINSIC psimd_f32 psimd_andmask_f32(psimd_s32 mask, psimd_f32 v) {
0683         return (psimd_f32) (mask & (psimd_s32) v);
0684     }
0685 
0686     /* Vector and-not */
0687     PSIMD_INTRINSIC psimd_f32 psimd_andnotmask_f32(psimd_s32 mask, psimd_f32 v) {
0688         return (psimd_f32) (~mask & (psimd_s32) v);
0689     }
0690 
0691     /* Vector blend */
0692     PSIMD_INTRINSIC psimd_s8 psimd_blend_s8(psimd_s8 mask, psimd_s8 a, psimd_s8 b) {
0693         #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0694             return (psimd_s8) vbslq_s8((uint8x16_t) mask, (int8x16_t) a, (int8x16_t) b);
0695         #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
0696             return (psimd_s8) __builtin_wasm_bitselect(a, b, mask);
0697         #else
0698             return (mask & a) | (~mask & b);
0699         #endif
0700     }
0701 
0702     PSIMD_INTRINSIC psimd_u8 psimd_blend_u8(psimd_s8 mask, psimd_u8 a, psimd_u8 b) {
0703         #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0704             return (psimd_u8) vbslq_u8((uint8x16_t) mask, (uint8x16_t) a, (uint8x16_t) b);
0705         #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
0706             return (psimd_u8) __builtin_wasm_bitselect(a, b, mask);
0707         #else
0708             return (psimd_u8) ((mask & (psimd_s8) a) | (~mask & (psimd_s8) b));
0709         #endif
0710     }
0711     
0712     PSIMD_INTRINSIC psimd_s16 psimd_blend_s16(psimd_s16 mask, psimd_s16 a, psimd_s16 b) {
0713         #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0714             return (psimd_s16) vbslq_s16((uint16x8_t) mask, (int16x8_t) a, (int16x8_t) b);
0715         #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
0716             return (psimd_s16) __builtin_wasm_bitselect(a, b, mask);
0717         #else
0718             return (mask & a) | (~mask & b);
0719         #endif
0720     }
0721 
0722     PSIMD_INTRINSIC psimd_u16 psimd_blend_u16(psimd_s16 mask, psimd_u16 a, psimd_u16 b) {
0723         #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0724             return (psimd_u16) vbslq_u16((uint16x8_t) mask, (uint16x8_t) a, (uint16x8_t) b);
0725         #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
0726             return (psimd_u16) __builtin_wasm_bitselect(a, b, mask);
0727         #else
0728             return (psimd_u16) ((mask & (psimd_s16) a) | (~mask & (psimd_s16) b));
0729         #endif
0730     }
0731     
0732     PSIMD_INTRINSIC psimd_s32 psimd_blend_s32(psimd_s32 mask, psimd_s32 a, psimd_s32 b) {
0733         #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0734             return (psimd_s32) vbslq_s32((uint32x4_t) mask, (int32x4_t) a, (int32x4_t) b);
0735         #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
0736             return (psimd_s32) __builtin_wasm_bitselect(a, b, mask);
0737         #else
0738             return (mask & a) | (~mask & b);
0739         #endif
0740     }
0741 
0742     PSIMD_INTRINSIC psimd_u32 psimd_blend_u32(psimd_s32 mask, psimd_u32 a, psimd_u32 b) {
0743         #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0744             return (psimd_u32) vbslq_u32((uint32x4_t) mask, (uint32x4_t) a, (uint32x4_t) b);
0745         #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
0746             return (psimd_u32) __builtin_wasm_bitselect(a, b, mask);
0747         #else
0748             return (psimd_u32) ((mask & (psimd_s32) a) | (~mask & (psimd_s32) b));
0749         #endif
0750     }
0751     
0752     PSIMD_INTRINSIC psimd_f32 psimd_blend_f32(psimd_s32 mask, psimd_f32 a, psimd_f32 b) {
0753         #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0754             return (psimd_f32) vbslq_f32((uint32x4_t) mask, (float32x4_t) a, (float32x4_t) b);
0755         #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
0756             return (psimd_f32) __builtin_wasm_bitselect(a, b, mask);
0757         #else
0758             return (psimd_f32) ((mask & (psimd_s32) a) | (~mask & (psimd_s32) b));
0759         #endif
0760     }
0761 
0762     /* Vector blend on sign */
0763     PSIMD_INTRINSIC psimd_s8 psimd_signblend_s8(psimd_s8 x, psimd_s8 a, psimd_s8 b) {
0764         return psimd_blend_s8(x >> psimd_splat_s8(7), a, b);
0765     }
0766 
0767     PSIMD_INTRINSIC psimd_u8 psimd_signblend_u8(psimd_s8 x, psimd_u8 a, psimd_u8 b) {
0768         return psimd_blend_u8((x >> psimd_splat_s8(7)), a, b);
0769     }
0770 
0771     PSIMD_INTRINSIC psimd_s16 psimd_signblend_s16(psimd_s16 x, psimd_s16 a, psimd_s16 b) {
0772         return psimd_blend_s16(x >> psimd_splat_s16(15), a, b);
0773     }
0774 
0775     PSIMD_INTRINSIC psimd_u16 psimd_signblend_u16(psimd_s16 x, psimd_u16 a, psimd_u16 b) {
0776         return psimd_blend_u16((x >> psimd_splat_s16(15)), a, b);
0777     }
0778 
0779     PSIMD_INTRINSIC psimd_s32 psimd_signblend_s32(psimd_s32 x, psimd_s32 a, psimd_s32 b) {
0780         return psimd_blend_s32(x >> psimd_splat_s32(31), a, b);
0781     }
0782 
0783     PSIMD_INTRINSIC psimd_u32 psimd_signblend_u32(psimd_s32 x, psimd_u32 a, psimd_u32 b) {
0784         return psimd_blend_u32((x >> psimd_splat_s32(31)), a, b);
0785     }
0786 
0787     PSIMD_INTRINSIC psimd_f32 psimd_signblend_f32(psimd_f32 x, psimd_f32 a, psimd_f32 b) {
0788         const psimd_s32 mask = (psimd_s32) x >> psimd_splat_s32(31);
0789         return psimd_blend_f32(mask, a, b);
0790     }
0791 
0792     /* Vector absolute value */
0793     PSIMD_INTRINSIC psimd_f32 psimd_abs_f32(psimd_f32 v) {
0794         const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f);
0795         return (psimd_f32) ((psimd_s32) v & ~mask);
0796     }
0797 
0798     /* Vector negation */
0799     PSIMD_INTRINSIC psimd_f32 psimd_neg_f32(psimd_f32 v) {
0800         const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f);
0801         return (psimd_f32) ((psimd_s32) v ^ mask);
0802     }
0803 
0804     /* Vector maximum */
0805     PSIMD_INTRINSIC psimd_s8 psimd_max_s8(psimd_s8 a, psimd_s8 b) {
0806         #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0807             return (psimd_s8) vmaxq_s8((int8x16_t) a, (int8x16_t) b);
0808         #else
0809             return psimd_blend_s8(a > b, a, b);
0810         #endif
0811     }
0812 
0813     PSIMD_INTRINSIC psimd_u8 psimd_max_u8(psimd_u8 a, psimd_u8 b) {
0814         #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0815             return (psimd_u8) vmaxq_u8((uint8x16_t) a, (uint8x16_t) b);
0816         #else
0817             return psimd_blend_u8(a > b, a, b);
0818         #endif
0819     }
0820 
0821     PSIMD_INTRINSIC psimd_s16 psimd_max_s16(psimd_s16 a, psimd_s16 b) {
0822         #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0823             return (psimd_s16) vmaxq_s16((int16x8_t) a, (int16x8_t) b);
0824         #else
0825             return psimd_blend_s16(a > b, a, b);
0826         #endif
0827     }
0828 
0829     PSIMD_INTRINSIC psimd_u16 psimd_max_u16(psimd_u16 a, psimd_u16 b) {
0830         #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0831             return (psimd_u16) vmaxq_u16((uint16x8_t) a, (uint16x8_t) b);
0832         #else
0833             return psimd_blend_u16(a > b, a, b);
0834         #endif
0835     }
0836 
0837     PSIMD_INTRINSIC psimd_s32 psimd_max_s32(psimd_s32 a, psimd_s32 b) {
0838         #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0839             return (psimd_s32) vmaxq_s32((int32x4_t) a, (int32x4_t) b);
0840         #else
0841             return psimd_blend_s32(a > b, a, b);
0842         #endif
0843     }
0844 
0845     PSIMD_INTRINSIC psimd_u32 psimd_max_u32(psimd_u32 a, psimd_u32 b) {
0846         #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0847             return (psimd_u32) vmaxq_u32((uint32x4_t) a, (uint32x4_t) b);
0848         #else
0849             return psimd_blend_u32(a > b, a, b);
0850         #endif
0851     }
0852 
0853     PSIMD_INTRINSIC psimd_f32 psimd_max_f32(psimd_f32 a, psimd_f32 b) {
0854         #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0855             return (psimd_f32) vmaxq_f32((float32x4_t) a, (float32x4_t) b);
0856         #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
0857             return __builtin_wasm_max_f32x4(a, b);
0858         #else
0859             return psimd_blend_f32(a > b, a, b);
0860         #endif
0861     }
0862 
0863     /* Vector minimum */
0864     PSIMD_INTRINSIC psimd_s8 psimd_min_s8(psimd_s8 a, psimd_s8 b) {
0865         #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0866             return (psimd_s8) vminq_s8((int8x16_t) a, (int8x16_t) b);
0867         #else
0868             return psimd_blend_s8(a < b, a, b);
0869         #endif
0870     }
0871 
0872     PSIMD_INTRINSIC psimd_u8 psimd_min_u8(psimd_u8 a, psimd_u8 b) {
0873         #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0874             return (psimd_u8) vminq_u8((uint8x16_t) a, (uint8x16_t) b);
0875         #else
0876             return psimd_blend_u8(a < b, a, b);
0877         #endif
0878     }
0879 
0880     PSIMD_INTRINSIC psimd_s16 psimd_min_s16(psimd_s16 a, psimd_s16 b) {
0881         #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0882             return (psimd_s16) vminq_s16((int16x8_t) a, (int16x8_t) b);
0883         #else
0884             return psimd_blend_s16(a < b, a, b);
0885         #endif
0886     }
0887 
0888     PSIMD_INTRINSIC psimd_u16 psimd_min_u16(psimd_u16 a, psimd_u16 b) {
0889         #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0890             return (psimd_u16) vminq_u16((uint16x8_t) a, (uint16x8_t) b);
0891         #else
0892             return psimd_blend_u16(a < b, a, b);
0893         #endif
0894     }
0895 
0896     PSIMD_INTRINSIC psimd_s32 psimd_min_s32(psimd_s32 a, psimd_s32 b) {
0897         #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0898             return (psimd_s32) vminq_s32((int32x4_t) a, (int32x4_t) b);
0899         #else
0900             return psimd_blend_s32(a < b, a, b);
0901         #endif
0902     }
0903 
0904     PSIMD_INTRINSIC psimd_u32 psimd_min_u32(psimd_u32 a, psimd_u32 b) {
0905         #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0906             return (psimd_u32) vminq_u32((uint32x4_t) a, (uint32x4_t) b);
0907         #else
0908             return psimd_blend_u32(a < b, a, b);
0909         #endif
0910     }
0911 
0912     PSIMD_INTRINSIC psimd_f32 psimd_min_f32(psimd_f32 a, psimd_f32 b) {
0913         #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0914             return (psimd_f32) vminq_f32((float32x4_t) a, (float32x4_t) b);
0915         #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
0916             return __builtin_wasm_min_f32x4(a, b);
0917         #else
0918             return psimd_blend_f32(a < b, a, b);
0919         #endif
0920     }
0921 
0922     PSIMD_INTRINSIC psimd_f32 psimd_cvt_s32_f32(psimd_s32 v) {
0923         #if defined(__clang__)
0924             return __builtin_convertvector(v, psimd_f32);
0925         #elif defined(__ARM_NEON__) || defined(__ARM_NEON)
0926             return (psimd_f32) vcvtq_f32_s32((int32x4_t) v);
0927         #elif defined(__SSE2__)
0928             return (psimd_f32) _mm_cvtepi32_ps((__m128i) v);
0929         #else
0930             return (psimd_f32) { (float) v[0], (float) v[1], (float) v[2], (float) v[3] };
0931         #endif
0932     }
0933 
0934     /* Broadcast vector element */
0935     #if defined(__clang__)
0936         PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) {
0937             return __builtin_shufflevector(v, v, 0, 0, 0, 0);
0938         }
0939 
0940         PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) {
0941             return __builtin_shufflevector(v, v, 1, 1, 1, 1);
0942         }
0943 
0944         PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) {
0945             return __builtin_shufflevector(v, v, 2, 2, 2, 2);
0946         }
0947 
0948         PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) {
0949             return __builtin_shufflevector(v, v, 3, 3, 3, 3);
0950         }
0951     #else
0952         PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) {
0953             return __builtin_shuffle(v, (psimd_s32) { 0, 0, 0, 0 });
0954         }
0955 
0956         PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) {
0957             return __builtin_shuffle(v, (psimd_s32) { 1, 1, 1, 1 });
0958         }
0959 
0960         PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) {
0961             return __builtin_shuffle(v, (psimd_s32) { 2, 2, 2, 2 });
0962         }
0963 
0964         PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) {
0965             return __builtin_shuffle(v, (psimd_s32) { 3, 3, 3, 3 });
0966         }
0967     #endif
0968 
0969     /* Reversal of vector elements */
0970     #if defined(__clang__)
0971         PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) {
0972             return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
0973         }
0974 
0975         PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) {
0976             return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
0977         }
0978 
0979         PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) {
0980             return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0);
0981         }
0982 
0983         PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) {
0984             return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0);
0985         }
0986 
0987         PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) {
0988             return __builtin_shufflevector(v, v, 3, 2, 1, 0);
0989         }
0990 
0991         PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) {
0992             return __builtin_shufflevector(v, v, 3, 2, 1, 0);
0993         }
0994 
0995         PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) {
0996             return __builtin_shufflevector(v, v, 3, 2, 1, 0);
0997         }
0998     #else
0999         PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) {
1000             return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 });
1001         }
1002 
1003         PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) {
1004             return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 });
1005         }
1006 
1007         PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) {
1008             return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 });
1009         }
1010 
1011         PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) {
1012             return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 });
1013         }
1014 
1015         PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) {
1016             return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
1017         }
1018 
1019         PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) {
1020             return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
1021         }
1022 
1023         PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) {
1024             return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
1025         }
1026     #endif
1027 
1028     /* Interleaving of vector elements */
1029     #if defined(__clang__)
1030         PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) {
1031             return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1032         }
1033 
1034         PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) {
1035             return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1036         }
1037 
1038         PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) {
1039             return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1040         }
1041 
1042         PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) {
1043             return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1044         }
1045 
1046         PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) {
1047             return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
1048         }
1049 
1050         PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) {
1051             return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
1052         }
1053 
1054         PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) {
1055             return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
1056         }
1057 
1058         PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) {
1059             return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
1060         }
1061 
1062         PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) {
1063             return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
1064         }
1065 
1066         PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) {
1067             return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
1068         }
1069     #else
1070         PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) {
1071             return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 });
1072         }
1073 
1074         PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) {
1075             return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 });
1076         }
1077 
1078         PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) {
1079             return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 });
1080         }
1081 
1082         PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) {
1083             return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 });
1084         }
1085 
1086         PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) {
1087             return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
1088         }
1089 
1090         PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) {
1091             return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
1092         }
1093 
1094         PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) {
1095             return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
1096         }
1097 
1098         PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) {
1099             return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
1100         }
1101 
1102         PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) {
1103             return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
1104         }
1105 
1106         PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) {
1107             return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
1108         }
1109     #endif
1110 
1111     /* Concatenation of low/high vector elements */
1112     #if defined(__clang__)
1113         PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) {
1114             return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3);
1115         }
1116 
1117         PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) {
1118             return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7);
1119         }
1120 
1121         PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) {
1122             return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3);
1123         }
1124 
1125         PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) {
1126             return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7);
1127         }
1128 
1129         PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) {
1130             return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
1131         }
1132 
1133         PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) {
1134             return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
1135         }
1136 
1137         PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) {
1138             return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
1139         }
1140 
1141         PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) {
1142             return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
1143         }
1144 
1145         PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) {
1146             return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
1147         }
1148 
1149         PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) {
1150             return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
1151         }
1152     #else
1153         PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) {
1154             return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 });
1155         }
1156 
1157         PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) {
1158             return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 });
1159         }
1160 
1161         PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) {
1162             return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 });
1163         }
1164 
1165         PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) {
1166             return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 });
1167         }
1168 
1169         PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) {
1170             return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
1171         }
1172 
1173         PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) {
1174             return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
1175         }
1176 
1177         PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) {
1178             return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
1179         }
1180 
1181         PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) {
1182             return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
1183         }
1184 
1185         PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) {
1186             return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
1187         }
1188 
1189         PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) {
1190             return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
1191         }
1192     #endif
1193 
1194     /* Concatenation of even/odd vector elements */
1195     #if defined(__clang__)
1196         PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) {
1197             return __builtin_shufflevector(a, b,
1198                 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14);
1199         }
1200 
1201         PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) {
1202             return __builtin_shufflevector(a, b,
1203                 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15);
1204         }
1205 
1206         PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) {
1207             return __builtin_shufflevector(a, b,
1208                 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14);
1209         }
1210 
1211         PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) {
1212             return __builtin_shufflevector(a, b,
1213                 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15);
1214         }
1215 
1216         PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) {
1217             return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6);
1218         }
1219 
1220         PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) {
1221             return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7);
1222         }
1223 
1224         PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) {
1225             return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6);
1226         }
1227 
1228         PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) {
1229             return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7);
1230         }
1231 
1232         PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) {
1233             return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
1234         }
1235 
1236         PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) {
1237             return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
1238         }
1239 
1240         PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) {
1241             return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
1242         }
1243 
1244         PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) {
1245             return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
1246         }
1247 
1248         PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) {
1249             return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
1250         }
1251 
1252         PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) {
1253             return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
1254         }
1255     #else
1256         PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) {
1257             return __builtin_shuffle(a, b,
1258                 (psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 });
1259         }
1260 
1261         PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) {
1262             return __builtin_shuffle(a, b,
1263                 (psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 });
1264         }
1265 
1266         PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) {
1267             return __builtin_shuffle(a, b,
1268                 (psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 });
1269         }
1270 
1271         PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) {
1272             return __builtin_shuffle(a, b,
1273                 (psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 });
1274         }
1275 
1276         PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) {
1277             return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 });
1278         }
1279 
1280         PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) {
1281             return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 });
1282         }
1283 
1284         PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) {
1285             return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 });
1286         }
1287 
1288         PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) {
1289             return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 });
1290         }
1291 
1292         PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) {
1293             return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
1294         }
1295 
1296         PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) {
1297             return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
1298         }
1299 
1300         PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) {
1301             return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
1302         }
1303 
1304         PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) {
1305             return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
1306         }
1307 
1308         PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) {
1309             return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
1310         }
1311 
1312         PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) {
1313             return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
1314         }
1315     #endif
1316 
1317     /* Vector reduce */
1318     #if defined(__clang__)
1319         PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) {
1320             const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, 0, 1);
1321             return temp + __builtin_shufflevector(temp, temp, 1, 0, 3, 2);
1322         }
1323 
1324         PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) {
1325             const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1));
1326             return psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2));
1327         }
1328 
1329         PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) {
1330             const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1));
1331             return psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2));
1332         }
1333 
1334         PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) {
1335             const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, -1, -1);
1336             const psimd_f32 result = temp + __builtin_shufflevector(temp, temp, 1, -1, -1, -1);
1337             return result[0];
1338         }
1339 
1340         PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) {
1341             const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1));
1342             const psimd_f32 result = psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1));
1343             return result[0];
1344         }
1345 
1346         PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) {
1347             const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1));
1348             const psimd_f32 result = psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1));
1349             return result[0];
1350         }
1351     #else
1352         PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) {
1353             const psimd_f32 temp = v + __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 });
1354             return temp + __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 });
1355         }
1356 
1357         PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) {
1358             const psimd_f32 temp = psimd_max_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 }));
1359             return psimd_max_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 }));
1360         }
1361 
1362         PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) {
1363             const psimd_f32 temp = psimd_min_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 }));
1364             return psimd_min_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 }));
1365         }
1366 
1367         PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) {
1368             const psimd_f32 result = psimd_allreduce_sum_f32(v);
1369             return result[0];
1370         }
1371 
1372         PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) {
1373             const psimd_f32 result = psimd_allreduce_max_f32(v);
1374             return result[0];
1375         }
1376 
1377         PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) {
1378             const psimd_f32 result = psimd_allreduce_min_f32(v);
1379             return result[0];
1380         }
1381     #endif
1382 #endif
1383 
1384 #endif /* PSIMD_H */