File indexing completed on 2024-05-18 08:30:12
0001 #pragma once
0002 #ifndef PSIMD_H
0003 #define PSIMD_H
0004
0005 #if defined(__CUDA_ARCH__)
0006
0007 #define PSIMD_INTRINSIC __forceinline__ __device__
0008 #elif defined(__OPENCL_VERSION__)
0009
0010 #define PSIMD_INTRINSIC inline static
0011 #elif defined(__INTEL_COMPILER)
0012
0013 #define PSIMD_INTRINSIC inline static __attribute__((__always_inline__))
0014 #elif defined(__GNUC__)
0015
0016 #define PSIMD_INTRINSIC inline static __attribute__((__always_inline__))
0017 #elif defined(_MSC_VER)
0018
0019 #define PSIMD_INTRINSIC __forceinline static
0020 #elif defined(__cplusplus)
0021
0022 #define PSIMD_INTRINSIC inline static
0023 #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
0024
0025 #define PSIMD_INTRINSIC inline static
0026 #else
0027
0028 #define PSIMD_INTRINSIC static
0029 #endif
0030
0031 #if defined(__GNUC__) || defined(__clang__)
0032 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0033 #include <arm_neon.h>
0034 #endif
0035
0036 #if defined(__SSE2__)
0037 #include <emmintrin.h>
0038 #endif
0039
0040 #if defined(__SSE3__)
0041 #include <pmmintrin.h>
0042 #endif
0043
0044 #if defined(__SSSE3__)
0045 #include <tmmintrin.h>
0046 #endif
0047
0048 #if defined(__SSE4_1__)
0049 #include <smmintrin.h>
0050 #endif
0051
0052 #if defined(__SSE4_2__)
0053 #include <nmmintrin.h>
0054 #endif
0055
0056 #if defined(__AVX__)
0057 #include <immintrin.h>
0058 #endif
0059 #elif defined(_MSC_VER)
0060 #include <intrin.h>
0061 #endif
0062
0063 #if defined(__cplusplus)
0064 #define PSIMD_CXX_SYNTAX
0065 #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
0066 #define PSIMD_C11_SYNTAX
0067 #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
0068 #define PSIMD_C99_SYNTAX
0069 #else
0070 #define PSIMD_C89_SYNTAX
0071 #endif
0072
0073 #if defined(__cplusplus) && (__cplusplus >= 201103L)
0074 #include <cstddef>
0075 #include <cstdint>
0076 #elif !defined(__OPENCL_VERSION__)
0077 #include <stddef.h>
0078 #include <stdint.h>
0079 #endif
0080
0081 #if defined(__GNUC__) || defined(__clang__)
0082 #define PSIMD_HAVE_F64 0
0083 #define PSIMD_HAVE_F32 1
0084 #define PSIMD_HAVE_U8 1
0085 #define PSIMD_HAVE_S8 1
0086 #define PSIMD_HAVE_U16 1
0087 #define PSIMD_HAVE_S16 1
0088 #define PSIMD_HAVE_U32 1
0089 #define PSIMD_HAVE_S32 1
0090 #define PSIMD_HAVE_U64 0
0091 #define PSIMD_HAVE_S64 0
0092
0093 typedef int8_t psimd_s8 __attribute__((vector_size(16), aligned(1)));
0094 typedef uint8_t psimd_u8 __attribute__((vector_size(16), aligned(1)));
0095 typedef int16_t psimd_s16 __attribute__((vector_size(16), aligned(2)));
0096 typedef uint16_t psimd_u16 __attribute__((vector_size(16), aligned(2)));
0097 typedef int32_t psimd_s32 __attribute__((vector_size(16), aligned(4)));
0098 typedef uint32_t psimd_u32 __attribute__((vector_size(16), aligned(4)));
0099 typedef float psimd_f32 __attribute__((vector_size(16), aligned(4)));
0100
0101 typedef struct {
0102 psimd_s8 lo;
0103 psimd_s8 hi;
0104 } psimd_s8x2;
0105
0106 typedef struct {
0107 psimd_u8 lo;
0108 psimd_u8 hi;
0109 } psimd_u8x2;
0110
0111 typedef struct {
0112 psimd_s16 lo;
0113 psimd_s16 hi;
0114 } psimd_s16x2;
0115
0116 typedef struct {
0117 psimd_u16 lo;
0118 psimd_u16 hi;
0119 } psimd_u16x2;
0120
0121 typedef struct {
0122 psimd_s32 lo;
0123 psimd_s32 hi;
0124 } psimd_s32x2;
0125
0126 typedef struct {
0127 psimd_u32 lo;
0128 psimd_u32 hi;
0129 } psimd_u32x2;
0130
0131 typedef struct {
0132 psimd_f32 lo;
0133 psimd_f32 hi;
0134 } psimd_f32x2;
0135
0136
0137 PSIMD_INTRINSIC psimd_u32x2 psimd_cast_s32x2_u32x2(psimd_s32x2 v) {
0138 return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi };
0139 }
0140
0141 PSIMD_INTRINSIC psimd_f32x2 psimd_cast_s32x2_f32x2(psimd_s32x2 v) {
0142 return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi };
0143 }
0144
0145 PSIMD_INTRINSIC psimd_s32x2 psimd_cast_u32x2_s32x2(psimd_u32x2 v) {
0146 return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi };
0147 }
0148
0149 PSIMD_INTRINSIC psimd_f32x2 psimd_cast_u32x2_f32x2(psimd_u32x2 v) {
0150 return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi };
0151 }
0152
0153 PSIMD_INTRINSIC psimd_s32x2 psimd_cast_f32x2_s32x2(psimd_f32x2 v) {
0154 return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi };
0155 }
0156
0157 PSIMD_INTRINSIC psimd_u32x2 psimd_cast_f32x2_u32x2(psimd_f32x2 v) {
0158 return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi };
0159 }
0160
0161
0162 PSIMD_INTRINSIC void psimd_swap_s8(psimd_s8 a[1], psimd_s8 b[1]) {
0163 const psimd_s8 new_a = *b;
0164 const psimd_s8 new_b = *a;
0165 *a = new_a;
0166 *b = new_b;
0167 }
0168
0169 PSIMD_INTRINSIC void psimd_swap_u8(psimd_u8 a[1], psimd_u8 b[1]) {
0170 const psimd_u8 new_a = *b;
0171 const psimd_u8 new_b = *a;
0172 *a = new_a;
0173 *b = new_b;
0174 }
0175
0176 PSIMD_INTRINSIC void psimd_swap_s16(psimd_s16 a[1], psimd_s16 b[1]) {
0177 const psimd_s16 new_a = *b;
0178 const psimd_s16 new_b = *a;
0179 *a = new_a;
0180 *b = new_b;
0181 }
0182
0183 PSIMD_INTRINSIC void psimd_swap_u16(psimd_u16 a[1], psimd_u16 b[1]) {
0184 const psimd_u16 new_a = *b;
0185 const psimd_u16 new_b = *a;
0186 *a = new_a;
0187 *b = new_b;
0188 }
0189
0190 PSIMD_INTRINSIC void psimd_swap_s32(psimd_s32 a[1], psimd_s32 b[1]) {
0191 const psimd_s32 new_a = *b;
0192 const psimd_s32 new_b = *a;
0193 *a = new_a;
0194 *b = new_b;
0195 }
0196
0197 PSIMD_INTRINSIC void psimd_swap_u32(psimd_u32 a[1], psimd_u32 b[1]) {
0198 const psimd_u32 new_a = *b;
0199 const psimd_u32 new_b = *a;
0200 *a = new_a;
0201 *b = new_b;
0202 }
0203
0204 PSIMD_INTRINSIC void psimd_swap_f32(psimd_f32 a[1], psimd_f32 b[1]) {
0205 const psimd_f32 new_a = *b;
0206 const psimd_f32 new_b = *a;
0207 *a = new_a;
0208 *b = new_b;
0209 }
0210
0211
0212 PSIMD_INTRINSIC psimd_s8 psimd_zero_s8(void) {
0213 return (psimd_s8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
0214 }
0215
0216 PSIMD_INTRINSIC psimd_u8 psimd_zero_u8(void) {
0217 return (psimd_u8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
0218 }
0219
0220 PSIMD_INTRINSIC psimd_s16 psimd_zero_s16(void) {
0221 return (psimd_s16) { 0, 0, 0, 0, 0, 0, 0, 0 };
0222 }
0223
0224 PSIMD_INTRINSIC psimd_u16 psimd_zero_u16(void) {
0225 return (psimd_u16) { 0, 0, 0, 0, 0, 0, 0, 0 };
0226 }
0227
0228 PSIMD_INTRINSIC psimd_s32 psimd_zero_s32(void) {
0229 return (psimd_s32) { 0, 0, 0, 0 };
0230 }
0231
0232 PSIMD_INTRINSIC psimd_u32 psimd_zero_u32(void) {
0233 return (psimd_u32) { 0, 0, 0, 0 };
0234 }
0235
0236 PSIMD_INTRINSIC psimd_f32 psimd_zero_f32(void) {
0237 return (psimd_f32) { 0.0f, 0.0f, 0.0f, 0.0f };
0238 }
0239
0240
0241 PSIMD_INTRINSIC psimd_s8 psimd_splat_s8(int8_t c) {
0242 return (psimd_s8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c };
0243 }
0244
0245 PSIMD_INTRINSIC psimd_u8 psimd_splat_u8(uint8_t c) {
0246 return (psimd_u8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c };
0247 }
0248
0249 PSIMD_INTRINSIC psimd_s16 psimd_splat_s16(int16_t c) {
0250 return (psimd_s16) { c, c, c, c, c, c, c, c };
0251 }
0252
0253 PSIMD_INTRINSIC psimd_u16 psimd_splat_u16(uint16_t c) {
0254 return (psimd_u16) { c, c, c, c, c, c, c, c };
0255 }
0256
0257 PSIMD_INTRINSIC psimd_s32 psimd_splat_s32(int32_t c) {
0258 return (psimd_s32) { c, c, c, c };
0259 }
0260
0261 PSIMD_INTRINSIC psimd_u32 psimd_splat_u32(uint32_t c) {
0262 return (psimd_u32) { c, c, c, c };
0263 }
0264
0265 PSIMD_INTRINSIC psimd_f32 psimd_splat_f32(float c) {
0266 return (psimd_f32) { c, c, c, c };
0267 }
0268
0269
0270 PSIMD_INTRINSIC psimd_s8 psimd_load_s8(const void* address) {
0271 return *((const psimd_s8*) address);
0272 }
0273
0274 PSIMD_INTRINSIC psimd_u8 psimd_load_u8(const void* address) {
0275 return *((const psimd_u8*) address);
0276 }
0277
0278 PSIMD_INTRINSIC psimd_s16 psimd_load_s16(const void* address) {
0279 return *((const psimd_s16*) address);
0280 }
0281
0282 PSIMD_INTRINSIC psimd_u16 psimd_load_u16(const void* address) {
0283 return *((const psimd_u16*) address);
0284 }
0285
0286 PSIMD_INTRINSIC psimd_s32 psimd_load_s32(const void* address) {
0287 return *((const psimd_s32*) address);
0288 }
0289
0290 PSIMD_INTRINSIC psimd_u32 psimd_load_u32(const void* address) {
0291 return *((const psimd_u32*) address);
0292 }
0293
0294 PSIMD_INTRINSIC psimd_f32 psimd_load_f32(const void* address) {
0295 return *((const psimd_f32*) address);
0296 }
0297
0298 PSIMD_INTRINSIC psimd_s8 psimd_load_splat_s8(const void* address) {
0299 return psimd_splat_s8(*((const int8_t*) address));
0300 }
0301
0302 PSIMD_INTRINSIC psimd_u8 psimd_load_splat_u8(const void* address) {
0303 return psimd_splat_u8(*((const uint8_t*) address));
0304 }
0305
0306 PSIMD_INTRINSIC psimd_s16 psimd_load_splat_s16(const void* address) {
0307 return psimd_splat_s16(*((const int16_t*) address));
0308 }
0309
0310 PSIMD_INTRINSIC psimd_u16 psimd_load_splat_u16(const void* address) {
0311 return psimd_splat_u16(*((const uint16_t*) address));
0312 }
0313
0314 PSIMD_INTRINSIC psimd_s32 psimd_load_splat_s32(const void* address) {
0315 return psimd_splat_s32(*((const int32_t*) address));
0316 }
0317
0318 PSIMD_INTRINSIC psimd_u32 psimd_load_splat_u32(const void* address) {
0319 return psimd_splat_u32(*((const uint32_t*) address));
0320 }
0321
0322 PSIMD_INTRINSIC psimd_f32 psimd_load_splat_f32(const void* address) {
0323 return psimd_splat_f32(*((const float*) address));
0324 }
0325
0326 PSIMD_INTRINSIC psimd_s32 psimd_load1_s32(const void* address) {
0327 return (psimd_s32) { *((const int32_t*) address), 0, 0, 0 };
0328 }
0329
0330 PSIMD_INTRINSIC psimd_u32 psimd_load1_u32(const void* address) {
0331 return (psimd_u32) { *((const uint32_t*) address), 0, 0, 0 };
0332 }
0333
0334 PSIMD_INTRINSIC psimd_f32 psimd_load1_f32(const void* address) {
0335 return (psimd_f32) { *((const float*) address), 0.0f, 0.0f, 0.0f };
0336 }
0337
0338 PSIMD_INTRINSIC psimd_s32 psimd_load2_s32(const void* address) {
0339 const int32_t* address_s32 = (const int32_t*) address;
0340 return (psimd_s32) { address_s32[0], address_s32[1], 0, 0 };
0341 }
0342
0343 PSIMD_INTRINSIC psimd_u32 psimd_load2_u32(const void* address) {
0344 const uint32_t* address_u32 = (const uint32_t*) address;
0345 return (psimd_u32) { address_u32[0], address_u32[1], 0, 0 };
0346 }
0347
0348 PSIMD_INTRINSIC psimd_f32 psimd_load2_f32(const void* address) {
0349 const float* address_f32 = (const float*) address;
0350 return (psimd_f32) { address_f32[0], address_f32[1], 0.0f, 0.0f };
0351 }
0352
0353 PSIMD_INTRINSIC psimd_s32 psimd_load3_s32(const void* address) {
0354 const int32_t* address_s32 = (const int32_t*) address;
0355 return (psimd_s32) { address_s32[0], address_s32[1], address_s32[2], 0 };
0356 }
0357
0358 PSIMD_INTRINSIC psimd_u32 psimd_load3_u32(const void* address) {
0359 const uint32_t* address_u32 = (const uint32_t*) address;
0360 return (psimd_u32) { address_u32[0], address_u32[1], address_u32[2], 0 };
0361 }
0362
0363 PSIMD_INTRINSIC psimd_f32 psimd_load3_f32(const void* address) {
0364 const float* address_f32 = (const float*) address;
0365 return (psimd_f32) { address_f32[0], address_f32[1], address_f32[2], 0.0f };
0366 }
0367
0368 PSIMD_INTRINSIC psimd_s32 psimd_load4_s32(const void* address) {
0369 return psimd_load_s32(address);
0370 }
0371
0372 PSIMD_INTRINSIC psimd_u32 psimd_load4_u32(const void* address) {
0373 return psimd_load_u32(address);
0374 }
0375
0376 PSIMD_INTRINSIC psimd_f32 psimd_load4_f32(const void* address) {
0377 return psimd_load_f32(address);
0378 }
0379
0380 PSIMD_INTRINSIC psimd_f32 psimd_load_stride2_f32(const void* address) {
0381 const psimd_f32 v0x1x = psimd_load_f32(address);
0382 const psimd_f32 vx2x3 = psimd_load_f32((const float*) address + 3);
0383 #if defined(__clang__)
0384 return __builtin_shufflevector(v0x1x, vx2x3, 0, 2, 5, 7);
0385 #else
0386 return __builtin_shuffle(v0x1x, vx2x3, (psimd_s32) { 0, 2, 5, 7 });
0387 #endif
0388 }
0389
0390 PSIMD_INTRINSIC psimd_f32 psimd_load1_stride2_f32(const void* address) {
0391 return psimd_load_f32(address);
0392 }
0393
0394 PSIMD_INTRINSIC psimd_f32 psimd_load2_stride2_f32(const void* address) {
0395 const float* address_f32 = (const float*) address;
0396 return (psimd_f32) { address_f32[0], address_f32[2], 0.0f, 0.0f };
0397 }
0398
0399 PSIMD_INTRINSIC psimd_f32 psimd_load3_stride2_f32(const void* address) {
0400 const psimd_f32 v0x1x = psimd_load_f32(address);
0401 const psimd_f32 v2zzz = psimd_load1_f32((const float*) address + 2);
0402 #if defined(__clang__)
0403 return __builtin_shufflevector(v0x1x, v2zzz, 0, 2, 4, 6);
0404 #else
0405 return __builtin_shuffle(v0x1x, v2zzz, (psimd_s32) { 0, 2, 4, 6 });
0406 #endif
0407 }
0408
0409 PSIMD_INTRINSIC psimd_f32 psimd_load4_stride2_f32(const void* address) {
0410 return psimd_load_stride2_f32(address);
0411 }
0412
0413 PSIMD_INTRINSIC psimd_f32 psimd_load_stride_f32(const void* address, size_t stride) {
0414 const float* address0_f32 = (const float*) address;
0415 const float* address1_f32 = address0_f32 + stride;
0416 const float* address2_f32 = address1_f32 + stride;
0417 const float* address3_f32 = address2_f32 + stride;
0418 return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, *address3_f32 };
0419 }
0420
0421 PSIMD_INTRINSIC psimd_f32 psimd_load1_stride_f32(const void* address, size_t stride) {
0422 return psimd_load1_f32(address);
0423 }
0424
0425 PSIMD_INTRINSIC psimd_f32 psimd_load2_stride_f32(const void* address, size_t stride) {
0426 const float* address_f32 = (const float*) address;
0427 return (psimd_f32) { address_f32[0], address_f32[stride], 0.0f, 0.0f };
0428 }
0429
0430 PSIMD_INTRINSIC psimd_f32 psimd_load3_stride_f32(const void* address, size_t stride) {
0431 const float* address0_f32 = (const float*) address;
0432 const float* address1_f32 = address0_f32 + stride;
0433 const float* address2_f32 = address1_f32 + stride;
0434 return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, 0.0f };
0435 }
0436
0437 PSIMD_INTRINSIC psimd_f32 psimd_load4_stride_f32(const void* address, size_t stride) {
0438 return psimd_load_stride_f32(address, stride);
0439 }
0440
0441
0442 PSIMD_INTRINSIC void psimd_store_s8(void* address, psimd_s8 value) {
0443 *((psimd_s8*) address) = value;
0444 }
0445
0446 PSIMD_INTRINSIC void psimd_store_u8(void* address, psimd_u8 value) {
0447 *((psimd_u8*) address) = value;
0448 }
0449
0450 PSIMD_INTRINSIC void psimd_store_s16(void* address, psimd_s16 value) {
0451 *((psimd_s16*) address) = value;
0452 }
0453
0454 PSIMD_INTRINSIC void psimd_store_u16(void* address, psimd_u16 value) {
0455 *((psimd_u16*) address) = value;
0456 }
0457
0458 PSIMD_INTRINSIC void psimd_store_s32(void* address, psimd_s32 value) {
0459 *((psimd_s32*) address) = value;
0460 }
0461
0462 PSIMD_INTRINSIC void psimd_store_u32(void* address, psimd_u32 value) {
0463 *((psimd_u32*) address) = value;
0464 }
0465
0466 PSIMD_INTRINSIC void psimd_store_f32(void* address, psimd_f32 value) {
0467 *((psimd_f32*) address) = value;
0468 }
0469
0470 PSIMD_INTRINSIC void psimd_store1_s32(void* address, psimd_s32 value) {
0471 *((int32_t*) address) = value[0];
0472 }
0473
0474 PSIMD_INTRINSIC void psimd_store1_u32(void* address, psimd_u32 value) {
0475 *((uint32_t*) address) = value[0];
0476 }
0477
0478 PSIMD_INTRINSIC void psimd_store1_f32(void* address, psimd_f32 value) {
0479 *((float*) address) = value[0];
0480 }
0481
0482 PSIMD_INTRINSIC void psimd_store2_s32(void* address, psimd_s32 value) {
0483 int32_t* address_s32 = (int32_t*) address;
0484 address_s32[0] = value[0];
0485 address_s32[1] = value[1];
0486 }
0487
0488 PSIMD_INTRINSIC void psimd_store2_u32(void* address, psimd_u32 value) {
0489 uint32_t* address_u32 = (uint32_t*) address;
0490 address_u32[0] = value[0];
0491 address_u32[1] = value[1];
0492 }
0493
0494 PSIMD_INTRINSIC void psimd_store2_f32(void* address, psimd_f32 value) {
0495 float* address_f32 = (float*) address;
0496 address_f32[0] = value[0];
0497 address_f32[1] = value[1];
0498 }
0499
0500 PSIMD_INTRINSIC void psimd_store3_s32(void* address, psimd_s32 value) {
0501 int32_t* address_s32 = (int32_t*) address;
0502 address_s32[0] = value[0];
0503 address_s32[1] = value[1];
0504 address_s32[2] = value[2];
0505 }
0506
0507 PSIMD_INTRINSIC void psimd_store3_u32(void* address, psimd_u32 value) {
0508 uint32_t* address_u32 = (uint32_t*) address;
0509 address_u32[0] = value[0];
0510 address_u32[1] = value[1];
0511 address_u32[2] = value[2];
0512 }
0513
0514 PSIMD_INTRINSIC void psimd_store3_f32(void* address, psimd_f32 value) {
0515 float* address_f32 = (float*) address;
0516 address_f32[0] = value[0];
0517 address_f32[1] = value[1];
0518 address_f32[2] = value[2];
0519 }
0520
0521 PSIMD_INTRINSIC void psimd_store4_s32(void* address, psimd_s32 value) {
0522 psimd_store_s32(address, value);
0523 }
0524
0525 PSIMD_INTRINSIC void psimd_store4_u32(void* address, psimd_u32 value) {
0526 psimd_store_u32(address, value);
0527 }
0528
0529 PSIMD_INTRINSIC void psimd_store4_f32(void* address, psimd_f32 value) {
0530 psimd_store_f32(address, value);
0531 }
0532
0533 PSIMD_INTRINSIC void psimd_store_stride_f32(void* address, size_t stride, psimd_f32 value) {
0534 float* address0_f32 = (float*) address;
0535 float* address1_f32 = address0_f32 + stride;
0536 float* address2_f32 = address1_f32 + stride;
0537 float* address3_f32 = address2_f32 + stride;
0538 *address0_f32 = value[0];
0539 *address1_f32 = value[1];
0540 *address2_f32 = value[2];
0541 *address3_f32 = value[3];
0542 }
0543
0544 PSIMD_INTRINSIC void psimd_store1_stride_f32(void* address, size_t stride, psimd_f32 value) {
0545 psimd_store1_f32(address, value);
0546 }
0547
0548 PSIMD_INTRINSIC void psimd_store2_stride_f32(void* address, size_t stride, psimd_f32 value) {
0549 float* address_f32 = (float*) address;
0550 address_f32[0] = value[0];
0551 address_f32[stride] = value[1];
0552 }
0553
0554 PSIMD_INTRINSIC void psimd_store3_stride_f32(void* address, size_t stride, psimd_f32 value) {
0555 float* address0_f32 = (float*) address;
0556 float* address1_f32 = address0_f32 + stride;
0557 float* address2_f32 = address1_f32 + stride;
0558 *address0_f32 = value[0];
0559 *address1_f32 = value[1];
0560 *address2_f32 = value[2];
0561 }
0562
0563
0564 PSIMD_INTRINSIC psimd_s8 psimd_add_s8(psimd_s8 a, psimd_s8 b) {
0565 return a + b;
0566 }
0567
0568 PSIMD_INTRINSIC psimd_u8 psimd_add_u8(psimd_u8 a, psimd_u8 b) {
0569 return a + b;
0570 }
0571
0572 PSIMD_INTRINSIC psimd_s16 psimd_add_s16(psimd_s16 a, psimd_s16 b) {
0573 return a + b;
0574 }
0575
0576 PSIMD_INTRINSIC psimd_u16 psimd_add_u16(psimd_u16 a, psimd_u16 b) {
0577 return a + b;
0578 }
0579
0580 PSIMD_INTRINSIC psimd_s32 psimd_add_s32(psimd_s32 a, psimd_s32 b) {
0581 return a + b;
0582 }
0583
0584 PSIMD_INTRINSIC psimd_u32 psimd_add_u32(psimd_u32 a, psimd_u32 b) {
0585 return a + b;
0586 }
0587
0588 PSIMD_INTRINSIC psimd_f32 psimd_add_f32(psimd_f32 a, psimd_f32 b) {
0589 #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
0590 return (psimd_f32) vaddq_f32((float32x4_t) a, (float32x4_t) b);
0591 #else
0592 return a + b;
0593 #endif
0594 }
0595
0596
0597 PSIMD_INTRINSIC psimd_s8 psimd_sub_s8(psimd_s8 a, psimd_s8 b) {
0598 return a - b;
0599 }
0600
0601 PSIMD_INTRINSIC psimd_u8 psimd_sub_u8(psimd_u8 a, psimd_u8 b) {
0602 return a - b;
0603 }
0604
0605 PSIMD_INTRINSIC psimd_s16 psimd_sub_s16(psimd_s16 a, psimd_s16 b) {
0606 return a - b;
0607 }
0608
0609 PSIMD_INTRINSIC psimd_u16 psimd_sub_u16(psimd_u16 a, psimd_u16 b) {
0610 return a - b;
0611 }
0612
0613 PSIMD_INTRINSIC psimd_s32 psimd_sub_s32(psimd_s32 a, psimd_s32 b) {
0614 return a - b;
0615 }
0616
0617 PSIMD_INTRINSIC psimd_u32 psimd_sub_u32(psimd_u32 a, psimd_u32 b) {
0618 return a - b;
0619 }
0620
0621 PSIMD_INTRINSIC psimd_f32 psimd_sub_f32(psimd_f32 a, psimd_f32 b) {
0622 #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
0623 return (psimd_f32) vsubq_f32((float32x4_t) a, (float32x4_t) b);
0624 #else
0625 return a - b;
0626 #endif
0627 }
0628
0629
0630 PSIMD_INTRINSIC psimd_s8 psimd_mul_s8(psimd_s8 a, psimd_s8 b) {
0631 return a * b;
0632 }
0633
0634 PSIMD_INTRINSIC psimd_u8 psimd_mul_u8(psimd_u8 a, psimd_u8 b) {
0635 return a * b;
0636 }
0637
0638 PSIMD_INTRINSIC psimd_s16 psimd_mul_s16(psimd_s16 a, psimd_s16 b) {
0639 return a * b;
0640 }
0641
0642 PSIMD_INTRINSIC psimd_u16 psimd_mul_u16(psimd_u16 a, psimd_u16 b) {
0643 return a * b;
0644 }
0645
0646 PSIMD_INTRINSIC psimd_s32 psimd_mul_s32(psimd_s32 a, psimd_s32 b) {
0647 return a * b;
0648 }
0649
0650 PSIMD_INTRINSIC psimd_u32 psimd_mul_u32(psimd_u32 a, psimd_u32 b) {
0651 return a * b;
0652 }
0653
0654 PSIMD_INTRINSIC psimd_f32 psimd_mul_f32(psimd_f32 a, psimd_f32 b) {
0655 #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
0656 return (psimd_f32) vmulq_f32((float32x4_t) a, (float32x4_t) b);
0657 #else
0658 return a * b;
0659 #endif
0660 }
0661
0662
0663 PSIMD_INTRINSIC psimd_f32 psimd_qfma_f32(psimd_f32 a, psimd_f32 b, psimd_f32 c) {
0664 #if defined(__aarch64__) || defined(__ARM_NEON__) && defined(__ARM_FEATURE_FMA)
0665 return (psimd_f32) vfmaq_f32((float32x4_t) a, (float32x4_t) b, (float32x4_t) c);
0666 #elif (defined(__x86_64__) || defined(__i386__) || defined(__i686__)) && defined(__FMA__)
0667 return (psimd_f32) _mm_fmadd_ps((__m128) b, (__m128) c, (__m128) a);
0668 #elif (defined(__x86_64__) || defined(__i386__) || defined(__i686__)) && defined(__FMA4__)
0669 return (psimd_f32) _mm_macc_ps((__m128) b, (__m128) c, (__m128) a);
0670 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) && PSIMD_ENABLE_WASM_QFMA
0671 return (psimd_f32) __builtin_wasm_qfma_f32x4(a, b, c);
0672 #else
0673 return a + b * c;
0674 #endif
0675 }
0676
0677 PSIMD_INTRINSIC psimd_f32 psimd_div_f32(psimd_f32 a, psimd_f32 b) {
0678 return a / b;
0679 }
0680
0681
0682 PSIMD_INTRINSIC psimd_f32 psimd_andmask_f32(psimd_s32 mask, psimd_f32 v) {
0683 return (psimd_f32) (mask & (psimd_s32) v);
0684 }
0685
0686
0687 PSIMD_INTRINSIC psimd_f32 psimd_andnotmask_f32(psimd_s32 mask, psimd_f32 v) {
0688 return (psimd_f32) (~mask & (psimd_s32) v);
0689 }
0690
0691
0692 PSIMD_INTRINSIC psimd_s8 psimd_blend_s8(psimd_s8 mask, psimd_s8 a, psimd_s8 b) {
0693 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0694 return (psimd_s8) vbslq_s8((uint8x16_t) mask, (int8x16_t) a, (int8x16_t) b);
0695 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
0696 return (psimd_s8) __builtin_wasm_bitselect(a, b, mask);
0697 #else
0698 return (mask & a) | (~mask & b);
0699 #endif
0700 }
0701
0702 PSIMD_INTRINSIC psimd_u8 psimd_blend_u8(psimd_s8 mask, psimd_u8 a, psimd_u8 b) {
0703 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0704 return (psimd_u8) vbslq_u8((uint8x16_t) mask, (uint8x16_t) a, (uint8x16_t) b);
0705 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
0706 return (psimd_u8) __builtin_wasm_bitselect(a, b, mask);
0707 #else
0708 return (psimd_u8) ((mask & (psimd_s8) a) | (~mask & (psimd_s8) b));
0709 #endif
0710 }
0711
0712 PSIMD_INTRINSIC psimd_s16 psimd_blend_s16(psimd_s16 mask, psimd_s16 a, psimd_s16 b) {
0713 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0714 return (psimd_s16) vbslq_s16((uint16x8_t) mask, (int16x8_t) a, (int16x8_t) b);
0715 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
0716 return (psimd_s16) __builtin_wasm_bitselect(a, b, mask);
0717 #else
0718 return (mask & a) | (~mask & b);
0719 #endif
0720 }
0721
0722 PSIMD_INTRINSIC psimd_u16 psimd_blend_u16(psimd_s16 mask, psimd_u16 a, psimd_u16 b) {
0723 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0724 return (psimd_u16) vbslq_u16((uint16x8_t) mask, (uint16x8_t) a, (uint16x8_t) b);
0725 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
0726 return (psimd_u16) __builtin_wasm_bitselect(a, b, mask);
0727 #else
0728 return (psimd_u16) ((mask & (psimd_s16) a) | (~mask & (psimd_s16) b));
0729 #endif
0730 }
0731
0732 PSIMD_INTRINSIC psimd_s32 psimd_blend_s32(psimd_s32 mask, psimd_s32 a, psimd_s32 b) {
0733 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0734 return (psimd_s32) vbslq_s32((uint32x4_t) mask, (int32x4_t) a, (int32x4_t) b);
0735 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
0736 return (psimd_s32) __builtin_wasm_bitselect(a, b, mask);
0737 #else
0738 return (mask & a) | (~mask & b);
0739 #endif
0740 }
0741
0742 PSIMD_INTRINSIC psimd_u32 psimd_blend_u32(psimd_s32 mask, psimd_u32 a, psimd_u32 b) {
0743 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0744 return (psimd_u32) vbslq_u32((uint32x4_t) mask, (uint32x4_t) a, (uint32x4_t) b);
0745 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
0746 return (psimd_u32) __builtin_wasm_bitselect(a, b, mask);
0747 #else
0748 return (psimd_u32) ((mask & (psimd_s32) a) | (~mask & (psimd_s32) b));
0749 #endif
0750 }
0751
0752 PSIMD_INTRINSIC psimd_f32 psimd_blend_f32(psimd_s32 mask, psimd_f32 a, psimd_f32 b) {
0753 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0754 return (psimd_f32) vbslq_f32((uint32x4_t) mask, (float32x4_t) a, (float32x4_t) b);
0755 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
0756 return (psimd_f32) __builtin_wasm_bitselect(a, b, mask);
0757 #else
0758 return (psimd_f32) ((mask & (psimd_s32) a) | (~mask & (psimd_s32) b));
0759 #endif
0760 }
0761
0762
0763 PSIMD_INTRINSIC psimd_s8 psimd_signblend_s8(psimd_s8 x, psimd_s8 a, psimd_s8 b) {
0764 return psimd_blend_s8(x >> psimd_splat_s8(7), a, b);
0765 }
0766
0767 PSIMD_INTRINSIC psimd_u8 psimd_signblend_u8(psimd_s8 x, psimd_u8 a, psimd_u8 b) {
0768 return psimd_blend_u8((x >> psimd_splat_s8(7)), a, b);
0769 }
0770
0771 PSIMD_INTRINSIC psimd_s16 psimd_signblend_s16(psimd_s16 x, psimd_s16 a, psimd_s16 b) {
0772 return psimd_blend_s16(x >> psimd_splat_s16(15), a, b);
0773 }
0774
0775 PSIMD_INTRINSIC psimd_u16 psimd_signblend_u16(psimd_s16 x, psimd_u16 a, psimd_u16 b) {
0776 return psimd_blend_u16((x >> psimd_splat_s16(15)), a, b);
0777 }
0778
0779 PSIMD_INTRINSIC psimd_s32 psimd_signblend_s32(psimd_s32 x, psimd_s32 a, psimd_s32 b) {
0780 return psimd_blend_s32(x >> psimd_splat_s32(31), a, b);
0781 }
0782
0783 PSIMD_INTRINSIC psimd_u32 psimd_signblend_u32(psimd_s32 x, psimd_u32 a, psimd_u32 b) {
0784 return psimd_blend_u32((x >> psimd_splat_s32(31)), a, b);
0785 }
0786
0787 PSIMD_INTRINSIC psimd_f32 psimd_signblend_f32(psimd_f32 x, psimd_f32 a, psimd_f32 b) {
0788 const psimd_s32 mask = (psimd_s32) x >> psimd_splat_s32(31);
0789 return psimd_blend_f32(mask, a, b);
0790 }
0791
0792
0793 PSIMD_INTRINSIC psimd_f32 psimd_abs_f32(psimd_f32 v) {
0794 const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f);
0795 return (psimd_f32) ((psimd_s32) v & ~mask);
0796 }
0797
0798
0799 PSIMD_INTRINSIC psimd_f32 psimd_neg_f32(psimd_f32 v) {
0800 const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f);
0801 return (psimd_f32) ((psimd_s32) v ^ mask);
0802 }
0803
0804
0805 PSIMD_INTRINSIC psimd_s8 psimd_max_s8(psimd_s8 a, psimd_s8 b) {
0806 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0807 return (psimd_s8) vmaxq_s8((int8x16_t) a, (int8x16_t) b);
0808 #else
0809 return psimd_blend_s8(a > b, a, b);
0810 #endif
0811 }
0812
0813 PSIMD_INTRINSIC psimd_u8 psimd_max_u8(psimd_u8 a, psimd_u8 b) {
0814 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0815 return (psimd_u8) vmaxq_u8((uint8x16_t) a, (uint8x16_t) b);
0816 #else
0817 return psimd_blend_u8(a > b, a, b);
0818 #endif
0819 }
0820
0821 PSIMD_INTRINSIC psimd_s16 psimd_max_s16(psimd_s16 a, psimd_s16 b) {
0822 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0823 return (psimd_s16) vmaxq_s16((int16x8_t) a, (int16x8_t) b);
0824 #else
0825 return psimd_blend_s16(a > b, a, b);
0826 #endif
0827 }
0828
0829 PSIMD_INTRINSIC psimd_u16 psimd_max_u16(psimd_u16 a, psimd_u16 b) {
0830 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0831 return (psimd_u16) vmaxq_u16((uint16x8_t) a, (uint16x8_t) b);
0832 #else
0833 return psimd_blend_u16(a > b, a, b);
0834 #endif
0835 }
0836
0837 PSIMD_INTRINSIC psimd_s32 psimd_max_s32(psimd_s32 a, psimd_s32 b) {
0838 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0839 return (psimd_s32) vmaxq_s32((int32x4_t) a, (int32x4_t) b);
0840 #else
0841 return psimd_blend_s32(a > b, a, b);
0842 #endif
0843 }
0844
0845 PSIMD_INTRINSIC psimd_u32 psimd_max_u32(psimd_u32 a, psimd_u32 b) {
0846 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0847 return (psimd_u32) vmaxq_u32((uint32x4_t) a, (uint32x4_t) b);
0848 #else
0849 return psimd_blend_u32(a > b, a, b);
0850 #endif
0851 }
0852
0853 PSIMD_INTRINSIC psimd_f32 psimd_max_f32(psimd_f32 a, psimd_f32 b) {
0854 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0855 return (psimd_f32) vmaxq_f32((float32x4_t) a, (float32x4_t) b);
0856 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
0857 return __builtin_wasm_max_f32x4(a, b);
0858 #else
0859 return psimd_blend_f32(a > b, a, b);
0860 #endif
0861 }
0862
0863
0864 PSIMD_INTRINSIC psimd_s8 psimd_min_s8(psimd_s8 a, psimd_s8 b) {
0865 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0866 return (psimd_s8) vminq_s8((int8x16_t) a, (int8x16_t) b);
0867 #else
0868 return psimd_blend_s8(a < b, a, b);
0869 #endif
0870 }
0871
0872 PSIMD_INTRINSIC psimd_u8 psimd_min_u8(psimd_u8 a, psimd_u8 b) {
0873 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0874 return (psimd_u8) vminq_u8((uint8x16_t) a, (uint8x16_t) b);
0875 #else
0876 return psimd_blend_u8(a < b, a, b);
0877 #endif
0878 }
0879
0880 PSIMD_INTRINSIC psimd_s16 psimd_min_s16(psimd_s16 a, psimd_s16 b) {
0881 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0882 return (psimd_s16) vminq_s16((int16x8_t) a, (int16x8_t) b);
0883 #else
0884 return psimd_blend_s16(a < b, a, b);
0885 #endif
0886 }
0887
0888 PSIMD_INTRINSIC psimd_u16 psimd_min_u16(psimd_u16 a, psimd_u16 b) {
0889 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0890 return (psimd_u16) vminq_u16((uint16x8_t) a, (uint16x8_t) b);
0891 #else
0892 return psimd_blend_u16(a < b, a, b);
0893 #endif
0894 }
0895
0896 PSIMD_INTRINSIC psimd_s32 psimd_min_s32(psimd_s32 a, psimd_s32 b) {
0897 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0898 return (psimd_s32) vminq_s32((int32x4_t) a, (int32x4_t) b);
0899 #else
0900 return psimd_blend_s32(a < b, a, b);
0901 #endif
0902 }
0903
0904 PSIMD_INTRINSIC psimd_u32 psimd_min_u32(psimd_u32 a, psimd_u32 b) {
0905 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0906 return (psimd_u32) vminq_u32((uint32x4_t) a, (uint32x4_t) b);
0907 #else
0908 return psimd_blend_u32(a < b, a, b);
0909 #endif
0910 }
0911
0912 PSIMD_INTRINSIC psimd_f32 psimd_min_f32(psimd_f32 a, psimd_f32 b) {
0913 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
0914 return (psimd_f32) vminq_f32((float32x4_t) a, (float32x4_t) b);
0915 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
0916 return __builtin_wasm_min_f32x4(a, b);
0917 #else
0918 return psimd_blend_f32(a < b, a, b);
0919 #endif
0920 }
0921
0922 PSIMD_INTRINSIC psimd_f32 psimd_cvt_s32_f32(psimd_s32 v) {
0923 #if defined(__clang__)
0924 return __builtin_convertvector(v, psimd_f32);
0925 #elif defined(__ARM_NEON__) || defined(__ARM_NEON)
0926 return (psimd_f32) vcvtq_f32_s32((int32x4_t) v);
0927 #elif defined(__SSE2__)
0928 return (psimd_f32) _mm_cvtepi32_ps((__m128i) v);
0929 #else
0930 return (psimd_f32) { (float) v[0], (float) v[1], (float) v[2], (float) v[3] };
0931 #endif
0932 }
0933
0934
0935 #if defined(__clang__)
0936 PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) {
0937 return __builtin_shufflevector(v, v, 0, 0, 0, 0);
0938 }
0939
0940 PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) {
0941 return __builtin_shufflevector(v, v, 1, 1, 1, 1);
0942 }
0943
0944 PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) {
0945 return __builtin_shufflevector(v, v, 2, 2, 2, 2);
0946 }
0947
0948 PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) {
0949 return __builtin_shufflevector(v, v, 3, 3, 3, 3);
0950 }
0951 #else
0952 PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) {
0953 return __builtin_shuffle(v, (psimd_s32) { 0, 0, 0, 0 });
0954 }
0955
0956 PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) {
0957 return __builtin_shuffle(v, (psimd_s32) { 1, 1, 1, 1 });
0958 }
0959
0960 PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) {
0961 return __builtin_shuffle(v, (psimd_s32) { 2, 2, 2, 2 });
0962 }
0963
0964 PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) {
0965 return __builtin_shuffle(v, (psimd_s32) { 3, 3, 3, 3 });
0966 }
0967 #endif
0968
0969
0970 #if defined(__clang__)
0971 PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) {
0972 return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
0973 }
0974
0975 PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) {
0976 return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
0977 }
0978
0979 PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) {
0980 return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0);
0981 }
0982
0983 PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) {
0984 return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0);
0985 }
0986
0987 PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) {
0988 return __builtin_shufflevector(v, v, 3, 2, 1, 0);
0989 }
0990
0991 PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) {
0992 return __builtin_shufflevector(v, v, 3, 2, 1, 0);
0993 }
0994
0995 PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) {
0996 return __builtin_shufflevector(v, v, 3, 2, 1, 0);
0997 }
0998 #else
0999 PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) {
1000 return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 });
1001 }
1002
1003 PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) {
1004 return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 });
1005 }
1006
1007 PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) {
1008 return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 });
1009 }
1010
1011 PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) {
1012 return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 });
1013 }
1014
1015 PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) {
1016 return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
1017 }
1018
1019 PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) {
1020 return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
1021 }
1022
1023 PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) {
1024 return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
1025 }
1026 #endif
1027
1028
1029 #if defined(__clang__)
1030 PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) {
1031 return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1032 }
1033
1034 PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) {
1035 return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1036 }
1037
1038 PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) {
1039 return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1040 }
1041
1042 PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) {
1043 return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1044 }
1045
1046 PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) {
1047 return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
1048 }
1049
1050 PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) {
1051 return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
1052 }
1053
1054 PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) {
1055 return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
1056 }
1057
1058 PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) {
1059 return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
1060 }
1061
1062 PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) {
1063 return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
1064 }
1065
1066 PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) {
1067 return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
1068 }
1069 #else
1070 PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) {
1071 return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 });
1072 }
1073
1074 PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) {
1075 return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 });
1076 }
1077
1078 PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) {
1079 return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 });
1080 }
1081
1082 PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) {
1083 return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 });
1084 }
1085
1086 PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) {
1087 return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
1088 }
1089
1090 PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) {
1091 return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
1092 }
1093
1094 PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) {
1095 return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
1096 }
1097
1098 PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) {
1099 return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
1100 }
1101
1102 PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) {
1103 return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
1104 }
1105
1106 PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) {
1107 return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
1108 }
1109 #endif
1110
1111
1112 #if defined(__clang__)
1113 PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) {
1114 return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3);
1115 }
1116
1117 PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) {
1118 return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7);
1119 }
1120
1121 PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) {
1122 return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3);
1123 }
1124
1125 PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) {
1126 return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7);
1127 }
1128
1129 PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) {
1130 return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
1131 }
1132
1133 PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) {
1134 return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
1135 }
1136
1137 PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) {
1138 return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
1139 }
1140
1141 PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) {
1142 return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
1143 }
1144
1145 PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) {
1146 return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
1147 }
1148
1149 PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) {
1150 return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
1151 }
1152 #else
1153 PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) {
1154 return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 });
1155 }
1156
1157 PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) {
1158 return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 });
1159 }
1160
1161 PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) {
1162 return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 });
1163 }
1164
1165 PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) {
1166 return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 });
1167 }
1168
1169 PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) {
1170 return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
1171 }
1172
1173 PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) {
1174 return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
1175 }
1176
1177 PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) {
1178 return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
1179 }
1180
1181 PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) {
1182 return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
1183 }
1184
1185 PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) {
1186 return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
1187 }
1188
1189 PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) {
1190 return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
1191 }
1192 #endif
1193
1194
1195 #if defined(__clang__)
1196 PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) {
1197 return __builtin_shufflevector(a, b,
1198 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14);
1199 }
1200
1201 PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) {
1202 return __builtin_shufflevector(a, b,
1203 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15);
1204 }
1205
1206 PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) {
1207 return __builtin_shufflevector(a, b,
1208 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14);
1209 }
1210
1211 PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) {
1212 return __builtin_shufflevector(a, b,
1213 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15);
1214 }
1215
1216 PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) {
1217 return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6);
1218 }
1219
1220 PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) {
1221 return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7);
1222 }
1223
1224 PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) {
1225 return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6);
1226 }
1227
1228 PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) {
1229 return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7);
1230 }
1231
1232 PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) {
1233 return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
1234 }
1235
1236 PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) {
1237 return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
1238 }
1239
1240 PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) {
1241 return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
1242 }
1243
1244 PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) {
1245 return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
1246 }
1247
1248 PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) {
1249 return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
1250 }
1251
1252 PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) {
1253 return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
1254 }
1255 #else
1256 PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) {
1257 return __builtin_shuffle(a, b,
1258 (psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 });
1259 }
1260
1261 PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) {
1262 return __builtin_shuffle(a, b,
1263 (psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 });
1264 }
1265
1266 PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) {
1267 return __builtin_shuffle(a, b,
1268 (psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 });
1269 }
1270
1271 PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) {
1272 return __builtin_shuffle(a, b,
1273 (psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 });
1274 }
1275
1276 PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) {
1277 return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 });
1278 }
1279
1280 PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) {
1281 return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 });
1282 }
1283
1284 PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) {
1285 return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 });
1286 }
1287
1288 PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) {
1289 return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 });
1290 }
1291
1292 PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) {
1293 return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
1294 }
1295
1296 PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) {
1297 return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
1298 }
1299
1300 PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) {
1301 return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
1302 }
1303
1304 PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) {
1305 return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
1306 }
1307
1308 PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) {
1309 return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
1310 }
1311
1312 PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) {
1313 return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
1314 }
1315 #endif
1316
1317
1318 #if defined(__clang__)
1319 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) {
1320 const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, 0, 1);
1321 return temp + __builtin_shufflevector(temp, temp, 1, 0, 3, 2);
1322 }
1323
1324 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) {
1325 const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1));
1326 return psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2));
1327 }
1328
1329 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) {
1330 const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1));
1331 return psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2));
1332 }
1333
1334 PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) {
1335 const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, -1, -1);
1336 const psimd_f32 result = temp + __builtin_shufflevector(temp, temp, 1, -1, -1, -1);
1337 return result[0];
1338 }
1339
1340 PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) {
1341 const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1));
1342 const psimd_f32 result = psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1));
1343 return result[0];
1344 }
1345
1346 PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) {
1347 const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1));
1348 const psimd_f32 result = psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1));
1349 return result[0];
1350 }
1351 #else
1352 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) {
1353 const psimd_f32 temp = v + __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 });
1354 return temp + __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 });
1355 }
1356
1357 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) {
1358 const psimd_f32 temp = psimd_max_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 }));
1359 return psimd_max_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 }));
1360 }
1361
1362 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) {
1363 const psimd_f32 temp = psimd_min_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 }));
1364 return psimd_min_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 }));
1365 }
1366
1367 PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) {
1368 const psimd_f32 result = psimd_allreduce_sum_f32(v);
1369 return result[0];
1370 }
1371
1372 PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) {
1373 const psimd_f32 result = psimd_allreduce_max_f32(v);
1374 return result[0];
1375 }
1376
1377 PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) {
1378 const psimd_f32 result = psimd_allreduce_min_f32(v);
1379 return result[0];
1380 }
1381 #endif
1382 #endif
1383
1384 #endif