Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-12-16 10:14:57

0001 #pragma once
0002 #ifndef FP16_PSIMD_H
0003 #define FP16_PSIMD_H
0004 
0005 #if defined(__cplusplus) && (__cplusplus >= 201103L)
0006     #include <cstdint>
0007 #elif !defined(__OPENCL_VERSION__)
0008     #include <stdint.h>
0009 #endif
0010 
0011 #include <psimd.h>
0012 
0013 
0014 PSIMD_INTRINSIC psimd_f32 fp16_ieee_to_fp32_psimd(psimd_u16 half) {
0015     const psimd_u32 word = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half);
0016 
0017     const psimd_u32 sign = word & psimd_splat_u32(UINT32_C(0x80000000));
0018     const psimd_u32 shr3_nonsign = (word + word) >> psimd_splat_u32(4);
0019 
0020     const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x70000000));
0021 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
0022     const psimd_f32 exp_scale = psimd_splat_f32(0x1.0p-112f);
0023 #else
0024     const psimd_f32 exp_scale = psimd_splat_f32(fp32_from_bits(UINT32_C(0x7800000)));
0025 #endif
0026     const psimd_f32 norm_nonsign = psimd_mul_f32((psimd_f32) (shr3_nonsign + exp_offset), exp_scale);
0027 
0028     const psimd_u16 magic_mask = psimd_splat_u16(UINT16_C(0x3E80));
0029     const psimd_f32 magic_bias = psimd_splat_f32(0.25f);
0030     const psimd_f32 denorm_nonsign = psimd_sub_f32((psimd_f32) psimd_interleave_lo_u16(half + half, magic_mask), magic_bias);
0031 
0032     const psimd_s32 denorm_cutoff = psimd_splat_s32(INT32_C(0x00800000));
0033     const psimd_s32 denorm_mask = (psimd_s32) shr3_nonsign < denorm_cutoff;
0034     return (psimd_f32) (sign | (psimd_s32) psimd_blend_f32(denorm_mask, denorm_nonsign, norm_nonsign));
0035 }
0036 
0037 PSIMD_INTRINSIC psimd_f32x2 fp16_ieee_to_fp32x2_psimd(psimd_u16 half) {
0038     const psimd_u32 word_lo = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half);
0039     const psimd_u32 word_hi = (psimd_u32) psimd_interleave_hi_u16(psimd_zero_u16(), half);
0040 
0041     const psimd_u32 sign_mask = psimd_splat_u32(UINT32_C(0x80000000));
0042     const psimd_u32 sign_lo = word_lo & sign_mask;
0043     const psimd_u32 sign_hi = word_hi & sign_mask;
0044     const psimd_u32 shr3_nonsign_lo = (word_lo + word_lo) >> psimd_splat_u32(4);
0045     const psimd_u32 shr3_nonsign_hi = (word_hi + word_hi) >> psimd_splat_u32(4);
0046 
0047     const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x70000000));
0048 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
0049     const psimd_f32 exp_scale = psimd_splat_f32(0x1.0p-112f);
0050 #else
0051     const psimd_f32 exp_scale = psimd_splat_f32(fp32_from_bits(UINT32_C(0x7800000)));
0052 #endif
0053     const psimd_f32 norm_nonsign_lo = psimd_mul_f32((psimd_f32) (shr3_nonsign_lo + exp_offset), exp_scale);
0054     const psimd_f32 norm_nonsign_hi = psimd_mul_f32((psimd_f32) (shr3_nonsign_hi + exp_offset), exp_scale);
0055 
0056     const psimd_u16 magic_mask = psimd_splat_u16(UINT16_C(0x3E80));
0057     const psimd_u16 shl1_half = half + half;
0058     const psimd_f32 magic_bias = psimd_splat_f32(0.25f);
0059     const psimd_f32 denorm_nonsign_lo = psimd_sub_f32((psimd_f32) psimd_interleave_lo_u16(shl1_half, magic_mask), magic_bias);
0060     const psimd_f32 denorm_nonsign_hi = psimd_sub_f32((psimd_f32) psimd_interleave_hi_u16(shl1_half, magic_mask), magic_bias);
0061 
0062     const psimd_s32 denorm_cutoff = psimd_splat_s32(INT32_C(0x00800000));
0063     const psimd_s32 denorm_mask_lo = (psimd_s32) shr3_nonsign_lo < denorm_cutoff;
0064     const psimd_s32 denorm_mask_hi = (psimd_s32) shr3_nonsign_hi < denorm_cutoff;
0065 
0066     psimd_f32x2 result;
0067     result.lo = (psimd_f32) (sign_lo | (psimd_s32) psimd_blend_f32(denorm_mask_lo, denorm_nonsign_lo, norm_nonsign_lo));
0068     result.hi = (psimd_f32) (sign_hi | (psimd_s32) psimd_blend_f32(denorm_mask_hi, denorm_nonsign_hi, norm_nonsign_hi));
0069     return result;
0070 }
0071 
0072 PSIMD_INTRINSIC psimd_f32 fp16_alt_to_fp32_psimd(psimd_u16 half) {
0073     const psimd_u32 word = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half);
0074 
0075     const psimd_u32 sign = word & psimd_splat_u32(INT32_C(0x80000000));
0076     const psimd_u32 shr3_nonsign = (word + word) >> psimd_splat_u32(4);
0077 
0078 #if 0
0079     const psimd_s32 exp112_offset = psimd_splat_s32(INT32_C(0x38000000));
0080     const psimd_s32 nonsign_bits = (psimd_s32) shr3_nonsign + exp112_offset;
0081     const psimd_s32 exp1_offset = psimd_splat_s32(INT32_C(0x00800000));
0082     const psimd_f32 two_nonsign = (psimd_f32) (nonsign_bits + exp1_offset);
0083     const psimd_s32 exp113_offset = exp112_offset | exp1_offset;
0084     return (psimd_f32) (sign | (psimd_s32) psimd_sub_f32(two_nonsign, (psimd_f32) psimd_max_s32(nonsign_bits, exp113_offset)));
0085 #else
0086     const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x38000000));
0087     const psimd_f32 nonsign = (psimd_f32) (shr3_nonsign + exp_offset);
0088 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
0089     const psimd_f32 denorm_bias = psimd_splat_f32(0x1.0p-14f);
0090 #else
0091     const psimd_f32 denorm_bias = psimd_splat_f32(fp32_from_bits(UINT32_C(0x38800000)));
0092 #endif
0093     return (psimd_f32) (sign | (psimd_s32) psimd_sub_f32(psimd_add_f32(nonsign, nonsign), psimd_max_f32(nonsign, denorm_bias)));
0094 #endif
0095 }
0096 
0097 PSIMD_INTRINSIC psimd_f32x2 fp16_alt_to_fp32x2_psimd(psimd_u16 half) {
0098     const psimd_u32 word_lo = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half);
0099     const psimd_u32 word_hi = (psimd_u32) psimd_interleave_hi_u16(psimd_zero_u16(), half);
0100 
0101     const psimd_u32 sign_mask = psimd_splat_u32(UINT32_C(0x80000000));
0102     const psimd_u32 sign_lo = word_lo & sign_mask;
0103     const psimd_u32 sign_hi = word_hi & sign_mask;
0104     const psimd_u32 shr3_nonsign_lo = (word_lo + word_lo) >> psimd_splat_u32(4);
0105     const psimd_u32 shr3_nonsign_hi = (word_hi + word_hi) >> psimd_splat_u32(4);
0106 
0107 #if 1
0108     const psimd_s32 exp112_offset = psimd_splat_s32(INT32_C(0x38000000));
0109     const psimd_s32 nonsign_bits_lo = (psimd_s32) shr3_nonsign_lo + exp112_offset;
0110     const psimd_s32 nonsign_bits_hi = (psimd_s32) shr3_nonsign_hi + exp112_offset;
0111     const psimd_s32 exp1_offset = psimd_splat_s32(INT32_C(0x00800000));
0112     const psimd_f32 two_nonsign_lo = (psimd_f32) (nonsign_bits_lo + exp1_offset);
0113     const psimd_f32 two_nonsign_hi = (psimd_f32) (nonsign_bits_hi + exp1_offset);
0114     const psimd_s32 exp113_offset = exp1_offset | exp112_offset;
0115     psimd_f32x2 result;
0116     result.lo = (psimd_f32) (sign_lo | (psimd_s32) psimd_sub_f32(two_nonsign_lo, (psimd_f32) psimd_max_s32(nonsign_bits_lo, exp113_offset)));
0117     result.hi = (psimd_f32) (sign_hi | (psimd_s32) psimd_sub_f32(two_nonsign_hi, (psimd_f32) psimd_max_s32(nonsign_bits_hi, exp113_offset)));
0118     return result;
0119 #else
0120     const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x38000000));
0121     const psimd_f32 nonsign_lo = (psimd_f32) (shr3_nonsign_lo + exp_offset);
0122     const psimd_f32 nonsign_hi = (psimd_f32) (shr3_nonsign_hi + exp_offset);
0123     const psimd_f32 denorm_bias = psimd_splat_f32(0x1.0p-14f);
0124     psimd_f32x2 result;
0125     result.lo = (psimd_f32) (sign_lo | (psimd_s32) psimd_sub_f32(psimd_add_f32(nonsign_lo, nonsign_lo), psimd_max_f32(nonsign_lo, denorm_bias)));
0126     result.hi = (psimd_f32) (sign_hi | (psimd_s32) psimd_sub_f32(psimd_add_f32(nonsign_hi, nonsign_hi), psimd_max_f32(nonsign_hi, denorm_bias)));
0127     return result;
0128 #endif
0129 }
0130 
0131 #endif /* FP16_PSIMD_H */