File indexing completed on 2025-01-30 10:25:46
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028 #ifndef VC_COMMON_GATHERIMPLEMENTATION_H_
0029 #define VC_COMMON_GATHERIMPLEMENTATION_H_
0030
0031 #include "macros.h"
0032
0033 namespace Vc_VERSIONED_NAMESPACE
0034 {
0035 namespace Common
0036 {
0037
0038 enum class GatherScatterImplementation : int {
0039 SimpleLoop,
0040 SetIndexZero,
0041 BitScanLoop,
0042 PopcntSwitch
0043 };
0044
0045 using SimpleLoopT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::SimpleLoop>;
0046 using SetIndexZeroT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::SetIndexZero>;
0047 using BitScanLoopT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::BitScanLoop>;
0048 using PopcntSwitchT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::PopcntSwitch>;
0049
0050 template <typename V, typename MT, typename IT>
0051 Vc_ALWAYS_INLINE void executeGather(SetIndexZeroT,
0052 V &v,
0053 const MT *mem,
0054 IT &&indexes_,
0055 typename V::MaskArgument mask)
0056 {
0057 auto indexes = std::forward<IT>(indexes_);
0058 indexes.setZeroInverted(static_cast<decltype(!indexes)>(mask));
0059 const V tmp(mem, indexes);
0060 where(mask) | v = tmp;
0061 }
0062
0063 template <typename V, typename MT, typename IT>
0064 Vc_ALWAYS_INLINE void executeGather(SimpleLoopT, V &v, const MT *mem, const IT &indexes,
0065 const typename V::MaskArgument mask)
0066 {
0067 if (Vc_IS_UNLIKELY(mask.isEmpty())) {
0068 return;
0069 }
0070 #if defined Vc_GCC && Vc_GCC >= 0x40900
0071
0072 constexpr std::size_t Sizeof = sizeof(V);
0073 using Builtin [[gnu::vector_size(Sizeof)]] = typename V::value_type;
0074 Builtin tmp = reinterpret_cast<Builtin>(v.data());
0075 Common::unrolled_loop<std::size_t, 0, V::Size>([&](std::size_t i) {
0076 if (mask[i]) {
0077 tmp[i] = mem[indexes[i]];
0078 }
0079 });
0080 v.data() = reinterpret_cast<typename V::VectorType>(tmp);
0081 #else
0082 Common::unrolled_loop<std::size_t, 0, V::Size>([&](std::size_t i) {
0083 if (mask[i])
0084 v[i] = mem[indexes[i]];
0085 });
0086 #endif
0087 }
0088
0089 template <typename V, typename MT, typename IT>
0090 Vc_ALWAYS_INLINE void executeGather(BitScanLoopT,
0091 V &v,
0092 const MT *mem,
0093 const IT &indexes,
0094 typename V::MaskArgument mask)
0095 {
0096 #ifdef Vc_GNU_ASM
0097 size_t bits = mask.toInt();
0098 while (Vc_IS_LIKELY(bits > 0)) {
0099 size_t i, j;
0100 asm("bsf %[bits],%[i]\n\t"
0101 "bsr %[bits],%[j]\n\t"
0102 "btr %[i],%[bits]\n\t"
0103 "btr %[j],%[bits]\n\t"
0104 : [i] "=r"(i), [j] "=r"(j), [bits] "+r"(bits));
0105 v[i] = mem[indexes[i]];
0106 v[j] = mem[indexes[j]];
0107 }
0108 #else
0109
0110 int bits = mask.toInt();
0111 while (bits) {
0112 const int i = _bit_scan_forward(bits);
0113 bits &= bits - 1;
0114 v[i] = mem[indexes[i]];
0115 }
0116 #endif
0117 }
0118
0119 template <typename V, typename MT, typename IT>
0120 Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
0121 V &v,
0122 const MT *mem,
0123 const IT &indexes,
0124 typename V::MaskArgument mask,
0125 enable_if<V::Size == 16> = nullarg)
0126 {
0127 unsigned int bits = mask.toInt();
0128 unsigned int low, high = 0;
0129 switch (Vc::Detail::popcnt16(bits)) {
0130 case 16:
0131 v.gather(mem, indexes);
0132 break;
0133 case 15:
0134 low = _bit_scan_forward(bits);
0135 bits ^= 1 << low;
0136 v[low] = mem[indexes[low]];
0137
0138 case 14:
0139 high = _bit_scan_reverse(bits);
0140 v[high] = mem[indexes[high]];
0141 high = (1 << high);
0142
0143 case 13:
0144 low = _bit_scan_forward(bits);
0145 bits ^= high | (1 << low);
0146 v[low] = mem[indexes[low]];
0147
0148 case 12:
0149 high = _bit_scan_reverse(bits);
0150 v[high] = mem[indexes[high]];
0151 high = (1 << high);
0152
0153 case 11:
0154 low = _bit_scan_forward(bits);
0155 bits ^= high | (1 << low);
0156 v[low] = mem[indexes[low]];
0157
0158 case 10:
0159 high = _bit_scan_reverse(bits);
0160 v[high] = mem[indexes[high]];
0161 high = (1 << high);
0162
0163 case 9:
0164 low = _bit_scan_forward(bits);
0165 bits ^= high | (1 << low);
0166 v[low] = mem[indexes[low]];
0167
0168 case 8:
0169 high = _bit_scan_reverse(bits);
0170 v[high] = mem[indexes[high]];
0171 high = (1 << high);
0172
0173 case 7:
0174 low = _bit_scan_forward(bits);
0175 bits ^= high | (1 << low);
0176 v[low] = mem[indexes[low]];
0177
0178 case 6:
0179 high = _bit_scan_reverse(bits);
0180 v[high] = mem[indexes[high]];
0181 high = (1 << high);
0182
0183 case 5:
0184 low = _bit_scan_forward(bits);
0185 bits ^= high | (1 << low);
0186 v[low] = mem[indexes[low]];
0187
0188 case 4:
0189 high = _bit_scan_reverse(bits);
0190 v[high] = mem[indexes[high]];
0191 high = (1 << high);
0192
0193 case 3:
0194 low = _bit_scan_forward(bits);
0195 bits ^= high | (1 << low);
0196 v[low] = mem[indexes[low]];
0197
0198 case 2:
0199 high = _bit_scan_reverse(bits);
0200 v[high] = mem[indexes[high]];
0201
0202 case 1:
0203 low = _bit_scan_forward(bits);
0204 v[low] = mem[indexes[low]];
0205
0206 case 0:
0207 break;
0208 }
0209 }
0210 template <typename V, typename MT, typename IT>
0211 Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
0212 V &v,
0213 const MT *mem,
0214 const IT &indexes,
0215 typename V::MaskArgument mask,
0216 enable_if<V::Size == 8> = nullarg)
0217 {
0218 unsigned int bits = mask.toInt();
0219 unsigned int low, high = 0;
0220 switch (Vc::Detail::popcnt8(bits)) {
0221 case 8:
0222 v.gather(mem, indexes);
0223 break;
0224 case 7:
0225 low = _bit_scan_forward(bits);
0226 bits ^= 1 << low;
0227 v[low] = mem[indexes[low]];
0228
0229 case 6:
0230 high = _bit_scan_reverse(bits);
0231 v[high] = mem[indexes[high]];
0232 high = (1 << high);
0233
0234 case 5:
0235 low = _bit_scan_forward(bits);
0236 bits ^= high | (1 << low);
0237 v[low] = mem[indexes[low]];
0238
0239 case 4:
0240 high = _bit_scan_reverse(bits);
0241 v[high] = mem[indexes[high]];
0242 high = (1 << high);
0243
0244 case 3:
0245 low = _bit_scan_forward(bits);
0246 bits ^= high | (1 << low);
0247 v[low] = mem[indexes[low]];
0248
0249 case 2:
0250 high = _bit_scan_reverse(bits);
0251 v[high] = mem[indexes[high]];
0252
0253 case 1:
0254 low = _bit_scan_forward(bits);
0255 v[low] = mem[indexes[low]];
0256
0257 case 0:
0258 break;
0259 }
0260 }
0261 template <typename V, typename MT, typename IT>
0262 Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
0263 V &v,
0264 const MT *mem,
0265 const IT &indexes,
0266 typename V::MaskArgument mask,
0267 enable_if<V::Size == 4> = nullarg)
0268 {
0269 unsigned int bits = mask.toInt();
0270 unsigned int low, high = 0;
0271 switch (Vc::Detail::popcnt4(bits)) {
0272 case 4:
0273 v.gather(mem, indexes);
0274 break;
0275 case 3:
0276 low = _bit_scan_forward(bits);
0277 bits ^= 1 << low;
0278 v[low] = mem[indexes[low]];
0279
0280 case 2:
0281 high = _bit_scan_reverse(bits);
0282 v[high] = mem[indexes[high]];
0283
0284 case 1:
0285 low = _bit_scan_forward(bits);
0286 v[low] = mem[indexes[low]];
0287
0288 case 0:
0289 break;
0290 }
0291 }
0292 template <typename V, typename MT, typename IT>
0293 Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
0294 V &v,
0295 const MT *mem,
0296 const IT &indexes,
0297 typename V::MaskArgument mask,
0298 enable_if<V::Size == 2> = nullarg)
0299 {
0300 unsigned int bits = mask.toInt();
0301 unsigned int low;
0302 switch (Vc::Detail::popcnt4(bits)) {
0303 case 2:
0304 v.gather(mem, indexes);
0305 break;
0306 case 1:
0307 low = _bit_scan_forward(bits);
0308 v[low] = mem[indexes[low]];
0309
0310 case 0:
0311 break;
0312 }
0313 }
0314
0315 }
0316 }
0317
0318 #endif