File indexing completed on 2025-01-18 09:54:53
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039 #ifndef CRYPTOPP_ADVANCED_SIMD_TEMPLATES
0040 #define CRYPTOPP_ADVANCED_SIMD_TEMPLATES
0041
0042 #include "config.h"
0043 #include "misc.h"
0044 #include "stdcpp.h"
0045
0046 #if (CRYPTOPP_ARM_NEON_HEADER)
0047 # include <arm_neon.h>
0048 #endif
0049
0050 #if (CRYPTOPP_ARM_ACLE_HEADER)
0051 # include <stdint.h>
0052 # include <arm_acle.h>
0053 #endif
0054
0055 #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE)
0056 # include <emmintrin.h>
0057 # include <xmmintrin.h>
0058 #endif
0059
0060
0061 #if (CRYPTOPP_SSSE3_AVAILABLE)
0062 # include <emmintrin.h>
0063 # include <pmmintrin.h>
0064 # include <xmmintrin.h>
0065 #endif
0066
0067 #if defined(__ALTIVEC__)
0068 # include "ppc_simd.h"
0069 #endif
0070
0071
0072
0073 ANONYMOUS_NAMESPACE_BEGIN
0074
0075 using CryptoPP::BlockTransformation;
0076
0077 CRYPTOPP_CONSTANT(BT_XorInput = BlockTransformation::BT_XorInput);
0078 CRYPTOPP_CONSTANT(BT_AllowParallel = BlockTransformation::BT_AllowParallel);
0079 CRYPTOPP_CONSTANT(BT_InBlockIsCounter = BlockTransformation::BT_InBlockIsCounter);
0080 CRYPTOPP_CONSTANT(BT_ReverseDirection = BlockTransformation::BT_ReverseDirection);
0081 CRYPTOPP_CONSTANT(BT_DontIncrementInOutPointers = BlockTransformation::BT_DontIncrementInOutPointers);
0082
0083 ANONYMOUS_NAMESPACE_END
0084
0085
0086
0087 #if (CRYPTOPP_ARM_NEON_AVAILABLE) || (CRYPTOPP_ARM_ASIMD_AVAILABLE) || \
0088 defined(CRYPTOPP_DOXYGEN_PROCESSING)
0089 NAMESPACE_BEGIN(CryptoPP)
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099 template <typename F1, typename F6, typename W>
0100 inline size_t AdvancedProcessBlocks128_6x1_NEON(F1 func1, F6 func6,
0101 const W *subKeys, size_t rounds, const byte *inBlocks,
0102 const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
0103 {
0104 CRYPTOPP_ASSERT(subKeys);
0105 CRYPTOPP_ASSERT(inBlocks);
0106 CRYPTOPP_ASSERT(outBlocks);
0107 CRYPTOPP_ASSERT(length >= 16);
0108
0109 const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
0110 const uint32x4_t s_one = vld1q_u32(w_one);
0111
0112 const size_t blockSize = 16;
0113
0114
0115 size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
0116 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
0117 size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
0118
0119
0120 const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
0121 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
0122
0123 if (flags & BT_ReverseDirection)
0124 {
0125 inBlocks = PtrAdd(inBlocks, length - blockSize);
0126 xorBlocks = PtrAdd(xorBlocks, length - blockSize);
0127 outBlocks = PtrAdd(outBlocks, length - blockSize);
0128 inIncrement = 0-inIncrement;
0129 xorIncrement = 0-xorIncrement;
0130 outIncrement = 0-outIncrement;
0131 }
0132
0133 if (flags & BT_AllowParallel)
0134 {
0135 while (length >= 6*blockSize)
0136 {
0137 uint64x2_t block0, block1, block2, block3, block4, block5;
0138 if (flags & BT_InBlockIsCounter)
0139 {
0140 const uint64x2_t one = vreinterpretq_u64_u32(s_one);
0141 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0142 block1 = vaddq_u64(block0, one);
0143 block2 = vaddq_u64(block1, one);
0144 block3 = vaddq_u64(block2, one);
0145 block4 = vaddq_u64(block3, one);
0146 block5 = vaddq_u64(block4, one);
0147 vst1q_u8(const_cast<byte*>(inBlocks),
0148 vreinterpretq_u8_u64(vaddq_u64(block5, one)));
0149 }
0150 else
0151 {
0152 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0153 inBlocks = PtrAdd(inBlocks, inIncrement);
0154 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0155 inBlocks = PtrAdd(inBlocks, inIncrement);
0156 block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0157 inBlocks = PtrAdd(inBlocks, inIncrement);
0158 block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0159 inBlocks = PtrAdd(inBlocks, inIncrement);
0160 block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0161 inBlocks = PtrAdd(inBlocks, inIncrement);
0162 block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0163 inBlocks = PtrAdd(inBlocks, inIncrement);
0164 }
0165
0166 if (xorInput)
0167 {
0168 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0169 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0170 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0171 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0172 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0173 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0174 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0175 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0176 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0177 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0178 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0179 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0180 }
0181
0182 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
0183
0184 if (xorOutput)
0185 {
0186 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0187 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0188 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0189 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0190 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0191 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0192 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0193 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0194 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0195 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0196 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0197 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0198 }
0199
0200 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
0201 outBlocks = PtrAdd(outBlocks, outIncrement);
0202 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
0203 outBlocks = PtrAdd(outBlocks, outIncrement);
0204 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
0205 outBlocks = PtrAdd(outBlocks, outIncrement);
0206 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
0207 outBlocks = PtrAdd(outBlocks, outIncrement);
0208 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
0209 outBlocks = PtrAdd(outBlocks, outIncrement);
0210 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
0211 outBlocks = PtrAdd(outBlocks, outIncrement);
0212
0213 length -= 6*blockSize;
0214 }
0215 }
0216
0217 while (length >= blockSize)
0218 {
0219 uint64x2_t block;
0220 block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0221
0222 if (xorInput)
0223 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0224
0225 if (flags & BT_InBlockIsCounter)
0226 const_cast<byte *>(inBlocks)[15]++;
0227
0228 func1(block, subKeys, static_cast<unsigned int>(rounds));
0229
0230 if (xorOutput)
0231 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0232
0233 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
0234
0235 inBlocks = PtrAdd(inBlocks, inIncrement);
0236 outBlocks = PtrAdd(outBlocks, outIncrement);
0237 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0238 length -= blockSize;
0239 }
0240
0241 return length;
0242 }
0243
0244
0245
0246
0247
0248
0249
0250
0251
0252
0253 template <typename F1, typename F4, typename W>
0254 inline size_t AdvancedProcessBlocks128_4x1_NEON(F1 func1, F4 func4,
0255 const W *subKeys, size_t rounds, const byte *inBlocks,
0256 const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
0257 {
0258 CRYPTOPP_ASSERT(subKeys);
0259 CRYPTOPP_ASSERT(inBlocks);
0260 CRYPTOPP_ASSERT(outBlocks);
0261 CRYPTOPP_ASSERT(length >= 16);
0262
0263 const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
0264 const uint32x4_t s_one = vld1q_u32(w_one);
0265
0266 const size_t blockSize = 16;
0267
0268
0269 size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
0270 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
0271 size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
0272
0273
0274 const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
0275 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
0276
0277 if (flags & BT_ReverseDirection)
0278 {
0279 inBlocks = PtrAdd(inBlocks, length - blockSize);
0280 xorBlocks = PtrAdd(xorBlocks, length - blockSize);
0281 outBlocks = PtrAdd(outBlocks, length - blockSize);
0282 inIncrement = 0-inIncrement;
0283 xorIncrement = 0-xorIncrement;
0284 outIncrement = 0-outIncrement;
0285 }
0286
0287 if (flags & BT_AllowParallel)
0288 {
0289 while (length >= 4*blockSize)
0290 {
0291 uint32x4_t block0, block1, block2, block3;
0292 if (flags & BT_InBlockIsCounter)
0293 {
0294 const uint32x4_t one = s_one;
0295 block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
0296 block1 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block0), vreinterpretq_u64_u32(one)));
0297 block2 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block1), vreinterpretq_u64_u32(one)));
0298 block3 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block2), vreinterpretq_u64_u32(one)));
0299 vst1q_u8(const_cast<byte*>(inBlocks), vreinterpretq_u8_u64(vaddq_u64(
0300 vreinterpretq_u64_u32(block3), vreinterpretq_u64_u32(one))));
0301 }
0302 else
0303 {
0304 block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
0305 inBlocks = PtrAdd(inBlocks, inIncrement);
0306 block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
0307 inBlocks = PtrAdd(inBlocks, inIncrement);
0308 block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
0309 inBlocks = PtrAdd(inBlocks, inIncrement);
0310 block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
0311 inBlocks = PtrAdd(inBlocks, inIncrement);
0312 }
0313
0314 if (xorInput)
0315 {
0316 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
0317 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0318 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
0319 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0320 block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
0321 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0322 block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
0323 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0324 }
0325
0326 func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
0327
0328 if (xorOutput)
0329 {
0330 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
0331 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0332 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
0333 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0334 block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
0335 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0336 block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
0337 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0338 }
0339
0340 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
0341 outBlocks = PtrAdd(outBlocks, outIncrement);
0342 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
0343 outBlocks = PtrAdd(outBlocks, outIncrement);
0344 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block2));
0345 outBlocks = PtrAdd(outBlocks, outIncrement);
0346 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block3));
0347 outBlocks = PtrAdd(outBlocks, outIncrement);
0348
0349 length -= 4*blockSize;
0350 }
0351 }
0352
0353 while (length >= blockSize)
0354 {
0355 uint32x4_t block = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
0356
0357 if (xorInput)
0358 block = veorq_u32(block, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
0359
0360 if (flags & BT_InBlockIsCounter)
0361 const_cast<byte *>(inBlocks)[15]++;
0362
0363 func1(block, subKeys, static_cast<unsigned int>(rounds));
0364
0365 if (xorOutput)
0366 block = veorq_u32(block, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
0367
0368 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block));
0369
0370 inBlocks = PtrAdd(inBlocks, inIncrement);
0371 outBlocks = PtrAdd(outBlocks, outIncrement);
0372 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0373 length -= blockSize;
0374 }
0375
0376 return length;
0377 }
0378
0379
0380
0381
0382
0383
0384
0385
0386
0387 template <typename F2, typename F6, typename W>
0388 inline size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6,
0389 const W *subKeys, size_t rounds, const byte *inBlocks,
0390 const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
0391 {
0392 CRYPTOPP_ASSERT(subKeys);
0393 CRYPTOPP_ASSERT(inBlocks);
0394 CRYPTOPP_ASSERT(outBlocks);
0395 CRYPTOPP_ASSERT(length >= 16);
0396
0397 const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
0398 const uint32x4_t s_one = vld1q_u32(w_one);
0399
0400 const size_t blockSize = 16;
0401
0402
0403 size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
0404 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
0405 size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
0406
0407
0408 const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
0409 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
0410
0411 if (flags & BT_ReverseDirection)
0412 {
0413 inBlocks = PtrAdd(inBlocks, length - blockSize);
0414 xorBlocks = PtrAdd(xorBlocks, length - blockSize);
0415 outBlocks = PtrAdd(outBlocks, length - blockSize);
0416 inIncrement = 0-inIncrement;
0417 xorIncrement = 0-xorIncrement;
0418 outIncrement = 0-outIncrement;
0419 }
0420
0421 if (flags & BT_AllowParallel)
0422 {
0423 while (length >= 6*blockSize)
0424 {
0425 uint64x2_t block0, block1, block2, block3, block4, block5;
0426 if (flags & BT_InBlockIsCounter)
0427 {
0428 const uint64x2_t one = vreinterpretq_u64_u32(s_one);
0429 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0430 block1 = vaddq_u64(block0, one);
0431 block2 = vaddq_u64(block1, one);
0432 block3 = vaddq_u64(block2, one);
0433 block4 = vaddq_u64(block3, one);
0434 block5 = vaddq_u64(block4, one);
0435 vst1q_u8(const_cast<byte*>(inBlocks),
0436 vreinterpretq_u8_u64(vaddq_u64(block5, one)));
0437 }
0438 else
0439 {
0440 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0441 inBlocks = PtrAdd(inBlocks, inIncrement);
0442 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0443 inBlocks = PtrAdd(inBlocks, inIncrement);
0444 block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0445 inBlocks = PtrAdd(inBlocks, inIncrement);
0446 block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0447 inBlocks = PtrAdd(inBlocks, inIncrement);
0448 block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0449 inBlocks = PtrAdd(inBlocks, inIncrement);
0450 block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0451 inBlocks = PtrAdd(inBlocks, inIncrement);
0452 }
0453
0454 if (xorInput)
0455 {
0456 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0457 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0458 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0459 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0460 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0461 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0462 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0463 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0464 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0465 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0466 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0467 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0468 }
0469
0470 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
0471
0472 if (xorOutput)
0473 {
0474 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0475 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0476 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0477 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0478 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0479 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0480 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0481 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0482 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0483 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0484 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0485 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0486 }
0487
0488 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
0489 outBlocks = PtrAdd(outBlocks, outIncrement);
0490 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
0491 outBlocks = PtrAdd(outBlocks, outIncrement);
0492 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
0493 outBlocks = PtrAdd(outBlocks, outIncrement);
0494 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
0495 outBlocks = PtrAdd(outBlocks, outIncrement);
0496 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
0497 outBlocks = PtrAdd(outBlocks, outIncrement);
0498 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
0499 outBlocks = PtrAdd(outBlocks, outIncrement);
0500
0501 length -= 6*blockSize;
0502 }
0503
0504 while (length >= 2*blockSize)
0505 {
0506 uint64x2_t block0, block1;
0507 if (flags & BT_InBlockIsCounter)
0508 {
0509 const uint64x2_t one = vreinterpretq_u64_u32(s_one);
0510 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0511 block1 = vaddq_u64(block0, one);
0512 vst1q_u8(const_cast<byte*>(inBlocks),
0513 vreinterpretq_u8_u64(vaddq_u64(block1, one)));
0514 }
0515 else
0516 {
0517 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0518 inBlocks = PtrAdd(inBlocks, inIncrement);
0519 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0520 inBlocks = PtrAdd(inBlocks, inIncrement);
0521 }
0522
0523 if (xorInput)
0524 {
0525 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0526 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0527 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0528 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0529 }
0530
0531 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
0532
0533 if (xorOutput)
0534 {
0535 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0536 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0537 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0538 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0539 }
0540
0541 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
0542 outBlocks = PtrAdd(outBlocks, outIncrement);
0543 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
0544 outBlocks = PtrAdd(outBlocks, outIncrement);
0545
0546 length -= 2*blockSize;
0547 }
0548 }
0549
0550 while (length >= blockSize)
0551 {
0552 uint64x2_t block, zero = {0,0};
0553 block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0554
0555 if (xorInput)
0556 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0557
0558 if (flags & BT_InBlockIsCounter)
0559 const_cast<byte *>(inBlocks)[15]++;
0560
0561 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
0562
0563 if (xorOutput)
0564 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0565
0566 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
0567
0568 inBlocks = PtrAdd(inBlocks, inIncrement);
0569 outBlocks = PtrAdd(outBlocks, outIncrement);
0570 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0571 length -= blockSize;
0572 }
0573
0574 return length;
0575 }
0576
0577 NAMESPACE_END
0578
0579 #endif
0580
0581
0582
0583 #if defined(CRYPTOPP_SSSE3_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
0584
0585 #if defined(CRYPTOPP_DOXYGEN_PROCESSING)
0586
0587
0588
0589
0590 # define MAYBE_CONST const
0591
0592
0593
0594
0595 # define MAYBE_UNCONST_CAST(T, x) (x)
0596 #elif (__SUNPRO_CC >= 0x5130)
0597 # define MAYBE_CONST
0598 # define MAYBE_UNCONST_CAST(T, x) const_cast<MAYBE_CONST T>(x)
0599 #else
0600 # define MAYBE_CONST const
0601 # define MAYBE_UNCONST_CAST(T, x) (x)
0602 #endif
0603
0604 #if defined(CRYPTOPP_DOXYGEN_PROCESSING)
0605
0606
0607
0608
0609 # define M128_CAST(x) ((__m128i *)(void *)(x))
0610
0611
0612
0613
0614 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
0615 #else
0616 # ifndef M128_CAST
0617 # define M128_CAST(x) ((__m128i *)(void *)(x))
0618 # endif
0619 # ifndef CONST_M128_CAST
0620 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
0621 # endif
0622 #endif
0623
0624 NAMESPACE_BEGIN(CryptoPP)
0625
0626
0627
0628
0629
0630
0631
0632
0633
0634 template <typename F2, typename F6, typename W>
0635 inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
0636 MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
0637 const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
0638 {
0639 CRYPTOPP_ASSERT(subKeys);
0640 CRYPTOPP_ASSERT(inBlocks);
0641 CRYPTOPP_ASSERT(outBlocks);
0642 CRYPTOPP_ASSERT(length >= 16);
0643
0644 const size_t blockSize = 16;
0645
0646
0647 size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
0648 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
0649 size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
0650
0651
0652 const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
0653 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
0654
0655 if (flags & BT_ReverseDirection)
0656 {
0657 inBlocks = PtrAdd(inBlocks, length - blockSize);
0658 xorBlocks = PtrAdd(xorBlocks, length - blockSize);
0659 outBlocks = PtrAdd(outBlocks, length - blockSize);
0660 inIncrement = 0-inIncrement;
0661 xorIncrement = 0-xorIncrement;
0662 outIncrement = 0-outIncrement;
0663 }
0664
0665 if (flags & BT_AllowParallel)
0666 {
0667 while (length >= 6*blockSize)
0668 {
0669 __m128i block0, block1, block2, block3, block4, block5;
0670 if (flags & BT_InBlockIsCounter)
0671 {
0672
0673 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
0674 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0675 block1 = _mm_add_epi32(block0, s_one);
0676 block2 = _mm_add_epi32(block1, s_one);
0677 block3 = _mm_add_epi32(block2, s_one);
0678 block4 = _mm_add_epi32(block3, s_one);
0679 block5 = _mm_add_epi32(block4, s_one);
0680 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block5, s_one));
0681 }
0682 else
0683 {
0684 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0685 inBlocks = PtrAdd(inBlocks, inIncrement);
0686 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0687 inBlocks = PtrAdd(inBlocks, inIncrement);
0688 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0689 inBlocks = PtrAdd(inBlocks, inIncrement);
0690 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0691 inBlocks = PtrAdd(inBlocks, inIncrement);
0692 block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0693 inBlocks = PtrAdd(inBlocks, inIncrement);
0694 block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0695 inBlocks = PtrAdd(inBlocks, inIncrement);
0696 }
0697
0698 if (xorInput)
0699 {
0700 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0701 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0702 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0703 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0704 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0705 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0706 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0707 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0708 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0709 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0710 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0711 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0712 }
0713
0714 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
0715
0716 if (xorOutput)
0717 {
0718 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0719 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0720 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0721 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0722 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0723 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0724 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0725 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0726 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0727 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0728 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0729 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0730 }
0731
0732 _mm_storeu_si128(M128_CAST(outBlocks), block0);
0733 outBlocks = PtrAdd(outBlocks, outIncrement);
0734 _mm_storeu_si128(M128_CAST(outBlocks), block1);
0735 outBlocks = PtrAdd(outBlocks, outIncrement);
0736 _mm_storeu_si128(M128_CAST(outBlocks), block2);
0737 outBlocks = PtrAdd(outBlocks, outIncrement);
0738 _mm_storeu_si128(M128_CAST(outBlocks), block3);
0739 outBlocks = PtrAdd(outBlocks, outIncrement);
0740 _mm_storeu_si128(M128_CAST(outBlocks), block4);
0741 outBlocks = PtrAdd(outBlocks, outIncrement);
0742 _mm_storeu_si128(M128_CAST(outBlocks), block5);
0743 outBlocks = PtrAdd(outBlocks, outIncrement);
0744
0745 length -= 6*blockSize;
0746 }
0747
0748 while (length >= 2*blockSize)
0749 {
0750 __m128i block0, block1;
0751 if (flags & BT_InBlockIsCounter)
0752 {
0753
0754 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
0755 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0756 block1 = _mm_add_epi32(block0, s_one);
0757 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, s_one));
0758 }
0759 else
0760 {
0761 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0762 inBlocks = PtrAdd(inBlocks, inIncrement);
0763 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0764 inBlocks = PtrAdd(inBlocks, inIncrement);
0765 }
0766
0767 if (xorInput)
0768 {
0769 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0770 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0771 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0772 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0773 }
0774
0775 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
0776
0777 if (xorOutput)
0778 {
0779 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0780 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0781 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0782 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0783 }
0784
0785 _mm_storeu_si128(M128_CAST(outBlocks), block0);
0786 outBlocks = PtrAdd(outBlocks, outIncrement);
0787 _mm_storeu_si128(M128_CAST(outBlocks), block1);
0788 outBlocks = PtrAdd(outBlocks, outIncrement);
0789
0790 length -= 2*blockSize;
0791 }
0792 }
0793
0794 while (length >= blockSize)
0795 {
0796 __m128i block, zero = _mm_setzero_si128();
0797 block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0798
0799 if (xorInput)
0800 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0801
0802 if (flags & BT_InBlockIsCounter)
0803 const_cast<byte *>(inBlocks)[15]++;
0804
0805 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
0806
0807 if (xorOutput)
0808 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0809
0810 _mm_storeu_si128(M128_CAST(outBlocks), block);
0811
0812 inBlocks = PtrAdd(inBlocks, inIncrement);
0813 outBlocks = PtrAdd(outBlocks, outIncrement);
0814 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0815 length -= blockSize;
0816 }
0817
0818 return length;
0819 }
0820
0821
0822
0823
0824
0825
0826
0827
0828
0829 template <typename F1, typename F4, typename W>
0830 inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4,
0831 MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
0832 const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
0833 {
0834 CRYPTOPP_ASSERT(subKeys);
0835 CRYPTOPP_ASSERT(inBlocks);
0836 CRYPTOPP_ASSERT(outBlocks);
0837 CRYPTOPP_ASSERT(length >= 16);
0838
0839 const size_t blockSize = 16;
0840
0841
0842 size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
0843 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
0844 size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
0845
0846
0847 const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
0848 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
0849
0850 if (flags & BT_ReverseDirection)
0851 {
0852 inBlocks = PtrAdd(inBlocks, length - blockSize);
0853 xorBlocks = PtrAdd(xorBlocks, length - blockSize);
0854 outBlocks = PtrAdd(outBlocks, length - blockSize);
0855 inIncrement = 0-inIncrement;
0856 xorIncrement = 0-xorIncrement;
0857 outIncrement = 0-outIncrement;
0858 }
0859
0860 if (flags & BT_AllowParallel)
0861 {
0862 while (length >= 4*blockSize)
0863 {
0864 __m128i block0, block1, block2, block3;
0865 if (flags & BT_InBlockIsCounter)
0866 {
0867
0868 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
0869 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0870 block1 = _mm_add_epi32(block0, s_one);
0871 block2 = _mm_add_epi32(block1, s_one);
0872 block3 = _mm_add_epi32(block2, s_one);
0873 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, s_one));
0874 }
0875 else
0876 {
0877 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0878 inBlocks = PtrAdd(inBlocks, inIncrement);
0879 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0880 inBlocks = PtrAdd(inBlocks, inIncrement);
0881 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0882 inBlocks = PtrAdd(inBlocks, inIncrement);
0883 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0884 inBlocks = PtrAdd(inBlocks, inIncrement);
0885 }
0886
0887 if (xorInput)
0888 {
0889 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0890 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0891 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0892 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0893 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0894 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0895 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0896 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0897 }
0898
0899 func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
0900
0901 if (xorOutput)
0902 {
0903 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0904 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0905 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0906 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0907 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0908 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0909 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0910 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0911 }
0912
0913 _mm_storeu_si128(M128_CAST(outBlocks), block0);
0914 outBlocks = PtrAdd(outBlocks, outIncrement);
0915 _mm_storeu_si128(M128_CAST(outBlocks), block1);
0916 outBlocks = PtrAdd(outBlocks, outIncrement);
0917 _mm_storeu_si128(M128_CAST(outBlocks), block2);
0918 outBlocks = PtrAdd(outBlocks, outIncrement);
0919 _mm_storeu_si128(M128_CAST(outBlocks), block3);
0920 outBlocks = PtrAdd(outBlocks, outIncrement);
0921
0922 length -= 4*blockSize;
0923 }
0924 }
0925
0926 while (length >= blockSize)
0927 {
0928 __m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0929
0930 if (xorInput)
0931 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0932
0933 if (flags & BT_InBlockIsCounter)
0934 const_cast<byte *>(inBlocks)[15]++;
0935
0936 func1(block, subKeys, static_cast<unsigned int>(rounds));
0937
0938 if (xorOutput)
0939 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0940
0941 _mm_storeu_si128(M128_CAST(outBlocks), block);
0942
0943 inBlocks = PtrAdd(inBlocks, inIncrement);
0944 outBlocks = PtrAdd(outBlocks, outIncrement);
0945 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0946 length -= blockSize;
0947 }
0948
0949 return length;
0950 }
0951
0952 NAMESPACE_END
0953
0954 #endif
0955
0956
0957
0958 #if defined(__ALTIVEC__) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
0959
0960 NAMESPACE_BEGIN(CryptoPP)
0961
0962
0963
0964
0965
0966
0967
0968
0969
0970 template <typename F1, typename F4, typename W>
0971 inline size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4,
0972 const W *subKeys, size_t rounds, const byte *inBlocks,
0973 const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
0974 {
0975 CRYPTOPP_ASSERT(subKeys);
0976 CRYPTOPP_ASSERT(inBlocks);
0977 CRYPTOPP_ASSERT(outBlocks);
0978 CRYPTOPP_ASSERT(length >= 16);
0979
0980 #if (CRYPTOPP_LITTLE_ENDIAN)
0981 const uint32x4_p s_one = {1,0,0,0};
0982 #else
0983 const uint32x4_p s_one = {0,0,0,1};
0984 #endif
0985
0986 const size_t blockSize = 16;
0987
0988
0989 size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
0990 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
0991 size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
0992
0993
0994 const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
0995 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
0996
0997 if (flags & BT_ReverseDirection)
0998 {
0999 inBlocks = PtrAdd(inBlocks, length - blockSize);
1000 xorBlocks = PtrAdd(xorBlocks, length - blockSize);
1001 outBlocks = PtrAdd(outBlocks, length - blockSize);
1002 inIncrement = 0-inIncrement;
1003 xorIncrement = 0-xorIncrement;
1004 outIncrement = 0-outIncrement;
1005 }
1006
1007 if (flags & BT_AllowParallel)
1008 {
1009 while (length >= 4*blockSize)
1010 {
1011 uint32x4_p block0, block1, block2, block3;
1012
1013 if (flags & BT_InBlockIsCounter)
1014 {
1015 block0 = VecLoadBE(inBlocks);
1016 block1 = VecAdd(block0, s_one);
1017 block2 = VecAdd(block1, s_one);
1018 block3 = VecAdd(block2, s_one);
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028 const_cast<byte*>(inBlocks)[15] += 6;
1029 }
1030 else
1031 {
1032 block0 = VecLoadBE(inBlocks);
1033 inBlocks = PtrAdd(inBlocks, inIncrement);
1034 block1 = VecLoadBE(inBlocks);
1035 inBlocks = PtrAdd(inBlocks, inIncrement);
1036 block2 = VecLoadBE(inBlocks);
1037 inBlocks = PtrAdd(inBlocks, inIncrement);
1038 block3 = VecLoadBE(inBlocks);
1039 inBlocks = PtrAdd(inBlocks, inIncrement);
1040 }
1041
1042 if (xorInput)
1043 {
1044 block0 = VecXor(block0, VecLoadBE(xorBlocks));
1045 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1046 block1 = VecXor(block1, VecLoadBE(xorBlocks));
1047 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1048 block2 = VecXor(block2, VecLoadBE(xorBlocks));
1049 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1050 block3 = VecXor(block3, VecLoadBE(xorBlocks));
1051 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1052 }
1053
1054 func4(block0, block1, block2, block3, subKeys, rounds);
1055
1056 if (xorOutput)
1057 {
1058 block0 = VecXor(block0, VecLoadBE(xorBlocks));
1059 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1060 block1 = VecXor(block1, VecLoadBE(xorBlocks));
1061 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1062 block2 = VecXor(block2, VecLoadBE(xorBlocks));
1063 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1064 block3 = VecXor(block3, VecLoadBE(xorBlocks));
1065 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1066 }
1067
1068 VecStoreBE(block0, outBlocks);
1069 outBlocks = PtrAdd(outBlocks, outIncrement);
1070 VecStoreBE(block1, outBlocks);
1071 outBlocks = PtrAdd(outBlocks, outIncrement);
1072 VecStoreBE(block2, outBlocks);
1073 outBlocks = PtrAdd(outBlocks, outIncrement);
1074 VecStoreBE(block3, outBlocks);
1075 outBlocks = PtrAdd(outBlocks, outIncrement);
1076
1077 length -= 4*blockSize;
1078 }
1079 }
1080
1081 while (length >= blockSize)
1082 {
1083 uint32x4_p block = VecLoadBE(inBlocks);
1084
1085 if (xorInput)
1086 block = VecXor(block, VecLoadBE(xorBlocks));
1087
1088 if (flags & BT_InBlockIsCounter)
1089 const_cast<byte *>(inBlocks)[15]++;
1090
1091 func1(block, subKeys, rounds);
1092
1093 if (xorOutput)
1094 block = VecXor(block, VecLoadBE(xorBlocks));
1095
1096 VecStoreBE(block, outBlocks);
1097
1098 inBlocks = PtrAdd(inBlocks, inIncrement);
1099 outBlocks = PtrAdd(outBlocks, outIncrement);
1100 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1101 length -= blockSize;
1102 }
1103
1104 return length;
1105 }
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115 template <typename F1, typename F6, typename W>
1116 inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
1117 const W *subKeys, size_t rounds, const byte *inBlocks,
1118 const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1119 {
1120 CRYPTOPP_ASSERT(subKeys);
1121 CRYPTOPP_ASSERT(inBlocks);
1122 CRYPTOPP_ASSERT(outBlocks);
1123 CRYPTOPP_ASSERT(length >= 16);
1124
1125 #if (CRYPTOPP_LITTLE_ENDIAN)
1126 const uint32x4_p s_one = {1,0,0,0};
1127 #else
1128 const uint32x4_p s_one = {0,0,0,1};
1129 #endif
1130
1131 const size_t blockSize = 16;
1132
1133
1134 size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
1135 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1136 size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1137
1138
1139 const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
1140 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
1141
1142 if (flags & BT_ReverseDirection)
1143 {
1144 inBlocks = PtrAdd(inBlocks, length - blockSize);
1145 xorBlocks = PtrAdd(xorBlocks, length - blockSize);
1146 outBlocks = PtrAdd(outBlocks, length - blockSize);
1147 inIncrement = 0-inIncrement;
1148 xorIncrement = 0-xorIncrement;
1149 outIncrement = 0-outIncrement;
1150 }
1151
1152 if (flags & BT_AllowParallel)
1153 {
1154 while (length >= 6*blockSize)
1155 {
1156 uint32x4_p block0, block1, block2, block3, block4, block5;
1157
1158 if (flags & BT_InBlockIsCounter)
1159 {
1160 block0 = VecLoadBE(inBlocks);
1161 block1 = VecAdd(block0, s_one);
1162 block2 = VecAdd(block1, s_one);
1163 block3 = VecAdd(block2, s_one);
1164 block4 = VecAdd(block3, s_one);
1165 block5 = VecAdd(block4, s_one);
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181 uint8x16_p temp = VecAdd((uint8x16_p)block5, (uint8x16_p)s_one);
1182 VecStoreBE(temp, const_cast<byte*>(inBlocks));
1183 }
1184 else
1185 {
1186 block0 = VecLoadBE(inBlocks);
1187 inBlocks = PtrAdd(inBlocks, inIncrement);
1188 block1 = VecLoadBE(inBlocks);
1189 inBlocks = PtrAdd(inBlocks, inIncrement);
1190 block2 = VecLoadBE(inBlocks);
1191 inBlocks = PtrAdd(inBlocks, inIncrement);
1192 block3 = VecLoadBE(inBlocks);
1193 inBlocks = PtrAdd(inBlocks, inIncrement);
1194 block4 = VecLoadBE(inBlocks);
1195 inBlocks = PtrAdd(inBlocks, inIncrement);
1196 block5 = VecLoadBE(inBlocks);
1197 inBlocks = PtrAdd(inBlocks, inIncrement);
1198 }
1199
1200 if (xorInput)
1201 {
1202 block0 = VecXor(block0, VecLoadBE(xorBlocks));
1203 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1204 block1 = VecXor(block1, VecLoadBE(xorBlocks));
1205 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1206 block2 = VecXor(block2, VecLoadBE(xorBlocks));
1207 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1208 block3 = VecXor(block3, VecLoadBE(xorBlocks));
1209 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1210 block4 = VecXor(block4, VecLoadBE(xorBlocks));
1211 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1212 block5 = VecXor(block5, VecLoadBE(xorBlocks));
1213 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1214 }
1215
1216 func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
1217
1218 if (xorOutput)
1219 {
1220 block0 = VecXor(block0, VecLoadBE(xorBlocks));
1221 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1222 block1 = VecXor(block1, VecLoadBE(xorBlocks));
1223 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1224 block2 = VecXor(block2, VecLoadBE(xorBlocks));
1225 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1226 block3 = VecXor(block3, VecLoadBE(xorBlocks));
1227 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1228 block4 = VecXor(block4, VecLoadBE(xorBlocks));
1229 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1230 block5 = VecXor(block5, VecLoadBE(xorBlocks));
1231 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1232 }
1233
1234 VecStoreBE(block0, outBlocks);
1235 outBlocks = PtrAdd(outBlocks, outIncrement);
1236 VecStoreBE(block1, outBlocks);
1237 outBlocks = PtrAdd(outBlocks, outIncrement);
1238 VecStoreBE(block2, outBlocks);
1239 outBlocks = PtrAdd(outBlocks, outIncrement);
1240 VecStoreBE(block3, outBlocks);
1241 outBlocks = PtrAdd(outBlocks, outIncrement);
1242 VecStoreBE(block4, outBlocks);
1243 outBlocks = PtrAdd(outBlocks, outIncrement);
1244 VecStoreBE(block5, outBlocks);
1245 outBlocks = PtrAdd(outBlocks, outIncrement);
1246
1247 length -= 6*blockSize;
1248 }
1249 }
1250
1251 while (length >= blockSize)
1252 {
1253 uint32x4_p block = VecLoadBE(inBlocks);
1254
1255 if (xorInput)
1256 block = VecXor(block, VecLoadBE(xorBlocks));
1257
1258 if (flags & BT_InBlockIsCounter)
1259 const_cast<byte *>(inBlocks)[15]++;
1260
1261 func1(block, subKeys, rounds);
1262
1263 if (xorOutput)
1264 block = VecXor(block, VecLoadBE(xorBlocks));
1265
1266 VecStoreBE(block, outBlocks);
1267
1268 inBlocks = PtrAdd(inBlocks, inIncrement);
1269 outBlocks = PtrAdd(outBlocks, outIncrement);
1270 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1271 length -= blockSize;
1272 }
1273
1274 return length;
1275 }
1276
1277 NAMESPACE_END
1278
1279 #endif
1280
1281 #endif