Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-18 09:54:53

0001 // adv_simd.h - written and placed in the public domain by Jeffrey Walton

0002 
0003 /// \file adv_simd.h

0004 /// \brief Template for AdvancedProcessBlocks and SIMD processing

0005 
0006 //    The SIMD based implementations for ciphers that use SSE, NEON and Power7

0007 //    have a common pattern. Namely, they have a specialized implementation of

0008 //    AdvancedProcessBlocks which processes multiple block using hardware

0009 //    acceleration. After several implementations we noticed a lot of copy and

0010 //    paste occurring. adv_simd.h provides a template to avoid the copy and paste.

0011 //

0012 //    There are 6 templates provided in this file. The number following the

0013 //    function name, 128, is the block size in bits. The name following the

0014 //    block size is the arrangement and acceleration. For example 4x1_SSE means

0015 //    Intel SSE using two encrypt (or decrypt) functions: one that operates on

0016 //    4 SIMD words, and one that operates on 1 SIMD words.

0017 //

0018 //      * AdvancedProcessBlocks128_4x1_SSE

0019 //      * AdvancedProcessBlocks128_6x2_SSE

0020 //      * AdvancedProcessBlocks128_4x1_NEON

0021 //      * AdvancedProcessBlocks128_6x1_NEON

0022 //      * AdvancedProcessBlocks128_4x1_ALTIVEC

0023 //      * AdvancedProcessBlocks128_6x1_ALTIVEC

0024 //

0025 //    If an arrangement ends in 2, like 6x2, then the template will handle the

0026 //    single block case by padding with 0's and using the two SIMD word

0027 //    function. This happens at most one time when processing multiple blocks.

0028 //    The extra processing of a zero block is trivial and worth the tradeoff.

0029 //

0030 //    The MAYBE_CONST macro present on x86 is a SunCC workaround. Some versions

0031 //    of SunCC lose/drop the const-ness in the F1 and F4 functions. It eventually

0032 //    results in a failed link due to the const/non-const mismatch.

0033 //

0034 //    In July 2020 the library stopped using 64-bit block version of

0035 //    AdvancedProcessBlocks. Testing showed unreliable results and failed

0036 //    self tests on occasion. Also see Issue 945 and

0037 //    https://github.com/weidai11/cryptopp/commit/dd7598e638bb.

0038 
0039 #ifndef CRYPTOPP_ADVANCED_SIMD_TEMPLATES
0040 #define CRYPTOPP_ADVANCED_SIMD_TEMPLATES
0041 
0042 #include "config.h"
0043 #include "misc.h"
0044 #include "stdcpp.h"
0045 
0046 #if (CRYPTOPP_ARM_NEON_HEADER)
0047 # include <arm_neon.h>
0048 #endif
0049 
0050 #if (CRYPTOPP_ARM_ACLE_HEADER)
0051 # include <stdint.h>
0052 # include <arm_acle.h>
0053 #endif
0054 
0055 #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE)
0056 # include <emmintrin.h>
0057 # include <xmmintrin.h>
0058 #endif
0059 
0060 // SunCC needs CRYPTOPP_SSSE3_AVAILABLE, too

0061 #if (CRYPTOPP_SSSE3_AVAILABLE)
0062 # include <emmintrin.h>
0063 # include <pmmintrin.h>
0064 # include <xmmintrin.h>
0065 #endif
0066 
0067 #if defined(__ALTIVEC__)
0068 # include "ppc_simd.h"
0069 #endif
0070 
0071 // ************************ All block ciphers *********************** //

0072 
0073 ANONYMOUS_NAMESPACE_BEGIN
0074 
0075 using CryptoPP::BlockTransformation;
0076 
0077 CRYPTOPP_CONSTANT(BT_XorInput = BlockTransformation::BT_XorInput);
0078 CRYPTOPP_CONSTANT(BT_AllowParallel = BlockTransformation::BT_AllowParallel);
0079 CRYPTOPP_CONSTANT(BT_InBlockIsCounter = BlockTransformation::BT_InBlockIsCounter);
0080 CRYPTOPP_CONSTANT(BT_ReverseDirection = BlockTransformation::BT_ReverseDirection);
0081 CRYPTOPP_CONSTANT(BT_DontIncrementInOutPointers = BlockTransformation::BT_DontIncrementInOutPointers);
0082 
0083 ANONYMOUS_NAMESPACE_END
0084 
0085 // *************************** ARM NEON ************************** //

0086 
0087 #if (CRYPTOPP_ARM_NEON_AVAILABLE) || (CRYPTOPP_ARM_ASIMD_AVAILABLE) || \
0088     defined(CRYPTOPP_DOXYGEN_PROCESSING)
0089 NAMESPACE_BEGIN(CryptoPP)
0090 
0091 /// \brief AdvancedProcessBlocks for 1 and 6 blocks

0092 /// \tparam F1 function to process 1 128-bit block

0093 /// \tparam F6 function to process 6 128-bit blocks

0094 /// \tparam W word type of the subkey table

0095 /// \details AdvancedProcessBlocks128_6x1_NEON processes 6 and 2 NEON SIMD words

0096 ///  at a time.

0097 /// \details The subkey type is usually word32 or word64. F1 and F6 must use the

0098 ///  same word type.

0099 template <typename F1, typename F6, typename W>
0100 inline size_t AdvancedProcessBlocks128_6x1_NEON(F1 func1, F6 func6,
0101             const W *subKeys, size_t rounds, const byte *inBlocks,
0102             const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
0103 {
0104     CRYPTOPP_ASSERT(subKeys);
0105     CRYPTOPP_ASSERT(inBlocks);
0106     CRYPTOPP_ASSERT(outBlocks);
0107     CRYPTOPP_ASSERT(length >= 16);
0108 
0109     const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
0110     const uint32x4_t s_one = vld1q_u32(w_one);
0111 
0112     const size_t blockSize = 16;
0113     // const size_t neonBlockSize = 16;

0114 
0115     size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
0116     size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
0117     size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
0118 
0119     // Clang and Coverity are generating findings using xorBlocks as a flag.

0120     const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
0121     const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
0122 
0123     if (flags & BT_ReverseDirection)
0124     {
0125         inBlocks = PtrAdd(inBlocks, length - blockSize);
0126         xorBlocks = PtrAdd(xorBlocks, length - blockSize);
0127         outBlocks = PtrAdd(outBlocks, length - blockSize);
0128         inIncrement = 0-inIncrement;
0129         xorIncrement = 0-xorIncrement;
0130         outIncrement = 0-outIncrement;
0131     }
0132 
0133     if (flags & BT_AllowParallel)
0134     {
0135         while (length >= 6*blockSize)
0136         {
0137             uint64x2_t block0, block1, block2, block3, block4, block5;
0138             if (flags & BT_InBlockIsCounter)
0139             {
0140                 const uint64x2_t one = vreinterpretq_u64_u32(s_one);
0141                 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0142                 block1 = vaddq_u64(block0, one);
0143                 block2 = vaddq_u64(block1, one);
0144                 block3 = vaddq_u64(block2, one);
0145                 block4 = vaddq_u64(block3, one);
0146                 block5 = vaddq_u64(block4, one);
0147                 vst1q_u8(const_cast<byte*>(inBlocks),
0148                     vreinterpretq_u8_u64(vaddq_u64(block5, one)));
0149             }
0150             else
0151             {
0152                 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0153                 inBlocks = PtrAdd(inBlocks, inIncrement);
0154                 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0155                 inBlocks = PtrAdd(inBlocks, inIncrement);
0156                 block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0157                 inBlocks = PtrAdd(inBlocks, inIncrement);
0158                 block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0159                 inBlocks = PtrAdd(inBlocks, inIncrement);
0160                 block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0161                 inBlocks = PtrAdd(inBlocks, inIncrement);
0162                 block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0163                 inBlocks = PtrAdd(inBlocks, inIncrement);
0164             }
0165 
0166             if (xorInput)
0167             {
0168                 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0169                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0170                 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0171                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0172                 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0173                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0174                 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0175                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0176                 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0177                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0178                 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0179                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0180             }
0181 
0182             func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
0183 
0184             if (xorOutput)
0185             {
0186                 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0187                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0188                 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0189                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0190                 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0191                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0192                 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0193                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0194                 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0195                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0196                 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0197                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0198             }
0199 
0200             vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
0201             outBlocks = PtrAdd(outBlocks, outIncrement);
0202             vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
0203             outBlocks = PtrAdd(outBlocks, outIncrement);
0204             vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
0205             outBlocks = PtrAdd(outBlocks, outIncrement);
0206             vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
0207             outBlocks = PtrAdd(outBlocks, outIncrement);
0208             vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
0209             outBlocks = PtrAdd(outBlocks, outIncrement);
0210             vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
0211             outBlocks = PtrAdd(outBlocks, outIncrement);
0212 
0213             length -= 6*blockSize;
0214         }
0215     }
0216 
0217     while (length >= blockSize)
0218     {
0219         uint64x2_t block;
0220         block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0221 
0222         if (xorInput)
0223             block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0224 
0225         if (flags & BT_InBlockIsCounter)
0226             const_cast<byte *>(inBlocks)[15]++;
0227 
0228         func1(block, subKeys, static_cast<unsigned int>(rounds));
0229 
0230         if (xorOutput)
0231             block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0232 
0233         vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
0234 
0235         inBlocks = PtrAdd(inBlocks, inIncrement);
0236         outBlocks = PtrAdd(outBlocks, outIncrement);
0237         xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0238         length -= blockSize;
0239     }
0240 
0241     return length;
0242 }
0243 
0244 /// \brief AdvancedProcessBlocks for 1 and 4 blocks

0245 /// \tparam F1 function to process 1 128-bit block

0246 /// \tparam F4 function to process 4 128-bit blocks

0247 /// \tparam W word type of the subkey table

0248 /// \details AdvancedProcessBlocks128_4x1_NEON processes 4 and 1 NEON SIMD words

0249 ///  at a time.

0250 /// \details The subkey type is usually word32 or word64. V is the vector type and it is

0251 ///  usually uint32x4_t or uint32x4_t. F1, F4, and W must use the same word and

0252 ///  vector type.

0253 template <typename F1, typename F4, typename W>
0254 inline size_t AdvancedProcessBlocks128_4x1_NEON(F1 func1, F4 func4,
0255             const W *subKeys, size_t rounds, const byte *inBlocks,
0256             const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
0257 {
0258     CRYPTOPP_ASSERT(subKeys);
0259     CRYPTOPP_ASSERT(inBlocks);
0260     CRYPTOPP_ASSERT(outBlocks);
0261     CRYPTOPP_ASSERT(length >= 16);
0262 
0263     const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
0264     const uint32x4_t s_one = vld1q_u32(w_one);
0265 
0266     const size_t blockSize = 16;
0267     // const size_t neonBlockSize = 16;

0268 
0269     size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
0270     size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
0271     size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
0272 
0273     // Clang and Coverity are generating findings using xorBlocks as a flag.

0274     const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
0275     const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
0276 
0277     if (flags & BT_ReverseDirection)
0278     {
0279         inBlocks = PtrAdd(inBlocks, length - blockSize);
0280         xorBlocks = PtrAdd(xorBlocks, length - blockSize);
0281         outBlocks = PtrAdd(outBlocks, length - blockSize);
0282         inIncrement = 0-inIncrement;
0283         xorIncrement = 0-xorIncrement;
0284         outIncrement = 0-outIncrement;
0285     }
0286 
0287     if (flags & BT_AllowParallel)
0288     {
0289         while (length >= 4*blockSize)
0290         {
0291             uint32x4_t block0, block1, block2, block3;
0292             if (flags & BT_InBlockIsCounter)
0293             {
0294                 const uint32x4_t one = s_one;
0295                 block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
0296                 block1 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block0), vreinterpretq_u64_u32(one)));
0297                 block2 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block1), vreinterpretq_u64_u32(one)));
0298                 block3 = vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(block2), vreinterpretq_u64_u32(one)));
0299                 vst1q_u8(const_cast<byte*>(inBlocks), vreinterpretq_u8_u64(vaddq_u64(
0300                     vreinterpretq_u64_u32(block3), vreinterpretq_u64_u32(one))));
0301             }
0302             else
0303             {
0304                 block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
0305                 inBlocks = PtrAdd(inBlocks, inIncrement);
0306                 block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
0307                 inBlocks = PtrAdd(inBlocks, inIncrement);
0308                 block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
0309                 inBlocks = PtrAdd(inBlocks, inIncrement);
0310                 block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
0311                 inBlocks = PtrAdd(inBlocks, inIncrement);
0312             }
0313 
0314             if (xorInput)
0315             {
0316                 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
0317                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0318                 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
0319                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0320                 block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
0321                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0322                 block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
0323                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0324             }
0325 
0326             func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
0327 
0328             if (xorOutput)
0329             {
0330                 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
0331                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0332                 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
0333                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0334                 block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
0335                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0336                 block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
0337                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0338             }
0339 
0340             vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
0341             outBlocks = PtrAdd(outBlocks, outIncrement);
0342             vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
0343             outBlocks = PtrAdd(outBlocks, outIncrement);
0344             vst1q_u8(outBlocks, vreinterpretq_u8_u32(block2));
0345             outBlocks = PtrAdd(outBlocks, outIncrement);
0346             vst1q_u8(outBlocks, vreinterpretq_u8_u32(block3));
0347             outBlocks = PtrAdd(outBlocks, outIncrement);
0348 
0349             length -= 4*blockSize;
0350         }
0351     }
0352 
0353     while (length >= blockSize)
0354     {
0355         uint32x4_t block = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
0356 
0357         if (xorInput)
0358             block = veorq_u32(block, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
0359 
0360         if (flags & BT_InBlockIsCounter)
0361             const_cast<byte *>(inBlocks)[15]++;
0362 
0363         func1(block, subKeys, static_cast<unsigned int>(rounds));
0364 
0365         if (xorOutput)
0366             block = veorq_u32(block, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
0367 
0368         vst1q_u8(outBlocks, vreinterpretq_u8_u32(block));
0369 
0370         inBlocks = PtrAdd(inBlocks, inIncrement);
0371         outBlocks = PtrAdd(outBlocks, outIncrement);
0372         xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0373         length -= blockSize;
0374     }
0375 
0376     return length;
0377 }
0378 
0379 /// \brief AdvancedProcessBlocks for 2 and 6 blocks

0380 /// \tparam F2 function to process 2 128-bit blocks

0381 /// \tparam F6 function to process 6 128-bit blocks

0382 /// \tparam W word type of the subkey table

0383 /// \details AdvancedProcessBlocks128_6x2_NEON processes 6 and 2 NEON SIMD words

0384 ///  at a time. For a single block the template uses F2 with a zero block.

0385 /// \details The subkey type is usually word32 or word64. F2 and F6 must use the

0386 ///  same word type.

0387 template <typename F2, typename F6, typename W>
0388 inline size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6,
0389             const W *subKeys, size_t rounds, const byte *inBlocks,
0390             const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
0391 {
0392     CRYPTOPP_ASSERT(subKeys);
0393     CRYPTOPP_ASSERT(inBlocks);
0394     CRYPTOPP_ASSERT(outBlocks);
0395     CRYPTOPP_ASSERT(length >= 16);
0396 
0397     const unsigned int w_one[] = {0, 0<<24, 0, 1<<24};
0398     const uint32x4_t s_one = vld1q_u32(w_one);
0399 
0400     const size_t blockSize = 16;
0401     // const size_t neonBlockSize = 16;

0402 
0403     size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
0404     size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
0405     size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
0406 
0407     // Clang and Coverity are generating findings using xorBlocks as a flag.

0408     const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
0409     const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
0410 
0411     if (flags & BT_ReverseDirection)
0412     {
0413         inBlocks = PtrAdd(inBlocks, length - blockSize);
0414         xorBlocks = PtrAdd(xorBlocks, length - blockSize);
0415         outBlocks = PtrAdd(outBlocks, length - blockSize);
0416         inIncrement = 0-inIncrement;
0417         xorIncrement = 0-xorIncrement;
0418         outIncrement = 0-outIncrement;
0419     }
0420 
0421     if (flags & BT_AllowParallel)
0422     {
0423         while (length >= 6*blockSize)
0424         {
0425             uint64x2_t block0, block1, block2, block3, block4, block5;
0426             if (flags & BT_InBlockIsCounter)
0427             {
0428                 const uint64x2_t one = vreinterpretq_u64_u32(s_one);
0429                 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0430                 block1 = vaddq_u64(block0, one);
0431                 block2 = vaddq_u64(block1, one);
0432                 block3 = vaddq_u64(block2, one);
0433                 block4 = vaddq_u64(block3, one);
0434                 block5 = vaddq_u64(block4, one);
0435                 vst1q_u8(const_cast<byte*>(inBlocks),
0436                     vreinterpretq_u8_u64(vaddq_u64(block5, one)));
0437             }
0438             else
0439             {
0440                 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0441                 inBlocks = PtrAdd(inBlocks, inIncrement);
0442                 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0443                 inBlocks = PtrAdd(inBlocks, inIncrement);
0444                 block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0445                 inBlocks = PtrAdd(inBlocks, inIncrement);
0446                 block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0447                 inBlocks = PtrAdd(inBlocks, inIncrement);
0448                 block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0449                 inBlocks = PtrAdd(inBlocks, inIncrement);
0450                 block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0451                 inBlocks = PtrAdd(inBlocks, inIncrement);
0452             }
0453 
0454             if (xorInput)
0455             {
0456                 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0457                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0458                 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0459                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0460                 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0461                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0462                 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0463                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0464                 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0465                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0466                 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0467                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0468             }
0469 
0470             func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
0471 
0472             if (xorOutput)
0473             {
0474                 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0475                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0476                 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0477                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0478                 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0479                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0480                 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0481                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0482                 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0483                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0484                 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0485                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0486             }
0487 
0488             vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
0489             outBlocks = PtrAdd(outBlocks, outIncrement);
0490             vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
0491             outBlocks = PtrAdd(outBlocks, outIncrement);
0492             vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
0493             outBlocks = PtrAdd(outBlocks, outIncrement);
0494             vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
0495             outBlocks = PtrAdd(outBlocks, outIncrement);
0496             vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
0497             outBlocks = PtrAdd(outBlocks, outIncrement);
0498             vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
0499             outBlocks = PtrAdd(outBlocks, outIncrement);
0500 
0501             length -= 6*blockSize;
0502         }
0503 
0504         while (length >= 2*blockSize)
0505         {
0506             uint64x2_t block0, block1;
0507             if (flags & BT_InBlockIsCounter)
0508             {
0509                 const uint64x2_t one = vreinterpretq_u64_u32(s_one);
0510                 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0511                 block1 = vaddq_u64(block0, one);
0512                 vst1q_u8(const_cast<byte*>(inBlocks),
0513                     vreinterpretq_u8_u64(vaddq_u64(block1, one)));
0514             }
0515             else
0516             {
0517                 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0518                 inBlocks = PtrAdd(inBlocks, inIncrement);
0519                 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0520                 inBlocks = PtrAdd(inBlocks, inIncrement);
0521             }
0522 
0523             if (xorInput)
0524             {
0525                 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0526                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0527                 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0528                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0529             }
0530 
0531             func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
0532 
0533             if (xorOutput)
0534             {
0535                 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0536                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0537                 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0538                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0539             }
0540 
0541             vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
0542             outBlocks = PtrAdd(outBlocks, outIncrement);
0543             vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
0544             outBlocks = PtrAdd(outBlocks, outIncrement);
0545 
0546             length -= 2*blockSize;
0547         }
0548     }
0549 
0550     while (length >= blockSize)
0551     {
0552         uint64x2_t block, zero = {0,0};
0553         block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
0554 
0555         if (xorInput)
0556             block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0557 
0558         if (flags & BT_InBlockIsCounter)
0559             const_cast<byte *>(inBlocks)[15]++;
0560 
0561         func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
0562 
0563         if (xorOutput)
0564             block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
0565 
0566         vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
0567 
0568         inBlocks = PtrAdd(inBlocks, inIncrement);
0569         outBlocks = PtrAdd(outBlocks, outIncrement);
0570         xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0571         length -= blockSize;
0572     }
0573 
0574     return length;
0575 }
0576 
0577 NAMESPACE_END  // CryptoPP

0578 
0579 #endif  // CRYPTOPP_ARM_NEON_AVAILABLE

0580 
0581 // *************************** Intel SSE ************************** //

0582 
0583 #if defined(CRYPTOPP_SSSE3_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
0584 
0585 #if defined(CRYPTOPP_DOXYGEN_PROCESSING)
0586 /// \brief SunCC workaround

0587 /// \details SunCC loses the const on AES_Enc_Block and AES_Dec_Block

0588 /// \sa <A HREF="http://github.com/weidai11/cryptopp/issues/224">Issue

0589 ///  224, SunCC and failed compile for rijndael.cpp</A>

0590 # define MAYBE_CONST const
0591 /// \brief SunCC workaround

0592 /// \details SunCC loses the const on AES_Enc_Block and AES_Dec_Block

0593 /// \sa <A HREF="http://github.com/weidai11/cryptopp/issues/224">Issue

0594 ///  224, SunCC and failed compile for rijndael.cpp</A>

0595 # define MAYBE_UNCONST_CAST(T, x) (x)
0596 #elif (__SUNPRO_CC >= 0x5130)
0597 # define MAYBE_CONST
0598 # define MAYBE_UNCONST_CAST(T, x) const_cast<MAYBE_CONST T>(x)
0599 #else
0600 # define MAYBE_CONST const
0601 # define MAYBE_UNCONST_CAST(T, x) (x)
0602 #endif
0603 
0604 #if defined(CRYPTOPP_DOXYGEN_PROCESSING)
0605 /// \brief Clang workaround

0606 /// \details Clang issues spurious alignment warnings

0607 /// \sa <A HREF="http://bugs.llvm.org/show_bug.cgi?id=20670">Issue

0608 ///  20670, _mm_loadu_si128 parameter has wrong type</A>

0609 # define M128_CAST(x) ((__m128i *)(void *)(x))
0610 /// \brief Clang workaround

0611 /// \details Clang issues spurious alignment warnings

0612 /// \sa <A HREF="http://bugs.llvm.org/show_bug.cgi?id=20670">Issue

0613 ///  20670, _mm_loadu_si128 parameter has wrong type</A>

0614 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
0615 #else
0616 # ifndef M128_CAST
0617 #  define M128_CAST(x) ((__m128i *)(void *)(x))
0618 # endif
0619 # ifndef CONST_M128_CAST
0620 #  define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
0621 # endif
0622 #endif
0623 
0624 NAMESPACE_BEGIN(CryptoPP)
0625 
0626 /// \brief AdvancedProcessBlocks for 2 and 6 blocks

0627 /// \tparam F2 function to process 2 128-bit blocks

0628 /// \tparam F6 function to process 6 128-bit blocks

0629 /// \tparam W word type of the subkey table

0630 /// \details AdvancedProcessBlocks128_6x2_SSE processes 6 and 2 SSE SIMD words

0631 ///  at a time. For a single block the template uses F2 with a zero block.

0632 /// \details The subkey type is usually word32 or word64. F2 and F6 must use the

0633 ///  same word type.

0634 template <typename F2, typename F6, typename W>
0635 inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
0636         MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
0637         const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
0638 {
0639     CRYPTOPP_ASSERT(subKeys);
0640     CRYPTOPP_ASSERT(inBlocks);
0641     CRYPTOPP_ASSERT(outBlocks);
0642     CRYPTOPP_ASSERT(length >= 16);
0643 
0644     const size_t blockSize = 16;
0645     // const size_t xmmBlockSize = 16;

0646 
0647     size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
0648     size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
0649     size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
0650 
0651     // Clang and Coverity are generating findings using xorBlocks as a flag.

0652     const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
0653     const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
0654 
0655     if (flags & BT_ReverseDirection)
0656     {
0657         inBlocks = PtrAdd(inBlocks, length - blockSize);
0658         xorBlocks = PtrAdd(xorBlocks, length - blockSize);
0659         outBlocks = PtrAdd(outBlocks, length - blockSize);
0660         inIncrement = 0-inIncrement;
0661         xorIncrement = 0-xorIncrement;
0662         outIncrement = 0-outIncrement;
0663     }
0664 
0665     if (flags & BT_AllowParallel)
0666     {
0667         while (length >= 6*blockSize)
0668         {
0669             __m128i block0, block1, block2, block3, block4, block5;
0670             if (flags & BT_InBlockIsCounter)
0671             {
0672                 // Increment of 1 in big-endian compatible with the ctr byte array.

0673                 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
0674                 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0675                 block1 = _mm_add_epi32(block0, s_one);
0676                 block2 = _mm_add_epi32(block1, s_one);
0677                 block3 = _mm_add_epi32(block2, s_one);
0678                 block4 = _mm_add_epi32(block3, s_one);
0679                 block5 = _mm_add_epi32(block4, s_one);
0680                 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block5, s_one));
0681             }
0682             else
0683             {
0684                 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0685                 inBlocks = PtrAdd(inBlocks, inIncrement);
0686                 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0687                 inBlocks = PtrAdd(inBlocks, inIncrement);
0688                 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0689                 inBlocks = PtrAdd(inBlocks, inIncrement);
0690                 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0691                 inBlocks = PtrAdd(inBlocks, inIncrement);
0692                 block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0693                 inBlocks = PtrAdd(inBlocks, inIncrement);
0694                 block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0695                 inBlocks = PtrAdd(inBlocks, inIncrement);
0696             }
0697 
0698             if (xorInput)
0699             {
0700                 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0701                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0702                 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0703                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0704                 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0705                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0706                 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0707                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0708                 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0709                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0710                 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0711                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0712             }
0713 
0714             func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
0715 
0716             if (xorOutput)
0717             {
0718                 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0719                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0720                 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0721                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0722                 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0723                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0724                 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0725                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0726                 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0727                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0728                 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0729                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0730             }
0731 
0732             _mm_storeu_si128(M128_CAST(outBlocks), block0);
0733             outBlocks = PtrAdd(outBlocks, outIncrement);
0734             _mm_storeu_si128(M128_CAST(outBlocks), block1);
0735             outBlocks = PtrAdd(outBlocks, outIncrement);
0736             _mm_storeu_si128(M128_CAST(outBlocks), block2);
0737             outBlocks = PtrAdd(outBlocks, outIncrement);
0738             _mm_storeu_si128(M128_CAST(outBlocks), block3);
0739             outBlocks = PtrAdd(outBlocks, outIncrement);
0740             _mm_storeu_si128(M128_CAST(outBlocks), block4);
0741             outBlocks = PtrAdd(outBlocks, outIncrement);
0742             _mm_storeu_si128(M128_CAST(outBlocks), block5);
0743             outBlocks = PtrAdd(outBlocks, outIncrement);
0744 
0745             length -= 6*blockSize;
0746         }
0747 
0748         while (length >= 2*blockSize)
0749         {
0750             __m128i block0, block1;
0751             if (flags & BT_InBlockIsCounter)
0752             {
0753                 // Increment of 1 in big-endian compatible with the ctr byte array.

0754                 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
0755                 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0756                 block1 = _mm_add_epi32(block0, s_one);
0757                 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, s_one));
0758             }
0759             else
0760             {
0761                 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0762                 inBlocks = PtrAdd(inBlocks, inIncrement);
0763                 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0764                 inBlocks = PtrAdd(inBlocks, inIncrement);
0765             }
0766 
0767             if (xorInput)
0768             {
0769                 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0770                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0771                 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0772                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0773             }
0774 
0775             func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
0776 
0777             if (xorOutput)
0778             {
0779                 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0780                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0781                 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0782                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0783             }
0784 
0785             _mm_storeu_si128(M128_CAST(outBlocks), block0);
0786             outBlocks = PtrAdd(outBlocks, outIncrement);
0787             _mm_storeu_si128(M128_CAST(outBlocks), block1);
0788             outBlocks = PtrAdd(outBlocks, outIncrement);
0789 
0790             length -= 2*blockSize;
0791         }
0792     }
0793 
0794     while (length >= blockSize)
0795     {
0796         __m128i block, zero = _mm_setzero_si128();
0797         block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0798 
0799         if (xorInput)
0800             block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0801 
0802         if (flags & BT_InBlockIsCounter)
0803             const_cast<byte *>(inBlocks)[15]++;
0804 
0805         func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
0806 
0807         if (xorOutput)
0808             block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0809 
0810         _mm_storeu_si128(M128_CAST(outBlocks), block);
0811 
0812         inBlocks = PtrAdd(inBlocks, inIncrement);
0813         outBlocks = PtrAdd(outBlocks, outIncrement);
0814         xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0815         length -= blockSize;
0816     }
0817 
0818     return length;
0819 }
0820 
0821 /// \brief AdvancedProcessBlocks for 1 and 4 blocks

0822 /// \tparam F1 function to process 1 128-bit block

0823 /// \tparam F4 function to process 4 128-bit blocks

0824 /// \tparam W word type of the subkey table

0825 /// \details AdvancedProcessBlocks128_4x1_SSE processes 4 and 1 SSE SIMD words

0826 ///  at a time.

0827 /// \details The subkey type is usually word32 or word64. F1 and F4 must use the

0828 ///  same word type.

0829 template <typename F1, typename F4, typename W>
0830 inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4,
0831         MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
0832         const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
0833 {
0834     CRYPTOPP_ASSERT(subKeys);
0835     CRYPTOPP_ASSERT(inBlocks);
0836     CRYPTOPP_ASSERT(outBlocks);
0837     CRYPTOPP_ASSERT(length >= 16);
0838 
0839     const size_t blockSize = 16;
0840     // const size_t xmmBlockSize = 16;

0841 
0842     size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
0843     size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
0844     size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
0845 
0846     // Clang and Coverity are generating findings using xorBlocks as a flag.

0847     const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
0848     const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
0849 
0850     if (flags & BT_ReverseDirection)
0851     {
0852         inBlocks = PtrAdd(inBlocks, length - blockSize);
0853         xorBlocks = PtrAdd(xorBlocks, length - blockSize);
0854         outBlocks = PtrAdd(outBlocks, length - blockSize);
0855         inIncrement = 0-inIncrement;
0856         xorIncrement = 0-xorIncrement;
0857         outIncrement = 0-outIncrement;
0858     }
0859 
0860     if (flags & BT_AllowParallel)
0861     {
0862         while (length >= 4*blockSize)
0863         {
0864             __m128i block0, block1, block2, block3;
0865             if (flags & BT_InBlockIsCounter)
0866             {
0867                 // Increment of 1 in big-endian compatible with the ctr byte array.

0868                 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
0869                 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0870                 block1 = _mm_add_epi32(block0, s_one);
0871                 block2 = _mm_add_epi32(block1, s_one);
0872                 block3 = _mm_add_epi32(block2, s_one);
0873                 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, s_one));
0874             }
0875             else
0876             {
0877                 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0878                 inBlocks = PtrAdd(inBlocks, inIncrement);
0879                 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0880                 inBlocks = PtrAdd(inBlocks, inIncrement);
0881                 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0882                 inBlocks = PtrAdd(inBlocks, inIncrement);
0883                 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0884                 inBlocks = PtrAdd(inBlocks, inIncrement);
0885             }
0886 
0887             if (xorInput)
0888             {
0889                 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0890                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0891                 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0892                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0893                 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0894                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0895                 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0896                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0897             }
0898 
0899             func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
0900 
0901             if (xorOutput)
0902             {
0903                 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0904                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0905                 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0906                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0907                 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0908                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0909                 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0910                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0911             }
0912 
0913             _mm_storeu_si128(M128_CAST(outBlocks), block0);
0914             outBlocks = PtrAdd(outBlocks, outIncrement);
0915             _mm_storeu_si128(M128_CAST(outBlocks), block1);
0916             outBlocks = PtrAdd(outBlocks, outIncrement);
0917             _mm_storeu_si128(M128_CAST(outBlocks), block2);
0918             outBlocks = PtrAdd(outBlocks, outIncrement);
0919             _mm_storeu_si128(M128_CAST(outBlocks), block3);
0920             outBlocks = PtrAdd(outBlocks, outIncrement);
0921 
0922             length -= 4*blockSize;
0923         }
0924     }
0925 
0926     while (length >= blockSize)
0927     {
0928         __m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
0929 
0930         if (xorInput)
0931             block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0932 
0933         if (flags & BT_InBlockIsCounter)
0934             const_cast<byte *>(inBlocks)[15]++;
0935 
0936         func1(block, subKeys, static_cast<unsigned int>(rounds));
0937 
0938         if (xorOutput)
0939             block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
0940 
0941         _mm_storeu_si128(M128_CAST(outBlocks), block);
0942 
0943         inBlocks = PtrAdd(inBlocks, inIncrement);
0944         outBlocks = PtrAdd(outBlocks, outIncrement);
0945         xorBlocks = PtrAdd(xorBlocks, xorIncrement);
0946         length -= blockSize;
0947     }
0948 
0949     return length;
0950 }
0951 
0952 NAMESPACE_END  // CryptoPP

0953 
0954 #endif  // CRYPTOPP_SSSE3_AVAILABLE

0955 
0956 // ************************** Altivec/Power 4 ************************** //

0957 
0958 #if defined(__ALTIVEC__) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
0959 
0960 NAMESPACE_BEGIN(CryptoPP)
0961 
0962 /// \brief AdvancedProcessBlocks for 1 and 4 blocks

0963 /// \tparam F1 function to process 1 128-bit block

0964 /// \tparam F4 function to process 4 128-bit blocks

0965 /// \tparam W word type of the subkey table

0966 /// \details AdvancedProcessBlocks128_4x1_ALTIVEC processes 4 and 1 Altivec SIMD words

0967 ///  at a time.

0968 /// \details The subkey type is usually word32 or word64. F1 and F4 must use the

0969 ///  same word type.

0970 template <typename F1, typename F4, typename W>
0971 inline size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4,
0972         const W *subKeys, size_t rounds, const byte *inBlocks,
0973         const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
0974 {
0975     CRYPTOPP_ASSERT(subKeys);
0976     CRYPTOPP_ASSERT(inBlocks);
0977     CRYPTOPP_ASSERT(outBlocks);
0978     CRYPTOPP_ASSERT(length >= 16);
0979 
0980 #if (CRYPTOPP_LITTLE_ENDIAN)
0981     const uint32x4_p s_one  = {1,0,0,0};
0982 #else
0983     const uint32x4_p s_one = {0,0,0,1};
0984 #endif
0985 
0986     const size_t blockSize = 16;
0987     // const size_t simdBlockSize = 16;

0988 
0989     size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
0990     size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
0991     size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
0992 
0993     // Clang and Coverity are generating findings using xorBlocks as a flag.

0994     const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
0995     const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
0996 
0997     if (flags & BT_ReverseDirection)
0998     {
0999         inBlocks = PtrAdd(inBlocks, length - blockSize);
1000         xorBlocks = PtrAdd(xorBlocks, length - blockSize);
1001         outBlocks = PtrAdd(outBlocks, length - blockSize);
1002         inIncrement = 0-inIncrement;
1003         xorIncrement = 0-xorIncrement;
1004         outIncrement = 0-outIncrement;
1005     }
1006 
1007     if (flags & BT_AllowParallel)
1008     {
1009         while (length >= 4*blockSize)
1010         {
1011             uint32x4_p block0, block1, block2, block3;
1012 
1013             if (flags & BT_InBlockIsCounter)
1014             {
1015                 block0 = VecLoadBE(inBlocks);
1016                 block1 = VecAdd(block0, s_one);
1017                 block2 = VecAdd(block1, s_one);
1018                 block3 = VecAdd(block2, s_one);
1019 
1020                 // Hack due to big-endian loads used by POWER8 (and maybe ARM-BE).

1021                 // CTR_ModePolicy::OperateKeystream is wired such that after

1022                 // returning from this function CTR_ModePolicy will detect wrap on

1023                 // on the last counter byte and increment the next to last byte.

1024                 // The problem is, with a big-endian load, inBlocks[15] is really

1025                 // located at index 15. The vector addition using a 32-bit element

1026                 // generates a carry into inBlocks[14] and then CTR_ModePolicy

1027                 // increments inBlocks[14] too.

1028                 const_cast<byte*>(inBlocks)[15] += 6;
1029             }
1030             else
1031             {
1032                 block0 = VecLoadBE(inBlocks);
1033                 inBlocks = PtrAdd(inBlocks, inIncrement);
1034                 block1 = VecLoadBE(inBlocks);
1035                 inBlocks = PtrAdd(inBlocks, inIncrement);
1036                 block2 = VecLoadBE(inBlocks);
1037                 inBlocks = PtrAdd(inBlocks, inIncrement);
1038                 block3 = VecLoadBE(inBlocks);
1039                 inBlocks = PtrAdd(inBlocks, inIncrement);
1040             }
1041 
1042             if (xorInput)
1043             {
1044                 block0 = VecXor(block0, VecLoadBE(xorBlocks));
1045                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1046                 block1 = VecXor(block1, VecLoadBE(xorBlocks));
1047                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1048                 block2 = VecXor(block2, VecLoadBE(xorBlocks));
1049                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1050                 block3 = VecXor(block3, VecLoadBE(xorBlocks));
1051                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1052             }
1053 
1054             func4(block0, block1, block2, block3, subKeys, rounds);
1055 
1056             if (xorOutput)
1057             {
1058                 block0 = VecXor(block0, VecLoadBE(xorBlocks));
1059                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1060                 block1 = VecXor(block1, VecLoadBE(xorBlocks));
1061                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1062                 block2 = VecXor(block2, VecLoadBE(xorBlocks));
1063                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1064                 block3 = VecXor(block3, VecLoadBE(xorBlocks));
1065                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1066             }
1067 
1068             VecStoreBE(block0, outBlocks);
1069             outBlocks = PtrAdd(outBlocks, outIncrement);
1070             VecStoreBE(block1, outBlocks);
1071             outBlocks = PtrAdd(outBlocks, outIncrement);
1072             VecStoreBE(block2, outBlocks);
1073             outBlocks = PtrAdd(outBlocks, outIncrement);
1074             VecStoreBE(block3, outBlocks);
1075             outBlocks = PtrAdd(outBlocks, outIncrement);
1076 
1077             length -= 4*blockSize;
1078         }
1079     }
1080 
1081     while (length >= blockSize)
1082     {
1083         uint32x4_p block = VecLoadBE(inBlocks);
1084 
1085         if (xorInput)
1086             block = VecXor(block, VecLoadBE(xorBlocks));
1087 
1088         if (flags & BT_InBlockIsCounter)
1089             const_cast<byte *>(inBlocks)[15]++;
1090 
1091         func1(block, subKeys, rounds);
1092 
1093         if (xorOutput)
1094             block = VecXor(block, VecLoadBE(xorBlocks));
1095 
1096         VecStoreBE(block, outBlocks);
1097 
1098         inBlocks = PtrAdd(inBlocks, inIncrement);
1099         outBlocks = PtrAdd(outBlocks, outIncrement);
1100         xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1101         length -= blockSize;
1102     }
1103 
1104     return length;
1105 }
1106 
1107 /// \brief AdvancedProcessBlocks for 1 and 6 blocks

1108 /// \tparam F1 function to process 1 128-bit block

1109 /// \tparam F6 function to process 6 128-bit blocks

1110 /// \tparam W word type of the subkey table

1111 /// \details AdvancedProcessBlocks128_6x1_ALTIVEC processes 6 and 1 Altivec SIMD words

1112 ///  at a time.

1113 /// \details The subkey type is usually word32 or word64. F1 and F6 must use the

1114 ///  same word type.

1115 template <typename F1, typename F6, typename W>
1116 inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
1117         const W *subKeys, size_t rounds, const byte *inBlocks,
1118         const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1119 {
1120     CRYPTOPP_ASSERT(subKeys);
1121     CRYPTOPP_ASSERT(inBlocks);
1122     CRYPTOPP_ASSERT(outBlocks);
1123     CRYPTOPP_ASSERT(length >= 16);
1124 
1125 #if (CRYPTOPP_LITTLE_ENDIAN)
1126     const uint32x4_p s_one  = {1,0,0,0};
1127 #else
1128     const uint32x4_p s_one = {0,0,0,1};
1129 #endif
1130 
1131     const size_t blockSize = 16;
1132     // const size_t simdBlockSize = 16;

1133 
1134     size_t inIncrement = (flags & (EnumToInt(BT_InBlockIsCounter)|EnumToInt(BT_DontIncrementInOutPointers))) ? 0 : blockSize;
1135     size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1136     size_t outIncrement = (flags & EnumToInt(BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1137 
1138     // Clang and Coverity are generating findings using xorBlocks as a flag.

1139     const bool xorInput = (xorBlocks != NULLPTR) && (flags & EnumToInt(BT_XorInput));
1140     const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & EnumToInt(BT_XorInput));
1141 
1142     if (flags & BT_ReverseDirection)
1143     {
1144         inBlocks = PtrAdd(inBlocks, length - blockSize);
1145         xorBlocks = PtrAdd(xorBlocks, length - blockSize);
1146         outBlocks = PtrAdd(outBlocks, length - blockSize);
1147         inIncrement = 0-inIncrement;
1148         xorIncrement = 0-xorIncrement;
1149         outIncrement = 0-outIncrement;
1150     }
1151 
1152     if (flags & BT_AllowParallel)
1153     {
1154         while (length >= 6*blockSize)
1155         {
1156             uint32x4_p block0, block1, block2, block3, block4, block5;
1157 
1158             if (flags & BT_InBlockIsCounter)
1159             {
1160                 block0 = VecLoadBE(inBlocks);
1161                 block1 = VecAdd(block0, s_one);
1162                 block2 = VecAdd(block1, s_one);
1163                 block3 = VecAdd(block2, s_one);
1164                 block4 = VecAdd(block3, s_one);
1165                 block5 = VecAdd(block4, s_one);
1166 
1167                 // Hack due to big-endian loads used by POWER8 (and maybe ARM-BE).

1168                 // CTR_ModePolicy::OperateKeystream is wired such that after

1169                 // returning from this function CTR_ModePolicy will detect wrap on

1170                 // on the last counter byte and increment the next to last byte.

1171                 // The problem is, with a big-endian load, inBlocks[15] is really

1172                 // located at index 15. The vector addition using a 32-bit element

1173                 // generates a carry into inBlocks[14] and then CTR_ModePolicy

1174                 // increments inBlocks[14] too.

1175                 //

1176                 // To find this bug we needed a test case with a ctr of 0xNN...FA.

1177                 // The last octet is 0xFA and adding 6 creates the wrap to trigger

1178                 // the issue. If the last octet was 0xFC then 4 would trigger it.

1179                 // We dumb-lucked into the test with SPECK-128. The test case of

1180                 // interest is the one with IV 348ECA9766C09F04 826520DE47A212FA.

1181                 uint8x16_p temp = VecAdd((uint8x16_p)block5, (uint8x16_p)s_one);
1182                 VecStoreBE(temp, const_cast<byte*>(inBlocks));
1183             }
1184             else
1185             {
1186                 block0 = VecLoadBE(inBlocks);
1187                 inBlocks = PtrAdd(inBlocks, inIncrement);
1188                 block1 = VecLoadBE(inBlocks);
1189                 inBlocks = PtrAdd(inBlocks, inIncrement);
1190                 block2 = VecLoadBE(inBlocks);
1191                 inBlocks = PtrAdd(inBlocks, inIncrement);
1192                 block3 = VecLoadBE(inBlocks);
1193                 inBlocks = PtrAdd(inBlocks, inIncrement);
1194                 block4 = VecLoadBE(inBlocks);
1195                 inBlocks = PtrAdd(inBlocks, inIncrement);
1196                 block5 = VecLoadBE(inBlocks);
1197                 inBlocks = PtrAdd(inBlocks, inIncrement);
1198             }
1199 
1200             if (xorInput)
1201             {
1202                 block0 = VecXor(block0, VecLoadBE(xorBlocks));
1203                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1204                 block1 = VecXor(block1, VecLoadBE(xorBlocks));
1205                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1206                 block2 = VecXor(block2, VecLoadBE(xorBlocks));
1207                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1208                 block3 = VecXor(block3, VecLoadBE(xorBlocks));
1209                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1210                 block4 = VecXor(block4, VecLoadBE(xorBlocks));
1211                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1212                 block5 = VecXor(block5, VecLoadBE(xorBlocks));
1213                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1214             }
1215 
1216             func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
1217 
1218             if (xorOutput)
1219             {
1220                 block0 = VecXor(block0, VecLoadBE(xorBlocks));
1221                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1222                 block1 = VecXor(block1, VecLoadBE(xorBlocks));
1223                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1224                 block2 = VecXor(block2, VecLoadBE(xorBlocks));
1225                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1226                 block3 = VecXor(block3, VecLoadBE(xorBlocks));
1227                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1228                 block4 = VecXor(block4, VecLoadBE(xorBlocks));
1229                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1230                 block5 = VecXor(block5, VecLoadBE(xorBlocks));
1231                 xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1232             }
1233 
1234             VecStoreBE(block0, outBlocks);
1235             outBlocks = PtrAdd(outBlocks, outIncrement);
1236             VecStoreBE(block1, outBlocks);
1237             outBlocks = PtrAdd(outBlocks, outIncrement);
1238             VecStoreBE(block2, outBlocks);
1239             outBlocks = PtrAdd(outBlocks, outIncrement);
1240             VecStoreBE(block3, outBlocks);
1241             outBlocks = PtrAdd(outBlocks, outIncrement);
1242             VecStoreBE(block4, outBlocks);
1243             outBlocks = PtrAdd(outBlocks, outIncrement);
1244             VecStoreBE(block5, outBlocks);
1245             outBlocks = PtrAdd(outBlocks, outIncrement);
1246 
1247             length -= 6*blockSize;
1248         }
1249     }
1250 
1251     while (length >= blockSize)
1252     {
1253         uint32x4_p block = VecLoadBE(inBlocks);
1254 
1255         if (xorInput)
1256             block = VecXor(block, VecLoadBE(xorBlocks));
1257 
1258         if (flags & BT_InBlockIsCounter)
1259             const_cast<byte *>(inBlocks)[15]++;
1260 
1261         func1(block, subKeys, rounds);
1262 
1263         if (xorOutput)
1264             block = VecXor(block, VecLoadBE(xorBlocks));
1265 
1266         VecStoreBE(block, outBlocks);
1267 
1268         inBlocks = PtrAdd(inBlocks, inIncrement);
1269         outBlocks = PtrAdd(outBlocks, outIncrement);
1270         xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1271         length -= blockSize;
1272     }
1273 
1274     return length;
1275 }
1276 
1277 NAMESPACE_END  // CryptoPP

1278 
1279 #endif  // __ALTIVEC__

1280 
1281 #endif  // CRYPTOPP_ADVANCED_SIMD_TEMPLATES