include/cryptopp/ppc_simd.h

0001 // ppc_simd.h - written and placed in public domain by Jeffrey Walton

0002
0003 /// \file ppc_simd.h

0004 /// \brief Support functions for PowerPC and vector operations

0005 /// \details This header provides an agnostic interface into Clang, GCC

0006 ///  and IBM XL C/C++ compilers modulo their different built-in functions

0007 ///  for accessing vector instructions.

0008 /// \details The abstractions are necessary to support back to GCC 4.8 and

0009 ///  XLC 11 and 12. GCC 4.8 and 4.9 are still popular, and they are the

0010 ///  default compiler for GCC112, GCC119 and others on the compile farm.

0011 ///  Older IBM XL C/C++ compilers also have the need due to lack of

0012 ///  <tt>vec_xl</tt> and <tt>vec_xst</tt> support on some platforms. Modern

0013 ///  compilers provide best support and don't need many of the hacks

0014 ///  below.

0015 /// \details The library is tested with the following PowerPC machines and

0016 ///  compilers. GCC110, GCC111, GCC112, GCC119 and GCC135 are provided by

0017 ///  the <A HREF="https://cfarm.tetaneutral.net/">GCC Compile Farm</A>

0018 ///  - PowerMac G5, OSX 10.5, POWER4, Apple GCC 4.0

0019 ///  - PowerMac G5, OSX 10.5, POWER4, Macports GCC 5.0

0020 ///  - GCC110, Linux, POWER7, GCC 4.8.5

0021 ///  - GCC110, Linux, POWER7, XLC 12.01

0022 ///  - GCC111, AIX, POWER7, GCC 4.8.1

0023 ///  - GCC111, AIX, POWER7, XLC 12.01

0024 ///  - GCC112, Linux, POWER8, GCC 4.8.5

0025 ///  - GCC112, Linux, POWER8, XLC 13.01

0026 ///  - GCC112, Linux, POWER8, Clang 7.0

0027 ///  - GCC119, AIX, POWER8, GCC 7.2.0

0028 ///  - GCC119, AIX, POWER8, XLC 13.01

0029 ///  - GCC135, Linux, POWER9, GCC 7.0

0030 /// \details 12 machines are used for testing because the three compilers form

0031 ///  five or six profiles. The profiles are listed below.

0032 ///  - GCC (Linux GCC, Macports GCC, etc. Consistent across machines)

0033 ///  - XLC 13.0 and earlier (all IBM components)

0034 ///  - XLC 13.1 and later on Linux (LLVM front-end, no compatibility macros)

0035 ///  - XLC 13.1 and later on Linux (LLVM front-end, -qxlcompatmacros option)

0036 ///  - early LLVM Clang (traditional Clang compiler)

0037 ///  - late LLVM Clang (traditional Clang compiler)

0038 /// \details The LLVM front-end makes it tricky to write portable code because

0039 ///  LLVM pretends to be other compilers but cannot consume other compiler's

0040 ///  builtins. When using XLC with -qxlcompatmacros the compiler pretends to

0041 ///  be GCC, Clang and XLC all at once but it can only consume it's variety

0042 ///  of builtins.

0043 /// \details At Crypto++ 8.0 the various <tt>Vector{FuncName}</tt> were

0044 ///  renamed to <tt>Vec{FuncName}</tt>. For example, <tt>VectorAnd</tt> was

0045 ///  changed to <tt>VecAnd</tt>. The name change helped consolidate two

0046 ///  slightly different implementations.

0047 /// \details At Crypto++ 8.3 the library added select 64-bit functions for

0048 ///  32-bit Altivec. For example, <tt>VecAdd64</tt> and <tt>VecSub64</tt>

0049 ///  take 32-bit vectors and adds or subtracts them as if there were vectors

0050 ///  with two 64-bit elements. The functions dramtically improve performance

0051 ///  for some algorithms on some platforms, like SIMON128 and SPECK128 on

0052 ///  Power6 and earlier. For example, SPECK128 improved from 70 cpb to

0053 ///  10 cpb on an old PowerMac. Use the functions like shown below.

0054 ///  <pre>

0055 ///    \#if defined(_ARCH_PWR8)

0056 ///    \#  define speck128_t uint64x2_p

0057 ///    \#else

0058 ///    \#  define speck128_t uint32x4_p

0059 ///    \#endif

0060 ///

0061 ///    speck128_t rk, x1, x2, y1, y2;

0062 ///    rk = (speck128_t)VecLoadAligned(ptr);

0063 ///    x1 = VecRotateRight64<8>(x1);

0064 ///    x1 = VecAdd64(x1, y1);

0065 ///    ...</pre>

0066 /// \since Crypto++ 6.0, LLVM Clang compiler support since Crypto++ 8.0

0067
0068 // Use __ALTIVEC__, _ARCH_PWR7, __VSX__, and _ARCH_PWR8 when detecting

0069 // actual availaibility of the feature for the source file being compiled.

0070 // The preprocessor macros depend on compiler options like -maltivec; and

0071 // not compiler versions.

0072
0073 // For GCC see https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions.html

0074 // For XLC see the Compiler Reference manual. For Clang you have to experiment.

0075 // Clang does not document the compiler options, does not reject options it does

0076 // not understand, and pretends to be other compilers even though it cannot

0077 // process the builtins and intrinsics. Clang will waste hours of your time.

0078
0079 // DO NOT USE this pattern in VecLoad and VecStore. We have to use the

0080 // code paths guarded by preprocessor macros because XLC 12 generates

0081 // bad code in some places. To verify the bad code generation test on

0082 // GCC111 with XLC 12.01 installed. XLC 13.01 on GCC112 and GCC119 are OK.

0083 //

0084 //   inline uint32x4_p VecLoad(const byte src[16])

0085 //   {

0086 //   #if defined(__VSX__) || defined(_ARCH_PWR8)

0087 //       return (uint32x4_p) *(uint8x16_p*)((byte*)src);

0088 //   #else

0089 //       return VecLoad_ALTIVEC(src);

0090 //   #endif

0091 //   }

0092
0093 // We should be able to perform the load using inline asm on Power7 with

0094 // VSX or Power8. The inline asm will avoid C undefined behavior due to

0095 // casting from byte* to word32*. We are safe because our byte* are

0096 // 16-byte aligned for Altivec. Below is the big endian load. Little

0097 // endian would need to follow with xxpermdi for the reversal.

0098 //

0099 //   __asm__ ("lxvw4x %x0, %1, %2" : "=wa"(v) : "r"(0), "r"(src) : );

0100
0101 // GCC and XLC use integer math for the address (D-form or byte-offset

0102 // in the ISA manual). LLVM uses pointer math for the address (DS-form

0103 // or indexed in the ISA manual). To keep them consistent we calculate

0104 // the address from the offset and pass to a load or store function

0105 // using a 0 offset.

0106
0107 #ifndef CRYPTOPP_PPC_CRYPTO_H
0108 #define CRYPTOPP_PPC_CRYPTO_H
0109
0110 #include "config.h"
0111 #include "misc.h"
0112
0113 #if defined(__ALTIVEC__)
0114 # include <altivec.h>
0115 # undef vector
0116 # undef pixel
0117 # undef bool
0118 #endif
0119
0120 // XL C++ on AIX does not define VSX and does not

0121 // provide an option to set it. We have to set it

0122 // for the code below. This define must stay in

0123 // sync with the define in test_ppc_power7.cpp.

0124 #ifndef CRYPTOPP_DISABLE_POWER7
0125 # if defined(_AIX) && defined(_ARCH_PWR7) && defined(__xlC__)
0126 #  define __VSX__ 1
0127 # endif
0128 #endif
0129
0130 // XL C++ on AIX does not define CRYPTO and does not

0131 // provide an option to set it. We have to set it

0132 // for the code below. This define must stay in

0133 // sync with the define in test_ppc_power8.cpp

0134 #ifndef CRYPTOPP_DISABLE_POWER8
0135 # if defined(_AIX) && defined(_ARCH_PWR8) && defined(__xlC__)
0136 #  define __CRYPTO__ 1
0137 # endif
0138 #endif
0139
0140 /// \brief Cast array to vector pointer

0141 /// \details CONST_V8_CAST casts a const array to a vector

0142 ///  pointer for a byte array. The Power ABI says source arrays

0143 ///  are non-const, so this define removes the const. XLC++ will

0144 ///  fail the compile if the source array is const.

0145 #define CONST_V8_CAST(x)  ((unsigned char*)(x))
0146 /// \brief Cast array to vector pointer

0147 /// \details CONST_V32_CAST casts a const array to a vector

0148 ///  pointer for a word array. The Power ABI says source arrays

0149 ///  are non-const, so this define removes the const. XLC++ will

0150 ///  fail the compile if the source array is const.

0151 #define CONST_V32_CAST(x) ((unsigned int*)(x))
0152 /// \brief Cast array to vector pointer

0153 /// \details CONST_V64_CAST casts a const array to a vector

0154 ///  pointer for a double word array. The Power ABI says source arrays

0155 ///  are non-const, so this define removes the const. XLC++ will

0156 ///  fail the compile if the source array is const.

0157 #define CONST_V64_CAST(x) ((unsigned long long*)(x))
0158 /// \brief Cast array to vector pointer

0159 /// \details NCONST_V8_CAST casts an array to a vector

0160 ///  pointer for a byte array. The Power ABI says source arrays

0161 ///  are non-const, so this define removes the const. XLC++ will

0162 ///  fail the compile if the source array is const.

0163 #define NCONST_V8_CAST(x)  ((unsigned char*)(x))
0164 /// \brief Cast array to vector pointer

0165 /// \details NCONST_V32_CAST casts an array to a vector

0166 ///  pointer for a word array. The Power ABI says source arrays

0167 ///  are non-const, so this define removes the const. XLC++ will

0168 ///  fail the compile if the source array is const.

0169 #define NCONST_V32_CAST(x) ((unsigned int*)(x))
0170 /// \brief Cast array to vector pointer

0171 /// \details NCONST_V64_CAST casts an array to a vector

0172 ///  pointer for a double word array. The Power ABI says source arrays

0173 ///  are non-const, so this define removes the const. XLC++ will

0174 ///  fail the compile if the source array is const.

0175 #define NCONST_V64_CAST(x) ((unsigned long long*)(x))
0176
0177 // VecLoad_ALTIVEC and VecStore_ALTIVEC are

0178 // too noisy on modern compilers

0179 #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
0180 # pragma GCC diagnostic push
0181 # pragma GCC diagnostic ignored "-Wdeprecated"
0182 #endif
0183
0184 NAMESPACE_BEGIN(CryptoPP)
0185
0186 #if defined(__ALTIVEC__) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
0187
0188 /// \brief Vector of 8-bit elements

0189 /// \par Wraps

0190 ///  __vector unsigned char

0191 /// \since Crypto++ 6.0

0192 typedef __vector unsigned char   uint8x16_p;
0193 /// \brief Vector of 16-bit elements

0194 /// \par Wraps

0195 ///  __vector unsigned short

0196 /// \since Crypto++ 6.0

0197 typedef __vector unsigned short  uint16x8_p;
0198 /// \brief Vector of 32-bit elements

0199 /// \par Wraps

0200 ///  __vector unsigned int

0201 /// \since Crypto++ 6.0

0202 typedef __vector unsigned int    uint32x4_p;
0203
0204 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
0205 /// \brief Vector of 64-bit elements

0206 /// \details uint64x2_p is available on POWER7 with VSX and above. Most

0207 ///  supporting functions, like 64-bit <tt>vec_add</tt> (<tt>vaddudm</tt>)

0208 ///  and <tt>vec_sub</tt> (<tt>vsubudm</tt>), did not arrive until POWER8.

0209 /// \par Wraps

0210 ///  __vector unsigned long long

0211 /// \since Crypto++ 6.0

0212 typedef __vector unsigned long long uint64x2_p;
0213 #endif  // VSX or ARCH_PWR8

0214
0215 /// \brief The 0 vector

0216 /// \return a 32-bit vector of 0's

0217 /// \since Crypto++ 8.0

0218 inline uint32x4_p VecZero()
0219 {
0220     const uint32x4_p v = {0,0,0,0};
0221     return v;
0222 }
0223
0224 /// \brief The 1 vector

0225 /// \return a 32-bit vector of 1's

0226 /// \since Crypto++ 8.0

0227 inline uint32x4_p VecOne()
0228 {
0229     const uint32x4_p v = {1,1,1,1};
0230     return v;
0231 }
0232
0233 /// \brief Reverse bytes in a vector

0234 /// \tparam T vector type

0235 /// \param data the vector

0236 /// \return vector

0237 /// \details VecReverse() reverses the bytes in a vector

0238 /// \par Wraps

0239 ///  vec_perm

0240 /// \since Crypto++ 6.0

0241 template <class T>
0242 inline T VecReverse(const T data)
0243 {
0244 #if defined(CRYPTOPP_BIG_ENDIAN)
0245     const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
0246     return (T)vec_perm(data, data, mask);
0247 #else
0248     const uint8x16_p mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
0249     return (T)vec_perm(data, data, mask);
0250 #endif
0251 }
0252
0253 /// \brief Reverse bytes in a vector

0254 /// \tparam T vector type

0255 /// \param data the vector

0256 /// \return vector

0257 /// \details VecReverseLE() reverses the bytes in a vector on

0258 ///  little-endian systems.

0259 /// \par Wraps

0260 ///  vec_perm

0261 /// \since Crypto++ 6.0

0262 template <class T>
0263 inline T VecReverseLE(const T data)
0264 {
0265 #if defined(CRYPTOPP_LITTLE_ENDIAN)
0266     const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
0267     return (T)vec_perm(data, data, mask);
0268 #else
0269     return data;
0270 #endif
0271 }
0272
0273 /// \brief Reverse bytes in a vector

0274 /// \tparam T vector type

0275 /// \param data the vector

0276 /// \return vector

0277 /// \details VecReverseBE() reverses the bytes in a vector on

0278 ///  big-endian systems.

0279 /// \par Wraps

0280 ///  vec_perm

0281 /// \since Crypto++ 6.0

0282 template <class T>
0283 inline T VecReverseBE(const T data)
0284 {
0285 #if defined(CRYPTOPP_BIG_ENDIAN)
0286     const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
0287     return (T)vec_perm(data, data, mask);
0288 #else
0289     return data;
0290 #endif
0291 }
0292
0293 /// \name LOAD OPERATIONS

0294 //@{

0295
0296 /// \brief Loads a vector from a byte array

0297 /// \param src the byte array

0298 /// \details Loads a vector in native endian format from a byte array.

0299 /// \details VecLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address

0300 ///  of <tt>src</tt> is aligned. If unaligned it uses <tt>vec_lvsl</tt>,

0301 ///  <tt>vec_ld</tt>, <tt>vec_perm</tt> and <tt>src</tt>. The fixups using

0302 ///  <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are relatively expensive so

0303 ///  you should provide aligned memory addresses.

0304 /// \par Wraps

0305 ///  vec_ld, vec_lvsl, vec_perm

0306 /// \sa VecLoad, VecLoadAligned

0307 /// \since Crypto++ 6.0

0308 inline uint32x4_p VecLoad_ALTIVEC(const byte src[16])
0309 {
0310     // Avoid IsAlignedOn for convenience.

0311     const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
0312     if (addr % 16 == 0)
0313     {
0314         return (uint32x4_p)vec_ld(0, CONST_V8_CAST(addr));
0315     }
0316     else
0317     {
0318         // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf

0319         const uint8x16_p perm = vec_lvsl(0, CONST_V8_CAST(addr));
0320         const uint8x16_p low = vec_ld(0, CONST_V8_CAST(addr));
0321         const uint8x16_p high = vec_ld(15, CONST_V8_CAST(addr));
0322         return (uint32x4_p)vec_perm(low, high, perm);
0323     }
0324 }
0325
0326 /// \brief Loads a vector from a byte array

0327 /// \param src the byte array

0328 /// \param off offset into the src byte array

0329 /// \details Loads a vector in native endian format from a byte array.

0330 /// \details VecLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address

0331 ///  of <tt>src</tt> is aligned. If unaligned it uses <tt>vec_lvsl</tt>,

0332 ///  <tt>vec_ld</tt>, <tt>vec_perm</tt> and <tt>src</tt>.

0333 /// \details The fixups using <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are

0334 ///  relatively expensive so you should provide aligned memory addresses.

0335 /// \par Wraps

0336 ///  vec_ld, vec_lvsl, vec_perm

0337 /// \sa VecLoad, VecLoadAligned

0338 /// \since Crypto++ 6.0

0339 inline uint32x4_p VecLoad_ALTIVEC(int off, const byte src[16])
0340 {
0341     // Avoid IsAlignedOn for convenience.

0342     const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
0343     if (addr % 16 == 0)
0344     {
0345         return (uint32x4_p)vec_ld(0, CONST_V8_CAST(addr));
0346     }
0347     else
0348     {
0349         // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf

0350         const uint8x16_p perm = vec_lvsl(0, CONST_V8_CAST(addr));
0351         const uint8x16_p low = vec_ld(0, CONST_V8_CAST(addr));
0352         const uint8x16_p high = vec_ld(15, CONST_V8_CAST(addr));
0353         return (uint32x4_p)vec_perm(low, high, perm);
0354     }
0355 }
0356
0357 /// \brief Loads a vector from a byte array

0358 /// \param src the byte array

0359 /// \details VecLoad() loads a vector from a byte array.

0360 /// \details VecLoad() uses POWER9's <tt>vec_xl</tt> if available.

0361 ///  The instruction does not require aligned effective memory addresses.

0362 ///  VecLoad_ALTIVEC() is used if POWER9 is not available.

0363 ///  VecLoad_ALTIVEC() can be relatively expensive if extra instructions

0364 ///  are required to fix up unaligned memory addresses.

0365 /// \par Wraps

0366 ///  vec_xl on POWER9 and above, Altivec load on POWER8 and below

0367 /// \sa VecLoad_ALTIVEC, VecLoadAligned

0368 /// \since Crypto++ 6.0

0369 inline uint32x4_p VecLoad(const byte src[16])
0370 {
0371     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

0372     // word pointers. The ISA lacks loads for short* and char*.

0373     // Power9/ISA 3.0 provides vec_xl for all datatypes.

0374
0375     const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
0376     CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
0377     CRYPTOPP_UNUSED(addr);
0378
0379 #if defined(_ARCH_PWR9)
0380     return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
0381 #else
0382     return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
0383 #endif
0384 }
0385
0386 /// \brief Loads a vector from a byte array

0387 /// \param src the byte array

0388 /// \param off offset into the src byte array

0389 /// \details VecLoad() loads a vector from a byte array.

0390 /// \details VecLoad() uses POWER9's <tt>vec_xl</tt> if available.

0391 ///  The instruction does not require aligned effective memory addresses.

0392 ///  VecLoad_ALTIVEC() is used if POWER9 is not available.

0393 ///  VecLoad_ALTIVEC() can be relatively expensive if extra instructions

0394 ///  are required to fix up unaligned memory addresses.

0395 /// \par Wraps

0396 ///  vec_xl on POWER9 and above, Altivec load on POWER8 and below

0397 /// \sa VecLoad_ALTIVEC, VecLoadAligned

0398 /// \since Crypto++ 6.0

0399 inline uint32x4_p VecLoad(int off, const byte src[16])
0400 {
0401     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

0402     // word pointers. The ISA lacks loads for short* and char*.

0403     // Power9/ISA 3.0 provides vec_xl for all datatypes.

0404
0405     const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
0406     CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
0407     CRYPTOPP_UNUSED(addr);
0408
0409 #if defined(_ARCH_PWR9)
0410     return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
0411 #else
0412     return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
0413 #endif
0414 }
0415
0416 /// \brief Loads a vector from a word array

0417 /// \param src the word array

0418 /// \details VecLoad() loads a vector from a word array.

0419 /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.

0420 ///  The instruction does not require aligned effective memory addresses.

0421 ///  VecLoad_ALTIVEC() is used if POWER7 is not available.

0422 ///  VecLoad_ALTIVEC() can be relatively expensive if extra instructions

0423 ///  are required to fix up unaligned memory addresses.

0424 /// \par Wraps

0425 ///  vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below

0426 /// \sa VecLoad_ALTIVEC, VecLoadAligned

0427 /// \since Crypto++ 8.0

0428 inline uint32x4_p VecLoad(const word32 src[4])
0429 {
0430     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

0431     // word pointers. The ISA lacks loads for short* and char*.

0432     // Power9/ISA 3.0 provides vec_xl for all datatypes.

0433
0434     const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
0435     CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
0436     CRYPTOPP_UNUSED(addr);
0437
0438 #if defined(_ARCH_PWR9)
0439     return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
0440 #elif defined(__VSX__) || defined(_ARCH_PWR8)
0441     return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));
0442 #else
0443     return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
0444 #endif
0445 }
0446
0447 /// \brief Loads a vector from a word array

0448 /// \param src the word array

0449 /// \param off offset into the word array

0450 /// \details VecLoad() loads a vector from a word array.

0451 /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.

0452 ///  The instruction does not require aligned effective memory addresses.

0453 ///  VecLoad_ALTIVEC() is used if POWER7 is not available.

0454 ///  VecLoad_ALTIVEC() can be relatively expensive if extra instructions

0455 ///  are required to fix up unaligned memory addresses.

0456 /// \par Wraps

0457 ///  vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below

0458 /// \sa VecLoad_ALTIVEC, VecLoadAligned

0459 /// \since Crypto++ 8.0

0460 inline uint32x4_p VecLoad(int off, const word32 src[4])
0461 {
0462     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

0463     // word pointers. The ISA lacks loads for short* and char*.

0464     // Power9/ISA 3.0 provides vec_xl for all datatypes.

0465
0466     const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
0467     CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
0468     CRYPTOPP_UNUSED(addr);
0469
0470 #if defined(_ARCH_PWR9)
0471     return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
0472 #elif defined(__VSX__) || defined(_ARCH_PWR8)
0473     return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));
0474 #else
0475     return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
0476 #endif
0477 }
0478
0479 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
0480
0481 /// \brief Loads a vector from a double word array

0482 /// \param src the double word array

0483 /// \details VecLoad() loads a vector from a double word array.

0484 /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.

0485 ///  The instruction does not require aligned effective memory addresses.

0486 ///  VecLoad_ALTIVEC() is used if POWER7 and VSX are not available.

0487 ///  VecLoad_ALTIVEC() can be relatively expensive if extra instructions

0488 ///  are required to fix up unaligned memory addresses.

0489 /// \details VecLoad() with 64-bit elements is available on POWER7 and above.

0490 /// \par Wraps

0491 ///  vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below

0492 /// \sa VecLoad_ALTIVEC, VecLoadAligned

0493 /// \since Crypto++ 8.0

0494 inline uint64x2_p VecLoad(const word64 src[2])
0495 {
0496     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

0497     // word pointers. The ISA lacks loads for short* and char*.

0498     // Power9/ISA 3.0 provides vec_xl for all datatypes.

0499
0500     const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
0501     CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
0502     CRYPTOPP_UNUSED(addr);
0503
0504 #if defined(_ARCH_PWR9)
0505     return (uint64x2_p)vec_xl(0, CONST_V8_CAST(src));
0506 #elif defined(__VSX__) || defined(_ARCH_PWR8)
0507     // The 32-bit cast is not a typo. Compiler workaround.

0508     return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));
0509 #else
0510     return (uint64x2_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
0511 #endif
0512 }
0513
0514 /// \brief Loads a vector from a double word array

0515 /// \param src the double word array

0516 /// \param off offset into the double word array

0517 /// \details VecLoad() loads a vector from a double word array.

0518 /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.

0519 ///  The instruction does not require aligned effective memory addresses.

0520 ///  VecLoad_ALTIVEC() is used if POWER7 and VSX are not available.

0521 ///  VecLoad_ALTIVEC() can be relatively expensive if extra instructions

0522 ///  are required to fix up unaligned memory addresses.

0523 /// \details VecLoad() with 64-bit elements is available on POWER8 and above.

0524 /// \par Wraps

0525 ///  vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below

0526 /// \sa VecLoad_ALTIVEC, VecLoadAligned

0527 /// \since Crypto++ 8.0

0528 inline uint64x2_p VecLoad(int off, const word64 src[2])
0529 {
0530     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

0531     // word pointers. The ISA lacks loads for short* and char*.

0532     // Power9/ISA 3.0 provides vec_xl for all datatypes.

0533
0534     const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
0535     CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
0536     CRYPTOPP_UNUSED(addr);
0537
0538 #if defined(_ARCH_PWR9)
0539     return (uint64x2_p)vec_xl(off, CONST_V8_CAST(src));
0540 #elif defined(__VSX__) || defined(_ARCH_PWR8)
0541     // The 32-bit cast is not a typo. Compiler workaround.

0542     return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));
0543 #else
0544     return (uint64x2_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
0545 #endif
0546 }
0547
0548 #endif  // VSX or ARCH_PWR8

0549
0550 /// \brief Loads a vector from an aligned byte array

0551 /// \param src the byte array

0552 /// \details VecLoadAligned() loads a vector from an aligned byte array.

0553 /// \details VecLoadAligned() uses POWER9's <tt>vec_xl</tt> if available.

0554 ///  <tt>vec_ld</tt> is used if POWER9 is not available. The effective

0555 ///  address of <tt>src</tt> must be 16-byte aligned for Altivec.

0556 /// \par Wraps

0557 ///  vec_xl on POWER9, vec_ld on POWER8 and below

0558 /// \sa VecLoad_ALTIVEC, VecLoad

0559 /// \since Crypto++ 8.0

0560 inline uint32x4_p VecLoadAligned(const byte src[16])
0561 {
0562     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

0563     // word pointers. The ISA lacks loads for short* and char*.

0564     // Power9/ISA 3.0 provides vec_xl for all datatypes.

0565
0566     const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
0567     CRYPTOPP_ASSERT(addr % 16 == 0);
0568     CRYPTOPP_UNUSED(addr);
0569
0570 #if defined(_ARCH_PWR9)
0571     return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
0572 #else
0573     return (uint32x4_p)vec_ld(0, CONST_V8_CAST(src));
0574 #endif
0575 }
0576
0577 /// \brief Loads a vector from an aligned byte array

0578 /// \param src the byte array

0579 /// \param off offset into the src byte array

0580 /// \details VecLoadAligned() loads a vector from an aligned byte array.

0581 /// \details VecLoadAligned() uses POWER9's <tt>vec_xl</tt> if available.

0582 ///  <tt>vec_ld</tt> is used if POWER9 is not available. The effective

0583 ///  address of <tt>src</tt> must be 16-byte aligned for Altivec.

0584 /// \par Wraps

0585 ///  vec_xl on POWER9, vec_ld on POWER8 and below

0586 /// \sa VecLoad_ALTIVEC, VecLoad

0587 /// \since Crypto++ 8.0

0588 inline uint32x4_p VecLoadAligned(int off, const byte src[16])
0589 {
0590     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

0591     // word pointers. The ISA lacks loads for short* and char*.

0592     // Power9/ISA 3.0 provides vec_xl for all datatypes.

0593
0594     const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
0595     CRYPTOPP_ASSERT(addr % 16 == 0);
0596     CRYPTOPP_UNUSED(addr);
0597
0598 #if defined(_ARCH_PWR9)
0599     return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
0600 #else
0601     return (uint32x4_p)vec_ld(off, CONST_V8_CAST(src));
0602 #endif
0603 }
0604
0605 /// \brief Loads a vector from an aligned word array

0606 /// \param src the word array

0607 /// \details VecLoadAligned() loads a vector from an aligned word array.

0608 /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if

0609 ///  available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.

0610 ///  The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.

0611 /// \par Wraps

0612 ///  vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below

0613 /// \sa VecLoad_ALTIVEC, VecLoad

0614 /// \since Crypto++ 8.0

0615 inline uint32x4_p VecLoadAligned(const word32 src[4])
0616 {
0617     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

0618     // word pointers. The ISA lacks loads for short* and char*.

0619     // Power9/ISA 3.0 provides vec_xl for all datatypes.

0620
0621     const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
0622     CRYPTOPP_ASSERT(addr % 16 == 0);
0623     CRYPTOPP_UNUSED(addr);
0624
0625 #if defined(_ARCH_PWR9)
0626     return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
0627 #elif defined(__VSX__) || defined(_ARCH_PWR8)
0628     return (uint32x4_p)vec_xl(0, CONST_V32_CAST(src));
0629 #else
0630     return (uint32x4_p)vec_ld(0, CONST_V8_CAST(src));
0631 #endif
0632 }
0633
0634 /// \brief Loads a vector from an aligned word array

0635 /// \param src the word array

0636 /// \param off offset into the src word array

0637 /// \details VecLoadAligned() loads a vector from an aligned word array.

0638 /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if

0639 ///  available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.

0640 ///  The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.

0641 /// \par Wraps

0642 ///  vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below

0643 /// \sa VecLoad_ALTIVEC, VecLoad

0644 /// \since Crypto++ 8.0

0645 inline uint32x4_p VecLoadAligned(int off, const word32 src[4])
0646 {
0647     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

0648     // word pointers. The ISA lacks loads for short* and char*.

0649     // Power9/ISA 3.0 provides vec_xl for all datatypes.

0650
0651     const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
0652     CRYPTOPP_ASSERT(addr % 16 == 0);
0653     CRYPTOPP_UNUSED(addr);
0654
0655 #if defined(_ARCH_PWR9)
0656     return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
0657 #elif defined(__VSX__) || defined(_ARCH_PWR8)
0658     return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));
0659 #else
0660     return (uint32x4_p)vec_ld(off, CONST_V8_CAST(src));
0661 #endif
0662 }
0663
0664 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
0665
0666 /// \brief Loads a vector from an aligned double word array

0667 /// \param src the double word array

0668 /// \details VecLoadAligned() loads a vector from an aligned double word array.

0669 /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if

0670 ///  available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.

0671 ///  The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.

0672 /// \par Wraps

0673 ///  vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below

0674 /// \sa VecLoad_ALTIVEC, VecLoad

0675 /// \since Crypto++ 8.0

0676 inline uint64x2_p VecLoadAligned(const word64 src[4])
0677 {
0678     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

0679     // word pointers. The ISA lacks loads for short* and char*.

0680     // Power9/ISA 3.0 provides vec_xl for all datatypes.

0681
0682     const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
0683     CRYPTOPP_ASSERT(addr % 16 == 0);
0684     CRYPTOPP_UNUSED(addr);
0685
0686 #if defined(_ARCH_PWR9)
0687     return (uint64x2_p)vec_xl(0, CONST_V8_CAST(src));
0688 #elif defined(__VSX__) || defined(_ARCH_PWR8)
0689     // The 32-bit cast is not a typo. Compiler workaround.

0690     return (uint64x2_p)vec_xl(0, CONST_V32_CAST(src));
0691 #else
0692     return (uint64x2_p)vec_ld(0, CONST_V8_CAST(src));
0693 #endif
0694 }
0695
0696 /// \brief Loads a vector from an aligned double word array

0697 /// \param src the double word array

0698 /// \param off offset into the src double word array

0699 /// \details VecLoadAligned() loads a vector from an aligned double word array.

0700 /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if

0701 ///  available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.

0702 ///  The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.

0703 /// \par Wraps

0704 ///  vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below

0705 /// \sa VecLoad_ALTIVEC, VecLoad

0706 /// \since Crypto++ 8.0

0707 inline uint64x2_p VecLoadAligned(int off, const word64 src[4])
0708 {
0709     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

0710     // word pointers. The ISA lacks loads for short* and char*.

0711     // Power9/ISA 3.0 provides vec_xl for all datatypes.

0712
0713     const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
0714     CRYPTOPP_ASSERT(addr % 16 == 0);
0715     CRYPTOPP_UNUSED(addr);
0716
0717 #if defined(_ARCH_PWR9)
0718     return (uint64x2_p)vec_xl(off, CONST_V8_CAST(src));
0719 #elif defined(__VSX__) || defined(_ARCH_PWR8)
0720     // The 32-bit cast is not a typo. Compiler workaround.

0721     return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));
0722 #else
0723     return (uint64x2_p)vec_ld(off, CONST_V8_CAST(src));
0724 #endif
0725 }
0726
0727 #endif
0728
0729 /// \brief Loads a vector from a byte array

0730 /// \param src the byte array

0731 /// \details VecLoadBE() loads a vector from a byte array. VecLoadBE

0732 ///  will reverse all bytes in the array on a little endian system.

0733 /// \details VecLoadBE() uses POWER7's and VSX's <tt>vec_xl</tt> if available.

0734 ///  The instruction does not require aligned effective memory addresses.

0735 ///  VecLoad_ALTIVEC() is used if POWER7 or VSX are not available.

0736 ///  VecLoad_ALTIVEC() can be relatively expensive if extra instructions

0737 ///  are required to fix up unaligned memory addresses.

0738 /// \par Wraps

0739 ///  vec_xl on POWER8, Altivec load on POWER7 and below

0740 /// \sa VecLoad_ALTIVEC, VecLoad, VecLoadAligned

0741 /// \since Crypto++ 6.0

0742 inline uint32x4_p VecLoadBE(const byte src[16])
0743 {
0744     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

0745     // word pointers. The ISA lacks loads for short* and char*.

0746     // Power9/ISA 3.0 provides vec_xl for all datatypes.

0747
0748     const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
0749     // CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);

0750     CRYPTOPP_UNUSED(addr);
0751
0752 #if defined(_ARCH_PWR9)
0753     CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
0754     return (uint32x4_p)vec_xl_be(0, CONST_V8_CAST(src));
0755 #elif defined(CRYPTOPP_BIG_ENDIAN)
0756     return (uint32x4_p)VecLoad_ALTIVEC(0, CONST_V8_CAST(src));
0757 #else
0758     return (uint32x4_p)VecReverseLE(VecLoad_ALTIVEC(CONST_V8_CAST(src)));
0759 #endif
0760 }
0761
0762 /// \brief Loads a vector from a byte array

0763 /// \param src the byte array

0764 /// \param off offset into the src byte array

0765 /// \details VecLoadBE() loads a vector from a byte array. VecLoadBE

0766 ///  will reverse all bytes in the array on a little endian system.

0767 /// \details VecLoadBE() uses POWER7's and VSX's <tt>vec_xl</tt> if available.

0768 ///  The instruction does not require aligned effective memory addresses.

0769 ///  VecLoad_ALTIVEC() is used if POWER7 is not available.

0770 ///  VecLoad_ALTIVEC() can be relatively expensive if extra instructions

0771 ///  are required to fix up unaligned memory addresses.

0772 /// \par Wraps

0773 ///  vec_xl on POWER8, Altivec load on POWER7 and below

0774 /// \sa VecLoad_ALTIVEC, VecLoad, VecLoadAligned

0775 /// \since Crypto++ 6.0

0776 inline uint32x4_p VecLoadBE(int off, const byte src[16])
0777 {
0778     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

0779     // word pointers. The ISA lacks loads for short* and char*.

0780     // Power9/ISA 3.0 provides vec_xl for all datatypes.

0781
0782     const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
0783     // CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);

0784     CRYPTOPP_UNUSED(addr);
0785
0786 #if defined(_ARCH_PWR9)
0787     CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
0788     return (uint32x4_p)vec_xl_be(off, CONST_V8_CAST(src));
0789 #elif defined(CRYPTOPP_BIG_ENDIAN)
0790     return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
0791 #else
0792     return (uint32x4_p)VecReverseLE(VecLoad_ALTIVEC(CONST_V8_CAST(addr)));
0793 #endif
0794 }
0795
0796 //@}

0797
0798 /// \name STORE OPERATIONS

0799 //@{

0800
0801 /// \brief Stores a vector to a byte array

0802 /// \tparam T vector type

0803 /// \param data the vector

0804 /// \param dest the byte array

0805 /// \details VecStore_ALTIVEC() stores a vector to a byte array.

0806 /// \details VecStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address

0807 ///  of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise.

0808 ///  <tt>vec_ste</tt> is relatively expensive so you should provide aligned

0809 ///  memory addresses.

0810 /// \details VecStore_ALTIVEC() is used when POWER7 or above

0811 ///  and unaligned loads is not available.

0812 /// \par Wraps

0813 ///  vec_st, vec_ste, vec_lvsr, vec_perm

0814 /// \sa VecStore, VecStoreAligned

0815 /// \since Crypto++ 8.0

0816 template<class T>
0817 inline void VecStore_ALTIVEC(const T data, byte dest[16])
0818 {
0819     // Avoid IsAlignedOn for convenience.

0820     uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
0821     if (addr % 16 == 0)
0822     {
0823         vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
0824     }
0825     else
0826     {
0827         // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf

0828         uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, NCONST_V8_CAST(addr)));
0829         vec_ste((uint8x16_p) perm,  0, (unsigned char*) NCONST_V8_CAST(addr));
0830         vec_ste((uint16x8_p) perm,  1, (unsigned short*)NCONST_V8_CAST(addr));
0831         vec_ste((uint32x4_p) perm,  3, (unsigned int*)  NCONST_V8_CAST(addr));
0832         vec_ste((uint32x4_p) perm,  4, (unsigned int*)  NCONST_V8_CAST(addr));
0833         vec_ste((uint32x4_p) perm,  8, (unsigned int*)  NCONST_V8_CAST(addr));
0834         vec_ste((uint32x4_p) perm, 12, (unsigned int*)  NCONST_V8_CAST(addr));
0835         vec_ste((uint16x8_p) perm, 14, (unsigned short*)NCONST_V8_CAST(addr));
0836         vec_ste((uint8x16_p) perm, 15, (unsigned char*) NCONST_V8_CAST(addr));
0837     }
0838 }
0839
0840 /// \brief Stores a vector to a byte array

0841 /// \tparam T vector type

0842 /// \param data the vector

0843 /// \param off offset into the dest byte array

0844 /// \param dest the byte array

0845 /// \details VecStore_ALTIVEC() stores a vector to a byte array.

0846 /// \details VecStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address

0847 ///  of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise.

0848 ///  <tt>vec_ste</tt> is relatively expensive so you should provide aligned

0849 ///  memory addresses.

0850 /// \details VecStore_ALTIVEC() is used when POWER7 or above

0851 ///  and unaligned loads is not available.

0852 /// \par Wraps

0853 ///  vec_st, vec_ste, vec_lvsr, vec_perm

0854 /// \sa VecStore, VecStoreAligned

0855 /// \since Crypto++ 8.0

0856 template<class T>
0857 inline void VecStore_ALTIVEC(const T data, int off, byte dest[16])
0858 {
0859     // Avoid IsAlignedOn for convenience.

0860     uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
0861     if (addr % 16 == 0)
0862     {
0863         vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
0864     }
0865     else
0866     {
0867         // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf

0868         uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, NCONST_V8_CAST(addr)));
0869         vec_ste((uint8x16_p) perm,  0, (unsigned char*) NCONST_V8_CAST(addr));
0870         vec_ste((uint16x8_p) perm,  1, (unsigned short*)NCONST_V8_CAST(addr));
0871         vec_ste((uint32x4_p) perm,  3, (unsigned int*)  NCONST_V8_CAST(addr));
0872         vec_ste((uint32x4_p) perm,  4, (unsigned int*)  NCONST_V8_CAST(addr));
0873         vec_ste((uint32x4_p) perm,  8, (unsigned int*)  NCONST_V8_CAST(addr));
0874         vec_ste((uint32x4_p) perm, 12, (unsigned int*)  NCONST_V8_CAST(addr));
0875         vec_ste((uint16x8_p) perm, 14, (unsigned short*)NCONST_V8_CAST(addr));
0876         vec_ste((uint8x16_p) perm, 15, (unsigned char*) NCONST_V8_CAST(addr));
0877     }
0878 }
0879
0880 /// \brief Stores a vector to a byte array

0881 /// \tparam T vector type

0882 /// \param data the vector

0883 /// \param dest the byte array

0884 /// \details VecStore() stores a vector to a byte array.

0885 /// \details VecStore() uses POWER9's <tt>vec_xst</tt> if available.

0886 ///  The instruction does not require aligned effective memory addresses.

0887 ///  VecStore_ALTIVEC() is used if POWER9 is not available.

0888 ///  VecStore_ALTIVEC() can be relatively expensive if extra instructions

0889 ///  are required to fix up unaligned memory addresses.

0890 /// \par Wraps

0891 ///  vec_xst on POWER9 and above, Altivec store on POWER8 and below

0892 /// \sa VecStore_ALTIVEC, VecStoreAligned

0893 /// \since Crypto++ 6.0

0894 template<class T>
0895 inline void VecStore(const T data, byte dest[16])
0896 {
0897     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

0898     // word pointers. The ISA lacks loads for short* and char*.

0899     // Power9/ISA 3.0 provides vec_xl for all datatypes.

0900
0901     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
0902     CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
0903     CRYPTOPP_UNUSED(addr);
0904
0905 #if defined(_ARCH_PWR9)
0906     vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
0907 #else
0908     VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(dest));
0909 #endif
0910 }
0911
0912 /// \brief Stores a vector to a byte array

0913 /// \tparam T vector type

0914 /// \param data the vector

0915 /// \param off offset into the dest byte array

0916 /// \param dest the byte array

0917 /// \details VecStore() stores a vector to a byte array.

0918 /// \details VecStore() uses POWER9's <tt>vec_xst</tt> if available.

0919 ///  The instruction does not require aligned effective memory addresses.

0920 ///  VecStore_ALTIVEC() is used if POWER9 is not available.

0921 ///  VecStore_ALTIVEC() can be relatively expensive if extra instructions

0922 ///  are required to fix up unaligned memory addresses.

0923 /// \par Wraps

0924 ///  vec_xst on POWER9 and above, Altivec store on POWER8 and below

0925 /// \sa VecStore_ALTIVEC, VecStoreAligned

0926 /// \since Crypto++ 6.0

0927 template<class T>
0928 inline void VecStore(const T data, int off, byte dest[16])
0929 {
0930     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

0931     // word pointers. The ISA lacks loads for short* and char*.

0932     // Power9/ISA 3.0 provides vec_xl for all datatypes.

0933
0934     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
0935     CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
0936     CRYPTOPP_UNUSED(addr);
0937
0938 #if defined(_ARCH_PWR9)
0939     vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
0940 #else
0941     VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));
0942 #endif
0943 }
0944
0945 /// \brief Stores a vector to a word array

0946 /// \tparam T vector type

0947 /// \param data the vector

0948 /// \param dest the word array

0949 /// \details VecStore() stores a vector to a word array.

0950 /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.

0951 ///  The instruction does not require aligned effective memory addresses.

0952 ///  VecStore_ALTIVEC() is used if POWER7 or VSX are not available.

0953 ///  VecStore_ALTIVEC() can be relatively expensive if extra instructions

0954 ///  are required to fix up unaligned memory addresses.

0955 /// \par Wraps

0956 ///  vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below

0957 /// \sa VecStore_ALTIVEC, VecStoreAligned

0958 /// \since Crypto++ 8.0

0959 template<class T>
0960 inline void VecStore(const T data, word32 dest[4])
0961 {
0962     // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit

0963     // word pointers. The ISA lacks stores for short* and char*.

0964     // Power9/ISA 3.0 provides vec_xst for all datatypes.

0965
0966     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
0967     CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
0968     CRYPTOPP_UNUSED(addr);
0969
0970 #if defined(_ARCH_PWR9)
0971     vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
0972 #elif defined(__VSX__) || defined(_ARCH_PWR8)
0973     vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
0974 #else
0975     VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));
0976 #endif
0977 }
0978
0979 /// \brief Stores a vector to a word array

0980 /// \tparam T vector type

0981 /// \param data the vector

0982 /// \param off offset into the dest word array

0983 /// \param dest the word array

0984 /// \details VecStore() stores a vector to a word array.

0985 /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.

0986 ///  The instruction does not require aligned effective memory addresses.

0987 ///  VecStore_ALTIVEC() is used if POWER7 or VSX are not available.

0988 ///  VecStore_ALTIVEC() can be relatively expensive if extra instructions

0989 ///  are required to fix up unaligned memory addresses.

0990 /// \par Wraps

0991 ///  vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below

0992 /// \sa VecStore_ALTIVEC, VecStoreAligned

0993 /// \since Crypto++ 8.0

0994 template<class T>
0995 inline void VecStore(const T data, int off, word32 dest[4])
0996 {
0997     // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit

0998     // word pointers. The ISA lacks stores for short* and char*.

0999     // Power9/ISA 3.0 provides vec_xst for all datatypes.

1000
1001     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1002     CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1003     CRYPTOPP_UNUSED(addr);
1004
1005 #if defined(_ARCH_PWR9)
1006     vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1007 #elif defined(__VSX__) || defined(_ARCH_PWR8)
1008     vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1009 #else
1010     VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));
1011 #endif
1012 }
1013
1014 /// \brief Stores a vector to a word array

1015 /// \tparam T vector type

1016 /// \param data the vector

1017 /// \param dest the word array

1018 /// \details VecStore() stores a vector to a word array.

1019 /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.

1020 ///  The instruction does not require aligned effective memory addresses.

1021 ///  VecStore_ALTIVEC() is used if POWER7 or VSX are not available.

1022 ///  VecStore_ALTIVEC() can be relatively expensive if extra instructions

1023 ///  are required to fix up unaligned memory addresses.

1024 /// \details VecStore() with 64-bit elements is available on POWER8 and above.

1025 /// \par Wraps

1026 ///  vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below

1027 /// \sa VecStore_ALTIVEC, VecStoreAligned

1028 /// \since Crypto++ 8.0

1029 template<class T>
1030 inline void VecStore(const T data, word64 dest[2])
1031 {
1032     // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit

1033     // word pointers. The ISA lacks stores for short* and char*.

1034     // Power9/ISA 3.0 provides vec_xst for all datatypes.

1035
1036     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1037     CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
1038     CRYPTOPP_UNUSED(addr);
1039
1040 #if defined(_ARCH_PWR9)
1041     vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1042 #elif defined(__VSX__) || defined(_ARCH_PWR8)
1043     // 32-bit cast is not a typo. Compiler workaround.

1044     vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1045 #else
1046     VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));
1047 #endif
1048 }
1049
1050 /// \brief Stores a vector to a word array

1051 /// \tparam T vector type

1052 /// \param data the vector

1053 /// \param off offset into the dest word array

1054 /// \param dest the word array

1055 /// \details VecStore() stores a vector to a word array.

1056 /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.

1057 ///  The instruction does not require aligned effective memory addresses.

1058 ///  VecStore_ALTIVEC() is used if POWER7 or VSX are not available.

1059 ///  VecStore_ALTIVEC() can be relatively expensive if extra instructions

1060 ///  are required to fix up unaligned memory addresses.

1061 /// \details VecStore() with 64-bit elements is available on POWER8 and above.

1062 /// \par Wraps

1063 ///  vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below

1064 /// \sa VecStore_ALTIVEC, VecStoreAligned

1065 /// \since Crypto++ 8.0

1066 template<class T>
1067 inline void VecStore(const T data, int off, word64 dest[2])
1068 {
1069     // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit

1070     // word pointers. The ISA lacks stores for short* and char*.

1071     // Power9/ISA 3.0 provides vec_xst for all datatypes.

1072
1073     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1074     CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
1075     CRYPTOPP_UNUSED(addr);
1076
1077 #if defined(_ARCH_PWR9)
1078     vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1079 #elif defined(__VSX__) || defined(_ARCH_PWR8)
1080     // 32-bit cast is not a typo. Compiler workaround.

1081     vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1082 #else
1083     VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));
1084 #endif
1085 }
1086
1087 /// \brief Stores a vector to a byte array

1088 /// \tparam T vector type

1089 /// \param data the vector

1090 /// \param dest the byte array

1091 /// \details VecStoreAligned() stores a vector from an aligned byte array.

1092 /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.

1093 ///  <tt>vec_st</tt> is used if POWER9 is not available. The effective

1094 ///  address of <tt>dest</tt> must be 16-byte aligned for Altivec.

1095 /// \par Wraps

1096 ///  vec_xst on POWER9 or above, vec_st on POWER8 and below

1097 /// \sa VecStore_ALTIVEC, VecStore

1098 /// \since Crypto++ 8.0

1099 template<class T>
1100 inline void VecStoreAligned(const T data, byte dest[16])
1101 {
1102     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

1103     // word pointers. The ISA lacks loads for short* and char*.

1104     // Power9/ISA 3.0 provides vec_xl for all datatypes.

1105
1106     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1107     CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1108     CRYPTOPP_UNUSED(addr);
1109
1110 #if defined(_ARCH_PWR9)
1111     vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1112 #else
1113     vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1114 #endif
1115 }
1116
1117 /// \brief Stores a vector to a byte array

1118 /// \tparam T vector type

1119 /// \param data the vector

1120 /// \param off offset into the dest byte array

1121 /// \param dest the byte array

1122 /// \details VecStoreAligned() stores a vector from an aligned byte array.

1123 /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.

1124 ///  <tt>vec_st</tt> is used if POWER9 is not available. The effective

1125 ///  address of <tt>dest</tt> must be 16-byte aligned for Altivec.

1126 /// \par Wraps

1127 ///  vec_xst on POWER9 or above, vec_st on POWER8 and below

1128 /// \sa VecStore_ALTIVEC, VecStore

1129 /// \since Crypto++ 8.0

1130 template<class T>
1131 inline void VecStoreAligned(const T data, int off, byte dest[16])
1132 {
1133     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit

1134     // word pointers. The ISA lacks loads for short* and char*.

1135     // Power9/ISA 3.0 provides vec_xl for all datatypes.

1136
1137     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1138     CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1139     CRYPTOPP_UNUSED(addr);
1140
1141 #if defined(_ARCH_PWR9)
1142     vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1143 #else
1144     vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1145 #endif
1146 }
1147
1148 /// \brief Stores a vector to a word array

1149 /// \tparam T vector type

1150 /// \param data the vector

1151 /// \param dest the word array

1152 /// \details VecStoreAligned() stores a vector from an aligned word array.

1153 /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.

1154 ///  POWER7 <tt>vec_xst</tt> is used if POWER9 is not available. <tt>vec_st</tt>

1155 ///  is used if POWER7 is not available. The effective address of <tt>dest</tt>

1156 ///  must be 16-byte aligned for Altivec.

1157 /// \par Wraps

1158 ///  vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below

1159 /// \sa VecStore_ALTIVEC, VecStore

1160 /// \since Crypto++ 8.0

1161 template<class T>
1162 inline void VecStoreAligned(const T data, word32 dest[4])
1163 {
1164     // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit

1165     // word pointers. The ISA lacks stores for short* and char*.

1166     // Power9/ISA 3.0 provides vec_xst for all datatypes.

1167
1168     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1169     CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1170     CRYPTOPP_UNUSED(addr);
1171
1172 #if defined(_ARCH_PWR9)
1173     vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1174 #elif defined(__VSX__) || defined(_ARCH_PWR8)
1175     vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1176 #else
1177     vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1178 #endif
1179 }
1180
1181 /// \brief Stores a vector to a word array

1182 /// \tparam T vector type

1183 /// \param data the vector

1184 /// \param off offset into the dest word array

1185 /// \param dest the word array

1186 /// \details VecStoreAligned() stores a vector from an aligned word array.

1187 /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.

1188 ///  POWER7 <tt>vec_xst</tt> is used if POWER9 is not available. <tt>vec_st</tt>

1189 ///  is used if POWER7 is not available. The effective address of <tt>dest</tt>

1190 ///  must be 16-byte aligned for Altivec.

1191 /// \par Wraps

1192 ///  vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below

1193 /// \sa VecStore_ALTIVEC, VecStore

1194 /// \since Crypto++ 8.0

1195 template<class T>
1196 inline void VecStoreAligned(const T data, int off, word32 dest[4])
1197 {
1198     // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit

1199     // word pointers. The ISA lacks stores for short* and char*.

1200     // Power9/ISA 3.0 provides vec_xst for all datatypes.

1201
1202     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1203     CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1204     CRYPTOPP_UNUSED(addr);
1205
1206 #if defined(_ARCH_PWR9)
1207     vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1208 #elif defined(__VSX__) || defined(_ARCH_PWR8)
1209     vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1210 #else
1211     vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1212 #endif
1213 }
1214
1215 /// \brief Stores a vector to a byte array

1216 /// \tparam T vector type

1217 /// \param data the vector

1218 /// \param dest the byte array

1219 /// \details VecStoreBE() stores a vector to a byte array. VecStoreBE

1220 ///  will reverse all bytes in the array on a little endian system.

1221 /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.

1222 ///  The instruction does not require aligned effective memory addresses.

1223 ///  VecStore_ALTIVEC() is used if POWER7 is not available.

1224 ///  VecStore_ALTIVEC() can be relatively expensive if extra instructions

1225 ///  are required to fix up unaligned memory addresses.

1226 /// \par Wraps

1227 ///  vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below

1228 /// \sa VecStore_ALTIVEC, VecStoreAligned

1229 /// \since Crypto++ 6.0

1230 template <class T>
1231 inline void VecStoreBE(const T data, byte dest[16])
1232 {
1233     // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit

1234     // word pointers. The ISA lacks stores for short* and char*.

1235     // Power9/ISA 3.0 provides vec_xst for all datatypes.

1236
1237     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1238     CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1239     CRYPTOPP_UNUSED(addr);
1240
1241 #if defined(_ARCH_PWR9)
1242     vec_xst_be((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1243 #elif defined(CRYPTOPP_BIG_ENDIAN)
1244     VecStore((uint8x16_p)data, NCONST_V8_CAST(addr));
1245 #else
1246     VecStore((uint8x16_p)VecReverseLE(data), NCONST_V8_CAST(addr));
1247 #endif
1248 }
1249
1250 /// \brief Stores a vector to a byte array

1251 /// \tparam T vector type

1252 /// \param data the vector

1253 /// \param off offset into the dest byte array

1254 /// \param dest the byte array

1255 /// \details VecStoreBE() stores a vector to a byte array. VecStoreBE

1256 ///  will reverse all bytes in the array on a little endian system.

1257 /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.

1258 ///  The instruction does not require aligned effective memory addresses.

1259 ///  VecStore_ALTIVEC() is used if POWER7 is not available.

1260 ///  VecStore_ALTIVEC() can be relatively expensive if extra instructions

1261 ///  are required to fix up unaligned memory addresses.

1262 /// \par Wraps

1263 ///  vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below

1264 /// \sa VecStore_ALTIVEC, VecStoreAligned

1265 /// \since Crypto++ 6.0

1266 template <class T>
1267 inline void VecStoreBE(const T data, int off, byte dest[16])
1268 {
1269     // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit

1270     // word pointers. The ISA lacks stores for short* and char*.

1271     // Power9/ISA 3.0 provides vec_xst for all datatypes.

1272
1273     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1274     CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1275     CRYPTOPP_UNUSED(addr);
1276
1277 #if defined(_ARCH_PWR9)
1278     vec_xst_be((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1279 #elif defined(CRYPTOPP_BIG_ENDIAN)
1280     VecStore((uint8x16_p)data, NCONST_V8_CAST(addr));
1281 #else
1282     VecStore((uint8x16_p)VecReverseLE(data), NCONST_V8_CAST(addr));
1283 #endif
1284 }
1285
1286 /// \brief Stores a vector to a word array

1287 /// \tparam T vector type

1288 /// \param data the vector

1289 /// \param dest the word array

1290 /// \details VecStoreBE() stores a vector to a word array. VecStoreBE

1291 ///  will reverse all bytes in the array on a little endian system.

1292 /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.

1293 ///  The instruction does not require aligned effective memory addresses.

1294 ///  VecStore_ALTIVEC() is used if POWER7 is not available.

1295 ///  VecStore_ALTIVEC() can be relatively expensive if extra instructions

1296 ///  are required to fix up unaligned memory addresses.

1297 /// \par Wraps

1298 ///  vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below

1299 /// \sa VecStore_ALTIVEC, VecStoreAligned

1300 /// \since Crypto++ 8.0

1301 template <class T>
1302 inline void VecStoreBE(const T data, word32 dest[4])
1303 {
1304     // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit

1305     // word pointers. The ISA lacks stores for short* and char*.

1306     // Power9/ISA 3.0 provides vec_xst for all datatypes.

1307
1308     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1309     CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1310     CRYPTOPP_UNUSED(addr);
1311
1312 #if defined(_ARCH_PWR9)
1313     vec_xst_be((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1314 #elif defined(CRYPTOPP_BIG_ENDIAN)
1315     VecStore((uint32x4_p)data, NCONST_V32_CAST(addr));
1316 #else
1317     VecStore((uint32x4_p)VecReverseLE(data), NCONST_V32_CAST(addr));
1318 #endif
1319 }
1320
1321 /// \brief Stores a vector to a word array

1322 /// \tparam T vector type

1323 /// \param data the vector

1324 /// \param off offset into the dest word array

1325 /// \param dest the word array

1326 /// \details VecStoreBE() stores a vector to a word array. VecStoreBE

1327 ///  will reverse all words in the array on a little endian system.

1328 /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.

1329 ///  The instruction does not require aligned effective memory addresses.

1330 ///  VecStore_ALTIVEC() is used if POWER7 is not available.

1331 ///  VecStore_ALTIVEC() can be relatively expensive if extra instructions

1332 ///  are required to fix up unaligned memory addresses.

1333 /// \par Wraps

1334 ///  vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below

1335 /// \sa VecStore_ALTIVEC, VecStoreAligned

1336 /// \since Crypto++ 8.0

1337 template <class T>
1338 inline void VecStoreBE(const T data, int off, word32 dest[4])
1339 {
1340     // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit

1341     // word pointers. The ISA lacks stores for short* and char*.

1342     // Power9/ISA 3.0 provides vec_xst for all datatypes.

1343
1344     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1345     CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1346     CRYPTOPP_UNUSED(addr);
1347
1348 #if defined(_ARCH_PWR9)
1349     vec_xst_be((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1350 #elif defined(CRYPTOPP_BIG_ENDIAN)
1351     VecStore((uint32x4_p)data, NCONST_V32_CAST(addr));
1352 #else
1353     VecStore((uint32x4_p)VecReverseLE(data), NCONST_V32_CAST(addr));
1354 #endif
1355 }
1356
1357 //@}

1358
1359 /// \name LOGICAL OPERATIONS

1360 //@{

1361
1362 /// \brief AND two vectors

1363 /// \tparam T1 vector type

1364 /// \tparam T2 vector type

1365 /// \param vec1 the first vector

1366 /// \param vec2 the second vector

1367 /// \return vector

1368 /// \details VecAnd() performs <tt>vec1 & vec2</tt>.

1369 ///  vec2 is cast to the same type as vec1. The return vector

1370 ///  is the same type as vec1.

1371 /// \par Wraps

1372 ///  vec_and

1373 /// \sa VecAnd64

1374 /// \since Crypto++ 6.0

1375 template <class T1, class T2>
1376 inline T1 VecAnd(const T1 vec1, const T2 vec2)
1377 {
1378     return (T1)vec_and(vec1, (T1)vec2);
1379 }
1380
1381 /// \brief OR two vectors

1382 /// \tparam T1 vector type

1383 /// \tparam T2 vector type

1384 /// \param vec1 the first vector

1385 /// \param vec2 the second vector

1386 /// \return vector

1387 /// \details VecOr() performs <tt>vec1 | vec2</tt>.

1388 ///  vec2 is cast to the same type as vec1. The return vector

1389 ///  is the same type as vec1.

1390 /// \par Wraps

1391 ///  vec_or

1392 /// \sa VecOr64

1393 /// \since Crypto++ 6.0

1394 template <class T1, class T2>
1395 inline T1 VecOr(const T1 vec1, const T2 vec2)
1396 {
1397     return (T1)vec_or(vec1, (T1)vec2);
1398 }
1399
1400 /// \brief XOR two vectors

1401 /// \tparam T1 vector type

1402 /// \tparam T2 vector type

1403 /// \param vec1 the first vector

1404 /// \param vec2 the second vector

1405 /// \return vector

1406 /// \details VecXor() performs <tt>vec1 ^ vec2</tt>.

1407 ///  vec2 is cast to the same type as vec1. The return vector

1408 ///  is the same type as vec1.

1409 /// \par Wraps

1410 ///  vec_xor

1411 /// \sa VecXor64

1412 /// \since Crypto++ 6.0

1413 template <class T1, class T2>
1414 inline T1 VecXor(const T1 vec1, const T2 vec2)
1415 {
1416     return (T1)vec_xor(vec1, (T1)vec2);
1417 }
1418
1419 //@}

1420
1421 /// \name ARITHMETIC OPERATIONS

1422 //@{

1423
1424 /// \brief Add two vectors

1425 /// \tparam T1 vector type

1426 /// \tparam T2 vector type

1427 /// \param vec1 the first vector

1428 /// \param vec2 the second vector

1429 /// \return vector

1430 /// \details VecAdd() performs <tt>vec1 + vec2</tt>.

1431 ///  vec2 is cast to the same type as vec1. The return vector

1432 ///  is the same type as vec1.

1433 /// \par Wraps

1434 ///  vec_add

1435 /// \sa VecAdd64

1436 /// \since Crypto++ 6.0

1437 template <class T1, class T2>
1438 inline T1 VecAdd(const T1 vec1, const T2 vec2)
1439 {
1440     return (T1)vec_add(vec1, (T1)vec2);
1441 }
1442
1443 /// \brief Subtract two vectors

1444 /// \tparam T1 vector type

1445 /// \tparam T2 vector type

1446 /// \param vec1 the first vector

1447 /// \param vec2 the second vector

1448 /// \details VecSub() performs <tt>vec1 - vec2</tt>.

1449 ///  vec2 is cast to the same type as vec1. The return vector

1450 ///  is the same type as vec1.

1451 /// \par Wraps

1452 ///  vec_sub

1453 /// \sa VecSub64

1454 /// \since Crypto++ 6.0

1455 template <class T1, class T2>
1456 inline T1 VecSub(const T1 vec1, const T2 vec2)
1457 {
1458     return (T1)vec_sub(vec1, (T1)vec2);
1459 }
1460
1461 //@}

1462
1463 /// \name PERMUTE OPERATIONS

1464 //@{

1465
1466 /// \brief Permutes a vector

1467 /// \tparam T1 vector type

1468 /// \tparam T2 vector type

1469 /// \param vec the vector

1470 /// \param mask vector mask

1471 /// \return vector

1472 /// \details VecPermute() creates a new vector from vec according to mask.

1473 ///  mask is an uint8x16_p vector. The return vector is the same type as vec.

1474 /// \par Wraps

1475 ///  vec_perm

1476 /// \since Crypto++ 6.0

1477 template <class T1, class T2>
1478 inline T1 VecPermute(const T1 vec, const T2 mask)
1479 {
1480     return (T1)vec_perm(vec, vec, (uint8x16_p)mask);
1481 }
1482
1483 /// \brief Permutes two vectors

1484 /// \tparam T1 vector type

1485 /// \tparam T2 vector type

1486 /// \param vec1 the first vector

1487 /// \param vec2 the second vector

1488 /// \param mask vector mask

1489 /// \return vector

1490 /// \details VecPermute() creates a new vector from vec1 and vec2 according to mask.

1491 ///  mask is an uint8x16_p vector. The return vector is the same type as vec.

1492 /// \par Wraps

1493 ///  vec_perm

1494 /// \since Crypto++ 6.0

1495 template <class T1, class T2>
1496 inline T1 VecPermute(const T1 vec1, const T1 vec2, const T2 mask)
1497 {
1498     return (T1)vec_perm(vec1, (T1)vec2, (uint8x16_p)mask);
1499 }
1500
1501 //@}

1502
1503 /// \name SHIFT AND ROTATE OPERATIONS

1504 //@{

1505
1506 /// \brief Shift a vector left

1507 /// \tparam C shift byte count

1508 /// \tparam T vector type

1509 /// \param vec the vector

1510 /// \return vector

1511 /// \details VecShiftLeftOctet() returns a new vector after shifting the

1512 ///  concatenation of the zero vector and the source vector by the specified

1513 ///  number of bytes. The return vector is the same type as vec.

1514 /// \details On big endian machines VecShiftLeftOctet() is <tt>vec_sld(a, z,

1515 ///  c)</tt>. On little endian machines VecShiftLeftOctet() is translated to

1516 ///  <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as

1517 ///  if on a big endian machine as shown below.

1518 /// <pre>

1519 ///   uint8x16_p x = VecLoad(ptr);

1520 ///   uint8x16_p y = VecShiftLeftOctet<12>(x);

1521 /// </pre>

1522 /// \par Wraps

1523 ///  vec_sld

1524 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld

1525 ///  endian sensitive?</A> on Stack Overflow

1526 /// \since Crypto++ 6.0

1527 template <unsigned int C, class T>
1528 inline T VecShiftLeftOctet(const T vec)
1529 {
1530     const T zero = {0};
1531     if (C >= 16)
1532     {
1533         // Out of range

1534         return zero;
1535     }
1536     else if (C == 0)
1537     {
1538         // Noop

1539         return vec;
1540     }
1541     else
1542     {
1543 #if defined(CRYPTOPP_BIG_ENDIAN)
1544     enum { R=C&0xf };
1545     return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R);
1546 #else
1547     enum { R=(16-C)&0xf };  // Linux xlC 13.1 workaround in Debug builds

1548     return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R);
1549 #endif
1550     }
1551 }
1552
1553 /// \brief Shift a vector right

1554 /// \tparam C shift byte count

1555 /// \tparam T vector type

1556 /// \param vec the vector

1557 /// \return vector

1558 /// \details VecShiftRightOctet() returns a new vector after shifting the

1559 ///  concatenation of the zero vector and the source vector by the specified

1560 ///  number of bytes. The return vector is the same type as vec.

1561 /// \details On big endian machines VecShiftRightOctet() is <tt>vec_sld(a, z,

1562 ///  c)</tt>. On little endian machines VecShiftRightOctet() is translated to

1563 ///  <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as

1564 ///  if on a big endian machine as shown below.

1565 /// <pre>

1566 ///   uint8x16_p x = VecLoad(ptr);

1567 ///   uint8x16_p y = VecShiftRightOctet<12>(y);

1568 /// </pre>

1569 /// \par Wraps

1570 ///  vec_sld

1571 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld

1572 ///  endian sensitive?</A> on Stack Overflow

1573 /// \since Crypto++ 6.0

1574 template <unsigned int C, class T>
1575 inline T VecShiftRightOctet(const T vec)
1576 {
1577     const T zero = {0};
1578     if (C >= 16)
1579     {
1580         // Out of range

1581         return zero;
1582     }
1583     else if (C == 0)
1584     {
1585         // Noop

1586         return vec;
1587     }
1588     else
1589     {
1590 #if defined(CRYPTOPP_BIG_ENDIAN)
1591     enum { R=(16-C)&0xf };  // Linux xlC 13.1 workaround in Debug builds

1592     return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R);
1593 #else
1594     enum { R=C&0xf };
1595     return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R);
1596 #endif
1597     }
1598 }
1599
1600 /// \brief Rotate a vector left

1601 /// \tparam C shift byte count

1602 /// \tparam T vector type

1603 /// \param vec the vector

1604 /// \return vector

1605 /// \details VecRotateLeftOctet() returns a new vector after rotating the

1606 ///  concatenation of the source vector with itself by the specified

1607 ///  number of bytes. The return vector is the same type as vec.

1608 /// \par Wraps

1609 ///  vec_sld

1610 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld

1611 ///  endian sensitive?</A> on Stack Overflow

1612 /// \since Crypto++ 6.0

1613 template <unsigned int C, class T>
1614 inline T VecRotateLeftOctet(const T vec)
1615 {
1616 #if defined(CRYPTOPP_BIG_ENDIAN)
1617     enum { R = C&0xf };
1618     return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1619 #else
1620     enum { R=(16-C)&0xf };  // Linux xlC 13.1 workaround in Debug builds

1621     return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1622 #endif
1623 }
1624
1625 /// \brief Rotate a vector right

1626 /// \tparam C shift byte count

1627 /// \tparam T vector type

1628 /// \param vec the vector

1629 /// \return vector

1630 /// \details VecRotateRightOctet() returns a new vector after rotating the

1631 ///  concatenation of the source vector with itself by the specified

1632 ///  number of bytes. The return vector is the same type as vec.

1633 /// \par Wraps

1634 ///  vec_sld

1635 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld

1636 ///  endian sensitive?</A> on Stack Overflow

1637 /// \since Crypto++ 6.0

1638 template <unsigned int C, class T>
1639 inline T VecRotateRightOctet(const T vec)
1640 {
1641 #if defined(CRYPTOPP_BIG_ENDIAN)
1642     enum { R=(16-C)&0xf };  // Linux xlC 13.1 workaround in Debug builds

1643     return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1644 #else
1645     enum { R = C&0xf };
1646     return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1647 #endif
1648 }
1649
1650 /// \brief Rotate a vector left

1651 /// \tparam C rotate bit count

1652 /// \param vec the vector

1653 /// \return vector

1654 /// \details VecRotateLeft() rotates each element in a vector by

1655 ///  bit count. The return vector is the same type as vec.

1656 /// \par Wraps

1657 ///  vec_rl

1658 /// \since Crypto++ 7.0

1659 template<unsigned int C>
1660 inline uint32x4_p VecRotateLeft(const uint32x4_p vec)
1661 {
1662     const uint32x4_p m = {C, C, C, C};
1663     return vec_rl(vec, m);
1664 }
1665
1666 /// \brief Rotate a vector right

1667 /// \tparam C rotate bit count

1668 /// \param vec the vector

1669 /// \return vector

1670 /// \details VecRotateRight() rotates each element in a vector

1671 ///  by bit count. The return vector is the same type as vec.

1672 /// \par Wraps

1673 ///  vec_rl

1674 /// \since Crypto++ 7.0

1675 template<unsigned int C>
1676 inline uint32x4_p VecRotateRight(const uint32x4_p vec)
1677 {
1678     const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
1679     return vec_rl(vec, m);
1680 }
1681
1682 /// \brief Shift a vector left

1683 /// \tparam C shift bit count

1684 /// \param vec the vector

1685 /// \return vector

1686 /// \details VecShiftLeft() rotates each element in a vector

1687 ///  by bit count. The return vector is the same type as vec.

1688 /// \par Wraps

1689 ///  vec_sl

1690 /// \since Crypto++ 8.1

1691 template<unsigned int C>
1692 inline uint32x4_p VecShiftLeft(const uint32x4_p vec)
1693 {
1694     const uint32x4_p m = {C, C, C, C};
1695     return vec_sl(vec, m);
1696 }
1697
1698 /// \brief Shift a vector right

1699 /// \tparam C shift bit count

1700 /// \param vec the vector

1701 /// \return vector

1702 /// \details VecShiftRight() rotates each element in a vector

1703 ///  by bit count. The return vector is the same type as vec.

1704 /// \par Wraps

1705 ///  vec_rl

1706 /// \since Crypto++ 8.1

1707 template<unsigned int C>
1708 inline uint32x4_p VecShiftRight(const uint32x4_p vec)
1709 {
1710     const uint32x4_p m = {C, C, C, C};
1711     return vec_sr(vec, m);
1712 }
1713
1714 // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8

1715 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
1716
1717 /// \brief Rotate a vector left

1718 /// \tparam C rotate bit count

1719 /// \param vec the vector

1720 /// \return vector

1721 /// \details VecRotateLeft() rotates each element in a vector

1722 ///  by bit count. The return vector is the same type as vec.

1723 /// \details VecRotateLeft() with 64-bit elements is available on

1724 ///  POWER8 and above.

1725 /// \par Wraps

1726 ///  vec_rl

1727 /// \since Crypto++ 8.0

1728 template<unsigned int C>
1729 inline uint64x2_p VecRotateLeft(const uint64x2_p vec)
1730 {
1731     const uint64x2_p m = {C, C};
1732     return vec_rl(vec, m);
1733 }
1734
1735 /// \brief Shift a vector left

1736 /// \tparam C shift bit count

1737 /// \param vec the vector

1738 /// \return vector

1739 /// \details VecShiftLeft() rotates each element in a vector

1740 ///  by bit count. The return vector is the same type as vec.

1741 /// \details VecShiftLeft() with 64-bit elements is available on

1742 ///  POWER8 and above.

1743 /// \par Wraps

1744 ///  vec_sl

1745 /// \since Crypto++ 8.1

1746 template<unsigned int C>
1747 inline uint64x2_p VecShiftLeft(const uint64x2_p vec)
1748 {
1749     const uint64x2_p m = {C, C};
1750     return vec_sl(vec, m);
1751 }
1752
1753 /// \brief Rotate a vector right

1754 /// \tparam C rotate bit count

1755 /// \param vec the vector

1756 /// \return vector

1757 /// \details VecRotateRight() rotates each element in a vector

1758 ///  by bit count. The return vector is the same type as vec.

1759 /// \details VecRotateRight() with 64-bit elements is available on

1760 ///  POWER8 and above.

1761 /// \par Wraps

1762 ///  vec_rl

1763 /// \since Crypto++ 8.0

1764 template<unsigned int C>
1765 inline uint64x2_p VecRotateRight(const uint64x2_p vec)
1766 {
1767     const uint64x2_p m = {64-C, 64-C};
1768     return vec_rl(vec, m);
1769 }
1770
1771 /// \brief Shift a vector right

1772 /// \tparam C shift bit count

1773 /// \param vec the vector

1774 /// \return vector

1775 /// \details VecShiftRight() rotates each element in a vector

1776 ///  by bit count. The return vector is the same type as vec.

1777 /// \details VecShiftRight() with 64-bit elements is available on

1778 ///  POWER8 and above.

1779 /// \par Wraps

1780 ///  vec_sr

1781 /// \since Crypto++ 8.1

1782 template<unsigned int C>
1783 inline uint64x2_p VecShiftRight(const uint64x2_p vec)
1784 {
1785     const uint64x2_p m = {C, C};
1786     return vec_sr(vec, m);
1787 }
1788
1789 #endif  // ARCH_PWR8

1790
1791 //@}

1792
1793 /// \name OTHER OPERATIONS

1794 //@{

1795
1796 /// \brief Merge two vectors

1797 /// \tparam T vector type

1798 /// \param vec1 the first vector

1799 /// \param vec2 the second vector

1800 /// \return vector

1801 /// \par Wraps

1802 ///  vec_mergel

1803 /// \since Crypto++ 8.1

1804 template <class T>
1805 inline T VecMergeLow(const T vec1, const T vec2)
1806 {
1807     return vec_mergel(vec1, vec2);
1808 }
1809
1810 /// \brief Merge two vectors

1811 /// \tparam T vector type

1812 /// \param vec1 the first vector

1813 /// \param vec2 the second vector

1814 /// \return vector

1815 /// \par Wraps

1816 ///  vec_mergeh

1817 /// \since Crypto++ 8.1

1818 template <class T>
1819 inline T VecMergeHigh(const T vec1, const T vec2)
1820 {
1821     return vec_mergeh(vec1, vec2);
1822 }
1823
1824 /// \brief Broadcast 32-bit word to a vector

1825 /// \param val the 32-bit value

1826 /// \return vector

1827 /// \par Wraps

1828 ///  vec_splats

1829 /// \since Crypto++ 8.3

1830 inline uint32x4_p VecSplatWord(word32 val)
1831 {
1832     // Fix spurious GCC warning???

1833     CRYPTOPP_UNUSED(val);
1834
1835     // Apple Altivec and XL C++ do not offer vec_splats.

1836     // GCC offers vec_splats back to -mcpu=power4.

1837 #if defined(_ARCH_PWR4) && defined(__GNUC__)
1838     return vec_splats(val);
1839 #else
1840     //const word32 x[4] = {val,val,val,val};

1841     //return VecLoad(x);

1842     const word32 x[4] = {val};
1843     return vec_splat(VecLoad(x),0);
1844 #endif
1845 }
1846
1847 /// \brief Broadcast 32-bit element to a vector

1848 /// \tparam the element number

1849 /// \param val the 32-bit value

1850 /// \return vector

1851 /// \par Wraps

1852 ///  vec_splat

1853 /// \since Crypto++ 8.3

1854 template <unsigned int N>
1855 inline uint32x4_p VecSplatElement(const uint32x4_p val)
1856 {
1857     return vec_splat(val, N);
1858 }
1859
1860 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
1861 /// \brief Broadcast 64-bit double word to a vector

1862 /// \param val the 64-bit value

1863 /// \return vector

1864 /// \par Wraps

1865 ///  vec_splats

1866 /// \since Crypto++ 8.3

1867 inline uint64x2_p VecSplatWord(word64 val)
1868 {
1869     // The PPC64 ABI says so.

1870     return vec_splats((unsigned long long)val);
1871 }
1872
1873 /// \brief Broadcast 64-bit element to a vector

1874 /// \tparam the element number

1875 /// \param val the 64-bit value

1876 /// \return vector

1877 /// \par Wraps

1878 ///  vec_splat

1879 /// \since Crypto++ 8.3

1880 template <unsigned int N>
1881 inline uint64x2_p VecSplatElement(const uint64x2_p val)
1882 {
1883 #if defined(__VSX__) || defined(_ARCH_PWR8)
1884     return vec_splat(val, N);
1885 #else
1886     enum {E=N&1};
1887     if (E == 0)
1888     {
1889         const uint8x16_p m = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7};
1890         return vec_perm(val, val, m);
1891     }
1892     else // (E == 1)

1893     {
1894         const uint8x16_p m = {8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15};
1895         return vec_perm(val, val, m);
1896     }
1897 #endif
1898 }
1899 #endif
1900
1901 /// \brief Extract a dword from a vector

1902 /// \tparam T vector type

1903 /// \param val the vector

1904 /// \return vector created from low dword

1905 /// \details VecGetLow() extracts the low dword from a vector. The low dword

1906 ///  is composed of the least significant bits and occupies bytes 8 through 15

1907 ///  when viewed as a big endian array. The return vector is the same type as

1908 ///  the original vector and padded with 0's in the most significant bit positions.

1909 /// \par Wraps

1910 ///  vec_sld

1911 /// \since Crypto++ 7.0

1912 template <class T>
1913 inline T VecGetLow(const T val)
1914 {
1915 #if defined(CRYPTOPP_BIG_ENDIAN) && (defined(__VSX__) || defined(_ARCH_PWR8))
1916     const T zero = {0};
1917     return (T)VecMergeLow((uint64x2_p)zero, (uint64x2_p)val);
1918 #else
1919     return VecShiftRightOctet<8>(VecShiftLeftOctet<8>(val));
1920 #endif
1921 }
1922
1923 /// \brief Extract a dword from a vector

1924 /// \tparam T vector type

1925 /// \param val the vector

1926 /// \return vector created from high dword

1927 /// \details VecGetHigh() extracts the high dword from a vector. The high dword

1928 ///  is composed of the most significant bits and occupies bytes 0 through 7

1929 ///  when viewed as a big endian array. The return vector is the same type as

1930 ///  the original vector and padded with 0's in the most significant bit positions.

1931 /// \par Wraps

1932 ///  vec_sld

1933 /// \since Crypto++ 7.0

1934 template <class T>
1935 inline T VecGetHigh(const T val)
1936 {
1937 #if defined(CRYPTOPP_BIG_ENDIAN) && (defined(__VSX__) || defined(_ARCH_PWR8))
1938     const T zero = {0};
1939     return (T)VecMergeHigh((uint64x2_p)zero, (uint64x2_p)val);
1940 #else
1941     return VecShiftRightOctet<8>(val);
1942 #endif
1943 }
1944
1945 /// \brief Exchange high and low double words

1946 /// \tparam T vector type

1947 /// \param vec the vector

1948 /// \return vector

1949 /// \par Wraps

1950 ///  vec_sld

1951 /// \since Crypto++ 7.0

1952 template <class T>
1953 inline T VecSwapWords(const T vec)
1954 {
1955     return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, 8);
1956 }
1957
1958 //@}

1959
1960 /// \name COMPARISON

1961 //@{

1962
1963 /// \brief Compare two vectors

1964 /// \tparam T1 vector type

1965 /// \tparam T2 vector type

1966 /// \param vec1 the first vector

1967 /// \param vec2 the second vector

1968 /// \return true if vec1 equals vec2, false otherwise

1969 /// \details VecEqual() performs a bitwise compare. The vector element types do

1970 ///  not matter.

1971 /// \par Wraps

1972 ///  vec_all_eq

1973 /// \since Crypto++ 8.0

1974 template <class T1, class T2>
1975 inline bool VecEqual(const T1 vec1, const T2 vec2)
1976 {
1977     return 1 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
1978 }
1979
1980 /// \brief Compare two vectors

1981 /// \tparam T1 vector type

1982 /// \tparam T2 vector type

1983 /// \param vec1 the first vector

1984 /// \param vec2 the second vector

1985 /// \return true if vec1 does not equal vec2, false otherwise

1986 /// \details VecNotEqual() performs a bitwise compare. The vector element types do

1987 ///  not matter.

1988 /// \par Wraps

1989 ///  vec_all_eq

1990 /// \since Crypto++ 8.0

1991 template <class T1, class T2>
1992 inline bool VecNotEqual(const T1 vec1, const T2 vec2)
1993 {
1994     return 0 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
1995 }
1996
1997 //@}

1998
1999 ////////////////// 32-bit Altivec /////////////////

2000
2001 /// \name 32-BIT ALTIVEC

2002 //@{

2003
2004 /// \brief Add two vectors as if uint64x2_p

2005 /// \param vec1 the first vector

2006 /// \param vec2 the second vector

2007 /// \return vector

2008 /// \details VecAdd64() performs <tt>vec1 + vec2</tt>. VecAdd64() performs as

2009 ///  if adding two uint64x2_p vectors. On POWER7 and below VecAdd64() manages

2010 ///  the carries from the elements.

2011 /// \par Wraps

2012 ///  vec_add for POWER8, vec_addc, vec_perm, vec_add for Altivec

2013 /// \since Crypto++ 8.3

2014 inline uint32x4_p VecAdd64(const uint32x4_p& vec1, const uint32x4_p& vec2)
2015 {
2016     // 64-bit elements available at POWER7 with VSX, but addudm requires POWER8

2017 #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2018     return (uint32x4_p)vec_add((uint64x2_p)vec1, (uint64x2_p)vec2);
2019 #else
2020     // The carry mask selects carrys for elements 1 and 3 and sets

2021     // remaining elements to 0. The results is then shifted so the

2022     // carried values are added to elements 0 and 2.

2023 #if defined(CRYPTOPP_BIG_ENDIAN)
2024     const uint32x4_p zero = {0, 0, 0, 0};
2025     const uint32x4_p mask = {0, 1, 0, 1};
2026 #else
2027     const uint32x4_p zero = {0, 0, 0, 0};
2028     const uint32x4_p mask = {1, 0, 1, 0};
2029 #endif
2030
2031     uint32x4_p cy = vec_addc(vec1, vec2);
2032     uint32x4_p res = vec_add(vec1, vec2);
2033     cy = vec_and(mask, cy);
2034     cy = vec_sld (cy, zero, 4);
2035     return vec_add(res, cy);
2036 #endif
2037 }
2038
2039 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2040 /// \brief Add two vectors as if uint64x2_p

2041 /// \param vec1 the first vector

2042 /// \param vec2 the second vector

2043 /// \return vector

2044 /// \details VecAdd64() performs <tt>vec1 + vec2</tt>. VecAdd64() performs as

2045 ///  if adding two uint64x2_p vectors. On POWER7 and below VecAdd64() manages

2046 ///  the carries from the elements.

2047 /// \par Wraps

2048 ///  vec_add for POWER8

2049 /// \since Crypto++ 8.3

2050 inline uint64x2_p VecAdd64(const uint64x2_p& vec1, const uint64x2_p& vec2)
2051 {
2052     // 64-bit elements available at POWER7 with VSX, but addudm requires POWER8

2053     const uint64x2_p res = vec_add(vec1, vec2);
2054
2055 #if defined(CRYPTOPP_DEBUG)
2056     // Test 32-bit add in debug builds while we are here.

2057     const uint32x4_p x = (uint32x4_p)vec1;
2058     const uint32x4_p y = (uint32x4_p)vec2;
2059     const uint32x4_p r = VecAdd64(x, y);
2060
2061     CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2062 #endif
2063
2064     return res;
2065 }
2066 #endif
2067
2068 /// \brief Subtract two vectors as if uint64x2_p

2069 /// \param vec1 the first vector

2070 /// \param vec2 the second vector

2071 /// \details VecSub64() performs <tt>vec1 - vec2</tt>. VecSub64() performs as

2072 ///  if subtracting two uint64x2_p vectors. On POWER7 and below VecSub64()

2073 ///  manages the borrows from the elements.

2074 /// \par Wraps

2075 ///  vec_sub for POWER8, vec_subc, vec_andc, vec_perm, vec_sub for Altivec

2076 /// \since Crypto++ 8.3

2077 inline uint32x4_p VecSub64(const uint32x4_p& vec1, const uint32x4_p& vec2)
2078 {
2079 #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2080     // 64-bit elements available at POWER7 with VSX, but subudm requires POWER8

2081     return (uint32x4_p)vec_sub((uint64x2_p)vec1, (uint64x2_p)vec2);
2082 #else
2083     // The borrow mask selects borrows for elements 1 and 3 and sets

2084     // remaining elements to 0. The results is then shifted so the

2085     // borrowed values are subtracted from elements 0 and 2.

2086 #if defined(CRYPTOPP_BIG_ENDIAN)
2087     const uint32x4_p zero = {0, 0, 0, 0};
2088     const uint32x4_p mask = {0, 1, 0, 1};
2089 #else
2090     const uint32x4_p zero = {0, 0, 0, 0};
2091     const uint32x4_p mask = {1, 0, 1, 0};
2092 #endif
2093
2094     // subc sets the complement of borrow, so we have to

2095     // un-complement it using andc.

2096     uint32x4_p bw = vec_subc(vec1, vec2);
2097     uint32x4_p res = vec_sub(vec1, vec2);
2098     bw = vec_andc(mask, bw);
2099     bw = vec_sld (bw, zero, 4);
2100     return vec_sub(res, bw);
2101 #endif
2102 }
2103
2104 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2105 /// \brief Subtract two vectors as if uint64x2_p

2106 /// \param vec1 the first vector

2107 /// \param vec2 the second vector

2108 /// \details VecSub64() performs <tt>vec1 - vec2</tt>. VecSub64() performs as

2109 ///  if subtracting two uint64x2_p vectors. On POWER7 and below VecSub64()

2110 ///  manages the borrows from the elements.

2111 /// \par Wraps

2112 ///  vec_sub for POWER8

2113 /// \since Crypto++ 8.3

2114 inline uint64x2_p VecSub64(const uint64x2_p& vec1, const uint64x2_p& vec2)
2115 {
2116     // 64-bit elements available at POWER7 with VSX, but subudm requires POWER8

2117     const uint64x2_p res = vec_sub(vec1, vec2);
2118
2119 #if defined(CRYPTOPP_DEBUG)
2120     // Test 32-bit sub in debug builds while we are here.

2121     const uint32x4_p x = (uint32x4_p)vec1;
2122     const uint32x4_p y = (uint32x4_p)vec2;
2123     const uint32x4_p r = VecSub64(x, y);
2124
2125     CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2126 #endif
2127
2128     return res;
2129 }
2130 #endif
2131
2132 /// \brief Rotate a vector left as if uint64x2_p

2133 /// \tparam C rotate bit count

2134 /// \param vec the vector

2135 /// \return vector

2136 /// \details VecRotateLeft() rotates each element in a vector by bit count.

2137 ///  vec is rotated as if uint64x2_p.

2138 /// \par Wraps

2139 ///  vec_rl

2140 /// \since Crypto++ 8.3

2141 template<unsigned int C>
2142 inline uint32x4_p VecRotateLeft64(const uint32x4_p vec)
2143 {
2144 #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2145     // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8

2146     return (uint32x4_p)VecRotateLeft<C>((uint64x2_p)vec);
2147 #else
2148     // C=0, 32, or 64 needs special handling. That is S32 and S64 below.

2149     enum {S64=C&63, S32=C&31, BR=(S64>=32)};
2150
2151     // Get the low bits, shift them to high bits

2152     uint32x4_p t1 = VecShiftLeft<S32>(vec);
2153     // Get the high bits, shift them to low bits

2154     uint32x4_p t2 = VecShiftRight<32-S32>(vec);
2155
2156     if (S64 == 0)
2157     {
2158         const uint8x16_p m = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
2159         return VecPermute(vec, m);
2160     }
2161     else if (S64 == 32)
2162     {
2163         const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2164         return VecPermute(vec, m);
2165     }
2166     else if (BR)  // Big rotate amount?

2167     {
2168         const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2169         t1 = VecPermute(t1, m);
2170     }
2171     else
2172     {
2173         const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2174         t2 = VecPermute(t2, m);
2175     }
2176
2177     return vec_or(t1, t2);
2178 #endif
2179 }
2180
2181 /// \brief Rotate a vector left as if uint64x2_p

2182 /// \param vec the vector

2183 /// \return vector

2184 /// \details VecRotateLeft<8>() rotates each element in a vector

2185 ///  by 8-bits. vec is rotated as if uint64x2_p. This specialization

2186 ///  is used by algorithms like Speck128.

2187 /// \par Wraps

2188 ///  vec_rl

2189 /// \since Crypto++ 8.3

2190 template<>
2191 inline uint32x4_p VecRotateLeft64<8>(const uint32x4_p vec)
2192 {
2193 #if (CRYPTOPP_BIG_ENDIAN)
2194     const uint8x16_p m = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
2195     return VecPermute(vec, m);
2196 #else
2197     const uint8x16_p m = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
2198     return VecPermute(vec, m);
2199 #endif
2200 }
2201
2202 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2203 /// \brief Rotate a vector left as if uint64x2_p

2204 /// \tparam C rotate bit count

2205 /// \param vec the vector

2206 /// \return vector

2207 /// \details VecRotateLeft64() rotates each element in a vector by

2208 ///  bit count. vec is rotated as if uint64x2_p.

2209 /// \par Wraps

2210 ///  vec_rl

2211 /// \since Crypto++ 8.3

2212 template<unsigned int C>
2213 inline uint64x2_p VecRotateLeft64(const uint64x2_p vec)
2214 {
2215     // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8

2216     const uint64x2_p res = VecRotateLeft<C>(vec);
2217
2218 #if defined(CRYPTOPP_DEBUG)
2219     // Test 32-bit rotate in debug builds while we are here.

2220     const uint32x4_p x = (uint32x4_p)vec;
2221     const uint32x4_p r = VecRotateLeft64<C>(x);
2222
2223     CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2224 #endif
2225
2226     return res;
2227 }
2228 #endif
2229
2230 /// \brief Rotate a vector right as if uint64x2_p

2231 /// \tparam C rotate bit count

2232 /// \param vec the vector

2233 /// \return vector

2234 /// \details VecRotateRight64() rotates each element in a vector by

2235 ///  bit count. vec is rotated as if uint64x2_p.

2236 /// \par Wraps

2237 ///  vec_rl

2238 /// \since Crypto++ 8.3

2239 template<unsigned int C>
2240 inline uint32x4_p VecRotateRight64(const uint32x4_p vec)
2241 {
2242 #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2243     // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8

2244     return (uint32x4_p)VecRotateRight<C>((uint64x2_p)vec);
2245 #else
2246     // C=0, 32, or 64 needs special handling. That is S32 and S64 below.

2247     enum {S64=C&63, S32=C&31, BR=(S64>=32)};
2248
2249     // Get the low bits, shift them to high bits

2250     uint32x4_p t1 = VecShiftRight<S32>(vec);
2251     // Get the high bits, shift them to low bits

2252     uint32x4_p t2 = VecShiftLeft<32-S32>(vec);
2253
2254     if (S64 == 0)
2255     {
2256         const uint8x16_p m = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
2257         return VecPermute(vec, m);
2258     }
2259     else if (S64 == 32)
2260     {
2261         const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2262         return VecPermute(vec, m);
2263     }
2264     else if (BR)  // Big rotate amount?

2265     {
2266         const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2267         t1 = VecPermute(t1, m);
2268     }
2269     else
2270     {
2271         const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2272         t2 = VecPermute(t2, m);
2273     }
2274
2275     return vec_or(t1, t2);
2276 #endif
2277 }
2278
2279 /// \brief Rotate a vector right as if uint64x2_p

2280 /// \param vec the vector

2281 /// \return vector

2282 /// \details VecRotateRight64<8>() rotates each element in a vector

2283 ///  by 8-bits. vec is rotated as if uint64x2_p. This specialization

2284 ///  is used by algorithms like Speck128.

2285 /// \details vec is rotated as if uint64x2_p.

2286 /// \par Wraps

2287 ///  vec_rl

2288 /// \since Crypto++ 8.3

2289 template<>
2290 inline uint32x4_p VecRotateRight64<8>(const uint32x4_p vec)
2291 {
2292 #if (CRYPTOPP_BIG_ENDIAN)
2293     const uint8x16_p m = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
2294     return VecPermute(vec, m);
2295 #else
2296     const uint8x16_p m = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
2297     return VecPermute(vec, m);
2298 #endif
2299 }
2300
2301 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2302 /// \brief Rotate a vector right as if uint64x2_p

2303 /// \tparam C rotate bit count

2304 /// \param vec the vector

2305 /// \return vector

2306 /// \details VecRotateRight64() rotates each element in a vector by

2307 ///  bit count. vec is rotated as if uint64x2_p.

2308 /// \par Wraps

2309 ///  vec_rl

2310 /// \since Crypto++ 8.3

2311 template<unsigned int C>
2312 inline uint64x2_p VecRotateRight64(const uint64x2_p vec)
2313 {
2314     // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8

2315     const uint64x2_p res = VecRotateRight<C>(vec);
2316
2317 #if defined(CRYPTOPP_DEBUG)
2318     // Test 32-bit rotate in debug builds while we are here.

2319     const uint32x4_p x = (uint32x4_p)vec;
2320     const uint32x4_p r = VecRotateRight64<C>(x);
2321
2322     CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2323 #endif
2324
2325     return res;
2326 }
2327 #endif
2328
2329 /// \brief AND two vectors as if uint64x2_p

2330 /// \tparam T1 vector type

2331 /// \tparam T2 vector type

2332 /// \param vec1 the first vector

2333 /// \param vec2 the second vector

2334 /// \return vector

2335 /// \details VecAnd64() performs <tt>vec1 & vec2</tt>.

2336 ///  vec2 is cast to the same type as vec1. The return vector

2337 ///  is the same type as vec1.

2338 /// \details VecAnd64() is a convenience function that simply performs a VecAnd().

2339 /// \par Wraps

2340 ///  vec_and

2341 /// \since Crypto++ 8.3

2342 template <class T1, class T2>
2343 inline T1 VecAnd64(const T1 vec1, const T2 vec2)
2344 {
2345     return (T1)vec_and(vec1, (T1)vec2);
2346 }
2347
2348 /// \brief OR two vectors as if uint64x2_p

2349 /// \tparam T1 vector type

2350 /// \tparam T2 vector type

2351 /// \param vec1 the first vector

2352 /// \param vec2 the second vector

2353 /// \return vector

2354 /// \details VecOr64() performs <tt>vec1 | vec2</tt>.

2355 ///  vec2 is cast to the same type as vec1. The return vector

2356 ///  is the same type as vec1.

2357 /// \details VecOr64() is a convenience function that simply performs a VecOr().

2358 /// \par Wraps

2359 ///  vec_or

2360 /// \since Crypto++ 8.3

2361 template <class T1, class T2>
2362 inline T1 VecOr64(const T1 vec1, const T2 vec2)
2363 {
2364     return (T1)vec_or(vec1, (T1)vec2);
2365 }
2366
2367 /// \brief XOR two vectors as if uint64x2_p

2368 /// \tparam T1 vector type

2369 /// \tparam T2 vector type

2370 /// \param vec1 the first vector

2371 /// \param vec2 the second vector

2372 /// \return vector

2373 /// \details VecXor64() performs <tt>vec1 ^ vec2</tt>.

2374 ///  vec2 is cast to the same type as vec1. The return vector

2375 ///  is the same type as vec1.

2376 /// \details VecXor64() is a convenience function that simply performs a VecXor().

2377 /// \par Wraps

2378 ///  vec_xor

2379 /// \since Crypto++ 8.3

2380 template <class T1, class T2>
2381 inline T1 VecXor64(const T1 vec1, const T2 vec2)
2382 {
2383     return (T1)vec_xor(vec1, (T1)vec2);
2384 }
2385
2386 /// \brief Broadcast 64-bit double word to a vector

2387 /// \param val the 64-bit value

2388 /// \return vector

2389 /// \par Wraps

2390 ///  vec_splats

2391 /// \since Crypto++ 8.3

2392 inline uint32x4_p VecSplatWord64(word64 val)
2393 {
2394 #if defined(_ARCH_PWR8)
2395     // The PPC64 ABI says so.

2396     return (uint32x4_p)vec_splats((unsigned long long)val);
2397 #else
2398     const word64 x[2] = {val,val};
2399     return (uint32x4_p)VecLoad((const word32*)x);
2400 #endif
2401 }
2402
2403 /// \brief Broadcast 64-bit element to a vector as if uint64x2_p

2404 /// \tparam the element number

2405 /// \param val the 64-bit value

2406 /// \return vector

2407 /// \par Wraps

2408 ///  vec_splat

2409 /// \since Crypto++ 8.3

2410 template <unsigned int N>
2411 inline uint32x4_p VecSplatElement64(const uint32x4_p val)
2412 {
2413 #if defined(__VSX__) || defined(_ARCH_PWR8)
2414     return (uint32x4_p)vec_splat((uint64x2_p)val, N);
2415 #else
2416     enum {E=N&1};
2417     if (E == 0)
2418     {
2419         const uint8x16_p m = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7};
2420         return (uint32x4_p)vec_perm(val, val, m);
2421     }
2422     else // (E == 1)

2423     {
2424         const uint8x16_p m = {8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15};
2425         return (uint32x4_p)vec_perm(val, val, m);
2426     }
2427 #endif
2428 }
2429
2430 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2431 /// \brief Broadcast 64-bit element to a vector

2432 /// \tparam the element number

2433 /// \param val the 64-bit value

2434 /// \return vector

2435 /// \since Crypto++ 8.3

2436 template <unsigned int N>
2437 inline uint64x2_p VecSplatElement64(const uint64x2_p val)
2438 {
2439     return vec_splat(val, N);
2440 }
2441 #endif
2442
2443 //@}

2444
2445 //////////////////////// Power8 Crypto ////////////////////////

2446
2447 // __CRYPTO__ alone is not enough. Clang will define __CRYPTO__

2448 // when it is not available, like with Power7. Sigh...

2449 #if (defined(_ARCH_PWR8) && defined(__CRYPTO__)) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2450
2451 /// \name POLYNOMIAL MULTIPLICATION

2452 //@{

2453
2454 /// \brief Polynomial multiplication

2455 /// \param a the first term

2456 /// \param b the second term

2457 /// \return vector product

2458 /// \details VecPolyMultiply() performs polynomial multiplication. POWER8

2459 ///  polynomial multiplication multiplies the high and low terms, and then

2460 ///  XOR's the high and low products. That is, the result is <tt>ah*bh XOR

2461 ///  al*bl</tt>. It is different behavior than Intel polynomial

2462 ///  multiplication. To obtain a single product without the XOR, then set

2463 ///  one of the high or low terms to 0. For example, setting <tt>ah=0</tt>

2464 ///  results in <tt>0*bh XOR al*bl = al*bl</tt>.

2465 /// \par Wraps

2466 ///  __vpmsumw, __builtin_altivec_crypto_vpmsumw and __builtin_crypto_vpmsumw.

2467 /// \since Crypto++ 8.1

2468 inline uint32x4_p VecPolyMultiply(const uint32x4_p& a, const uint32x4_p& b)
2469 {
2470 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2471     return __vpmsumw (a, b);
2472 #elif defined(__clang__)
2473     return __builtin_altivec_crypto_vpmsumw (a, b);
2474 #else
2475     return __builtin_crypto_vpmsumw (a, b);
2476 #endif
2477 }
2478
2479 /// \brief Polynomial multiplication

2480 /// \param a the first term

2481 /// \param b the second term

2482 /// \return vector product

2483 /// \details VecPolyMultiply() performs polynomial multiplication. POWER8

2484 ///  polynomial multiplication multiplies the high and low terms, and then

2485 ///  XOR's the high and low products. That is, the result is <tt>ah*bh XOR

2486 ///  al*bl</tt>. It is different behavior than Intel polynomial

2487 ///  multiplication. To obtain a single product without the XOR, then set

2488 ///  one of the high or low terms to 0. For example, setting <tt>ah=0</tt>

2489 ///  results in <tt>0*bh XOR al*bl = al*bl</tt>.

2490 /// \par Wraps

2491 ///  __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.

2492 /// \since Crypto++ 8.1

2493 inline uint64x2_p VecPolyMultiply(const uint64x2_p& a, const uint64x2_p& b)
2494 {
2495 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2496     return __vpmsumd (a, b);
2497 #elif defined(__clang__)
2498     return __builtin_altivec_crypto_vpmsumd (a, b);
2499 #else
2500     return __builtin_crypto_vpmsumd (a, b);
2501 #endif
2502 }
2503
2504 /// \brief Polynomial multiplication

2505 /// \param a the first term

2506 /// \param b the second term

2507 /// \return vector product

2508 /// \details VecIntelMultiply00() performs polynomial multiplication and presents

2509 ///  the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x00)</tt>.

2510 ///  The <tt>0x00</tt> indicates the low 64-bits of <tt>a</tt> and <tt>b</tt>

2511 ///  are multiplied.

2512 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit

2513 ///  is MSB and numbered 127, while the rightmost bit is LSB and numbered 0.

2514 /// \par Wraps

2515 ///  __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.

2516 /// \since Crypto++ 8.0

2517 inline uint64x2_p VecIntelMultiply00(const uint64x2_p& a, const uint64x2_p& b)
2518 {
2519 #if defined(CRYPTOPP_BIG_ENDIAN)
2520     return VecSwapWords(VecPolyMultiply(VecGetHigh(a), VecGetHigh(b)));
2521 #else
2522     return VecPolyMultiply(VecGetHigh(a), VecGetHigh(b));
2523 #endif
2524 }
2525
2526 /// \brief Polynomial multiplication

2527 /// \param a the first term

2528 /// \param b the second term

2529 /// \return vector product

2530 /// \details VecIntelMultiply01 performs() polynomial multiplication and presents

2531 ///  the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x01)</tt>.

2532 ///  The <tt>0x01</tt> indicates the low 64-bits of <tt>a</tt> and high

2533 ///  64-bits of <tt>b</tt> are multiplied.

2534 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit

2535 ///  is MSB and numbered 127, while the rightmost bit is LSB and numbered 0.

2536 /// \par Wraps

2537 ///  __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.

2538 /// \since Crypto++ 8.0

2539 inline uint64x2_p VecIntelMultiply01(const uint64x2_p& a, const uint64x2_p& b)
2540 {
2541 #if defined(CRYPTOPP_BIG_ENDIAN)
2542     return VecSwapWords(VecPolyMultiply(a, VecGetHigh(b)));
2543 #else
2544     return VecPolyMultiply(a, VecGetHigh(b));
2545 #endif
2546 }
2547
2548 /// \brief Polynomial multiplication

2549 /// \param a the first term

2550 /// \param b the second term

2551 /// \return vector product

2552 /// \details VecIntelMultiply10() performs polynomial multiplication and presents

2553 ///  the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x10)</tt>.

2554 ///  The <tt>0x10</tt> indicates the high 64-bits of <tt>a</tt> and low

2555 ///  64-bits of <tt>b</tt> are multiplied.

2556 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit

2557 ///  is MSB and numbered 127, while the rightmost bit is LSB and numbered 0.

2558 /// \par Wraps

2559 ///  __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.

2560 /// \since Crypto++ 8.0

2561 inline uint64x2_p VecIntelMultiply10(const uint64x2_p& a, const uint64x2_p& b)
2562 {
2563 #if defined(CRYPTOPP_BIG_ENDIAN)
2564     return VecSwapWords(VecPolyMultiply(VecGetHigh(a), b));
2565 #else
2566     return VecPolyMultiply(VecGetHigh(a), b);
2567 #endif
2568 }
2569
2570 /// \brief Polynomial multiplication

2571 /// \param a the first term

2572 /// \param b the second term

2573 /// \return vector product

2574 /// \details VecIntelMultiply11() performs polynomial multiplication and presents

2575 ///  the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x11)</tt>.

2576 ///  The <tt>0x11</tt> indicates the high 64-bits of <tt>a</tt> and <tt>b</tt>

2577 ///  are multiplied.

2578 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit

2579 ///  is MSB and numbered 127, while the rightmost bit is LSB and numbered 0.

2580 /// \par Wraps

2581 ///  __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.

2582 /// \since Crypto++ 8.0

2583 inline uint64x2_p VecIntelMultiply11(const uint64x2_p& a, const uint64x2_p& b)
2584 {
2585 #if defined(CRYPTOPP_BIG_ENDIAN)
2586     return VecSwapWords(VecPolyMultiply(VecGetLow(a), b));
2587 #else
2588     return VecPolyMultiply(VecGetLow(a), b);
2589 #endif
2590 }
2591
2592 //@}

2593
2594 /// \name AES ENCRYPTION

2595 //@{

2596
2597 /// \brief One round of AES encryption

2598 /// \tparam T1 vector type

2599 /// \tparam T2 vector type

2600 /// \param state the state vector

2601 /// \param key the subkey vector

2602 /// \details VecEncrypt() performs one round of AES encryption of state

2603 ///  using subkey key. The return vector is the same type as state.

2604 /// \details VecEncrypt() is available on POWER8 and above.

2605 /// \par Wraps

2606 ///  __vcipher, __builtin_altivec_crypto_vcipher, __builtin_crypto_vcipher

2607 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0

2608 template <class T1, class T2>
2609 inline T1 VecEncrypt(const T1 state, const T2 key)
2610 {
2611 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2612     return (T1)__vcipher((uint8x16_p)state, (uint8x16_p)key);
2613 #elif defined(__clang__)
2614     return (T1)__builtin_altivec_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key);
2615 #elif defined(__GNUC__)
2616     return (T1)__builtin_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key);
2617 #else
2618     CRYPTOPP_ASSERT(0);
2619 #endif
2620 }
2621
2622 /// \brief Final round of AES encryption

2623 /// \tparam T1 vector type

2624 /// \tparam T2 vector type

2625 /// \param state the state vector

2626 /// \param key the subkey vector

2627 /// \details VecEncryptLast() performs the final round of AES encryption

2628 ///  of state using subkey key. The return vector is the same type as state.

2629 /// \details VecEncryptLast() is available on POWER8 and above.

2630 /// \par Wraps

2631 ///  __vcipherlast, __builtin_altivec_crypto_vcipherlast, __builtin_crypto_vcipherlast

2632 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0

2633 template <class T1, class T2>
2634 inline T1 VecEncryptLast(const T1 state, const T2 key)
2635 {
2636 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2637     return (T1)__vcipherlast((uint8x16_p)state, (uint8x16_p)key);
2638 #elif defined(__clang__)
2639     return (T1)__builtin_altivec_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key);
2640 #elif defined(__GNUC__)
2641     return (T1)__builtin_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key);
2642 #else
2643     CRYPTOPP_ASSERT(0);
2644 #endif
2645 }
2646
2647 /// \brief One round of AES decryption

2648 /// \tparam T1 vector type

2649 /// \tparam T2 vector type

2650 /// \param state the state vector

2651 /// \param key the subkey vector

2652 /// \details VecDecrypt() performs one round of AES decryption of state

2653 ///  using subkey key. The return vector is the same type as state.

2654 /// \details VecDecrypt() is available on POWER8 and above.

2655 /// \par Wraps

2656 ///  __vncipher, __builtin_altivec_crypto_vncipher, __builtin_crypto_vncipher

2657 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0

2658 template <class T1, class T2>
2659 inline T1 VecDecrypt(const T1 state, const T2 key)
2660 {
2661 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2662     return (T1)__vncipher((uint8x16_p)state, (uint8x16_p)key);
2663 #elif defined(__clang__)
2664     return (T1)__builtin_altivec_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key);
2665 #elif defined(__GNUC__)
2666     return (T1)__builtin_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key);
2667 #else
2668     CRYPTOPP_ASSERT(0);
2669 #endif
2670 }
2671
2672 /// \brief Final round of AES decryption

2673 /// \tparam T1 vector type

2674 /// \tparam T2 vector type

2675 /// \param state the state vector

2676 /// \param key the subkey vector

2677 /// \details VecDecryptLast() performs the final round of AES decryption

2678 ///  of state using subkey key. The return vector is the same type as state.

2679 /// \details VecDecryptLast() is available on POWER8 and above.

2680 /// \par Wraps

2681 ///  __vncipherlast, __builtin_altivec_crypto_vncipherlast, __builtin_crypto_vncipherlast

2682 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0

2683 template <class T1, class T2>
2684 inline T1 VecDecryptLast(const T1 state, const T2 key)
2685 {
2686 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2687     return (T1)__vncipherlast((uint8x16_p)state, (uint8x16_p)key);
2688 #elif defined(__clang__)
2689     return (T1)__builtin_altivec_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key);
2690 #elif defined(__GNUC__)
2691     return (T1)__builtin_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key);
2692 #else
2693     CRYPTOPP_ASSERT(0);
2694 #endif
2695 }
2696
2697 //@}

2698
2699 /// \name SHA DIGESTS

2700 //@{

2701
2702 /// \brief SHA256 Sigma functions

2703 /// \tparam func function

2704 /// \tparam fmask function mask

2705 /// \tparam T vector type

2706 /// \param data the block to transform

2707 /// \details VecSHA256() selects sigma0, sigma1, Sigma0, Sigma1 based on

2708 ///  func and fmask. The return vector is the same type as data.

2709 /// \details VecSHA256() is available on POWER8 and above.

2710 /// \par Wraps

2711 ///  __vshasigmaw, __builtin_altivec_crypto_vshasigmaw, __builtin_crypto_vshasigmaw

2712 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0

2713 template <int func, int fmask, class T>
2714 inline T VecSHA256(const T data)
2715 {
2716 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2717     return (T)__vshasigmaw((uint32x4_p)data, func, fmask);
2718 #elif defined(__clang__)
2719     return (T)__builtin_altivec_crypto_vshasigmaw((uint32x4_p)data, func, fmask);
2720 #elif defined(__GNUC__)
2721     return (T)__builtin_crypto_vshasigmaw((uint32x4_p)data, func, fmask);
2722 #else
2723     CRYPTOPP_ASSERT(0);
2724 #endif
2725 }
2726
2727 /// \brief SHA512 Sigma functions

2728 /// \tparam func function

2729 /// \tparam fmask function mask

2730 /// \tparam T vector type

2731 /// \param data the block to transform

2732 /// \details VecSHA512() selects sigma0, sigma1, Sigma0, Sigma1 based on

2733 ///  func and fmask. The return vector is the same type as data.

2734 /// \details VecSHA512() is available on POWER8 and above.

2735 /// \par Wraps

2736 ///  __vshasigmad, __builtin_altivec_crypto_vshasigmad, __builtin_crypto_vshasigmad

2737 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0

2738 template <int func, int fmask, class T>
2739 inline T VecSHA512(const T data)
2740 {
2741 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2742     return (T)__vshasigmad((uint64x2_p)data, func, fmask);
2743 #elif defined(__clang__)
2744     return (T)__builtin_altivec_crypto_vshasigmad((uint64x2_p)data, func, fmask);
2745 #elif defined(__GNUC__)
2746     return (T)__builtin_crypto_vshasigmad((uint64x2_p)data, func, fmask);
2747 #else
2748     CRYPTOPP_ASSERT(0);
2749 #endif
2750 }
2751
2752 //@}

2753
2754 #endif  // __CRYPTO__

2755
2756 #endif  // _ALTIVEC_

2757
2758 NAMESPACE_END
2759
2760 #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
2761 # pragma GCC diagnostic pop
2762 #endif
2763
2764 #endif  // CRYPTOPP_PPC_CRYPTO_H