|
||||
File indexing completed on 2025-01-18 09:55:07
0001 // ppc_simd.h - written and placed in public domain by Jeffrey Walton 0002 0003 /// \file ppc_simd.h 0004 /// \brief Support functions for PowerPC and vector operations 0005 /// \details This header provides an agnostic interface into Clang, GCC 0006 /// and IBM XL C/C++ compilers modulo their different built-in functions 0007 /// for accessing vector instructions. 0008 /// \details The abstractions are necessary to support back to GCC 4.8 and 0009 /// XLC 11 and 12. GCC 4.8 and 4.9 are still popular, and they are the 0010 /// default compiler for GCC112, GCC119 and others on the compile farm. 0011 /// Older IBM XL C/C++ compilers also have the need due to lack of 0012 /// <tt>vec_xl</tt> and <tt>vec_xst</tt> support on some platforms. Modern 0013 /// compilers provide best support and don't need many of the hacks 0014 /// below. 0015 /// \details The library is tested with the following PowerPC machines and 0016 /// compilers. GCC110, GCC111, GCC112, GCC119 and GCC135 are provided by 0017 /// the <A HREF="https://cfarm.tetaneutral.net/">GCC Compile Farm</A> 0018 /// - PowerMac G5, OSX 10.5, POWER4, Apple GCC 4.0 0019 /// - PowerMac G5, OSX 10.5, POWER4, Macports GCC 5.0 0020 /// - GCC110, Linux, POWER7, GCC 4.8.5 0021 /// - GCC110, Linux, POWER7, XLC 12.01 0022 /// - GCC111, AIX, POWER7, GCC 4.8.1 0023 /// - GCC111, AIX, POWER7, XLC 12.01 0024 /// - GCC112, Linux, POWER8, GCC 4.8.5 0025 /// - GCC112, Linux, POWER8, XLC 13.01 0026 /// - GCC112, Linux, POWER8, Clang 7.0 0027 /// - GCC119, AIX, POWER8, GCC 7.2.0 0028 /// - GCC119, AIX, POWER8, XLC 13.01 0029 /// - GCC135, Linux, POWER9, GCC 7.0 0030 /// \details 12 machines are used for testing because the three compilers form 0031 /// five or six profiles. The profiles are listed below. 0032 /// - GCC (Linux GCC, Macports GCC, etc. Consistent across machines) 0033 /// - XLC 13.0 and earlier (all IBM components) 0034 /// - XLC 13.1 and later on Linux (LLVM front-end, no compatibility macros) 0035 /// - XLC 13.1 and later on Linux (LLVM front-end, -qxlcompatmacros option) 0036 /// - early LLVM Clang (traditional Clang compiler) 0037 /// - late LLVM Clang (traditional Clang compiler) 0038 /// \details The LLVM front-end makes it tricky to write portable code because 0039 /// LLVM pretends to be other compilers but cannot consume other compiler's 0040 /// builtins. When using XLC with -qxlcompatmacros the compiler pretends to 0041 /// be GCC, Clang and XLC all at once but it can only consume it's variety 0042 /// of builtins. 0043 /// \details At Crypto++ 8.0 the various <tt>Vector{FuncName}</tt> were 0044 /// renamed to <tt>Vec{FuncName}</tt>. For example, <tt>VectorAnd</tt> was 0045 /// changed to <tt>VecAnd</tt>. The name change helped consolidate two 0046 /// slightly different implementations. 0047 /// \details At Crypto++ 8.3 the library added select 64-bit functions for 0048 /// 32-bit Altivec. For example, <tt>VecAdd64</tt> and <tt>VecSub64</tt> 0049 /// take 32-bit vectors and adds or subtracts them as if there were vectors 0050 /// with two 64-bit elements. The functions dramtically improve performance 0051 /// for some algorithms on some platforms, like SIMON128 and SPECK128 on 0052 /// Power6 and earlier. For example, SPECK128 improved from 70 cpb to 0053 /// 10 cpb on an old PowerMac. Use the functions like shown below. 0054 /// <pre> 0055 /// \#if defined(_ARCH_PWR8) 0056 /// \# define speck128_t uint64x2_p 0057 /// \#else 0058 /// \# define speck128_t uint32x4_p 0059 /// \#endif 0060 /// 0061 /// speck128_t rk, x1, x2, y1, y2; 0062 /// rk = (speck128_t)VecLoadAligned(ptr); 0063 /// x1 = VecRotateRight64<8>(x1); 0064 /// x1 = VecAdd64(x1, y1); 0065 /// ...</pre> 0066 /// \since Crypto++ 6.0, LLVM Clang compiler support since Crypto++ 8.0 0067 0068 // Use __ALTIVEC__, _ARCH_PWR7, __VSX__, and _ARCH_PWR8 when detecting 0069 // actual availaibility of the feature for the source file being compiled. 0070 // The preprocessor macros depend on compiler options like -maltivec; and 0071 // not compiler versions. 0072 0073 // For GCC see https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions.html 0074 // For XLC see the Compiler Reference manual. For Clang you have to experiment. 0075 // Clang does not document the compiler options, does not reject options it does 0076 // not understand, and pretends to be other compilers even though it cannot 0077 // process the builtins and intrinsics. Clang will waste hours of your time. 0078 0079 // DO NOT USE this pattern in VecLoad and VecStore. We have to use the 0080 // code paths guarded by preprocessor macros because XLC 12 generates 0081 // bad code in some places. To verify the bad code generation test on 0082 // GCC111 with XLC 12.01 installed. XLC 13.01 on GCC112 and GCC119 are OK. 0083 // 0084 // inline uint32x4_p VecLoad(const byte src[16]) 0085 // { 0086 // #if defined(__VSX__) || defined(_ARCH_PWR8) 0087 // return (uint32x4_p) *(uint8x16_p*)((byte*)src); 0088 // #else 0089 // return VecLoad_ALTIVEC(src); 0090 // #endif 0091 // } 0092 0093 // We should be able to perform the load using inline asm on Power7 with 0094 // VSX or Power8. The inline asm will avoid C undefined behavior due to 0095 // casting from byte* to word32*. We are safe because our byte* are 0096 // 16-byte aligned for Altivec. Below is the big endian load. Little 0097 // endian would need to follow with xxpermdi for the reversal. 0098 // 0099 // __asm__ ("lxvw4x %x0, %1, %2" : "=wa"(v) : "r"(0), "r"(src) : ); 0100 0101 // GCC and XLC use integer math for the address (D-form or byte-offset 0102 // in the ISA manual). LLVM uses pointer math for the address (DS-form 0103 // or indexed in the ISA manual). To keep them consistent we calculate 0104 // the address from the offset and pass to a load or store function 0105 // using a 0 offset. 0106 0107 #ifndef CRYPTOPP_PPC_CRYPTO_H 0108 #define CRYPTOPP_PPC_CRYPTO_H 0109 0110 #include "config.h" 0111 #include "misc.h" 0112 0113 #if defined(__ALTIVEC__) 0114 # include <altivec.h> 0115 # undef vector 0116 # undef pixel 0117 # undef bool 0118 #endif 0119 0120 // XL C++ on AIX does not define VSX and does not 0121 // provide an option to set it. We have to set it 0122 // for the code below. This define must stay in 0123 // sync with the define in test_ppc_power7.cpp. 0124 #ifndef CRYPTOPP_DISABLE_POWER7 0125 # if defined(_AIX) && defined(_ARCH_PWR7) && defined(__xlC__) 0126 # define __VSX__ 1 0127 # endif 0128 #endif 0129 0130 // XL C++ on AIX does not define CRYPTO and does not 0131 // provide an option to set it. We have to set it 0132 // for the code below. This define must stay in 0133 // sync with the define in test_ppc_power8.cpp 0134 #ifndef CRYPTOPP_DISABLE_POWER8 0135 # if defined(_AIX) && defined(_ARCH_PWR8) && defined(__xlC__) 0136 # define __CRYPTO__ 1 0137 # endif 0138 #endif 0139 0140 /// \brief Cast array to vector pointer 0141 /// \details CONST_V8_CAST casts a const array to a vector 0142 /// pointer for a byte array. The Power ABI says source arrays 0143 /// are non-const, so this define removes the const. XLC++ will 0144 /// fail the compile if the source array is const. 0145 #define CONST_V8_CAST(x) ((unsigned char*)(x)) 0146 /// \brief Cast array to vector pointer 0147 /// \details CONST_V32_CAST casts a const array to a vector 0148 /// pointer for a word array. The Power ABI says source arrays 0149 /// are non-const, so this define removes the const. XLC++ will 0150 /// fail the compile if the source array is const. 0151 #define CONST_V32_CAST(x) ((unsigned int*)(x)) 0152 /// \brief Cast array to vector pointer 0153 /// \details CONST_V64_CAST casts a const array to a vector 0154 /// pointer for a double word array. The Power ABI says source arrays 0155 /// are non-const, so this define removes the const. XLC++ will 0156 /// fail the compile if the source array is const. 0157 #define CONST_V64_CAST(x) ((unsigned long long*)(x)) 0158 /// \brief Cast array to vector pointer 0159 /// \details NCONST_V8_CAST casts an array to a vector 0160 /// pointer for a byte array. The Power ABI says source arrays 0161 /// are non-const, so this define removes the const. XLC++ will 0162 /// fail the compile if the source array is const. 0163 #define NCONST_V8_CAST(x) ((unsigned char*)(x)) 0164 /// \brief Cast array to vector pointer 0165 /// \details NCONST_V32_CAST casts an array to a vector 0166 /// pointer for a word array. The Power ABI says source arrays 0167 /// are non-const, so this define removes the const. XLC++ will 0168 /// fail the compile if the source array is const. 0169 #define NCONST_V32_CAST(x) ((unsigned int*)(x)) 0170 /// \brief Cast array to vector pointer 0171 /// \details NCONST_V64_CAST casts an array to a vector 0172 /// pointer for a double word array. The Power ABI says source arrays 0173 /// are non-const, so this define removes the const. XLC++ will 0174 /// fail the compile if the source array is const. 0175 #define NCONST_V64_CAST(x) ((unsigned long long*)(x)) 0176 0177 // VecLoad_ALTIVEC and VecStore_ALTIVEC are 0178 // too noisy on modern compilers 0179 #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE 0180 # pragma GCC diagnostic push 0181 # pragma GCC diagnostic ignored "-Wdeprecated" 0182 #endif 0183 0184 NAMESPACE_BEGIN(CryptoPP) 0185 0186 #if defined(__ALTIVEC__) || defined(CRYPTOPP_DOXYGEN_PROCESSING) 0187 0188 /// \brief Vector of 8-bit elements 0189 /// \par Wraps 0190 /// __vector unsigned char 0191 /// \since Crypto++ 6.0 0192 typedef __vector unsigned char uint8x16_p; 0193 /// \brief Vector of 16-bit elements 0194 /// \par Wraps 0195 /// __vector unsigned short 0196 /// \since Crypto++ 6.0 0197 typedef __vector unsigned short uint16x8_p; 0198 /// \brief Vector of 32-bit elements 0199 /// \par Wraps 0200 /// __vector unsigned int 0201 /// \since Crypto++ 6.0 0202 typedef __vector unsigned int uint32x4_p; 0203 0204 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING) 0205 /// \brief Vector of 64-bit elements 0206 /// \details uint64x2_p is available on POWER7 with VSX and above. Most 0207 /// supporting functions, like 64-bit <tt>vec_add</tt> (<tt>vaddudm</tt>) 0208 /// and <tt>vec_sub</tt> (<tt>vsubudm</tt>), did not arrive until POWER8. 0209 /// \par Wraps 0210 /// __vector unsigned long long 0211 /// \since Crypto++ 6.0 0212 typedef __vector unsigned long long uint64x2_p; 0213 #endif // VSX or ARCH_PWR8 0214 0215 /// \brief The 0 vector 0216 /// \return a 32-bit vector of 0's 0217 /// \since Crypto++ 8.0 0218 inline uint32x4_p VecZero() 0219 { 0220 const uint32x4_p v = {0,0,0,0}; 0221 return v; 0222 } 0223 0224 /// \brief The 1 vector 0225 /// \return a 32-bit vector of 1's 0226 /// \since Crypto++ 8.0 0227 inline uint32x4_p VecOne() 0228 { 0229 const uint32x4_p v = {1,1,1,1}; 0230 return v; 0231 } 0232 0233 /// \brief Reverse bytes in a vector 0234 /// \tparam T vector type 0235 /// \param data the vector 0236 /// \return vector 0237 /// \details VecReverse() reverses the bytes in a vector 0238 /// \par Wraps 0239 /// vec_perm 0240 /// \since Crypto++ 6.0 0241 template <class T> 0242 inline T VecReverse(const T data) 0243 { 0244 #if defined(CRYPTOPP_BIG_ENDIAN) 0245 const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; 0246 return (T)vec_perm(data, data, mask); 0247 #else 0248 const uint8x16_p mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}; 0249 return (T)vec_perm(data, data, mask); 0250 #endif 0251 } 0252 0253 /// \brief Reverse bytes in a vector 0254 /// \tparam T vector type 0255 /// \param data the vector 0256 /// \return vector 0257 /// \details VecReverseLE() reverses the bytes in a vector on 0258 /// little-endian systems. 0259 /// \par Wraps 0260 /// vec_perm 0261 /// \since Crypto++ 6.0 0262 template <class T> 0263 inline T VecReverseLE(const T data) 0264 { 0265 #if defined(CRYPTOPP_LITTLE_ENDIAN) 0266 const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; 0267 return (T)vec_perm(data, data, mask); 0268 #else 0269 return data; 0270 #endif 0271 } 0272 0273 /// \brief Reverse bytes in a vector 0274 /// \tparam T vector type 0275 /// \param data the vector 0276 /// \return vector 0277 /// \details VecReverseBE() reverses the bytes in a vector on 0278 /// big-endian systems. 0279 /// \par Wraps 0280 /// vec_perm 0281 /// \since Crypto++ 6.0 0282 template <class T> 0283 inline T VecReverseBE(const T data) 0284 { 0285 #if defined(CRYPTOPP_BIG_ENDIAN) 0286 const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; 0287 return (T)vec_perm(data, data, mask); 0288 #else 0289 return data; 0290 #endif 0291 } 0292 0293 /// \name LOAD OPERATIONS 0294 //@{ 0295 0296 /// \brief Loads a vector from a byte array 0297 /// \param src the byte array 0298 /// \details Loads a vector in native endian format from a byte array. 0299 /// \details VecLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address 0300 /// of <tt>src</tt> is aligned. If unaligned it uses <tt>vec_lvsl</tt>, 0301 /// <tt>vec_ld</tt>, <tt>vec_perm</tt> and <tt>src</tt>. The fixups using 0302 /// <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are relatively expensive so 0303 /// you should provide aligned memory addresses. 0304 /// \par Wraps 0305 /// vec_ld, vec_lvsl, vec_perm 0306 /// \sa VecLoad, VecLoadAligned 0307 /// \since Crypto++ 6.0 0308 inline uint32x4_p VecLoad_ALTIVEC(const byte src[16]) 0309 { 0310 // Avoid IsAlignedOn for convenience. 0311 const uintptr_t addr = reinterpret_cast<uintptr_t>(src); 0312 if (addr % 16 == 0) 0313 { 0314 return (uint32x4_p)vec_ld(0, CONST_V8_CAST(addr)); 0315 } 0316 else 0317 { 0318 // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf 0319 const uint8x16_p perm = vec_lvsl(0, CONST_V8_CAST(addr)); 0320 const uint8x16_p low = vec_ld(0, CONST_V8_CAST(addr)); 0321 const uint8x16_p high = vec_ld(15, CONST_V8_CAST(addr)); 0322 return (uint32x4_p)vec_perm(low, high, perm); 0323 } 0324 } 0325 0326 /// \brief Loads a vector from a byte array 0327 /// \param src the byte array 0328 /// \param off offset into the src byte array 0329 /// \details Loads a vector in native endian format from a byte array. 0330 /// \details VecLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address 0331 /// of <tt>src</tt> is aligned. If unaligned it uses <tt>vec_lvsl</tt>, 0332 /// <tt>vec_ld</tt>, <tt>vec_perm</tt> and <tt>src</tt>. 0333 /// \details The fixups using <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are 0334 /// relatively expensive so you should provide aligned memory addresses. 0335 /// \par Wraps 0336 /// vec_ld, vec_lvsl, vec_perm 0337 /// \sa VecLoad, VecLoadAligned 0338 /// \since Crypto++ 6.0 0339 inline uint32x4_p VecLoad_ALTIVEC(int off, const byte src[16]) 0340 { 0341 // Avoid IsAlignedOn for convenience. 0342 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off; 0343 if (addr % 16 == 0) 0344 { 0345 return (uint32x4_p)vec_ld(0, CONST_V8_CAST(addr)); 0346 } 0347 else 0348 { 0349 // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf 0350 const uint8x16_p perm = vec_lvsl(0, CONST_V8_CAST(addr)); 0351 const uint8x16_p low = vec_ld(0, CONST_V8_CAST(addr)); 0352 const uint8x16_p high = vec_ld(15, CONST_V8_CAST(addr)); 0353 return (uint32x4_p)vec_perm(low, high, perm); 0354 } 0355 } 0356 0357 /// \brief Loads a vector from a byte array 0358 /// \param src the byte array 0359 /// \details VecLoad() loads a vector from a byte array. 0360 /// \details VecLoad() uses POWER9's <tt>vec_xl</tt> if available. 0361 /// The instruction does not require aligned effective memory addresses. 0362 /// VecLoad_ALTIVEC() is used if POWER9 is not available. 0363 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions 0364 /// are required to fix up unaligned memory addresses. 0365 /// \par Wraps 0366 /// vec_xl on POWER9 and above, Altivec load on POWER8 and below 0367 /// \sa VecLoad_ALTIVEC, VecLoadAligned 0368 /// \since Crypto++ 6.0 0369 inline uint32x4_p VecLoad(const byte src[16]) 0370 { 0371 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit 0372 // word pointers. The ISA lacks loads for short* and char*. 0373 // Power9/ISA 3.0 provides vec_xl for all datatypes. 0374 0375 const uintptr_t addr = reinterpret_cast<uintptr_t>(src); 0376 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0); 0377 CRYPTOPP_UNUSED(addr); 0378 0379 #if defined(_ARCH_PWR9) 0380 return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src)); 0381 #else 0382 return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr)); 0383 #endif 0384 } 0385 0386 /// \brief Loads a vector from a byte array 0387 /// \param src the byte array 0388 /// \param off offset into the src byte array 0389 /// \details VecLoad() loads a vector from a byte array. 0390 /// \details VecLoad() uses POWER9's <tt>vec_xl</tt> if available. 0391 /// The instruction does not require aligned effective memory addresses. 0392 /// VecLoad_ALTIVEC() is used if POWER9 is not available. 0393 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions 0394 /// are required to fix up unaligned memory addresses. 0395 /// \par Wraps 0396 /// vec_xl on POWER9 and above, Altivec load on POWER8 and below 0397 /// \sa VecLoad_ALTIVEC, VecLoadAligned 0398 /// \since Crypto++ 6.0 0399 inline uint32x4_p VecLoad(int off, const byte src[16]) 0400 { 0401 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit 0402 // word pointers. The ISA lacks loads for short* and char*. 0403 // Power9/ISA 3.0 provides vec_xl for all datatypes. 0404 0405 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off; 0406 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0); 0407 CRYPTOPP_UNUSED(addr); 0408 0409 #if defined(_ARCH_PWR9) 0410 return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src)); 0411 #else 0412 return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr)); 0413 #endif 0414 } 0415 0416 /// \brief Loads a vector from a word array 0417 /// \param src the word array 0418 /// \details VecLoad() loads a vector from a word array. 0419 /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available. 0420 /// The instruction does not require aligned effective memory addresses. 0421 /// VecLoad_ALTIVEC() is used if POWER7 is not available. 0422 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions 0423 /// are required to fix up unaligned memory addresses. 0424 /// \par Wraps 0425 /// vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below 0426 /// \sa VecLoad_ALTIVEC, VecLoadAligned 0427 /// \since Crypto++ 8.0 0428 inline uint32x4_p VecLoad(const word32 src[4]) 0429 { 0430 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit 0431 // word pointers. The ISA lacks loads for short* and char*. 0432 // Power9/ISA 3.0 provides vec_xl for all datatypes. 0433 0434 const uintptr_t addr = reinterpret_cast<uintptr_t>(src); 0435 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0); 0436 CRYPTOPP_UNUSED(addr); 0437 0438 #if defined(_ARCH_PWR9) 0439 return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src)); 0440 #elif defined(__VSX__) || defined(_ARCH_PWR8) 0441 return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr)); 0442 #else 0443 return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr)); 0444 #endif 0445 } 0446 0447 /// \brief Loads a vector from a word array 0448 /// \param src the word array 0449 /// \param off offset into the word array 0450 /// \details VecLoad() loads a vector from a word array. 0451 /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available. 0452 /// The instruction does not require aligned effective memory addresses. 0453 /// VecLoad_ALTIVEC() is used if POWER7 is not available. 0454 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions 0455 /// are required to fix up unaligned memory addresses. 0456 /// \par Wraps 0457 /// vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below 0458 /// \sa VecLoad_ALTIVEC, VecLoadAligned 0459 /// \since Crypto++ 8.0 0460 inline uint32x4_p VecLoad(int off, const word32 src[4]) 0461 { 0462 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit 0463 // word pointers. The ISA lacks loads for short* and char*. 0464 // Power9/ISA 3.0 provides vec_xl for all datatypes. 0465 0466 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off; 0467 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0); 0468 CRYPTOPP_UNUSED(addr); 0469 0470 #if defined(_ARCH_PWR9) 0471 return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src)); 0472 #elif defined(__VSX__) || defined(_ARCH_PWR8) 0473 return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr)); 0474 #else 0475 return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr)); 0476 #endif 0477 } 0478 0479 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING) 0480 0481 /// \brief Loads a vector from a double word array 0482 /// \param src the double word array 0483 /// \details VecLoad() loads a vector from a double word array. 0484 /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available. 0485 /// The instruction does not require aligned effective memory addresses. 0486 /// VecLoad_ALTIVEC() is used if POWER7 and VSX are not available. 0487 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions 0488 /// are required to fix up unaligned memory addresses. 0489 /// \details VecLoad() with 64-bit elements is available on POWER7 and above. 0490 /// \par Wraps 0491 /// vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below 0492 /// \sa VecLoad_ALTIVEC, VecLoadAligned 0493 /// \since Crypto++ 8.0 0494 inline uint64x2_p VecLoad(const word64 src[2]) 0495 { 0496 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit 0497 // word pointers. The ISA lacks loads for short* and char*. 0498 // Power9/ISA 3.0 provides vec_xl for all datatypes. 0499 0500 const uintptr_t addr = reinterpret_cast<uintptr_t>(src); 0501 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0); 0502 CRYPTOPP_UNUSED(addr); 0503 0504 #if defined(_ARCH_PWR9) 0505 return (uint64x2_p)vec_xl(0, CONST_V8_CAST(src)); 0506 #elif defined(__VSX__) || defined(_ARCH_PWR8) 0507 // The 32-bit cast is not a typo. Compiler workaround. 0508 return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr)); 0509 #else 0510 return (uint64x2_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr)); 0511 #endif 0512 } 0513 0514 /// \brief Loads a vector from a double word array 0515 /// \param src the double word array 0516 /// \param off offset into the double word array 0517 /// \details VecLoad() loads a vector from a double word array. 0518 /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available. 0519 /// The instruction does not require aligned effective memory addresses. 0520 /// VecLoad_ALTIVEC() is used if POWER7 and VSX are not available. 0521 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions 0522 /// are required to fix up unaligned memory addresses. 0523 /// \details VecLoad() with 64-bit elements is available on POWER8 and above. 0524 /// \par Wraps 0525 /// vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below 0526 /// \sa VecLoad_ALTIVEC, VecLoadAligned 0527 /// \since Crypto++ 8.0 0528 inline uint64x2_p VecLoad(int off, const word64 src[2]) 0529 { 0530 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit 0531 // word pointers. The ISA lacks loads for short* and char*. 0532 // Power9/ISA 3.0 provides vec_xl for all datatypes. 0533 0534 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off; 0535 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0); 0536 CRYPTOPP_UNUSED(addr); 0537 0538 #if defined(_ARCH_PWR9) 0539 return (uint64x2_p)vec_xl(off, CONST_V8_CAST(src)); 0540 #elif defined(__VSX__) || defined(_ARCH_PWR8) 0541 // The 32-bit cast is not a typo. Compiler workaround. 0542 return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr)); 0543 #else 0544 return (uint64x2_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr)); 0545 #endif 0546 } 0547 0548 #endif // VSX or ARCH_PWR8 0549 0550 /// \brief Loads a vector from an aligned byte array 0551 /// \param src the byte array 0552 /// \details VecLoadAligned() loads a vector from an aligned byte array. 0553 /// \details VecLoadAligned() uses POWER9's <tt>vec_xl</tt> if available. 0554 /// <tt>vec_ld</tt> is used if POWER9 is not available. The effective 0555 /// address of <tt>src</tt> must be 16-byte aligned for Altivec. 0556 /// \par Wraps 0557 /// vec_xl on POWER9, vec_ld on POWER8 and below 0558 /// \sa VecLoad_ALTIVEC, VecLoad 0559 /// \since Crypto++ 8.0 0560 inline uint32x4_p VecLoadAligned(const byte src[16]) 0561 { 0562 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit 0563 // word pointers. The ISA lacks loads for short* and char*. 0564 // Power9/ISA 3.0 provides vec_xl for all datatypes. 0565 0566 const uintptr_t addr = reinterpret_cast<uintptr_t>(src); 0567 CRYPTOPP_ASSERT(addr % 16 == 0); 0568 CRYPTOPP_UNUSED(addr); 0569 0570 #if defined(_ARCH_PWR9) 0571 return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src)); 0572 #else 0573 return (uint32x4_p)vec_ld(0, CONST_V8_CAST(src)); 0574 #endif 0575 } 0576 0577 /// \brief Loads a vector from an aligned byte array 0578 /// \param src the byte array 0579 /// \param off offset into the src byte array 0580 /// \details VecLoadAligned() loads a vector from an aligned byte array. 0581 /// \details VecLoadAligned() uses POWER9's <tt>vec_xl</tt> if available. 0582 /// <tt>vec_ld</tt> is used if POWER9 is not available. The effective 0583 /// address of <tt>src</tt> must be 16-byte aligned for Altivec. 0584 /// \par Wraps 0585 /// vec_xl on POWER9, vec_ld on POWER8 and below 0586 /// \sa VecLoad_ALTIVEC, VecLoad 0587 /// \since Crypto++ 8.0 0588 inline uint32x4_p VecLoadAligned(int off, const byte src[16]) 0589 { 0590 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit 0591 // word pointers. The ISA lacks loads for short* and char*. 0592 // Power9/ISA 3.0 provides vec_xl for all datatypes. 0593 0594 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off; 0595 CRYPTOPP_ASSERT(addr % 16 == 0); 0596 CRYPTOPP_UNUSED(addr); 0597 0598 #if defined(_ARCH_PWR9) 0599 return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src)); 0600 #else 0601 return (uint32x4_p)vec_ld(off, CONST_V8_CAST(src)); 0602 #endif 0603 } 0604 0605 /// \brief Loads a vector from an aligned word array 0606 /// \param src the word array 0607 /// \details VecLoadAligned() loads a vector from an aligned word array. 0608 /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if 0609 /// available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available. 0610 /// The effective address of <tt>src</tt> must be 16-byte aligned for Altivec. 0611 /// \par Wraps 0612 /// vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below 0613 /// \sa VecLoad_ALTIVEC, VecLoad 0614 /// \since Crypto++ 8.0 0615 inline uint32x4_p VecLoadAligned(const word32 src[4]) 0616 { 0617 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit 0618 // word pointers. The ISA lacks loads for short* and char*. 0619 // Power9/ISA 3.0 provides vec_xl for all datatypes. 0620 0621 const uintptr_t addr = reinterpret_cast<uintptr_t>(src); 0622 CRYPTOPP_ASSERT(addr % 16 == 0); 0623 CRYPTOPP_UNUSED(addr); 0624 0625 #if defined(_ARCH_PWR9) 0626 return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src)); 0627 #elif defined(__VSX__) || defined(_ARCH_PWR8) 0628 return (uint32x4_p)vec_xl(0, CONST_V32_CAST(src)); 0629 #else 0630 return (uint32x4_p)vec_ld(0, CONST_V8_CAST(src)); 0631 #endif 0632 } 0633 0634 /// \brief Loads a vector from an aligned word array 0635 /// \param src the word array 0636 /// \param off offset into the src word array 0637 /// \details VecLoadAligned() loads a vector from an aligned word array. 0638 /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if 0639 /// available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available. 0640 /// The effective address of <tt>src</tt> must be 16-byte aligned for Altivec. 0641 /// \par Wraps 0642 /// vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below 0643 /// \sa VecLoad_ALTIVEC, VecLoad 0644 /// \since Crypto++ 8.0 0645 inline uint32x4_p VecLoadAligned(int off, const word32 src[4]) 0646 { 0647 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit 0648 // word pointers. The ISA lacks loads for short* and char*. 0649 // Power9/ISA 3.0 provides vec_xl for all datatypes. 0650 0651 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off; 0652 CRYPTOPP_ASSERT(addr % 16 == 0); 0653 CRYPTOPP_UNUSED(addr); 0654 0655 #if defined(_ARCH_PWR9) 0656 return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src)); 0657 #elif defined(__VSX__) || defined(_ARCH_PWR8) 0658 return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr)); 0659 #else 0660 return (uint32x4_p)vec_ld(off, CONST_V8_CAST(src)); 0661 #endif 0662 } 0663 0664 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING) 0665 0666 /// \brief Loads a vector from an aligned double word array 0667 /// \param src the double word array 0668 /// \details VecLoadAligned() loads a vector from an aligned double word array. 0669 /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if 0670 /// available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available. 0671 /// The effective address of <tt>src</tt> must be 16-byte aligned for Altivec. 0672 /// \par Wraps 0673 /// vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below 0674 /// \sa VecLoad_ALTIVEC, VecLoad 0675 /// \since Crypto++ 8.0 0676 inline uint64x2_p VecLoadAligned(const word64 src[4]) 0677 { 0678 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit 0679 // word pointers. The ISA lacks loads for short* and char*. 0680 // Power9/ISA 3.0 provides vec_xl for all datatypes. 0681 0682 const uintptr_t addr = reinterpret_cast<uintptr_t>(src); 0683 CRYPTOPP_ASSERT(addr % 16 == 0); 0684 CRYPTOPP_UNUSED(addr); 0685 0686 #if defined(_ARCH_PWR9) 0687 return (uint64x2_p)vec_xl(0, CONST_V8_CAST(src)); 0688 #elif defined(__VSX__) || defined(_ARCH_PWR8) 0689 // The 32-bit cast is not a typo. Compiler workaround. 0690 return (uint64x2_p)vec_xl(0, CONST_V32_CAST(src)); 0691 #else 0692 return (uint64x2_p)vec_ld(0, CONST_V8_CAST(src)); 0693 #endif 0694 } 0695 0696 /// \brief Loads a vector from an aligned double word array 0697 /// \param src the double word array 0698 /// \param off offset into the src double word array 0699 /// \details VecLoadAligned() loads a vector from an aligned double word array. 0700 /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if 0701 /// available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available. 0702 /// The effective address of <tt>src</tt> must be 16-byte aligned for Altivec. 0703 /// \par Wraps 0704 /// vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below 0705 /// \sa VecLoad_ALTIVEC, VecLoad 0706 /// \since Crypto++ 8.0 0707 inline uint64x2_p VecLoadAligned(int off, const word64 src[4]) 0708 { 0709 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit 0710 // word pointers. The ISA lacks loads for short* and char*. 0711 // Power9/ISA 3.0 provides vec_xl for all datatypes. 0712 0713 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off; 0714 CRYPTOPP_ASSERT(addr % 16 == 0); 0715 CRYPTOPP_UNUSED(addr); 0716 0717 #if defined(_ARCH_PWR9) 0718 return (uint64x2_p)vec_xl(off, CONST_V8_CAST(src)); 0719 #elif defined(__VSX__) || defined(_ARCH_PWR8) 0720 // The 32-bit cast is not a typo. Compiler workaround. 0721 return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr)); 0722 #else 0723 return (uint64x2_p)vec_ld(off, CONST_V8_CAST(src)); 0724 #endif 0725 } 0726 0727 #endif 0728 0729 /// \brief Loads a vector from a byte array 0730 /// \param src the byte array 0731 /// \details VecLoadBE() loads a vector from a byte array. VecLoadBE 0732 /// will reverse all bytes in the array on a little endian system. 0733 /// \details VecLoadBE() uses POWER7's and VSX's <tt>vec_xl</tt> if available. 0734 /// The instruction does not require aligned effective memory addresses. 0735 /// VecLoad_ALTIVEC() is used if POWER7 or VSX are not available. 0736 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions 0737 /// are required to fix up unaligned memory addresses. 0738 /// \par Wraps 0739 /// vec_xl on POWER8, Altivec load on POWER7 and below 0740 /// \sa VecLoad_ALTIVEC, VecLoad, VecLoadAligned 0741 /// \since Crypto++ 6.0 0742 inline uint32x4_p VecLoadBE(const byte src[16]) 0743 { 0744 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit 0745 // word pointers. The ISA lacks loads for short* and char*. 0746 // Power9/ISA 3.0 provides vec_xl for all datatypes. 0747 0748 const uintptr_t addr = reinterpret_cast<uintptr_t>(src); 0749 // CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0); 0750 CRYPTOPP_UNUSED(addr); 0751 0752 #if defined(_ARCH_PWR9) 0753 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0); 0754 return (uint32x4_p)vec_xl_be(0, CONST_V8_CAST(src)); 0755 #elif defined(CRYPTOPP_BIG_ENDIAN) 0756 return (uint32x4_p)VecLoad_ALTIVEC(0, CONST_V8_CAST(src)); 0757 #else 0758 return (uint32x4_p)VecReverseLE(VecLoad_ALTIVEC(CONST_V8_CAST(src))); 0759 #endif 0760 } 0761 0762 /// \brief Loads a vector from a byte array 0763 /// \param src the byte array 0764 /// \param off offset into the src byte array 0765 /// \details VecLoadBE() loads a vector from a byte array. VecLoadBE 0766 /// will reverse all bytes in the array on a little endian system. 0767 /// \details VecLoadBE() uses POWER7's and VSX's <tt>vec_xl</tt> if available. 0768 /// The instruction does not require aligned effective memory addresses. 0769 /// VecLoad_ALTIVEC() is used if POWER7 is not available. 0770 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions 0771 /// are required to fix up unaligned memory addresses. 0772 /// \par Wraps 0773 /// vec_xl on POWER8, Altivec load on POWER7 and below 0774 /// \sa VecLoad_ALTIVEC, VecLoad, VecLoadAligned 0775 /// \since Crypto++ 6.0 0776 inline uint32x4_p VecLoadBE(int off, const byte src[16]) 0777 { 0778 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit 0779 // word pointers. The ISA lacks loads for short* and char*. 0780 // Power9/ISA 3.0 provides vec_xl for all datatypes. 0781 0782 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off; 0783 // CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0); 0784 CRYPTOPP_UNUSED(addr); 0785 0786 #if defined(_ARCH_PWR9) 0787 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0); 0788 return (uint32x4_p)vec_xl_be(off, CONST_V8_CAST(src)); 0789 #elif defined(CRYPTOPP_BIG_ENDIAN) 0790 return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr)); 0791 #else 0792 return (uint32x4_p)VecReverseLE(VecLoad_ALTIVEC(CONST_V8_CAST(addr))); 0793 #endif 0794 } 0795 0796 //@} 0797 0798 /// \name STORE OPERATIONS 0799 //@{ 0800 0801 /// \brief Stores a vector to a byte array 0802 /// \tparam T vector type 0803 /// \param data the vector 0804 /// \param dest the byte array 0805 /// \details VecStore_ALTIVEC() stores a vector to a byte array. 0806 /// \details VecStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address 0807 /// of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise. 0808 /// <tt>vec_ste</tt> is relatively expensive so you should provide aligned 0809 /// memory addresses. 0810 /// \details VecStore_ALTIVEC() is used when POWER7 or above 0811 /// and unaligned loads is not available. 0812 /// \par Wraps 0813 /// vec_st, vec_ste, vec_lvsr, vec_perm 0814 /// \sa VecStore, VecStoreAligned 0815 /// \since Crypto++ 8.0 0816 template<class T> 0817 inline void VecStore_ALTIVEC(const T data, byte dest[16]) 0818 { 0819 // Avoid IsAlignedOn for convenience. 0820 uintptr_t addr = reinterpret_cast<uintptr_t>(dest); 0821 if (addr % 16 == 0) 0822 { 0823 vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr)); 0824 } 0825 else 0826 { 0827 // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf 0828 uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, NCONST_V8_CAST(addr))); 0829 vec_ste((uint8x16_p) perm, 0, (unsigned char*) NCONST_V8_CAST(addr)); 0830 vec_ste((uint16x8_p) perm, 1, (unsigned short*)NCONST_V8_CAST(addr)); 0831 vec_ste((uint32x4_p) perm, 3, (unsigned int*) NCONST_V8_CAST(addr)); 0832 vec_ste((uint32x4_p) perm, 4, (unsigned int*) NCONST_V8_CAST(addr)); 0833 vec_ste((uint32x4_p) perm, 8, (unsigned int*) NCONST_V8_CAST(addr)); 0834 vec_ste((uint32x4_p) perm, 12, (unsigned int*) NCONST_V8_CAST(addr)); 0835 vec_ste((uint16x8_p) perm, 14, (unsigned short*)NCONST_V8_CAST(addr)); 0836 vec_ste((uint8x16_p) perm, 15, (unsigned char*) NCONST_V8_CAST(addr)); 0837 } 0838 } 0839 0840 /// \brief Stores a vector to a byte array 0841 /// \tparam T vector type 0842 /// \param data the vector 0843 /// \param off offset into the dest byte array 0844 /// \param dest the byte array 0845 /// \details VecStore_ALTIVEC() stores a vector to a byte array. 0846 /// \details VecStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address 0847 /// of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise. 0848 /// <tt>vec_ste</tt> is relatively expensive so you should provide aligned 0849 /// memory addresses. 0850 /// \details VecStore_ALTIVEC() is used when POWER7 or above 0851 /// and unaligned loads is not available. 0852 /// \par Wraps 0853 /// vec_st, vec_ste, vec_lvsr, vec_perm 0854 /// \sa VecStore, VecStoreAligned 0855 /// \since Crypto++ 8.0 0856 template<class T> 0857 inline void VecStore_ALTIVEC(const T data, int off, byte dest[16]) 0858 { 0859 // Avoid IsAlignedOn for convenience. 0860 uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off; 0861 if (addr % 16 == 0) 0862 { 0863 vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr)); 0864 } 0865 else 0866 { 0867 // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf 0868 uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, NCONST_V8_CAST(addr))); 0869 vec_ste((uint8x16_p) perm, 0, (unsigned char*) NCONST_V8_CAST(addr)); 0870 vec_ste((uint16x8_p) perm, 1, (unsigned short*)NCONST_V8_CAST(addr)); 0871 vec_ste((uint32x4_p) perm, 3, (unsigned int*) NCONST_V8_CAST(addr)); 0872 vec_ste((uint32x4_p) perm, 4, (unsigned int*) NCONST_V8_CAST(addr)); 0873 vec_ste((uint32x4_p) perm, 8, (unsigned int*) NCONST_V8_CAST(addr)); 0874 vec_ste((uint32x4_p) perm, 12, (unsigned int*) NCONST_V8_CAST(addr)); 0875 vec_ste((uint16x8_p) perm, 14, (unsigned short*)NCONST_V8_CAST(addr)); 0876 vec_ste((uint8x16_p) perm, 15, (unsigned char*) NCONST_V8_CAST(addr)); 0877 } 0878 } 0879 0880 /// \brief Stores a vector to a byte array 0881 /// \tparam T vector type 0882 /// \param data the vector 0883 /// \param dest the byte array 0884 /// \details VecStore() stores a vector to a byte array. 0885 /// \details VecStore() uses POWER9's <tt>vec_xst</tt> if available. 0886 /// The instruction does not require aligned effective memory addresses. 0887 /// VecStore_ALTIVEC() is used if POWER9 is not available. 0888 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions 0889 /// are required to fix up unaligned memory addresses. 0890 /// \par Wraps 0891 /// vec_xst on POWER9 and above, Altivec store on POWER8 and below 0892 /// \sa VecStore_ALTIVEC, VecStoreAligned 0893 /// \since Crypto++ 6.0 0894 template<class T> 0895 inline void VecStore(const T data, byte dest[16]) 0896 { 0897 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit 0898 // word pointers. The ISA lacks loads for short* and char*. 0899 // Power9/ISA 3.0 provides vec_xl for all datatypes. 0900 0901 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest); 0902 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0); 0903 CRYPTOPP_UNUSED(addr); 0904 0905 #if defined(_ARCH_PWR9) 0906 vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest)); 0907 #else 0908 VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(dest)); 0909 #endif 0910 } 0911 0912 /// \brief Stores a vector to a byte array 0913 /// \tparam T vector type 0914 /// \param data the vector 0915 /// \param off offset into the dest byte array 0916 /// \param dest the byte array 0917 /// \details VecStore() stores a vector to a byte array. 0918 /// \details VecStore() uses POWER9's <tt>vec_xst</tt> if available. 0919 /// The instruction does not require aligned effective memory addresses. 0920 /// VecStore_ALTIVEC() is used if POWER9 is not available. 0921 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions 0922 /// are required to fix up unaligned memory addresses. 0923 /// \par Wraps 0924 /// vec_xst on POWER9 and above, Altivec store on POWER8 and below 0925 /// \sa VecStore_ALTIVEC, VecStoreAligned 0926 /// \since Crypto++ 6.0 0927 template<class T> 0928 inline void VecStore(const T data, int off, byte dest[16]) 0929 { 0930 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit 0931 // word pointers. The ISA lacks loads for short* and char*. 0932 // Power9/ISA 3.0 provides vec_xl for all datatypes. 0933 0934 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off; 0935 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0); 0936 CRYPTOPP_UNUSED(addr); 0937 0938 #if defined(_ARCH_PWR9) 0939 vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest)); 0940 #else 0941 VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr)); 0942 #endif 0943 } 0944 0945 /// \brief Stores a vector to a word array 0946 /// \tparam T vector type 0947 /// \param data the vector 0948 /// \param dest the word array 0949 /// \details VecStore() stores a vector to a word array. 0950 /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available. 0951 /// The instruction does not require aligned effective memory addresses. 0952 /// VecStore_ALTIVEC() is used if POWER7 or VSX are not available. 0953 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions 0954 /// are required to fix up unaligned memory addresses. 0955 /// \par Wraps 0956 /// vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below 0957 /// \sa VecStore_ALTIVEC, VecStoreAligned 0958 /// \since Crypto++ 8.0 0959 template<class T> 0960 inline void VecStore(const T data, word32 dest[4]) 0961 { 0962 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit 0963 // word pointers. The ISA lacks stores for short* and char*. 0964 // Power9/ISA 3.0 provides vec_xst for all datatypes. 0965 0966 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest); 0967 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0); 0968 CRYPTOPP_UNUSED(addr); 0969 0970 #if defined(_ARCH_PWR9) 0971 vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest)); 0972 #elif defined(__VSX__) || defined(_ARCH_PWR8) 0973 vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr)); 0974 #else 0975 VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr)); 0976 #endif 0977 } 0978 0979 /// \brief Stores a vector to a word array 0980 /// \tparam T vector type 0981 /// \param data the vector 0982 /// \param off offset into the dest word array 0983 /// \param dest the word array 0984 /// \details VecStore() stores a vector to a word array. 0985 /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available. 0986 /// The instruction does not require aligned effective memory addresses. 0987 /// VecStore_ALTIVEC() is used if POWER7 or VSX are not available. 0988 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions 0989 /// are required to fix up unaligned memory addresses. 0990 /// \par Wraps 0991 /// vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below 0992 /// \sa VecStore_ALTIVEC, VecStoreAligned 0993 /// \since Crypto++ 8.0 0994 template<class T> 0995 inline void VecStore(const T data, int off, word32 dest[4]) 0996 { 0997 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit 0998 // word pointers. The ISA lacks stores for short* and char*. 0999 // Power9/ISA 3.0 provides vec_xst for all datatypes. 1000 1001 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off; 1002 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0); 1003 CRYPTOPP_UNUSED(addr); 1004 1005 #if defined(_ARCH_PWR9) 1006 vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest)); 1007 #elif defined(__VSX__) || defined(_ARCH_PWR8) 1008 vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr)); 1009 #else 1010 VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr)); 1011 #endif 1012 } 1013 1014 /// \brief Stores a vector to a word array 1015 /// \tparam T vector type 1016 /// \param data the vector 1017 /// \param dest the word array 1018 /// \details VecStore() stores a vector to a word array. 1019 /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available. 1020 /// The instruction does not require aligned effective memory addresses. 1021 /// VecStore_ALTIVEC() is used if POWER7 or VSX are not available. 1022 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions 1023 /// are required to fix up unaligned memory addresses. 1024 /// \details VecStore() with 64-bit elements is available on POWER8 and above. 1025 /// \par Wraps 1026 /// vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below 1027 /// \sa VecStore_ALTIVEC, VecStoreAligned 1028 /// \since Crypto++ 8.0 1029 template<class T> 1030 inline void VecStore(const T data, word64 dest[2]) 1031 { 1032 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit 1033 // word pointers. The ISA lacks stores for short* and char*. 1034 // Power9/ISA 3.0 provides vec_xst for all datatypes. 1035 1036 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest); 1037 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0); 1038 CRYPTOPP_UNUSED(addr); 1039 1040 #if defined(_ARCH_PWR9) 1041 vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest)); 1042 #elif defined(__VSX__) || defined(_ARCH_PWR8) 1043 // 32-bit cast is not a typo. Compiler workaround. 1044 vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr)); 1045 #else 1046 VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr)); 1047 #endif 1048 } 1049 1050 /// \brief Stores a vector to a word array 1051 /// \tparam T vector type 1052 /// \param data the vector 1053 /// \param off offset into the dest word array 1054 /// \param dest the word array 1055 /// \details VecStore() stores a vector to a word array. 1056 /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available. 1057 /// The instruction does not require aligned effective memory addresses. 1058 /// VecStore_ALTIVEC() is used if POWER7 or VSX are not available. 1059 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions 1060 /// are required to fix up unaligned memory addresses. 1061 /// \details VecStore() with 64-bit elements is available on POWER8 and above. 1062 /// \par Wraps 1063 /// vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below 1064 /// \sa VecStore_ALTIVEC, VecStoreAligned 1065 /// \since Crypto++ 8.0 1066 template<class T> 1067 inline void VecStore(const T data, int off, word64 dest[2]) 1068 { 1069 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit 1070 // word pointers. The ISA lacks stores for short* and char*. 1071 // Power9/ISA 3.0 provides vec_xst for all datatypes. 1072 1073 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off; 1074 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0); 1075 CRYPTOPP_UNUSED(addr); 1076 1077 #if defined(_ARCH_PWR9) 1078 vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest)); 1079 #elif defined(__VSX__) || defined(_ARCH_PWR8) 1080 // 32-bit cast is not a typo. Compiler workaround. 1081 vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr)); 1082 #else 1083 VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr)); 1084 #endif 1085 } 1086 1087 /// \brief Stores a vector to a byte array 1088 /// \tparam T vector type 1089 /// \param data the vector 1090 /// \param dest the byte array 1091 /// \details VecStoreAligned() stores a vector from an aligned byte array. 1092 /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available. 1093 /// <tt>vec_st</tt> is used if POWER9 is not available. The effective 1094 /// address of <tt>dest</tt> must be 16-byte aligned for Altivec. 1095 /// \par Wraps 1096 /// vec_xst on POWER9 or above, vec_st on POWER8 and below 1097 /// \sa VecStore_ALTIVEC, VecStore 1098 /// \since Crypto++ 8.0 1099 template<class T> 1100 inline void VecStoreAligned(const T data, byte dest[16]) 1101 { 1102 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit 1103 // word pointers. The ISA lacks loads for short* and char*. 1104 // Power9/ISA 3.0 provides vec_xl for all datatypes. 1105 1106 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest); 1107 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0); 1108 CRYPTOPP_UNUSED(addr); 1109 1110 #if defined(_ARCH_PWR9) 1111 vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest)); 1112 #else 1113 vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr)); 1114 #endif 1115 } 1116 1117 /// \brief Stores a vector to a byte array 1118 /// \tparam T vector type 1119 /// \param data the vector 1120 /// \param off offset into the dest byte array 1121 /// \param dest the byte array 1122 /// \details VecStoreAligned() stores a vector from an aligned byte array. 1123 /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available. 1124 /// <tt>vec_st</tt> is used if POWER9 is not available. The effective 1125 /// address of <tt>dest</tt> must be 16-byte aligned for Altivec. 1126 /// \par Wraps 1127 /// vec_xst on POWER9 or above, vec_st on POWER8 and below 1128 /// \sa VecStore_ALTIVEC, VecStore 1129 /// \since Crypto++ 8.0 1130 template<class T> 1131 inline void VecStoreAligned(const T data, int off, byte dest[16]) 1132 { 1133 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit 1134 // word pointers. The ISA lacks loads for short* and char*. 1135 // Power9/ISA 3.0 provides vec_xl for all datatypes. 1136 1137 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off; 1138 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0); 1139 CRYPTOPP_UNUSED(addr); 1140 1141 #if defined(_ARCH_PWR9) 1142 vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest)); 1143 #else 1144 vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr)); 1145 #endif 1146 } 1147 1148 /// \brief Stores a vector to a word array 1149 /// \tparam T vector type 1150 /// \param data the vector 1151 /// \param dest the word array 1152 /// \details VecStoreAligned() stores a vector from an aligned word array. 1153 /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available. 1154 /// POWER7 <tt>vec_xst</tt> is used if POWER9 is not available. <tt>vec_st</tt> 1155 /// is used if POWER7 is not available. The effective address of <tt>dest</tt> 1156 /// must be 16-byte aligned for Altivec. 1157 /// \par Wraps 1158 /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below 1159 /// \sa VecStore_ALTIVEC, VecStore 1160 /// \since Crypto++ 8.0 1161 template<class T> 1162 inline void VecStoreAligned(const T data, word32 dest[4]) 1163 { 1164 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit 1165 // word pointers. The ISA lacks stores for short* and char*. 1166 // Power9/ISA 3.0 provides vec_xst for all datatypes. 1167 1168 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest); 1169 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0); 1170 CRYPTOPP_UNUSED(addr); 1171 1172 #if defined(_ARCH_PWR9) 1173 vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest)); 1174 #elif defined(__VSX__) || defined(_ARCH_PWR8) 1175 vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr)); 1176 #else 1177 vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr)); 1178 #endif 1179 } 1180 1181 /// \brief Stores a vector to a word array 1182 /// \tparam T vector type 1183 /// \param data the vector 1184 /// \param off offset into the dest word array 1185 /// \param dest the word array 1186 /// \details VecStoreAligned() stores a vector from an aligned word array. 1187 /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available. 1188 /// POWER7 <tt>vec_xst</tt> is used if POWER9 is not available. <tt>vec_st</tt> 1189 /// is used if POWER7 is not available. The effective address of <tt>dest</tt> 1190 /// must be 16-byte aligned for Altivec. 1191 /// \par Wraps 1192 /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below 1193 /// \sa VecStore_ALTIVEC, VecStore 1194 /// \since Crypto++ 8.0 1195 template<class T> 1196 inline void VecStoreAligned(const T data, int off, word32 dest[4]) 1197 { 1198 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit 1199 // word pointers. The ISA lacks stores for short* and char*. 1200 // Power9/ISA 3.0 provides vec_xst for all datatypes. 1201 1202 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off; 1203 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0); 1204 CRYPTOPP_UNUSED(addr); 1205 1206 #if defined(_ARCH_PWR9) 1207 vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest)); 1208 #elif defined(__VSX__) || defined(_ARCH_PWR8) 1209 vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr)); 1210 #else 1211 vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr)); 1212 #endif 1213 } 1214 1215 /// \brief Stores a vector to a byte array 1216 /// \tparam T vector type 1217 /// \param data the vector 1218 /// \param dest the byte array 1219 /// \details VecStoreBE() stores a vector to a byte array. VecStoreBE 1220 /// will reverse all bytes in the array on a little endian system. 1221 /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available. 1222 /// The instruction does not require aligned effective memory addresses. 1223 /// VecStore_ALTIVEC() is used if POWER7 is not available. 1224 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions 1225 /// are required to fix up unaligned memory addresses. 1226 /// \par Wraps 1227 /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below 1228 /// \sa VecStore_ALTIVEC, VecStoreAligned 1229 /// \since Crypto++ 6.0 1230 template <class T> 1231 inline void VecStoreBE(const T data, byte dest[16]) 1232 { 1233 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit 1234 // word pointers. The ISA lacks stores for short* and char*. 1235 // Power9/ISA 3.0 provides vec_xst for all datatypes. 1236 1237 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest); 1238 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0); 1239 CRYPTOPP_UNUSED(addr); 1240 1241 #if defined(_ARCH_PWR9) 1242 vec_xst_be((uint8x16_p)data, 0, NCONST_V8_CAST(dest)); 1243 #elif defined(CRYPTOPP_BIG_ENDIAN) 1244 VecStore((uint8x16_p)data, NCONST_V8_CAST(addr)); 1245 #else 1246 VecStore((uint8x16_p)VecReverseLE(data), NCONST_V8_CAST(addr)); 1247 #endif 1248 } 1249 1250 /// \brief Stores a vector to a byte array 1251 /// \tparam T vector type 1252 /// \param data the vector 1253 /// \param off offset into the dest byte array 1254 /// \param dest the byte array 1255 /// \details VecStoreBE() stores a vector to a byte array. VecStoreBE 1256 /// will reverse all bytes in the array on a little endian system. 1257 /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available. 1258 /// The instruction does not require aligned effective memory addresses. 1259 /// VecStore_ALTIVEC() is used if POWER7 is not available. 1260 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions 1261 /// are required to fix up unaligned memory addresses. 1262 /// \par Wraps 1263 /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below 1264 /// \sa VecStore_ALTIVEC, VecStoreAligned 1265 /// \since Crypto++ 6.0 1266 template <class T> 1267 inline void VecStoreBE(const T data, int off, byte dest[16]) 1268 { 1269 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit 1270 // word pointers. The ISA lacks stores for short* and char*. 1271 // Power9/ISA 3.0 provides vec_xst for all datatypes. 1272 1273 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off; 1274 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0); 1275 CRYPTOPP_UNUSED(addr); 1276 1277 #if defined(_ARCH_PWR9) 1278 vec_xst_be((uint8x16_p)data, off, NCONST_V8_CAST(dest)); 1279 #elif defined(CRYPTOPP_BIG_ENDIAN) 1280 VecStore((uint8x16_p)data, NCONST_V8_CAST(addr)); 1281 #else 1282 VecStore((uint8x16_p)VecReverseLE(data), NCONST_V8_CAST(addr)); 1283 #endif 1284 } 1285 1286 /// \brief Stores a vector to a word array 1287 /// \tparam T vector type 1288 /// \param data the vector 1289 /// \param dest the word array 1290 /// \details VecStoreBE() stores a vector to a word array. VecStoreBE 1291 /// will reverse all bytes in the array on a little endian system. 1292 /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available. 1293 /// The instruction does not require aligned effective memory addresses. 1294 /// VecStore_ALTIVEC() is used if POWER7 is not available. 1295 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions 1296 /// are required to fix up unaligned memory addresses. 1297 /// \par Wraps 1298 /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below 1299 /// \sa VecStore_ALTIVEC, VecStoreAligned 1300 /// \since Crypto++ 8.0 1301 template <class T> 1302 inline void VecStoreBE(const T data, word32 dest[4]) 1303 { 1304 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit 1305 // word pointers. The ISA lacks stores for short* and char*. 1306 // Power9/ISA 3.0 provides vec_xst for all datatypes. 1307 1308 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest); 1309 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0); 1310 CRYPTOPP_UNUSED(addr); 1311 1312 #if defined(_ARCH_PWR9) 1313 vec_xst_be((uint8x16_p)data, 0, NCONST_V8_CAST(dest)); 1314 #elif defined(CRYPTOPP_BIG_ENDIAN) 1315 VecStore((uint32x4_p)data, NCONST_V32_CAST(addr)); 1316 #else 1317 VecStore((uint32x4_p)VecReverseLE(data), NCONST_V32_CAST(addr)); 1318 #endif 1319 } 1320 1321 /// \brief Stores a vector to a word array 1322 /// \tparam T vector type 1323 /// \param data the vector 1324 /// \param off offset into the dest word array 1325 /// \param dest the word array 1326 /// \details VecStoreBE() stores a vector to a word array. VecStoreBE 1327 /// will reverse all words in the array on a little endian system. 1328 /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available. 1329 /// The instruction does not require aligned effective memory addresses. 1330 /// VecStore_ALTIVEC() is used if POWER7 is not available. 1331 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions 1332 /// are required to fix up unaligned memory addresses. 1333 /// \par Wraps 1334 /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below 1335 /// \sa VecStore_ALTIVEC, VecStoreAligned 1336 /// \since Crypto++ 8.0 1337 template <class T> 1338 inline void VecStoreBE(const T data, int off, word32 dest[4]) 1339 { 1340 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit 1341 // word pointers. The ISA lacks stores for short* and char*. 1342 // Power9/ISA 3.0 provides vec_xst for all datatypes. 1343 1344 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off; 1345 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0); 1346 CRYPTOPP_UNUSED(addr); 1347 1348 #if defined(_ARCH_PWR9) 1349 vec_xst_be((uint8x16_p)data, off, NCONST_V8_CAST(dest)); 1350 #elif defined(CRYPTOPP_BIG_ENDIAN) 1351 VecStore((uint32x4_p)data, NCONST_V32_CAST(addr)); 1352 #else 1353 VecStore((uint32x4_p)VecReverseLE(data), NCONST_V32_CAST(addr)); 1354 #endif 1355 } 1356 1357 //@} 1358 1359 /// \name LOGICAL OPERATIONS 1360 //@{ 1361 1362 /// \brief AND two vectors 1363 /// \tparam T1 vector type 1364 /// \tparam T2 vector type 1365 /// \param vec1 the first vector 1366 /// \param vec2 the second vector 1367 /// \return vector 1368 /// \details VecAnd() performs <tt>vec1 & vec2</tt>. 1369 /// vec2 is cast to the same type as vec1. The return vector 1370 /// is the same type as vec1. 1371 /// \par Wraps 1372 /// vec_and 1373 /// \sa VecAnd64 1374 /// \since Crypto++ 6.0 1375 template <class T1, class T2> 1376 inline T1 VecAnd(const T1 vec1, const T2 vec2) 1377 { 1378 return (T1)vec_and(vec1, (T1)vec2); 1379 } 1380 1381 /// \brief OR two vectors 1382 /// \tparam T1 vector type 1383 /// \tparam T2 vector type 1384 /// \param vec1 the first vector 1385 /// \param vec2 the second vector 1386 /// \return vector 1387 /// \details VecOr() performs <tt>vec1 | vec2</tt>. 1388 /// vec2 is cast to the same type as vec1. The return vector 1389 /// is the same type as vec1. 1390 /// \par Wraps 1391 /// vec_or 1392 /// \sa VecOr64 1393 /// \since Crypto++ 6.0 1394 template <class T1, class T2> 1395 inline T1 VecOr(const T1 vec1, const T2 vec2) 1396 { 1397 return (T1)vec_or(vec1, (T1)vec2); 1398 } 1399 1400 /// \brief XOR two vectors 1401 /// \tparam T1 vector type 1402 /// \tparam T2 vector type 1403 /// \param vec1 the first vector 1404 /// \param vec2 the second vector 1405 /// \return vector 1406 /// \details VecXor() performs <tt>vec1 ^ vec2</tt>. 1407 /// vec2 is cast to the same type as vec1. The return vector 1408 /// is the same type as vec1. 1409 /// \par Wraps 1410 /// vec_xor 1411 /// \sa VecXor64 1412 /// \since Crypto++ 6.0 1413 template <class T1, class T2> 1414 inline T1 VecXor(const T1 vec1, const T2 vec2) 1415 { 1416 return (T1)vec_xor(vec1, (T1)vec2); 1417 } 1418 1419 //@} 1420 1421 /// \name ARITHMETIC OPERATIONS 1422 //@{ 1423 1424 /// \brief Add two vectors 1425 /// \tparam T1 vector type 1426 /// \tparam T2 vector type 1427 /// \param vec1 the first vector 1428 /// \param vec2 the second vector 1429 /// \return vector 1430 /// \details VecAdd() performs <tt>vec1 + vec2</tt>. 1431 /// vec2 is cast to the same type as vec1. The return vector 1432 /// is the same type as vec1. 1433 /// \par Wraps 1434 /// vec_add 1435 /// \sa VecAdd64 1436 /// \since Crypto++ 6.0 1437 template <class T1, class T2> 1438 inline T1 VecAdd(const T1 vec1, const T2 vec2) 1439 { 1440 return (T1)vec_add(vec1, (T1)vec2); 1441 } 1442 1443 /// \brief Subtract two vectors 1444 /// \tparam T1 vector type 1445 /// \tparam T2 vector type 1446 /// \param vec1 the first vector 1447 /// \param vec2 the second vector 1448 /// \details VecSub() performs <tt>vec1 - vec2</tt>. 1449 /// vec2 is cast to the same type as vec1. The return vector 1450 /// is the same type as vec1. 1451 /// \par Wraps 1452 /// vec_sub 1453 /// \sa VecSub64 1454 /// \since Crypto++ 6.0 1455 template <class T1, class T2> 1456 inline T1 VecSub(const T1 vec1, const T2 vec2) 1457 { 1458 return (T1)vec_sub(vec1, (T1)vec2); 1459 } 1460 1461 //@} 1462 1463 /// \name PERMUTE OPERATIONS 1464 //@{ 1465 1466 /// \brief Permutes a vector 1467 /// \tparam T1 vector type 1468 /// \tparam T2 vector type 1469 /// \param vec the vector 1470 /// \param mask vector mask 1471 /// \return vector 1472 /// \details VecPermute() creates a new vector from vec according to mask. 1473 /// mask is an uint8x16_p vector. The return vector is the same type as vec. 1474 /// \par Wraps 1475 /// vec_perm 1476 /// \since Crypto++ 6.0 1477 template <class T1, class T2> 1478 inline T1 VecPermute(const T1 vec, const T2 mask) 1479 { 1480 return (T1)vec_perm(vec, vec, (uint8x16_p)mask); 1481 } 1482 1483 /// \brief Permutes two vectors 1484 /// \tparam T1 vector type 1485 /// \tparam T2 vector type 1486 /// \param vec1 the first vector 1487 /// \param vec2 the second vector 1488 /// \param mask vector mask 1489 /// \return vector 1490 /// \details VecPermute() creates a new vector from vec1 and vec2 according to mask. 1491 /// mask is an uint8x16_p vector. The return vector is the same type as vec. 1492 /// \par Wraps 1493 /// vec_perm 1494 /// \since Crypto++ 6.0 1495 template <class T1, class T2> 1496 inline T1 VecPermute(const T1 vec1, const T1 vec2, const T2 mask) 1497 { 1498 return (T1)vec_perm(vec1, (T1)vec2, (uint8x16_p)mask); 1499 } 1500 1501 //@} 1502 1503 /// \name SHIFT AND ROTATE OPERATIONS 1504 //@{ 1505 1506 /// \brief Shift a vector left 1507 /// \tparam C shift byte count 1508 /// \tparam T vector type 1509 /// \param vec the vector 1510 /// \return vector 1511 /// \details VecShiftLeftOctet() returns a new vector after shifting the 1512 /// concatenation of the zero vector and the source vector by the specified 1513 /// number of bytes. The return vector is the same type as vec. 1514 /// \details On big endian machines VecShiftLeftOctet() is <tt>vec_sld(a, z, 1515 /// c)</tt>. On little endian machines VecShiftLeftOctet() is translated to 1516 /// <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as 1517 /// if on a big endian machine as shown below. 1518 /// <pre> 1519 /// uint8x16_p x = VecLoad(ptr); 1520 /// uint8x16_p y = VecShiftLeftOctet<12>(x); 1521 /// </pre> 1522 /// \par Wraps 1523 /// vec_sld 1524 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld 1525 /// endian sensitive?</A> on Stack Overflow 1526 /// \since Crypto++ 6.0 1527 template <unsigned int C, class T> 1528 inline T VecShiftLeftOctet(const T vec) 1529 { 1530 const T zero = {0}; 1531 if (C >= 16) 1532 { 1533 // Out of range 1534 return zero; 1535 } 1536 else if (C == 0) 1537 { 1538 // Noop 1539 return vec; 1540 } 1541 else 1542 { 1543 #if defined(CRYPTOPP_BIG_ENDIAN) 1544 enum { R=C&0xf }; 1545 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R); 1546 #else 1547 enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds 1548 return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R); 1549 #endif 1550 } 1551 } 1552 1553 /// \brief Shift a vector right 1554 /// \tparam C shift byte count 1555 /// \tparam T vector type 1556 /// \param vec the vector 1557 /// \return vector 1558 /// \details VecShiftRightOctet() returns a new vector after shifting the 1559 /// concatenation of the zero vector and the source vector by the specified 1560 /// number of bytes. The return vector is the same type as vec. 1561 /// \details On big endian machines VecShiftRightOctet() is <tt>vec_sld(a, z, 1562 /// c)</tt>. On little endian machines VecShiftRightOctet() is translated to 1563 /// <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as 1564 /// if on a big endian machine as shown below. 1565 /// <pre> 1566 /// uint8x16_p x = VecLoad(ptr); 1567 /// uint8x16_p y = VecShiftRightOctet<12>(y); 1568 /// </pre> 1569 /// \par Wraps 1570 /// vec_sld 1571 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld 1572 /// endian sensitive?</A> on Stack Overflow 1573 /// \since Crypto++ 6.0 1574 template <unsigned int C, class T> 1575 inline T VecShiftRightOctet(const T vec) 1576 { 1577 const T zero = {0}; 1578 if (C >= 16) 1579 { 1580 // Out of range 1581 return zero; 1582 } 1583 else if (C == 0) 1584 { 1585 // Noop 1586 return vec; 1587 } 1588 else 1589 { 1590 #if defined(CRYPTOPP_BIG_ENDIAN) 1591 enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds 1592 return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R); 1593 #else 1594 enum { R=C&0xf }; 1595 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R); 1596 #endif 1597 } 1598 } 1599 1600 /// \brief Rotate a vector left 1601 /// \tparam C shift byte count 1602 /// \tparam T vector type 1603 /// \param vec the vector 1604 /// \return vector 1605 /// \details VecRotateLeftOctet() returns a new vector after rotating the 1606 /// concatenation of the source vector with itself by the specified 1607 /// number of bytes. The return vector is the same type as vec. 1608 /// \par Wraps 1609 /// vec_sld 1610 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld 1611 /// endian sensitive?</A> on Stack Overflow 1612 /// \since Crypto++ 6.0 1613 template <unsigned int C, class T> 1614 inline T VecRotateLeftOctet(const T vec) 1615 { 1616 #if defined(CRYPTOPP_BIG_ENDIAN) 1617 enum { R = C&0xf }; 1618 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R); 1619 #else 1620 enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds 1621 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R); 1622 #endif 1623 } 1624 1625 /// \brief Rotate a vector right 1626 /// \tparam C shift byte count 1627 /// \tparam T vector type 1628 /// \param vec the vector 1629 /// \return vector 1630 /// \details VecRotateRightOctet() returns a new vector after rotating the 1631 /// concatenation of the source vector with itself by the specified 1632 /// number of bytes. The return vector is the same type as vec. 1633 /// \par Wraps 1634 /// vec_sld 1635 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld 1636 /// endian sensitive?</A> on Stack Overflow 1637 /// \since Crypto++ 6.0 1638 template <unsigned int C, class T> 1639 inline T VecRotateRightOctet(const T vec) 1640 { 1641 #if defined(CRYPTOPP_BIG_ENDIAN) 1642 enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds 1643 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R); 1644 #else 1645 enum { R = C&0xf }; 1646 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R); 1647 #endif 1648 } 1649 1650 /// \brief Rotate a vector left 1651 /// \tparam C rotate bit count 1652 /// \param vec the vector 1653 /// \return vector 1654 /// \details VecRotateLeft() rotates each element in a vector by 1655 /// bit count. The return vector is the same type as vec. 1656 /// \par Wraps 1657 /// vec_rl 1658 /// \since Crypto++ 7.0 1659 template<unsigned int C> 1660 inline uint32x4_p VecRotateLeft(const uint32x4_p vec) 1661 { 1662 const uint32x4_p m = {C, C, C, C}; 1663 return vec_rl(vec, m); 1664 } 1665 1666 /// \brief Rotate a vector right 1667 /// \tparam C rotate bit count 1668 /// \param vec the vector 1669 /// \return vector 1670 /// \details VecRotateRight() rotates each element in a vector 1671 /// by bit count. The return vector is the same type as vec. 1672 /// \par Wraps 1673 /// vec_rl 1674 /// \since Crypto++ 7.0 1675 template<unsigned int C> 1676 inline uint32x4_p VecRotateRight(const uint32x4_p vec) 1677 { 1678 const uint32x4_p m = {32-C, 32-C, 32-C, 32-C}; 1679 return vec_rl(vec, m); 1680 } 1681 1682 /// \brief Shift a vector left 1683 /// \tparam C shift bit count 1684 /// \param vec the vector 1685 /// \return vector 1686 /// \details VecShiftLeft() rotates each element in a vector 1687 /// by bit count. The return vector is the same type as vec. 1688 /// \par Wraps 1689 /// vec_sl 1690 /// \since Crypto++ 8.1 1691 template<unsigned int C> 1692 inline uint32x4_p VecShiftLeft(const uint32x4_p vec) 1693 { 1694 const uint32x4_p m = {C, C, C, C}; 1695 return vec_sl(vec, m); 1696 } 1697 1698 /// \brief Shift a vector right 1699 /// \tparam C shift bit count 1700 /// \param vec the vector 1701 /// \return vector 1702 /// \details VecShiftRight() rotates each element in a vector 1703 /// by bit count. The return vector is the same type as vec. 1704 /// \par Wraps 1705 /// vec_rl 1706 /// \since Crypto++ 8.1 1707 template<unsigned int C> 1708 inline uint32x4_p VecShiftRight(const uint32x4_p vec) 1709 { 1710 const uint32x4_p m = {C, C, C, C}; 1711 return vec_sr(vec, m); 1712 } 1713 1714 // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8 1715 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING) 1716 1717 /// \brief Rotate a vector left 1718 /// \tparam C rotate bit count 1719 /// \param vec the vector 1720 /// \return vector 1721 /// \details VecRotateLeft() rotates each element in a vector 1722 /// by bit count. The return vector is the same type as vec. 1723 /// \details VecRotateLeft() with 64-bit elements is available on 1724 /// POWER8 and above. 1725 /// \par Wraps 1726 /// vec_rl 1727 /// \since Crypto++ 8.0 1728 template<unsigned int C> 1729 inline uint64x2_p VecRotateLeft(const uint64x2_p vec) 1730 { 1731 const uint64x2_p m = {C, C}; 1732 return vec_rl(vec, m); 1733 } 1734 1735 /// \brief Shift a vector left 1736 /// \tparam C shift bit count 1737 /// \param vec the vector 1738 /// \return vector 1739 /// \details VecShiftLeft() rotates each element in a vector 1740 /// by bit count. The return vector is the same type as vec. 1741 /// \details VecShiftLeft() with 64-bit elements is available on 1742 /// POWER8 and above. 1743 /// \par Wraps 1744 /// vec_sl 1745 /// \since Crypto++ 8.1 1746 template<unsigned int C> 1747 inline uint64x2_p VecShiftLeft(const uint64x2_p vec) 1748 { 1749 const uint64x2_p m = {C, C}; 1750 return vec_sl(vec, m); 1751 } 1752 1753 /// \brief Rotate a vector right 1754 /// \tparam C rotate bit count 1755 /// \param vec the vector 1756 /// \return vector 1757 /// \details VecRotateRight() rotates each element in a vector 1758 /// by bit count. The return vector is the same type as vec. 1759 /// \details VecRotateRight() with 64-bit elements is available on 1760 /// POWER8 and above. 1761 /// \par Wraps 1762 /// vec_rl 1763 /// \since Crypto++ 8.0 1764 template<unsigned int C> 1765 inline uint64x2_p VecRotateRight(const uint64x2_p vec) 1766 { 1767 const uint64x2_p m = {64-C, 64-C}; 1768 return vec_rl(vec, m); 1769 } 1770 1771 /// \brief Shift a vector right 1772 /// \tparam C shift bit count 1773 /// \param vec the vector 1774 /// \return vector 1775 /// \details VecShiftRight() rotates each element in a vector 1776 /// by bit count. The return vector is the same type as vec. 1777 /// \details VecShiftRight() with 64-bit elements is available on 1778 /// POWER8 and above. 1779 /// \par Wraps 1780 /// vec_sr 1781 /// \since Crypto++ 8.1 1782 template<unsigned int C> 1783 inline uint64x2_p VecShiftRight(const uint64x2_p vec) 1784 { 1785 const uint64x2_p m = {C, C}; 1786 return vec_sr(vec, m); 1787 } 1788 1789 #endif // ARCH_PWR8 1790 1791 //@} 1792 1793 /// \name OTHER OPERATIONS 1794 //@{ 1795 1796 /// \brief Merge two vectors 1797 /// \tparam T vector type 1798 /// \param vec1 the first vector 1799 /// \param vec2 the second vector 1800 /// \return vector 1801 /// \par Wraps 1802 /// vec_mergel 1803 /// \since Crypto++ 8.1 1804 template <class T> 1805 inline T VecMergeLow(const T vec1, const T vec2) 1806 { 1807 return vec_mergel(vec1, vec2); 1808 } 1809 1810 /// \brief Merge two vectors 1811 /// \tparam T vector type 1812 /// \param vec1 the first vector 1813 /// \param vec2 the second vector 1814 /// \return vector 1815 /// \par Wraps 1816 /// vec_mergeh 1817 /// \since Crypto++ 8.1 1818 template <class T> 1819 inline T VecMergeHigh(const T vec1, const T vec2) 1820 { 1821 return vec_mergeh(vec1, vec2); 1822 } 1823 1824 /// \brief Broadcast 32-bit word to a vector 1825 /// \param val the 32-bit value 1826 /// \return vector 1827 /// \par Wraps 1828 /// vec_splats 1829 /// \since Crypto++ 8.3 1830 inline uint32x4_p VecSplatWord(word32 val) 1831 { 1832 // Fix spurious GCC warning??? 1833 CRYPTOPP_UNUSED(val); 1834 1835 // Apple Altivec and XL C++ do not offer vec_splats. 1836 // GCC offers vec_splats back to -mcpu=power4. 1837 #if defined(_ARCH_PWR4) && defined(__GNUC__) 1838 return vec_splats(val); 1839 #else 1840 //const word32 x[4] = {val,val,val,val}; 1841 //return VecLoad(x); 1842 const word32 x[4] = {val}; 1843 return vec_splat(VecLoad(x),0); 1844 #endif 1845 } 1846 1847 /// \brief Broadcast 32-bit element to a vector 1848 /// \tparam the element number 1849 /// \param val the 32-bit value 1850 /// \return vector 1851 /// \par Wraps 1852 /// vec_splat 1853 /// \since Crypto++ 8.3 1854 template <unsigned int N> 1855 inline uint32x4_p VecSplatElement(const uint32x4_p val) 1856 { 1857 return vec_splat(val, N); 1858 } 1859 1860 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING) 1861 /// \brief Broadcast 64-bit double word to a vector 1862 /// \param val the 64-bit value 1863 /// \return vector 1864 /// \par Wraps 1865 /// vec_splats 1866 /// \since Crypto++ 8.3 1867 inline uint64x2_p VecSplatWord(word64 val) 1868 { 1869 // The PPC64 ABI says so. 1870 return vec_splats((unsigned long long)val); 1871 } 1872 1873 /// \brief Broadcast 64-bit element to a vector 1874 /// \tparam the element number 1875 /// \param val the 64-bit value 1876 /// \return vector 1877 /// \par Wraps 1878 /// vec_splat 1879 /// \since Crypto++ 8.3 1880 template <unsigned int N> 1881 inline uint64x2_p VecSplatElement(const uint64x2_p val) 1882 { 1883 #if defined(__VSX__) || defined(_ARCH_PWR8) 1884 return vec_splat(val, N); 1885 #else 1886 enum {E=N&1}; 1887 if (E == 0) 1888 { 1889 const uint8x16_p m = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7}; 1890 return vec_perm(val, val, m); 1891 } 1892 else // (E == 1) 1893 { 1894 const uint8x16_p m = {8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15}; 1895 return vec_perm(val, val, m); 1896 } 1897 #endif 1898 } 1899 #endif 1900 1901 /// \brief Extract a dword from a vector 1902 /// \tparam T vector type 1903 /// \param val the vector 1904 /// \return vector created from low dword 1905 /// \details VecGetLow() extracts the low dword from a vector. The low dword 1906 /// is composed of the least significant bits and occupies bytes 8 through 15 1907 /// when viewed as a big endian array. The return vector is the same type as 1908 /// the original vector and padded with 0's in the most significant bit positions. 1909 /// \par Wraps 1910 /// vec_sld 1911 /// \since Crypto++ 7.0 1912 template <class T> 1913 inline T VecGetLow(const T val) 1914 { 1915 #if defined(CRYPTOPP_BIG_ENDIAN) && (defined(__VSX__) || defined(_ARCH_PWR8)) 1916 const T zero = {0}; 1917 return (T)VecMergeLow((uint64x2_p)zero, (uint64x2_p)val); 1918 #else 1919 return VecShiftRightOctet<8>(VecShiftLeftOctet<8>(val)); 1920 #endif 1921 } 1922 1923 /// \brief Extract a dword from a vector 1924 /// \tparam T vector type 1925 /// \param val the vector 1926 /// \return vector created from high dword 1927 /// \details VecGetHigh() extracts the high dword from a vector. The high dword 1928 /// is composed of the most significant bits and occupies bytes 0 through 7 1929 /// when viewed as a big endian array. The return vector is the same type as 1930 /// the original vector and padded with 0's in the most significant bit positions. 1931 /// \par Wraps 1932 /// vec_sld 1933 /// \since Crypto++ 7.0 1934 template <class T> 1935 inline T VecGetHigh(const T val) 1936 { 1937 #if defined(CRYPTOPP_BIG_ENDIAN) && (defined(__VSX__) || defined(_ARCH_PWR8)) 1938 const T zero = {0}; 1939 return (T)VecMergeHigh((uint64x2_p)zero, (uint64x2_p)val); 1940 #else 1941 return VecShiftRightOctet<8>(val); 1942 #endif 1943 } 1944 1945 /// \brief Exchange high and low double words 1946 /// \tparam T vector type 1947 /// \param vec the vector 1948 /// \return vector 1949 /// \par Wraps 1950 /// vec_sld 1951 /// \since Crypto++ 7.0 1952 template <class T> 1953 inline T VecSwapWords(const T vec) 1954 { 1955 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, 8); 1956 } 1957 1958 //@} 1959 1960 /// \name COMPARISON 1961 //@{ 1962 1963 /// \brief Compare two vectors 1964 /// \tparam T1 vector type 1965 /// \tparam T2 vector type 1966 /// \param vec1 the first vector 1967 /// \param vec2 the second vector 1968 /// \return true if vec1 equals vec2, false otherwise 1969 /// \details VecEqual() performs a bitwise compare. The vector element types do 1970 /// not matter. 1971 /// \par Wraps 1972 /// vec_all_eq 1973 /// \since Crypto++ 8.0 1974 template <class T1, class T2> 1975 inline bool VecEqual(const T1 vec1, const T2 vec2) 1976 { 1977 return 1 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2); 1978 } 1979 1980 /// \brief Compare two vectors 1981 /// \tparam T1 vector type 1982 /// \tparam T2 vector type 1983 /// \param vec1 the first vector 1984 /// \param vec2 the second vector 1985 /// \return true if vec1 does not equal vec2, false otherwise 1986 /// \details VecNotEqual() performs a bitwise compare. The vector element types do 1987 /// not matter. 1988 /// \par Wraps 1989 /// vec_all_eq 1990 /// \since Crypto++ 8.0 1991 template <class T1, class T2> 1992 inline bool VecNotEqual(const T1 vec1, const T2 vec2) 1993 { 1994 return 0 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2); 1995 } 1996 1997 //@} 1998 1999 ////////////////// 32-bit Altivec ///////////////// 2000 2001 /// \name 32-BIT ALTIVEC 2002 //@{ 2003 2004 /// \brief Add two vectors as if uint64x2_p 2005 /// \param vec1 the first vector 2006 /// \param vec2 the second vector 2007 /// \return vector 2008 /// \details VecAdd64() performs <tt>vec1 + vec2</tt>. VecAdd64() performs as 2009 /// if adding two uint64x2_p vectors. On POWER7 and below VecAdd64() manages 2010 /// the carries from the elements. 2011 /// \par Wraps 2012 /// vec_add for POWER8, vec_addc, vec_perm, vec_add for Altivec 2013 /// \since Crypto++ 8.3 2014 inline uint32x4_p VecAdd64(const uint32x4_p& vec1, const uint32x4_p& vec2) 2015 { 2016 // 64-bit elements available at POWER7 with VSX, but addudm requires POWER8 2017 #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG) 2018 return (uint32x4_p)vec_add((uint64x2_p)vec1, (uint64x2_p)vec2); 2019 #else 2020 // The carry mask selects carrys for elements 1 and 3 and sets 2021 // remaining elements to 0. The results is then shifted so the 2022 // carried values are added to elements 0 and 2. 2023 #if defined(CRYPTOPP_BIG_ENDIAN) 2024 const uint32x4_p zero = {0, 0, 0, 0}; 2025 const uint32x4_p mask = {0, 1, 0, 1}; 2026 #else 2027 const uint32x4_p zero = {0, 0, 0, 0}; 2028 const uint32x4_p mask = {1, 0, 1, 0}; 2029 #endif 2030 2031 uint32x4_p cy = vec_addc(vec1, vec2); 2032 uint32x4_p res = vec_add(vec1, vec2); 2033 cy = vec_and(mask, cy); 2034 cy = vec_sld (cy, zero, 4); 2035 return vec_add(res, cy); 2036 #endif 2037 } 2038 2039 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING) 2040 /// \brief Add two vectors as if uint64x2_p 2041 /// \param vec1 the first vector 2042 /// \param vec2 the second vector 2043 /// \return vector 2044 /// \details VecAdd64() performs <tt>vec1 + vec2</tt>. VecAdd64() performs as 2045 /// if adding two uint64x2_p vectors. On POWER7 and below VecAdd64() manages 2046 /// the carries from the elements. 2047 /// \par Wraps 2048 /// vec_add for POWER8 2049 /// \since Crypto++ 8.3 2050 inline uint64x2_p VecAdd64(const uint64x2_p& vec1, const uint64x2_p& vec2) 2051 { 2052 // 64-bit elements available at POWER7 with VSX, but addudm requires POWER8 2053 const uint64x2_p res = vec_add(vec1, vec2); 2054 2055 #if defined(CRYPTOPP_DEBUG) 2056 // Test 32-bit add in debug builds while we are here. 2057 const uint32x4_p x = (uint32x4_p)vec1; 2058 const uint32x4_p y = (uint32x4_p)vec2; 2059 const uint32x4_p r = VecAdd64(x, y); 2060 2061 CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1); 2062 #endif 2063 2064 return res; 2065 } 2066 #endif 2067 2068 /// \brief Subtract two vectors as if uint64x2_p 2069 /// \param vec1 the first vector 2070 /// \param vec2 the second vector 2071 /// \details VecSub64() performs <tt>vec1 - vec2</tt>. VecSub64() performs as 2072 /// if subtracting two uint64x2_p vectors. On POWER7 and below VecSub64() 2073 /// manages the borrows from the elements. 2074 /// \par Wraps 2075 /// vec_sub for POWER8, vec_subc, vec_andc, vec_perm, vec_sub for Altivec 2076 /// \since Crypto++ 8.3 2077 inline uint32x4_p VecSub64(const uint32x4_p& vec1, const uint32x4_p& vec2) 2078 { 2079 #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG) 2080 // 64-bit elements available at POWER7 with VSX, but subudm requires POWER8 2081 return (uint32x4_p)vec_sub((uint64x2_p)vec1, (uint64x2_p)vec2); 2082 #else 2083 // The borrow mask selects borrows for elements 1 and 3 and sets 2084 // remaining elements to 0. The results is then shifted so the 2085 // borrowed values are subtracted from elements 0 and 2. 2086 #if defined(CRYPTOPP_BIG_ENDIAN) 2087 const uint32x4_p zero = {0, 0, 0, 0}; 2088 const uint32x4_p mask = {0, 1, 0, 1}; 2089 #else 2090 const uint32x4_p zero = {0, 0, 0, 0}; 2091 const uint32x4_p mask = {1, 0, 1, 0}; 2092 #endif 2093 2094 // subc sets the complement of borrow, so we have to 2095 // un-complement it using andc. 2096 uint32x4_p bw = vec_subc(vec1, vec2); 2097 uint32x4_p res = vec_sub(vec1, vec2); 2098 bw = vec_andc(mask, bw); 2099 bw = vec_sld (bw, zero, 4); 2100 return vec_sub(res, bw); 2101 #endif 2102 } 2103 2104 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING) 2105 /// \brief Subtract two vectors as if uint64x2_p 2106 /// \param vec1 the first vector 2107 /// \param vec2 the second vector 2108 /// \details VecSub64() performs <tt>vec1 - vec2</tt>. VecSub64() performs as 2109 /// if subtracting two uint64x2_p vectors. On POWER7 and below VecSub64() 2110 /// manages the borrows from the elements. 2111 /// \par Wraps 2112 /// vec_sub for POWER8 2113 /// \since Crypto++ 8.3 2114 inline uint64x2_p VecSub64(const uint64x2_p& vec1, const uint64x2_p& vec2) 2115 { 2116 // 64-bit elements available at POWER7 with VSX, but subudm requires POWER8 2117 const uint64x2_p res = vec_sub(vec1, vec2); 2118 2119 #if defined(CRYPTOPP_DEBUG) 2120 // Test 32-bit sub in debug builds while we are here. 2121 const uint32x4_p x = (uint32x4_p)vec1; 2122 const uint32x4_p y = (uint32x4_p)vec2; 2123 const uint32x4_p r = VecSub64(x, y); 2124 2125 CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1); 2126 #endif 2127 2128 return res; 2129 } 2130 #endif 2131 2132 /// \brief Rotate a vector left as if uint64x2_p 2133 /// \tparam C rotate bit count 2134 /// \param vec the vector 2135 /// \return vector 2136 /// \details VecRotateLeft() rotates each element in a vector by bit count. 2137 /// vec is rotated as if uint64x2_p. 2138 /// \par Wraps 2139 /// vec_rl 2140 /// \since Crypto++ 8.3 2141 template<unsigned int C> 2142 inline uint32x4_p VecRotateLeft64(const uint32x4_p vec) 2143 { 2144 #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG) 2145 // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8 2146 return (uint32x4_p)VecRotateLeft<C>((uint64x2_p)vec); 2147 #else 2148 // C=0, 32, or 64 needs special handling. That is S32 and S64 below. 2149 enum {S64=C&63, S32=C&31, BR=(S64>=32)}; 2150 2151 // Get the low bits, shift them to high bits 2152 uint32x4_p t1 = VecShiftLeft<S32>(vec); 2153 // Get the high bits, shift them to low bits 2154 uint32x4_p t2 = VecShiftRight<32-S32>(vec); 2155 2156 if (S64 == 0) 2157 { 2158 const uint8x16_p m = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}; 2159 return VecPermute(vec, m); 2160 } 2161 else if (S64 == 32) 2162 { 2163 const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11}; 2164 return VecPermute(vec, m); 2165 } 2166 else if (BR) // Big rotate amount? 2167 { 2168 const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11}; 2169 t1 = VecPermute(t1, m); 2170 } 2171 else 2172 { 2173 const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11}; 2174 t2 = VecPermute(t2, m); 2175 } 2176 2177 return vec_or(t1, t2); 2178 #endif 2179 } 2180 2181 /// \brief Rotate a vector left as if uint64x2_p 2182 /// \param vec the vector 2183 /// \return vector 2184 /// \details VecRotateLeft<8>() rotates each element in a vector 2185 /// by 8-bits. vec is rotated as if uint64x2_p. This specialization 2186 /// is used by algorithms like Speck128. 2187 /// \par Wraps 2188 /// vec_rl 2189 /// \since Crypto++ 8.3 2190 template<> 2191 inline uint32x4_p VecRotateLeft64<8>(const uint32x4_p vec) 2192 { 2193 #if (CRYPTOPP_BIG_ENDIAN) 2194 const uint8x16_p m = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 }; 2195 return VecPermute(vec, m); 2196 #else 2197 const uint8x16_p m = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 }; 2198 return VecPermute(vec, m); 2199 #endif 2200 } 2201 2202 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING) 2203 /// \brief Rotate a vector left as if uint64x2_p 2204 /// \tparam C rotate bit count 2205 /// \param vec the vector 2206 /// \return vector 2207 /// \details VecRotateLeft64() rotates each element in a vector by 2208 /// bit count. vec is rotated as if uint64x2_p. 2209 /// \par Wraps 2210 /// vec_rl 2211 /// \since Crypto++ 8.3 2212 template<unsigned int C> 2213 inline uint64x2_p VecRotateLeft64(const uint64x2_p vec) 2214 { 2215 // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8 2216 const uint64x2_p res = VecRotateLeft<C>(vec); 2217 2218 #if defined(CRYPTOPP_DEBUG) 2219 // Test 32-bit rotate in debug builds while we are here. 2220 const uint32x4_p x = (uint32x4_p)vec; 2221 const uint32x4_p r = VecRotateLeft64<C>(x); 2222 2223 CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1); 2224 #endif 2225 2226 return res; 2227 } 2228 #endif 2229 2230 /// \brief Rotate a vector right as if uint64x2_p 2231 /// \tparam C rotate bit count 2232 /// \param vec the vector 2233 /// \return vector 2234 /// \details VecRotateRight64() rotates each element in a vector by 2235 /// bit count. vec is rotated as if uint64x2_p. 2236 /// \par Wraps 2237 /// vec_rl 2238 /// \since Crypto++ 8.3 2239 template<unsigned int C> 2240 inline uint32x4_p VecRotateRight64(const uint32x4_p vec) 2241 { 2242 #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG) 2243 // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8 2244 return (uint32x4_p)VecRotateRight<C>((uint64x2_p)vec); 2245 #else 2246 // C=0, 32, or 64 needs special handling. That is S32 and S64 below. 2247 enum {S64=C&63, S32=C&31, BR=(S64>=32)}; 2248 2249 // Get the low bits, shift them to high bits 2250 uint32x4_p t1 = VecShiftRight<S32>(vec); 2251 // Get the high bits, shift them to low bits 2252 uint32x4_p t2 = VecShiftLeft<32-S32>(vec); 2253 2254 if (S64 == 0) 2255 { 2256 const uint8x16_p m = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}; 2257 return VecPermute(vec, m); 2258 } 2259 else if (S64 == 32) 2260 { 2261 const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11}; 2262 return VecPermute(vec, m); 2263 } 2264 else if (BR) // Big rotate amount? 2265 { 2266 const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11}; 2267 t1 = VecPermute(t1, m); 2268 } 2269 else 2270 { 2271 const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11}; 2272 t2 = VecPermute(t2, m); 2273 } 2274 2275 return vec_or(t1, t2); 2276 #endif 2277 } 2278 2279 /// \brief Rotate a vector right as if uint64x2_p 2280 /// \param vec the vector 2281 /// \return vector 2282 /// \details VecRotateRight64<8>() rotates each element in a vector 2283 /// by 8-bits. vec is rotated as if uint64x2_p. This specialization 2284 /// is used by algorithms like Speck128. 2285 /// \details vec is rotated as if uint64x2_p. 2286 /// \par Wraps 2287 /// vec_rl 2288 /// \since Crypto++ 8.3 2289 template<> 2290 inline uint32x4_p VecRotateRight64<8>(const uint32x4_p vec) 2291 { 2292 #if (CRYPTOPP_BIG_ENDIAN) 2293 const uint8x16_p m = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 }; 2294 return VecPermute(vec, m); 2295 #else 2296 const uint8x16_p m = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 }; 2297 return VecPermute(vec, m); 2298 #endif 2299 } 2300 2301 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING) 2302 /// \brief Rotate a vector right as if uint64x2_p 2303 /// \tparam C rotate bit count 2304 /// \param vec the vector 2305 /// \return vector 2306 /// \details VecRotateRight64() rotates each element in a vector by 2307 /// bit count. vec is rotated as if uint64x2_p. 2308 /// \par Wraps 2309 /// vec_rl 2310 /// \since Crypto++ 8.3 2311 template<unsigned int C> 2312 inline uint64x2_p VecRotateRight64(const uint64x2_p vec) 2313 { 2314 // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8 2315 const uint64x2_p res = VecRotateRight<C>(vec); 2316 2317 #if defined(CRYPTOPP_DEBUG) 2318 // Test 32-bit rotate in debug builds while we are here. 2319 const uint32x4_p x = (uint32x4_p)vec; 2320 const uint32x4_p r = VecRotateRight64<C>(x); 2321 2322 CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1); 2323 #endif 2324 2325 return res; 2326 } 2327 #endif 2328 2329 /// \brief AND two vectors as if uint64x2_p 2330 /// \tparam T1 vector type 2331 /// \tparam T2 vector type 2332 /// \param vec1 the first vector 2333 /// \param vec2 the second vector 2334 /// \return vector 2335 /// \details VecAnd64() performs <tt>vec1 & vec2</tt>. 2336 /// vec2 is cast to the same type as vec1. The return vector 2337 /// is the same type as vec1. 2338 /// \details VecAnd64() is a convenience function that simply performs a VecAnd(). 2339 /// \par Wraps 2340 /// vec_and 2341 /// \since Crypto++ 8.3 2342 template <class T1, class T2> 2343 inline T1 VecAnd64(const T1 vec1, const T2 vec2) 2344 { 2345 return (T1)vec_and(vec1, (T1)vec2); 2346 } 2347 2348 /// \brief OR two vectors as if uint64x2_p 2349 /// \tparam T1 vector type 2350 /// \tparam T2 vector type 2351 /// \param vec1 the first vector 2352 /// \param vec2 the second vector 2353 /// \return vector 2354 /// \details VecOr64() performs <tt>vec1 | vec2</tt>. 2355 /// vec2 is cast to the same type as vec1. The return vector 2356 /// is the same type as vec1. 2357 /// \details VecOr64() is a convenience function that simply performs a VecOr(). 2358 /// \par Wraps 2359 /// vec_or 2360 /// \since Crypto++ 8.3 2361 template <class T1, class T2> 2362 inline T1 VecOr64(const T1 vec1, const T2 vec2) 2363 { 2364 return (T1)vec_or(vec1, (T1)vec2); 2365 } 2366 2367 /// \brief XOR two vectors as if uint64x2_p 2368 /// \tparam T1 vector type 2369 /// \tparam T2 vector type 2370 /// \param vec1 the first vector 2371 /// \param vec2 the second vector 2372 /// \return vector 2373 /// \details VecXor64() performs <tt>vec1 ^ vec2</tt>. 2374 /// vec2 is cast to the same type as vec1. The return vector 2375 /// is the same type as vec1. 2376 /// \details VecXor64() is a convenience function that simply performs a VecXor(). 2377 /// \par Wraps 2378 /// vec_xor 2379 /// \since Crypto++ 8.3 2380 template <class T1, class T2> 2381 inline T1 VecXor64(const T1 vec1, const T2 vec2) 2382 { 2383 return (T1)vec_xor(vec1, (T1)vec2); 2384 } 2385 2386 /// \brief Broadcast 64-bit double word to a vector 2387 /// \param val the 64-bit value 2388 /// \return vector 2389 /// \par Wraps 2390 /// vec_splats 2391 /// \since Crypto++ 8.3 2392 inline uint32x4_p VecSplatWord64(word64 val) 2393 { 2394 #if defined(_ARCH_PWR8) 2395 // The PPC64 ABI says so. 2396 return (uint32x4_p)vec_splats((unsigned long long)val); 2397 #else 2398 const word64 x[2] = {val,val}; 2399 return (uint32x4_p)VecLoad((const word32*)x); 2400 #endif 2401 } 2402 2403 /// \brief Broadcast 64-bit element to a vector as if uint64x2_p 2404 /// \tparam the element number 2405 /// \param val the 64-bit value 2406 /// \return vector 2407 /// \par Wraps 2408 /// vec_splat 2409 /// \since Crypto++ 8.3 2410 template <unsigned int N> 2411 inline uint32x4_p VecSplatElement64(const uint32x4_p val) 2412 { 2413 #if defined(__VSX__) || defined(_ARCH_PWR8) 2414 return (uint32x4_p)vec_splat((uint64x2_p)val, N); 2415 #else 2416 enum {E=N&1}; 2417 if (E == 0) 2418 { 2419 const uint8x16_p m = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7}; 2420 return (uint32x4_p)vec_perm(val, val, m); 2421 } 2422 else // (E == 1) 2423 { 2424 const uint8x16_p m = {8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15}; 2425 return (uint32x4_p)vec_perm(val, val, m); 2426 } 2427 #endif 2428 } 2429 2430 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING) 2431 /// \brief Broadcast 64-bit element to a vector 2432 /// \tparam the element number 2433 /// \param val the 64-bit value 2434 /// \return vector 2435 /// \since Crypto++ 8.3 2436 template <unsigned int N> 2437 inline uint64x2_p VecSplatElement64(const uint64x2_p val) 2438 { 2439 return vec_splat(val, N); 2440 } 2441 #endif 2442 2443 //@} 2444 2445 //////////////////////// Power8 Crypto //////////////////////// 2446 2447 // __CRYPTO__ alone is not enough. Clang will define __CRYPTO__ 2448 // when it is not available, like with Power7. Sigh... 2449 #if (defined(_ARCH_PWR8) && defined(__CRYPTO__)) || defined(CRYPTOPP_DOXYGEN_PROCESSING) 2450 2451 /// \name POLYNOMIAL MULTIPLICATION 2452 //@{ 2453 2454 /// \brief Polynomial multiplication 2455 /// \param a the first term 2456 /// \param b the second term 2457 /// \return vector product 2458 /// \details VecPolyMultiply() performs polynomial multiplication. POWER8 2459 /// polynomial multiplication multiplies the high and low terms, and then 2460 /// XOR's the high and low products. That is, the result is <tt>ah*bh XOR 2461 /// al*bl</tt>. It is different behavior than Intel polynomial 2462 /// multiplication. To obtain a single product without the XOR, then set 2463 /// one of the high or low terms to 0. For example, setting <tt>ah=0</tt> 2464 /// results in <tt>0*bh XOR al*bl = al*bl</tt>. 2465 /// \par Wraps 2466 /// __vpmsumw, __builtin_altivec_crypto_vpmsumw and __builtin_crypto_vpmsumw. 2467 /// \since Crypto++ 8.1 2468 inline uint32x4_p VecPolyMultiply(const uint32x4_p& a, const uint32x4_p& b) 2469 { 2470 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__)) 2471 return __vpmsumw (a, b); 2472 #elif defined(__clang__) 2473 return __builtin_altivec_crypto_vpmsumw (a, b); 2474 #else 2475 return __builtin_crypto_vpmsumw (a, b); 2476 #endif 2477 } 2478 2479 /// \brief Polynomial multiplication 2480 /// \param a the first term 2481 /// \param b the second term 2482 /// \return vector product 2483 /// \details VecPolyMultiply() performs polynomial multiplication. POWER8 2484 /// polynomial multiplication multiplies the high and low terms, and then 2485 /// XOR's the high and low products. That is, the result is <tt>ah*bh XOR 2486 /// al*bl</tt>. It is different behavior than Intel polynomial 2487 /// multiplication. To obtain a single product without the XOR, then set 2488 /// one of the high or low terms to 0. For example, setting <tt>ah=0</tt> 2489 /// results in <tt>0*bh XOR al*bl = al*bl</tt>. 2490 /// \par Wraps 2491 /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd. 2492 /// \since Crypto++ 8.1 2493 inline uint64x2_p VecPolyMultiply(const uint64x2_p& a, const uint64x2_p& b) 2494 { 2495 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__)) 2496 return __vpmsumd (a, b); 2497 #elif defined(__clang__) 2498 return __builtin_altivec_crypto_vpmsumd (a, b); 2499 #else 2500 return __builtin_crypto_vpmsumd (a, b); 2501 #endif 2502 } 2503 2504 /// \brief Polynomial multiplication 2505 /// \param a the first term 2506 /// \param b the second term 2507 /// \return vector product 2508 /// \details VecIntelMultiply00() performs polynomial multiplication and presents 2509 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x00)</tt>. 2510 /// The <tt>0x00</tt> indicates the low 64-bits of <tt>a</tt> and <tt>b</tt> 2511 /// are multiplied. 2512 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit 2513 /// is MSB and numbered 127, while the rightmost bit is LSB and numbered 0. 2514 /// \par Wraps 2515 /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd. 2516 /// \since Crypto++ 8.0 2517 inline uint64x2_p VecIntelMultiply00(const uint64x2_p& a, const uint64x2_p& b) 2518 { 2519 #if defined(CRYPTOPP_BIG_ENDIAN) 2520 return VecSwapWords(VecPolyMultiply(VecGetHigh(a), VecGetHigh(b))); 2521 #else 2522 return VecPolyMultiply(VecGetHigh(a), VecGetHigh(b)); 2523 #endif 2524 } 2525 2526 /// \brief Polynomial multiplication 2527 /// \param a the first term 2528 /// \param b the second term 2529 /// \return vector product 2530 /// \details VecIntelMultiply01 performs() polynomial multiplication and presents 2531 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x01)</tt>. 2532 /// The <tt>0x01</tt> indicates the low 64-bits of <tt>a</tt> and high 2533 /// 64-bits of <tt>b</tt> are multiplied. 2534 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit 2535 /// is MSB and numbered 127, while the rightmost bit is LSB and numbered 0. 2536 /// \par Wraps 2537 /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd. 2538 /// \since Crypto++ 8.0 2539 inline uint64x2_p VecIntelMultiply01(const uint64x2_p& a, const uint64x2_p& b) 2540 { 2541 #if defined(CRYPTOPP_BIG_ENDIAN) 2542 return VecSwapWords(VecPolyMultiply(a, VecGetHigh(b))); 2543 #else 2544 return VecPolyMultiply(a, VecGetHigh(b)); 2545 #endif 2546 } 2547 2548 /// \brief Polynomial multiplication 2549 /// \param a the first term 2550 /// \param b the second term 2551 /// \return vector product 2552 /// \details VecIntelMultiply10() performs polynomial multiplication and presents 2553 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x10)</tt>. 2554 /// The <tt>0x10</tt> indicates the high 64-bits of <tt>a</tt> and low 2555 /// 64-bits of <tt>b</tt> are multiplied. 2556 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit 2557 /// is MSB and numbered 127, while the rightmost bit is LSB and numbered 0. 2558 /// \par Wraps 2559 /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd. 2560 /// \since Crypto++ 8.0 2561 inline uint64x2_p VecIntelMultiply10(const uint64x2_p& a, const uint64x2_p& b) 2562 { 2563 #if defined(CRYPTOPP_BIG_ENDIAN) 2564 return VecSwapWords(VecPolyMultiply(VecGetHigh(a), b)); 2565 #else 2566 return VecPolyMultiply(VecGetHigh(a), b); 2567 #endif 2568 } 2569 2570 /// \brief Polynomial multiplication 2571 /// \param a the first term 2572 /// \param b the second term 2573 /// \return vector product 2574 /// \details VecIntelMultiply11() performs polynomial multiplication and presents 2575 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x11)</tt>. 2576 /// The <tt>0x11</tt> indicates the high 64-bits of <tt>a</tt> and <tt>b</tt> 2577 /// are multiplied. 2578 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit 2579 /// is MSB and numbered 127, while the rightmost bit is LSB and numbered 0. 2580 /// \par Wraps 2581 /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd. 2582 /// \since Crypto++ 8.0 2583 inline uint64x2_p VecIntelMultiply11(const uint64x2_p& a, const uint64x2_p& b) 2584 { 2585 #if defined(CRYPTOPP_BIG_ENDIAN) 2586 return VecSwapWords(VecPolyMultiply(VecGetLow(a), b)); 2587 #else 2588 return VecPolyMultiply(VecGetLow(a), b); 2589 #endif 2590 } 2591 2592 //@} 2593 2594 /// \name AES ENCRYPTION 2595 //@{ 2596 2597 /// \brief One round of AES encryption 2598 /// \tparam T1 vector type 2599 /// \tparam T2 vector type 2600 /// \param state the state vector 2601 /// \param key the subkey vector 2602 /// \details VecEncrypt() performs one round of AES encryption of state 2603 /// using subkey key. The return vector is the same type as state. 2604 /// \details VecEncrypt() is available on POWER8 and above. 2605 /// \par Wraps 2606 /// __vcipher, __builtin_altivec_crypto_vcipher, __builtin_crypto_vcipher 2607 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0 2608 template <class T1, class T2> 2609 inline T1 VecEncrypt(const T1 state, const T2 key) 2610 { 2611 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__)) 2612 return (T1)__vcipher((uint8x16_p)state, (uint8x16_p)key); 2613 #elif defined(__clang__) 2614 return (T1)__builtin_altivec_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key); 2615 #elif defined(__GNUC__) 2616 return (T1)__builtin_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key); 2617 #else 2618 CRYPTOPP_ASSERT(0); 2619 #endif 2620 } 2621 2622 /// \brief Final round of AES encryption 2623 /// \tparam T1 vector type 2624 /// \tparam T2 vector type 2625 /// \param state the state vector 2626 /// \param key the subkey vector 2627 /// \details VecEncryptLast() performs the final round of AES encryption 2628 /// of state using subkey key. The return vector is the same type as state. 2629 /// \details VecEncryptLast() is available on POWER8 and above. 2630 /// \par Wraps 2631 /// __vcipherlast, __builtin_altivec_crypto_vcipherlast, __builtin_crypto_vcipherlast 2632 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0 2633 template <class T1, class T2> 2634 inline T1 VecEncryptLast(const T1 state, const T2 key) 2635 { 2636 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__)) 2637 return (T1)__vcipherlast((uint8x16_p)state, (uint8x16_p)key); 2638 #elif defined(__clang__) 2639 return (T1)__builtin_altivec_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key); 2640 #elif defined(__GNUC__) 2641 return (T1)__builtin_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key); 2642 #else 2643 CRYPTOPP_ASSERT(0); 2644 #endif 2645 } 2646 2647 /// \brief One round of AES decryption 2648 /// \tparam T1 vector type 2649 /// \tparam T2 vector type 2650 /// \param state the state vector 2651 /// \param key the subkey vector 2652 /// \details VecDecrypt() performs one round of AES decryption of state 2653 /// using subkey key. The return vector is the same type as state. 2654 /// \details VecDecrypt() is available on POWER8 and above. 2655 /// \par Wraps 2656 /// __vncipher, __builtin_altivec_crypto_vncipher, __builtin_crypto_vncipher 2657 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0 2658 template <class T1, class T2> 2659 inline T1 VecDecrypt(const T1 state, const T2 key) 2660 { 2661 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__)) 2662 return (T1)__vncipher((uint8x16_p)state, (uint8x16_p)key); 2663 #elif defined(__clang__) 2664 return (T1)__builtin_altivec_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key); 2665 #elif defined(__GNUC__) 2666 return (T1)__builtin_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key); 2667 #else 2668 CRYPTOPP_ASSERT(0); 2669 #endif 2670 } 2671 2672 /// \brief Final round of AES decryption 2673 /// \tparam T1 vector type 2674 /// \tparam T2 vector type 2675 /// \param state the state vector 2676 /// \param key the subkey vector 2677 /// \details VecDecryptLast() performs the final round of AES decryption 2678 /// of state using subkey key. The return vector is the same type as state. 2679 /// \details VecDecryptLast() is available on POWER8 and above. 2680 /// \par Wraps 2681 /// __vncipherlast, __builtin_altivec_crypto_vncipherlast, __builtin_crypto_vncipherlast 2682 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0 2683 template <class T1, class T2> 2684 inline T1 VecDecryptLast(const T1 state, const T2 key) 2685 { 2686 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__)) 2687 return (T1)__vncipherlast((uint8x16_p)state, (uint8x16_p)key); 2688 #elif defined(__clang__) 2689 return (T1)__builtin_altivec_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key); 2690 #elif defined(__GNUC__) 2691 return (T1)__builtin_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key); 2692 #else 2693 CRYPTOPP_ASSERT(0); 2694 #endif 2695 } 2696 2697 //@} 2698 2699 /// \name SHA DIGESTS 2700 //@{ 2701 2702 /// \brief SHA256 Sigma functions 2703 /// \tparam func function 2704 /// \tparam fmask function mask 2705 /// \tparam T vector type 2706 /// \param data the block to transform 2707 /// \details VecSHA256() selects sigma0, sigma1, Sigma0, Sigma1 based on 2708 /// func and fmask. The return vector is the same type as data. 2709 /// \details VecSHA256() is available on POWER8 and above. 2710 /// \par Wraps 2711 /// __vshasigmaw, __builtin_altivec_crypto_vshasigmaw, __builtin_crypto_vshasigmaw 2712 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0 2713 template <int func, int fmask, class T> 2714 inline T VecSHA256(const T data) 2715 { 2716 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__)) 2717 return (T)__vshasigmaw((uint32x4_p)data, func, fmask); 2718 #elif defined(__clang__) 2719 return (T)__builtin_altivec_crypto_vshasigmaw((uint32x4_p)data, func, fmask); 2720 #elif defined(__GNUC__) 2721 return (T)__builtin_crypto_vshasigmaw((uint32x4_p)data, func, fmask); 2722 #else 2723 CRYPTOPP_ASSERT(0); 2724 #endif 2725 } 2726 2727 /// \brief SHA512 Sigma functions 2728 /// \tparam func function 2729 /// \tparam fmask function mask 2730 /// \tparam T vector type 2731 /// \param data the block to transform 2732 /// \details VecSHA512() selects sigma0, sigma1, Sigma0, Sigma1 based on 2733 /// func and fmask. The return vector is the same type as data. 2734 /// \details VecSHA512() is available on POWER8 and above. 2735 /// \par Wraps 2736 /// __vshasigmad, __builtin_altivec_crypto_vshasigmad, __builtin_crypto_vshasigmad 2737 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0 2738 template <int func, int fmask, class T> 2739 inline T VecSHA512(const T data) 2740 { 2741 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__)) 2742 return (T)__vshasigmad((uint64x2_p)data, func, fmask); 2743 #elif defined(__clang__) 2744 return (T)__builtin_altivec_crypto_vshasigmad((uint64x2_p)data, func, fmask); 2745 #elif defined(__GNUC__) 2746 return (T)__builtin_crypto_vshasigmad((uint64x2_p)data, func, fmask); 2747 #else 2748 CRYPTOPP_ASSERT(0); 2749 #endif 2750 } 2751 2752 //@} 2753 2754 #endif // __CRYPTO__ 2755 2756 #endif // _ALTIVEC_ 2757 2758 NAMESPACE_END 2759 2760 #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE 2761 # pragma GCC diagnostic pop 2762 #endif 2763 2764 #endif // CRYPTOPP_PPC_CRYPTO_H
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |