include/cryptopp/arm_simd.h

0001 // arm_simd.h - written and placed in public domain by Jeffrey Walton

0002
0003 /// \file arm_simd.h

0004 /// \brief Support functions for ARM and vector operations

0005
0006 #ifndef CRYPTOPP_ARM_SIMD_H
0007 #define CRYPTOPP_ARM_SIMD_H
0008
0009 #include "config.h"
0010
0011 #if (CRYPTOPP_ARM_NEON_HEADER)
0012 # include <stdint.h>
0013 # include <arm_neon.h>
0014 #endif
0015
0016 #if (CRYPTOPP_ARM_ACLE_HEADER)
0017 # include <stdint.h>
0018 # include <arm_acle.h>
0019 #endif
0020
0021 #if (CRYPTOPP_ARM_CRC32_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
0022 /// \name CRC32 checksum

0023 //@{

0024
0025 /// \brief CRC32 checksum

0026 /// \param crc the starting crc value

0027 /// \param val the value to checksum

0028 /// \return CRC32 value

0029 /// \since Crypto++ 8.6

0030 inline uint32_t CRC32B (uint32_t crc, uint8_t val)
0031 {
0032 #if defined(CRYPTOPP_MSC_VERSION)
0033     return __crc32b(crc, val);
0034 #else
0035     __asm__ ("crc32b   %w0, %w0, %w1   \n\t"
0036             :"+r" (crc) : "r" (val) );
0037     return crc;
0038 #endif
0039 }
0040
0041 /// \brief CRC32 checksum

0042 /// \param crc the starting crc value

0043 /// \param val the value to checksum

0044 /// \return CRC32 value

0045 /// \since Crypto++ 8.6

0046 inline uint32_t CRC32W (uint32_t crc, uint32_t val)
0047 {
0048 #if defined(CRYPTOPP_MSC_VERSION)
0049     return __crc32w(crc, val);
0050 #else
0051     __asm__ ("crc32w   %w0, %w0, %w1   \n\t"
0052             :"+r" (crc) : "r" (val) );
0053     return crc;
0054 #endif
0055 }
0056
0057 /// \brief CRC32 checksum

0058 /// \param crc the starting crc value

0059 /// \param vals the values to checksum

0060 /// \return CRC32 value

0061 /// \since Crypto++ 8.6

0062 inline uint32_t CRC32Wx4 (uint32_t crc, const uint32_t vals[4])
0063 {
0064 #if defined(CRYPTOPP_MSC_VERSION)
0065     return __crc32w(__crc32w(__crc32w(__crc32w(
0066              crc, vals[0]), vals[1]), vals[2]), vals[3]);
0067 #else
0068     __asm__ ("crc32w   %w0, %w0, %w1   \n\t"
0069              "crc32w   %w0, %w0, %w2   \n\t"
0070              "crc32w   %w0, %w0, %w3   \n\t"
0071              "crc32w   %w0, %w0, %w4   \n\t"
0072             :"+r" (crc) : "r" (vals[0]), "r" (vals[1]),
0073                           "r" (vals[2]), "r" (vals[3]));
0074     return crc;
0075 #endif
0076 }
0077
0078 //@}

0079 /// \name CRC32-C checksum

0080
0081 /// \brief CRC32-C checksum

0082 /// \param crc the starting crc value

0083 /// \param val the value to checksum

0084 /// \return CRC32-C value

0085 /// \since Crypto++ 8.6

0086 inline uint32_t CRC32CB (uint32_t crc, uint8_t val)
0087 {
0088 #if defined(CRYPTOPP_MSC_VERSION)
0089     return __crc32cb(crc, val);
0090 #else
0091     __asm__ ("crc32cb   %w0, %w0, %w1   \n\t"
0092             :"+r" (crc) : "r" (val) );
0093     return crc;
0094 #endif
0095 }
0096
0097 /// \brief CRC32-C checksum

0098 /// \param crc the starting crc value

0099 /// \param val the value to checksum

0100 /// \return CRC32-C value

0101 /// \since Crypto++ 8.6

0102 inline uint32_t CRC32CW (uint32_t crc, uint32_t val)
0103 {
0104 #if defined(CRYPTOPP_MSC_VERSION)
0105     return __crc32cw(crc, val);
0106 #else
0107     __asm__ ("crc32cw   %w0, %w0, %w1   \n\t"
0108             :"+r" (crc) : "r" (val) );
0109     return crc;
0110 #endif
0111 }
0112
0113 /// \brief CRC32-C checksum

0114 /// \param crc the starting crc value

0115 /// \param vals the values to checksum

0116 /// \return CRC32-C value

0117 /// \since Crypto++ 8.6

0118 inline uint32_t CRC32CWx4 (uint32_t crc, const uint32_t vals[4])
0119 {
0120 #if defined(CRYPTOPP_MSC_VERSION)
0121     return __crc32cw(__crc32cw(__crc32cw(__crc32cw(
0122              crc, vals[0]), vals[1]), vals[2]), vals[3]);
0123 #else
0124     __asm__ ("crc32cw   %w0, %w0, %w1   \n\t"
0125              "crc32cw   %w0, %w0, %w2   \n\t"
0126              "crc32cw   %w0, %w0, %w3   \n\t"
0127              "crc32cw   %w0, %w0, %w4   \n\t"
0128             :"+r" (crc) : "r" (vals[0]), "r" (vals[1]),
0129                           "r" (vals[2]), "r" (vals[3]));
0130     return crc;
0131 #endif
0132 }
0133 //@}

0134 #endif  // CRYPTOPP_ARM_CRC32_AVAILABLE

0135
0136 #if (CRYPTOPP_ARM_PMULL_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
0137 /// \name Polynomial multiplication

0138 //@{

0139
0140 /// \brief Polynomial multiplication

0141 /// \param a the first value

0142 /// \param b the second value

0143 /// \return vector product

0144 /// \details PMULL_00() performs polynomial multiplication and presents

0145 ///  the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x00)</tt>.

0146 ///  The <tt>0x00</tt> indicates the low 64-bits of <tt>a</tt> and <tt>b</tt>

0147 ///  are multiplied.

0148 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit

0149 ///  is MSB and numbered 127, while the rightmost bit is LSB and

0150 ///  numbered 0.

0151 /// \since Crypto++ 8.0

0152 inline uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
0153 {
0154 #if defined(CRYPTOPP_MSC_VERSION)
0155     const __n64 x = { vgetq_lane_u64(a, 0) };
0156     const __n64 y = { vgetq_lane_u64(b, 0) };
0157     return vmull_p64(x, y);
0158 #elif defined(__GNUC__)
0159     uint64x2_t r;
0160     __asm__ ("pmull    %0.1q, %1.1d, %2.1d   \n\t"
0161             :"=w" (r) : "w" (a), "w" (b) );
0162     return r;
0163 #else
0164     return (uint64x2_t)(vmull_p64(
0165         vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
0166         vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
0167 #endif
0168 }
0169
0170 /// \brief Polynomial multiplication

0171 /// \param a the first value

0172 /// \param b the second value

0173 /// \return vector product

0174 /// \details PMULL_01 performs() polynomial multiplication and presents

0175 ///  the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x01)</tt>.

0176 ///  The <tt>0x01</tt> indicates the low 64-bits of <tt>a</tt> and high

0177 ///  64-bits of <tt>b</tt> are multiplied.

0178 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit

0179 ///  is MSB and numbered 127, while the rightmost bit is LSB and

0180 ///  numbered 0.

0181 /// \since Crypto++ 8.0

0182 inline uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b)
0183 {
0184 #if defined(CRYPTOPP_MSC_VERSION)
0185     const __n64 x = { vgetq_lane_u64(a, 0) };
0186     const __n64 y = { vgetq_lane_u64(b, 1) };
0187     return vmull_p64(x, y);
0188 #elif defined(__GNUC__)
0189     uint64x2_t r;
0190     __asm__ ("pmull    %0.1q, %1.1d, %2.1d   \n\t"
0191             :"=w" (r) : "w" (a), "w" (vget_high_u64(b)) );
0192     return r;
0193 #else
0194     return (uint64x2_t)(vmull_p64(
0195         vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
0196         vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
0197 #endif
0198 }
0199
0200 /// \brief Polynomial multiplication

0201 /// \param a the first value

0202 /// \param b the second value

0203 /// \return vector product

0204 /// \details PMULL_10() performs polynomial multiplication and presents

0205 ///  the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x10)</tt>.

0206 ///  The <tt>0x10</tt> indicates the high 64-bits of <tt>a</tt> and low

0207 ///  64-bits of <tt>b</tt> are multiplied.

0208 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit

0209 ///  is MSB and numbered 127, while the rightmost bit is LSB and

0210 ///  numbered 0.

0211 /// \since Crypto++ 8.0

0212 inline uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b)
0213 {
0214 #if defined(CRYPTOPP_MSC_VERSION)
0215     const __n64 x = { vgetq_lane_u64(a, 1) };
0216     const __n64 y = { vgetq_lane_u64(b, 0) };
0217     return vmull_p64(x, y);
0218 #elif defined(__GNUC__)
0219     uint64x2_t r;
0220     __asm__ ("pmull    %0.1q, %1.1d, %2.1d   \n\t"
0221             :"=w" (r) : "w" (vget_high_u64(a)), "w" (b) );
0222     return r;
0223 #else
0224     return (uint64x2_t)(vmull_p64(
0225         vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
0226         vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
0227 #endif
0228 }
0229
0230 /// \brief Polynomial multiplication

0231 /// \param a the first value

0232 /// \param b the second value

0233 /// \return vector product

0234 /// \details PMULL_11() performs polynomial multiplication and presents

0235 ///  the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x11)</tt>.

0236 ///  The <tt>0x11</tt> indicates the high 64-bits of <tt>a</tt> and <tt>b</tt>

0237 ///  are multiplied.

0238 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit

0239 ///  is MSB and numbered 127, while the rightmost bit is LSB and

0240 ///  numbered 0.

0241 /// \since Crypto++ 8.0

0242 inline uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
0243 {
0244 #if defined(CRYPTOPP_MSC_VERSION)
0245     const __n64 x = { vgetq_lane_u64(a, 1) };
0246     const __n64 y = { vgetq_lane_u64(b, 1) };
0247     return vmull_p64(x, y);
0248 #elif defined(__GNUC__)
0249     uint64x2_t r;
0250     __asm__ ("pmull2   %0.1q, %1.2d, %2.2d   \n\t"
0251             :"=w" (r) : "w" (a), "w" (b) );
0252     return r;
0253 #else
0254     return (uint64x2_t)(vmull_p64(
0255         vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
0256         vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
0257 #endif
0258 }
0259
0260 /// \brief Polynomial multiplication

0261 /// \param a the first value

0262 /// \param b the second value

0263 /// \return vector product

0264 /// \details PMULL() performs vmull_p64(). PMULL is provided as

0265 ///  GCC inline assembly due to Clang and lack of support for the intrinsic.

0266 /// \since Crypto++ 8.0

0267 inline uint64x2_t PMULL(const uint64x2_t a, const uint64x2_t b)
0268 {
0269 #if defined(CRYPTOPP_MSC_VERSION)
0270     const __n64 x = { vgetq_lane_u64(a, 0) };
0271     const __n64 y = { vgetq_lane_u64(b, 0) };
0272     return vmull_p64(x, y);
0273 #elif defined(__GNUC__)
0274     uint64x2_t r;
0275     __asm__ ("pmull    %0.1q, %1.1d, %2.1d   \n\t"
0276             :"=w" (r) : "w" (a), "w" (b) );
0277     return r;
0278 #else
0279     return (uint64x2_t)(vmull_p64(
0280         vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
0281         vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
0282 #endif
0283 }
0284
0285 /// \brief Polynomial multiplication

0286 /// \param a the first value

0287 /// \param b the second value

0288 /// \return vector product

0289 /// \details PMULL_HIGH() performs vmull_high_p64(). PMULL_HIGH is provided as

0290 ///  GCC inline assembly due to Clang and lack of support for the intrinsic.

0291 /// \since Crypto++ 8.0

0292 inline uint64x2_t PMULL_HIGH(const uint64x2_t a, const uint64x2_t b)
0293 {
0294 #if defined(CRYPTOPP_MSC_VERSION)
0295     const __n64 x = { vgetq_lane_u64(a, 1) };
0296     const __n64 y = { vgetq_lane_u64(b, 1) };
0297     return vmull_p64(x, y);
0298 #elif defined(__GNUC__)
0299     uint64x2_t r;
0300     __asm__ ("pmull2   %0.1q, %1.2d, %2.2d   \n\t"
0301             :"=w" (r) : "w" (a), "w" (b) );
0302     return r;
0303 #else
0304     return (uint64x2_t)(vmull_p64(
0305         vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
0306         vgetq_lane_u64(vreinterpretq_u64_u8(b),1))));
0307 #endif
0308 }
0309
0310 /// \brief Vector extraction

0311 /// \tparam C the byte count

0312 /// \param a the first value

0313 /// \param b the second value

0314 /// \return vector

0315 /// \details VEXT_U8() extracts the first <tt>C</tt> bytes of vector

0316 ///  <tt>a</tt> and the remaining bytes in <tt>b</tt>. VEXT_U8 is provided

0317 ///  as GCC inline assembly due to Clang and lack of support for the intrinsic.

0318 /// \since Crypto++ 8.0

0319 template <unsigned int C>
0320 inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b)
0321 {
0322     // https://github.com/weidai11/cryptopp/issues/366

0323 #if defined(CRYPTOPP_MSC_VERSION)
0324     return vreinterpretq_u64_u8(vextq_u8(
0325         vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), C));
0326 #else
0327     uint64x2_t r;
0328     __asm__ ("ext   %0.16b, %1.16b, %2.16b, %3   \n\t"
0329             :"=w" (r) : "w" (a), "w" (b), "I" (C) );
0330     return r;
0331 #endif
0332 }
0333
0334 //@}

0335 #endif // CRYPTOPP_ARM_PMULL_AVAILABLE

0336
0337 #if CRYPTOPP_ARM_SHA3_AVAILABLE  || defined(CRYPTOPP_DOXYGEN_PROCESSING)
0338 /// \name ARMv8.2 operations

0339 //@{

0340
0341 /// \brief Three-way XOR

0342 /// \param a the first value

0343 /// \param b the second value

0344 /// \param c the third value

0345 /// \return three-way exclusive OR of the values

0346 /// \details VEOR3() performs veor3q_u64(). VEOR3 is provided as GCC inline assembly due

0347 ///  to Clang and lack of support for the intrinsic.

0348 /// \details VEOR3 requires ARMv8.2.

0349 /// \since Crypto++ 8.6

0350 inline uint64x2_t VEOR3(uint64x2_t a, uint64x2_t b, uint64x2_t c)
0351 {
0352 #if defined(CRYPTOPP_MSC_VERSION)
0353     return veor3q_u64(a, b, c);
0354 #else
0355     uint64x2_t r;
0356     __asm__ ("eor3   %0.16b, %1.16b, %2.16b, %3.16b   \n\t"
0357             :"=w" (r) : "w" (a), "w" (b), "w" (c));
0358     return r;
0359 #endif
0360 }
0361
0362 /// \brief XOR and rotate

0363 /// \param a the first value

0364 /// \param b the second value

0365 /// \param c the third value

0366 /// \return two-way exclusive OR of the values, then rotated by c

0367 /// \details VXARQ() performs vxarq_u64(). VXARQ is provided as GCC inline assembly due

0368 ///  to Clang and lack of support for the intrinsic.

0369 /// \details VXARQ requires ARMv8.2.

0370 /// \since Crypto++ 8.6

0371 inline uint64x2_t VXAR(uint64x2_t a, uint64x2_t b, const int c)
0372 {
0373 #if defined(CRYPTOPP_MSC_VERSION)
0374     return vxarq_u64(a, b, c);
0375 #else
0376     uint64x2_t r;
0377     __asm__ ("xar   %0.2d, %1.2d, %2.2d, %3   \n\t"
0378             :"=w" (r) : "w" (a), "w" (b), "I" (c));
0379     return r;
0380 #endif
0381 }
0382
0383 /// \brief XOR and rotate

0384 /// \tparam C the rotate amount

0385 /// \param a the first value

0386 /// \param b the second value

0387 /// \return two-way exclusive OR of the values, then rotated by C

0388 /// \details VXARQ() performs vxarq_u64(). VXARQ is provided as GCC inline assembly due

0389 ///  to Clang and lack of support for the intrinsic.

0390 /// \details VXARQ requires ARMv8.2.

0391 /// \since Crypto++ 8.6

0392 template <unsigned int C>
0393 inline uint64x2_t VXAR(uint64x2_t a, uint64x2_t b)
0394 {
0395 #if defined(CRYPTOPP_MSC_VERSION)
0396     return vxarq_u64(a, b, C);
0397 #else
0398     uint64x2_t r;
0399     __asm__ ("xar   %0.2d, %1.2d, %2.2d, %3   \n\t"
0400             :"=w" (r) : "w" (a), "w" (b), "I" (C));
0401     return r;
0402 #endif
0403 }
0404
0405 /// \brief XOR and rotate

0406 /// \param a the first value

0407 /// \param b the second value

0408 /// \return two-way exclusive OR of the values, then rotated 1-bit

0409 /// \details VRAX1() performs vrax1q_u64(). VRAX1 is provided as GCC inline assembly due

0410 ///  to Clang and lack of support for the intrinsic.

0411 /// \details VRAX1 requires ARMv8.2.

0412 /// \since Crypto++ 8.6

0413 inline uint64x2_t VRAX1(uint64x2_t a, uint64x2_t b)
0414 {
0415 #if defined(CRYPTOPP_MSC_VERSION)
0416     return vrax1q_u64(a, b);
0417 #else
0418     uint64x2_t r;
0419     __asm__ ("rax1   %0.2d, %1.2d, %2.2d   \n\t"
0420             :"=w" (r) : "w" (a), "w" (b));
0421     return r;
0422 #endif
0423 }
0424 //@}

0425 #endif  // CRYPTOPP_ARM_SHA3_AVAILABLE

0426
0427 #endif // CRYPTOPP_ARM_SIMD_H