26 #if (CRYPTOPP_CLMUL_AVAILABLE) 27 # include <emmintrin.h> 28 # include <wmmintrin.h> 31 #if (CRYPTOPP_ARM_PMULL_AVAILABLE) 35 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) 39 ANONYMOUS_NAMESPACE_BEGIN
45 #if (CRYPTOPP_ARM_PMULL_AVAILABLE) 49 F2N_Multiply_128x128_ARMv8(uint64x2_t& c1, uint64x2_t& c0,
const uint64x2_t& a,
const uint64x2_t& b)
51 uint64x2_t t1, t2, z0={0};
55 t1 = vmovq_n_u64(vgetq_lane_u64(a, 1));
56 t1 = veorq_u64(a, t1);
57 t2 = vmovq_n_u64(vgetq_lane_u64(b, 1));
58 t2 = veorq_u64(b, t2);
60 t1 = veorq_u64(c0, t1);
61 t1 = veorq_u64(c1, t1);
63 t1 = vextq_u64(z0, t1, 1);
64 t2 = vextq_u64(t2, z0, 1);
65 c0 = veorq_u64(c0, t1);
66 c1 = veorq_u64(c1, t2);
71 F2N_Multiply_256x256_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1, uint64x2_t& c0,
72 const uint64x2_t& b1,
const uint64x2_t& b0,
const uint64x2_t& a1,
const uint64x2_t& a0)
75 uint64x2_t x0=a0, x1=a1, y0=b0, y1=b1;
77 F2N_Multiply_128x128_ARMv8(c1, c0, x0, y0);
78 F2N_Multiply_128x128_ARMv8(c3, c2, x1, y1);
80 x0 = veorq_u64(x0, x1);
81 y0 = veorq_u64(y0, y1);
83 F2N_Multiply_128x128_ARMv8(c5, c4, x0, y0);
85 c4 = veorq_u64(c4, c0);
86 c4 = veorq_u64(c4, c2);
87 c5 = veorq_u64(c5, c1);
88 c5 = veorq_u64(c5, c3);
89 c1 = veorq_u64(c1, c4);
90 c2 = veorq_u64(c2, c5);
95 F2N_Square_256_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1,
96 uint64x2_t& c0,
const uint64x2_t& a1,
const uint64x2_t& a0)
105 template <
unsigned int N>
106 inline uint64x2_t ShiftLeft128_ARMv8(uint64x2_t x)
108 uint64x2_t u=x, v, z={0};
109 x = vshlq_n_u64(x, N);
110 u = vshrq_n_u64(u, (64-N));
111 v = vcombine_u64(vget_low_u64(z), vget_low_u64(u));
119 GF2NT_233_Reduce_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1, uint64x2_t& c0)
121 const unsigned int mask[4] = {
122 0xffffffff, 0xffffffff, 0xffffffff, 0x000001ff,
125 uint64x2_t b3, b2, b1, a1, a0, m0, z0={0};
126 m0 = vreinterpretq_u64_u32(vld1q_u32(mask));
128 a0 = vcombine_u64(vget_low_u64(c1), vget_low_u64(z0));
129 a1 = vshlq_n_u64(a1, 23);
130 a1 = vshrq_n_u64(a1, 23);
131 c1 = vorrq_u64(a1, a0);
132 b2 = vshrq_n_u64(c2, (64-23));
133 c3 = ShiftLeft128_ARMv8<23>(c3);
134 a0 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
135 c3 = vorrq_u64(c3, a0);
136 b1 = vshrq_n_u64(b1, (64-23));
137 c2 = ShiftLeft128_ARMv8<23>(c2);
138 a0 = vcombine_u64(vget_high_u64(b1), vget_high_u64(z0));
139 c2 = vorrq_u64(c2, a0);
141 b2 = vshrq_n_u64(c2, (64-10));
142 b3 = ShiftLeft128_ARMv8<10>(b3);
143 a0 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
144 b3 = vorrq_u64(b3, a0);
145 a0 = vcombine_u64(vget_high_u64(c3), vget_high_u64(z0));
146 b3 = veorq_u64(b3, a0);
147 b1 = vshrq_n_u64(b3, (64-23));
148 b3 = ShiftLeft128_ARMv8<23>(b3);
149 b3 = vcombine_u64(vget_high_u64(b3), vget_high_u64(z0));
150 b3 = vorrq_u64(b3, b1);
151 c2 = veorq_u64(c2, b3);
153 b2 = vshrq_n_u64(c2, (64-10));
154 b3 = ShiftLeft128_ARMv8<10>(b3);
155 b2 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
156 b3 = vorrq_u64(b3, b2);
158 b2 = ShiftLeft128_ARMv8<10>(b2);
159 a0 = vcombine_u64(vget_low_u64(z0), vget_low_u64(b2));
160 c2 = veorq_u64(c2, a0);
161 a0 = vcombine_u64(vget_low_u64(z0), vget_low_u64(b3));
162 a1 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
163 a0 = vorrq_u64(a0, a1);
164 c3 = veorq_u64(c3, a0);
165 c0 = veorq_u64(c0, c2);
166 c1 = veorq_u64(c1, c3);
167 c1 = vandq_u64(c1, m0);
174 #if (CRYPTOPP_CLMUL_AVAILABLE) 176 using CryptoPP::word;
180 F2N_Multiply_128x128_CLMUL(__m128i& c1, __m128i& c0,
const __m128i& a,
const __m128i& b)
184 c0 = _mm_clmulepi64_si128(a, b, 0x00);
185 c1 = _mm_clmulepi64_si128(a, b, 0x11);
186 t1 = _mm_shuffle_epi32(a, 0xEE);
187 t1 = _mm_xor_si128(a, t1);
188 t2 = _mm_shuffle_epi32(b, 0xEE);
189 t2 = _mm_xor_si128(b, t2);
190 t1 = _mm_clmulepi64_si128(t1, t2, 0x00);
191 t1 = _mm_xor_si128(c0, t1);
192 t1 = _mm_xor_si128(c1, t1);
194 t1 = _mm_slli_si128(t1, 8);
195 t2 = _mm_srli_si128(t2, 8);
196 c0 = _mm_xor_si128(c0, t1);
197 c1 = _mm_xor_si128(c1, t2);
202 F2N_Multiply_256x256_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1, __m128i& c0,
203 const __m128i& b1,
const __m128i& b0,
const __m128i& a1,
const __m128i& a0)
206 __m128i x0=a0, x1=a1, y0=b0, y1=b1;
208 F2N_Multiply_128x128_CLMUL(c1, c0, x0, y0);
209 F2N_Multiply_128x128_CLMUL(c3, c2, x1, y1);
211 x0 = _mm_xor_si128(x0, x1);
212 y0 = _mm_xor_si128(y0, y1);
214 F2N_Multiply_128x128_CLMUL(c5, c4, x0, y0);
216 c4 = _mm_xor_si128(c4, c0);
217 c4 = _mm_xor_si128(c4, c2);
218 c5 = _mm_xor_si128(c5, c1);
219 c5 = _mm_xor_si128(c5, c3);
220 c1 = _mm_xor_si128(c1, c4);
221 c2 = _mm_xor_si128(c2, c5);
226 F2N_Square_256_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1,
227 __m128i& c0,
const __m128i& a1,
const __m128i& a0)
229 c0 = _mm_clmulepi64_si128(a0, a0, 0x00);
230 c1 = _mm_clmulepi64_si128(a0, a0, 0x11);
231 c2 = _mm_clmulepi64_si128(a1, a1, 0x00);
232 c3 = _mm_clmulepi64_si128(a1, a1, 0x11);
236 template <
unsigned int N>
237 inline __m128i ShiftLeft128_SSE(__m128i x,
const __m128i& z)
240 x = _mm_slli_epi64(x, N);
241 u = _mm_srli_epi64(u, (64-N));
242 v = _mm_unpacklo_epi64(z, u);
243 x = _mm_or_si128(x, v);
250 GF2NT_233_Reduce_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1, __m128i& c0)
252 const unsigned int m[4] = {
253 0xffffffff, 0xffffffff, 0xffffffff, 0x000001ff
256 __m128i b3, b2, b1, a1, a0, m0, z0;
257 m0 = _mm_set_epi32(m[3], m[2], m[1], m[0]);
258 z0 = _mm_setzero_si128();
260 a0 = _mm_move_epi64(c1);
261 a1 = _mm_slli_epi64(a1, 23);
262 a1 = _mm_srli_epi64(a1, 23);
263 c1 = _mm_or_si128(a1, a0);
264 b2 = _mm_srli_epi64(c2, (64-23));
265 c3 = ShiftLeft128_SSE<23>(c3, z0);
266 a0 = _mm_unpackhi_epi64(b2, z0);
267 c3 = _mm_or_si128(c3, a0);
268 b1 = _mm_srli_epi64(b1, (64-23));
269 c2 = ShiftLeft128_SSE<23>(c2, z0);
270 a0 = _mm_unpackhi_epi64(b1, z0);
271 c2 = _mm_or_si128(c2, a0);
273 b2 = _mm_srli_epi64(c2, (64-10));
274 b3 = ShiftLeft128_SSE<10>(b3, z0);
275 a0 = _mm_unpackhi_epi64(b2, z0);
276 b3 = _mm_or_si128(b3, a0);
277 a0 = _mm_unpackhi_epi64(c3, z0);
278 b3 = _mm_xor_si128(b3, a0);
279 b1 = _mm_srli_epi64(b3, (64-23));
280 b3 = ShiftLeft128_SSE<23>(b3, z0);
281 b3 = _mm_unpackhi_epi64(b3, z0);
282 b3 = _mm_or_si128(b3, b1);
283 c2 = _mm_xor_si128(c2, b3);
285 b2 = _mm_srli_epi64(c2, (64-10));
286 b3 = ShiftLeft128_SSE<10>(b3, z0);
287 b2 = _mm_unpackhi_epi64(b2, z0);
288 b3 = _mm_or_si128(b3, b2);
290 b2 = ShiftLeft128_SSE<10>(b2, z0);
291 a0 = _mm_unpacklo_epi64(z0, b2);
292 c2 = _mm_xor_si128(c2, a0);
293 a0 = _mm_unpacklo_epi64(z0, b3);
294 a1 = _mm_unpackhi_epi64(b2, z0);
295 a0 = _mm_or_si128(a0, a1);
296 c3 = _mm_xor_si128(c3, a0);
297 c0 = _mm_xor_si128(c0, c2);
298 c1 = _mm_xor_si128(c1, c3);
299 c1 = _mm_and_si128(c1, m0);
306 #if (CRYPTOPP_POWER8_VMULL_AVAILABLE) 308 using CryptoPP::byte;
309 using CryptoPP::word;
360 F2N_Multiply_128x128_POWER8(c1, c0, x0, y0);
361 F2N_Multiply_128x128_POWER8(c3, c2, x1, y1);
366 F2N_Multiply_128x128_POWER8(c5, c4, x0, y0);
388 template <
unsigned int N>
394 x = VecShiftLeft<N>(x);
406 const uint64_t mod[] = {W64LIT(0xffffffffffffffff), W64LIT(0x01ffffffffff)};
414 a1 = VecShiftLeft<23>(a1);
415 a1 = VecShiftRight<23>(a1);
418 c3 = ShiftLeft128_POWER8<23>(c3);
422 c2 = ShiftLeft128_POWER8<23>(c2);
427 b3 = ShiftLeft128_POWER8<10>(b3);
433 b3 = ShiftLeft128_POWER8<23>(b3);
439 b3 = ShiftLeft128_POWER8<10>(b3);
443 b2 = ShiftLeft128_POWER8<10>(b2);
457 ANONYMOUS_NAMESPACE_END
461 #if (CRYPTOPP_CLMUL_AVAILABLE) 464 GF2NT_233_Multiply_Reduce_CLMUL(
const word* pA,
const word* pB, word* pC)
466 const __m128i* pAA =
reinterpret_cast<const __m128i*
>(pA);
467 const __m128i* pBB =
reinterpret_cast<const __m128i*
>(pB);
468 __m128i a0 = _mm_loadu_si128(pAA+0);
469 __m128i a1 = _mm_loadu_si128(pAA+1);
470 __m128i b0 = _mm_loadu_si128(pBB+0);
471 __m128i b1 = _mm_loadu_si128(pBB+1);
473 __m128i c0, c1, c2, c3;
474 F2N_Multiply_256x256_CLMUL(c3, c2, c1, c0, a1, a0, b1, b0);
475 GF2NT_233_Reduce_CLMUL(c3, c2, c1, c0);
477 __m128i* pCC =
reinterpret_cast<__m128i*
>(pC);
478 _mm_storeu_si128(pCC+0, c0);
479 _mm_storeu_si128(pCC+1, c1);
483 GF2NT_233_Square_Reduce_CLMUL(
const word* pA, word* pC)
485 const __m128i* pAA =
reinterpret_cast<const __m128i*
>(pA);
486 __m128i a0 = _mm_loadu_si128(pAA+0);
487 __m128i a1 = _mm_loadu_si128(pAA+1);
489 __m128i c0, c1, c2, c3;
490 F2N_Square_256_CLMUL(c3, c2, c1, c0, a1, a0);
491 GF2NT_233_Reduce_CLMUL(c3, c2, c1, c0);
493 __m128i* pCC =
reinterpret_cast<__m128i*
>(pC);
494 _mm_storeu_si128(pCC+0, c0);
495 _mm_storeu_si128(pCC+1, c1);
498 #elif (CRYPTOPP_ARM_PMULL_AVAILABLE) 501 GF2NT_233_Multiply_Reduce_ARMv8(
const word* pA,
const word* pB, word* pC)
505 const uint32_t* pAA =
reinterpret_cast<const uint32_t*
>(pA);
506 const uint32_t* pBB =
reinterpret_cast<const uint32_t*
>(pB);
508 uint64x2_t a0 = vreinterpretq_u64_u32(vld1q_u32(pAA+0));
509 uint64x2_t a1 = vreinterpretq_u64_u32(vld1q_u32(pAA+4));
510 uint64x2_t b0 = vreinterpretq_u64_u32(vld1q_u32(pBB+0));
511 uint64x2_t b1 = vreinterpretq_u64_u32(vld1q_u32(pBB+4));
513 uint64x2_t c0, c1, c2, c3;
514 F2N_Multiply_256x256_ARMv8(c3, c2, c1, c0, a1, a0, b1, b0);
515 GF2NT_233_Reduce_ARMv8(c3, c2, c1, c0);
517 uint32_t* pCC =
reinterpret_cast<uint32_t*
>(pC);
518 vst1q_u32(pCC+0, vreinterpretq_u32_u64(c0));
519 vst1q_u32(pCC+4, vreinterpretq_u32_u64(c1));
523 GF2NT_233_Square_Reduce_ARMv8(
const word* pA, word* pC)
527 const uint32_t* pAA =
reinterpret_cast<const uint32_t*
>(pA);
528 uint64x2_t a0 = vreinterpretq_u64_u32(vld1q_u32(pAA+0));
529 uint64x2_t a1 = vreinterpretq_u64_u32(vld1q_u32(pAA+4));
531 uint64x2_t c0, c1, c2, c3;
532 F2N_Square_256_ARMv8(c3, c2, c1, c0, a1, a0);
533 GF2NT_233_Reduce_ARMv8(c3, c2, c1, c0);
535 uint32_t* pCC =
reinterpret_cast<uint32_t*
>(pC);
536 vst1q_u32(pCC+0, vreinterpretq_u32_u64(c0));
537 vst1q_u32(pCC+4, vreinterpretq_u32_u64(c1));
540 #elif (CRYPTOPP_POWER8_VMULL_AVAILABLE) 543 GF2NT_233_Multiply_Reduce_POWER8(
const word* pA,
const word* pB, word* pC)
547 const byte* pAA =
reinterpret_cast<const byte*
>(pA);
548 const byte* pBB =
reinterpret_cast<const byte*
>(pB);
555 #if (CRYPTOPP_BIG_ENDIAN) 556 const uint8_t mb[] = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
565 F2N_Multiply_256x256_POWER8(c3, c2, c1, c0, a1, a0, b1, b0);
566 GF2NT_233_Reduce_POWER8(c3, c2, c1, c0);
568 #if (CRYPTOPP_BIG_ENDIAN) 573 byte* pCC =
reinterpret_cast<byte*
>(pC);
579 GF2NT_233_Square_Reduce_POWER8(
const word* pA, word* pC)
583 const byte* pAA =
reinterpret_cast<const byte*
>(pA);
587 #if (CRYPTOPP_BIG_ENDIAN) 588 const uint8_t mb[] = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
595 F2N_Square_256_POWER8(c3, c2, c1, c0, a1, a0);
596 GF2NT_233_Reduce_POWER8(c3, c2, c1, c0);
598 #if (CRYPTOPP_BIG_ENDIAN) 603 byte* pCC =
reinterpret_cast<byte*
>(pC);
Library configuration file.
uint32x4_p VecShiftLeft(const uint32x4_p vec)
Shift a packed vector left.
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
uint64x2_p VecPolyMultiply00LE(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Support functions for PowerPC and vector operations.
T VecMergeHigh(const T vec1, const T vec2)
Merge two vectors.
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
Classes and functions for schemes over GF(2^n)
T VecMergeLow(const T vec1, const T vec2)
Merge two vectors.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
uint32x4_p VecShiftRight(const uint32x4_p vec)
Shift a packed vector right.
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
T1 VecOr(const T1 vec1, const T2 vec2)
OR two vectors.
uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Crypto++ library namespace.
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
T1 VecAnd(const T1 vec1, const T2 vec2)
AND two vectors.
uint64x2_p VecPolyMultiply11LE(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Support functions for ARM and vector operations.