Crypto++  8.0
Free C++ class library of cryptographic schemes
gf2n_simd.cpp
1 // gf2n_simd.cpp - written and placed in the public domain by Jeffrey Walton
2 // Also based on PCLMULQDQ code by Jankowski, Laurent and
3 // O'Mahony from Intel (see reference below).
4 //
5 // This source file uses intrinsics and built-ins to gain access to
6 // CLMUL, ARMv8a, and Power8 instructions. A separate source file is
7 // needed because additional CXXFLAGS are required to enable the
8 // appropriate instructions sets in some build configurations.
9 //
10 // Several speedups were taken from Intel Polynomial Multiplication
11 // Instruction and its Usage for Elliptic Curve Cryptography, by
12 // Krzysztof Jankowski, Pierre Laurent and Aidan O'Mahony,
13 // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/polynomial-multiplication-instructions-paper.pdf
14 // There may be more speedups available, see https://eprint.iacr.org/2011/589.pdf.
15 // The IACR paper performs some optimizations that the compiler is
16 // expected to perform, like Common Subexpression Elimination to save
17 // on variables (among others). Note that the compiler may miss the
18 // optimization so the IACR paper is useful. However, the code is GPL3
19 // and toxic for some users of the library...
20 
21 #include "pch.h"
22 #include "config.h"
23 
24 #include "gf2n.h"
25 
26 #if (CRYPTOPP_CLMUL_AVAILABLE)
27 # include <emmintrin.h>
28 # include <wmmintrin.h>
29 #endif
30 
31 #if (CRYPTOPP_ARM_PMULL_AVAILABLE)
32 # include "arm_simd.h"
33 #endif
34 
35 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
36 # include "ppc_simd.h"
37 #endif
38 
39 ANONYMOUS_NAMESPACE_BEGIN
40 
41 // ************************** ARMv8 ************************** //
42 
43 using CryptoPP::word;
44 
45 #if (CRYPTOPP_ARM_PMULL_AVAILABLE)
46 
47 // c1c0 = a * b
48 inline void
49 F2N_Multiply_128x128_ARMv8(uint64x2_t& c1, uint64x2_t& c0, const uint64x2_t& a, const uint64x2_t& b)
50 {
51  uint64x2_t t1, t2, z0={0};
52 
53  c0 = PMULL_00(a, b);
54  c1 = PMULL_11(a, b);
55  t1 = vmovq_n_u64(vgetq_lane_u64(a, 1));
56  t1 = veorq_u64(a, t1);
57  t2 = vmovq_n_u64(vgetq_lane_u64(b, 1));
58  t2 = veorq_u64(b, t2);
59  t1 = PMULL_00(t1, t2);
60  t1 = veorq_u64(c0, t1);
61  t1 = veorq_u64(c1, t1);
62  t2 = t1;
63  t1 = vextq_u64(z0, t1, 1);
64  t2 = vextq_u64(t2, z0, 1);
65  c0 = veorq_u64(c0, t1);
66  c1 = veorq_u64(c1, t2);
67 }
68 
69 // c3c2c1c0 = a1a0 * b1b0
70 inline void
71 F2N_Multiply_256x256_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1, uint64x2_t& c0,
72  const uint64x2_t& b1, const uint64x2_t& b0, const uint64x2_t& a1, const uint64x2_t& a0)
73 {
74  uint64x2_t c4, c5;
75  uint64x2_t x0=a0, x1=a1, y0=b0, y1=b1;
76 
77  F2N_Multiply_128x128_ARMv8(c1, c0, x0, y0);
78  F2N_Multiply_128x128_ARMv8(c3, c2, x1, y1);
79 
80  x0 = veorq_u64(x0, x1);
81  y0 = veorq_u64(y0, y1);
82 
83  F2N_Multiply_128x128_ARMv8(c5, c4, x0, y0);
84 
85  c4 = veorq_u64(c4, c0);
86  c4 = veorq_u64(c4, c2);
87  c5 = veorq_u64(c5, c1);
88  c5 = veorq_u64(c5, c3);
89  c1 = veorq_u64(c1, c4);
90  c2 = veorq_u64(c2, c5);
91 }
92 
93 // c3c2c1c0 = a1a0 * a1a0
94 inline void
95 F2N_Square_256_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1,
96  uint64x2_t& c0, const uint64x2_t& a1, const uint64x2_t& a0)
97 {
98  c0 = PMULL_00(a0, a0);
99  c1 = PMULL_11(a0, a0);
100  c2 = PMULL_00(a1, a1);
101  c3 = PMULL_11(a1, a1);
102 }
103 
104 // x = (x << n), z = 0
105 template <unsigned int N>
106 inline uint64x2_t ShiftLeft128_ARMv8(uint64x2_t x)
107 {
108  uint64x2_t u=x, v, z={0};
109  x = vshlq_n_u64(x, N);
110  u = vshrq_n_u64(u, (64-N));
111  v = vcombine_u64(vget_low_u64(z), vget_low_u64(u));
112  x = vorrq_u64(x, v);
113  return x;
114 }
115 
116 // c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at
117 // Intel paper or https://github.com/antonblanchard/crc32-vpmsum.
118 inline void
119 GF2NT_233_Reduce_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1, uint64x2_t& c0)
120 {
121  const unsigned int mask[4] = {
122  0xffffffff, 0xffffffff, 0xffffffff, 0x000001ff,
123  };
124 
125  uint64x2_t b3, b2, b1, /*b0,*/ a1, a0, m0, z0={0};
126  m0 = vreinterpretq_u64_u32(vld1q_u32(mask));
127  b1 = c1; a1 = c1;
128  a0 = vcombine_u64(vget_low_u64(c1), vget_low_u64(z0));
129  a1 = vshlq_n_u64(a1, 23);
130  a1 = vshrq_n_u64(a1, 23);
131  c1 = vorrq_u64(a1, a0);
132  b2 = vshrq_n_u64(c2, (64-23));
133  c3 = ShiftLeft128_ARMv8<23>(c3);
134  a0 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
135  c3 = vorrq_u64(c3, a0);
136  b1 = vshrq_n_u64(b1, (64-23));
137  c2 = ShiftLeft128_ARMv8<23>(c2);
138  a0 = vcombine_u64(vget_high_u64(b1), vget_high_u64(z0));
139  c2 = vorrq_u64(c2, a0);
140  b3 = c3;
141  b2 = vshrq_n_u64(c2, (64-10));
142  b3 = ShiftLeft128_ARMv8<10>(b3);
143  a0 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
144  b3 = vorrq_u64(b3, a0);
145  a0 = vcombine_u64(vget_high_u64(c3), vget_high_u64(z0));
146  b3 = veorq_u64(b3, a0);
147  b1 = vshrq_n_u64(b3, (64-23));
148  b3 = ShiftLeft128_ARMv8<23>(b3);
149  b3 = vcombine_u64(vget_high_u64(b3), vget_high_u64(z0));
150  b3 = vorrq_u64(b3, b1);
151  c2 = veorq_u64(c2, b3);
152  b3 = c3;
153  b2 = vshrq_n_u64(c2, (64-10));
154  b3 = ShiftLeft128_ARMv8<10>(b3);
155  b2 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
156  b3 = vorrq_u64(b3, b2);
157  b2 = c2;
158  b2 = ShiftLeft128_ARMv8<10>(b2);
159  a0 = vcombine_u64(vget_low_u64(z0), vget_low_u64(b2));
160  c2 = veorq_u64(c2, a0);
161  a0 = vcombine_u64(vget_low_u64(z0), vget_low_u64(b3));
162  a1 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
163  a0 = vorrq_u64(a0, a1);
164  c3 = veorq_u64(c3, a0);
165  c0 = veorq_u64(c0, c2);
166  c1 = veorq_u64(c1, c3);
167  c1 = vandq_u64(c1, m0);
168 }
169 
170 #endif
171 
172 // ************************** SSE ************************** //
173 
174 #if (CRYPTOPP_CLMUL_AVAILABLE)
175 
176 using CryptoPP::word;
177 
178 // c1c0 = a * b
179 inline void
180 F2N_Multiply_128x128_CLMUL(__m128i& c1, __m128i& c0, const __m128i& a, const __m128i& b)
181 {
182  __m128i t1, t2;
183 
184  c0 = _mm_clmulepi64_si128(a, b, 0x00);
185  c1 = _mm_clmulepi64_si128(a, b, 0x11);
186  t1 = _mm_shuffle_epi32(a, 0xEE);
187  t1 = _mm_xor_si128(a, t1);
188  t2 = _mm_shuffle_epi32(b, 0xEE);
189  t2 = _mm_xor_si128(b, t2);
190  t1 = _mm_clmulepi64_si128(t1, t2, 0x00);
191  t1 = _mm_xor_si128(c0, t1);
192  t1 = _mm_xor_si128(c1, t1);
193  t2 = t1;
194  t1 = _mm_slli_si128(t1, 8);
195  t2 = _mm_srli_si128(t2, 8);
196  c0 = _mm_xor_si128(c0, t1);
197  c1 = _mm_xor_si128(c1, t2);
198 }
199 
200 // c3c2c1c0 = a1a0 * b1b0
201 inline void
202 F2N_Multiply_256x256_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1, __m128i& c0,
203  const __m128i& b1, const __m128i& b0, const __m128i& a1, const __m128i& a0)
204 {
205  __m128i c4, c5;
206  __m128i x0=a0, x1=a1, y0=b0, y1=b1;
207 
208  F2N_Multiply_128x128_CLMUL(c1, c0, x0, y0);
209  F2N_Multiply_128x128_CLMUL(c3, c2, x1, y1);
210 
211  x0 = _mm_xor_si128(x0, x1);
212  y0 = _mm_xor_si128(y0, y1);
213 
214  F2N_Multiply_128x128_CLMUL(c5, c4, x0, y0);
215 
216  c4 = _mm_xor_si128(c4, c0);
217  c4 = _mm_xor_si128(c4, c2);
218  c5 = _mm_xor_si128(c5, c1);
219  c5 = _mm_xor_si128(c5, c3);
220  c1 = _mm_xor_si128(c1, c4);
221  c2 = _mm_xor_si128(c2, c5);
222 }
223 
224 // c3c2c1c0 = a1a0 * a1a0
225 inline void
226 F2N_Square_256_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1,
227  __m128i& c0, const __m128i& a1, const __m128i& a0)
228 {
229  c0 = _mm_clmulepi64_si128(a0, a0, 0x00);
230  c1 = _mm_clmulepi64_si128(a0, a0, 0x11);
231  c2 = _mm_clmulepi64_si128(a1, a1, 0x00);
232  c3 = _mm_clmulepi64_si128(a1, a1, 0x11);
233 }
234 
235 // x = (x << n), z = 0
236 template <unsigned int N>
237 inline __m128i ShiftLeft128_SSE(__m128i x, const __m128i& z)
238 {
239  __m128i u=x, v;
240  x = _mm_slli_epi64(x, N);
241  u = _mm_srli_epi64(u, (64-N));
242  v = _mm_unpacklo_epi64(z, u);
243  x = _mm_or_si128(x, v);
244  return x;
245 }
246 
247 // c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at
248 // Intel paper or https://github.com/antonblanchard/crc32-vpmsum.
249 inline void
250 GF2NT_233_Reduce_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1, __m128i& c0)
251 {
252  const unsigned int m[4] = {
253  0xffffffff, 0xffffffff, 0xffffffff, 0x000001ff
254  };
255 
256  __m128i b3, b2, b1, /*b0,*/ a1, a0, m0, z0;
257  m0 = _mm_set_epi32(m[3], m[2], m[1], m[0]);
258  z0 = _mm_setzero_si128();
259  b1 = c1; a1 = c1;
260  a0 = _mm_move_epi64(c1);
261  a1 = _mm_slli_epi64(a1, 23);
262  a1 = _mm_srli_epi64(a1, 23);
263  c1 = _mm_or_si128(a1, a0);
264  b2 = _mm_srli_epi64(c2, (64-23));
265  c3 = ShiftLeft128_SSE<23>(c3, z0);
266  a0 = _mm_unpackhi_epi64(b2, z0);
267  c3 = _mm_or_si128(c3, a0);
268  b1 = _mm_srli_epi64(b1, (64-23));
269  c2 = ShiftLeft128_SSE<23>(c2, z0);
270  a0 = _mm_unpackhi_epi64(b1, z0);
271  c2 = _mm_or_si128(c2, a0);
272  b3 = c3;
273  b2 = _mm_srli_epi64(c2, (64-10));
274  b3 = ShiftLeft128_SSE<10>(b3, z0);
275  a0 = _mm_unpackhi_epi64(b2, z0);
276  b3 = _mm_or_si128(b3, a0);
277  a0 = _mm_unpackhi_epi64(c3, z0);
278  b3 = _mm_xor_si128(b3, a0);
279  b1 = _mm_srli_epi64(b3, (64-23));
280  b3 = ShiftLeft128_SSE<23>(b3, z0);
281  b3 = _mm_unpackhi_epi64(b3, z0);
282  b3 = _mm_or_si128(b3, b1);
283  c2 = _mm_xor_si128(c2, b3);
284  b3 = c3;
285  b2 = _mm_srli_epi64(c2, (64-10));
286  b3 = ShiftLeft128_SSE<10>(b3, z0);
287  b2 = _mm_unpackhi_epi64(b2, z0);
288  b3 = _mm_or_si128(b3, b2);
289  b2 = c2;
290  b2 = ShiftLeft128_SSE<10>(b2, z0);
291  a0 = _mm_unpacklo_epi64(z0, b2);
292  c2 = _mm_xor_si128(c2, a0);
293  a0 = _mm_unpacklo_epi64(z0, b3);
294  a1 = _mm_unpackhi_epi64(b2, z0);
295  a0 = _mm_or_si128(a0, a1);
296  c3 = _mm_xor_si128(c3, a0);
297  c0 = _mm_xor_si128(c0, c2);
298  c1 = _mm_xor_si128(c1, c3);
299  c1 = _mm_and_si128(c1, m0);
300 }
301 
302 #endif
303 
304 // ************************* Power8 ************************* //
305 
306 #if (CRYPTOPP_POWER8_VMULL_AVAILABLE)
307 
308 using CryptoPP::byte;
309 using CryptoPP::word;
312 
313 using CryptoPP::VecLoad;
314 using CryptoPP::VecStore;
315 
316 using CryptoPP::VecOr;
317 using CryptoPP::VecXor;
318 using CryptoPP::VecAnd;
319 
325 
328 
329 // c1c0 = a * b
330 inline void
331 F2N_Multiply_128x128_POWER8(uint64x2_p& c1, uint64x2_p& c0, const uint64x2_p& a, const uint64x2_p& b)
332 {
333  uint64x2_p t1, t2;
334  const uint64x2_p z0={0};
335 
336  c0 = VecPolyMultiply00LE(a, b);
337  c1 = VecPolyMultiply11LE(a, b);
338  t1 = VecMergeLow(a, a);
339  t1 = VecXor(a, t1);
340  t2 = VecMergeLow(b, b);
341  t2 = VecXor(b, t2);
342  t1 = VecPolyMultiply00LE(t1, t2);
343  t1 = VecXor(c0, t1);
344  t1 = VecXor(c1, t1);
345  t2 = t1;
346  t1 = VecMergeHigh(z0, t1);
347  t2 = VecMergeLow(t2, z0);
348  c0 = VecXor(c0, t1);
349  c1 = VecXor(c1, t2);
350 }
351 
352 // c3c2c1c0 = a1a0 * b1b0
353 inline void
354 F2N_Multiply_256x256_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1, uint64x2_p& c0,
355  const uint64x2_p& b1, const uint64x2_p& b0, const uint64x2_p& a1, const uint64x2_p& a0)
356 {
357  uint64x2_p c4, c5;
358  uint64x2_p x0=a0, x1=a1, y0=b0, y1=b1;
359 
360  F2N_Multiply_128x128_POWER8(c1, c0, x0, y0);
361  F2N_Multiply_128x128_POWER8(c3, c2, x1, y1);
362 
363  x0 = VecXor(x0, x1);
364  y0 = VecXor(y0, y1);
365 
366  F2N_Multiply_128x128_POWER8(c5, c4, x0, y0);
367 
368  c4 = VecXor(c4, c0);
369  c4 = VecXor(c4, c2);
370  c5 = VecXor(c5, c1);
371  c5 = VecXor(c5, c3);
372  c1 = VecXor(c1, c4);
373  c2 = VecXor(c2, c5);
374 }
375 
376 // c3c2c1c0 = a1a0 * a1a0
377 inline void
378 F2N_Square_256_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1,
379  uint64x2_p& c0, const uint64x2_p& a1, const uint64x2_p& a0)
380 {
381  c0 = VecPolyMultiply00LE(a0, a0);
382  c1 = VecPolyMultiply11LE(a0, a0);
383  c2 = VecPolyMultiply00LE(a1, a1);
384  c3 = VecPolyMultiply11LE(a1, a1);
385 }
386 
387 // x = (x << n), z = 0
388 template <unsigned int N>
389 inline uint64x2_p ShiftLeft128_POWER8(uint64x2_p x)
390 {
391  uint64x2_p u=x, v;
392  const uint64x2_p z={0};
393 
394  x = VecShiftLeft<N>(x);
395  u = VecShiftRight<64-N>(u);
396  v = VecMergeHigh(z, u);
397  x = VecOr(x, v);
398  return x;
399 }
400 
401 // c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at
402 // Intel paper or https://github.com/antonblanchard/crc32-vpmsum.
403 inline void
404 GF2NT_233_Reduce_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1, uint64x2_p& c0)
405 {
406  const uint64_t mod[] = {W64LIT(0xffffffffffffffff), W64LIT(0x01ffffffffff)};
407  const uint64x2_p m0 = (uint64x2_p)VecLoad(mod);
408 
409  uint64x2_p b3, b2, b1, /*b0,*/ a1, a0;
410  const uint64x2_p z0={0};
411 
412  b1 = c1; a1 = c1;
413  a0 = VecMergeHigh(c1, z0);
414  a1 = VecShiftLeft<23>(a1);
415  a1 = VecShiftRight<23>(a1);
416  c1 = VecOr(a1, a0);
417  b2 = VecShiftRight<64-23>(c2);
418  c3 = ShiftLeft128_POWER8<23>(c3);
419  a0 = VecMergeLow(b2, z0);
420  c3 = VecOr(c3, a0);
421  b1 = VecShiftRight<64-23>(b1);
422  c2 = ShiftLeft128_POWER8<23>(c2);
423  a0 = VecMergeLow(b1, z0);
424  c2 = VecOr(c2, a0);
425  b3 = c3;
426  b2 = VecShiftRight<64-10>(c2);
427  b3 = ShiftLeft128_POWER8<10>(b3);
428  a0 = VecMergeLow(b2, z0);
429  b3 = VecOr(b3, a0);
430  a0 = VecMergeLow(c3, z0);
431  b3 = VecXor(b3, a0);
432  b1 = VecShiftRight<64-23>(b3);
433  b3 = ShiftLeft128_POWER8<23>(b3);
434  b3 = VecMergeLow(b3, z0);
435  b3 = VecOr(b3, b1);
436  c2 = VecXor(c2, b3);
437  b3 = c3;
438  b2 = VecShiftRight<64-10>(c2);
439  b3 = ShiftLeft128_POWER8<10>(b3);
440  b2 = VecMergeLow(b2, z0);
441  b3 = VecOr(b3, b2);
442  b2 = c2;
443  b2 = ShiftLeft128_POWER8<10>(b2);
444  a0 = VecMergeHigh(z0, b2);
445  c2 = VecXor(c2, a0);
446  a0 = VecMergeHigh(z0, b3);
447  a1 = VecMergeLow(b2, z0);
448  a0 = VecOr(a0, a1);
449  c3 = VecXor(c3, a0);
450  c0 = VecXor(c0, c2);
451  c1 = VecXor(c1, c3);
452  c1 = VecAnd(c1, m0);
453 }
454 
455 #endif
456 
457 ANONYMOUS_NAMESPACE_END
458 
459 NAMESPACE_BEGIN(CryptoPP)
460 
461 #if (CRYPTOPP_CLMUL_AVAILABLE)
462 
463 void
464 GF2NT_233_Multiply_Reduce_CLMUL(const word* pA, const word* pB, word* pC)
465 {
466  const __m128i* pAA = reinterpret_cast<const __m128i*>(pA);
467  const __m128i* pBB = reinterpret_cast<const __m128i*>(pB);
468  __m128i a0 = _mm_loadu_si128(pAA+0);
469  __m128i a1 = _mm_loadu_si128(pAA+1);
470  __m128i b0 = _mm_loadu_si128(pBB+0);
471  __m128i b1 = _mm_loadu_si128(pBB+1);
472 
473  __m128i c0, c1, c2, c3;
474  F2N_Multiply_256x256_CLMUL(c3, c2, c1, c0, a1, a0, b1, b0);
475  GF2NT_233_Reduce_CLMUL(c3, c2, c1, c0);
476 
477  __m128i* pCC = reinterpret_cast<__m128i*>(pC);
478  _mm_storeu_si128(pCC+0, c0);
479  _mm_storeu_si128(pCC+1, c1);
480 }
481 
482 void
483 GF2NT_233_Square_Reduce_CLMUL(const word* pA, word* pC)
484 {
485  const __m128i* pAA = reinterpret_cast<const __m128i*>(pA);
486  __m128i a0 = _mm_loadu_si128(pAA+0);
487  __m128i a1 = _mm_loadu_si128(pAA+1);
488 
489  __m128i c0, c1, c2, c3;
490  F2N_Square_256_CLMUL(c3, c2, c1, c0, a1, a0);
491  GF2NT_233_Reduce_CLMUL(c3, c2, c1, c0);
492 
493  __m128i* pCC = reinterpret_cast<__m128i*>(pC);
494  _mm_storeu_si128(pCC+0, c0);
495  _mm_storeu_si128(pCC+1, c1);
496 }
497 
498 #elif (CRYPTOPP_ARM_PMULL_AVAILABLE)
499 
500 void
501 GF2NT_233_Multiply_Reduce_ARMv8(const word* pA, const word* pB, word* pC)
502 {
503  // word is either 32-bit or 64-bit, depending on the platform.
504  // Load using a 32-bit pointer to avoid possible alignment issues.
505  const uint32_t* pAA = reinterpret_cast<const uint32_t*>(pA);
506  const uint32_t* pBB = reinterpret_cast<const uint32_t*>(pB);
507 
508  uint64x2_t a0 = vreinterpretq_u64_u32(vld1q_u32(pAA+0));
509  uint64x2_t a1 = vreinterpretq_u64_u32(vld1q_u32(pAA+4));
510  uint64x2_t b0 = vreinterpretq_u64_u32(vld1q_u32(pBB+0));
511  uint64x2_t b1 = vreinterpretq_u64_u32(vld1q_u32(pBB+4));
512 
513  uint64x2_t c0, c1, c2, c3;
514  F2N_Multiply_256x256_ARMv8(c3, c2, c1, c0, a1, a0, b1, b0);
515  GF2NT_233_Reduce_ARMv8(c3, c2, c1, c0);
516 
517  uint32_t* pCC = reinterpret_cast<uint32_t*>(pC);
518  vst1q_u32(pCC+0, vreinterpretq_u32_u64(c0));
519  vst1q_u32(pCC+4, vreinterpretq_u32_u64(c1));
520 }
521 
522 void
523 GF2NT_233_Square_Reduce_ARMv8(const word* pA, word* pC)
524 {
525  // word is either 32-bit or 64-bit, depending on the platform.
526  // Load using a 32-bit pointer to avoid possible alignment issues.
527  const uint32_t* pAA = reinterpret_cast<const uint32_t*>(pA);
528  uint64x2_t a0 = vreinterpretq_u64_u32(vld1q_u32(pAA+0));
529  uint64x2_t a1 = vreinterpretq_u64_u32(vld1q_u32(pAA+4));
530 
531  uint64x2_t c0, c1, c2, c3;
532  F2N_Square_256_ARMv8(c3, c2, c1, c0, a1, a0);
533  GF2NT_233_Reduce_ARMv8(c3, c2, c1, c0);
534 
535  uint32_t* pCC = reinterpret_cast<uint32_t*>(pC);
536  vst1q_u32(pCC+0, vreinterpretq_u32_u64(c0));
537  vst1q_u32(pCC+4, vreinterpretq_u32_u64(c1));
538 }
539 
540 #elif (CRYPTOPP_POWER8_VMULL_AVAILABLE)
541 
542 void
543 GF2NT_233_Multiply_Reduce_POWER8(const word* pA, const word* pB, word* pC)
544 {
545  // word is either 32-bit or 64-bit, depending on the platform.
546  // Load using a byte pointer to avoid possible alignment issues.
547  const byte* pAA = reinterpret_cast<const byte*>(pA);
548  const byte* pBB = reinterpret_cast<const byte*>(pB);
549 
550  uint64x2_p a0 = (uint64x2_p)VecLoad(pAA+0);
551  uint64x2_p a1 = (uint64x2_p)VecLoad(pAA+16);
552  uint64x2_p b0 = (uint64x2_p)VecLoad(pBB+0);
553  uint64x2_p b1 = (uint64x2_p)VecLoad(pBB+16);
554 
555 #if (CRYPTOPP_BIG_ENDIAN)
556  const uint8_t mb[] = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
557  const uint8x16_p m = (uint8x16_p)VecLoad(mb);
558  a0 = VecPermute(a0, m);
559  a1 = VecPermute(a1, m);
560  b0 = VecPermute(b0, m);
561  b1 = VecPermute(b1, m);
562 #endif
563 
564  uint64x2_p c0, c1, c2, c3;
565  F2N_Multiply_256x256_POWER8(c3, c2, c1, c0, a1, a0, b1, b0);
566  GF2NT_233_Reduce_POWER8(c3, c2, c1, c0);
567 
568 #if (CRYPTOPP_BIG_ENDIAN)
569  c0 = VecPermute(c0, m);
570  c1 = VecPermute(c1, m);
571 #endif
572 
573  byte* pCC = reinterpret_cast<byte*>(pC);
574  VecStore(c0, pCC+0);
575  VecStore(c1, pCC+16);
576 }
577 
578 void
579 GF2NT_233_Square_Reduce_POWER8(const word* pA, word* pC)
580 {
581  // word is either 32-bit or 64-bit, depending on the platform.
582  // Load using a byte pointer to avoid possible alignment issues.
583  const byte* pAA = reinterpret_cast<const byte*>(pA);
584  uint64x2_p a0 = (uint64x2_p)VecLoad(pAA+0);
585  uint64x2_p a1 = (uint64x2_p)VecLoad(pAA+16);
586 
587 #if (CRYPTOPP_BIG_ENDIAN)
588  const uint8_t mb[] = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
589  const uint8x16_p m = (uint8x16_p)VecLoad(mb);
590  a0 = VecPermute(a0, m);
591  a1 = VecPermute(a1, m);
592 #endif
593 
594  uint64x2_p c0, c1, c2, c3;
595  F2N_Square_256_POWER8(c3, c2, c1, c0, a1, a0);
596  GF2NT_233_Reduce_POWER8(c3, c2, c1, c0);
597 
598 #if (CRYPTOPP_BIG_ENDIAN)
599  c0 = VecPermute(c0, m);
600  c1 = VecPermute(c1, m);
601 #endif
602 
603  byte* pCC = reinterpret_cast<byte*>(pC);
604  VecStore(c0, pCC+0);
605  VecStore(c1, pCC+16);
606 }
607 
608 #endif
609 
610 NAMESPACE_END
Library configuration file.
uint32x4_p VecShiftLeft(const uint32x4_p vec)
Shift a packed vector left.
Definition: ppc_simd.h:1202
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1010
uint64x2_p VecPolyMultiply00LE(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Definition: ppc_simd.h:1501
uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:35
Support functions for PowerPC and vector operations.
Precompiled header file.
T VecMergeHigh(const T vec1, const T vec2)
Merge two vectors.
Definition: ppc_simd.h:1217
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:605
Classes and functions for schemes over GF(2^n)
T VecMergeLow(const T vec1, const T vec2)
Merge two vectors.
Definition: ppc_simd.h:1231
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:916
uint32x4_p VecShiftRight(const uint32x4_p vec)
Shift a packed vector right.
Definition: ppc_simd.h:1296
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:139
T1 VecOr(const T1 vec1, const T2 vec2)
OR two vectors.
Definition: ppc_simd.h:899
uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:125
Crypto++ library namespace.
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:253
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:119
T1 VecAnd(const T1 vec1, const T2 vec2)
AND two vectors.
Definition: ppc_simd.h:882
uint64x2_p VecPolyMultiply11LE(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Definition: ppc_simd.h:1567
Support functions for ARM and vector operations.