Crypto++  8.1
Free C++ class library of cryptographic schemes
blake2b_simd.cpp
1 // blake2-simd.cpp - written and placed in the public domain by
2 // Samuel Neves, Jeffrey Walton, Uri Blumenthal
3 // and Marcel Raad.
4 //
5 // This source file uses intrinsics to gain access to ARMv7a/ARMv8a
6 // NEON, Power8 and SSE4.1 instructions. A separate source file is
7 // needed because additional CXXFLAGS are required to enable the
8 // appropriate instructions sets in some build configurations.
9 
10 #include "pch.h"
11 #include "config.h"
12 #include "misc.h"
13 #include "blake2.h"
14 
15 // Uncomment for benchmarking C++ against SSE2 or NEON.
16 // Do so in both blake2.cpp and blake2-simd.cpp.
17 // #undef CRYPTOPP_SSE41_AVAILABLE
18 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
19 // #undef CRYPTOPP_ALTIVEC_AVAILABLE
20 
21 // Disable NEON/ASIMD for Cortex-A53 and A57. The shifts are too slow and C/C++ is about
22 // 3 cpb faster than NEON/ASIMD. Also see http://github.com/weidai11/cryptopp/issues/367.
23 #if (defined(__aarch32__) || defined(__aarch64__)) && defined(CRYPTOPP_SLOW_ARMV8_SHIFT)
24 # undef CRYPTOPP_ARM_NEON_AVAILABLE
25 #endif
26 
27 // BLAKE2s bug on AIX 7.1 (POWER7) with XLC 12.01
28 // https://github.com/weidai11/cryptopp/issues/743
29 #if defined(__xlC__) && (__xlC__ < 0x0d01)
30 # define CRYPTOPP_DISABLE_ALTIVEC 1
31 # undef CRYPTOPP_POWER7_AVAILABLE
32 # undef CRYPTOPP_ALTIVEC_AVAILABLE
33 #endif
34 
35 #if (CRYPTOPP_SSE41_AVAILABLE)
36 # include <emmintrin.h>
37 # include <tmmintrin.h>
38 # include <smmintrin.h>
39 #endif
40 
41 // C1189: error: This header is specific to ARM targets
42 #if (CRYPTOPP_ARM_NEON_AVAILABLE) && !defined(_M_ARM64)
43 # include <arm_neon.h>
44 #endif
45 
46 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
47 # include <stdint.h>
48 # include <arm_acle.h>
49 #endif
50 
51 #if (CRYPTOPP_POWER8_AVAILABLE)
52 # include "ppc_simd.h"
53 #endif
54 
55 // Squash MS LNK4221 and libtool warnings
56 extern const char BLAKE2B_SIMD_FNAME[] = __FILE__;
57 
58 NAMESPACE_BEGIN(CryptoPP)
59 
60 // Exported by blake2.cpp
61 extern const word32 BLAKE2S_IV[8];
62 extern const word64 BLAKE2B_IV[8];
63 
64 #if CRYPTOPP_SSE41_AVAILABLE
65 
66 #define LOADU(p) _mm_loadu_si128((const __m128i *)(const void*)(p))
67 #define STOREU(p,r) _mm_storeu_si128((__m128i *)(void*)(p), r)
68 #define TOF(reg) _mm_castsi128_ps((reg))
69 #define TOI(reg) _mm_castps_si128((reg))
70 
71 void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2b_State& state)
72 {
73  #define BLAKE2B_LOAD_MSG_0_1(b0, b1) \
74  do { \
75  b0 = _mm_unpacklo_epi64(m0, m1); \
76  b1 = _mm_unpacklo_epi64(m2, m3); \
77  } while(0)
78 
79  #define BLAKE2B_LOAD_MSG_0_2(b0, b1) \
80  do { \
81  b0 = _mm_unpackhi_epi64(m0, m1); \
82  b1 = _mm_unpackhi_epi64(m2, m3); \
83  } while(0)
84 
85  #define BLAKE2B_LOAD_MSG_0_3(b0, b1) \
86  do { \
87  b0 = _mm_unpacklo_epi64(m4, m5); \
88  b1 = _mm_unpacklo_epi64(m6, m7); \
89  } while(0)
90 
91  #define BLAKE2B_LOAD_MSG_0_4(b0, b1) \
92  do { \
93  b0 = _mm_unpackhi_epi64(m4, m5); \
94  b1 = _mm_unpackhi_epi64(m6, m7); \
95  } while(0)
96 
97  #define BLAKE2B_LOAD_MSG_1_1(b0, b1) \
98  do { \
99  b0 = _mm_unpacklo_epi64(m7, m2); \
100  b1 = _mm_unpackhi_epi64(m4, m6); \
101  } while(0)
102 
103  #define BLAKE2B_LOAD_MSG_1_2(b0, b1) \
104  do { \
105  b0 = _mm_unpacklo_epi64(m5, m4); \
106  b1 = _mm_alignr_epi8(m3, m7, 8); \
107  } while(0)
108 
109  #define BLAKE2B_LOAD_MSG_1_3(b0, b1) \
110  do { \
111  b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
112  b1 = _mm_unpackhi_epi64(m5, m2); \
113  } while(0)
114 
115  #define BLAKE2B_LOAD_MSG_1_4(b0, b1) \
116  do { \
117  b0 = _mm_unpacklo_epi64(m6, m1); \
118  b1 = _mm_unpackhi_epi64(m3, m1); \
119  } while(0)
120 
121  #define BLAKE2B_LOAD_MSG_2_1(b0, b1) \
122  do { \
123  b0 = _mm_alignr_epi8(m6, m5, 8); \
124  b1 = _mm_unpackhi_epi64(m2, m7); \
125  } while(0)
126 
127  #define BLAKE2B_LOAD_MSG_2_2(b0, b1) \
128  do { \
129  b0 = _mm_unpacklo_epi64(m4, m0); \
130  b1 = _mm_blend_epi16(m1, m6, 0xF0); \
131  } while(0)
132 
133  #define BLAKE2B_LOAD_MSG_2_3(b0, b1) \
134  do { \
135  b0 = _mm_blend_epi16(m5, m1, 0xF0); \
136  b1 = _mm_unpackhi_epi64(m3, m4); \
137  } while(0)
138 
139  #define BLAKE2B_LOAD_MSG_2_4(b0, b1) \
140  do { \
141  b0 = _mm_unpacklo_epi64(m7, m3); \
142  b1 = _mm_alignr_epi8(m2, m0, 8); \
143  } while(0)
144 
145  #define BLAKE2B_LOAD_MSG_3_1(b0, b1) \
146  do { \
147  b0 = _mm_unpackhi_epi64(m3, m1); \
148  b1 = _mm_unpackhi_epi64(m6, m5); \
149  } while(0)
150 
151  #define BLAKE2B_LOAD_MSG_3_2(b0, b1) \
152  do { \
153  b0 = _mm_unpackhi_epi64(m4, m0); \
154  b1 = _mm_unpacklo_epi64(m6, m7); \
155  } while(0)
156 
157  #define BLAKE2B_LOAD_MSG_3_3(b0, b1) \
158  do { \
159  b0 = _mm_blend_epi16(m1, m2, 0xF0); \
160  b1 = _mm_blend_epi16(m2, m7, 0xF0); \
161  } while(0)
162 
163  #define BLAKE2B_LOAD_MSG_3_4(b0, b1) \
164  do { \
165  b0 = _mm_unpacklo_epi64(m3, m5); \
166  b1 = _mm_unpacklo_epi64(m0, m4); \
167  } while(0)
168 
169  #define BLAKE2B_LOAD_MSG_4_1(b0, b1) \
170  do { \
171  b0 = _mm_unpackhi_epi64(m4, m2); \
172  b1 = _mm_unpacklo_epi64(m1, m5); \
173  } while(0)
174 
175  #define BLAKE2B_LOAD_MSG_4_2(b0, b1) \
176  do { \
177  b0 = _mm_blend_epi16(m0, m3, 0xF0); \
178  b1 = _mm_blend_epi16(m2, m7, 0xF0); \
179  } while(0)
180 
181  #define BLAKE2B_LOAD_MSG_4_3(b0, b1) \
182  do { \
183  b0 = _mm_blend_epi16(m7, m5, 0xF0); \
184  b1 = _mm_blend_epi16(m3, m1, 0xF0); \
185  } while(0)
186 
187  #define BLAKE2B_LOAD_MSG_4_4(b0, b1) \
188  do { \
189  b0 = _mm_alignr_epi8(m6, m0, 8); \
190  b1 = _mm_blend_epi16(m4, m6, 0xF0); \
191  } while(0)
192 
193  #define BLAKE2B_LOAD_MSG_5_1(b0, b1) \
194  do { \
195  b0 = _mm_unpacklo_epi64(m1, m3); \
196  b1 = _mm_unpacklo_epi64(m0, m4); \
197  } while(0)
198 
199  #define BLAKE2B_LOAD_MSG_5_2(b0, b1) \
200  do { \
201  b0 = _mm_unpacklo_epi64(m6, m5); \
202  b1 = _mm_unpackhi_epi64(m5, m1); \
203  } while(0)
204 
205  #define BLAKE2B_LOAD_MSG_5_3(b0, b1) \
206  do { \
207  b0 = _mm_blend_epi16(m2, m3, 0xF0); \
208  b1 = _mm_unpackhi_epi64(m7, m0); \
209  } while(0)
210 
211  #define BLAKE2B_LOAD_MSG_5_4(b0, b1) \
212  do { \
213  b0 = _mm_unpackhi_epi64(m6, m2); \
214  b1 = _mm_blend_epi16(m7, m4, 0xF0); \
215  } while(0)
216 
217  #define BLAKE2B_LOAD_MSG_6_1(b0, b1) \
218  do { \
219  b0 = _mm_blend_epi16(m6, m0, 0xF0); \
220  b1 = _mm_unpacklo_epi64(m7, m2); \
221  } while(0)
222 
223  #define BLAKE2B_LOAD_MSG_6_2(b0, b1) \
224  do { \
225  b0 = _mm_unpackhi_epi64(m2, m7); \
226  b1 = _mm_alignr_epi8(m5, m6, 8); \
227  } while(0)
228 
229  #define BLAKE2B_LOAD_MSG_6_3(b0, b1) \
230  do { \
231  b0 = _mm_unpacklo_epi64(m0, m3); \
232  b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
233  } while(0)
234 
235  #define BLAKE2B_LOAD_MSG_6_4(b0, b1) \
236  do { \
237  b0 = _mm_unpackhi_epi64(m3, m1); \
238  b1 = _mm_blend_epi16(m1, m5, 0xF0); \
239  } while(0)
240 
241  #define BLAKE2B_LOAD_MSG_7_1(b0, b1) \
242  do { \
243  b0 = _mm_unpackhi_epi64(m6, m3); \
244  b1 = _mm_blend_epi16(m6, m1, 0xF0); \
245  } while(0)
246 
247  #define BLAKE2B_LOAD_MSG_7_2(b0, b1) \
248  do { \
249  b0 = _mm_alignr_epi8(m7, m5, 8); \
250  b1 = _mm_unpackhi_epi64(m0, m4); \
251  } while(0)
252 
253  #define BLAKE2B_LOAD_MSG_7_3(b0, b1) \
254  do { \
255  b0 = _mm_unpackhi_epi64(m2, m7); \
256  b1 = _mm_unpacklo_epi64(m4, m1); \
257  } while(0)
258 
259  #define BLAKE2B_LOAD_MSG_7_4(b0, b1) \
260  do { \
261  b0 = _mm_unpacklo_epi64(m0, m2); \
262  b1 = _mm_unpacklo_epi64(m3, m5); \
263  } while(0)
264 
265  #define BLAKE2B_LOAD_MSG_8_1(b0, b1) \
266  do { \
267  b0 = _mm_unpacklo_epi64(m3, m7); \
268  b1 = _mm_alignr_epi8(m0, m5, 8); \
269  } while(0)
270 
271  #define BLAKE2B_LOAD_MSG_8_2(b0, b1) \
272  do { \
273  b0 = _mm_unpackhi_epi64(m7, m4); \
274  b1 = _mm_alignr_epi8(m4, m1, 8); \
275  } while(0)
276 
277  #define BLAKE2B_LOAD_MSG_8_3(b0, b1) \
278  do { \
279  b0 = m6; \
280  b1 = _mm_alignr_epi8(m5, m0, 8); \
281  } while(0)
282 
283  #define BLAKE2B_LOAD_MSG_8_4(b0, b1) \
284  do { \
285  b0 = _mm_blend_epi16(m1, m3, 0xF0); \
286  b1 = m2; \
287  } while(0)
288 
289  #define BLAKE2B_LOAD_MSG_9_1(b0, b1) \
290  do { \
291  b0 = _mm_unpacklo_epi64(m5, m4); \
292  b1 = _mm_unpackhi_epi64(m3, m0); \
293  } while(0)
294 
295  #define BLAKE2B_LOAD_MSG_9_2(b0, b1) \
296  do { \
297  b0 = _mm_unpacklo_epi64(m1, m2); \
298  b1 = _mm_blend_epi16(m3, m2, 0xF0); \
299  } while(0)
300 
301  #define BLAKE2B_LOAD_MSG_9_3(b0, b1) \
302  do { \
303  b0 = _mm_unpackhi_epi64(m7, m4); \
304  b1 = _mm_unpackhi_epi64(m1, m6); \
305  } while(0)
306 
307  #define BLAKE2B_LOAD_MSG_9_4(b0, b1) \
308  do { \
309  b0 = _mm_alignr_epi8(m7, m5, 8); \
310  b1 = _mm_unpacklo_epi64(m6, m0); \
311  } while(0)
312 
313  #define BLAKE2B_LOAD_MSG_10_1(b0, b1) \
314  do { \
315  b0 = _mm_unpacklo_epi64(m0, m1); \
316  b1 = _mm_unpacklo_epi64(m2, m3); \
317  } while(0)
318 
319  #define BLAKE2B_LOAD_MSG_10_2(b0, b1) \
320  do { \
321  b0 = _mm_unpackhi_epi64(m0, m1); \
322  b1 = _mm_unpackhi_epi64(m2, m3); \
323  } while(0)
324 
325  #define BLAKE2B_LOAD_MSG_10_3(b0, b1) \
326  do { \
327  b0 = _mm_unpacklo_epi64(m4, m5); \
328  b1 = _mm_unpacklo_epi64(m6, m7); \
329  } while(0)
330 
331  #define BLAKE2B_LOAD_MSG_10_4(b0, b1) \
332  do { \
333  b0 = _mm_unpackhi_epi64(m4, m5); \
334  b1 = _mm_unpackhi_epi64(m6, m7); \
335  } while(0)
336 
337  #define BLAKE2B_LOAD_MSG_11_1(b0, b1) \
338  do { \
339  b0 = _mm_unpacklo_epi64(m7, m2); \
340  b1 = _mm_unpackhi_epi64(m4, m6); \
341  } while(0)
342 
343  #define BLAKE2B_LOAD_MSG_11_2(b0, b1) \
344  do { \
345  b0 = _mm_unpacklo_epi64(m5, m4); \
346  b1 = _mm_alignr_epi8(m3, m7, 8); \
347  } while(0)
348 
349  #define BLAKE2B_LOAD_MSG_11_3(b0, b1) \
350  do { \
351  b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
352  b1 = _mm_unpackhi_epi64(m5, m2); \
353  } while(0)
354 
355  #define BLAKE2B_LOAD_MSG_11_4(b0, b1) \
356  do { \
357  b0 = _mm_unpacklo_epi64(m6, m1); \
358  b1 = _mm_unpackhi_epi64(m3, m1); \
359  } while(0)
360 
361 #ifdef __XOP__
362 # define MM_ROTI_EPI64(r, c) \
363  _mm_roti_epi64(r, c)
364 #else
365 # define MM_ROTI_EPI64(x, c) \
366  (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \
367  : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
368  : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
369  : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \
370  : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
371 #endif
372 
373 #define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
374  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
375  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
376  \
377  row4l = _mm_xor_si128(row4l, row1l); \
378  row4h = _mm_xor_si128(row4h, row1h); \
379  \
380  row4l = MM_ROTI_EPI64(row4l, -32); \
381  row4h = MM_ROTI_EPI64(row4h, -32); \
382  \
383  row3l = _mm_add_epi64(row3l, row4l); \
384  row3h = _mm_add_epi64(row3h, row4h); \
385  \
386  row2l = _mm_xor_si128(row2l, row3l); \
387  row2h = _mm_xor_si128(row2h, row3h); \
388  \
389  row2l = MM_ROTI_EPI64(row2l, -24); \
390  row2h = MM_ROTI_EPI64(row2h, -24);
391 
392 #define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
393  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
394  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
395  \
396  row4l = _mm_xor_si128(row4l, row1l); \
397  row4h = _mm_xor_si128(row4h, row1h); \
398  \
399  row4l = MM_ROTI_EPI64(row4l, -16); \
400  row4h = MM_ROTI_EPI64(row4h, -16); \
401  \
402  row3l = _mm_add_epi64(row3l, row4l); \
403  row3h = _mm_add_epi64(row3h, row4h); \
404  \
405  row2l = _mm_xor_si128(row2l, row3l); \
406  row2h = _mm_xor_si128(row2h, row3h); \
407  \
408  row2l = MM_ROTI_EPI64(row2l, -63); \
409  row2h = MM_ROTI_EPI64(row2h, -63); \
410 
411 #define BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
412  t0 = row4l;\
413  t1 = row2l;\
414  row4l = row3l;\
415  row3l = row3h;\
416  row3h = row4l;\
417  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \
418  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \
419  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \
420  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1))
421 
422 #define BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
423  t0 = row3l;\
424  row3l = row3h;\
425  row3h = t0;\
426  t0 = row2l;\
427  t1 = row4l;\
428  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \
429  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \
430  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \
431  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1))
432 
433 #define BLAKE2B_ROUND(r) \
434  BLAKE2B_LOAD_MSG_ ##r ##_1(b0, b1); \
435  BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
436  BLAKE2B_LOAD_MSG_ ##r ##_2(b0, b1); \
437  BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
438  BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
439  BLAKE2B_LOAD_MSG_ ##r ##_3(b0, b1); \
440  BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
441  BLAKE2B_LOAD_MSG_ ##r ##_4(b0, b1); \
442  BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
443  BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
444 
445  __m128i row1l, row1h;
446  __m128i row2l, row2h;
447  __m128i row3l, row3h;
448  __m128i row4l, row4h;
449  __m128i b0, b1;
450  __m128i t0, t1;
451 
452  const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
453  const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
454 
455  const __m128i m0 = LOADU(input + 00);
456  const __m128i m1 = LOADU(input + 16);
457  const __m128i m2 = LOADU(input + 32);
458  const __m128i m3 = LOADU(input + 48);
459  const __m128i m4 = LOADU(input + 64);
460  const __m128i m5 = LOADU(input + 80);
461  const __m128i m6 = LOADU(input + 96);
462  const __m128i m7 = LOADU(input + 112);
463 
464  row1l = LOADU(state.h()+0);
465  row1h = LOADU(state.h()+2);
466  row2l = LOADU(state.h()+4);
467  row2h = LOADU(state.h()+6);
468  row3l = LOADU(BLAKE2B_IV+0);
469  row3h = LOADU(BLAKE2B_IV+2);
470  row4l = _mm_xor_si128(LOADU(BLAKE2B_IV+4), LOADU(state.t()+0));
471  row4h = _mm_xor_si128(LOADU(BLAKE2B_IV+6), LOADU(state.f()+0));
472 
473  BLAKE2B_ROUND(0);
474  BLAKE2B_ROUND(1);
475  BLAKE2B_ROUND(2);
476  BLAKE2B_ROUND(3);
477  BLAKE2B_ROUND(4);
478  BLAKE2B_ROUND(5);
479  BLAKE2B_ROUND(6);
480  BLAKE2B_ROUND(7);
481  BLAKE2B_ROUND(8);
482  BLAKE2B_ROUND(9);
483  BLAKE2B_ROUND(10);
484  BLAKE2B_ROUND(11);
485 
486  row1l = _mm_xor_si128(row3l, row1l);
487  row1h = _mm_xor_si128(row3h, row1h);
488  STOREU(state.h()+0, _mm_xor_si128(LOADU(state.h()+0), row1l));
489  STOREU(state.h()+2, _mm_xor_si128(LOADU(state.h()+2), row1h));
490  row2l = _mm_xor_si128(row4l, row2l);
491  row2h = _mm_xor_si128(row4h, row2h);
492  STOREU(state.h()+4, _mm_xor_si128(LOADU(state.h()+4), row2l));
493  STOREU(state.h()+6, _mm_xor_si128(LOADU(state.h()+6), row2h));
494 }
495 #endif // CRYPTOPP_SSE41_AVAILABLE
496 
497 #if CRYPTOPP_ARM_NEON_AVAILABLE
498 void BLAKE2_Compress64_NEON(const byte* input, BLAKE2b_State& state)
499 {
500  #define BLAKE2B_LOAD_MSG_0_1(b0, b1) \
501  do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); } while(0)
502 
503  #define BLAKE2B_LOAD_MSG_0_2(b0, b1) \
504  do { b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); } while(0)
505 
506  #define BLAKE2B_LOAD_MSG_0_3(b0, b1) \
507  do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0)
508 
509  #define BLAKE2B_LOAD_MSG_0_4(b0, b1) \
510  do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); } while(0)
511 
512  #define BLAKE2B_LOAD_MSG_1_1(b0, b1) \
513  do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); } while(0)
514 
515  #define BLAKE2B_LOAD_MSG_1_2(b0, b1) \
516  do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vextq_u64(m7, m3, 1); } while(0)
517 
518  #define BLAKE2B_LOAD_MSG_1_3(b0, b1) \
519  do { b0 = vextq_u64(m0, m0, 1); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); } while(0)
520 
521  #define BLAKE2B_LOAD_MSG_1_4(b0, b1) \
522  do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); } while(0)
523 
524  #define BLAKE2B_LOAD_MSG_2_1(b0, b1) \
525  do { b0 = vextq_u64(m5, m6, 1); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); } while(0)
526 
527  #define BLAKE2B_LOAD_MSG_2_2(b0, b1) \
528  do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m0)); b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m6)); } while(0)
529 
530  #define BLAKE2B_LOAD_MSG_2_3(b0, b1) \
531  do { b0 = vcombine_u64(vget_low_u64(m5), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m4)); } while(0)
532 
533  #define BLAKE2B_LOAD_MSG_2_4(b0, b1) \
534  do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m3)); b1 = vextq_u64(m0, m2, 1); } while(0)
535 
536  #define BLAKE2B_LOAD_MSG_3_1(b0, b1) \
537  do { b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m5)); } while(0)
538 
539  #define BLAKE2B_LOAD_MSG_3_2(b0, b1) \
540  do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m0)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0)
541 
542  #define BLAKE2B_LOAD_MSG_3_3(b0, b1) \
543  do { b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); } while(0)
544 
545  #define BLAKE2B_LOAD_MSG_3_4(b0, b1) \
546  do { b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); } while(0)
547 
548  #define BLAKE2B_LOAD_MSG_4_1(b0, b1) \
549  do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m5)); } while(0)
550 
551  #define BLAKE2B_LOAD_MSG_4_2(b0, b1) \
552  do { b0 = vcombine_u64(vget_low_u64(m0), vget_high_u64(m3)); b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); } while(0)
553 
554  #define BLAKE2B_LOAD_MSG_4_3(b0, b1) \
555  do { b0 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m5)); b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m1)); } while(0)
556 
557  #define BLAKE2B_LOAD_MSG_4_4(b0, b1) \
558  do { b0 = vextq_u64(m0, m6, 1); b1 = vcombine_u64(vget_low_u64(m4), vget_high_u64(m6)); } while(0)
559 
560  #define BLAKE2B_LOAD_MSG_5_1(b0, b1) \
561  do { b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m3)); b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); } while(0)
562 
563  #define BLAKE2B_LOAD_MSG_5_2(b0, b1) \
564  do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m5)); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m1)); } while(0)
565 
566  #define BLAKE2B_LOAD_MSG_5_3(b0, b1) \
567  do { b0 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m3)); b1 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m0)); } while(0)
568 
569  #define BLAKE2B_LOAD_MSG_5_4(b0, b1) \
570  do { b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m4)); } while(0)
571 
572  #define BLAKE2B_LOAD_MSG_6_1(b0, b1) \
573  do { b0 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m0)); b1 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); } while(0)
574 
575  #define BLAKE2B_LOAD_MSG_6_2(b0, b1) \
576  do { b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); b1 = vextq_u64(m6, m5, 1); } while(0)
577 
578  #define BLAKE2B_LOAD_MSG_6_3(b0, b1) \
579  do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m3)); b1 = vextq_u64(m4, m4, 1); } while(0)
580 
581  #define BLAKE2B_LOAD_MSG_6_4(b0, b1) \
582  do { b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m5)); } while(0)
583 
584  #define BLAKE2B_LOAD_MSG_7_1(b0, b1) \
585  do { b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m3)); b1 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m1)); } while(0)
586 
587  #define BLAKE2B_LOAD_MSG_7_2(b0, b1) \
588  do { b0 = vextq_u64(m5, m7, 1); b1 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m4)); } while(0)
589 
590  #define BLAKE2B_LOAD_MSG_7_3(b0, b1) \
591  do { b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); b1 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m1)); } while(0)
592 
593  #define BLAKE2B_LOAD_MSG_7_4(b0, b1) \
594  do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m2)); b1 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); } while(0)
595 
596  #define BLAKE2B_LOAD_MSG_8_1(b0, b1) \
597  do { b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m7)); b1 = vextq_u64(m5, m0, 1); } while(0)
598 
599  #define BLAKE2B_LOAD_MSG_8_2(b0, b1) \
600  do { b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); b1 = vextq_u64(m1, m4, 1); } while(0)
601 
602  #define BLAKE2B_LOAD_MSG_8_3(b0, b1) \
603  do { b0 = m6; b1 = vextq_u64(m0, m5, 1); } while(0)
604 
605  #define BLAKE2B_LOAD_MSG_8_4(b0, b1) \
606  do { b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m3)); b1 = m2; } while(0)
607 
608  #define BLAKE2B_LOAD_MSG_9_1(b0, b1) \
609  do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m0)); } while(0)
610 
611  #define BLAKE2B_LOAD_MSG_9_2(b0, b1) \
612  do { b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m2)); b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m2)); } while(0)
613 
614  #define BLAKE2B_LOAD_MSG_9_3(b0, b1) \
615  do { b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); b1 = vcombine_u64(vget_high_u64(m1), vget_high_u64(m6)); } while(0)
616 
617  #define BLAKE2B_LOAD_MSG_9_4(b0, b1) \
618  do { b0 = vextq_u64(m5, m7, 1); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m0)); } while(0)
619 
620  #define BLAKE2B_LOAD_MSG_10_1(b0, b1) \
621  do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); } while(0)
622 
623  #define BLAKE2B_LOAD_MSG_10_2(b0, b1) \
624  do { b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); } while(0)
625 
626  #define BLAKE2B_LOAD_MSG_10_3(b0, b1) \
627  do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0)
628 
629  #define BLAKE2B_LOAD_MSG_10_4(b0, b1) \
630  do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); } while(0)
631 
632  #define BLAKE2B_LOAD_MSG_11_1(b0, b1) \
633  do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); } while(0)
634 
635  #define BLAKE2B_LOAD_MSG_11_2(b0, b1) \
636  do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vextq_u64(m7, m3, 1); } while(0)
637 
638  #define BLAKE2B_LOAD_MSG_11_3(b0, b1) \
639  do { b0 = vextq_u64(m0, m0, 1); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); } while(0)
640 
641  #define BLAKE2B_LOAD_MSG_11_4(b0, b1) \
642  do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); } while(0)
643 
644  #define vrorq_n_u64_32(x) vreinterpretq_u64_u32(vrev64q_u32(vreinterpretq_u32_u64((x))))
645 
646  #define vrorq_n_u64_24(x) vcombine_u64( \
647  vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 3)), \
648  vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 3)))
649 
650  #define vrorq_n_u64_16(x) vcombine_u64( \
651  vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 2)), \
652  vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 2)))
653 
654  #define vrorq_n_u64_63(x) veorq_u64(vaddq_u64(x, x), vshrq_n_u64(x, 63))
655 
656  #define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
657  do { \
658  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \
659  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \
660  row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \
661  row4l = vrorq_n_u64_32(row4l); row4h = vrorq_n_u64_32(row4h); \
662  row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \
663  row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \
664  row2l = vrorq_n_u64_24(row2l); row2h = vrorq_n_u64_24(row2h); \
665  } while(0)
666 
667  #define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
668  do { \
669  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \
670  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \
671  row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \
672  row4l = vrorq_n_u64_16(row4l); row4h = vrorq_n_u64_16(row4h); \
673  row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \
674  row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \
675  row2l = vrorq_n_u64_63(row2l); row2h = vrorq_n_u64_63(row2h); \
676  } while(0)
677 
678  #define BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
679  do { \
680  uint64x2_t t0 = vextq_u64(row2l, row2h, 1); \
681  uint64x2_t t1 = vextq_u64(row2h, row2l, 1); \
682  row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \
683  t0 = vextq_u64(row4h, row4l, 1); t1 = vextq_u64(row4l, row4h, 1); \
684  row4l = t0; row4h = t1; \
685  } while(0)
686 
687  #define BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
688  do { \
689  uint64x2_t t0 = vextq_u64(row2h, row2l, 1); \
690  uint64x2_t t1 = vextq_u64(row2l, row2h, 1); \
691  row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \
692  t0 = vextq_u64(row4l, row4h, 1); t1 = vextq_u64(row4h, row4l, 1); \
693  row4l = t0; row4h = t1; \
694  } while(0)
695 
696  #define BLAKE2B_ROUND(r) \
697  do { \
698  uint64x2_t b0, b1; \
699  BLAKE2B_LOAD_MSG_ ##r ##_1(b0, b1); \
700  BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
701  BLAKE2B_LOAD_MSG_ ##r ##_2(b0, b1); \
702  BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
703  BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
704  BLAKE2B_LOAD_MSG_ ##r ##_3(b0, b1); \
705  BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
706  BLAKE2B_LOAD_MSG_ ##r ##_4(b0, b1); \
707  BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
708  BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
709  } while(0)
710 
711  const uint64x2_t m0 = vreinterpretq_u64_u8(vld1q_u8(input + 00));
712  const uint64x2_t m1 = vreinterpretq_u64_u8(vld1q_u8(input + 16));
713  const uint64x2_t m2 = vreinterpretq_u64_u8(vld1q_u8(input + 32));
714  const uint64x2_t m3 = vreinterpretq_u64_u8(vld1q_u8(input + 48));
715  const uint64x2_t m4 = vreinterpretq_u64_u8(vld1q_u8(input + 64));
716  const uint64x2_t m5 = vreinterpretq_u64_u8(vld1q_u8(input + 80));
717  const uint64x2_t m6 = vreinterpretq_u64_u8(vld1q_u8(input + 96));
718  const uint64x2_t m7 = vreinterpretq_u64_u8(vld1q_u8(input + 112));
719 
720  uint64x2_t row1l, row1h, row2l, row2h;
721  uint64x2_t row3l, row3h, row4l, row4h;
722 
723  const uint64x2_t h0 = row1l = vld1q_u64(state.h()+0);
724  const uint64x2_t h1 = row1h = vld1q_u64(state.h()+2);
725  const uint64x2_t h2 = row2l = vld1q_u64(state.h()+4);
726  const uint64x2_t h3 = row2h = vld1q_u64(state.h()+6);
727 
728  row3l = vld1q_u64(BLAKE2B_IV+0);
729  row3h = vld1q_u64(BLAKE2B_IV+2);
730  row4l = veorq_u64(vld1q_u64(BLAKE2B_IV+4), vld1q_u64(state.t()+0));
731  row4h = veorq_u64(vld1q_u64(BLAKE2B_IV+6), vld1q_u64(state.f()+0));
732 
733  BLAKE2B_ROUND(0);
734  BLAKE2B_ROUND(1);
735  BLAKE2B_ROUND(2);
736  BLAKE2B_ROUND(3);
737  BLAKE2B_ROUND(4);
738  BLAKE2B_ROUND(5);
739  BLAKE2B_ROUND(6);
740  BLAKE2B_ROUND(7);
741  BLAKE2B_ROUND(8);
742  BLAKE2B_ROUND(9);
743  BLAKE2B_ROUND(10);
744  BLAKE2B_ROUND(11);
745 
746  vst1q_u64(state.h()+0, veorq_u64(h0, veorq_u64(row1l, row3l)));
747  vst1q_u64(state.h()+2, veorq_u64(h1, veorq_u64(row1h, row3h)));
748  vst1q_u64(state.h()+4, veorq_u64(h2, veorq_u64(row2l, row4l)));
749  vst1q_u64(state.h()+6, veorq_u64(h3, veorq_u64(row2h, row4h)));
750 }
751 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
752 
753 #if (CRYPTOPP_POWER8_AVAILABLE)
754 
755 inline uint64x2_p VecLoad64(const void* p)
756 {
757 #if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
758  return (uint64x2_p)vec_xl(0, (uint8_t*)p);
759 #else
760  return (uint64x2_p)vec_vsx_ld(0, (uint8_t*)p);
761 #endif
762 }
763 
764 inline uint64x2_p VecLoad64LE(const void* p)
765 {
766 #if __BIG_ENDIAN__
767  const uint8x16_p m = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
768  const uint64x2_p v = VecLoad64(p);
769  return VecPermute(v, v, m);
770 #else
771  return VecLoad64(p);
772 #endif
773 }
774 
775 inline void VecStore64(void* p, const uint64x2_p x)
776 {
777 #if defined(__xlc__) || defined(__xlC__) || defined(__clang__)
778  vec_xst((uint8x16_p)x,0,(uint8_t*)p);
779 #else
780  vec_vsx_st((uint8x16_p)x,0,(uint8_t*)p);
781 #endif
782 }
783 
784 inline void VecStore64LE(void* p, const uint64x2_p x)
785 {
786 #if __BIG_ENDIAN__
787  const uint8x16_p m = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
788  VecStore64(p, VecPermute(x, x, m));
789 #else
790  VecStore64(p, x);
791 #endif
792 }
793 
794 template <unsigned int C>
795 inline uint64x2_p VecShiftLeftOctet(const uint64x2_p a, const uint64x2_p b)
796 {
797 #if __BIG_ENDIAN__
798  return (uint64x2_p)vec_sld((uint8x16_p)a, (uint8x16_p)b, C);
799 #else
800  return (uint64x2_p)vec_sld((uint8x16_p)b, (uint8x16_p)a, 16-C);
801 #endif
802 }
803 
804 #define vec_shl_octet(a,b,c) VecShiftLeftOctet<c*8>(a, b)
805 
806 // vec_mergeh(a,b) is equivalent to VecPermute(a,b,HH_MASK); and
807 // vec_mergel(a,b) is equivalent VecPermute(a,b,LL_MASK). Benchmarks
808 // show vec_mergeh and vec_mergel is faster on little-endian
809 // machines by 0.4 cpb. Benchmarks show VecPermute is faster on
810 // big-endian machines by 1.5 cpb. The code that uses
811 // vec_mergeh and vec_mergel is about 880 bytes shorter.
812 
813 #if defined(__GNUC__) && (__BIG_ENDIAN__)
814 # define vec_merge_hi(a,b) VecPermute(a,b, HH_MASK)
815 # define vec_merge_lo(a,b) VecPermute(a,b, LL_MASK)
816 #else
817 # define vec_merge_hi(a,b) vec_mergeh(a,b)
818 # define vec_merge_lo(a,b) vec_mergel(a,b)
819 #endif
820 
821 void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
822 {
823  // Permute masks. High is element 0 (most significant),
824  // low is element 1 (least significant).
825 
826 #if defined(__GNUC__) && (__BIG_ENDIAN__)
827  const uint8x16_p HH_MASK = { 0,1,2,3,4,5,6,7, 16,17,18,19,20,21,22,23 };
828  const uint8x16_p LL_MASK = { 8,9,10,11,12,13,14,15, 24,25,26,27,28,29,30,31 };
829 #endif
830 
831  const uint8x16_p HL_MASK = { 0,1,2,3,4,5,6,7, 24,25,26,27,28,29,30,31 };
832  const uint8x16_p LH_MASK = { 8,9,10,11,12,13,14,15, 16,17,18,19,20,21,22,23 };
833 
834  #define BLAKE2B_LOAD_MSG_0_1(b0, b1) \
835  do { \
836  b0 = vec_merge_hi(m0, m1); \
837  b1 = vec_merge_hi(m2, m3); \
838  } while(0)
839 
840  #define BLAKE2B_LOAD_MSG_0_2(b0, b1) \
841  do { \
842  b0 = vec_merge_lo(m0, m1); \
843  b1 = vec_merge_lo(m2, m3); \
844  } while(0)
845 
846  #define BLAKE2B_LOAD_MSG_0_3(b0, b1) \
847  do { \
848  b0 = vec_merge_hi(m4, m5); \
849  b1 = vec_merge_hi(m6, m7); \
850  } while(0)
851 
852  #define BLAKE2B_LOAD_MSG_0_4(b0, b1) \
853  do { \
854  b0 = vec_merge_lo(m4, m5); \
855  b1 = vec_merge_lo(m6, m7); \
856  } while(0)
857 
858  #define BLAKE2B_LOAD_MSG_1_1(b0, b1) \
859  do { \
860  b0 = vec_merge_hi(m7, m2); \
861  b1 = vec_merge_lo(m4, m6); \
862  } while(0)
863 
864  #define BLAKE2B_LOAD_MSG_1_2(b0, b1) \
865  do { \
866  b0 = vec_merge_hi(m5, m4); \
867  b1 = vec_shl_octet(m7, m3, 1); \
868  } while(0)
869 
870  #define BLAKE2B_LOAD_MSG_1_3(b0, b1) \
871  do { \
872  b0 = vec_shl_octet(m0, m0, 1); \
873  b1 = vec_merge_lo(m5, m2); \
874  } while(0)
875 
876  #define BLAKE2B_LOAD_MSG_1_4(b0, b1) \
877  do { \
878  b0 = vec_merge_hi(m6, m1); \
879  b1 = vec_merge_lo(m3, m1); \
880  } while(0)
881 
882  #define BLAKE2B_LOAD_MSG_2_1(b0, b1) \
883  do { \
884  b0 = vec_shl_octet(m5, m6, 1); \
885  b1 = vec_merge_lo(m2, m7); \
886  } while(0)
887 
888  #define BLAKE2B_LOAD_MSG_2_2(b0, b1) \
889  do { \
890  b0 = vec_merge_hi(m4, m0); \
891  b1 = VecPermute(m1, m6, HL_MASK); \
892  } while(0)
893 
894  #define BLAKE2B_LOAD_MSG_2_3(b0, b1) \
895  do { \
896  b0 = VecPermute(m5, m1, HL_MASK); \
897  b1 = vec_merge_lo(m3, m4); \
898  } while(0)
899 
900  #define BLAKE2B_LOAD_MSG_2_4(b0, b1) \
901  do { \
902  b0 = vec_merge_hi(m7, m3); \
903  b1 = vec_shl_octet(m0, m2, 1); \
904  } while(0)
905 
906  #define BLAKE2B_LOAD_MSG_3_1(b0, b1) \
907  do { \
908  b0 = vec_merge_lo(m3, m1); \
909  b1 = vec_merge_lo(m6, m5); \
910  } while(0)
911 
912  #define BLAKE2B_LOAD_MSG_3_2(b0, b1) \
913  do { \
914  b0 = vec_merge_lo(m4, m0); \
915  b1 = vec_merge_hi(m6, m7); \
916  } while(0)
917 
918  #define BLAKE2B_LOAD_MSG_3_3(b0, b1) \
919  do { \
920  b0 = VecPermute(m1, m2, HL_MASK); \
921  b1 = VecPermute(m2, m7, HL_MASK); \
922  } while(0)
923 
924  #define BLAKE2B_LOAD_MSG_3_4(b0, b1) \
925  do { \
926  b0 = vec_merge_hi(m3, m5); \
927  b1 = vec_merge_hi(m0, m4); \
928  } while(0)
929 
930  #define BLAKE2B_LOAD_MSG_4_1(b0, b1) \
931  do { \
932  b0 = vec_merge_lo(m4, m2); \
933  b1 = vec_merge_hi(m1, m5); \
934  } while(0)
935 
936  #define BLAKE2B_LOAD_MSG_4_2(b0, b1) \
937  do { \
938  b0 = VecPermute(m0, m3, HL_MASK); \
939  b1 = VecPermute(m2, m7, HL_MASK); \
940  } while(0)
941 
942  #define BLAKE2B_LOAD_MSG_4_3(b0, b1) \
943  do { \
944  b0 = VecPermute(m7, m5, HL_MASK); \
945  b1 = VecPermute(m3, m1, HL_MASK); \
946  } while(0)
947 
948  #define BLAKE2B_LOAD_MSG_4_4(b0, b1) \
949  do { \
950  b0 = vec_shl_octet(m0, m6, 1); \
951  b1 = VecPermute(m4, m6, HL_MASK); \
952  } while(0)
953 
954  #define BLAKE2B_LOAD_MSG_5_1(b0, b1) \
955  do { \
956  b0 = vec_merge_hi(m1, m3); \
957  b1 = vec_merge_hi(m0, m4); \
958  } while(0)
959 
960  #define BLAKE2B_LOAD_MSG_5_2(b0, b1) \
961  do { \
962  b0 = vec_merge_hi(m6, m5); \
963  b1 = vec_merge_lo(m5, m1); \
964  } while(0)
965 
966  #define BLAKE2B_LOAD_MSG_5_3(b0, b1) \
967  do { \
968  b0 = VecPermute(m2, m3, HL_MASK); \
969  b1 = vec_merge_lo(m7, m0); \
970  } while(0)
971 
972  #define BLAKE2B_LOAD_MSG_5_4(b0, b1) \
973  do { \
974  b0 = vec_merge_lo(m6, m2); \
975  b1 = VecPermute(m7, m4, HL_MASK); \
976  } while(0)
977 
978  #define BLAKE2B_LOAD_MSG_6_1(b0, b1) \
979  do { \
980  b0 = VecPermute(m6, m0, HL_MASK); \
981  b1 = vec_merge_hi(m7, m2); \
982  } while(0)
983 
984  #define BLAKE2B_LOAD_MSG_6_2(b0, b1) \
985  do { \
986  b0 = vec_merge_lo(m2, m7); \
987  b1 = vec_shl_octet(m6, m5, 1); \
988  } while(0)
989 
990  #define BLAKE2B_LOAD_MSG_6_3(b0, b1) \
991  do { \
992  b0 = vec_merge_hi(m0, m3); \
993  b1 = vec_shl_octet(m4, m4, 1); \
994  } while(0)
995 
996  #define BLAKE2B_LOAD_MSG_6_4(b0, b1) \
997  do { \
998  b0 = vec_merge_lo(m3, m1); \
999  b1 = VecPermute(m1, m5, HL_MASK); \
1000  } while(0)
1001 
1002  #define BLAKE2B_LOAD_MSG_7_1(b0, b1) \
1003  do { \
1004  b0 = vec_merge_lo(m6, m3); \
1005  b1 = VecPermute(m6, m1, HL_MASK); \
1006  } while(0)
1007 
1008  #define BLAKE2B_LOAD_MSG_7_2(b0, b1) \
1009  do { \
1010  b0 = vec_shl_octet(m5, m7, 1); \
1011  b1 = vec_merge_lo(m0, m4); \
1012  } while(0)
1013 
1014  #define BLAKE2B_LOAD_MSG_7_3(b0, b1) \
1015  do { \
1016  b0 = vec_merge_lo(m2, m7); \
1017  b1 = vec_merge_hi(m4, m1); \
1018  } while(0)
1019 
1020  #define BLAKE2B_LOAD_MSG_7_4(b0, b1) \
1021  do { \
1022  b0 = vec_merge_hi(m0, m2); \
1023  b1 = vec_merge_hi(m3, m5); \
1024  } while(0)
1025 
1026  #define BLAKE2B_LOAD_MSG_8_1(b0, b1) \
1027  do { \
1028  b0 = vec_merge_hi(m3, m7); \
1029  b1 = vec_shl_octet(m5, m0, 1); \
1030  } while(0)
1031 
1032  #define BLAKE2B_LOAD_MSG_8_2(b0, b1) \
1033  do { \
1034  b0 = vec_merge_lo(m7, m4); \
1035  b1 = vec_shl_octet(m1, m4, 1); \
1036  } while(0)
1037 
1038  #define BLAKE2B_LOAD_MSG_8_3(b0, b1) \
1039  do { \
1040  b0 = m6; \
1041  b1 = vec_shl_octet(m0, m5, 1); \
1042  } while(0)
1043 
1044  #define BLAKE2B_LOAD_MSG_8_4(b0, b1) \
1045  do { \
1046  b0 = VecPermute(m1, m3, HL_MASK); \
1047  b1 = m2; \
1048  } while(0)
1049 
1050  #define BLAKE2B_LOAD_MSG_9_1(b0, b1) \
1051  do { \
1052  b0 = vec_merge_hi(m5, m4); \
1053  b1 = vec_merge_lo(m3, m0); \
1054  } while(0)
1055 
1056  #define BLAKE2B_LOAD_MSG_9_2(b0, b1) \
1057  do { \
1058  b0 = vec_merge_hi(m1, m2); \
1059  b1 = VecPermute(m3, m2, HL_MASK); \
1060  } while(0)
1061 
1062  #define BLAKE2B_LOAD_MSG_9_3(b0, b1) \
1063  do { \
1064  b0 = vec_merge_lo(m7, m4); \
1065  b1 = vec_merge_lo(m1, m6); \
1066  } while(0)
1067 
1068  #define BLAKE2B_LOAD_MSG_9_4(b0, b1) \
1069  do { \
1070  b0 = vec_shl_octet(m5, m7, 1); \
1071  b1 = vec_merge_hi(m6, m0); \
1072  } while(0)
1073 
1074  #define BLAKE2B_LOAD_MSG_10_1(b0, b1) \
1075  do { \
1076  b0 = vec_merge_hi(m0, m1); \
1077  b1 = vec_merge_hi(m2, m3); \
1078  } while(0)
1079 
1080  #define BLAKE2B_LOAD_MSG_10_2(b0, b1) \
1081  do { \
1082  b0 = vec_merge_lo(m0, m1); \
1083  b1 = vec_merge_lo(m2, m3); \
1084  } while(0)
1085 
1086  #define BLAKE2B_LOAD_MSG_10_3(b0, b1) \
1087  do { \
1088  b0 = vec_merge_hi(m4, m5); \
1089  b1 = vec_merge_hi(m6, m7); \
1090  } while(0)
1091 
1092  #define BLAKE2B_LOAD_MSG_10_4(b0, b1) \
1093  do { \
1094  b0 = vec_merge_lo(m4, m5); \
1095  b1 = vec_merge_lo(m6, m7); \
1096  } while(0)
1097 
1098  #define BLAKE2B_LOAD_MSG_11_1(b0, b1) \
1099  do { \
1100  b0 = vec_merge_hi(m7, m2); \
1101  b1 = vec_merge_lo(m4, m6); \
1102  } while(0)
1103 
1104  #define BLAKE2B_LOAD_MSG_11_2(b0, b1) \
1105  do { \
1106  b0 = vec_merge_hi(m5, m4); \
1107  b1 = vec_shl_octet(m7, m3, 1); \
1108  } while(0)
1109 
1110  #define BLAKE2B_LOAD_MSG_11_3(b0, b1) \
1111  do { \
1112  b0 = vec_shl_octet(m0, m0, 1); \
1113  b1 = vec_merge_lo(m5, m2); \
1114  } while(0)
1115 
1116  #define BLAKE2B_LOAD_MSG_11_4(b0, b1) \
1117  do { \
1118  b0 = vec_merge_hi(m6, m1); \
1119  b1 = vec_merge_lo(m3, m1); \
1120  } while(0)
1121 
1122  // Power8 has packed 64-bit rotate, but in terms of left rotate
1123  const uint64x2_p ROR16_MASK = { 64-16, 64-16 };
1124  const uint64x2_p ROR24_MASK = { 64-24, 64-24 };
1125  const uint64x2_p ROR32_MASK = { 64-32, 64-32 };
1126  const uint64x2_p ROR63_MASK = { 64-63, 64-63 };
1127 
1128  #define vec_ror_32(x) vec_rl(x, ROR32_MASK)
1129  #define vec_ror_24(x) vec_rl(x, ROR24_MASK)
1130  #define vec_ror_16(x) vec_rl(x, ROR16_MASK)
1131  #define vec_ror_63(x) vec_rl(x, ROR63_MASK)
1132 
1133  #define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
1134  do { \
1135  row1l = VecAdd(VecAdd(row1l, b0), row2l); \
1136  row1h = VecAdd(VecAdd(row1h, b1), row2h); \
1137  row4l = VecXor(row4l, row1l); row4h = VecXor(row4h, row1h); \
1138  row4l = vec_ror_32(row4l); row4h = vec_ror_32(row4h); \
1139  row3l = VecAdd(row3l, row4l); row3h = VecAdd(row3h, row4h); \
1140  row2l = VecXor(row2l, row3l); row2h = VecXor(row2h, row3h); \
1141  row2l = vec_ror_24(row2l); row2h = vec_ror_24(row2h); \
1142  } while(0)
1143 
1144  #define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
1145  do { \
1146  row1l = VecAdd(VecAdd(row1l, b0), row2l); \
1147  row1h = VecAdd(VecAdd(row1h, b1), row2h); \
1148  row4l = VecXor(row4l, row1l); row4h = VecXor(row4h, row1h); \
1149  row4l = vec_ror_16(row4l); row4h = vec_ror_16(row4h); \
1150  row3l = VecAdd(row3l, row4l); row3h = VecAdd(row3h, row4h); \
1151  row2l = VecXor(row2l, row3l); row2h = VecXor(row2h, row3h); \
1152  row2l = vec_ror_63(row2l); row2h = vec_ror_63(row2h); \
1153  } while(0)
1154 
1155  #define BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
1156  do { \
1157  uint64x2_p t0 = vec_shl_octet(row2l, row2h, 1); \
1158  uint64x2_p t1 = vec_shl_octet(row2h, row2l, 1); \
1159  row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \
1160  t0 = vec_shl_octet(row4h, row4l, 1); t1 = vec_shl_octet(row4l, row4h, 1); \
1161  row4l = t0; row4h = t1; \
1162  } while(0)
1163 
1164  #define BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
1165  do { \
1166  uint64x2_p t0 = vec_shl_octet(row2h, row2l, 1); \
1167  uint64x2_p t1 = vec_shl_octet(row2l, row2h, 1); \
1168  row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \
1169  t0 = vec_shl_octet(row4l, row4h, 1); t1 = vec_shl_octet(row4h, row4l, 1); \
1170  row4l = t0; row4h = t1; \
1171  } while(0)
1172 
1173  #define BLAKE2B_ROUND(r) \
1174  do { \
1175  uint64x2_p b0, b1; \
1176  BLAKE2B_LOAD_MSG_ ##r ##_1(b0, b1); \
1177  BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
1178  BLAKE2B_LOAD_MSG_ ##r ##_2(b0, b1); \
1179  BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
1180  BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
1181  BLAKE2B_LOAD_MSG_ ##r ##_3(b0, b1); \
1182  BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
1183  BLAKE2B_LOAD_MSG_ ##r ##_4(b0, b1); \
1184  BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
1185  BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
1186  } while(0)
1187 
1188  const uint64x2_p m0 = VecLoad64LE(input + 00);
1189  const uint64x2_p m1 = VecLoad64LE(input + 16);
1190  const uint64x2_p m2 = VecLoad64LE(input + 32);
1191  const uint64x2_p m3 = VecLoad64LE(input + 48);
1192  const uint64x2_p m4 = VecLoad64LE(input + 64);
1193  const uint64x2_p m5 = VecLoad64LE(input + 80);
1194  const uint64x2_p m6 = VecLoad64LE(input + 96);
1195  const uint64x2_p m7 = VecLoad64LE(input + 112);
1196 
1197  uint64x2_p row1l, row1h, row2l, row2h;
1198  uint64x2_p row3l, row3h, row4l, row4h;
1199 
1200  const uint64x2_p h0 = row1l = VecLoad64LE(state.h()+0);
1201  const uint64x2_p h1 = row1h = VecLoad64LE(state.h()+2);
1202  const uint64x2_p h2 = row2l = VecLoad64LE(state.h()+4);
1203  const uint64x2_p h3 = row2h = VecLoad64LE(state.h()+6);
1204 
1205  row3l = VecLoad64(BLAKE2B_IV+0);
1206  row3h = VecLoad64(BLAKE2B_IV+2);
1207  row4l = VecXor(VecLoad64(BLAKE2B_IV+4), VecLoad64(state.t()+0));
1208  row4h = VecXor(VecLoad64(BLAKE2B_IV+6), VecLoad64(state.f()+0));
1209 
1210  BLAKE2B_ROUND(0);
1211  BLAKE2B_ROUND(1);
1212  BLAKE2B_ROUND(2);
1213  BLAKE2B_ROUND(3);
1214  BLAKE2B_ROUND(4);
1215  BLAKE2B_ROUND(5);
1216  BLAKE2B_ROUND(6);
1217  BLAKE2B_ROUND(7);
1218  BLAKE2B_ROUND(8);
1219  BLAKE2B_ROUND(9);
1220  BLAKE2B_ROUND(10);
1221  BLAKE2B_ROUND(11);
1222 
1223  VecStore64LE(state.h()+0, VecXor(h0, VecXor(row1l, row3l)));
1224  VecStore64LE(state.h()+2, VecXor(h1, VecXor(row1h, row3h)));
1225  VecStore64LE(state.h()+4, VecXor(h2, VecXor(row2l, row4l)));
1226  VecStore64LE(state.h()+6, VecXor(h3, VecXor(row2h, row4h)));
1227 }
1228 #endif // CRYPTOPP_POWER8_AVAILABLE
1229 
1230 NAMESPACE_END
Utility functions for the Crypto++ library.
Library configuration file.
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1010
Support functions for PowerPC and vector operations.
Precompiled header file.
Classes for BLAKE2b and BLAKE2s message digests and keyed message digests.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:916
BLAKE2b state information.
Definition: blake2.h:196
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:139
Crypto++ library namespace.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:119
T VecShiftLeftOctet(const T vec)
Shift a vector left.
Definition: ppc_simd.h:1056