23 #if (defined(__aarch32__) || defined(__aarch64__)) && defined(CRYPTOPP_SLOW_ARMV8_SHIFT) 24 # undef CRYPTOPP_ARM_NEON_AVAILABLE 29 #if defined(__xlC__) && (__xlC__ < 0x0d01) 30 # define CRYPTOPP_DISABLE_ALTIVEC 1 31 # undef CRYPTOPP_POWER7_AVAILABLE 32 # undef CRYPTOPP_ALTIVEC_AVAILABLE 35 #if (CRYPTOPP_SSE41_AVAILABLE) 36 # include <emmintrin.h> 37 # include <tmmintrin.h> 38 # include <smmintrin.h> 42 #if (CRYPTOPP_ARM_NEON_AVAILABLE) && !defined(_M_ARM64) 43 # include <arm_neon.h> 46 #if (CRYPTOPP_ARM_ACLE_AVAILABLE) 48 # include <arm_acle.h> 51 #if (CRYPTOPP_POWER8_AVAILABLE) 56 extern const char BLAKE2B_SIMD_FNAME[] = __FILE__;
61 extern const word32 BLAKE2S_IV[8];
62 extern const word64 BLAKE2B_IV[8];
64 #if CRYPTOPP_SSE41_AVAILABLE 66 #define LOADU(p) _mm_loadu_si128((const __m128i *)(const void*)(p)) 67 #define STOREU(p,r) _mm_storeu_si128((__m128i *)(void*)(p), r) 68 #define TOF(reg) _mm_castsi128_ps((reg)) 69 #define TOI(reg) _mm_castps_si128((reg)) 71 void BLAKE2_Compress64_SSE4(
const byte* input,
BLAKE2b_State& state)
73 #define BLAKE2B_LOAD_MSG_0_1(b0, b1) \ 75 b0 = _mm_unpacklo_epi64(m0, m1); \ 76 b1 = _mm_unpacklo_epi64(m2, m3); \ 79 #define BLAKE2B_LOAD_MSG_0_2(b0, b1) \ 81 b0 = _mm_unpackhi_epi64(m0, m1); \ 82 b1 = _mm_unpackhi_epi64(m2, m3); \ 85 #define BLAKE2B_LOAD_MSG_0_3(b0, b1) \ 87 b0 = _mm_unpacklo_epi64(m4, m5); \ 88 b1 = _mm_unpacklo_epi64(m6, m7); \ 91 #define BLAKE2B_LOAD_MSG_0_4(b0, b1) \ 93 b0 = _mm_unpackhi_epi64(m4, m5); \ 94 b1 = _mm_unpackhi_epi64(m6, m7); \ 97 #define BLAKE2B_LOAD_MSG_1_1(b0, b1) \ 99 b0 = _mm_unpacklo_epi64(m7, m2); \ 100 b1 = _mm_unpackhi_epi64(m4, m6); \ 103 #define BLAKE2B_LOAD_MSG_1_2(b0, b1) \ 105 b0 = _mm_unpacklo_epi64(m5, m4); \ 106 b1 = _mm_alignr_epi8(m3, m7, 8); \ 109 #define BLAKE2B_LOAD_MSG_1_3(b0, b1) \ 111 b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ 112 b1 = _mm_unpackhi_epi64(m5, m2); \ 115 #define BLAKE2B_LOAD_MSG_1_4(b0, b1) \ 117 b0 = _mm_unpacklo_epi64(m6, m1); \ 118 b1 = _mm_unpackhi_epi64(m3, m1); \ 121 #define BLAKE2B_LOAD_MSG_2_1(b0, b1) \ 123 b0 = _mm_alignr_epi8(m6, m5, 8); \ 124 b1 = _mm_unpackhi_epi64(m2, m7); \ 127 #define BLAKE2B_LOAD_MSG_2_2(b0, b1) \ 129 b0 = _mm_unpacklo_epi64(m4, m0); \ 130 b1 = _mm_blend_epi16(m1, m6, 0xF0); \ 133 #define BLAKE2B_LOAD_MSG_2_3(b0, b1) \ 135 b0 = _mm_blend_epi16(m5, m1, 0xF0); \ 136 b1 = _mm_unpackhi_epi64(m3, m4); \ 139 #define BLAKE2B_LOAD_MSG_2_4(b0, b1) \ 141 b0 = _mm_unpacklo_epi64(m7, m3); \ 142 b1 = _mm_alignr_epi8(m2, m0, 8); \ 145 #define BLAKE2B_LOAD_MSG_3_1(b0, b1) \ 147 b0 = _mm_unpackhi_epi64(m3, m1); \ 148 b1 = _mm_unpackhi_epi64(m6, m5); \ 151 #define BLAKE2B_LOAD_MSG_3_2(b0, b1) \ 153 b0 = _mm_unpackhi_epi64(m4, m0); \ 154 b1 = _mm_unpacklo_epi64(m6, m7); \ 157 #define BLAKE2B_LOAD_MSG_3_3(b0, b1) \ 159 b0 = _mm_blend_epi16(m1, m2, 0xF0); \ 160 b1 = _mm_blend_epi16(m2, m7, 0xF0); \ 163 #define BLAKE2B_LOAD_MSG_3_4(b0, b1) \ 165 b0 = _mm_unpacklo_epi64(m3, m5); \ 166 b1 = _mm_unpacklo_epi64(m0, m4); \ 169 #define BLAKE2B_LOAD_MSG_4_1(b0, b1) \ 171 b0 = _mm_unpackhi_epi64(m4, m2); \ 172 b1 = _mm_unpacklo_epi64(m1, m5); \ 175 #define BLAKE2B_LOAD_MSG_4_2(b0, b1) \ 177 b0 = _mm_blend_epi16(m0, m3, 0xF0); \ 178 b1 = _mm_blend_epi16(m2, m7, 0xF0); \ 181 #define BLAKE2B_LOAD_MSG_4_3(b0, b1) \ 183 b0 = _mm_blend_epi16(m7, m5, 0xF0); \ 184 b1 = _mm_blend_epi16(m3, m1, 0xF0); \ 187 #define BLAKE2B_LOAD_MSG_4_4(b0, b1) \ 189 b0 = _mm_alignr_epi8(m6, m0, 8); \ 190 b1 = _mm_blend_epi16(m4, m6, 0xF0); \ 193 #define BLAKE2B_LOAD_MSG_5_1(b0, b1) \ 195 b0 = _mm_unpacklo_epi64(m1, m3); \ 196 b1 = _mm_unpacklo_epi64(m0, m4); \ 199 #define BLAKE2B_LOAD_MSG_5_2(b0, b1) \ 201 b0 = _mm_unpacklo_epi64(m6, m5); \ 202 b1 = _mm_unpackhi_epi64(m5, m1); \ 205 #define BLAKE2B_LOAD_MSG_5_3(b0, b1) \ 207 b0 = _mm_blend_epi16(m2, m3, 0xF0); \ 208 b1 = _mm_unpackhi_epi64(m7, m0); \ 211 #define BLAKE2B_LOAD_MSG_5_4(b0, b1) \ 213 b0 = _mm_unpackhi_epi64(m6, m2); \ 214 b1 = _mm_blend_epi16(m7, m4, 0xF0); \ 217 #define BLAKE2B_LOAD_MSG_6_1(b0, b1) \ 219 b0 = _mm_blend_epi16(m6, m0, 0xF0); \ 220 b1 = _mm_unpacklo_epi64(m7, m2); \ 223 #define BLAKE2B_LOAD_MSG_6_2(b0, b1) \ 225 b0 = _mm_unpackhi_epi64(m2, m7); \ 226 b1 = _mm_alignr_epi8(m5, m6, 8); \ 229 #define BLAKE2B_LOAD_MSG_6_3(b0, b1) \ 231 b0 = _mm_unpacklo_epi64(m0, m3); \ 232 b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \ 235 #define BLAKE2B_LOAD_MSG_6_4(b0, b1) \ 237 b0 = _mm_unpackhi_epi64(m3, m1); \ 238 b1 = _mm_blend_epi16(m1, m5, 0xF0); \ 241 #define BLAKE2B_LOAD_MSG_7_1(b0, b1) \ 243 b0 = _mm_unpackhi_epi64(m6, m3); \ 244 b1 = _mm_blend_epi16(m6, m1, 0xF0); \ 247 #define BLAKE2B_LOAD_MSG_7_2(b0, b1) \ 249 b0 = _mm_alignr_epi8(m7, m5, 8); \ 250 b1 = _mm_unpackhi_epi64(m0, m4); \ 253 #define BLAKE2B_LOAD_MSG_7_3(b0, b1) \ 255 b0 = _mm_unpackhi_epi64(m2, m7); \ 256 b1 = _mm_unpacklo_epi64(m4, m1); \ 259 #define BLAKE2B_LOAD_MSG_7_4(b0, b1) \ 261 b0 = _mm_unpacklo_epi64(m0, m2); \ 262 b1 = _mm_unpacklo_epi64(m3, m5); \ 265 #define BLAKE2B_LOAD_MSG_8_1(b0, b1) \ 267 b0 = _mm_unpacklo_epi64(m3, m7); \ 268 b1 = _mm_alignr_epi8(m0, m5, 8); \ 271 #define BLAKE2B_LOAD_MSG_8_2(b0, b1) \ 273 b0 = _mm_unpackhi_epi64(m7, m4); \ 274 b1 = _mm_alignr_epi8(m4, m1, 8); \ 277 #define BLAKE2B_LOAD_MSG_8_3(b0, b1) \ 280 b1 = _mm_alignr_epi8(m5, m0, 8); \ 283 #define BLAKE2B_LOAD_MSG_8_4(b0, b1) \ 285 b0 = _mm_blend_epi16(m1, m3, 0xF0); \ 289 #define BLAKE2B_LOAD_MSG_9_1(b0, b1) \ 291 b0 = _mm_unpacklo_epi64(m5, m4); \ 292 b1 = _mm_unpackhi_epi64(m3, m0); \ 295 #define BLAKE2B_LOAD_MSG_9_2(b0, b1) \ 297 b0 = _mm_unpacklo_epi64(m1, m2); \ 298 b1 = _mm_blend_epi16(m3, m2, 0xF0); \ 301 #define BLAKE2B_LOAD_MSG_9_3(b0, b1) \ 303 b0 = _mm_unpackhi_epi64(m7, m4); \ 304 b1 = _mm_unpackhi_epi64(m1, m6); \ 307 #define BLAKE2B_LOAD_MSG_9_4(b0, b1) \ 309 b0 = _mm_alignr_epi8(m7, m5, 8); \ 310 b1 = _mm_unpacklo_epi64(m6, m0); \ 313 #define BLAKE2B_LOAD_MSG_10_1(b0, b1) \ 315 b0 = _mm_unpacklo_epi64(m0, m1); \ 316 b1 = _mm_unpacklo_epi64(m2, m3); \ 319 #define BLAKE2B_LOAD_MSG_10_2(b0, b1) \ 321 b0 = _mm_unpackhi_epi64(m0, m1); \ 322 b1 = _mm_unpackhi_epi64(m2, m3); \ 325 #define BLAKE2B_LOAD_MSG_10_3(b0, b1) \ 327 b0 = _mm_unpacklo_epi64(m4, m5); \ 328 b1 = _mm_unpacklo_epi64(m6, m7); \ 331 #define BLAKE2B_LOAD_MSG_10_4(b0, b1) \ 333 b0 = _mm_unpackhi_epi64(m4, m5); \ 334 b1 = _mm_unpackhi_epi64(m6, m7); \ 337 #define BLAKE2B_LOAD_MSG_11_1(b0, b1) \ 339 b0 = _mm_unpacklo_epi64(m7, m2); \ 340 b1 = _mm_unpackhi_epi64(m4, m6); \ 343 #define BLAKE2B_LOAD_MSG_11_2(b0, b1) \ 345 b0 = _mm_unpacklo_epi64(m5, m4); \ 346 b1 = _mm_alignr_epi8(m3, m7, 8); \ 349 #define BLAKE2B_LOAD_MSG_11_3(b0, b1) \ 351 b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ 352 b1 = _mm_unpackhi_epi64(m5, m2); \ 355 #define BLAKE2B_LOAD_MSG_11_4(b0, b1) \ 357 b0 = _mm_unpacklo_epi64(m6, m1); \ 358 b1 = _mm_unpackhi_epi64(m3, m1); \ 362 # define MM_ROTI_EPI64(r, c) \ 365 # define MM_ROTI_EPI64(x, c) \ 366 (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \ 367 : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \ 368 : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \ 369 : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \ 370 : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c)))) 373 #define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ 374 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ 375 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ 377 row4l = _mm_xor_si128(row4l, row1l); \ 378 row4h = _mm_xor_si128(row4h, row1h); \ 380 row4l = MM_ROTI_EPI64(row4l, -32); \ 381 row4h = MM_ROTI_EPI64(row4h, -32); \ 383 row3l = _mm_add_epi64(row3l, row4l); \ 384 row3h = _mm_add_epi64(row3h, row4h); \ 386 row2l = _mm_xor_si128(row2l, row3l); \ 387 row2h = _mm_xor_si128(row2h, row3h); \ 389 row2l = MM_ROTI_EPI64(row2l, -24); \ 390 row2h = MM_ROTI_EPI64(row2h, -24); 392 #define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ 393 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ 394 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ 396 row4l = _mm_xor_si128(row4l, row1l); \ 397 row4h = _mm_xor_si128(row4h, row1h); \ 399 row4l = MM_ROTI_EPI64(row4l, -16); \ 400 row4h = MM_ROTI_EPI64(row4h, -16); \ 402 row3l = _mm_add_epi64(row3l, row4l); \ 403 row3h = _mm_add_epi64(row3h, row4h); \ 405 row2l = _mm_xor_si128(row2l, row3l); \ 406 row2h = _mm_xor_si128(row2h, row3h); \ 408 row2l = MM_ROTI_EPI64(row2l, -63); \ 409 row2h = MM_ROTI_EPI64(row2h, -63); \ 411 #define BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ 417 row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \ 418 row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \ 419 row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \ 420 row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)) 422 #define BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ 428 row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \ 429 row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \ 430 row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \ 431 row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)) 433 #define BLAKE2B_ROUND(r) \ 434 BLAKE2B_LOAD_MSG_ ##r ##_1(b0, b1); \ 435 BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 436 BLAKE2B_LOAD_MSG_ ##r ##_2(b0, b1); \ 437 BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 438 BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ 439 BLAKE2B_LOAD_MSG_ ##r ##_3(b0, b1); \ 440 BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 441 BLAKE2B_LOAD_MSG_ ##r ##_4(b0, b1); \ 442 BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 443 BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); 445 __m128i row1l, row1h;
446 __m128i row2l, row2h;
447 __m128i row3l, row3h;
448 __m128i row4l, row4h;
452 const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
453 const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
455 const __m128i m0 = LOADU(input + 00);
456 const __m128i m1 = LOADU(input + 16);
457 const __m128i m2 = LOADU(input + 32);
458 const __m128i m3 = LOADU(input + 48);
459 const __m128i m4 = LOADU(input + 64);
460 const __m128i m5 = LOADU(input + 80);
461 const __m128i m6 = LOADU(input + 96);
462 const __m128i m7 = LOADU(input + 112);
464 row1l = LOADU(state.h()+0);
465 row1h = LOADU(state.h()+2);
466 row2l = LOADU(state.h()+4);
467 row2h = LOADU(state.h()+6);
468 row3l = LOADU(BLAKE2B_IV+0);
469 row3h = LOADU(BLAKE2B_IV+2);
470 row4l = _mm_xor_si128(LOADU(BLAKE2B_IV+4), LOADU(state.t()+0));
471 row4h = _mm_xor_si128(LOADU(BLAKE2B_IV+6), LOADU(state.f()+0));
486 row1l = _mm_xor_si128(row3l, row1l);
487 row1h = _mm_xor_si128(row3h, row1h);
488 STOREU(state.h()+0, _mm_xor_si128(LOADU(state.h()+0), row1l));
489 STOREU(state.h()+2, _mm_xor_si128(LOADU(state.h()+2), row1h));
490 row2l = _mm_xor_si128(row4l, row2l);
491 row2h = _mm_xor_si128(row4h, row2h);
492 STOREU(state.h()+4, _mm_xor_si128(LOADU(state.h()+4), row2l));
493 STOREU(state.h()+6, _mm_xor_si128(LOADU(state.h()+6), row2h));
495 #endif // CRYPTOPP_SSE41_AVAILABLE 497 #if CRYPTOPP_ARM_NEON_AVAILABLE 498 void BLAKE2_Compress64_NEON(
const byte* input,
BLAKE2b_State& state)
500 #define BLAKE2B_LOAD_MSG_0_1(b0, b1) \ 501 do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); } while(0) 503 #define BLAKE2B_LOAD_MSG_0_2(b0, b1) \ 504 do { b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); } while(0) 506 #define BLAKE2B_LOAD_MSG_0_3(b0, b1) \ 507 do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0) 509 #define BLAKE2B_LOAD_MSG_0_4(b0, b1) \ 510 do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); } while(0) 512 #define BLAKE2B_LOAD_MSG_1_1(b0, b1) \ 513 do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); } while(0) 515 #define BLAKE2B_LOAD_MSG_1_2(b0, b1) \ 516 do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vextq_u64(m7, m3, 1); } while(0) 518 #define BLAKE2B_LOAD_MSG_1_3(b0, b1) \ 519 do { b0 = vextq_u64(m0, m0, 1); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); } while(0) 521 #define BLAKE2B_LOAD_MSG_1_4(b0, b1) \ 522 do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); } while(0) 524 #define BLAKE2B_LOAD_MSG_2_1(b0, b1) \ 525 do { b0 = vextq_u64(m5, m6, 1); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); } while(0) 527 #define BLAKE2B_LOAD_MSG_2_2(b0, b1) \ 528 do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m0)); b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m6)); } while(0) 530 #define BLAKE2B_LOAD_MSG_2_3(b0, b1) \ 531 do { b0 = vcombine_u64(vget_low_u64(m5), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m4)); } while(0) 533 #define BLAKE2B_LOAD_MSG_2_4(b0, b1) \ 534 do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m3)); b1 = vextq_u64(m0, m2, 1); } while(0) 536 #define BLAKE2B_LOAD_MSG_3_1(b0, b1) \ 537 do { b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m5)); } while(0) 539 #define BLAKE2B_LOAD_MSG_3_2(b0, b1) \ 540 do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m0)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0) 542 #define BLAKE2B_LOAD_MSG_3_3(b0, b1) \ 543 do { b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); } while(0) 545 #define BLAKE2B_LOAD_MSG_3_4(b0, b1) \ 546 do { b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); } while(0) 548 #define BLAKE2B_LOAD_MSG_4_1(b0, b1) \ 549 do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m5)); } while(0) 551 #define BLAKE2B_LOAD_MSG_4_2(b0, b1) \ 552 do { b0 = vcombine_u64(vget_low_u64(m0), vget_high_u64(m3)); b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); } while(0) 554 #define BLAKE2B_LOAD_MSG_4_3(b0, b1) \ 555 do { b0 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m5)); b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m1)); } while(0) 557 #define BLAKE2B_LOAD_MSG_4_4(b0, b1) \ 558 do { b0 = vextq_u64(m0, m6, 1); b1 = vcombine_u64(vget_low_u64(m4), vget_high_u64(m6)); } while(0) 560 #define BLAKE2B_LOAD_MSG_5_1(b0, b1) \ 561 do { b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m3)); b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); } while(0) 563 #define BLAKE2B_LOAD_MSG_5_2(b0, b1) \ 564 do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m5)); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m1)); } while(0) 566 #define BLAKE2B_LOAD_MSG_5_3(b0, b1) \ 567 do { b0 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m3)); b1 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m0)); } while(0) 569 #define BLAKE2B_LOAD_MSG_5_4(b0, b1) \ 570 do { b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m4)); } while(0) 572 #define BLAKE2B_LOAD_MSG_6_1(b0, b1) \ 573 do { b0 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m0)); b1 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); } while(0) 575 #define BLAKE2B_LOAD_MSG_6_2(b0, b1) \ 576 do { b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); b1 = vextq_u64(m6, m5, 1); } while(0) 578 #define BLAKE2B_LOAD_MSG_6_3(b0, b1) \ 579 do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m3)); b1 = vextq_u64(m4, m4, 1); } while(0) 581 #define BLAKE2B_LOAD_MSG_6_4(b0, b1) \ 582 do { b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m5)); } while(0) 584 #define BLAKE2B_LOAD_MSG_7_1(b0, b1) \ 585 do { b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m3)); b1 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m1)); } while(0) 587 #define BLAKE2B_LOAD_MSG_7_2(b0, b1) \ 588 do { b0 = vextq_u64(m5, m7, 1); b1 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m4)); } while(0) 590 #define BLAKE2B_LOAD_MSG_7_3(b0, b1) \ 591 do { b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); b1 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m1)); } while(0) 593 #define BLAKE2B_LOAD_MSG_7_4(b0, b1) \ 594 do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m2)); b1 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); } while(0) 596 #define BLAKE2B_LOAD_MSG_8_1(b0, b1) \ 597 do { b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m7)); b1 = vextq_u64(m5, m0, 1); } while(0) 599 #define BLAKE2B_LOAD_MSG_8_2(b0, b1) \ 600 do { b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); b1 = vextq_u64(m1, m4, 1); } while(0) 602 #define BLAKE2B_LOAD_MSG_8_3(b0, b1) \ 603 do { b0 = m6; b1 = vextq_u64(m0, m5, 1); } while(0) 605 #define BLAKE2B_LOAD_MSG_8_4(b0, b1) \ 606 do { b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m3)); b1 = m2; } while(0) 608 #define BLAKE2B_LOAD_MSG_9_1(b0, b1) \ 609 do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m0)); } while(0) 611 #define BLAKE2B_LOAD_MSG_9_2(b0, b1) \ 612 do { b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m2)); b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m2)); } while(0) 614 #define BLAKE2B_LOAD_MSG_9_3(b0, b1) \ 615 do { b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); b1 = vcombine_u64(vget_high_u64(m1), vget_high_u64(m6)); } while(0) 617 #define BLAKE2B_LOAD_MSG_9_4(b0, b1) \ 618 do { b0 = vextq_u64(m5, m7, 1); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m0)); } while(0) 620 #define BLAKE2B_LOAD_MSG_10_1(b0, b1) \ 621 do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); } while(0) 623 #define BLAKE2B_LOAD_MSG_10_2(b0, b1) \ 624 do { b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); } while(0) 626 #define BLAKE2B_LOAD_MSG_10_3(b0, b1) \ 627 do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0) 629 #define BLAKE2B_LOAD_MSG_10_4(b0, b1) \ 630 do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); } while(0) 632 #define BLAKE2B_LOAD_MSG_11_1(b0, b1) \ 633 do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); } while(0) 635 #define BLAKE2B_LOAD_MSG_11_2(b0, b1) \ 636 do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vextq_u64(m7, m3, 1); } while(0) 638 #define BLAKE2B_LOAD_MSG_11_3(b0, b1) \ 639 do { b0 = vextq_u64(m0, m0, 1); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); } while(0) 641 #define BLAKE2B_LOAD_MSG_11_4(b0, b1) \ 642 do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); } while(0) 644 #define vrorq_n_u64_32(x) vreinterpretq_u64_u32(vrev64q_u32(vreinterpretq_u32_u64((x)))) 646 #define vrorq_n_u64_24(x) vcombine_u64( \ 647 vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 3)), \ 648 vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 3))) 650 #define vrorq_n_u64_16(x) vcombine_u64( \ 651 vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 2)), \ 652 vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 2))) 654 #define vrorq_n_u64_63(x) veorq_u64(vaddq_u64(x, x), vshrq_n_u64(x, 63)) 656 #define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ 658 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \ 659 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \ 660 row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \ 661 row4l = vrorq_n_u64_32(row4l); row4h = vrorq_n_u64_32(row4h); \ 662 row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \ 663 row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \ 664 row2l = vrorq_n_u64_24(row2l); row2h = vrorq_n_u64_24(row2h); \ 667 #define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ 669 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \ 670 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \ 671 row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \ 672 row4l = vrorq_n_u64_16(row4l); row4h = vrorq_n_u64_16(row4h); \ 673 row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \ 674 row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \ 675 row2l = vrorq_n_u64_63(row2l); row2h = vrorq_n_u64_63(row2h); \ 678 #define BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ 680 uint64x2_t t0 = vextq_u64(row2l, row2h, 1); \ 681 uint64x2_t t1 = vextq_u64(row2h, row2l, 1); \ 682 row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \ 683 t0 = vextq_u64(row4h, row4l, 1); t1 = vextq_u64(row4l, row4h, 1); \ 684 row4l = t0; row4h = t1; \ 687 #define BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ 689 uint64x2_t t0 = vextq_u64(row2h, row2l, 1); \ 690 uint64x2_t t1 = vextq_u64(row2l, row2h, 1); \ 691 row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \ 692 t0 = vextq_u64(row4l, row4h, 1); t1 = vextq_u64(row4h, row4l, 1); \ 693 row4l = t0; row4h = t1; \ 696 #define BLAKE2B_ROUND(r) \ 699 BLAKE2B_LOAD_MSG_ ##r ##_1(b0, b1); \ 700 BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 701 BLAKE2B_LOAD_MSG_ ##r ##_2(b0, b1); \ 702 BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 703 BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ 704 BLAKE2B_LOAD_MSG_ ##r ##_3(b0, b1); \ 705 BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 706 BLAKE2B_LOAD_MSG_ ##r ##_4(b0, b1); \ 707 BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 708 BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ 711 const uint64x2_t m0 = vreinterpretq_u64_u8(vld1q_u8(input + 00));
712 const uint64x2_t m1 = vreinterpretq_u64_u8(vld1q_u8(input + 16));
713 const uint64x2_t m2 = vreinterpretq_u64_u8(vld1q_u8(input + 32));
714 const uint64x2_t m3 = vreinterpretq_u64_u8(vld1q_u8(input + 48));
715 const uint64x2_t m4 = vreinterpretq_u64_u8(vld1q_u8(input + 64));
716 const uint64x2_t m5 = vreinterpretq_u64_u8(vld1q_u8(input + 80));
717 const uint64x2_t m6 = vreinterpretq_u64_u8(vld1q_u8(input + 96));
718 const uint64x2_t m7 = vreinterpretq_u64_u8(vld1q_u8(input + 112));
720 uint64x2_t row1l, row1h, row2l, row2h;
721 uint64x2_t row3l, row3h, row4l, row4h;
723 const uint64x2_t h0 = row1l = vld1q_u64(state.h()+0);
724 const uint64x2_t h1 = row1h = vld1q_u64(state.h()+2);
725 const uint64x2_t h2 = row2l = vld1q_u64(state.h()+4);
726 const uint64x2_t h3 = row2h = vld1q_u64(state.h()+6);
728 row3l = vld1q_u64(BLAKE2B_IV+0);
729 row3h = vld1q_u64(BLAKE2B_IV+2);
730 row4l = veorq_u64(vld1q_u64(BLAKE2B_IV+4), vld1q_u64(state.t()+0));
731 row4h = veorq_u64(vld1q_u64(BLAKE2B_IV+6), vld1q_u64(state.f()+0));
746 vst1q_u64(state.h()+0, veorq_u64(h0, veorq_u64(row1l, row3l)));
747 vst1q_u64(state.h()+2, veorq_u64(h1, veorq_u64(row1h, row3h)));
748 vst1q_u64(state.h()+4, veorq_u64(h2, veorq_u64(row2l, row4l)));
749 vst1q_u64(state.h()+6, veorq_u64(h3, veorq_u64(row2h, row4h)));
751 #endif // CRYPTOPP_ARM_NEON_AVAILABLE 753 #if (CRYPTOPP_POWER8_AVAILABLE) 757 #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) 760 return (
uint64x2_p)vec_vsx_ld(0, (uint8_t*)p);
767 const uint8x16_p m = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
775 inline void VecStore64(
void* p,
const uint64x2_p x)
777 #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) 784 inline void VecStore64LE(
void* p,
const uint64x2_p x)
787 const uint8x16_p m = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
794 template <
unsigned int C>
804 #define vec_shl_octet(a,b,c) VecShiftLeftOctet<c*8>(a, b) 813 #if defined(__GNUC__) && (__BIG_ENDIAN__) 814 # define vec_merge_hi(a,b) VecPermute(a,b, HH_MASK) 815 # define vec_merge_lo(a,b) VecPermute(a,b, LL_MASK) 817 # define vec_merge_hi(a,b) vec_mergeh(a,b) 818 # define vec_merge_lo(a,b) vec_mergel(a,b) 821 void BLAKE2_Compress64_POWER8(
const byte* input,
BLAKE2b_State& state)
826 #if defined(__GNUC__) && (__BIG_ENDIAN__) 827 const uint8x16_p HH_MASK = { 0,1,2,3,4,5,6,7, 16,17,18,19,20,21,22,23 };
828 const uint8x16_p LL_MASK = { 8,9,10,11,12,13,14,15, 24,25,26,27,28,29,30,31 };
831 const uint8x16_p HL_MASK = { 0,1,2,3,4,5,6,7, 24,25,26,27,28,29,30,31 };
832 const uint8x16_p LH_MASK = { 8,9,10,11,12,13,14,15, 16,17,18,19,20,21,22,23 };
834 #define BLAKE2B_LOAD_MSG_0_1(b0, b1) \ 836 b0 = vec_merge_hi(m0, m1); \ 837 b1 = vec_merge_hi(m2, m3); \ 840 #define BLAKE2B_LOAD_MSG_0_2(b0, b1) \ 842 b0 = vec_merge_lo(m0, m1); \ 843 b1 = vec_merge_lo(m2, m3); \ 846 #define BLAKE2B_LOAD_MSG_0_3(b0, b1) \ 848 b0 = vec_merge_hi(m4, m5); \ 849 b1 = vec_merge_hi(m6, m7); \ 852 #define BLAKE2B_LOAD_MSG_0_4(b0, b1) \ 854 b0 = vec_merge_lo(m4, m5); \ 855 b1 = vec_merge_lo(m6, m7); \ 858 #define BLAKE2B_LOAD_MSG_1_1(b0, b1) \ 860 b0 = vec_merge_hi(m7, m2); \ 861 b1 = vec_merge_lo(m4, m6); \ 864 #define BLAKE2B_LOAD_MSG_1_2(b0, b1) \ 866 b0 = vec_merge_hi(m5, m4); \ 867 b1 = vec_shl_octet(m7, m3, 1); \ 870 #define BLAKE2B_LOAD_MSG_1_3(b0, b1) \ 872 b0 = vec_shl_octet(m0, m0, 1); \ 873 b1 = vec_merge_lo(m5, m2); \ 876 #define BLAKE2B_LOAD_MSG_1_4(b0, b1) \ 878 b0 = vec_merge_hi(m6, m1); \ 879 b1 = vec_merge_lo(m3, m1); \ 882 #define BLAKE2B_LOAD_MSG_2_1(b0, b1) \ 884 b0 = vec_shl_octet(m5, m6, 1); \ 885 b1 = vec_merge_lo(m2, m7); \ 888 #define BLAKE2B_LOAD_MSG_2_2(b0, b1) \ 890 b0 = vec_merge_hi(m4, m0); \ 891 b1 = VecPermute(m1, m6, HL_MASK); \ 894 #define BLAKE2B_LOAD_MSG_2_3(b0, b1) \ 896 b0 = VecPermute(m5, m1, HL_MASK); \ 897 b1 = vec_merge_lo(m3, m4); \ 900 #define BLAKE2B_LOAD_MSG_2_4(b0, b1) \ 902 b0 = vec_merge_hi(m7, m3); \ 903 b1 = vec_shl_octet(m0, m2, 1); \ 906 #define BLAKE2B_LOAD_MSG_3_1(b0, b1) \ 908 b0 = vec_merge_lo(m3, m1); \ 909 b1 = vec_merge_lo(m6, m5); \ 912 #define BLAKE2B_LOAD_MSG_3_2(b0, b1) \ 914 b0 = vec_merge_lo(m4, m0); \ 915 b1 = vec_merge_hi(m6, m7); \ 918 #define BLAKE2B_LOAD_MSG_3_3(b0, b1) \ 920 b0 = VecPermute(m1, m2, HL_MASK); \ 921 b1 = VecPermute(m2, m7, HL_MASK); \ 924 #define BLAKE2B_LOAD_MSG_3_4(b0, b1) \ 926 b0 = vec_merge_hi(m3, m5); \ 927 b1 = vec_merge_hi(m0, m4); \ 930 #define BLAKE2B_LOAD_MSG_4_1(b0, b1) \ 932 b0 = vec_merge_lo(m4, m2); \ 933 b1 = vec_merge_hi(m1, m5); \ 936 #define BLAKE2B_LOAD_MSG_4_2(b0, b1) \ 938 b0 = VecPermute(m0, m3, HL_MASK); \ 939 b1 = VecPermute(m2, m7, HL_MASK); \ 942 #define BLAKE2B_LOAD_MSG_4_3(b0, b1) \ 944 b0 = VecPermute(m7, m5, HL_MASK); \ 945 b1 = VecPermute(m3, m1, HL_MASK); \ 948 #define BLAKE2B_LOAD_MSG_4_4(b0, b1) \ 950 b0 = vec_shl_octet(m0, m6, 1); \ 951 b1 = VecPermute(m4, m6, HL_MASK); \ 954 #define BLAKE2B_LOAD_MSG_5_1(b0, b1) \ 956 b0 = vec_merge_hi(m1, m3); \ 957 b1 = vec_merge_hi(m0, m4); \ 960 #define BLAKE2B_LOAD_MSG_5_2(b0, b1) \ 962 b0 = vec_merge_hi(m6, m5); \ 963 b1 = vec_merge_lo(m5, m1); \ 966 #define BLAKE2B_LOAD_MSG_5_3(b0, b1) \ 968 b0 = VecPermute(m2, m3, HL_MASK); \ 969 b1 = vec_merge_lo(m7, m0); \ 972 #define BLAKE2B_LOAD_MSG_5_4(b0, b1) \ 974 b0 = vec_merge_lo(m6, m2); \ 975 b1 = VecPermute(m7, m4, HL_MASK); \ 978 #define BLAKE2B_LOAD_MSG_6_1(b0, b1) \ 980 b0 = VecPermute(m6, m0, HL_MASK); \ 981 b1 = vec_merge_hi(m7, m2); \ 984 #define BLAKE2B_LOAD_MSG_6_2(b0, b1) \ 986 b0 = vec_merge_lo(m2, m7); \ 987 b1 = vec_shl_octet(m6, m5, 1); \ 990 #define BLAKE2B_LOAD_MSG_6_3(b0, b1) \ 992 b0 = vec_merge_hi(m0, m3); \ 993 b1 = vec_shl_octet(m4, m4, 1); \ 996 #define BLAKE2B_LOAD_MSG_6_4(b0, b1) \ 998 b0 = vec_merge_lo(m3, m1); \ 999 b1 = VecPermute(m1, m5, HL_MASK); \ 1002 #define BLAKE2B_LOAD_MSG_7_1(b0, b1) \ 1004 b0 = vec_merge_lo(m6, m3); \ 1005 b1 = VecPermute(m6, m1, HL_MASK); \ 1008 #define BLAKE2B_LOAD_MSG_7_2(b0, b1) \ 1010 b0 = vec_shl_octet(m5, m7, 1); \ 1011 b1 = vec_merge_lo(m0, m4); \ 1014 #define BLAKE2B_LOAD_MSG_7_3(b0, b1) \ 1016 b0 = vec_merge_lo(m2, m7); \ 1017 b1 = vec_merge_hi(m4, m1); \ 1020 #define BLAKE2B_LOAD_MSG_7_4(b0, b1) \ 1022 b0 = vec_merge_hi(m0, m2); \ 1023 b1 = vec_merge_hi(m3, m5); \ 1026 #define BLAKE2B_LOAD_MSG_8_1(b0, b1) \ 1028 b0 = vec_merge_hi(m3, m7); \ 1029 b1 = vec_shl_octet(m5, m0, 1); \ 1032 #define BLAKE2B_LOAD_MSG_8_2(b0, b1) \ 1034 b0 = vec_merge_lo(m7, m4); \ 1035 b1 = vec_shl_octet(m1, m4, 1); \ 1038 #define BLAKE2B_LOAD_MSG_8_3(b0, b1) \ 1041 b1 = vec_shl_octet(m0, m5, 1); \ 1044 #define BLAKE2B_LOAD_MSG_8_4(b0, b1) \ 1046 b0 = VecPermute(m1, m3, HL_MASK); \ 1050 #define BLAKE2B_LOAD_MSG_9_1(b0, b1) \ 1052 b0 = vec_merge_hi(m5, m4); \ 1053 b1 = vec_merge_lo(m3, m0); \ 1056 #define BLAKE2B_LOAD_MSG_9_2(b0, b1) \ 1058 b0 = vec_merge_hi(m1, m2); \ 1059 b1 = VecPermute(m3, m2, HL_MASK); \ 1062 #define BLAKE2B_LOAD_MSG_9_3(b0, b1) \ 1064 b0 = vec_merge_lo(m7, m4); \ 1065 b1 = vec_merge_lo(m1, m6); \ 1068 #define BLAKE2B_LOAD_MSG_9_4(b0, b1) \ 1070 b0 = vec_shl_octet(m5, m7, 1); \ 1071 b1 = vec_merge_hi(m6, m0); \ 1074 #define BLAKE2B_LOAD_MSG_10_1(b0, b1) \ 1076 b0 = vec_merge_hi(m0, m1); \ 1077 b1 = vec_merge_hi(m2, m3); \ 1080 #define BLAKE2B_LOAD_MSG_10_2(b0, b1) \ 1082 b0 = vec_merge_lo(m0, m1); \ 1083 b1 = vec_merge_lo(m2, m3); \ 1086 #define BLAKE2B_LOAD_MSG_10_3(b0, b1) \ 1088 b0 = vec_merge_hi(m4, m5); \ 1089 b1 = vec_merge_hi(m6, m7); \ 1092 #define BLAKE2B_LOAD_MSG_10_4(b0, b1) \ 1094 b0 = vec_merge_lo(m4, m5); \ 1095 b1 = vec_merge_lo(m6, m7); \ 1098 #define BLAKE2B_LOAD_MSG_11_1(b0, b1) \ 1100 b0 = vec_merge_hi(m7, m2); \ 1101 b1 = vec_merge_lo(m4, m6); \ 1104 #define BLAKE2B_LOAD_MSG_11_2(b0, b1) \ 1106 b0 = vec_merge_hi(m5, m4); \ 1107 b1 = vec_shl_octet(m7, m3, 1); \ 1110 #define BLAKE2B_LOAD_MSG_11_3(b0, b1) \ 1112 b0 = vec_shl_octet(m0, m0, 1); \ 1113 b1 = vec_merge_lo(m5, m2); \ 1116 #define BLAKE2B_LOAD_MSG_11_4(b0, b1) \ 1118 b0 = vec_merge_hi(m6, m1); \ 1119 b1 = vec_merge_lo(m3, m1); \ 1123 const uint64x2_p ROR16_MASK = { 64-16, 64-16 };
1124 const uint64x2_p ROR24_MASK = { 64-24, 64-24 };
1125 const uint64x2_p ROR32_MASK = { 64-32, 64-32 };
1126 const uint64x2_p ROR63_MASK = { 64-63, 64-63 };
1128 #define vec_ror_32(x) vec_rl(x, ROR32_MASK) 1129 #define vec_ror_24(x) vec_rl(x, ROR24_MASK) 1130 #define vec_ror_16(x) vec_rl(x, ROR16_MASK) 1131 #define vec_ror_63(x) vec_rl(x, ROR63_MASK) 1133 #define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ 1135 row1l = VecAdd(VecAdd(row1l, b0), row2l); \ 1136 row1h = VecAdd(VecAdd(row1h, b1), row2h); \ 1137 row4l = VecXor(row4l, row1l); row4h = VecXor(row4h, row1h); \ 1138 row4l = vec_ror_32(row4l); row4h = vec_ror_32(row4h); \ 1139 row3l = VecAdd(row3l, row4l); row3h = VecAdd(row3h, row4h); \ 1140 row2l = VecXor(row2l, row3l); row2h = VecXor(row2h, row3h); \ 1141 row2l = vec_ror_24(row2l); row2h = vec_ror_24(row2h); \ 1144 #define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ 1146 row1l = VecAdd(VecAdd(row1l, b0), row2l); \ 1147 row1h = VecAdd(VecAdd(row1h, b1), row2h); \ 1148 row4l = VecXor(row4l, row1l); row4h = VecXor(row4h, row1h); \ 1149 row4l = vec_ror_16(row4l); row4h = vec_ror_16(row4h); \ 1150 row3l = VecAdd(row3l, row4l); row3h = VecAdd(row3h, row4h); \ 1151 row2l = VecXor(row2l, row3l); row2h = VecXor(row2h, row3h); \ 1152 row2l = vec_ror_63(row2l); row2h = vec_ror_63(row2h); \ 1155 #define BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ 1157 uint64x2_p t0 = vec_shl_octet(row2l, row2h, 1); \ 1158 uint64x2_p t1 = vec_shl_octet(row2h, row2l, 1); \ 1159 row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \ 1160 t0 = vec_shl_octet(row4h, row4l, 1); t1 = vec_shl_octet(row4l, row4h, 1); \ 1161 row4l = t0; row4h = t1; \ 1164 #define BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ 1166 uint64x2_p t0 = vec_shl_octet(row2h, row2l, 1); \ 1167 uint64x2_p t1 = vec_shl_octet(row2l, row2h, 1); \ 1168 row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \ 1169 t0 = vec_shl_octet(row4l, row4h, 1); t1 = vec_shl_octet(row4h, row4l, 1); \ 1170 row4l = t0; row4h = t1; \ 1173 #define BLAKE2B_ROUND(r) \ 1175 uint64x2_p b0, b1; \ 1176 BLAKE2B_LOAD_MSG_ ##r ##_1(b0, b1); \ 1177 BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 1178 BLAKE2B_LOAD_MSG_ ##r ##_2(b0, b1); \ 1179 BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 1180 BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ 1181 BLAKE2B_LOAD_MSG_ ##r ##_3(b0, b1); \ 1182 BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 1183 BLAKE2B_LOAD_MSG_ ##r ##_4(b0, b1); \ 1184 BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 1185 BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ 1188 const uint64x2_p m0 = VecLoad64LE(input + 00);
1189 const uint64x2_p m1 = VecLoad64LE(input + 16);
1190 const uint64x2_p m2 = VecLoad64LE(input + 32);
1191 const uint64x2_p m3 = VecLoad64LE(input + 48);
1192 const uint64x2_p m4 = VecLoad64LE(input + 64);
1193 const uint64x2_p m5 = VecLoad64LE(input + 80);
1194 const uint64x2_p m6 = VecLoad64LE(input + 96);
1195 const uint64x2_p m7 = VecLoad64LE(input + 112);
1200 const uint64x2_p h0 = row1l = VecLoad64LE(state.h()+0);
1201 const uint64x2_p h1 = row1h = VecLoad64LE(state.h()+2);
1202 const uint64x2_p h2 = row2l = VecLoad64LE(state.h()+4);
1203 const uint64x2_p h3 = row2h = VecLoad64LE(state.h()+6);
1205 row3l = VecLoad64(BLAKE2B_IV+0);
1206 row3h = VecLoad64(BLAKE2B_IV+2);
1207 row4l =
VecXor(VecLoad64(BLAKE2B_IV+4), VecLoad64(state.t()+0));
1208 row4h =
VecXor(VecLoad64(BLAKE2B_IV+6), VecLoad64(state.f()+0));
1223 VecStore64LE(state.h()+0,
VecXor(h0,
VecXor(row1l, row3l)));
1224 VecStore64LE(state.h()+2,
VecXor(h1,
VecXor(row1h, row3h)));
1225 VecStore64LE(state.h()+4,
VecXor(h2,
VecXor(row2l, row4l)));
1226 VecStore64LE(state.h()+6,
VecXor(h3,
VecXor(row2h, row4h)));
1228 #endif // CRYPTOPP_POWER8_AVAILABLE Utility functions for the Crypto++ library.
Library configuration file.
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Support functions for PowerPC and vector operations.
Classes for BLAKE2b and BLAKE2s message digests and keyed message digests.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
BLAKE2b state information.
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Crypto++ library namespace.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
T VecShiftLeftOctet(const T vec)
Shift a vector left.