45 #ifndef CRYPTOPP_ADVANCED_SIMD_TEMPLATES 46 #define CRYPTOPP_ADVANCED_SIMD_TEMPLATES 52 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 53 # include <arm_neon.h> 56 #if (CRYPTOPP_ARM_ACLE_AVAILABLE) 58 # include <arm_acle.h> 61 #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE) 62 # include <emmintrin.h> 63 # include <xmmintrin.h> 67 #if (CRYPTOPP_SSSE3_AVAILABLE) 68 # include <emmintrin.h> 69 # include <pmmintrin.h> 70 # include <xmmintrin.h> 73 #if defined(__ALTIVEC__) 77 #ifndef CRYPTOPP_INLINE 78 # if defined(CRYPTOPP_DEBUG) 79 # define CRYPTOPP_INLINE static 81 # define CRYPTOPP_INLINE inline 87 ANONYMOUS_NAMESPACE_BEGIN
89 using CryptoPP::BlockTransformation;
97 ANONYMOUS_NAMESPACE_END
101 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 113 template <
typename F2,
typename F6,
typename W>
114 CRYPTOPP_INLINE
size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6,
115 const W *subKeys,
size_t rounds,
const byte *inBlocks,
116 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
123 #if (CRYPTOPP_LITTLE_ENDIAN) 124 const uint32x4_t s_one = {0, 0, 0, 1<<24};
125 const uint32x4_t s_two = {0, 2<<24, 0, 2<<24};
128 const uint32x4_t s_one = {0, 0, 0, 1};
129 const uint32x4_t s_two = {0, 2, 0, 2};
132 const size_t blockSize = 8;
133 const size_t neonBlockSize = 16;
135 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : neonBlockSize;
136 size_t xorIncrement = (xorBlocks != NULLPTR) ? neonBlockSize : 0;
137 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : neonBlockSize;
140 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
141 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
143 if (flags & BT_ReverseDirection)
145 inBlocks =
PtrAdd(inBlocks, length - neonBlockSize);
146 xorBlocks =
PtrAdd(xorBlocks, length - neonBlockSize);
147 outBlocks =
PtrAdd(outBlocks, length - neonBlockSize);
148 inIncrement = 0-inIncrement;
149 xorIncrement = 0-xorIncrement;
150 outIncrement = 0-outIncrement;
153 if (flags & BT_AllowParallel)
155 while (length >= 6*neonBlockSize)
157 uint32x4_t block0, block1, block2, block3, block4, block5;
158 if (flags & BT_InBlockIsCounter)
163 const uint8x8_t ctr = vld1_u8(inBlocks);
164 block0 = vaddq_u32(s_one, vreinterpretq_u32_u8(vcombine_u8(ctr,ctr)));
167 block1 = vaddq_u32(s_two, block0);
168 block2 = vaddq_u32(s_two, block1);
169 block3 = vaddq_u32(s_two, block2);
170 block4 = vaddq_u32(s_two, block3);
171 block5 = vaddq_u32(s_two, block4);
173 vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
174 vreinterpretq_u8_u32(vaddq_u32(s_two, block5))));
178 block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
179 inBlocks =
PtrAdd(inBlocks, inIncrement);
180 block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
181 inBlocks =
PtrAdd(inBlocks, inIncrement);
182 block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
183 inBlocks =
PtrAdd(inBlocks, inIncrement);
184 block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
185 inBlocks =
PtrAdd(inBlocks, inIncrement);
186 block4 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
187 inBlocks =
PtrAdd(inBlocks, inIncrement);
188 block5 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
189 inBlocks =
PtrAdd(inBlocks, inIncrement);
194 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
195 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
196 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
197 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
198 block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
199 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
200 block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
201 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
202 block4 = veorq_u32(block4, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
203 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
204 block5 = veorq_u32(block5, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
205 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
208 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
212 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
213 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
214 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
215 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
216 block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
217 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
218 block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
219 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
220 block4 = veorq_u32(block4, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
221 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
222 block5 = veorq_u32(block5, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
223 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
226 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
227 outBlocks =
PtrAdd(outBlocks, outIncrement);
228 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
229 outBlocks =
PtrAdd(outBlocks, outIncrement);
230 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block2));
231 outBlocks =
PtrAdd(outBlocks, outIncrement);
232 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block3));
233 outBlocks =
PtrAdd(outBlocks, outIncrement);
234 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block4));
235 outBlocks =
PtrAdd(outBlocks, outIncrement);
236 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block5));
237 outBlocks =
PtrAdd(outBlocks, outIncrement);
239 length -= 6*neonBlockSize;
242 while (length >= 2*neonBlockSize)
244 uint32x4_t block0, block1;
245 if (flags & BT_InBlockIsCounter)
250 const uint8x8_t ctr = vld1_u8(inBlocks);
251 block0 = vaddq_u32(s_one, vreinterpretq_u32_u8(vcombine_u8(ctr,ctr)));
254 block1 = vaddq_u32(s_two, block0);
256 vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
257 vreinterpretq_u8_u32(vaddq_u32(s_two, block1))));
261 block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
262 inBlocks =
PtrAdd(inBlocks, inIncrement);
263 block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
264 inBlocks =
PtrAdd(inBlocks, inIncrement);
269 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
270 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
271 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
272 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
275 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
279 block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
280 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
281 block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
282 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
285 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
286 outBlocks =
PtrAdd(outBlocks, outIncrement);
287 vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
288 outBlocks =
PtrAdd(outBlocks, outIncrement);
290 length -= 2*neonBlockSize;
297 if (flags & BT_ReverseDirection)
299 inIncrement += inIncrement ? blockSize : 0;
300 xorIncrement += xorIncrement ? blockSize : 0;
301 outIncrement += outIncrement ? blockSize : 0;
302 inBlocks =
PtrSub(inBlocks, inIncrement);
303 xorBlocks =
PtrSub(xorBlocks, xorIncrement);
304 outBlocks =
PtrSub(outBlocks, outIncrement);
308 inIncrement -= inIncrement ? blockSize : 0;
309 xorIncrement -= xorIncrement ? blockSize : 0;
310 outIncrement -= outIncrement ? blockSize : 0;
313 while (length >= blockSize)
315 uint32x4_t block, zero = {0};
317 const uint8x8_t v = vld1_u8(inBlocks);
318 block = vreinterpretq_u32_u8(vcombine_u8(v,v));
322 const uint8x8_t x = vld1_u8(xorBlocks);
323 block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
326 if (flags & BT_InBlockIsCounter)
327 const_cast<byte *
>(inBlocks)[7]++;
329 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
333 const uint8x8_t x = vld1_u8(xorBlocks);
334 block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
337 vst1_u8(const_cast<byte*>(outBlocks),
338 vget_low_u8(vreinterpretq_u8_u32(block)));
340 inBlocks =
PtrAdd(inBlocks, inIncrement);
341 outBlocks =
PtrAdd(outBlocks, outIncrement);
342 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
358 template <
typename F1,
typename F6,
typename W>
359 CRYPTOPP_INLINE
size_t AdvancedProcessBlocks128_6x1_NEON(F1 func1, F6 func6,
360 const W *subKeys,
size_t rounds,
const byte *inBlocks,
361 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
368 #if (CRYPTOPP_LITTLE_ENDIAN) 369 const uint32x4_t s_one = {0, 0, 0, 1<<24};
373 const uint32x4_t s_one = {0, 0, 0, 1};
377 const size_t blockSize = 16;
380 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
381 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
382 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
385 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
386 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
388 if (flags & BT_ReverseDirection)
390 inBlocks =
PtrAdd(inBlocks, length - blockSize);
391 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
392 outBlocks =
PtrAdd(outBlocks, length - blockSize);
393 inIncrement = 0-inIncrement;
394 xorIncrement = 0-xorIncrement;
395 outIncrement = 0-outIncrement;
398 if (flags & BT_AllowParallel)
400 while (length >= 6*blockSize)
402 uint64x2_t block0, block1, block2, block3, block4, block5;
403 if (flags & BT_InBlockIsCounter)
405 const uint64x2_t one = vreinterpretq_u64_u32(s_one);
406 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
407 block1 = vaddq_u64(block0, one);
408 block2 = vaddq_u64(block1, one);
409 block3 = vaddq_u64(block2, one);
410 block4 = vaddq_u64(block3, one);
411 block5 = vaddq_u64(block4, one);
412 vst1q_u8(const_cast<byte*>(inBlocks),
413 vreinterpretq_u8_u64(vaddq_u64(block5, one)));
417 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
418 inBlocks =
PtrAdd(inBlocks, inIncrement);
419 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
420 inBlocks =
PtrAdd(inBlocks, inIncrement);
421 block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
422 inBlocks =
PtrAdd(inBlocks, inIncrement);
423 block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
424 inBlocks =
PtrAdd(inBlocks, inIncrement);
425 block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
426 inBlocks =
PtrAdd(inBlocks, inIncrement);
427 block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
428 inBlocks =
PtrAdd(inBlocks, inIncrement);
433 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
434 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
435 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
436 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
437 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
438 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
439 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
440 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
441 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
442 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
443 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
444 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
447 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
451 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
452 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
453 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
454 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
455 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
456 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
457 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
458 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
459 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
460 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
461 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
462 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
465 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
466 outBlocks =
PtrAdd(outBlocks, outIncrement);
467 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
468 outBlocks =
PtrAdd(outBlocks, outIncrement);
469 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
470 outBlocks =
PtrAdd(outBlocks, outIncrement);
471 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
472 outBlocks =
PtrAdd(outBlocks, outIncrement);
473 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
474 outBlocks =
PtrAdd(outBlocks, outIncrement);
475 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
476 outBlocks =
PtrAdd(outBlocks, outIncrement);
478 length -= 6*blockSize;
482 while (length >= blockSize)
485 block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
488 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
490 if (flags & BT_InBlockIsCounter)
491 const_cast<byte *
>(inBlocks)[15]++;
493 func1(block, subKeys, static_cast<unsigned int>(rounds));
496 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
498 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
500 inBlocks =
PtrAdd(inBlocks, inIncrement);
501 outBlocks =
PtrAdd(outBlocks, outIncrement);
502 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
520 template <
typename F1,
typename F4,
typename W,
typename V>
521 CRYPTOPP_INLINE
size_t AdvancedProcessBlocks128_4x1_NEON(F1 func1, F4 func4,
522 const V& unused,
const W *subKeys,
size_t rounds,
const byte *inBlocks,
523 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
529 CRYPTOPP_UNUSED(unused);
531 #if (CRYPTOPP_LITTLE_ENDIAN) 532 const uint32x4_t s_one = {0, 0, 0, 1<<24};
536 const uint32x4_t s_one = {0, 0, 0, 1};
540 const size_t blockSize = 16;
543 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
544 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
545 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
548 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
549 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
551 if (flags & BT_ReverseDirection)
553 inBlocks =
PtrAdd(inBlocks, length - blockSize);
554 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
555 outBlocks =
PtrAdd(outBlocks, length - blockSize);
556 inIncrement = 0-inIncrement;
557 xorIncrement = 0-xorIncrement;
558 outIncrement = 0-outIncrement;
561 if (flags & BT_AllowParallel)
563 while (length >= 4*blockSize)
565 uint64x2_t block0, block1, block2, block3;
566 if (flags & BT_InBlockIsCounter)
568 const uint64x2_t one = vreinterpretq_u64_u32(s_one);
569 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
570 block1 = vaddq_u64(block0, one);
571 block2 = vaddq_u64(block1, one);
572 block3 = vaddq_u64(block2, one);
573 vst1q_u8(const_cast<byte*>(inBlocks),
574 vreinterpretq_u8_u64(vaddq_u64(block3, one)));
578 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
579 inBlocks =
PtrAdd(inBlocks, inIncrement);
580 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
581 inBlocks =
PtrAdd(inBlocks, inIncrement);
582 block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
583 inBlocks =
PtrAdd(inBlocks, inIncrement);
584 block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
585 inBlocks =
PtrAdd(inBlocks, inIncrement);
590 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
591 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
592 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
593 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
594 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
595 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
596 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
597 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
600 func4((V&)block0, (V&)block1, (V&)block2, (V&)block3, subKeys, static_cast<unsigned int>(rounds));
604 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
605 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
606 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
607 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
608 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
609 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
610 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
611 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
614 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
615 outBlocks =
PtrAdd(outBlocks, outIncrement);
616 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
617 outBlocks =
PtrAdd(outBlocks, outIncrement);
618 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
619 outBlocks =
PtrAdd(outBlocks, outIncrement);
620 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
621 outBlocks =
PtrAdd(outBlocks, outIncrement);
623 length -= 4*blockSize;
627 while (length >= blockSize)
629 uint64x2_t block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
632 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
634 if (flags & BT_InBlockIsCounter)
635 const_cast<byte *
>(inBlocks)[15]++;
637 func1( (V&)block, subKeys, static_cast<unsigned int>(rounds));
640 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
642 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
644 inBlocks =
PtrAdd(inBlocks, inIncrement);
645 outBlocks =
PtrAdd(outBlocks, outIncrement);
646 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
661 template <
typename F2,
typename F6,
typename W>
662 CRYPTOPP_INLINE
size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6,
663 const W *subKeys,
size_t rounds,
const byte *inBlocks,
664 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
671 #if (CRYPTOPP_LITTLE_ENDIAN) 672 const uint32x4_t s_one = {0, 0, 0, 1<<24};
676 const uint32x4_t s_one = {0, 0, 0, 1};
680 const size_t blockSize = 16;
683 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
684 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
685 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
688 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
689 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
691 if (flags & BT_ReverseDirection)
693 inBlocks =
PtrAdd(inBlocks, length - blockSize);
694 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
695 outBlocks =
PtrAdd(outBlocks, length - blockSize);
696 inIncrement = 0-inIncrement;
697 xorIncrement = 0-xorIncrement;
698 outIncrement = 0-outIncrement;
701 if (flags & BT_AllowParallel)
703 while (length >= 6*blockSize)
705 uint64x2_t block0, block1, block2, block3, block4, block5;
706 if (flags & BT_InBlockIsCounter)
708 const uint64x2_t one = vreinterpretq_u64_u32(s_one);
709 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
710 block1 = vaddq_u64(block0, one);
711 block2 = vaddq_u64(block1, one);
712 block3 = vaddq_u64(block2, one);
713 block4 = vaddq_u64(block3, one);
714 block5 = vaddq_u64(block4, one);
715 vst1q_u8(const_cast<byte*>(inBlocks),
716 vreinterpretq_u8_u64(vaddq_u64(block5, one)));
720 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
721 inBlocks =
PtrAdd(inBlocks, inIncrement);
722 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
723 inBlocks =
PtrAdd(inBlocks, inIncrement);
724 block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
725 inBlocks =
PtrAdd(inBlocks, inIncrement);
726 block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
727 inBlocks =
PtrAdd(inBlocks, inIncrement);
728 block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
729 inBlocks =
PtrAdd(inBlocks, inIncrement);
730 block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
731 inBlocks =
PtrAdd(inBlocks, inIncrement);
736 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
737 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
738 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
739 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
740 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
741 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
742 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
743 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
744 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
745 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
746 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
747 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
750 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
754 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
755 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
756 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
757 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
758 block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
759 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
760 block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
761 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
762 block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
763 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
764 block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
765 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
768 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
769 outBlocks =
PtrAdd(outBlocks, outIncrement);
770 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
771 outBlocks =
PtrAdd(outBlocks, outIncrement);
772 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
773 outBlocks =
PtrAdd(outBlocks, outIncrement);
774 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
775 outBlocks =
PtrAdd(outBlocks, outIncrement);
776 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
777 outBlocks =
PtrAdd(outBlocks, outIncrement);
778 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
779 outBlocks =
PtrAdd(outBlocks, outIncrement);
781 length -= 6*blockSize;
784 while (length >= 2*blockSize)
786 uint64x2_t block0, block1;
787 if (flags & BT_InBlockIsCounter)
789 const uint64x2_t one = vreinterpretq_u64_u32(s_one);
790 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
791 block1 = vaddq_u64(block0, one);
792 vst1q_u8(const_cast<byte*>(inBlocks),
793 vreinterpretq_u8_u64(vaddq_u64(block1, one)));
797 block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
798 inBlocks =
PtrAdd(inBlocks, inIncrement);
799 block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
800 inBlocks =
PtrAdd(inBlocks, inIncrement);
805 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
806 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
807 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
808 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
811 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
815 block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
816 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
817 block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
818 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
821 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
822 outBlocks =
PtrAdd(outBlocks, outIncrement);
823 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
824 outBlocks =
PtrAdd(outBlocks, outIncrement);
826 length -= 2*blockSize;
830 while (length >= blockSize)
832 uint64x2_t block, zero = {0,0};
833 block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
836 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
838 if (flags & BT_InBlockIsCounter)
839 const_cast<byte *
>(inBlocks)[15]++;
841 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
844 block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
846 vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
848 inBlocks =
PtrAdd(inBlocks, inIncrement);
849 outBlocks =
PtrAdd(outBlocks, outIncrement);
850 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
859 #endif // CRYPTOPP_ARM_NEON_AVAILABLE 863 #if defined(CRYPTOPP_SSSE3_AVAILABLE) 866 #if (__SUNPRO_CC >= 0x5130) 868 # define MAYBE_UNCONST_CAST(T, x) const_cast<MAYBE_CONST T>(x) 870 # define MAYBE_CONST const 871 # define MAYBE_UNCONST_CAST(T, x) (x) 876 # define M128_CAST(x) ((__m128i *)(void *)(x)) 878 #ifndef CONST_M128_CAST 879 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) 892 template <
typename F1,
typename F2,
typename W>
893 CRYPTOPP_INLINE
size_t AdvancedProcessBlocks64_2x1_SSE(F1 func1, F2 func2,
894 MAYBE_CONST W *subKeys,
size_t rounds,
const byte *inBlocks,
895 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
902 const size_t blockSize = 8;
903 const size_t xmmBlockSize = 16;
905 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
906 size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
907 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
910 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
911 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
913 if (flags & BT_ReverseDirection)
915 inBlocks =
PtrAdd(inBlocks, length - xmmBlockSize);
916 xorBlocks =
PtrAdd(xorBlocks, length - xmmBlockSize);
917 outBlocks =
PtrAdd(outBlocks, length - xmmBlockSize);
918 inIncrement = 0-inIncrement;
919 xorIncrement = 0-xorIncrement;
920 outIncrement = 0-outIncrement;
923 if (flags & BT_AllowParallel)
926 while (length >= 2*xmmBlockSize)
928 __m128i block0, block1;
929 if (flags & BT_InBlockIsCounter)
932 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
933 const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
938 std::memcpy(temp, inBlocks, blockSize);
939 block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
942 block1 = _mm_add_epi32(s_two, block0);
946 _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block1)));
947 std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
951 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
952 inBlocks =
PtrAdd(inBlocks, inIncrement);
953 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
954 inBlocks =
PtrAdd(inBlocks, inIncrement);
959 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
960 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
961 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
962 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
965 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
969 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
970 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
971 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
972 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
975 _mm_storeu_si128(M128_CAST(outBlocks), block0);
976 outBlocks =
PtrAdd(outBlocks, outIncrement);
977 _mm_storeu_si128(M128_CAST(outBlocks), block1);
978 outBlocks =
PtrAdd(outBlocks, outIncrement);
980 length -= 2*xmmBlockSize;
987 if (flags & BT_ReverseDirection)
989 inIncrement += inIncrement ? blockSize : 0;
990 xorIncrement += xorIncrement ? blockSize : 0;
991 outIncrement += outIncrement ? blockSize : 0;
992 inBlocks =
PtrSub(inBlocks, inIncrement);
993 xorBlocks =
PtrSub(xorBlocks, xorIncrement);
994 outBlocks =
PtrSub(outBlocks, outIncrement);
998 inIncrement -= inIncrement ? blockSize : 0;
999 xorIncrement -= xorIncrement ? blockSize : 0;
1000 outIncrement -= outIncrement ? blockSize : 0;
1003 while (length >= blockSize)
1006 std::memcpy(temp, inBlocks, blockSize);
1007 __m128i block = _mm_castpd_si128(_mm_load_sd(temp));
1011 std::memcpy(temp, xorBlocks, blockSize);
1012 block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
1015 if (flags & BT_InBlockIsCounter)
1016 const_cast<byte *
>(inBlocks)[7]++;
1018 func1(block, subKeys, static_cast<unsigned int>(rounds));
1022 std::memcpy(temp, xorBlocks, blockSize);
1023 block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
1026 _mm_store_sd(temp, _mm_castsi128_pd(block));
1027 std::memcpy(outBlocks, temp, blockSize);
1029 inBlocks =
PtrAdd(inBlocks, inIncrement);
1030 outBlocks =
PtrAdd(outBlocks, outIncrement);
1031 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1032 length -= blockSize;
1047 template <
typename F2,
typename F6,
typename W>
1048 CRYPTOPP_INLINE
size_t AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6,
1049 MAYBE_CONST W *subKeys,
size_t rounds,
const byte *inBlocks,
1050 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1057 const size_t blockSize = 8;
1058 const size_t xmmBlockSize = 16;
1060 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
1061 size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
1062 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
1065 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1066 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1068 if (flags & BT_ReverseDirection)
1070 inBlocks =
PtrAdd(inBlocks, length - xmmBlockSize);
1071 xorBlocks =
PtrAdd(xorBlocks, length - xmmBlockSize);
1072 outBlocks =
PtrAdd(outBlocks, length - xmmBlockSize);
1073 inIncrement = 0-inIncrement;
1074 xorIncrement = 0-xorIncrement;
1075 outIncrement = 0-outIncrement;
1078 if (flags & BT_AllowParallel)
1081 while (length >= 6*xmmBlockSize)
1083 __m128i block0, block1, block2, block3, block4, block5;
1084 if (flags & BT_InBlockIsCounter)
1087 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1088 const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
1093 std::memcpy(temp, inBlocks, blockSize);
1094 block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
1097 block1 = _mm_add_epi32(s_two, block0);
1098 block2 = _mm_add_epi32(s_two, block1);
1099 block3 = _mm_add_epi32(s_two, block2);
1100 block4 = _mm_add_epi32(s_two, block3);
1101 block5 = _mm_add_epi32(s_two, block4);
1105 _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi32(s_two, block5)));
1106 std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
1110 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1111 inBlocks =
PtrAdd(inBlocks, inIncrement);
1112 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1113 inBlocks =
PtrAdd(inBlocks, inIncrement);
1114 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1115 inBlocks =
PtrAdd(inBlocks, inIncrement);
1116 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1117 inBlocks =
PtrAdd(inBlocks, inIncrement);
1118 block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1119 inBlocks =
PtrAdd(inBlocks, inIncrement);
1120 block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1121 inBlocks =
PtrAdd(inBlocks, inIncrement);
1126 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1127 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1128 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1129 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1130 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1131 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1132 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1133 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1134 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1135 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1136 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1137 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1140 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
1144 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1145 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1146 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1147 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1148 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1149 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1150 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1151 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1152 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1153 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1154 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1155 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1158 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1159 outBlocks =
PtrAdd(outBlocks, outIncrement);
1160 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1161 outBlocks =
PtrAdd(outBlocks, outIncrement);
1162 _mm_storeu_si128(M128_CAST(outBlocks), block2);
1163 outBlocks =
PtrAdd(outBlocks, outIncrement);
1164 _mm_storeu_si128(M128_CAST(outBlocks), block3);
1165 outBlocks =
PtrAdd(outBlocks, outIncrement);
1166 _mm_storeu_si128(M128_CAST(outBlocks), block4);
1167 outBlocks =
PtrAdd(outBlocks, outIncrement);
1168 _mm_storeu_si128(M128_CAST(outBlocks), block5);
1169 outBlocks =
PtrAdd(outBlocks, outIncrement);
1171 length -= 6*xmmBlockSize;
1174 while (length >= 2*xmmBlockSize)
1176 __m128i block0, block1;
1177 if (flags & BT_InBlockIsCounter)
1180 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1181 const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
1186 std::memcpy(temp, inBlocks, blockSize);
1187 block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
1190 block1 = _mm_add_epi32(s_two, block0);
1194 _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block1)));
1195 std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
1199 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1200 inBlocks =
PtrAdd(inBlocks, inIncrement);
1201 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1202 inBlocks =
PtrAdd(inBlocks, inIncrement);
1207 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1208 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1209 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1210 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1213 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
1217 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1218 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1219 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1220 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1223 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1224 outBlocks =
PtrAdd(outBlocks, outIncrement);
1225 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1226 outBlocks =
PtrAdd(outBlocks, outIncrement);
1228 length -= 2*xmmBlockSize;
1235 if (flags & BT_ReverseDirection)
1237 inIncrement += inIncrement ? blockSize : 0;
1238 xorIncrement += xorIncrement ? blockSize : 0;
1239 outIncrement += outIncrement ? blockSize : 0;
1240 inBlocks =
PtrSub(inBlocks, inIncrement);
1241 xorBlocks =
PtrSub(xorBlocks, xorIncrement);
1242 outBlocks =
PtrSub(outBlocks, outIncrement);
1246 inIncrement -= inIncrement ? blockSize : 0;
1247 xorIncrement -= xorIncrement ? blockSize : 0;
1248 outIncrement -= outIncrement ? blockSize : 0;
1251 while (length >= blockSize)
1254 __m128i block, zero = _mm_setzero_si128();
1255 std::memcpy(temp, inBlocks, blockSize);
1256 block = _mm_castpd_si128(_mm_load_sd(temp));
1260 std::memcpy(temp, xorBlocks, blockSize);
1261 block = _mm_xor_si128(block,
1262 _mm_castpd_si128(_mm_load_sd(temp)));
1265 if (flags & BT_InBlockIsCounter)
1266 const_cast<byte *
>(inBlocks)[7]++;
1268 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
1272 std::memcpy(temp, xorBlocks, blockSize);
1273 block = _mm_xor_si128(block,
1274 _mm_castpd_si128(_mm_load_sd(temp)));
1277 _mm_store_sd(temp, _mm_castsi128_pd(block));
1278 std::memcpy(outBlocks, temp, blockSize);
1280 inBlocks =
PtrAdd(inBlocks, inIncrement);
1281 outBlocks =
PtrAdd(outBlocks, outIncrement);
1282 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1283 length -= blockSize;
1298 template <
typename F2,
typename F6,
typename W>
1299 CRYPTOPP_INLINE
size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
1300 MAYBE_CONST W *subKeys,
size_t rounds,
const byte *inBlocks,
1301 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1308 const size_t blockSize = 16;
1311 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1312 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1313 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
1316 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1317 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1319 if (flags & BT_ReverseDirection)
1321 inBlocks =
PtrAdd(inBlocks, length - blockSize);
1322 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
1323 outBlocks =
PtrAdd(outBlocks, length - blockSize);
1324 inIncrement = 0-inIncrement;
1325 xorIncrement = 0-xorIncrement;
1326 outIncrement = 0-outIncrement;
1329 if (flags & BT_AllowParallel)
1331 while (length >= 6*blockSize)
1333 __m128i block0, block1, block2, block3, block4, block5;
1334 if (flags & BT_InBlockIsCounter)
1337 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1338 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1339 block1 = _mm_add_epi32(block0, s_one);
1340 block2 = _mm_add_epi32(block1, s_one);
1341 block3 = _mm_add_epi32(block2, s_one);
1342 block4 = _mm_add_epi32(block3, s_one);
1343 block5 = _mm_add_epi32(block4, s_one);
1344 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block5, s_one));
1348 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1349 inBlocks =
PtrAdd(inBlocks, inIncrement);
1350 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1351 inBlocks =
PtrAdd(inBlocks, inIncrement);
1352 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1353 inBlocks =
PtrAdd(inBlocks, inIncrement);
1354 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1355 inBlocks =
PtrAdd(inBlocks, inIncrement);
1356 block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1357 inBlocks =
PtrAdd(inBlocks, inIncrement);
1358 block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1359 inBlocks =
PtrAdd(inBlocks, inIncrement);
1364 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1365 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1366 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1367 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1368 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1369 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1370 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1371 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1372 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1373 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1374 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1375 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1378 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
1382 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1383 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1384 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1385 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1386 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1387 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1388 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1389 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1390 block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1391 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1392 block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1393 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1396 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1397 outBlocks =
PtrAdd(outBlocks, outIncrement);
1398 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1399 outBlocks =
PtrAdd(outBlocks, outIncrement);
1400 _mm_storeu_si128(M128_CAST(outBlocks), block2);
1401 outBlocks =
PtrAdd(outBlocks, outIncrement);
1402 _mm_storeu_si128(M128_CAST(outBlocks), block3);
1403 outBlocks =
PtrAdd(outBlocks, outIncrement);
1404 _mm_storeu_si128(M128_CAST(outBlocks), block4);
1405 outBlocks =
PtrAdd(outBlocks, outIncrement);
1406 _mm_storeu_si128(M128_CAST(outBlocks), block5);
1407 outBlocks =
PtrAdd(outBlocks, outIncrement);
1409 length -= 6*blockSize;
1412 while (length >= 2*blockSize)
1414 __m128i block0, block1;
1415 if (flags & BT_InBlockIsCounter)
1418 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1419 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1420 block1 = _mm_add_epi32(block0, s_one);
1421 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, s_one));
1425 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1426 inBlocks =
PtrAdd(inBlocks, inIncrement);
1427 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1428 inBlocks =
PtrAdd(inBlocks, inIncrement);
1433 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1434 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1435 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1436 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1439 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
1443 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1444 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1445 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1446 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1449 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1450 outBlocks =
PtrAdd(outBlocks, outIncrement);
1451 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1452 outBlocks =
PtrAdd(outBlocks, outIncrement);
1454 length -= 2*blockSize;
1458 while (length >= blockSize)
1460 __m128i block, zero = _mm_setzero_si128();
1461 block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1464 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1466 if (flags & BT_InBlockIsCounter)
1467 const_cast<byte *
>(inBlocks)[15]++;
1469 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
1472 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1474 _mm_storeu_si128(M128_CAST(outBlocks), block);
1476 inBlocks =
PtrAdd(inBlocks, inIncrement);
1477 outBlocks =
PtrAdd(outBlocks, outIncrement);
1478 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1479 length -= blockSize;
1493 template <
typename F1,
typename F4,
typename W>
1494 CRYPTOPP_INLINE
size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4,
1495 MAYBE_CONST W *subKeys,
size_t rounds,
const byte *inBlocks,
1496 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1503 const size_t blockSize = 16;
1506 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1507 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1508 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
1511 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1512 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1514 if (flags & BT_ReverseDirection)
1516 inBlocks =
PtrAdd(inBlocks, length - blockSize);
1517 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
1518 outBlocks =
PtrAdd(outBlocks, length - blockSize);
1519 inIncrement = 0-inIncrement;
1520 xorIncrement = 0-xorIncrement;
1521 outIncrement = 0-outIncrement;
1524 if (flags & BT_AllowParallel)
1526 while (length >= 4*blockSize)
1528 __m128i block0, block1, block2, block3;
1529 if (flags & BT_InBlockIsCounter)
1532 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1533 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1534 block1 = _mm_add_epi32(block0, s_one);
1535 block2 = _mm_add_epi32(block1, s_one);
1536 block3 = _mm_add_epi32(block2, s_one);
1537 _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, s_one));
1541 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1542 inBlocks =
PtrAdd(inBlocks, inIncrement);
1543 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1544 inBlocks =
PtrAdd(inBlocks, inIncrement);
1545 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1546 inBlocks =
PtrAdd(inBlocks, inIncrement);
1547 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1548 inBlocks =
PtrAdd(inBlocks, inIncrement);
1553 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1554 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1555 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1556 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1557 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1558 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1559 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1560 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1563 func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
1567 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1568 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1569 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1570 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1571 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1572 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1573 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1574 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1577 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1578 outBlocks =
PtrAdd(outBlocks, outIncrement);
1579 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1580 outBlocks =
PtrAdd(outBlocks, outIncrement);
1581 _mm_storeu_si128(M128_CAST(outBlocks), block2);
1582 outBlocks =
PtrAdd(outBlocks, outIncrement);
1583 _mm_storeu_si128(M128_CAST(outBlocks), block3);
1584 outBlocks =
PtrAdd(outBlocks, outIncrement);
1586 length -= 4*blockSize;
1590 while (length >= blockSize)
1592 __m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1595 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1597 if (flags & BT_InBlockIsCounter)
1598 const_cast<byte *
>(inBlocks)[15]++;
1600 func1(block, subKeys, static_cast<unsigned int>(rounds));
1603 block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1605 _mm_storeu_si128(M128_CAST(outBlocks), block);
1607 inBlocks =
PtrAdd(inBlocks, inIncrement);
1608 outBlocks =
PtrAdd(outBlocks, outIncrement);
1609 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1610 length -= blockSize;
1624 template <
typename F1,
typename F4,
typename W>
1625 CRYPTOPP_INLINE
size_t AdvancedProcessBlocks64_4x1_SSE(F1 func1, F4 func4,
1626 MAYBE_CONST W *subKeys,
size_t rounds,
const byte *inBlocks,
1627 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1634 const size_t blockSize = 8;
1635 const size_t xmmBlockSize = 16;
1637 size_t inIncrement = (flags & (BT_InBlockIsCounter | BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
1638 size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
1639 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
1642 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1643 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1645 if (flags & BT_ReverseDirection)
1647 inBlocks =
PtrAdd(inBlocks, length - xmmBlockSize);
1648 xorBlocks =
PtrAdd(xorBlocks, length - xmmBlockSize);
1649 outBlocks =
PtrAdd(outBlocks, length - xmmBlockSize);
1650 inIncrement = 0 - inIncrement;
1651 xorIncrement = 0 - xorIncrement;
1652 outIncrement = 0 - outIncrement;
1655 if (flags & BT_AllowParallel)
1657 while (length >= 4*xmmBlockSize)
1659 __m128i block0, block1, block2, block3;
1660 if (flags & BT_InBlockIsCounter)
1663 const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1664 const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
1670 std::memcpy(temp, inBlocks, blockSize);
1671 block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
1674 block1 = _mm_add_epi32(s_two, block0);
1675 block2 = _mm_add_epi32(s_two, block1);
1676 block3 = _mm_add_epi32(s_two, block2);
1680 _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block3)));
1681 std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
1685 block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1686 inBlocks =
PtrAdd(inBlocks, inIncrement);
1687 block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1688 inBlocks =
PtrAdd(inBlocks, inIncrement);
1689 block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1690 inBlocks =
PtrAdd(inBlocks, inIncrement);
1691 block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1692 inBlocks =
PtrAdd(inBlocks, inIncrement);
1697 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1698 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1699 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1700 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1701 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1702 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1703 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1704 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1707 func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
1711 block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1712 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1713 block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1714 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1715 block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1716 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1717 block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1718 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1721 _mm_storeu_si128(M128_CAST(outBlocks), block0);
1722 outBlocks =
PtrAdd(outBlocks, outIncrement);
1723 _mm_storeu_si128(M128_CAST(outBlocks), block1);
1724 outBlocks =
PtrAdd(outBlocks, outIncrement);
1725 _mm_storeu_si128(M128_CAST(outBlocks), block2);
1726 outBlocks =
PtrAdd(outBlocks, outIncrement);
1727 _mm_storeu_si128(M128_CAST(outBlocks), block3);
1728 outBlocks =
PtrAdd(outBlocks, outIncrement);
1730 length -= 4*xmmBlockSize;
1737 if (flags & BT_ReverseDirection)
1739 inIncrement += inIncrement ? blockSize : 0;
1740 xorIncrement += xorIncrement ? blockSize : 0;
1741 outIncrement += outIncrement ? blockSize : 0;
1742 inBlocks =
PtrSub(inBlocks, inIncrement);
1743 xorBlocks =
PtrSub(xorBlocks, xorIncrement);
1744 outBlocks =
PtrSub(outBlocks, outIncrement);
1748 inIncrement -= inIncrement ? blockSize : 0;
1749 xorIncrement -= xorIncrement ? blockSize : 0;
1750 outIncrement -= outIncrement ? blockSize : 0;
1753 while (length >= blockSize)
1756 std::memcpy(temp, inBlocks, blockSize);
1757 __m128i block = _mm_castpd_si128(_mm_load_sd(temp));
1761 std::memcpy(temp, xorBlocks, blockSize);
1762 block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
1765 if (flags & BT_InBlockIsCounter)
1766 const_cast<byte *
>(inBlocks)[7]++;
1768 func1(block, subKeys, static_cast<unsigned int>(rounds));
1772 std::memcpy(temp, xorBlocks, blockSize);
1773 block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
1776 _mm_store_sd(temp, _mm_castsi128_pd(block));
1777 std::memcpy(outBlocks, temp, blockSize);
1779 inBlocks =
PtrAdd(inBlocks, inIncrement);
1780 outBlocks =
PtrAdd(outBlocks, outIncrement);
1781 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1782 length -= blockSize;
1791 #endif // CRYPTOPP_SSSE3_AVAILABLE 1795 #if defined(__ALTIVEC__) 1807 template <
typename F2,
typename F6,
typename W>
1808 CRYPTOPP_INLINE
size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6,
1809 const W *subKeys,
size_t rounds,
const byte *inBlocks,
1810 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1817 #if (CRYPTOPP_LITTLE_ENDIAN) 1818 enum {LowOffset=8, HighOffset=0};
1822 enum {LowOffset=8, HighOffset=0};
1827 const size_t blockSize = 8;
1828 const size_t vsxBlockSize = 16;
1829 CRYPTOPP_ALIGN_DATA(16) uint8_t temp[16];
1831 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : vsxBlockSize;
1832 size_t xorIncrement = (xorBlocks != NULLPTR) ? vsxBlockSize : 0;
1833 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : vsxBlockSize;
1836 const
bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1837 const
bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1839 if (flags & BT_ReverseDirection)
1841 inBlocks =
PtrAdd(inBlocks, length - vsxBlockSize);
1842 xorBlocks =
PtrAdd(xorBlocks, length - vsxBlockSize);
1843 outBlocks =
PtrAdd(outBlocks, length - vsxBlockSize);
1844 inIncrement = 0-inIncrement;
1845 xorIncrement = 0-xorIncrement;
1846 outIncrement = 0-outIncrement;
1849 if (flags & BT_AllowParallel)
1851 while (length >= 6*vsxBlockSize)
1853 uint32x4_p block0, block1, block2, block3, block4, block5;
1854 if (flags & BT_InBlockIsCounter)
1858 std::memcpy(temp+LowOffset, inBlocks, 8);
1859 std::memcpy(temp+HighOffset, inBlocks, 8);
1866 block0 =
VecAdd(s_one, ctr);
1870 block1 =
VecAdd(s_two, block0);
1871 block2 =
VecAdd(s_two, block1);
1872 block3 =
VecAdd(s_two, block2);
1873 block4 =
VecAdd(s_two, block3);
1874 block5 =
VecAdd(s_two, block4);
1877 const_cast<byte*
>(inBlocks)[7] += 12;
1882 inBlocks =
PtrAdd(inBlocks, inIncrement);
1884 inBlocks =
PtrAdd(inBlocks, inIncrement);
1886 inBlocks =
PtrAdd(inBlocks, inIncrement);
1888 inBlocks =
PtrAdd(inBlocks, inIncrement);
1890 inBlocks =
PtrAdd(inBlocks, inIncrement);
1892 inBlocks =
PtrAdd(inBlocks, inIncrement);
1898 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1900 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1902 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1904 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1906 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1908 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1911 func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
1916 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1918 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1920 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1922 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1924 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1926 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1930 outBlocks =
PtrAdd(outBlocks, outIncrement);
1932 outBlocks =
PtrAdd(outBlocks, outIncrement);
1934 outBlocks =
PtrAdd(outBlocks, outIncrement);
1936 outBlocks =
PtrAdd(outBlocks, outIncrement);
1938 outBlocks =
PtrAdd(outBlocks, outIncrement);
1940 outBlocks =
PtrAdd(outBlocks, outIncrement);
1942 length -= 6*vsxBlockSize;
1945 while (length >= 2*vsxBlockSize)
1948 if (flags & BT_InBlockIsCounter)
1952 std::memcpy(temp+LowOffset, inBlocks, 8);
1953 std::memcpy(temp+HighOffset, inBlocks, 8);
1960 block0 =
VecAdd(s_one, ctr);
1964 block1 =
VecAdd(s_two, block0);
1967 const_cast<byte*
>(inBlocks)[7] += 4;
1972 inBlocks =
PtrAdd(inBlocks, inIncrement);
1974 inBlocks =
PtrAdd(inBlocks, inIncrement);
1980 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1982 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1985 func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
1990 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1992 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
1996 outBlocks =
PtrAdd(outBlocks, outIncrement);
1998 outBlocks =
PtrAdd(outBlocks, outIncrement);
2000 length -= 2*vsxBlockSize;
2007 if (flags & BT_ReverseDirection)
2009 inIncrement += inIncrement ? blockSize : 0;
2010 xorIncrement += xorIncrement ? blockSize : 0;
2011 outIncrement += outIncrement ? blockSize : 0;
2012 inBlocks =
PtrSub(inBlocks, inIncrement);
2013 xorBlocks =
PtrSub(xorBlocks, xorIncrement);
2014 outBlocks =
PtrSub(outBlocks, outIncrement);
2018 inIncrement -= inIncrement ? blockSize : 0;
2019 xorIncrement -= xorIncrement ? blockSize : 0;
2020 outIncrement -= outIncrement ? blockSize : 0;
2023 while (length >= blockSize)
2031 std::memcpy(temp+LowOffset, inBlocks, 8);
2032 std::memcpy(temp+HighOffset, inBlocks, 8);
2037 std::memcpy(temp+LowOffset, xorBlocks, 8);
2038 std::memcpy(temp+HighOffset, xorBlocks, 8);
2040 block =
VecXor(block, x);
2044 if (flags & BT_InBlockIsCounter)
2045 const_cast<byte *
>(inBlocks)[7]++;
2047 func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
2051 std::memcpy(temp+LowOffset, xorBlocks, 8);
2052 std::memcpy(temp+HighOffset, xorBlocks, 8);
2054 block =
VecXor(block, x);
2058 std::memcpy(outBlocks, temp+LowOffset, 8);
2060 inBlocks =
PtrAdd(inBlocks, inIncrement);
2061 outBlocks =
PtrAdd(outBlocks, outIncrement);
2062 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2063 length -= blockSize;
2078 template <
typename F1,
typename F4,
typename W>
2079 CRYPTOPP_INLINE
size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4,
2080 const W *subKeys,
size_t rounds,
const byte *inBlocks,
2081 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
2088 #if (CRYPTOPP_LITTLE_ENDIAN) 2094 const size_t blockSize = 16;
2097 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
2098 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
2099 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
2102 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
2103 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
2105 if (flags & BT_ReverseDirection)
2107 inBlocks =
PtrAdd(inBlocks, length - blockSize);
2108 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
2109 outBlocks =
PtrAdd(outBlocks, length - blockSize);
2110 inIncrement = 0-inIncrement;
2111 xorIncrement = 0-xorIncrement;
2112 outIncrement = 0-outIncrement;
2115 if (flags & BT_AllowParallel)
2117 while (length >= 4*blockSize)
2121 if (flags & BT_InBlockIsCounter)
2124 block1 =
VecAdd(block0, s_one);
2125 block2 =
VecAdd(block1, s_one);
2126 block3 =
VecAdd(block2, s_one);
2136 const_cast<byte*
>(inBlocks)[15] += 6;
2141 inBlocks =
PtrAdd(inBlocks, inIncrement);
2143 inBlocks =
PtrAdd(inBlocks, inIncrement);
2145 inBlocks =
PtrAdd(inBlocks, inIncrement);
2147 inBlocks =
PtrAdd(inBlocks, inIncrement);
2153 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2155 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2157 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2159 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2162 func4(block0, block1, block2, block3, subKeys, rounds);
2167 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2169 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2171 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2173 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2177 outBlocks =
PtrAdd(outBlocks, outIncrement);
2179 outBlocks =
PtrAdd(outBlocks, outIncrement);
2181 outBlocks =
PtrAdd(outBlocks, outIncrement);
2183 outBlocks =
PtrAdd(outBlocks, outIncrement);
2185 length -= 4*blockSize;
2189 while (length >= blockSize)
2196 if (flags & BT_InBlockIsCounter)
2197 const_cast<byte *
>(inBlocks)[15]++;
2199 func1(block, subKeys, rounds);
2206 inBlocks =
PtrAdd(inBlocks, inIncrement);
2207 outBlocks =
PtrAdd(outBlocks, outIncrement);
2208 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2209 length -= blockSize;
2223 template <
typename F1,
typename F6,
typename W>
2224 CRYPTOPP_INLINE
size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
2225 const W *subKeys,
size_t rounds,
const byte *inBlocks,
2226 const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
2233 #if (CRYPTOPP_LITTLE_ENDIAN) 2239 const size_t blockSize = 16;
2242 size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
2243 size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
2244 size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
2247 const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
2248 const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
2250 if (flags & BT_ReverseDirection)
2252 inBlocks =
PtrAdd(inBlocks, length - blockSize);
2253 xorBlocks =
PtrAdd(xorBlocks, length - blockSize);
2254 outBlocks =
PtrAdd(outBlocks, length - blockSize);
2255 inIncrement = 0-inIncrement;
2256 xorIncrement = 0-xorIncrement;
2257 outIncrement = 0-outIncrement;
2260 if (flags & BT_AllowParallel)
2262 while (length >= 6*blockSize)
2264 uint32x4_p block0, block1, block2, block3, block4, block5;
2266 if (flags & BT_InBlockIsCounter)
2269 block1 =
VecAdd(block0, s_one);
2270 block2 =
VecAdd(block1, s_one);
2271 block3 =
VecAdd(block2, s_one);
2272 block4 =
VecAdd(block3, s_one);
2273 block5 =
VecAdd(block4, s_one);
2290 VecStoreBE(temp, const_cast<byte*>(inBlocks));
2295 inBlocks =
PtrAdd(inBlocks, inIncrement);
2297 inBlocks =
PtrAdd(inBlocks, inIncrement);
2299 inBlocks =
PtrAdd(inBlocks, inIncrement);
2301 inBlocks =
PtrAdd(inBlocks, inIncrement);
2303 inBlocks =
PtrAdd(inBlocks, inIncrement);
2305 inBlocks =
PtrAdd(inBlocks, inIncrement);
2311 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2313 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2315 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2317 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2319 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2321 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2324 func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
2329 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2331 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2333 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2335 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2337 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2339 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2343 outBlocks =
PtrAdd(outBlocks, outIncrement);
2345 outBlocks =
PtrAdd(outBlocks, outIncrement);
2347 outBlocks =
PtrAdd(outBlocks, outIncrement);
2349 outBlocks =
PtrAdd(outBlocks, outIncrement);
2351 outBlocks =
PtrAdd(outBlocks, outIncrement);
2353 outBlocks =
PtrAdd(outBlocks, outIncrement);
2355 length -= 6*blockSize;
2359 while (length >= blockSize)
2366 if (flags & BT_InBlockIsCounter)
2367 const_cast<byte *
>(inBlocks)[15]++;
2369 func1(block, subKeys, rounds);
2376 inBlocks =
PtrAdd(inBlocks, inIncrement);
2377 outBlocks =
PtrAdd(outBlocks, outIncrement);
2378 xorBlocks =
PtrAdd(xorBlocks, xorIncrement);
2379 length -= blockSize;
2387 #endif // __ALTIVEC__ 2389 #endif // CRYPTOPP_ADVANCED_SIMD_TEMPLATES
Utility functions for the Crypto++ library.
Library configuration file.
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Support functions for PowerPC and vector operations.
void VecStoreBE(const T data, byte dest[16])
Stores a vector to a byte array.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
PTR PtrSub(PTR pointer, OFF offset)
Create a pointer with an offset.
PTR PtrAdd(PTR pointer, OFF offset)
Create a pointer with an offset.
uint32x4_p VecLoadBE(const byte src[16])
Loads a vector from a byte array.
Crypto++ library namespace.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.