19 #if (CRYPTOPP_SSSE3_AVAILABLE) 21 # include <pmmintrin.h> 22 # include <tmmintrin.h> 25 #if (CRYPTOPP_SSE41_AVAILABLE) 26 # include <smmintrin.h> 30 # include <ammintrin.h> 33 #if defined(__AVX512F__) 34 # define CRYPTOPP_AVX512_ROTATE 1 35 # include <immintrin.h> 39 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 42 # include <arm_neon.h> 46 #if (CRYPTOPP_ARM_ACLE_AVAILABLE) 48 # include <arm_acle.h> 51 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) 57 extern const char SIMON64_SIMD_FNAME[] = __FILE__;
59 ANONYMOUS_NAMESPACE_BEGIN
62 using CryptoPP::word32;
63 using CryptoPP::word64;
68 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 71 inline T UnpackHigh32(
const T& a,
const T& b)
73 const uint32x2_t x(vget_high_u32((uint32x4_t)a));
74 const uint32x2_t y(vget_high_u32((uint32x4_t)b));
75 const uint32x2x2_t r = vzip_u32(x, y);
76 return (T)vcombine_u32(r.val[0], r.val[1]);
80 inline T UnpackLow32(
const T& a,
const T& b)
82 const uint32x2_t x(vget_low_u32((uint32x4_t)a));
83 const uint32x2_t y(vget_low_u32((uint32x4_t)b));
84 const uint32x2x2_t r = vzip_u32(x, y);
85 return (T)vcombine_u32(r.val[0], r.val[1]);
88 template <
unsigned int R>
89 inline uint32x4_t RotateLeft32(
const uint32x4_t& val)
91 const uint32x4_t a(vshlq_n_u32(val, R));
92 const uint32x4_t b(vshrq_n_u32(val, 32 - R));
93 return vorrq_u32(a, b);
96 template <
unsigned int R>
97 inline uint32x4_t RotateRight32(
const uint32x4_t& val)
99 const uint32x4_t a(vshlq_n_u32(val, 32 - R));
100 const uint32x4_t b(vshrq_n_u32(val, R));
101 return vorrq_u32(a, b);
104 #if defined(__aarch32__) || defined(__aarch64__) 107 inline uint32x4_t RotateLeft32<8>(
const uint32x4_t& val)
109 #if (CRYPTOPP_BIG_ENDIAN) 110 const uint8_t maskb[16] = { 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3 };
111 const uint8x16_t mask = vld1q_u8(maskb);
113 const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
114 const uint8x16_t mask = vld1q_u8(maskb);
117 return vreinterpretq_u32_u8(
118 vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
123 inline uint32x4_t RotateRight32<8>(
const uint32x4_t& val)
125 #if (CRYPTOPP_BIG_ENDIAN) 126 const uint8_t maskb[16] = { 12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1 };
127 const uint8x16_t mask = vld1q_u8(maskb);
129 const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,14,12 };
130 const uint8x16_t mask = vld1q_u8(maskb);
133 return vreinterpretq_u32_u8(
134 vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
138 inline uint32x4_t SIMON64_f(
const uint32x4_t& val)
140 return veorq_u32(RotateLeft32<2>(val),
141 vandq_u32(RotateLeft32<1>(val), RotateLeft32<8>(val)));
144 inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
145 const word32 *subkeys,
unsigned int rounds)
148 uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
149 uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
151 for (
int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
153 const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i);
154 y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1);
156 const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1);
157 x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2);
162 const uint32x4_t rk = vld1q_dup_u32(subkeys+rounds-1);
164 y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk);
169 block0 = UnpackLow32(y1, x1);
170 block1 = UnpackHigh32(y1, x1);
173 inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
174 const word32 *subkeys,
unsigned int rounds)
177 uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
178 uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
183 const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
185 y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1));
189 for (
int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
191 const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i+1);
192 x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1);
194 const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i);
195 y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
199 block0 = UnpackLow32(y1, x1);
200 block1 = UnpackHigh32(y1, x1);
203 inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
204 uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
205 const word32 *subkeys,
unsigned int rounds)
208 uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
209 uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
210 uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
211 uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
212 uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
213 uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
215 for (
int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
217 const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i);
218 y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1);
219 y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk1);
220 y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk1);
222 const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1);
223 x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2);
224 x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk2);
225 x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk2);
230 const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
232 y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk);
233 y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk);
234 y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk);
235 std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
239 block0 = UnpackLow32(y1, x1);
240 block1 = UnpackHigh32(y1, x1);
241 block2 = UnpackLow32(y2, x2);
242 block3 = UnpackHigh32(y2, x2);
243 block4 = UnpackLow32(y3, x3);
244 block5 = UnpackHigh32(y3, x3);
247 inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
248 uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
249 const word32 *subkeys,
unsigned int rounds)
252 uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
253 uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
254 uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
255 uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
256 uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
257 uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
261 std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
262 const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
264 y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1));
265 y2 = veorq_u32(veorq_u32(y2, rk), SIMON64_f(x2));
266 y3 = veorq_u32(veorq_u32(y3, rk), SIMON64_f(x3));
270 for (
int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
272 const uint32x4_t rk1 = vld1q_dup_u32(subkeys + i + 1);
273 x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1);
274 x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk1);
275 x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk1);
277 const uint32x4_t rk2 = vld1q_dup_u32(subkeys + i);
278 y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
279 y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk2);
280 y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk2);
284 block0 = UnpackLow32(y1, x1);
285 block1 = UnpackHigh32(y1, x1);
286 block2 = UnpackLow32(y2, x2);
287 block3 = UnpackHigh32(y2, x2);
288 block4 = UnpackLow32(y3, x3);
289 block5 = UnpackHigh32(y3, x3);
292 #endif // CRYPTOPP_ARM_NEON_AVAILABLE 296 #if defined(CRYPTOPP_SSE41_AVAILABLE) 298 inline void Swap128(__m128i& a,__m128i& b)
300 #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120) 309 template <
unsigned int R>
310 inline __m128i RotateLeft32(
const __m128i& val)
313 return _mm_roti_epi32(val, R);
316 _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
320 template <
unsigned int R>
321 inline __m128i RotateRight32(
const __m128i& val)
324 return _mm_roti_epi32(val, 32-R);
327 _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
333 __m128i RotateLeft32<8>(
const __m128i& val)
336 return _mm_roti_epi32(val, 8);
338 const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
339 return _mm_shuffle_epi8(val, mask);
345 __m128i RotateRight32<8>(
const __m128i& val)
348 return _mm_roti_epi32(val, 32-8);
350 const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
351 return _mm_shuffle_epi8(val, mask);
355 inline __m128i SIMON64_f(
const __m128i& v)
357 return _mm_xor_si128(RotateLeft32<2>(v),
358 _mm_and_si128(RotateLeft32<1>(v), RotateLeft32<8>(v)));
361 inline void SIMON64_Enc_Block(__m128i &block0, __m128i &block1,
362 const word32 *subkeys,
unsigned int rounds)
365 const __m128 t0 = _mm_castsi128_ps(block0);
366 const __m128 t1 = _mm_castsi128_ps(block1);
367 __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
368 __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
370 for (
int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
372 const __m128i rk1 = _mm_set1_epi32(subkeys[i]);
373 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
375 const __m128i rk2 = _mm_set1_epi32(subkeys[i+1]);
376 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
381 const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
382 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
387 block0 = _mm_unpacklo_epi32(y1, x1);
388 block1 = _mm_unpackhi_epi32(y1, x1);
391 inline void SIMON64_Dec_Block(__m128i &block0, __m128i &block1,
392 const word32 *subkeys,
unsigned int rounds)
395 const __m128 t0 = _mm_castsi128_ps(block0);
396 const __m128 t1 = _mm_castsi128_ps(block1);
397 __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
398 __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
403 const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
404 y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
408 for (
int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
410 const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
411 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
413 const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
414 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
418 block0 = _mm_unpacklo_epi32(y1, x1);
419 block1 = _mm_unpackhi_epi32(y1, x1);
422 inline void SIMON64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
423 __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
424 const word32 *subkeys,
unsigned int rounds)
427 const __m128 t0 = _mm_castsi128_ps(block0);
428 const __m128 t1 = _mm_castsi128_ps(block1);
429 __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
430 __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
432 const __m128 t2 = _mm_castsi128_ps(block2);
433 const __m128 t3 = _mm_castsi128_ps(block3);
434 __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
435 __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
437 const __m128 t4 = _mm_castsi128_ps(block4);
438 const __m128 t5 = _mm_castsi128_ps(block5);
439 __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
440 __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
442 for (
int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
444 const __m128i rk1 = _mm_set1_epi32(subkeys[i]);
445 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
446 y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk1);
447 y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk1);
449 const __m128i rk2 = _mm_set1_epi32(subkeys[i+1]);
450 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
451 x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk2);
452 x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk2);
457 const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
458 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
459 y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk);
460 y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk);
461 Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
465 block0 = _mm_unpacklo_epi32(y1, x1);
466 block1 = _mm_unpackhi_epi32(y1, x1);
467 block2 = _mm_unpacklo_epi32(y2, x2);
468 block3 = _mm_unpackhi_epi32(y2, x2);
469 block4 = _mm_unpacklo_epi32(y3, x3);
470 block5 = _mm_unpackhi_epi32(y3, x3);
473 inline void SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
474 __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
475 const word32 *subkeys,
unsigned int rounds)
478 const __m128 t0 = _mm_castsi128_ps(block0);
479 const __m128 t1 = _mm_castsi128_ps(block1);
480 __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
481 __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
483 const __m128 t2 = _mm_castsi128_ps(block2);
484 const __m128 t3 = _mm_castsi128_ps(block3);
485 __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
486 __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
488 const __m128 t4 = _mm_castsi128_ps(block4);
489 const __m128 t5 = _mm_castsi128_ps(block5);
490 __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
491 __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
495 Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
496 const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
497 y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
498 y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON64_f(x2));
499 y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON64_f(x3));
503 for (
int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
505 const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
506 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
507 x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk1);
508 x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk1);
510 const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
511 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
512 y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk2);
513 y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk2);
517 block0 = _mm_unpacklo_epi32(y1, x1);
518 block1 = _mm_unpackhi_epi32(y1, x1);
519 block2 = _mm_unpacklo_epi32(y2, x2);
520 block3 = _mm_unpackhi_epi32(y2, x2);
521 block4 = _mm_unpacklo_epi32(y3, x3);
522 block5 = _mm_unpackhi_epi32(y3, x3);
525 #endif // CRYPTOPP_SSE41_AVAILABLE 529 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) 541 template<
unsigned int C>
545 return vec_rl(val, m);
549 template<
unsigned int C>
552 const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
553 return vec_rl(val, m);
558 return VecXor(RotateLeft32<2>(val),
559 VecAnd(RotateLeft32<1>(val), RotateLeft32<8>(val)));
563 const word32 *subkeys,
unsigned int rounds)
565 #if (CRYPTOPP_BIG_ENDIAN) 566 const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
567 const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
569 const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
570 const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
577 for (
int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
579 #if CRYPTOPP_POWER7_AVAILABLE 580 const uint32x4_p rk1 = vec_splats(subkeys[i]);
581 const uint32x4_p rk2 = vec_splats(subkeys[i+1]);
583 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
595 #if CRYPTOPP_POWER7_AVAILABLE 596 const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
598 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
606 #if (CRYPTOPP_BIG_ENDIAN) 607 const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
608 const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
610 const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
611 const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
620 const word32 *subkeys,
unsigned int rounds)
622 #if (CRYPTOPP_BIG_ENDIAN) 623 const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
624 const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
626 const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
627 const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
637 #if CRYPTOPP_POWER7_AVAILABLE 638 const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
640 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
648 for (
int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
650 #if CRYPTOPP_POWER7_AVAILABLE 651 const uint32x4_p rk1 = vec_splats(subkeys[i+1]);
652 const uint32x4_p rk2 = vec_splats(subkeys[i]);
654 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
664 #if (CRYPTOPP_BIG_ENDIAN) 665 const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
666 const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
668 const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
669 const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
679 uint32x4_p &block5,
const word32 *subkeys,
unsigned int rounds)
681 #if (CRYPTOPP_BIG_ENDIAN) 682 const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
683 const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
685 const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
686 const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
697 for (
int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
699 #if CRYPTOPP_POWER7_AVAILABLE 700 const uint32x4_p rk1 = vec_splats(subkeys[i]);
701 const uint32x4_p rk2 = vec_splats(subkeys[i+1]);
703 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
720 #if CRYPTOPP_POWER7_AVAILABLE 721 const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
723 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
730 std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
733 #if (CRYPTOPP_BIG_ENDIAN) 734 const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
735 const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
737 const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
738 const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
752 uint32x4_p &block5,
const word32 *subkeys,
unsigned int rounds)
754 #if (CRYPTOPP_BIG_ENDIAN) 755 const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
756 const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
758 const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
759 const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
772 std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
774 #if CRYPTOPP_POWER7_AVAILABLE 775 const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
777 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
787 for (
int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
789 #if CRYPTOPP_POWER7_AVAILABLE 790 const uint32x4_p rk1 = vec_splats(subkeys[i+1]);
791 const uint32x4_p rk2 = vec_splats(subkeys[i]);
793 const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
808 #if (CRYPTOPP_BIG_ENDIAN) 809 const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
810 const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
812 const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
813 const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
825 #endif // CRYPTOPP_ALTIVEC_AVAILABLE 827 ANONYMOUS_NAMESPACE_END
835 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 836 size_t SIMON64_Enc_AdvancedProcessBlocks_NEON(
const word32* subKeys,
size_t rounds,
837 const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
839 return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
840 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
843 size_t SIMON64_Dec_AdvancedProcessBlocks_NEON(
const word32* subKeys,
size_t rounds,
844 const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
846 return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
847 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
849 #endif // CRYPTOPP_ARM_NEON_AVAILABLE 853 #if defined(CRYPTOPP_SSE41_AVAILABLE) 854 size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(
const word32* subKeys,
size_t rounds,
855 const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
857 return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
858 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
861 size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(
const word32* subKeys,
size_t rounds,
862 const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
864 return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
865 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
871 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) 872 size_t SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(
const word32* subKeys,
size_t rounds,
873 const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
875 return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
876 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
879 size_t SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(
const word32* subKeys,
size_t rounds,
880 const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
882 return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
883 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
Utility functions for the Crypto++ library.
Library configuration file.
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Support functions for PowerPC and vector operations.
Template for AdvancedProcessBlocks and SIMD processing.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
uint32x4_p VecLoadBE(const byte src[16])
Loads a vector from a byte array.
Classes for the Simon block cipher.
Crypto++ library namespace.
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
T1 VecAnd(const T1 vec1, const T2 vec2)
AND two vectors.
void vec_swap(T &a, T &b)
Swaps two variables which are arrays.