Crypto++  8.1
Free C++ class library of cryptographic schemes
simon128_simd.cpp
1 // simon-simd.cpp - written and placed in the public domain by Jeffrey Walton
2 //
3 // This source file uses intrinsics and built-ins to gain access to
4 // SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
5 // source file is needed because additional CXXFLAGS are required to enable
6 // the appropriate instructions sets in some build configurations.
7 
8 #include "pch.h"
9 #include "config.h"
10 
11 #include "simon.h"
12 #include "misc.h"
13 
14 // Uncomment for benchmarking C++ against SSE or NEON.
15 // Do so in both simon.cpp and simon-simd.cpp.
16 // #undef CRYPTOPP_SSSE3_AVAILABLE
17 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
18 
19 #if (CRYPTOPP_SSSE3_AVAILABLE)
20 # include "adv_simd.h"
21 # include <pmmintrin.h>
22 # include <tmmintrin.h>
23 #endif
24 
25 #if defined(__XOP__)
26 # include <ammintrin.h>
27 #endif
28 
29 #if defined(__AVX512F__)
30 # define CRYPTOPP_AVX512_ROTATE 1
31 # include <immintrin.h>
32 #endif
33 
34 // C1189: error: This header is specific to ARM targets
35 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
36 # include "adv_simd.h"
37 # ifndef _M_ARM64
38 # include <arm_neon.h>
39 # endif
40 #endif
41 
42 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
43 # include <stdint.h>
44 # include <arm_acle.h>
45 #endif
46 
47 #if defined(CRYPTOPP_POWER7_AVAILABLE)
48 # include "adv_simd.h"
49 # include "ppc_simd.h"
50 #endif
51 
52 // Squash MS LNK4221 and libtool warnings
53 extern const char SIMON128_SIMD_FNAME[] = __FILE__;
54 
55 ANONYMOUS_NAMESPACE_BEGIN
56 
57 using CryptoPP::byte;
58 using CryptoPP::word32;
59 using CryptoPP::word64;
60 using CryptoPP::vec_swap; // SunCC
61 
62 // *************************** ARM NEON ************************** //
63 
64 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
65 
66 // Missing from Microsoft's ARM A-32 implementation
67 #if defined(_MSC_VER) && !defined(_M_ARM64)
68 inline uint64x2_t vld1q_dup_u64(const uint64_t* ptr)
69 {
70  return vmovq_n_u64(*ptr);
71 }
72 #endif
73 
74 template <class T>
75 inline T UnpackHigh64(const T& a, const T& b)
76 {
77  const uint64x1_t x(vget_high_u64((uint64x2_t)a));
78  const uint64x1_t y(vget_high_u64((uint64x2_t)b));
79  return (T)vcombine_u64(x, y);
80 }
81 
82 template <class T>
83 inline T UnpackLow64(const T& a, const T& b)
84 {
85  const uint64x1_t x(vget_low_u64((uint64x2_t)a));
86  const uint64x1_t y(vget_low_u64((uint64x2_t)b));
87  return (T)vcombine_u64(x, y);
88 }
89 
90 template <unsigned int R>
91 inline uint64x2_t RotateLeft64(const uint64x2_t& val)
92 {
93  const uint64x2_t a(vshlq_n_u64(val, R));
94  const uint64x2_t b(vshrq_n_u64(val, 64 - R));
95  return vorrq_u64(a, b);
96 }
97 
98 template <unsigned int R>
99 inline uint64x2_t RotateRight64(const uint64x2_t& val)
100 {
101  const uint64x2_t a(vshlq_n_u64(val, 64 - R));
102  const uint64x2_t b(vshrq_n_u64(val, R));
103  return vorrq_u64(a, b);
104 }
105 
106 #if defined(__aarch32__) || defined(__aarch64__)
107 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
108 template <>
109 inline uint64x2_t RotateLeft64<8>(const uint64x2_t& val)
110 {
111 #if (CRYPTOPP_BIG_ENDIAN)
112  const uint8_t maskb[16] = { 14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7 };
113  const uint8x16_t mask = vld1q_u8(maskb);
114 #else
115  const uint8_t maskb[16] = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
116  const uint8x16_t mask = vld1q_u8(maskb);
117 #endif
118 
119  return vreinterpretq_u64_u8(
120  vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
121 }
122 
123 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
124 template <>
125 inline uint64x2_t RotateRight64<8>(const uint64x2_t& val)
126 {
127 #if (CRYPTOPP_BIG_ENDIAN)
128  const uint8_t maskb[16] = { 8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1 };
129  const uint8x16_t mask = vld1q_u8(maskb);
130 #else
131  const uint8_t maskb[16] = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
132  const uint8x16_t mask = vld1q_u8(maskb);
133 #endif
134 
135  return vreinterpretq_u64_u8(
136  vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
137 }
138 #endif
139 
140 inline uint64x2_t SIMON128_f(const uint64x2_t& val)
141 {
142  return veorq_u64(RotateLeft64<2>(val),
143  vandq_u64(RotateLeft64<1>(val), RotateLeft64<8>(val)));
144 }
145 
146 inline void SIMON128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
147  const word64 *subkeys, unsigned int rounds)
148 {
149  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
150  uint64x2_t x1 = UnpackHigh64(block0, block1);
151  uint64x2_t y1 = UnpackLow64(block0, block1);
152 
153  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
154  {
155  const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i);
156  y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk1);
157 
158  const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i+1);
159  x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk2);
160  }
161 
162  if (rounds & 1)
163  {
164  const uint64x2_t rk = vld1q_dup_u64(subkeys+rounds-1);
165 
166  y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk);
167  std::swap(x1, y1);
168  }
169 
170  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
171  block0 = UnpackLow64(y1, x1);
172  block1 = UnpackHigh64(y1, x1);
173 }
174 
175 inline void SIMON128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
176  uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
177  const word64 *subkeys, unsigned int rounds)
178 {
179  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
180  uint64x2_t x1 = UnpackHigh64(block0, block1);
181  uint64x2_t y1 = UnpackLow64(block0, block1);
182  uint64x2_t x2 = UnpackHigh64(block2, block3);
183  uint64x2_t y2 = UnpackLow64(block2, block3);
184  uint64x2_t x3 = UnpackHigh64(block4, block5);
185  uint64x2_t y3 = UnpackLow64(block4, block5);
186 
187  for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
188  {
189  const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i);
190  y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk1);
191  y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk1);
192  y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk1);
193 
194  const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i+1);
195  x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk2);
196  x2 = veorq_u64(veorq_u64(x2, SIMON128_f(y2)), rk2);
197  x3 = veorq_u64(veorq_u64(x3, SIMON128_f(y3)), rk2);
198  }
199 
200  if (rounds & 1)
201  {
202  const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
203 
204  y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk);
205  y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk);
206  y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk);
207  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
208  }
209 
210  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
211  block0 = UnpackLow64(y1, x1);
212  block1 = UnpackHigh64(y1, x1);
213  block2 = UnpackLow64(y2, x2);
214  block3 = UnpackHigh64(y2, x2);
215  block4 = UnpackLow64(y3, x3);
216  block5 = UnpackHigh64(y3, x3);
217 }
218 
219 inline void SIMON128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
220  const word64 *subkeys, unsigned int rounds)
221 {
222  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
223  uint64x2_t x1 = UnpackHigh64(block0, block1);
224  uint64x2_t y1 = UnpackLow64(block0, block1);
225 
226  if (rounds & 1)
227  {
228  std::swap(x1, y1);
229  const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
230 
231  y1 = veorq_u64(veorq_u64(y1, rk), SIMON128_f(x1));
232  rounds--;
233  }
234 
235  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
236  {
237  const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i+1);
238  x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk1);
239 
240  const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i);
241  y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk2);
242  }
243 
244  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
245  block0 = UnpackLow64(y1, x1);
246  block1 = UnpackHigh64(y1, x1);
247 }
248 
249 inline void SIMON128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
250  uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
251  const word64 *subkeys, unsigned int rounds)
252 {
253  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
254  uint64x2_t x1 = UnpackHigh64(block0, block1);
255  uint64x2_t y1 = UnpackLow64(block0, block1);
256  uint64x2_t x2 = UnpackHigh64(block2, block3);
257  uint64x2_t y2 = UnpackLow64(block2, block3);
258  uint64x2_t x3 = UnpackHigh64(block4, block5);
259  uint64x2_t y3 = UnpackLow64(block4, block5);
260 
261  if (rounds & 1)
262  {
263  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
264  const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
265 
266  y1 = veorq_u64(veorq_u64(y1, rk), SIMON128_f(x1));
267  y2 = veorq_u64(veorq_u64(y2, rk), SIMON128_f(x2));
268  y3 = veorq_u64(veorq_u64(y3, rk), SIMON128_f(x3));
269  rounds--;
270  }
271 
272  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
273  {
274  const uint64x2_t rk1 = vld1q_dup_u64(subkeys + i + 1);
275  x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk1);
276  x2 = veorq_u64(veorq_u64(x2, SIMON128_f(y2)), rk1);
277  x3 = veorq_u64(veorq_u64(x3, SIMON128_f(y3)), rk1);
278 
279  const uint64x2_t rk2 = vld1q_dup_u64(subkeys + i);
280  y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk2);
281  y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk2);
282  y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk2);
283  }
284 
285  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
286  block0 = UnpackLow64(y1, x1);
287  block1 = UnpackHigh64(y1, x1);
288  block2 = UnpackLow64(y2, x2);
289  block3 = UnpackHigh64(y2, x2);
290  block4 = UnpackLow64(y3, x3);
291  block5 = UnpackHigh64(y3, x3);
292 }
293 
294 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
295 
296 // ***************************** IA-32 ***************************** //
297 
298 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
299 
300 // Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670
301 #ifndef M128_CAST
302 # define M128_CAST(x) ((__m128i *)(void *)(x))
303 #endif
304 #ifndef CONST_M128_CAST
305 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
306 #endif
307 
308 // GCC double casts, https://www.spinics.net/lists/gcchelp/msg47735.html
309 #ifndef DOUBLE_CAST
310 # define DOUBLE_CAST(x) ((double *)(void *)(x))
311 #endif
312 #ifndef CONST_DOUBLE_CAST
313 # define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x))
314 #endif
315 
316 inline void Swap128(__m128i& a,__m128i& b)
317 {
318 #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
319  // __m128i is an unsigned long long[2], and support for swapping it was not added until C++11.
320  // SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11.
321  vec_swap(a, b);
322 #else
323  std::swap(a, b);
324 #endif
325 }
326 
327 template <unsigned int R>
328 inline __m128i RotateLeft64(const __m128i& val)
329 {
330 #if defined(CRYPTOPP_AVX512_ROTATE)
331  return _mm_rol_epi64(val, R);
332 #elif defined(__XOP__)
333  return _mm_roti_epi64(val, R);
334 #else
335  return _mm_or_si128(
336  _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R));
337 #endif
338 }
339 
340 template <unsigned int R>
341 inline __m128i RotateRight64(const __m128i& val)
342 {
343 #if defined(CRYPTOPP_AVX512_ROTATE)
344  return _mm_ror_epi64(val, R);
345 #elif defined(__XOP__)
346  return _mm_roti_epi64(val, 64-R);
347 #else
348  return _mm_or_si128(
349  _mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R));
350 #endif
351 }
352 
353 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
354 template <>
355 __m128i RotateLeft64<8>(const __m128i& val)
356 {
357 #if defined(__XOP__)
358  return _mm_roti_epi64(val, 8);
359 #else
360  const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7);
361  return _mm_shuffle_epi8(val, mask);
362 #endif
363 }
364 
365 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
366 template <>
367 __m128i RotateRight64<8>(const __m128i& val)
368 {
369 #if defined(__XOP__)
370  return _mm_roti_epi64(val, 64-8);
371 #else
372  const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1);
373  return _mm_shuffle_epi8(val, mask);
374 #endif
375 }
376 
377 inline __m128i SIMON128_f(const __m128i& v)
378 {
379  return _mm_xor_si128(RotateLeft64<2>(v),
380  _mm_and_si128(RotateLeft64<1>(v), RotateLeft64<8>(v)));
381 }
382 
383 inline void SIMON128_Enc_Block(__m128i &block0, __m128i &block1,
384  const word64 *subkeys, unsigned int rounds)
385 {
386  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
387  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
388  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
389 
390  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
391  {
392  const __m128i rk1 = _mm_castpd_si128(
393  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
394  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1);
395 
396  const __m128i rk2 = _mm_castpd_si128(
397  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i+1)));
398  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2);
399  }
400 
401  if (rounds & 1)
402  {
403  const __m128i rk = _mm_castpd_si128(
404  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+rounds-1)));
405 
406  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk);
407  Swap128(x1, y1);
408  }
409 
410  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
411  block0 = _mm_unpacklo_epi64(y1, x1);
412  block1 = _mm_unpackhi_epi64(y1, x1);
413 }
414 
415 inline void SIMON128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
416  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
417  const word64 *subkeys, unsigned int rounds)
418 {
419  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
420  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
421  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
422  __m128i x2 = _mm_unpackhi_epi64(block2, block3);
423  __m128i y2 = _mm_unpacklo_epi64(block2, block3);
424  __m128i x3 = _mm_unpackhi_epi64(block4, block5);
425  __m128i y3 = _mm_unpacklo_epi64(block4, block5);
426 
427  for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
428  {
429  const __m128i rk1 = _mm_castpd_si128(
430  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i)));
431  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1);
432  y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk1);
433  y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk1);
434 
435  const __m128i rk2 = _mm_castpd_si128(
436  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i + 1)));
437  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2);
438  x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk2);
439  x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk2);
440  }
441 
442  if (rounds & 1)
443  {
444  const __m128i rk = _mm_castpd_si128(
445  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
446  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk);
447  y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk);
448  y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk);
449  Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
450  }
451 
452  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
453  block0 = _mm_unpacklo_epi64(y1, x1);
454  block1 = _mm_unpackhi_epi64(y1, x1);
455  block2 = _mm_unpacklo_epi64(y2, x2);
456  block3 = _mm_unpackhi_epi64(y2, x2);
457  block4 = _mm_unpacklo_epi64(y3, x3);
458  block5 = _mm_unpackhi_epi64(y3, x3);
459 }
460 
461 inline void SIMON128_Dec_Block(__m128i &block0, __m128i &block1,
462  const word64 *subkeys, unsigned int rounds)
463 {
464  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
465  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
466  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
467 
468  if (rounds & 1)
469  {
470  const __m128i rk = _mm_castpd_si128(
471  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
472 
473  Swap128(x1, y1);
474  y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1));
475  rounds--;
476  }
477 
478  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
479  {
480  const __m128i rk1 = _mm_castpd_si128(
481  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i+1)));
482  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1);
483 
484  const __m128i rk2 = _mm_castpd_si128(
485  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
486  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2);
487  }
488 
489  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
490  block0 = _mm_unpacklo_epi64(y1, x1);
491  block1 = _mm_unpackhi_epi64(y1, x1);
492 }
493 
494 inline void SIMON128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
495  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
496  const word64 *subkeys, unsigned int rounds)
497 {
498  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
499  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
500  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
501  __m128i x2 = _mm_unpackhi_epi64(block2, block3);
502  __m128i y2 = _mm_unpacklo_epi64(block2, block3);
503  __m128i x3 = _mm_unpackhi_epi64(block4, block5);
504  __m128i y3 = _mm_unpacklo_epi64(block4, block5);
505 
506  if (rounds & 1)
507  {
508  const __m128i rk = _mm_castpd_si128(
509  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
510 
511  Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
512  y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1));
513  y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON128_f(x2));
514  y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON128_f(x3));
515  rounds--;
516  }
517 
518  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
519  {
520  const __m128i rk1 = _mm_castpd_si128(
521  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i + 1)));
522  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1);
523  x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk1);
524  x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk1);
525 
526  const __m128i rk2 = _mm_castpd_si128(
527  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i)));
528  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2);
529  y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk2);
530  y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk2);
531  }
532 
533  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
534  block0 = _mm_unpacklo_epi64(y1, x1);
535  block1 = _mm_unpackhi_epi64(y1, x1);
536  block2 = _mm_unpacklo_epi64(y2, x2);
537  block3 = _mm_unpackhi_epi64(y2, x2);
538  block4 = _mm_unpacklo_epi64(y3, x3);
539  block5 = _mm_unpackhi_epi64(y3, x3);
540 }
541 
542 #endif // CRYPTOPP_SSSE3_AVAILABLE
543 
544 // ***************************** Power8 ***************************** //
545 
546 #if defined(CRYPTOPP_POWER8_AVAILABLE)
547 
551 
552 using CryptoPP::VecAnd;
553 using CryptoPP::VecXor;
555 
556 // Rotate left by bit count
557 template<unsigned int C>
558 inline uint64x2_p RotateLeft64(const uint64x2_p val)
559 {
560  const uint64x2_p m = {C, C};
561  return vec_rl(val, m);
562 }
563 
564 // Rotate right by bit count
565 template<unsigned int C>
566 inline uint64x2_p RotateRight64(const uint64x2_p val)
567 {
568  const uint64x2_p m = {64-C, 64-C};
569  return vec_rl(val, m);
570 }
571 
572 inline uint64x2_p SIMON128_f(const uint64x2_p val)
573 {
574  return VecXor(RotateLeft64<2>(val),
575  VecAnd(RotateLeft64<1>(val), RotateLeft64<8>(val)));
576 }
577 
578 inline void SIMON128_Enc_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds)
579 {
580 #if (CRYPTOPP_BIG_ENDIAN)
581  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
582  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
583 #else
584  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
585  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
586 #endif
587 
588  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
589  uint64x2_p x1 = (uint64x2_p)VecPermute(block, block, m1);
590  uint64x2_p y1 = (uint64x2_p)VecPermute(block, block, m2);
591 
592  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
593  {
594  const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i]);
595  y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk1);
596 
597  const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i+1]);
598  x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk2);
599  }
600 
601  if (rounds & 1)
602  {
603  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]);
604  y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk);
605  std::swap(x1, y1);
606  }
607 
608 #if (CRYPTOPP_BIG_ENDIAN)
609  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
610  //const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
611 #else
612  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
613  //const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
614 #endif
615 
616  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
617  block = (uint32x4_p)VecPermute(x1, y1, m3);
618 }
619 
620 inline void SIMON128_Dec_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds)
621 {
622 #if (CRYPTOPP_BIG_ENDIAN)
623  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
624  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
625 #else
626  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
627  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
628 #endif
629 
630  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
631  uint64x2_p x1 = (uint64x2_p)VecPermute(block, block, m1);
632  uint64x2_p y1 = (uint64x2_p)VecPermute(block, block, m2);
633 
634  if (rounds & 1)
635  {
636  std::swap(x1, y1);
637  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]);
638  y1 = VecXor(VecXor(y1, rk), SIMON128_f(x1));
639  rounds--;
640  }
641 
642  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
643  {
644  const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i+1]);
645  x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk1);
646 
647  const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i]);
648  y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk2);
649  }
650 
651 #if (CRYPTOPP_BIG_ENDIAN)
652  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
653  //const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
654 #else
655  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
656  //const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
657 #endif
658 
659  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
660  block = (uint32x4_p)VecPermute(x1, y1, m3);
661 }
662 
663 inline void SIMON128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
664  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
665  uint32x4_p &block5, const word64 *subkeys, unsigned int rounds)
666 {
667 #if (CRYPTOPP_BIG_ENDIAN)
668  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
669  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
670 #else
671  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
672  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
673 #endif
674 
675  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
676  uint64x2_p x1 = (uint64x2_p)VecPermute(block0, block1, m1);
677  uint64x2_p y1 = (uint64x2_p)VecPermute(block0, block1, m2);
678  uint64x2_p x2 = (uint64x2_p)VecPermute(block2, block3, m1);
679  uint64x2_p y2 = (uint64x2_p)VecPermute(block2, block3, m2);
680  uint64x2_p x3 = (uint64x2_p)VecPermute(block4, block5, m1);
681  uint64x2_p y3 = (uint64x2_p)VecPermute(block4, block5, m2);
682 
683  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
684  {
685  const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i]);
686  y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk1);
687  y2 = VecXor(VecXor(y2, SIMON128_f(x2)), rk1);
688  y3 = VecXor(VecXor(y3, SIMON128_f(x3)), rk1);
689 
690  const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i+1]);
691  x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk2);
692  x2 = VecXor(VecXor(x2, SIMON128_f(y2)), rk2);
693  x3 = VecXor(VecXor(x3, SIMON128_f(y3)), rk2);
694  }
695 
696  if (rounds & 1)
697  {
698  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]);
699  y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk);
700  y2 = VecXor(VecXor(y2, SIMON128_f(x2)), rk);
701  y3 = VecXor(VecXor(y3, SIMON128_f(x3)), rk);
702  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
703  }
704 
705 #if (CRYPTOPP_BIG_ENDIAN)
706  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
707  const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
708 #else
709  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
710  const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
711 #endif
712 
713  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
714  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
715  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
716  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
717  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
718  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
719  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
720 }
721 
722 inline void SIMON128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
723  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
724  uint32x4_p &block5, const word64 *subkeys, unsigned int rounds)
725 {
726 #if (CRYPTOPP_BIG_ENDIAN)
727  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
728  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
729 #else
730  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
731  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
732 #endif
733 
734  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
735  uint64x2_p x1 = (uint64x2_p)VecPermute(block0, block1, m1);
736  uint64x2_p y1 = (uint64x2_p)VecPermute(block0, block1, m2);
737  uint64x2_p x2 = (uint64x2_p)VecPermute(block2, block3, m1);
738  uint64x2_p y2 = (uint64x2_p)VecPermute(block2, block3, m2);
739  uint64x2_p x3 = (uint64x2_p)VecPermute(block4, block5, m1);
740  uint64x2_p y3 = (uint64x2_p)VecPermute(block4, block5, m2);
741 
742  if (rounds & 1)
743  {
744  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
745  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]);
746  y1 = VecXor(VecXor(y1, rk), SIMON128_f(x1));
747  y2 = VecXor(VecXor(y2, rk), SIMON128_f(x2));
748  y3 = VecXor(VecXor(y3, rk), SIMON128_f(x3));
749  rounds--;
750  }
751 
752  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
753  {
754  const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i+1]);
755  x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk1);
756  x2 = VecXor(VecXor(x2, SIMON128_f(y2)), rk1);
757  x3 = VecXor(VecXor(x3, SIMON128_f(y3)), rk1);
758 
759  const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i]);
760  y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk2);
761  y2 = VecXor(VecXor(y2, SIMON128_f(x2)), rk2);
762  y3 = VecXor(VecXor(y3, SIMON128_f(x3)), rk2);
763  }
764 
765 #if (CRYPTOPP_BIG_ENDIAN)
766  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
767  const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
768 #else
769  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
770  const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
771 #endif
772 
773  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
774  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
775  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
776  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
777  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
778  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
779  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
780 }
781 
782 #endif // CRYPTOPP_POWER8_AVAILABLE
783 
784 ANONYMOUS_NAMESPACE_END
785 
786 ///////////////////////////////////////////////////////////////////////
787 
788 NAMESPACE_BEGIN(CryptoPP)
789 
790 // *************************** ARM NEON **************************** //
791 
792 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
793 size_t SIMON128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
794  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
795 {
796  return AdvancedProcessBlocks128_6x2_NEON(SIMON128_Enc_Block, SIMON128_Enc_6_Blocks,
797  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
798 }
799 
800 size_t SIMON128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
801  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
802 {
803  return AdvancedProcessBlocks128_6x2_NEON(SIMON128_Dec_Block, SIMON128_Dec_6_Blocks,
804  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
805 }
806 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
807 
808 // ***************************** IA-32 ***************************** //
809 
810 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
811 size_t SIMON128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
812  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
813 {
814  return AdvancedProcessBlocks128_6x2_SSE(SIMON128_Enc_Block, SIMON128_Enc_6_Blocks,
815  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
816 }
817 
818 size_t SIMON128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
819  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
820 {
821  return AdvancedProcessBlocks128_6x2_SSE(SIMON128_Dec_Block, SIMON128_Dec_6_Blocks,
822  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
823 }
824 #endif // CRYPTOPP_SSSE3_AVAILABLE
825 
826 // ***************************** Power8 ***************************** //
827 
828 #if defined(CRYPTOPP_POWER8_AVAILABLE)
829 size_t SIMON128_Enc_AdvancedProcessBlocks_POWER8(const word64* subKeys, size_t rounds,
830  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
831 {
832  return AdvancedProcessBlocks128_6x1_ALTIVEC(SIMON128_Enc_Block, SIMON128_Enc_6_Blocks,
833  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
834 }
835 
836 size_t SIMON128_Dec_AdvancedProcessBlocks_POWER8(const word64* subKeys, size_t rounds,
837  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
838 {
839  return AdvancedProcessBlocks128_6x1_ALTIVEC(SIMON128_Dec_Block, SIMON128_Dec_6_Blocks,
840  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
841 }
842 #endif // CRYPTOPP_POWER8_AVAILABLE
843 
844 NAMESPACE_END
Utility functions for the Crypto++ library.
Library configuration file.
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1010
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:129
Support functions for PowerPC and vector operations.
Template for AdvancedProcessBlocks and SIMD processing.
Precompiled header file.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:916
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:139
Classes for the Simon block cipher.
Crypto++ library namespace.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:119
T1 VecAnd(const T1 vec1, const T2 vec2)
AND two vectors.
Definition: ppc_simd.h:882
void vec_swap(T &a, T &b)
Swaps two variables which are arrays.
Definition: misc.h:499