Crypto++  8.0
Free C++ class library of cryptographic schemes
simon64_simd.cpp
1 // simon-simd.cpp - written and placed in the public domain by Jeffrey Walton
2 //
3 // This source file uses intrinsics and built-ins to gain access to
4 // SSSE3, ARM NEON and ARMv8a, and Altivec instructions. A separate
5 // source file is needed because additional CXXFLAGS are required to enable
6 // the appropriate instructions sets in some build configurations.
7 
8 #include "pch.h"
9 #include "config.h"
10 
11 #include "simon.h"
12 #include "misc.h"
13 
14 // Uncomment for benchmarking C++ against SSE or NEON.
15 // Do so in both simon.cpp and simon-simd.cpp.
16 // #undef CRYPTOPP_SSE41_AVAILABLE
17 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
18 
19 #if (CRYPTOPP_SSSE3_AVAILABLE)
20 # include "adv_simd.h"
21 # include <pmmintrin.h>
22 # include <tmmintrin.h>
23 #endif
24 
25 #if (CRYPTOPP_SSE41_AVAILABLE)
26 # include <smmintrin.h>
27 #endif
28 
29 #if defined(__XOP__)
30 # include <ammintrin.h>
31 #endif
32 
33 #if defined(__AVX512F__)
34 # define CRYPTOPP_AVX512_ROTATE 1
35 # include <immintrin.h>
36 #endif
37 
38 // C1189: error: This header is specific to ARM targets
39 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
40 # include "adv_simd.h"
41 # ifndef _M_ARM64
42 # include <arm_neon.h>
43 # endif
44 #endif
45 
46 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
47 # include <stdint.h>
48 # include <arm_acle.h>
49 #endif
50 
51 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
52 # include "adv_simd.h"
53 # include "ppc_simd.h"
54 #endif
55 
56 // Squash MS LNK4221 and libtool warnings
57 extern const char SIMON64_SIMD_FNAME[] = __FILE__;
58 
59 ANONYMOUS_NAMESPACE_BEGIN
60 
61 using CryptoPP::byte;
62 using CryptoPP::word32;
63 using CryptoPP::word64;
64 using CryptoPP::vec_swap; // SunCC
65 
66 // *************************** ARM NEON ************************** //
67 
68 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
69 
70 template <class T>
71 inline T UnpackHigh32(const T& a, const T& b)
72 {
73  const uint32x2_t x(vget_high_u32((uint32x4_t)a));
74  const uint32x2_t y(vget_high_u32((uint32x4_t)b));
75  const uint32x2x2_t r = vzip_u32(x, y);
76  return (T)vcombine_u32(r.val[0], r.val[1]);
77 }
78 
79 template <class T>
80 inline T UnpackLow32(const T& a, const T& b)
81 {
82  const uint32x2_t x(vget_low_u32((uint32x4_t)a));
83  const uint32x2_t y(vget_low_u32((uint32x4_t)b));
84  const uint32x2x2_t r = vzip_u32(x, y);
85  return (T)vcombine_u32(r.val[0], r.val[1]);
86 }
87 
88 template <unsigned int R>
89 inline uint32x4_t RotateLeft32(const uint32x4_t& val)
90 {
91  const uint32x4_t a(vshlq_n_u32(val, R));
92  const uint32x4_t b(vshrq_n_u32(val, 32 - R));
93  return vorrq_u32(a, b);
94 }
95 
96 template <unsigned int R>
97 inline uint32x4_t RotateRight32(const uint32x4_t& val)
98 {
99  const uint32x4_t a(vshlq_n_u32(val, 32 - R));
100  const uint32x4_t b(vshrq_n_u32(val, R));
101  return vorrq_u32(a, b);
102 }
103 
104 #if defined(__aarch32__) || defined(__aarch64__)
105 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
106 template <>
107 inline uint32x4_t RotateLeft32<8>(const uint32x4_t& val)
108 {
109 #if (CRYPTOPP_BIG_ENDIAN)
110  const uint8_t maskb[16] = { 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3 };
111  const uint8x16_t mask = vld1q_u8(maskb);
112 #else
113  const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
114  const uint8x16_t mask = vld1q_u8(maskb);
115 #endif
116 
117  return vreinterpretq_u32_u8(
118  vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
119 }
120 
121 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
122 template <>
123 inline uint32x4_t RotateRight32<8>(const uint32x4_t& val)
124 {
125 #if (CRYPTOPP_BIG_ENDIAN)
126  const uint8_t maskb[16] = { 12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1 };
127  const uint8x16_t mask = vld1q_u8(maskb);
128 #else
129  const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,14,12 };
130  const uint8x16_t mask = vld1q_u8(maskb);
131 #endif
132 
133  return vreinterpretq_u32_u8(
134  vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
135 }
136 #endif
137 
138 inline uint32x4_t SIMON64_f(const uint32x4_t& val)
139 {
140  return veorq_u32(RotateLeft32<2>(val),
141  vandq_u32(RotateLeft32<1>(val), RotateLeft32<8>(val)));
142 }
143 
144 inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
145  const word32 *subkeys, unsigned int rounds)
146 {
147  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
148  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
149  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
150 
151  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
152  {
153  const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i);
154  y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1);
155 
156  const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1);
157  x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2);
158  }
159 
160  if (rounds & 1)
161  {
162  const uint32x4_t rk = vld1q_dup_u32(subkeys+rounds-1);
163 
164  y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk);
165  std::swap(x1, y1);
166  }
167 
168  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
169  block0 = UnpackLow32(y1, x1);
170  block1 = UnpackHigh32(y1, x1);
171 }
172 
173 inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
174  const word32 *subkeys, unsigned int rounds)
175 {
176  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
177  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
178  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
179 
180  if (rounds & 1)
181  {
182  std::swap(x1, y1);
183  const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
184 
185  y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1));
186  rounds--;
187  }
188 
189  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
190  {
191  const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i+1);
192  x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1);
193 
194  const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i);
195  y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
196  }
197 
198  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
199  block0 = UnpackLow32(y1, x1);
200  block1 = UnpackHigh32(y1, x1);
201 }
202 
203 inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
204  uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
205  const word32 *subkeys, unsigned int rounds)
206 {
207  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
208  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
209  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
210  uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
211  uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
212  uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
213  uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
214 
215  for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
216  {
217  const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i);
218  y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1);
219  y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk1);
220  y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk1);
221 
222  const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1);
223  x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2);
224  x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk2);
225  x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk2);
226  }
227 
228  if (rounds & 1)
229  {
230  const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
231 
232  y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk);
233  y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk);
234  y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk);
235  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
236  }
237 
238  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
239  block0 = UnpackLow32(y1, x1);
240  block1 = UnpackHigh32(y1, x1);
241  block2 = UnpackLow32(y2, x2);
242  block3 = UnpackHigh32(y2, x2);
243  block4 = UnpackLow32(y3, x3);
244  block5 = UnpackHigh32(y3, x3);
245 }
246 
247 inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
248  uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
249  const word32 *subkeys, unsigned int rounds)
250 {
251  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
252  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
253  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
254  uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
255  uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
256  uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
257  uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
258 
259  if (rounds & 1)
260  {
261  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
262  const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
263 
264  y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1));
265  y2 = veorq_u32(veorq_u32(y2, rk), SIMON64_f(x2));
266  y3 = veorq_u32(veorq_u32(y3, rk), SIMON64_f(x3));
267  rounds--;
268  }
269 
270  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
271  {
272  const uint32x4_t rk1 = vld1q_dup_u32(subkeys + i + 1);
273  x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1);
274  x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk1);
275  x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk1);
276 
277  const uint32x4_t rk2 = vld1q_dup_u32(subkeys + i);
278  y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
279  y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk2);
280  y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk2);
281  }
282 
283  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
284  block0 = UnpackLow32(y1, x1);
285  block1 = UnpackHigh32(y1, x1);
286  block2 = UnpackLow32(y2, x2);
287  block3 = UnpackHigh32(y2, x2);
288  block4 = UnpackLow32(y3, x3);
289  block5 = UnpackHigh32(y3, x3);
290 }
291 
292 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
293 
294 // ***************************** IA-32 ***************************** //
295 
296 #if defined(CRYPTOPP_SSE41_AVAILABLE)
297 
298 inline void Swap128(__m128i& a,__m128i& b)
299 {
300 #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
301  // __m128i is an unsigned long long[2], and support for swapping it was not added until C++11.
302  // SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11.
303  vec_swap(a, b);
304 #else
305  std::swap(a, b);
306 #endif
307 }
308 
309 template <unsigned int R>
310 inline __m128i RotateLeft32(const __m128i& val)
311 {
312 #if defined(__XOP__)
313  return _mm_roti_epi32(val, R);
314 #else
315  return _mm_or_si128(
316  _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
317 #endif
318 }
319 
320 template <unsigned int R>
321 inline __m128i RotateRight32(const __m128i& val)
322 {
323 #if defined(__XOP__)
324  return _mm_roti_epi32(val, 32-R);
325 #else
326  return _mm_or_si128(
327  _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
328 #endif
329 }
330 
331 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
332 template <>
333 __m128i RotateLeft32<8>(const __m128i& val)
334 {
335 #if defined(__XOP__)
336  return _mm_roti_epi32(val, 8);
337 #else
338  const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
339  return _mm_shuffle_epi8(val, mask);
340 #endif
341 }
342 
343 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
344 template <>
345 __m128i RotateRight32<8>(const __m128i& val)
346 {
347 #if defined(__XOP__)
348  return _mm_roti_epi32(val, 32-8);
349 #else
350  const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
351  return _mm_shuffle_epi8(val, mask);
352 #endif
353 }
354 
355 inline __m128i SIMON64_f(const __m128i& v)
356 {
357  return _mm_xor_si128(RotateLeft32<2>(v),
358  _mm_and_si128(RotateLeft32<1>(v), RotateLeft32<8>(v)));
359 }
360 
361 inline void SIMON64_Enc_Block(__m128i &block0, __m128i &block1,
362  const word32 *subkeys, unsigned int rounds)
363 {
364  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
365  const __m128 t0 = _mm_castsi128_ps(block0);
366  const __m128 t1 = _mm_castsi128_ps(block1);
367  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
368  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
369 
370  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
371  {
372  const __m128i rk1 = _mm_set1_epi32(subkeys[i]);
373  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
374 
375  const __m128i rk2 = _mm_set1_epi32(subkeys[i+1]);
376  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
377  }
378 
379  if (rounds & 1)
380  {
381  const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
382  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
383  Swap128(x1, y1);
384  }
385 
386  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
387  block0 = _mm_unpacklo_epi32(y1, x1);
388  block1 = _mm_unpackhi_epi32(y1, x1);
389 }
390 
391 inline void SIMON64_Dec_Block(__m128i &block0, __m128i &block1,
392  const word32 *subkeys, unsigned int rounds)
393 {
394  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
395  const __m128 t0 = _mm_castsi128_ps(block0);
396  const __m128 t1 = _mm_castsi128_ps(block1);
397  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
398  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
399 
400  if (rounds & 1)
401  {
402  Swap128(x1, y1);
403  const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
404  y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
405  rounds--;
406  }
407 
408  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
409  {
410  const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
411  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
412 
413  const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
414  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
415  }
416 
417  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
418  block0 = _mm_unpacklo_epi32(y1, x1);
419  block1 = _mm_unpackhi_epi32(y1, x1);
420 }
421 
422 inline void SIMON64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
423  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
424  const word32 *subkeys, unsigned int rounds)
425 {
426  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
427  const __m128 t0 = _mm_castsi128_ps(block0);
428  const __m128 t1 = _mm_castsi128_ps(block1);
429  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
430  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
431 
432  const __m128 t2 = _mm_castsi128_ps(block2);
433  const __m128 t3 = _mm_castsi128_ps(block3);
434  __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
435  __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
436 
437  const __m128 t4 = _mm_castsi128_ps(block4);
438  const __m128 t5 = _mm_castsi128_ps(block5);
439  __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
440  __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
441 
442  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
443  {
444  const __m128i rk1 = _mm_set1_epi32(subkeys[i]);
445  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
446  y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk1);
447  y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk1);
448 
449  const __m128i rk2 = _mm_set1_epi32(subkeys[i+1]);
450  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
451  x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk2);
452  x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk2);
453  }
454 
455  if (rounds & 1)
456  {
457  const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
458  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
459  y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk);
460  y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk);
461  Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
462  }
463 
464  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
465  block0 = _mm_unpacklo_epi32(y1, x1);
466  block1 = _mm_unpackhi_epi32(y1, x1);
467  block2 = _mm_unpacklo_epi32(y2, x2);
468  block3 = _mm_unpackhi_epi32(y2, x2);
469  block4 = _mm_unpacklo_epi32(y3, x3);
470  block5 = _mm_unpackhi_epi32(y3, x3);
471 }
472 
473 inline void SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
474  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
475  const word32 *subkeys, unsigned int rounds)
476 {
477  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
478  const __m128 t0 = _mm_castsi128_ps(block0);
479  const __m128 t1 = _mm_castsi128_ps(block1);
480  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
481  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
482 
483  const __m128 t2 = _mm_castsi128_ps(block2);
484  const __m128 t3 = _mm_castsi128_ps(block3);
485  __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
486  __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
487 
488  const __m128 t4 = _mm_castsi128_ps(block4);
489  const __m128 t5 = _mm_castsi128_ps(block5);
490  __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
491  __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
492 
493  if (rounds & 1)
494  {
495  Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
496  const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
497  y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
498  y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON64_f(x2));
499  y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON64_f(x3));
500  rounds--;
501  }
502 
503  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
504  {
505  const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
506  x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
507  x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk1);
508  x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk1);
509 
510  const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
511  y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
512  y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk2);
513  y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk2);
514  }
515 
516  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
517  block0 = _mm_unpacklo_epi32(y1, x1);
518  block1 = _mm_unpackhi_epi32(y1, x1);
519  block2 = _mm_unpacklo_epi32(y2, x2);
520  block3 = _mm_unpackhi_epi32(y2, x2);
521  block4 = _mm_unpacklo_epi32(y3, x3);
522  block5 = _mm_unpackhi_epi32(y3, x3);
523 }
524 
525 #endif // CRYPTOPP_SSE41_AVAILABLE
526 
527 // ***************************** Altivec ***************************** //
528 
529 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
530 
533 
534 using CryptoPP::VecAnd;
535 using CryptoPP::VecXor;
536 using CryptoPP::VecLoad;
537 using CryptoPP::VecLoadBE;
539 
540 // Rotate left by bit count
541 template<unsigned int C>
542 inline uint32x4_p RotateLeft32(const uint32x4_p val)
543 {
544  const uint32x4_p m = {C, C, C, C};
545  return vec_rl(val, m);
546 }
547 
548 // Rotate right by bit count
549 template<unsigned int C>
550 inline uint32x4_p RotateRight32(const uint32x4_p val)
551 {
552  const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
553  return vec_rl(val, m);
554 }
555 
556 inline uint32x4_p SIMON64_f(const uint32x4_p val)
557 {
558  return VecXor(RotateLeft32<2>(val),
559  VecAnd(RotateLeft32<1>(val), RotateLeft32<8>(val)));
560 }
561 
562 inline void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
563  const word32 *subkeys, unsigned int rounds)
564 {
565 #if (CRYPTOPP_BIG_ENDIAN)
566  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
567  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
568 #else
569  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
570  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
571 #endif
572 
573  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
574  uint32x4_p x1 = VecPermute(block0, block1, m1);
575  uint32x4_p y1 = VecPermute(block0, block1, m2);
576 
577  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
578  {
579 #if CRYPTOPP_POWER7_AVAILABLE
580  const uint32x4_p rk1 = vec_splats(subkeys[i]);
581  const uint32x4_p rk2 = vec_splats(subkeys[i+1]);
582 #else
583  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
584  uint32x4_p rk1 = VecLoad(subkeys+i);
585  uint32x4_p rk2 = VecLoad(subkeys+i+1);
586  rk1 = VecPermute(rk1, rk1, m);
587  rk2 = VecPermute(rk2, rk2, m);
588 #endif
589  y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk1);
590  x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk2);
591  }
592 
593  if (rounds & 1)
594  {
595 #if CRYPTOPP_POWER7_AVAILABLE
596  const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
597 #else
598  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
599  uint32x4_p rk = VecLoad(subkeys+rounds-1);
600  rk = VecPermute(rk, rk, m);
601 #endif
602  y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk);
603  std::swap(x1, y1);
604  }
605 
606 #if (CRYPTOPP_BIG_ENDIAN)
607  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
608  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
609 #else
610  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
611  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
612 #endif
613 
614  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
615  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
616  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
617 }
618 
619 inline void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
620  const word32 *subkeys, unsigned int rounds)
621 {
622 #if (CRYPTOPP_BIG_ENDIAN)
623  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
624  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
625 #else
626  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
627  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
628 #endif
629 
630  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
631  uint32x4_p x1 = VecPermute(block0, block1, m1);
632  uint32x4_p y1 = VecPermute(block0, block1, m2);
633 
634  if (rounds & 1)
635  {
636  std::swap(x1, y1);
637 #if CRYPTOPP_POWER7_AVAILABLE
638  const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
639 #else
640  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
641  uint32x4_p rk = VecLoad(subkeys+rounds-1);
642  rk = VecPermute(rk, rk, m);
643 #endif
644  y1 = VecXor(VecXor(y1, rk), SIMON64_f(x1));
645  rounds--;
646  }
647 
648  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
649  {
650 #if CRYPTOPP_POWER7_AVAILABLE
651  const uint32x4_p rk1 = vec_splats(subkeys[i+1]);
652  const uint32x4_p rk2 = vec_splats(subkeys[i]);
653 #else
654  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
655  uint32x4_p rk1 = VecLoad(subkeys+i+1);
656  uint32x4_p rk2 = VecLoad(subkeys+i);
657  rk1 = VecPermute(rk1, rk1, m);
658  rk2 = VecPermute(rk2, rk2, m);
659 #endif
660  x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk1);
661  y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk2);
662  }
663 
664 #if (CRYPTOPP_BIG_ENDIAN)
665  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
666  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
667 #else
668  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
669  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
670 #endif
671 
672  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
673  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
674  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
675 }
676 
677 inline void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
678  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
679  uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
680 {
681 #if (CRYPTOPP_BIG_ENDIAN)
682  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
683  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
684 #else
685  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
686  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
687 #endif
688 
689  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
690  uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
691  uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
692  uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
693  uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
694  uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
695  uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
696 
697  for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
698  {
699 #if CRYPTOPP_POWER7_AVAILABLE
700  const uint32x4_p rk1 = vec_splats(subkeys[i]);
701  const uint32x4_p rk2 = vec_splats(subkeys[i+1]);
702 #else
703  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
704  uint32x4_p rk1 = VecLoad(subkeys+i);
705  uint32x4_p rk2 = VecLoad(subkeys+i+1);
706  rk1 = VecPermute(rk1, rk1, m);
707  rk2 = VecPermute(rk2, rk2, m);
708 #endif
709  y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk1);
710  y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk1);
711  y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk1);
712 
713  x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk2);
714  x2 = VecXor(VecXor(x2, SIMON64_f(y2)), rk2);
715  x3 = VecXor(VecXor(x3, SIMON64_f(y3)), rk2);
716  }
717 
718  if (rounds & 1)
719  {
720 #if CRYPTOPP_POWER7_AVAILABLE
721  const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
722 #else
723  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
724  uint32x4_p rk = VecLoad(subkeys+rounds-1);
725  rk = VecPermute(rk, rk, m);
726 #endif
727  y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk);
728  y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk);
729  y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk);
730  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
731  }
732 
733 #if (CRYPTOPP_BIG_ENDIAN)
734  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
735  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
736 #else
737  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
738  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
739 #endif
740 
741  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
742  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
743  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
744  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
745  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
746  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
747  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
748 }
749 
750 inline void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
751  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
752  uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
753 {
754 #if (CRYPTOPP_BIG_ENDIAN)
755  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
756  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
757 #else
758  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
759  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
760 #endif
761 
762  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
763  uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
764  uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
765  uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
766  uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
767  uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
768  uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
769 
770  if (rounds & 1)
771  {
772  std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
773 
774 #if CRYPTOPP_POWER7_AVAILABLE
775  const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
776 #else
777  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
778  uint32x4_p rk = VecLoad(subkeys+rounds-1);
779  rk = VecPermute(rk, rk, m);
780 #endif
781  y1 = VecXor(VecXor(y1, rk), SIMON64_f(x1));
782  y2 = VecXor(VecXor(y2, rk), SIMON64_f(x2));
783  y3 = VecXor(VecXor(y3, rk), SIMON64_f(x3));
784  rounds--;
785  }
786 
787  for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
788  {
789 #if CRYPTOPP_POWER7_AVAILABLE
790  const uint32x4_p rk1 = vec_splats(subkeys[i+1]);
791  const uint32x4_p rk2 = vec_splats(subkeys[i]);
792 #else
793  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
794  uint32x4_p rk1 = VecLoad(subkeys+i+1);
795  uint32x4_p rk2 = VecLoad(subkeys+i);
796  rk1 = VecPermute(rk1, rk1, m);
797  rk2 = VecPermute(rk2, rk2, m);
798 #endif
799  x1 = VecXor(VecXor(x1, SIMON64_f(y1)), rk1);
800  x2 = VecXor(VecXor(x2, SIMON64_f(y2)), rk1);
801  x3 = VecXor(VecXor(x3, SIMON64_f(y3)), rk1);
802 
803  y1 = VecXor(VecXor(y1, SIMON64_f(x1)), rk2);
804  y2 = VecXor(VecXor(y2, SIMON64_f(x2)), rk2);
805  y3 = VecXor(VecXor(y3, SIMON64_f(x3)), rk2);
806  }
807 
808 #if (CRYPTOPP_BIG_ENDIAN)
809  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
810  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
811 #else
812  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
813  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
814 #endif
815 
816  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
817  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
818  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
819  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
820  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
821  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
822  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
823 }
824 
825 #endif // CRYPTOPP_ALTIVEC_AVAILABLE
826 
827 ANONYMOUS_NAMESPACE_END
828 
829 ///////////////////////////////////////////////////////////////////////
830 
831 NAMESPACE_BEGIN(CryptoPP)
832 
833 // *************************** ARM NEON **************************** //
834 
835 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
836 size_t SIMON64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
837  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
838 {
839  return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
840  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
841 }
842 
843 size_t SIMON64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
844  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
845 {
846  return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
847  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
848 }
849 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
850 
851 // ***************************** IA-32 ***************************** //
852 
853 #if defined(CRYPTOPP_SSE41_AVAILABLE)
854 size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
855  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
856 {
857  return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
858  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
859 }
860 
861 size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
862  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
863 {
864  return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
865  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
866 }
867 #endif
868 
869 // ***************************** Altivec ***************************** //
870 
871 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
872 size_t SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
873  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
874 {
875  return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
876  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
877 }
878 
879 size_t SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
880  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
881 {
882  return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
883  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
884 }
885 #endif
886 
887 NAMESPACE_END
Utility functions for the Crypto++ library.
Library configuration file.
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1010
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:129
Support functions for PowerPC and vector operations.
Template for AdvancedProcessBlocks and SIMD processing.
Precompiled header file.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:916
uint32x4_p VecLoadBE(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:440
Classes for the Simon block cipher.
Crypto++ library namespace.
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:253
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:119
T1 VecAnd(const T1 vec1, const T2 vec2)
AND two vectors.
Definition: ppc_simd.h:882
void vec_swap(T &a, T &b)
Swaps two variables which are arrays.
Definition: misc.h:499