Crypto++  7.0
Free C++ class library of cryptographic schemes
speck128_simd.cpp
1 // speck128_simd.cpp - written and placed in the public domain by Jeffrey Walton
2 //
3 // This source file uses intrinsics and built-ins to gain access to
4 // SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
5 // source file is needed because additional CXXFLAGS are required to enable
6 // the appropriate instructions sets in some build configurations.
7 
8 #include "pch.h"
9 #include "config.h"
10 
11 #include "speck.h"
12 #include "misc.h"
13 #include "adv_simd.h"
14 
15 #ifndef CRYPTOPP_INLINE
16 # if defined(CRYPTOPP_DEBUG)
17 # define CRYPTOPP_INLINE static
18 # else
19 # define CRYPTOPP_INLINE inline
20 # endif
21 #endif
22 
23 // Uncomment for benchmarking C++ against SSE or NEON.
24 // Do so in both speck.cpp and speck-simd.cpp.
25 // #undef CRYPTOPP_SSSE3_AVAILABLE
26 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
27 
28 #if (CRYPTOPP_SSSE3_AVAILABLE)
29 # include <pmmintrin.h>
30 # include <tmmintrin.h>
31 #endif
32 
33 #if defined(__XOP__)
34 # include <ammintrin.h>
35 #endif
36 
37 #if defined(__AVX512F__) && defined(__AVX512VL__)
38 # define CRYPTOPP_AVX512_ROTATE 1
39 # include <immintrin.h>
40 #endif
41 
42 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
43 # include <arm_neon.h>
44 #endif
45 
46 // Can't use CRYPTOPP_ARM_XXX_AVAILABLE because too many
47 // compilers don't follow ACLE conventions for the include.
48 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
49 # include <stdint.h>
50 # include <arm_acle.h>
51 #endif
52 
53 #if defined(CRYPTOPP_POWER8_AVAILABLE)
54 # include "ppc_simd.h"
55 #endif
56 
57 // Squash MS LNK4221 and libtool warnings
58 extern const char SPECK128_SIMD_FNAME[] = __FILE__;
59 
60 ANONYMOUS_NAMESPACE_BEGIN
61 
62 using CryptoPP::byte;
63 using CryptoPP::word32;
64 using CryptoPP::word64;
65 
66 // *************************** ARM NEON ************************** //
67 
68 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
69 
70 template <class T>
71 CRYPTOPP_INLINE T UnpackHigh64(const T& a, const T& b)
72 {
73  const uint64x1_t x(vget_high_u64((uint64x2_t)a));
74  const uint64x1_t y(vget_high_u64((uint64x2_t)b));
75  return (T)vcombine_u64(x, y);
76 }
77 
78 template <class T>
79 CRYPTOPP_INLINE T UnpackLow64(const T& a, const T& b)
80 {
81  const uint64x1_t x(vget_low_u64((uint64x2_t)a));
82  const uint64x1_t y(vget_low_u64((uint64x2_t)b));
83  return (T)vcombine_u64(x, y);
84 }
85 
86 template <unsigned int R>
87 CRYPTOPP_INLINE uint64x2_t RotateLeft64(const uint64x2_t& val)
88 {
89  const uint64x2_t a(vshlq_n_u64(val, R));
90  const uint64x2_t b(vshrq_n_u64(val, 64 - R));
91  return vorrq_u64(a, b);
92 }
93 
94 template <unsigned int R>
95 CRYPTOPP_INLINE uint64x2_t RotateRight64(const uint64x2_t& val)
96 {
97  const uint64x2_t a(vshlq_n_u64(val, 64 - R));
98  const uint64x2_t b(vshrq_n_u64(val, R));
99  return vorrq_u64(a, b);
100 }
101 
102 #if defined(__aarch32__) || defined(__aarch64__)
103 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
104 template <>
105 CRYPTOPP_INLINE uint64x2_t RotateLeft64<8>(const uint64x2_t& val)
106 {
107 #if (CRYPTOPP_BIG_ENDIAN)
108  const uint8_t maskb[16] = { 14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7 };
109  const uint8x16_t mask = vld1q_u8(maskb);
110 #else
111  const uint8_t maskb[16] = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
112  const uint8x16_t mask = vld1q_u8(maskb);
113 #endif
114 
115  return vreinterpretq_u64_u8(
116  vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
117 }
118 
119 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
120 template <>
121 CRYPTOPP_INLINE uint64x2_t RotateRight64<8>(const uint64x2_t& val)
122 {
123 #if (CRYPTOPP_BIG_ENDIAN)
124  const uint8_t maskb[16] = { 8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1 };
125  const uint8x16_t mask = vld1q_u8(maskb);
126 #else
127  const uint8_t maskb[16] = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
128  const uint8x16_t mask = vld1q_u8(maskb);
129 #endif
130 
131  return vreinterpretq_u64_u8(
132  vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
133 }
134 #endif
135 
136 CRYPTOPP_INLINE void SPECK128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
137  const word64 *subkeys, unsigned int rounds)
138 {
139  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
140  uint64x2_t x1 = UnpackHigh64(block0, block1);
141  uint64x2_t y1 = UnpackLow64(block0, block1);
142 
143  for (int i=0; i < static_cast<int>(rounds); ++i)
144  {
145  const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
146 
147  x1 = RotateRight64<8>(x1);
148  x1 = vaddq_u64(x1, y1);
149  x1 = veorq_u64(x1, rk);
150  y1 = RotateLeft64<3>(y1);
151  y1 = veorq_u64(y1, x1);
152  }
153 
154  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
155  block0 = UnpackLow64(y1, x1);
156  block1 = UnpackHigh64(y1, x1);
157 }
158 
159 CRYPTOPP_INLINE void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
160  uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
161  const word64 *subkeys, unsigned int rounds)
162 {
163  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
164  uint64x2_t x1 = UnpackHigh64(block0, block1);
165  uint64x2_t y1 = UnpackLow64(block0, block1);
166  uint64x2_t x2 = UnpackHigh64(block2, block3);
167  uint64x2_t y2 = UnpackLow64(block2, block3);
168  uint64x2_t x3 = UnpackHigh64(block4, block5);
169  uint64x2_t y3 = UnpackLow64(block4, block5);
170 
171  for (int i=0; i < static_cast<int>(rounds); ++i)
172  {
173  const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
174 
175  x1 = RotateRight64<8>(x1);
176  x2 = RotateRight64<8>(x2);
177  x3 = RotateRight64<8>(x3);
178  x1 = vaddq_u64(x1, y1);
179  x2 = vaddq_u64(x2, y2);
180  x3 = vaddq_u64(x3, y3);
181  x1 = veorq_u64(x1, rk);
182  x2 = veorq_u64(x2, rk);
183  x3 = veorq_u64(x3, rk);
184  y1 = RotateLeft64<3>(y1);
185  y2 = RotateLeft64<3>(y2);
186  y3 = RotateLeft64<3>(y3);
187  y1 = veorq_u64(y1, x1);
188  y2 = veorq_u64(y2, x2);
189  y3 = veorq_u64(y3, x3);
190  }
191 
192  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
193  block0 = UnpackLow64(y1, x1);
194  block1 = UnpackHigh64(y1, x1);
195  block2 = UnpackLow64(y2, x2);
196  block3 = UnpackHigh64(y2, x2);
197  block4 = UnpackLow64(y3, x3);
198  block5 = UnpackHigh64(y3, x3);
199 }
200 
201 CRYPTOPP_INLINE void SPECK128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
202  const word64 *subkeys, unsigned int rounds)
203 {
204  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
205  uint64x2_t x1 = UnpackHigh64(block0, block1);
206  uint64x2_t y1 = UnpackLow64(block0, block1);
207 
208  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
209  {
210  const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
211 
212  y1 = veorq_u64(y1, x1);
213  y1 = RotateRight64<3>(y1);
214  x1 = veorq_u64(x1, rk);
215  x1 = vsubq_u64(x1, y1);
216  x1 = RotateLeft64<8>(x1);
217  }
218 
219  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
220  block0 = UnpackLow64(y1, x1);
221  block1 = UnpackHigh64(y1, x1);
222 }
223 
224 CRYPTOPP_INLINE void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
225  uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
226  const word64 *subkeys, unsigned int rounds)
227 {
228  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
229  uint64x2_t x1 = UnpackHigh64(block0, block1);
230  uint64x2_t y1 = UnpackLow64(block0, block1);
231  uint64x2_t x2 = UnpackHigh64(block2, block3);
232  uint64x2_t y2 = UnpackLow64(block2, block3);
233  uint64x2_t x3 = UnpackHigh64(block4, block5);
234  uint64x2_t y3 = UnpackLow64(block4, block5);
235 
236  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
237  {
238  const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
239 
240  y1 = veorq_u64(y1, x1);
241  y2 = veorq_u64(y2, x2);
242  y3 = veorq_u64(y3, x3);
243  y1 = RotateRight64<3>(y1);
244  y2 = RotateRight64<3>(y2);
245  y3 = RotateRight64<3>(y3);
246  x1 = veorq_u64(x1, rk);
247  x2 = veorq_u64(x2, rk);
248  x3 = veorq_u64(x3, rk);
249  x1 = vsubq_u64(x1, y1);
250  x2 = vsubq_u64(x2, y2);
251  x3 = vsubq_u64(x3, y3);
252  x1 = RotateLeft64<8>(x1);
253  x2 = RotateLeft64<8>(x2);
254  x3 = RotateLeft64<8>(x3);
255  }
256 
257  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
258  block0 = UnpackLow64(y1, x1);
259  block1 = UnpackHigh64(y1, x1);
260  block2 = UnpackLow64(y2, x2);
261  block3 = UnpackHigh64(y2, x2);
262  block4 = UnpackLow64(y3, x3);
263  block5 = UnpackHigh64(y3, x3);
264 }
265 
266 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
267 
268 // ***************************** IA-32 ***************************** //
269 
270 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
271 
272 // Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670
273 #ifndef M128_CAST
274 # define M128_CAST(x) ((__m128i *)(void *)(x))
275 #endif
276 #ifndef CONST_M128_CAST
277 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
278 #endif
279 
280 // GCC double casts, https://www.spinics.net/lists/gcchelp/msg47735.html
281 #ifndef DOUBLE_CAST
282 # define DOUBLE_CAST(x) ((double *)(void *)(x))
283 #endif
284 #ifndef CONST_DOUBLE_CAST
285 # define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x))
286 #endif
287 
288 template <unsigned int R>
289 CRYPTOPP_INLINE __m128i RotateLeft64(const __m128i& val)
290 {
291 #if defined(CRYPTOPP_AVX512_ROTATE)
292  return _mm_rol_epi64(val, R);
293 #elif defined(__XOP__)
294  return _mm_roti_epi64(val, R);
295 #else
296  return _mm_or_si128(
297  _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R));
298 #endif
299 }
300 
301 template <unsigned int R>
302 CRYPTOPP_INLINE __m128i RotateRight64(const __m128i& val)
303 {
304 #if defined(CRYPTOPP_AVX512_ROTATE)
305  return _mm_ror_epi64(val, R);
306 #elif defined(__XOP__)
307  return _mm_roti_epi64(val, 64-R);
308 #else
309  return _mm_or_si128(
310  _mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R));
311 #endif
312 }
313 
314 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
315 template <>
316 __m128i RotateLeft64<8>(const __m128i& val)
317 {
318 #if defined(__XOP__)
319  return _mm_roti_epi64(val, 8);
320 #else
321  const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7);
322  return _mm_shuffle_epi8(val, mask);
323 #endif
324 }
325 
326 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
327 template <>
328 __m128i RotateRight64<8>(const __m128i& val)
329 {
330 #if defined(__XOP__)
331  return _mm_roti_epi64(val, 64-8);
332 #else
333  const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1);
334  return _mm_shuffle_epi8(val, mask);
335 #endif
336 }
337 
338 CRYPTOPP_INLINE void SPECK128_Enc_Block(__m128i &block0, __m128i &block1,
339  const word64 *subkeys, unsigned int rounds)
340 {
341  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
342  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
343  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
344 
345  for (int i=0; i < static_cast<int>(rounds); ++i)
346  {
347  const __m128i rk = _mm_castpd_si128(
348  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
349 
350  x1 = RotateRight64<8>(x1);
351  x1 = _mm_add_epi64(x1, y1);
352  x1 = _mm_xor_si128(x1, rk);
353  y1 = RotateLeft64<3>(y1);
354  y1 = _mm_xor_si128(y1, x1);
355  }
356 
357  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
358  block0 = _mm_unpacklo_epi64(y1, x1);
359  block1 = _mm_unpackhi_epi64(y1, x1);
360 }
361 
362 CRYPTOPP_INLINE void SPECK128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
363  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
364  const word64 *subkeys, unsigned int rounds)
365 {
366  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
367  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
368  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
369  __m128i x2 = _mm_unpackhi_epi64(block2, block3);
370  __m128i y2 = _mm_unpacklo_epi64(block2, block3);
371  __m128i x3 = _mm_unpackhi_epi64(block4, block5);
372  __m128i y3 = _mm_unpacklo_epi64(block4, block5);
373 
374  for (int i=0; i < static_cast<int>(rounds); ++i)
375  {
376  const __m128i rk = _mm_castpd_si128(
377  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
378 
379  x1 = RotateRight64<8>(x1);
380  x2 = RotateRight64<8>(x2);
381  x3 = RotateRight64<8>(x3);
382  x1 = _mm_add_epi64(x1, y1);
383  x2 = _mm_add_epi64(x2, y2);
384  x3 = _mm_add_epi64(x3, y3);
385  x1 = _mm_xor_si128(x1, rk);
386  x2 = _mm_xor_si128(x2, rk);
387  x3 = _mm_xor_si128(x3, rk);
388  y1 = RotateLeft64<3>(y1);
389  y2 = RotateLeft64<3>(y2);
390  y3 = RotateLeft64<3>(y3);
391  y1 = _mm_xor_si128(y1, x1);
392  y2 = _mm_xor_si128(y2, x2);
393  y3 = _mm_xor_si128(y3, x3);
394  }
395 
396  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
397  block0 = _mm_unpacklo_epi64(y1, x1);
398  block1 = _mm_unpackhi_epi64(y1, x1);
399  block2 = _mm_unpacklo_epi64(y2, x2);
400  block3 = _mm_unpackhi_epi64(y2, x2);
401  block4 = _mm_unpacklo_epi64(y3, x3);
402  block5 = _mm_unpackhi_epi64(y3, x3);
403 }
404 
405 CRYPTOPP_INLINE void SPECK128_Dec_Block(__m128i &block0, __m128i &block1,
406  const word64 *subkeys, unsigned int rounds)
407 {
408  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
409  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
410  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
411 
412  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
413  {
414  const __m128i rk = _mm_castpd_si128(
415  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
416 
417  y1 = _mm_xor_si128(y1, x1);
418  y1 = RotateRight64<3>(y1);
419  x1 = _mm_xor_si128(x1, rk);
420  x1 = _mm_sub_epi64(x1, y1);
421  x1 = RotateLeft64<8>(x1);
422  }
423 
424  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
425  block0 = _mm_unpacklo_epi64(y1, x1);
426  block1 = _mm_unpackhi_epi64(y1, x1);
427 }
428 
429 CRYPTOPP_INLINE void SPECK128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
430  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
431  const word64 *subkeys, unsigned int rounds)
432 {
433  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
434  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
435  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
436  __m128i x2 = _mm_unpackhi_epi64(block2, block3);
437  __m128i y2 = _mm_unpacklo_epi64(block2, block3);
438  __m128i x3 = _mm_unpackhi_epi64(block4, block5);
439  __m128i y3 = _mm_unpacklo_epi64(block4, block5);
440 
441  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
442  {
443  const __m128i rk = _mm_castpd_si128(
444  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
445 
446  y1 = _mm_xor_si128(y1, x1);
447  y2 = _mm_xor_si128(y2, x2);
448  y3 = _mm_xor_si128(y3, x3);
449  y1 = RotateRight64<3>(y1);
450  y2 = RotateRight64<3>(y2);
451  y3 = RotateRight64<3>(y3);
452  x1 = _mm_xor_si128(x1, rk);
453  x2 = _mm_xor_si128(x2, rk);
454  x3 = _mm_xor_si128(x3, rk);
455  x1 = _mm_sub_epi64(x1, y1);
456  x2 = _mm_sub_epi64(x2, y2);
457  x3 = _mm_sub_epi64(x3, y3);
458  x1 = RotateLeft64<8>(x1);
459  x2 = RotateLeft64<8>(x2);
460  x3 = RotateLeft64<8>(x3);
461  }
462 
463  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
464  block0 = _mm_unpacklo_epi64(y1, x1);
465  block1 = _mm_unpackhi_epi64(y1, x1);
466  block2 = _mm_unpacklo_epi64(y2, x2);
467  block3 = _mm_unpackhi_epi64(y2, x2);
468  block4 = _mm_unpacklo_epi64(y3, x3);
469  block5 = _mm_unpackhi_epi64(y3, x3);
470 }
471 
472 #endif // CRYPTOPP_SSSE3_AVAILABLE
473 
474 // ***************************** Power8 ***************************** //
475 
476 #if defined(CRYPTOPP_POWER8_AVAILABLE)
477 
481 
482 using CryptoPP::VecAdd;
483 using CryptoPP::VecSub;
484 using CryptoPP::VecXor;
486 
487 // Rotate left by bit count
488 template<unsigned int C>
489 CRYPTOPP_INLINE uint64x2_p RotateLeft64(const uint64x2_p val)
490 {
491  const uint64x2_p m = {C, C};
492  return vec_rl(val, m);
493 }
494 
495 // Rotate right by bit count
496 template<unsigned int C>
497 CRYPTOPP_INLINE uint64x2_p RotateRight64(const uint64x2_p val)
498 {
499  const uint64x2_p m = {64-C, 64-C};
500  return vec_rl(val, m);
501 }
502 
503 void SPECK128_Enc_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds)
504 {
505 #if (CRYPTOPP_BIG_ENDIAN)
506  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
507  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
508 #else
509  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
510  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
511 #endif
512 
513  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
514  uint64x2_p x1 = (uint64x2_p)VecPermute(block, block, m1);
515  uint64x2_p y1 = (uint64x2_p)VecPermute(block, block, m2);
516 
517  for (int i=0; i < static_cast<int>(rounds); ++i)
518  {
519  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]);
520 
521  x1 = RotateRight64<8>(x1);
522  x1 = VecAdd(x1, y1);
523  x1 = VecXor(x1, rk);
524 
525  y1 = RotateLeft64<3>(y1);
526  y1 = VecXor(y1, x1);
527  }
528 
529 #if (CRYPTOPP_BIG_ENDIAN)
530  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
531  //const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
532 #else
533  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
534  //const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
535 #endif
536 
537  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
538  block = (uint32x4_p)VecPermute(x1, y1, m3);
539 }
540 
541 void SPECK128_Dec_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds)
542 {
543 #if (CRYPTOPP_BIG_ENDIAN)
544  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
545  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
546 #else
547  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
548  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
549 #endif
550 
551  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
552  uint64x2_p x1 = (uint64x2_p)VecPermute(block, block, m1);
553  uint64x2_p y1 = (uint64x2_p)VecPermute(block, block, m2);
554 
555  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
556  {
557  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]);
558 
559  y1 = VecXor(y1, x1);
560  y1 = RotateRight64<3>(y1);
561  x1 = VecXor(x1, rk);
562  x1 = VecSub(x1, y1);
563  x1 = RotateLeft64<8>(x1);
564  }
565 
566 #if (CRYPTOPP_BIG_ENDIAN)
567  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
568  //const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
569 #else
570  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
571  //const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
572 #endif
573 
574  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
575  block = (uint32x4_p)VecPermute(x1, y1, m3);
576 }
577 
578 void SPECK128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
579  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
580  uint32x4_p &block5, const word64 *subkeys, unsigned int rounds)
581 {
582 #if (CRYPTOPP_BIG_ENDIAN)
583  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
584  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
585 #else
586  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
587  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
588 #endif
589 
590  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
591  uint64x2_p x1 = (uint64x2_p)VecPermute(block0, block1, m1);
592  uint64x2_p y1 = (uint64x2_p)VecPermute(block0, block1, m2);
593  uint64x2_p x2 = (uint64x2_p)VecPermute(block2, block3, m1);
594  uint64x2_p y2 = (uint64x2_p)VecPermute(block2, block3, m2);
595  uint64x2_p x3 = (uint64x2_p)VecPermute(block4, block5, m1);
596  uint64x2_p y3 = (uint64x2_p)VecPermute(block4, block5, m2);
597 
598  for (int i=0; i < static_cast<int>(rounds); ++i)
599  {
600  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]);
601 
602  x1 = RotateRight64<8>(x1);
603  x2 = RotateRight64<8>(x2);
604  x3 = RotateRight64<8>(x3);
605  x1 = VecAdd(x1, y1);
606  x2 = VecAdd(x2, y2);
607  x3 = VecAdd(x3, y3);
608  x1 = VecXor(x1, rk);
609  x2 = VecXor(x2, rk);
610  x3 = VecXor(x3, rk);
611 
612  y1 = RotateLeft64<3>(y1);
613  y2 = RotateLeft64<3>(y2);
614  y3 = RotateLeft64<3>(y3);
615  y1 = VecXor(y1, x1);
616  y2 = VecXor(y2, x2);
617  y3 = VecXor(y3, x3);
618  }
619 
620 #if (CRYPTOPP_BIG_ENDIAN)
621  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
622  const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
623 #else
624  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
625  const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
626 #endif
627 
628  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
629  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
630  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
631  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
632  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
633  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
634  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
635 }
636 
637 void SPECK128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
638  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
639  uint32x4_p &block5, const word64 *subkeys, unsigned int rounds)
640 {
641 #if (CRYPTOPP_BIG_ENDIAN)
642  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
643  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
644 #else
645  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
646  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
647 #endif
648 
649  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
650  uint64x2_p x1 = (uint64x2_p)VecPermute(block0, block1, m1);
651  uint64x2_p y1 = (uint64x2_p)VecPermute(block0, block1, m2);
652  uint64x2_p x2 = (uint64x2_p)VecPermute(block2, block3, m1);
653  uint64x2_p y2 = (uint64x2_p)VecPermute(block2, block3, m2);
654  uint64x2_p x3 = (uint64x2_p)VecPermute(block4, block5, m1);
655  uint64x2_p y3 = (uint64x2_p)VecPermute(block4, block5, m2);
656 
657  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
658  {
659  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]);
660 
661  y1 = VecXor(y1, x1);
662  y2 = VecXor(y2, x2);
663  y3 = VecXor(y3, x3);
664  y1 = RotateRight64<3>(y1);
665  y2 = RotateRight64<3>(y2);
666  y3 = RotateRight64<3>(y3);
667 
668  x1 = VecXor(x1, rk);
669  x2 = VecXor(x2, rk);
670  x3 = VecXor(x3, rk);
671  x1 = VecSub(x1, y1);
672  x2 = VecSub(x2, y2);
673  x3 = VecSub(x3, y3);
674  x1 = RotateLeft64<8>(x1);
675  x2 = RotateLeft64<8>(x2);
676  x3 = RotateLeft64<8>(x3);
677  }
678 
679 #if (CRYPTOPP_BIG_ENDIAN)
680  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
681  const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
682 #else
683  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
684  const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
685 #endif
686 
687  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
688  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
689  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
690  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
691  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
692  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
693  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
694 }
695 
696 #endif // CRYPTOPP_POWER8_AVAILABLE
697 
698 ANONYMOUS_NAMESPACE_END
699 
700 ///////////////////////////////////////////////////////////////////////
701 
702 NAMESPACE_BEGIN(CryptoPP)
703 
704 // *************************** ARM NEON **************************** //
705 
706 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
707 size_t SPECK128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
708  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
709 {
710  return AdvancedProcessBlocks128_6x2_NEON(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks,
711  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
712 }
713 
714 size_t SPECK128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
715  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
716 {
717  return AdvancedProcessBlocks128_6x2_NEON(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks,
718  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
719 }
720 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
721 
722 // ***************************** IA-32 ***************************** //
723 
724 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
725 size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
726  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
727 {
728  return AdvancedProcessBlocks128_6x2_SSE(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks,
729  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
730 }
731 
732 size_t SPECK128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
733  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
734 {
735  return AdvancedProcessBlocks128_6x2_SSE(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks,
736  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
737 }
738 #endif // CRYPTOPP_SSSE3_AVAILABLE
739 
740 // ***************************** Power8 ***************************** //
741 
742 #if defined(CRYPTOPP_POWER8_AVAILABLE)
743 size_t SPECK128_Enc_AdvancedProcessBlocks_POWER8(const word64* subKeys, size_t rounds,
744  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
745 {
746  return AdvancedProcessBlocks128_6x1_ALTIVEC(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks,
747  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
748 }
749 
750 size_t SPECK128_Dec_AdvancedProcessBlocks_POWER8(const word64* subKeys, size_t rounds,
751  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
752 {
753  return AdvancedProcessBlocks128_6x1_ALTIVEC(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks,
754  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
755 }
756 #endif // CRYPTOPP_POWER8_AVAILABLE
757 
758 NAMESPACE_END
Utility functions for the Crypto++ library.
T1 VecSub(const T1 vec1, const T2 vec2)
Subtract two vectors.
Definition: ppc_simd.h:980
Library configuration file.
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition: ppc_simd.h:963
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:875
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:128
Support functions for PowerPC and vector operations.
Template for AdvancedProcessBlocks and SIMD processing.
Precompiled header file.
Classes for the Speck block cipher.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:945
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:138
Crypto++ library namespace.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:118