Crypto++  8.1
Free C++ class library of cryptographic schemes
speck128_simd.cpp
1 // speck128_simd.cpp - written and placed in the public domain by Jeffrey Walton
2 //
3 // This source file uses intrinsics and built-ins to gain access to
4 // SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
5 // source file is needed because additional CXXFLAGS are required to enable
6 // the appropriate instructions sets in some build configurations.
7 
8 #include "pch.h"
9 #include "config.h"
10 
11 #include "speck.h"
12 #include "misc.h"
13 
14 // Uncomment for benchmarking C++ against SSE or NEON.
15 // Do so in both speck.cpp and speck-simd.cpp.
16 // #undef CRYPTOPP_SSSE3_AVAILABLE
17 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
18 
19 #if (CRYPTOPP_SSSE3_AVAILABLE)
20 # include "adv_simd.h"
21 # include <pmmintrin.h>
22 # include <tmmintrin.h>
23 #endif
24 
25 #if defined(__XOP__)
26 # include <ammintrin.h>
27 #endif
28 
29 #if defined(__AVX512F__)
30 # define CRYPTOPP_AVX512_ROTATE 1
31 # include <immintrin.h>
32 #endif
33 
34 // C1189: error: This header is specific to ARM targets
35 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
36 # include "adv_simd.h"
37 # ifndef _M_ARM64
38 # include <arm_neon.h>
39 # endif
40 #endif
41 
42 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
43 # include <stdint.h>
44 # include <arm_acle.h>
45 #endif
46 
47 #if defined(CRYPTOPP_POWER8_AVAILABLE)
48 # include "adv_simd.h"
49 # include "ppc_simd.h"
50 #endif
51 
52 // Squash MS LNK4221 and libtool warnings
53 extern const char SPECK128_SIMD_FNAME[] = __FILE__;
54 
55 ANONYMOUS_NAMESPACE_BEGIN
56 
57 using CryptoPP::byte;
58 using CryptoPP::word32;
59 using CryptoPP::word64;
60 
61 // *************************** ARM NEON ************************** //
62 
63 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
64 
65 // Missing from Microsoft's ARM A-32 implementation
66 #if defined(_MSC_VER) && !defined(_M_ARM64)
67 inline uint64x2_t vld1q_dup_u64(const uint64_t* ptr)
68 {
69  return vmovq_n_u64(*ptr);
70 }
71 #endif
72 
73 template <class T>
74 inline T UnpackHigh64(const T& a, const T& b)
75 {
76  const uint64x1_t x(vget_high_u64((uint64x2_t)a));
77  const uint64x1_t y(vget_high_u64((uint64x2_t)b));
78  return (T)vcombine_u64(x, y);
79 }
80 
81 template <class T>
82 inline T UnpackLow64(const T& a, const T& b)
83 {
84  const uint64x1_t x(vget_low_u64((uint64x2_t)a));
85  const uint64x1_t y(vget_low_u64((uint64x2_t)b));
86  return (T)vcombine_u64(x, y);
87 }
88 
89 template <unsigned int R>
90 inline uint64x2_t RotateLeft64(const uint64x2_t& val)
91 {
92  const uint64x2_t a(vshlq_n_u64(val, R));
93  const uint64x2_t b(vshrq_n_u64(val, 64 - R));
94  return vorrq_u64(a, b);
95 }
96 
97 template <unsigned int R>
98 inline uint64x2_t RotateRight64(const uint64x2_t& val)
99 {
100  const uint64x2_t a(vshlq_n_u64(val, 64 - R));
101  const uint64x2_t b(vshrq_n_u64(val, R));
102  return vorrq_u64(a, b);
103 }
104 
105 #if defined(__aarch32__) || defined(__aarch64__)
106 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
107 template <>
108 inline uint64x2_t RotateLeft64<8>(const uint64x2_t& val)
109 {
110 #if (CRYPTOPP_BIG_ENDIAN)
111  const uint8_t maskb[16] = { 14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7 };
112  const uint8x16_t mask = vld1q_u8(maskb);
113 #else
114  const uint8_t maskb[16] = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
115  const uint8x16_t mask = vld1q_u8(maskb);
116 #endif
117 
118  return vreinterpretq_u64_u8(
119  vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
120 }
121 
122 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
123 template <>
124 inline uint64x2_t RotateRight64<8>(const uint64x2_t& val)
125 {
126 #if (CRYPTOPP_BIG_ENDIAN)
127  const uint8_t maskb[16] = { 8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1 };
128  const uint8x16_t mask = vld1q_u8(maskb);
129 #else
130  const uint8_t maskb[16] = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
131  const uint8x16_t mask = vld1q_u8(maskb);
132 #endif
133 
134  return vreinterpretq_u64_u8(
135  vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
136 }
137 #endif
138 
139 inline void SPECK128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
140  const word64 *subkeys, unsigned int rounds)
141 {
142  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
143  uint64x2_t x1 = UnpackHigh64(block0, block1);
144  uint64x2_t y1 = UnpackLow64(block0, block1);
145 
146  for (int i=0; i < static_cast<int>(rounds); ++i)
147  {
148  const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
149 
150  x1 = RotateRight64<8>(x1);
151  x1 = vaddq_u64(x1, y1);
152  x1 = veorq_u64(x1, rk);
153  y1 = RotateLeft64<3>(y1);
154  y1 = veorq_u64(y1, x1);
155  }
156 
157  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
158  block0 = UnpackLow64(y1, x1);
159  block1 = UnpackHigh64(y1, x1);
160 }
161 
162 inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
163  uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
164  const word64 *subkeys, unsigned int rounds)
165 {
166  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
167  uint64x2_t x1 = UnpackHigh64(block0, block1);
168  uint64x2_t y1 = UnpackLow64(block0, block1);
169  uint64x2_t x2 = UnpackHigh64(block2, block3);
170  uint64x2_t y2 = UnpackLow64(block2, block3);
171  uint64x2_t x3 = UnpackHigh64(block4, block5);
172  uint64x2_t y3 = UnpackLow64(block4, block5);
173 
174  for (int i=0; i < static_cast<int>(rounds); ++i)
175  {
176  const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
177 
178  x1 = RotateRight64<8>(x1);
179  x2 = RotateRight64<8>(x2);
180  x3 = RotateRight64<8>(x3);
181  x1 = vaddq_u64(x1, y1);
182  x2 = vaddq_u64(x2, y2);
183  x3 = vaddq_u64(x3, y3);
184  x1 = veorq_u64(x1, rk);
185  x2 = veorq_u64(x2, rk);
186  x3 = veorq_u64(x3, rk);
187  y1 = RotateLeft64<3>(y1);
188  y2 = RotateLeft64<3>(y2);
189  y3 = RotateLeft64<3>(y3);
190  y1 = veorq_u64(y1, x1);
191  y2 = veorq_u64(y2, x2);
192  y3 = veorq_u64(y3, x3);
193  }
194 
195  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
196  block0 = UnpackLow64(y1, x1);
197  block1 = UnpackHigh64(y1, x1);
198  block2 = UnpackLow64(y2, x2);
199  block3 = UnpackHigh64(y2, x2);
200  block4 = UnpackLow64(y3, x3);
201  block5 = UnpackHigh64(y3, x3);
202 }
203 
204 inline void SPECK128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
205  const word64 *subkeys, unsigned int rounds)
206 {
207  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
208  uint64x2_t x1 = UnpackHigh64(block0, block1);
209  uint64x2_t y1 = UnpackLow64(block0, block1);
210 
211  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
212  {
213  const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
214 
215  y1 = veorq_u64(y1, x1);
216  y1 = RotateRight64<3>(y1);
217  x1 = veorq_u64(x1, rk);
218  x1 = vsubq_u64(x1, y1);
219  x1 = RotateLeft64<8>(x1);
220  }
221 
222  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
223  block0 = UnpackLow64(y1, x1);
224  block1 = UnpackHigh64(y1, x1);
225 }
226 
227 inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
228  uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
229  const word64 *subkeys, unsigned int rounds)
230 {
231  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
232  uint64x2_t x1 = UnpackHigh64(block0, block1);
233  uint64x2_t y1 = UnpackLow64(block0, block1);
234  uint64x2_t x2 = UnpackHigh64(block2, block3);
235  uint64x2_t y2 = UnpackLow64(block2, block3);
236  uint64x2_t x3 = UnpackHigh64(block4, block5);
237  uint64x2_t y3 = UnpackLow64(block4, block5);
238 
239  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
240  {
241  const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
242 
243  y1 = veorq_u64(y1, x1);
244  y2 = veorq_u64(y2, x2);
245  y3 = veorq_u64(y3, x3);
246  y1 = RotateRight64<3>(y1);
247  y2 = RotateRight64<3>(y2);
248  y3 = RotateRight64<3>(y3);
249  x1 = veorq_u64(x1, rk);
250  x2 = veorq_u64(x2, rk);
251  x3 = veorq_u64(x3, rk);
252  x1 = vsubq_u64(x1, y1);
253  x2 = vsubq_u64(x2, y2);
254  x3 = vsubq_u64(x3, y3);
255  x1 = RotateLeft64<8>(x1);
256  x2 = RotateLeft64<8>(x2);
257  x3 = RotateLeft64<8>(x3);
258  }
259 
260  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
261  block0 = UnpackLow64(y1, x1);
262  block1 = UnpackHigh64(y1, x1);
263  block2 = UnpackLow64(y2, x2);
264  block3 = UnpackHigh64(y2, x2);
265  block4 = UnpackLow64(y3, x3);
266  block5 = UnpackHigh64(y3, x3);
267 }
268 
269 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
270 
271 // ***************************** IA-32 ***************************** //
272 
273 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
274 
275 // Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670
276 #ifndef M128_CAST
277 # define M128_CAST(x) ((__m128i *)(void *)(x))
278 #endif
279 #ifndef CONST_M128_CAST
280 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
281 #endif
282 
283 // GCC double casts, https://www.spinics.net/lists/gcchelp/msg47735.html
284 #ifndef DOUBLE_CAST
285 # define DOUBLE_CAST(x) ((double *)(void *)(x))
286 #endif
287 #ifndef CONST_DOUBLE_CAST
288 # define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x))
289 #endif
290 
291 template <unsigned int R>
292 inline __m128i RotateLeft64(const __m128i& val)
293 {
294 #if defined(CRYPTOPP_AVX512_ROTATE)
295  return _mm_rol_epi64(val, R);
296 #elif defined(__XOP__)
297  return _mm_roti_epi64(val, R);
298 #else
299  return _mm_or_si128(
300  _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R));
301 #endif
302 }
303 
304 template <unsigned int R>
305 inline __m128i RotateRight64(const __m128i& val)
306 {
307 #if defined(CRYPTOPP_AVX512_ROTATE)
308  return _mm_ror_epi64(val, R);
309 #elif defined(__XOP__)
310  return _mm_roti_epi64(val, 64-R);
311 #else
312  return _mm_or_si128(
313  _mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R));
314 #endif
315 }
316 
317 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
318 template <>
319 __m128i RotateLeft64<8>(const __m128i& val)
320 {
321 #if defined(__XOP__)
322  return _mm_roti_epi64(val, 8);
323 #else
324  const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7);
325  return _mm_shuffle_epi8(val, mask);
326 #endif
327 }
328 
329 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
330 template <>
331 __m128i RotateRight64<8>(const __m128i& val)
332 {
333 #if defined(__XOP__)
334  return _mm_roti_epi64(val, 64-8);
335 #else
336  const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1);
337  return _mm_shuffle_epi8(val, mask);
338 #endif
339 }
340 
341 inline void SPECK128_Enc_Block(__m128i &block0, __m128i &block1,
342  const word64 *subkeys, unsigned int rounds)
343 {
344  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
345  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
346  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
347 
348  for (int i=0; i < static_cast<int>(rounds); ++i)
349  {
350  const __m128i rk = _mm_castpd_si128(
351  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
352 
353  x1 = RotateRight64<8>(x1);
354  x1 = _mm_add_epi64(x1, y1);
355  x1 = _mm_xor_si128(x1, rk);
356  y1 = RotateLeft64<3>(y1);
357  y1 = _mm_xor_si128(y1, x1);
358  }
359 
360  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
361  block0 = _mm_unpacklo_epi64(y1, x1);
362  block1 = _mm_unpackhi_epi64(y1, x1);
363 }
364 
365 inline void SPECK128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
366  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
367  const word64 *subkeys, unsigned int rounds)
368 {
369  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
370  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
371  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
372  __m128i x2 = _mm_unpackhi_epi64(block2, block3);
373  __m128i y2 = _mm_unpacklo_epi64(block2, block3);
374  __m128i x3 = _mm_unpackhi_epi64(block4, block5);
375  __m128i y3 = _mm_unpacklo_epi64(block4, block5);
376 
377  for (int i=0; i < static_cast<int>(rounds); ++i)
378  {
379  const __m128i rk = _mm_castpd_si128(
380  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
381 
382  x1 = RotateRight64<8>(x1);
383  x2 = RotateRight64<8>(x2);
384  x3 = RotateRight64<8>(x3);
385  x1 = _mm_add_epi64(x1, y1);
386  x2 = _mm_add_epi64(x2, y2);
387  x3 = _mm_add_epi64(x3, y3);
388  x1 = _mm_xor_si128(x1, rk);
389  x2 = _mm_xor_si128(x2, rk);
390  x3 = _mm_xor_si128(x3, rk);
391  y1 = RotateLeft64<3>(y1);
392  y2 = RotateLeft64<3>(y2);
393  y3 = RotateLeft64<3>(y3);
394  y1 = _mm_xor_si128(y1, x1);
395  y2 = _mm_xor_si128(y2, x2);
396  y3 = _mm_xor_si128(y3, x3);
397  }
398 
399  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
400  block0 = _mm_unpacklo_epi64(y1, x1);
401  block1 = _mm_unpackhi_epi64(y1, x1);
402  block2 = _mm_unpacklo_epi64(y2, x2);
403  block3 = _mm_unpackhi_epi64(y2, x2);
404  block4 = _mm_unpacklo_epi64(y3, x3);
405  block5 = _mm_unpackhi_epi64(y3, x3);
406 }
407 
408 inline void SPECK128_Dec_Block(__m128i &block0, __m128i &block1,
409  const word64 *subkeys, unsigned int rounds)
410 {
411  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
412  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
413  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
414 
415  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
416  {
417  const __m128i rk = _mm_castpd_si128(
418  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
419 
420  y1 = _mm_xor_si128(y1, x1);
421  y1 = RotateRight64<3>(y1);
422  x1 = _mm_xor_si128(x1, rk);
423  x1 = _mm_sub_epi64(x1, y1);
424  x1 = RotateLeft64<8>(x1);
425  }
426 
427  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
428  block0 = _mm_unpacklo_epi64(y1, x1);
429  block1 = _mm_unpackhi_epi64(y1, x1);
430 }
431 
432 inline void SPECK128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
433  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
434  const word64 *subkeys, unsigned int rounds)
435 {
436  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
437  __m128i x1 = _mm_unpackhi_epi64(block0, block1);
438  __m128i y1 = _mm_unpacklo_epi64(block0, block1);
439  __m128i x2 = _mm_unpackhi_epi64(block2, block3);
440  __m128i y2 = _mm_unpacklo_epi64(block2, block3);
441  __m128i x3 = _mm_unpackhi_epi64(block4, block5);
442  __m128i y3 = _mm_unpacklo_epi64(block4, block5);
443 
444  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
445  {
446  const __m128i rk = _mm_castpd_si128(
447  _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
448 
449  y1 = _mm_xor_si128(y1, x1);
450  y2 = _mm_xor_si128(y2, x2);
451  y3 = _mm_xor_si128(y3, x3);
452  y1 = RotateRight64<3>(y1);
453  y2 = RotateRight64<3>(y2);
454  y3 = RotateRight64<3>(y3);
455  x1 = _mm_xor_si128(x1, rk);
456  x2 = _mm_xor_si128(x2, rk);
457  x3 = _mm_xor_si128(x3, rk);
458  x1 = _mm_sub_epi64(x1, y1);
459  x2 = _mm_sub_epi64(x2, y2);
460  x3 = _mm_sub_epi64(x3, y3);
461  x1 = RotateLeft64<8>(x1);
462  x2 = RotateLeft64<8>(x2);
463  x3 = RotateLeft64<8>(x3);
464  }
465 
466  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
467  block0 = _mm_unpacklo_epi64(y1, x1);
468  block1 = _mm_unpackhi_epi64(y1, x1);
469  block2 = _mm_unpacklo_epi64(y2, x2);
470  block3 = _mm_unpackhi_epi64(y2, x2);
471  block4 = _mm_unpacklo_epi64(y3, x3);
472  block5 = _mm_unpackhi_epi64(y3, x3);
473 }
474 
475 #endif // CRYPTOPP_SSSE3_AVAILABLE
476 
477 // ***************************** Power8 ***************************** //
478 
479 #if defined(CRYPTOPP_POWER8_AVAILABLE)
480 
484 
485 using CryptoPP::VecAdd;
486 using CryptoPP::VecSub;
487 using CryptoPP::VecXor;
489 
490 // Rotate left by bit count
491 template<unsigned int C>
492 inline uint64x2_p RotateLeft64(const uint64x2_p val)
493 {
494  const uint64x2_p m = {C, C};
495  return vec_rl(val, m);
496 }
497 
498 // Rotate right by bit count
499 template<unsigned int C>
500 inline uint64x2_p RotateRight64(const uint64x2_p val)
501 {
502  const uint64x2_p m = {64-C, 64-C};
503  return vec_rl(val, m);
504 }
505 
506 void SPECK128_Enc_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds)
507 {
508 #if (CRYPTOPP_BIG_ENDIAN)
509  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
510  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
511 #else
512  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
513  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
514 #endif
515 
516  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
517  uint64x2_p x1 = (uint64x2_p)VecPermute(block, block, m1);
518  uint64x2_p y1 = (uint64x2_p)VecPermute(block, block, m2);
519 
520  for (int i=0; i < static_cast<int>(rounds); ++i)
521  {
522  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]);
523 
524  x1 = RotateRight64<8>(x1);
525  x1 = VecAdd(x1, y1);
526  x1 = VecXor(x1, rk);
527 
528  y1 = RotateLeft64<3>(y1);
529  y1 = VecXor(y1, x1);
530  }
531 
532 #if (CRYPTOPP_BIG_ENDIAN)
533  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
534  //const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
535 #else
536  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
537  //const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
538 #endif
539 
540  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
541  block = (uint32x4_p)VecPermute(x1, y1, m3);
542 }
543 
544 void SPECK128_Dec_Block(uint32x4_p &block, const word64 *subkeys, unsigned int rounds)
545 {
546 #if (CRYPTOPP_BIG_ENDIAN)
547  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
548  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
549 #else
550  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
551  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
552 #endif
553 
554  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
555  uint64x2_p x1 = (uint64x2_p)VecPermute(block, block, m1);
556  uint64x2_p y1 = (uint64x2_p)VecPermute(block, block, m2);
557 
558  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
559  {
560  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]);
561 
562  y1 = VecXor(y1, x1);
563  y1 = RotateRight64<3>(y1);
564  x1 = VecXor(x1, rk);
565  x1 = VecSub(x1, y1);
566  x1 = RotateLeft64<8>(x1);
567  }
568 
569 #if (CRYPTOPP_BIG_ENDIAN)
570  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
571  //const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
572 #else
573  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
574  //const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
575 #endif
576 
577  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
578  block = (uint32x4_p)VecPermute(x1, y1, m3);
579 }
580 
581 void SPECK128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
582  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
583  uint32x4_p &block5, const word64 *subkeys, unsigned int rounds)
584 {
585 #if (CRYPTOPP_BIG_ENDIAN)
586  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
587  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
588 #else
589  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
590  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
591 #endif
592 
593  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
594  uint64x2_p x1 = (uint64x2_p)VecPermute(block0, block1, m1);
595  uint64x2_p y1 = (uint64x2_p)VecPermute(block0, block1, m2);
596  uint64x2_p x2 = (uint64x2_p)VecPermute(block2, block3, m1);
597  uint64x2_p y2 = (uint64x2_p)VecPermute(block2, block3, m2);
598  uint64x2_p x3 = (uint64x2_p)VecPermute(block4, block5, m1);
599  uint64x2_p y3 = (uint64x2_p)VecPermute(block4, block5, m2);
600 
601  for (int i=0; i < static_cast<int>(rounds); ++i)
602  {
603  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]);
604 
605  x1 = RotateRight64<8>(x1);
606  x2 = RotateRight64<8>(x2);
607  x3 = RotateRight64<8>(x3);
608  x1 = VecAdd(x1, y1);
609  x2 = VecAdd(x2, y2);
610  x3 = VecAdd(x3, y3);
611  x1 = VecXor(x1, rk);
612  x2 = VecXor(x2, rk);
613  x3 = VecXor(x3, rk);
614 
615  y1 = RotateLeft64<3>(y1);
616  y2 = RotateLeft64<3>(y2);
617  y3 = RotateLeft64<3>(y3);
618  y1 = VecXor(y1, x1);
619  y2 = VecXor(y2, x2);
620  y3 = VecXor(y3, x3);
621  }
622 
623 #if (CRYPTOPP_BIG_ENDIAN)
624  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
625  const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
626 #else
627  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
628  const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
629 #endif
630 
631  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
632  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
633  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
634  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
635  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
636  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
637  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
638 }
639 
640 void SPECK128_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
641  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
642  uint32x4_p &block5, const word64 *subkeys, unsigned int rounds)
643 {
644 #if (CRYPTOPP_BIG_ENDIAN)
645  const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
646  const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
647 #else
648  const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
649  const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
650 #endif
651 
652  // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
653  uint64x2_p x1 = (uint64x2_p)VecPermute(block0, block1, m1);
654  uint64x2_p y1 = (uint64x2_p)VecPermute(block0, block1, m2);
655  uint64x2_p x2 = (uint64x2_p)VecPermute(block2, block3, m1);
656  uint64x2_p y2 = (uint64x2_p)VecPermute(block2, block3, m2);
657  uint64x2_p x3 = (uint64x2_p)VecPermute(block4, block5, m1);
658  uint64x2_p y3 = (uint64x2_p)VecPermute(block4, block5, m2);
659 
660  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
661  {
662  const uint64x2_p rk = vec_splats((unsigned long long)subkeys[i]);
663 
664  y1 = VecXor(y1, x1);
665  y2 = VecXor(y2, x2);
666  y3 = VecXor(y3, x3);
667  y1 = RotateRight64<3>(y1);
668  y2 = RotateRight64<3>(y2);
669  y3 = RotateRight64<3>(y3);
670 
671  x1 = VecXor(x1, rk);
672  x2 = VecXor(x2, rk);
673  x3 = VecXor(x3, rk);
674  x1 = VecSub(x1, y1);
675  x2 = VecSub(x2, y2);
676  x3 = VecSub(x3, y3);
677  x1 = RotateLeft64<8>(x1);
678  x2 = RotateLeft64<8>(x2);
679  x3 = RotateLeft64<8>(x3);
680  }
681 
682 #if (CRYPTOPP_BIG_ENDIAN)
683  const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
684  const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
685 #else
686  const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
687  const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
688 #endif
689 
690  // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
691  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
692  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
693  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
694  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
695  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
696  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
697 }
698 
699 #endif // CRYPTOPP_POWER8_AVAILABLE
700 
701 ANONYMOUS_NAMESPACE_END
702 
703 ///////////////////////////////////////////////////////////////////////
704 
705 NAMESPACE_BEGIN(CryptoPP)
706 
707 // *************************** ARM NEON **************************** //
708 
709 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
710 size_t SPECK128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
711  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
712 {
713  return AdvancedProcessBlocks128_6x2_NEON(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks,
714  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
715 }
716 
717 size_t SPECK128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
718  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
719 {
720  return AdvancedProcessBlocks128_6x2_NEON(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks,
721  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
722 }
723 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
724 
725 // ***************************** IA-32 ***************************** //
726 
727 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
728 size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
729  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
730 {
731  return AdvancedProcessBlocks128_6x2_SSE(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks,
732  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
733 }
734 
735 size_t SPECK128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
736  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
737 {
738  return AdvancedProcessBlocks128_6x2_SSE(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks,
739  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
740 }
741 #endif // CRYPTOPP_SSSE3_AVAILABLE
742 
743 // ***************************** Power8 ***************************** //
744 
745 #if defined(CRYPTOPP_POWER8_AVAILABLE)
746 size_t SPECK128_Enc_AdvancedProcessBlocks_POWER8(const word64* subKeys, size_t rounds,
747  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
748 {
749  return AdvancedProcessBlocks128_6x1_ALTIVEC(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks,
750  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
751 }
752 
753 size_t SPECK128_Dec_AdvancedProcessBlocks_POWER8(const word64* subKeys, size_t rounds,
754  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
755 {
756  return AdvancedProcessBlocks128_6x1_ALTIVEC(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks,
757  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
758 }
759 #endif // CRYPTOPP_POWER8_AVAILABLE
760 
761 NAMESPACE_END
Utility functions for the Crypto++ library.
T1 VecSub(const T1 vec1, const T2 vec2)
Subtract two vectors.
Definition: ppc_simd.h:956
Library configuration file.
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition: ppc_simd.h:939
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1010
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:129
Support functions for PowerPC and vector operations.
Template for AdvancedProcessBlocks and SIMD processing.
Precompiled header file.
Classes for the Speck block cipher.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:916
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:139
Crypto++ library namespace.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:119