Crypto++  7.0
Free C++ class library of cryptographic schemes
speck64_simd.cpp
1 // speck64_simd.cpp - written and placed in the public domain by Jeffrey Walton
2 //
3 // This source file uses intrinsics and built-ins to gain access to
4 // SSSE3, ARM NEON and ARMv8a, and Altivec instructions. A separate
5 // source file is needed because additional CXXFLAGS are required to enable
6 // the appropriate instructions sets in some build configurations.
7 
8 #include "pch.h"
9 #include "config.h"
10 
11 #include "speck.h"
12 #include "misc.h"
13 #include "adv_simd.h"
14 
15 // Uncomment for benchmarking C++ against SSE or NEON.
16 // Do so in both speck.cpp and speck-simd.cpp.
17 // #undef CRYPTOPP_SSE41_AVAILABLE
18 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
19 
20 #if (CRYPTOPP_SSSE3_AVAILABLE)
21 # include <pmmintrin.h>
22 # include <tmmintrin.h>
23 #endif
24 
25 #if (CRYPTOPP_SSE41_AVAILABLE)
26 # include <smmintrin.h>
27 #endif
28 
29 #if defined(__XOP__)
30 # include <ammintrin.h>
31 #endif
32 
33 #if defined(__AVX512F__) && defined(__AVX512VL__)
34 # define CRYPTOPP_AVX512_ROTATE 1
35 # include <immintrin.h>
36 #endif
37 
38 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
39 # include <arm_neon.h>
40 #endif
41 
42 // Can't use CRYPTOPP_ARM_XXX_AVAILABLE because too many
43 // compilers don't follow ACLE conventions for the include.
44 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
45 # include <stdint.h>
46 # include <arm_acle.h>
47 #endif
48 
49 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
50 # include "ppc_simd.h"
51 #endif
52 
53 #ifndef CRYPTOPP_INLINE
54 # if defined(CRYPTOPP_DEBUG)
55 # define CRYPTOPP_INLINE static
56 # else
57 # define CRYPTOPP_INLINE inline
58 # endif
59 #endif
60 
61 // Squash MS LNK4221 and libtool warnings
62 extern const char SPECK64_SIMD_FNAME[] = __FILE__;
63 
64 ANONYMOUS_NAMESPACE_BEGIN
65 
66 using CryptoPP::byte;
67 using CryptoPP::word32;
68 using CryptoPP::word64;
69 
70 // *************************** ARM NEON ************************** //
71 
72 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
73 
74 template <class T>
75 CRYPTOPP_INLINE T UnpackHigh32(const T& a, const T& b)
76 {
77  const uint32x2_t x(vget_high_u32((uint32x4_t)a));
78  const uint32x2_t y(vget_high_u32((uint32x4_t)b));
79  const uint32x2x2_t r = vzip_u32(x, y);
80  return (T)vcombine_u32(r.val[0], r.val[1]);
81 }
82 
83 template <class T>
84 CRYPTOPP_INLINE T UnpackLow32(const T& a, const T& b)
85 {
86  const uint32x2_t x(vget_low_u32((uint32x4_t)a));
87  const uint32x2_t y(vget_low_u32((uint32x4_t)b));
88  const uint32x2x2_t r = vzip_u32(x, y);
89  return (T)vcombine_u32(r.val[0], r.val[1]);
90 }
91 
92 template <unsigned int R>
93 CRYPTOPP_INLINE uint32x4_t RotateLeft32(const uint32x4_t& val)
94 {
95  const uint32x4_t a(vshlq_n_u32(val, R));
96  const uint32x4_t b(vshrq_n_u32(val, 32 - R));
97  return vorrq_u32(a, b);
98 }
99 
100 template <unsigned int R>
101 CRYPTOPP_INLINE uint32x4_t RotateRight32(const uint32x4_t& val)
102 {
103  const uint32x4_t a(vshlq_n_u32(val, 32 - R));
104  const uint32x4_t b(vshrq_n_u32(val, R));
105  return vorrq_u32(a, b);
106 }
107 
108 #if defined(__aarch32__) || defined(__aarch64__)
109 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
110 template <>
111 CRYPTOPP_INLINE uint32x4_t RotateLeft32<8>(const uint32x4_t& val)
112 {
113 #if (CRYPTOPP_BIG_ENDIAN)
114  const uint8_t maskb[16] = { 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3 };
115  const uint8x16_t mask = vld1q_u8(maskb);
116 #else
117  const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
118  const uint8x16_t mask = vld1q_u8(maskb);
119 #endif
120 
121  return vreinterpretq_u32_u8(
122  vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
123 }
124 
125 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
126 template <>
127 CRYPTOPP_INLINE uint32x4_t RotateRight32<8>(const uint32x4_t& val)
128 {
129 #if (CRYPTOPP_BIG_ENDIAN)
130  const uint8_t maskb[16] = { 12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1 };
131  const uint8x16_t mask = vld1q_u8(maskb);
132 #else
133  const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,15,12 };
134  const uint8x16_t mask = vld1q_u8(maskb);
135 #endif
136 
137  return vreinterpretq_u32_u8(
138  vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
139 }
140 #endif // Aarch32 or Aarch64
141 
142 CRYPTOPP_INLINE void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1,
143  const word32 *subkeys, unsigned int rounds)
144 {
145  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
146  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
147  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
148 
149  for (int i=0; i < static_cast<int>(rounds); ++i)
150  {
151  const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
152 
153  x1 = RotateRight32<8>(x1);
154  x1 = vaddq_u32(x1, y1);
155  x1 = veorq_u32(x1, rk);
156  y1 = RotateLeft32<3>(y1);
157  y1 = veorq_u32(y1, x1);
158  }
159 
160  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
161  block0 = UnpackLow32(y1, x1);
162  block1 = UnpackHigh32(y1, x1);
163 }
164 
165 CRYPTOPP_INLINE void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
166  const word32 *subkeys, unsigned int rounds)
167 {
168  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
169  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
170  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
171 
172  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
173  {
174  const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
175 
176  y1 = veorq_u32(y1, x1);
177  y1 = RotateRight32<3>(y1);
178  x1 = veorq_u32(x1, rk);
179  x1 = vsubq_u32(x1, y1);
180  x1 = RotateLeft32<8>(x1);
181  }
182 
183  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
184  block0 = UnpackLow32(y1, x1);
185  block1 = UnpackHigh32(y1, x1);
186 }
187 
188 CRYPTOPP_INLINE void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
189  uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
190  const word32 *subkeys, unsigned int rounds)
191 {
192  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
193  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
194  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
195  uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
196  uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
197  uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
198  uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
199 
200  for (int i=0; i < static_cast<int>(rounds); ++i)
201  {
202  const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
203 
204  x1 = RotateRight32<8>(x1);
205  x2 = RotateRight32<8>(x2);
206  x3 = RotateRight32<8>(x3);
207  x1 = vaddq_u32(x1, y1);
208  x2 = vaddq_u32(x2, y2);
209  x3 = vaddq_u32(x3, y3);
210  x1 = veorq_u32(x1, rk);
211  x2 = veorq_u32(x2, rk);
212  x3 = veorq_u32(x3, rk);
213  y1 = RotateLeft32<3>(y1);
214  y2 = RotateLeft32<3>(y2);
215  y3 = RotateLeft32<3>(y3);
216  y1 = veorq_u32(y1, x1);
217  y2 = veorq_u32(y2, x2);
218  y3 = veorq_u32(y3, x3);
219  }
220 
221  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
222  block0 = UnpackLow32(y1, x1);
223  block1 = UnpackHigh32(y1, x1);
224  block2 = UnpackLow32(y2, x2);
225  block3 = UnpackHigh32(y2, x2);
226  block4 = UnpackLow32(y3, x3);
227  block5 = UnpackHigh32(y3, x3);
228 }
229 
230 CRYPTOPP_INLINE void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
231  uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
232  const word32 *subkeys, unsigned int rounds)
233 {
234  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
235  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
236  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
237  uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
238  uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
239  uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
240  uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
241 
242  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
243  {
244  const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
245 
246  y1 = veorq_u32(y1, x1);
247  y2 = veorq_u32(y2, x2);
248  y3 = veorq_u32(y3, x3);
249  y1 = RotateRight32<3>(y1);
250  y2 = RotateRight32<3>(y2);
251  y3 = RotateRight32<3>(y3);
252  x1 = veorq_u32(x1, rk);
253  x2 = veorq_u32(x2, rk);
254  x3 = veorq_u32(x3, rk);
255  x1 = vsubq_u32(x1, y1);
256  x2 = vsubq_u32(x2, y2);
257  x3 = vsubq_u32(x3, y3);
258  x1 = RotateLeft32<8>(x1);
259  x2 = RotateLeft32<8>(x2);
260  x3 = RotateLeft32<8>(x3);
261  }
262 
263  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
264  block0 = UnpackLow32(y1, x1);
265  block1 = UnpackHigh32(y1, x1);
266  block2 = UnpackLow32(y2, x2);
267  block3 = UnpackHigh32(y2, x2);
268  block4 = UnpackLow32(y3, x3);
269  block5 = UnpackHigh32(y3, x3);
270 }
271 
272 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
273 
274 // ***************************** IA-32 ***************************** //
275 
276 #if defined(CRYPTOPP_SSE41_AVAILABLE)
277 
278 template <unsigned int R>
279 CRYPTOPP_INLINE __m128i RotateLeft32(const __m128i& val)
280 {
281 #if defined(__XOP__)
282  return _mm_roti_epi32(val, R);
283 #else
284  return _mm_or_si128(
285  _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
286 #endif
287 }
288 
289 template <unsigned int R>
290 CRYPTOPP_INLINE __m128i RotateRight32(const __m128i& val)
291 {
292 #if defined(__XOP__)
293  return _mm_roti_epi32(val, 32-R);
294 #else
295  return _mm_or_si128(
296  _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
297 #endif
298 }
299 
300 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
301 template <>
302 __m128i RotateLeft32<8>(const __m128i& val)
303 {
304 #if defined(__XOP__)
305  return _mm_roti_epi32(val, 8);
306 #else
307  const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
308  return _mm_shuffle_epi8(val, mask);
309 #endif
310 }
311 
312 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
313 template <>
314 __m128i RotateRight32<8>(const __m128i& val)
315 {
316 #if defined(__XOP__)
317  return _mm_roti_epi32(val, 32-8);
318 #else
319  const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
320  return _mm_shuffle_epi8(val, mask);
321 #endif
322 }
323 
324 CRYPTOPP_INLINE void SPECK64_Enc_Block(__m128i &block0, __m128i &block1,
325  const word32 *subkeys, unsigned int rounds)
326 {
327  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
328  const __m128 t0 = _mm_castsi128_ps(block0);
329  const __m128 t1 = _mm_castsi128_ps(block1);
330  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
331  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
332 
333  for (int i=0; i < static_cast<int>(rounds); ++i)
334  {
335  const __m128i rk = _mm_set1_epi32(subkeys[i]);
336 
337  x1 = RotateRight32<8>(x1);
338  x1 = _mm_add_epi32(x1, y1);
339  x1 = _mm_xor_si128(x1, rk);
340  y1 = RotateLeft32<3>(y1);
341  y1 = _mm_xor_si128(y1, x1);
342  }
343 
344  // The is roughly the SSE equivalent to ARM vzp32
345  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
346  block0 = _mm_unpacklo_epi32(y1, x1);
347  block1 = _mm_unpackhi_epi32(y1, x1);
348 }
349 
350 CRYPTOPP_INLINE void SPECK64_Dec_Block(__m128i &block0, __m128i &block1,
351  const word32 *subkeys, unsigned int rounds)
352 {
353  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
354  const __m128 t0 = _mm_castsi128_ps(block0);
355  const __m128 t1 = _mm_castsi128_ps(block1);
356  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
357  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
358 
359  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
360  {
361  const __m128i rk = _mm_set1_epi32(subkeys[i]);
362 
363  y1 = _mm_xor_si128(y1, x1);
364  y1 = RotateRight32<3>(y1);
365  x1 = _mm_xor_si128(x1, rk);
366  x1 = _mm_sub_epi32(x1, y1);
367  x1 = RotateLeft32<8>(x1);
368  }
369 
370  // The is roughly the SSE equivalent to ARM vzp32
371  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
372  block0 = _mm_unpacklo_epi32(y1, x1);
373  block1 = _mm_unpackhi_epi32(y1, x1);
374 }
375 
376 CRYPTOPP_INLINE void SPECK64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
377  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
378  const word32 *subkeys, unsigned int rounds)
379 {
380  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
381  const __m128 t0 = _mm_castsi128_ps(block0);
382  const __m128 t1 = _mm_castsi128_ps(block1);
383  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
384  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
385 
386  const __m128 t2 = _mm_castsi128_ps(block2);
387  const __m128 t3 = _mm_castsi128_ps(block3);
388  __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
389  __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
390 
391  const __m128 t4 = _mm_castsi128_ps(block4);
392  const __m128 t5 = _mm_castsi128_ps(block5);
393  __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
394  __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
395 
396  for (int i=0; i < static_cast<int>(rounds); ++i)
397  {
398  const __m128i rk = _mm_set1_epi32(subkeys[i]);
399 
400  x1 = RotateRight32<8>(x1);
401  x2 = RotateRight32<8>(x2);
402  x3 = RotateRight32<8>(x3);
403  x1 = _mm_add_epi32(x1, y1);
404  x2 = _mm_add_epi32(x2, y2);
405  x3 = _mm_add_epi32(x3, y3);
406  x1 = _mm_xor_si128(x1, rk);
407  x2 = _mm_xor_si128(x2, rk);
408  x3 = _mm_xor_si128(x3, rk);
409  y1 = RotateLeft32<3>(y1);
410  y2 = RotateLeft32<3>(y2);
411  y3 = RotateLeft32<3>(y3);
412  y1 = _mm_xor_si128(y1, x1);
413  y2 = _mm_xor_si128(y2, x2);
414  y3 = _mm_xor_si128(y3, x3);
415  }
416 
417  // The is roughly the SSE equivalent to ARM vzp32
418  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
419  block0 = _mm_unpacklo_epi32(y1, x1);
420  block1 = _mm_unpackhi_epi32(y1, x1);
421  block2 = _mm_unpacklo_epi32(y2, x2);
422  block3 = _mm_unpackhi_epi32(y2, x2);
423  block4 = _mm_unpacklo_epi32(y3, x3);
424  block5 = _mm_unpackhi_epi32(y3, x3);
425 }
426 
427 CRYPTOPP_INLINE void SPECK64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
428  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
429  const word32 *subkeys, unsigned int rounds)
430 {
431  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
432  const __m128 t0 = _mm_castsi128_ps(block0);
433  const __m128 t1 = _mm_castsi128_ps(block1);
434  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
435  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
436 
437  const __m128 t2 = _mm_castsi128_ps(block2);
438  const __m128 t3 = _mm_castsi128_ps(block3);
439  __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
440  __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
441 
442  const __m128 t4 = _mm_castsi128_ps(block4);
443  const __m128 t5 = _mm_castsi128_ps(block5);
444  __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
445  __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
446 
447  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
448  {
449  const __m128i rk = _mm_set1_epi32(subkeys[i]);
450 
451  y1 = _mm_xor_si128(y1, x1);
452  y2 = _mm_xor_si128(y2, x2);
453  y3 = _mm_xor_si128(y3, x3);
454  y1 = RotateRight32<3>(y1);
455  y2 = RotateRight32<3>(y2);
456  y3 = RotateRight32<3>(y3);
457  x1 = _mm_xor_si128(x1, rk);
458  x2 = _mm_xor_si128(x2, rk);
459  x3 = _mm_xor_si128(x3, rk);
460  x1 = _mm_sub_epi32(x1, y1);
461  x2 = _mm_sub_epi32(x2, y2);
462  x3 = _mm_sub_epi32(x3, y3);
463  x1 = RotateLeft32<8>(x1);
464  x2 = RotateLeft32<8>(x2);
465  x3 = RotateLeft32<8>(x3);
466  }
467 
468  // The is roughly the SSE equivalent to ARM vzp32
469  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
470  block0 = _mm_unpacklo_epi32(y1, x1);
471  block1 = _mm_unpackhi_epi32(y1, x1);
472  block2 = _mm_unpacklo_epi32(y2, x2);
473  block3 = _mm_unpackhi_epi32(y2, x2);
474  block4 = _mm_unpacklo_epi32(y3, x3);
475  block5 = _mm_unpackhi_epi32(y3, x3);
476 }
477 
478 #endif // CRYPTOPP_SSE41_AVAILABLE
479 
480 // ***************************** Altivec ***************************** //
481 
482 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
485 
486 using CryptoPP::VecAdd;
487 using CryptoPP::VecSub;
488 using CryptoPP::VecXor;
489 using CryptoPP::VecLoad;
491 
492 // Rotate left by bit count
493 template<unsigned int C>
494 CRYPTOPP_INLINE uint32x4_p RotateLeft32(const uint32x4_p val)
495 {
496  const uint32x4_p m = {C, C, C, C};
497  return vec_rl(val, m);
498 }
499 
500 // Rotate right by bit count
501 template<unsigned int C>
502 CRYPTOPP_INLINE uint32x4_p RotateRight32(const uint32x4_p val)
503 {
504  const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
505  return vec_rl(val, m);
506 }
507 
508 void SPECK64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
509  const word32 *subkeys, unsigned int rounds)
510 {
511 #if (CRYPTOPP_BIG_ENDIAN)
512  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
513  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
514 #else
515  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
516  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
517 #endif
518 
519  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
520  uint32x4_p x1 = VecPermute(block0, block1, m1);
521  uint32x4_p y1 = VecPermute(block0, block1, m2);
522 
523  for (int i=0; i < static_cast<int>(rounds); ++i)
524  {
525 #if CRYPTOPP_POWER7_AVAILABLE
526  const uint32x4_p rk = vec_splats(subkeys[i]);
527 #else
528  // subkeys has extra elements so memory backs the last subkey
529  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
530  uint32x4_p rk = VecLoad(subkeys+i);
531  rk = VecPermute(rk, rk, m);
532 #endif
533 
534  x1 = RotateRight32<8>(x1);
535  x1 = VecAdd(x1, y1);
536  x1 = VecXor(x1, rk);
537 
538  y1 = RotateLeft32<3>(y1);
539  y1 = VecXor(y1, x1);
540  }
541 
542 #if (CRYPTOPP_BIG_ENDIAN)
543  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
544  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
545 #else
546  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
547  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
548 #endif
549 
550  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
551  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
552  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
553 }
554 
555 void SPECK64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
556  const word32 *subkeys, unsigned int rounds)
557 {
558 #if (CRYPTOPP_BIG_ENDIAN)
559  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
560  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
561 #else
562  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
563  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
564 #endif
565 
566  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
567  uint32x4_p x1 = VecPermute(block0, block1, m1);
568  uint32x4_p y1 = VecPermute(block0, block1, m2);
569 
570  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
571  {
572 #if CRYPTOPP_POWER7_AVAILABLE
573  const uint32x4_p rk = vec_splats(subkeys[i]);
574 #else
575  // subkeys has extra elements so memory backs the last subkey
576  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
577  uint32x4_p rk = VecLoad(subkeys+i);
578  rk = VecPermute(rk, rk, m);
579 #endif
580 
581  y1 = VecXor(y1, x1);
582  y1 = RotateRight32<3>(y1);
583 
584  x1 = VecXor(x1, rk);
585  x1 = VecSub(x1, y1);
586  x1 = RotateLeft32<8>(x1);
587  }
588 
589 #if (CRYPTOPP_BIG_ENDIAN)
590  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
591  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
592 #else
593  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
594  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
595 #endif
596 
597  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
598  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
599  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
600 }
601 
602 void SPECK64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
603  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
604  uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
605 {
606 #if (CRYPTOPP_BIG_ENDIAN)
607  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
608  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
609 #else
610  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
611  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
612 #endif
613 
614  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
615  uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
616  uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
617  uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
618  uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
619  uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
620  uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
621 
622  for (int i=0; i < static_cast<int>(rounds); ++i)
623  {
624 #if CRYPTOPP_POWER7_AVAILABLE
625  const uint32x4_p rk = vec_splats(subkeys[i]);
626 #else
627  // subkeys has extra elements so memory backs the last subkey
628  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
629  uint32x4_p rk = VecLoad(subkeys+i);
630  rk = VecPermute(rk, rk, m);
631 #endif
632 
633  x1 = RotateRight32<8>(x1);
634  x2 = RotateRight32<8>(x2);
635  x3 = RotateRight32<8>(x3);
636 
637  x1 = VecAdd(x1, y1);
638  x2 = VecAdd(x2, y2);
639  x3 = VecAdd(x3, y3);
640 
641  x1 = VecXor(x1, rk);
642  x2 = VecXor(x2, rk);
643  x3 = VecXor(x3, rk);
644 
645  y1 = RotateLeft32<3>(y1);
646  y2 = RotateLeft32<3>(y2);
647  y3 = RotateLeft32<3>(y3);
648 
649  y1 = VecXor(y1, x1);
650  y2 = VecXor(y2, x2);
651  y3 = VecXor(y3, x3);
652  }
653 
654 #if (CRYPTOPP_BIG_ENDIAN)
655  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
656  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
657 #else
658  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
659  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
660 #endif
661 
662  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
663  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
664  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
665  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
666  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
667  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
668  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
669 }
670 
671 void SPECK64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
672  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
673  uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
674 {
675 #if (CRYPTOPP_BIG_ENDIAN)
676  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
677  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
678 #else
679  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
680  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
681 #endif
682 
683  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
684  uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
685  uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
686  uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
687  uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
688  uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
689  uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
690 
691  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
692  {
693 #if CRYPTOPP_POWER7_AVAILABLE
694  const uint32x4_p rk = vec_splats(subkeys[i]);
695 #else
696  // subkeys has extra elements so memory backs the last subkey
697  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
698  uint32x4_p rk = VecLoad(subkeys+i);
699  rk = VecPermute(rk, rk, m);
700 #endif
701 
702  y1 = VecXor(y1, x1);
703  y2 = VecXor(y2, x2);
704  y3 = VecXor(y3, x3);
705 
706  y1 = RotateRight32<3>(y1);
707  y2 = RotateRight32<3>(y2);
708  y3 = RotateRight32<3>(y3);
709 
710  x1 = VecXor(x1, rk);
711  x2 = VecXor(x2, rk);
712  x3 = VecXor(x3, rk);
713 
714  x1 = VecSub(x1, y1);
715  x2 = VecSub(x2, y2);
716  x3 = VecSub(x3, y3);
717 
718  x1 = RotateLeft32<8>(x1);
719  x2 = RotateLeft32<8>(x2);
720  x3 = RotateLeft32<8>(x3);
721  }
722 
723 #if (CRYPTOPP_BIG_ENDIAN)
724  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
725  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
726 #else
727  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
728  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
729 #endif
730 
731  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
732  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
733  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
734  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
735  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
736  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
737  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
738 }
739 
740 #endif // CRYPTOPP_ALTIVEC_AVAILABLE
741 
742 ANONYMOUS_NAMESPACE_END
743 
744 ///////////////////////////////////////////////////////////////////////
745 
746 NAMESPACE_BEGIN(CryptoPP)
747 
748 // *************************** ARM NEON **************************** //
749 
750 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
751 size_t SPECK64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
752  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
753 {
754  return AdvancedProcessBlocks64_6x2_NEON(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
755  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
756 }
757 
758 size_t SPECK64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
759  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
760 {
761  return AdvancedProcessBlocks64_6x2_NEON(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
762  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
763 }
764 #endif
765 
766 // ***************************** IA-32 ***************************** //
767 
768 #if defined(CRYPTOPP_SSE41_AVAILABLE)
769 size_t SPECK64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
770  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
771 {
772  return AdvancedProcessBlocks64_6x2_SSE(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
773  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
774 }
775 
776 size_t SPECK64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
777  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
778 {
779  return AdvancedProcessBlocks64_6x2_SSE(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
780  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
781 }
782 #endif
783 
784 // ***************************** Altivec ***************************** //
785 
786 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
787 size_t SPECK64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
788  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
789 {
790  return AdvancedProcessBlocks64_6x2_ALTIVEC(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
791  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
792 }
793 
794 size_t SPECK64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
795  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
796 {
797  return AdvancedProcessBlocks64_6x2_ALTIVEC(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
798  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
799 }
800 #endif
801 
802 NAMESPACE_END
Utility functions for the Crypto++ library.
T1 VecSub(const T1 vec1, const T2 vec2)
Subtract two vectors.
Definition: ppc_simd.h:980
Library configuration file.
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition: ppc_simd.h:963
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:875
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:128
Support functions for PowerPC and vector operations.
Template for AdvancedProcessBlocks and SIMD processing.
Precompiled header file.
Classes for the Speck block cipher.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:945
Crypto++ library namespace.
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:251
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:118