Crypto++  8.0
Free C++ class library of cryptographic schemes
speck64_simd.cpp
1 // speck64_simd.cpp - written and placed in the public domain by Jeffrey Walton
2 //
3 // This source file uses intrinsics and built-ins to gain access to
4 // SSSE3, ARM NEON and ARMv8a, and Altivec instructions. A separate
5 // source file is needed because additional CXXFLAGS are required to enable
6 // the appropriate instructions sets in some build configurations.
7 
8 #include "pch.h"
9 #include "config.h"
10 
11 #include "speck.h"
12 #include "misc.h"
13 
14 // Uncomment for benchmarking C++ against SSE or NEON.
15 // Do so in both speck.cpp and speck-simd.cpp.
16 // #undef CRYPTOPP_SSE41_AVAILABLE
17 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
18 
19 #if (CRYPTOPP_SSSE3_AVAILABLE)
20 # include "adv_simd.h"
21 # include <pmmintrin.h>
22 # include <tmmintrin.h>
23 #endif
24 
25 #if (CRYPTOPP_SSE41_AVAILABLE)
26 # include <smmintrin.h>
27 #endif
28 
29 #if defined(__XOP__)
30 # include <ammintrin.h>
31 #endif
32 
33 #if defined(__AVX512F__)
34 # define CRYPTOPP_AVX512_ROTATE 1
35 # include <immintrin.h>
36 #endif
37 
38 // C1189: error: This header is specific to ARM targets
39 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
40 # include "adv_simd.h"
41 # ifndef _M_ARM64
42 # include <arm_neon.h>
43 # endif
44 #endif
45 
46 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
47 # include <stdint.h>
48 # include <arm_acle.h>
49 #endif
50 
51 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
52 # include "adv_simd.h"
53 # include "ppc_simd.h"
54 #endif
55 
56 // Squash MS LNK4221 and libtool warnings
57 extern const char SPECK64_SIMD_FNAME[] = __FILE__;
58 
59 ANONYMOUS_NAMESPACE_BEGIN
60 
61 using CryptoPP::byte;
62 using CryptoPP::word32;
63 using CryptoPP::word64;
64 
65 // *************************** ARM NEON ************************** //
66 
67 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
68 
69 template <class T>
70 inline T UnpackHigh32(const T& a, const T& b)
71 {
72  const uint32x2_t x(vget_high_u32((uint32x4_t)a));
73  const uint32x2_t y(vget_high_u32((uint32x4_t)b));
74  const uint32x2x2_t r = vzip_u32(x, y);
75  return (T)vcombine_u32(r.val[0], r.val[1]);
76 }
77 
78 template <class T>
79 inline T UnpackLow32(const T& a, const T& b)
80 {
81  const uint32x2_t x(vget_low_u32((uint32x4_t)a));
82  const uint32x2_t y(vget_low_u32((uint32x4_t)b));
83  const uint32x2x2_t r = vzip_u32(x, y);
84  return (T)vcombine_u32(r.val[0], r.val[1]);
85 }
86 
87 template <unsigned int R>
88 inline uint32x4_t RotateLeft32(const uint32x4_t& val)
89 {
90  const uint32x4_t a(vshlq_n_u32(val, R));
91  const uint32x4_t b(vshrq_n_u32(val, 32 - R));
92  return vorrq_u32(a, b);
93 }
94 
95 template <unsigned int R>
96 inline uint32x4_t RotateRight32(const uint32x4_t& val)
97 {
98  const uint32x4_t a(vshlq_n_u32(val, 32 - R));
99  const uint32x4_t b(vshrq_n_u32(val, R));
100  return vorrq_u32(a, b);
101 }
102 
103 #if defined(__aarch32__) || defined(__aarch64__)
104 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
105 template <>
106 inline uint32x4_t RotateLeft32<8>(const uint32x4_t& val)
107 {
108 #if (CRYPTOPP_BIG_ENDIAN)
109  const uint8_t maskb[16] = { 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3 };
110  const uint8x16_t mask = vld1q_u8(maskb);
111 #else
112  const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
113  const uint8x16_t mask = vld1q_u8(maskb);
114 #endif
115 
116  return vreinterpretq_u32_u8(
117  vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
118 }
119 
120 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
121 template <>
122 inline uint32x4_t RotateRight32<8>(const uint32x4_t& val)
123 {
124 #if (CRYPTOPP_BIG_ENDIAN)
125  const uint8_t maskb[16] = { 12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1 };
126  const uint8x16_t mask = vld1q_u8(maskb);
127 #else
128  const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,15,12 };
129  const uint8x16_t mask = vld1q_u8(maskb);
130 #endif
131 
132  return vreinterpretq_u32_u8(
133  vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
134 }
135 #endif // Aarch32 or Aarch64
136 
137 inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1,
138  const word32 *subkeys, unsigned int rounds)
139 {
140  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
141  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
142  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
143 
144  for (int i=0; i < static_cast<int>(rounds); ++i)
145  {
146  const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
147 
148  x1 = RotateRight32<8>(x1);
149  x1 = vaddq_u32(x1, y1);
150  x1 = veorq_u32(x1, rk);
151  y1 = RotateLeft32<3>(y1);
152  y1 = veorq_u32(y1, x1);
153  }
154 
155  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
156  block0 = UnpackLow32(y1, x1);
157  block1 = UnpackHigh32(y1, x1);
158 }
159 
160 inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
161  const word32 *subkeys, unsigned int rounds)
162 {
163  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
164  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
165  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
166 
167  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
168  {
169  const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
170 
171  y1 = veorq_u32(y1, x1);
172  y1 = RotateRight32<3>(y1);
173  x1 = veorq_u32(x1, rk);
174  x1 = vsubq_u32(x1, y1);
175  x1 = RotateLeft32<8>(x1);
176  }
177 
178  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
179  block0 = UnpackLow32(y1, x1);
180  block1 = UnpackHigh32(y1, x1);
181 }
182 
183 inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
184  uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
185  const word32 *subkeys, unsigned int rounds)
186 {
187  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
188  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
189  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
190  uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
191  uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
192  uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
193  uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
194 
195  for (int i=0; i < static_cast<int>(rounds); ++i)
196  {
197  const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
198 
199  x1 = RotateRight32<8>(x1);
200  x2 = RotateRight32<8>(x2);
201  x3 = RotateRight32<8>(x3);
202  x1 = vaddq_u32(x1, y1);
203  x2 = vaddq_u32(x2, y2);
204  x3 = vaddq_u32(x3, y3);
205  x1 = veorq_u32(x1, rk);
206  x2 = veorq_u32(x2, rk);
207  x3 = veorq_u32(x3, rk);
208  y1 = RotateLeft32<3>(y1);
209  y2 = RotateLeft32<3>(y2);
210  y3 = RotateLeft32<3>(y3);
211  y1 = veorq_u32(y1, x1);
212  y2 = veorq_u32(y2, x2);
213  y3 = veorq_u32(y3, x3);
214  }
215 
216  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
217  block0 = UnpackLow32(y1, x1);
218  block1 = UnpackHigh32(y1, x1);
219  block2 = UnpackLow32(y2, x2);
220  block3 = UnpackHigh32(y2, x2);
221  block4 = UnpackLow32(y3, x3);
222  block5 = UnpackHigh32(y3, x3);
223 }
224 
225 inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
226  uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
227  const word32 *subkeys, unsigned int rounds)
228 {
229  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
230  uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
231  uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
232  uint32x4_t x2 = vuzpq_u32(block2, block3).val[1];
233  uint32x4_t y2 = vuzpq_u32(block2, block3).val[0];
234  uint32x4_t x3 = vuzpq_u32(block4, block5).val[1];
235  uint32x4_t y3 = vuzpq_u32(block4, block5).val[0];
236 
237  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
238  {
239  const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
240 
241  y1 = veorq_u32(y1, x1);
242  y2 = veorq_u32(y2, x2);
243  y3 = veorq_u32(y3, x3);
244  y1 = RotateRight32<3>(y1);
245  y2 = RotateRight32<3>(y2);
246  y3 = RotateRight32<3>(y3);
247  x1 = veorq_u32(x1, rk);
248  x2 = veorq_u32(x2, rk);
249  x3 = veorq_u32(x3, rk);
250  x1 = vsubq_u32(x1, y1);
251  x2 = vsubq_u32(x2, y2);
252  x3 = vsubq_u32(x3, y3);
253  x1 = RotateLeft32<8>(x1);
254  x2 = RotateLeft32<8>(x2);
255  x3 = RotateLeft32<8>(x3);
256  }
257 
258  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
259  block0 = UnpackLow32(y1, x1);
260  block1 = UnpackHigh32(y1, x1);
261  block2 = UnpackLow32(y2, x2);
262  block3 = UnpackHigh32(y2, x2);
263  block4 = UnpackLow32(y3, x3);
264  block5 = UnpackHigh32(y3, x3);
265 }
266 
267 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
268 
269 // ***************************** IA-32 ***************************** //
270 
271 #if defined(CRYPTOPP_SSE41_AVAILABLE)
272 
273 template <unsigned int R>
274 inline __m128i RotateLeft32(const __m128i& val)
275 {
276 #if defined(__XOP__)
277  return _mm_roti_epi32(val, R);
278 #else
279  return _mm_or_si128(
280  _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
281 #endif
282 }
283 
284 template <unsigned int R>
285 inline __m128i RotateRight32(const __m128i& val)
286 {
287 #if defined(__XOP__)
288  return _mm_roti_epi32(val, 32-R);
289 #else
290  return _mm_or_si128(
291  _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
292 #endif
293 }
294 
295 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
296 template <>
297 __m128i RotateLeft32<8>(const __m128i& val)
298 {
299 #if defined(__XOP__)
300  return _mm_roti_epi32(val, 8);
301 #else
302  const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
303  return _mm_shuffle_epi8(val, mask);
304 #endif
305 }
306 
307 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
308 template <>
309 __m128i RotateRight32<8>(const __m128i& val)
310 {
311 #if defined(__XOP__)
312  return _mm_roti_epi32(val, 32-8);
313 #else
314  const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
315  return _mm_shuffle_epi8(val, mask);
316 #endif
317 }
318 
319 inline void SPECK64_Enc_Block(__m128i &block0, __m128i &block1,
320  const word32 *subkeys, unsigned int rounds)
321 {
322  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
323  const __m128 t0 = _mm_castsi128_ps(block0);
324  const __m128 t1 = _mm_castsi128_ps(block1);
325  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
326  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
327 
328  for (int i=0; i < static_cast<int>(rounds); ++i)
329  {
330  const __m128i rk = _mm_set1_epi32(subkeys[i]);
331 
332  x1 = RotateRight32<8>(x1);
333  x1 = _mm_add_epi32(x1, y1);
334  x1 = _mm_xor_si128(x1, rk);
335  y1 = RotateLeft32<3>(y1);
336  y1 = _mm_xor_si128(y1, x1);
337  }
338 
339  // The is roughly the SSE equivalent to ARM vzp32
340  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
341  block0 = _mm_unpacklo_epi32(y1, x1);
342  block1 = _mm_unpackhi_epi32(y1, x1);
343 }
344 
345 inline void SPECK64_Dec_Block(__m128i &block0, __m128i &block1,
346  const word32 *subkeys, unsigned int rounds)
347 {
348  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
349  const __m128 t0 = _mm_castsi128_ps(block0);
350  const __m128 t1 = _mm_castsi128_ps(block1);
351  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
352  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
353 
354  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
355  {
356  const __m128i rk = _mm_set1_epi32(subkeys[i]);
357 
358  y1 = _mm_xor_si128(y1, x1);
359  y1 = RotateRight32<3>(y1);
360  x1 = _mm_xor_si128(x1, rk);
361  x1 = _mm_sub_epi32(x1, y1);
362  x1 = RotateLeft32<8>(x1);
363  }
364 
365  // The is roughly the SSE equivalent to ARM vzp32
366  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
367  block0 = _mm_unpacklo_epi32(y1, x1);
368  block1 = _mm_unpackhi_epi32(y1, x1);
369 }
370 
371 inline void SPECK64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
372  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
373  const word32 *subkeys, unsigned int rounds)
374 {
375  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
376  const __m128 t0 = _mm_castsi128_ps(block0);
377  const __m128 t1 = _mm_castsi128_ps(block1);
378  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
379  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
380 
381  const __m128 t2 = _mm_castsi128_ps(block2);
382  const __m128 t3 = _mm_castsi128_ps(block3);
383  __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
384  __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
385 
386  const __m128 t4 = _mm_castsi128_ps(block4);
387  const __m128 t5 = _mm_castsi128_ps(block5);
388  __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
389  __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
390 
391  for (int i=0; i < static_cast<int>(rounds); ++i)
392  {
393  const __m128i rk = _mm_set1_epi32(subkeys[i]);
394 
395  x1 = RotateRight32<8>(x1);
396  x2 = RotateRight32<8>(x2);
397  x3 = RotateRight32<8>(x3);
398  x1 = _mm_add_epi32(x1, y1);
399  x2 = _mm_add_epi32(x2, y2);
400  x3 = _mm_add_epi32(x3, y3);
401  x1 = _mm_xor_si128(x1, rk);
402  x2 = _mm_xor_si128(x2, rk);
403  x3 = _mm_xor_si128(x3, rk);
404  y1 = RotateLeft32<3>(y1);
405  y2 = RotateLeft32<3>(y2);
406  y3 = RotateLeft32<3>(y3);
407  y1 = _mm_xor_si128(y1, x1);
408  y2 = _mm_xor_si128(y2, x2);
409  y3 = _mm_xor_si128(y3, x3);
410  }
411 
412  // The is roughly the SSE equivalent to ARM vzp32
413  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
414  block0 = _mm_unpacklo_epi32(y1, x1);
415  block1 = _mm_unpackhi_epi32(y1, x1);
416  block2 = _mm_unpacklo_epi32(y2, x2);
417  block3 = _mm_unpackhi_epi32(y2, x2);
418  block4 = _mm_unpacklo_epi32(y3, x3);
419  block5 = _mm_unpackhi_epi32(y3, x3);
420 }
421 
422 inline void SPECK64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
423  __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
424  const word32 *subkeys, unsigned int rounds)
425 {
426  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
427  const __m128 t0 = _mm_castsi128_ps(block0);
428  const __m128 t1 = _mm_castsi128_ps(block1);
429  __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
430  __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
431 
432  const __m128 t2 = _mm_castsi128_ps(block2);
433  const __m128 t3 = _mm_castsi128_ps(block3);
434  __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
435  __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
436 
437  const __m128 t4 = _mm_castsi128_ps(block4);
438  const __m128 t5 = _mm_castsi128_ps(block5);
439  __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
440  __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
441 
442  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
443  {
444  const __m128i rk = _mm_set1_epi32(subkeys[i]);
445 
446  y1 = _mm_xor_si128(y1, x1);
447  y2 = _mm_xor_si128(y2, x2);
448  y3 = _mm_xor_si128(y3, x3);
449  y1 = RotateRight32<3>(y1);
450  y2 = RotateRight32<3>(y2);
451  y3 = RotateRight32<3>(y3);
452  x1 = _mm_xor_si128(x1, rk);
453  x2 = _mm_xor_si128(x2, rk);
454  x3 = _mm_xor_si128(x3, rk);
455  x1 = _mm_sub_epi32(x1, y1);
456  x2 = _mm_sub_epi32(x2, y2);
457  x3 = _mm_sub_epi32(x3, y3);
458  x1 = RotateLeft32<8>(x1);
459  x2 = RotateLeft32<8>(x2);
460  x3 = RotateLeft32<8>(x3);
461  }
462 
463  // The is roughly the SSE equivalent to ARM vzp32
464  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
465  block0 = _mm_unpacklo_epi32(y1, x1);
466  block1 = _mm_unpackhi_epi32(y1, x1);
467  block2 = _mm_unpacklo_epi32(y2, x2);
468  block3 = _mm_unpackhi_epi32(y2, x2);
469  block4 = _mm_unpacklo_epi32(y3, x3);
470  block5 = _mm_unpackhi_epi32(y3, x3);
471 }
472 
473 #endif // CRYPTOPP_SSE41_AVAILABLE
474 
475 // ***************************** Altivec ***************************** //
476 
477 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
480 
481 using CryptoPP::VecAdd;
482 using CryptoPP::VecSub;
483 using CryptoPP::VecXor;
484 using CryptoPP::VecLoad;
486 
487 // Rotate left by bit count
488 template<unsigned int C>
489 inline uint32x4_p RotateLeft32(const uint32x4_p val)
490 {
491  const uint32x4_p m = {C, C, C, C};
492  return vec_rl(val, m);
493 }
494 
495 // Rotate right by bit count
496 template<unsigned int C>
497 inline uint32x4_p RotateRight32(const uint32x4_p val)
498 {
499  const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
500  return vec_rl(val, m);
501 }
502 
503 void SPECK64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
504  const word32 *subkeys, unsigned int rounds)
505 {
506 #if (CRYPTOPP_BIG_ENDIAN)
507  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
508  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
509 #else
510  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
511  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
512 #endif
513 
514  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
515  uint32x4_p x1 = VecPermute(block0, block1, m1);
516  uint32x4_p y1 = VecPermute(block0, block1, m2);
517 
518  for (int i=0; i < static_cast<int>(rounds); ++i)
519  {
520 #if CRYPTOPP_POWER7_AVAILABLE
521  const uint32x4_p rk = vec_splats(subkeys[i]);
522 #else
523  // subkeys has extra elements so memory backs the last subkey
524  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
525  uint32x4_p rk = VecLoad(subkeys+i);
526  rk = VecPermute(rk, rk, m);
527 #endif
528 
529  x1 = RotateRight32<8>(x1);
530  x1 = VecAdd(x1, y1);
531  x1 = VecXor(x1, rk);
532 
533  y1 = RotateLeft32<3>(y1);
534  y1 = VecXor(y1, x1);
535  }
536 
537 #if (CRYPTOPP_BIG_ENDIAN)
538  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
539  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
540 #else
541  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
542  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
543 #endif
544 
545  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
546  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
547  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
548 }
549 
550 void SPECK64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
551  const word32 *subkeys, unsigned int rounds)
552 {
553 #if (CRYPTOPP_BIG_ENDIAN)
554  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
555  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
556 #else
557  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
558  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
559 #endif
560 
561  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
562  uint32x4_p x1 = VecPermute(block0, block1, m1);
563  uint32x4_p y1 = VecPermute(block0, block1, m2);
564 
565  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
566  {
567 #if CRYPTOPP_POWER7_AVAILABLE
568  const uint32x4_p rk = vec_splats(subkeys[i]);
569 #else
570  // subkeys has extra elements so memory backs the last subkey
571  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
572  uint32x4_p rk = VecLoad(subkeys+i);
573  rk = VecPermute(rk, rk, m);
574 #endif
575 
576  y1 = VecXor(y1, x1);
577  y1 = RotateRight32<3>(y1);
578 
579  x1 = VecXor(x1, rk);
580  x1 = VecSub(x1, y1);
581  x1 = RotateLeft32<8>(x1);
582  }
583 
584 #if (CRYPTOPP_BIG_ENDIAN)
585  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
586  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
587 #else
588  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
589  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
590 #endif
591 
592  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
593  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
594  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
595 }
596 
597 void SPECK64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
598  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
599  uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
600 {
601 #if (CRYPTOPP_BIG_ENDIAN)
602  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
603  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
604 #else
605  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
606  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
607 #endif
608 
609  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
610  uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
611  uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
612  uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
613  uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
614  uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
615  uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
616 
617  for (int i=0; i < static_cast<int>(rounds); ++i)
618  {
619 #if CRYPTOPP_POWER7_AVAILABLE
620  const uint32x4_p rk = vec_splats(subkeys[i]);
621 #else
622  // subkeys has extra elements so memory backs the last subkey
623  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
624  uint32x4_p rk = VecLoad(subkeys+i);
625  rk = VecPermute(rk, rk, m);
626 #endif
627 
628  x1 = RotateRight32<8>(x1);
629  x2 = RotateRight32<8>(x2);
630  x3 = RotateRight32<8>(x3);
631 
632  x1 = VecAdd(x1, y1);
633  x2 = VecAdd(x2, y2);
634  x3 = VecAdd(x3, y3);
635 
636  x1 = VecXor(x1, rk);
637  x2 = VecXor(x2, rk);
638  x3 = VecXor(x3, rk);
639 
640  y1 = RotateLeft32<3>(y1);
641  y2 = RotateLeft32<3>(y2);
642  y3 = RotateLeft32<3>(y3);
643 
644  y1 = VecXor(y1, x1);
645  y2 = VecXor(y2, x2);
646  y3 = VecXor(y3, x3);
647  }
648 
649 #if (CRYPTOPP_BIG_ENDIAN)
650  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
651  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
652 #else
653  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
654  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
655 #endif
656 
657  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
658  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
659  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
660  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
661  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
662  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
663  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
664 }
665 
666 void SPECK64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
667  uint32x4_p &block2, uint32x4_p &block3, uint32x4_p &block4,
668  uint32x4_p &block5, const word32 *subkeys, unsigned int rounds)
669 {
670 #if (CRYPTOPP_BIG_ENDIAN)
671  const uint8x16_p m1 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
672  const uint8x16_p m2 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
673 #else
674  const uint8x16_p m1 = {3,2,1,0, 11,10,9,8, 19,18,17,16, 27,26,25,24};
675  const uint8x16_p m2 = {7,6,5,4, 15,14,13,12, 23,22,21,20, 31,30,29,28};
676 #endif
677 
678  // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
679  uint32x4_p x1 = (uint32x4_p)VecPermute(block0, block1, m1);
680  uint32x4_p y1 = (uint32x4_p)VecPermute(block0, block1, m2);
681  uint32x4_p x2 = (uint32x4_p)VecPermute(block2, block3, m1);
682  uint32x4_p y2 = (uint32x4_p)VecPermute(block2, block3, m2);
683  uint32x4_p x3 = (uint32x4_p)VecPermute(block4, block5, m1);
684  uint32x4_p y3 = (uint32x4_p)VecPermute(block4, block5, m2);
685 
686  for (int i = static_cast<int>(rounds-1); i >= 0; --i)
687  {
688 #if CRYPTOPP_POWER7_AVAILABLE
689  const uint32x4_p rk = vec_splats(subkeys[i]);
690 #else
691  // subkeys has extra elements so memory backs the last subkey
692  const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
693  uint32x4_p rk = VecLoad(subkeys+i);
694  rk = VecPermute(rk, rk, m);
695 #endif
696 
697  y1 = VecXor(y1, x1);
698  y2 = VecXor(y2, x2);
699  y3 = VecXor(y3, x3);
700 
701  y1 = RotateRight32<3>(y1);
702  y2 = RotateRight32<3>(y2);
703  y3 = RotateRight32<3>(y3);
704 
705  x1 = VecXor(x1, rk);
706  x2 = VecXor(x2, rk);
707  x3 = VecXor(x3, rk);
708 
709  x1 = VecSub(x1, y1);
710  x2 = VecSub(x2, y2);
711  x3 = VecSub(x3, y3);
712 
713  x1 = RotateLeft32<8>(x1);
714  x2 = RotateLeft32<8>(x2);
715  x3 = RotateLeft32<8>(x3);
716  }
717 
718 #if (CRYPTOPP_BIG_ENDIAN)
719  const uint8x16_p m3 = {19,18,17,16, 3,2,1,0, 23,22,21,20, 7,6,5,4};
720  const uint8x16_p m4 = {27,26,25,24, 11,10,9,8, 31,30,29,28, 15,14,13,12};
721 #else
722  const uint8x16_p m3 = {3,2,1,0, 19,18,17,16, 7,6,5,4, 23,22,21,20};
723  const uint8x16_p m4 = {11,10,9,8, 27,26,25,24, 15,14,13,12, 31,30,29,28};
724 #endif
725 
726  // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
727  block0 = (uint32x4_p)VecPermute(x1, y1, m3);
728  block1 = (uint32x4_p)VecPermute(x1, y1, m4);
729  block2 = (uint32x4_p)VecPermute(x2, y2, m3);
730  block3 = (uint32x4_p)VecPermute(x2, y2, m4);
731  block4 = (uint32x4_p)VecPermute(x3, y3, m3);
732  block5 = (uint32x4_p)VecPermute(x3, y3, m4);
733 }
734 
735 #endif // CRYPTOPP_ALTIVEC_AVAILABLE
736 
737 ANONYMOUS_NAMESPACE_END
738 
739 ///////////////////////////////////////////////////////////////////////
740 
741 NAMESPACE_BEGIN(CryptoPP)
742 
743 // *************************** ARM NEON **************************** //
744 
745 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
746 size_t SPECK64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
747  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
748 {
749  return AdvancedProcessBlocks64_6x2_NEON(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
750  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
751 }
752 
753 size_t SPECK64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
754  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
755 {
756  return AdvancedProcessBlocks64_6x2_NEON(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
757  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
758 }
759 #endif
760 
761 // ***************************** IA-32 ***************************** //
762 
763 #if defined(CRYPTOPP_SSE41_AVAILABLE)
764 size_t SPECK64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
765  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
766 {
767  return AdvancedProcessBlocks64_6x2_SSE(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
768  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
769 }
770 
771 size_t SPECK64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
772  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
773 {
774  return AdvancedProcessBlocks64_6x2_SSE(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
775  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
776 }
777 #endif
778 
779 // ***************************** Altivec ***************************** //
780 
781 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
782 size_t SPECK64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
783  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
784 {
785  return AdvancedProcessBlocks64_6x2_ALTIVEC(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
786  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
787 }
788 
789 size_t SPECK64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
790  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
791 {
792  return AdvancedProcessBlocks64_6x2_ALTIVEC(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
793  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
794 }
795 #endif
796 
797 NAMESPACE_END
Utility functions for the Crypto++ library.
T1 VecSub(const T1 vec1, const T2 vec2)
Subtract two vectors.
Definition: ppc_simd.h:956
Library configuration file.
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition: ppc_simd.h:939
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1010
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:129
Support functions for PowerPC and vector operations.
Template for AdvancedProcessBlocks and SIMD processing.
Precompiled header file.
Classes for the Speck block cipher.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:916
Crypto++ library namespace.
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:253
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:119