Crypto++  7.0
Free C++ class library of cryptographic schemes
sm4_simd.cpp
1 // sm4_simd.cpp - written and placed in the public domain by
2 // Markku-Juhani O. Saarinen and Jeffrey Walton
3 //
4 // This source file uses intrinsics and built-ins to gain access to
5 // AESNI, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
6 // source file is needed because additional CXXFLAGS are required to enable
7 // the appropriate instructions sets in some build configurations.
8 //
9 // AES-NI based on Markku-Juhani O. Saarinen work at https://github.com/mjosaarinen/sm4ni.
10 //
11 // ARMv8 is upcoming.
12 
13 #include "pch.h"
14 #include "config.h"
15 
16 #include "sm4.h"
17 #include "misc.h"
18 #include "adv_simd.h"
19 
20 // Uncomment for benchmarking C++ against SSE.
21 // Do so in both simon.cpp and simon-simd.cpp.
22 // #undef CRYPTOPP_AESNI_AVAILABLE
23 
24 #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE)
25 # include <xmmintrin.h>
26 # include <emmintrin.h>
27 #endif
28 
29 #if (CRYPTOPP_AESNI_AVAILABLE)
30 # include <tmmintrin.h>
31 # include <wmmintrin.h>
32 #endif
33 
34 #if (CRYPTOPP_ARM_NEON_AVAILABLE) && 0
35 # include <arm_neon.h>
36 #endif
37 
38 // Can't use CRYPTOPP_ARM_XXX_AVAILABLE because too many
39 // compilers don't follow ACLE conventions for the include.
40 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
41 # include <stdint.h>
42 # include <arm_acle.h>
43 #endif
44 
45 // Squash MS LNK4221 and libtool warnings
46 extern const char SM4_SIMD_FNAME[] = __FILE__;
47 
48 ANONYMOUS_NAMESPACE_BEGIN
49 
50 using CryptoPP::word32;
51 
52 #if (CRYPTOPP_AESNI_AVAILABLE)
53 
54 template <unsigned int R>
55 inline __m128i ShiftLeft(const __m128i& val)
56 {
57  return _mm_slli_epi32(val, R);
58 }
59 
60 template <unsigned int R>
61 inline __m128i ShiftRight(const __m128i& val)
62 {
63  return _mm_srli_epi32(val, R);
64 }
65 
66 template <unsigned int R>
67 inline __m128i ShiftLeft64(const __m128i& val)
68 {
69  return _mm_slli_epi64(val, R);
70 }
71 
72 template <unsigned int R>
73 inline __m128i ShiftRight64(const __m128i& val)
74 {
75  return _mm_srli_epi64(val, R);
76 }
77 
78 template <unsigned int R>
79 inline __m128i RotateLeft(const __m128i& val)
80 {
81  return _mm_or_si128(
82  _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
83 }
84 
85 template <unsigned int R>
86 inline __m128i RotateRight(const __m128i& val)
87 {
88  return _mm_or_si128(
89  _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
90 }
91 
92 template <>
93 inline __m128i RotateLeft<8>(const __m128i& val)
94 {
95  const __m128i r08 = _mm_set_epi32(0x0E0D0C0F, 0x0A09080B, 0x06050407, 0x02010003);
96  return _mm_shuffle_epi8(val, r08);
97 }
98 
99 template <>
100 inline __m128i RotateLeft<16>(const __m128i& val)
101 {
102  const __m128i mask = _mm_set_epi32(0x0D0C0F0E, 0x09080B0A, 0x05040706, 0x01000302);
103  return _mm_shuffle_epi8(val, mask);
104 }
105 
106 template <>
107 inline __m128i RotateLeft<24>(const __m128i& val)
108 {
109  const __m128i mask = _mm_set_epi32(0x0C0F0E0D, 0x080B0A09, 0x04070605, 0x00030201);
110  return _mm_shuffle_epi8(val, mask);
111 }
112 
113 /// \brief Unpack XMM words
114 /// \tparam IDX the element from each XMM word
115 /// \param a the first XMM word
116 /// \param b the second XMM word
117 /// \param c the third XMM word
118 /// \param d the fourth XMM word
119 /// \details UnpackXMM selects the IDX element from a, b, c, d and returns a concatenation
120 /// equivalent to <tt>a[IDX] || b[IDX] || c[IDX] || d[IDX]</tt>.
121 template <unsigned int IDX>
122 inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
123 {
124  // Should not be instantiated
125  CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
126  CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
127  CRYPTOPP_ASSERT(0);
128  return _mm_setzero_si128();
129 }
130 
131 template <>
132 inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
133 {
134  const __m128i r1 = _mm_unpacklo_epi32(a, b);
135  const __m128i r2 = _mm_unpacklo_epi32(c, d);
136  return _mm_unpacklo_epi64(r1, r2);
137 }
138 
139 template <>
140 inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
141 {
142  const __m128i r1 = _mm_unpacklo_epi32(a, b);
143  const __m128i r2 = _mm_unpacklo_epi32(c, d);
144  return _mm_unpackhi_epi64(r1, r2);
145 }
146 
147 template <>
148 inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
149 {
150  const __m128i r1 = _mm_unpackhi_epi32(a, b);
151  const __m128i r2 = _mm_unpackhi_epi32(c, d);
152  return _mm_unpacklo_epi64(r1, r2);
153 }
154 
155 template <>
156 inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
157 {
158  const __m128i r1 = _mm_unpackhi_epi32(a, b);
159  const __m128i r2 = _mm_unpackhi_epi32(c, d);
160  return _mm_unpackhi_epi64(r1, r2);
161 }
162 
163 /// \brief Unpack a XMM word
164 /// \tparam IDX the element from each XMM word
165 /// \param v the first XMM word
166 /// \details UnpackXMM selects the IDX element from v and returns a concatenation
167 /// equivalent to <tt>v[IDX] || v[IDX] || v[IDX] || v[IDX]</tt>.
168 template <unsigned int IDX>
169 inline __m128i UnpackXMM(const __m128i& v)
170 {
171  // Should not be instantiated
172  CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0);
173  return _mm_setzero_si128();
174 }
175 
176 template <>
177 inline __m128i UnpackXMM<0>(const __m128i& v)
178 {
179  // Splat to all lanes
180  return _mm_shuffle_epi8(v, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
181 }
182 
183 template <>
184 inline __m128i UnpackXMM<1>(const __m128i& v)
185 {
186  // Splat to all lanes
187  return _mm_shuffle_epi8(v, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
188 }
189 
190 template <>
191 inline __m128i UnpackXMM<2>(const __m128i& v)
192 {
193  // Splat to all lanes
194  return _mm_shuffle_epi8(v, _mm_set_epi8(11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8));
195 }
196 
197 template <>
198 inline __m128i UnpackXMM<3>(const __m128i& v)
199 {
200  // Splat to all lanes
201  return _mm_shuffle_epi8(v, _mm_set_epi8(15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12));
202 }
203 
204 template <unsigned int IDX>
205 inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
206 {
207  return UnpackXMM<IDX>(a, b, c, d);
208 }
209 
210 template <unsigned int IDX>
211 inline __m128i RepackXMM(const __m128i& v)
212 {
213  return UnpackXMM<IDX>(v);
214 }
215 
216 inline void SM4_Encrypt(__m128i &block0, __m128i &block1,
217  __m128i &block2, __m128i &block3, const word32 *subkeys)
218 {
219  // nibble mask
220  const __m128i c0f = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F);
221 
222  // flip all bytes in all 32-bit words
223  const __m128i flp = _mm_set_epi32(0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203);
224 
225  // inverse shift rows
226  const __m128i shr = _mm_set_epi32(0x0306090C, 0x0F020508, 0x0B0E0104, 0x070A0D00);
227 
228  // Affine transform 1 (low and high hibbles)
229  const __m128i m1l = _mm_set_epi32(0xC7C1B4B2, 0x22245157, 0x9197E2E4, 0x74720701);
230  const __m128i m1h = _mm_set_epi32(0xF052B91B, 0xF95BB012, 0xE240AB09, 0xEB49A200);
231 
232  // Affine transform 2 (low and high hibbles)
233  const __m128i m2l = _mm_set_epi32(0xEDD14478, 0x172BBE82, 0x5B67F2CE, 0xA19D0834);
234  const __m128i m2h = _mm_set_epi32(0x11CDBE62, 0xCC1063BF, 0xAE7201DD, 0x73AFDC00);
235 
236  __m128i t0 = UnpackXMM<0>(block0, block1, block2, block3);
237  __m128i t1 = UnpackXMM<1>(block0, block1, block2, block3);
238  __m128i t2 = UnpackXMM<2>(block0, block1, block2, block3);
239  __m128i t3 = UnpackXMM<3>(block0, block1, block2, block3);
240 
241  t0 = _mm_shuffle_epi8(t0, flp);
242  t1 = _mm_shuffle_epi8(t1, flp);
243  t2 = _mm_shuffle_epi8(t2, flp);
244  t3 = _mm_shuffle_epi8(t3, flp);
245 
246  const unsigned int ROUNDS = 32;
247  for (unsigned int i = 0; i < ROUNDS; i++)
248  {
249  const __m128i k = _mm_shuffle_epi32(_mm_castps_si128(
250  _mm_load_ss((const float*)(subkeys+i))), _MM_SHUFFLE(0,0,0,0));
251 
252  __m128i x, y;
253  x = _mm_xor_si128(t1, _mm_xor_si128(t2, _mm_xor_si128(t3, k)));
254 
255  y = _mm_and_si128(x, c0f); // inner affine
256  y = _mm_shuffle_epi8(m1l, y);
257  x = _mm_and_si128(ShiftRight64<4>(x), c0f);
258  x = _mm_xor_si128(_mm_shuffle_epi8(m1h, x), y);
259 
260  x = _mm_shuffle_epi8(x, shr); // inverse MixColumns
261  x = _mm_aesenclast_si128(x, c0f); // AESNI instruction
262 
263  y = _mm_andnot_si128(x, c0f); // outer affine
264  y = _mm_shuffle_epi8(m2l, y);
265  x = _mm_and_si128(ShiftRight64<4>(x), c0f);
266  x = _mm_xor_si128(_mm_shuffle_epi8(m2h, x), y);
267 
268  // 4 parallel L1 linear transforms
269  y = _mm_xor_si128(x, RotateLeft<8>(x));
270  y = _mm_xor_si128(y, RotateLeft<16>(x));
271  y = _mm_xor_si128(ShiftLeft<2>(y), ShiftRight<30>(y));
272  x = _mm_xor_si128(x, _mm_xor_si128(y, RotateLeft<24>(x)));
273 
274  // rotate registers
275  x = _mm_xor_si128(x, t0);
276  t0 = t1; t1 = t2;
277  t2 = t3; t3 = x;
278  }
279 
280  t0 = _mm_shuffle_epi8(t0, flp);
281  t1 = _mm_shuffle_epi8(t1, flp);
282  t2 = _mm_shuffle_epi8(t2, flp);
283  t3 = _mm_shuffle_epi8(t3, flp);
284 
285  block0 = RepackXMM<0>(t3,t2,t1,t0);
286  block1 = RepackXMM<1>(t3,t2,t1,t0);
287  block2 = RepackXMM<2>(t3,t2,t1,t0);
288  block3 = RepackXMM<3>(t3,t2,t1,t0);
289 }
290 
291 inline void SM4_Enc_4_Blocks(__m128i &block0, __m128i &block1,
292  __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int /*rounds*/)
293 {
294  SM4_Encrypt(block0, block1, block2, block3, subkeys);
295 }
296 
297 inline void SM4_Dec_4_Blocks(__m128i &block0, __m128i &block1,
298  __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int /*rounds*/)
299 {
300  SM4_Encrypt(block0, block1, block2, block3, subkeys);
301 }
302 
303 inline void SM4_Enc_Block(__m128i &block0,
304  const word32 *subkeys, unsigned int /*rounds*/)
305 {
306  __m128i t1 = _mm_setzero_si128();
307  __m128i t2 = _mm_setzero_si128();
308  __m128i t3 = _mm_setzero_si128();
309 
310  SM4_Encrypt(block0, t1, t2, t3, subkeys);
311 }
312 
313 inline void SM4_Dec_Block(__m128i &block0,
314  const word32 *subkeys, unsigned int /*rounds*/)
315 {
316  __m128i t1 = _mm_setzero_si128();
317  __m128i t2 = _mm_setzero_si128();
318  __m128i t3 = _mm_setzero_si128();
319 
320  SM4_Encrypt(block0, t1, t2, t3, subkeys);
321 }
322 
323 #endif // CRYPTOPP_AESNI_AVAILABLE
324 
325 ANONYMOUS_NAMESPACE_END
326 
327 NAMESPACE_BEGIN(CryptoPP)
328 
329 #if defined(CRYPTOPP_AESNI_AVAILABLE)
330 size_t SM4_Enc_AdvancedProcessBlocks_AESNI(const word32* subKeys, size_t rounds,
331  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
332 {
333  return AdvancedProcessBlocks128_4x1_SSE(SM4_Enc_Block, SM4_Enc_4_Blocks,
334  subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
335 }
336 #endif // CRYPTOPP_AESNI_AVAILABLE
337 
338 #if defined(CRYPTOPP_ARM_NEON_AVAILABLE) && 0
339 size_t SM4_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
340  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
341 {
342  uint32x4_t unused; // Avoid template argument deduction/substitution failures
343  return AdvancedProcessBlocks128_4x1_NEON(SM4_Enc_Block, SM4_Enc_4_Blocks,
344  unused, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
345 }
346 
347 size_t SM4_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
348  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
349 {
350  uint32x4_t unused; // Avoid template argument deduction/substitution failures
351  return AdvancedProcessBlocks128_4x1_NEON(SM4_Dec_Block, SM4_Dec_4_Blocks,
352  unused, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
353 }
354 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
355 
356 NAMESPACE_END
Utility functions for the Crypto++ library.
Library configuration file.
Template for AdvancedProcessBlocks and SIMD processing.
Precompiled header file.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:60
Classes for the SM4 block cipher.
Crypto++ library namespace.