Crypto++  8.8
Free C++ class library of cryptographic schemes
rijndael.cpp
1 // rijndael.cpp - modified by Chris Morgan <cmorgan@wpi.edu>
2 // and Wei Dai from Paulo Baretto's Rijndael implementation
3 // The original code and all modifications are in the public domain.
4 
5 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
6 
7 /*
8 July 2018: Added support for ARMv7 AES instructions via Cryptogams ASM.
9  See the head notes in aes_armv4.S for copyright and license.
10 */
11 
12 /*
13 September 2017: Added support for Power8 AES instructions via compiler intrinsics.
14 */
15 
16 /*
17 July 2017: Added support for ARMv8 AES instructions via compiler intrinsics.
18 */
19 
20 /*
21 July 2010: Added support for AES-NI instructions via compiler intrinsics.
22 */
23 
24 /*
25 Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode
26 caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein
27 and Peter Schwabe in their paper "New AES software speed records". The round
28 function was also modified to include a trick similar to one in Brian Gladman's
29 x86 assembly code, doing an 8-bit register move to minimize the number of
30 register spills. Also switched to compressed tables and copying round keys to
31 the stack.
32 
33 The C++ implementation uses compressed tables if
34 CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS is defined.
35 It is defined on x86 platforms by default but no others.
36 */
37 
38 /*
39 July 2006: Defense against timing attacks was added in by Wei Dai.
40 
41 The code now uses smaller tables in the first and last rounds,
42 and preloads them into L1 cache before usage (by loading at least
43 one element in each cache line).
44 
45 We try to delay subsequent accesses to each table (used in the first
46 and last rounds) until all of the table has been preloaded. Hopefully
47 the compiler isn't smart enough to optimize that code away.
48 
49 After preloading the table, we also try not to access any memory location
50 other than the table and the stack, in order to prevent table entries from
51 being unloaded from L1 cache, until that round is finished.
52 (Some popular CPUs have 2-way associative caches.)
53 */
54 
55 // This is the original introductory comment:
56 
57 /**
58  * version 3.0 (December 2000)
59  *
60  * Optimised ANSI C code for the Rijndael cipher (now AES)
61  *
62  * author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
63  * author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
64  * author Paulo Barreto <paulo.barreto@terra.com.br>
65  *
66  * This code is hereby placed in the public domain.
67  *
68  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
69  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
70  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
71  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
72  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
73  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
74  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
75  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
76  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
77  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
78  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
79  */
80 
81 #include "pch.h"
82 #include "config.h"
83 
84 #ifndef CRYPTOPP_IMPORTS
85 #ifndef CRYPTOPP_GENERATE_X64_MASM
86 
87 #include "rijndael.h"
88 #include "misc.h"
89 #include "cpu.h"
90 
91 // VS2017 and global optimization bug. Also see
92 // https://github.com/weidai11/cryptopp/issues/649
93 #if (CRYPTOPP_MSC_VERSION >= 1910) && (CRYPTOPP_MSC_VERSION <= 1916)
94 # ifndef CRYPTOPP_DEBUG
95 # pragma optimize("", off)
96 # pragma optimize("ts", on)
97 # endif
98 #endif
99 
100 NAMESPACE_BEGIN(CryptoPP)
101 
102 // Hack for http://github.com/weidai11/cryptopp/issues/42 and http://github.com/weidai11/cryptopp/issues/132
103 #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE))
104 # define CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS 1
105 #endif
106 
107 // Clang intrinsic casts
108 #define M128I_CAST(x) ((__m128i *)(void *)(x))
109 #define CONST_M128I_CAST(x) ((const __m128i *)(const void *)(x))
110 
111 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
112 # if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
113 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
114 using namespace rdtable;
115 # else
116 static word64 Te[256];
117 # endif
118 static word64 Td[256];
119 #else // Not CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS
120 # if defined(CRYPTOPP_X64_MASM_AVAILABLE)
121 // Unused; avoids linker error on Microsoft X64 non-AESNI platforms
122 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
123 # endif
124 CRYPTOPP_ALIGN_DATA(16) static word32 Te[256*4];
125 CRYPTOPP_ALIGN_DATA(16) static word32 Td[256*4];
126 #endif // CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS
127 
128 static volatile bool s_TeFilled = false, s_TdFilled = false;
129 
130 ANONYMOUS_NAMESPACE_BEGIN
131 
132 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
133 
134 // Determine whether the range between begin and end overlaps
135 // with the same 4k block offsets as the Te table. Logically,
136 // the code is trying to create the condition:
137 //
138 // Two separate memory pages:
139 //
140 // +-----+ +-----+
141 // |XXXXX| |YYYYY|
142 // |XXXXX| |YYYYY|
143 // | | | |
144 // | | | |
145 // +-----+ +-----+
146 // Te Table Locals
147 //
148 // Have a logical cache view of (X and Y may be inverted):
149 //
150 // +-----+
151 // |XXXXX|
152 // |XXXXX|
153 // |YYYYY|
154 // |YYYYY|
155 // +-----+
156 //
157 static inline bool AliasedWithTable(const byte *begin, const byte *end)
158 {
159  ptrdiff_t s0 = uintptr_t(begin)%4096, s1 = uintptr_t(end)%4096;
160  ptrdiff_t t0 = uintptr_t(Te)%4096, t1 = (uintptr_t(Te)+sizeof(Te))%4096;
161  if (t1 > t0)
162  return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
163  else
164  return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
165 }
166 
167 struct Locals
168 {
169  word32 subkeys[4*12], workspace[8];
170  const byte *inBlocks, *inXorBlocks, *outXorBlocks;
171  byte *outBlocks;
172  size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
173  size_t regSpill, lengthAndCounterFlag, keysBegin;
174 };
175 
176 const size_t s_aliasPageSize = 4096;
177 const size_t s_aliasBlockSize = 256;
178 const size_t s_sizeToAllocate = s_aliasPageSize + s_aliasBlockSize + sizeof(Locals);
179 
180 #endif // CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
181 
182 ANONYMOUS_NAMESPACE_END
183 
184 // ************************* Portable Code ************************************
185 
186 #define QUARTER_ROUND(L, T, t, a, b, c, d) \
187  a ^= L(T, 3, byte(t)); t >>= 8;\
188  b ^= L(T, 2, byte(t)); t >>= 8;\
189  c ^= L(T, 1, byte(t)); t >>= 8;\
190  d ^= L(T, 0, t);
191 
192 #define QUARTER_ROUND_LE(t, a, b, c, d) \
193  tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
194  tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
195  tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
196  tempBlock[d] = ((byte *)(Te+t))[1];
197 
198 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
199  #define QUARTER_ROUND_LD(t, a, b, c, d) \
200  tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
201  tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
202  tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
203  tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
204 #else
205  #define QUARTER_ROUND_LD(t, a, b, c, d) \
206  tempBlock[a] = Sd[byte(t)]; t >>= 8;\
207  tempBlock[b] = Sd[byte(t)]; t >>= 8;\
208  tempBlock[c] = Sd[byte(t)]; t >>= 8;\
209  tempBlock[d] = Sd[t];
210 #endif
211 
212 #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
213 #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
214 
215 #if (CRYPTOPP_LITTLE_ENDIAN)
216  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
217  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
218  #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
219  #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (6-i)%4+1))
220  #define TL_M(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (i+3)%4+1))
221  #else
222  #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8)
223  #define TL_M(T, i, x) T[i*256 + x]
224  #endif
225 #else
226  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
227  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
228  #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
229  #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (4-i)%4))
230  #define TL_M TL_F
231  #else
232  #define TL_F(T, i, x) rotrFixed(T[x], i*8)
233  #define TL_M(T, i, x) T[i*256 + x]
234  #endif
235 #endif
236 
237 
238 #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
239 #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
240 #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
241 
242 #define f3(x) (f2(x) ^ x)
243 #define f9(x) (f8(x) ^ x)
244 #define fb(x) (f8(x) ^ f2(x) ^ x)
245 #define fd(x) (f8(x) ^ f4(x) ^ x)
246 #define fe(x) (f8(x) ^ f4(x) ^ f2(x))
247 
248 unsigned int Rijndael::Base::OptimalDataAlignment() const
249 {
250 #if (CRYPTOPP_AESNI_AVAILABLE)
251  if (HasAESNI())
252  return 16; // load __m128i
253 #endif
254 #if (CRYPTOPP_ARM_AES_AVAILABLE)
255  if (HasAES())
256  return 4; // load uint32x4_t
257 #endif
258 #if (CRYPTOGAMS_ARM_AES)
259  // Must use 1 here for Cryptogams AES. Also see
260  // https://github.com/weidai11/cryptopp/issues/683
261  if (HasARMv7())
262  return 1;
263 #endif
264 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
265  if (HasAES())
266  return 16; // load uint32x4_p
267 #endif
269 }
270 
271 void Rijndael::Base::FillEncTable()
272 {
273  for (int i=0; i<256; i++)
274  {
275  byte x = Se[i];
276 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
277  word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
278  Te[i] = word64(y | f3(x))<<32 | y;
279 #else
280  word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
281  for (int j=0; j<4; j++)
282  {
283  Te[i+j*256] = y;
284  y = rotrConstant<8>(y);
285  }
286 #endif
287  }
288 #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
289  Te[256] = Te[257] = 0;
290 #endif
291  s_TeFilled = true;
292 }
293 
294 void Rijndael::Base::FillDecTable()
295 {
296  for (int i=0; i<256; i++)
297  {
298  byte x = Sd[i];
299 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
300  word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
301  Td[i] = word64(y | fb(x))<<32 | y | x;
302 #else
303  word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
304  for (int j=0; j<4; j++)
305  {
306  Td[i+j*256] = y;
307  y = rotrConstant<8>(y);
308  }
309 #endif
310  }
311  s_TdFilled = true;
312 }
313 
314 #if (CRYPTOPP_AESNI_AVAILABLE)
315 extern void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32* rk);
316 extern void Rijndael_UncheckedSetKeyRev_AESNI(word32 *key, unsigned int rounds);
317 
318 extern size_t Rijndael_Enc_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds,
319  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
320 extern size_t Rijndael_Dec_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds,
321  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
322 #endif
323 
324 #if (CRYPTOPP_ARM_AES_AVAILABLE)
325 extern size_t Rijndael_Enc_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds,
326  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
327 extern size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds,
328  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
329 #endif
330 
331 #if (CRYPTOGAMS_ARM_AES)
332 extern "C" int cryptogams_AES_set_encrypt_key(const unsigned char *userKey, const int bitLen, word32 *rkey);
333 extern "C" int cryptogams_AES_set_decrypt_key(const unsigned char *userKey, const int bitLen, word32 *rkey);
334 extern "C" void cryptogams_AES_encrypt_block(const unsigned char *in, unsigned char *out, const word32 *rkey);
335 extern "C" void cryptogams_AES_decrypt_block(const unsigned char *in, unsigned char *out, const word32 *rkey);
336 #endif
337 
338 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
339 extern void Rijndael_UncheckedSetKey_POWER8(const byte* userKey, size_t keyLen,
340  word32* rk, const byte* Se);
341 
342 extern size_t Rijndael_Enc_AdvancedProcessBlocks128_6x1_ALTIVEC(const word32 *subkeys, size_t rounds,
343  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
344 extern size_t Rijndael_Dec_AdvancedProcessBlocks128_6x1_ALTIVEC(const word32 *subkeys, size_t rounds,
345  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
346 #endif
347 
348 #if (CRYPTOGAMS_ARM_AES)
349 int CRYPTOGAMS_set_encrypt_key(const byte *userKey, const int bitLen, word32 *rkey)
350 {
351  return cryptogams_AES_set_encrypt_key(userKey, bitLen, rkey);
352 }
353 int CRYPTOGAMS_set_decrypt_key(const byte *userKey, const int bitLen, word32 *rkey)
354 {
355  return cryptogams_AES_set_decrypt_key(userKey, bitLen, rkey);
356 }
357 void CRYPTOGAMS_encrypt(const byte *inBlock, const byte *xorBlock, byte *outBlock, const word32 *rkey)
358 {
359  cryptogams_AES_encrypt_block(inBlock, outBlock, rkey);
360  if (xorBlock)
361  xorbuf (outBlock, xorBlock, 16);
362 }
363 void CRYPTOGAMS_decrypt(const byte *inBlock, const byte *xorBlock, byte *outBlock, const word32 *rkey)
364 {
365  cryptogams_AES_decrypt_block(inBlock, outBlock, rkey);
366  if (xorBlock)
367  xorbuf (outBlock, xorBlock, 16);
368 }
369 #endif
370 
371 std::string Rijndael::Base::AlgorithmProvider() const
372 {
373 #if (CRYPTOPP_AESNI_AVAILABLE)
374  if (HasAESNI())
375  return "AESNI";
376 #endif
377 #if CRYPTOPP_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
378  if (HasSSE2())
379  return "SSE2";
380 #endif
381 #if (CRYPTOPP_ARM_AES_AVAILABLE)
382  if (HasAES())
383  return "ARMv8";
384 #endif
385 #if (CRYPTOGAMS_ARM_AES)
386  if (HasARMv7())
387  return "ARMv7";
388 #endif
389 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
390  if (HasAES())
391  return "Power8";
392 #endif
393  return "C++";
394 }
395 
396 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, const NameValuePairs &)
397 {
398  AssertValidKeyLength(keyLen);
399 
400 #if (CRYPTOGAMS_ARM_AES)
401  if (HasARMv7())
402  {
403  m_rounds = keyLen/4 + 6;
404  m_key.New(4*(14+1)+4);
405 
406  if (IsForwardTransformation())
407  CRYPTOGAMS_set_encrypt_key(userKey, keyLen*8, m_key.begin());
408  else
409  CRYPTOGAMS_set_decrypt_key(userKey, keyLen*8, m_key.begin());
410  return;
411  }
412 #endif
413 
414 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
415  m_aliasBlock.New(s_sizeToAllocate);
416  // The alias block is only used on IA-32 when unaligned data access is in effect.
417  // Setting the low water mark to 0 avoids zeroization when m_aliasBlock is unused.
418  m_aliasBlock.SetMark(0);
419 #endif
420 
421  m_rounds = keyLen/4 + 6;
422  m_key.New(4*(m_rounds+1));
423  word32 *rk = m_key;
424 
425 #if (CRYPTOPP_AESNI_AVAILABLE && CRYPTOPP_SSE41_AVAILABLE && (!defined(CRYPTOPP_MSC_VERSION) || CRYPTOPP_MSC_VERSION >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32))
426  // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64
427  if (HasAESNI() && HasSSE41())
428  {
429  // TODO: Add non-SSE4.1 variant for low-end Atoms. The low-end
430  // Atoms have SSE2-SSSE3 and AES-NI, but not SSE4.1 or SSE4.2.
431  Rijndael_UncheckedSetKey_SSE4_AESNI(userKey, keyLen, rk);
432  if (!IsForwardTransformation())
433  Rijndael_UncheckedSetKeyRev_AESNI(m_key, m_rounds);
434 
435  return;
436  }
437 #endif
438 
439 #if CRYPTOPP_POWER8_AES_AVAILABLE
440  if (HasAES())
441  {
442  // We still need rcon and Se to fallback to C/C++ for AES-192 and AES-256.
443  // The IBM docs on AES sucks. Intel's docs on AESNI puts IBM to shame.
444  Rijndael_UncheckedSetKey_POWER8(userKey, keyLen, rk, Se);
445  return;
446  }
447 #endif
448 
449  GetUserKey(BIG_ENDIAN_ORDER, rk, keyLen/4, userKey, keyLen);
450  const word32 *rc = rcon;
451  word32 temp;
452 
453  while (true)
454  {
455  temp = rk[keyLen/4-1];
456  word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^
457  (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
458  rk[keyLen/4] = rk[0] ^ x ^ *(rc++);
459  rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4];
460  rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1];
461  rk[keyLen/4+3] = rk[3] ^ rk[keyLen/4+2];
462 
463  if (rk + keyLen/4 + 4 == m_key.end())
464  break;
465 
466  if (keyLen == 24)
467  {
468  rk[10] = rk[ 4] ^ rk[ 9];
469  rk[11] = rk[ 5] ^ rk[10];
470  }
471  else if (keyLen == 32)
472  {
473  temp = rk[11];
474  rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
475  rk[13] = rk[ 5] ^ rk[12];
476  rk[14] = rk[ 6] ^ rk[13];
477  rk[15] = rk[ 7] ^ rk[14];
478  }
479  rk += keyLen/4;
480  }
481 
482  rk = m_key;
483 
484  if (IsForwardTransformation())
485  {
486  if (!s_TeFilled)
487  FillEncTable();
488 
490  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
491  }
492  else
493  {
494  if (!s_TdFilled)
495  FillDecTable();
496 
497  #define InverseMixColumn(x) \
498  TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ \
499  TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
500 
501  unsigned int i, j;
502  for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
503  {
504  temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp;
505  temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
506  temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
507  temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
508  }
509 
510  rk[i+0] = InverseMixColumn(rk[i+0]);
511  rk[i+1] = InverseMixColumn(rk[i+1]);
512  rk[i+2] = InverseMixColumn(rk[i+2]);
513  rk[i+3] = InverseMixColumn(rk[i+3]);
514 
515  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
516  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
517  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
518  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
519  }
520 
521 #if CRYPTOPP_AESNI_AVAILABLE
522  if (HasAESNI())
523  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
524 #endif
525 #if CRYPTOPP_ARM_AES_AVAILABLE
526  if (HasAES())
527  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
528 #endif
529 }
530 
531 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
532 {
533 #if CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_AESNI_AVAILABLE
534 # if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
535  if (HasSSE2())
536 # else
537  if (HasAESNI())
538 # endif
539  {
540  (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
541  return;
542  }
543 #endif
544 
545 #if (CRYPTOPP_ARM_AES_AVAILABLE)
546  if (HasAES())
547  {
548  (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
549  return;
550  }
551 #endif
552 
553 #if (CRYPTOGAMS_ARM_AES)
554  if (HasARMv7())
555  {
556  CRYPTOGAMS_encrypt(inBlock, xorBlock, outBlock, m_key.begin());
557  return;
558  }
559 #endif
560 
561 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
562  if (HasAES())
563  {
564  (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
565  return;
566  }
567 #endif
568 
570 
571  word32 s0, s1, s2, s3, t0, t1, t2, t3;
572  Block::Get(inBlock)(s0)(s1)(s2)(s3);
573 
574  const word32 *rk = m_key;
575  s0 ^= rk[0];
576  s1 ^= rk[1];
577  s2 ^= rk[2];
578  s3 ^= rk[3];
579  t0 = rk[4];
580  t1 = rk[5];
581  t2 = rk[6];
582  t3 = rk[7];
583  rk += 8;
584 
585  // timing attack countermeasure. see comments at top for more details.
586  // also see http://github.com/weidai11/cryptopp/issues/146
587  const int cacheLineSize = GetCacheLineSize();
588  unsigned int i;
589  volatile word32 _u = 0;
590  word32 u = _u;
591 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
592  for (i=0; i<2048; i+=cacheLineSize)
593 #else
594  for (i=0; i<1024; i+=cacheLineSize)
595 #endif
596  u &= *(const word32 *)(const void *)(((const byte *)Te)+i);
597  u &= Te[255];
598  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
599 
600  QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
601  QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
602  QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
603  QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
604 
605  // Nr - 2 full rounds:
606  unsigned int r = m_rounds/2 - 1;
607  do
608  {
609  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
610 
611  QUARTER_ROUND_E(t3, s0, s1, s2, s3)
612  QUARTER_ROUND_E(t2, s3, s0, s1, s2)
613  QUARTER_ROUND_E(t1, s2, s3, s0, s1)
614  QUARTER_ROUND_E(t0, s1, s2, s3, s0)
615 
616  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
617 
618  QUARTER_ROUND_E(s3, t0, t1, t2, t3)
619  QUARTER_ROUND_E(s2, t3, t0, t1, t2)
620  QUARTER_ROUND_E(s1, t2, t3, t0, t1)
621  QUARTER_ROUND_E(s0, t1, t2, t3, t0)
622 
623  rk += 8;
624  } while (--r);
625 
626  word32 tbw[4];
627  byte *const tempBlock = (byte *)tbw;
628 
629  QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
630  QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
631  QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
632  QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
633 
634  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
635 }
636 
637 void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
638 {
639 #if CRYPTOPP_AESNI_AVAILABLE
640  if (HasAESNI())
641  {
642  (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
643  return;
644  }
645 #endif
646 
647 #if (CRYPTOPP_ARM_AES_AVAILABLE)
648  if (HasAES())
649  {
650  (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
651  return;
652  }
653 #endif
654 
655 #if (CRYPTOGAMS_ARM_AES)
656  if (HasARMv7())
657  {
658  CRYPTOGAMS_decrypt(inBlock, xorBlock, outBlock, m_key.begin());
659  return;
660  }
661 #endif
662 
663 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
664  if (HasAES())
665  {
666  (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
667  return;
668  }
669 #endif
670 
672 
673  word32 s0, s1, s2, s3, t0, t1, t2, t3;
674  Block::Get(inBlock)(s0)(s1)(s2)(s3);
675 
676  const word32 *rk = m_key;
677  s0 ^= rk[0];
678  s1 ^= rk[1];
679  s2 ^= rk[2];
680  s3 ^= rk[3];
681  t0 = rk[4];
682  t1 = rk[5];
683  t2 = rk[6];
684  t3 = rk[7];
685  rk += 8;
686 
687  // timing attack countermeasure. see comments at top for more details.
688  // also see http://github.com/weidai11/cryptopp/issues/146
689  const int cacheLineSize = GetCacheLineSize();
690  unsigned int i;
691  volatile word32 _u = 0;
692  word32 u = _u;
693 #if defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
694  for (i=0; i<2048; i+=cacheLineSize)
695 #else
696  for (i=0; i<1024; i+=cacheLineSize)
697 #endif
698  u &= *(const word32 *)(const void *)(((const byte *)Td)+i);
699  u &= Td[255];
700  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
701 
702  QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
703  QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
704  QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
705  QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
706 
707  // Nr - 2 full rounds:
708  unsigned int r = m_rounds/2 - 1;
709  do
710  {
711  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
712 
713  QUARTER_ROUND_D(t3, s2, s1, s0, s3)
714  QUARTER_ROUND_D(t2, s1, s0, s3, s2)
715  QUARTER_ROUND_D(t1, s0, s3, s2, s1)
716  QUARTER_ROUND_D(t0, s3, s2, s1, s0)
717 
718  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
719 
720  QUARTER_ROUND_D(s3, t2, t1, t0, t3)
721  QUARTER_ROUND_D(s2, t1, t0, t3, t2)
722  QUARTER_ROUND_D(s1, t0, t3, t2, t1)
723  QUARTER_ROUND_D(s0, t3, t2, t1, t0)
724 
725  rk += 8;
726  } while (--r);
727 
728 #if !(defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS))
729  // timing attack countermeasure. see comments at top for more details
730  // If CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS is defined,
731  // QUARTER_ROUND_LD will use Td, which is already preloaded.
732  u = _u;
733  for (i=0; i<256; i+=cacheLineSize)
734  u &= *(const word32 *)(const void *)(Sd+i);
735  u &= *(const word32 *)(const void *)(Sd+252);
736  t0 |= u; t1 |= u; t2 |= u; t3 |= u;
737 #endif
738 
739  word32 tbw[4];
740  byte *const tempBlock = (byte *)tbw;
741 
742  QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
743  QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
744  QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
745  QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
746 
747  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
748 }
749 
750 // ************************* Assembly Code ************************************
751 
752 #if CRYPTOPP_MSC_VERSION
753 # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
754 #endif
755 
756 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
757 
758 #if CRYPTOPP_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
759 
760 CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks_SSE2(void *locals, const word32 *k)
761 {
762  CRYPTOPP_UNUSED(locals); CRYPTOPP_UNUSED(k);
763 
764 #if CRYPTOPP_BOOL_X86
765 
766 #define L_REG esp
767 #define L_INDEX(i) (L_REG+768+i)
768 #define L_INXORBLOCKS L_INBLOCKS+4
769 #define L_OUTXORBLOCKS L_INBLOCKS+8
770 #define L_OUTBLOCKS L_INBLOCKS+12
771 #define L_INCREMENTS L_INDEX(16*15)
772 #define L_SP L_INDEX(16*16)
773 #define L_LENGTH L_INDEX(16*16+4)
774 #define L_KEYS_BEGIN L_INDEX(16*16+8)
775 
776 #define MOVD movd
777 #define MM(i) mm##i
778 
779 #define MXOR(a,b,c) \
780  AS2( movzx esi, b)\
781  AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
782  AS2( pxor MM(a), mm7)\
783 
784 #define MMOV(a,b,c) \
785  AS2( movzx esi, b)\
786  AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
787 
788 #else
789 
790 #define L_REG r8
791 #define L_INDEX(i) (L_REG+i)
792 #define L_INXORBLOCKS L_INBLOCKS+8
793 #define L_OUTXORBLOCKS L_INBLOCKS+16
794 #define L_OUTBLOCKS L_INBLOCKS+24
795 #define L_INCREMENTS L_INDEX(16*16)
796 #define L_LENGTH L_INDEX(16*18+8)
797 #define L_KEYS_BEGIN L_INDEX(16*19)
798 
799 #define MOVD mov
800 #define MM_0 r9d
801 #define MM_1 r12d
802 #ifdef __GNUC__
803 #define MM_2 r11d
804 #else
805 #define MM_2 r10d
806 #endif
807 #define MM(i) MM_##i
808 
809 #define MXOR(a,b,c) \
810  AS2( movzx esi, b)\
811  AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
812 
813 #define MMOV(a,b,c) \
814  AS2( movzx esi, b)\
815  AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
816 
817 #endif
818 
819 #define L_SUBKEYS L_INDEX(0)
820 #define L_SAVED_X L_SUBKEYS
821 #define L_KEY12 L_INDEX(16*12)
822 #define L_LASTROUND L_INDEX(16*13)
823 #define L_INBLOCKS L_INDEX(16*14)
824 #define MAP0TO4(i) (ASM_MOD(i+3,4)+1)
825 
826 #define XOR(a,b,c) \
827  AS2( movzx esi, b)\
828  AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
829 
830 #define MOV(a,b,c) \
831  AS2( movzx esi, b)\
832  AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
833 
834 #ifdef CRYPTOPP_GENERATE_X64_MASM
835  ALIGN 8
836  Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
837  rex_push_reg rsi
838  push_reg rdi
839  push_reg rbx
840  push_reg r12
841  .endprolog
842  mov L_REG, rcx
843  mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
844  mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
845 #elif defined(__GNUC__)
846  __asm__ __volatile__
847  (
848  INTEL_NOPREFIX
850  AS2( mov L_REG, rcx)
851  #endif
852  AS_PUSH_IF86(bx)
853  AS_PUSH_IF86(bp)
854  AS2( mov AS_REG_7, WORD_REG(si))
855 #else
856  AS_PUSH_IF86(si)
857  AS_PUSH_IF86(di)
858  AS_PUSH_IF86(bx)
859  AS_PUSH_IF86(bp)
860  AS2( lea AS_REG_7, [Te])
861  AS2( mov edi, [g_cacheLineSize])
862 #endif
863 
865  AS2( mov [ecx+16*12+16*4], esp) // save esp to L_SP
866  AS2( lea esp, [ecx-768])
867 #endif
868 
869  // copy subkeys to stack
870  AS2( mov WORD_REG(si), [L_KEYS_BEGIN])
871  AS2( mov WORD_REG(ax), 16)
872  AS2( and WORD_REG(ax), WORD_REG(si))
873  AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)]) // subkey 1 (non-counter) or 2 (counter)
874  AS2( movdqa [L_KEY12], xmm3)
875  AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
876  AS2( sub WORD_REG(ax), WORD_REG(si))
877  ASL(0)
878  AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)])
879  AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
880  AS2( add WORD_REG(si), 16)
881  AS2( cmp WORD_REG(si), 16*12)
882  ATT_NOPREFIX
883  ASJ( jl, 0, b)
884  INTEL_NOPREFIX
885 
886  // read subkeys 0, 1 and last
887  AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)]) // last subkey
888  AS2( movdqa xmm1, [WORD_REG(dx)]) // subkey 0
889  AS2( MOVD MM(1), [WORD_REG(dx)+4*4]) // 0,1,2,3
890  AS2( mov ebx, [WORD_REG(dx)+5*4]) // 4,5,6,7
891  AS2( mov ecx, [WORD_REG(dx)+6*4]) // 8,9,10,11
892  AS2( mov edx, [WORD_REG(dx)+7*4]) // 12,13,14,15
893 
894  // load table into cache
895  AS2( xor WORD_REG(ax), WORD_REG(ax))
896  ASL(9)
897  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
898  AS2( add WORD_REG(ax), WORD_REG(di))
899  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
900  AS2( add WORD_REG(ax), WORD_REG(di))
901  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
902  AS2( add WORD_REG(ax), WORD_REG(di))
903  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
904  AS2( add WORD_REG(ax), WORD_REG(di))
905  AS2( cmp WORD_REG(ax), 2048)
906  ATT_NOPREFIX
907  ASJ( jl, 9, b)
908  INTEL_NOPREFIX
909  AS1( lfence)
910 
911  AS2( test DWORD PTR [L_LENGTH], 1)
912  ATT_NOPREFIX
913  ASJ( jz, 8, f)
914  INTEL_NOPREFIX
915 
916  // counter mode one-time setup
917  AS2( mov WORD_REG(si), [L_INBLOCKS])
918  AS2( movdqu xmm2, [WORD_REG(si)]) // counter
919  AS2( pxor xmm2, xmm1)
920  AS2( psrldq xmm1, 14)
921  AS2( movd eax, xmm1)
922  AS2( mov al, BYTE PTR [WORD_REG(si)+15])
923  AS2( MOVD MM(2), eax)
925  AS2( mov eax, 1)
926  AS2( movd mm3, eax)
927 #endif
928 
929  // partial first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: mm1, ebx, ecx, edx
930  AS2( movd eax, xmm2)
931  AS2( psrldq xmm2, 4)
932  AS2( movd edi, xmm2)
933  AS2( psrldq xmm2, 4)
934  MXOR( 1, al, 0) // 0
935  XOR( edx, ah, 1) // 1
936  AS2( shr eax, 16)
937  XOR( ecx, al, 2) // 2
938  XOR( ebx, ah, 3) // 3
939  AS2( mov eax, edi)
940  AS2( movd edi, xmm2)
941  AS2( psrldq xmm2, 4)
942  XOR( ebx, al, 0) // 4
943  MXOR( 1, ah, 1) // 5
944  AS2( shr eax, 16)
945  XOR( edx, al, 2) // 6
946  XOR( ecx, ah, 3) // 7
947  AS2( mov eax, edi)
948  AS2( movd edi, xmm2)
949  XOR( ecx, al, 0) // 8
950  XOR( ebx, ah, 1) // 9
951  AS2( shr eax, 16)
952  MXOR( 1, al, 2) // 10
953  XOR( edx, ah, 3) // 11
954  AS2( mov eax, edi)
955  XOR( edx, al, 0) // 12
956  XOR( ecx, ah, 1) // 13
957  AS2( shr eax, 16)
958  XOR( ebx, al, 2) // 14
959  AS2( psrldq xmm2, 3)
960 
961  // partial second round, in: ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15), out: eax, ebx, edi, mm0
962  AS2( mov eax, [L_KEY12+0*4])
963  AS2( mov edi, [L_KEY12+2*4])
964  AS2( MOVD MM(0), [L_KEY12+3*4])
965  MXOR( 0, cl, 3) /* 11 */
966  XOR( edi, bl, 3) /* 7 */
967  MXOR( 0, bh, 2) /* 6 */
968  AS2( shr ebx, 16) /* 4,5 */
969  XOR( eax, bl, 1) /* 5 */
970  MOV( ebx, bh, 0) /* 4 */
971  AS2( xor ebx, [L_KEY12+1*4])
972  XOR( eax, ch, 2) /* 10 */
973  AS2( shr ecx, 16) /* 8,9 */
974  XOR( eax, dl, 3) /* 15 */
975  XOR( ebx, dh, 2) /* 14 */
976  AS2( shr edx, 16) /* 12,13 */
977  XOR( edi, ch, 0) /* 8 */
978  XOR( ebx, cl, 1) /* 9 */
979  XOR( edi, dl, 1) /* 13 */
980  MXOR( 0, dh, 0) /* 12 */
981 
982  AS2( movd ecx, xmm2)
983  AS2( MOVD edx, MM(1))
984  AS2( MOVD [L_SAVED_X+3*4], MM(0))
985  AS2( mov [L_SAVED_X+0*4], eax)
986  AS2( mov [L_SAVED_X+1*4], ebx)
987  AS2( mov [L_SAVED_X+2*4], edi)
988  ATT_NOPREFIX
989  ASJ( jmp, 5, f)
990  INTEL_NOPREFIX
991  ASL(3)
992  // non-counter mode per-block setup
993  AS2( MOVD MM(1), [L_KEY12+0*4]) // 0,1,2,3
994  AS2( mov ebx, [L_KEY12+1*4]) // 4,5,6,7
995  AS2( mov ecx, [L_KEY12+2*4]) // 8,9,10,11
996  AS2( mov edx, [L_KEY12+3*4]) // 12,13,14,15
997  ASL(8)
998  AS2( mov WORD_REG(ax), [L_INBLOCKS])
999  AS2( movdqu xmm2, [WORD_REG(ax)])
1000  AS2( mov WORD_REG(si), [L_INXORBLOCKS])
1001  AS2( movdqu xmm5, [WORD_REG(si)])
1002  AS2( pxor xmm2, xmm1)
1003  AS2( pxor xmm2, xmm5)
1004 
1005  // first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: eax, ebx, ecx, edx
1006  AS2( movd eax, xmm2)
1007  AS2( psrldq xmm2, 4)
1008  AS2( movd edi, xmm2)
1009  AS2( psrldq xmm2, 4)
1010  MXOR( 1, al, 0) // 0
1011  XOR( edx, ah, 1) // 1
1012  AS2( shr eax, 16)
1013  XOR( ecx, al, 2) // 2
1014  XOR( ebx, ah, 3) // 3
1015  AS2( mov eax, edi)
1016  AS2( movd edi, xmm2)
1017  AS2( psrldq xmm2, 4)
1018  XOR( ebx, al, 0) // 4
1019  MXOR( 1, ah, 1) // 5
1020  AS2( shr eax, 16)
1021  XOR( edx, al, 2) // 6
1022  XOR( ecx, ah, 3) // 7
1023  AS2( mov eax, edi)
1024  AS2( movd edi, xmm2)
1025  XOR( ecx, al, 0) // 8
1026  XOR( ebx, ah, 1) // 9
1027  AS2( shr eax, 16)
1028  MXOR( 1, al, 2) // 10
1029  XOR( edx, ah, 3) // 11
1030  AS2( mov eax, edi)
1031  XOR( edx, al, 0) // 12
1032  XOR( ecx, ah, 1) // 13
1033  AS2( shr eax, 16)
1034  XOR( ebx, al, 2) // 14
1035  MXOR( 1, ah, 3) // 15
1036  AS2( MOVD eax, MM(1))
1037 
1038  AS2( add L_REG, [L_KEYS_BEGIN])
1039  AS2( add L_REG, 4*16)
1040  ATT_NOPREFIX
1041  ASJ( jmp, 2, f)
1042  INTEL_NOPREFIX
1043  ASL(1)
1044  // counter-mode per-block setup
1045  AS2( MOVD ecx, MM(2))
1046  AS2( MOVD edx, MM(1))
1047  AS2( mov eax, [L_SAVED_X+0*4])
1048  AS2( mov ebx, [L_SAVED_X+1*4])
1049  AS2( xor cl, ch)
1050  AS2( and WORD_REG(cx), 255)
1051  ASL(5)
1053  AS2( paddb MM(2), mm3)
1054 #else
1055  AS2( add MM(2), 1)
1056 #endif
1057  // remaining part of second round, in: edx(previous round),esi(keyed counter byte) eax,ebx,[L_SAVED_X+2*4],[L_SAVED_X+3*4], out: eax,ebx,ecx,edx
1058  AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
1059  XOR( ebx, dl, 3)
1060  MOV( ecx, dh, 2)
1061  AS2( shr edx, 16)
1062  AS2( xor ecx, [L_SAVED_X+2*4])
1063  XOR( eax, dh, 0)
1064  MOV( edx, dl, 1)
1065  AS2( xor edx, [L_SAVED_X+3*4])
1066 
1067  AS2( add L_REG, [L_KEYS_BEGIN])
1068  AS2( add L_REG, 3*16)
1069  ATT_NOPREFIX
1070  ASJ( jmp, 4, f)
1071  INTEL_NOPREFIX
1072 
1073 // in: eax(0,1,2,3), ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15)
1074 // out: eax, ebx, edi, mm0
1075 #define ROUND() \
1076  MXOR( 0, cl, 3) /* 11 */\
1077  AS2( mov cl, al) /* 8,9,10,3 */\
1078  XOR( edi, ah, 2) /* 2 */\
1079  AS2( shr eax, 16) /* 0,1 */\
1080  XOR( edi, bl, 3) /* 7 */\
1081  MXOR( 0, bh, 2) /* 6 */\
1082  AS2( shr ebx, 16) /* 4,5 */\
1083  MXOR( 0, al, 1) /* 1 */\
1084  MOV( eax, ah, 0) /* 0 */\
1085  XOR( eax, bl, 1) /* 5 */\
1086  MOV( ebx, bh, 0) /* 4 */\
1087  XOR( eax, ch, 2) /* 10 */\
1088  XOR( ebx, cl, 3) /* 3 */\
1089  AS2( shr ecx, 16) /* 8,9 */\
1090  XOR( eax, dl, 3) /* 15 */\
1091  XOR( ebx, dh, 2) /* 14 */\
1092  AS2( shr edx, 16) /* 12,13 */\
1093  XOR( edi, ch, 0) /* 8 */\
1094  XOR( ebx, cl, 1) /* 9 */\
1095  XOR( edi, dl, 1) /* 13 */\
1096  MXOR( 0, dh, 0) /* 12 */\
1097 
1098  ASL(2) // 2-round loop
1099  AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4])
1100  AS2( mov edi, [L_SUBKEYS-4*16+2*4])
1101  ROUND()
1102  AS2( mov ecx, edi)
1103  AS2( xor eax, [L_SUBKEYS-4*16+0*4])
1104  AS2( xor ebx, [L_SUBKEYS-4*16+1*4])
1105  AS2( MOVD edx, MM(0))
1106 
1107  ASL(4)
1108  AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4])
1109  AS2( mov edi, [L_SUBKEYS-4*16+6*4])
1110  ROUND()
1111  AS2( mov ecx, edi)
1112  AS2( xor eax, [L_SUBKEYS-4*16+4*4])
1113  AS2( xor ebx, [L_SUBKEYS-4*16+5*4])
1114  AS2( MOVD edx, MM(0))
1115 
1116  AS2( add L_REG, 32)
1117  AS2( test L_REG, 255)
1118  ATT_NOPREFIX
1119  ASJ( jnz, 2, b)
1120  INTEL_NOPREFIX
1121  AS2( sub L_REG, 16*16)
1122 
1123 #define LAST(a, b, c) \
1124  AS2( movzx esi, a )\
1125  AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
1126  AS2( movzx esi, b )\
1127  AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\
1128  AS2( mov WORD PTR [L_LASTROUND+c], di )\
1129 
1130  // last round
1131  LAST(ch, dl, 2)
1132  LAST(dh, al, 6)
1133  AS2( shr edx, 16)
1134  LAST(ah, bl, 10)
1135  AS2( shr eax, 16)
1136  LAST(bh, cl, 14)
1137  AS2( shr ebx, 16)
1138  LAST(dh, al, 12)
1139  AS2( shr ecx, 16)
1140  LAST(ah, bl, 0)
1141  LAST(bh, cl, 4)
1142  LAST(ch, dl, 8)
1143 
1144  AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS])
1145  AS2( mov WORD_REG(bx), [L_OUTBLOCKS])
1146 
1147  AS2( mov WORD_REG(cx), [L_LENGTH])
1148  AS2( sub WORD_REG(cx), 16)
1149 
1150  AS2( movdqu xmm2, [WORD_REG(ax)])
1151  AS2( pxor xmm2, xmm4)
1152 
1154  AS2( movdqa xmm0, [L_INCREMENTS])
1155  AS2( paddd xmm0, [L_INBLOCKS])
1156  AS2( movdqa [L_INBLOCKS], xmm0)
1157 #else
1158  AS2( movdqa xmm0, [L_INCREMENTS+16])
1159  AS2( paddq xmm0, [L_INBLOCKS+16])
1160  AS2( movdqa [L_INBLOCKS+16], xmm0)
1161 #endif
1162 
1163  AS2( pxor xmm2, [L_LASTROUND])
1164  AS2( movdqu [WORD_REG(bx)], xmm2)
1165 
1166  ATT_NOPREFIX
1167  ASJ( jle, 7, f)
1168  INTEL_NOPREFIX
1169  AS2( mov [L_LENGTH], WORD_REG(cx))
1170  AS2( test WORD_REG(cx), 1)
1171  ATT_NOPREFIX
1172  ASJ( jnz, 1, b)
1173  INTEL_NOPREFIX
1175  AS2( movdqa xmm0, [L_INCREMENTS])
1176  AS2( paddq xmm0, [L_INBLOCKS])
1177  AS2( movdqa [L_INBLOCKS], xmm0)
1178 #endif
1179  ATT_NOPREFIX
1180  ASJ( jmp, 3, b)
1181  INTEL_NOPREFIX
1182 
1183  ASL(7)
1184  // erase keys on stack
1185  AS2( xorps xmm0, xmm0)
1186  AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16])
1187  AS2( movaps [WORD_REG(ax)-7*16], xmm0)
1188  AS2( movaps [WORD_REG(ax)-6*16], xmm0)
1189  AS2( movaps [WORD_REG(ax)-5*16], xmm0)
1190  AS2( movaps [WORD_REG(ax)-4*16], xmm0)
1191  AS2( movaps [WORD_REG(ax)-3*16], xmm0)
1192  AS2( movaps [WORD_REG(ax)-2*16], xmm0)
1193  AS2( movaps [WORD_REG(ax)-1*16], xmm0)
1194  AS2( movaps [WORD_REG(ax)+0*16], xmm0)
1195  AS2( movaps [WORD_REG(ax)+1*16], xmm0)
1196  AS2( movaps [WORD_REG(ax)+2*16], xmm0)
1197  AS2( movaps [WORD_REG(ax)+3*16], xmm0)
1198  AS2( movaps [WORD_REG(ax)+4*16], xmm0)
1199  AS2( movaps [WORD_REG(ax)+5*16], xmm0)
1200  AS2( movaps [WORD_REG(ax)+6*16], xmm0)
1202  AS2( mov esp, [L_SP])
1203  AS1( emms)
1204 #endif
1205  AS_POP_IF86(bp)
1206  AS_POP_IF86(bx)
1207 #if defined(CRYPTOPP_MSC_VERSION) && CRYPTOPP_BOOL_X86
1208  AS_POP_IF86(di)
1209  AS_POP_IF86(si)
1210  AS1(ret)
1211 #endif
1212 #ifdef CRYPTOPP_GENERATE_X64_MASM
1213  pop r12
1214  pop rbx
1215  pop rdi
1216  pop rsi
1217  ret
1218  Rijndael_Enc_AdvancedProcessBlocks ENDP
1219 #endif
1220 #ifdef __GNUC__
1221  ATT_PREFIX
1222  :
1223  : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize)
1224  : "memory", "cc", "%eax"
1225  #if CRYPTOPP_BOOL_X64
1226  , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12"
1227  #endif
1228  );
1229 #endif
1230 }
1231 
1232 #endif
1233 
1234 #ifndef CRYPTOPP_GENERATE_X64_MASM
1235 
1236 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
1237 extern "C" {
1238 void Rijndael_Enc_AdvancedProcessBlocks_SSE2(void *locals, const word32 *k);
1239 }
1240 #endif
1241 
1242 #if CRYPTOPP_RIJNDAEL_ADVANCED_PROCESS_BLOCKS
1243 size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1244 {
1245 #if CRYPTOPP_AESNI_AVAILABLE
1246  if (HasAESNI())
1247  return Rijndael_Enc_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1248 #endif
1249 #if CRYPTOPP_ARM_AES_AVAILABLE
1250  if (HasAES())
1251  return Rijndael_Enc_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1252 #endif
1253 #if CRYPTOPP_POWER8_AES_AVAILABLE
1254  if (HasAES())
1255  return Rijndael_Enc_AdvancedProcessBlocks128_6x1_ALTIVEC(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1256 #endif
1257 
1258 #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
1259  if (HasSSE2())
1260  {
1261  if (length < BLOCKSIZE)
1262  return length;
1263 
1264  static const byte *zeros = (const byte*)(Te+256);
1265  m_aliasBlock.SetMark(m_aliasBlock.size());
1266  byte *space = NULLPTR, *originalSpace = const_cast<byte*>(m_aliasBlock.data());
1267 
1268  // round up to nearest 256 byte boundary
1269  space = originalSpace + (s_aliasBlockSize - (uintptr_t)originalSpace % s_aliasBlockSize) % s_aliasBlockSize;
1270  while (AliasedWithTable(space, space + sizeof(Locals)))
1271  {
1272  space += 256;
1273  CRYPTOPP_ASSERT(space < (originalSpace + s_aliasPageSize));
1274  }
1275 
1276  size_t increment = BLOCKSIZE;
1277  if (flags & BT_ReverseDirection)
1278  {
1279  CRYPTOPP_ASSERT(length % BLOCKSIZE == 0);
1280  inBlocks += length - BLOCKSIZE;
1281  xorBlocks += length - BLOCKSIZE;
1282  outBlocks += length - BLOCKSIZE;
1283  increment = 0-increment;
1284  }
1285 
1286  Locals &locals = *(Locals *)(void *)space;
1287 
1288  locals.inBlocks = inBlocks;
1289  locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
1290  locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
1291  locals.outBlocks = outBlocks;
1292 
1293  locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1294  locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
1295  locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
1296  locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1297 
1298  locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter);
1299  int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
1300  locals.keysBegin = (12-keysToCopy)*16;
1301 
1302  Rijndael_Enc_AdvancedProcessBlocks_SSE2(&locals, m_key);
1303 
1304  return length % BLOCKSIZE;
1305  }
1306 #endif
1307 
1308  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1309 }
1310 
1311 size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1312 {
1313 #if CRYPTOPP_AESNI_AVAILABLE
1314  if (HasAESNI())
1315  return Rijndael_Dec_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1316 #endif
1317 #if CRYPTOPP_ARM_AES_AVAILABLE
1318  if (HasAES())
1319  return Rijndael_Dec_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1320 #endif
1321 #if CRYPTOPP_POWER8_AES_AVAILABLE
1322  if (HasAES())
1323  return Rijndael_Dec_AdvancedProcessBlocks128_6x1_ALTIVEC(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1324 #endif
1325 
1326  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1327 }
1328 #endif // CRYPTOPP_RIJNDAEL_ADVANCED_PROCESS_BLOCKS
1329 
1330 NAMESPACE_END
1331 
1332 #endif
1333 #endif
virtual size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
Encrypt and xor multiple blocks using additional flags.
virtual unsigned int OptimalDataAlignment() const
Provides input and output data alignment for optimal performance.
Interface for retrieving values given their names.
Definition: cryptlib.h:327
Rijndael block cipher.
Definition: rijndael.h:46
Library configuration file.
#define CRYPTOPP_BOOL_X86
32-bit x86 platform
Definition: config_cpu.h:52
#define CRYPTOPP_BOOL_X64
32-bit x86 platform
Definition: config_cpu.h:48
unsigned int word32
32-bit unsigned datatype
Definition: config_int.h:72
unsigned long long word64
64-bit unsigned datatype
Definition: config_int.h:101
Functions for CPU features and intrinsics.
@ BIG_ENDIAN_ORDER
byte order is big-endian
Definition: cryptlib.h:152
Utility functions for the Crypto++ library.
T ConditionalByteReverse(ByteOrder order, T value)
Reverses bytes in a value depending upon endianness.
Definition: misc.h:2417
void GetUserKey(ByteOrder order, T *out, size_t outlen, const byte *in, size_t inlen)
Copy bytes in a buffer to an array of elements in big-endian order.
Definition: misc.h:2500
CRYPTOPP_DLL void xorbuf(byte *buf, const byte *mask, size_t count)
Performs an XOR of a buffer with a mask.
Crypto++ library namespace.
Precompiled header file.
Classes for Rijndael encryption algorithm.
Access a block of memory.
Definition: misc.h:3053
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:68