Crypto++  5.6.5
Free C++ class library of cryptographic schemes
rijndael.cpp
1 // rijndael.cpp - modified by Chris Morgan <cmorgan@wpi.edu>
2 // and Wei Dai from Paulo Baretto's Rijndael implementation
3 // The original code and all modifications are in the public domain.
4 
5 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
6 
7 /*
8 July 2017: Added support for ARM AES instructions via compiler intrinsics.
9 */
10 
11 /*
12 July 2010: Added support for AES-NI instructions via compiler intrinsics.
13 */
14 
15 /*
16 Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode
17 caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein
18 and Peter Schwabe in their paper "New AES software speed records". The round
19 function was also modified to include a trick similar to one in Brian Gladman's
20 x86 assembly code, doing an 8-bit register move to minimize the number of
21 register spills. Also switched to compressed tables and copying round keys to
22 the stack.
23 
24 The C++ implementation now uses compressed tables if
25 CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined.
26 */
27 
28 /*
29 July 2006: Defense against timing attacks was added in by Wei Dai.
30 
31 The code now uses smaller tables in the first and last rounds,
32 and preloads them into L1 cache before usage (by loading at least
33 one element in each cache line).
34 
35 We try to delay subsequent accesses to each table (used in the first
36 and last rounds) until all of the table has been preloaded. Hopefully
37 the compiler isn't smart enough to optimize that code away.
38 
39 After preloading the table, we also try not to access any memory location
40 other than the table and the stack, in order to prevent table entries from
41 being unloaded from L1 cache, until that round is finished.
42 (Some popular CPUs have 2-way associative caches.)
43 */
44 
45 // This is the original introductory comment:
46 
47 /**
48  * version 3.0 (December 2000)
49  *
50  * Optimised ANSI C code for the Rijndael cipher (now AES)
51  *
52  * author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
53  * author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
54  * author Paulo Barreto <paulo.barreto@terra.com.br>
55  *
56  * This code is hereby placed in the public domain.
57  *
58  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
59  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
60  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
61  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
62  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
63  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
64  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
65  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
66  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
67  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
68  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
69  */
70 
71 #include "pch.h"
72 #include "config.h"
73 
74 #ifndef CRYPTOPP_IMPORTS
75 #ifndef CRYPTOPP_GENERATE_X64_MASM
76 
77 #include "rijndael.h"
78 #include "misc.h"
79 #include "cpu.h"
80 
81 NAMESPACE_BEGIN(CryptoPP)
82 
83 // Clang 3.3 integrated assembler crash on Linux
84 #if CRYPTOPP_BOOL_X32 || (defined(CRYPTOPP_LLVM_CLANG_VERSION) && (CRYPTOPP_LLVM_CLANG_VERSION < 30400))
85 # define CRYPTOPP_DISABLE_RIJNDAEL_ASM
86 #endif
87 
88 // Hack for http://github.com/weidai11/cryptopp/issues/42 and http://github.com/weidai11/cryptopp/issues/132
89 #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS)
90 # define CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS 1
91 #endif
92 
93 // Clang __m128i casts
94 #define M128I_CAST(x) ((__m128i *)(void *)(x))
95 #define CONST_M128I_CAST(x) ((const __m128i *)(const void *)(x))
96 
97 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
98 # if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
99 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
100 using namespace rdtable;
101 # else
102 static word64 Te[256];
103 # endif
104 static word64 Td[256];
105 #else // Not CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
106 # if defined(CRYPTOPP_X64_MASM_AVAILABLE)
107 // Unused; avoids linker error on Microsoft X64 non-AESNI platforms
108 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
109 # endif
110 CRYPTOPP_ALIGN_DATA(16) static word32 Te[256*4];
111 CRYPTOPP_ALIGN_DATA(16) static word32 Td[256*4];
112 #endif // CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
113 
114 static volatile bool s_TeFilled = false, s_TdFilled = false;
115 
116 ANONYMOUS_NAMESPACE_BEGIN
117 
118 CRYPTOPP_ALIGN_DATA(16)
119 const word32 s_one[] = {0, 0, 0, 1<<24};
120 
121 /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
122 CRYPTOPP_ALIGN_DATA(16)
123 const word32 s_rconLE[] = {
124  0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36
125 };
126 
127 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
128 
129 // Determine whether the range between begin and end overlaps
130 // with the same 4k block offsets as the Te table. Logically,
131 // the code is trying to create the condition:
132 //
133 // Two sepearate memory pages:
134 //
135 // +-----+ +-----+
136 // |XXXXX| |YYYYY|
137 // |XXXXX| |YYYYY|
138 // | | | |
139 // | | | |
140 // +-----+ +-----+
141 // Te Table Locals
142 //
143 // Have a logical cache view of (X and Y may be inverted):
144 //
145 // +-----+
146 // |XXXXX|
147 // |XXXXX|
148 // |YYYYY|
149 // |YYYYY|
150 // +-----+
151 //
152 static inline bool AliasedWithTable(const byte *begin, const byte *end)
153 {
154  ptrdiff_t s0 = uintptr_t(begin)%4096, s1 = uintptr_t(end)%4096;
155  ptrdiff_t t0 = uintptr_t(Te)%4096, t1 = (uintptr_t(Te)+sizeof(Te))%4096;
156  if (t1 > t0)
157  return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
158  else
159  return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
160 }
161 
162 struct Locals
163 {
164  word32 subkeys[4*12], workspace[8];
165  const byte *inBlocks, *inXorBlocks, *outXorBlocks;
166  byte *outBlocks;
167  size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
168  size_t regSpill, lengthAndCounterFlag, keysBegin;
169 };
170 
171 const size_t s_aliasPageSize = 4096;
172 const size_t s_aliasBlockSize = 256;
173 const size_t s_sizeToAllocate = s_aliasPageSize + s_aliasBlockSize + sizeof(Locals);
174 
175 #endif // CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
176 
177 ANONYMOUS_NAMESPACE_END
178 
179 // ************************* Portable Code ************************************
180 
181 #define QUARTER_ROUND(L, T, t, a, b, c, d) \
182  a ^= L(T, 3, byte(t)); t >>= 8;\
183  b ^= L(T, 2, byte(t)); t >>= 8;\
184  c ^= L(T, 1, byte(t)); t >>= 8;\
185  d ^= L(T, 0, t);
186 
187 #define QUARTER_ROUND_LE(t, a, b, c, d) \
188  tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
189  tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
190  tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
191  tempBlock[d] = ((byte *)(Te+t))[1];
192 
193 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
194  #define QUARTER_ROUND_LD(t, a, b, c, d) \
195  tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
196  tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
197  tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
198  tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
199 #else
200  #define QUARTER_ROUND_LD(t, a, b, c, d) \
201  tempBlock[a] = Sd[byte(t)]; t >>= 8;\
202  tempBlock[b] = Sd[byte(t)]; t >>= 8;\
203  tempBlock[c] = Sd[byte(t)]; t >>= 8;\
204  tempBlock[d] = Sd[t];
205 #endif
206 
207 #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
208 #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
209 
210 #ifdef CRYPTOPP_LITTLE_ENDIAN
211  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
212  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
213  #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
214  #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (6-i)%4+1))
215  #define TL_M(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (i+3)%4+1))
216  #else
217  #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8)
218  #define TL_M(T, i, x) T[i*256 + x]
219  #endif
220 #else
221  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
222  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
223  #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
224  #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (4-i)%4))
225  #define TL_M TL_F
226  #else
227  #define TL_F(T, i, x) rotrFixed(T[x], i*8)
228  #define TL_M(T, i, x) T[i*256 + x]
229  #endif
230 #endif
231 
232 
233 #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
234 #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
235 #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
236 
237 #define f3(x) (f2(x) ^ x)
238 #define f9(x) (f8(x) ^ x)
239 #define fb(x) (f8(x) ^ f2(x) ^ x)
240 #define fd(x) (f8(x) ^ f4(x) ^ x)
241 #define fe(x) (f8(x) ^ f4(x) ^ f2(x))
242 
243 void Rijndael::Base::FillEncTable()
244 {
245  for (int i=0; i<256; i++)
246  {
247  byte x = Se[i];
248 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
249  word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
250  Te[i] = word64(y | f3(x))<<32 | y;
251 #else
252  word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
253  for (int j=0; j<4; j++)
254  {
255  Te[i+j*256] = y;
256  y = rotrConstant<8>(y);
257  }
258 #endif
259  }
260 #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
261  Te[256] = Te[257] = 0;
262 #endif
263  s_TeFilled = true;
264 }
265 
266 void Rijndael::Base::FillDecTable()
267 {
268  for (int i=0; i<256; i++)
269  {
270  byte x = Sd[i];
271 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
272  word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
273  Td[i] = word64(y | fb(x))<<32 | y | x;
274 #else
275  word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;;
276  for (int j=0; j<4; j++)
277  {
278  Td[i+j*256] = y;
279  y = rotrConstant<8>(y);
280  }
281 #endif
282  }
283  s_TdFilled = true;
284 }
285 
286 #if (CRYPTOPP_AESNI_AVAILABLE)
287 extern void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32* rk, unsigned int rounds);
288 extern void Rijndael_UncheckedSetKeyRev_AESNI(word32 *key, unsigned int rounds);
289 
290 extern size_t Rijndael_Enc_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds,
291  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
292 extern size_t Rijndael_Dec_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds,
293  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
294 #endif
295 
296 #if (CRYPTOPP_ARM_AES_AVAILABLE)
297 extern size_t Rijndael_Enc_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds,
298  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
299 extern size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds,
300  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
301 #endif
302 
303 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
304 extern void Rijndael_UncheckedSetKey_POWER8(const byte* userKey, size_t keyLen,
305  word32* rk, const word32* rc, const byte* Se);
306 
307 extern size_t Rijndael_Enc_AdvancedProcessBlocks128_6x1_ALTIVEC(const word32 *subkeys, size_t rounds,
308  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
309 extern size_t Rijndael_Dec_AdvancedProcessBlocks128_6x1_ALTIVEC(const word32 *subkeys, size_t rounds,
310  const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
311 #endif
312 
313 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, const NameValuePairs &)
314 {
315  AssertValidKeyLength(keyLen);
316 
317 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
318  m_aliasBlock.New(s_sizeToAllocate);
319  // The alias block is only used on IA-32 when unaligned data access is in effect.
320  // Setting the low water mark to 0 avoids zeroization when m_aliasBlock is unused.
321  m_aliasBlock.SetMark(0);
322 #endif
323 
324  m_rounds = keyLen/4 + 6;
325  m_key.New(4*(m_rounds+1));
326  word32 *rk = m_key;
327 
328 #if (CRYPTOPP_AESNI_AVAILABLE && CRYPTOPP_SSE41_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32))
329  // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64
330  if (HasAESNI() && HasSSE41())
331  {
332  // TODO: Add non-SSE4.1 variant for low-end Atoms. The low-end
333  // Atoms have SSE2-SSSE3 and AES-NI, but not SSE4.1 or SSE4.2.
334  Rijndael_UncheckedSetKey_SSE4_AESNI(userKey, keyLen, rk, m_rounds);
335  if (!IsForwardTransformation())
336  Rijndael_UncheckedSetKeyRev_AESNI(m_key, m_rounds);
337 
338  return;
339  }
340 #endif
341 
342 #if CRYPTOPP_POWER8_AES_AVAILABLE
343  if (HasAES())
344  {
345  // We still need rcon and Se to fallback to C/C++ for AES-192 and AES-256.
346  // The IBM docs on AES sucks. Intel's docs on AESNI puts IBM to shame.
347  Rijndael_UncheckedSetKey_POWER8(userKey, keyLen, rk, rcon, Se);
348  return;
349  }
350 #endif
351 
352  GetUserKey(BIG_ENDIAN_ORDER, rk, keyLen/4, userKey, keyLen);
353  const word32 *rc = rcon;
354  word32 temp;
355 
356  while (true)
357  {
358  temp = rk[keyLen/4-1];
359  word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^
360  (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
361  rk[keyLen/4] = rk[0] ^ x ^ *(rc++);
362  rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4];
363  rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1];
364  rk[keyLen/4+3] = rk[3] ^ rk[keyLen/4+2];
365 
366  if (rk + keyLen/4 + 4 == m_key.end())
367  break;
368 
369  if (keyLen == 24)
370  {
371  rk[10] = rk[ 4] ^ rk[ 9];
372  rk[11] = rk[ 5] ^ rk[10];
373  }
374  else if (keyLen == 32)
375  {
376  temp = rk[11];
377  rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
378  rk[13] = rk[ 5] ^ rk[12];
379  rk[14] = rk[ 6] ^ rk[13];
380  rk[15] = rk[ 7] ^ rk[14];
381  }
382  rk += keyLen/4;
383  }
384 
385  rk = m_key;
386 
387  if (IsForwardTransformation())
388  {
389  if (!s_TeFilled)
390  FillEncTable();
391 
393  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
394  }
395  else
396  {
397  if (!s_TdFilled)
398  FillDecTable();
399 
400  #define InverseMixColumn(x) \
401  TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ \
402  TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
403 
404  unsigned int i, j;
405  for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
406  {
407  temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp;
408  temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
409  temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
410  temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
411  }
412 
413  rk[i+0] = InverseMixColumn(rk[i+0]);
414  rk[i+1] = InverseMixColumn(rk[i+1]);
415  rk[i+2] = InverseMixColumn(rk[i+2]);
416  rk[i+3] = InverseMixColumn(rk[i+3]);
417 
418  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
419  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
420  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
421  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
422  }
423 
424 #if CRYPTOPP_AESNI_AVAILABLE
425  if (HasAESNI())
426  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
427 #endif
428 #if CRYPTOPP_ARM_AES_AVAILABLE
429  if (HasAES())
430  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
431 #endif
432 }
433 
434 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
435 {
436 #if CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_AESNI_AVAILABLE
437 # if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
438  if (HasSSE2())
439 # else
440  if (HasAESNI())
441 # endif
442  {
443  (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
444  return;
445  }
446 #endif
447 
448 #if (CRYPTOPP_ARM_AES_AVAILABLE)
449  if (HasAES())
450  {
451  (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
452  return;
453  }
454 #endif
455 
456 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
457  if (HasAES())
458  {
459  (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
460  return;
461  }
462 #endif
463 
465 
466  word32 s0, s1, s2, s3, t0, t1, t2, t3;
467  Block::Get(inBlock)(s0)(s1)(s2)(s3);
468 
469  const word32 *rk = m_key;
470  s0 ^= rk[0];
471  s1 ^= rk[1];
472  s2 ^= rk[2];
473  s3 ^= rk[3];
474  t0 = rk[4];
475  t1 = rk[5];
476  t2 = rk[6];
477  t3 = rk[7];
478  rk += 8;
479 
480  // timing attack countermeasure. see comments at top for more details.
481  // also see http://github.com/weidai11/cryptopp/issues/146
482  const int cacheLineSize = GetCacheLineSize();
483  unsigned int i;
484  volatile word32 _u = 0;
485  word32 u = _u;
486 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
487  for (i=0; i<2048; i+=cacheLineSize)
488 #else
489  for (i=0; i<1024; i+=cacheLineSize)
490 #endif
491  u &= *(const word32 *)(const void *)(((const byte *)Te)+i);
492  u &= Te[255];
493  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
494 
495  QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
496  QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
497  QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
498  QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
499 
500  // Nr - 2 full rounds:
501  unsigned int r = m_rounds/2 - 1;
502  do
503  {
504  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
505 
506  QUARTER_ROUND_E(t3, s0, s1, s2, s3)
507  QUARTER_ROUND_E(t2, s3, s0, s1, s2)
508  QUARTER_ROUND_E(t1, s2, s3, s0, s1)
509  QUARTER_ROUND_E(t0, s1, s2, s3, s0)
510 
511  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
512 
513  QUARTER_ROUND_E(s3, t0, t1, t2, t3)
514  QUARTER_ROUND_E(s2, t3, t0, t1, t2)
515  QUARTER_ROUND_E(s1, t2, t3, t0, t1)
516  QUARTER_ROUND_E(s0, t1, t2, t3, t0)
517 
518  rk += 8;
519  } while (--r);
520 
521  word32 tbw[4];
522  byte *const tempBlock = (byte *)tbw;
523 
524  QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
525  QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
526  QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
527  QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
528 
529  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
530 }
531 
532 void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
533 {
534 #if CRYPTOPP_AESNI_AVAILABLE
535  if (HasAESNI())
536  {
537  (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
538  return;
539  }
540 #endif
541 
542 #if (CRYPTOPP_ARM_AES_AVAILABLE)
543  if (HasAES())
544  {
545  (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
546  return;
547  }
548 #endif
549 
550 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
551  if (HasAES())
552  {
553  (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
554  return;
555  }
556 #endif
557 
559 
560  word32 s0, s1, s2, s3, t0, t1, t2, t3;
561  Block::Get(inBlock)(s0)(s1)(s2)(s3);
562 
563  const word32 *rk = m_key;
564  s0 ^= rk[0];
565  s1 ^= rk[1];
566  s2 ^= rk[2];
567  s3 ^= rk[3];
568  t0 = rk[4];
569  t1 = rk[5];
570  t2 = rk[6];
571  t3 = rk[7];
572  rk += 8;
573 
574  // timing attack countermeasure. see comments at top for more details.
575  // also see http://github.com/weidai11/cryptopp/issues/146
576  const int cacheLineSize = GetCacheLineSize();
577  unsigned int i;
578  volatile word32 _u = 0;
579  word32 u = _u;
580 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
581  for (i=0; i<2048; i+=cacheLineSize)
582 #else
583  for (i=0; i<1024; i+=cacheLineSize)
584 #endif
585  u &= *(const word32 *)(const void *)(((const byte *)Td)+i);
586  u &= Td[255];
587  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
588 
589  QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
590  QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
591  QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
592  QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
593 
594  // Nr - 2 full rounds:
595  unsigned int r = m_rounds/2 - 1;
596  do
597  {
598  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
599 
600  QUARTER_ROUND_D(t3, s2, s1, s0, s3)
601  QUARTER_ROUND_D(t2, s1, s0, s3, s2)
602  QUARTER_ROUND_D(t1, s0, s3, s2, s1)
603  QUARTER_ROUND_D(t0, s3, s2, s1, s0)
604 
605  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
606 
607  QUARTER_ROUND_D(s3, t2, t1, t0, t3)
608  QUARTER_ROUND_D(s2, t1, t0, t3, t2)
609  QUARTER_ROUND_D(s1, t0, t3, t2, t1)
610  QUARTER_ROUND_D(s0, t3, t2, t1, t0)
611 
612  rk += 8;
613  } while (--r);
614 
615 #if !(defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS))
616  // timing attack countermeasure. see comments at top for more details
617  // If CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined,
618  // QUARTER_ROUND_LD will use Td, which is already preloaded.
619  u = _u;
620  for (i=0; i<256; i+=cacheLineSize)
621  u &= *(const word32 *)(const void *)(Sd+i);
622  u &= *(const word32 *)(const void *)(Sd+252);
623  t0 |= u; t1 |= u; t2 |= u; t3 |= u;
624 #endif
625 
626  word32 tbw[4];
627  byte *const tempBlock = (byte *)tbw;
628 
629  QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
630  QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
631  QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
632  QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
633 
634  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
635 }
636 
637 // ************************* Assembly Code ************************************
638 
639 #if CRYPTOPP_MSC_VERSION
640 # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
641 #endif
642 
643 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
644 
645 #if CRYPTOPP_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
646 
647 CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k)
648 {
649  CRYPTOPP_UNUSED(locals); CRYPTOPP_UNUSED(k);
650 
651 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
652 
653 #define L_REG esp
654 #define L_INDEX(i) (L_REG+768+i)
655 #define L_INXORBLOCKS L_INBLOCKS+4
656 #define L_OUTXORBLOCKS L_INBLOCKS+8
657 #define L_OUTBLOCKS L_INBLOCKS+12
658 #define L_INCREMENTS L_INDEX(16*15)
659 #define L_SP L_INDEX(16*16)
660 #define L_LENGTH L_INDEX(16*16+4)
661 #define L_KEYS_BEGIN L_INDEX(16*16+8)
662 
663 #define MOVD movd
664 #define MM(i) mm##i
665 
666 #define MXOR(a,b,c) \
667  AS2( movzx esi, b)\
668  AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
669  AS2( pxor MM(a), mm7)\
670 
671 #define MMOV(a,b,c) \
672  AS2( movzx esi, b)\
673  AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
674 
675 #else
676 
677 #define L_REG r8
678 #define L_INDEX(i) (L_REG+i)
679 #define L_INXORBLOCKS L_INBLOCKS+8
680 #define L_OUTXORBLOCKS L_INBLOCKS+16
681 #define L_OUTBLOCKS L_INBLOCKS+24
682 #define L_INCREMENTS L_INDEX(16*16)
683 #define L_LENGTH L_INDEX(16*18+8)
684 #define L_KEYS_BEGIN L_INDEX(16*19)
685 
686 #define MOVD mov
687 #define MM_0 r9d
688 #define MM_1 r12d
689 #ifdef __GNUC__
690 #define MM_2 r11d
691 #else
692 #define MM_2 r10d
693 #endif
694 #define MM(i) MM_##i
695 
696 #define MXOR(a,b,c) \
697  AS2( movzx esi, b)\
698  AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
699 
700 #define MMOV(a,b,c) \
701  AS2( movzx esi, b)\
702  AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
703 
704 #endif
705 
706 #define L_SUBKEYS L_INDEX(0)
707 #define L_SAVED_X L_SUBKEYS
708 #define L_KEY12 L_INDEX(16*12)
709 #define L_LASTROUND L_INDEX(16*13)
710 #define L_INBLOCKS L_INDEX(16*14)
711 #define MAP0TO4(i) (ASM_MOD(i+3,4)+1)
712 
713 #define XOR(a,b,c) \
714  AS2( movzx esi, b)\
715  AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
716 
717 #define MOV(a,b,c) \
718  AS2( movzx esi, b)\
719  AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
720 
721 #ifdef CRYPTOPP_GENERATE_X64_MASM
722  ALIGN 8
723  Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
724  rex_push_reg rsi
725  push_reg rdi
726  push_reg rbx
727  push_reg r12
728  .endprolog
729  mov L_REG, rcx
730  mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
731  mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
732 #elif defined(__GNUC__)
733  __asm__ __volatile__
734  (
735  INTEL_NOPREFIX
736  #if CRYPTOPP_BOOL_X64
737  AS2( mov L_REG, rcx)
738  #endif
739  AS_PUSH_IF86(bx)
740  AS_PUSH_IF86(bp)
741  AS2( mov AS_REG_7, WORD_REG(si))
742 #else
743  AS_PUSH_IF86(si)
744  AS_PUSH_IF86(di)
745  AS_PUSH_IF86(bx)
746  AS_PUSH_IF86(bp)
747  AS2( lea AS_REG_7, [Te])
748  AS2( mov edi, [g_cacheLineSize])
749 #endif
750 
751 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
752  AS2( mov [ecx+16*12+16*4], esp) // save esp to L_SP
753  AS2( lea esp, [ecx-768])
754 #endif
755 
756  // copy subkeys to stack
757  AS2( mov WORD_REG(si), [L_KEYS_BEGIN])
758  AS2( mov WORD_REG(ax), 16)
759  AS2( and WORD_REG(ax), WORD_REG(si))
760  AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)]) // subkey 1 (non-counter) or 2 (counter)
761  AS2( movdqa [L_KEY12], xmm3)
762  AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
763  AS2( sub WORD_REG(ax), WORD_REG(si))
764  ASL(0)
765  AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)])
766  AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
767  AS2( add WORD_REG(si), 16)
768  AS2( cmp WORD_REG(si), 16*12)
769  ATT_NOPREFIX
770  ASJ( jl, 0, b)
771  INTEL_NOPREFIX
772 
773  // read subkeys 0, 1 and last
774  AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)]) // last subkey
775  AS2( movdqa xmm1, [WORD_REG(dx)]) // subkey 0
776  AS2( MOVD MM(1), [WORD_REG(dx)+4*4]) // 0,1,2,3
777  AS2( mov ebx, [WORD_REG(dx)+5*4]) // 4,5,6,7
778  AS2( mov ecx, [WORD_REG(dx)+6*4]) // 8,9,10,11
779  AS2( mov edx, [WORD_REG(dx)+7*4]) // 12,13,14,15
780 
781  // load table into cache
782  AS2( xor WORD_REG(ax), WORD_REG(ax))
783  ASL(9)
784  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
785  AS2( add WORD_REG(ax), WORD_REG(di))
786  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
787  AS2( add WORD_REG(ax), WORD_REG(di))
788  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
789  AS2( add WORD_REG(ax), WORD_REG(di))
790  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
791  AS2( add WORD_REG(ax), WORD_REG(di))
792  AS2( cmp WORD_REG(ax), 2048)
793  ATT_NOPREFIX
794  ASJ( jl, 9, b)
795  INTEL_NOPREFIX
796  AS1( lfence)
797 
798  AS2( test DWORD PTR [L_LENGTH], 1)
799  ATT_NOPREFIX
800  ASJ( jz, 8, f)
801  INTEL_NOPREFIX
802 
803  // counter mode one-time setup
804  AS2( mov WORD_REG(si), [L_INBLOCKS])
805  AS2( movdqu xmm2, [WORD_REG(si)]) // counter
806  AS2( pxor xmm2, xmm1)
807  AS2( psrldq xmm1, 14)
808  AS2( movd eax, xmm1)
809  AS2( mov al, BYTE PTR [WORD_REG(si)+15])
810  AS2( MOVD MM(2), eax)
811 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
812  AS2( mov eax, 1)
813  AS2( movd mm3, eax)
814 #endif
815 
816  // partial first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: mm1, ebx, ecx, edx
817  AS2( movd eax, xmm2)
818  AS2( psrldq xmm2, 4)
819  AS2( movd edi, xmm2)
820  AS2( psrldq xmm2, 4)
821  MXOR( 1, al, 0) // 0
822  XOR( edx, ah, 1) // 1
823  AS2( shr eax, 16)
824  XOR( ecx, al, 2) // 2
825  XOR( ebx, ah, 3) // 3
826  AS2( mov eax, edi)
827  AS2( movd edi, xmm2)
828  AS2( psrldq xmm2, 4)
829  XOR( ebx, al, 0) // 4
830  MXOR( 1, ah, 1) // 5
831  AS2( shr eax, 16)
832  XOR( edx, al, 2) // 6
833  XOR( ecx, ah, 3) // 7
834  AS2( mov eax, edi)
835  AS2( movd edi, xmm2)
836  XOR( ecx, al, 0) // 8
837  XOR( ebx, ah, 1) // 9
838  AS2( shr eax, 16)
839  MXOR( 1, al, 2) // 10
840  XOR( edx, ah, 3) // 11
841  AS2( mov eax, edi)
842  XOR( edx, al, 0) // 12
843  XOR( ecx, ah, 1) // 13
844  AS2( shr eax, 16)
845  XOR( ebx, al, 2) // 14
846  AS2( psrldq xmm2, 3)
847 
848  // partial second round, in: ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15), out: eax, ebx, edi, mm0
849  AS2( mov eax, [L_KEY12+0*4])
850  AS2( mov edi, [L_KEY12+2*4])
851  AS2( MOVD MM(0), [L_KEY12+3*4])
852  MXOR( 0, cl, 3) /* 11 */
853  XOR( edi, bl, 3) /* 7 */
854  MXOR( 0, bh, 2) /* 6 */
855  AS2( shr ebx, 16) /* 4,5 */
856  XOR( eax, bl, 1) /* 5 */
857  MOV( ebx, bh, 0) /* 4 */
858  AS2( xor ebx, [L_KEY12+1*4])
859  XOR( eax, ch, 2) /* 10 */
860  AS2( shr ecx, 16) /* 8,9 */
861  XOR( eax, dl, 3) /* 15 */
862  XOR( ebx, dh, 2) /* 14 */
863  AS2( shr edx, 16) /* 12,13 */
864  XOR( edi, ch, 0) /* 8 */
865  XOR( ebx, cl, 1) /* 9 */
866  XOR( edi, dl, 1) /* 13 */
867  MXOR( 0, dh, 0) /* 12 */
868 
869  AS2( movd ecx, xmm2)
870  AS2( MOVD edx, MM(1))
871  AS2( MOVD [L_SAVED_X+3*4], MM(0))
872  AS2( mov [L_SAVED_X+0*4], eax)
873  AS2( mov [L_SAVED_X+1*4], ebx)
874  AS2( mov [L_SAVED_X+2*4], edi)
875  ATT_NOPREFIX
876  ASJ( jmp, 5, f)
877  INTEL_NOPREFIX
878  ASL(3)
879  // non-counter mode per-block setup
880  AS2( MOVD MM(1), [L_KEY12+0*4]) // 0,1,2,3
881  AS2( mov ebx, [L_KEY12+1*4]) // 4,5,6,7
882  AS2( mov ecx, [L_KEY12+2*4]) // 8,9,10,11
883  AS2( mov edx, [L_KEY12+3*4]) // 12,13,14,15
884  ASL(8)
885  AS2( mov WORD_REG(ax), [L_INBLOCKS])
886  AS2( movdqu xmm2, [WORD_REG(ax)])
887  AS2( mov WORD_REG(si), [L_INXORBLOCKS])
888  AS2( movdqu xmm5, [WORD_REG(si)])
889  AS2( pxor xmm2, xmm1)
890  AS2( pxor xmm2, xmm5)
891 
892  // first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: eax, ebx, ecx, edx
893  AS2( movd eax, xmm2)
894  AS2( psrldq xmm2, 4)
895  AS2( movd edi, xmm2)
896  AS2( psrldq xmm2, 4)
897  MXOR( 1, al, 0) // 0
898  XOR( edx, ah, 1) // 1
899  AS2( shr eax, 16)
900  XOR( ecx, al, 2) // 2
901  XOR( ebx, ah, 3) // 3
902  AS2( mov eax, edi)
903  AS2( movd edi, xmm2)
904  AS2( psrldq xmm2, 4)
905  XOR( ebx, al, 0) // 4
906  MXOR( 1, ah, 1) // 5
907  AS2( shr eax, 16)
908  XOR( edx, al, 2) // 6
909  XOR( ecx, ah, 3) // 7
910  AS2( mov eax, edi)
911  AS2( movd edi, xmm2)
912  XOR( ecx, al, 0) // 8
913  XOR( ebx, ah, 1) // 9
914  AS2( shr eax, 16)
915  MXOR( 1, al, 2) // 10
916  XOR( edx, ah, 3) // 11
917  AS2( mov eax, edi)
918  XOR( edx, al, 0) // 12
919  XOR( ecx, ah, 1) // 13
920  AS2( shr eax, 16)
921  XOR( ebx, al, 2) // 14
922  MXOR( 1, ah, 3) // 15
923  AS2( MOVD eax, MM(1))
924 
925  AS2( add L_REG, [L_KEYS_BEGIN])
926  AS2( add L_REG, 4*16)
927  ATT_NOPREFIX
928  ASJ( jmp, 2, f)
929  INTEL_NOPREFIX
930  ASL(1)
931  // counter-mode per-block setup
932  AS2( MOVD ecx, MM(2))
933  AS2( MOVD edx, MM(1))
934  AS2( mov eax, [L_SAVED_X+0*4])
935  AS2( mov ebx, [L_SAVED_X+1*4])
936  AS2( xor cl, ch)
937  AS2( and WORD_REG(cx), 255)
938  ASL(5)
939 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
940  AS2( paddb MM(2), mm3)
941 #else
942  AS2( add MM(2), 1)
943 #endif
944  // remaining part of second round, in: edx(previous round),esi(keyed counter byte) eax,ebx,[L_SAVED_X+2*4],[L_SAVED_X+3*4], out: eax,ebx,ecx,edx
945  AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
946  XOR( ebx, dl, 3)
947  MOV( ecx, dh, 2)
948  AS2( shr edx, 16)
949  AS2( xor ecx, [L_SAVED_X+2*4])
950  XOR( eax, dh, 0)
951  MOV( edx, dl, 1)
952  AS2( xor edx, [L_SAVED_X+3*4])
953 
954  AS2( add L_REG, [L_KEYS_BEGIN])
955  AS2( add L_REG, 3*16)
956  ATT_NOPREFIX
957  ASJ( jmp, 4, f)
958  INTEL_NOPREFIX
959 
960 // in: eax(0,1,2,3), ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15)
961 // out: eax, ebx, edi, mm0
962 #define ROUND() \
963  MXOR( 0, cl, 3) /* 11 */\
964  AS2( mov cl, al) /* 8,9,10,3 */\
965  XOR( edi, ah, 2) /* 2 */\
966  AS2( shr eax, 16) /* 0,1 */\
967  XOR( edi, bl, 3) /* 7 */\
968  MXOR( 0, bh, 2) /* 6 */\
969  AS2( shr ebx, 16) /* 4,5 */\
970  MXOR( 0, al, 1) /* 1 */\
971  MOV( eax, ah, 0) /* 0 */\
972  XOR( eax, bl, 1) /* 5 */\
973  MOV( ebx, bh, 0) /* 4 */\
974  XOR( eax, ch, 2) /* 10 */\
975  XOR( ebx, cl, 3) /* 3 */\
976  AS2( shr ecx, 16) /* 8,9 */\
977  XOR( eax, dl, 3) /* 15 */\
978  XOR( ebx, dh, 2) /* 14 */\
979  AS2( shr edx, 16) /* 12,13 */\
980  XOR( edi, ch, 0) /* 8 */\
981  XOR( ebx, cl, 1) /* 9 */\
982  XOR( edi, dl, 1) /* 13 */\
983  MXOR( 0, dh, 0) /* 12 */\
984 
985  ASL(2) // 2-round loop
986  AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4])
987  AS2( mov edi, [L_SUBKEYS-4*16+2*4])
988  ROUND()
989  AS2( mov ecx, edi)
990  AS2( xor eax, [L_SUBKEYS-4*16+0*4])
991  AS2( xor ebx, [L_SUBKEYS-4*16+1*4])
992  AS2( MOVD edx, MM(0))
993 
994  ASL(4)
995  AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4])
996  AS2( mov edi, [L_SUBKEYS-4*16+6*4])
997  ROUND()
998  AS2( mov ecx, edi)
999  AS2( xor eax, [L_SUBKEYS-4*16+4*4])
1000  AS2( xor ebx, [L_SUBKEYS-4*16+5*4])
1001  AS2( MOVD edx, MM(0))
1002 
1003  AS2( add L_REG, 32)
1004  AS2( test L_REG, 255)
1005  ATT_NOPREFIX
1006  ASJ( jnz, 2, b)
1007  INTEL_NOPREFIX
1008  AS2( sub L_REG, 16*16)
1009 
1010 #define LAST(a, b, c) \
1011  AS2( movzx esi, a )\
1012  AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
1013  AS2( movzx esi, b )\
1014  AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\
1015  AS2( mov WORD PTR [L_LASTROUND+c], di )\
1016 
1017  // last round
1018  LAST(ch, dl, 2)
1019  LAST(dh, al, 6)
1020  AS2( shr edx, 16)
1021  LAST(ah, bl, 10)
1022  AS2( shr eax, 16)
1023  LAST(bh, cl, 14)
1024  AS2( shr ebx, 16)
1025  LAST(dh, al, 12)
1026  AS2( shr ecx, 16)
1027  LAST(ah, bl, 0)
1028  LAST(bh, cl, 4)
1029  LAST(ch, dl, 8)
1030 
1031  AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS])
1032  AS2( mov WORD_REG(bx), [L_OUTBLOCKS])
1033 
1034  AS2( mov WORD_REG(cx), [L_LENGTH])
1035  AS2( sub WORD_REG(cx), 16)
1036 
1037  AS2( movdqu xmm2, [WORD_REG(ax)])
1038  AS2( pxor xmm2, xmm4)
1039 
1040 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
1041  AS2( movdqa xmm0, [L_INCREMENTS])
1042  AS2( paddd xmm0, [L_INBLOCKS])
1043  AS2( movdqa [L_INBLOCKS], xmm0)
1044 #else
1045  AS2( movdqa xmm0, [L_INCREMENTS+16])
1046  AS2( paddq xmm0, [L_INBLOCKS+16])
1047  AS2( movdqa [L_INBLOCKS+16], xmm0)
1048 #endif
1049 
1050  AS2( pxor xmm2, [L_LASTROUND])
1051  AS2( movdqu [WORD_REG(bx)], xmm2)
1052 
1053  ATT_NOPREFIX
1054  ASJ( jle, 7, f)
1055  INTEL_NOPREFIX
1056  AS2( mov [L_LENGTH], WORD_REG(cx))
1057  AS2( test WORD_REG(cx), 1)
1058  ATT_NOPREFIX
1059  ASJ( jnz, 1, b)
1060  INTEL_NOPREFIX
1061 #if CRYPTOPP_BOOL_X64
1062  AS2( movdqa xmm0, [L_INCREMENTS])
1063  AS2( paddq xmm0, [L_INBLOCKS])
1064  AS2( movdqa [L_INBLOCKS], xmm0)
1065 #endif
1066  ATT_NOPREFIX
1067  ASJ( jmp, 3, b)
1068  INTEL_NOPREFIX
1069 
1070  ASL(7)
1071  // erase keys on stack
1072  AS2( xorps xmm0, xmm0)
1073  AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16])
1074  AS2( movaps [WORD_REG(ax)-7*16], xmm0)
1075  AS2( movaps [WORD_REG(ax)-6*16], xmm0)
1076  AS2( movaps [WORD_REG(ax)-5*16], xmm0)
1077  AS2( movaps [WORD_REG(ax)-4*16], xmm0)
1078  AS2( movaps [WORD_REG(ax)-3*16], xmm0)
1079  AS2( movaps [WORD_REG(ax)-2*16], xmm0)
1080  AS2( movaps [WORD_REG(ax)-1*16], xmm0)
1081  AS2( movaps [WORD_REG(ax)+0*16], xmm0)
1082  AS2( movaps [WORD_REG(ax)+1*16], xmm0)
1083  AS2( movaps [WORD_REG(ax)+2*16], xmm0)
1084  AS2( movaps [WORD_REG(ax)+3*16], xmm0)
1085  AS2( movaps [WORD_REG(ax)+4*16], xmm0)
1086  AS2( movaps [WORD_REG(ax)+5*16], xmm0)
1087  AS2( movaps [WORD_REG(ax)+6*16], xmm0)
1088 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
1089  AS2( mov esp, [L_SP])
1090  AS1( emms)
1091 #endif
1092  AS_POP_IF86(bp)
1093  AS_POP_IF86(bx)
1094 #if defined(_MSC_VER) && CRYPTOPP_BOOL_X86
1095  AS_POP_IF86(di)
1096  AS_POP_IF86(si)
1097  AS1(ret)
1098 #endif
1099 #ifdef CRYPTOPP_GENERATE_X64_MASM
1100  pop r12
1101  pop rbx
1102  pop rdi
1103  pop rsi
1104  ret
1105  Rijndael_Enc_AdvancedProcessBlocks ENDP
1106 #endif
1107 #ifdef __GNUC__
1108  ATT_PREFIX
1109  :
1110  : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize)
1111  : "memory", "cc", "%eax"
1112  #if CRYPTOPP_BOOL_X64
1113  , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12"
1114  #endif
1115  );
1116 #endif
1117 }
1118 
1119 #endif
1120 
1121 #ifndef CRYPTOPP_GENERATE_X64_MASM
1122 
1123 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
1124 extern "C" {
1125 void Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k);
1126 }
1127 #endif
1128 
1129 #if CRYPTOPP_RIJNDAEL_ADVANCED_PROCESS_BLOCKS
1130 size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1131 {
1132 #if CRYPTOPP_AESNI_AVAILABLE
1133  if (HasAESNI())
1134  return Rijndael_Enc_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1135 #endif
1136 #if CRYPTOPP_ARM_AES_AVAILABLE
1137  if (HasAES())
1138  return Rijndael_Enc_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1139 #endif
1140 #if CRYPTOPP_POWER8_AES_AVAILABLE
1141  if (HasAES())
1142  return Rijndael_Enc_AdvancedProcessBlocks128_6x1_ALTIVEC(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1143 #endif
1144 
1145 #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
1146  if (HasSSE2())
1147  {
1148  if (length < BLOCKSIZE)
1149  return length;
1150 
1151  static const byte *zeros = (const byte*)(Te+256);
1152  m_aliasBlock.SetMark(m_aliasBlock.size());
1153  byte *space = NULLPTR, *originalSpace = const_cast<byte*>(m_aliasBlock.data());
1154 
1155  // round up to nearest 256 byte boundary
1156  space = originalSpace + (s_aliasBlockSize - (uintptr_t)originalSpace % s_aliasBlockSize) % s_aliasBlockSize;
1157  while (AliasedWithTable(space, space + sizeof(Locals)))
1158  {
1159  space += 256;
1160  CRYPTOPP_ASSERT(space < (originalSpace + s_aliasPageSize));
1161  }
1162 
1163  size_t increment = BLOCKSIZE;
1164  if (flags & BT_ReverseDirection)
1165  {
1166  CRYPTOPP_ASSERT(length % BLOCKSIZE == 0);
1167  inBlocks += length - BLOCKSIZE;
1168  xorBlocks += length - BLOCKSIZE;
1169  outBlocks += length - BLOCKSIZE;
1170  increment = 0-increment;
1171  }
1172 
1173  Locals &locals = *(Locals *)(void *)space;
1174 
1175  locals.inBlocks = inBlocks;
1176  locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
1177  locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
1178  locals.outBlocks = outBlocks;
1179 
1180  locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1181  locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
1182  locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
1183  locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1184 
1185  locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter);
1186  int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
1187  locals.keysBegin = (12-keysToCopy)*16;
1188 
1189  Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key);
1190 
1191  return length % BLOCKSIZE;
1192  }
1193 #endif
1194 
1195  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1196 }
1197 
1198 size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1199 {
1200 #if CRYPTOPP_AESNI_AVAILABLE
1201  if (HasAESNI())
1202  return Rijndael_Dec_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1203 #endif
1204 #if CRYPTOPP_ARM_AES_AVAILABLE
1205  if (HasAES())
1206  return Rijndael_Dec_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1207 #endif
1208 #if CRYPTOPP_POWER8_AES_AVAILABLE
1209  if (HasAES())
1210  return Rijndael_Dec_AdvancedProcessBlocks128_6x1_ALTIVEC(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1211 #endif
1212 
1213  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1214 }
1215 #endif // CRYPTOPP_RIJNDAEL_ADVANCED_PROCESS_BLOCKS
1216 
1217 NAMESPACE_END
1218 
1219 #endif
1220 #endif
Utility functions for the Crypto++ library.
bool HasAES()
Determine if an ARM processor has AES available.
Definition: cpu.h:386
Library configuration file.
int GetCacheLineSize()
Provides the cache line size.
Definition: cpu.h:298
Access a block of memory.
Definition: misc.h:2397
Rijndael block cipher.
Definition: rijndael.h:39
T ConditionalByteReverse(ByteOrder order, T value)
Reverses bytes in a value depending upon endianness.
Definition: misc.h:1974
virtual size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
Encrypt and xor multiple blocks using additional flags.
Definition: cryptlib.cpp:145
byte order is big-endian
Definition: cryptlib.h:145
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:60
Classes for Rijndael encryption algorithm.
Functions for CPU features and intrinsics.
bool HasAESNI()
Determines AES-NI availability.
Definition: cpu.h:162
bool HasSSE2()
Determines SSE2 availability.
Definition: cpu.h:114
bool HasSSE41()
Determines SSE4.1 availability.
Definition: cpu.h:140
Crypto++ library namespace.
Interface for retrieving values given their names.
Definition: cryptlib.h:291