Crypto++  5.6.4
Free C++ class library of cryptographic schemes
rijndael.cpp
1 // rijndael.cpp - modified by Chris Morgan <cmorgan@wpi.edu>
2 // and Wei Dai from Paulo Baretto's Rijndael implementation
3 // The original code and all modifications are in the public domain.
4 
5 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
6 
7 /*
8 July 2010: Added support for AES-NI instructions via compiler intrinsics.
9 */
10 
11 /*
12 Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode
13 caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein
14 and Peter Schwabe in their paper "New AES software speed records". The round
15 function was also modified to include a trick similar to one in Brian Gladman's
16 x86 assembly code, doing an 8-bit register move to minimize the number of
17 register spills. Also switched to compressed tables and copying round keys to
18 the stack.
19 
20 The C++ implementation now uses compressed tables if
21 CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined.
22 */
23 
24 /*
25 July 2006: Defense against timing attacks was added in by Wei Dai.
26 
27 The code now uses smaller tables in the first and last rounds,
28 and preloads them into L1 cache before usage (by loading at least
29 one element in each cache line).
30 
31 We try to delay subsequent accesses to each table (used in the first
32 and last rounds) until all of the table has been preloaded. Hopefully
33 the compiler isn't smart enough to optimize that code away.
34 
35 After preloading the table, we also try not to access any memory location
36 other than the table and the stack, in order to prevent table entries from
37 being unloaded from L1 cache, until that round is finished.
38 (Some popular CPUs have 2-way associative caches.)
39 */
40 
41 // This is the original introductory comment:
42 
43 /**
44  * version 3.0 (December 2000)
45  *
46  * Optimised ANSI C code for the Rijndael cipher (now AES)
47  *
48  * author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
49  * author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
50  * author Paulo Barreto <paulo.barreto@terra.com.br>
51  *
52  * This code is hereby placed in the public domain.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
55  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
56  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
58  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
59  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
60  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
61  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
62  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
63  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
64  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65  */
66 
67 #include "pch.h"
68 #include "config.h"
69 
70 #ifndef CRYPTOPP_IMPORTS
71 #ifndef CRYPTOPP_GENERATE_X64_MASM
72 
73 #include "rijndael.h"
74 #include "stdcpp.h" // alloca
75 #include "misc.h"
76 #include "cpu.h"
77 
78 NAMESPACE_BEGIN(CryptoPP)
79 
80 // Hack for http://github.com/weidai11/cryptopp/issues/42 and http://github.com/weidai11/cryptopp/issues/132
81 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS)
82 # define CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS 1
83 #endif
84 
85 // Hack for SunCC, http://github.com/weidai11/cryptopp/issues/224
86 #if (__SUNPRO_CC >= 0x5130)
87 # define MAYBE_CONST
88 #else
89 # define MAYBE_CONST const
90 #endif
91 
92 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
93 # if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
94 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
95 using namespace rdtable;
96 # else
97 static word64 Te[256];
98 # endif
99 static word64 Td[256];
100 #else // Not CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
101 # if defined(CRYPTOPP_X64_MASM_AVAILABLE)
102 // Unused; avoids linker error on Microsoft X64 non-AESNI platforms
103 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
104 # endif
105 CRYPTOPP_ALIGN_DATA(16) static word32 Te[256*4];
106 CRYPTOPP_ALIGN_DATA(16) static word32 Td[256*4];
107 #endif // CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
108 
109 static volatile bool s_TeFilled = false, s_TdFilled = false;
110 
111 // ************************* Portable Code ************************************
112 
113 #define QUARTER_ROUND(L, T, t, a, b, c, d) \
114  a ^= L(T, 3, byte(t)); t >>= 8;\
115  b ^= L(T, 2, byte(t)); t >>= 8;\
116  c ^= L(T, 1, byte(t)); t >>= 8;\
117  d ^= L(T, 0, t);
118 
119 #define QUARTER_ROUND_LE(t, a, b, c, d) \
120  tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
121  tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
122  tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
123  tempBlock[d] = ((byte *)(Te+t))[1];
124 
125 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
126  #define QUARTER_ROUND_LD(t, a, b, c, d) \
127  tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
128  tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
129  tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
130  tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
131 #else
132  #define QUARTER_ROUND_LD(t, a, b, c, d) \
133  tempBlock[a] = Sd[byte(t)]; t >>= 8;\
134  tempBlock[b] = Sd[byte(t)]; t >>= 8;\
135  tempBlock[c] = Sd[byte(t)]; t >>= 8;\
136  tempBlock[d] = Sd[t];
137 #endif
138 
139 #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
140 #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
141 
142 #ifdef IS_LITTLE_ENDIAN
143  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
144  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
145  #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
146  #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (6-i)%4+1))
147  #define TL_M(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (i+3)%4+1))
148  #else
149  #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8)
150  #define TL_M(T, i, x) T[i*256 + x]
151  #endif
152 #else
153  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
154  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
155  #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
156  #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (4-i)%4))
157  #define TL_M TL_F
158  #else
159  #define TL_F(T, i, x) rotrFixed(T[x], i*8)
160  #define TL_M(T, i, x) T[i*256 + x]
161  #endif
162 #endif
163 
164 
165 #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
166 #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
167 #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
168 
169 #define f3(x) (f2(x) ^ x)
170 #define f9(x) (f8(x) ^ x)
171 #define fb(x) (f8(x) ^ f2(x) ^ x)
172 #define fd(x) (f8(x) ^ f4(x) ^ x)
173 #define fe(x) (f8(x) ^ f4(x) ^ f2(x))
174 
175 void Rijndael::Base::FillEncTable()
176 {
177  for (int i=0; i<256; i++)
178  {
179  byte x = Se[i];
180 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
181  word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
182  Te[i] = word64(y | f3(x))<<32 | y;
183 #else
184  word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
185  for (int j=0; j<4; j++)
186  {
187  Te[i+j*256] = y;
188  y = rotrFixed(y, 8);
189  }
190 #endif
191  }
192 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
193  Te[256] = Te[257] = 0;
194 #endif
195  s_TeFilled = true;
196 }
197 
198 void Rijndael::Base::FillDecTable()
199 {
200  for (int i=0; i<256; i++)
201  {
202  byte x = Sd[i];
203 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
204  word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
205  Td[i] = word64(y | fb(x))<<32 | y | x;
206 #else
207  word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;;
208  for (int j=0; j<4; j++)
209  {
210  Td[i+j*256] = y;
211  y = rotrFixed(y, 8);
212  }
213 #endif
214  }
215  s_TdFilled = true;
216 }
217 
218 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &)
219 {
220  AssertValidKeyLength(keylen);
221 
222  m_rounds = keylen/4 + 6;
223  m_key.New(4*(m_rounds+1));
224 
225  word32 *rk = m_key;
226 
227 #if (CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32))
228  // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64
229  if (HasAESNI() && HasSSE4())
230  {
231  static const word32 rcLE[] = {
232  0x01, 0x02, 0x04, 0x08,
233  0x10, 0x20, 0x40, 0x80,
234  0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
235  };
236  const word32 *rc = rcLE;
237 
238  __m128i temp = _mm_loadu_si128((__m128i *)(void *)(userKey+keylen-16));
239  memcpy(rk, userKey, keylen);
240 
241  while (true)
242  {
243  rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
244  rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
245  rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
246  rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
247 
248  if (rk + keylen/4 + 4 == m_key.end())
249  break;
250 
251  if (keylen == 24)
252  {
253  rk[10] = rk[ 4] ^ rk[ 9];
254  rk[11] = rk[ 5] ^ rk[10];
255  temp = _mm_insert_epi32(temp, rk[11], 3);
256  }
257  else if (keylen == 32)
258  {
259  temp = _mm_insert_epi32(temp, rk[11], 3);
260  rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2);
261  rk[13] = rk[ 5] ^ rk[12];
262  rk[14] = rk[ 6] ^ rk[13];
263  rk[15] = rk[ 7] ^ rk[14];
264  temp = _mm_insert_epi32(temp, rk[15], 3);
265  }
266  else
267  temp = _mm_insert_epi32(temp, rk[7], 3);
268 
269  rk += keylen/4;
270  }
271 
272  if (!IsForwardTransformation())
273  {
274  rk = m_key;
275  unsigned int i, j;
276 
277 #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
278  // __m128i is an unsigned long long[2], and support for swapping it was not added until C++11.
279  // SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11.
280  vec_swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds));
281 #else
282  std::swap(*(__m128i *)(void *)(rk), *(__m128i *)(void *)(rk+4*m_rounds));
283 #endif
284  for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
285  {
286  temp = _mm_aesimc_si128(*(__m128i *)(void *)(rk+i));
287  *(__m128i *)(void *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(void *)(rk+j));
288  *(__m128i *)(void *)(rk+j) = temp;
289  }
290 
291  *(__m128i *)(void *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(void *)(rk+i));
292  }
293 
294  return;
295  }
296 #endif
297 
298  GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen);
299  const word32 *rc = rcon;
300  word32 temp;
301 
302  while (true)
303  {
304  temp = rk[keylen/4-1];
305  word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
306  rk[keylen/4] = rk[0] ^ x ^ *(rc++);
307  rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
308  rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
309  rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
310 
311  if (rk + keylen/4 + 4 == m_key.end())
312  break;
313 
314  if (keylen == 24)
315  {
316  rk[10] = rk[ 4] ^ rk[ 9];
317  rk[11] = rk[ 5] ^ rk[10];
318  }
319  else if (keylen == 32)
320  {
321  temp = rk[11];
322  rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
323  rk[13] = rk[ 5] ^ rk[12];
324  rk[14] = rk[ 6] ^ rk[13];
325  rk[15] = rk[ 7] ^ rk[14];
326  }
327  rk += keylen/4;
328  }
329 
330  rk = m_key;
331 
332  if (IsForwardTransformation())
333  {
334  if (!s_TeFilled)
335  FillEncTable();
336 
338  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
339  }
340  else
341  {
342  if (!s_TdFilled)
343  FillDecTable();
344 
345  unsigned int i, j;
346 
347 #define InverseMixColumn(x) TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
348 
349  for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
350  {
351  temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp;
352  temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
353  temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
354  temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
355  }
356 
357  rk[i+0] = InverseMixColumn(rk[i+0]);
358  rk[i+1] = InverseMixColumn(rk[i+1]);
359  rk[i+2] = InverseMixColumn(rk[i+2]);
360  rk[i+3] = InverseMixColumn(rk[i+3]);
361 
362  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
363  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
364  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
365  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
366  }
367 
368 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
369  if (HasAESNI())
370  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
371 #endif
372 }
373 
374 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
375 {
376 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
377 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
378  if (HasSSE2())
379 #else
380  if (HasAESNI())
381 #endif
382  {
383  return (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
384  }
385 #endif
386 
388 
389  word32 s0, s1, s2, s3, t0, t1, t2, t3;
390  Block::Get(inBlock)(s0)(s1)(s2)(s3);
391 
392  const word32 *rk = m_key;
393  s0 ^= rk[0];
394  s1 ^= rk[1];
395  s2 ^= rk[2];
396  s3 ^= rk[3];
397  t0 = rk[4];
398  t1 = rk[5];
399  t2 = rk[6];
400  t3 = rk[7];
401  rk += 8;
402 
403  // timing attack countermeasure. see comments at top for more details.
404  // also see http://github.com/weidai11/cryptopp/issues/146
405  const int cacheLineSize = GetCacheLineSize();
406  unsigned int i;
407  volatile word32 _u = 0;
408  word32 u = _u;
409 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
410  for (i=0; i<2048; i+=cacheLineSize)
411 #else
412  for (i=0; i<1024; i+=cacheLineSize)
413 #endif
414  u &= *(const word32 *)(const void *)(((const byte *)Te)+i);
415  u &= Te[255];
416  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
417 
418  QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
419  QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
420  QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
421  QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
422 
423  // Nr - 2 full rounds:
424  unsigned int r = m_rounds/2 - 1;
425  do
426  {
427  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
428 
429  QUARTER_ROUND_E(t3, s0, s1, s2, s3)
430  QUARTER_ROUND_E(t2, s3, s0, s1, s2)
431  QUARTER_ROUND_E(t1, s2, s3, s0, s1)
432  QUARTER_ROUND_E(t0, s1, s2, s3, s0)
433 
434  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
435 
436  QUARTER_ROUND_E(s3, t0, t1, t2, t3)
437  QUARTER_ROUND_E(s2, t3, t0, t1, t2)
438  QUARTER_ROUND_E(s1, t2, t3, t0, t1)
439  QUARTER_ROUND_E(s0, t1, t2, t3, t0)
440 
441  rk += 8;
442  } while (--r);
443 
444  word32 tbw[4];
445  byte *const tempBlock = (byte *)tbw;
446 
447  QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
448  QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
449  QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
450  QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
451 
452  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
453 }
454 
455 void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
456 {
457 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
458  if (HasAESNI())
459  {
460  Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
461  return;
462  }
463 #endif
464 
466 
467  word32 s0, s1, s2, s3, t0, t1, t2, t3;
468  Block::Get(inBlock)(s0)(s1)(s2)(s3);
469 
470  const word32 *rk = m_key;
471  s0 ^= rk[0];
472  s1 ^= rk[1];
473  s2 ^= rk[2];
474  s3 ^= rk[3];
475  t0 = rk[4];
476  t1 = rk[5];
477  t2 = rk[6];
478  t3 = rk[7];
479  rk += 8;
480 
481  // timing attack countermeasure. see comments at top for more details.
482  // also see http://github.com/weidai11/cryptopp/issues/146
483  const int cacheLineSize = GetCacheLineSize();
484  unsigned int i;
485  volatile word32 _u = 0;
486  word32 u = _u;
487 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
488  for (i=0; i<2048; i+=cacheLineSize)
489 #else
490  for (i=0; i<1024; i+=cacheLineSize)
491 #endif
492  u &= *(const word32 *)(const void *)(((const byte *)Td)+i);
493  u &= Td[255];
494  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
495 
496  QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
497  QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
498  QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
499  QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
500 
501  // Nr - 2 full rounds:
502  unsigned int r = m_rounds/2 - 1;
503  do
504  {
505  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
506 
507  QUARTER_ROUND_D(t3, s2, s1, s0, s3)
508  QUARTER_ROUND_D(t2, s1, s0, s3, s2)
509  QUARTER_ROUND_D(t1, s0, s3, s2, s1)
510  QUARTER_ROUND_D(t0, s3, s2, s1, s0)
511 
512  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
513 
514  QUARTER_ROUND_D(s3, t2, t1, t0, t3)
515  QUARTER_ROUND_D(s2, t1, t0, t3, t2)
516  QUARTER_ROUND_D(s1, t0, t3, t2, t1)
517  QUARTER_ROUND_D(s0, t3, t2, t1, t0)
518 
519  rk += 8;
520  } while (--r);
521 
522 #if !(defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS))
523  // timing attack countermeasure. see comments at top for more details
524  // If CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined,
525  // QUARTER_ROUND_LD will use Td, which is already preloaded.
526  u = _u;
527  for (i=0; i<256; i+=cacheLineSize)
528  u &= *(const word32 *)(const void *)(Sd+i);
529  u &= *(const word32 *)(const void *)(Sd+252);
530  t0 |= u; t1 |= u; t2 |= u; t3 |= u;
531 #endif
532 
533  word32 tbw[4];
534  byte *const tempBlock = (byte *)tbw;
535 
536  QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
537  QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
538  QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
539  QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
540 
541  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
542 }
543 
544 // ************************* Assembly Code ************************************
545 
546 #if CRYPTOPP_MSC_VERSION
547 # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
548 #endif
549 
550 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
551 
552 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
553 
554 CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k)
555 {
556  CRYPTOPP_UNUSED(locals); CRYPTOPP_UNUSED(k);
557 
558 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
559 
560 #define L_REG esp
561 #define L_INDEX(i) (L_REG+768+i)
562 #define L_INXORBLOCKS L_INBLOCKS+4
563 #define L_OUTXORBLOCKS L_INBLOCKS+8
564 #define L_OUTBLOCKS L_INBLOCKS+12
565 #define L_INCREMENTS L_INDEX(16*15)
566 #define L_SP L_INDEX(16*16)
567 #define L_LENGTH L_INDEX(16*16+4)
568 #define L_KEYS_BEGIN L_INDEX(16*16+8)
569 
570 #define MOVD movd
571 #define MM(i) mm##i
572 
573 #define MXOR(a,b,c) \
574  AS2( movzx esi, b)\
575  AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
576  AS2( pxor MM(a), mm7)\
577 
578 #define MMOV(a,b,c) \
579  AS2( movzx esi, b)\
580  AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
581 
582 #else
583 
584 #define L_REG r8
585 #define L_INDEX(i) (L_REG+i)
586 #define L_INXORBLOCKS L_INBLOCKS+8
587 #define L_OUTXORBLOCKS L_INBLOCKS+16
588 #define L_OUTBLOCKS L_INBLOCKS+24
589 #define L_INCREMENTS L_INDEX(16*16)
590 #define L_LENGTH L_INDEX(16*18+8)
591 #define L_KEYS_BEGIN L_INDEX(16*19)
592 
593 #define MOVD mov
594 #define MM_0 r9d
595 #define MM_1 r12d
596 #ifdef __GNUC__
597 #define MM_2 r11d
598 #else
599 #define MM_2 r10d
600 #endif
601 #define MM(i) MM_##i
602 
603 #define MXOR(a,b,c) \
604  AS2( movzx esi, b)\
605  AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
606 
607 #define MMOV(a,b,c) \
608  AS2( movzx esi, b)\
609  AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
610 
611 #endif
612 
613 #define L_SUBKEYS L_INDEX(0)
614 #define L_SAVED_X L_SUBKEYS
615 #define L_KEY12 L_INDEX(16*12)
616 #define L_LASTROUND L_INDEX(16*13)
617 #define L_INBLOCKS L_INDEX(16*14)
618 #define MAP0TO4(i) (ASM_MOD(i+3,4)+1)
619 
620 #define XOR(a,b,c) \
621  AS2( movzx esi, b)\
622  AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
623 
624 #define MOV(a,b,c) \
625  AS2( movzx esi, b)\
626  AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
627 
628 #ifdef CRYPTOPP_GENERATE_X64_MASM
629  ALIGN 8
630  Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
631  rex_push_reg rsi
632  push_reg rdi
633  push_reg rbx
634  push_reg r12
635  .endprolog
636  mov L_REG, rcx
637  mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
638  mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
639 #elif defined(__GNUC__)
640  __asm__ __volatile__
641  (
642  INTEL_NOPREFIX
643  #if CRYPTOPP_BOOL_X64
644  AS2( mov L_REG, rcx)
645  #endif
646  AS_PUSH_IF86(bx)
647  AS_PUSH_IF86(bp)
648  AS2( mov AS_REG_7, WORD_REG(si))
649 #else
650  AS_PUSH_IF86(si)
651  AS_PUSH_IF86(di)
652  AS_PUSH_IF86(bx)
653  AS_PUSH_IF86(bp)
654  AS2( lea AS_REG_7, [Te])
655  AS2( mov edi, [g_cacheLineSize])
656 #endif
657 
658 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
659  AS2( mov [ecx+16*12+16*4], esp) // save esp to L_SP
660  AS2( lea esp, [ecx-768])
661 #endif
662 
663  // copy subkeys to stack
664  AS2( mov WORD_REG(si), [L_KEYS_BEGIN])
665  AS2( mov WORD_REG(ax), 16)
666  AS2( and WORD_REG(ax), WORD_REG(si))
667  AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)]) // subkey 1 (non-counter) or 2 (counter)
668  AS2( movdqa [L_KEY12], xmm3)
669  AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
670  AS2( sub WORD_REG(ax), WORD_REG(si))
671  ASL(0)
672  AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)])
673  AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
674  AS2( add WORD_REG(si), 16)
675  AS2( cmp WORD_REG(si), 16*12)
676  ATT_NOPREFIX
677  ASJ( jl, 0, b)
678  INTEL_NOPREFIX
679 
680  // read subkeys 0, 1 and last
681  AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)]) // last subkey
682  AS2( movdqa xmm1, [WORD_REG(dx)]) // subkey 0
683  AS2( MOVD MM(1), [WORD_REG(dx)+4*4]) // 0,1,2,3
684  AS2( mov ebx, [WORD_REG(dx)+5*4]) // 4,5,6,7
685  AS2( mov ecx, [WORD_REG(dx)+6*4]) // 8,9,10,11
686  AS2( mov edx, [WORD_REG(dx)+7*4]) // 12,13,14,15
687 
688  // load table into cache
689  AS2( xor WORD_REG(ax), WORD_REG(ax))
690  ASL(9)
691  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
692  AS2( add WORD_REG(ax), WORD_REG(di))
693  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
694  AS2( add WORD_REG(ax), WORD_REG(di))
695  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
696  AS2( add WORD_REG(ax), WORD_REG(di))
697  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
698  AS2( add WORD_REG(ax), WORD_REG(di))
699  AS2( cmp WORD_REG(ax), 2048)
700  ATT_NOPREFIX
701  ASJ( jl, 9, b)
702  INTEL_NOPREFIX
703  AS1( lfence)
704 
705  AS2( test DWORD PTR [L_LENGTH], 1)
706  ATT_NOPREFIX
707  ASJ( jz, 8, f)
708  INTEL_NOPREFIX
709 
710  // counter mode one-time setup
711  AS2( mov WORD_REG(si), [L_INBLOCKS])
712  AS2( movdqu xmm2, [WORD_REG(si)]) // counter
713  AS2( pxor xmm2, xmm1)
714  AS2( psrldq xmm1, 14)
715  AS2( movd eax, xmm1)
716  AS2( mov al, BYTE PTR [WORD_REG(si)+15])
717  AS2( MOVD MM(2), eax)
718 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
719  AS2( mov eax, 1)
720  AS2( movd mm3, eax)
721 #endif
722 
723  // partial first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: mm1, ebx, ecx, edx
724  AS2( movd eax, xmm2)
725  AS2( psrldq xmm2, 4)
726  AS2( movd edi, xmm2)
727  AS2( psrldq xmm2, 4)
728  MXOR( 1, al, 0) // 0
729  XOR( edx, ah, 1) // 1
730  AS2( shr eax, 16)
731  XOR( ecx, al, 2) // 2
732  XOR( ebx, ah, 3) // 3
733  AS2( mov eax, edi)
734  AS2( movd edi, xmm2)
735  AS2( psrldq xmm2, 4)
736  XOR( ebx, al, 0) // 4
737  MXOR( 1, ah, 1) // 5
738  AS2( shr eax, 16)
739  XOR( edx, al, 2) // 6
740  XOR( ecx, ah, 3) // 7
741  AS2( mov eax, edi)
742  AS2( movd edi, xmm2)
743  XOR( ecx, al, 0) // 8
744  XOR( ebx, ah, 1) // 9
745  AS2( shr eax, 16)
746  MXOR( 1, al, 2) // 10
747  XOR( edx, ah, 3) // 11
748  AS2( mov eax, edi)
749  XOR( edx, al, 0) // 12
750  XOR( ecx, ah, 1) // 13
751  AS2( shr eax, 16)
752  XOR( ebx, al, 2) // 14
753  AS2( psrldq xmm2, 3)
754 
755  // partial second round, in: ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15), out: eax, ebx, edi, mm0
756  AS2( mov eax, [L_KEY12+0*4])
757  AS2( mov edi, [L_KEY12+2*4])
758  AS2( MOVD MM(0), [L_KEY12+3*4])
759  MXOR( 0, cl, 3) /* 11 */
760  XOR( edi, bl, 3) /* 7 */
761  MXOR( 0, bh, 2) /* 6 */
762  AS2( shr ebx, 16) /* 4,5 */
763  XOR( eax, bl, 1) /* 5 */
764  MOV( ebx, bh, 0) /* 4 */
765  AS2( xor ebx, [L_KEY12+1*4])
766  XOR( eax, ch, 2) /* 10 */
767  AS2( shr ecx, 16) /* 8,9 */
768  XOR( eax, dl, 3) /* 15 */
769  XOR( ebx, dh, 2) /* 14 */
770  AS2( shr edx, 16) /* 12,13 */
771  XOR( edi, ch, 0) /* 8 */
772  XOR( ebx, cl, 1) /* 9 */
773  XOR( edi, dl, 1) /* 13 */
774  MXOR( 0, dh, 0) /* 12 */
775 
776  AS2( movd ecx, xmm2)
777  AS2( MOVD edx, MM(1))
778  AS2( MOVD [L_SAVED_X+3*4], MM(0))
779  AS2( mov [L_SAVED_X+0*4], eax)
780  AS2( mov [L_SAVED_X+1*4], ebx)
781  AS2( mov [L_SAVED_X+2*4], edi)
782  ATT_NOPREFIX
783  ASJ( jmp, 5, f)
784  INTEL_NOPREFIX
785  ASL(3)
786  // non-counter mode per-block setup
787  AS2( MOVD MM(1), [L_KEY12+0*4]) // 0,1,2,3
788  AS2( mov ebx, [L_KEY12+1*4]) // 4,5,6,7
789  AS2( mov ecx, [L_KEY12+2*4]) // 8,9,10,11
790  AS2( mov edx, [L_KEY12+3*4]) // 12,13,14,15
791  ASL(8)
792  AS2( mov WORD_REG(ax), [L_INBLOCKS])
793  AS2( movdqu xmm2, [WORD_REG(ax)])
794  AS2( mov WORD_REG(si), [L_INXORBLOCKS])
795  AS2( movdqu xmm5, [WORD_REG(si)])
796  AS2( pxor xmm2, xmm1)
797  AS2( pxor xmm2, xmm5)
798 
799  // first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: eax, ebx, ecx, edx
800  AS2( movd eax, xmm2)
801  AS2( psrldq xmm2, 4)
802  AS2( movd edi, xmm2)
803  AS2( psrldq xmm2, 4)
804  MXOR( 1, al, 0) // 0
805  XOR( edx, ah, 1) // 1
806  AS2( shr eax, 16)
807  XOR( ecx, al, 2) // 2
808  XOR( ebx, ah, 3) // 3
809  AS2( mov eax, edi)
810  AS2( movd edi, xmm2)
811  AS2( psrldq xmm2, 4)
812  XOR( ebx, al, 0) // 4
813  MXOR( 1, ah, 1) // 5
814  AS2( shr eax, 16)
815  XOR( edx, al, 2) // 6
816  XOR( ecx, ah, 3) // 7
817  AS2( mov eax, edi)
818  AS2( movd edi, xmm2)
819  XOR( ecx, al, 0) // 8
820  XOR( ebx, ah, 1) // 9
821  AS2( shr eax, 16)
822  MXOR( 1, al, 2) // 10
823  XOR( edx, ah, 3) // 11
824  AS2( mov eax, edi)
825  XOR( edx, al, 0) // 12
826  XOR( ecx, ah, 1) // 13
827  AS2( shr eax, 16)
828  XOR( ebx, al, 2) // 14
829  MXOR( 1, ah, 3) // 15
830  AS2( MOVD eax, MM(1))
831 
832  AS2( add L_REG, [L_KEYS_BEGIN])
833  AS2( add L_REG, 4*16)
834  ATT_NOPREFIX
835  ASJ( jmp, 2, f)
836  INTEL_NOPREFIX
837  ASL(1)
838  // counter-mode per-block setup
839  AS2( MOVD ecx, MM(2))
840  AS2( MOVD edx, MM(1))
841  AS2( mov eax, [L_SAVED_X+0*4])
842  AS2( mov ebx, [L_SAVED_X+1*4])
843  AS2( xor cl, ch)
844  AS2( and WORD_REG(cx), 255)
845  ASL(5)
846 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
847  AS2( paddb MM(2), mm3)
848 #else
849  AS2( add MM(2), 1)
850 #endif
851  // remaining part of second round, in: edx(previous round),esi(keyed counter byte) eax,ebx,[L_SAVED_X+2*4],[L_SAVED_X+3*4], out: eax,ebx,ecx,edx
852  AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
853  XOR( ebx, dl, 3)
854  MOV( ecx, dh, 2)
855  AS2( shr edx, 16)
856  AS2( xor ecx, [L_SAVED_X+2*4])
857  XOR( eax, dh, 0)
858  MOV( edx, dl, 1)
859  AS2( xor edx, [L_SAVED_X+3*4])
860 
861  AS2( add L_REG, [L_KEYS_BEGIN])
862  AS2( add L_REG, 3*16)
863  ATT_NOPREFIX
864  ASJ( jmp, 4, f)
865  INTEL_NOPREFIX
866 
867 // in: eax(0,1,2,3), ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15)
868 // out: eax, ebx, edi, mm0
869 #define ROUND() \
870  MXOR( 0, cl, 3) /* 11 */\
871  AS2( mov cl, al) /* 8,9,10,3 */\
872  XOR( edi, ah, 2) /* 2 */\
873  AS2( shr eax, 16) /* 0,1 */\
874  XOR( edi, bl, 3) /* 7 */\
875  MXOR( 0, bh, 2) /* 6 */\
876  AS2( shr ebx, 16) /* 4,5 */\
877  MXOR( 0, al, 1) /* 1 */\
878  MOV( eax, ah, 0) /* 0 */\
879  XOR( eax, bl, 1) /* 5 */\
880  MOV( ebx, bh, 0) /* 4 */\
881  XOR( eax, ch, 2) /* 10 */\
882  XOR( ebx, cl, 3) /* 3 */\
883  AS2( shr ecx, 16) /* 8,9 */\
884  XOR( eax, dl, 3) /* 15 */\
885  XOR( ebx, dh, 2) /* 14 */\
886  AS2( shr edx, 16) /* 12,13 */\
887  XOR( edi, ch, 0) /* 8 */\
888  XOR( ebx, cl, 1) /* 9 */\
889  XOR( edi, dl, 1) /* 13 */\
890  MXOR( 0, dh, 0) /* 12 */\
891 
892  ASL(2) // 2-round loop
893  AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4])
894  AS2( mov edi, [L_SUBKEYS-4*16+2*4])
895  ROUND()
896  AS2( mov ecx, edi)
897  AS2( xor eax, [L_SUBKEYS-4*16+0*4])
898  AS2( xor ebx, [L_SUBKEYS-4*16+1*4])
899  AS2( MOVD edx, MM(0))
900 
901  ASL(4)
902  AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4])
903  AS2( mov edi, [L_SUBKEYS-4*16+6*4])
904  ROUND()
905  AS2( mov ecx, edi)
906  AS2( xor eax, [L_SUBKEYS-4*16+4*4])
907  AS2( xor ebx, [L_SUBKEYS-4*16+5*4])
908  AS2( MOVD edx, MM(0))
909 
910  AS2( add L_REG, 32)
911  AS2( test L_REG, 255)
912  ATT_NOPREFIX
913  ASJ( jnz, 2, b)
914  INTEL_NOPREFIX
915  AS2( sub L_REG, 16*16)
916 
917 #define LAST(a, b, c) \
918  AS2( movzx esi, a )\
919  AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
920  AS2( movzx esi, b )\
921  AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\
922  AS2( mov WORD PTR [L_LASTROUND+c], di )\
923 
924  // last round
925  LAST(ch, dl, 2)
926  LAST(dh, al, 6)
927  AS2( shr edx, 16)
928  LAST(ah, bl, 10)
929  AS2( shr eax, 16)
930  LAST(bh, cl, 14)
931  AS2( shr ebx, 16)
932  LAST(dh, al, 12)
933  AS2( shr ecx, 16)
934  LAST(ah, bl, 0)
935  LAST(bh, cl, 4)
936  LAST(ch, dl, 8)
937 
938  AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS])
939  AS2( mov WORD_REG(bx), [L_OUTBLOCKS])
940 
941  AS2( mov WORD_REG(cx), [L_LENGTH])
942  AS2( sub WORD_REG(cx), 16)
943 
944  AS2( movdqu xmm2, [WORD_REG(ax)])
945  AS2( pxor xmm2, xmm4)
946 
947 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
948  AS2( movdqa xmm0, [L_INCREMENTS])
949  AS2( paddd xmm0, [L_INBLOCKS])
950  AS2( movdqa [L_INBLOCKS], xmm0)
951 #else
952  AS2( movdqa xmm0, [L_INCREMENTS+16])
953  AS2( paddq xmm0, [L_INBLOCKS+16])
954  AS2( movdqa [L_INBLOCKS+16], xmm0)
955 #endif
956 
957  AS2( pxor xmm2, [L_LASTROUND])
958  AS2( movdqu [WORD_REG(bx)], xmm2)
959 
960  ATT_NOPREFIX
961  ASJ( jle, 7, f)
962  INTEL_NOPREFIX
963  AS2( mov [L_LENGTH], WORD_REG(cx))
964  AS2( test WORD_REG(cx), 1)
965  ATT_NOPREFIX
966  ASJ( jnz, 1, b)
967  INTEL_NOPREFIX
968 #if CRYPTOPP_BOOL_X64
969  AS2( movdqa xmm0, [L_INCREMENTS])
970  AS2( paddq xmm0, [L_INBLOCKS])
971  AS2( movdqa [L_INBLOCKS], xmm0)
972 #endif
973  ATT_NOPREFIX
974  ASJ( jmp, 3, b)
975  INTEL_NOPREFIX
976 
977  ASL(7)
978  // erase keys on stack
979  AS2( xorps xmm0, xmm0)
980  AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16])
981  AS2( movaps [WORD_REG(ax)-7*16], xmm0)
982  AS2( movaps [WORD_REG(ax)-6*16], xmm0)
983  AS2( movaps [WORD_REG(ax)-5*16], xmm0)
984  AS2( movaps [WORD_REG(ax)-4*16], xmm0)
985  AS2( movaps [WORD_REG(ax)-3*16], xmm0)
986  AS2( movaps [WORD_REG(ax)-2*16], xmm0)
987  AS2( movaps [WORD_REG(ax)-1*16], xmm0)
988  AS2( movaps [WORD_REG(ax)+0*16], xmm0)
989  AS2( movaps [WORD_REG(ax)+1*16], xmm0)
990  AS2( movaps [WORD_REG(ax)+2*16], xmm0)
991  AS2( movaps [WORD_REG(ax)+3*16], xmm0)
992  AS2( movaps [WORD_REG(ax)+4*16], xmm0)
993  AS2( movaps [WORD_REG(ax)+5*16], xmm0)
994  AS2( movaps [WORD_REG(ax)+6*16], xmm0)
995 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
996  AS2( mov esp, [L_SP])
997  AS1( emms)
998 #endif
999  AS_POP_IF86(bp)
1000  AS_POP_IF86(bx)
1001 #if defined(_MSC_VER) && CRYPTOPP_BOOL_X86
1002  AS_POP_IF86(di)
1003  AS_POP_IF86(si)
1004  AS1(ret)
1005 #endif
1006 #ifdef CRYPTOPP_GENERATE_X64_MASM
1007  pop r12
1008  pop rbx
1009  pop rdi
1010  pop rsi
1011  ret
1012  Rijndael_Enc_AdvancedProcessBlocks ENDP
1013 #endif
1014 #ifdef __GNUC__
1015  ATT_PREFIX
1016  :
1017  : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize)
1018  : "memory", "cc", "%eax"
1019  #if CRYPTOPP_BOOL_X64
1020  , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12"
1021  #endif
1022  );
1023 #endif
1024 }
1025 
1026 #endif
1027 
1028 #ifndef CRYPTOPP_GENERATE_X64_MASM
1029 
1030 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
1031 extern "C" {
1032 void Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k);
1033 }
1034 #endif
1035 
1036 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
1037 
1038 static inline bool AliasedWithTable(const byte *begin, const byte *end)
1039 {
1040  size_t s0 = size_t(begin)%4096, s1 = size_t(end)%4096;
1041  size_t t0 = size_t(Te)%4096, t1 = (size_t(Te)+sizeof(Te))%4096;
1042  if (t1 > t0)
1043  return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
1044  else
1045  return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
1046 }
1047 
1048 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1049 
1050 inline void AESNI_Enc_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
1051 {
1052  block = _mm_xor_si128(block, subkeys[0]);
1053  for (unsigned int i=1; i<rounds-1; i+=2)
1054  {
1055  block = _mm_aesenc_si128(block, subkeys[i]);
1056  block = _mm_aesenc_si128(block, subkeys[i+1]);
1057  }
1058  block = _mm_aesenc_si128(block, subkeys[rounds-1]);
1059  block = _mm_aesenclast_si128(block, subkeys[rounds]);
1060 }
1061 
1062 inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
1063 {
1064  __m128i rk = subkeys[0];
1065  block0 = _mm_xor_si128(block0, rk);
1066  block1 = _mm_xor_si128(block1, rk);
1067  block2 = _mm_xor_si128(block2, rk);
1068  block3 = _mm_xor_si128(block3, rk);
1069  for (unsigned int i=1; i<rounds; i++)
1070  {
1071  rk = subkeys[i];
1072  block0 = _mm_aesenc_si128(block0, rk);
1073  block1 = _mm_aesenc_si128(block1, rk);
1074  block2 = _mm_aesenc_si128(block2, rk);
1075  block3 = _mm_aesenc_si128(block3, rk);
1076  }
1077  rk = subkeys[rounds];
1078  block0 = _mm_aesenclast_si128(block0, rk);
1079  block1 = _mm_aesenclast_si128(block1, rk);
1080  block2 = _mm_aesenclast_si128(block2, rk);
1081  block3 = _mm_aesenclast_si128(block3, rk);
1082 }
1083 
1084 inline void AESNI_Dec_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
1085 {
1086  block = _mm_xor_si128(block, subkeys[0]);
1087  for (unsigned int i=1; i<rounds-1; i+=2)
1088  {
1089  block = _mm_aesdec_si128(block, subkeys[i]);
1090  block = _mm_aesdec_si128(block, subkeys[i+1]);
1091  }
1092  block = _mm_aesdec_si128(block, subkeys[rounds-1]);
1093  block = _mm_aesdeclast_si128(block, subkeys[rounds]);
1094 }
1095 
1096 inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
1097 {
1098  __m128i rk = subkeys[0];
1099  block0 = _mm_xor_si128(block0, rk);
1100  block1 = _mm_xor_si128(block1, rk);
1101  block2 = _mm_xor_si128(block2, rk);
1102  block3 = _mm_xor_si128(block3, rk);
1103  for (unsigned int i=1; i<rounds; i++)
1104  {
1105  rk = subkeys[i];
1106  block0 = _mm_aesdec_si128(block0, rk);
1107  block1 = _mm_aesdec_si128(block1, rk);
1108  block2 = _mm_aesdec_si128(block2, rk);
1109  block3 = _mm_aesdec_si128(block3, rk);
1110  }
1111  rk = subkeys[rounds];
1112  block0 = _mm_aesdeclast_si128(block0, rk);
1113  block1 = _mm_aesdeclast_si128(block1, rk);
1114  block2 = _mm_aesdeclast_si128(block2, rk);
1115  block3 = _mm_aesdeclast_si128(block3, rk);
1116 }
1117 
1118 CRYPTOPP_ALIGN_DATA(16)
1119 static const word32 s_one[] = {0, 0, 0, 1<<24};
1120 
1121 template <typename F1, typename F4>
1122 inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, MAYBE_CONST __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1123 {
1124  size_t blockSize = 16;
1126  size_t xorIncrement = xorBlocks ? blockSize : 0;
1127  size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
1128 
1130  {
1131  CRYPTOPP_ASSERT(length % blockSize == 0);
1132  inBlocks += length - blockSize;
1133  xorBlocks += length - blockSize;
1134  outBlocks += length - blockSize;
1135  inIncrement = 0-inIncrement;
1136  xorIncrement = 0-xorIncrement;
1137  outIncrement = 0-outIncrement;
1138  }
1139 
1140  if (flags & BlockTransformation::BT_AllowParallel)
1141  {
1142  while (length >= 4*blockSize)
1143  {
1144  __m128i block0 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks), block1, block2, block3;
1146  {
1147  const __m128i be1 = *(const __m128i *)(const void *)s_one;
1148  block1 = _mm_add_epi32(block0, be1);
1149  block2 = _mm_add_epi32(block1, be1);
1150  block3 = _mm_add_epi32(block2, be1);
1151  _mm_storeu_si128((__m128i *)(void *)inBlocks, _mm_add_epi32(block3, be1));
1152  }
1153  else
1154  {
1155  inBlocks += inIncrement;
1156  block1 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
1157  inBlocks += inIncrement;
1158  block2 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
1159  inBlocks += inIncrement;
1160  block3 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
1161  inBlocks += inIncrement;
1162  }
1163 
1164  if (flags & BlockTransformation::BT_XorInput)
1165  {
1166  block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1167  xorBlocks += xorIncrement;
1168  block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1169  xorBlocks += xorIncrement;
1170  block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1171  xorBlocks += xorIncrement;
1172  block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1173  xorBlocks += xorIncrement;
1174  }
1175 
1176  func4(block0, block1, block2, block3, subkeys, rounds);
1177 
1178  if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
1179  {
1180  block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1181  xorBlocks += xorIncrement;
1182  block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1183  xorBlocks += xorIncrement;
1184  block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1185  xorBlocks += xorIncrement;
1186  block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1187  xorBlocks += xorIncrement;
1188  }
1189 
1190  _mm_storeu_si128((__m128i *)(void *)outBlocks, block0);
1191  outBlocks += outIncrement;
1192  _mm_storeu_si128((__m128i *)(void *)outBlocks, block1);
1193  outBlocks += outIncrement;
1194  _mm_storeu_si128((__m128i *)(void *)outBlocks, block2);
1195  outBlocks += outIncrement;
1196  _mm_storeu_si128((__m128i *)(void *)outBlocks, block3);
1197  outBlocks += outIncrement;
1198 
1199  length -= 4*blockSize;
1200  }
1201  }
1202 
1203  while (length >= blockSize)
1204  {
1205  __m128i block = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
1206 
1207  if (flags & BlockTransformation::BT_XorInput)
1208  block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1209 
1210  if (flags & BlockTransformation::BT_InBlockIsCounter)
1211  const_cast<byte *>(inBlocks)[15]++;
1212 
1213  func1(block, subkeys, rounds);
1214 
1215  if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
1216  block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1217 
1218  _mm_storeu_si128((__m128i *)(void *)outBlocks, block);
1219 
1220  inBlocks += inIncrement;
1221  outBlocks += outIncrement;
1222  xorBlocks += xorIncrement;
1223  length -= blockSize;
1224  }
1225 
1226  return length;
1227 }
1228 #endif
1229 
1230 size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1231 {
1232 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1233  if (HasAESNI())
1234  return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (MAYBE_CONST __m128i *)(const void *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1235 #endif
1236 
1237 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
1238  if (HasSSE2())
1239  {
1240  if (length < BLOCKSIZE)
1241  return length;
1242 
1243  struct Locals
1244  {
1245  word32 subkeys[4*12], workspace[8];
1246  const byte *inBlocks, *inXorBlocks, *outXorBlocks;
1247  byte *outBlocks;
1248  size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
1249  size_t regSpill, lengthAndCounterFlag, keysBegin;
1250  };
1251 
1252  const byte* zeros = (byte *)(Te+256);
1253  byte *space = NULL;
1254 
1255  do {
1256 #if (CRYPTOPP_MSC_VERSION >= 1400)
1257  // http://msdn.microsoft.com/en-us/library/5471dc8s.aspx
1258  space = (byte *)_malloca(255+sizeof(Locals));
1259  space += (256-(size_t)space%256)%256;
1260 #else
1261  space = (byte *)alloca(255+sizeof(Locals));
1262  space += (256-(size_t)space%256)%256;
1263 #endif
1264  }
1265  while (AliasedWithTable(space, space+sizeof(Locals)));
1266 
1267  size_t increment = BLOCKSIZE;
1268  if (flags & BT_ReverseDirection)
1269  {
1270  CRYPTOPP_ASSERT(length % BLOCKSIZE == 0);
1271  inBlocks += length - BLOCKSIZE;
1272  xorBlocks += length - BLOCKSIZE;
1273  outBlocks += length - BLOCKSIZE;
1274  increment = 0-increment;
1275  }
1276 
1277  Locals &locals = *(Locals *)(void *)space;
1278 
1279  locals.inBlocks = inBlocks;
1280  locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
1281  locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
1282  locals.outBlocks = outBlocks;
1283 
1284  locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1285  locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
1286  locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
1287  locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1288 
1289  locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter);
1290  int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
1291  locals.keysBegin = (12-keysToCopy)*16;
1292 
1293  Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key);
1294 
1295 #if (CRYPTOPP_MSC_VERSION >= 1400)
1296  _freea(space);
1297 #endif
1298 
1299  return length % BLOCKSIZE;
1300  }
1301 #endif
1302 
1303  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1304 }
1305 
1306 #endif
1307 
1308 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1309 
1310 size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1311 {
1312  if (HasAESNI())
1313  return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (MAYBE_CONST __m128i *)(const void *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1314 
1315  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1316 }
1317 
1318 #endif // #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1319 
1320 NAMESPACE_END
1321 
1322 #endif
1323 #endif
Utility functions for the Crypto++ library.
bool HasSSE4()
Determines SSE4 availability.
Definition: cpu.h:185
Library configuration file.
should not modify block pointers
Definition: cryptlib.h:795
int GetCacheLineSize()
Provides the cache line size.
Definition: cpu.h:299
virtual size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
Encrypt and xor multiple blocks using additional flags.
Definition: cryptlib.cpp:178
Access a block of memory.
Definition: misc.h:2255
Rijndael block cipher implementation details.
Definition: rijndael.h:29
T ConditionalByteReverse(ByteOrder order, T value)
Reverses bytes in a value depending upon endianess.
Definition: misc.h:1858
byte order is big-endian
Definition: cryptlib.h:132
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:62
Classes for Rijndael encryption algorithm.
Functions for CPU features and intrinsics.
bool HasAESNI()
Determines AES-NI availability.
Definition: cpu.h:195
bool HasSSE2()
Determines SSE2 availability.
Definition: cpu.h:160
perform the transformation in reverse
Definition: cryptlib.h:799
Crypto++ library namespace.
T rotrFixed(T x, unsigned int y)
Performs a right rotate.
Definition: misc.h:1336
Interface for retrieving values given their names.
Definition: cryptlib.h:282
void vec_swap(T &a, T &b)
Swaps two variables which are arrays.
Definition: misc.h:439