Crypto++  5.6.3
Free C++ class library of cryptographic schemes
rijndael.cpp
1 // rijndael.cpp - modified by Chris Morgan <cmorgan@wpi.edu>
2 // and Wei Dai from Paulo Baretto's Rijndael implementation
3 // The original code and all modifications are in the public domain.
4 
5 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
6 
7 /*
8 July 2010: Added support for AES-NI instructions via compiler intrinsics.
9 */
10 
11 /*
12 Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode
13 caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein
14 and Peter Schwabe in their paper "New AES software speed records". The round
15 function was also modified to include a trick similar to one in Brian Gladman's
16 x86 assembly code, doing an 8-bit register move to minimize the number of
17 register spills. Also switched to compressed tables and copying round keys to
18 the stack.
19 
20 The C++ implementation now uses compressed tables if
21 CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined.
22 */
23 
24 /*
25 July 2006: Defense against timing attacks was added in by Wei Dai.
26 
27 The code now uses smaller tables in the first and last rounds,
28 and preloads them into L1 cache before usage (by loading at least
29 one element in each cache line).
30 
31 We try to delay subsequent accesses to each table (used in the first
32 and last rounds) until all of the table has been preloaded. Hopefully
33 the compiler isn't smart enough to optimize that code away.
34 
35 After preloading the table, we also try not to access any memory location
36 other than the table and the stack, in order to prevent table entries from
37 being unloaded from L1 cache, until that round is finished.
38 (Some popular CPUs have 2-way associative caches.)
39 */
40 
41 // This is the original introductory comment:
42 
43 /**
44  * version 3.0 (December 2000)
45  *
46  * Optimised ANSI C code for the Rijndael cipher (now AES)
47  *
48  * author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
49  * author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
50  * author Paulo Barreto <paulo.barreto@terra.com.br>
51  *
52  * This code is hereby placed in the public domain.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
55  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
56  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
58  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
59  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
60  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
61  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
62  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
63  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
64  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65  */
66 
67 #include "pch.h"
68 #include "config.h"
69 
70 #ifndef CRYPTOPP_IMPORTS
71 #ifndef CRYPTOPP_GENERATE_X64_MASMrij
72 
73 #include "rijndael.h"
74 #include "stdcpp.h" // alloca
75 #include "misc.h"
76 #include "cpu.h"
77 
78 NAMESPACE_BEGIN(CryptoPP)
79 
80 // Hack for https://github.com/weidai11/cryptopp/issues/42
81 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS)
82 # define CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS 1
83 #endif
84 
85 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
86 # if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
87 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
88 using namespace rdtable;
89 # else
90 static word64 Te[256];
91 # endif
92 static word64 Td[256];
93 #else
94 static word32 Te[256*4], Td[256*4];
95 #endif
96 static volatile bool s_TeFilled = false, s_TdFilled = false;
97 
98 // ************************* Portable Code ************************************
99 
100 #define QUARTER_ROUND(L, T, t, a, b, c, d) \
101  a ^= L(T, 3, byte(t)); t >>= 8;\
102  b ^= L(T, 2, byte(t)); t >>= 8;\
103  c ^= L(T, 1, byte(t)); t >>= 8;\
104  d ^= L(T, 0, t);
105 
106 #define QUARTER_ROUND_LE(t, a, b, c, d) \
107  tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
108  tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
109  tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
110  tempBlock[d] = ((byte *)(Te+t))[1];
111 
112 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
113  #define QUARTER_ROUND_LD(t, a, b, c, d) \
114  tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
115  tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
116  tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
117  tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
118 #else
119  #define QUARTER_ROUND_LD(t, a, b, c, d) \
120  tempBlock[a] = Sd[byte(t)]; t >>= 8;\
121  tempBlock[b] = Sd[byte(t)]; t >>= 8;\
122  tempBlock[c] = Sd[byte(t)]; t >>= 8;\
123  tempBlock[d] = Sd[t];
124 #endif
125 
126 #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
127 #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
128 
129 #ifdef IS_LITTLE_ENDIAN
130  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
131  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
132  #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
133  #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (6-i)%4+1))
134  #define TL_M(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (i+3)%4+1))
135  #else
136  #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8)
137  #define TL_M(T, i, x) T[i*256 + x]
138  #endif
139 #else
140  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
141  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
142  #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
143  #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (4-i)%4))
144  #define TL_M TL_F
145  #else
146  #define TL_F(T, i, x) rotrFixed(T[x], i*8)
147  #define TL_M(T, i, x) T[i*256 + x]
148  #endif
149 #endif
150 
151 
152 #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
153 #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
154 #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
155 
156 #define f3(x) (f2(x) ^ x)
157 #define f9(x) (f8(x) ^ x)
158 #define fb(x) (f8(x) ^ f2(x) ^ x)
159 #define fd(x) (f8(x) ^ f4(x) ^ x)
160 #define fe(x) (f8(x) ^ f4(x) ^ f2(x))
161 
162 void Rijndael::Base::FillEncTable()
163 {
164  for (int i=0; i<256; i++)
165  {
166  byte x = Se[i];
167 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
168  word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
169  Te[i] = word64(y | f3(x))<<32 | y;
170 #else
171  word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
172  for (int j=0; j<4; j++)
173  {
174  Te[i+j*256] = y;
175  y = rotrFixed(y, 8);
176  }
177 #endif
178  }
179 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
180  Te[256] = Te[257] = 0;
181 #endif
182  s_TeFilled = true;
183 }
184 
185 void Rijndael::Base::FillDecTable()
186 {
187  for (int i=0; i<256; i++)
188  {
189  byte x = Sd[i];
190 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
191  word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
192  Td[i] = word64(y | fb(x))<<32 | y | x;
193 #else
194  word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;;
195  for (int j=0; j<4; j++)
196  {
197  Td[i+j*256] = y;
198  y = rotrFixed(y, 8);
199  }
200 #endif
201  }
202  s_TdFilled = true;
203 }
204 
205 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &)
206 {
207  AssertValidKeyLength(keylen);
208 
209  m_rounds = keylen/4 + 6;
210  m_key.New(4*(m_rounds+1));
211 
212  word32 *rk = m_key;
213 
214 #if (CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32))
215  // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64
216  if (HasAESNI())
217  {
218  static const word32 rcLE[] = {
219  0x01, 0x02, 0x04, 0x08,
220  0x10, 0x20, 0x40, 0x80,
221  0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
222  };
223  const word32 *rc = rcLE;
224 
225  __m128i temp = _mm_loadu_si128((__m128i *)(void *)(userKey+keylen-16));
226  memcpy(rk, userKey, keylen);
227 
228  while (true)
229  {
230  rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
231  rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
232  rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
233  rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
234 
235  if (rk + keylen/4 + 4 == m_key.end())
236  break;
237 
238  if (keylen == 24)
239  {
240  rk[10] = rk[ 4] ^ rk[ 9];
241  rk[11] = rk[ 5] ^ rk[10];
242  temp = _mm_insert_epi32(temp, rk[11], 3);
243  }
244  else if (keylen == 32)
245  {
246  temp = _mm_insert_epi32(temp, rk[11], 3);
247  rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2);
248  rk[13] = rk[ 5] ^ rk[12];
249  rk[14] = rk[ 6] ^ rk[13];
250  rk[15] = rk[ 7] ^ rk[14];
251  temp = _mm_insert_epi32(temp, rk[15], 3);
252  }
253  else
254  temp = _mm_insert_epi32(temp, rk[7], 3);
255 
256  rk += keylen/4;
257  }
258 
259  if (!IsForwardTransformation())
260  {
261  rk = m_key;
262  unsigned int i, j;
263 
264  std::swap(*(__m128i *)(void *)(rk), *(__m128i *)(void *)(rk+4*m_rounds));
265 
266  for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
267  {
268  temp = _mm_aesimc_si128(*(__m128i *)(void *)(rk+i));
269  *(__m128i *)(void *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(void *)(rk+j));
270  *(__m128i *)(void *)(rk+j) = temp;
271  }
272 
273  *(__m128i *)(void *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(void *)(rk+i));
274  }
275 
276  return;
277  }
278 #endif
279 
280  GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen);
281  const word32 *rc = rcon;
282  word32 temp;
283 
284  while (true)
285  {
286  temp = rk[keylen/4-1];
287  word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
288  rk[keylen/4] = rk[0] ^ x ^ *(rc++);
289  rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
290  rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
291  rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
292 
293  if (rk + keylen/4 + 4 == m_key.end())
294  break;
295 
296  if (keylen == 24)
297  {
298  rk[10] = rk[ 4] ^ rk[ 9];
299  rk[11] = rk[ 5] ^ rk[10];
300  }
301  else if (keylen == 32)
302  {
303  temp = rk[11];
304  rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
305  rk[13] = rk[ 5] ^ rk[12];
306  rk[14] = rk[ 6] ^ rk[13];
307  rk[15] = rk[ 7] ^ rk[14];
308  }
309  rk += keylen/4;
310  }
311 
312  rk = m_key;
313 
314  if (IsForwardTransformation())
315  {
316  if (!s_TeFilled)
317  FillEncTable();
318 
320  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
321  }
322  else
323  {
324  if (!s_TdFilled)
325  FillDecTable();
326 
327  unsigned int i, j;
328 
329 #define InverseMixColumn(x) TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
330 
331  for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
332  {
333  temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp;
334  temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
335  temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
336  temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
337  }
338 
339  rk[i+0] = InverseMixColumn(rk[i+0]);
340  rk[i+1] = InverseMixColumn(rk[i+1]);
341  rk[i+2] = InverseMixColumn(rk[i+2]);
342  rk[i+3] = InverseMixColumn(rk[i+3]);
343 
344  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
345  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
346  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
347  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
348  }
349 
350 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
351  if (HasAESNI())
352  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
353 #endif
354 }
355 
356 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
357 {
358 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
359 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
360  if (HasSSE2())
361 #else
362  if (HasAESNI())
363 #endif
364  {
365  return (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
366  }
367 #endif
368 
370 
371  word32 s0, s1, s2, s3, t0, t1, t2, t3;
372  Block::Get(inBlock)(s0)(s1)(s2)(s3);
373 
374  const word32 *rk = m_key;
375  s0 ^= rk[0];
376  s1 ^= rk[1];
377  s2 ^= rk[2];
378  s3 ^= rk[3];
379  t0 = rk[4];
380  t1 = rk[5];
381  t2 = rk[6];
382  t3 = rk[7];
383  rk += 8;
384 
385  // timing attack countermeasure. see comments at top for more details
386  const int cacheLineSize = GetCacheLineSize();
387  unsigned int i;
388  word32 u = 0;
389 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
390  for (i=0; i<2048; i+=cacheLineSize)
391 #else
392  for (i=0; i<1024; i+=cacheLineSize)
393 #endif
394  u &= *(const word32 *)(const void *)(((const byte *)Te)+i);
395  u &= Te[255];
396  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
397 
398  QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
399  QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
400  QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
401  QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
402 
403  // Nr - 2 full rounds:
404  unsigned int r = m_rounds/2 - 1;
405  do
406  {
407  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
408 
409  QUARTER_ROUND_E(t3, s0, s1, s2, s3)
410  QUARTER_ROUND_E(t2, s3, s0, s1, s2)
411  QUARTER_ROUND_E(t1, s2, s3, s0, s1)
412  QUARTER_ROUND_E(t0, s1, s2, s3, s0)
413 
414  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
415 
416  QUARTER_ROUND_E(s3, t0, t1, t2, t3)
417  QUARTER_ROUND_E(s2, t3, t0, t1, t2)
418  QUARTER_ROUND_E(s1, t2, t3, t0, t1)
419  QUARTER_ROUND_E(s0, t1, t2, t3, t0)
420 
421  rk += 8;
422  } while (--r);
423 
424  word32 tbw[4];
425  byte *const tempBlock = (byte *)tbw;
426 
427  QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
428  QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
429  QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
430  QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
431 
432  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
433 }
434 
435 void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
436 {
437 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
438  if (HasAESNI())
439  {
440  Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
441  return;
442  }
443 #endif
444 
446 
447  word32 s0, s1, s2, s3, t0, t1, t2, t3;
448  Block::Get(inBlock)(s0)(s1)(s2)(s3);
449 
450  const word32 *rk = m_key;
451  s0 ^= rk[0];
452  s1 ^= rk[1];
453  s2 ^= rk[2];
454  s3 ^= rk[3];
455  t0 = rk[4];
456  t1 = rk[5];
457  t2 = rk[6];
458  t3 = rk[7];
459  rk += 8;
460 
461  // timing attack countermeasure. see comments at top for more details
462  const int cacheLineSize = GetCacheLineSize();
463  unsigned int i;
464  word32 u = 0;
465 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
466  for (i=0; i<2048; i+=cacheLineSize)
467 #else
468  for (i=0; i<1024; i+=cacheLineSize)
469 #endif
470  u &= *(const word32 *)(const void *)(((const byte *)Td)+i);
471  u &= Td[255];
472  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
473 
474  QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
475  QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
476  QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
477  QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
478 
479  // Nr - 2 full rounds:
480  unsigned int r = m_rounds/2 - 1;
481  do
482  {
483  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
484 
485  QUARTER_ROUND_D(t3, s2, s1, s0, s3)
486  QUARTER_ROUND_D(t2, s1, s0, s3, s2)
487  QUARTER_ROUND_D(t1, s0, s3, s2, s1)
488  QUARTER_ROUND_D(t0, s3, s2, s1, s0)
489 
490  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
491 
492  QUARTER_ROUND_D(s3, t2, t1, t0, t3)
493  QUARTER_ROUND_D(s2, t1, t0, t3, t2)
494  QUARTER_ROUND_D(s1, t0, t3, t2, t1)
495  QUARTER_ROUND_D(s0, t3, t2, t1, t0)
496 
497  rk += 8;
498  } while (--r);
499 
500 #if !(defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS))
501  // timing attack countermeasure. see comments at top for more details
502  // If CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined,
503  // QUARTER_ROUND_LD will use Td, which is already preloaded.
504  u = 0;
505  for (i=0; i<256; i+=cacheLineSize)
506  u &= *(const word32 *)(const void *)(Sd+i);
507  u &= *(const word32 *)(const void *)(Sd+252);
508  t0 |= u; t1 |= u; t2 |= u; t3 |= u;
509 #endif
510 
511  word32 tbw[4];
512  byte *const tempBlock = (byte *)tbw;
513 
514  QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
515  QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
516  QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
517  QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
518 
519  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
520 }
521 
522 // ************************* Assembly Code ************************************
523 
524 #if CRYPTOPP_MSC_VERSION
525 # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
526 #endif
527 
528 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
529 
530 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
531 
532 CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k)
533 {
534  CRYPTOPP_UNUSED(locals); CRYPTOPP_UNUSED(k);
535 
536 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
537 
538 #define L_REG esp
539 #define L_INDEX(i) (L_REG+768+i)
540 #define L_INXORBLOCKS L_INBLOCKS+4
541 #define L_OUTXORBLOCKS L_INBLOCKS+8
542 #define L_OUTBLOCKS L_INBLOCKS+12
543 #define L_INCREMENTS L_INDEX(16*15)
544 #define L_SP L_INDEX(16*16)
545 #define L_LENGTH L_INDEX(16*16+4)
546 #define L_KEYS_BEGIN L_INDEX(16*16+8)
547 
548 #define MOVD movd
549 #define MM(i) mm##i
550 
551 #define MXOR(a,b,c) \
552  AS2( movzx esi, b)\
553  AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
554  AS2( pxor MM(a), mm7)\
555 
556 #define MMOV(a,b,c) \
557  AS2( movzx esi, b)\
558  AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
559 
560 #else
561 
562 #define L_REG r8
563 #define L_INDEX(i) (L_REG+i)
564 #define L_INXORBLOCKS L_INBLOCKS+8
565 #define L_OUTXORBLOCKS L_INBLOCKS+16
566 #define L_OUTBLOCKS L_INBLOCKS+24
567 #define L_INCREMENTS L_INDEX(16*16)
568 #define L_LENGTH L_INDEX(16*18+8)
569 #define L_KEYS_BEGIN L_INDEX(16*19)
570 
571 #define MOVD mov
572 #define MM_0 r9d
573 #define MM_1 r12d
574 #ifdef __GNUC__
575 #define MM_2 r11d
576 #else
577 #define MM_2 r10d
578 #endif
579 #define MM(i) MM_##i
580 
581 #define MXOR(a,b,c) \
582  AS2( movzx esi, b)\
583  AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
584 
585 #define MMOV(a,b,c) \
586  AS2( movzx esi, b)\
587  AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
588 
589 #endif
590 
591 #define L_SUBKEYS L_INDEX(0)
592 #define L_SAVED_X L_SUBKEYS
593 #define L_KEY12 L_INDEX(16*12)
594 #define L_LASTROUND L_INDEX(16*13)
595 #define L_INBLOCKS L_INDEX(16*14)
596 #define MAP0TO4(i) (ASM_MOD(i+3,4)+1)
597 
598 #define XOR(a,b,c) \
599  AS2( movzx esi, b)\
600  AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
601 
602 #define MOV(a,b,c) \
603  AS2( movzx esi, b)\
604  AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
605 
606 #ifdef CRYPTOPP_GENERATE_X64_MASM
607  ALIGN 8
608  Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
609  rex_push_reg rsi
610  push_reg rdi
611  push_reg rbx
612  push_reg r12
613  .endprolog
614  mov L_REG, rcx
615  mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
616  mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
617 #elif defined(__GNUC__)
618  __asm__ __volatile__
619  (
620  INTEL_NOPREFIX
621  #if CRYPTOPP_BOOL_X64
622  AS2( mov L_REG, rcx)
623  #endif
624  AS_PUSH_IF86(bx)
625  AS_PUSH_IF86(bp)
626  AS2( mov AS_REG_7, WORD_REG(si))
627 #else
628  AS_PUSH_IF86(si)
629  AS_PUSH_IF86(di)
630  AS_PUSH_IF86(bx)
631  AS_PUSH_IF86(bp)
632  AS2( lea AS_REG_7, [Te])
633  AS2( mov edi, [g_cacheLineSize])
634 #endif
635 
636 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
637  AS2( mov [ecx+16*12+16*4], esp) // save esp to L_SP
638  AS2( lea esp, [ecx-768])
639 #endif
640 
641  // copy subkeys to stack
642  AS2( mov WORD_REG(si), [L_KEYS_BEGIN])
643  AS2( mov WORD_REG(ax), 16)
644  AS2( and WORD_REG(ax), WORD_REG(si))
645  AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)]) // subkey 1 (non-counter) or 2 (counter)
646  AS2( movdqa [L_KEY12], xmm3)
647  AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
648  AS2( sub WORD_REG(ax), WORD_REG(si))
649  ASL(0)
650  AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)])
651  AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
652  AS2( add WORD_REG(si), 16)
653  AS2( cmp WORD_REG(si), 16*12)
654  ATT_NOPREFIX
655  ASJ( jl, 0, b)
656  INTEL_NOPREFIX
657 
658  // read subkeys 0, 1 and last
659  AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)]) // last subkey
660  AS2( movdqa xmm1, [WORD_REG(dx)]) // subkey 0
661  AS2( MOVD MM(1), [WORD_REG(dx)+4*4]) // 0,1,2,3
662  AS2( mov ebx, [WORD_REG(dx)+5*4]) // 4,5,6,7
663  AS2( mov ecx, [WORD_REG(dx)+6*4]) // 8,9,10,11
664  AS2( mov edx, [WORD_REG(dx)+7*4]) // 12,13,14,15
665 
666  // load table into cache
667  AS2( xor WORD_REG(ax), WORD_REG(ax))
668  ASL(9)
669  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
670  AS2( add WORD_REG(ax), WORD_REG(di))
671  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
672  AS2( add WORD_REG(ax), WORD_REG(di))
673  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
674  AS2( add WORD_REG(ax), WORD_REG(di))
675  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
676  AS2( add WORD_REG(ax), WORD_REG(di))
677  AS2( cmp WORD_REG(ax), 2048)
678  ATT_NOPREFIX
679  ASJ( jl, 9, b)
680  INTEL_NOPREFIX
681  AS1( lfence)
682 
683  AS2( test DWORD PTR [L_LENGTH], 1)
684  ATT_NOPREFIX
685  ASJ( jz, 8, f)
686  INTEL_NOPREFIX
687 
688  // counter mode one-time setup
689  AS2( mov WORD_REG(si), [L_INBLOCKS])
690  AS2( movdqu xmm2, [WORD_REG(si)]) // counter
691  AS2( pxor xmm2, xmm1)
692  AS2( psrldq xmm1, 14)
693  AS2( movd eax, xmm1)
694  AS2( mov al, BYTE PTR [WORD_REG(si)+15])
695  AS2( MOVD MM(2), eax)
696 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
697  AS2( mov eax, 1)
698  AS2( movd mm3, eax)
699 #endif
700 
701  // partial first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: mm1, ebx, ecx, edx
702  AS2( movd eax, xmm2)
703  AS2( psrldq xmm2, 4)
704  AS2( movd edi, xmm2)
705  AS2( psrldq xmm2, 4)
706  MXOR( 1, al, 0) // 0
707  XOR( edx, ah, 1) // 1
708  AS2( shr eax, 16)
709  XOR( ecx, al, 2) // 2
710  XOR( ebx, ah, 3) // 3
711  AS2( mov eax, edi)
712  AS2( movd edi, xmm2)
713  AS2( psrldq xmm2, 4)
714  XOR( ebx, al, 0) // 4
715  MXOR( 1, ah, 1) // 5
716  AS2( shr eax, 16)
717  XOR( edx, al, 2) // 6
718  XOR( ecx, ah, 3) // 7
719  AS2( mov eax, edi)
720  AS2( movd edi, xmm2)
721  XOR( ecx, al, 0) // 8
722  XOR( ebx, ah, 1) // 9
723  AS2( shr eax, 16)
724  MXOR( 1, al, 2) // 10
725  XOR( edx, ah, 3) // 11
726  AS2( mov eax, edi)
727  XOR( edx, al, 0) // 12
728  XOR( ecx, ah, 1) // 13
729  AS2( shr eax, 16)
730  XOR( ebx, al, 2) // 14
731  AS2( psrldq xmm2, 3)
732 
733  // partial second round, in: ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15), out: eax, ebx, edi, mm0
734  AS2( mov eax, [L_KEY12+0*4])
735  AS2( mov edi, [L_KEY12+2*4])
736  AS2( MOVD MM(0), [L_KEY12+3*4])
737  MXOR( 0, cl, 3) /* 11 */
738  XOR( edi, bl, 3) /* 7 */
739  MXOR( 0, bh, 2) /* 6 */
740  AS2( shr ebx, 16) /* 4,5 */
741  XOR( eax, bl, 1) /* 5 */
742  MOV( ebx, bh, 0) /* 4 */
743  AS2( xor ebx, [L_KEY12+1*4])
744  XOR( eax, ch, 2) /* 10 */
745  AS2( shr ecx, 16) /* 8,9 */
746  XOR( eax, dl, 3) /* 15 */
747  XOR( ebx, dh, 2) /* 14 */
748  AS2( shr edx, 16) /* 12,13 */
749  XOR( edi, ch, 0) /* 8 */
750  XOR( ebx, cl, 1) /* 9 */
751  XOR( edi, dl, 1) /* 13 */
752  MXOR( 0, dh, 0) /* 12 */
753 
754  AS2( movd ecx, xmm2)
755  AS2( MOVD edx, MM(1))
756  AS2( MOVD [L_SAVED_X+3*4], MM(0))
757  AS2( mov [L_SAVED_X+0*4], eax)
758  AS2( mov [L_SAVED_X+1*4], ebx)
759  AS2( mov [L_SAVED_X+2*4], edi)
760  ATT_NOPREFIX
761  ASJ( jmp, 5, f)
762  INTEL_NOPREFIX
763  ASL(3)
764  // non-counter mode per-block setup
765  AS2( MOVD MM(1), [L_KEY12+0*4]) // 0,1,2,3
766  AS2( mov ebx, [L_KEY12+1*4]) // 4,5,6,7
767  AS2( mov ecx, [L_KEY12+2*4]) // 8,9,10,11
768  AS2( mov edx, [L_KEY12+3*4]) // 12,13,14,15
769  ASL(8)
770  AS2( mov WORD_REG(ax), [L_INBLOCKS])
771  AS2( movdqu xmm2, [WORD_REG(ax)])
772  AS2( mov WORD_REG(si), [L_INXORBLOCKS])
773  AS2( movdqu xmm5, [WORD_REG(si)])
774  AS2( pxor xmm2, xmm1)
775  AS2( pxor xmm2, xmm5)
776 
777  // first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: eax, ebx, ecx, edx
778  AS2( movd eax, xmm2)
779  AS2( psrldq xmm2, 4)
780  AS2( movd edi, xmm2)
781  AS2( psrldq xmm2, 4)
782  MXOR( 1, al, 0) // 0
783  XOR( edx, ah, 1) // 1
784  AS2( shr eax, 16)
785  XOR( ecx, al, 2) // 2
786  XOR( ebx, ah, 3) // 3
787  AS2( mov eax, edi)
788  AS2( movd edi, xmm2)
789  AS2( psrldq xmm2, 4)
790  XOR( ebx, al, 0) // 4
791  MXOR( 1, ah, 1) // 5
792  AS2( shr eax, 16)
793  XOR( edx, al, 2) // 6
794  XOR( ecx, ah, 3) // 7
795  AS2( mov eax, edi)
796  AS2( movd edi, xmm2)
797  XOR( ecx, al, 0) // 8
798  XOR( ebx, ah, 1) // 9
799  AS2( shr eax, 16)
800  MXOR( 1, al, 2) // 10
801  XOR( edx, ah, 3) // 11
802  AS2( mov eax, edi)
803  XOR( edx, al, 0) // 12
804  XOR( ecx, ah, 1) // 13
805  AS2( shr eax, 16)
806  XOR( ebx, al, 2) // 14
807  MXOR( 1, ah, 3) // 15
808  AS2( MOVD eax, MM(1))
809 
810  AS2( add L_REG, [L_KEYS_BEGIN])
811  AS2( add L_REG, 4*16)
812  ATT_NOPREFIX
813  ASJ( jmp, 2, f)
814  INTEL_NOPREFIX
815  ASL(1)
816  // counter-mode per-block setup
817  AS2( MOVD ecx, MM(2))
818  AS2( MOVD edx, MM(1))
819  AS2( mov eax, [L_SAVED_X+0*4])
820  AS2( mov ebx, [L_SAVED_X+1*4])
821  AS2( xor cl, ch)
822  AS2( and WORD_REG(cx), 255)
823  ASL(5)
824 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
825  AS2( paddb MM(2), mm3)
826 #else
827  AS2( add MM(2), 1)
828 #endif
829  // remaining part of second round, in: edx(previous round),esi(keyed counter byte) eax,ebx,[L_SAVED_X+2*4],[L_SAVED_X+3*4], out: eax,ebx,ecx,edx
830  AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
831  XOR( ebx, dl, 3)
832  MOV( ecx, dh, 2)
833  AS2( shr edx, 16)
834  AS2( xor ecx, [L_SAVED_X+2*4])
835  XOR( eax, dh, 0)
836  MOV( edx, dl, 1)
837  AS2( xor edx, [L_SAVED_X+3*4])
838 
839  AS2( add L_REG, [L_KEYS_BEGIN])
840  AS2( add L_REG, 3*16)
841  ATT_NOPREFIX
842  ASJ( jmp, 4, f)
843  INTEL_NOPREFIX
844 
845 // in: eax(0,1,2,3), ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15)
846 // out: eax, ebx, edi, mm0
847 #define ROUND() \
848  MXOR( 0, cl, 3) /* 11 */\
849  AS2( mov cl, al) /* 8,9,10,3 */\
850  XOR( edi, ah, 2) /* 2 */\
851  AS2( shr eax, 16) /* 0,1 */\
852  XOR( edi, bl, 3) /* 7 */\
853  MXOR( 0, bh, 2) /* 6 */\
854  AS2( shr ebx, 16) /* 4,5 */\
855  MXOR( 0, al, 1) /* 1 */\
856  MOV( eax, ah, 0) /* 0 */\
857  XOR( eax, bl, 1) /* 5 */\
858  MOV( ebx, bh, 0) /* 4 */\
859  XOR( eax, ch, 2) /* 10 */\
860  XOR( ebx, cl, 3) /* 3 */\
861  AS2( shr ecx, 16) /* 8,9 */\
862  XOR( eax, dl, 3) /* 15 */\
863  XOR( ebx, dh, 2) /* 14 */\
864  AS2( shr edx, 16) /* 12,13 */\
865  XOR( edi, ch, 0) /* 8 */\
866  XOR( ebx, cl, 1) /* 9 */\
867  XOR( edi, dl, 1) /* 13 */\
868  MXOR( 0, dh, 0) /* 12 */\
869 
870  ASL(2) // 2-round loop
871  AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4])
872  AS2( mov edi, [L_SUBKEYS-4*16+2*4])
873  ROUND()
874  AS2( mov ecx, edi)
875  AS2( xor eax, [L_SUBKEYS-4*16+0*4])
876  AS2( xor ebx, [L_SUBKEYS-4*16+1*4])
877  AS2( MOVD edx, MM(0))
878 
879  ASL(4)
880  AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4])
881  AS2( mov edi, [L_SUBKEYS-4*16+6*4])
882  ROUND()
883  AS2( mov ecx, edi)
884  AS2( xor eax, [L_SUBKEYS-4*16+4*4])
885  AS2( xor ebx, [L_SUBKEYS-4*16+5*4])
886  AS2( MOVD edx, MM(0))
887 
888  AS2( add L_REG, 32)
889  AS2( test L_REG, 255)
890  ATT_NOPREFIX
891  ASJ( jnz, 2, b)
892  INTEL_NOPREFIX
893  AS2( sub L_REG, 16*16)
894 
895 #define LAST(a, b, c) \
896  AS2( movzx esi, a )\
897  AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
898  AS2( movzx esi, b )\
899  AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\
900  AS2( mov WORD PTR [L_LASTROUND+c], di )\
901 
902  // last round
903  LAST(ch, dl, 2)
904  LAST(dh, al, 6)
905  AS2( shr edx, 16)
906  LAST(ah, bl, 10)
907  AS2( shr eax, 16)
908  LAST(bh, cl, 14)
909  AS2( shr ebx, 16)
910  LAST(dh, al, 12)
911  AS2( shr ecx, 16)
912  LAST(ah, bl, 0)
913  LAST(bh, cl, 4)
914  LAST(ch, dl, 8)
915 
916  AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS])
917  AS2( mov WORD_REG(bx), [L_OUTBLOCKS])
918 
919  AS2( mov WORD_REG(cx), [L_LENGTH])
920  AS2( sub WORD_REG(cx), 16)
921 
922  AS2( movdqu xmm2, [WORD_REG(ax)])
923  AS2( pxor xmm2, xmm4)
924 
925 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
926  AS2( movdqa xmm0, [L_INCREMENTS])
927  AS2( paddd xmm0, [L_INBLOCKS])
928  AS2( movdqa [L_INBLOCKS], xmm0)
929 #else
930  AS2( movdqa xmm0, [L_INCREMENTS+16])
931  AS2( paddq xmm0, [L_INBLOCKS+16])
932  AS2( movdqa [L_INBLOCKS+16], xmm0)
933 #endif
934 
935  AS2( pxor xmm2, [L_LASTROUND])
936  AS2( movdqu [WORD_REG(bx)], xmm2)
937 
938  ATT_NOPREFIX
939  ASJ( jle, 7, f)
940  INTEL_NOPREFIX
941  AS2( mov [L_LENGTH], WORD_REG(cx))
942  AS2( test WORD_REG(cx), 1)
943  ATT_NOPREFIX
944  ASJ( jnz, 1, b)
945  INTEL_NOPREFIX
946 #if CRYPTOPP_BOOL_X64
947  AS2( movdqa xmm0, [L_INCREMENTS])
948  AS2( paddq xmm0, [L_INBLOCKS])
949  AS2( movdqa [L_INBLOCKS], xmm0)
950 #endif
951  ATT_NOPREFIX
952  ASJ( jmp, 3, b)
953  INTEL_NOPREFIX
954 
955  ASL(7)
956  // erase keys on stack
957  AS2( xorps xmm0, xmm0)
958  AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16])
959  AS2( movaps [WORD_REG(ax)-7*16], xmm0)
960  AS2( movaps [WORD_REG(ax)-6*16], xmm0)
961  AS2( movaps [WORD_REG(ax)-5*16], xmm0)
962  AS2( movaps [WORD_REG(ax)-4*16], xmm0)
963  AS2( movaps [WORD_REG(ax)-3*16], xmm0)
964  AS2( movaps [WORD_REG(ax)-2*16], xmm0)
965  AS2( movaps [WORD_REG(ax)-1*16], xmm0)
966  AS2( movaps [WORD_REG(ax)+0*16], xmm0)
967  AS2( movaps [WORD_REG(ax)+1*16], xmm0)
968  AS2( movaps [WORD_REG(ax)+2*16], xmm0)
969  AS2( movaps [WORD_REG(ax)+3*16], xmm0)
970  AS2( movaps [WORD_REG(ax)+4*16], xmm0)
971  AS2( movaps [WORD_REG(ax)+5*16], xmm0)
972  AS2( movaps [WORD_REG(ax)+6*16], xmm0)
973 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
974  AS2( mov esp, [L_SP])
975  AS1( emms)
976 #endif
977  AS_POP_IF86(bp)
978  AS_POP_IF86(bx)
979 #if defined(_MSC_VER) && CRYPTOPP_BOOL_X86
980  AS_POP_IF86(di)
981  AS_POP_IF86(si)
982  AS1(ret)
983 #endif
984 #ifdef CRYPTOPP_GENERATE_X64_MASM
985  pop r12
986  pop rbx
987  pop rdi
988  pop rsi
989  ret
990  Rijndael_Enc_AdvancedProcessBlocks ENDP
991 #endif
992 #ifdef __GNUC__
993  ATT_PREFIX
994  :
995  : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize)
996  : "memory", "cc", "%eax"
997  #if CRYPTOPP_BOOL_X64
998  , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12"
999  #endif
1000  );
1001 #endif
1002 }
1003 
1004 #endif
1005 
1006 #ifndef CRYPTOPP_GENERATE_X64_MASM
1007 
1008 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
1009 extern "C" {
1010 void Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k);
1011 }
1012 #endif
1013 
1014 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
1015 
1016 static inline bool AliasedWithTable(const byte *begin, const byte *end)
1017 {
1018  size_t s0 = size_t(begin)%4096, s1 = size_t(end)%4096;
1019  size_t t0 = size_t(Te)%4096, t1 = (size_t(Te)+sizeof(Te))%4096;
1020  if (t1 > t0)
1021  return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
1022  else
1023  return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
1024 }
1025 
1026 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1027 
1028 inline void AESNI_Enc_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
1029 {
1030  block = _mm_xor_si128(block, subkeys[0]);
1031  for (unsigned int i=1; i<rounds-1; i+=2)
1032  {
1033  block = _mm_aesenc_si128(block, subkeys[i]);
1034  block = _mm_aesenc_si128(block, subkeys[i+1]);
1035  }
1036  block = _mm_aesenc_si128(block, subkeys[rounds-1]);
1037  block = _mm_aesenclast_si128(block, subkeys[rounds]);
1038 }
1039 
1040 inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
1041 {
1042  __m128i rk = subkeys[0];
1043  block0 = _mm_xor_si128(block0, rk);
1044  block1 = _mm_xor_si128(block1, rk);
1045  block2 = _mm_xor_si128(block2, rk);
1046  block3 = _mm_xor_si128(block3, rk);
1047  for (unsigned int i=1; i<rounds; i++)
1048  {
1049  rk = subkeys[i];
1050  block0 = _mm_aesenc_si128(block0, rk);
1051  block1 = _mm_aesenc_si128(block1, rk);
1052  block2 = _mm_aesenc_si128(block2, rk);
1053  block3 = _mm_aesenc_si128(block3, rk);
1054  }
1055  rk = subkeys[rounds];
1056  block0 = _mm_aesenclast_si128(block0, rk);
1057  block1 = _mm_aesenclast_si128(block1, rk);
1058  block2 = _mm_aesenclast_si128(block2, rk);
1059  block3 = _mm_aesenclast_si128(block3, rk);
1060 }
1061 
1062 inline void AESNI_Dec_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
1063 {
1064  block = _mm_xor_si128(block, subkeys[0]);
1065  for (unsigned int i=1; i<rounds-1; i+=2)
1066  {
1067  block = _mm_aesdec_si128(block, subkeys[i]);
1068  block = _mm_aesdec_si128(block, subkeys[i+1]);
1069  }
1070  block = _mm_aesdec_si128(block, subkeys[rounds-1]);
1071  block = _mm_aesdeclast_si128(block, subkeys[rounds]);
1072 }
1073 
1074 inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
1075 {
1076  __m128i rk = subkeys[0];
1077  block0 = _mm_xor_si128(block0, rk);
1078  block1 = _mm_xor_si128(block1, rk);
1079  block2 = _mm_xor_si128(block2, rk);
1080  block3 = _mm_xor_si128(block3, rk);
1081  for (unsigned int i=1; i<rounds; i++)
1082  {
1083  rk = subkeys[i];
1084  block0 = _mm_aesdec_si128(block0, rk);
1085  block1 = _mm_aesdec_si128(block1, rk);
1086  block2 = _mm_aesdec_si128(block2, rk);
1087  block3 = _mm_aesdec_si128(block3, rk);
1088  }
1089  rk = subkeys[rounds];
1090  block0 = _mm_aesdeclast_si128(block0, rk);
1091  block1 = _mm_aesdeclast_si128(block1, rk);
1092  block2 = _mm_aesdeclast_si128(block2, rk);
1093  block3 = _mm_aesdeclast_si128(block3, rk);
1094 }
1095 
1096 static CRYPTOPP_ALIGN_DATA(16) const word32 s_one[] = {0, 0, 0, 1<<24};
1097 
1098 template <typename F1, typename F4>
1099 inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, const __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1100 {
1101  size_t blockSize = 16;
1103  size_t xorIncrement = xorBlocks ? blockSize : 0;
1104  size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
1105 
1107  {
1108  assert(length % blockSize == 0);
1109  inBlocks += length - blockSize;
1110  xorBlocks += length - blockSize;
1111  outBlocks += length - blockSize;
1112  inIncrement = 0-inIncrement;
1113  xorIncrement = 0-xorIncrement;
1114  outIncrement = 0-outIncrement;
1115  }
1116 
1117  if (flags & BlockTransformation::BT_AllowParallel)
1118  {
1119  while (length >= 4*blockSize)
1120  {
1121  __m128i block0 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks), block1, block2, block3;
1123  {
1124  const __m128i be1 = *(const __m128i *)(const void *)s_one;
1125  block1 = _mm_add_epi32(block0, be1);
1126  block2 = _mm_add_epi32(block1, be1);
1127  block3 = _mm_add_epi32(block2, be1);
1128  _mm_storeu_si128((__m128i *)(void *)inBlocks, _mm_add_epi32(block3, be1));
1129  }
1130  else
1131  {
1132  inBlocks += inIncrement;
1133  block1 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
1134  inBlocks += inIncrement;
1135  block2 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
1136  inBlocks += inIncrement;
1137  block3 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
1138  inBlocks += inIncrement;
1139  }
1140 
1141  if (flags & BlockTransformation::BT_XorInput)
1142  {
1143  block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1144  xorBlocks += xorIncrement;
1145  block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1146  xorBlocks += xorIncrement;
1147  block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1148  xorBlocks += xorIncrement;
1149  block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1150  xorBlocks += xorIncrement;
1151  }
1152 
1153  func4(block0, block1, block2, block3, subkeys, rounds);
1154 
1155  if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
1156  {
1157  block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1158  xorBlocks += xorIncrement;
1159  block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1160  xorBlocks += xorIncrement;
1161  block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1162  xorBlocks += xorIncrement;
1163  block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1164  xorBlocks += xorIncrement;
1165  }
1166 
1167  _mm_storeu_si128((__m128i *)(void *)outBlocks, block0);
1168  outBlocks += outIncrement;
1169  _mm_storeu_si128((__m128i *)(void *)outBlocks, block1);
1170  outBlocks += outIncrement;
1171  _mm_storeu_si128((__m128i *)(void *)outBlocks, block2);
1172  outBlocks += outIncrement;
1173  _mm_storeu_si128((__m128i *)(void *)outBlocks, block3);
1174  outBlocks += outIncrement;
1175 
1176  length -= 4*blockSize;
1177  }
1178  }
1179 
1180  while (length >= blockSize)
1181  {
1182  __m128i block = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
1183 
1184  if (flags & BlockTransformation::BT_XorInput)
1185  block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1186 
1187  if (flags & BlockTransformation::BT_InBlockIsCounter)
1188  const_cast<byte *>(inBlocks)[15]++;
1189 
1190  func1(block, subkeys, rounds);
1191 
1192  if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
1193  block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1194 
1195  _mm_storeu_si128((__m128i *)(void *)outBlocks, block);
1196 
1197  inBlocks += inIncrement;
1198  outBlocks += outIncrement;
1199  xorBlocks += xorIncrement;
1200  length -= blockSize;
1201  }
1202 
1203  return length;
1204 }
1205 #endif
1206 
1207 size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1208 {
1209 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1210  if (HasAESNI())
1211  return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (const __m128i *)(const void *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1212 #endif
1213 
1214 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
1215  if (HasSSE2())
1216  {
1217  if (length < BLOCKSIZE)
1218  return length;
1219 
1220  struct Locals
1221  {
1222  word32 subkeys[4*12], workspace[8];
1223  const byte *inBlocks, *inXorBlocks, *outXorBlocks;
1224  byte *outBlocks;
1225  size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
1226  size_t regSpill, lengthAndCounterFlag, keysBegin;
1227  };
1228 
1229  size_t increment = BLOCKSIZE;
1230  const byte* zeros = (byte *)(Te+256);
1231  byte *space;
1232 
1233  do {
1234  // https://msdn.microsoft.com/en-us/library/5471dc8s.aspx
1235 #if (CRYPTOPP_MSC_VERION >= 1400)
1236  space = (byte *)_malloca(255+sizeof(Locals));
1237  space += (256-(size_t)space%256)%256;
1238 #else
1239  space = (byte *)alloca(255+sizeof(Locals));
1240  space += (256-(size_t)space%256)%256;
1241 #endif
1242  }
1243  while (AliasedWithTable(space, space+sizeof(Locals)));
1244 
1245  if (flags & BT_ReverseDirection)
1246  {
1247  assert(length % BLOCKSIZE == 0);
1248  inBlocks += length - BLOCKSIZE;
1249  xorBlocks += length - BLOCKSIZE;
1250  outBlocks += length - BLOCKSIZE;
1251  increment = 0-increment;
1252  }
1253 
1254  Locals &locals = *(Locals *)(void *)space;
1255 
1256  locals.inBlocks = inBlocks;
1257  locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
1258  locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
1259  locals.outBlocks = outBlocks;
1260 
1261  locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1262  locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
1263  locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
1264  locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1265 
1266  locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter);
1267  int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
1268  locals.keysBegin = (12-keysToCopy)*16;
1269 
1270  Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key);
1271  return length % BLOCKSIZE;
1272  }
1273 #endif
1274 
1275  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1276 }
1277 
1278 #endif
1279 
1280 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1281 
1282 size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1283 {
1284  if (HasAESNI())
1285  return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (const __m128i *)(const void *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1286 
1287  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1288 }
1289 
1290 #endif // #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1291 
1292 NAMESPACE_END
1293 
1294 #endif
1295 #endif
Utility functions for the Crypto++ library.
Library configuration file.
should not modify block pointers
Definition: cryptlib.h:775
virtual size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
Encrypt and xor multiple blocks using additional flags.
Definition: cryptlib.cpp:181
Rijndael block cipher implementation details.
Definition: rijndael.h:29
T ConditionalByteReverse(ByteOrder order, T value)
Reverses bytes in a value depending upon endianess.
Definition: misc.h:1705
byte order is big-endian
Definition: cryptlib.h:127
Classes for Rijndael encryption algorithm.
Classes, functions, intrinsics and features for X86, X32 nd X64 assembly.
perform the transformation in reverse
Definition: cryptlib.h:779
Crypto++ library namespace.
T rotrFixed(T x, unsigned int y)
Performs a right rotate.
Definition: misc.h:1199
Interface for retrieving values given their names.
Definition: cryptlib.h:277