Crypto++  5.6.5
Free C++ class library of cryptographic schemes
rijndael.cpp
1 // rijndael.cpp - modified by Chris Morgan <cmorgan@wpi.edu>
2 // and Wei Dai from Paulo Baretto's Rijndael implementation
3 // The original code and all modifications are in the public domain.
4 
5 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
6 
7 /*
8 July 2010: Added support for AES-NI instructions via compiler intrinsics.
9 */
10 
11 /*
12 Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode
13 caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein
14 and Peter Schwabe in their paper "New AES software speed records". The round
15 function was also modified to include a trick similar to one in Brian Gladman's
16 x86 assembly code, doing an 8-bit register move to minimize the number of
17 register spills. Also switched to compressed tables and copying round keys to
18 the stack.
19 
20 The C++ implementation now uses compressed tables if
21 CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined.
22 */
23 
24 /*
25 July 2006: Defense against timing attacks was added in by Wei Dai.
26 
27 The code now uses smaller tables in the first and last rounds,
28 and preloads them into L1 cache before usage (by loading at least
29 one element in each cache line).
30 
31 We try to delay subsequent accesses to each table (used in the first
32 and last rounds) until all of the table has been preloaded. Hopefully
33 the compiler isn't smart enough to optimize that code away.
34 
35 After preloading the table, we also try not to access any memory location
36 other than the table and the stack, in order to prevent table entries from
37 being unloaded from L1 cache, until that round is finished.
38 (Some popular CPUs have 2-way associative caches.)
39 */
40 
41 // This is the original introductory comment:
42 
43 /**
44  * version 3.0 (December 2000)
45  *
46  * Optimised ANSI C code for the Rijndael cipher (now AES)
47  *
48  * author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
49  * author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
50  * author Paulo Barreto <paulo.barreto@terra.com.br>
51  *
52  * This code is hereby placed in the public domain.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
55  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
56  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
58  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
59  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
60  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
61  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
62  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
63  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
64  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65  */
66 
67 #include "pch.h"
68 #include "config.h"
69 
70 #ifndef CRYPTOPP_IMPORTS
71 #ifndef CRYPTOPP_GENERATE_X64_MASM
72 
73 #include "rijndael.h"
74 #include "misc.h"
75 #include "cpu.h"
76 
77 NAMESPACE_BEGIN(CryptoPP)
78 
79 // Hack for http://github.com/weidai11/cryptopp/issues/42 and http://github.com/weidai11/cryptopp/issues/132
80 #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS)
81 # define CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS 1
82 #endif
83 
84 // Hack for SunCC, http://github.com/weidai11/cryptopp/issues/224
85 #if (__SUNPRO_CC >= 0x5130)
86 # define MAYBE_CONST
87 #else
88 # define MAYBE_CONST const
89 #endif
90 
91 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
92 # if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
93 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
94 using namespace rdtable;
95 # else
96 static word64 Te[256];
97 # endif
98 static word64 Td[256];
99 #else // Not CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
100 # if defined(CRYPTOPP_X64_MASM_AVAILABLE)
101 // Unused; avoids linker error on Microsoft X64 non-AESNI platforms
102 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
103 # endif
104 CRYPTOPP_ALIGN_DATA(16) static word32 Te[256*4];
105 CRYPTOPP_ALIGN_DATA(16) static word32 Td[256*4];
106 #endif // CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
107 
108 static volatile bool s_TeFilled = false, s_TdFilled = false;
109 
110 // ************************* Portable Code ************************************
111 
112 #define QUARTER_ROUND(L, T, t, a, b, c, d) \
113  a ^= L(T, 3, byte(t)); t >>= 8;\
114  b ^= L(T, 2, byte(t)); t >>= 8;\
115  c ^= L(T, 1, byte(t)); t >>= 8;\
116  d ^= L(T, 0, t);
117 
118 #define QUARTER_ROUND_LE(t, a, b, c, d) \
119  tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
120  tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
121  tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
122  tempBlock[d] = ((byte *)(Te+t))[1];
123 
124 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
125  #define QUARTER_ROUND_LD(t, a, b, c, d) \
126  tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
127  tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
128  tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
129  tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
130 #else
131  #define QUARTER_ROUND_LD(t, a, b, c, d) \
132  tempBlock[a] = Sd[byte(t)]; t >>= 8;\
133  tempBlock[b] = Sd[byte(t)]; t >>= 8;\
134  tempBlock[c] = Sd[byte(t)]; t >>= 8;\
135  tempBlock[d] = Sd[t];
136 #endif
137 
138 #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
139 #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
140 
141 #ifdef IS_LITTLE_ENDIAN
142  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
143  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
144  #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
145  #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (6-i)%4+1))
146  #define TL_M(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (i+3)%4+1))
147  #else
148  #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8)
149  #define TL_M(T, i, x) T[i*256 + x]
150  #endif
151 #else
152  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
153  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
154  #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
155  #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (4-i)%4))
156  #define TL_M TL_F
157  #else
158  #define TL_F(T, i, x) rotrFixed(T[x], i*8)
159  #define TL_M(T, i, x) T[i*256 + x]
160  #endif
161 #endif
162 
163 
164 #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
165 #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
166 #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
167 
168 #define f3(x) (f2(x) ^ x)
169 #define f9(x) (f8(x) ^ x)
170 #define fb(x) (f8(x) ^ f2(x) ^ x)
171 #define fd(x) (f8(x) ^ f4(x) ^ x)
172 #define fe(x) (f8(x) ^ f4(x) ^ f2(x))
173 
174 void Rijndael::Base::FillEncTable()
175 {
176  for (int i=0; i<256; i++)
177  {
178  byte x = Se[i];
179 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
180  word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
181  Te[i] = word64(y | f3(x))<<32 | y;
182 #else
183  word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
184  for (int j=0; j<4; j++)
185  {
186  Te[i+j*256] = y;
187  y = rotrFixed(y, 8);
188  }
189 #endif
190  }
191 #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
192  Te[256] = Te[257] = 0;
193 #endif
194  s_TeFilled = true;
195 }
196 
197 void Rijndael::Base::FillDecTable()
198 {
199  for (int i=0; i<256; i++)
200  {
201  byte x = Sd[i];
202 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
203  word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
204  Td[i] = word64(y | fb(x))<<32 | y | x;
205 #else
206  word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;;
207  for (int j=0; j<4; j++)
208  {
209  Td[i+j*256] = y;
210  y = rotrFixed(y, 8);
211  }
212 #endif
213  }
214  s_TdFilled = true;
215 }
216 
217 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &)
218 {
219  AssertValidKeyLength(keylen);
220 
221  m_rounds = keylen/4 + 6;
222  m_key.New(4*(m_rounds+1));
223 
224  word32 *rk = m_key;
225 
226 #if (CRYPTOPP_AESNI_AVAILABLE && CRYPTOPP_SSE4_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_X86 || CRYPTOPP_X32))
227  // MSVC 2008 SP1 generates bad code for MM_EXTRACT_EPI32() when compiling for X64
228  if (HasAESNI() && HasSSE4())
229  {
230  static const word32 rcLE[] = {
231  0x01, 0x02, 0x04, 0x08,
232  0x10, 0x20, 0x40, 0x80,
233  0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
234  };
235 
236  // Coverity finding, appears to be false positive. Assert the condition.
237  const word32 *ro = rcLE, *rc = rcLE;
238  CRYPTOPP_UNUSED(ro);
239 
240  __m128i temp = _mm_loadu_si128((__m128i *)(void *)(userKey+keylen-16));
241  memcpy(rk, userKey, keylen);
242 
243  while (true)
244  {
245  // Coverity finding, appears to be false positive. Assert the condition.
246  CRYPTOPP_ASSERT(rc < ro + COUNTOF(rcLE));
247  rk[keylen/4] = rk[0] ^ MM_EXTRACT_EPI32(MM_AESKEYGENASSIST_SI128(temp, 0), 3) ^ *(rc++);
248  rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
249  rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
250  rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
251 
252  if (rk + keylen/4 + 4 == m_key.end())
253  break;
254 
255  if (keylen == 24)
256  {
257  rk[10] = rk[ 4] ^ rk[ 9];
258  rk[11] = rk[ 5] ^ rk[10];
259  // Coverity finding, appears to be false positive. Assert the condition.
260  CRYPTOPP_ASSERT(m_key.size() >= 12);
261  temp = MM_INSERT_EPI32(temp, rk[11], 3);
262  }
263  else if (keylen == 32)
264  {
265  // Coverity finding, appears to be false positive. Assert the condition.
266  CRYPTOPP_ASSERT(m_key.size() >= 12);
267  temp = MM_INSERT_EPI32(temp, rk[11], 3);
268  rk[12] = rk[ 4] ^ MM_EXTRACT_EPI32(MM_AESKEYGENASSIST_SI128(temp, 0), 2);
269  rk[13] = rk[ 5] ^ rk[12];
270  rk[14] = rk[ 6] ^ rk[13];
271  rk[15] = rk[ 7] ^ rk[14];
272  // Coverity finding, appears to be false positive. Assert the condition.
273  CRYPTOPP_ASSERT(m_key.size() >= 16);
274  temp = MM_INSERT_EPI32(temp, rk[15], 3);
275  }
276  else
277  {
278  // Coverity finding, appears to be false positive. Assert the condition.
279  CRYPTOPP_ASSERT(m_key.size() >= 8);
280  temp = MM_INSERT_EPI32(temp, rk[7], 3);
281  }
282 
283  rk += keylen/4;
284  }
285 
286  if (!IsForwardTransformation())
287  {
288  rk = m_key;
289  unsigned int i, j;
290 
291 #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
292  // __m128i is an unsigned long long[2], and support for swapping it was not added until C++11.
293  // SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11.
294  vec_swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds));
295 #else
296  std::swap(*(__m128i *)(void *)(rk), *(__m128i *)(void *)(rk+4*m_rounds));
297 #endif
298  for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
299  {
300  temp = MM_AESIMC_SI128(*(__m128i *)(void *)(rk+i));
301  *(__m128i *)(void *)(rk+i) = MM_AESIMC_SI128(*(__m128i *)(void *)(rk+j));
302  *(__m128i *)(void *)(rk+j) = temp;
303  }
304 
305  *(__m128i *)(void *)(rk+i) = MM_AESIMC_SI128(*(__m128i *)(void *)(rk+i));
306  }
307 
308  return;
309  }
310 #endif
311 
312  GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen);
313  const word32 *rc = rcon;
314  word32 temp;
315 
316  while (true)
317  {
318  temp = rk[keylen/4-1];
319  word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
320  rk[keylen/4] = rk[0] ^ x ^ *(rc++);
321  rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
322  rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
323  rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
324 
325  if (rk + keylen/4 + 4 == m_key.end())
326  break;
327 
328  if (keylen == 24)
329  {
330  rk[10] = rk[ 4] ^ rk[ 9];
331  rk[11] = rk[ 5] ^ rk[10];
332  }
333  else if (keylen == 32)
334  {
335  temp = rk[11];
336  rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
337  rk[13] = rk[ 5] ^ rk[12];
338  rk[14] = rk[ 6] ^ rk[13];
339  rk[15] = rk[ 7] ^ rk[14];
340  }
341  rk += keylen/4;
342  }
343 
344  rk = m_key;
345 
346  if (IsForwardTransformation())
347  {
348  if (!s_TeFilled)
349  FillEncTable();
350 
352  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
353  }
354  else
355  {
356  if (!s_TdFilled)
357  FillDecTable();
358 
359  unsigned int i, j;
360 
361 #define InverseMixColumn(x) TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
362 
363  for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
364  {
365  temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp;
366  temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
367  temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
368  temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
369  }
370 
371  rk[i+0] = InverseMixColumn(rk[i+0]);
372  rk[i+1] = InverseMixColumn(rk[i+1]);
373  rk[i+2] = InverseMixColumn(rk[i+2]);
374  rk[i+3] = InverseMixColumn(rk[i+3]);
375 
376  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
377  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
378  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
379  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
380  }
381 
382 #if CRYPTOPP_AESNI_AVAILABLE
383  if (HasAESNI())
384  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
385 #endif
386 }
387 
388 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
389 {
390 #if CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_AESNI_AVAILABLE
391 #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
392  if (HasSSE2())
393 #else
394  if (HasAESNI())
395 #endif
396  {
397  return (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
398  }
399 #endif
400 
402 
403  word32 s0, s1, s2, s3, t0, t1, t2, t3;
404  Block::Get(inBlock)(s0)(s1)(s2)(s3);
405 
406  const word32 *rk = m_key;
407  s0 ^= rk[0];
408  s1 ^= rk[1];
409  s2 ^= rk[2];
410  s3 ^= rk[3];
411  t0 = rk[4];
412  t1 = rk[5];
413  t2 = rk[6];
414  t3 = rk[7];
415  rk += 8;
416 
417  // timing attack countermeasure. see comments at top for more details.
418  // also see http://github.com/weidai11/cryptopp/issues/146
419  const int cacheLineSize = GetCacheLineSize();
420  unsigned int i;
421  volatile word32 _u = 0;
422  word32 u = _u;
423 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
424  for (i=0; i<2048; i+=cacheLineSize)
425 #else
426  for (i=0; i<1024; i+=cacheLineSize)
427 #endif
428  u &= *(const word32 *)(const void *)(((const byte *)Te)+i);
429  u &= Te[255];
430  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
431 
432  QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
433  QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
434  QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
435  QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
436 
437  // Nr - 2 full rounds:
438  unsigned int r = m_rounds/2 - 1;
439  do
440  {
441  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
442 
443  QUARTER_ROUND_E(t3, s0, s1, s2, s3)
444  QUARTER_ROUND_E(t2, s3, s0, s1, s2)
445  QUARTER_ROUND_E(t1, s2, s3, s0, s1)
446  QUARTER_ROUND_E(t0, s1, s2, s3, s0)
447 
448  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
449 
450  QUARTER_ROUND_E(s3, t0, t1, t2, t3)
451  QUARTER_ROUND_E(s2, t3, t0, t1, t2)
452  QUARTER_ROUND_E(s1, t2, t3, t0, t1)
453  QUARTER_ROUND_E(s0, t1, t2, t3, t0)
454 
455  rk += 8;
456  } while (--r);
457 
458  word32 tbw[4];
459  byte *const tempBlock = (byte *)tbw;
460 
461  QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
462  QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
463  QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
464  QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
465 
466  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
467 }
468 
469 void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
470 {
471 #if CRYPTOPP_AESNI_AVAILABLE
472  if (HasAESNI())
473  {
474  Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
475  return;
476  }
477 #endif
478 
480 
481  word32 s0, s1, s2, s3, t0, t1, t2, t3;
482  Block::Get(inBlock)(s0)(s1)(s2)(s3);
483 
484  const word32 *rk = m_key;
485  s0 ^= rk[0];
486  s1 ^= rk[1];
487  s2 ^= rk[2];
488  s3 ^= rk[3];
489  t0 = rk[4];
490  t1 = rk[5];
491  t2 = rk[6];
492  t3 = rk[7];
493  rk += 8;
494 
495  // timing attack countermeasure. see comments at top for more details.
496  // also see http://github.com/weidai11/cryptopp/issues/146
497  const int cacheLineSize = GetCacheLineSize();
498  unsigned int i;
499  volatile word32 _u = 0;
500  word32 u = _u;
501 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
502  for (i=0; i<2048; i+=cacheLineSize)
503 #else
504  for (i=0; i<1024; i+=cacheLineSize)
505 #endif
506  u &= *(const word32 *)(const void *)(((const byte *)Td)+i);
507  u &= Td[255];
508  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
509 
510  QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
511  QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
512  QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
513  QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
514 
515  // Nr - 2 full rounds:
516  unsigned int r = m_rounds/2 - 1;
517  do
518  {
519  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
520 
521  QUARTER_ROUND_D(t3, s2, s1, s0, s3)
522  QUARTER_ROUND_D(t2, s1, s0, s3, s2)
523  QUARTER_ROUND_D(t1, s0, s3, s2, s1)
524  QUARTER_ROUND_D(t0, s3, s2, s1, s0)
525 
526  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
527 
528  QUARTER_ROUND_D(s3, t2, t1, t0, t3)
529  QUARTER_ROUND_D(s2, t1, t0, t3, t2)
530  QUARTER_ROUND_D(s1, t0, t3, t2, t1)
531  QUARTER_ROUND_D(s0, t3, t2, t1, t0)
532 
533  rk += 8;
534  } while (--r);
535 
536 #if !(defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS))
537  // timing attack countermeasure. see comments at top for more details
538  // If CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined,
539  // QUARTER_ROUND_LD will use Td, which is already preloaded.
540  u = _u;
541  for (i=0; i<256; i+=cacheLineSize)
542  u &= *(const word32 *)(const void *)(Sd+i);
543  u &= *(const word32 *)(const void *)(Sd+252);
544  t0 |= u; t1 |= u; t2 |= u; t3 |= u;
545 #endif
546 
547  word32 tbw[4];
548  byte *const tempBlock = (byte *)tbw;
549 
550  QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
551  QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
552  QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
553  QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
554 
555  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
556 }
557 
558 // ************************* Assembly Code ************************************
559 
560 #if CRYPTOPP_MSC_VERSION
561 # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
562 #endif
563 
564 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
565 
566 #if CRYPTOPP_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
567 
568 CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k)
569 {
570  CRYPTOPP_UNUSED(locals); CRYPTOPP_UNUSED(k);
571 
572 #if CRYPTOPP_X86 || CRYPTOPP_X32
573 
574 #define L_REG esp
575 #define L_INDEX(i) (L_REG+768+i)
576 #define L_INXORBLOCKS L_INBLOCKS+4
577 #define L_OUTXORBLOCKS L_INBLOCKS+8
578 #define L_OUTBLOCKS L_INBLOCKS+12
579 #define L_INCREMENTS L_INDEX(16*15)
580 #define L_SP L_INDEX(16*16)
581 #define L_LENGTH L_INDEX(16*16+4)
582 #define L_KEYS_BEGIN L_INDEX(16*16+8)
583 
584 #define MOVD movd
585 #define MM(i) mm##i
586 
587 #define MXOR(a,b,c) \
588  AS2( movzx esi, b)\
589  AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
590  AS2( pxor MM(a), mm7)\
591 
592 #define MMOV(a,b,c) \
593  AS2( movzx esi, b)\
594  AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
595 
596 #else
597 
598 #define L_REG r8
599 #define L_INDEX(i) (L_REG+i)
600 #define L_INXORBLOCKS L_INBLOCKS+8
601 #define L_OUTXORBLOCKS L_INBLOCKS+16
602 #define L_OUTBLOCKS L_INBLOCKS+24
603 #define L_INCREMENTS L_INDEX(16*16)
604 #define L_LENGTH L_INDEX(16*18+8)
605 #define L_KEYS_BEGIN L_INDEX(16*19)
606 
607 #define MOVD mov
608 #define MM_0 r9d
609 #define MM_1 r12d
610 #ifdef __GNUC__
611 #define MM_2 r11d
612 #else
613 #define MM_2 r10d
614 #endif
615 #define MM(i) MM_##i
616 
617 #define MXOR(a,b,c) \
618  AS2( movzx esi, b)\
619  AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
620 
621 #define MMOV(a,b,c) \
622  AS2( movzx esi, b)\
623  AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
624 
625 #endif
626 
627 #define L_SUBKEYS L_INDEX(0)
628 #define L_SAVED_X L_SUBKEYS
629 #define L_KEY12 L_INDEX(16*12)
630 #define L_LASTROUND L_INDEX(16*13)
631 #define L_INBLOCKS L_INDEX(16*14)
632 #define MAP0TO4(i) (ASM_MOD(i+3,4)+1)
633 
634 #define XOR(a,b,c) \
635  AS2( movzx esi, b)\
636  AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
637 
638 #define MOV(a,b,c) \
639  AS2( movzx esi, b)\
640  AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
641 
642 #ifdef CRYPTOPP_GENERATE_X64_MASM
643  ALIGN 8
644  Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
645  rex_push_reg rsi
646  push_reg rdi
647  push_reg rbx
648  push_reg r12
649  .endprolog
650  mov L_REG, rcx
651  mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
652  mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
653 #elif defined(__GNUC__)
654  __asm__ __volatile__
655  (
656  INTEL_NOPREFIX
657  #if CRYPTOPP_X64
658  AS2( mov L_REG, rcx)
659  #endif
660  AS_PUSH_IF86(bx)
661  AS_PUSH_IF86(bp)
662  AS2( mov AS_REG_7, WORD_REG(si))
663 #else
664  AS_PUSH_IF86(si)
665  AS_PUSH_IF86(di)
666  AS_PUSH_IF86(bx)
667  AS_PUSH_IF86(bp)
668  AS2( lea AS_REG_7, [Te])
669  AS2( mov edi, [g_cacheLineSize])
670 #endif
671 
672 #if CRYPTOPP_X86 || CRYPTOPP_X32
673  AS2( mov [ecx+16*12+16*4], esp) // save esp to L_SP
674  AS2( lea esp, [ecx-768])
675 #endif
676 
677  // copy subkeys to stack
678  AS2( mov WORD_REG(si), [L_KEYS_BEGIN])
679  AS2( mov WORD_REG(ax), 16)
680  AS2( and WORD_REG(ax), WORD_REG(si))
681  AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)]) // subkey 1 (non-counter) or 2 (counter)
682  AS2( movdqa [L_KEY12], xmm3)
683  AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
684  AS2( sub WORD_REG(ax), WORD_REG(si))
685  ASL(0)
686  AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)])
687  AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
688  AS2( add WORD_REG(si), 16)
689  AS2( cmp WORD_REG(si), 16*12)
690  ATT_NOPREFIX
691  ASJ( jl, 0, b)
692  INTEL_NOPREFIX
693 
694  // read subkeys 0, 1 and last
695  AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)]) // last subkey
696  AS2( movdqa xmm1, [WORD_REG(dx)]) // subkey 0
697  AS2( MOVD MM(1), [WORD_REG(dx)+4*4]) // 0,1,2,3
698  AS2( mov ebx, [WORD_REG(dx)+5*4]) // 4,5,6,7
699  AS2( mov ecx, [WORD_REG(dx)+6*4]) // 8,9,10,11
700  AS2( mov edx, [WORD_REG(dx)+7*4]) // 12,13,14,15
701 
702  // load table into cache
703  AS2( xor WORD_REG(ax), WORD_REG(ax))
704  ASL(9)
705  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
706  AS2( add WORD_REG(ax), WORD_REG(di))
707  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
708  AS2( add WORD_REG(ax), WORD_REG(di))
709  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
710  AS2( add WORD_REG(ax), WORD_REG(di))
711  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
712  AS2( add WORD_REG(ax), WORD_REG(di))
713  AS2( cmp WORD_REG(ax), 2048)
714  ATT_NOPREFIX
715  ASJ( jl, 9, b)
716  INTEL_NOPREFIX
717  AS1( lfence)
718 
719  AS2( test DWORD PTR [L_LENGTH], 1)
720  ATT_NOPREFIX
721  ASJ( jz, 8, f)
722  INTEL_NOPREFIX
723 
724  // counter mode one-time setup
725  AS2( mov WORD_REG(si), [L_INBLOCKS])
726  AS2( movdqu xmm2, [WORD_REG(si)]) // counter
727  AS2( pxor xmm2, xmm1)
728  AS2( psrldq xmm1, 14)
729  AS2( movd eax, xmm1)
730  AS2( mov al, BYTE PTR [WORD_REG(si)+15])
731  AS2( MOVD MM(2), eax)
732 #if CRYPTOPP_X86 || CRYPTOPP_X32
733  AS2( mov eax, 1)
734  AS2( movd mm3, eax)
735 #endif
736 
737  // partial first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: mm1, ebx, ecx, edx
738  AS2( movd eax, xmm2)
739  AS2( psrldq xmm2, 4)
740  AS2( movd edi, xmm2)
741  AS2( psrldq xmm2, 4)
742  MXOR( 1, al, 0) // 0
743  XOR( edx, ah, 1) // 1
744  AS2( shr eax, 16)
745  XOR( ecx, al, 2) // 2
746  XOR( ebx, ah, 3) // 3
747  AS2( mov eax, edi)
748  AS2( movd edi, xmm2)
749  AS2( psrldq xmm2, 4)
750  XOR( ebx, al, 0) // 4
751  MXOR( 1, ah, 1) // 5
752  AS2( shr eax, 16)
753  XOR( edx, al, 2) // 6
754  XOR( ecx, ah, 3) // 7
755  AS2( mov eax, edi)
756  AS2( movd edi, xmm2)
757  XOR( ecx, al, 0) // 8
758  XOR( ebx, ah, 1) // 9
759  AS2( shr eax, 16)
760  MXOR( 1, al, 2) // 10
761  XOR( edx, ah, 3) // 11
762  AS2( mov eax, edi)
763  XOR( edx, al, 0) // 12
764  XOR( ecx, ah, 1) // 13
765  AS2( shr eax, 16)
766  XOR( ebx, al, 2) // 14
767  AS2( psrldq xmm2, 3)
768 
769  // partial second round, in: ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15), out: eax, ebx, edi, mm0
770  AS2( mov eax, [L_KEY12+0*4])
771  AS2( mov edi, [L_KEY12+2*4])
772  AS2( MOVD MM(0), [L_KEY12+3*4])
773  MXOR( 0, cl, 3) /* 11 */
774  XOR( edi, bl, 3) /* 7 */
775  MXOR( 0, bh, 2) /* 6 */
776  AS2( shr ebx, 16) /* 4,5 */
777  XOR( eax, bl, 1) /* 5 */
778  MOV( ebx, bh, 0) /* 4 */
779  AS2( xor ebx, [L_KEY12+1*4])
780  XOR( eax, ch, 2) /* 10 */
781  AS2( shr ecx, 16) /* 8,9 */
782  XOR( eax, dl, 3) /* 15 */
783  XOR( ebx, dh, 2) /* 14 */
784  AS2( shr edx, 16) /* 12,13 */
785  XOR( edi, ch, 0) /* 8 */
786  XOR( ebx, cl, 1) /* 9 */
787  XOR( edi, dl, 1) /* 13 */
788  MXOR( 0, dh, 0) /* 12 */
789 
790  AS2( movd ecx, xmm2)
791  AS2( MOVD edx, MM(1))
792  AS2( MOVD [L_SAVED_X+3*4], MM(0))
793  AS2( mov [L_SAVED_X+0*4], eax)
794  AS2( mov [L_SAVED_X+1*4], ebx)
795  AS2( mov [L_SAVED_X+2*4], edi)
796  ATT_NOPREFIX
797  ASJ( jmp, 5, f)
798  INTEL_NOPREFIX
799  ASL(3)
800  // non-counter mode per-block setup
801  AS2( MOVD MM(1), [L_KEY12+0*4]) // 0,1,2,3
802  AS2( mov ebx, [L_KEY12+1*4]) // 4,5,6,7
803  AS2( mov ecx, [L_KEY12+2*4]) // 8,9,10,11
804  AS2( mov edx, [L_KEY12+3*4]) // 12,13,14,15
805  ASL(8)
806  AS2( mov WORD_REG(ax), [L_INBLOCKS])
807  AS2( movdqu xmm2, [WORD_REG(ax)])
808  AS2( mov WORD_REG(si), [L_INXORBLOCKS])
809  AS2( movdqu xmm5, [WORD_REG(si)])
810  AS2( pxor xmm2, xmm1)
811  AS2( pxor xmm2, xmm5)
812 
813  // first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: eax, ebx, ecx, edx
814  AS2( movd eax, xmm2)
815  AS2( psrldq xmm2, 4)
816  AS2( movd edi, xmm2)
817  AS2( psrldq xmm2, 4)
818  MXOR( 1, al, 0) // 0
819  XOR( edx, ah, 1) // 1
820  AS2( shr eax, 16)
821  XOR( ecx, al, 2) // 2
822  XOR( ebx, ah, 3) // 3
823  AS2( mov eax, edi)
824  AS2( movd edi, xmm2)
825  AS2( psrldq xmm2, 4)
826  XOR( ebx, al, 0) // 4
827  MXOR( 1, ah, 1) // 5
828  AS2( shr eax, 16)
829  XOR( edx, al, 2) // 6
830  XOR( ecx, ah, 3) // 7
831  AS2( mov eax, edi)
832  AS2( movd edi, xmm2)
833  XOR( ecx, al, 0) // 8
834  XOR( ebx, ah, 1) // 9
835  AS2( shr eax, 16)
836  MXOR( 1, al, 2) // 10
837  XOR( edx, ah, 3) // 11
838  AS2( mov eax, edi)
839  XOR( edx, al, 0) // 12
840  XOR( ecx, ah, 1) // 13
841  AS2( shr eax, 16)
842  XOR( ebx, al, 2) // 14
843  MXOR( 1, ah, 3) // 15
844  AS2( MOVD eax, MM(1))
845 
846  AS2( add L_REG, [L_KEYS_BEGIN])
847  AS2( add L_REG, 4*16)
848  ATT_NOPREFIX
849  ASJ( jmp, 2, f)
850  INTEL_NOPREFIX
851  ASL(1)
852  // counter-mode per-block setup
853  AS2( MOVD ecx, MM(2))
854  AS2( MOVD edx, MM(1))
855  AS2( mov eax, [L_SAVED_X+0*4])
856  AS2( mov ebx, [L_SAVED_X+1*4])
857  AS2( xor cl, ch)
858  AS2( and WORD_REG(cx), 255)
859  ASL(5)
860 #if CRYPTOPP_X86 || CRYPTOPP_X32
861  AS2( paddb MM(2), mm3)
862 #else
863  AS2( add MM(2), 1)
864 #endif
865  // remaining part of second round, in: edx(previous round),esi(keyed counter byte) eax,ebx,[L_SAVED_X+2*4],[L_SAVED_X+3*4], out: eax,ebx,ecx,edx
866  AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
867  XOR( ebx, dl, 3)
868  MOV( ecx, dh, 2)
869  AS2( shr edx, 16)
870  AS2( xor ecx, [L_SAVED_X+2*4])
871  XOR( eax, dh, 0)
872  MOV( edx, dl, 1)
873  AS2( xor edx, [L_SAVED_X+3*4])
874 
875  AS2( add L_REG, [L_KEYS_BEGIN])
876  AS2( add L_REG, 3*16)
877  ATT_NOPREFIX
878  ASJ( jmp, 4, f)
879  INTEL_NOPREFIX
880 
881 // in: eax(0,1,2,3), ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15)
882 // out: eax, ebx, edi, mm0
883 #define ROUND() \
884  MXOR( 0, cl, 3) /* 11 */\
885  AS2( mov cl, al) /* 8,9,10,3 */\
886  XOR( edi, ah, 2) /* 2 */\
887  AS2( shr eax, 16) /* 0,1 */\
888  XOR( edi, bl, 3) /* 7 */\
889  MXOR( 0, bh, 2) /* 6 */\
890  AS2( shr ebx, 16) /* 4,5 */\
891  MXOR( 0, al, 1) /* 1 */\
892  MOV( eax, ah, 0) /* 0 */\
893  XOR( eax, bl, 1) /* 5 */\
894  MOV( ebx, bh, 0) /* 4 */\
895  XOR( eax, ch, 2) /* 10 */\
896  XOR( ebx, cl, 3) /* 3 */\
897  AS2( shr ecx, 16) /* 8,9 */\
898  XOR( eax, dl, 3) /* 15 */\
899  XOR( ebx, dh, 2) /* 14 */\
900  AS2( shr edx, 16) /* 12,13 */\
901  XOR( edi, ch, 0) /* 8 */\
902  XOR( ebx, cl, 1) /* 9 */\
903  XOR( edi, dl, 1) /* 13 */\
904  MXOR( 0, dh, 0) /* 12 */\
905 
906  ASL(2) // 2-round loop
907  AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4])
908  AS2( mov edi, [L_SUBKEYS-4*16+2*4])
909  ROUND()
910  AS2( mov ecx, edi)
911  AS2( xor eax, [L_SUBKEYS-4*16+0*4])
912  AS2( xor ebx, [L_SUBKEYS-4*16+1*4])
913  AS2( MOVD edx, MM(0))
914 
915  ASL(4)
916  AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4])
917  AS2( mov edi, [L_SUBKEYS-4*16+6*4])
918  ROUND()
919  AS2( mov ecx, edi)
920  AS2( xor eax, [L_SUBKEYS-4*16+4*4])
921  AS2( xor ebx, [L_SUBKEYS-4*16+5*4])
922  AS2( MOVD edx, MM(0))
923 
924  AS2( add L_REG, 32)
925  AS2( test L_REG, 255)
926  ATT_NOPREFIX
927  ASJ( jnz, 2, b)
928  INTEL_NOPREFIX
929  AS2( sub L_REG, 16*16)
930 
931 #define LAST(a, b, c) \
932  AS2( movzx esi, a )\
933  AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
934  AS2( movzx esi, b )\
935  AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\
936  AS2( mov WORD PTR [L_LASTROUND+c], di )\
937 
938  // last round
939  LAST(ch, dl, 2)
940  LAST(dh, al, 6)
941  AS2( shr edx, 16)
942  LAST(ah, bl, 10)
943  AS2( shr eax, 16)
944  LAST(bh, cl, 14)
945  AS2( shr ebx, 16)
946  LAST(dh, al, 12)
947  AS2( shr ecx, 16)
948  LAST(ah, bl, 0)
949  LAST(bh, cl, 4)
950  LAST(ch, dl, 8)
951 
952  AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS])
953  AS2( mov WORD_REG(bx), [L_OUTBLOCKS])
954 
955  AS2( mov WORD_REG(cx), [L_LENGTH])
956  AS2( sub WORD_REG(cx), 16)
957 
958  AS2( movdqu xmm2, [WORD_REG(ax)])
959  AS2( pxor xmm2, xmm4)
960 
961 #if CRYPTOPP_X86 || CRYPTOPP_X32
962  AS2( movdqa xmm0, [L_INCREMENTS])
963  AS2( paddd xmm0, [L_INBLOCKS])
964  AS2( movdqa [L_INBLOCKS], xmm0)
965 #else
966  AS2( movdqa xmm0, [L_INCREMENTS+16])
967  AS2( paddq xmm0, [L_INBLOCKS+16])
968  AS2( movdqa [L_INBLOCKS+16], xmm0)
969 #endif
970 
971  AS2( pxor xmm2, [L_LASTROUND])
972  AS2( movdqu [WORD_REG(bx)], xmm2)
973 
974  ATT_NOPREFIX
975  ASJ( jle, 7, f)
976  INTEL_NOPREFIX
977  AS2( mov [L_LENGTH], WORD_REG(cx))
978  AS2( test WORD_REG(cx), 1)
979  ATT_NOPREFIX
980  ASJ( jnz, 1, b)
981  INTEL_NOPREFIX
982 #if CRYPTOPP_X64
983  AS2( movdqa xmm0, [L_INCREMENTS])
984  AS2( paddq xmm0, [L_INBLOCKS])
985  AS2( movdqa [L_INBLOCKS], xmm0)
986 #endif
987  ATT_NOPREFIX
988  ASJ( jmp, 3, b)
989  INTEL_NOPREFIX
990 
991  ASL(7)
992  // erase keys on stack
993  AS2( xorps xmm0, xmm0)
994  AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16])
995  AS2( movaps [WORD_REG(ax)-7*16], xmm0)
996  AS2( movaps [WORD_REG(ax)-6*16], xmm0)
997  AS2( movaps [WORD_REG(ax)-5*16], xmm0)
998  AS2( movaps [WORD_REG(ax)-4*16], xmm0)
999  AS2( movaps [WORD_REG(ax)-3*16], xmm0)
1000  AS2( movaps [WORD_REG(ax)-2*16], xmm0)
1001  AS2( movaps [WORD_REG(ax)-1*16], xmm0)
1002  AS2( movaps [WORD_REG(ax)+0*16], xmm0)
1003  AS2( movaps [WORD_REG(ax)+1*16], xmm0)
1004  AS2( movaps [WORD_REG(ax)+2*16], xmm0)
1005  AS2( movaps [WORD_REG(ax)+3*16], xmm0)
1006  AS2( movaps [WORD_REG(ax)+4*16], xmm0)
1007  AS2( movaps [WORD_REG(ax)+5*16], xmm0)
1008  AS2( movaps [WORD_REG(ax)+6*16], xmm0)
1009 #if CRYPTOPP_X86 || CRYPTOPP_X32
1010  AS2( mov esp, [L_SP])
1011  AS1( emms)
1012 #endif
1013  AS_POP_IF86(bp)
1014  AS_POP_IF86(bx)
1015 #if defined(_MSC_VER) && CRYPTOPP_X86
1016  AS_POP_IF86(di)
1017  AS_POP_IF86(si)
1018  AS1(ret)
1019 #endif
1020 #ifdef CRYPTOPP_GENERATE_X64_MASM
1021  pop r12
1022  pop rbx
1023  pop rdi
1024  pop rsi
1025  ret
1026  Rijndael_Enc_AdvancedProcessBlocks ENDP
1027 #endif
1028 #ifdef __GNUC__
1029  ATT_PREFIX
1030  :
1031  : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize)
1032  : "memory", "cc", "%eax"
1033  #if CRYPTOPP_X64
1034  , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12"
1035  #endif
1036  );
1037 #endif
1038 }
1039 
1040 #endif
1041 
1042 #ifndef CRYPTOPP_GENERATE_X64_MASM
1043 
1044 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
1045 extern "C" {
1046 void Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k);
1047 }
1048 #endif
1049 
1050 #if CRYPTOPP_X64 || CRYPTOPP_X32 || CRYPTOPP_X86
1051 
1052 // Determine whether the range between begin and end overlaps
1053 // with the same 4k block offsets as the Te table. Logically,
1054 // the code is trying to create the condition:
1055 //
1056 // Two sepearate memory pages:
1057 //
1058 // +-----+ +-----+
1059 // |XXXXX| |YYYYY|
1060 // |XXXXX| |YYYYY|
1061 // | | | |
1062 // | | | |
1063 // +-----+ +-----+
1064 // Te Table Locals
1065 //
1066 // Have a logical cache view of (X and Y may be inverted):
1067 //
1068 // +-----+
1069 // |XXXXX|
1070 // |XXXXX|
1071 // |YYYYY|
1072 // |YYYYY|
1073 // +-----+
1074 //
1075 static inline bool AliasedWithTable(const byte *begin, const byte *end)
1076 {
1077  ptrdiff_t s0 = uintptr_t(begin)%4096, s1 = uintptr_t(end)%4096;
1078  ptrdiff_t t0 = uintptr_t(Te)%4096, t1 = (uintptr_t(Te)+sizeof(Te))%4096;
1079  if (t1 > t0)
1080  return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
1081  else
1082  return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
1083 }
1084 
1085 #if CRYPTOPP_AESNI_AVAILABLE
1086 
1087 inline void AESNI_Enc_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
1088 {
1089  block = _mm_xor_si128(block, subkeys[0]);
1090  for (unsigned int i=1; i<rounds-1; i+=2)
1091  {
1092  block = MM_AESENC_SI128(block, subkeys[i]);
1093  block = MM_AESENC_SI128(block, subkeys[i+1]);
1094  }
1095  block = MM_AESENC_SI128(block, subkeys[rounds-1]);
1096  block = MM_AESENCLAST_SI128(block, subkeys[rounds]);
1097 }
1098 
1099 inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
1100 {
1101  __m128i rk = subkeys[0];
1102  block0 = _mm_xor_si128(block0, rk);
1103  block1 = _mm_xor_si128(block1, rk);
1104  block2 = _mm_xor_si128(block2, rk);
1105  block3 = _mm_xor_si128(block3, rk);
1106  for (unsigned int i=1; i<rounds; i++)
1107  {
1108  rk = subkeys[i];
1109  block0 = MM_AESENC_SI128(block0, rk);
1110  block1 = MM_AESENC_SI128(block1, rk);
1111  block2 = MM_AESENC_SI128(block2, rk);
1112  block3 = MM_AESENC_SI128(block3, rk);
1113  }
1114  rk = subkeys[rounds];
1115  block0 = MM_AESENCLAST_SI128(block0, rk);
1116  block1 = MM_AESENCLAST_SI128(block1, rk);
1117  block2 = MM_AESENCLAST_SI128(block2, rk);
1118  block3 = MM_AESENCLAST_SI128(block3, rk);
1119 }
1120 
1121 inline void AESNI_Dec_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
1122 {
1123  block = _mm_xor_si128(block, subkeys[0]);
1124  for (unsigned int i=1; i<rounds-1; i+=2)
1125  {
1126  block = MM_AESDEC_SI128(block, subkeys[i]);
1127  block = MM_AESDEC_SI128(block, subkeys[i+1]);
1128  }
1129  block = MM_AESDEC_SI128(block, subkeys[rounds-1]);
1130  block = MM_AESDECLAST_SI128(block, subkeys[rounds]);
1131 }
1132 
1133 inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
1134 {
1135  __m128i rk = subkeys[0];
1136  block0 = _mm_xor_si128(block0, rk);
1137  block1 = _mm_xor_si128(block1, rk);
1138  block2 = _mm_xor_si128(block2, rk);
1139  block3 = _mm_xor_si128(block3, rk);
1140  for (unsigned int i=1; i<rounds; i++)
1141  {
1142  rk = subkeys[i];
1143  block0 = MM_AESDEC_SI128(block0, rk);
1144  block1 = MM_AESDEC_SI128(block1, rk);
1145  block2 = MM_AESDEC_SI128(block2, rk);
1146  block3 = MM_AESDEC_SI128(block3, rk);
1147  }
1148  rk = subkeys[rounds];
1149  block0 = MM_AESDECLAST_SI128(block0, rk);
1150  block1 = MM_AESDECLAST_SI128(block1, rk);
1151  block2 = MM_AESDECLAST_SI128(block2, rk);
1152  block3 = MM_AESDECLAST_SI128(block3, rk);
1153 }
1154 
1155 CRYPTOPP_ALIGN_DATA(16)
1156 static const word32 s_one[] = {0, 0, 0, 1<<24};
1157 
1158 template <typename F1, typename F4>
1159 inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, MAYBE_CONST __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1160 {
1161  size_t blockSize = 16;
1163  size_t xorIncrement = xorBlocks ? blockSize : 0;
1164  size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
1165 
1167  {
1168  CRYPTOPP_ASSERT(length % blockSize == 0);
1169  inBlocks += length - blockSize;
1170  xorBlocks += length - blockSize;
1171  outBlocks += length - blockSize;
1172  inIncrement = 0-inIncrement;
1173  xorIncrement = 0-xorIncrement;
1174  outIncrement = 0-outIncrement;
1175  }
1176 
1177  if (flags & BlockTransformation::BT_AllowParallel)
1178  {
1179  while (length >= 4*blockSize)
1180  {
1181  __m128i block0 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks), block1, block2, block3;
1183  {
1184  const __m128i be1 = *(const __m128i *)(const void *)s_one;
1185  block1 = _mm_add_epi32(block0, be1);
1186  block2 = _mm_add_epi32(block1, be1);
1187  block3 = _mm_add_epi32(block2, be1);
1188  _mm_storeu_si128((__m128i *)(void *)inBlocks, _mm_add_epi32(block3, be1));
1189  }
1190  else
1191  {
1192  inBlocks += inIncrement;
1193  block1 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
1194  inBlocks += inIncrement;
1195  block2 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
1196  inBlocks += inIncrement;
1197  block3 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
1198  inBlocks += inIncrement;
1199  }
1200 
1201  if (flags & BlockTransformation::BT_XorInput)
1202  {
1203  // Coverity finding, appears to be false positive. Assert the condition.
1204  CRYPTOPP_ASSERT(xorBlocks);
1205  block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1206  xorBlocks += xorIncrement;
1207  block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1208  xorBlocks += xorIncrement;
1209  block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1210  xorBlocks += xorIncrement;
1211  block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1212  xorBlocks += xorIncrement;
1213  }
1214 
1215  func4(block0, block1, block2, block3, subkeys, rounds);
1216 
1217  if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
1218  {
1219  block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1220  xorBlocks += xorIncrement;
1221  block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1222  xorBlocks += xorIncrement;
1223  block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1224  xorBlocks += xorIncrement;
1225  block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1226  xorBlocks += xorIncrement;
1227  }
1228 
1229  _mm_storeu_si128((__m128i *)(void *)outBlocks, block0);
1230  outBlocks += outIncrement;
1231  _mm_storeu_si128((__m128i *)(void *)outBlocks, block1);
1232  outBlocks += outIncrement;
1233  _mm_storeu_si128((__m128i *)(void *)outBlocks, block2);
1234  outBlocks += outIncrement;
1235  _mm_storeu_si128((__m128i *)(void *)outBlocks, block3);
1236  outBlocks += outIncrement;
1237 
1238  length -= 4*blockSize;
1239  }
1240  }
1241 
1242  while (length >= blockSize)
1243  {
1244  __m128i block = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
1245 
1246  if (flags & BlockTransformation::BT_XorInput)
1247  block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1248 
1249  if (flags & BlockTransformation::BT_InBlockIsCounter)
1250  const_cast<byte *>(inBlocks)[15]++;
1251 
1252  func1(block, subkeys, rounds);
1253 
1254  if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
1255  block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1256 
1257  _mm_storeu_si128((__m128i *)(void *)outBlocks, block);
1258 
1259  inBlocks += inIncrement;
1260  outBlocks += outIncrement;
1261  xorBlocks += xorIncrement;
1262  length -= blockSize;
1263  }
1264 
1265  return length;
1266 }
1267 #endif
1268 
1269 #if CRYPTOPP_X64 || CRYPTOPP_X32 || CRYPTOPP_X86
1270 struct Locals
1271 {
1272  word32 subkeys[4*12], workspace[8];
1273  const byte *inBlocks, *inXorBlocks, *outXorBlocks;
1274  byte *outBlocks;
1275  size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
1276  size_t regSpill, lengthAndCounterFlag, keysBegin;
1277 };
1278 
1279 const size_t s_aliasPageSize = 4096;
1280 const size_t s_aliasBlockSize = 256;
1281 const size_t s_sizeToAllocate = s_aliasPageSize + s_aliasBlockSize + sizeof(Locals);
1282 
1283 Rijndael::Enc::Enc() : m_aliasBlock(s_sizeToAllocate) { }
1284 #endif
1285 
1286 size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1287 {
1288 #if CRYPTOPP_AESNI_AVAILABLE
1289  if (HasAESNI())
1290  return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (MAYBE_CONST __m128i *)(const void *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1291 #endif
1292 
1293 #if (CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
1294  if (HasSSE2())
1295  {
1296  if (length < BLOCKSIZE)
1297  return length;
1298 
1299  static const byte *zeros = (const byte*)(Te+256);
1300  byte *space = NULLPTR, *originalSpace = const_cast<byte*>(m_aliasBlock.data());
1301 
1302  // round up to nearest 256 byte boundary
1303  space = originalSpace + (s_aliasBlockSize - (uintptr_t)originalSpace % s_aliasBlockSize) % s_aliasBlockSize;
1304  while (AliasedWithTable(space, space + sizeof(Locals)))
1305  {
1306  space += 256;
1307  CRYPTOPP_ASSERT(space < (originalSpace + s_aliasPageSize));
1308  }
1309 
1310  size_t increment = BLOCKSIZE;
1311  if (flags & BT_ReverseDirection)
1312  {
1313  CRYPTOPP_ASSERT(length % BLOCKSIZE == 0);
1314  inBlocks += length - BLOCKSIZE;
1315  xorBlocks += length - BLOCKSIZE;
1316  outBlocks += length - BLOCKSIZE;
1317  increment = 0-increment;
1318  }
1319 
1320  Locals &locals = *(Locals *)(void *)space;
1321 
1322  locals.inBlocks = inBlocks;
1323  locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
1324  locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
1325  locals.outBlocks = outBlocks;
1326 
1327  locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1328  locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
1329  locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
1330  locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1331 
1332  locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter);
1333  int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
1334  locals.keysBegin = (12-keysToCopy)*16;
1335 
1336  Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key);
1337 
1338  return length % BLOCKSIZE;
1339  }
1340 #endif
1341 
1342  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1343 }
1344 
1345 #endif
1346 
1347 #if CRYPTOPP_X64 || CRYPTOPP_X32 || CRYPTOPP_X86
1348 size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1349 {
1350 #if CRYPTOPP_AESNI_AVAILABLE
1351  if (HasAESNI())
1352  return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (MAYBE_CONST __m128i *)(const void *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1353 #endif
1354 
1355  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1356 }
1357 #endif // CRYPTOPP_X64 || CRYPTOPP_X32 || CRYPTOPP_X86
1358 
1359 NAMESPACE_END
1360 
1361 #endif
1362 #endif
Utility functions for the Crypto++ library.
bool HasSSE4()
Determines SSE4 availability.
Definition: cpu.h:208
Library configuration file.
should not modify block pointers
Definition: cryptlib.h:802
int GetCacheLineSize()
Provides the L1 cache line size.
Definition: cpu.h:427
Access a block of memory.
Definition: misc.h:2345
Rijndael block cipher implementation details.
Definition: rijndael.h:29
T ConditionalByteReverse(ByteOrder order, T value)
Reverses bytes in a value depending upon endianness.
Definition: misc.h:1919
virtual size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
Encrypt and xor multiple blocks using additional flags.
Definition: cryptlib.cpp:152
#define COUNTOF(arr)
Counts elements in an array.
Definition: misc.h:196
byte order is big-endian
Definition: cryptlib.h:138
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:61
Classes for Rijndael encryption algorithm.
Functions for CPU features and intrinsics.
bool HasAESNI()
Determines AES-NI availability.
Definition: cpu.h:218
bool HasSSE2()
Determines SSE2 availability.
Definition: cpu.h:183
perform the transformation in reverse
Definition: cryptlib.h:806
Crypto++ library namespace.
T rotrFixed(T x, unsigned int y)
Performs a right rotate.
Definition: misc.h:1388
Interface for retrieving values given their names.
Definition: cryptlib.h:285
void vec_swap(T &a, T &b)
Swaps two variables which are arrays.
Definition: misc.h:485