Crypto++  5.6.5
Free C++ class library of cryptographic schemes
rijndael.cpp
1 // rijndael.cpp - modified by Chris Morgan <cmorgan@wpi.edu>
2 // and Wei Dai from Paulo Baretto's Rijndael implementation
3 // The original code and all modifications are in the public domain.
4 
5 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
6 
7 /*
8 July 2010: Added support for AES-NI instructions via compiler intrinsics.
9 */
10 
11 /*
12 Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode
13 caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein
14 and Peter Schwabe in their paper "New AES software speed records". The round
15 function was also modified to include a trick similar to one in Brian Gladman's
16 x86 assembly code, doing an 8-bit register move to minimize the number of
17 register spills. Also switched to compressed tables and copying round keys to
18 the stack.
19 
20 The C++ implementation now uses compressed tables if
21 CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined.
22 */
23 
24 /*
25 July 2006: Defense against timing attacks was added in by Wei Dai.
26 
27 The code now uses smaller tables in the first and last rounds,
28 and preloads them into L1 cache before usage (by loading at least
29 one element in each cache line).
30 
31 We try to delay subsequent accesses to each table (used in the first
32 and last rounds) until all of the table has been preloaded. Hopefully
33 the compiler isn't smart enough to optimize that code away.
34 
35 After preloading the table, we also try not to access any memory location
36 other than the table and the stack, in order to prevent table entries from
37 being unloaded from L1 cache, until that round is finished.
38 (Some popular CPUs have 2-way associative caches.)
39 */
40 
41 // This is the original introductory comment:
42 
43 /**
44  * version 3.0 (December 2000)
45  *
46  * Optimised ANSI C code for the Rijndael cipher (now AES)
47  *
48  * author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
49  * author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
50  * author Paulo Barreto <paulo.barreto@terra.com.br>
51  *
52  * This code is hereby placed in the public domain.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
55  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
56  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
58  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
59  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
60  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
61  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
62  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
63  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
64  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65  */
66 
67 #include "pch.h"
68 #include "config.h"
69 
70 #ifndef CRYPTOPP_IMPORTS
71 #ifndef CRYPTOPP_GENERATE_X64_MASM
72 
73 #include "rijndael.h"
74 #include "stdcpp.h" // alloca
75 #include "misc.h"
76 #include "cpu.h"
77 
78 NAMESPACE_BEGIN(CryptoPP)
79 
80 // Hack for http://github.com/weidai11/cryptopp/issues/42 and http://github.com/weidai11/cryptopp/issues/132
81 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS)
82 # define CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS 1
83 #endif
84 
85 // Hack for SunCC, http://github.com/weidai11/cryptopp/issues/224
86 #if (__SUNPRO_CC >= 0x5130)
87 # define MAYBE_CONST
88 #else
89 # define MAYBE_CONST const
90 #endif
91 
92 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
93 # if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
94 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
95 using namespace rdtable;
96 # else
97 static word64 Te[256];
98 # endif
99 static word64 Td[256];
100 #else // Not CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
101 # if defined(CRYPTOPP_X64_MASM_AVAILABLE)
102 // Unused; avoids linker error on Microsoft X64 non-AESNI platforms
103 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
104 # endif
105 CRYPTOPP_ALIGN_DATA(16) static word32 Te[256*4];
106 CRYPTOPP_ALIGN_DATA(16) static word32 Td[256*4];
107 #endif // CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
108 
109 static volatile bool s_TeFilled = false, s_TdFilled = false;
110 
111 // ************************* Portable Code ************************************
112 
113 #define QUARTER_ROUND(L, T, t, a, b, c, d) \
114  a ^= L(T, 3, byte(t)); t >>= 8;\
115  b ^= L(T, 2, byte(t)); t >>= 8;\
116  c ^= L(T, 1, byte(t)); t >>= 8;\
117  d ^= L(T, 0, t);
118 
119 #define QUARTER_ROUND_LE(t, a, b, c, d) \
120  tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
121  tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
122  tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
123  tempBlock[d] = ((byte *)(Te+t))[1];
124 
125 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
126  #define QUARTER_ROUND_LD(t, a, b, c, d) \
127  tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
128  tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
129  tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
130  tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
131 #else
132  #define QUARTER_ROUND_LD(t, a, b, c, d) \
133  tempBlock[a] = Sd[byte(t)]; t >>= 8;\
134  tempBlock[b] = Sd[byte(t)]; t >>= 8;\
135  tempBlock[c] = Sd[byte(t)]; t >>= 8;\
136  tempBlock[d] = Sd[t];
137 #endif
138 
139 #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
140 #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
141 
142 #ifdef IS_LITTLE_ENDIAN
143  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
144  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
145  #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
146  #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (6-i)%4+1))
147  #define TL_M(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (i+3)%4+1))
148  #else
149  #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8)
150  #define TL_M(T, i, x) T[i*256 + x]
151  #endif
152 #else
153  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
154  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
155  #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
156  #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (4-i)%4))
157  #define TL_M TL_F
158  #else
159  #define TL_F(T, i, x) rotrFixed(T[x], i*8)
160  #define TL_M(T, i, x) T[i*256 + x]
161  #endif
162 #endif
163 
164 
165 #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
166 #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
167 #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
168 
169 #define f3(x) (f2(x) ^ x)
170 #define f9(x) (f8(x) ^ x)
171 #define fb(x) (f8(x) ^ f2(x) ^ x)
172 #define fd(x) (f8(x) ^ f4(x) ^ x)
173 #define fe(x) (f8(x) ^ f4(x) ^ f2(x))
174 
175 void Rijndael::Base::FillEncTable()
176 {
177  for (int i=0; i<256; i++)
178  {
179  byte x = Se[i];
180 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
181  word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
182  Te[i] = word64(y | f3(x))<<32 | y;
183 #else
184  word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
185  for (int j=0; j<4; j++)
186  {
187  Te[i+j*256] = y;
188  y = rotrFixed(y, 8);
189  }
190 #endif
191  }
192 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
193  Te[256] = Te[257] = 0;
194 #endif
195  s_TeFilled = true;
196 }
197 
198 void Rijndael::Base::FillDecTable()
199 {
200  for (int i=0; i<256; i++)
201  {
202  byte x = Sd[i];
203 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
204  word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
205  Td[i] = word64(y | fb(x))<<32 | y | x;
206 #else
207  word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;;
208  for (int j=0; j<4; j++)
209  {
210  Td[i+j*256] = y;
211  y = rotrFixed(y, 8);
212  }
213 #endif
214  }
215  s_TdFilled = true;
216 }
217 
218 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &)
219 {
220  AssertValidKeyLength(keylen);
221 
222  m_rounds = keylen/4 + 6;
223  m_key.New(4*(m_rounds+1));
224 
225  word32 *rk = m_key;
226 
227 #if (CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32))
228  // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64
229  if (HasAESNI() && HasSSE4())
230  {
231  static const word32 rcLE[] = {
232  0x01, 0x02, 0x04, 0x08,
233  0x10, 0x20, 0x40, 0x80,
234  0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
235  };
236 
237  // Coverity finding, appears to be false positive. Assert the condition.
238  const word32 *ro = rcLE, *rc = rcLE;
239  CRYPTOPP_UNUSED(ro);
240 
241  __m128i temp = _mm_loadu_si128((__m128i *)(void *)(userKey+keylen-16));
242  memcpy(rk, userKey, keylen);
243 
244  while (true)
245  {
246  // Coverity finding, appears to be false positive. Assert the condition.
247  CRYPTOPP_ASSERT(rc < ro + COUNTOF(rcLE));
248  rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
249  rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
250  rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
251  rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
252 
253  if (rk + keylen/4 + 4 == m_key.end())
254  break;
255 
256  if (keylen == 24)
257  {
258  rk[10] = rk[ 4] ^ rk[ 9];
259  rk[11] = rk[ 5] ^ rk[10];
260  // Coverity finding, appears to be false positive. Assert the condition.
261  CRYPTOPP_ASSERT(m_key.size() >= 12);
262  temp = _mm_insert_epi32(temp, rk[11], 3);
263  }
264  else if (keylen == 32)
265  {
266  // Coverity finding, appears to be false positive. Assert the condition.
267  CRYPTOPP_ASSERT(m_key.size() >= 12);
268  temp = _mm_insert_epi32(temp, rk[11], 3);
269  rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2);
270  rk[13] = rk[ 5] ^ rk[12];
271  rk[14] = rk[ 6] ^ rk[13];
272  rk[15] = rk[ 7] ^ rk[14];
273  // Coverity finding, appears to be false positive. Assert the condition.
274  CRYPTOPP_ASSERT(m_key.size() >= 16);
275  temp = _mm_insert_epi32(temp, rk[15], 3);
276  }
277  else
278  {
279  // Coverity finding, appears to be false positive. Assert the condition.
280  CRYPTOPP_ASSERT(m_key.size() >= 8);
281  temp = _mm_insert_epi32(temp, rk[7], 3);
282  }
283 
284  rk += keylen/4;
285  }
286 
287  if (!IsForwardTransformation())
288  {
289  rk = m_key;
290  unsigned int i, j;
291 
292 #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
293  // __m128i is an unsigned long long[2], and support for swapping it was not added until C++11.
294  // SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11.
295  vec_swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds));
296 #else
297  std::swap(*(__m128i *)(void *)(rk), *(__m128i *)(void *)(rk+4*m_rounds));
298 #endif
299  for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
300  {
301  temp = _mm_aesimc_si128(*(__m128i *)(void *)(rk+i));
302  *(__m128i *)(void *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(void *)(rk+j));
303  *(__m128i *)(void *)(rk+j) = temp;
304  }
305 
306  *(__m128i *)(void *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(void *)(rk+i));
307  }
308 
309  return;
310  }
311 #endif
312 
313  GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen);
314  const word32 *rc = rcon;
315  word32 temp;
316 
317  while (true)
318  {
319  temp = rk[keylen/4-1];
320  word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
321  rk[keylen/4] = rk[0] ^ x ^ *(rc++);
322  rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
323  rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
324  rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
325 
326  if (rk + keylen/4 + 4 == m_key.end())
327  break;
328 
329  if (keylen == 24)
330  {
331  rk[10] = rk[ 4] ^ rk[ 9];
332  rk[11] = rk[ 5] ^ rk[10];
333  }
334  else if (keylen == 32)
335  {
336  temp = rk[11];
337  rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
338  rk[13] = rk[ 5] ^ rk[12];
339  rk[14] = rk[ 6] ^ rk[13];
340  rk[15] = rk[ 7] ^ rk[14];
341  }
342  rk += keylen/4;
343  }
344 
345  rk = m_key;
346 
347  if (IsForwardTransformation())
348  {
349  if (!s_TeFilled)
350  FillEncTable();
351 
353  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
354  }
355  else
356  {
357  if (!s_TdFilled)
358  FillDecTable();
359 
360  unsigned int i, j;
361 
362 #define InverseMixColumn(x) TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
363 
364  for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
365  {
366  temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp;
367  temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
368  temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
369  temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
370  }
371 
372  rk[i+0] = InverseMixColumn(rk[i+0]);
373  rk[i+1] = InverseMixColumn(rk[i+1]);
374  rk[i+2] = InverseMixColumn(rk[i+2]);
375  rk[i+3] = InverseMixColumn(rk[i+3]);
376 
377  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
378  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
379  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
380  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
381  }
382 
383 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
384  if (HasAESNI())
385  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
386 #endif
387 }
388 
389 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
390 {
391 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
392 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
393  if (HasSSE2())
394 #else
395  if (HasAESNI())
396 #endif
397  {
398  return (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
399  }
400 #endif
401 
403 
404  word32 s0, s1, s2, s3, t0, t1, t2, t3;
405  Block::Get(inBlock)(s0)(s1)(s2)(s3);
406 
407  const word32 *rk = m_key;
408  s0 ^= rk[0];
409  s1 ^= rk[1];
410  s2 ^= rk[2];
411  s3 ^= rk[3];
412  t0 = rk[4];
413  t1 = rk[5];
414  t2 = rk[6];
415  t3 = rk[7];
416  rk += 8;
417 
418  // timing attack countermeasure. see comments at top for more details.
419  // also see http://github.com/weidai11/cryptopp/issues/146
420  const int cacheLineSize = GetCacheLineSize();
421  unsigned int i;
422  volatile word32 _u = 0;
423  word32 u = _u;
424 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
425  for (i=0; i<2048; i+=cacheLineSize)
426 #else
427  for (i=0; i<1024; i+=cacheLineSize)
428 #endif
429  u &= *(const word32 *)(const void *)(((const byte *)Te)+i);
430  u &= Te[255];
431  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
432 
433  QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
434  QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
435  QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
436  QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
437 
438  // Nr - 2 full rounds:
439  unsigned int r = m_rounds/2 - 1;
440  do
441  {
442  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
443 
444  QUARTER_ROUND_E(t3, s0, s1, s2, s3)
445  QUARTER_ROUND_E(t2, s3, s0, s1, s2)
446  QUARTER_ROUND_E(t1, s2, s3, s0, s1)
447  QUARTER_ROUND_E(t0, s1, s2, s3, s0)
448 
449  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
450 
451  QUARTER_ROUND_E(s3, t0, t1, t2, t3)
452  QUARTER_ROUND_E(s2, t3, t0, t1, t2)
453  QUARTER_ROUND_E(s1, t2, t3, t0, t1)
454  QUARTER_ROUND_E(s0, t1, t2, t3, t0)
455 
456  rk += 8;
457  } while (--r);
458 
459  word32 tbw[4];
460  byte *const tempBlock = (byte *)tbw;
461 
462  QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
463  QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
464  QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
465  QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
466 
467  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
468 }
469 
470 void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
471 {
472 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
473  if (HasAESNI())
474  {
475  Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
476  return;
477  }
478 #endif
479 
481 
482  word32 s0, s1, s2, s3, t0, t1, t2, t3;
483  Block::Get(inBlock)(s0)(s1)(s2)(s3);
484 
485  const word32 *rk = m_key;
486  s0 ^= rk[0];
487  s1 ^= rk[1];
488  s2 ^= rk[2];
489  s3 ^= rk[3];
490  t0 = rk[4];
491  t1 = rk[5];
492  t2 = rk[6];
493  t3 = rk[7];
494  rk += 8;
495 
496  // timing attack countermeasure. see comments at top for more details.
497  // also see http://github.com/weidai11/cryptopp/issues/146
498  const int cacheLineSize = GetCacheLineSize();
499  unsigned int i;
500  volatile word32 _u = 0;
501  word32 u = _u;
502 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
503  for (i=0; i<2048; i+=cacheLineSize)
504 #else
505  for (i=0; i<1024; i+=cacheLineSize)
506 #endif
507  u &= *(const word32 *)(const void *)(((const byte *)Td)+i);
508  u &= Td[255];
509  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
510 
511  QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
512  QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
513  QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
514  QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
515 
516  // Nr - 2 full rounds:
517  unsigned int r = m_rounds/2 - 1;
518  do
519  {
520  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
521 
522  QUARTER_ROUND_D(t3, s2, s1, s0, s3)
523  QUARTER_ROUND_D(t2, s1, s0, s3, s2)
524  QUARTER_ROUND_D(t1, s0, s3, s2, s1)
525  QUARTER_ROUND_D(t0, s3, s2, s1, s0)
526 
527  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
528 
529  QUARTER_ROUND_D(s3, t2, t1, t0, t3)
530  QUARTER_ROUND_D(s2, t1, t0, t3, t2)
531  QUARTER_ROUND_D(s1, t0, t3, t2, t1)
532  QUARTER_ROUND_D(s0, t3, t2, t1, t0)
533 
534  rk += 8;
535  } while (--r);
536 
537 #if !(defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS))
538  // timing attack countermeasure. see comments at top for more details
539  // If CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined,
540  // QUARTER_ROUND_LD will use Td, which is already preloaded.
541  u = _u;
542  for (i=0; i<256; i+=cacheLineSize)
543  u &= *(const word32 *)(const void *)(Sd+i);
544  u &= *(const word32 *)(const void *)(Sd+252);
545  t0 |= u; t1 |= u; t2 |= u; t3 |= u;
546 #endif
547 
548  word32 tbw[4];
549  byte *const tempBlock = (byte *)tbw;
550 
551  QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
552  QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
553  QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
554  QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
555 
556  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
557 }
558 
559 // ************************* Assembly Code ************************************
560 
561 #if CRYPTOPP_MSC_VERSION
562 # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
563 #endif
564 
565 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
566 
567 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
568 
569 CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k)
570 {
571  CRYPTOPP_UNUSED(locals); CRYPTOPP_UNUSED(k);
572 
573 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
574 
575 #define L_REG esp
576 #define L_INDEX(i) (L_REG+768+i)
577 #define L_INXORBLOCKS L_INBLOCKS+4
578 #define L_OUTXORBLOCKS L_INBLOCKS+8
579 #define L_OUTBLOCKS L_INBLOCKS+12
580 #define L_INCREMENTS L_INDEX(16*15)
581 #define L_SP L_INDEX(16*16)
582 #define L_LENGTH L_INDEX(16*16+4)
583 #define L_KEYS_BEGIN L_INDEX(16*16+8)
584 
585 #define MOVD movd
586 #define MM(i) mm##i
587 
588 #define MXOR(a,b,c) \
589  AS2( movzx esi, b)\
590  AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
591  AS2( pxor MM(a), mm7)\
592 
593 #define MMOV(a,b,c) \
594  AS2( movzx esi, b)\
595  AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
596 
597 #else
598 
599 #define L_REG r8
600 #define L_INDEX(i) (L_REG+i)
601 #define L_INXORBLOCKS L_INBLOCKS+8
602 #define L_OUTXORBLOCKS L_INBLOCKS+16
603 #define L_OUTBLOCKS L_INBLOCKS+24
604 #define L_INCREMENTS L_INDEX(16*16)
605 #define L_LENGTH L_INDEX(16*18+8)
606 #define L_KEYS_BEGIN L_INDEX(16*19)
607 
608 #define MOVD mov
609 #define MM_0 r9d
610 #define MM_1 r12d
611 #ifdef __GNUC__
612 #define MM_2 r11d
613 #else
614 #define MM_2 r10d
615 #endif
616 #define MM(i) MM_##i
617 
618 #define MXOR(a,b,c) \
619  AS2( movzx esi, b)\
620  AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
621 
622 #define MMOV(a,b,c) \
623  AS2( movzx esi, b)\
624  AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
625 
626 #endif
627 
628 #define L_SUBKEYS L_INDEX(0)
629 #define L_SAVED_X L_SUBKEYS
630 #define L_KEY12 L_INDEX(16*12)
631 #define L_LASTROUND L_INDEX(16*13)
632 #define L_INBLOCKS L_INDEX(16*14)
633 #define MAP0TO4(i) (ASM_MOD(i+3,4)+1)
634 
635 #define XOR(a,b,c) \
636  AS2( movzx esi, b)\
637  AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
638 
639 #define MOV(a,b,c) \
640  AS2( movzx esi, b)\
641  AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
642 
643 #ifdef CRYPTOPP_GENERATE_X64_MASM
644  ALIGN 8
645  Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
646  rex_push_reg rsi
647  push_reg rdi
648  push_reg rbx
649  push_reg r12
650  .endprolog
651  mov L_REG, rcx
652  mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
653  mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
654 #elif defined(__GNUC__)
655  __asm__ __volatile__
656  (
657  INTEL_NOPREFIX
658  #if CRYPTOPP_BOOL_X64
659  AS2( mov L_REG, rcx)
660  #endif
661  AS_PUSH_IF86(bx)
662  AS_PUSH_IF86(bp)
663  AS2( mov AS_REG_7, WORD_REG(si))
664 #else
665  AS_PUSH_IF86(si)
666  AS_PUSH_IF86(di)
667  AS_PUSH_IF86(bx)
668  AS_PUSH_IF86(bp)
669  AS2( lea AS_REG_7, [Te])
670  AS2( mov edi, [g_cacheLineSize])
671 #endif
672 
673 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
674  AS2( mov [ecx+16*12+16*4], esp) // save esp to L_SP
675  AS2( lea esp, [ecx-768])
676 #endif
677 
678  // copy subkeys to stack
679  AS2( mov WORD_REG(si), [L_KEYS_BEGIN])
680  AS2( mov WORD_REG(ax), 16)
681  AS2( and WORD_REG(ax), WORD_REG(si))
682  AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)]) // subkey 1 (non-counter) or 2 (counter)
683  AS2( movdqa [L_KEY12], xmm3)
684  AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
685  AS2( sub WORD_REG(ax), WORD_REG(si))
686  ASL(0)
687  AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)])
688  AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
689  AS2( add WORD_REG(si), 16)
690  AS2( cmp WORD_REG(si), 16*12)
691  ATT_NOPREFIX
692  ASJ( jl, 0, b)
693  INTEL_NOPREFIX
694 
695  // read subkeys 0, 1 and last
696  AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)]) // last subkey
697  AS2( movdqa xmm1, [WORD_REG(dx)]) // subkey 0
698  AS2( MOVD MM(1), [WORD_REG(dx)+4*4]) // 0,1,2,3
699  AS2( mov ebx, [WORD_REG(dx)+5*4]) // 4,5,6,7
700  AS2( mov ecx, [WORD_REG(dx)+6*4]) // 8,9,10,11
701  AS2( mov edx, [WORD_REG(dx)+7*4]) // 12,13,14,15
702 
703  // load table into cache
704  AS2( xor WORD_REG(ax), WORD_REG(ax))
705  ASL(9)
706  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
707  AS2( add WORD_REG(ax), WORD_REG(di))
708  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
709  AS2( add WORD_REG(ax), WORD_REG(di))
710  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
711  AS2( add WORD_REG(ax), WORD_REG(di))
712  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
713  AS2( add WORD_REG(ax), WORD_REG(di))
714  AS2( cmp WORD_REG(ax), 2048)
715  ATT_NOPREFIX
716  ASJ( jl, 9, b)
717  INTEL_NOPREFIX
718  AS1( lfence)
719 
720  AS2( test DWORD PTR [L_LENGTH], 1)
721  ATT_NOPREFIX
722  ASJ( jz, 8, f)
723  INTEL_NOPREFIX
724 
725  // counter mode one-time setup
726  AS2( mov WORD_REG(si), [L_INBLOCKS])
727  AS2( movdqu xmm2, [WORD_REG(si)]) // counter
728  AS2( pxor xmm2, xmm1)
729  AS2( psrldq xmm1, 14)
730  AS2( movd eax, xmm1)
731  AS2( mov al, BYTE PTR [WORD_REG(si)+15])
732  AS2( MOVD MM(2), eax)
733 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
734  AS2( mov eax, 1)
735  AS2( movd mm3, eax)
736 #endif
737 
738  // partial first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: mm1, ebx, ecx, edx
739  AS2( movd eax, xmm2)
740  AS2( psrldq xmm2, 4)
741  AS2( movd edi, xmm2)
742  AS2( psrldq xmm2, 4)
743  MXOR( 1, al, 0) // 0
744  XOR( edx, ah, 1) // 1
745  AS2( shr eax, 16)
746  XOR( ecx, al, 2) // 2
747  XOR( ebx, ah, 3) // 3
748  AS2( mov eax, edi)
749  AS2( movd edi, xmm2)
750  AS2( psrldq xmm2, 4)
751  XOR( ebx, al, 0) // 4
752  MXOR( 1, ah, 1) // 5
753  AS2( shr eax, 16)
754  XOR( edx, al, 2) // 6
755  XOR( ecx, ah, 3) // 7
756  AS2( mov eax, edi)
757  AS2( movd edi, xmm2)
758  XOR( ecx, al, 0) // 8
759  XOR( ebx, ah, 1) // 9
760  AS2( shr eax, 16)
761  MXOR( 1, al, 2) // 10
762  XOR( edx, ah, 3) // 11
763  AS2( mov eax, edi)
764  XOR( edx, al, 0) // 12
765  XOR( ecx, ah, 1) // 13
766  AS2( shr eax, 16)
767  XOR( ebx, al, 2) // 14
768  AS2( psrldq xmm2, 3)
769 
770  // partial second round, in: ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15), out: eax, ebx, edi, mm0
771  AS2( mov eax, [L_KEY12+0*4])
772  AS2( mov edi, [L_KEY12+2*4])
773  AS2( MOVD MM(0), [L_KEY12+3*4])
774  MXOR( 0, cl, 3) /* 11 */
775  XOR( edi, bl, 3) /* 7 */
776  MXOR( 0, bh, 2) /* 6 */
777  AS2( shr ebx, 16) /* 4,5 */
778  XOR( eax, bl, 1) /* 5 */
779  MOV( ebx, bh, 0) /* 4 */
780  AS2( xor ebx, [L_KEY12+1*4])
781  XOR( eax, ch, 2) /* 10 */
782  AS2( shr ecx, 16) /* 8,9 */
783  XOR( eax, dl, 3) /* 15 */
784  XOR( ebx, dh, 2) /* 14 */
785  AS2( shr edx, 16) /* 12,13 */
786  XOR( edi, ch, 0) /* 8 */
787  XOR( ebx, cl, 1) /* 9 */
788  XOR( edi, dl, 1) /* 13 */
789  MXOR( 0, dh, 0) /* 12 */
790 
791  AS2( movd ecx, xmm2)
792  AS2( MOVD edx, MM(1))
793  AS2( MOVD [L_SAVED_X+3*4], MM(0))
794  AS2( mov [L_SAVED_X+0*4], eax)
795  AS2( mov [L_SAVED_X+1*4], ebx)
796  AS2( mov [L_SAVED_X+2*4], edi)
797  ATT_NOPREFIX
798  ASJ( jmp, 5, f)
799  INTEL_NOPREFIX
800  ASL(3)
801  // non-counter mode per-block setup
802  AS2( MOVD MM(1), [L_KEY12+0*4]) // 0,1,2,3
803  AS2( mov ebx, [L_KEY12+1*4]) // 4,5,6,7
804  AS2( mov ecx, [L_KEY12+2*4]) // 8,9,10,11
805  AS2( mov edx, [L_KEY12+3*4]) // 12,13,14,15
806  ASL(8)
807  AS2( mov WORD_REG(ax), [L_INBLOCKS])
808  AS2( movdqu xmm2, [WORD_REG(ax)])
809  AS2( mov WORD_REG(si), [L_INXORBLOCKS])
810  AS2( movdqu xmm5, [WORD_REG(si)])
811  AS2( pxor xmm2, xmm1)
812  AS2( pxor xmm2, xmm5)
813 
814  // first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: eax, ebx, ecx, edx
815  AS2( movd eax, xmm2)
816  AS2( psrldq xmm2, 4)
817  AS2( movd edi, xmm2)
818  AS2( psrldq xmm2, 4)
819  MXOR( 1, al, 0) // 0
820  XOR( edx, ah, 1) // 1
821  AS2( shr eax, 16)
822  XOR( ecx, al, 2) // 2
823  XOR( ebx, ah, 3) // 3
824  AS2( mov eax, edi)
825  AS2( movd edi, xmm2)
826  AS2( psrldq xmm2, 4)
827  XOR( ebx, al, 0) // 4
828  MXOR( 1, ah, 1) // 5
829  AS2( shr eax, 16)
830  XOR( edx, al, 2) // 6
831  XOR( ecx, ah, 3) // 7
832  AS2( mov eax, edi)
833  AS2( movd edi, xmm2)
834  XOR( ecx, al, 0) // 8
835  XOR( ebx, ah, 1) // 9
836  AS2( shr eax, 16)
837  MXOR( 1, al, 2) // 10
838  XOR( edx, ah, 3) // 11
839  AS2( mov eax, edi)
840  XOR( edx, al, 0) // 12
841  XOR( ecx, ah, 1) // 13
842  AS2( shr eax, 16)
843  XOR( ebx, al, 2) // 14
844  MXOR( 1, ah, 3) // 15
845  AS2( MOVD eax, MM(1))
846 
847  AS2( add L_REG, [L_KEYS_BEGIN])
848  AS2( add L_REG, 4*16)
849  ATT_NOPREFIX
850  ASJ( jmp, 2, f)
851  INTEL_NOPREFIX
852  ASL(1)
853  // counter-mode per-block setup
854  AS2( MOVD ecx, MM(2))
855  AS2( MOVD edx, MM(1))
856  AS2( mov eax, [L_SAVED_X+0*4])
857  AS2( mov ebx, [L_SAVED_X+1*4])
858  AS2( xor cl, ch)
859  AS2( and WORD_REG(cx), 255)
860  ASL(5)
861 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
862  AS2( paddb MM(2), mm3)
863 #else
864  AS2( add MM(2), 1)
865 #endif
866  // remaining part of second round, in: edx(previous round),esi(keyed counter byte) eax,ebx,[L_SAVED_X+2*4],[L_SAVED_X+3*4], out: eax,ebx,ecx,edx
867  AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
868  XOR( ebx, dl, 3)
869  MOV( ecx, dh, 2)
870  AS2( shr edx, 16)
871  AS2( xor ecx, [L_SAVED_X+2*4])
872  XOR( eax, dh, 0)
873  MOV( edx, dl, 1)
874  AS2( xor edx, [L_SAVED_X+3*4])
875 
876  AS2( add L_REG, [L_KEYS_BEGIN])
877  AS2( add L_REG, 3*16)
878  ATT_NOPREFIX
879  ASJ( jmp, 4, f)
880  INTEL_NOPREFIX
881 
882 // in: eax(0,1,2,3), ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15)
883 // out: eax, ebx, edi, mm0
884 #define ROUND() \
885  MXOR( 0, cl, 3) /* 11 */\
886  AS2( mov cl, al) /* 8,9,10,3 */\
887  XOR( edi, ah, 2) /* 2 */\
888  AS2( shr eax, 16) /* 0,1 */\
889  XOR( edi, bl, 3) /* 7 */\
890  MXOR( 0, bh, 2) /* 6 */\
891  AS2( shr ebx, 16) /* 4,5 */\
892  MXOR( 0, al, 1) /* 1 */\
893  MOV( eax, ah, 0) /* 0 */\
894  XOR( eax, bl, 1) /* 5 */\
895  MOV( ebx, bh, 0) /* 4 */\
896  XOR( eax, ch, 2) /* 10 */\
897  XOR( ebx, cl, 3) /* 3 */\
898  AS2( shr ecx, 16) /* 8,9 */\
899  XOR( eax, dl, 3) /* 15 */\
900  XOR( ebx, dh, 2) /* 14 */\
901  AS2( shr edx, 16) /* 12,13 */\
902  XOR( edi, ch, 0) /* 8 */\
903  XOR( ebx, cl, 1) /* 9 */\
904  XOR( edi, dl, 1) /* 13 */\
905  MXOR( 0, dh, 0) /* 12 */\
906 
907  ASL(2) // 2-round loop
908  AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4])
909  AS2( mov edi, [L_SUBKEYS-4*16+2*4])
910  ROUND()
911  AS2( mov ecx, edi)
912  AS2( xor eax, [L_SUBKEYS-4*16+0*4])
913  AS2( xor ebx, [L_SUBKEYS-4*16+1*4])
914  AS2( MOVD edx, MM(0))
915 
916  ASL(4)
917  AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4])
918  AS2( mov edi, [L_SUBKEYS-4*16+6*4])
919  ROUND()
920  AS2( mov ecx, edi)
921  AS2( xor eax, [L_SUBKEYS-4*16+4*4])
922  AS2( xor ebx, [L_SUBKEYS-4*16+5*4])
923  AS2( MOVD edx, MM(0))
924 
925  AS2( add L_REG, 32)
926  AS2( test L_REG, 255)
927  ATT_NOPREFIX
928  ASJ( jnz, 2, b)
929  INTEL_NOPREFIX
930  AS2( sub L_REG, 16*16)
931 
932 #define LAST(a, b, c) \
933  AS2( movzx esi, a )\
934  AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
935  AS2( movzx esi, b )\
936  AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\
937  AS2( mov WORD PTR [L_LASTROUND+c], di )\
938 
939  // last round
940  LAST(ch, dl, 2)
941  LAST(dh, al, 6)
942  AS2( shr edx, 16)
943  LAST(ah, bl, 10)
944  AS2( shr eax, 16)
945  LAST(bh, cl, 14)
946  AS2( shr ebx, 16)
947  LAST(dh, al, 12)
948  AS2( shr ecx, 16)
949  LAST(ah, bl, 0)
950  LAST(bh, cl, 4)
951  LAST(ch, dl, 8)
952 
953  AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS])
954  AS2( mov WORD_REG(bx), [L_OUTBLOCKS])
955 
956  AS2( mov WORD_REG(cx), [L_LENGTH])
957  AS2( sub WORD_REG(cx), 16)
958 
959  AS2( movdqu xmm2, [WORD_REG(ax)])
960  AS2( pxor xmm2, xmm4)
961 
962 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
963  AS2( movdqa xmm0, [L_INCREMENTS])
964  AS2( paddd xmm0, [L_INBLOCKS])
965  AS2( movdqa [L_INBLOCKS], xmm0)
966 #else
967  AS2( movdqa xmm0, [L_INCREMENTS+16])
968  AS2( paddq xmm0, [L_INBLOCKS+16])
969  AS2( movdqa [L_INBLOCKS+16], xmm0)
970 #endif
971 
972  AS2( pxor xmm2, [L_LASTROUND])
973  AS2( movdqu [WORD_REG(bx)], xmm2)
974 
975  ATT_NOPREFIX
976  ASJ( jle, 7, f)
977  INTEL_NOPREFIX
978  AS2( mov [L_LENGTH], WORD_REG(cx))
979  AS2( test WORD_REG(cx), 1)
980  ATT_NOPREFIX
981  ASJ( jnz, 1, b)
982  INTEL_NOPREFIX
983 #if CRYPTOPP_BOOL_X64
984  AS2( movdqa xmm0, [L_INCREMENTS])
985  AS2( paddq xmm0, [L_INBLOCKS])
986  AS2( movdqa [L_INBLOCKS], xmm0)
987 #endif
988  ATT_NOPREFIX
989  ASJ( jmp, 3, b)
990  INTEL_NOPREFIX
991 
992  ASL(7)
993  // erase keys on stack
994  AS2( xorps xmm0, xmm0)
995  AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16])
996  AS2( movaps [WORD_REG(ax)-7*16], xmm0)
997  AS2( movaps [WORD_REG(ax)-6*16], xmm0)
998  AS2( movaps [WORD_REG(ax)-5*16], xmm0)
999  AS2( movaps [WORD_REG(ax)-4*16], xmm0)
1000  AS2( movaps [WORD_REG(ax)-3*16], xmm0)
1001  AS2( movaps [WORD_REG(ax)-2*16], xmm0)
1002  AS2( movaps [WORD_REG(ax)-1*16], xmm0)
1003  AS2( movaps [WORD_REG(ax)+0*16], xmm0)
1004  AS2( movaps [WORD_REG(ax)+1*16], xmm0)
1005  AS2( movaps [WORD_REG(ax)+2*16], xmm0)
1006  AS2( movaps [WORD_REG(ax)+3*16], xmm0)
1007  AS2( movaps [WORD_REG(ax)+4*16], xmm0)
1008  AS2( movaps [WORD_REG(ax)+5*16], xmm0)
1009  AS2( movaps [WORD_REG(ax)+6*16], xmm0)
1010 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
1011  AS2( mov esp, [L_SP])
1012  AS1( emms)
1013 #endif
1014  AS_POP_IF86(bp)
1015  AS_POP_IF86(bx)
1016 #if defined(_MSC_VER) && CRYPTOPP_BOOL_X86
1017  AS_POP_IF86(di)
1018  AS_POP_IF86(si)
1019  AS1(ret)
1020 #endif
1021 #ifdef CRYPTOPP_GENERATE_X64_MASM
1022  pop r12
1023  pop rbx
1024  pop rdi
1025  pop rsi
1026  ret
1027  Rijndael_Enc_AdvancedProcessBlocks ENDP
1028 #endif
1029 #ifdef __GNUC__
1030  ATT_PREFIX
1031  :
1032  : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize)
1033  : "memory", "cc", "%eax"
1034  #if CRYPTOPP_BOOL_X64
1035  , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12"
1036  #endif
1037  );
1038 #endif
1039 }
1040 
1041 #endif
1042 
1043 #ifndef CRYPTOPP_GENERATE_X64_MASM
1044 
1045 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
1046 extern "C" {
1047 void Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k);
1048 }
1049 #endif
1050 
1051 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
1052 
1053 // Determine whether the range between begin and end overlaps
1054 // with the same 4k block offsets as the Te table. Logically,
1055 // the code is trying to create the condition:
1056 //
1057 // Two sepearate memory pages:
1058 //
1059 // +-----+ +-----+
1060 // |XXXXX| |YYYYY|
1061 // |XXXXX| |YYYYY|
1062 // | | | |
1063 // | | | |
1064 // +-----+ +-----+
1065 // Te Table Locals
1066 //
1067 // Have a logical cache view of (X and Y may be inverted):
1068 //
1069 // +-----+
1070 // |XXXXX|
1071 // |XXXXX|
1072 // |YYYYY|
1073 // |YYYYY|
1074 // +-----+
1075 //
1076 static inline bool AliasedWithTable(const byte *begin, const byte *end)
1077 {
1078  ptrdiff_t s0 = uintptr_t(begin)%4096, s1 = uintptr_t(end)%4096;
1079  ptrdiff_t t0 = uintptr_t(Te)%4096, t1 = (uintptr_t(Te)+sizeof(Te))%4096;
1080  if (t1 > t0)
1081  return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
1082  else
1083  return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
1084 }
1085 
1086 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1087 
1088 inline void AESNI_Enc_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
1089 {
1090  block = _mm_xor_si128(block, subkeys[0]);
1091  for (unsigned int i=1; i<rounds-1; i+=2)
1092  {
1093  block = _mm_aesenc_si128(block, subkeys[i]);
1094  block = _mm_aesenc_si128(block, subkeys[i+1]);
1095  }
1096  block = _mm_aesenc_si128(block, subkeys[rounds-1]);
1097  block = _mm_aesenclast_si128(block, subkeys[rounds]);
1098 }
1099 
1100 inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
1101 {
1102  __m128i rk = subkeys[0];
1103  block0 = _mm_xor_si128(block0, rk);
1104  block1 = _mm_xor_si128(block1, rk);
1105  block2 = _mm_xor_si128(block2, rk);
1106  block3 = _mm_xor_si128(block3, rk);
1107  for (unsigned int i=1; i<rounds; i++)
1108  {
1109  rk = subkeys[i];
1110  block0 = _mm_aesenc_si128(block0, rk);
1111  block1 = _mm_aesenc_si128(block1, rk);
1112  block2 = _mm_aesenc_si128(block2, rk);
1113  block3 = _mm_aesenc_si128(block3, rk);
1114  }
1115  rk = subkeys[rounds];
1116  block0 = _mm_aesenclast_si128(block0, rk);
1117  block1 = _mm_aesenclast_si128(block1, rk);
1118  block2 = _mm_aesenclast_si128(block2, rk);
1119  block3 = _mm_aesenclast_si128(block3, rk);
1120 }
1121 
1122 inline void AESNI_Dec_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
1123 {
1124  block = _mm_xor_si128(block, subkeys[0]);
1125  for (unsigned int i=1; i<rounds-1; i+=2)
1126  {
1127  block = _mm_aesdec_si128(block, subkeys[i]);
1128  block = _mm_aesdec_si128(block, subkeys[i+1]);
1129  }
1130  block = _mm_aesdec_si128(block, subkeys[rounds-1]);
1131  block = _mm_aesdeclast_si128(block, subkeys[rounds]);
1132 }
1133 
1134 inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
1135 {
1136  __m128i rk = subkeys[0];
1137  block0 = _mm_xor_si128(block0, rk);
1138  block1 = _mm_xor_si128(block1, rk);
1139  block2 = _mm_xor_si128(block2, rk);
1140  block3 = _mm_xor_si128(block3, rk);
1141  for (unsigned int i=1; i<rounds; i++)
1142  {
1143  rk = subkeys[i];
1144  block0 = _mm_aesdec_si128(block0, rk);
1145  block1 = _mm_aesdec_si128(block1, rk);
1146  block2 = _mm_aesdec_si128(block2, rk);
1147  block3 = _mm_aesdec_si128(block3, rk);
1148  }
1149  rk = subkeys[rounds];
1150  block0 = _mm_aesdeclast_si128(block0, rk);
1151  block1 = _mm_aesdeclast_si128(block1, rk);
1152  block2 = _mm_aesdeclast_si128(block2, rk);
1153  block3 = _mm_aesdeclast_si128(block3, rk);
1154 }
1155 
1156 CRYPTOPP_ALIGN_DATA(16)
1157 static const word32 s_one[] = {0, 0, 0, 1<<24};
1158 
1159 template <typename F1, typename F4>
1160 inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, MAYBE_CONST __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1161 {
1162  size_t blockSize = 16;
1164  size_t xorIncrement = xorBlocks ? blockSize : 0;
1165  size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
1166 
1168  {
1169  CRYPTOPP_ASSERT(length % blockSize == 0);
1170  inBlocks += length - blockSize;
1171  xorBlocks += length - blockSize;
1172  outBlocks += length - blockSize;
1173  inIncrement = 0-inIncrement;
1174  xorIncrement = 0-xorIncrement;
1175  outIncrement = 0-outIncrement;
1176  }
1177 
1178  if (flags & BlockTransformation::BT_AllowParallel)
1179  {
1180  while (length >= 4*blockSize)
1181  {
1182  __m128i block0 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks), block1, block2, block3;
1184  {
1185  const __m128i be1 = *(const __m128i *)(const void *)s_one;
1186  block1 = _mm_add_epi32(block0, be1);
1187  block2 = _mm_add_epi32(block1, be1);
1188  block3 = _mm_add_epi32(block2, be1);
1189  _mm_storeu_si128((__m128i *)(void *)inBlocks, _mm_add_epi32(block3, be1));
1190  }
1191  else
1192  {
1193  inBlocks += inIncrement;
1194  block1 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
1195  inBlocks += inIncrement;
1196  block2 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
1197  inBlocks += inIncrement;
1198  block3 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
1199  inBlocks += inIncrement;
1200  }
1201 
1202  if (flags & BlockTransformation::BT_XorInput)
1203  {
1204  // Coverity finding, appears to be false positive. Assert the condition.
1205  CRYPTOPP_ASSERT(xorBlocks);
1206  block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1207  xorBlocks += xorIncrement;
1208  block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1209  xorBlocks += xorIncrement;
1210  block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1211  xorBlocks += xorIncrement;
1212  block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1213  xorBlocks += xorIncrement;
1214  }
1215 
1216  func4(block0, block1, block2, block3, subkeys, rounds);
1217 
1218  if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
1219  {
1220  block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1221  xorBlocks += xorIncrement;
1222  block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1223  xorBlocks += xorIncrement;
1224  block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1225  xorBlocks += xorIncrement;
1226  block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1227  xorBlocks += xorIncrement;
1228  }
1229 
1230  _mm_storeu_si128((__m128i *)(void *)outBlocks, block0);
1231  outBlocks += outIncrement;
1232  _mm_storeu_si128((__m128i *)(void *)outBlocks, block1);
1233  outBlocks += outIncrement;
1234  _mm_storeu_si128((__m128i *)(void *)outBlocks, block2);
1235  outBlocks += outIncrement;
1236  _mm_storeu_si128((__m128i *)(void *)outBlocks, block3);
1237  outBlocks += outIncrement;
1238 
1239  length -= 4*blockSize;
1240  }
1241  }
1242 
1243  while (length >= blockSize)
1244  {
1245  __m128i block = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
1246 
1247  if (flags & BlockTransformation::BT_XorInput)
1248  block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1249 
1250  if (flags & BlockTransformation::BT_InBlockIsCounter)
1251  const_cast<byte *>(inBlocks)[15]++;
1252 
1253  func1(block, subkeys, rounds);
1254 
1255  if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
1256  block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1257 
1258  _mm_storeu_si128((__m128i *)(void *)outBlocks, block);
1259 
1260  inBlocks += inIncrement;
1261  outBlocks += outIncrement;
1262  xorBlocks += xorIncrement;
1263  length -= blockSize;
1264  }
1265 
1266  return length;
1267 }
1268 #endif
1269 
1270 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
1271 struct Locals
1272 {
1273  word32 subkeys[4*12], workspace[8];
1274  const byte *inBlocks, *inXorBlocks, *outXorBlocks;
1275  byte *outBlocks;
1276  size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
1277  size_t regSpill, lengthAndCounterFlag, keysBegin;
1278 };
1279 
1280 const size_t s_aliasPageSize = 4096;
1281 const size_t s_aliasBlockSize = 256;
1282 const size_t s_sizeToAllocate = s_aliasPageSize + s_aliasBlockSize + sizeof(Locals);
1283 
1284 Rijndael::Enc::Enc() : m_aliasBlock(s_sizeToAllocate) { }
1285 #endif
1286 
1287 size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1288 {
1289 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1290  if (HasAESNI())
1291  return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (MAYBE_CONST __m128i *)(const void *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1292 #endif
1293 
1294 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
1295  if (HasSSE2())
1296  {
1297  if (length < BLOCKSIZE)
1298  return length;
1299 
1300  static const byte *zeros = (const byte*)(Te+256);
1301  byte *space = NULL, *originalSpace = const_cast<byte*>(m_aliasBlock.data());
1302 
1303  // round up to nearest 256 byte boundary
1304  space = originalSpace + (s_aliasBlockSize - (uintptr_t)originalSpace % s_aliasBlockSize) % s_aliasBlockSize;
1305  while (AliasedWithTable(space, space + sizeof(Locals)))
1306  {
1307  space += 256;
1308  CRYPTOPP_ASSERT(space < (originalSpace + s_aliasPageSize));
1309  }
1310 
1311  size_t increment = BLOCKSIZE;
1312  if (flags & BT_ReverseDirection)
1313  {
1314  CRYPTOPP_ASSERT(length % BLOCKSIZE == 0);
1315  inBlocks += length - BLOCKSIZE;
1316  xorBlocks += length - BLOCKSIZE;
1317  outBlocks += length - BLOCKSIZE;
1318  increment = 0-increment;
1319  }
1320 
1321  Locals &locals = *(Locals *)(void *)space;
1322 
1323  locals.inBlocks = inBlocks;
1324  locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
1325  locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
1326  locals.outBlocks = outBlocks;
1327 
1328  locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1329  locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
1330  locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
1331  locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1332 
1333  locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter);
1334  int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
1335  locals.keysBegin = (12-keysToCopy)*16;
1336 
1337  Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key);
1338 
1339  return length % BLOCKSIZE;
1340  }
1341 #endif
1342 
1343  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1344 }
1345 
1346 #endif
1347 
1348 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
1349 size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1350 {
1351 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1352  if (HasAESNI())
1353  return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (MAYBE_CONST __m128i *)(const void *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1354 #endif
1355 
1356  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1357 }
1358 #endif // CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
1359 
1360 NAMESPACE_END
1361 
1362 #endif
1363 #endif
Utility functions for the Crypto++ library.
bool HasSSE4()
Determines SSE4 availability.
Definition: cpu.h:190
Library configuration file.
should not modify block pointers
Definition: cryptlib.h:796
int GetCacheLineSize()
Provides the cache line size.
Definition: cpu.h:314
virtual size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
Encrypt and xor multiple blocks using additional flags.
Definition: cryptlib.cpp:178
Access a block of memory.
Definition: misc.h:2233
Rijndael block cipher implementation details.
Definition: rijndael.h:29
T ConditionalByteReverse(ByteOrder order, T value)
Reverses bytes in a value depending upon endianess.
Definition: misc.h:1807
#define COUNTOF(arr)
Counts elements in an array.
Definition: misc.h:168
byte order is big-endian
Definition: cryptlib.h:128
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:62
Classes for Rijndael encryption algorithm.
Functions for CPU features and intrinsics.
bool HasAESNI()
Determines AES-NI availability.
Definition: cpu.h:200
bool HasSSE2()
Determines SSE2 availability.
Definition: cpu.h:165
perform the transformation in reverse
Definition: cryptlib.h:800
Crypto++ library namespace.
T rotrFixed(T x, unsigned int y)
Performs a right rotate.
Definition: misc.h:1285
Interface for retrieving values given their names.
Definition: cryptlib.h:279
void vec_swap(T &a, T &b)
Swaps two variables which are arrays.
Definition: misc.h:449