Crypto++  5.6.3
Free C++ class library of cryptographic schemes
rijndael.cpp
1 // rijndael.cpp - modified by Chris Morgan <cmorgan@wpi.edu>
2 // and Wei Dai from Paulo Baretto's Rijndael implementation
3 // The original code and all modifications are in the public domain.
4 
5 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
6 
7 /*
8 July 2010: Added support for AES-NI instructions via compiler intrinsics.
9 */
10 
11 /*
12 Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode
13 caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein
14 and Peter Schwabe in their paper "New AES software speed records". The round
15 function was also modified to include a trick similar to one in Brian Gladman's
16 x86 assembly code, doing an 8-bit register move to minimize the number of
17 register spills. Also switched to compressed tables and copying round keys to
18 the stack.
19 
20 The C++ implementation now uses compressed tables if
21 CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined.
22 */
23 
24 /*
25 July 2006: Defense against timing attacks was added in by Wei Dai.
26 
27 The code now uses smaller tables in the first and last rounds,
28 and preloads them into L1 cache before usage (by loading at least
29 one element in each cache line).
30 
31 We try to delay subsequent accesses to each table (used in the first
32 and last rounds) until all of the table has been preloaded. Hopefully
33 the compiler isn't smart enough to optimize that code away.
34 
35 After preloading the table, we also try not to access any memory location
36 other than the table and the stack, in order to prevent table entries from
37 being unloaded from L1 cache, until that round is finished.
38 (Some popular CPUs have 2-way associative caches.)
39 */
40 
41 // This is the original introductory comment:
42 
43 /**
44  * version 3.0 (December 2000)
45  *
46  * Optimised ANSI C code for the Rijndael cipher (now AES)
47  *
48  * author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
49  * author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
50  * author Paulo Barreto <paulo.barreto@terra.com.br>
51  *
52  * This code is hereby placed in the public domain.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
55  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
56  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
58  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
59  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
60  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
61  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
62  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
63  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
64  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65  */
66 
67 #include "pch.h"
68 #include "config.h"
69 
70 #ifndef CRYPTOPP_IMPORTS
71 #ifndef CRYPTOPP_GENERATE_X64_MASM
72 
73 #include "rijndael.h"
74 #include "stdcpp.h" // alloca
75 #include "misc.h"
76 #include "cpu.h"
77 
78 NAMESPACE_BEGIN(CryptoPP)
79 
80 // Hack for https://github.com/weidai11/cryptopp/issues/42 and https://github.com/weidai11/cryptopp/issues/132
81 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS)
82 # define CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS 1
83 #endif
84 
85 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
86 # if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
87 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
88 using namespace rdtable;
89 # else
90 static word64 Te[256];
91 # endif
92 static word64 Td[256];
93 #else // Not CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
94 # if defined(CRYPTOPP_X64_MASM_AVAILABLE)
95 // Unused; avoids linker error on Microsoft X64 non-AESNI platforms
96 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
97 # endif
98 CRYPTOPP_ALIGN_DATA(16) static word32 Te[256*4];
99 CRYPTOPP_ALIGN_DATA(16) static word32 Td[256*4];
100 #endif // CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
101 
102 static volatile bool s_TeFilled = false, s_TdFilled = false;
103 
104 // ************************* Portable Code ************************************
105 
106 #define QUARTER_ROUND(L, T, t, a, b, c, d) \
107  a ^= L(T, 3, byte(t)); t >>= 8;\
108  b ^= L(T, 2, byte(t)); t >>= 8;\
109  c ^= L(T, 1, byte(t)); t >>= 8;\
110  d ^= L(T, 0, t);
111 
112 #define QUARTER_ROUND_LE(t, a, b, c, d) \
113  tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
114  tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
115  tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
116  tempBlock[d] = ((byte *)(Te+t))[1];
117 
118 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
119  #define QUARTER_ROUND_LD(t, a, b, c, d) \
120  tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
121  tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
122  tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
123  tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
124 #else
125  #define QUARTER_ROUND_LD(t, a, b, c, d) \
126  tempBlock[a] = Sd[byte(t)]; t >>= 8;\
127  tempBlock[b] = Sd[byte(t)]; t >>= 8;\
128  tempBlock[c] = Sd[byte(t)]; t >>= 8;\
129  tempBlock[d] = Sd[t];
130 #endif
131 
132 #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
133 #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
134 
135 #ifdef IS_LITTLE_ENDIAN
136  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
137  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
138  #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
139  #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (6-i)%4+1))
140  #define TL_M(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (i+3)%4+1))
141  #else
142  #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8)
143  #define TL_M(T, i, x) T[i*256 + x]
144  #endif
145 #else
146  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
147  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
148  #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
149  #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (4-i)%4))
150  #define TL_M TL_F
151  #else
152  #define TL_F(T, i, x) rotrFixed(T[x], i*8)
153  #define TL_M(T, i, x) T[i*256 + x]
154  #endif
155 #endif
156 
157 
158 #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
159 #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
160 #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
161 
162 #define f3(x) (f2(x) ^ x)
163 #define f9(x) (f8(x) ^ x)
164 #define fb(x) (f8(x) ^ f2(x) ^ x)
165 #define fd(x) (f8(x) ^ f4(x) ^ x)
166 #define fe(x) (f8(x) ^ f4(x) ^ f2(x))
167 
168 void Rijndael::Base::FillEncTable()
169 {
170  for (int i=0; i<256; i++)
171  {
172  byte x = Se[i];
173 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
174  word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
175  Te[i] = word64(y | f3(x))<<32 | y;
176 #else
177  word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
178  for (int j=0; j<4; j++)
179  {
180  Te[i+j*256] = y;
181  y = rotrFixed(y, 8);
182  }
183 #endif
184  }
185 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
186  Te[256] = Te[257] = 0;
187 #endif
188  s_TeFilled = true;
189 }
190 
191 void Rijndael::Base::FillDecTable()
192 {
193  for (int i=0; i<256; i++)
194  {
195  byte x = Sd[i];
196 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
197  word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
198  Td[i] = word64(y | fb(x))<<32 | y | x;
199 #else
200  word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;;
201  for (int j=0; j<4; j++)
202  {
203  Td[i+j*256] = y;
204  y = rotrFixed(y, 8);
205  }
206 #endif
207  }
208  s_TdFilled = true;
209 }
210 
211 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &)
212 {
213  AssertValidKeyLength(keylen);
214 
215  m_rounds = keylen/4 + 6;
216  m_key.New(4*(m_rounds+1));
217 
218  word32 *rk = m_key;
219 
220 #if (CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32))
221  // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64
222  if (HasAESNI() && HasSSE4())
223  {
224  static const word32 rcLE[] = {
225  0x01, 0x02, 0x04, 0x08,
226  0x10, 0x20, 0x40, 0x80,
227  0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
228  };
229  const word32 *rc = rcLE;
230 
231  __m128i temp = _mm_loadu_si128((__m128i *)(void *)(userKey+keylen-16));
232  memcpy(rk, userKey, keylen);
233 
234  while (true)
235  {
236  rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
237  rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
238  rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
239  rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
240 
241  if (rk + keylen/4 + 4 == m_key.end())
242  break;
243 
244  if (keylen == 24)
245  {
246  rk[10] = rk[ 4] ^ rk[ 9];
247  rk[11] = rk[ 5] ^ rk[10];
248  temp = _mm_insert_epi32(temp, rk[11], 3);
249  }
250  else if (keylen == 32)
251  {
252  temp = _mm_insert_epi32(temp, rk[11], 3);
253  rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2);
254  rk[13] = rk[ 5] ^ rk[12];
255  rk[14] = rk[ 6] ^ rk[13];
256  rk[15] = rk[ 7] ^ rk[14];
257  temp = _mm_insert_epi32(temp, rk[15], 3);
258  }
259  else
260  temp = _mm_insert_epi32(temp, rk[7], 3);
261 
262  rk += keylen/4;
263  }
264 
265  if (!IsForwardTransformation())
266  {
267  rk = m_key;
268  unsigned int i, j;
269 
270  std::swap(*(__m128i *)(void *)(rk), *(__m128i *)(void *)(rk+4*m_rounds));
271 
272  for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
273  {
274  temp = _mm_aesimc_si128(*(__m128i *)(void *)(rk+i));
275  *(__m128i *)(void *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(void *)(rk+j));
276  *(__m128i *)(void *)(rk+j) = temp;
277  }
278 
279  *(__m128i *)(void *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(void *)(rk+i));
280  }
281 
282  return;
283  }
284 #endif
285 
286  GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen);
287  const word32 *rc = rcon;
288  word32 temp;
289 
290  while (true)
291  {
292  temp = rk[keylen/4-1];
293  word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
294  rk[keylen/4] = rk[0] ^ x ^ *(rc++);
295  rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
296  rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
297  rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
298 
299  if (rk + keylen/4 + 4 == m_key.end())
300  break;
301 
302  if (keylen == 24)
303  {
304  rk[10] = rk[ 4] ^ rk[ 9];
305  rk[11] = rk[ 5] ^ rk[10];
306  }
307  else if (keylen == 32)
308  {
309  temp = rk[11];
310  rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
311  rk[13] = rk[ 5] ^ rk[12];
312  rk[14] = rk[ 6] ^ rk[13];
313  rk[15] = rk[ 7] ^ rk[14];
314  }
315  rk += keylen/4;
316  }
317 
318  rk = m_key;
319 
320  if (IsForwardTransformation())
321  {
322  if (!s_TeFilled)
323  FillEncTable();
324 
326  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
327  }
328  else
329  {
330  if (!s_TdFilled)
331  FillDecTable();
332 
333  unsigned int i, j;
334 
335 #define InverseMixColumn(x) TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
336 
337  for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
338  {
339  temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp;
340  temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
341  temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
342  temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
343  }
344 
345  rk[i+0] = InverseMixColumn(rk[i+0]);
346  rk[i+1] = InverseMixColumn(rk[i+1]);
347  rk[i+2] = InverseMixColumn(rk[i+2]);
348  rk[i+3] = InverseMixColumn(rk[i+3]);
349 
350  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
351  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
352  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
353  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
354  }
355 
356 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
357  if (HasAESNI())
358  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
359 #endif
360 }
361 
362 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
363 {
364 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
365 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
366  if (HasSSE2())
367 #else
368  if (HasAESNI())
369 #endif
370  {
371  return (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
372  }
373 #endif
374 
376 
377  word32 s0, s1, s2, s3, t0, t1, t2, t3;
378  Block::Get(inBlock)(s0)(s1)(s2)(s3);
379 
380  const word32 *rk = m_key;
381  s0 ^= rk[0];
382  s1 ^= rk[1];
383  s2 ^= rk[2];
384  s3 ^= rk[3];
385  t0 = rk[4];
386  t1 = rk[5];
387  t2 = rk[6];
388  t3 = rk[7];
389  rk += 8;
390 
391  // timing attack countermeasure. see comments at top for more details.
392  // also see http://github.com/weidai11/cryptopp/issues/146
393  const int cacheLineSize = GetCacheLineSize();
394  unsigned int i;
395  volatile word32 _u = 0;
396  word32 u = _u;
397 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
398  for (i=0; i<2048; i+=cacheLineSize)
399 #else
400  for (i=0; i<1024; i+=cacheLineSize)
401 #endif
402  u &= *(const word32 *)(const void *)(((const byte *)Te)+i);
403  u &= Te[255];
404  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
405 
406  QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
407  QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
408  QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
409  QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
410 
411  // Nr - 2 full rounds:
412  unsigned int r = m_rounds/2 - 1;
413  do
414  {
415  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
416 
417  QUARTER_ROUND_E(t3, s0, s1, s2, s3)
418  QUARTER_ROUND_E(t2, s3, s0, s1, s2)
419  QUARTER_ROUND_E(t1, s2, s3, s0, s1)
420  QUARTER_ROUND_E(t0, s1, s2, s3, s0)
421 
422  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
423 
424  QUARTER_ROUND_E(s3, t0, t1, t2, t3)
425  QUARTER_ROUND_E(s2, t3, t0, t1, t2)
426  QUARTER_ROUND_E(s1, t2, t3, t0, t1)
427  QUARTER_ROUND_E(s0, t1, t2, t3, t0)
428 
429  rk += 8;
430  } while (--r);
431 
432  word32 tbw[4];
433  byte *const tempBlock = (byte *)tbw;
434 
435  QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
436  QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
437  QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
438  QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
439 
440  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
441 }
442 
443 void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
444 {
445 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
446  if (HasAESNI())
447  {
448  Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
449  return;
450  }
451 #endif
452 
454 
455  word32 s0, s1, s2, s3, t0, t1, t2, t3;
456  Block::Get(inBlock)(s0)(s1)(s2)(s3);
457 
458  const word32 *rk = m_key;
459  s0 ^= rk[0];
460  s1 ^= rk[1];
461  s2 ^= rk[2];
462  s3 ^= rk[3];
463  t0 = rk[4];
464  t1 = rk[5];
465  t2 = rk[6];
466  t3 = rk[7];
467  rk += 8;
468 
469  // timing attack countermeasure. see comments at top for more details.
470  // also see http://github.com/weidai11/cryptopp/issues/146
471  const int cacheLineSize = GetCacheLineSize();
472  unsigned int i;
473  volatile word32 _u = 0;
474  word32 u = _u;
475 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
476  for (i=0; i<2048; i+=cacheLineSize)
477 #else
478  for (i=0; i<1024; i+=cacheLineSize)
479 #endif
480  u &= *(const word32 *)(const void *)(((const byte *)Td)+i);
481  u &= Td[255];
482  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
483 
484  QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
485  QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
486  QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
487  QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
488 
489  // Nr - 2 full rounds:
490  unsigned int r = m_rounds/2 - 1;
491  do
492  {
493  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
494 
495  QUARTER_ROUND_D(t3, s2, s1, s0, s3)
496  QUARTER_ROUND_D(t2, s1, s0, s3, s2)
497  QUARTER_ROUND_D(t1, s0, s3, s2, s1)
498  QUARTER_ROUND_D(t0, s3, s2, s1, s0)
499 
500  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
501 
502  QUARTER_ROUND_D(s3, t2, t1, t0, t3)
503  QUARTER_ROUND_D(s2, t1, t0, t3, t2)
504  QUARTER_ROUND_D(s1, t0, t3, t2, t1)
505  QUARTER_ROUND_D(s0, t3, t2, t1, t0)
506 
507  rk += 8;
508  } while (--r);
509 
510 #if !(defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS))
511  // timing attack countermeasure. see comments at top for more details
512  // If CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined,
513  // QUARTER_ROUND_LD will use Td, which is already preloaded.
514  u = _u;
515  for (i=0; i<256; i+=cacheLineSize)
516  u &= *(const word32 *)(const void *)(Sd+i);
517  u &= *(const word32 *)(const void *)(Sd+252);
518  t0 |= u; t1 |= u; t2 |= u; t3 |= u;
519 #endif
520 
521  word32 tbw[4];
522  byte *const tempBlock = (byte *)tbw;
523 
524  QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
525  QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
526  QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
527  QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
528 
529  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
530 }
531 
532 // ************************* Assembly Code ************************************
533 
534 #if CRYPTOPP_MSC_VERSION
535 # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
536 #endif
537 
538 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
539 
540 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
541 
542 CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k)
543 {
544  CRYPTOPP_UNUSED(locals); CRYPTOPP_UNUSED(k);
545 
546 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
547 
548 #define L_REG esp
549 #define L_INDEX(i) (L_REG+768+i)
550 #define L_INXORBLOCKS L_INBLOCKS+4
551 #define L_OUTXORBLOCKS L_INBLOCKS+8
552 #define L_OUTBLOCKS L_INBLOCKS+12
553 #define L_INCREMENTS L_INDEX(16*15)
554 #define L_SP L_INDEX(16*16)
555 #define L_LENGTH L_INDEX(16*16+4)
556 #define L_KEYS_BEGIN L_INDEX(16*16+8)
557 
558 #define MOVD movd
559 #define MM(i) mm##i
560 
561 #define MXOR(a,b,c) \
562  AS2( movzx esi, b)\
563  AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
564  AS2( pxor MM(a), mm7)\
565 
566 #define MMOV(a,b,c) \
567  AS2( movzx esi, b)\
568  AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
569 
570 #else
571 
572 #define L_REG r8
573 #define L_INDEX(i) (L_REG+i)
574 #define L_INXORBLOCKS L_INBLOCKS+8
575 #define L_OUTXORBLOCKS L_INBLOCKS+16
576 #define L_OUTBLOCKS L_INBLOCKS+24
577 #define L_INCREMENTS L_INDEX(16*16)
578 #define L_LENGTH L_INDEX(16*18+8)
579 #define L_KEYS_BEGIN L_INDEX(16*19)
580 
581 #define MOVD mov
582 #define MM_0 r9d
583 #define MM_1 r12d
584 #ifdef __GNUC__
585 #define MM_2 r11d
586 #else
587 #define MM_2 r10d
588 #endif
589 #define MM(i) MM_##i
590 
591 #define MXOR(a,b,c) \
592  AS2( movzx esi, b)\
593  AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
594 
595 #define MMOV(a,b,c) \
596  AS2( movzx esi, b)\
597  AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
598 
599 #endif
600 
601 #define L_SUBKEYS L_INDEX(0)
602 #define L_SAVED_X L_SUBKEYS
603 #define L_KEY12 L_INDEX(16*12)
604 #define L_LASTROUND L_INDEX(16*13)
605 #define L_INBLOCKS L_INDEX(16*14)
606 #define MAP0TO4(i) (ASM_MOD(i+3,4)+1)
607 
608 #define XOR(a,b,c) \
609  AS2( movzx esi, b)\
610  AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
611 
612 #define MOV(a,b,c) \
613  AS2( movzx esi, b)\
614  AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
615 
616 #ifdef CRYPTOPP_GENERATE_X64_MASM
617  ALIGN 8
618  Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
619  rex_push_reg rsi
620  push_reg rdi
621  push_reg rbx
622  push_reg r12
623  .endprolog
624  mov L_REG, rcx
625  mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
626  mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
627 #elif defined(__GNUC__)
628  __asm__ __volatile__
629  (
630  INTEL_NOPREFIX
631  #if CRYPTOPP_BOOL_X64
632  AS2( mov L_REG, rcx)
633  #endif
634  AS_PUSH_IF86(bx)
635  AS_PUSH_IF86(bp)
636  AS2( mov AS_REG_7, WORD_REG(si))
637 #else
638  AS_PUSH_IF86(si)
639  AS_PUSH_IF86(di)
640  AS_PUSH_IF86(bx)
641  AS_PUSH_IF86(bp)
642  AS2( lea AS_REG_7, [Te])
643  AS2( mov edi, [g_cacheLineSize])
644 #endif
645 
646 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
647  AS2( mov [ecx+16*12+16*4], esp) // save esp to L_SP
648  AS2( lea esp, [ecx-768])
649 #endif
650 
651  // copy subkeys to stack
652  AS2( mov WORD_REG(si), [L_KEYS_BEGIN])
653  AS2( mov WORD_REG(ax), 16)
654  AS2( and WORD_REG(ax), WORD_REG(si))
655  AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)]) // subkey 1 (non-counter) or 2 (counter)
656  AS2( movdqa [L_KEY12], xmm3)
657  AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
658  AS2( sub WORD_REG(ax), WORD_REG(si))
659  ASL(0)
660  AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)])
661  AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
662  AS2( add WORD_REG(si), 16)
663  AS2( cmp WORD_REG(si), 16*12)
664  ATT_NOPREFIX
665  ASJ( jl, 0, b)
666  INTEL_NOPREFIX
667 
668  // read subkeys 0, 1 and last
669  AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)]) // last subkey
670  AS2( movdqa xmm1, [WORD_REG(dx)]) // subkey 0
671  AS2( MOVD MM(1), [WORD_REG(dx)+4*4]) // 0,1,2,3
672  AS2( mov ebx, [WORD_REG(dx)+5*4]) // 4,5,6,7
673  AS2( mov ecx, [WORD_REG(dx)+6*4]) // 8,9,10,11
674  AS2( mov edx, [WORD_REG(dx)+7*4]) // 12,13,14,15
675 
676  // load table into cache
677  AS2( xor WORD_REG(ax), WORD_REG(ax))
678  ASL(9)
679  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
680  AS2( add WORD_REG(ax), WORD_REG(di))
681  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
682  AS2( add WORD_REG(ax), WORD_REG(di))
683  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
684  AS2( add WORD_REG(ax), WORD_REG(di))
685  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
686  AS2( add WORD_REG(ax), WORD_REG(di))
687  AS2( cmp WORD_REG(ax), 2048)
688  ATT_NOPREFIX
689  ASJ( jl, 9, b)
690  INTEL_NOPREFIX
691  AS1( lfence)
692 
693  AS2( test DWORD PTR [L_LENGTH], 1)
694  ATT_NOPREFIX
695  ASJ( jz, 8, f)
696  INTEL_NOPREFIX
697 
698  // counter mode one-time setup
699  AS2( mov WORD_REG(si), [L_INBLOCKS])
700  AS2( movdqu xmm2, [WORD_REG(si)]) // counter
701  AS2( pxor xmm2, xmm1)
702  AS2( psrldq xmm1, 14)
703  AS2( movd eax, xmm1)
704  AS2( mov al, BYTE PTR [WORD_REG(si)+15])
705  AS2( MOVD MM(2), eax)
706 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
707  AS2( mov eax, 1)
708  AS2( movd mm3, eax)
709 #endif
710 
711  // partial first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: mm1, ebx, ecx, edx
712  AS2( movd eax, xmm2)
713  AS2( psrldq xmm2, 4)
714  AS2( movd edi, xmm2)
715  AS2( psrldq xmm2, 4)
716  MXOR( 1, al, 0) // 0
717  XOR( edx, ah, 1) // 1
718  AS2( shr eax, 16)
719  XOR( ecx, al, 2) // 2
720  XOR( ebx, ah, 3) // 3
721  AS2( mov eax, edi)
722  AS2( movd edi, xmm2)
723  AS2( psrldq xmm2, 4)
724  XOR( ebx, al, 0) // 4
725  MXOR( 1, ah, 1) // 5
726  AS2( shr eax, 16)
727  XOR( edx, al, 2) // 6
728  XOR( ecx, ah, 3) // 7
729  AS2( mov eax, edi)
730  AS2( movd edi, xmm2)
731  XOR( ecx, al, 0) // 8
732  XOR( ebx, ah, 1) // 9
733  AS2( shr eax, 16)
734  MXOR( 1, al, 2) // 10
735  XOR( edx, ah, 3) // 11
736  AS2( mov eax, edi)
737  XOR( edx, al, 0) // 12
738  XOR( ecx, ah, 1) // 13
739  AS2( shr eax, 16)
740  XOR( ebx, al, 2) // 14
741  AS2( psrldq xmm2, 3)
742 
743  // partial second round, in: ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15), out: eax, ebx, edi, mm0
744  AS2( mov eax, [L_KEY12+0*4])
745  AS2( mov edi, [L_KEY12+2*4])
746  AS2( MOVD MM(0), [L_KEY12+3*4])
747  MXOR( 0, cl, 3) /* 11 */
748  XOR( edi, bl, 3) /* 7 */
749  MXOR( 0, bh, 2) /* 6 */
750  AS2( shr ebx, 16) /* 4,5 */
751  XOR( eax, bl, 1) /* 5 */
752  MOV( ebx, bh, 0) /* 4 */
753  AS2( xor ebx, [L_KEY12+1*4])
754  XOR( eax, ch, 2) /* 10 */
755  AS2( shr ecx, 16) /* 8,9 */
756  XOR( eax, dl, 3) /* 15 */
757  XOR( ebx, dh, 2) /* 14 */
758  AS2( shr edx, 16) /* 12,13 */
759  XOR( edi, ch, 0) /* 8 */
760  XOR( ebx, cl, 1) /* 9 */
761  XOR( edi, dl, 1) /* 13 */
762  MXOR( 0, dh, 0) /* 12 */
763 
764  AS2( movd ecx, xmm2)
765  AS2( MOVD edx, MM(1))
766  AS2( MOVD [L_SAVED_X+3*4], MM(0))
767  AS2( mov [L_SAVED_X+0*4], eax)
768  AS2( mov [L_SAVED_X+1*4], ebx)
769  AS2( mov [L_SAVED_X+2*4], edi)
770  ATT_NOPREFIX
771  ASJ( jmp, 5, f)
772  INTEL_NOPREFIX
773  ASL(3)
774  // non-counter mode per-block setup
775  AS2( MOVD MM(1), [L_KEY12+0*4]) // 0,1,2,3
776  AS2( mov ebx, [L_KEY12+1*4]) // 4,5,6,7
777  AS2( mov ecx, [L_KEY12+2*4]) // 8,9,10,11
778  AS2( mov edx, [L_KEY12+3*4]) // 12,13,14,15
779  ASL(8)
780  AS2( mov WORD_REG(ax), [L_INBLOCKS])
781  AS2( movdqu xmm2, [WORD_REG(ax)])
782  AS2( mov WORD_REG(si), [L_INXORBLOCKS])
783  AS2( movdqu xmm5, [WORD_REG(si)])
784  AS2( pxor xmm2, xmm1)
785  AS2( pxor xmm2, xmm5)
786 
787  // first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: eax, ebx, ecx, edx
788  AS2( movd eax, xmm2)
789  AS2( psrldq xmm2, 4)
790  AS2( movd edi, xmm2)
791  AS2( psrldq xmm2, 4)
792  MXOR( 1, al, 0) // 0
793  XOR( edx, ah, 1) // 1
794  AS2( shr eax, 16)
795  XOR( ecx, al, 2) // 2
796  XOR( ebx, ah, 3) // 3
797  AS2( mov eax, edi)
798  AS2( movd edi, xmm2)
799  AS2( psrldq xmm2, 4)
800  XOR( ebx, al, 0) // 4
801  MXOR( 1, ah, 1) // 5
802  AS2( shr eax, 16)
803  XOR( edx, al, 2) // 6
804  XOR( ecx, ah, 3) // 7
805  AS2( mov eax, edi)
806  AS2( movd edi, xmm2)
807  XOR( ecx, al, 0) // 8
808  XOR( ebx, ah, 1) // 9
809  AS2( shr eax, 16)
810  MXOR( 1, al, 2) // 10
811  XOR( edx, ah, 3) // 11
812  AS2( mov eax, edi)
813  XOR( edx, al, 0) // 12
814  XOR( ecx, ah, 1) // 13
815  AS2( shr eax, 16)
816  XOR( ebx, al, 2) // 14
817  MXOR( 1, ah, 3) // 15
818  AS2( MOVD eax, MM(1))
819 
820  AS2( add L_REG, [L_KEYS_BEGIN])
821  AS2( add L_REG, 4*16)
822  ATT_NOPREFIX
823  ASJ( jmp, 2, f)
824  INTEL_NOPREFIX
825  ASL(1)
826  // counter-mode per-block setup
827  AS2( MOVD ecx, MM(2))
828  AS2( MOVD edx, MM(1))
829  AS2( mov eax, [L_SAVED_X+0*4])
830  AS2( mov ebx, [L_SAVED_X+1*4])
831  AS2( xor cl, ch)
832  AS2( and WORD_REG(cx), 255)
833  ASL(5)
834 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
835  AS2( paddb MM(2), mm3)
836 #else
837  AS2( add MM(2), 1)
838 #endif
839  // remaining part of second round, in: edx(previous round),esi(keyed counter byte) eax,ebx,[L_SAVED_X+2*4],[L_SAVED_X+3*4], out: eax,ebx,ecx,edx
840  AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
841  XOR( ebx, dl, 3)
842  MOV( ecx, dh, 2)
843  AS2( shr edx, 16)
844  AS2( xor ecx, [L_SAVED_X+2*4])
845  XOR( eax, dh, 0)
846  MOV( edx, dl, 1)
847  AS2( xor edx, [L_SAVED_X+3*4])
848 
849  AS2( add L_REG, [L_KEYS_BEGIN])
850  AS2( add L_REG, 3*16)
851  ATT_NOPREFIX
852  ASJ( jmp, 4, f)
853  INTEL_NOPREFIX
854 
855 // in: eax(0,1,2,3), ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15)
856 // out: eax, ebx, edi, mm0
857 #define ROUND() \
858  MXOR( 0, cl, 3) /* 11 */\
859  AS2( mov cl, al) /* 8,9,10,3 */\
860  XOR( edi, ah, 2) /* 2 */\
861  AS2( shr eax, 16) /* 0,1 */\
862  XOR( edi, bl, 3) /* 7 */\
863  MXOR( 0, bh, 2) /* 6 */\
864  AS2( shr ebx, 16) /* 4,5 */\
865  MXOR( 0, al, 1) /* 1 */\
866  MOV( eax, ah, 0) /* 0 */\
867  XOR( eax, bl, 1) /* 5 */\
868  MOV( ebx, bh, 0) /* 4 */\
869  XOR( eax, ch, 2) /* 10 */\
870  XOR( ebx, cl, 3) /* 3 */\
871  AS2( shr ecx, 16) /* 8,9 */\
872  XOR( eax, dl, 3) /* 15 */\
873  XOR( ebx, dh, 2) /* 14 */\
874  AS2( shr edx, 16) /* 12,13 */\
875  XOR( edi, ch, 0) /* 8 */\
876  XOR( ebx, cl, 1) /* 9 */\
877  XOR( edi, dl, 1) /* 13 */\
878  MXOR( 0, dh, 0) /* 12 */\
879 
880  ASL(2) // 2-round loop
881  AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4])
882  AS2( mov edi, [L_SUBKEYS-4*16+2*4])
883  ROUND()
884  AS2( mov ecx, edi)
885  AS2( xor eax, [L_SUBKEYS-4*16+0*4])
886  AS2( xor ebx, [L_SUBKEYS-4*16+1*4])
887  AS2( MOVD edx, MM(0))
888 
889  ASL(4)
890  AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4])
891  AS2( mov edi, [L_SUBKEYS-4*16+6*4])
892  ROUND()
893  AS2( mov ecx, edi)
894  AS2( xor eax, [L_SUBKEYS-4*16+4*4])
895  AS2( xor ebx, [L_SUBKEYS-4*16+5*4])
896  AS2( MOVD edx, MM(0))
897 
898  AS2( add L_REG, 32)
899  AS2( test L_REG, 255)
900  ATT_NOPREFIX
901  ASJ( jnz, 2, b)
902  INTEL_NOPREFIX
903  AS2( sub L_REG, 16*16)
904 
905 #define LAST(a, b, c) \
906  AS2( movzx esi, a )\
907  AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
908  AS2( movzx esi, b )\
909  AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\
910  AS2( mov WORD PTR [L_LASTROUND+c], di )\
911 
912  // last round
913  LAST(ch, dl, 2)
914  LAST(dh, al, 6)
915  AS2( shr edx, 16)
916  LAST(ah, bl, 10)
917  AS2( shr eax, 16)
918  LAST(bh, cl, 14)
919  AS2( shr ebx, 16)
920  LAST(dh, al, 12)
921  AS2( shr ecx, 16)
922  LAST(ah, bl, 0)
923  LAST(bh, cl, 4)
924  LAST(ch, dl, 8)
925 
926  AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS])
927  AS2( mov WORD_REG(bx), [L_OUTBLOCKS])
928 
929  AS2( mov WORD_REG(cx), [L_LENGTH])
930  AS2( sub WORD_REG(cx), 16)
931 
932  AS2( movdqu xmm2, [WORD_REG(ax)])
933  AS2( pxor xmm2, xmm4)
934 
935 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
936  AS2( movdqa xmm0, [L_INCREMENTS])
937  AS2( paddd xmm0, [L_INBLOCKS])
938  AS2( movdqa [L_INBLOCKS], xmm0)
939 #else
940  AS2( movdqa xmm0, [L_INCREMENTS+16])
941  AS2( paddq xmm0, [L_INBLOCKS+16])
942  AS2( movdqa [L_INBLOCKS+16], xmm0)
943 #endif
944 
945  AS2( pxor xmm2, [L_LASTROUND])
946  AS2( movdqu [WORD_REG(bx)], xmm2)
947 
948  ATT_NOPREFIX
949  ASJ( jle, 7, f)
950  INTEL_NOPREFIX
951  AS2( mov [L_LENGTH], WORD_REG(cx))
952  AS2( test WORD_REG(cx), 1)
953  ATT_NOPREFIX
954  ASJ( jnz, 1, b)
955  INTEL_NOPREFIX
956 #if CRYPTOPP_BOOL_X64
957  AS2( movdqa xmm0, [L_INCREMENTS])
958  AS2( paddq xmm0, [L_INBLOCKS])
959  AS2( movdqa [L_INBLOCKS], xmm0)
960 #endif
961  ATT_NOPREFIX
962  ASJ( jmp, 3, b)
963  INTEL_NOPREFIX
964 
965  ASL(7)
966  // erase keys on stack
967  AS2( xorps xmm0, xmm0)
968  AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16])
969  AS2( movaps [WORD_REG(ax)-7*16], xmm0)
970  AS2( movaps [WORD_REG(ax)-6*16], xmm0)
971  AS2( movaps [WORD_REG(ax)-5*16], xmm0)
972  AS2( movaps [WORD_REG(ax)-4*16], xmm0)
973  AS2( movaps [WORD_REG(ax)-3*16], xmm0)
974  AS2( movaps [WORD_REG(ax)-2*16], xmm0)
975  AS2( movaps [WORD_REG(ax)-1*16], xmm0)
976  AS2( movaps [WORD_REG(ax)+0*16], xmm0)
977  AS2( movaps [WORD_REG(ax)+1*16], xmm0)
978  AS2( movaps [WORD_REG(ax)+2*16], xmm0)
979  AS2( movaps [WORD_REG(ax)+3*16], xmm0)
980  AS2( movaps [WORD_REG(ax)+4*16], xmm0)
981  AS2( movaps [WORD_REG(ax)+5*16], xmm0)
982  AS2( movaps [WORD_REG(ax)+6*16], xmm0)
983 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
984  AS2( mov esp, [L_SP])
985  AS1( emms)
986 #endif
987  AS_POP_IF86(bp)
988  AS_POP_IF86(bx)
989 #if defined(_MSC_VER) && CRYPTOPP_BOOL_X86
990  AS_POP_IF86(di)
991  AS_POP_IF86(si)
992  AS1(ret)
993 #endif
994 #ifdef CRYPTOPP_GENERATE_X64_MASM
995  pop r12
996  pop rbx
997  pop rdi
998  pop rsi
999  ret
1000  Rijndael_Enc_AdvancedProcessBlocks ENDP
1001 #endif
1002 #ifdef __GNUC__
1003  ATT_PREFIX
1004  :
1005  : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize)
1006  : "memory", "cc", "%eax"
1007  #if CRYPTOPP_BOOL_X64
1008  , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12"
1009  #endif
1010  );
1011 #endif
1012 }
1013 
1014 #endif
1015 
1016 #ifndef CRYPTOPP_GENERATE_X64_MASM
1017 
1018 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
1019 extern "C" {
1020 void Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k);
1021 }
1022 #endif
1023 
1024 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
1025 
1026 static inline bool AliasedWithTable(const byte *begin, const byte *end)
1027 {
1028  size_t s0 = size_t(begin)%4096, s1 = size_t(end)%4096;
1029  size_t t0 = size_t(Te)%4096, t1 = (size_t(Te)+sizeof(Te))%4096;
1030  if (t1 > t0)
1031  return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
1032  else
1033  return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
1034 }
1035 
1036 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1037 
1038 inline void AESNI_Enc_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
1039 {
1040  block = _mm_xor_si128(block, subkeys[0]);
1041  for (unsigned int i=1; i<rounds-1; i+=2)
1042  {
1043  block = _mm_aesenc_si128(block, subkeys[i]);
1044  block = _mm_aesenc_si128(block, subkeys[i+1]);
1045  }
1046  block = _mm_aesenc_si128(block, subkeys[rounds-1]);
1047  block = _mm_aesenclast_si128(block, subkeys[rounds]);
1048 }
1049 
1050 inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
1051 {
1052  __m128i rk = subkeys[0];
1053  block0 = _mm_xor_si128(block0, rk);
1054  block1 = _mm_xor_si128(block1, rk);
1055  block2 = _mm_xor_si128(block2, rk);
1056  block3 = _mm_xor_si128(block3, rk);
1057  for (unsigned int i=1; i<rounds; i++)
1058  {
1059  rk = subkeys[i];
1060  block0 = _mm_aesenc_si128(block0, rk);
1061  block1 = _mm_aesenc_si128(block1, rk);
1062  block2 = _mm_aesenc_si128(block2, rk);
1063  block3 = _mm_aesenc_si128(block3, rk);
1064  }
1065  rk = subkeys[rounds];
1066  block0 = _mm_aesenclast_si128(block0, rk);
1067  block1 = _mm_aesenclast_si128(block1, rk);
1068  block2 = _mm_aesenclast_si128(block2, rk);
1069  block3 = _mm_aesenclast_si128(block3, rk);
1070 }
1071 
1072 inline void AESNI_Dec_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
1073 {
1074  block = _mm_xor_si128(block, subkeys[0]);
1075  for (unsigned int i=1; i<rounds-1; i+=2)
1076  {
1077  block = _mm_aesdec_si128(block, subkeys[i]);
1078  block = _mm_aesdec_si128(block, subkeys[i+1]);
1079  }
1080  block = _mm_aesdec_si128(block, subkeys[rounds-1]);
1081  block = _mm_aesdeclast_si128(block, subkeys[rounds]);
1082 }
1083 
1084 inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
1085 {
1086  __m128i rk = subkeys[0];
1087  block0 = _mm_xor_si128(block0, rk);
1088  block1 = _mm_xor_si128(block1, rk);
1089  block2 = _mm_xor_si128(block2, rk);
1090  block3 = _mm_xor_si128(block3, rk);
1091  for (unsigned int i=1; i<rounds; i++)
1092  {
1093  rk = subkeys[i];
1094  block0 = _mm_aesdec_si128(block0, rk);
1095  block1 = _mm_aesdec_si128(block1, rk);
1096  block2 = _mm_aesdec_si128(block2, rk);
1097  block3 = _mm_aesdec_si128(block3, rk);
1098  }
1099  rk = subkeys[rounds];
1100  block0 = _mm_aesdeclast_si128(block0, rk);
1101  block1 = _mm_aesdeclast_si128(block1, rk);
1102  block2 = _mm_aesdeclast_si128(block2, rk);
1103  block3 = _mm_aesdeclast_si128(block3, rk);
1104 }
1105 
1106 CRYPTOPP_ALIGN_DATA(16)
1107 static const word32 s_one[] = {0, 0, 0, 1<<24};
1108 
1109 template <typename F1, typename F4>
1110 inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, const __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1111 {
1112  size_t blockSize = 16;
1114  size_t xorIncrement = xorBlocks ? blockSize : 0;
1115  size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
1116 
1118  {
1119  assert(length % blockSize == 0);
1120  inBlocks += length - blockSize;
1121  xorBlocks += length - blockSize;
1122  outBlocks += length - blockSize;
1123  inIncrement = 0-inIncrement;
1124  xorIncrement = 0-xorIncrement;
1125  outIncrement = 0-outIncrement;
1126  }
1127 
1128  if (flags & BlockTransformation::BT_AllowParallel)
1129  {
1130  while (length >= 4*blockSize)
1131  {
1132  __m128i block0 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks), block1, block2, block3;
1134  {
1135  const __m128i be1 = *(const __m128i *)(const void *)s_one;
1136  block1 = _mm_add_epi32(block0, be1);
1137  block2 = _mm_add_epi32(block1, be1);
1138  block3 = _mm_add_epi32(block2, be1);
1139  _mm_storeu_si128((__m128i *)(void *)inBlocks, _mm_add_epi32(block3, be1));
1140  }
1141  else
1142  {
1143  inBlocks += inIncrement;
1144  block1 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
1145  inBlocks += inIncrement;
1146  block2 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
1147  inBlocks += inIncrement;
1148  block3 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
1149  inBlocks += inIncrement;
1150  }
1151 
1152  if (flags & BlockTransformation::BT_XorInput)
1153  {
1154  block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1155  xorBlocks += xorIncrement;
1156  block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1157  xorBlocks += xorIncrement;
1158  block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1159  xorBlocks += xorIncrement;
1160  block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1161  xorBlocks += xorIncrement;
1162  }
1163 
1164  func4(block0, block1, block2, block3, subkeys, rounds);
1165 
1166  if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
1167  {
1168  block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1169  xorBlocks += xorIncrement;
1170  block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1171  xorBlocks += xorIncrement;
1172  block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1173  xorBlocks += xorIncrement;
1174  block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1175  xorBlocks += xorIncrement;
1176  }
1177 
1178  _mm_storeu_si128((__m128i *)(void *)outBlocks, block0);
1179  outBlocks += outIncrement;
1180  _mm_storeu_si128((__m128i *)(void *)outBlocks, block1);
1181  outBlocks += outIncrement;
1182  _mm_storeu_si128((__m128i *)(void *)outBlocks, block2);
1183  outBlocks += outIncrement;
1184  _mm_storeu_si128((__m128i *)(void *)outBlocks, block3);
1185  outBlocks += outIncrement;
1186 
1187  length -= 4*blockSize;
1188  }
1189  }
1190 
1191  while (length >= blockSize)
1192  {
1193  __m128i block = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
1194 
1195  if (flags & BlockTransformation::BT_XorInput)
1196  block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1197 
1198  if (flags & BlockTransformation::BT_InBlockIsCounter)
1199  const_cast<byte *>(inBlocks)[15]++;
1200 
1201  func1(block, subkeys, rounds);
1202 
1203  if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
1204  block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1205 
1206  _mm_storeu_si128((__m128i *)(void *)outBlocks, block);
1207 
1208  inBlocks += inIncrement;
1209  outBlocks += outIncrement;
1210  xorBlocks += xorIncrement;
1211  length -= blockSize;
1212  }
1213 
1214  return length;
1215 }
1216 #endif
1217 
1218 size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1219 {
1220 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1221  if (HasAESNI())
1222  return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (const __m128i *)(const void *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1223 #endif
1224 
1225 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
1226  if (HasSSE2())
1227  {
1228  if (length < BLOCKSIZE)
1229  return length;
1230 
1231  struct Locals
1232  {
1233  word32 subkeys[4*12], workspace[8];
1234  const byte *inBlocks, *inXorBlocks, *outXorBlocks;
1235  byte *outBlocks;
1236  size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
1237  size_t regSpill, lengthAndCounterFlag, keysBegin;
1238  };
1239 
1240  const byte* zeros = (byte *)(Te+256);
1241  byte *space = NULL;
1242 
1243  do {
1244 #if (CRYPTOPP_MSC_VERSION >= 1400)
1245  // https://msdn.microsoft.com/en-us/library/5471dc8s.aspx
1246  space = (byte *)_malloca(255+sizeof(Locals));
1247  space += (256-(size_t)space%256)%256;
1248 #else
1249  space = (byte *)alloca(255+sizeof(Locals));
1250  space += (256-(size_t)space%256)%256;
1251 #endif
1252  }
1253  while (AliasedWithTable(space, space+sizeof(Locals)));
1254 
1255  size_t increment = BLOCKSIZE;
1256  if (flags & BT_ReverseDirection)
1257  {
1258  assert(length % BLOCKSIZE == 0);
1259  inBlocks += length - BLOCKSIZE;
1260  xorBlocks += length - BLOCKSIZE;
1261  outBlocks += length - BLOCKSIZE;
1262  increment = 0-increment;
1263  }
1264 
1265  Locals &locals = *(Locals *)(void *)space;
1266 
1267  locals.inBlocks = inBlocks;
1268  locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
1269  locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
1270  locals.outBlocks = outBlocks;
1271 
1272  locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1273  locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
1274  locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
1275  locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1276 
1277  locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter);
1278  int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
1279  locals.keysBegin = (12-keysToCopy)*16;
1280 
1281  Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key);
1282 
1283 #if (CRYPTOPP_MSC_VERSION >= 1400)
1284  _freea(space);
1285 #endif
1286 
1287  return length % BLOCKSIZE;
1288  }
1289 #endif
1290 
1291  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1292 }
1293 
1294 #endif
1295 
1296 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1297 
1298 size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1299 {
1300  if (HasAESNI())
1301  return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (const __m128i *)(const void *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1302 
1303  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1304 }
1305 
1306 #endif // #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1307 
1308 NAMESPACE_END
1309 
1310 #endif
1311 #endif
Utility functions for the Crypto++ library.
bool HasSSE4()
Determines SSE4 availability.
Definition: cpu.h:230
Library configuration file.
should not modify block pointers
Definition: cryptlib.h:789
int GetCacheLineSize()
Provides the cache line size.
Definition: cpu.h:344
virtual size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
Encrypt and xor multiple blocks using additional flags.
Definition: cryptlib.cpp:181
Access a block of memory.
Definition: misc.h:2209
Rijndael block cipher implementation details.
Definition: rijndael.h:29
T ConditionalByteReverse(ByteOrder order, T value)
Reverses bytes in a value depending upon endianess.
Definition: misc.h:1812
byte order is big-endian
Definition: cryptlib.h:127
Classes for Rijndael encryption algorithm.
Functions for CPU features and intrinsics.
bool HasAESNI()
Determines AES-NI availability.
Definition: cpu.h:240
bool HasSSE2()
Determines SSE2 availability.
Definition: cpu.h:205
perform the transformation in reverse
Definition: cryptlib.h:793
Crypto++ library namespace.
T rotrFixed(T x, unsigned int y)
Performs a right rotate.
Definition: misc.h:1306
Interface for retrieving values given their names.
Definition: cryptlib.h:277