Crypto++  5.6.3
Free C++ class library of cryptographic schemes
rijndael.cpp
1 // rijndael.cpp - modified by Chris Morgan <cmorgan@wpi.edu>
2 // and Wei Dai from Paulo Baretto's Rijndael implementation
3 // The original code and all modifications are in the public domain.
4 
5 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
6 
7 /*
8 July 2010: Added support for AES-NI instructions via compiler intrinsics.
9 */
10 
11 /*
12 Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode
13 caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein
14 and Peter Schwabe in their paper "New AES software speed records". The round
15 function was also modified to include a trick similar to one in Brian Gladman's
16 x86 assembly code, doing an 8-bit register move to minimize the number of
17 register spills. Also switched to compressed tables and copying round keys to
18 the stack.
19 
20 The C++ implementation now uses compressed tables if
21 CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined.
22 */
23 
24 /*
25 July 2006: Defense against timing attacks was added in by Wei Dai.
26 
27 The code now uses smaller tables in the first and last rounds,
28 and preloads them into L1 cache before usage (by loading at least
29 one element in each cache line).
30 
31 We try to delay subsequent accesses to each table (used in the first
32 and last rounds) until all of the table has been preloaded. Hopefully
33 the compiler isn't smart enough to optimize that code away.
34 
35 After preloading the table, we also try not to access any memory location
36 other than the table and the stack, in order to prevent table entries from
37 being unloaded from L1 cache, until that round is finished.
38 (Some popular CPUs have 2-way associative caches.)
39 */
40 
41 // This is the original introductory comment:
42 
43 /**
44  * version 3.0 (December 2000)
45  *
46  * Optimised ANSI C code for the Rijndael cipher (now AES)
47  *
48  * author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
49  * author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
50  * author Paulo Barreto <paulo.barreto@terra.com.br>
51  *
52  * This code is hereby placed in the public domain.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
55  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
56  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
58  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
59  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
60  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
61  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
62  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
63  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
64  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65  */
66 
67 #include "pch.h"
68 #include "config.h"
69 
70 #ifndef CRYPTOPP_IMPORTS
71 #ifndef CRYPTOPP_GENERATE_X64_MASM
72 
73 #include "rijndael.h"
74 #include "stdcpp.h" // alloca
75 #include "misc.h"
76 #include "cpu.h"
77 
78 NAMESPACE_BEGIN(CryptoPP)
79 
80 // Hack for https://github.com/weidai11/cryptopp/issues/42 and https://github.com/weidai11/cryptopp/issues/132
81 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS)
82 # define CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS 1
83 #endif
84 
85 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
86 # if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
87 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
88 using namespace rdtable;
89 # else
90 static word64 Te[256];
91 # endif
92 static word64 Td[256];
93 #else // Not CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
94 # if defined(CRYPTOPP_X64_MASM_AVAILABLE)
95 // Unused; avoids linker error on Microsoft X64 non-AESNI platforms
96 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
97 # endif
98 static CRYPTOPP_ALIGN_DATA(16) word32 Te[256*4];
99 static CRYPTOPP_ALIGN_DATA(16) word32 Td[256*4];
100 #endif // CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
101 
102 static volatile bool s_TeFilled = false, s_TdFilled = false;
103 
104 // ************************* Portable Code ************************************
105 
106 #define QUARTER_ROUND(L, T, t, a, b, c, d) \
107  a ^= L(T, 3, byte(t)); t >>= 8;\
108  b ^= L(T, 2, byte(t)); t >>= 8;\
109  c ^= L(T, 1, byte(t)); t >>= 8;\
110  d ^= L(T, 0, t);
111 
112 #define QUARTER_ROUND_LE(t, a, b, c, d) \
113  tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
114  tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
115  tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
116  tempBlock[d] = ((byte *)(Te+t))[1];
117 
118 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
119  #define QUARTER_ROUND_LD(t, a, b, c, d) \
120  tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
121  tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
122  tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
123  tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
124 #else
125  #define QUARTER_ROUND_LD(t, a, b, c, d) \
126  tempBlock[a] = Sd[byte(t)]; t >>= 8;\
127  tempBlock[b] = Sd[byte(t)]; t >>= 8;\
128  tempBlock[c] = Sd[byte(t)]; t >>= 8;\
129  tempBlock[d] = Sd[t];
130 #endif
131 
132 #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
133 #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
134 
135 #ifdef IS_LITTLE_ENDIAN
136  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
137  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
138  #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
139  #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (6-i)%4+1))
140  #define TL_M(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (i+3)%4+1))
141  #else
142  #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8)
143  #define TL_M(T, i, x) T[i*256 + x]
144  #endif
145 #else
146  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
147  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
148  #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
149  #define TL_F(T, i, x) (*(word32 *)(void *)((byte *)T + x*8 + (4-i)%4))
150  #define TL_M TL_F
151  #else
152  #define TL_F(T, i, x) rotrFixed(T[x], i*8)
153  #define TL_M(T, i, x) T[i*256 + x]
154  #endif
155 #endif
156 
157 
158 #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
159 #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
160 #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
161 
162 #define f3(x) (f2(x) ^ x)
163 #define f9(x) (f8(x) ^ x)
164 #define fb(x) (f8(x) ^ f2(x) ^ x)
165 #define fd(x) (f8(x) ^ f4(x) ^ x)
166 #define fe(x) (f8(x) ^ f4(x) ^ f2(x))
167 
168 void Rijndael::Base::FillEncTable()
169 {
170  for (int i=0; i<256; i++)
171  {
172  byte x = Se[i];
173 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
174  word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
175  Te[i] = word64(y | f3(x))<<32 | y;
176 #else
177  word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
178  for (int j=0; j<4; j++)
179  {
180  Te[i+j*256] = y;
181  y = rotrFixed(y, 8);
182  }
183 #endif
184  }
185 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
186  Te[256] = Te[257] = 0;
187 #endif
188  s_TeFilled = true;
189 }
190 
191 void Rijndael::Base::FillDecTable()
192 {
193  for (int i=0; i<256; i++)
194  {
195  byte x = Sd[i];
196 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
197  word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
198  Td[i] = word64(y | fb(x))<<32 | y | x;
199 #else
200  word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;;
201  for (int j=0; j<4; j++)
202  {
203  Td[i+j*256] = y;
204  y = rotrFixed(y, 8);
205  }
206 #endif
207  }
208  s_TdFilled = true;
209 }
210 
211 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &)
212 {
213  AssertValidKeyLength(keylen);
214 
215  m_rounds = keylen/4 + 6;
216  m_key.New(4*(m_rounds+1));
217 
218  word32 *rk = m_key;
219 
220 #if (CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32))
221  // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64
222  if (HasAESNI())
223  {
224  static const word32 rcLE[] = {
225  0x01, 0x02, 0x04, 0x08,
226  0x10, 0x20, 0x40, 0x80,
227  0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
228  };
229  const word32 *rc = rcLE;
230 
231  __m128i temp = _mm_loadu_si128((__m128i *)(void *)(userKey+keylen-16));
232  memcpy(rk, userKey, keylen);
233 
234  while (true)
235  {
236  rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
237  rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
238  rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
239  rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
240 
241  if (rk + keylen/4 + 4 == m_key.end())
242  break;
243 
244  if (keylen == 24)
245  {
246  rk[10] = rk[ 4] ^ rk[ 9];
247  rk[11] = rk[ 5] ^ rk[10];
248  temp = _mm_insert_epi32(temp, rk[11], 3);
249  }
250  else if (keylen == 32)
251  {
252  temp = _mm_insert_epi32(temp, rk[11], 3);
253  rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2);
254  rk[13] = rk[ 5] ^ rk[12];
255  rk[14] = rk[ 6] ^ rk[13];
256  rk[15] = rk[ 7] ^ rk[14];
257  temp = _mm_insert_epi32(temp, rk[15], 3);
258  }
259  else
260  temp = _mm_insert_epi32(temp, rk[7], 3);
261 
262  rk += keylen/4;
263  }
264 
265  if (!IsForwardTransformation())
266  {
267  rk = m_key;
268  unsigned int i, j;
269 
270  std::swap(*(__m128i *)(void *)(rk), *(__m128i *)(void *)(rk+4*m_rounds));
271 
272  for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
273  {
274  temp = _mm_aesimc_si128(*(__m128i *)(void *)(rk+i));
275  *(__m128i *)(void *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(void *)(rk+j));
276  *(__m128i *)(void *)(rk+j) = temp;
277  }
278 
279  *(__m128i *)(void *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(void *)(rk+i));
280  }
281 
282  return;
283  }
284 #endif
285 
286  GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen);
287  const word32 *rc = rcon;
288  word32 temp;
289 
290  while (true)
291  {
292  temp = rk[keylen/4-1];
293  word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
294  rk[keylen/4] = rk[0] ^ x ^ *(rc++);
295  rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
296  rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
297  rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
298 
299  if (rk + keylen/4 + 4 == m_key.end())
300  break;
301 
302  if (keylen == 24)
303  {
304  rk[10] = rk[ 4] ^ rk[ 9];
305  rk[11] = rk[ 5] ^ rk[10];
306  }
307  else if (keylen == 32)
308  {
309  temp = rk[11];
310  rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
311  rk[13] = rk[ 5] ^ rk[12];
312  rk[14] = rk[ 6] ^ rk[13];
313  rk[15] = rk[ 7] ^ rk[14];
314  }
315  rk += keylen/4;
316  }
317 
318  rk = m_key;
319 
320  if (IsForwardTransformation())
321  {
322  if (!s_TeFilled)
323  FillEncTable();
324 
326  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
327  }
328  else
329  {
330  if (!s_TdFilled)
331  FillDecTable();
332 
333  unsigned int i, j;
334 
335 #define InverseMixColumn(x) TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
336 
337  for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
338  {
339  temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp;
340  temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
341  temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
342  temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
343  }
344 
345  rk[i+0] = InverseMixColumn(rk[i+0]);
346  rk[i+1] = InverseMixColumn(rk[i+1]);
347  rk[i+2] = InverseMixColumn(rk[i+2]);
348  rk[i+3] = InverseMixColumn(rk[i+3]);
349 
350  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
351  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
352  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
353  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
354  }
355 
356 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
357  if (HasAESNI())
358  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
359 #endif
360 }
361 
362 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
363 {
364 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
365 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
366  if (HasSSE2())
367 #else
368  if (HasAESNI())
369 #endif
370  {
371  return (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
372  }
373 #endif
374 
376 
377  word32 s0, s1, s2, s3, t0, t1, t2, t3;
378  Block::Get(inBlock)(s0)(s1)(s2)(s3);
379 
380  const word32 *rk = m_key;
381  s0 ^= rk[0];
382  s1 ^= rk[1];
383  s2 ^= rk[2];
384  s3 ^= rk[3];
385  t0 = rk[4];
386  t1 = rk[5];
387  t2 = rk[6];
388  t3 = rk[7];
389  rk += 8;
390 
391  // timing attack countermeasure. see comments at top for more details.
392  // also see http://github.com/weidai11/cryptopp/issues/146
393  const int cacheLineSize = GetCacheLineSize();
394  unsigned int i;
395  volatile word32 _u = 0;
396  word32 u = _u;
397 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
398  for (i=0; i<2048; i+=cacheLineSize)
399 #else
400  for (i=0; i<1024; i+=cacheLineSize)
401 #endif
402  u &= *(const word32 *)(const void *)(((const byte *)Te)+i);
403  u &= Te[255];
404  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
405 
406  QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
407  QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
408  QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
409  QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
410 
411  // Nr - 2 full rounds:
412  unsigned int r = m_rounds/2 - 1;
413  do
414  {
415  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
416 
417  QUARTER_ROUND_E(t3, s0, s1, s2, s3)
418  QUARTER_ROUND_E(t2, s3, s0, s1, s2)
419  QUARTER_ROUND_E(t1, s2, s3, s0, s1)
420  QUARTER_ROUND_E(t0, s1, s2, s3, s0)
421 
422  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
423 
424  QUARTER_ROUND_E(s3, t0, t1, t2, t3)
425  QUARTER_ROUND_E(s2, t3, t0, t1, t2)
426  QUARTER_ROUND_E(s1, t2, t3, t0, t1)
427  QUARTER_ROUND_E(s0, t1, t2, t3, t0)
428 
429  rk += 8;
430  } while (--r);
431 
432  word32 tbw[4];
433  byte *const tempBlock = (byte *)tbw;
434 
435  QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
436  QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
437  QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
438  QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
439 
440  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
441 }
442 
443 void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
444 {
445 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
446  if (HasAESNI())
447  {
448  Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
449  return;
450  }
451 #endif
452 
454 
455  word32 s0, s1, s2, s3, t0, t1, t2, t3;
456  Block::Get(inBlock)(s0)(s1)(s2)(s3);
457 
458  const word32 *rk = m_key;
459  s0 ^= rk[0];
460  s1 ^= rk[1];
461  s2 ^= rk[2];
462  s3 ^= rk[3];
463  t0 = rk[4];
464  t1 = rk[5];
465  t2 = rk[6];
466  t3 = rk[7];
467  rk += 8;
468 
469  // timing attack countermeasure. see comments at top for more details.
470  // also see http://github.com/weidai11/cryptopp/issues/146
471  const int cacheLineSize = GetCacheLineSize();
472  unsigned int i;
473  volatile word32 _u = 0;
474  word32 u = _u;
475 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
476  for (i=0; i<2048; i+=cacheLineSize)
477 #else
478  for (i=0; i<1024; i+=cacheLineSize)
479 #endif
480  u &= *(const word32 *)(const void *)(((const byte *)Td)+i);
481  u &= Td[255];
482  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
483 
484  QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
485  QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
486  QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
487  QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
488 
489  // Nr - 2 full rounds:
490  unsigned int r = m_rounds/2 - 1;
491  do
492  {
493  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
494 
495  QUARTER_ROUND_D(t3, s2, s1, s0, s3)
496  QUARTER_ROUND_D(t2, s1, s0, s3, s2)
497  QUARTER_ROUND_D(t1, s0, s3, s2, s1)
498  QUARTER_ROUND_D(t0, s3, s2, s1, s0)
499 
500  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
501 
502  QUARTER_ROUND_D(s3, t2, t1, t0, t3)
503  QUARTER_ROUND_D(s2, t1, t0, t3, t2)
504  QUARTER_ROUND_D(s1, t0, t3, t2, t1)
505  QUARTER_ROUND_D(s0, t3, t2, t1, t0)
506 
507  rk += 8;
508  } while (--r);
509 
510 #if !(defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS))
511  // timing attack countermeasure. see comments at top for more details
512  // If CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined,
513  // QUARTER_ROUND_LD will use Td, which is already preloaded.
514  u = _u;
515  for (i=0; i<256; i+=cacheLineSize)
516  u &= *(const word32 *)(const void *)(Sd+i);
517  u &= *(const word32 *)(const void *)(Sd+252);
518  t0 |= u; t1 |= u; t2 |= u; t3 |= u;
519 #endif
520 
521  word32 tbw[4];
522  byte *const tempBlock = (byte *)tbw;
523 
524  QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
525  QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
526  QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
527  QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
528 
529  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
530 }
531 
532 // ************************* Assembly Code ************************************
533 
534 #if CRYPTOPP_MSC_VERSION
535 # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
536 #endif
537 
538 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
539 
540 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
541 
542 CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k)
543 {
544  CRYPTOPP_UNUSED(locals); CRYPTOPP_UNUSED(k);
545 
546 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
547 
548 #define L_REG esp
549 #define L_INDEX(i) (L_REG+768+i)
550 #define L_INXORBLOCKS L_INBLOCKS+4
551 #define L_OUTXORBLOCKS L_INBLOCKS+8
552 #define L_OUTBLOCKS L_INBLOCKS+12
553 #define L_INCREMENTS L_INDEX(16*15)
554 #define L_SP L_INDEX(16*16)
555 #define L_LENGTH L_INDEX(16*16+4)
556 #define L_KEYS_BEGIN L_INDEX(16*16+8)
557 
558 #define MOVD movd
559 #define MM(i) mm##i
560 
561 #define MXOR(a,b,c) \
562  AS2( movzx esi, b)\
563  AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
564  AS2( pxor MM(a), mm7)\
565 
566 #define MMOV(a,b,c) \
567  AS2( movzx esi, b)\
568  AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
569 
570 #else
571 
572 #define L_REG r8
573 #define L_INDEX(i) (L_REG+i)
574 #define L_INXORBLOCKS L_INBLOCKS+8
575 #define L_OUTXORBLOCKS L_INBLOCKS+16
576 #define L_OUTBLOCKS L_INBLOCKS+24
577 #define L_INCREMENTS L_INDEX(16*16)
578 #define L_LENGTH L_INDEX(16*18+8)
579 #define L_KEYS_BEGIN L_INDEX(16*19)
580 
581 #define MOVD mov
582 #define MM_0 r9d
583 #define MM_1 r12d
584 #ifdef __GNUC__
585 #define MM_2 r11d
586 #else
587 #define MM_2 r10d
588 #endif
589 #define MM(i) MM_##i
590 
591 #define MXOR(a,b,c) \
592  AS2( movzx esi, b)\
593  AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
594 
595 #define MMOV(a,b,c) \
596  AS2( movzx esi, b)\
597  AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
598 
599 #endif
600 
601 #define L_SUBKEYS L_INDEX(0)
602 #define L_SAVED_X L_SUBKEYS
603 #define L_KEY12 L_INDEX(16*12)
604 #define L_LASTROUND L_INDEX(16*13)
605 #define L_INBLOCKS L_INDEX(16*14)
606 #define MAP0TO4(i) (ASM_MOD(i+3,4)+1)
607 
608 #define XOR(a,b,c) \
609  AS2( movzx esi, b)\
610  AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
611 
612 #define MOV(a,b,c) \
613  AS2( movzx esi, b)\
614  AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
615 
616 #ifdef CRYPTOPP_GENERATE_X64_MASM
617  ALIGN 8
618  Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
619  rex_push_reg rsi
620  push_reg rdi
621  push_reg rbx
622  push_reg r12
623  .endprolog
624  mov L_REG, rcx
625  mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
626  mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
627 #elif defined(__GNUC__)
628  __asm__ __volatile__
629  (
630  INTEL_NOPREFIX
631  #if CRYPTOPP_BOOL_X64
632  AS2( mov L_REG, rcx)
633  #endif
634  AS_PUSH_IF86(bx)
635  AS_PUSH_IF86(bp)
636  AS2( mov AS_REG_7, WORD_REG(si))
637 #else
638  AS_PUSH_IF86(si)
639  AS_PUSH_IF86(di)
640  AS_PUSH_IF86(bx)
641  AS_PUSH_IF86(bp)
642  AS2( lea AS_REG_7, [Te])
643  AS2( mov edi, [g_cacheLineSize])
644 #endif
645 
646 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
647  AS2( mov [ecx+16*12+16*4], esp) // save esp to L_SP
648  AS2( lea esp, [ecx-768])
649 #endif
650 
651  // copy subkeys to stack
652  AS2( mov WORD_REG(si), [L_KEYS_BEGIN])
653  AS2( mov WORD_REG(ax), 16)
654  AS2( and WORD_REG(ax), WORD_REG(si))
655  AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)]) // subkey 1 (non-counter) or 2 (counter)
656  AS2( movdqa [L_KEY12], xmm3)
657  AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
658  AS2( sub WORD_REG(ax), WORD_REG(si))
659  ASL(0)
660  AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)])
661  AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
662  AS2( add WORD_REG(si), 16)
663  AS2( cmp WORD_REG(si), 16*12)
664  ATT_NOPREFIX
665  ASJ( jl, 0, b)
666  INTEL_NOPREFIX
667 
668  // read subkeys 0, 1 and last
669  AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)]) // last subkey
670  AS2( movdqa xmm1, [WORD_REG(dx)]) // subkey 0
671  AS2( MOVD MM(1), [WORD_REG(dx)+4*4]) // 0,1,2,3
672  AS2( mov ebx, [WORD_REG(dx)+5*4]) // 4,5,6,7
673  AS2( mov ecx, [WORD_REG(dx)+6*4]) // 8,9,10,11
674  AS2( mov edx, [WORD_REG(dx)+7*4]) // 12,13,14,15
675 
676  // load table into cache
677  AS2( xor WORD_REG(ax), WORD_REG(ax))
678  ASL(9)
679  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
680  AS2( add WORD_REG(ax), WORD_REG(di))
681  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
682  AS2( add WORD_REG(ax), WORD_REG(di))
683  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
684  AS2( add WORD_REG(ax), WORD_REG(di))
685  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
686  AS2( add WORD_REG(ax), WORD_REG(di))
687  AS2( cmp WORD_REG(ax), 2048)
688  ATT_NOPREFIX
689  ASJ( jl, 9, b)
690  INTEL_NOPREFIX
691  AS1( lfence)
692 
693  AS2( test DWORD PTR [L_LENGTH], 1)
694  ATT_NOPREFIX
695  ASJ( jz, 8, f)
696  INTEL_NOPREFIX
697 
698  // counter mode one-time setup
699  AS2( mov WORD_REG(si), [L_INBLOCKS])
700  AS2( movdqu xmm2, [WORD_REG(si)]) // counter
701  AS2( pxor xmm2, xmm1)
702  AS2( psrldq xmm1, 14)
703  AS2( movd eax, xmm1)
704  AS2( mov al, BYTE PTR [WORD_REG(si)+15])
705  AS2( MOVD MM(2), eax)
706 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
707  AS2( mov eax, 1)
708  AS2( movd mm3, eax)
709 #endif
710 
711  // partial first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: mm1, ebx, ecx, edx
712  AS2( movd eax, xmm2)
713  AS2( psrldq xmm2, 4)
714  AS2( movd edi, xmm2)
715  AS2( psrldq xmm2, 4)
716  MXOR( 1, al, 0) // 0
717  XOR( edx, ah, 1) // 1
718  AS2( shr eax, 16)
719  XOR( ecx, al, 2) // 2
720  XOR( ebx, ah, 3) // 3
721  AS2( mov eax, edi)
722  AS2( movd edi, xmm2)
723  AS2( psrldq xmm2, 4)
724  XOR( ebx, al, 0) // 4
725  MXOR( 1, ah, 1) // 5
726  AS2( shr eax, 16)
727  XOR( edx, al, 2) // 6
728  XOR( ecx, ah, 3) // 7
729  AS2( mov eax, edi)
730  AS2( movd edi, xmm2)
731  XOR( ecx, al, 0) // 8
732  XOR( ebx, ah, 1) // 9
733  AS2( shr eax, 16)
734  MXOR( 1, al, 2) // 10
735  XOR( edx, ah, 3) // 11
736  AS2( mov eax, edi)
737  XOR( edx, al, 0) // 12
738  XOR( ecx, ah, 1) // 13
739  AS2( shr eax, 16)
740  XOR( ebx, al, 2) // 14
741  AS2( psrldq xmm2, 3)
742 
743  // partial second round, in: ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15), out: eax, ebx, edi, mm0
744  AS2( mov eax, [L_KEY12+0*4])
745  AS2( mov edi, [L_KEY12+2*4])
746  AS2( MOVD MM(0), [L_KEY12+3*4])
747  MXOR( 0, cl, 3) /* 11 */
748  XOR( edi, bl, 3) /* 7 */
749  MXOR( 0, bh, 2) /* 6 */
750  AS2( shr ebx, 16) /* 4,5 */
751  XOR( eax, bl, 1) /* 5 */
752  MOV( ebx, bh, 0) /* 4 */
753  AS2( xor ebx, [L_KEY12+1*4])
754  XOR( eax, ch, 2) /* 10 */
755  AS2( shr ecx, 16) /* 8,9 */
756  XOR( eax, dl, 3) /* 15 */
757  XOR( ebx, dh, 2) /* 14 */
758  AS2( shr edx, 16) /* 12,13 */
759  XOR( edi, ch, 0) /* 8 */
760  XOR( ebx, cl, 1) /* 9 */
761  XOR( edi, dl, 1) /* 13 */
762  MXOR( 0, dh, 0) /* 12 */
763 
764  AS2( movd ecx, xmm2)
765  AS2( MOVD edx, MM(1))
766  AS2( MOVD [L_SAVED_X+3*4], MM(0))
767  AS2( mov [L_SAVED_X+0*4], eax)
768  AS2( mov [L_SAVED_X+1*4], ebx)
769  AS2( mov [L_SAVED_X+2*4], edi)
770  ATT_NOPREFIX
771  ASJ( jmp, 5, f)
772  INTEL_NOPREFIX
773  ASL(3)
774  // non-counter mode per-block setup
775  AS2( MOVD MM(1), [L_KEY12+0*4]) // 0,1,2,3
776  AS2( mov ebx, [L_KEY12+1*4]) // 4,5,6,7
777  AS2( mov ecx, [L_KEY12+2*4]) // 8,9,10,11
778  AS2( mov edx, [L_KEY12+3*4]) // 12,13,14,15
779  ASL(8)
780  AS2( mov WORD_REG(ax), [L_INBLOCKS])
781  AS2( movdqu xmm2, [WORD_REG(ax)])
782  AS2( mov WORD_REG(si), [L_INXORBLOCKS])
783  AS2( movdqu xmm5, [WORD_REG(si)])
784  AS2( pxor xmm2, xmm1)
785  AS2( pxor xmm2, xmm5)
786 
787  // first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: eax, ebx, ecx, edx
788  AS2( movd eax, xmm2)
789  AS2( psrldq xmm2, 4)
790  AS2( movd edi, xmm2)
791  AS2( psrldq xmm2, 4)
792  MXOR( 1, al, 0) // 0
793  XOR( edx, ah, 1) // 1
794  AS2( shr eax, 16)
795  XOR( ecx, al, 2) // 2
796  XOR( ebx, ah, 3) // 3
797  AS2( mov eax, edi)
798  AS2( movd edi, xmm2)
799  AS2( psrldq xmm2, 4)
800  XOR( ebx, al, 0) // 4
801  MXOR( 1, ah, 1) // 5
802  AS2( shr eax, 16)
803  XOR( edx, al, 2) // 6
804  XOR( ecx, ah, 3) // 7
805  AS2( mov eax, edi)
806  AS2( movd edi, xmm2)
807  XOR( ecx, al, 0) // 8
808  XOR( ebx, ah, 1) // 9
809  AS2( shr eax, 16)
810  MXOR( 1, al, 2) // 10
811  XOR( edx, ah, 3) // 11
812  AS2( mov eax, edi)
813  XOR( edx, al, 0) // 12
814  XOR( ecx, ah, 1) // 13
815  AS2( shr eax, 16)
816  XOR( ebx, al, 2) // 14
817  MXOR( 1, ah, 3) // 15
818  AS2( MOVD eax, MM(1))
819 
820  AS2( add L_REG, [L_KEYS_BEGIN])
821  AS2( add L_REG, 4*16)
822  ATT_NOPREFIX
823  ASJ( jmp, 2, f)
824  INTEL_NOPREFIX
825  ASL(1)
826  // counter-mode per-block setup
827  AS2( MOVD ecx, MM(2))
828  AS2( MOVD edx, MM(1))
829  AS2( mov eax, [L_SAVED_X+0*4])
830  AS2( mov ebx, [L_SAVED_X+1*4])
831  AS2( xor cl, ch)
832  AS2( and WORD_REG(cx), 255)
833  ASL(5)
834 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
835  AS2( paddb MM(2), mm3)
836 #else
837  AS2( add MM(2), 1)
838 #endif
839  // remaining part of second round, in: edx(previous round),esi(keyed counter byte) eax,ebx,[L_SAVED_X+2*4],[L_SAVED_X+3*4], out: eax,ebx,ecx,edx
840  AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
841  XOR( ebx, dl, 3)
842  MOV( ecx, dh, 2)
843  AS2( shr edx, 16)
844  AS2( xor ecx, [L_SAVED_X+2*4])
845  XOR( eax, dh, 0)
846  MOV( edx, dl, 1)
847  AS2( xor edx, [L_SAVED_X+3*4])
848 
849  AS2( add L_REG, [L_KEYS_BEGIN])
850  AS2( add L_REG, 3*16)
851  ATT_NOPREFIX
852  ASJ( jmp, 4, f)
853  INTEL_NOPREFIX
854 
855 // in: eax(0,1,2,3), ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15)
856 // out: eax, ebx, edi, mm0
857 #define ROUND() \
858  MXOR( 0, cl, 3) /* 11 */\
859  AS2( mov cl, al) /* 8,9,10,3 */\
860  XOR( edi, ah, 2) /* 2 */\
861  AS2( shr eax, 16) /* 0,1 */\
862  XOR( edi, bl, 3) /* 7 */\
863  MXOR( 0, bh, 2) /* 6 */\
864  AS2( shr ebx, 16) /* 4,5 */\
865  MXOR( 0, al, 1) /* 1 */\
866  MOV( eax, ah, 0) /* 0 */\
867  XOR( eax, bl, 1) /* 5 */\
868  MOV( ebx, bh, 0) /* 4 */\
869  XOR( eax, ch, 2) /* 10 */\
870  XOR( ebx, cl, 3) /* 3 */\
871  AS2( shr ecx, 16) /* 8,9 */\
872  XOR( eax, dl, 3) /* 15 */\
873  XOR( ebx, dh, 2) /* 14 */\
874  AS2( shr edx, 16) /* 12,13 */\
875  XOR( edi, ch, 0) /* 8 */\
876  XOR( ebx, cl, 1) /* 9 */\
877  XOR( edi, dl, 1) /* 13 */\
878  MXOR( 0, dh, 0) /* 12 */\
879 
880  ASL(2) // 2-round loop
881  AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4])
882  AS2( mov edi, [L_SUBKEYS-4*16+2*4])
883  ROUND()
884  AS2( mov ecx, edi)
885  AS2( xor eax, [L_SUBKEYS-4*16+0*4])
886  AS2( xor ebx, [L_SUBKEYS-4*16+1*4])
887  AS2( MOVD edx, MM(0))
888 
889  ASL(4)
890  AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4])
891  AS2( mov edi, [L_SUBKEYS-4*16+6*4])
892  ROUND()
893  AS2( mov ecx, edi)
894  AS2( xor eax, [L_SUBKEYS-4*16+4*4])
895  AS2( xor ebx, [L_SUBKEYS-4*16+5*4])
896  AS2( MOVD edx, MM(0))
897 
898  AS2( add L_REG, 32)
899  AS2( test L_REG, 255)
900  ATT_NOPREFIX
901  ASJ( jnz, 2, b)
902  INTEL_NOPREFIX
903  AS2( sub L_REG, 16*16)
904 
905 #define LAST(a, b, c) \
906  AS2( movzx esi, a )\
907  AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
908  AS2( movzx esi, b )\
909  AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\
910  AS2( mov WORD PTR [L_LASTROUND+c], di )\
911 
912  // last round
913  LAST(ch, dl, 2)
914  LAST(dh, al, 6)
915  AS2( shr edx, 16)
916  LAST(ah, bl, 10)
917  AS2( shr eax, 16)
918  LAST(bh, cl, 14)
919  AS2( shr ebx, 16)
920  LAST(dh, al, 12)
921  AS2( shr ecx, 16)
922  LAST(ah, bl, 0)
923  LAST(bh, cl, 4)
924  LAST(ch, dl, 8)
925 
926  AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS])
927  AS2( mov WORD_REG(bx), [L_OUTBLOCKS])
928 
929  AS2( mov WORD_REG(cx), [L_LENGTH])
930  AS2( sub WORD_REG(cx), 16)
931 
932  AS2( movdqu xmm2, [WORD_REG(ax)])
933  AS2( pxor xmm2, xmm4)
934 
935 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
936  AS2( movdqa xmm0, [L_INCREMENTS])
937  AS2( paddd xmm0, [L_INBLOCKS])
938  AS2( movdqa [L_INBLOCKS], xmm0)
939 #else
940  AS2( movdqa xmm0, [L_INCREMENTS+16])
941  AS2( paddq xmm0, [L_INBLOCKS+16])
942  AS2( movdqa [L_INBLOCKS+16], xmm0)
943 #endif
944 
945  AS2( pxor xmm2, [L_LASTROUND])
946  AS2( movdqu [WORD_REG(bx)], xmm2)
947 
948  ATT_NOPREFIX
949  ASJ( jle, 7, f)
950  INTEL_NOPREFIX
951  AS2( mov [L_LENGTH], WORD_REG(cx))
952  AS2( test WORD_REG(cx), 1)
953  ATT_NOPREFIX
954  ASJ( jnz, 1, b)
955  INTEL_NOPREFIX
956 #if CRYPTOPP_BOOL_X64
957  AS2( movdqa xmm0, [L_INCREMENTS])
958  AS2( paddq xmm0, [L_INBLOCKS])
959  AS2( movdqa [L_INBLOCKS], xmm0)
960 #endif
961  ATT_NOPREFIX
962  ASJ( jmp, 3, b)
963  INTEL_NOPREFIX
964 
965  ASL(7)
966  // erase keys on stack
967  AS2( xorps xmm0, xmm0)
968  AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16])
969  AS2( movaps [WORD_REG(ax)-7*16], xmm0)
970  AS2( movaps [WORD_REG(ax)-6*16], xmm0)
971  AS2( movaps [WORD_REG(ax)-5*16], xmm0)
972  AS2( movaps [WORD_REG(ax)-4*16], xmm0)
973  AS2( movaps [WORD_REG(ax)-3*16], xmm0)
974  AS2( movaps [WORD_REG(ax)-2*16], xmm0)
975  AS2( movaps [WORD_REG(ax)-1*16], xmm0)
976  AS2( movaps [WORD_REG(ax)+0*16], xmm0)
977  AS2( movaps [WORD_REG(ax)+1*16], xmm0)
978  AS2( movaps [WORD_REG(ax)+2*16], xmm0)
979  AS2( movaps [WORD_REG(ax)+3*16], xmm0)
980  AS2( movaps [WORD_REG(ax)+4*16], xmm0)
981  AS2( movaps [WORD_REG(ax)+5*16], xmm0)
982  AS2( movaps [WORD_REG(ax)+6*16], xmm0)
983 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
984  AS2( mov esp, [L_SP])
985  AS1( emms)
986 #endif
987  AS_POP_IF86(bp)
988  AS_POP_IF86(bx)
989 #if defined(_MSC_VER) && CRYPTOPP_BOOL_X86
990  AS_POP_IF86(di)
991  AS_POP_IF86(si)
992  AS1(ret)
993 #endif
994 #ifdef CRYPTOPP_GENERATE_X64_MASM
995  pop r12
996  pop rbx
997  pop rdi
998  pop rsi
999  ret
1000  Rijndael_Enc_AdvancedProcessBlocks ENDP
1001 #endif
1002 #ifdef __GNUC__
1003  ATT_PREFIX
1004  :
1005  : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize)
1006  : "memory", "cc", "%eax"
1007  #if CRYPTOPP_BOOL_X64
1008  , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12"
1009  #endif
1010  );
1011 #endif
1012 }
1013 
1014 #endif
1015 
1016 #ifndef CRYPTOPP_GENERATE_X64_MASM
1017 
1018 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
1019 extern "C" {
1020 void Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k);
1021 }
1022 #endif
1023 
1024 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
1025 
1026 static inline bool AliasedWithTable(const byte *begin, const byte *end)
1027 {
1028  size_t s0 = size_t(begin)%4096, s1 = size_t(end)%4096;
1029  size_t t0 = size_t(Te)%4096, t1 = (size_t(Te)+sizeof(Te))%4096;
1030  if (t1 > t0)
1031  return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
1032  else
1033  return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
1034 }
1035 
1036 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1037 
1038 inline void AESNI_Enc_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
1039 {
1040  block = _mm_xor_si128(block, subkeys[0]);
1041  for (unsigned int i=1; i<rounds-1; i+=2)
1042  {
1043  block = _mm_aesenc_si128(block, subkeys[i]);
1044  block = _mm_aesenc_si128(block, subkeys[i+1]);
1045  }
1046  block = _mm_aesenc_si128(block, subkeys[rounds-1]);
1047  block = _mm_aesenclast_si128(block, subkeys[rounds]);
1048 }
1049 
1050 inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
1051 {
1052  __m128i rk = subkeys[0];
1053  block0 = _mm_xor_si128(block0, rk);
1054  block1 = _mm_xor_si128(block1, rk);
1055  block2 = _mm_xor_si128(block2, rk);
1056  block3 = _mm_xor_si128(block3, rk);
1057  for (unsigned int i=1; i<rounds; i++)
1058  {
1059  rk = subkeys[i];
1060  block0 = _mm_aesenc_si128(block0, rk);
1061  block1 = _mm_aesenc_si128(block1, rk);
1062  block2 = _mm_aesenc_si128(block2, rk);
1063  block3 = _mm_aesenc_si128(block3, rk);
1064  }
1065  rk = subkeys[rounds];
1066  block0 = _mm_aesenclast_si128(block0, rk);
1067  block1 = _mm_aesenclast_si128(block1, rk);
1068  block2 = _mm_aesenclast_si128(block2, rk);
1069  block3 = _mm_aesenclast_si128(block3, rk);
1070 }
1071 
1072 inline void AESNI_Dec_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
1073 {
1074  block = _mm_xor_si128(block, subkeys[0]);
1075  for (unsigned int i=1; i<rounds-1; i+=2)
1076  {
1077  block = _mm_aesdec_si128(block, subkeys[i]);
1078  block = _mm_aesdec_si128(block, subkeys[i+1]);
1079  }
1080  block = _mm_aesdec_si128(block, subkeys[rounds-1]);
1081  block = _mm_aesdeclast_si128(block, subkeys[rounds]);
1082 }
1083 
1084 inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
1085 {
1086  __m128i rk = subkeys[0];
1087  block0 = _mm_xor_si128(block0, rk);
1088  block1 = _mm_xor_si128(block1, rk);
1089  block2 = _mm_xor_si128(block2, rk);
1090  block3 = _mm_xor_si128(block3, rk);
1091  for (unsigned int i=1; i<rounds; i++)
1092  {
1093  rk = subkeys[i];
1094  block0 = _mm_aesdec_si128(block0, rk);
1095  block1 = _mm_aesdec_si128(block1, rk);
1096  block2 = _mm_aesdec_si128(block2, rk);
1097  block3 = _mm_aesdec_si128(block3, rk);
1098  }
1099  rk = subkeys[rounds];
1100  block0 = _mm_aesdeclast_si128(block0, rk);
1101  block1 = _mm_aesdeclast_si128(block1, rk);
1102  block2 = _mm_aesdeclast_si128(block2, rk);
1103  block3 = _mm_aesdeclast_si128(block3, rk);
1104 }
1105 
1106 static CRYPTOPP_ALIGN_DATA(16) const word32 s_one[] = {0, 0, 0, 1<<24};
1107 
1108 template <typename F1, typename F4>
1109 inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, const __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1110 {
1111  size_t blockSize = 16;
1113  size_t xorIncrement = xorBlocks ? blockSize : 0;
1114  size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
1115 
1117  {
1118  assert(length % blockSize == 0);
1119  inBlocks += length - blockSize;
1120  xorBlocks += length - blockSize;
1121  outBlocks += length - blockSize;
1122  inIncrement = 0-inIncrement;
1123  xorIncrement = 0-xorIncrement;
1124  outIncrement = 0-outIncrement;
1125  }
1126 
1127  if (flags & BlockTransformation::BT_AllowParallel)
1128  {
1129  while (length >= 4*blockSize)
1130  {
1131  __m128i block0 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks), block1, block2, block3;
1133  {
1134  const __m128i be1 = *(const __m128i *)(const void *)s_one;
1135  block1 = _mm_add_epi32(block0, be1);
1136  block2 = _mm_add_epi32(block1, be1);
1137  block3 = _mm_add_epi32(block2, be1);
1138  _mm_storeu_si128((__m128i *)(void *)inBlocks, _mm_add_epi32(block3, be1));
1139  }
1140  else
1141  {
1142  inBlocks += inIncrement;
1143  block1 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
1144  inBlocks += inIncrement;
1145  block2 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
1146  inBlocks += inIncrement;
1147  block3 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
1148  inBlocks += inIncrement;
1149  }
1150 
1151  if (flags & BlockTransformation::BT_XorInput)
1152  {
1153  block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1154  xorBlocks += xorIncrement;
1155  block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1156  xorBlocks += xorIncrement;
1157  block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1158  xorBlocks += xorIncrement;
1159  block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1160  xorBlocks += xorIncrement;
1161  }
1162 
1163  func4(block0, block1, block2, block3, subkeys, rounds);
1164 
1165  if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
1166  {
1167  block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1168  xorBlocks += xorIncrement;
1169  block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1170  xorBlocks += xorIncrement;
1171  block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1172  xorBlocks += xorIncrement;
1173  block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1174  xorBlocks += xorIncrement;
1175  }
1176 
1177  _mm_storeu_si128((__m128i *)(void *)outBlocks, block0);
1178  outBlocks += outIncrement;
1179  _mm_storeu_si128((__m128i *)(void *)outBlocks, block1);
1180  outBlocks += outIncrement;
1181  _mm_storeu_si128((__m128i *)(void *)outBlocks, block2);
1182  outBlocks += outIncrement;
1183  _mm_storeu_si128((__m128i *)(void *)outBlocks, block3);
1184  outBlocks += outIncrement;
1185 
1186  length -= 4*blockSize;
1187  }
1188  }
1189 
1190  while (length >= blockSize)
1191  {
1192  __m128i block = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
1193 
1194  if (flags & BlockTransformation::BT_XorInput)
1195  block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1196 
1197  if (flags & BlockTransformation::BT_InBlockIsCounter)
1198  const_cast<byte *>(inBlocks)[15]++;
1199 
1200  func1(block, subkeys, rounds);
1201 
1202  if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
1203  block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
1204 
1205  _mm_storeu_si128((__m128i *)(void *)outBlocks, block);
1206 
1207  inBlocks += inIncrement;
1208  outBlocks += outIncrement;
1209  xorBlocks += xorIncrement;
1210  length -= blockSize;
1211  }
1212 
1213  return length;
1214 }
1215 #endif
1216 
1217 size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1218 {
1219 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1220  if (HasAESNI())
1221  return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (const __m128i *)(const void *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1222 #endif
1223 
1224 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
1225  if (HasSSE2())
1226  {
1227  if (length < BLOCKSIZE)
1228  return length;
1229 
1230  struct Locals
1231  {
1232  word32 subkeys[4*12], workspace[8];
1233  const byte *inBlocks, *inXorBlocks, *outXorBlocks;
1234  byte *outBlocks;
1235  size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
1236  size_t regSpill, lengthAndCounterFlag, keysBegin;
1237  };
1238 
1239  const byte* zeros = (byte *)(Te+256);
1240  byte *space = NULL;
1241 
1242  do {
1243 #if (CRYPTOPP_MSC_VERSION >= 1400)
1244  // https://msdn.microsoft.com/en-us/library/5471dc8s.aspx
1245  space = (byte *)_malloca(255+sizeof(Locals));
1246  space += (256-(size_t)space%256)%256;
1247 #else
1248  space = (byte *)alloca(255+sizeof(Locals));
1249  space += (256-(size_t)space%256)%256;
1250 #endif
1251  }
1252  while (AliasedWithTable(space, space+sizeof(Locals)));
1253 
1254  size_t increment = BLOCKSIZE;
1255  if (flags & BT_ReverseDirection)
1256  {
1257  assert(length % BLOCKSIZE == 0);
1258  inBlocks += length - BLOCKSIZE;
1259  xorBlocks += length - BLOCKSIZE;
1260  outBlocks += length - BLOCKSIZE;
1261  increment = 0-increment;
1262  }
1263 
1264  Locals &locals = *(Locals *)(void *)space;
1265 
1266  locals.inBlocks = inBlocks;
1267  locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
1268  locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
1269  locals.outBlocks = outBlocks;
1270 
1271  locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1272  locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
1273  locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
1274  locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1275 
1276  locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter);
1277  int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
1278  locals.keysBegin = (12-keysToCopy)*16;
1279 
1280  Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key);
1281 
1282 #if (CRYPTOPP_MSC_VERSION >= 1400)
1283  _freea(space);
1284 #endif
1285 
1286  return length % BLOCKSIZE;
1287  }
1288 #endif
1289 
1290  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1291 }
1292 
1293 #endif
1294 
1295 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1296 
1297 size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1298 {
1299  if (HasAESNI())
1300  return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (const __m128i *)(const void *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1301 
1302  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1303 }
1304 
1305 #endif // #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1306 
1307 NAMESPACE_END
1308 
1309 #endif
1310 #endif
Utility functions for the Crypto++ library.
Library configuration file.
should not modify block pointers
Definition: cryptlib.h:789
virtual size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
Encrypt and xor multiple blocks using additional flags.
Definition: cryptlib.cpp:181
Access a block of memory.
Definition: misc.h:2102
Rijndael block cipher implementation details.
Definition: rijndael.h:29
T ConditionalByteReverse(ByteOrder order, T value)
Reverses bytes in a value depending upon endianess.
Definition: misc.h:1705
byte order is big-endian
Definition: cryptlib.h:127
Classes for Rijndael encryption algorithm.
Classes, functions, intrinsics and features for X86, X32 nd X64 assembly.
perform the transformation in reverse
Definition: cryptlib.h:793
Crypto++ library namespace.
T rotrFixed(T x, unsigned int y)
Performs a right rotate.
Definition: misc.h:1199
Interface for retrieving values given their names.
Definition: cryptlib.h:277