Crypto++  5.6.3
Free C++ class library of cryptographic schemes
gcm.cpp
1 // gcm.cpp - written and placed in the public domain by Wei Dai
2 
3 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM gcm.cpp" to generate MASM code
4 
5 #include "pch.h"
6 #include "config.h"
7 
8 #if CRYPTOPP_MSC_VERSION
9 # pragma warning(disable: 4189)
10 #endif
11 
12 #ifndef CRYPTOPP_IMPORTS
13 #ifndef CRYPTOPP_GENERATE_X64_MASM
14 
15 // Clang 3.3 integrated assembler crash on Linux
16 #if defined(CRYPTOPP_CLANG_VERSION) && (CRYPTOPP_CLANG_VERSION < 30400)
17 # undef CRYPTOPP_X86_ASM_AVAILABLE
18 # undef CRYPTOPP_X32_ASM_AVAILABLE
19 # undef CRYPTOPP_X64_ASM_AVAILABLE
20 # undef CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
21 # undef CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
22 # define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 0
23 # define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 0
24 #endif
25 
26 #include "gcm.h"
27 #include "cpu.h"
28 
29 NAMESPACE_BEGIN(CryptoPP)
30 
31 word16 GCM_Base::s_reductionTable[256];
32 volatile bool GCM_Base::s_reductionTableInitialized = false;
33 
34 void GCM_Base::GCTR::IncrementCounterBy256()
35 {
36  IncrementCounterByOne(m_counterArray+BlockSize()-4, 3);
37 }
38 
39 #if 0
40 // preserved for testing
41 void gcm_gf_mult(const unsigned char *a, const unsigned char *b, unsigned char *c)
42 {
43  word64 Z0=0, Z1=0, V0, V1;
44 
46  Block::Get(a)(V0)(V1);
47 
48  for (int i=0; i<16; i++)
49  {
50  for (int j=0x80; j!=0; j>>=1)
51  {
52  int x = b[i] & j;
53  Z0 ^= x ? V0 : 0;
54  Z1 ^= x ? V1 : 0;
55  x = (int)V1 & 1;
56  V1 = (V1>>1) | (V0<<63);
57  V0 = (V0>>1) ^ (x ? W64LIT(0xe1) << 56 : 0);
58  }
59  }
60  Block::Put(NULL, c)(Z0)(Z1);
61 }
62 
63 __m128i _mm_clmulepi64_si128(const __m128i &a, const __m128i &b, int i)
64 {
65  word64 A[1] = {ByteReverse(((word64*)&a)[i&1])};
66  word64 B[1] = {ByteReverse(((word64*)&b)[i>>4])};
67 
68  PolynomialMod2 pa((byte *)A, 8);
69  PolynomialMod2 pb((byte *)B, 8);
70  PolynomialMod2 c = pa*pb;
71 
72  __m128i output;
73  for (int i=0; i<16; i++)
74  ((byte *)&output)[i] = c.GetByte(i);
75  return output;
76 }
77 #endif
78 
79 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
80 inline static void SSE2_Xor16(byte *a, const byte *b, const byte *c)
81 {
82 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
83  assert(IsAlignedOn(a,GetAlignmentOf<__m128i>()));
84  assert(IsAlignedOn(b,GetAlignmentOf<__m128i>()));
85  assert(IsAlignedOn(c,GetAlignmentOf<__m128i>()));
86  *(__m128i *)(void *)a = _mm_xor_si128(*(__m128i *)(void *)b, *(__m128i *)(void *)c);
87 #else
88  asm ("movdqa %1, %%xmm0; pxor %2, %%xmm0; movdqa %%xmm0, %0;" : "=m" (a[0]) : "m"(b[0]), "m"(c[0]));
89 #endif
90 }
91 #endif
92 
93 inline static void Xor16(byte *a, const byte *b, const byte *c)
94 {
95  assert(IsAlignedOn(a,GetAlignmentOf<word64>()));
96  assert(IsAlignedOn(b,GetAlignmentOf<word64>()));
97  assert(IsAlignedOn(c,GetAlignmentOf<word64>()));
98  ((word64 *)(void *)a)[0] = ((word64 *)(void *)b)[0] ^ ((word64 *)(void *)c)[0];
99  ((word64 *)(void *)a)[1] = ((word64 *)(void *)b)[1] ^ ((word64 *)(void *)c)[1];
100 }
101 
102 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
103 static CRYPTOPP_ALIGN_DATA(16) const word64 s_clmulConstants64[] = {
104  W64LIT(0xe100000000000000), W64LIT(0xc200000000000000),
105  W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607),
106  W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f)};
107 static const __m128i *s_clmulConstants = (const __m128i *)(const void *)s_clmulConstants64;
108 static const unsigned int s_clmulTableSizeInBlocks = 8;
109 
110 inline __m128i CLMUL_Reduce(__m128i c0, __m128i c1, __m128i c2, const __m128i &r)
111 {
112  /*
113  The polynomial to be reduced is c0 * x^128 + c1 * x^64 + c2. c0t below refers to the most
114  significant half of c0 as a polynomial, which, due to GCM's bit reflection, are in the
115  rightmost bit positions, and the lowest byte addresses.
116 
117  c1 ^= c0t * 0xc200000000000000
118  c2t ^= c0t
119  t = shift (c1t ^ c0b) left 1 bit
120  c2 ^= t * 0xe100000000000000
121  c2t ^= c1b
122  shift c2 left 1 bit and xor in lowest bit of c1t
123  */
124 #if 0 // MSVC 2010 workaround: see http://connect.microsoft.com/VisualStudio/feedback/details/575301
125  c2 = _mm_xor_si128(c2, _mm_move_epi64(c0));
126 #else
127  c1 = _mm_xor_si128(c1, _mm_slli_si128(c0, 8));
128 #endif
129  c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(c0, r, 0x10));
130  c0 = _mm_srli_si128(c0, 8);
131  c0 = _mm_xor_si128(c0, c1);
132  c0 = _mm_slli_epi64(c0, 1);
133  c0 = _mm_clmulepi64_si128(c0, r, 0);
134  c2 = _mm_xor_si128(c2, c0);
135  c2 = _mm_xor_si128(c2, _mm_srli_si128(c1, 8));
136  c1 = _mm_unpacklo_epi64(c1, c2);
137  c1 = _mm_srli_epi64(c1, 63);
138  c2 = _mm_slli_epi64(c2, 1);
139  return _mm_xor_si128(c2, c1);
140 }
141 
142 inline __m128i CLMUL_GF_Mul(const __m128i &x, const __m128i &h, const __m128i &r)
143 {
144  __m128i c0 = _mm_clmulepi64_si128(x,h,0);
145  __m128i c1 = _mm_xor_si128(_mm_clmulepi64_si128(x,h,1), _mm_clmulepi64_si128(x,h,0x10));
146  __m128i c2 = _mm_clmulepi64_si128(x,h,0x11);
147 
148  return CLMUL_Reduce(c0, c1, c2, r);
149 }
150 #endif
151 
152 void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const NameValuePairs &params)
153 {
154  BlockCipher &blockCipher = AccessBlockCipher();
155  blockCipher.SetKey(userKey, keylength, params);
156 
157  if (blockCipher.BlockSize() != REQUIRED_BLOCKSIZE)
158  throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not 16");
159 
160  int tableSize, i, j, k;
161 
162 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
163  if (HasCLMUL())
164  {
165  // Avoid "parameter not used" error and suppress Coverity finding
166  (void)params.GetIntValue(Name::TableSize(), tableSize);
167  tableSize = s_clmulTableSizeInBlocks * REQUIRED_BLOCKSIZE;
168  }
169  else
170 #endif
171  {
172  if (params.GetIntValue(Name::TableSize(), tableSize))
173  tableSize = (tableSize >= 64*1024) ? 64*1024 : 2*1024;
174  else
175  tableSize = (GetTablesOption() == GCM_64K_Tables) ? 64*1024 : 2*1024;
176 
177 #if defined(_MSC_VER) && (_MSC_VER >= 1300 && _MSC_VER < 1400)
178  // VC 2003 workaround: compiler generates bad code for 64K tables
179  tableSize = 2*1024;
180 #endif
181  }
182 
183  m_buffer.resize(3*REQUIRED_BLOCKSIZE + tableSize);
184  byte *table = MulTable();
185  byte *hashKey = HashKey();
186  memset(hashKey, 0, REQUIRED_BLOCKSIZE);
187  blockCipher.ProcessBlock(hashKey);
188 
189 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
190  if (HasCLMUL())
191  {
192  const __m128i r = s_clmulConstants[0];
193  __m128i h0 = _mm_shuffle_epi8(_mm_load_si128((__m128i *)(void *)hashKey), s_clmulConstants[1]);
194  __m128i h = h0;
195 
196  for (i=0; i<tableSize; i+=32)
197  {
198  __m128i h1 = CLMUL_GF_Mul(h, h0, r);
199  _mm_storel_epi64((__m128i *)(void *)(table+i), h);
200  _mm_storeu_si128((__m128i *)(void *)(table+i+16), h1);
201  _mm_storeu_si128((__m128i *)(void *)(table+i+8), h);
202  _mm_storel_epi64((__m128i *)(void *)(table+i+8), h1);
203  h = CLMUL_GF_Mul(h1, h0, r);
204  }
205 
206  return;
207  }
208 #endif
209 
210  word64 V0, V1;
211  typedef BlockGetAndPut<word64, BigEndian> Block;
212  Block::Get(hashKey)(V0)(V1);
213 
214  if (tableSize == 64*1024)
215  {
216  for (i=0; i<128; i++)
217  {
218  k = i%8;
219  Block::Put(NULL, table+(i/8)*256*16+(size_t(1)<<(11-k)))(V0)(V1);
220 
221  int x = (int)V1 & 1;
222  V1 = (V1>>1) | (V0<<63);
223  V0 = (V0>>1) ^ (x ? W64LIT(0xe1) << 56 : 0);
224  }
225 
226  for (i=0; i<16; i++)
227  {
228  memset(table+i*256*16, 0, 16);
229 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
230  if (HasSSE2())
231  for (j=2; j<=0x80; j*=2)
232  for (k=1; k<j; k++)
233  SSE2_Xor16(table+i*256*16+(j+k)*16, table+i*256*16+j*16, table+i*256*16+k*16);
234  else
235 #endif
236  for (j=2; j<=0x80; j*=2)
237  for (k=1; k<j; k++)
238  Xor16(table+i*256*16+(j+k)*16, table+i*256*16+j*16, table+i*256*16+k*16);
239  }
240  }
241  else
242  {
243  if (!s_reductionTableInitialized)
244  {
245  s_reductionTable[0] = 0;
246  word16 x = 0x01c2;
247  s_reductionTable[1] = ByteReverse(x);
248  for (unsigned int ii=2; ii<=0x80; ii*=2)
249  {
250  x <<= 1;
251  s_reductionTable[ii] = ByteReverse(x);
252  for (unsigned int jj=1; jj<ii; jj++)
253  s_reductionTable[ii+jj] = s_reductionTable[ii] ^ s_reductionTable[jj];
254  }
255  s_reductionTableInitialized = true;
256  }
257 
258  for (i=0; i<128-24; i++)
259  {
260  k = i%32;
261  if (k < 4)
262  Block::Put(NULL, table+1024+(i/32)*256+(size_t(1)<<(7-k)))(V0)(V1);
263  else if (k < 8)
264  Block::Put(NULL, table+(i/32)*256+(size_t(1)<<(11-k)))(V0)(V1);
265 
266  int x = (int)V1 & 1;
267  V1 = (V1>>1) | (V0<<63);
268  V0 = (V0>>1) ^ (x ? W64LIT(0xe1) << 56 : 0);
269  }
270 
271  for (i=0; i<4; i++)
272  {
273  memset(table+i*256, 0, 16);
274  memset(table+1024+i*256, 0, 16);
275 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
276  if (HasSSE2())
277  for (j=2; j<=8; j*=2)
278  for (k=1; k<j; k++)
279  {
280  SSE2_Xor16(table+i*256+(j+k)*16, table+i*256+j*16, table+i*256+k*16);
281  SSE2_Xor16(table+1024+i*256+(j+k)*16, table+1024+i*256+j*16, table+1024+i*256+k*16);
282  }
283  else
284 #endif
285  for (j=2; j<=8; j*=2)
286  for (k=1; k<j; k++)
287  {
288  Xor16(table+i*256+(j+k)*16, table+i*256+j*16, table+i*256+k*16);
289  Xor16(table+1024+i*256+(j+k)*16, table+1024+i*256+j*16, table+1024+i*256+k*16);
290  }
291  }
292  }
293 }
294 
295 inline void GCM_Base::ReverseHashBufferIfNeeded()
296 {
297 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
298  if (HasCLMUL())
299  {
300  __m128i &x = *(__m128i *)(void *)HashBuffer();
301  x = _mm_shuffle_epi8(x, s_clmulConstants[1]);
302  }
303 #endif
304 }
305 
306 void GCM_Base::Resync(const byte *iv, size_t len)
307 {
308  BlockCipher &cipher = AccessBlockCipher();
309  byte *hashBuffer = HashBuffer();
310 
311  if (len == 12)
312  {
313  memcpy(hashBuffer, iv, len);
314  memset(hashBuffer+len, 0, 3);
315  hashBuffer[len+3] = 1;
316  }
317  else
318  {
319  size_t origLen = len;
320  memset(hashBuffer, 0, HASH_BLOCKSIZE);
321 
322  if (len >= HASH_BLOCKSIZE)
323  {
324  len = GCM_Base::AuthenticateBlocks(iv, len);
325  iv += (origLen - len);
326  }
327 
328  if (len > 0)
329  {
330  memcpy(m_buffer, iv, len);
331  memset(m_buffer+len, 0, HASH_BLOCKSIZE-len);
332  GCM_Base::AuthenticateBlocks(m_buffer, HASH_BLOCKSIZE);
333  }
334 
335  PutBlock<word64, BigEndian, true>(NULL, m_buffer)(0)(origLen*8);
336  GCM_Base::AuthenticateBlocks(m_buffer, HASH_BLOCKSIZE);
337 
338  ReverseHashBufferIfNeeded();
339  }
340 
341  if (m_state >= State_IVSet)
342  m_ctr.Resynchronize(hashBuffer, REQUIRED_BLOCKSIZE);
343  else
344  m_ctr.SetCipherWithIV(cipher, hashBuffer);
345 
346  m_ctr.Seek(HASH_BLOCKSIZE);
347 
348  memset(hashBuffer, 0, HASH_BLOCKSIZE);
349 }
350 
351 unsigned int GCM_Base::OptimalDataAlignment() const
352 {
353  return
354 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
355  HasSSE2() ? 16 :
356 #endif
357  GetBlockCipher().OptimalDataAlignment();
358 }
359 
360 #if CRYPTOPP_MSC_VERSION
361 # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
362 #endif
363 
364 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
365 
366 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
367 extern "C" {
368 void GCM_AuthenticateBlocks_2K(const byte *data, size_t blocks, word64 *hashBuffer, const word16 *reductionTable);
369 void GCM_AuthenticateBlocks_64K(const byte *data, size_t blocks, word64 *hashBuffer);
370 }
371 #endif
372 
373 #ifndef CRYPTOPP_GENERATE_X64_MASM
374 
375 size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
376 {
377 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
378  if (HasCLMUL())
379  {
380  const __m128i *table = (const __m128i *)(const void *)MulTable();
381  __m128i x = _mm_load_si128((__m128i *)(void *)HashBuffer());
382  const __m128i r = s_clmulConstants[0], bswapMask = s_clmulConstants[1], bswapMask2 = s_clmulConstants[2];
383 
384  while (len >= 16)
385  {
386  size_t s = UnsignedMin(len/16, s_clmulTableSizeInBlocks), i=0;
387  __m128i d, d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-1)*16)), bswapMask2);;
388  __m128i c0 = _mm_setzero_si128();
389  __m128i c1 = _mm_setzero_si128();
390  __m128i c2 = _mm_setzero_si128();
391 
392  while (true)
393  {
394  __m128i h0 = _mm_load_si128(table+i);
395  __m128i h1 = _mm_load_si128(table+i+1);
396  __m128i h01 = _mm_xor_si128(h0, h1);
397 
398  if (++i == s)
399  {
400  d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)data), bswapMask);
401  d = _mm_xor_si128(d, x);
402  c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0));
403  c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 1));
404  d = _mm_xor_si128(d, _mm_shuffle_epi32(d, _MM_SHUFFLE(1, 0, 3, 2)));
405  c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h01, 0));
406  break;
407  }
408 
409  d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-i)*16-8)), bswapMask2);
410  c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d2, h0, 1));
411  c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 1));
412  d2 = _mm_xor_si128(d2, d);
413  c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d2, h01, 1));
414 
415  if (++i == s)
416  {
417  d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)data), bswapMask);
418  d = _mm_xor_si128(d, x);
419  c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0x10));
420  c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 0x11));
421  d = _mm_xor_si128(d, _mm_shuffle_epi32(d, _MM_SHUFFLE(1, 0, 3, 2)));
422  c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h01, 0x10));
423  break;
424  }
425 
426  d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-i)*16-8)), bswapMask);
427  c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0x10));
428  c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d2, h1, 0x10));
429  d = _mm_xor_si128(d, d2);
430  c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h01, 0x10));
431  }
432  data += s*16;
433  len -= s*16;
434 
435  c1 = _mm_xor_si128(_mm_xor_si128(c1, c0), c2);
436  x = CLMUL_Reduce(c0, c1, c2, r);
437  }
438 
439  _mm_store_si128((__m128i *)(void *)HashBuffer(), x);
440  return len;
441  }
442 #endif
443 
445  word64 *hashBuffer = (word64 *)(void *)HashBuffer();
446  assert(IsAlignedOn(hashBuffer,GetAlignmentOf<word64>()));
447 
448  switch (2*(m_buffer.size()>=64*1024)
449 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
450  + HasSSE2()
451 #endif
452  )
453  {
454  case 0: // non-SSE2 and 2K tables
455  {
456  byte *table = MulTable();
457  word64 x0 = hashBuffer[0], x1 = hashBuffer[1];
458 
459  do
460  {
461  word64 y0, y1, a0, a1, b0, b1, c0, c1, d0, d1;
462  Block::Get(data)(y0)(y1);
463  x0 ^= y0;
464  x1 ^= y1;
465 
466  data += HASH_BLOCKSIZE;
467  len -= HASH_BLOCKSIZE;
468 
469  #define READ_TABLE_WORD64_COMMON(a, b, c, d) *(word64 *)(void *)(table+(a*1024)+(b*256)+c+d*8)
470 
471  #ifdef IS_LITTLE_ENDIAN
472  #if CRYPTOPP_BOOL_SLOW_WORD64
473  word32 z0 = (word32)x0;
474  word32 z1 = (word32)(x0>>32);
475  word32 z2 = (word32)x1;
476  word32 z3 = (word32)(x1>>32);
477  #define READ_TABLE_WORD64(a, b, c, d, e) READ_TABLE_WORD64_COMMON((d%2), c, (d?(z##c>>((d?d-1:0)*4))&0xf0:(z##c&0xf)<<4), e)
478  #else
479  #define READ_TABLE_WORD64(a, b, c, d, e) READ_TABLE_WORD64_COMMON((d%2), c, ((d+8*b)?(x##a>>(((d+8*b)?(d+8*b)-1:1)*4))&0xf0:(x##a&0xf)<<4), e)
480  #endif
481  #define GF_MOST_SIG_8BITS(a) (a##1 >> 7*8)
482  #define GF_SHIFT_8(a) a##1 = (a##1 << 8) ^ (a##0 >> 7*8); a##0 <<= 8;
483  #else
484  #define READ_TABLE_WORD64(a, b, c, d, e) READ_TABLE_WORD64_COMMON((1-d%2), c, ((15-d-8*b)?(x##a>>(((15-d-8*b)?(15-d-8*b)-1:0)*4))&0xf0:(x##a&0xf)<<4), e)
485  #define GF_MOST_SIG_8BITS(a) (a##1 & 0xff)
486  #define GF_SHIFT_8(a) a##1 = (a##1 >> 8) ^ (a##0 << 7*8); a##0 >>= 8;
487  #endif
488 
489  #define GF_MUL_32BY128(op, a, b, c) \
490  a0 op READ_TABLE_WORD64(a, b, c, 0, 0) ^ READ_TABLE_WORD64(a, b, c, 1, 0);\
491  a1 op READ_TABLE_WORD64(a, b, c, 0, 1) ^ READ_TABLE_WORD64(a, b, c, 1, 1);\
492  b0 op READ_TABLE_WORD64(a, b, c, 2, 0) ^ READ_TABLE_WORD64(a, b, c, 3, 0);\
493  b1 op READ_TABLE_WORD64(a, b, c, 2, 1) ^ READ_TABLE_WORD64(a, b, c, 3, 1);\
494  c0 op READ_TABLE_WORD64(a, b, c, 4, 0) ^ READ_TABLE_WORD64(a, b, c, 5, 0);\
495  c1 op READ_TABLE_WORD64(a, b, c, 4, 1) ^ READ_TABLE_WORD64(a, b, c, 5, 1);\
496  d0 op READ_TABLE_WORD64(a, b, c, 6, 0) ^ READ_TABLE_WORD64(a, b, c, 7, 0);\
497  d1 op READ_TABLE_WORD64(a, b, c, 6, 1) ^ READ_TABLE_WORD64(a, b, c, 7, 1);\
498 
499  GF_MUL_32BY128(=, 0, 0, 0)
500  GF_MUL_32BY128(^=, 0, 1, 1)
501  GF_MUL_32BY128(^=, 1, 0, 2)
502  GF_MUL_32BY128(^=, 1, 1, 3)
503 
504  word32 r = (word32)s_reductionTable[GF_MOST_SIG_8BITS(d)] << 16;
505  GF_SHIFT_8(d)
506  c0 ^= d0; c1 ^= d1;
507  r ^= (word32)s_reductionTable[GF_MOST_SIG_8BITS(c)] << 8;
508  GF_SHIFT_8(c)
509  b0 ^= c0; b1 ^= c1;
510  r ^= s_reductionTable[GF_MOST_SIG_8BITS(b)];
511  GF_SHIFT_8(b)
512  a0 ^= b0; a1 ^= b1;
514  x0 = a0; x1 = a1;
515  }
516  while (len >= HASH_BLOCKSIZE);
517 
518  hashBuffer[0] = x0; hashBuffer[1] = x1;
519  return len;
520  }
521 
522  case 2: // non-SSE2 and 64K tables
523  {
524  byte *table = MulTable();
525  word64 x0 = hashBuffer[0], x1 = hashBuffer[1];
526 
527  do
528  {
529  word64 y0, y1, a0, a1;
530  Block::Get(data)(y0)(y1);
531  x0 ^= y0;
532  x1 ^= y1;
533 
534  data += HASH_BLOCKSIZE;
535  len -= HASH_BLOCKSIZE;
536 
537  #undef READ_TABLE_WORD64_COMMON
538  #undef READ_TABLE_WORD64
539 
540  #define READ_TABLE_WORD64_COMMON(a, c, d) *(word64 *)(void *)(table+(a)*256*16+(c)+(d)*8)
541 
542  #ifdef IS_LITTLE_ENDIAN
543  #if CRYPTOPP_BOOL_SLOW_WORD64
544  word32 z0 = (word32)x0;
545  word32 z1 = (word32)(x0>>32);
546  word32 z2 = (word32)x1;
547  word32 z3 = (word32)(x1>>32);
548  #define READ_TABLE_WORD64(b, c, d, e) READ_TABLE_WORD64_COMMON(c*4+d, (d?(z##c>>((d?d:1)*8-4))&0xff0:(z##c&0xff)<<4), e)
549  #else
550  #define READ_TABLE_WORD64(b, c, d, e) READ_TABLE_WORD64_COMMON(c*4+d, ((d+4*(c%2))?(x##b>>(((d+4*(c%2))?(d+4*(c%2)):1)*8-4))&0xff0:(x##b&0xff)<<4), e)
551  #endif
552  #else
553  #define READ_TABLE_WORD64(b, c, d, e) READ_TABLE_WORD64_COMMON(c*4+d, ((7-d-4*(c%2))?(x##b>>(((7-d-4*(c%2))?(7-d-4*(c%2)):1)*8-4))&0xff0:(x##b&0xff)<<4), e)
554  #endif
555 
556  #define GF_MUL_8BY128(op, b, c, d) \
557  a0 op READ_TABLE_WORD64(b, c, d, 0);\
558  a1 op READ_TABLE_WORD64(b, c, d, 1);\
559 
560  GF_MUL_8BY128(=, 0, 0, 0)
561  GF_MUL_8BY128(^=, 0, 0, 1)
562  GF_MUL_8BY128(^=, 0, 0, 2)
563  GF_MUL_8BY128(^=, 0, 0, 3)
564  GF_MUL_8BY128(^=, 0, 1, 0)
565  GF_MUL_8BY128(^=, 0, 1, 1)
566  GF_MUL_8BY128(^=, 0, 1, 2)
567  GF_MUL_8BY128(^=, 0, 1, 3)
568  GF_MUL_8BY128(^=, 1, 2, 0)
569  GF_MUL_8BY128(^=, 1, 2, 1)
570  GF_MUL_8BY128(^=, 1, 2, 2)
571  GF_MUL_8BY128(^=, 1, 2, 3)
572  GF_MUL_8BY128(^=, 1, 3, 0)
573  GF_MUL_8BY128(^=, 1, 3, 1)
574  GF_MUL_8BY128(^=, 1, 3, 2)
575  GF_MUL_8BY128(^=, 1, 3, 3)
576 
577  x0 = a0; x1 = a1;
578  }
579  while (len >= HASH_BLOCKSIZE);
580 
581  hashBuffer[0] = x0; hashBuffer[1] = x1;
582  return len;
583  }
584 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
585 
586 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
587  case 1: // SSE2 and 2K tables
588  GCM_AuthenticateBlocks_2K(data, len/16, hashBuffer, s_reductionTable);
589  return len % 16;
590  case 3: // SSE2 and 64K tables
591  GCM_AuthenticateBlocks_64K(data, len/16, hashBuffer);
592  return len % 16;
593 #endif
594 
595 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
596  case 1: // SSE2 and 2K tables
597  {
598  #ifdef __GNUC__
599  __asm__ __volatile__
600  (
601  INTEL_NOPREFIX
602  #elif defined(CRYPTOPP_GENERATE_X64_MASM)
603  ALIGN 8
604  GCM_AuthenticateBlocks_2K PROC FRAME
605  rex_push_reg rsi
606  push_reg rdi
607  push_reg rbx
608  .endprolog
609  mov rsi, r8
610  mov r11, r9
611  #else
612  AS2( mov WORD_REG(cx), data )
613  AS2( mov WORD_REG(dx), len )
614  AS2( mov WORD_REG(si), hashBuffer )
615  AS2( shr WORD_REG(dx), 4 )
616  #endif
617 
618  #if CRYPTOPP_BOOL_X32
619  AS1(push rbx)
620  AS1(push rbp)
621  #else
622  AS_PUSH_IF86( bx)
623  AS_PUSH_IF86( bp)
624  #endif
625 
626  #ifdef __GNUC__
627  AS2( mov AS_REG_7, WORD_REG(di))
628  #elif CRYPTOPP_BOOL_X86
629  AS2( lea AS_REG_7, s_reductionTable)
630  #endif
631 
632  AS2( movdqa xmm0, [WORD_REG(si)] )
633 
634  #define MUL_TABLE_0 WORD_REG(si) + 32
635  #define MUL_TABLE_1 WORD_REG(si) + 32 + 1024
636  #define RED_TABLE AS_REG_7
637 
638  ASL(0)
639  AS2( movdqu xmm4, [WORD_REG(cx)] )
640  AS2( pxor xmm0, xmm4 )
641 
642  AS2( movd ebx, xmm0 )
643  AS2( mov eax, AS_HEX(f0f0f0f0) )
644  AS2( and eax, ebx )
645  AS2( shl ebx, 4 )
646  AS2( and ebx, AS_HEX(f0f0f0f0) )
647  AS2( movzx edi, ah )
648  AS2( movdqa xmm5, XMMWORD_PTR [MUL_TABLE_1 + WORD_REG(di)] )
649  AS2( movzx edi, al )
650  AS2( movdqa xmm4, XMMWORD_PTR [MUL_TABLE_1 + WORD_REG(di)] )
651  AS2( shr eax, 16 )
652  AS2( movzx edi, ah )
653  AS2( movdqa xmm3, XMMWORD_PTR [MUL_TABLE_1 + WORD_REG(di)] )
654  AS2( movzx edi, al )
655  AS2( movdqa xmm2, XMMWORD_PTR [MUL_TABLE_1 + WORD_REG(di)] )
656 
657  #define SSE2_MUL_32BITS(i) \
658  AS2( psrldq xmm0, 4 )\
659  AS2( movd eax, xmm0 )\
660  AS2( and eax, AS_HEX(f0f0f0f0) )\
661  AS2( movzx edi, bh )\
662  AS2( pxor xmm5, XMMWORD_PTR [MUL_TABLE_0 + (i-1)*256 + WORD_REG(di)] )\
663  AS2( movzx edi, bl )\
664  AS2( pxor xmm4, XMMWORD_PTR [MUL_TABLE_0 + (i-1)*256 + WORD_REG(di)] )\
665  AS2( shr ebx, 16 )\
666  AS2( movzx edi, bh )\
667  AS2( pxor xmm3, XMMWORD_PTR [MUL_TABLE_0 + (i-1)*256 + WORD_REG(di)] )\
668  AS2( movzx edi, bl )\
669  AS2( pxor xmm2, XMMWORD_PTR [MUL_TABLE_0 + (i-1)*256 + WORD_REG(di)] )\
670  AS2( movd ebx, xmm0 )\
671  AS2( shl ebx, 4 )\
672  AS2( and ebx, AS_HEX(f0f0f0f0) )\
673  AS2( movzx edi, ah )\
674  AS2( pxor xmm5, XMMWORD_PTR [MUL_TABLE_1 + i*256 + WORD_REG(di)] )\
675  AS2( movzx edi, al )\
676  AS2( pxor xmm4, XMMWORD_PTR [MUL_TABLE_1 + i*256 + WORD_REG(di)] )\
677  AS2( shr eax, 16 )\
678  AS2( movzx edi, ah )\
679  AS2( pxor xmm3, XMMWORD_PTR [MUL_TABLE_1 + i*256 + WORD_REG(di)] )\
680  AS2( movzx edi, al )\
681  AS2( pxor xmm2, XMMWORD_PTR [MUL_TABLE_1 + i*256 + WORD_REG(di)] )\
682 
683  SSE2_MUL_32BITS(1)
684  SSE2_MUL_32BITS(2)
685  SSE2_MUL_32BITS(3)
686 
687  AS2( movzx edi, bh )
688  AS2( pxor xmm5, XMMWORD_PTR [MUL_TABLE_0 + 3*256 + WORD_REG(di)] )
689  AS2( movzx edi, bl )
690  AS2( pxor xmm4, XMMWORD_PTR [MUL_TABLE_0 + 3*256 + WORD_REG(di)] )
691  AS2( shr ebx, 16 )
692  AS2( movzx edi, bh )
693  AS2( pxor xmm3, XMMWORD_PTR [MUL_TABLE_0 + 3*256 + WORD_REG(di)] )
694  AS2( movzx edi, bl )
695  AS2( pxor xmm2, XMMWORD_PTR [MUL_TABLE_0 + 3*256 + WORD_REG(di)] )
696 
697  AS2( movdqa xmm0, xmm3 )
698  AS2( pslldq xmm3, 1 )
699  AS2( pxor xmm2, xmm3 )
700  AS2( movdqa xmm1, xmm2 )
701  AS2( pslldq xmm2, 1 )
702  AS2( pxor xmm5, xmm2 )
703 
704  AS2( psrldq xmm0, 15 )
705 #if (CRYPTOPP_CLANG_VERSION >= 30600) || (CRYPTOPP_APPLE_CLANG_VERSION >= 70000)
706  AS2( movd edi, xmm0 )
707 #elif (defined(CRYPTOPP_CLANG_VERSION) || defined(CRYPTOPP_APPLE_CLANG_VERSION)) && defined(CRYPTOPP_X64_ASM_AVAILABLE)
708  AS2( mov WORD_REG(di), xmm0 )
709 #else // GNU Assembler
710  AS2( movd WORD_REG(di), xmm0 )
711 #endif
712  AS2( movzx eax, WORD PTR [RED_TABLE + WORD_REG(di)*2] )
713  AS2( shl eax, 8 )
714 
715  AS2( movdqa xmm0, xmm5 )
716  AS2( pslldq xmm5, 1 )
717  AS2( pxor xmm4, xmm5 )
718 
719  AS2( psrldq xmm1, 15 )
720 #if (CRYPTOPP_CLANG_VERSION >= 30600) || (CRYPTOPP_APPLE_CLANG_VERSION >= 70000)
721  AS2( movd edi, xmm1 )
722 #elif (defined(CRYPTOPP_CLANG_VERSION) || defined(CRYPTOPP_APPLE_CLANG_VERSION)) && defined(CRYPTOPP_X64_ASM_AVAILABLE)
723  AS2( mov WORD_REG(di), xmm1 )
724 #else
725  AS2( movd WORD_REG(di), xmm1 )
726 #endif
727  AS2( xor ax, WORD PTR [RED_TABLE + WORD_REG(di)*2] )
728  AS2( shl eax, 8 )
729 
730  AS2( psrldq xmm0, 15 )
731 #if (CRYPTOPP_CLANG_VERSION >= 30600) || (CRYPTOPP_APPLE_CLANG_VERSION >= 70000)
732  AS2( movd edi, xmm0 )
733 #elif (defined(CRYPTOPP_CLANG_VERSION) || defined(CRYPTOPP_APPLE_CLANG_VERSION)) && defined(CRYPTOPP_X64_ASM_AVAILABLE)
734  AS2( mov WORD_REG(di), xmm0 )
735 #else
736  AS2( movd WORD_REG(di), xmm0 )
737 #endif
738  AS2( xor ax, WORD PTR [RED_TABLE + WORD_REG(di)*2] )
739 
740  AS2( movd xmm0, eax )
741  AS2( pxor xmm0, xmm4 )
742 
743  AS2( add WORD_REG(cx), 16 )
744  AS2( sub WORD_REG(dx), 1 )
745  ATT_NOPREFIX
746  ASJ( jnz, 0, b )
747  INTEL_NOPREFIX
748  AS2( movdqa [WORD_REG(si)], xmm0 )
749 
750  #if CRYPTOPP_BOOL_X32
751  AS1(pop rbp)
752  AS1(pop rbx)
753  #else
754  AS_POP_IF86( bp)
755  AS_POP_IF86( bx)
756  #endif
757 
758  #ifdef __GNUC__
759  ATT_PREFIX
760  :
761  : "c" (data), "d" (len/16), "S" (hashBuffer), "D" (s_reductionTable)
762  : "memory", "cc", "%eax"
763  #if CRYPTOPP_BOOL_X64
764  , "%ebx", "%r11"
765  #endif
766  );
767  #elif defined(CRYPTOPP_GENERATE_X64_MASM)
768  pop rbx
769  pop rdi
770  pop rsi
771  ret
772  GCM_AuthenticateBlocks_2K ENDP
773  #endif
774 
775  return len%16;
776  }
777  case 3: // SSE2 and 64K tables
778  {
779  #ifdef __GNUC__
780  __asm__ __volatile__
781  (
782  INTEL_NOPREFIX
783  #elif defined(CRYPTOPP_GENERATE_X64_MASM)
784  ALIGN 8
785  GCM_AuthenticateBlocks_64K PROC FRAME
786  rex_push_reg rsi
787  push_reg rdi
788  .endprolog
789  mov rsi, r8
790  #else
791  AS2( mov WORD_REG(cx), data )
792  AS2( mov WORD_REG(dx), len )
793  AS2( mov WORD_REG(si), hashBuffer )
794  AS2( shr WORD_REG(dx), 4 )
795  #endif
796 
797  AS2( movdqa xmm0, [WORD_REG(si)] )
798 
799  #undef MUL_TABLE
800  #define MUL_TABLE(i,j) WORD_REG(si) + 32 + (i*4+j)*256*16
801 
802  ASL(1)
803  AS2( movdqu xmm1, [WORD_REG(cx)] )
804  AS2( pxor xmm1, xmm0 )
805  AS2( pxor xmm0, xmm0 )
806 
807  #undef SSE2_MUL_32BITS
808  #define SSE2_MUL_32BITS(i) \
809  AS2( movd eax, xmm1 )\
810  AS2( psrldq xmm1, 4 )\
811  AS2( movzx edi, al )\
812  AS2( add WORD_REG(di), WORD_REG(di) )\
813  AS2( pxor xmm0, [MUL_TABLE(i,0) + WORD_REG(di)*8] )\
814  AS2( movzx edi, ah )\
815  AS2( add WORD_REG(di), WORD_REG(di) )\
816  AS2( pxor xmm0, [MUL_TABLE(i,1) + WORD_REG(di)*8] )\
817  AS2( shr eax, 16 )\
818  AS2( movzx edi, al )\
819  AS2( add WORD_REG(di), WORD_REG(di) )\
820  AS2( pxor xmm0, [MUL_TABLE(i,2) + WORD_REG(di)*8] )\
821  AS2( movzx edi, ah )\
822  AS2( add WORD_REG(di), WORD_REG(di) )\
823  AS2( pxor xmm0, [MUL_TABLE(i,3) + WORD_REG(di)*8] )\
824 
825  SSE2_MUL_32BITS(0)
826  SSE2_MUL_32BITS(1)
827  SSE2_MUL_32BITS(2)
828  SSE2_MUL_32BITS(3)
829 
830  AS2( add WORD_REG(cx), 16 )
831  AS2( sub WORD_REG(dx), 1 )
832  ATT_NOPREFIX
833  ASJ( jnz, 1, b )
834  INTEL_NOPREFIX
835  AS2( movdqa [WORD_REG(si)], xmm0 )
836 
837  #ifdef __GNUC__
838  ATT_PREFIX
839  :
840  : "c" (data), "d" (len/16), "S" (hashBuffer)
841  : "memory", "cc", "%edi", "%eax"
842  );
843  #elif defined(CRYPTOPP_GENERATE_X64_MASM)
844  pop rdi
845  pop rsi
846  ret
847  GCM_AuthenticateBlocks_64K ENDP
848  #endif
849 
850  return len%16;
851  }
852 #endif
853 #ifndef CRYPTOPP_GENERATE_X64_MASM
854  }
855 
856  return len%16;
857 }
858 
859 void GCM_Base::AuthenticateLastHeaderBlock()
860 {
861  if (m_bufferedDataLength > 0)
862  {
863  memset(m_buffer+m_bufferedDataLength, 0, HASH_BLOCKSIZE-m_bufferedDataLength);
864  m_bufferedDataLength = 0;
865  GCM_Base::AuthenticateBlocks(m_buffer, HASH_BLOCKSIZE);
866  }
867 }
868 
869 void GCM_Base::AuthenticateLastConfidentialBlock()
870 {
871  GCM_Base::AuthenticateLastHeaderBlock();
872  PutBlock<word64, BigEndian, true>(NULL, m_buffer)(m_totalHeaderLength*8)(m_totalMessageLength*8);
873  GCM_Base::AuthenticateBlocks(m_buffer, HASH_BLOCKSIZE);
874 }
875 
876 void GCM_Base::AuthenticateLastFooterBlock(byte *mac, size_t macSize)
877 {
878  m_ctr.Seek(0);
879  ReverseHashBufferIfNeeded();
880  m_ctr.ProcessData(mac, HashBuffer(), macSize);
881 }
882 
883 NAMESPACE_END
884 
885 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
886 #endif
GCM block cipher base implementation.
Definition: gcm.h:25
An invalid argument was detected.
Definition: cryptlib.h:182
virtual void SetKey(const byte *key, size_t length, const NameValuePairs &params=g_nullNameValuePairs)
Sets or reset the key of this object.
Definition: cryptlib.cpp:100
void IncrementCounterByOne(byte *inout, unsigned int size)
Performs an addition with carry on a block of bytes.
Definition: misc.h:917
virtual unsigned int BlockSize() const =0
Provides the block size of the cipher.
Library configuration file.
byte GetByte(size_t n) const
return the n-th byte
Definition: gf2n.cpp:77
bool GetIntValue(const char *name, int &value) const
Get a named value with type int.
Definition: cryptlib.h:371
byte order is little-endian
Definition: cryptlib.h:125
Polynomial with Coefficients in GF(2)
Definition: gf2n.h:18
Interface for one direction (encryption or decryption) of a block cipher.
Definition: cryptlib.h:1081
Use a table with 64K entries.
Definition: gcm.h:20
bool IsAlignedOn(const void *ptr, unsigned int alignment)
Determines whether ptr is aligned to a minimum value.
Definition: misc.h:811
const char * TableSize()
int, in bytes
Definition: argnames.h:80
unsigned int OptimalDataAlignment() const
Provides input and output data alignment for optimal performance.
Definition: gcm.cpp:351
void ProcessBlock(const byte *inBlock, byte *outBlock) const
Encrypt or decrypt a block.
Definition: cryptlib.h:737
T ConditionalByteReverse(ByteOrder order, T value)
Reverses bytes in a value depending upon endianess.
Definition: misc.h:1705
const T1 UnsignedMin(const T1 &a, const T2 &b)
Safe comparison of values that could be neagtive and incorrectly promoted.
Definition: misc.h:433
const char * BlockSize()
int, in bytes
Definition: argnames.h:26
Classes, functions, intrinsics and features for X86, X32 nd X64 assembly.
virtual unsigned int OptimalDataAlignment() const
Provides input and output data alignment for optimal performance.
Definition: cryptlib.cpp:232
std::string AlgorithmName() const
Provides the name of this algorithm.
Definition: gcm.h:29
GCM block cipher mode of operation.
Crypto++ library namespace.
byte ByteReverse(byte value)
Reverses bytes in a 8-bit value.
Definition: misc.h:1561
Interface for retrieving values given their names.
Definition: cryptlib.h:277