Crypto++  5.6.4
Free C++ class library of cryptographic schemes
gcm.cpp
1 // gcm.cpp - written and placed in the public domain by Wei Dai
2 
3 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM gcm.cpp" to generate MASM code
4 
5 #include "pch.h"
6 #include "config.h"
7 
8 #if CRYPTOPP_MSC_VERSION
9 # pragma warning(disable: 4189)
10 #endif
11 
12 #ifndef CRYPTOPP_IMPORTS
13 #ifndef CRYPTOPP_GENERATE_X64_MASM
14 
15 // Clang 3.3 integrated assembler crash on Linux.
16 #if (defined(CRYPTOPP_LLVM_CLANG_VERSION) && (CRYPTOPP_LLVM_CLANG_VERSION < 30400))
17 # undef CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
18 #endif
19 
20 // SunCC 5.13 and below crash with AES-NI/CLMUL and C++{03|11}. Disable one or the other.
21 // Also see http://github.com/weidai11/cryptopp/issues/226
22 #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x513)
23 # undef CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
24 #endif
25 
26 #include "gcm.h"
27 #include "cpu.h"
28 
29 NAMESPACE_BEGIN(CryptoPP)
30 
31 // Different assemblers accept different mnemonics: 'movd eax, xmm0' vs 'movd rax, xmm0' vs 'mov eax, xmm0' vs 'mov rax, xmm0'
32 #if (CRYPTOPP_LLVM_CLANG_VERSION >= 30600) || (CRYPTOPP_APPLE_CLANG_VERSION >= 70000) || defined(CRYPTOPP_CLANG_INTEGRATED_ASSEMBLER)
33 // 'movd eax, xmm0' only. REG_WORD() macro not used.
34 # define USE_MOVD_REG32 1
35 #elif (defined(CRYPTOPP_LLVM_CLANG_VERSION) || defined(CRYPTOPP_APPLE_CLANG_VERSION)) && defined(CRYPTOPP_X64_ASM_AVAILABLE)
36 // 'movd eax, xmm0' or 'movd rax, xmm0'. REG_WORD() macro supplies REG32 or REG64.
37 # define USE_MOVD_REG32_OR_REG64 1
38 #elif defined(__GNUC__) || defined(_MSC_VER)
39 // 'movd eax, xmm0' or 'movd rax, xmm0'. REG_WORD() macro supplies REG32 or REG64.
40 # define USE_MOVD_REG32_OR_REG64 1
41 #else
42 // 'mov eax, xmm0' or 'mov rax, xmm0'. REG_WORD() macro supplies REG32 or REG64.
43 # define USE_MOV_REG32_OR_REG64 1
44 #endif
45 
46 word16 GCM_Base::s_reductionTable[256];
47 volatile bool GCM_Base::s_reductionTableInitialized = false;
48 
49 void GCM_Base::GCTR::IncrementCounterBy256()
50 {
51  IncrementCounterByOne(m_counterArray+BlockSize()-4, 3);
52 }
53 
54 #if 0
55 // preserved for testing
56 void gcm_gf_mult(const unsigned char *a, const unsigned char *b, unsigned char *c)
57 {
58  word64 Z0=0, Z1=0, V0, V1;
59 
61  Block::Get(a)(V0)(V1);
62 
63  for (int i=0; i<16; i++)
64  {
65  for (int j=0x80; j!=0; j>>=1)
66  {
67  int x = b[i] & j;
68  Z0 ^= x ? V0 : 0;
69  Z1 ^= x ? V1 : 0;
70  x = (int)V1 & 1;
71  V1 = (V1>>1) | (V0<<63);
72  V0 = (V0>>1) ^ (x ? W64LIT(0xe1) << 56 : 0);
73  }
74  }
75  Block::Put(NULL, c)(Z0)(Z1);
76 }
77 
78 __m128i _mm_clmulepi64_si128(const __m128i &a, const __m128i &b, int i)
79 {
80  word64 A[1] = {ByteReverse(((word64*)&a)[i&1])};
81  word64 B[1] = {ByteReverse(((word64*)&b)[i>>4])};
82 
83  PolynomialMod2 pa((byte *)A, 8);
84  PolynomialMod2 pb((byte *)B, 8);
85  PolynomialMod2 c = pa*pb;
86 
87  __m128i output;
88  for (int i=0; i<16; i++)
89  ((byte *)&output)[i] = c.GetByte(i);
90  return output;
91 }
92 #endif
93 
94 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
95 inline static void SSE2_Xor16(byte *a, const byte *b, const byte *c)
96 {
97 // SunCC 5.14 crash (bewildering since asserts are not in effect in release builds)
98 // Also see http://github.com/weidai11/cryptopp/issues/226 and http://github.com/weidai11/cryptopp/issues/284
99 # if __SUNPRO_CC
100  *(__m128i *)(void *)a = _mm_xor_si128(*(__m128i *)(void *)b, *(__m128i *)(void *)c);
101 # elif CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
102  CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf<__m128i>()));
103  CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf<__m128i>()));
104  CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf<__m128i>()));
105  *(__m128i *)(void *)a = _mm_xor_si128(*(__m128i *)(void *)b, *(__m128i *)(void *)c);
106 # else
107  asm ("movdqa %1, %%xmm0; pxor %2, %%xmm0; movdqa %%xmm0, %0;" : "=m" (a[0]) : "m"(b[0]), "m"(c[0]));
108 # endif
109 }
110 #endif
111 
112 #if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
113 inline static void NEON_Xor16(byte *a, const byte *b, const byte *c)
114 {
115  CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf<uint64x2_t>()));
116  CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf<uint64x2_t>()));
117  CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf<uint64x2_t>()));
118  *(uint64x2_t*)a = veorq_u64(*(uint64x2_t*)b, *(uint64x2_t*)c);
119 }
120 #endif
121 
122 inline static void Xor16(byte *a, const byte *b, const byte *c)
123 {
124  CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf<word64>()));
125  CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf<word64>()));
126  CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf<word64>()));
127  ((word64 *)(void *)a)[0] = ((word64 *)(void *)b)[0] ^ ((word64 *)(void *)c)[0];
128  ((word64 *)(void *)a)[1] = ((word64 *)(void *)b)[1] ^ ((word64 *)(void *)c)[1];
129 }
130 
131 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
132 CRYPTOPP_ALIGN_DATA(16)
133 static const word64 s_clmulConstants64[] = {
134  W64LIT(0xe100000000000000), W64LIT(0xc200000000000000),
135  W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607),
136  W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f)};
137 
138 static const __m128i *s_clmulConstants = (const __m128i *)(const void *)s_clmulConstants64;
139 static const unsigned int s_clmulTableSizeInBlocks = 8;
140 
141 inline __m128i CLMUL_Reduce(__m128i c0, __m128i c1, __m128i c2, const __m128i &r)
142 {
143  /*
144  The polynomial to be reduced is c0 * x^128 + c1 * x^64 + c2. c0t below refers to the most
145  significant half of c0 as a polynomial, which, due to GCM's bit reflection, are in the
146  rightmost bit positions, and the lowest byte addresses.
147 
148  c1 ^= c0t * 0xc200000000000000
149  c2t ^= c0t
150  t = shift (c1t ^ c0b) left 1 bit
151  c2 ^= t * 0xe100000000000000
152  c2t ^= c1b
153  shift c2 left 1 bit and xor in lowest bit of c1t
154  */
155 #if 0 // MSVC 2010 workaround: see http://connect.microsoft.com/VisualStudio/feedback/details/575301
156  c2 = _mm_xor_si128(c2, _mm_move_epi64(c0));
157 #else
158  c1 = _mm_xor_si128(c1, _mm_slli_si128(c0, 8));
159 #endif
160  c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(c0, r, 0x10));
161  c0 = _mm_srli_si128(c0, 8);
162  c0 = _mm_xor_si128(c0, c1);
163  c0 = _mm_slli_epi64(c0, 1);
164  c0 = _mm_clmulepi64_si128(c0, r, 0);
165  c2 = _mm_xor_si128(c2, c0);
166  c2 = _mm_xor_si128(c2, _mm_srli_si128(c1, 8));
167  c1 = _mm_unpacklo_epi64(c1, c2);
168  c1 = _mm_srli_epi64(c1, 63);
169  c2 = _mm_slli_epi64(c2, 1);
170  return _mm_xor_si128(c2, c1);
171 }
172 
173 inline __m128i CLMUL_GF_Mul(const __m128i &x, const __m128i &h, const __m128i &r)
174 {
175  const __m128i c0 = _mm_clmulepi64_si128(x,h,0);
176  const __m128i c1 = _mm_xor_si128(_mm_clmulepi64_si128(x,h,1), _mm_clmulepi64_si128(x,h,0x10));
177  const __m128i c2 = _mm_clmulepi64_si128(x,h,0x11);
178 
179  return CLMUL_Reduce(c0, c1, c2, r);
180 }
181 #endif
182 
183 #if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
184 
185 CRYPTOPP_ALIGN_DATA(16)
186 static const word64 s_clmulConstants64[] = {
187  W64LIT(0xe100000000000000), W64LIT(0xc200000000000000), // Used for ARM and x86; polynomial coefficients
188  W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607), // Unused for ARM; used for x86 _mm_shuffle_epi8
189  W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f) // Unused for ARM; used for x86 _mm_shuffle_epi8
190 };
191 
192 static const uint64x2_t *s_clmulConstants = (const uint64x2_t *)s_clmulConstants64;
193 static const unsigned int s_clmulTableSizeInBlocks = 8;
194 
195 inline uint64x2_t PMULL_Reduce(uint64x2_t c0, uint64x2_t c1, uint64x2_t c2, const uint64x2_t &r)
196 {
197  // See comments fo CLMUL_Reduce
198 
199  c1 = veorq_u64(c1, (uint64x2_t)vextq_u8(vdupq_n_u8(0), (uint8x16_t)c0, 8));
200  c1 = veorq_u64(c1, (uint64x2_t)vmull_p64(vgetq_lane_u64(c0, 0), vgetq_lane_u64(r, 1)));
201  c0 = (uint64x2_t)vextq_u8((uint8x16_t)c0, vdupq_n_u8(0), 8);
202  c0 = veorq_u64(c0, c1);
203  c0 = vshlq_n_u64(c0, 1);
204  c0 = (uint64x2_t)vmull_p64(vgetq_lane_u64(c0, 0), vgetq_lane_u64(r, 0));
205  c2 = veorq_u64(c2, c0);
206  c2 = veorq_u64(c2, (uint64x2_t)vextq_u8((uint8x16_t)c1, vdupq_n_u8(0), 8));
207  c1 = vcombine_u64(vget_low_u64(c1), vget_low_u64(c2));
208  c1 = vshrq_n_u64(c1, 63);
209  c2 = vshlq_n_u64(c2, 1);
210 
211  return veorq_u64(c2, c1);
212 }
213 
214 inline uint64x2_t PMULL_GF_Mul(const uint64x2_t &x, const uint64x2_t &h, const uint64x2_t &r)
215 {
216  const uint64x2_t c0 = (uint64x2_t)vmull_p64(vgetq_lane_u64(x, 0), vgetq_lane_u64(h, 0));
217  const uint64x2_t c1 = veorq_u64((uint64x2_t)vmull_p64(vgetq_lane_u64(x, 1), vgetq_lane_u64(h,0)),
218  (uint64x2_t)vmull_p64(vgetq_lane_u64(x, 0), vgetq_lane_u64(h, 1)));
219  const uint64x2_t c2 = (uint64x2_t)vmull_high_p64((poly64x2_t)x, (poly64x2_t)h);
220 
221  return PMULL_Reduce(c0, c1, c2, r);
222 }
223 #endif
224 
225 void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const NameValuePairs &params)
226 {
227  BlockCipher &blockCipher = AccessBlockCipher();
228  blockCipher.SetKey(userKey, keylength, params);
229 
230  if (blockCipher.BlockSize() != REQUIRED_BLOCKSIZE)
231  throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not 16");
232 
233  int tableSize, i, j, k;
234 
235 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
236  if (HasCLMUL())
237  {
238  // Avoid "parameter not used" error and suppress Coverity finding
239  (void)params.GetIntValue(Name::TableSize(), tableSize);
240  tableSize = s_clmulTableSizeInBlocks * REQUIRED_BLOCKSIZE;
241  }
242  else
243 #elif CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
244  if (HasPMULL())
245  {
246  // Avoid "parameter not used" error and suppress Coverity finding
247  (void)params.GetIntValue(Name::TableSize(), tableSize);
248  tableSize = s_clmulTableSizeInBlocks * REQUIRED_BLOCKSIZE;
249  }
250  else
251 #endif
252  {
253  if (params.GetIntValue(Name::TableSize(), tableSize))
254  tableSize = (tableSize >= 64*1024) ? 64*1024 : 2*1024;
255  else
256  tableSize = (GetTablesOption() == GCM_64K_Tables) ? 64*1024 : 2*1024;
257 
258 #if defined(_MSC_VER) && (_MSC_VER >= 1300 && _MSC_VER < 1400)
259  // VC 2003 workaround: compiler generates bad code for 64K tables
260  tableSize = 2*1024;
261 #endif
262  }
263 
264  m_buffer.resize(3*REQUIRED_BLOCKSIZE + tableSize);
265  byte *table = MulTable();
266  byte *hashKey = HashKey();
267  memset(hashKey, 0, REQUIRED_BLOCKSIZE);
268  blockCipher.ProcessBlock(hashKey);
269 
270 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
271  if (HasCLMUL())
272  {
273  const __m128i r = s_clmulConstants[0];
274  __m128i h0 = _mm_shuffle_epi8(_mm_load_si128((__m128i *)(void *)hashKey), s_clmulConstants[1]);
275  __m128i h = h0;
276 
277  for (i=0; i<tableSize; i+=32)
278  {
279  __m128i h1 = CLMUL_GF_Mul(h, h0, r);
280  _mm_storel_epi64((__m128i *)(void *)(table+i), h);
281  _mm_storeu_si128((__m128i *)(void *)(table+i+16), h1);
282  _mm_storeu_si128((__m128i *)(void *)(table+i+8), h);
283  _mm_storel_epi64((__m128i *)(void *)(table+i+8), h1);
284  h = CLMUL_GF_Mul(h1, h0, r);
285  }
286 
287  return;
288  }
289 #elif CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
290  if (HasPMULL())
291  {
292  const uint64x2_t r = s_clmulConstants[0];
293  const uint64x2_t t = vld1q_u64((uint64_t *)hashKey);
294  const uint64x2_t h0 = (uint64x2_t)vrev64q_u8((uint8x16_t)vcombine_u64(vget_high_u64(t), vget_low_u64(t)));
295 
296  uint64x2_t h = h0;
297  for (i=0; i<tableSize-32; i+=32)
298  {
299  const uint64x2_t h1 = PMULL_GF_Mul(h, h0, r);
300  vst1_u64((uint64_t *)(table+i), vget_low_u64(h));
301  vst1q_u64((uint64_t *)(table+i+16), h1);
302  vst1q_u64((uint64_t *)(table+i+8), h);
303  vst1_u64((uint64_t *)(table+i+8), vget_low_u64(h1));
304  h = PMULL_GF_Mul(h1, h0, r);
305  }
306 
307  const uint64x2_t h1 = PMULL_GF_Mul(h, h0, r);
308  vst1_u64((uint64_t *)(table+i), vget_low_u64(h));
309  vst1q_u64((uint64_t *)(table+i+16), h1);
310  vst1q_u64((uint64_t *)(table+i+8), h);
311  vst1_u64((uint64_t *)(table+i+8), vget_low_u64(h1));
312 
313  return;
314  }
315 #endif
316 
317  word64 V0, V1;
318  typedef BlockGetAndPut<word64, BigEndian> Block;
319  Block::Get(hashKey)(V0)(V1);
320 
321  if (tableSize == 64*1024)
322  {
323  for (i=0; i<128; i++)
324  {
325  k = i%8;
326  Block::Put(NULL, table+(i/8)*256*16+(size_t(1)<<(11-k)))(V0)(V1);
327 
328  int x = (int)V1 & 1;
329  V1 = (V1>>1) | (V0<<63);
330  V0 = (V0>>1) ^ (x ? W64LIT(0xe1) << 56 : 0);
331  }
332 
333  for (i=0; i<16; i++)
334  {
335  memset(table+i*256*16, 0, 16);
336 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
337  if (HasSSE2())
338  for (j=2; j<=0x80; j*=2)
339  for (k=1; k<j; k++)
340  SSE2_Xor16(table+i*256*16+(j+k)*16, table+i*256*16+j*16, table+i*256*16+k*16);
341  else
342 #elif CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
343  if (HasNEON())
344  for (j=2; j<=0x80; j*=2)
345  for (k=1; k<j; k++)
346  NEON_Xor16(table+i*256*16+(j+k)*16, table+i*256*16+j*16, table+i*256*16+k*16);
347  else
348 #endif
349  for (j=2; j<=0x80; j*=2)
350  for (k=1; k<j; k++)
351  Xor16(table+i*256*16+(j+k)*16, table+i*256*16+j*16, table+i*256*16+k*16);
352  }
353  }
354  else
355  {
356  if (!s_reductionTableInitialized)
357  {
358  s_reductionTable[0] = 0;
359  word16 x = 0x01c2;
360  s_reductionTable[1] = ByteReverse(x);
361  for (unsigned int ii=2; ii<=0x80; ii*=2)
362  {
363  x <<= 1;
364  s_reductionTable[ii] = ByteReverse(x);
365  for (unsigned int jj=1; jj<ii; jj++)
366  s_reductionTable[ii+jj] = s_reductionTable[ii] ^ s_reductionTable[jj];
367  }
368  s_reductionTableInitialized = true;
369  }
370 
371  for (i=0; i<128-24; i++)
372  {
373  k = i%32;
374  if (k < 4)
375  Block::Put(NULL, table+1024+(i/32)*256+(size_t(1)<<(7-k)))(V0)(V1);
376  else if (k < 8)
377  Block::Put(NULL, table+(i/32)*256+(size_t(1)<<(11-k)))(V0)(V1);
378 
379  int x = (int)V1 & 1;
380  V1 = (V1>>1) | (V0<<63);
381  V0 = (V0>>1) ^ (x ? W64LIT(0xe1) << 56 : 0);
382  }
383 
384  for (i=0; i<4; i++)
385  {
386  memset(table+i*256, 0, 16);
387  memset(table+1024+i*256, 0, 16);
388 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
389  if (HasSSE2())
390  for (j=2; j<=8; j*=2)
391  for (k=1; k<j; k++)
392  {
393  SSE2_Xor16(table+i*256+(j+k)*16, table+i*256+j*16, table+i*256+k*16);
394  SSE2_Xor16(table+1024+i*256+(j+k)*16, table+1024+i*256+j*16, table+1024+i*256+k*16);
395  }
396  else
397 #elif CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
398  if (HasNEON())
399  for (j=2; j<=8; j*=2)
400  for (k=1; k<j; k++)
401  {
402  NEON_Xor16(table+i*256+(j+k)*16, table+i*256+j*16, table+i*256+k*16);
403  NEON_Xor16(table+1024+i*256+(j+k)*16, table+1024+i*256+j*16, table+1024+i*256+k*16);
404  }
405  else
406 #endif
407  for (j=2; j<=8; j*=2)
408  for (k=1; k<j; k++)
409  {
410  Xor16(table+i*256+(j+k)*16, table+i*256+j*16, table+i*256+k*16);
411  Xor16(table+1024+i*256+(j+k)*16, table+1024+i*256+j*16, table+1024+i*256+k*16);
412  }
413  }
414  }
415 }
416 
417 inline void GCM_Base::ReverseHashBufferIfNeeded()
418 {
419 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
420  if (HasCLMUL())
421  {
422  __m128i &x = *(__m128i *)(void *)HashBuffer();
423  x = _mm_shuffle_epi8(x, s_clmulConstants[1]);
424  }
425 #elif CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
426  if (HasPMULL())
427  {
429  {
430  const uint8x16_t x = vrev64q_u8(vld1q_u8(HashBuffer()));
431  vst1q_u8(HashBuffer(), (uint8x16_t)vcombine_u64(vget_high_u64((uint64x2_t)x), vget_low_u64((uint64x2_t)x)));
432  }
433  }
434 #endif
435 }
436 
437 void GCM_Base::Resync(const byte *iv, size_t len)
438 {
439  BlockCipher &cipher = AccessBlockCipher();
440  byte *hashBuffer = HashBuffer();
441 
442  if (len == 12)
443  {
444  memcpy(hashBuffer, iv, len);
445  memset(hashBuffer+len, 0, 3);
446  hashBuffer[len+3] = 1;
447  }
448  else
449  {
450  size_t origLen = len;
451  memset(hashBuffer, 0, HASH_BLOCKSIZE);
452 
453  if (len >= HASH_BLOCKSIZE)
454  {
455  len = GCM_Base::AuthenticateBlocks(iv, len);
456  iv += (origLen - len);
457  }
458 
459  if (len > 0)
460  {
461  memcpy(m_buffer, iv, len);
462  memset(m_buffer+len, 0, HASH_BLOCKSIZE-len);
463  GCM_Base::AuthenticateBlocks(m_buffer, HASH_BLOCKSIZE);
464  }
465 
466  PutBlock<word64, BigEndian, true>(NULL, m_buffer)(0)(origLen*8);
467  GCM_Base::AuthenticateBlocks(m_buffer, HASH_BLOCKSIZE);
468 
469  ReverseHashBufferIfNeeded();
470  }
471 
472  if (m_state >= State_IVSet)
473  m_ctr.Resynchronize(hashBuffer, REQUIRED_BLOCKSIZE);
474  else
475  m_ctr.SetCipherWithIV(cipher, hashBuffer);
476 
477  m_ctr.Seek(HASH_BLOCKSIZE);
478 
479  memset(hashBuffer, 0, HASH_BLOCKSIZE);
480 }
481 
482 unsigned int GCM_Base::OptimalDataAlignment() const
483 {
484  return
485 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
486  HasSSE2() ? 16 :
487 #elif CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
488  HasNEON() ? 16 :
489 #endif
490  GetBlockCipher().OptimalDataAlignment();
491 }
492 
493 #if CRYPTOPP_MSC_VERSION
494 # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
495 #endif
496 
497 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
498 
499 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
500 extern "C" {
501 void GCM_AuthenticateBlocks_2K(const byte *data, size_t blocks, word64 *hashBuffer, const word16 *reductionTable);
502 void GCM_AuthenticateBlocks_64K(const byte *data, size_t blocks, word64 *hashBuffer);
503 }
504 #endif
505 
506 #ifndef CRYPTOPP_GENERATE_X64_MASM
507 
508 size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
509 {
510 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
511  if (HasCLMUL())
512  {
513  const __m128i *table = (const __m128i *)(const void *)MulTable();
514  __m128i x = _mm_load_si128((__m128i *)(void *)HashBuffer());
515  const __m128i r = s_clmulConstants[0], mask1 = s_clmulConstants[1], mask2 = s_clmulConstants[2];
516 
517  while (len >= 16)
518  {
519  size_t s = UnsignedMin(len/16, s_clmulTableSizeInBlocks), i=0;
520  __m128i d, d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-1)*16)), mask2);;
521  __m128i c0 = _mm_setzero_si128();
522  __m128i c1 = _mm_setzero_si128();
523  __m128i c2 = _mm_setzero_si128();
524 
525  while (true)
526  {
527  __m128i h0 = _mm_load_si128(table+i);
528  __m128i h1 = _mm_load_si128(table+i+1);
529  __m128i h2 = _mm_xor_si128(h0, h1);
530 
531  if (++i == s)
532  {
533  d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)data), mask1);
534  d = _mm_xor_si128(d, x);
535  c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0));
536  c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 1));
537  d = _mm_xor_si128(d, _mm_shuffle_epi32(d, _MM_SHUFFLE(1, 0, 3, 2)));
538  c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h2, 0));
539  break;
540  }
541 
542  d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-i)*16-8)), mask2);
543  c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d2, h0, 1));
544  c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 1));
545  d2 = _mm_xor_si128(d2, d);
546  c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d2, h2, 1));
547 
548  if (++i == s)
549  {
550  d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)data), mask1);
551  d = _mm_xor_si128(d, x);
552  c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0x10));
553  c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 0x11));
554  d = _mm_xor_si128(d, _mm_shuffle_epi32(d, _MM_SHUFFLE(1, 0, 3, 2)));
555  c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h2, 0x10));
556  break;
557  }
558 
559  d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-i)*16-8)), mask1);
560  c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0x10));
561  c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d2, h1, 0x10));
562  d = _mm_xor_si128(d, d2);
563  c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h2, 0x10));
564  }
565  data += s*16;
566  len -= s*16;
567 
568  c1 = _mm_xor_si128(_mm_xor_si128(c1, c0), c2);
569  x = CLMUL_Reduce(c0, c1, c2, r);
570  }
571 
572  _mm_store_si128((__m128i *)(void *)HashBuffer(), x);
573  return len;
574  }
575 #elif CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
576  if (HasPMULL())
577  {
578  const uint64x2_t *table = (const uint64x2_t *)MulTable();
579  uint64x2_t x = vld1q_u64((const uint64_t*)HashBuffer());
580  const uint64x2_t r = s_clmulConstants[0];
581 
582  while (len >= 16)
583  {
584  size_t s = UnsignedMin(len/16, s_clmulTableSizeInBlocks), i=0;
585  uint64x2_t d, d2 = (uint64x2_t)vrev64q_u8((uint8x16_t)vld1q_u64((const uint64_t *)(data+(s-1)*16)));
586  uint64x2_t c0 = vdupq_n_u64(0);
587  uint64x2_t c1 = vdupq_n_u64(0);
588  uint64x2_t c2 = vdupq_n_u64(0);
589 
590  while (true)
591  {
592  const uint64x2_t h0 = vld1q_u64((const uint64_t*)(table+i));
593  const uint64x2_t h1 = vld1q_u64((const uint64_t*)(table+i+1));
594  const uint64x2_t h2 = veorq_u64(h0, h1);
595 
596  if (++i == s)
597  {
598  const uint64x2_t t1 = vld1q_u64((const uint64_t *)data);
599  d = veorq_u64((uint64x2_t)vrev64q_u8((uint8x16_t)vcombine_u64(vget_high_u64(t1), vget_low_u64(t1))), x);
600  c0 = veorq_u64(c0, (uint64x2_t)vmull_p64(vgetq_lane_u64(d, 0), vgetq_lane_u64(h0, 0)));
601  c2 = veorq_u64(c2, (uint64x2_t)vmull_p64(vgetq_lane_u64(d, 1), vgetq_lane_u64(h1, 0)));
602  d = veorq_u64(d, (uint64x2_t)vcombine_u32(vget_high_u32((uint32x4_t)d), vget_low_u32((uint32x4_t)d)));
603  c1 = veorq_u64(c1, (uint64x2_t)vmull_p64(vgetq_lane_u64(d, 0), vgetq_lane_u64(h2, 0)));
604 
605  break;
606  }
607 
608  d = (uint64x2_t)vrev64q_u8((uint8x16_t)vld1q_u64((const uint64_t *)(data+(s-i)*16-8)));
609  c0 = veorq_u64(c0, (uint64x2_t)vmull_p64(vgetq_lane_u64(d2, 1), vgetq_lane_u64(h0, 0)));
610  c2 = veorq_u64(c2, (uint64x2_t)vmull_p64(vgetq_lane_u64(d, 1), vgetq_lane_u64(h1, 0)));
611  d2 = veorq_u64(d2, d);
612  c1 = veorq_u64(c1, (uint64x2_t)vmull_p64(vgetq_lane_u64(d2, 1), vgetq_lane_u64(h2, 0)));
613 
614  if (++i == s)
615  {
616 
617  const uint64x2_t t2 = vld1q_u64((const uint64_t *)data);
618  d = veorq_u64((uint64x2_t)vrev64q_u8((uint8x16_t)vcombine_u64(vget_high_u64(t2), vget_low_u64(t2))), x);
619  c0 = veorq_u64(c0, (uint64x2_t)vmull_p64(vgetq_lane_u64(d, 0), vgetq_lane_u64(h0, 1)));
620  c2 = veorq_u64(c2, (uint64x2_t)vmull_high_p64((poly64x2_t)d, (poly64x2_t)h1));
621  d = veorq_u64(d, (uint64x2_t)vcombine_u32(vget_high_u32((uint32x4_t)d), vget_low_u32((uint32x4_t)d)));
622  c1 = veorq_u64(c1, (uint64x2_t)vmull_p64(vgetq_lane_u64(d, 0), vgetq_lane_u64(h2, 1)));
623 
624  break;
625  }
626 
627  const uint64x2_t t3 = vld1q_u64((uint64_t *)(data+(s-i)*16-8));
628  d2 = (uint64x2_t)vrev64q_u8((uint8x16_t)vcombine_u64(vget_high_u64(t3), vget_low_u64(t3)));
629  c0 = veorq_u64(c0, (uint64x2_t)vmull_p64(vgetq_lane_u64(d, 0), vgetq_lane_u64(h0, 1)));
630  c2 = veorq_u64(c2, (uint64x2_t)vmull_p64(vgetq_lane_u64(d2, 0), vgetq_lane_u64(h1, 1)));
631  d = veorq_u64(d, d2);
632  c1 = veorq_u64(c1, (uint64x2_t)vmull_p64(vgetq_lane_u64(d, 0), vgetq_lane_u64(h2, 1)));
633  }
634  data += s*16;
635  len -= s*16;
636 
637  c1 = veorq_u64(veorq_u64(c1, c0), c2);
638  x = PMULL_Reduce(c0, c1, c2, r);
639  }
640 
641  vst1q_u64((uint64_t *)HashBuffer(), x);
642  return len;
643 }
644 #endif
645 
647  word64 *hashBuffer = (word64 *)(void *)HashBuffer();
648  CRYPTOPP_ASSERT(IsAlignedOn(hashBuffer,GetAlignmentOf<word64>()));
649 
650  switch (2*(m_buffer.size()>=64*1024)
651 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
652  + HasSSE2()
653 //#elif CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
654 // + HasNEON()
655 #endif
656  )
657  {
658  case 0: // non-SSE2 and 2K tables
659  {
660  byte *table = MulTable();
661  word64 x0 = hashBuffer[0], x1 = hashBuffer[1];
662 
663  do
664  {
665  word64 y0, y1, a0, a1, b0, b1, c0, c1, d0, d1;
666  Block::Get(data)(y0)(y1);
667  x0 ^= y0;
668  x1 ^= y1;
669 
670  data += HASH_BLOCKSIZE;
671  len -= HASH_BLOCKSIZE;
672 
673  #define READ_TABLE_WORD64_COMMON(a, b, c, d) *(word64 *)(void *)(table+(a*1024)+(b*256)+c+d*8)
674 
675  #ifdef IS_LITTLE_ENDIAN
676  #if CRYPTOPP_BOOL_SLOW_WORD64
677  word32 z0 = (word32)x0;
678  word32 z1 = (word32)(x0>>32);
679  word32 z2 = (word32)x1;
680  word32 z3 = (word32)(x1>>32);
681  #define READ_TABLE_WORD64(a, b, c, d, e) READ_TABLE_WORD64_COMMON((d%2), c, (d?(z##c>>((d?d-1:0)*4))&0xf0:(z##c&0xf)<<4), e)
682  #else
683  #define READ_TABLE_WORD64(a, b, c, d, e) READ_TABLE_WORD64_COMMON((d%2), c, ((d+8*b)?(x##a>>(((d+8*b)?(d+8*b)-1:1)*4))&0xf0:(x##a&0xf)<<4), e)
684  #endif
685  #define GF_MOST_SIG_8BITS(a) (a##1 >> 7*8)
686  #define GF_SHIFT_8(a) a##1 = (a##1 << 8) ^ (a##0 >> 7*8); a##0 <<= 8;
687  #else
688  #define READ_TABLE_WORD64(a, b, c, d, e) READ_TABLE_WORD64_COMMON((1-d%2), c, ((15-d-8*b)?(x##a>>(((15-d-8*b)?(15-d-8*b)-1:0)*4))&0xf0:(x##a&0xf)<<4), e)
689  #define GF_MOST_SIG_8BITS(a) (a##1 & 0xff)
690  #define GF_SHIFT_8(a) a##1 = (a##1 >> 8) ^ (a##0 << 7*8); a##0 >>= 8;
691  #endif
692 
693  #define GF_MUL_32BY128(op, a, b, c) \
694  a0 op READ_TABLE_WORD64(a, b, c, 0, 0) ^ READ_TABLE_WORD64(a, b, c, 1, 0);\
695  a1 op READ_TABLE_WORD64(a, b, c, 0, 1) ^ READ_TABLE_WORD64(a, b, c, 1, 1);\
696  b0 op READ_TABLE_WORD64(a, b, c, 2, 0) ^ READ_TABLE_WORD64(a, b, c, 3, 0);\
697  b1 op READ_TABLE_WORD64(a, b, c, 2, 1) ^ READ_TABLE_WORD64(a, b, c, 3, 1);\
698  c0 op READ_TABLE_WORD64(a, b, c, 4, 0) ^ READ_TABLE_WORD64(a, b, c, 5, 0);\
699  c1 op READ_TABLE_WORD64(a, b, c, 4, 1) ^ READ_TABLE_WORD64(a, b, c, 5, 1);\
700  d0 op READ_TABLE_WORD64(a, b, c, 6, 0) ^ READ_TABLE_WORD64(a, b, c, 7, 0);\
701  d1 op READ_TABLE_WORD64(a, b, c, 6, 1) ^ READ_TABLE_WORD64(a, b, c, 7, 1);\
702 
703  GF_MUL_32BY128(=, 0, 0, 0)
704  GF_MUL_32BY128(^=, 0, 1, 1)
705  GF_MUL_32BY128(^=, 1, 0, 2)
706  GF_MUL_32BY128(^=, 1, 1, 3)
707 
708  word32 r = (word32)s_reductionTable[GF_MOST_SIG_8BITS(d)] << 16;
709  GF_SHIFT_8(d)
710  c0 ^= d0; c1 ^= d1;
711  r ^= (word32)s_reductionTable[GF_MOST_SIG_8BITS(c)] << 8;
712  GF_SHIFT_8(c)
713  b0 ^= c0; b1 ^= c1;
714  r ^= s_reductionTable[GF_MOST_SIG_8BITS(b)];
715  GF_SHIFT_8(b)
716  a0 ^= b0; a1 ^= b1;
718  x0 = a0; x1 = a1;
719  }
720  while (len >= HASH_BLOCKSIZE);
721 
722  hashBuffer[0] = x0; hashBuffer[1] = x1;
723  return len;
724  }
725 
726  case 2: // non-SSE2 and 64K tables
727  {
728  byte *table = MulTable();
729  word64 x0 = hashBuffer[0], x1 = hashBuffer[1];
730 
731  do
732  {
733  word64 y0, y1, a0, a1;
734  Block::Get(data)(y0)(y1);
735  x0 ^= y0;
736  x1 ^= y1;
737 
738  data += HASH_BLOCKSIZE;
739  len -= HASH_BLOCKSIZE;
740 
741  #undef READ_TABLE_WORD64_COMMON
742  #undef READ_TABLE_WORD64
743 
744  #define READ_TABLE_WORD64_COMMON(a, c, d) *(word64 *)(void *)(table+(a)*256*16+(c)+(d)*8)
745 
746  #ifdef IS_LITTLE_ENDIAN
747  #if CRYPTOPP_BOOL_SLOW_WORD64
748  word32 z0 = (word32)x0;
749  word32 z1 = (word32)(x0>>32);
750  word32 z2 = (word32)x1;
751  word32 z3 = (word32)(x1>>32);
752  #define READ_TABLE_WORD64(b, c, d, e) READ_TABLE_WORD64_COMMON(c*4+d, (d?(z##c>>((d?d:1)*8-4))&0xff0:(z##c&0xff)<<4), e)
753  #else
754  #define READ_TABLE_WORD64(b, c, d, e) READ_TABLE_WORD64_COMMON(c*4+d, ((d+4*(c%2))?(x##b>>(((d+4*(c%2))?(d+4*(c%2)):1)*8-4))&0xff0:(x##b&0xff)<<4), e)
755  #endif
756  #else
757  #define READ_TABLE_WORD64(b, c, d, e) READ_TABLE_WORD64_COMMON(c*4+d, ((7-d-4*(c%2))?(x##b>>(((7-d-4*(c%2))?(7-d-4*(c%2)):1)*8-4))&0xff0:(x##b&0xff)<<4), e)
758  #endif
759 
760  #define GF_MUL_8BY128(op, b, c, d) \
761  a0 op READ_TABLE_WORD64(b, c, d, 0);\
762  a1 op READ_TABLE_WORD64(b, c, d, 1);\
763 
764  GF_MUL_8BY128(=, 0, 0, 0)
765  GF_MUL_8BY128(^=, 0, 0, 1)
766  GF_MUL_8BY128(^=, 0, 0, 2)
767  GF_MUL_8BY128(^=, 0, 0, 3)
768  GF_MUL_8BY128(^=, 0, 1, 0)
769  GF_MUL_8BY128(^=, 0, 1, 1)
770  GF_MUL_8BY128(^=, 0, 1, 2)
771  GF_MUL_8BY128(^=, 0, 1, 3)
772  GF_MUL_8BY128(^=, 1, 2, 0)
773  GF_MUL_8BY128(^=, 1, 2, 1)
774  GF_MUL_8BY128(^=, 1, 2, 2)
775  GF_MUL_8BY128(^=, 1, 2, 3)
776  GF_MUL_8BY128(^=, 1, 3, 0)
777  GF_MUL_8BY128(^=, 1, 3, 1)
778  GF_MUL_8BY128(^=, 1, 3, 2)
779  GF_MUL_8BY128(^=, 1, 3, 3)
780 
781  x0 = a0; x1 = a1;
782  }
783  while (len >= HASH_BLOCKSIZE);
784 
785  hashBuffer[0] = x0; hashBuffer[1] = x1;
786  return len;
787  }
788 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
789 
790 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
791  case 1: // SSE2 and 2K tables
792  GCM_AuthenticateBlocks_2K(data, len/16, hashBuffer, s_reductionTable);
793  return len % 16;
794  case 3: // SSE2 and 64K tables
795  GCM_AuthenticateBlocks_64K(data, len/16, hashBuffer);
796  return len % 16;
797 #endif
798 
799 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
800  case 1: // SSE2 and 2K tables
801  {
802  #ifdef __GNUC__
803  __asm__ __volatile__
804  (
805  INTEL_NOPREFIX
806  #elif defined(CRYPTOPP_GENERATE_X64_MASM)
807  ALIGN 8
808  GCM_AuthenticateBlocks_2K PROC FRAME
809  rex_push_reg rsi
810  push_reg rdi
811  push_reg rbx
812  .endprolog
813  mov rsi, r8
814  mov r11, r9
815  #else
816  AS2( mov WORD_REG(cx), data )
817  AS2( mov WORD_REG(dx), len )
818  AS2( mov WORD_REG(si), hashBuffer )
819  AS2( shr WORD_REG(dx), 4 )
820  #endif
821 
822  #if CRYPTOPP_BOOL_X32
823  AS1(push rbx)
824  AS1(push rbp)
825  #else
826  AS_PUSH_IF86( bx)
827  AS_PUSH_IF86( bp)
828  #endif
829 
830  #ifdef __GNUC__
831  AS2( mov AS_REG_7, WORD_REG(di))
832  #elif CRYPTOPP_BOOL_X86
833  AS2( lea AS_REG_7, s_reductionTable)
834  #endif
835 
836  AS2( movdqa xmm0, [WORD_REG(si)] )
837 
838  #define MUL_TABLE_0 WORD_REG(si) + 32
839  #define MUL_TABLE_1 WORD_REG(si) + 32 + 1024
840  #define RED_TABLE AS_REG_7
841 
842  ASL(0)
843  AS2( movdqu xmm4, [WORD_REG(cx)] )
844  AS2( pxor xmm0, xmm4 )
845 
846  AS2( movd ebx, xmm0 )
847  AS2( mov eax, AS_HEX(f0f0f0f0) )
848  AS2( and eax, ebx )
849  AS2( shl ebx, 4 )
850  AS2( and ebx, AS_HEX(f0f0f0f0) )
851  AS2( movzx edi, ah )
852  AS2( movdqa xmm5, XMMWORD_PTR [MUL_TABLE_1 + WORD_REG(di)] )
853  AS2( movzx edi, al )
854  AS2( movdqa xmm4, XMMWORD_PTR [MUL_TABLE_1 + WORD_REG(di)] )
855  AS2( shr eax, 16 )
856  AS2( movzx edi, ah )
857  AS2( movdqa xmm3, XMMWORD_PTR [MUL_TABLE_1 + WORD_REG(di)] )
858  AS2( movzx edi, al )
859  AS2( movdqa xmm2, XMMWORD_PTR [MUL_TABLE_1 + WORD_REG(di)] )
860 
861  #define SSE2_MUL_32BITS(i) \
862  AS2( psrldq xmm0, 4 )\
863  AS2( movd eax, xmm0 )\
864  AS2( and eax, AS_HEX(f0f0f0f0) )\
865  AS2( movzx edi, bh )\
866  AS2( pxor xmm5, XMMWORD_PTR [MUL_TABLE_0 + (i-1)*256 + WORD_REG(di)] )\
867  AS2( movzx edi, bl )\
868  AS2( pxor xmm4, XMMWORD_PTR [MUL_TABLE_0 + (i-1)*256 + WORD_REG(di)] )\
869  AS2( shr ebx, 16 )\
870  AS2( movzx edi, bh )\
871  AS2( pxor xmm3, XMMWORD_PTR [MUL_TABLE_0 + (i-1)*256 + WORD_REG(di)] )\
872  AS2( movzx edi, bl )\
873  AS2( pxor xmm2, XMMWORD_PTR [MUL_TABLE_0 + (i-1)*256 + WORD_REG(di)] )\
874  AS2( movd ebx, xmm0 )\
875  AS2( shl ebx, 4 )\
876  AS2( and ebx, AS_HEX(f0f0f0f0) )\
877  AS2( movzx edi, ah )\
878  AS2( pxor xmm5, XMMWORD_PTR [MUL_TABLE_1 + i*256 + WORD_REG(di)] )\
879  AS2( movzx edi, al )\
880  AS2( pxor xmm4, XMMWORD_PTR [MUL_TABLE_1 + i*256 + WORD_REG(di)] )\
881  AS2( shr eax, 16 )\
882  AS2( movzx edi, ah )\
883  AS2( pxor xmm3, XMMWORD_PTR [MUL_TABLE_1 + i*256 + WORD_REG(di)] )\
884  AS2( movzx edi, al )\
885  AS2( pxor xmm2, XMMWORD_PTR [MUL_TABLE_1 + i*256 + WORD_REG(di)] )\
886 
887  SSE2_MUL_32BITS(1)
888  SSE2_MUL_32BITS(2)
889  SSE2_MUL_32BITS(3)
890 
891  AS2( movzx edi, bh )
892  AS2( pxor xmm5, XMMWORD_PTR [MUL_TABLE_0 + 3*256 + WORD_REG(di)] )
893  AS2( movzx edi, bl )
894  AS2( pxor xmm4, XMMWORD_PTR [MUL_TABLE_0 + 3*256 + WORD_REG(di)] )
895  AS2( shr ebx, 16 )
896  AS2( movzx edi, bh )
897  AS2( pxor xmm3, XMMWORD_PTR [MUL_TABLE_0 + 3*256 + WORD_REG(di)] )
898  AS2( movzx edi, bl )
899  AS2( pxor xmm2, XMMWORD_PTR [MUL_TABLE_0 + 3*256 + WORD_REG(di)] )
900 
901  AS2( movdqa xmm0, xmm3 )
902  AS2( pslldq xmm3, 1 )
903  AS2( pxor xmm2, xmm3 )
904  AS2( movdqa xmm1, xmm2 )
905  AS2( pslldq xmm2, 1 )
906  AS2( pxor xmm5, xmm2 )
907 
908  AS2( psrldq xmm0, 15 )
909 #if USE_MOVD_REG32
910  AS2( movd edi, xmm0 )
911 #elif USE_MOV_REG32_OR_REG64
912  AS2( mov WORD_REG(di), xmm0 )
913 #else // GNU Assembler
914  AS2( movd WORD_REG(di), xmm0 )
915 #endif
916  AS2( movzx eax, WORD PTR [RED_TABLE + WORD_REG(di)*2] )
917  AS2( shl eax, 8 )
918 
919  AS2( movdqa xmm0, xmm5 )
920  AS2( pslldq xmm5, 1 )
921  AS2( pxor xmm4, xmm5 )
922 
923  AS2( psrldq xmm1, 15 )
924 #if USE_MOVD_REG32
925  AS2( movd edi, xmm1 )
926 #elif USE_MOV_REG32_OR_REG64
927  AS2( mov WORD_REG(di), xmm1 )
928 #else
929  AS2( movd WORD_REG(di), xmm1 )
930 #endif
931  AS2( xor ax, WORD PTR [RED_TABLE + WORD_REG(di)*2] )
932  AS2( shl eax, 8 )
933 
934  AS2( psrldq xmm0, 15 )
935 #if USE_MOVD_REG32
936  AS2( movd edi, xmm0 )
937 #elif USE_MOV_REG32_OR_REG64
938  AS2( mov WORD_REG(di), xmm0 )
939 #else
940  AS2( movd WORD_REG(di), xmm0 )
941 #endif
942  AS2( xor ax, WORD PTR [RED_TABLE + WORD_REG(di)*2] )
943 
944  AS2( movd xmm0, eax )
945  AS2( pxor xmm0, xmm4 )
946 
947  AS2( add WORD_REG(cx), 16 )
948  AS2( sub WORD_REG(dx), 1 )
949  ATT_NOPREFIX
950  ASJ( jnz, 0, b )
951  INTEL_NOPREFIX
952  AS2( movdqa [WORD_REG(si)], xmm0 )
953 
954  #if CRYPTOPP_BOOL_X32
955  AS1(pop rbp)
956  AS1(pop rbx)
957  #else
958  AS_POP_IF86( bp)
959  AS_POP_IF86( bx)
960  #endif
961 
962  #ifdef __GNUC__
963  ATT_PREFIX
964  :
965  : "c" (data), "d" (len/16), "S" (hashBuffer), "D" (s_reductionTable)
966  : "memory", "cc", "%eax"
967  #if CRYPTOPP_BOOL_X64
968  , "%ebx", "%r11"
969  #endif
970  );
971  #elif defined(CRYPTOPP_GENERATE_X64_MASM)
972  pop rbx
973  pop rdi
974  pop rsi
975  ret
976  GCM_AuthenticateBlocks_2K ENDP
977  #endif
978 
979  return len%16;
980  }
981  case 3: // SSE2 and 64K tables
982  {
983  #ifdef __GNUC__
984  __asm__ __volatile__
985  (
986  INTEL_NOPREFIX
987  #elif defined(CRYPTOPP_GENERATE_X64_MASM)
988  ALIGN 8
989  GCM_AuthenticateBlocks_64K PROC FRAME
990  rex_push_reg rsi
991  push_reg rdi
992  .endprolog
993  mov rsi, r8
994  #else
995  AS2( mov WORD_REG(cx), data )
996  AS2( mov WORD_REG(dx), len )
997  AS2( mov WORD_REG(si), hashBuffer )
998  AS2( shr WORD_REG(dx), 4 )
999  #endif
1000 
1001  AS2( movdqa xmm0, [WORD_REG(si)] )
1002 
1003  #undef MUL_TABLE
1004  #define MUL_TABLE(i,j) WORD_REG(si) + 32 + (i*4+j)*256*16
1005 
1006  ASL(1)
1007  AS2( movdqu xmm1, [WORD_REG(cx)] )
1008  AS2( pxor xmm1, xmm0 )
1009  AS2( pxor xmm0, xmm0 )
1010 
1011  #undef SSE2_MUL_32BITS
1012  #define SSE2_MUL_32BITS(i) \
1013  AS2( movd eax, xmm1 )\
1014  AS2( psrldq xmm1, 4 )\
1015  AS2( movzx edi, al )\
1016  AS2( add WORD_REG(di), WORD_REG(di) )\
1017  AS2( pxor xmm0, [MUL_TABLE(i,0) + WORD_REG(di)*8] )\
1018  AS2( movzx edi, ah )\
1019  AS2( add WORD_REG(di), WORD_REG(di) )\
1020  AS2( pxor xmm0, [MUL_TABLE(i,1) + WORD_REG(di)*8] )\
1021  AS2( shr eax, 16 )\
1022  AS2( movzx edi, al )\
1023  AS2( add WORD_REG(di), WORD_REG(di) )\
1024  AS2( pxor xmm0, [MUL_TABLE(i,2) + WORD_REG(di)*8] )\
1025  AS2( movzx edi, ah )\
1026  AS2( add WORD_REG(di), WORD_REG(di) )\
1027  AS2( pxor xmm0, [MUL_TABLE(i,3) + WORD_REG(di)*8] )\
1028 
1029  SSE2_MUL_32BITS(0)
1030  SSE2_MUL_32BITS(1)
1031  SSE2_MUL_32BITS(2)
1032  SSE2_MUL_32BITS(3)
1033 
1034  AS2( add WORD_REG(cx), 16 )
1035  AS2( sub WORD_REG(dx), 1 )
1036  ATT_NOPREFIX
1037  ASJ( jnz, 1, b )
1038  INTEL_NOPREFIX
1039  AS2( movdqa [WORD_REG(si)], xmm0 )
1040 
1041  #ifdef __GNUC__
1042  ATT_PREFIX
1043  :
1044  : "c" (data), "d" (len/16), "S" (hashBuffer)
1045  : "memory", "cc", "%edi", "%eax"
1046  );
1047  #elif defined(CRYPTOPP_GENERATE_X64_MASM)
1048  pop rdi
1049  pop rsi
1050  ret
1051  GCM_AuthenticateBlocks_64K ENDP
1052  #endif
1053 
1054  return len%16;
1055  }
1056 #endif
1057 #ifndef CRYPTOPP_GENERATE_X64_MASM
1058  }
1059 
1060  return len%16;
1061 }
1062 
1063 void GCM_Base::AuthenticateLastHeaderBlock()
1064 {
1065  if (m_bufferedDataLength > 0)
1066  {
1067  memset(m_buffer+m_bufferedDataLength, 0, HASH_BLOCKSIZE-m_bufferedDataLength);
1068  m_bufferedDataLength = 0;
1069  GCM_Base::AuthenticateBlocks(m_buffer, HASH_BLOCKSIZE);
1070  }
1071 }
1072 
1073 void GCM_Base::AuthenticateLastConfidentialBlock()
1074 {
1075  GCM_Base::AuthenticateLastHeaderBlock();
1076  PutBlock<word64, BigEndian, true>(NULL, m_buffer)(m_totalHeaderLength*8)(m_totalMessageLength*8);
1077  GCM_Base::AuthenticateBlocks(m_buffer, HASH_BLOCKSIZE);
1078 }
1079 
1080 void GCM_Base::AuthenticateLastFooterBlock(byte *mac, size_t macSize)
1081 {
1082  m_ctr.Seek(0);
1083  ReverseHashBufferIfNeeded();
1084  m_ctr.ProcessData(mac, HashBuffer(), macSize);
1085 }
1086 
1087 NAMESPACE_END
1088 
1089 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
1090 #endif
An invalid argument was detected.
Definition: cryptlib.h:187
virtual void SetKey(const byte *key, size_t length, const NameValuePairs &params=g_nullNameValuePairs)
Sets or reset the key of this object.
Definition: cryptlib.cpp:97
void IncrementCounterByOne(byte *inout, unsigned int size)
Performs an addition with carry on a block of bytes.
Definition: misc.h:1022
virtual unsigned int BlockSize() const =0
Provides the block size of the cipher.
Library configuration file.
byte GetByte(size_t n) const
return the n-th byte
Definition: gf2n.cpp:77
Access a block of memory.
Definition: misc.h:2255
bool GetIntValue(const char *name, int &value) const
Get a named value with type int.
Definition: cryptlib.h:376
byte order is little-endian
Definition: cryptlib.h:130
Polynomial with Coefficients in GF(2)
Definition: gf2n.h:18
Interface for one direction (encryption or decryption) of a block cipher.
Definition: cryptlib.h:1098
Use a table with 64K entries.
Definition: gcm.h:21
bool IsAlignedOn(const void *ptr, unsigned int alignment)
Determines whether ptr is aligned to a minimum value.
Definition: misc.h:916
const char * TableSize()
int, in bytes
Definition: argnames.h:80
unsigned int OptimalDataAlignment() const
Provides input and output data alignment for optimal performance.
Definition: gcm.cpp:482
void ProcessBlock(const byte *inBlock, byte *outBlock) const
Encrypt or decrypt a block.
Definition: cryptlib.h:757
bool HasCLMUL()
Determines Carryless Multiply availability.
Definition: cpu.h:205
T ConditionalByteReverse(ByteOrder order, T value)
Reverses bytes in a value depending upon endianess.
Definition: misc.h:1858
const T1 UnsignedMin(const T1 &a, const T2 &b)
Safe comparison of values that could be neagtive and incorrectly promoted.
Definition: misc.h:503
byte order is big-endian
Definition: cryptlib.h:132
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:62
const char * BlockSize()
int, in bytes
Definition: argnames.h:26
Functions for CPU features and intrinsics.
virtual unsigned int OptimalDataAlignment() const
Provides input and output data alignment for optimal performance.
Definition: cryptlib.cpp:229
bool HasSSE2()
Determines SSE2 availability.
Definition: cpu.h:160
std::string AlgorithmName() const
Provides the name of this algorithm.
Definition: gcm.h:31
GCM block cipher mode of operation.
ByteOrder GetNativeByteOrder()
Returns NativeByteOrder as an enumerated ByteOrder value.
Definition: misc.h:949
Access a block of memory.
Definition: misc.h:2217
Crypto++ library namespace.
byte ByteReverse(byte value)
Reverses bytes in a 8-bit value.
Definition: misc.h:1714
Interface for retrieving values given their names.
Definition: cryptlib.h:282