Crypto++  5.6.5
Free C++ class library of cryptographic schemes
sha.cpp
1 // sha.cpp - modified by Wei Dai from Steve Reid's public domain sha1.c
2 
3 // Steve Reid implemented SHA-1. Wei Dai implemented SHA-2. Jeffrey Walton
4 // implemented Intel SHA extensions based on Intel articles and code by
5 // Sean Gulley. All code is in the public domain.
6 
7 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM sha.cpp" to generate MASM code
8 
9 #include "pch.h"
10 #include "config.h"
11 
12 #if CRYPTOPP_MSC_VERSION
13 # pragma warning(disable: 4100 4731)
14 #endif
15 
16 #ifndef CRYPTOPP_IMPORTS
17 #ifndef CRYPTOPP_GENERATE_X64_MASM
18 
19 #include "secblock.h"
20 #include "sha.h"
21 #include "misc.h"
22 #include "cpu.h"
23 
24 #if defined(CRYPTOPP_DISABLE_SHA_ASM)
25 # undef CRYPTOPP_X86_ASM_AVAILABLE
26 # undef CRYPTOPP_X32_ASM_AVAILABLE
27 # undef CRYPTOPP_X64_ASM_AVAILABLE
28 # undef CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
29 #endif
30 
31 NAMESPACE_BEGIN(CryptoPP)
32 
33 // Function pointer for specific SHA1 or SHA256 Transform function
34 typedef void (*pfnSHATransform)(word32 *state, const word32 *data);
35 typedef void (CRYPTOPP_FASTCALL *pfnSHAHashBlocks)(word32 *state, const word32 *data, size_t length);
36 
37 ////////////////////////////////
38 // start of Steve Reid's code //
39 ////////////////////////////////
40 
41 #define blk0(i) (W[i] = data[i])
42 #define blk1(i) (W[i&15] = rotlFixed(W[(i+13)&15]^W[(i+8)&15]^W[(i+2)&15]^W[i&15],1))
43 
44 #define f1(x,y,z) (z^(x&(y^z)))
45 #define f2(x,y,z) (x^y^z)
46 #define f3(x,y,z) ((x&y)|(z&(x|y)))
47 #define f4(x,y,z) (x^y^z)
48 
49 /* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
50 #define R0(v,w,x,y,z,i) z+=f1(w,x,y)+blk0(i)+0x5A827999+rotlFixed(v,5);w=rotlFixed(w,30);
51 #define R1(v,w,x,y,z,i) z+=f1(w,x,y)+blk1(i)+0x5A827999+rotlFixed(v,5);w=rotlFixed(w,30);
52 #define R2(v,w,x,y,z,i) z+=f2(w,x,y)+blk1(i)+0x6ED9EBA1+rotlFixed(v,5);w=rotlFixed(w,30);
53 #define R3(v,w,x,y,z,i) z+=f3(w,x,y)+blk1(i)+0x8F1BBCDC+rotlFixed(v,5);w=rotlFixed(w,30);
54 #define R4(v,w,x,y,z,i) z+=f4(w,x,y)+blk1(i)+0xCA62C1D6+rotlFixed(v,5);w=rotlFixed(w,30);
55 
56 static void SHA1_CXX_Transform(word32 *state, const word32 *data)
57 {
58  word32 W[16];
59  /* Copy context->state[] to working vars */
60  word32 a = state[0];
61  word32 b = state[1];
62  word32 c = state[2];
63  word32 d = state[3];
64  word32 e = state[4];
65  /* 4 rounds of 20 operations each. Loop unrolled. */
66  R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3);
67  R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7);
68  R0(c,d,e,a,b, 8); R0(b,c,d,e,a, 9); R0(a,b,c,d,e,10); R0(e,a,b,c,d,11);
69  R0(d,e,a,b,c,12); R0(c,d,e,a,b,13); R0(b,c,d,e,a,14); R0(a,b,c,d,e,15);
70  R1(e,a,b,c,d,16); R1(d,e,a,b,c,17); R1(c,d,e,a,b,18); R1(b,c,d,e,a,19);
71  R2(a,b,c,d,e,20); R2(e,a,b,c,d,21); R2(d,e,a,b,c,22); R2(c,d,e,a,b,23);
72  R2(b,c,d,e,a,24); R2(a,b,c,d,e,25); R2(e,a,b,c,d,26); R2(d,e,a,b,c,27);
73  R2(c,d,e,a,b,28); R2(b,c,d,e,a,29); R2(a,b,c,d,e,30); R2(e,a,b,c,d,31);
74  R2(d,e,a,b,c,32); R2(c,d,e,a,b,33); R2(b,c,d,e,a,34); R2(a,b,c,d,e,35);
75  R2(e,a,b,c,d,36); R2(d,e,a,b,c,37); R2(c,d,e,a,b,38); R2(b,c,d,e,a,39);
76  R3(a,b,c,d,e,40); R3(e,a,b,c,d,41); R3(d,e,a,b,c,42); R3(c,d,e,a,b,43);
77  R3(b,c,d,e,a,44); R3(a,b,c,d,e,45); R3(e,a,b,c,d,46); R3(d,e,a,b,c,47);
78  R3(c,d,e,a,b,48); R3(b,c,d,e,a,49); R3(a,b,c,d,e,50); R3(e,a,b,c,d,51);
79  R3(d,e,a,b,c,52); R3(c,d,e,a,b,53); R3(b,c,d,e,a,54); R3(a,b,c,d,e,55);
80  R3(e,a,b,c,d,56); R3(d,e,a,b,c,57); R3(c,d,e,a,b,58); R3(b,c,d,e,a,59);
81  R4(a,b,c,d,e,60); R4(e,a,b,c,d,61); R4(d,e,a,b,c,62); R4(c,d,e,a,b,63);
82  R4(b,c,d,e,a,64); R4(a,b,c,d,e,65); R4(e,a,b,c,d,66); R4(d,e,a,b,c,67);
83  R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71);
84  R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75);
85  R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79);
86  /* Add the working vars back into context.state[] */
87  state[0] += a;
88  state[1] += b;
89  state[2] += c;
90  state[3] += d;
91  state[4] += e;
92 }
93 
94 //////////////////////////////
95 // end of Steve Reid's code //
96 //////////////////////////////
97 
98 ///////////////////////////////////
99 // start of Walton/Gulley's code //
100 ///////////////////////////////////
101 
102 #if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
103 // Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
104 static void SHA1_SSE_SHA_Transform(word32 *state, const word32 *data)
105 {
106  __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
107  __m128i MASK, MSG0, MSG1, MSG2, MSG3;
108 
109  // IteratedHashBase<T> has code to perform this step before HashEndianCorrectedBlock()
110  // is called, but the design does not lend itself to optional hardware components
111  // where SHA1 needs reversing, but SHA256 does not.
112  word32* dataBuf = const_cast<word32*>(data);
113  ByteReverse(dataBuf, dataBuf, 64);
114 
115  // Load initial values
116  ABCD = _mm_loadu_si128((__m128i*) state);
117  E0 = _mm_set_epi32(state[4], 0, 0, 0);
118  ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
119  MASK = _mm_set_epi64x(W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f));
120 
121  // Save current hash
122  ABCD_SAVE = ABCD;
123  E0_SAVE = E0;
124 
125  // Rounds 0-3
126  MSG0 = _mm_loadu_si128((__m128i*) data+0);
127  MSG0 = _mm_shuffle_epi8(MSG0, MASK);
128  E0 = _mm_add_epi32(E0, MSG0);
129  E1 = ABCD;
130  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
131 
132  // Rounds 4-7
133  MSG1 = _mm_loadu_si128((__m128i*) (data+4));
134  MSG1 = _mm_shuffle_epi8(MSG1, MASK);
135  E1 = _mm_sha1nexte_epu32(E1, MSG1);
136  E0 = ABCD;
137  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
138  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
139 
140  // Rounds 8-11
141  MSG2 = _mm_loadu_si128((__m128i*) (data+8));
142  MSG2 = _mm_shuffle_epi8(MSG2, MASK);
143  E0 = _mm_sha1nexte_epu32(E0, MSG2);
144  E1 = ABCD;
145  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
146  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
147  MSG0 = _mm_xor_si128(MSG0, MSG2);
148 
149  // Rounds 12-15
150  MSG3 = _mm_loadu_si128((__m128i*) (data+12));
151  MSG3 = _mm_shuffle_epi8(MSG3, MASK);
152  E1 = _mm_sha1nexte_epu32(E1, MSG3);
153  E0 = ABCD;
154  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
155  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
156  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
157  MSG1 = _mm_xor_si128(MSG1, MSG3);
158 
159  // Rounds 16-19
160  E0 = _mm_sha1nexte_epu32(E0, MSG0);
161  E1 = ABCD;
162  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
163  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
164  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
165  MSG2 = _mm_xor_si128(MSG2, MSG0);
166 
167  // Rounds 20-23
168  E1 = _mm_sha1nexte_epu32(E1, MSG1);
169  E0 = ABCD;
170  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
171  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
172  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
173  MSG3 = _mm_xor_si128(MSG3, MSG1);
174 
175  // Rounds 24-27
176  E0 = _mm_sha1nexte_epu32(E0, MSG2);
177  E1 = ABCD;
178  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
179  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
180  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
181  MSG0 = _mm_xor_si128(MSG0, MSG2);
182 
183  // Rounds 28-31
184  E1 = _mm_sha1nexte_epu32(E1, MSG3);
185  E0 = ABCD;
186  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
187  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
188  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
189  MSG1 = _mm_xor_si128(MSG1, MSG3);
190 
191  // Rounds 32-35
192  E0 = _mm_sha1nexte_epu32(E0, MSG0);
193  E1 = ABCD;
194  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
195  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
196  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
197  MSG2 = _mm_xor_si128(MSG2, MSG0);
198 
199  // Rounds 36-39
200  E1 = _mm_sha1nexte_epu32(E1, MSG1);
201  E0 = ABCD;
202  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
203  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
204  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
205  MSG3 = _mm_xor_si128(MSG3, MSG1);
206 
207  // Rounds 40-43
208  E0 = _mm_sha1nexte_epu32(E0, MSG2);
209  E1 = ABCD;
210  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
211  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
212  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
213  MSG0 = _mm_xor_si128(MSG0, MSG2);
214 
215  // Rounds 44-47
216  E1 = _mm_sha1nexte_epu32(E1, MSG3);
217  E0 = ABCD;
218  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
219  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
220  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
221  MSG1 = _mm_xor_si128(MSG1, MSG3);
222 
223  // Rounds 48-51
224  E0 = _mm_sha1nexte_epu32(E0, MSG0);
225  E1 = ABCD;
226  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
227  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
228  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
229  MSG2 = _mm_xor_si128(MSG2, MSG0);
230 
231  // Rounds 52-55
232  E1 = _mm_sha1nexte_epu32(E1, MSG1);
233  E0 = ABCD;
234  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
235  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
236  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
237  MSG3 = _mm_xor_si128(MSG3, MSG1);
238 
239  // Rounds 56-59
240  E0 = _mm_sha1nexte_epu32(E0, MSG2);
241  E1 = ABCD;
242  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
243  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
244  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
245  MSG0 = _mm_xor_si128(MSG0, MSG2);
246 
247  // Rounds 60-63
248  E1 = _mm_sha1nexte_epu32(E1, MSG3);
249  E0 = ABCD;
250  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
251  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
252  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
253  MSG1 = _mm_xor_si128(MSG1, MSG3);
254 
255  // Rounds 64-67
256  E0 = _mm_sha1nexte_epu32(E0, MSG0);
257  E1 = ABCD;
258  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
259  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
260  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
261  MSG2 = _mm_xor_si128(MSG2, MSG0);
262 
263  // Rounds 68-71
264  E1 = _mm_sha1nexte_epu32(E1, MSG1);
265  E0 = ABCD;
266  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
267  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
268  MSG3 = _mm_xor_si128(MSG3, MSG1);
269 
270  // Rounds 72-75
271  E0 = _mm_sha1nexte_epu32(E0, MSG2);
272  E1 = ABCD;
273  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
274  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
275 
276  // Rounds 76-79
277  E1 = _mm_sha1nexte_epu32(E1, MSG3);
278  E0 = ABCD;
279  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
280 
281  // Add values back to state
282  E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);
283  ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);
284 
285  // Save state
286  ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
287  _mm_storeu_si128((__m128i*) state, ABCD);
288  state[4] = _mm_extract_epi32(E0, 3);
289 }
290 #endif
291 
292 /////////////////////////////////
293 // end of Walton/Gulley's code //
294 /////////////////////////////////
295 
296 pfnSHATransform InitializeSHA1Transform()
297 {
298 #if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
299  if (HasSHA())
300  return &SHA1_SSE_SHA_Transform;
301  else
302 #endif
303 
304  return &SHA1_CXX_Transform;
305 }
306 
307 void SHA1::InitState(HashWordType *state)
308 {
309  state[0] = 0x67452301L;
310  state[1] = 0xEFCDAB89L;
311  state[2] = 0x98BADCFEL;
312  state[3] = 0x10325476L;
313  state[4] = 0xC3D2E1F0L;
314 }
315 
316 void SHA1::Transform(word32 *state, const word32 *data)
317 {
318  static const pfnSHATransform s_pfn = InitializeSHA1Transform();
319  s_pfn(state, data);
320 }
321 
322 // *************************************************************
323 
324 void SHA224::InitState(HashWordType *state)
325 {
326  static const word32 s[8] = {0xc1059ed8, 0x367cd507, 0x3070dd17, 0xf70e5939, 0xffc00b31, 0x68581511, 0x64f98fa7, 0xbefa4fa4};
327  memcpy(state, s, sizeof(s));
328 }
329 
330 void SHA256::InitState(HashWordType *state)
331 {
332  static const word32 s[8] = {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19};
333  memcpy(state, s, sizeof(s));
334 }
335 
336 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
337 CRYPTOPP_ALIGN_DATA(16) extern const word32 SHA256_K[64] CRYPTOPP_SECTION_ALIGN16 = {
338 #else
339 extern const word32 SHA256_K[64] = {
340 #endif
341  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
342  0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
343  0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
344  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
345  0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
346  0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
347  0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
348  0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
349  0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
350  0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
351  0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
352  0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
353  0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
354  0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
355  0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
356  0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
357 };
358 
359 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
360 
361 #if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE) || defined(CRYPTOPP_GENERATE_X64_MASM))
362 
363 static void CRYPTOPP_FASTCALL X86_SHA256_HashBlocks(word32 *state, const word32 *data, size_t len)
364 {
365  #define LOCALS_SIZE 8*4 + 16*4 + 4*WORD_SZ
366  #define H(i) [BASE+ASM_MOD(1024+7-(i),8)*4]
367  #define G(i) H(i+1)
368  #define F(i) H(i+2)
369  #define E(i) H(i+3)
370  #define D(i) H(i+4)
371  #define C(i) H(i+5)
372  #define B(i) H(i+6)
373  #define A(i) H(i+7)
374  #define Wt(i) BASE+8*4+ASM_MOD(1024+15-(i),16)*4
375  #define Wt_2(i) Wt((i)-2)
376  #define Wt_15(i) Wt((i)-15)
377  #define Wt_7(i) Wt((i)-7)
378  #define K_END [BASE+8*4+16*4+0*WORD_SZ]
379  #define STATE_SAVE [BASE+8*4+16*4+1*WORD_SZ]
380  #define DATA_SAVE [BASE+8*4+16*4+2*WORD_SZ]
381  #define DATA_END [BASE+8*4+16*4+3*WORD_SZ]
382  #define Kt(i) WORD_REG(si)+(i)*4
383 #if CRYPTOPP_BOOL_X32
384  #define BASE esp+8
385 #elif CRYPTOPP_BOOL_X86
386  #define BASE esp+4
387 #elif defined(__GNUC__)
388  #define BASE r8
389 #else
390  #define BASE rsp
391 #endif
392 
393 #define RA0(i, edx, edi) \
394  AS2( add edx, [Kt(i)] )\
395  AS2( add edx, [Wt(i)] )\
396  AS2( add edx, H(i) )\
397 
398 #define RA1(i, edx, edi)
399 
400 #define RB0(i, edx, edi)
401 
402 #define RB1(i, edx, edi) \
403  AS2( mov AS_REG_7d, [Wt_2(i)] )\
404  AS2( mov edi, [Wt_15(i)])\
405  AS2( mov ebx, AS_REG_7d )\
406  AS2( shr AS_REG_7d, 10 )\
407  AS2( ror ebx, 17 )\
408  AS2( xor AS_REG_7d, ebx )\
409  AS2( ror ebx, 2 )\
410  AS2( xor ebx, AS_REG_7d )/* s1(W_t-2) */\
411  AS2( add ebx, [Wt_7(i)])\
412  AS2( mov AS_REG_7d, edi )\
413  AS2( shr AS_REG_7d, 3 )\
414  AS2( ror edi, 7 )\
415  AS2( add ebx, [Wt(i)])/* s1(W_t-2) + W_t-7 + W_t-16 */\
416  AS2( xor AS_REG_7d, edi )\
417  AS2( add edx, [Kt(i)])\
418  AS2( ror edi, 11 )\
419  AS2( add edx, H(i) )\
420  AS2( xor AS_REG_7d, edi )/* s0(W_t-15) */\
421  AS2( add AS_REG_7d, ebx )/* W_t = s1(W_t-2) + W_t-7 + s0(W_t-15) W_t-16*/\
422  AS2( mov [Wt(i)], AS_REG_7d)\
423  AS2( add edx, AS_REG_7d )\
424 
425 #define ROUND(i, r, eax, ecx, edi, edx)\
426  /* in: edi = E */\
427  /* unused: eax, ecx, temp: ebx, AS_REG_7d, out: edx = T1 */\
428  AS2( mov edx, F(i) )\
429  AS2( xor edx, G(i) )\
430  AS2( and edx, edi )\
431  AS2( xor edx, G(i) )/* Ch(E,F,G) = (G^(E&(F^G))) */\
432  AS2( mov AS_REG_7d, edi )\
433  AS2( ror edi, 6 )\
434  AS2( ror AS_REG_7d, 25 )\
435  RA##r(i, edx, edi )/* H + Wt + Kt + Ch(E,F,G) */\
436  AS2( xor AS_REG_7d, edi )\
437  AS2( ror edi, 5 )\
438  AS2( xor AS_REG_7d, edi )/* S1(E) */\
439  AS2( add edx, AS_REG_7d )/* T1 = S1(E) + Ch(E,F,G) + H + Wt + Kt */\
440  RB##r(i, edx, edi )/* H + Wt + Kt + Ch(E,F,G) */\
441  /* in: ecx = A, eax = B^C, edx = T1 */\
442  /* unused: edx, temp: ebx, AS_REG_7d, out: eax = A, ecx = B^C, edx = E */\
443  AS2( mov ebx, ecx )\
444  AS2( xor ecx, B(i) )/* A^B */\
445  AS2( and eax, ecx )\
446  AS2( xor eax, B(i) )/* Maj(A,B,C) = B^((A^B)&(B^C) */\
447  AS2( mov AS_REG_7d, ebx )\
448  AS2( ror ebx, 2 )\
449  AS2( add eax, edx )/* T1 + Maj(A,B,C) */\
450  AS2( add edx, D(i) )\
451  AS2( mov D(i), edx )\
452  AS2( ror AS_REG_7d, 22 )\
453  AS2( xor AS_REG_7d, ebx )\
454  AS2( ror ebx, 11 )\
455  AS2( xor AS_REG_7d, ebx )\
456  AS2( add eax, AS_REG_7d )/* T1 + S0(A) + Maj(A,B,C) */\
457  AS2( mov H(i), eax )\
458 
459 // Unroll the use of CRYPTOPP_BOOL_X64 in assembler math. The GAS assembler on X32 (version 2.25)
460 // complains "Error: invalid operands (*ABS* and *UND* sections) for `*` and `-`"
461 #if CRYPTOPP_BOOL_X64
462 #define SWAP_COPY(i) \
463  AS2( mov WORD_REG(bx), [WORD_REG(dx)+i*WORD_SZ])\
464  AS1( bswap WORD_REG(bx))\
465  AS2( mov [Wt(i*2+1)], WORD_REG(bx))
466 #else // X86 and X32
467 #define SWAP_COPY(i) \
468  AS2( mov WORD_REG(bx), [WORD_REG(dx)+i*WORD_SZ])\
469  AS1( bswap WORD_REG(bx))\
470  AS2( mov [Wt(i)], WORD_REG(bx))
471 #endif
472 
473 #if defined(__GNUC__)
474  #if CRYPTOPP_BOOL_X64
476  #endif
477  __asm__ __volatile__
478  (
479  #if CRYPTOPP_BOOL_X64
480  "lea %4, %%r8;"
481  #endif
482  INTEL_NOPREFIX
483 #elif defined(CRYPTOPP_GENERATE_X64_MASM)
484  ALIGN 8
485  X86_SHA256_HashBlocks PROC FRAME
486  rex_push_reg rsi
487  push_reg rdi
488  push_reg rbx
489  push_reg rbp
490  alloc_stack(LOCALS_SIZE+8)
491  .endprolog
492  mov rdi, r8
493  lea rsi, [?SHA256_K@CryptoPP@@3QBIB + 48*4]
494 #endif
495 
496 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
497  #ifndef __GNUC__
498  AS2( mov edi, [len])
499  AS2( lea WORD_REG(si), [SHA256_K+48*4])
500  #endif
501  #if !defined(_MSC_VER) || (_MSC_VER < 1400)
502  AS_PUSH_IF86(bx)
503  #endif
504 
505  AS_PUSH_IF86(bp)
506  AS2( mov ebx, esp)
507  AS2( and esp, -16)
508  AS2( sub WORD_REG(sp), LOCALS_SIZE)
509  AS_PUSH_IF86(bx)
510 #endif
511  AS2( mov STATE_SAVE, WORD_REG(cx))
512  AS2( mov DATA_SAVE, WORD_REG(dx))
513  AS2( lea WORD_REG(ax), [WORD_REG(di) + WORD_REG(dx)])
514  AS2( mov DATA_END, WORD_REG(ax))
515  AS2( mov K_END, WORD_REG(si))
516 
517 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
518 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
519  AS2( test edi, 1)
520  ASJ( jnz, 2, f)
521  AS1( dec DWORD PTR K_END)
522 #endif
523  AS2( movdqa xmm0, XMMWORD_PTR [WORD_REG(cx)+0*16])
524  AS2( movdqa xmm1, XMMWORD_PTR [WORD_REG(cx)+1*16])
525 #endif
526 
527 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
528 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
529  ASJ( jmp, 0, f)
530 #endif
531  ASL(2) // non-SSE2
532  AS2( mov esi, ecx)
533  AS2( lea edi, A(0))
534  AS2( mov ecx, 8)
535 ATT_NOPREFIX
536  AS1( rep movsd)
537 INTEL_NOPREFIX
538  AS2( mov esi, K_END)
539  ASJ( jmp, 3, f)
540 #endif
541 
542 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
543  ASL(0)
544  AS2( movdqa E(0), xmm1)
545  AS2( movdqa A(0), xmm0)
546 #endif
547 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
548  ASL(3)
549 #endif
550  AS2( sub WORD_REG(si), 48*4)
551  SWAP_COPY(0) SWAP_COPY(1) SWAP_COPY(2) SWAP_COPY(3)
552  SWAP_COPY(4) SWAP_COPY(5) SWAP_COPY(6) SWAP_COPY(7)
553 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
554  SWAP_COPY(8) SWAP_COPY(9) SWAP_COPY(10) SWAP_COPY(11)
555  SWAP_COPY(12) SWAP_COPY(13) SWAP_COPY(14) SWAP_COPY(15)
556 #endif
557  AS2( mov edi, E(0)) // E
558  AS2( mov eax, B(0)) // B
559  AS2( xor eax, C(0)) // B^C
560  AS2( mov ecx, A(0)) // A
561 
562  ROUND(0, 0, eax, ecx, edi, edx)
563  ROUND(1, 0, ecx, eax, edx, edi)
564  ROUND(2, 0, eax, ecx, edi, edx)
565  ROUND(3, 0, ecx, eax, edx, edi)
566  ROUND(4, 0, eax, ecx, edi, edx)
567  ROUND(5, 0, ecx, eax, edx, edi)
568  ROUND(6, 0, eax, ecx, edi, edx)
569  ROUND(7, 0, ecx, eax, edx, edi)
570  ROUND(8, 0, eax, ecx, edi, edx)
571  ROUND(9, 0, ecx, eax, edx, edi)
572  ROUND(10, 0, eax, ecx, edi, edx)
573  ROUND(11, 0, ecx, eax, edx, edi)
574  ROUND(12, 0, eax, ecx, edi, edx)
575  ROUND(13, 0, ecx, eax, edx, edi)
576  ROUND(14, 0, eax, ecx, edi, edx)
577  ROUND(15, 0, ecx, eax, edx, edi)
578 
579  ASL(1)
580  AS2(add WORD_REG(si), 4*16)
581  ROUND(0, 1, eax, ecx, edi, edx)
582  ROUND(1, 1, ecx, eax, edx, edi)
583  ROUND(2, 1, eax, ecx, edi, edx)
584  ROUND(3, 1, ecx, eax, edx, edi)
585  ROUND(4, 1, eax, ecx, edi, edx)
586  ROUND(5, 1, ecx, eax, edx, edi)
587  ROUND(6, 1, eax, ecx, edi, edx)
588  ROUND(7, 1, ecx, eax, edx, edi)
589  ROUND(8, 1, eax, ecx, edi, edx)
590  ROUND(9, 1, ecx, eax, edx, edi)
591  ROUND(10, 1, eax, ecx, edi, edx)
592  ROUND(11, 1, ecx, eax, edx, edi)
593  ROUND(12, 1, eax, ecx, edi, edx)
594  ROUND(13, 1, ecx, eax, edx, edi)
595  ROUND(14, 1, eax, ecx, edi, edx)
596  ROUND(15, 1, ecx, eax, edx, edi)
597  AS2( cmp WORD_REG(si), K_END)
598  ATT_NOPREFIX
599  ASJ( jb, 1, b)
600  INTEL_NOPREFIX
601 
602  AS2( mov WORD_REG(dx), DATA_SAVE)
603  AS2( add WORD_REG(dx), 64)
604  AS2( mov AS_REG_7, STATE_SAVE)
605  AS2( mov DATA_SAVE, WORD_REG(dx))
606 
607 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
608 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
609  AS2( test DWORD PTR K_END, 1)
610  ASJ( jz, 4, f)
611 #endif
612  AS2( movdqa xmm1, XMMWORD_PTR [AS_REG_7+1*16])
613  AS2( movdqa xmm0, XMMWORD_PTR [AS_REG_7+0*16])
614  AS2( paddd xmm1, E(0))
615  AS2( paddd xmm0, A(0))
616  AS2( movdqa [AS_REG_7+1*16], xmm1)
617  AS2( movdqa [AS_REG_7+0*16], xmm0)
618  AS2( cmp WORD_REG(dx), DATA_END)
619  ATT_NOPREFIX
620  ASJ( jb, 0, b)
621  INTEL_NOPREFIX
622 #endif
623 
624 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
625 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
626  ASJ( jmp, 5, f)
627  ASL(4) // non-SSE2
628 #endif
629  AS2( add [AS_REG_7+0*4], ecx) // A
630  AS2( add [AS_REG_7+4*4], edi) // E
631  AS2( mov eax, B(0))
632  AS2( mov ebx, C(0))
633  AS2( mov ecx, D(0))
634  AS2( add [AS_REG_7+1*4], eax)
635  AS2( add [AS_REG_7+2*4], ebx)
636  AS2( add [AS_REG_7+3*4], ecx)
637  AS2( mov eax, F(0))
638  AS2( mov ebx, G(0))
639  AS2( mov ecx, H(0))
640  AS2( add [AS_REG_7+5*4], eax)
641  AS2( add [AS_REG_7+6*4], ebx)
642  AS2( add [AS_REG_7+7*4], ecx)
643  AS2( mov ecx, AS_REG_7d)
644  AS2( cmp WORD_REG(dx), DATA_END)
645  ASJ( jb, 2, b)
646 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
647  ASL(5)
648 #endif
649 #endif
650 
651  AS_POP_IF86(sp)
652  AS_POP_IF86(bp)
653  #if !defined(_MSC_VER) || (_MSC_VER < 1400)
654  AS_POP_IF86(bx)
655  #endif
656 
657 #ifdef CRYPTOPP_GENERATE_X64_MASM
658  add rsp, LOCALS_SIZE+8
659  pop rbp
660  pop rbx
661  pop rdi
662  pop rsi
663  ret
664  X86_SHA256_HashBlocks ENDP
665 #endif
666 
667 #ifdef __GNUC__
668  ATT_PREFIX
669  :
670  : "c" (state), "d" (data), "S" (SHA256_K+48), "D" (len)
671  #if CRYPTOPP_BOOL_X64
672  , "m" (workspace[0])
673  #endif
674  : "memory", "cc", "%eax"
675  #if CRYPTOPP_BOOL_X64
676  , "%rbx", "%r8", "%r10"
677  #endif
678  );
679 #endif
680 }
681 
682 #endif // (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_GENERATE_X64_MASM))
683 
684 #ifndef CRYPTOPP_GENERATE_X64_MASM
685 
686 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
687 extern "C" {
688 void CRYPTOPP_FASTCALL X86_SHA256_HashBlocks(word32 *state, const word32 *data, size_t len);
689 }
690 #endif
691 
692 #if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
693 static void CRYPTOPP_FASTCALL SHA256_SSE_SHA_HashBlocks(word32 *state, const word32 *data, size_t length);
694 #endif
695 
696 #if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE) || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_SHA_ASM)
697 
698 pfnSHAHashBlocks InitializeSHA256HashBlocks()
699 {
700 #if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
701  if (HasSHA())
702  return &SHA256_SSE_SHA_HashBlocks;
703  else
704 #endif
705 
706  return &X86_SHA256_HashBlocks;
707 }
708 
709 size_t SHA256::HashMultipleBlocks(const word32 *input, size_t length)
710 {
711  static const pfnSHAHashBlocks s_pfn = InitializeSHA256HashBlocks();
712  s_pfn(m_state, input, (length&(size_t(0)-BLOCKSIZE)) - !HasSSE2());
713  return length % BLOCKSIZE;
714 }
715 
716 size_t SHA224::HashMultipleBlocks(const word32 *input, size_t length)
717 {
718  static const pfnSHAHashBlocks s_pfn = InitializeSHA256HashBlocks();
719  s_pfn(m_state, input, (length&(size_t(0)-BLOCKSIZE)) - !HasSSE2());
720  return length % BLOCKSIZE;
721 }
722 #endif
723 
724 #define blk2(i) (W[i&15]+=s1(W[(i-2)&15])+W[(i-7)&15]+s0(W[(i-15)&15]))
725 
726 #define Ch(x,y,z) (z^(x&(y^z)))
727 #define Maj(x,y,z) (y^((x^y)&(y^z)))
728 
729 #define a(i) T[(0-i)&7]
730 #define b(i) T[(1-i)&7]
731 #define c(i) T[(2-i)&7]
732 #define d(i) T[(3-i)&7]
733 #define e(i) T[(4-i)&7]
734 #define f(i) T[(5-i)&7]
735 #define g(i) T[(6-i)&7]
736 #define h(i) T[(7-i)&7]
737 
738 #define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+SHA256_K[i+j]+(j?blk2(i):blk0(i));\
739  d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i))
740 
741 // for SHA256
742 #define S0(x) (rotrFixed(x,2)^rotrFixed(x,13)^rotrFixed(x,22))
743 #define S1(x) (rotrFixed(x,6)^rotrFixed(x,11)^rotrFixed(x,25))
744 #define s0(x) (rotrFixed(x,7)^rotrFixed(x,18)^(x>>3))
745 #define s1(x) (rotrFixed(x,17)^rotrFixed(x,19)^(x>>10))
746 
747 #if defined(__OPTIMIZE_SIZE__)
748 // Smaller but slower
749 void SHA256_CXX_Transform(word32 *state, const word32 *data)
750 {
751  word32 W[32], T[20];
752  unsigned int i = 0, j = 0;
753  word32 *t = T+8;
754 
755  memcpy(t, state, 8*4);
756  word32 e = t[4], a = t[0];
757 
758  do
759  {
760  word32 w = data[j];
761  W[j] = w;
762  w += SHA256_K[j];
763  w += t[7];
764  w += S1(e);
765  w += Ch(e, t[5], t[6]);
766  e = t[3] + w;
767  t[3] = t[3+8] = e;
768  w += S0(t[0]);
769  a = w + Maj(a, t[1], t[2]);
770  t[-1] = t[7] = a;
771  --t;
772  ++j;
773  if (j%8 == 0)
774  t += 8;
775  } while (j<16);
776 
777  do
778  {
779  i = j&0xf;
780  word32 w = s1(W[i+16-2]) + s0(W[i+16-15]) + W[i] + W[i+16-7];
781  W[i+16] = W[i] = w;
782  w += SHA256_K[j];
783  w += t[7];
784  w += S1(e);
785  w += Ch(e, t[5], t[6]);
786  e = t[3] + w;
787  t[3] = t[3+8] = e;
788  w += S0(t[0]);
789  a = w + Maj(a, t[1], t[2]);
790  t[-1] = t[7] = a;
791 
792  w = s1(W[(i+1)+16-2]) + s0(W[(i+1)+16-15]) + W[(i+1)] + W[(i+1)+16-7];
793  W[(i+1)+16] = W[(i+1)] = w;
794  w += SHA256_K[j+1];
795  w += (t-1)[7];
796  w += S1(e);
797  w += Ch(e, (t-1)[5], (t-1)[6]);
798  e = (t-1)[3] + w;
799  (t-1)[3] = (t-1)[3+8] = e;
800  w += S0((t-1)[0]);
801  a = w + Maj(a, (t-1)[1], (t-1)[2]);
802  (t-1)[-1] = (t-1)[7] = a;
803 
804  t-=2;
805  j+=2;
806  if (j%8 == 0)
807  t += 8;
808  } while (j<64);
809 
810  state[0] += a;
811  state[1] += t[1];
812  state[2] += t[2];
813  state[3] += t[3];
814  state[4] += e;
815  state[5] += t[5];
816  state[6] += t[6];
817  state[7] += t[7];
818 }
819 #else
820 // Bigger but faster
821 void SHA256_CXX_Transform(word32 *state, const word32 *data)
822 {
823  word32 W[16], T[8];
824  /* Copy context->state[] to working vars */
825  memcpy(T, state, sizeof(T));
826  /* 64 operations, partially loop unrolled */
827  for (unsigned int j=0; j<64; j+=16)
828  {
829  R( 0); R( 1); R( 2); R( 3);
830  R( 4); R( 5); R( 6); R( 7);
831  R( 8); R( 9); R(10); R(11);
832  R(12); R(13); R(14); R(15);
833  }
834  /* Add the working vars back into context.state[] */
835  state[0] += a(0);
836  state[1] += b(0);
837  state[2] += c(0);
838  state[3] += d(0);
839  state[4] += e(0);
840  state[5] += f(0);
841  state[6] += g(0);
842  state[7] += h(0);
843 }
844 #endif // __OPTIMIZE_SIZE__
845 
846 #undef S0
847 #undef S1
848 #undef s0
849 #undef s1
850 #undef R
851 
852 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
853 static void SHA256_SSE2_Transform(word32 *state, const word32 *data)
854 {
855  // this byte reverse is a waste of time, but this function is only called by MDC
856  word32 W[16];
857  ByteReverse(W, data, SHA256::BLOCKSIZE);
858  X86_SHA256_HashBlocks(state, W, SHA256::BLOCKSIZE - !HasSSE2());
859 }
860 #endif // CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
861 
862 #if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
863 static void SHA256_SSE_SHA_Transform(word32 *state, const word32 *data)
864 {
865  return SHA256_SSE_SHA_HashBlocks(state, data, SHA256::BLOCKSIZE);
866 }
867 #endif // CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
868 
869 ///////////////////////////////////
870 // start of Walton/Gulley's code //
871 ///////////////////////////////////
872 
873 #if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
874 // Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
875 static void CRYPTOPP_FASTCALL SHA256_SSE_SHA_HashBlocks(word32 *state, const word32 *data, size_t length)
876 {
877  CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
878  CRYPTOPP_ASSERT(length % SHA256::BLOCKSIZE == 0);
879 
880  __m128i STATE0, STATE1;
881  __m128i MSG, TMP, MASK;
882  __m128i TMSG0, TMSG1, TMSG2, TMSG3;
883  __m128i ABEF_SAVE, CDGH_SAVE;
884 
885  // Load initial values
886  TMP = _mm_loadu_si128((__m128i*) &state[0]);
887  STATE1 = _mm_loadu_si128((__m128i*) &state[4]);
888  MASK = _mm_set_epi64x(W64LIT(0x0c0d0e0f08090a0b), W64LIT(0x0405060700010203));
889 
890  TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
891  STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
892  STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
893  STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
894 
895  while (length)
896  {
897  // Save current hash
898  ABEF_SAVE = STATE0;
899  CDGH_SAVE = STATE1;
900 
901  // Rounds 0-3
902  MSG = _mm_loadu_si128((__m128i*) data+0);
903  TMSG0 = _mm_shuffle_epi8(MSG, MASK);
904  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98)));
905  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
906  MSG = _mm_shuffle_epi32(MSG, 0x0E);
907  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
908 
909  // Rounds 4-7
910  TMSG1 = _mm_loadu_si128((__m128i*) (data+4));
911  TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
912  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B)));
913  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
914  MSG = _mm_shuffle_epi32(MSG, 0x0E);
915  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
916  TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
917 
918  // Rounds 8-11
919  TMSG2 = _mm_loadu_si128((__m128i*) (data+8));
920  TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
921  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98)));
922  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
923  MSG = _mm_shuffle_epi32(MSG, 0x0E);
924  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
925  TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
926 
927  // Rounds 12-15
928  TMSG3 = _mm_loadu_si128((__m128i*) (data+12));
929  TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
930  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74)));
931  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
932  TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
933  TMSG0 = _mm_add_epi32(TMSG0, TMP);
934  TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
935  MSG = _mm_shuffle_epi32(MSG, 0x0E);
936  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
937  TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
938 
939  // Rounds 16-19
940  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x240CA1CC0FC19DC6), W64LIT(0xEFBE4786E49B69C1)));
941  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
942  TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
943  TMSG1 = _mm_add_epi32(TMSG1, TMP);
944  TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
945  MSG = _mm_shuffle_epi32(MSG, 0x0E);
946  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
947  TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
948 
949  // Rounds 20-23
950  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x76F988DA5CB0A9DC), W64LIT(0x4A7484AA2DE92C6F)));
951  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
952  TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
953  TMSG2 = _mm_add_epi32(TMSG2, TMP);
954  TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
955  MSG = _mm_shuffle_epi32(MSG, 0x0E);
956  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
957  TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
958 
959  // Rounds 24-27
960  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xBF597FC7B00327C8), W64LIT(0xA831C66D983E5152)));
961  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
962  TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
963  TMSG3 = _mm_add_epi32(TMSG3, TMP);
964  TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
965  MSG = _mm_shuffle_epi32(MSG, 0x0E);
966  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
967  TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
968 
969  // Rounds 28-31
970  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x1429296706CA6351), W64LIT(0xD5A79147C6E00BF3)));
971  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
972  TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
973  TMSG0 = _mm_add_epi32(TMSG0, TMP);
974  TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
975  MSG = _mm_shuffle_epi32(MSG, 0x0E);
976  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
977  TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
978 
979  // Rounds 32-35
980  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x53380D134D2C6DFC), W64LIT(0x2E1B213827B70A85)));
981  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
982  TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
983  TMSG1 = _mm_add_epi32(TMSG1, TMP);
984  TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
985  MSG = _mm_shuffle_epi32(MSG, 0x0E);
986  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
987  TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
988 
989  // Rounds 36-39
990  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x92722C8581C2C92E), W64LIT(0x766A0ABB650A7354)));
991  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
992  TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
993  TMSG2 = _mm_add_epi32(TMSG2, TMP);
994  TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
995  MSG = _mm_shuffle_epi32(MSG, 0x0E);
996  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
997  TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
998 
999  // Rounds 40-43
1000  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xC76C51A3C24B8B70), W64LIT(0xA81A664BA2BFE8A1)));
1001  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
1002  TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
1003  TMSG3 = _mm_add_epi32(TMSG3, TMP);
1004  TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
1005  MSG = _mm_shuffle_epi32(MSG, 0x0E);
1006  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
1007  TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
1008 
1009  // Rounds 44-47
1010  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x106AA070F40E3585), W64LIT(0xD6990624D192E819)));
1011  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
1012  TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
1013  TMSG0 = _mm_add_epi32(TMSG0, TMP);
1014  TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
1015  MSG = _mm_shuffle_epi32(MSG, 0x0E);
1016  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
1017  TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
1018 
1019  // Rounds 48-51
1020  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x34B0BCB52748774C), W64LIT(0x1E376C0819A4C116)));
1021  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
1022  TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
1023  TMSG1 = _mm_add_epi32(TMSG1, TMP);
1024  TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
1025  MSG = _mm_shuffle_epi32(MSG, 0x0E);
1026  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
1027  TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
1028 
1029  // Rounds 52-55
1030  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x682E6FF35B9CCA4F), W64LIT(0x4ED8AA4A391C0CB3)));
1031  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
1032  TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
1033  TMSG2 = _mm_add_epi32(TMSG2, TMP);
1034  TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
1035  MSG = _mm_shuffle_epi32(MSG, 0x0E);
1036  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
1037 
1038  // Rounds 56-59
1039  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x8CC7020884C87814), W64LIT(0x78A5636F748F82EE)));
1040  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
1041  TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
1042  TMSG3 = _mm_add_epi32(TMSG3, TMP);
1043  TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
1044  MSG = _mm_shuffle_epi32(MSG, 0x0E);
1045  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
1046 
1047  // Rounds 60-63
1048  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC67178F2BEF9A3F7), W64LIT(0xA4506CEB90BEFFFA)));
1049  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
1050  MSG = _mm_shuffle_epi32(MSG, 0x0E);
1051  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
1052 
1053  // Add values back to state
1054  STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
1055  STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
1056 
1057  data += 16;
1058  length -= SHA256::BLOCKSIZE;
1059  }
1060 
1061  TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
1062  STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
1063  STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
1064  STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
1065 
1066  // Save state
1067  _mm_storeu_si128((__m128i*) &state[0], STATE0);
1068  _mm_storeu_si128((__m128i*) &state[4], STATE1);
1069 }
1070 #endif // CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
1071 
1072 /////////////////////////////////
1073 // end of Walton/Gulley's code //
1074 /////////////////////////////////
1075 
1076 pfnSHATransform InitializeSHA256Transform()
1077 {
1078 #if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
1079  if (HasSHA())
1080  return &SHA256_SSE_SHA_Transform;
1081  else
1082 #endif
1083 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
1084  if (HasSSE2())
1085  return &SHA256_SSE2_Transform;
1086  else
1087 #endif
1088 
1089  return &SHA256_CXX_Transform;
1090 }
1091 
1092 void SHA256::Transform(word32 *state, const word32 *data)
1093 {
1094  static const pfnSHATransform s_pfn = InitializeSHA256Transform();
1095  s_pfn(state, data);
1096 }
1097 
1098 // *************************************************************
1099 
1100 void SHA384::InitState(HashWordType *state)
1101 {
1102  static const word64 s[8] = {
1103  W64LIT(0xcbbb9d5dc1059ed8), W64LIT(0x629a292a367cd507),
1104  W64LIT(0x9159015a3070dd17), W64LIT(0x152fecd8f70e5939),
1105  W64LIT(0x67332667ffc00b31), W64LIT(0x8eb44a8768581511),
1106  W64LIT(0xdb0c2e0d64f98fa7), W64LIT(0x47b5481dbefa4fa4)};
1107  memcpy(state, s, sizeof(s));
1108 }
1109 
1110 void SHA512::InitState(HashWordType *state)
1111 {
1112  static const word64 s[8] = {
1113  W64LIT(0x6a09e667f3bcc908), W64LIT(0xbb67ae8584caa73b),
1114  W64LIT(0x3c6ef372fe94f82b), W64LIT(0xa54ff53a5f1d36f1),
1115  W64LIT(0x510e527fade682d1), W64LIT(0x9b05688c2b3e6c1f),
1116  W64LIT(0x1f83d9abfb41bd6b), W64LIT(0x5be0cd19137e2179)};
1117  memcpy(state, s, sizeof(s));
1118 }
1119 
1120 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32)
1121 CRYPTOPP_ALIGN_DATA(16) static const word64 SHA512_K[80] CRYPTOPP_SECTION_ALIGN16 = {
1122 #else
1123 CRYPTOPP_ALIGN_DATA(16) static const word64 SHA512_K[80] CRYPTOPP_SECTION_ALIGN16 = {
1124 #endif
1125  W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd),
1126  W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc),
1127  W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019),
1128  W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118),
1129  W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe),
1130  W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2),
1131  W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1),
1132  W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694),
1133  W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3),
1134  W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65),
1135  W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483),
1136  W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5),
1137  W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210),
1138  W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4),
1139  W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725),
1140  W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70),
1141  W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926),
1142  W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df),
1143  W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8),
1144  W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b),
1145  W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001),
1146  W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30),
1147  W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910),
1148  W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8),
1149  W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53),
1150  W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8),
1151  W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb),
1152  W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3),
1153  W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60),
1154  W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec),
1155  W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9),
1156  W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b),
1157  W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207),
1158  W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178),
1159  W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6),
1160  W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b),
1161  W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493),
1162  W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c),
1163  W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a),
1164  W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817)
1165 };
1166 
1167 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32)
1168 // put assembly version in separate function, otherwise MSVC 2005 SP1 doesn't generate correct code for the non-assembly version
1169 CRYPTOPP_NAKED static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64 *data)
1170 {
1171 #ifdef __GNUC__
1172  __asm__ __volatile__
1173  (
1174  INTEL_NOPREFIX
1175  AS_PUSH_IF86( bx)
1176  AS2( mov ebx, eax)
1177 #else
1178  AS1( push ebx)
1179  AS1( push esi)
1180  AS1( push edi)
1181  AS2( lea ebx, SHA512_K)
1182 #endif
1183 
1184  AS2( mov eax, esp)
1185  AS2( and esp, 0xfffffff0)
1186  AS2( sub esp, 27*16) // 17*16 for expanded data, 20*8 for state
1187  AS_PUSH_IF86( ax)
1188  AS2( xor eax, eax)
1189 
1190 #if CRYPTOPP_BOOL_X32
1191  AS2( lea edi, [esp+8+8*8]) // start at middle of state buffer. will decrement pointer each round to avoid copying
1192  AS2( lea esi, [esp+8+20*8+8]) // 16-byte alignment, then add 8
1193 #else
1194  AS2( lea edi, [esp+4+8*8]) // start at middle of state buffer. will decrement pointer each round to avoid copying
1195  AS2( lea esi, [esp+4+20*8+8]) // 16-byte alignment, then add 8
1196 #endif
1197 
1198  AS2( movdqa xmm0, [ecx+0*16])
1199  AS2( movdq2q mm4, xmm0)
1200  AS2( movdqa [edi+0*16], xmm0)
1201  AS2( movdqa xmm0, [ecx+1*16])
1202  AS2( movdqa [edi+1*16], xmm0)
1203  AS2( movdqa xmm0, [ecx+2*16])
1204  AS2( movdq2q mm5, xmm0)
1205  AS2( movdqa [edi+2*16], xmm0)
1206  AS2( movdqa xmm0, [ecx+3*16])
1207  AS2( movdqa [edi+3*16], xmm0)
1208  ASJ( jmp, 0, f)
1209 
1210 #define SSE2_S0_S1(r, a, b, c) \
1211  AS2( movq mm6, r)\
1212  AS2( psrlq r, a)\
1213  AS2( movq mm7, r)\
1214  AS2( psllq mm6, 64-c)\
1215  AS2( pxor mm7, mm6)\
1216  AS2( psrlq r, b-a)\
1217  AS2( pxor mm7, r)\
1218  AS2( psllq mm6, c-b)\
1219  AS2( pxor mm7, mm6)\
1220  AS2( psrlq r, c-b)\
1221  AS2( pxor r, mm7)\
1222  AS2( psllq mm6, b-a)\
1223  AS2( pxor r, mm6)
1224 
1225 #define SSE2_s0(r, a, b, c) \
1226  AS2( movdqa xmm6, r)\
1227  AS2( psrlq r, a)\
1228  AS2( movdqa xmm7, r)\
1229  AS2( psllq xmm6, 64-c)\
1230  AS2( pxor xmm7, xmm6)\
1231  AS2( psrlq r, b-a)\
1232  AS2( pxor xmm7, r)\
1233  AS2( psrlq r, c-b)\
1234  AS2( pxor r, xmm7)\
1235  AS2( psllq xmm6, c-a)\
1236  AS2( pxor r, xmm6)
1237 
1238 #define SSE2_s1(r, a, b, c) \
1239  AS2( movdqa xmm6, r)\
1240  AS2( psrlq r, a)\
1241  AS2( movdqa xmm7, r)\
1242  AS2( psllq xmm6, 64-c)\
1243  AS2( pxor xmm7, xmm6)\
1244  AS2( psrlq r, b-a)\
1245  AS2( pxor xmm7, r)\
1246  AS2( psllq xmm6, c-b)\
1247  AS2( pxor xmm7, xmm6)\
1248  AS2( psrlq r, c-b)\
1249  AS2( pxor r, xmm7)
1250 
1251  ASL(SHA512_Round)
1252  // k + w is in mm0, a is in mm4, e is in mm5
1253  AS2( paddq mm0, [edi+7*8]) // h
1254  AS2( movq mm2, [edi+5*8]) // f
1255  AS2( movq mm3, [edi+6*8]) // g
1256  AS2( pxor mm2, mm3)
1257  AS2( pand mm2, mm5)
1258  SSE2_S0_S1(mm5,14,18,41)
1259  AS2( pxor mm2, mm3)
1260  AS2( paddq mm0, mm2) // h += Ch(e,f,g)
1261  AS2( paddq mm5, mm0) // h += S1(e)
1262  AS2( movq mm2, [edi+1*8]) // b
1263  AS2( movq mm1, mm2)
1264  AS2( por mm2, mm4)
1265  AS2( pand mm2, [edi+2*8]) // c
1266  AS2( pand mm1, mm4)
1267  AS2( por mm1, mm2)
1268  AS2( paddq mm1, mm5) // temp = h + Maj(a,b,c)
1269  AS2( paddq mm5, [edi+3*8]) // e = d + h
1270  AS2( movq [edi+3*8], mm5)
1271  AS2( movq [edi+11*8], mm5)
1272  SSE2_S0_S1(mm4,28,34,39) // S0(a)
1273  AS2( paddq mm4, mm1) // a = temp + S0(a)
1274  AS2( movq [edi-8], mm4)
1275  AS2( movq [edi+7*8], mm4)
1276  AS1( ret)
1277 
1278  // first 16 rounds
1279  ASL(0)
1280  AS2( movq mm0, [edx+eax*8])
1281  AS2( movq [esi+eax*8], mm0)
1282  AS2( movq [esi+eax*8+16*8], mm0)
1283  AS2( paddq mm0, [ebx+eax*8])
1284  ASC( call, SHA512_Round)
1285  AS1( inc eax)
1286  AS2( sub edi, 8)
1287  AS2( test eax, 7)
1288  ASJ( jnz, 0, b)
1289  AS2( add edi, 8*8)
1290  AS2( cmp eax, 16)
1291  ASJ( jne, 0, b)
1292 
1293  // rest of the rounds
1294  AS2( movdqu xmm0, [esi+(16-2)*8])
1295  ASL(1)
1296  // data expansion, W[i-2] already in xmm0
1297  AS2( movdqu xmm3, [esi])
1298  AS2( paddq xmm3, [esi+(16-7)*8])
1299  AS2( movdqa xmm2, [esi+(16-15)*8])
1300  SSE2_s1(xmm0, 6, 19, 61)
1301  AS2( paddq xmm0, xmm3)
1302  SSE2_s0(xmm2, 1, 7, 8)
1303  AS2( paddq xmm0, xmm2)
1304  AS2( movdq2q mm0, xmm0)
1305  AS2( movhlps xmm1, xmm0)
1306  AS2( paddq mm0, [ebx+eax*8])
1307  AS2( movlps [esi], xmm0)
1308  AS2( movlps [esi+8], xmm1)
1309  AS2( movlps [esi+8*16], xmm0)
1310  AS2( movlps [esi+8*17], xmm1)
1311  // 2 rounds
1312  ASC( call, SHA512_Round)
1313  AS2( sub edi, 8)
1314  AS2( movdq2q mm0, xmm1)
1315  AS2( paddq mm0, [ebx+eax*8+8])
1316  ASC( call, SHA512_Round)
1317  // update indices and loop
1318  AS2( add esi, 16)
1319  AS2( add eax, 2)
1320  AS2( sub edi, 8)
1321  AS2( test eax, 7)
1322  ASJ( jnz, 1, b)
1323  // do housekeeping every 8 rounds
1324  AS2( mov esi, 0xf)
1325  AS2( and esi, eax)
1326 #if CRYPTOPP_BOOL_X32
1327  AS2( lea esi, [esp+8+20*8+8+esi*8])
1328 #else
1329  AS2( lea esi, [esp+4+20*8+8+esi*8])
1330 #endif
1331  AS2( add edi, 8*8)
1332  AS2( cmp eax, 80)
1333  ASJ( jne, 1, b)
1334 
1335 #define SSE2_CombineState(i) \
1336  AS2( movdqa xmm0, [edi+i*16])\
1337  AS2( paddq xmm0, [ecx+i*16])\
1338  AS2( movdqa [ecx+i*16], xmm0)
1339 
1340  SSE2_CombineState(0)
1341  SSE2_CombineState(1)
1342  SSE2_CombineState(2)
1343  SSE2_CombineState(3)
1344 
1345  AS_POP_IF86( sp)
1346  AS1( emms)
1347 
1348 #if defined(__GNUC__)
1349  AS_POP_IF86( bx)
1350  ATT_PREFIX
1351  :
1352  : "a" (SHA512_K), "c" (state), "d" (data)
1353  : "%esi", "%edi", "memory", "cc"
1354  );
1355 #else
1356  AS1( pop edi)
1357  AS1( pop esi)
1358  AS1( pop ebx)
1359  AS1( ret)
1360 #endif
1361 }
1362 #endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
1363 
1364 void SHA512::Transform(word64 *state, const word64 *data)
1365 {
1366  CRYPTOPP_ASSERT(IsAlignedOn(state, GetAlignmentOf<word64>()));
1367  CRYPTOPP_ASSERT(IsAlignedOn(data, GetAlignmentOf<word64>()));
1368 
1369 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32)
1370  if (HasSSE2())
1371  {
1372  SHA512_SSE2_Transform(state, data);
1373  return;
1374  }
1375 #endif
1376 
1377 #define S0(x) (rotrFixed(x,28)^rotrFixed(x,34)^rotrFixed(x,39))
1378 #define S1(x) (rotrFixed(x,14)^rotrFixed(x,18)^rotrFixed(x,41))
1379 #define s0(x) (rotrFixed(x,1)^rotrFixed(x,8)^(x>>7))
1380 #define s1(x) (rotrFixed(x,19)^rotrFixed(x,61)^(x>>6))
1381 
1382 #define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+SHA512_K[i+j]+(j?blk2(i):blk0(i));\
1383  d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i))
1384 
1385  word64 W[16];
1386  word64 T[8];
1387  /* Copy context->state[] to working vars */
1388  memcpy(T, state, sizeof(T));
1389  /* 80 operations, partially loop unrolled */
1390  for (unsigned int j=0; j<80; j+=16)
1391  {
1392  R( 0); R( 1); R( 2); R( 3);
1393  R( 4); R( 5); R( 6); R( 7);
1394  R( 8); R( 9); R(10); R(11);
1395  R(12); R(13); R(14); R(15);
1396  }
1397  /* Add the working vars back into context.state[] */
1398  state[0] += a(0);
1399  state[1] += b(0);
1400  state[2] += c(0);
1401  state[3] += d(0);
1402  state[4] += e(0);
1403  state[5] += f(0);
1404  state[6] += g(0);
1405  state[7] += h(0);
1406 }
1407 
1408 NAMESPACE_END
1409 
1410 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
1411 #endif // #ifndef CRYPTOPP_IMPORTS
bool HasSHA()
Determines SHA availability.
Definition: cpu.h:220
Utility functions for the Crypto++ library.
Library configuration file.
Classes and functions for secure memory allocations.
bool IsAlignedOn(const void *ptr, unsigned int alignment)
Determines whether ptr is aligned to a minimum value.
Definition: misc.h:954
Fixed size stack-based SecBlock with 16-byte alignment.
Definition: secblock.h:766
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:62
Functions for CPU features and intrinsics.
Classes for SHA-1 and SHA-2 family of message digests.
bool HasSSE2()
Determines SSE2 availability.
Definition: cpu.h:165
Crypto++ library namespace.
byte ByteReverse(byte value)
Reverses bytes in a 8-bit value.
Definition: misc.h:1663