Crypto++  5.6.5
Free C++ class library of cryptographic schemes
sha.cpp
1 // sha.cpp - modified by Wei Dai from Steve Reid's public domain sha1.c
2 
3 // Steve Reid implemented SHA-1. Wei Dai implemented SHA-2. Jeffrey Walton
4 // implemented Intel SHA extensions based on Intel articles and code by
5 // Sean Gulley. All code is in the public domain.
6 
7 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM sha.cpp" to generate MASM code
8 
9 #include "pch.h"
10 #include "config.h"
11 
12 #if CRYPTOPP_MSC_VERSION
13 # pragma warning(disable: 4100 4731)
14 #endif
15 
16 #ifndef CRYPTOPP_IMPORTS
17 #ifndef CRYPTOPP_GENERATE_X64_MASM
18 
19 #include "secblock.h"
20 #include "sha.h"
21 #include "misc.h"
22 #include "cpu.h"
23 
24 #if defined(CRYPTOPP_DISABLE_SHA_ASM)
25 # undef CRYPTOPP_X86_ASM_AVAILABLE
26 # undef CRYPTOPP_X32_ASM_AVAILABLE
27 # undef CRYPTOPP_X64_ASM_AVAILABLE
28 # undef CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
29 #endif
30 
31 NAMESPACE_BEGIN(CryptoPP)
32 
33 // Function pointer for specific SHA1 or SHA256 Transform function
34 typedef void (*pfnSHATransform)(word32 *state, const word32 *data);
35 typedef void (CRYPTOPP_FASTCALL *pfnSHAHashBlocks)(word32 *state, const word32 *data, size_t length);
36 
37 ////////////////////////////////
38 // start of Steve Reid's code //
39 ////////////////////////////////
40 
41 #define blk0(i) (W[i] = data[i])
42 #define blk1(i) (W[i&15] = rotlFixed(W[(i+13)&15]^W[(i+8)&15]^W[(i+2)&15]^W[i&15],1))
43 
44 #define f1(x,y,z) (z^(x&(y^z)))
45 #define f2(x,y,z) (x^y^z)
46 #define f3(x,y,z) ((x&y)|(z&(x|y)))
47 #define f4(x,y,z) (x^y^z)
48 
49 /* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
50 #define R0(v,w,x,y,z,i) z+=f1(w,x,y)+blk0(i)+0x5A827999+rotlFixed(v,5);w=rotlFixed(w,30);
51 #define R1(v,w,x,y,z,i) z+=f1(w,x,y)+blk1(i)+0x5A827999+rotlFixed(v,5);w=rotlFixed(w,30);
52 #define R2(v,w,x,y,z,i) z+=f2(w,x,y)+blk1(i)+0x6ED9EBA1+rotlFixed(v,5);w=rotlFixed(w,30);
53 #define R3(v,w,x,y,z,i) z+=f3(w,x,y)+blk1(i)+0x8F1BBCDC+rotlFixed(v,5);w=rotlFixed(w,30);
54 #define R4(v,w,x,y,z,i) z+=f4(w,x,y)+blk1(i)+0xCA62C1D6+rotlFixed(v,5);w=rotlFixed(w,30);
55 
56 static void SHA1_CXX_Transform(word32 *state, const word32 *data)
57 {
58  word32 W[16];
59  /* Copy context->state[] to working vars */
60  word32 a = state[0];
61  word32 b = state[1];
62  word32 c = state[2];
63  word32 d = state[3];
64  word32 e = state[4];
65  /* 4 rounds of 20 operations each. Loop unrolled. */
66  R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3);
67  R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7);
68  R0(c,d,e,a,b, 8); R0(b,c,d,e,a, 9); R0(a,b,c,d,e,10); R0(e,a,b,c,d,11);
69  R0(d,e,a,b,c,12); R0(c,d,e,a,b,13); R0(b,c,d,e,a,14); R0(a,b,c,d,e,15);
70  R1(e,a,b,c,d,16); R1(d,e,a,b,c,17); R1(c,d,e,a,b,18); R1(b,c,d,e,a,19);
71  R2(a,b,c,d,e,20); R2(e,a,b,c,d,21); R2(d,e,a,b,c,22); R2(c,d,e,a,b,23);
72  R2(b,c,d,e,a,24); R2(a,b,c,d,e,25); R2(e,a,b,c,d,26); R2(d,e,a,b,c,27);
73  R2(c,d,e,a,b,28); R2(b,c,d,e,a,29); R2(a,b,c,d,e,30); R2(e,a,b,c,d,31);
74  R2(d,e,a,b,c,32); R2(c,d,e,a,b,33); R2(b,c,d,e,a,34); R2(a,b,c,d,e,35);
75  R2(e,a,b,c,d,36); R2(d,e,a,b,c,37); R2(c,d,e,a,b,38); R2(b,c,d,e,a,39);
76  R3(a,b,c,d,e,40); R3(e,a,b,c,d,41); R3(d,e,a,b,c,42); R3(c,d,e,a,b,43);
77  R3(b,c,d,e,a,44); R3(a,b,c,d,e,45); R3(e,a,b,c,d,46); R3(d,e,a,b,c,47);
78  R3(c,d,e,a,b,48); R3(b,c,d,e,a,49); R3(a,b,c,d,e,50); R3(e,a,b,c,d,51);
79  R3(d,e,a,b,c,52); R3(c,d,e,a,b,53); R3(b,c,d,e,a,54); R3(a,b,c,d,e,55);
80  R3(e,a,b,c,d,56); R3(d,e,a,b,c,57); R3(c,d,e,a,b,58); R3(b,c,d,e,a,59);
81  R4(a,b,c,d,e,60); R4(e,a,b,c,d,61); R4(d,e,a,b,c,62); R4(c,d,e,a,b,63);
82  R4(b,c,d,e,a,64); R4(a,b,c,d,e,65); R4(e,a,b,c,d,66); R4(d,e,a,b,c,67);
83  R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71);
84  R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75);
85  R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79);
86  /* Add the working vars back into context.state[] */
87  state[0] += a;
88  state[1] += b;
89  state[2] += c;
90  state[3] += d;
91  state[4] += e;
92 }
93 
94 //////////////////////////////
95 // end of Steve Reid's code //
96 //////////////////////////////
97 
98 ///////////////////////////////////
99 // start of Walton/Gulley's code //
100 ///////////////////////////////////
101 
102 #if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
103 // Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
104 static void SHA1_SSE_SHA_Transform(word32 *state, const word32 *data)
105 {
106  __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
107  __m128i MASK, MSG0, MSG1, MSG2, MSG3;
108 
109  word32 T[16];
110  ByteReverse(T, data, 64);
111 
112  // Load initial values
113  ABCD = _mm_loadu_si128((__m128i*) state);
114  E0 = _mm_set_epi32(state[4], 0, 0, 0);
115  ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
116  MASK = _mm_set_epi64x(W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f));
117 
118  // Save current hash
119  ABCD_SAVE = ABCD;
120  E0_SAVE = E0;
121 
122  // Rounds 0-3
123  MSG0 = _mm_loadu_si128((__m128i*) T+0);
124  MSG0 = _mm_shuffle_epi8(MSG0, MASK);
125  E0 = _mm_add_epi32(E0, MSG0);
126  E1 = ABCD;
127  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
128 
129  // Rounds 4-7
130  MSG1 = _mm_loadu_si128((__m128i*) (T+4));
131  MSG1 = _mm_shuffle_epi8(MSG1, MASK);
132  E1 = _mm_sha1nexte_epu32(E1, MSG1);
133  E0 = ABCD;
134  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
135  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
136 
137  // Rounds 8-11
138  MSG2 = _mm_loadu_si128((__m128i*) (T+8));
139  MSG2 = _mm_shuffle_epi8(MSG2, MASK);
140  E0 = _mm_sha1nexte_epu32(E0, MSG2);
141  E1 = ABCD;
142  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
143  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
144  MSG0 = _mm_xor_si128(MSG0, MSG2);
145 
146  // Rounds 12-15
147  MSG3 = _mm_loadu_si128((__m128i*) (T+12));
148  MSG3 = _mm_shuffle_epi8(MSG3, MASK);
149  E1 = _mm_sha1nexte_epu32(E1, MSG3);
150  E0 = ABCD;
151  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
152  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
153  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
154  MSG1 = _mm_xor_si128(MSG1, MSG3);
155 
156  // Rounds 16-19
157  E0 = _mm_sha1nexte_epu32(E0, MSG0);
158  E1 = ABCD;
159  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
160  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
161  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
162  MSG2 = _mm_xor_si128(MSG2, MSG0);
163 
164  // Rounds 20-23
165  E1 = _mm_sha1nexte_epu32(E1, MSG1);
166  E0 = ABCD;
167  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
168  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
169  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
170  MSG3 = _mm_xor_si128(MSG3, MSG1);
171 
172  // Rounds 24-27
173  E0 = _mm_sha1nexte_epu32(E0, MSG2);
174  E1 = ABCD;
175  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
176  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
177  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
178  MSG0 = _mm_xor_si128(MSG0, MSG2);
179 
180  // Rounds 28-31
181  E1 = _mm_sha1nexte_epu32(E1, MSG3);
182  E0 = ABCD;
183  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
184  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
185  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
186  MSG1 = _mm_xor_si128(MSG1, MSG3);
187 
188  // Rounds 32-35
189  E0 = _mm_sha1nexte_epu32(E0, MSG0);
190  E1 = ABCD;
191  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
192  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
193  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
194  MSG2 = _mm_xor_si128(MSG2, MSG0);
195 
196  // Rounds 36-39
197  E1 = _mm_sha1nexte_epu32(E1, MSG1);
198  E0 = ABCD;
199  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
200  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
201  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
202  MSG3 = _mm_xor_si128(MSG3, MSG1);
203 
204  // Rounds 40-43
205  E0 = _mm_sha1nexte_epu32(E0, MSG2);
206  E1 = ABCD;
207  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
208  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
209  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
210  MSG0 = _mm_xor_si128(MSG0, MSG2);
211 
212  // Rounds 44-47
213  E1 = _mm_sha1nexte_epu32(E1, MSG3);
214  E0 = ABCD;
215  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
216  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
217  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
218  MSG1 = _mm_xor_si128(MSG1, MSG3);
219 
220  // Rounds 48-51
221  E0 = _mm_sha1nexte_epu32(E0, MSG0);
222  E1 = ABCD;
223  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
224  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
225  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
226  MSG2 = _mm_xor_si128(MSG2, MSG0);
227 
228  // Rounds 52-55
229  E1 = _mm_sha1nexte_epu32(E1, MSG1);
230  E0 = ABCD;
231  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
232  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
233  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
234  MSG3 = _mm_xor_si128(MSG3, MSG1);
235 
236  // Rounds 56-59
237  E0 = _mm_sha1nexte_epu32(E0, MSG2);
238  E1 = ABCD;
239  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
240  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
241  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
242  MSG0 = _mm_xor_si128(MSG0, MSG2);
243 
244  // Rounds 60-63
245  E1 = _mm_sha1nexte_epu32(E1, MSG3);
246  E0 = ABCD;
247  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
248  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
249  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
250  MSG1 = _mm_xor_si128(MSG1, MSG3);
251 
252  // Rounds 64-67
253  E0 = _mm_sha1nexte_epu32(E0, MSG0);
254  E1 = ABCD;
255  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
256  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
257  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
258  MSG2 = _mm_xor_si128(MSG2, MSG0);
259 
260  // Rounds 68-71
261  E1 = _mm_sha1nexte_epu32(E1, MSG1);
262  E0 = ABCD;
263  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
264  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
265  MSG3 = _mm_xor_si128(MSG3, MSG1);
266 
267  // Rounds 72-75
268  E0 = _mm_sha1nexte_epu32(E0, MSG2);
269  E1 = ABCD;
270  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
271  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
272 
273  // Rounds 76-79
274  E1 = _mm_sha1nexte_epu32(E1, MSG3);
275  E0 = ABCD;
276  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
277 
278  // Add values back to state
279  E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);
280  ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);
281 
282  // Save state
283  ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
284  _mm_storeu_si128((__m128i*) state, ABCD);
285  state[4] = _mm_extract_epi32(E0, 3);
286 }
287 #endif
288 
289 /////////////////////////////////
290 // end of Walton/Gulley's code //
291 /////////////////////////////////
292 
293 pfnSHATransform InitializeSHA1Transform()
294 {
295 #if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
296  if (HasSHA())
297  return &SHA1_SSE_SHA_Transform;
298  else
299 #endif
300 
301  return &SHA1_CXX_Transform;
302 }
303 
304 void SHA1::InitState(HashWordType *state)
305 {
306  state[0] = 0x67452301L;
307  state[1] = 0xEFCDAB89L;
308  state[2] = 0x98BADCFEL;
309  state[3] = 0x10325476L;
310  state[4] = 0xC3D2E1F0L;
311 }
312 
313 void SHA1::Transform(word32 *state, const word32 *data)
314 {
315  static const pfnSHATransform s_pfn = InitializeSHA1Transform();
316  s_pfn(state, data);
317 }
318 
319 // *************************************************************
320 
321 void SHA224::InitState(HashWordType *state)
322 {
323  static const word32 s[8] = {0xc1059ed8, 0x367cd507, 0x3070dd17, 0xf70e5939, 0xffc00b31, 0x68581511, 0x64f98fa7, 0xbefa4fa4};
324  memcpy(state, s, sizeof(s));
325 }
326 
327 void SHA256::InitState(HashWordType *state)
328 {
329  static const word32 s[8] = {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19};
330  memcpy(state, s, sizeof(s));
331 }
332 
333 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
334 CRYPTOPP_ALIGN_DATA(16) extern const word32 SHA256_K[64] CRYPTOPP_SECTION_ALIGN16 = {
335 #else
336 extern const word32 SHA256_K[64] = {
337 #endif
338  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
339  0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
340  0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
341  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
342  0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
343  0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
344  0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
345  0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
346  0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
347  0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
348  0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
349  0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
350  0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
351  0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
352  0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
353  0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
354 };
355 
356 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
357 
358 #if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE) || defined(CRYPTOPP_GENERATE_X64_MASM))
359 
360 static void CRYPTOPP_FASTCALL X86_SHA256_HashBlocks(word32 *state, const word32 *data, size_t len)
361 {
362  #define LOCALS_SIZE 8*4 + 16*4 + 4*WORD_SZ
363  #define H(i) [BASE+ASM_MOD(1024+7-(i),8)*4]
364  #define G(i) H(i+1)
365  #define F(i) H(i+2)
366  #define E(i) H(i+3)
367  #define D(i) H(i+4)
368  #define C(i) H(i+5)
369  #define B(i) H(i+6)
370  #define A(i) H(i+7)
371  #define Wt(i) BASE+8*4+ASM_MOD(1024+15-(i),16)*4
372  #define Wt_2(i) Wt((i)-2)
373  #define Wt_15(i) Wt((i)-15)
374  #define Wt_7(i) Wt((i)-7)
375  #define K_END [BASE+8*4+16*4+0*WORD_SZ]
376  #define STATE_SAVE [BASE+8*4+16*4+1*WORD_SZ]
377  #define DATA_SAVE [BASE+8*4+16*4+2*WORD_SZ]
378  #define DATA_END [BASE+8*4+16*4+3*WORD_SZ]
379  #define Kt(i) WORD_REG(si)+(i)*4
380 #if CRYPTOPP_BOOL_X32
381  #define BASE esp+8
382 #elif CRYPTOPP_BOOL_X86
383  #define BASE esp+4
384 #elif defined(__GNUC__)
385  #define BASE r8
386 #else
387  #define BASE rsp
388 #endif
389 
390 #define RA0(i, edx, edi) \
391  AS2( add edx, [Kt(i)] )\
392  AS2( add edx, [Wt(i)] )\
393  AS2( add edx, H(i) )\
394 
395 #define RA1(i, edx, edi)
396 
397 #define RB0(i, edx, edi)
398 
399 #define RB1(i, edx, edi) \
400  AS2( mov AS_REG_7d, [Wt_2(i)] )\
401  AS2( mov edi, [Wt_15(i)])\
402  AS2( mov ebx, AS_REG_7d )\
403  AS2( shr AS_REG_7d, 10 )\
404  AS2( ror ebx, 17 )\
405  AS2( xor AS_REG_7d, ebx )\
406  AS2( ror ebx, 2 )\
407  AS2( xor ebx, AS_REG_7d )/* s1(W_t-2) */\
408  AS2( add ebx, [Wt_7(i)])\
409  AS2( mov AS_REG_7d, edi )\
410  AS2( shr AS_REG_7d, 3 )\
411  AS2( ror edi, 7 )\
412  AS2( add ebx, [Wt(i)])/* s1(W_t-2) + W_t-7 + W_t-16 */\
413  AS2( xor AS_REG_7d, edi )\
414  AS2( add edx, [Kt(i)])\
415  AS2( ror edi, 11 )\
416  AS2( add edx, H(i) )\
417  AS2( xor AS_REG_7d, edi )/* s0(W_t-15) */\
418  AS2( add AS_REG_7d, ebx )/* W_t = s1(W_t-2) + W_t-7 + s0(W_t-15) W_t-16*/\
419  AS2( mov [Wt(i)], AS_REG_7d)\
420  AS2( add edx, AS_REG_7d )\
421 
422 #define ROUND(i, r, eax, ecx, edi, edx)\
423  /* in: edi = E */\
424  /* unused: eax, ecx, temp: ebx, AS_REG_7d, out: edx = T1 */\
425  AS2( mov edx, F(i) )\
426  AS2( xor edx, G(i) )\
427  AS2( and edx, edi )\
428  AS2( xor edx, G(i) )/* Ch(E,F,G) = (G^(E&(F^G))) */\
429  AS2( mov AS_REG_7d, edi )\
430  AS2( ror edi, 6 )\
431  AS2( ror AS_REG_7d, 25 )\
432  RA##r(i, edx, edi )/* H + Wt + Kt + Ch(E,F,G) */\
433  AS2( xor AS_REG_7d, edi )\
434  AS2( ror edi, 5 )\
435  AS2( xor AS_REG_7d, edi )/* S1(E) */\
436  AS2( add edx, AS_REG_7d )/* T1 = S1(E) + Ch(E,F,G) + H + Wt + Kt */\
437  RB##r(i, edx, edi )/* H + Wt + Kt + Ch(E,F,G) */\
438  /* in: ecx = A, eax = B^C, edx = T1 */\
439  /* unused: edx, temp: ebx, AS_REG_7d, out: eax = A, ecx = B^C, edx = E */\
440  AS2( mov ebx, ecx )\
441  AS2( xor ecx, B(i) )/* A^B */\
442  AS2( and eax, ecx )\
443  AS2( xor eax, B(i) )/* Maj(A,B,C) = B^((A^B)&(B^C) */\
444  AS2( mov AS_REG_7d, ebx )\
445  AS2( ror ebx, 2 )\
446  AS2( add eax, edx )/* T1 + Maj(A,B,C) */\
447  AS2( add edx, D(i) )\
448  AS2( mov D(i), edx )\
449  AS2( ror AS_REG_7d, 22 )\
450  AS2( xor AS_REG_7d, ebx )\
451  AS2( ror ebx, 11 )\
452  AS2( xor AS_REG_7d, ebx )\
453  AS2( add eax, AS_REG_7d )/* T1 + S0(A) + Maj(A,B,C) */\
454  AS2( mov H(i), eax )\
455 
456 // Unroll the use of CRYPTOPP_BOOL_X64 in assembler math. The GAS assembler on X32 (version 2.25)
457 // complains "Error: invalid operands (*ABS* and *UND* sections) for `*` and `-`"
458 #if CRYPTOPP_BOOL_X64
459 #define SWAP_COPY(i) \
460  AS2( mov WORD_REG(bx), [WORD_REG(dx)+i*WORD_SZ])\
461  AS1( bswap WORD_REG(bx))\
462  AS2( mov [Wt(i*2+1)], WORD_REG(bx))
463 #else // X86 and X32
464 #define SWAP_COPY(i) \
465  AS2( mov WORD_REG(bx), [WORD_REG(dx)+i*WORD_SZ])\
466  AS1( bswap WORD_REG(bx))\
467  AS2( mov [Wt(i)], WORD_REG(bx))
468 #endif
469 
470 #if defined(__GNUC__)
471  #if CRYPTOPP_BOOL_X64
473  #endif
474  __asm__ __volatile__
475  (
476  #if CRYPTOPP_BOOL_X64
477  "lea %4, %%r8;"
478  #endif
479  INTEL_NOPREFIX
480 #elif defined(CRYPTOPP_GENERATE_X64_MASM)
481  ALIGN 8
482  X86_SHA256_HashBlocks PROC FRAME
483  rex_push_reg rsi
484  push_reg rdi
485  push_reg rbx
486  push_reg rbp
487  alloc_stack(LOCALS_SIZE+8)
488  .endprolog
489  mov rdi, r8
490  lea rsi, [?SHA256_K@CryptoPP@@3QBIB + 48*4]
491 #endif
492 
493 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
494  #ifndef __GNUC__
495  AS2( mov edi, [len])
496  AS2( lea WORD_REG(si), [SHA256_K+48*4])
497  #endif
498  #if !defined(_MSC_VER) || (_MSC_VER < 1400)
499  AS_PUSH_IF86(bx)
500  #endif
501 
502  AS_PUSH_IF86(bp)
503  AS2( mov ebx, esp)
504  AS2( and esp, -16)
505  AS2( sub WORD_REG(sp), LOCALS_SIZE)
506  AS_PUSH_IF86(bx)
507 #endif
508  AS2( mov STATE_SAVE, WORD_REG(cx))
509  AS2( mov DATA_SAVE, WORD_REG(dx))
510  AS2( lea WORD_REG(ax), [WORD_REG(di) + WORD_REG(dx)])
511  AS2( mov DATA_END, WORD_REG(ax))
512  AS2( mov K_END, WORD_REG(si))
513 
514 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
515 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
516  AS2( test edi, 1)
517  ASJ( jnz, 2, f)
518  AS1( dec DWORD PTR K_END)
519 #endif
520  AS2( movdqa xmm0, XMMWORD_PTR [WORD_REG(cx)+0*16])
521  AS2( movdqa xmm1, XMMWORD_PTR [WORD_REG(cx)+1*16])
522 #endif
523 
524 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
525 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
526  ASJ( jmp, 0, f)
527 #endif
528  ASL(2) // non-SSE2
529  AS2( mov esi, ecx)
530  AS2( lea edi, A(0))
531  AS2( mov ecx, 8)
532 ATT_NOPREFIX
533  AS1( rep movsd)
534 INTEL_NOPREFIX
535  AS2( mov esi, K_END)
536  ASJ( jmp, 3, f)
537 #endif
538 
539 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
540  ASL(0)
541  AS2( movdqa E(0), xmm1)
542  AS2( movdqa A(0), xmm0)
543 #endif
544 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
545  ASL(3)
546 #endif
547  AS2( sub WORD_REG(si), 48*4)
548  SWAP_COPY(0) SWAP_COPY(1) SWAP_COPY(2) SWAP_COPY(3)
549  SWAP_COPY(4) SWAP_COPY(5) SWAP_COPY(6) SWAP_COPY(7)
550 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
551  SWAP_COPY(8) SWAP_COPY(9) SWAP_COPY(10) SWAP_COPY(11)
552  SWAP_COPY(12) SWAP_COPY(13) SWAP_COPY(14) SWAP_COPY(15)
553 #endif
554  AS2( mov edi, E(0)) // E
555  AS2( mov eax, B(0)) // B
556  AS2( xor eax, C(0)) // B^C
557  AS2( mov ecx, A(0)) // A
558 
559  ROUND(0, 0, eax, ecx, edi, edx)
560  ROUND(1, 0, ecx, eax, edx, edi)
561  ROUND(2, 0, eax, ecx, edi, edx)
562  ROUND(3, 0, ecx, eax, edx, edi)
563  ROUND(4, 0, eax, ecx, edi, edx)
564  ROUND(5, 0, ecx, eax, edx, edi)
565  ROUND(6, 0, eax, ecx, edi, edx)
566  ROUND(7, 0, ecx, eax, edx, edi)
567  ROUND(8, 0, eax, ecx, edi, edx)
568  ROUND(9, 0, ecx, eax, edx, edi)
569  ROUND(10, 0, eax, ecx, edi, edx)
570  ROUND(11, 0, ecx, eax, edx, edi)
571  ROUND(12, 0, eax, ecx, edi, edx)
572  ROUND(13, 0, ecx, eax, edx, edi)
573  ROUND(14, 0, eax, ecx, edi, edx)
574  ROUND(15, 0, ecx, eax, edx, edi)
575 
576  ASL(1)
577  AS2(add WORD_REG(si), 4*16)
578  ROUND(0, 1, eax, ecx, edi, edx)
579  ROUND(1, 1, ecx, eax, edx, edi)
580  ROUND(2, 1, eax, ecx, edi, edx)
581  ROUND(3, 1, ecx, eax, edx, edi)
582  ROUND(4, 1, eax, ecx, edi, edx)
583  ROUND(5, 1, ecx, eax, edx, edi)
584  ROUND(6, 1, eax, ecx, edi, edx)
585  ROUND(7, 1, ecx, eax, edx, edi)
586  ROUND(8, 1, eax, ecx, edi, edx)
587  ROUND(9, 1, ecx, eax, edx, edi)
588  ROUND(10, 1, eax, ecx, edi, edx)
589  ROUND(11, 1, ecx, eax, edx, edi)
590  ROUND(12, 1, eax, ecx, edi, edx)
591  ROUND(13, 1, ecx, eax, edx, edi)
592  ROUND(14, 1, eax, ecx, edi, edx)
593  ROUND(15, 1, ecx, eax, edx, edi)
594  AS2( cmp WORD_REG(si), K_END)
595  ATT_NOPREFIX
596  ASJ( jb, 1, b)
597  INTEL_NOPREFIX
598 
599  AS2( mov WORD_REG(dx), DATA_SAVE)
600  AS2( add WORD_REG(dx), 64)
601  AS2( mov AS_REG_7, STATE_SAVE)
602  AS2( mov DATA_SAVE, WORD_REG(dx))
603 
604 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
605 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
606  AS2( test DWORD PTR K_END, 1)
607  ASJ( jz, 4, f)
608 #endif
609  AS2( movdqa xmm1, XMMWORD_PTR [AS_REG_7+1*16])
610  AS2( movdqa xmm0, XMMWORD_PTR [AS_REG_7+0*16])
611  AS2( paddd xmm1, E(0))
612  AS2( paddd xmm0, A(0))
613  AS2( movdqa [AS_REG_7+1*16], xmm1)
614  AS2( movdqa [AS_REG_7+0*16], xmm0)
615  AS2( cmp WORD_REG(dx), DATA_END)
616  ATT_NOPREFIX
617  ASJ( jb, 0, b)
618  INTEL_NOPREFIX
619 #endif
620 
621 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
622 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
623  ASJ( jmp, 5, f)
624  ASL(4) // non-SSE2
625 #endif
626  AS2( add [AS_REG_7+0*4], ecx) // A
627  AS2( add [AS_REG_7+4*4], edi) // E
628  AS2( mov eax, B(0))
629  AS2( mov ebx, C(0))
630  AS2( mov ecx, D(0))
631  AS2( add [AS_REG_7+1*4], eax)
632  AS2( add [AS_REG_7+2*4], ebx)
633  AS2( add [AS_REG_7+3*4], ecx)
634  AS2( mov eax, F(0))
635  AS2( mov ebx, G(0))
636  AS2( mov ecx, H(0))
637  AS2( add [AS_REG_7+5*4], eax)
638  AS2( add [AS_REG_7+6*4], ebx)
639  AS2( add [AS_REG_7+7*4], ecx)
640  AS2( mov ecx, AS_REG_7d)
641  AS2( cmp WORD_REG(dx), DATA_END)
642  ASJ( jb, 2, b)
643 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
644  ASL(5)
645 #endif
646 #endif
647 
648  AS_POP_IF86(sp)
649  AS_POP_IF86(bp)
650  #if !defined(_MSC_VER) || (_MSC_VER < 1400)
651  AS_POP_IF86(bx)
652  #endif
653 
654 #ifdef CRYPTOPP_GENERATE_X64_MASM
655  add rsp, LOCALS_SIZE+8
656  pop rbp
657  pop rbx
658  pop rdi
659  pop rsi
660  ret
661  X86_SHA256_HashBlocks ENDP
662 #endif
663 
664 #ifdef __GNUC__
665  ATT_PREFIX
666  :
667  : "c" (state), "d" (data), "S" (SHA256_K+48), "D" (len)
668  #if CRYPTOPP_BOOL_X64
669  , "m" (workspace[0])
670  #endif
671  : "memory", "cc", "%eax"
672  #if CRYPTOPP_BOOL_X64
673  , "%rbx", "%r8", "%r10"
674  #endif
675  );
676 #endif
677 }
678 
679 #endif // (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_GENERATE_X64_MASM))
680 
681 #ifndef CRYPTOPP_GENERATE_X64_MASM
682 
683 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
684 extern "C" {
685 void CRYPTOPP_FASTCALL X86_SHA256_HashBlocks(word32 *state, const word32 *data, size_t len);
686 }
687 #endif
688 
689 #if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
690 static void CRYPTOPP_FASTCALL SHA256_SSE_SHA_HashBlocks(word32 *state, const word32 *data, size_t length);
691 #endif
692 
693 #if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE) || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_SHA_ASM)
694 
695 pfnSHAHashBlocks InitializeSHA256HashBlocks()
696 {
697 #if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
698  if (HasSHA())
699  return &SHA256_SSE_SHA_HashBlocks;
700  else
701 #endif
702 
703  return &X86_SHA256_HashBlocks;
704 }
705 
706 size_t SHA256::HashMultipleBlocks(const word32 *input, size_t length)
707 {
708  static const pfnSHAHashBlocks s_pfn = InitializeSHA256HashBlocks();
709  s_pfn(m_state, input, (length&(size_t(0)-BLOCKSIZE)) - !HasSSE2());
710  return length % BLOCKSIZE;
711 }
712 
713 size_t SHA224::HashMultipleBlocks(const word32 *input, size_t length)
714 {
715  static const pfnSHAHashBlocks s_pfn = InitializeSHA256HashBlocks();
716  s_pfn(m_state, input, (length&(size_t(0)-BLOCKSIZE)) - !HasSSE2());
717  return length % BLOCKSIZE;
718 }
719 #endif
720 
721 #define blk2(i) (W[i&15]+=s1(W[(i-2)&15])+W[(i-7)&15]+s0(W[(i-15)&15]))
722 
723 #define Ch(x,y,z) (z^(x&(y^z)))
724 #define Maj(x,y,z) (y^((x^y)&(y^z)))
725 
726 #define a(i) T[(0-i)&7]
727 #define b(i) T[(1-i)&7]
728 #define c(i) T[(2-i)&7]
729 #define d(i) T[(3-i)&7]
730 #define e(i) T[(4-i)&7]
731 #define f(i) T[(5-i)&7]
732 #define g(i) T[(6-i)&7]
733 #define h(i) T[(7-i)&7]
734 
735 #define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+SHA256_K[i+j]+(j?blk2(i):blk0(i));\
736  d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i))
737 
738 // for SHA256
739 #define S0(x) (rotrFixed(x,2)^rotrFixed(x,13)^rotrFixed(x,22))
740 #define S1(x) (rotrFixed(x,6)^rotrFixed(x,11)^rotrFixed(x,25))
741 #define s0(x) (rotrFixed(x,7)^rotrFixed(x,18)^(x>>3))
742 #define s1(x) (rotrFixed(x,17)^rotrFixed(x,19)^(x>>10))
743 
744 #if defined(__OPTIMIZE_SIZE__)
745 // Smaller but slower
746 void SHA256_CXX_Transform(word32 *state, const word32 *data)
747 {
748  word32 W[32], T[20];
749  unsigned int i = 0, j = 0;
750  word32 *t = T+8;
751 
752  memcpy(t, state, 8*4);
753  word32 e = t[4], a = t[0];
754 
755  do
756  {
757  word32 w = data[j];
758  W[j] = w;
759  w += SHA256_K[j];
760  w += t[7];
761  w += S1(e);
762  w += Ch(e, t[5], t[6]);
763  e = t[3] + w;
764  t[3] = t[3+8] = e;
765  w += S0(t[0]);
766  a = w + Maj(a, t[1], t[2]);
767  t[-1] = t[7] = a;
768  --t;
769  ++j;
770  if (j%8 == 0)
771  t += 8;
772  } while (j<16);
773 
774  do
775  {
776  i = j&0xf;
777  word32 w = s1(W[i+16-2]) + s0(W[i+16-15]) + W[i] + W[i+16-7];
778  W[i+16] = W[i] = w;
779  w += SHA256_K[j];
780  w += t[7];
781  w += S1(e);
782  w += Ch(e, t[5], t[6]);
783  e = t[3] + w;
784  t[3] = t[3+8] = e;
785  w += S0(t[0]);
786  a = w + Maj(a, t[1], t[2]);
787  t[-1] = t[7] = a;
788 
789  w = s1(W[(i+1)+16-2]) + s0(W[(i+1)+16-15]) + W[(i+1)] + W[(i+1)+16-7];
790  W[(i+1)+16] = W[(i+1)] = w;
791  w += SHA256_K[j+1];
792  w += (t-1)[7];
793  w += S1(e);
794  w += Ch(e, (t-1)[5], (t-1)[6]);
795  e = (t-1)[3] + w;
796  (t-1)[3] = (t-1)[3+8] = e;
797  w += S0((t-1)[0]);
798  a = w + Maj(a, (t-1)[1], (t-1)[2]);
799  (t-1)[-1] = (t-1)[7] = a;
800 
801  t-=2;
802  j+=2;
803  if (j%8 == 0)
804  t += 8;
805  } while (j<64);
806 
807  state[0] += a;
808  state[1] += t[1];
809  state[2] += t[2];
810  state[3] += t[3];
811  state[4] += e;
812  state[5] += t[5];
813  state[6] += t[6];
814  state[7] += t[7];
815 }
816 #else
817 // Bigger but faster
818 void SHA256_CXX_Transform(word32 *state, const word32 *data)
819 {
820  word32 W[16], T[8];
821  /* Copy context->state[] to working vars */
822  memcpy(T, state, sizeof(T));
823  /* 64 operations, partially loop unrolled */
824  for (unsigned int j=0; j<64; j+=16)
825  {
826  R( 0); R( 1); R( 2); R( 3);
827  R( 4); R( 5); R( 6); R( 7);
828  R( 8); R( 9); R(10); R(11);
829  R(12); R(13); R(14); R(15);
830  }
831  /* Add the working vars back into context.state[] */
832  state[0] += a(0);
833  state[1] += b(0);
834  state[2] += c(0);
835  state[3] += d(0);
836  state[4] += e(0);
837  state[5] += f(0);
838  state[6] += g(0);
839  state[7] += h(0);
840 }
841 #endif // __OPTIMIZE_SIZE__
842 
843 #undef S0
844 #undef S1
845 #undef s0
846 #undef s1
847 #undef R
848 
849 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
850 static void SHA256_SSE2_Transform(word32 *state, const word32 *data)
851 {
852  // this byte reverse is a waste of time, but this function is only called by MDC
853  word32 W[16];
854  ByteReverse(W, data, SHA256::BLOCKSIZE);
855  X86_SHA256_HashBlocks(state, W, SHA256::BLOCKSIZE - !HasSSE2());
856 }
857 #endif // CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
858 
859 #if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
860 static void SHA256_SSE_SHA_Transform(word32 *state, const word32 *data)
861 {
862  return SHA256_SSE_SHA_HashBlocks(state, data, SHA256::BLOCKSIZE);
863 }
864 #endif // CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
865 
866 ///////////////////////////////////
867 // start of Walton/Gulley's code //
868 ///////////////////////////////////
869 
870 #if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
871 // Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
872 static void CRYPTOPP_FASTCALL SHA256_SSE_SHA_HashBlocks(word32 *state, const word32 *data, size_t length)
873 {
874  CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
875  CRYPTOPP_ASSERT(length % SHA256::BLOCKSIZE == 0);
876 
877  __m128i STATE0, STATE1;
878  __m128i MSG, TMP, MASK;
879  __m128i TMSG0, TMSG1, TMSG2, TMSG3;
880  __m128i ABEF_SAVE, CDGH_SAVE;
881 
882  // Load initial hash values
883  TMP = _mm_loadu_si128((__m128i*) &state[0]);
884  STATE1 = _mm_loadu_si128((__m128i*) &state[4]);
885  MASK = _mm_set_epi64x(W64LIT(0x0c0d0e0f08090a0b), W64LIT(0x0405060700010203));
886 
887  TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
888  STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
889  STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
890  STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
891 
892  while (length)
893  {
894  // Save hash values for addition after rounds
895  ABEF_SAVE = STATE0;
896  CDGH_SAVE = STATE1;
897 
898  // Rounds 0-3
899  MSG = _mm_loadu_si128((__m128i*) data+0);
900  TMSG0 = _mm_shuffle_epi8(MSG, MASK);
901  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98)));
902  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
903  MSG = _mm_shuffle_epi32(MSG, 0x0E);
904  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
905 
906  // Rounds 4-7
907  TMSG1 = _mm_loadu_si128((__m128i*) (data+4));
908  TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
909  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B)));
910  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
911  MSG = _mm_shuffle_epi32(MSG, 0x0E);
912  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
913  TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
914 
915  // Rounds 8-11
916  TMSG2 = _mm_loadu_si128((__m128i*) (data+8));
917  TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
918  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98)));
919  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
920  MSG = _mm_shuffle_epi32(MSG, 0x0E);
921  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
922  TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
923 
924  // Rounds 12-15
925  TMSG3 = _mm_loadu_si128((__m128i*) (data+12));
926  TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
927  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74)));
928  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
929  TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
930  TMSG0 = _mm_add_epi32(TMSG0, TMP);
931  TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
932  MSG = _mm_shuffle_epi32(MSG, 0x0E);
933  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
934  TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
935 
936  // Rounds 16-19
937  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x240CA1CC0FC19DC6), W64LIT(0xEFBE4786E49B69C1)));
938  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
939  TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
940  TMSG1 = _mm_add_epi32(TMSG1, TMP);
941  TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
942  MSG = _mm_shuffle_epi32(MSG, 0x0E);
943  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
944  TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
945 
946  // Rounds 20-23
947  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x76F988DA5CB0A9DC), W64LIT(0x4A7484AA2DE92C6F)));
948  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
949  TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
950  TMSG2 = _mm_add_epi32(TMSG2, TMP);
951  TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
952  MSG = _mm_shuffle_epi32(MSG, 0x0E);
953  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
954  TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
955 
956  // Rounds 24-27
957  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xBF597FC7B00327C8), W64LIT(0xA831C66D983E5152)));
958  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
959  TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
960  TMSG3 = _mm_add_epi32(TMSG3, TMP);
961  TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
962  MSG = _mm_shuffle_epi32(MSG, 0x0E);
963  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
964  TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
965 
966  // Rounds 28-31
967  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x1429296706CA6351), W64LIT(0xD5A79147C6E00BF3)));
968  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
969  TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
970  TMSG0 = _mm_add_epi32(TMSG0, TMP);
971  TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
972  MSG = _mm_shuffle_epi32(MSG, 0x0E);
973  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
974  TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
975 
976  // Rounds 32-35
977  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x53380D134D2C6DFC), W64LIT(0x2E1B213827B70A85)));
978  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
979  TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
980  TMSG1 = _mm_add_epi32(TMSG1, TMP);
981  TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
982  MSG = _mm_shuffle_epi32(MSG, 0x0E);
983  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
984  TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
985 
986  // Rounds 36-39
987  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x92722C8581C2C92E), W64LIT(0x766A0ABB650A7354)));
988  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
989  TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
990  TMSG2 = _mm_add_epi32(TMSG2, TMP);
991  TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
992  MSG = _mm_shuffle_epi32(MSG, 0x0E);
993  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
994  TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
995 
996  // Rounds 40-43
997  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xC76C51A3C24B8B70), W64LIT(0xA81A664BA2BFE8A1)));
998  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
999  TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
1000  TMSG3 = _mm_add_epi32(TMSG3, TMP);
1001  TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
1002  MSG = _mm_shuffle_epi32(MSG, 0x0E);
1003  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
1004  TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
1005 
1006  // Rounds 44-47
1007  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x106AA070F40E3585), W64LIT(0xD6990624D192E819)));
1008  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
1009  TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
1010  TMSG0 = _mm_add_epi32(TMSG0, TMP);
1011  TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
1012  MSG = _mm_shuffle_epi32(MSG, 0x0E);
1013  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
1014  TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
1015 
1016  // Rounds 48-51
1017  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x34B0BCB52748774C), W64LIT(0x1E376C0819A4C116)));
1018  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
1019  TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
1020  TMSG1 = _mm_add_epi32(TMSG1, TMP);
1021  TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
1022  MSG = _mm_shuffle_epi32(MSG, 0x0E);
1023  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
1024  TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
1025 
1026  // Rounds 52-55
1027  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x682E6FF35B9CCA4F), W64LIT(0x4ED8AA4A391C0CB3)));
1028  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
1029  TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
1030  TMSG2 = _mm_add_epi32(TMSG2, TMP);
1031  TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
1032  MSG = _mm_shuffle_epi32(MSG, 0x0E);
1033  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
1034 
1035  // Rounds 56-59
1036  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x8CC7020884C87814), W64LIT(0x78A5636F748F82EE)));
1037  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
1038  TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
1039  TMSG3 = _mm_add_epi32(TMSG3, TMP);
1040  TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
1041  MSG = _mm_shuffle_epi32(MSG, 0x0E);
1042  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
1043 
1044  // Rounds 60-63
1045  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC67178F2BEF9A3F7), W64LIT(0xA4506CEB90BEFFFA)));
1046  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
1047  MSG = _mm_shuffle_epi32(MSG, 0x0E);
1048  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
1049 
1050  // Add current hash values with previously saved
1051  STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
1052  STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
1053 
1054  data += 16;
1055  length -= SHA256::BLOCKSIZE;
1056  }
1057 
1058  // Write hash values back in the correct order
1059  TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
1060  STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
1061  STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
1062  STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
1063 
1064  _mm_storeu_si128((__m128i*) &state[0], STATE0);
1065  _mm_storeu_si128((__m128i*) &state[4], STATE1);
1066 }
1067 #endif // CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
1068 
1069 /////////////////////////////////
1070 // end of Walton/Gulley's code //
1071 /////////////////////////////////
1072 
1073 pfnSHATransform InitializeSHA256Transform()
1074 {
1075 #if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
1076  if (HasSHA())
1077  return &SHA256_SSE_SHA_Transform;
1078  else
1079 #endif
1080 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
1081  if (HasSSE2())
1082  return &SHA256_SSE2_Transform;
1083  else
1084 #endif
1085 
1086  return &SHA256_CXX_Transform;
1087 }
1088 
1089 void SHA256::Transform(word32 *state, const word32 *data)
1090 {
1091  static const pfnSHATransform s_pfn = InitializeSHA256Transform();
1092  s_pfn(state, data);
1093 }
1094 
1095 // *************************************************************
1096 
1097 void SHA384::InitState(HashWordType *state)
1098 {
1099  static const word64 s[8] = {
1100  W64LIT(0xcbbb9d5dc1059ed8), W64LIT(0x629a292a367cd507),
1101  W64LIT(0x9159015a3070dd17), W64LIT(0x152fecd8f70e5939),
1102  W64LIT(0x67332667ffc00b31), W64LIT(0x8eb44a8768581511),
1103  W64LIT(0xdb0c2e0d64f98fa7), W64LIT(0x47b5481dbefa4fa4)};
1104  memcpy(state, s, sizeof(s));
1105 }
1106 
1107 void SHA512::InitState(HashWordType *state)
1108 {
1109  static const word64 s[8] = {
1110  W64LIT(0x6a09e667f3bcc908), W64LIT(0xbb67ae8584caa73b),
1111  W64LIT(0x3c6ef372fe94f82b), W64LIT(0xa54ff53a5f1d36f1),
1112  W64LIT(0x510e527fade682d1), W64LIT(0x9b05688c2b3e6c1f),
1113  W64LIT(0x1f83d9abfb41bd6b), W64LIT(0x5be0cd19137e2179)};
1114  memcpy(state, s, sizeof(s));
1115 }
1116 
1117 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32)
1118 CRYPTOPP_ALIGN_DATA(16) static const word64 SHA512_K[80] CRYPTOPP_SECTION_ALIGN16 = {
1119 #else
1120 CRYPTOPP_ALIGN_DATA(16) static const word64 SHA512_K[80] CRYPTOPP_SECTION_ALIGN16 = {
1121 #endif
1122  W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd),
1123  W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc),
1124  W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019),
1125  W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118),
1126  W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe),
1127  W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2),
1128  W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1),
1129  W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694),
1130  W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3),
1131  W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65),
1132  W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483),
1133  W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5),
1134  W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210),
1135  W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4),
1136  W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725),
1137  W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70),
1138  W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926),
1139  W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df),
1140  W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8),
1141  W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b),
1142  W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001),
1143  W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30),
1144  W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910),
1145  W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8),
1146  W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53),
1147  W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8),
1148  W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb),
1149  W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3),
1150  W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60),
1151  W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec),
1152  W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9),
1153  W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b),
1154  W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207),
1155  W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178),
1156  W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6),
1157  W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b),
1158  W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493),
1159  W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c),
1160  W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a),
1161  W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817)
1162 };
1163 
1164 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32)
1165 // put assembly version in separate function, otherwise MSVC 2005 SP1 doesn't generate correct code for the non-assembly version
1166 CRYPTOPP_NAKED static void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64 *data)
1167 {
1168 #ifdef __GNUC__
1169  __asm__ __volatile__
1170  (
1171  INTEL_NOPREFIX
1172  AS_PUSH_IF86( bx)
1173  AS2( mov ebx, eax)
1174 #else
1175  AS1( push ebx)
1176  AS1( push esi)
1177  AS1( push edi)
1178  AS2( lea ebx, SHA512_K)
1179 #endif
1180 
1181  AS2( mov eax, esp)
1182  AS2( and esp, 0xfffffff0)
1183  AS2( sub esp, 27*16) // 17*16 for expanded data, 20*8 for state
1184  AS_PUSH_IF86( ax)
1185  AS2( xor eax, eax)
1186 
1187 #if CRYPTOPP_BOOL_X32
1188  AS2( lea edi, [esp+8+8*8]) // start at middle of state buffer. will decrement pointer each round to avoid copying
1189  AS2( lea esi, [esp+8+20*8+8]) // 16-byte alignment, then add 8
1190 #else
1191  AS2( lea edi, [esp+4+8*8]) // start at middle of state buffer. will decrement pointer each round to avoid copying
1192  AS2( lea esi, [esp+4+20*8+8]) // 16-byte alignment, then add 8
1193 #endif
1194 
1195  AS2( movdqa xmm0, [ecx+0*16])
1196  AS2( movdq2q mm4, xmm0)
1197  AS2( movdqa [edi+0*16], xmm0)
1198  AS2( movdqa xmm0, [ecx+1*16])
1199  AS2( movdqa [edi+1*16], xmm0)
1200  AS2( movdqa xmm0, [ecx+2*16])
1201  AS2( movdq2q mm5, xmm0)
1202  AS2( movdqa [edi+2*16], xmm0)
1203  AS2( movdqa xmm0, [ecx+3*16])
1204  AS2( movdqa [edi+3*16], xmm0)
1205  ASJ( jmp, 0, f)
1206 
1207 #define SSE2_S0_S1(r, a, b, c) \
1208  AS2( movq mm6, r)\
1209  AS2( psrlq r, a)\
1210  AS2( movq mm7, r)\
1211  AS2( psllq mm6, 64-c)\
1212  AS2( pxor mm7, mm6)\
1213  AS2( psrlq r, b-a)\
1214  AS2( pxor mm7, r)\
1215  AS2( psllq mm6, c-b)\
1216  AS2( pxor mm7, mm6)\
1217  AS2( psrlq r, c-b)\
1218  AS2( pxor r, mm7)\
1219  AS2( psllq mm6, b-a)\
1220  AS2( pxor r, mm6)
1221 
1222 #define SSE2_s0(r, a, b, c) \
1223  AS2( movdqa xmm6, r)\
1224  AS2( psrlq r, a)\
1225  AS2( movdqa xmm7, r)\
1226  AS2( psllq xmm6, 64-c)\
1227  AS2( pxor xmm7, xmm6)\
1228  AS2( psrlq r, b-a)\
1229  AS2( pxor xmm7, r)\
1230  AS2( psrlq r, c-b)\
1231  AS2( pxor r, xmm7)\
1232  AS2( psllq xmm6, c-a)\
1233  AS2( pxor r, xmm6)
1234 
1235 #define SSE2_s1(r, a, b, c) \
1236  AS2( movdqa xmm6, r)\
1237  AS2( psrlq r, a)\
1238  AS2( movdqa xmm7, r)\
1239  AS2( psllq xmm6, 64-c)\
1240  AS2( pxor xmm7, xmm6)\
1241  AS2( psrlq r, b-a)\
1242  AS2( pxor xmm7, r)\
1243  AS2( psllq xmm6, c-b)\
1244  AS2( pxor xmm7, xmm6)\
1245  AS2( psrlq r, c-b)\
1246  AS2( pxor r, xmm7)
1247 
1248  ASL(SHA512_Round)
1249  // k + w is in mm0, a is in mm4, e is in mm5
1250  AS2( paddq mm0, [edi+7*8]) // h
1251  AS2( movq mm2, [edi+5*8]) // f
1252  AS2( movq mm3, [edi+6*8]) // g
1253  AS2( pxor mm2, mm3)
1254  AS2( pand mm2, mm5)
1255  SSE2_S0_S1(mm5,14,18,41)
1256  AS2( pxor mm2, mm3)
1257  AS2( paddq mm0, mm2) // h += Ch(e,f,g)
1258  AS2( paddq mm5, mm0) // h += S1(e)
1259  AS2( movq mm2, [edi+1*8]) // b
1260  AS2( movq mm1, mm2)
1261  AS2( por mm2, mm4)
1262  AS2( pand mm2, [edi+2*8]) // c
1263  AS2( pand mm1, mm4)
1264  AS2( por mm1, mm2)
1265  AS2( paddq mm1, mm5) // temp = h + Maj(a,b,c)
1266  AS2( paddq mm5, [edi+3*8]) // e = d + h
1267  AS2( movq [edi+3*8], mm5)
1268  AS2( movq [edi+11*8], mm5)
1269  SSE2_S0_S1(mm4,28,34,39) // S0(a)
1270  AS2( paddq mm4, mm1) // a = temp + S0(a)
1271  AS2( movq [edi-8], mm4)
1272  AS2( movq [edi+7*8], mm4)
1273  AS1( ret)
1274 
1275  // first 16 rounds
1276  ASL(0)
1277  AS2( movq mm0, [edx+eax*8])
1278  AS2( movq [esi+eax*8], mm0)
1279  AS2( movq [esi+eax*8+16*8], mm0)
1280  AS2( paddq mm0, [ebx+eax*8])
1281  ASC( call, SHA512_Round)
1282  AS1( inc eax)
1283  AS2( sub edi, 8)
1284  AS2( test eax, 7)
1285  ASJ( jnz, 0, b)
1286  AS2( add edi, 8*8)
1287  AS2( cmp eax, 16)
1288  ASJ( jne, 0, b)
1289 
1290  // rest of the rounds
1291  AS2( movdqu xmm0, [esi+(16-2)*8])
1292  ASL(1)
1293  // data expansion, W[i-2] already in xmm0
1294  AS2( movdqu xmm3, [esi])
1295  AS2( paddq xmm3, [esi+(16-7)*8])
1296  AS2( movdqa xmm2, [esi+(16-15)*8])
1297  SSE2_s1(xmm0, 6, 19, 61)
1298  AS2( paddq xmm0, xmm3)
1299  SSE2_s0(xmm2, 1, 7, 8)
1300  AS2( paddq xmm0, xmm2)
1301  AS2( movdq2q mm0, xmm0)
1302  AS2( movhlps xmm1, xmm0)
1303  AS2( paddq mm0, [ebx+eax*8])
1304  AS2( movlps [esi], xmm0)
1305  AS2( movlps [esi+8], xmm1)
1306  AS2( movlps [esi+8*16], xmm0)
1307  AS2( movlps [esi+8*17], xmm1)
1308  // 2 rounds
1309  ASC( call, SHA512_Round)
1310  AS2( sub edi, 8)
1311  AS2( movdq2q mm0, xmm1)
1312  AS2( paddq mm0, [ebx+eax*8+8])
1313  ASC( call, SHA512_Round)
1314  // update indices and loop
1315  AS2( add esi, 16)
1316  AS2( add eax, 2)
1317  AS2( sub edi, 8)
1318  AS2( test eax, 7)
1319  ASJ( jnz, 1, b)
1320  // do housekeeping every 8 rounds
1321  AS2( mov esi, 0xf)
1322  AS2( and esi, eax)
1323 #if CRYPTOPP_BOOL_X32
1324  AS2( lea esi, [esp+8+20*8+8+esi*8])
1325 #else
1326  AS2( lea esi, [esp+4+20*8+8+esi*8])
1327 #endif
1328  AS2( add edi, 8*8)
1329  AS2( cmp eax, 80)
1330  ASJ( jne, 1, b)
1331 
1332 #define SSE2_CombineState(i) \
1333  AS2( movdqa xmm0, [edi+i*16])\
1334  AS2( paddq xmm0, [ecx+i*16])\
1335  AS2( movdqa [ecx+i*16], xmm0)
1336 
1337  SSE2_CombineState(0)
1338  SSE2_CombineState(1)
1339  SSE2_CombineState(2)
1340  SSE2_CombineState(3)
1341 
1342  AS_POP_IF86( sp)
1343  AS1( emms)
1344 
1345 #if defined(__GNUC__)
1346  AS_POP_IF86( bx)
1347  ATT_PREFIX
1348  :
1349  : "a" (SHA512_K), "c" (state), "d" (data)
1350  : "%esi", "%edi", "memory", "cc"
1351  );
1352 #else
1353  AS1( pop edi)
1354  AS1( pop esi)
1355  AS1( pop ebx)
1356  AS1( ret)
1357 #endif
1358 }
1359 #endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
1360 
1361 void SHA512::Transform(word64 *state, const word64 *data)
1362 {
1363  CRYPTOPP_ASSERT(IsAlignedOn(state, GetAlignmentOf<word64>()));
1364  CRYPTOPP_ASSERT(IsAlignedOn(data, GetAlignmentOf<word64>()));
1365 
1366 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32)
1367  if (HasSSE2())
1368  {
1369  SHA512_SSE2_Transform(state, data);
1370  return;
1371  }
1372 #endif
1373 
1374 #define S0(x) (rotrFixed(x,28)^rotrFixed(x,34)^rotrFixed(x,39))
1375 #define S1(x) (rotrFixed(x,14)^rotrFixed(x,18)^rotrFixed(x,41))
1376 #define s0(x) (rotrFixed(x,1)^rotrFixed(x,8)^(x>>7))
1377 #define s1(x) (rotrFixed(x,19)^rotrFixed(x,61)^(x>>6))
1378 
1379 #define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+SHA512_K[i+j]+(j?blk2(i):blk0(i));\
1380  d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i))
1381 
1382  word64 W[16];
1383  word64 T[8];
1384  /* Copy context->state[] to working vars */
1385  memcpy(T, state, sizeof(T));
1386  /* 80 operations, partially loop unrolled */
1387  for (unsigned int j=0; j<80; j+=16)
1388  {
1389  R( 0); R( 1); R( 2); R( 3);
1390  R( 4); R( 5); R( 6); R( 7);
1391  R( 8); R( 9); R(10); R(11);
1392  R(12); R(13); R(14); R(15);
1393  }
1394  /* Add the working vars back into context.state[] */
1395  state[0] += a(0);
1396  state[1] += b(0);
1397  state[2] += c(0);
1398  state[3] += d(0);
1399  state[4] += e(0);
1400  state[5] += f(0);
1401  state[6] += g(0);
1402  state[7] += h(0);
1403 }
1404 
1405 NAMESPACE_END
1406 
1407 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
1408 #endif // #ifndef CRYPTOPP_IMPORTS
bool HasSHA()
Determines SHA availability.
Definition: cpu.h:220
Utility functions for the Crypto++ library.
Library configuration file.
Classes and functions for secure memory allocations.
bool IsAlignedOn(const void *ptr, unsigned int alignment)
Determines whether ptr is aligned to a minimum value.
Definition: misc.h:954
Fixed size stack-based SecBlock with 16-byte alignment.
Definition: secblock.h:766
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:62
Functions for CPU features and intrinsics.
Classes for SHA-1 and SHA-2 family of message digests.
bool HasSSE2()
Determines SSE2 availability.
Definition: cpu.h:165
Crypto++ library namespace.
byte ByteReverse(byte value)
Reverses bytes in a 8-bit value.
Definition: misc.h:1663