Crypto++  5.6.5
Free C++ class library of cryptographic schemes
salsa.cpp
1 // salsa.cpp - originally written and placed in the public domain by Wei Dai
2 
3 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM salsa.cpp" to generate MASM code
4 
5 #include "pch.h"
6 #include "config.h"
7 
8 #ifndef CRYPTOPP_GENERATE_X64_MASM
9 
10 #include "salsa.h"
11 #include "argnames.h"
12 #include "misc.h"
13 #include "cpu.h"
14 
15 #if CRYPTOPP_MSC_VERSION
16 # pragma warning(disable: 4702 4740)
17 #endif
18 
19 // Clang due to "Inline assembly operands don't work with .intel_syntax"
20 // https://llvm.org/bugs/show_bug.cgi?id=24232
21 #if defined(CRYPTOPP_DISABLE_SALSA_ASM)
22 # undef CRYPTOPP_X86_ASM_AVAILABLE
23 # undef CRYPTOPP_X32_ASM_AVAILABLE
24 # undef CRYPTOPP_X64_ASM_AVAILABLE
25 # undef CRYPTOPP_SSE2_ASM_AVAILABLE
26 # undef CRYPTOPP_SSSE3_ASM_AVAILABLE
27 #endif
28 
29 NAMESPACE_BEGIN(CryptoPP)
30 
31 #if defined(CRYPTOPP_DEBUG) && !defined(CRYPTOPP_DOXYGEN_PROCESSING)
32 void Salsa20_TestInstantiations()
33 {
36 }
37 #endif
38 
39 void Salsa20_Policy::CipherSetKey(const NameValuePairs &params, const byte *key, size_t length)
40 {
41  m_rounds = params.GetIntValueWithDefault(Name::Rounds(), 20);
42 
43  if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20))
44  throw InvalidRounds(Salsa20::StaticAlgorithmName(), m_rounds);
45 
46  // m_state is reordered for SSE2
48  get1(m_state[13])(m_state[10])(m_state[7])(m_state[4]);
49  GetBlock<word32, LittleEndian> get2(key + length - 16);
50  get2(m_state[15])(m_state[12])(m_state[9])(m_state[6]);
51 
52  // "expand 16-byte k" or "expand 32-byte k"
53  m_state[0] = 0x61707865;
54  m_state[1] = (length == 16) ? 0x3120646e : 0x3320646e;
55  m_state[2] = (length == 16) ? 0x79622d36 : 0x79622d32;
56  m_state[3] = 0x6b206574;
57 }
58 
59 void Salsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length)
60 {
61  CRYPTOPP_UNUSED(keystreamBuffer), CRYPTOPP_UNUSED(length);
62  CRYPTOPP_ASSERT(length==8);
63 
65  get(m_state[14])(m_state[11]);
66  m_state[8] = m_state[5] = 0;
67 }
68 
69 void Salsa20_Policy::SeekToIteration(lword iterationCount)
70 {
71  m_state[8] = (word32)iterationCount;
72  m_state[5] = (word32)SafeRightShift<32>(iterationCount);
73 }
74 
75 #if (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64)
76 unsigned int Salsa20_Policy::GetAlignment() const
77 {
78 #if CRYPTOPP_SSE2_ASM_AVAILABLE
79  if (HasSSE2())
80  return 16;
81  else
82 #endif
83  return GetAlignmentOf<word32>();
84 }
85 
86 unsigned int Salsa20_Policy::GetOptimalBlockSize() const
87 {
88 #if CRYPTOPP_SSE2_ASM_AVAILABLE
89  if (HasSSE2())
90  return 4*BYTES_PER_ITERATION;
91  else
92 #endif
93  return BYTES_PER_ITERATION;
94 }
95 #endif
96 
97 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
98 extern "C" {
99 void Salsa20_OperateKeystream(byte *output, const byte *input, size_t iterationCount, int rounds, void *state);
100 }
101 #endif
102 
103 #if CRYPTOPP_MSC_VERSION
104 # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
105 #endif
106 
107 void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
108 {
109 #endif // #ifdef CRYPTOPP_GENERATE_X64_MASM
110 
111 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
112  Salsa20_OperateKeystream(output, input, iterationCount, m_rounds, m_state.data());
113  return;
114 #endif
115 
116 #if CRYPTOPP_SSE2_ASM_AVAILABLE
117 #ifdef CRYPTOPP_GENERATE_X64_MASM
118  ALIGN 8
119  Salsa20_OperateKeystream PROC FRAME
120  mov r10, [rsp + 5*8] ; state
121  alloc_stack(10*16 + 32*16 + 8)
122  save_xmm128 xmm6, 0200h
123  save_xmm128 xmm7, 0210h
124  save_xmm128 xmm8, 0220h
125  save_xmm128 xmm9, 0230h
126  save_xmm128 xmm10, 0240h
127  save_xmm128 xmm11, 0250h
128  save_xmm128 xmm12, 0260h
129  save_xmm128 xmm13, 0270h
130  save_xmm128 xmm14, 0280h
131  save_xmm128 xmm15, 0290h
132  .endprolog
133 
134  #define REG_output rcx
135  #define REG_input rdx
136  #define REG_iterationCount r8
137  #define REG_state r10
138  #define REG_rounds e9d
139  #define REG_roundsLeft eax
140  #define REG_temp32 r11d
141  #define REG_temp r11
142  #define SSE2_WORKSPACE rsp
143 #else
144  if (HasSSE2())
145  {
146  #if CRYPTOPP_BOOL_X64
147  #define REG_output %1
148  #define REG_input %0
149  #define REG_iterationCount %2
150  #define REG_state %4 /* constant */
151  #define REG_rounds %3 /* constant */
152  #define REG_roundsLeft eax
153  #define REG_temp32 edx
154  #define REG_temp rdx
155  #define SSE2_WORKSPACE %5 /* constant */
156 
157  CRYPTOPP_ALIGN_DATA(16) byte workspace[16*32];
158  #else
159  #define REG_output edi
160  #define REG_input eax
161  #define REG_iterationCount ecx
162  #define REG_state esi
163  #define REG_rounds edx
164  #define REG_roundsLeft ebx
165  #define REG_temp32 ebp
166  #define REG_temp ebp
167  #define SSE2_WORKSPACE esp + WORD_SZ
168  #endif
169 
170  #ifdef __GNUC__
171  __asm__ __volatile__
172  (
173  INTEL_NOPREFIX
174  AS_PUSH_IF86( bx)
175  #else
176  void *s = m_state.data();
177  word32 r = m_rounds;
178 
179  AS2( mov REG_iterationCount, iterationCount)
180  AS2( mov REG_input, input)
181  AS2( mov REG_output, output)
182  AS2( mov REG_state, s)
183  AS2( mov REG_rounds, r)
184  #endif
185 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
186 
187  AS_PUSH_IF86( bp)
188  AS2( cmp REG_iterationCount, 4)
189  ASJ( jl, 5, f)
190 
191 #if CRYPTOPP_BOOL_X86
192  AS2( mov ebx, esp)
193  AS2( and esp, -16)
194  AS2( sub esp, 32*16)
195  AS1( push ebx)
196 #endif
197 
198 #define SSE2_EXPAND_S(i, j) \
199  ASS( pshufd xmm4, xmm##i, j, j, j, j) \
200  AS2( movdqa [SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4)
201 
202  AS2( movdqa xmm0, [REG_state + 0*16])
203  AS2( movdqa xmm1, [REG_state + 1*16])
204  AS2( movdqa xmm2, [REG_state + 2*16])
205  AS2( movdqa xmm3, [REG_state + 3*16])
206  SSE2_EXPAND_S(0, 0)
207  SSE2_EXPAND_S(0, 1)
208  SSE2_EXPAND_S(0, 2)
209  SSE2_EXPAND_S(0, 3)
210  SSE2_EXPAND_S(1, 0)
211  SSE2_EXPAND_S(1, 2)
212  SSE2_EXPAND_S(1, 3)
213  SSE2_EXPAND_S(2, 1)
214  SSE2_EXPAND_S(2, 2)
215  SSE2_EXPAND_S(2, 3)
216  SSE2_EXPAND_S(3, 0)
217  SSE2_EXPAND_S(3, 1)
218  SSE2_EXPAND_S(3, 2)
219  SSE2_EXPAND_S(3, 3)
220 
221 #define SSE2_EXPAND_S85(i) \
222  AS2( mov dword ptr [SSE2_WORKSPACE + 8*16 + i*4 + 256], REG_roundsLeft) \
223  AS2( mov dword ptr [SSE2_WORKSPACE + 5*16 + i*4 + 256], REG_temp32) \
224  AS2( add REG_roundsLeft, 1) \
225  AS2( adc REG_temp32, 0)
226 
227  ASL(1)
228  AS2( mov REG_roundsLeft, dword ptr [REG_state + 8*4])
229  AS2( mov REG_temp32, dword ptr [REG_state + 5*4])
230  SSE2_EXPAND_S85(0)
231  SSE2_EXPAND_S85(1)
232  SSE2_EXPAND_S85(2)
233  SSE2_EXPAND_S85(3)
234  AS2( mov dword ptr [REG_state + 8*4], REG_roundsLeft)
235  AS2( mov dword ptr [REG_state + 5*4], REG_temp32)
236 
237 #define SSE2_QUARTER_ROUND(a, b, d, i) \
238  AS2( movdqa xmm4, xmm##d) \
239  AS2( paddd xmm4, xmm##a) \
240  AS2( movdqa xmm5, xmm4) \
241  AS2( pslld xmm4, i) \
242  AS2( psrld xmm5, 32-i) \
243  AS2( pxor xmm##b, xmm4) \
244  AS2( pxor xmm##b, xmm5)
245 
246 #define L01(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##A, [SSE2_WORKSPACE + d*16 + i*256]) /* y3 */
247 #define L02(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##C, [SSE2_WORKSPACE + a*16 + i*256]) /* y0 */
248 #define L03(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* y0+y3 */
249 #define L04(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
250 #define L05(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 7)
251 #define L06(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-7)
252 #define L07(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + b*16 + i*256])
253 #define L08(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z1 */
254 #define L09(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + b*16], xmm##A)
255 #define L10(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
256 #define L11(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* z1+y0 */
257 #define L12(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
258 #define L13(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 9)
259 #define L14(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-9)
260 #define L15(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + c*16 + i*256])
261 #define L16(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z2 */
262 #define L17(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + c*16], xmm##A)
263 #define L18(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
264 #define L19(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##B) /* z2+z1 */
265 #define L20(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
266 #define L21(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 13)
267 #define L22(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-13)
268 #define L23(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + d*16 + i*256])
269 #define L24(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z3 */
270 #define L25(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + d*16], xmm##A)
271 #define L26(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##D) /* z3+z2 */
272 #define L27(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
273 #define L28(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 18)
274 #define L29(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-18)
275 #define L30(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##C) /* xor y0 */
276 #define L31(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z0 */
277 #define L32(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + a*16], xmm##A)
278 
279 #define SSE2_QUARTER_ROUND_X8(i, a, b, c, d, e, f, g, h) \
280  L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) \
281  L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) \
282  L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) \
283  L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) \
284  L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) \
285  L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) \
286  L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) \
287  L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) \
288  L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) \
289  L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) \
290  L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) \
291  L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) \
292  L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) \
293  L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) \
294  L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) \
295  L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) \
296  L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) \
297  L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) \
298  L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) \
299  L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) \
300  L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) \
301  L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) \
302  L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) \
303  L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) \
304  L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) \
305  L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) \
306  L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) \
307  L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) \
308  L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) \
309  L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) \
310  L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) \
311  L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i)
312 
313 #define SSE2_QUARTER_ROUND_X16(i, a, b, c, d, e, f, g, h, A, B, C, D, E, F, G, H) \
314  L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) L01(8,9,10,11, A,B,C,D, i) L01(12,13,14,15, E,F,G,H, i) \
315  L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) L02(8,9,10,11, A,B,C,D, i) L02(12,13,14,15, E,F,G,H, i) \
316  L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) L03(8,9,10,11, A,B,C,D, i) L03(12,13,14,15, E,F,G,H, i) \
317  L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) L04(8,9,10,11, A,B,C,D, i) L04(12,13,14,15, E,F,G,H, i) \
318  L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) L05(8,9,10,11, A,B,C,D, i) L05(12,13,14,15, E,F,G,H, i) \
319  L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) L06(8,9,10,11, A,B,C,D, i) L06(12,13,14,15, E,F,G,H, i) \
320  L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) L07(8,9,10,11, A,B,C,D, i) L07(12,13,14,15, E,F,G,H, i) \
321  L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) L08(8,9,10,11, A,B,C,D, i) L08(12,13,14,15, E,F,G,H, i) \
322  L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) L09(8,9,10,11, A,B,C,D, i) L09(12,13,14,15, E,F,G,H, i) \
323  L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) L10(8,9,10,11, A,B,C,D, i) L10(12,13,14,15, E,F,G,H, i) \
324  L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) L11(8,9,10,11, A,B,C,D, i) L11(12,13,14,15, E,F,G,H, i) \
325  L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) L12(8,9,10,11, A,B,C,D, i) L12(12,13,14,15, E,F,G,H, i) \
326  L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) L13(8,9,10,11, A,B,C,D, i) L13(12,13,14,15, E,F,G,H, i) \
327  L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) L14(8,9,10,11, A,B,C,D, i) L14(12,13,14,15, E,F,G,H, i) \
328  L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) L15(8,9,10,11, A,B,C,D, i) L15(12,13,14,15, E,F,G,H, i) \
329  L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) L16(8,9,10,11, A,B,C,D, i) L16(12,13,14,15, E,F,G,H, i) \
330  L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) L17(8,9,10,11, A,B,C,D, i) L17(12,13,14,15, E,F,G,H, i) \
331  L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) L18(8,9,10,11, A,B,C,D, i) L18(12,13,14,15, E,F,G,H, i) \
332  L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) L19(8,9,10,11, A,B,C,D, i) L19(12,13,14,15, E,F,G,H, i) \
333  L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) L20(8,9,10,11, A,B,C,D, i) L20(12,13,14,15, E,F,G,H, i) \
334  L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) L21(8,9,10,11, A,B,C,D, i) L21(12,13,14,15, E,F,G,H, i) \
335  L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) L22(8,9,10,11, A,B,C,D, i) L22(12,13,14,15, E,F,G,H, i) \
336  L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) L23(8,9,10,11, A,B,C,D, i) L23(12,13,14,15, E,F,G,H, i) \
337  L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) L24(8,9,10,11, A,B,C,D, i) L24(12,13,14,15, E,F,G,H, i) \
338  L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) L25(8,9,10,11, A,B,C,D, i) L25(12,13,14,15, E,F,G,H, i) \
339  L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) L26(8,9,10,11, A,B,C,D, i) L26(12,13,14,15, E,F,G,H, i) \
340  L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) L27(8,9,10,11, A,B,C,D, i) L27(12,13,14,15, E,F,G,H, i) \
341  L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) L28(8,9,10,11, A,B,C,D, i) L28(12,13,14,15, E,F,G,H, i) \
342  L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) L29(8,9,10,11, A,B,C,D, i) L29(12,13,14,15, E,F,G,H, i) \
343  L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) L30(8,9,10,11, A,B,C,D, i) L30(12,13,14,15, E,F,G,H, i) \
344  L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) L31(8,9,10,11, A,B,C,D, i) L31(12,13,14,15, E,F,G,H, i) \
345  L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i) L32(8,9,10,11, A,B,C,D, i) L32(12,13,14,15, E,F,G,H, i)
346 
347 #if CRYPTOPP_BOOL_X64
348  SSE2_QUARTER_ROUND_X16(1, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
349 #else
350  SSE2_QUARTER_ROUND_X8(1, 2, 6, 10, 14, 3, 7, 11, 15)
351  SSE2_QUARTER_ROUND_X8(1, 0, 4, 8, 12, 1, 5, 9, 13)
352 #endif
353  AS2( mov REG_roundsLeft, REG_rounds)
354  ASJ( jmp, 2, f)
355 
356  ASL(SSE2_Salsa_Output)
357  AS2( movdqa xmm0, xmm4)
358  AS2( punpckldq xmm4, xmm5)
359  AS2( movdqa xmm1, xmm6)
360  AS2( punpckldq xmm6, xmm7)
361  AS2( movdqa xmm2, xmm4)
362  AS2( punpcklqdq xmm4, xmm6) // e
363  AS2( punpckhqdq xmm2, xmm6) // f
364  AS2( punpckhdq xmm0, xmm5)
365  AS2( punpckhdq xmm1, xmm7)
366  AS2( movdqa xmm6, xmm0)
367  AS2( punpcklqdq xmm0, xmm1) // g
368  AS2( punpckhqdq xmm6, xmm1) // h
369  AS_XMM_OUTPUT4(SSE2_Salsa_Output_A, REG_input, REG_output, 4, 2, 0, 6, 1, 0, 4, 8, 12, 1)
370  AS1( ret)
371 
372  ASL(6)
373 #if CRYPTOPP_BOOL_X64
374  SSE2_QUARTER_ROUND_X16(0, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
375  ASL(2)
376  SSE2_QUARTER_ROUND_X16(0, 0, 13, 10, 7, 1, 14, 11, 4, 2, 15, 8, 5, 3, 12, 9, 6)
377 #else
378  SSE2_QUARTER_ROUND_X8(0, 2, 6, 10, 14, 3, 7, 11, 15)
379  SSE2_QUARTER_ROUND_X8(0, 0, 4, 8, 12, 1, 5, 9, 13)
380  ASL(2)
381  SSE2_QUARTER_ROUND_X8(0, 2, 15, 8, 5, 3, 12, 9, 6)
382  SSE2_QUARTER_ROUND_X8(0, 0, 13, 10, 7, 1, 14, 11, 4)
383 #endif
384  AS2( sub REG_roundsLeft, 2)
385  ASJ( jnz, 6, b)
386 
387 #define SSE2_OUTPUT_4(a, b, c, d) \
388  AS2( movdqa xmm4, [SSE2_WORKSPACE + a*16 + 256])\
389  AS2( paddd xmm4, [SSE2_WORKSPACE + a*16])\
390  AS2( movdqa xmm5, [SSE2_WORKSPACE + b*16 + 256])\
391  AS2( paddd xmm5, [SSE2_WORKSPACE + b*16])\
392  AS2( movdqa xmm6, [SSE2_WORKSPACE + c*16 + 256])\
393  AS2( paddd xmm6, [SSE2_WORKSPACE + c*16])\
394  AS2( movdqa xmm7, [SSE2_WORKSPACE + d*16 + 256])\
395  AS2( paddd xmm7, [SSE2_WORKSPACE + d*16])\
396  ASC( call, SSE2_Salsa_Output)
397 
398  SSE2_OUTPUT_4(0, 13, 10, 7)
399  SSE2_OUTPUT_4(4, 1, 14, 11)
400  SSE2_OUTPUT_4(8, 5, 2, 15)
401  SSE2_OUTPUT_4(12, 9, 6, 3)
402  AS2( test REG_input, REG_input)
403  ASJ( jz, 9, f)
404  AS2( add REG_input, 12*16)
405  ASL(9)
406  AS2( add REG_output, 12*16)
407  AS2( sub REG_iterationCount, 4)
408  AS2( cmp REG_iterationCount, 4)
409  ASJ( jge, 1, b)
410  AS_POP_IF86( sp)
411 
412  ASL(5)
413  AS2( sub REG_iterationCount, 1)
414  ASJ( jl, 4, f)
415  AS2( movdqa xmm0, [REG_state + 0*16])
416  AS2( movdqa xmm1, [REG_state + 1*16])
417  AS2( movdqa xmm2, [REG_state + 2*16])
418  AS2( movdqa xmm3, [REG_state + 3*16])
419  AS2( mov REG_roundsLeft, REG_rounds)
420 
421  ASL(0)
422  SSE2_QUARTER_ROUND(0, 1, 3, 7)
423  SSE2_QUARTER_ROUND(1, 2, 0, 9)
424  SSE2_QUARTER_ROUND(2, 3, 1, 13)
425  SSE2_QUARTER_ROUND(3, 0, 2, 18)
426  ASS( pshufd xmm1, xmm1, 2, 1, 0, 3)
427  ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
428  ASS( pshufd xmm3, xmm3, 0, 3, 2, 1)
429  SSE2_QUARTER_ROUND(0, 3, 1, 7)
430  SSE2_QUARTER_ROUND(3, 2, 0, 9)
431  SSE2_QUARTER_ROUND(2, 1, 3, 13)
432  SSE2_QUARTER_ROUND(1, 0, 2, 18)
433  ASS( pshufd xmm1, xmm1, 0, 3, 2, 1)
434  ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
435  ASS( pshufd xmm3, xmm3, 2, 1, 0, 3)
436  AS2( sub REG_roundsLeft, 2)
437  ASJ( jnz, 0, b)
438 
439  AS2( paddd xmm0, [REG_state + 0*16])
440  AS2( paddd xmm1, [REG_state + 1*16])
441  AS2( paddd xmm2, [REG_state + 2*16])
442  AS2( paddd xmm3, [REG_state + 3*16])
443 
444  AS2( add dword ptr [REG_state + 8*4], 1)
445  AS2( adc dword ptr [REG_state + 5*4], 0)
446 
447  AS2( pcmpeqb xmm6, xmm6) // all ones
448  AS2( psrlq xmm6, 32) // lo32 mask
449  ASS( pshufd xmm7, xmm6, 0, 1, 2, 3) // hi32 mask
450  AS2( movdqa xmm4, xmm0)
451  AS2( movdqa xmm5, xmm3)
452  AS2( pand xmm0, xmm7)
453  AS2( pand xmm4, xmm6)
454  AS2( pand xmm3, xmm6)
455  AS2( pand xmm5, xmm7)
456  AS2( por xmm4, xmm5) // 0,13,2,15
457  AS2( movdqa xmm5, xmm1)
458  AS2( pand xmm1, xmm7)
459  AS2( pand xmm5, xmm6)
460  AS2( por xmm0, xmm5) // 4,1,6,3
461  AS2( pand xmm6, xmm2)
462  AS2( pand xmm2, xmm7)
463  AS2( por xmm1, xmm6) // 8,5,10,7
464  AS2( por xmm2, xmm3) // 12,9,14,11
465 
466  AS2( movdqa xmm5, xmm4)
467  AS2( movdqa xmm6, xmm0)
468  AS3( shufpd xmm4, xmm1, 2) // 0,13,10,7
469  AS3( shufpd xmm0, xmm2, 2) // 4,1,14,11
470  AS3( shufpd xmm1, xmm5, 2) // 8,5,2,15
471  AS3( shufpd xmm2, xmm6, 2) // 12,9,6,3
472 
473  // output keystream
474  AS_XMM_OUTPUT4(SSE2_Salsa_Output_B, REG_input, REG_output, 4, 0, 1, 2, 3, 0, 1, 2, 3, 4)
475  ASJ( jmp, 5, b)
476  ASL(4)
477 
478  AS_POP_IF86( bp)
479 #ifdef __GNUC__
480  AS_POP_IF86( bx)
481  ATT_PREFIX
482  #if CRYPTOPP_BOOL_X64
483  : "+r" (input), "+r" (output), "+r" (iterationCount)
484  : "r" (m_rounds), "r" (m_state.begin()), "r" (workspace)
485  : "%eax", "%rdx", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
486  #else
487  : "+a" (input), "+D" (output), "+c" (iterationCount)
488  : "d" (m_rounds), "S" (m_state.begin())
489  : "memory", "cc"
490  #endif
491  );
492 #endif
493 #ifdef CRYPTOPP_GENERATE_X64_MASM
494  movdqa xmm6, [rsp + 0200h]
495  movdqa xmm7, [rsp + 0210h]
496  movdqa xmm8, [rsp + 0220h]
497  movdqa xmm9, [rsp + 0230h]
498  movdqa xmm10, [rsp + 0240h]
499  movdqa xmm11, [rsp + 0250h]
500  movdqa xmm12, [rsp + 0260h]
501  movdqa xmm13, [rsp + 0270h]
502  movdqa xmm14, [rsp + 0280h]
503  movdqa xmm15, [rsp + 0290h]
504  add rsp, 10*16 + 32*16 + 8
505  ret
506 Salsa20_OperateKeystream ENDP
507 #else
508  }
509  else
510 #endif
511 #endif
512 #ifndef CRYPTOPP_GENERATE_X64_MASM
513  {
514  word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
515 
516  while (iterationCount--)
517  {
518  x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3];
519  x4 = m_state[4]; x5 = m_state[5]; x6 = m_state[6]; x7 = m_state[7];
520  x8 = m_state[8]; x9 = m_state[9]; x10 = m_state[10]; x11 = m_state[11];
521  x12 = m_state[12]; x13 = m_state[13]; x14 = m_state[14]; x15 = m_state[15];
522 
523  for (int i=m_rounds; i>0; i-=2)
524  {
525  #define QUARTER_ROUND(a, b, c, d) \
526  b = b ^ rotlConstant<7>(a + d); \
527  c = c ^ rotlConstant<9>(b + a); \
528  d = d ^ rotlConstant<13>(c + b); \
529  a = a ^ rotlConstant<18>(d + c);
530 
531  QUARTER_ROUND(x0, x4, x8, x12)
532  QUARTER_ROUND(x1, x5, x9, x13)
533  QUARTER_ROUND(x2, x6, x10, x14)
534  QUARTER_ROUND(x3, x7, x11, x15)
535 
536  QUARTER_ROUND(x0, x13, x10, x7)
537  QUARTER_ROUND(x1, x14, x11, x4)
538  QUARTER_ROUND(x2, x15, x8, x5)
539  QUARTER_ROUND(x3, x12, x9, x6)
540  }
541 
542  #define SALSA_OUTPUT(x) {\
543  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
544  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\
545  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\
546  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\
547  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
548  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\
549  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\
550  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\
551  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
552  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\
553  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\
554  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\
555  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
556  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\
557  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\
558  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);}
559 
560 #ifndef CRYPTOPP_DOXYGEN_PROCESSING
561  CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, BYTES_PER_ITERATION);
562 #endif
563 
564  if (++m_state[8] == 0)
565  ++m_state[5];
566  }
567  }
568 } // see comment above if an internal compiler error occurs here
569 
570 void XSalsa20_Policy::CipherSetKey(const NameValuePairs &params, const byte *key, size_t length)
571 {
572  m_rounds = params.GetIntValueWithDefault(Name::Rounds(), 20);
573 
574  if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20))
575  throw InvalidRounds(XSalsa20::StaticAlgorithmName(), m_rounds);
576 
577  GetUserKey(LITTLE_ENDIAN_ORDER, m_key.begin(), m_key.size(), key, length);
578  if (length == 16)
579  memcpy(m_key.begin()+4, m_key.begin(), 16);
580 
581  // "expand 32-byte k"
582  m_state[0] = 0x61707865;
583  m_state[1] = 0x3320646e;
584  m_state[2] = 0x79622d32;
585  m_state[3] = 0x6b206574;
586 }
587 
588 void XSalsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length)
589 {
590  CRYPTOPP_UNUSED(keystreamBuffer), CRYPTOPP_UNUSED(length);
591  CRYPTOPP_ASSERT(length==24);
592 
593  word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
594 
596  get(x14)(x11)(x8)(x5)(m_state[14])(m_state[11]);
597 
598  x13 = m_key[0]; x10 = m_key[1]; x7 = m_key[2]; x4 = m_key[3];
599  x15 = m_key[4]; x12 = m_key[5]; x9 = m_key[6]; x6 = m_key[7];
600  x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3];
601 
602  for (int i=m_rounds; i>0; i-=2)
603  {
604  QUARTER_ROUND(x0, x4, x8, x12)
605  QUARTER_ROUND(x1, x5, x9, x13)
606  QUARTER_ROUND(x2, x6, x10, x14)
607  QUARTER_ROUND(x3, x7, x11, x15)
608 
609  QUARTER_ROUND(x0, x13, x10, x7)
610  QUARTER_ROUND(x1, x14, x11, x4)
611  QUARTER_ROUND(x2, x15, x8, x5)
612  QUARTER_ROUND(x3, x12, x9, x6)
613  }
614 
615  m_state[13] = x0; m_state[10] = x1; m_state[7] = x2; m_state[4] = x3;
616  m_state[15] = x14; m_state[12] = x11; m_state[9] = x8; m_state[6] = x5;
617  m_state[8] = m_state[5] = 0;
618 }
619 
620 NAMESPACE_END
621 
622 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
int GetIntValueWithDefault(const char *name, int defaultValue) const
Get a named value with type int, with default.
Definition: cryptlib.h:393
Standard names for retrieving values by name when working with NameValuePairs.
Utility functions for the Crypto++ library.
const char * Rounds()
int
Definition: argnames.h:24
Library configuration file.
virtual unsigned int GetOptimalBlockSize() const
Provides number of ideal bytes to process.
Definition: strciphr.h:122
unsigned int GetAlignment() const
Provides data alignment requirements.
Definition: strciphr.h:191
#define CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(x, y)
Helper macro to implement OperateKeystream.
Definition: strciphr.h:230
byte order is little-endian
Definition: cryptlib.h:143
void CipherSetKey(const NameValuePairs &params, const byte *key, size_t length)
Key the cipher.
Definition: salsa.cpp:570
Exception thrown when an invalid number of rounds is encountered.
Definition: simple.h:59
A::pointer data()
Provides a pointer to the first element in the memory block.
Definition: secblock.h:553
void CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length)
Resynchronize the cipher.
Definition: salsa.cpp:588
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:60
Functions for CPU features and intrinsics.
Classes for Salsa and Salsa20 stream ciphers.
iterator begin()
Provides an iterator pointing to the first element in the memory block.
Definition: secblock.h:536
const char * IV()
ConstByteArrayParameter, also accepts const byte * for backwards compatibility.
Definition: argnames.h:21
bool HasSSE2()
Determines SSE2 availability.
Definition: cpu.h:114
Access a block of memory.
Definition: misc.h:2320
KeystreamOperation
Keystream operation flags.
Definition: strciphr.h:88
Crypto++ library namespace.
SymmetricCipher implementation.
Definition: strciphr.h:571
Interface for retrieving values given their names.
Definition: cryptlib.h:291