Crypto++  5.6.3
Free C++ class library of cryptographic schemes
salsa.cpp
1 // salsa.cpp - written and placed in the public domain by Wei Dai
2 
3 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM salsa.cpp" to generate MASM code
4 
5 #include "pch.h"
6 #include "config.h"
7 
8 #ifndef CRYPTOPP_GENERATE_X64_MASM
9 
10 #include "salsa.h"
11 #include "argnames.h"
12 #include "misc.h"
13 #include "cpu.h"
14 
15 #if CRYPTOPP_MSC_VERSION
16 # pragma warning(disable: 4702 4740)
17 #endif
18 
19 // TODO: work around GCC 4.8+ issue with SSE2 ASM until the exact details are known
20 // and fix is released. Duplicate with "valgrind ./cryptest.exe tv salsa"
21 // Clang due to "Inline assembly operands don't work with .intel_syntax"
22 // https://llvm.org/bugs/show_bug.cgi?id=24232
23 #if defined(CRYPTOPP_DISABLE_SALSA_ASM)
24 # undef CRYPTOPP_X86_ASM_AVAILABLE
25 # undef CRYPTOPP_X32_ASM_AVAILABLE
26 # undef CRYPTOPP_X64_ASM_AVAILABLE
27 # undef CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
28 # undef CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
29 # define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 0
30 # define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 0
31 #endif
32 
33 NAMESPACE_BEGIN(CryptoPP)
34 
35 #if !defined(NDEBUG) && !defined(CRYPTOPP_DOXYGEN_PROCESSING)
36 void Salsa20_TestInstantiations()
37 {
39 }
40 #endif
41 
42 void Salsa20_Policy::CipherSetKey(const NameValuePairs &params, const byte *key, size_t length)
43 {
44  m_rounds = params.GetIntValueWithDefault(Name::Rounds(), 20);
45 
46  if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20))
47  throw InvalidRounds(Salsa20::StaticAlgorithmName(), m_rounds);
48 
49  // m_state is reordered for SSE2
51  get1(m_state[13])(m_state[10])(m_state[7])(m_state[4]);
52  GetBlock<word32, LittleEndian> get2(key + length - 16);
53  get2(m_state[15])(m_state[12])(m_state[9])(m_state[6]);
54 
55  // "expand 16-byte k" or "expand 32-byte k"
56  m_state[0] = 0x61707865;
57  m_state[1] = (length == 16) ? 0x3120646e : 0x3320646e;
58  m_state[2] = (length == 16) ? 0x79622d36 : 0x79622d32;
59  m_state[3] = 0x6b206574;
60 }
61 
62 void Salsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length)
63 {
64  CRYPTOPP_UNUSED(keystreamBuffer), CRYPTOPP_UNUSED(length);
65  assert(length==8);
66 
68  get(m_state[14])(m_state[11]);
69  m_state[8] = m_state[5] = 0;
70 }
71 
72 void Salsa20_Policy::SeekToIteration(lword iterationCount)
73 {
74  m_state[8] = (word32)iterationCount;
75  m_state[5] = (word32)SafeRightShift<32>(iterationCount);
76 }
77 
78 #if (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64) && !defined(CRYPTOPP_DISABLE_SALSA_ASM)
79 unsigned int Salsa20_Policy::GetAlignment() const
80 {
81 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
82  if (HasSSE2())
83  return 16;
84  else
85 #endif
86  return GetAlignmentOf<word32>();
87 }
88 
89 unsigned int Salsa20_Policy::GetOptimalBlockSize() const
90 {
91 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
92  if (HasSSE2())
93  return 4*BYTES_PER_ITERATION;
94  else
95 #endif
96  return BYTES_PER_ITERATION;
97 }
98 #endif
99 
100 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
101 extern "C" {
102 void Salsa20_OperateKeystream(byte *output, const byte *input, size_t iterationCount, int rounds, void *state);
103 }
104 #endif
105 
106 #if CRYPTOPP_MSC_VERSION
107 # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
108 #endif
109 
110 void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
111 {
112 #endif // #ifdef CRYPTOPP_GENERATE_X64_MASM
113 
114 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
115  Salsa20_OperateKeystream(output, input, iterationCount, m_rounds, m_state.data());
116  return;
117 #endif
118 
119 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
120 #ifdef CRYPTOPP_GENERATE_X64_MASM
121  ALIGN 8
122  Salsa20_OperateKeystream PROC FRAME
123  mov r10, [rsp + 5*8] ; state
124  alloc_stack(10*16 + 32*16 + 8)
125  save_xmm128 xmm6, 0200h
126  save_xmm128 xmm7, 0210h
127  save_xmm128 xmm8, 0220h
128  save_xmm128 xmm9, 0230h
129  save_xmm128 xmm10, 0240h
130  save_xmm128 xmm11, 0250h
131  save_xmm128 xmm12, 0260h
132  save_xmm128 xmm13, 0270h
133  save_xmm128 xmm14, 0280h
134  save_xmm128 xmm15, 0290h
135  .endprolog
136 
137  #define REG_output rcx
138  #define REG_input rdx
139  #define REG_iterationCount r8
140  #define REG_state r10
141  #define REG_rounds e9d
142  #define REG_roundsLeft eax
143  #define REG_temp32 r11d
144  #define REG_temp r11
145  #define SSE2_WORKSPACE rsp
146 #else
147  if (HasSSE2())
148  {
149  #if CRYPTOPP_BOOL_X64
150  #define REG_output %1
151  #define REG_input %0
152  #define REG_iterationCount %2
153  #define REG_state %4 /* constant */
154  #define REG_rounds %3 /* constant */
155  #define REG_roundsLeft eax
156  #define REG_temp32 edx
157  #define REG_temp rdx
158  #define SSE2_WORKSPACE %5 /* constant */
159 
160  CRYPTOPP_ALIGN_DATA(16) byte workspace[16*32];
161  #else
162  #define REG_output edi
163  #define REG_input eax
164  #define REG_iterationCount ecx
165  #define REG_state esi
166  #define REG_rounds edx
167  #define REG_roundsLeft ebx
168  #define REG_temp32 ebp
169  #define REG_temp ebp
170  #define SSE2_WORKSPACE esp + WORD_SZ
171  #endif
172 
173  #ifdef __GNUC__
174  __asm__ __volatile__
175  (
176  INTEL_NOPREFIX
177  AS_PUSH_IF86( bx)
178  #else
179  void *s = m_state.data();
180  word32 r = m_rounds;
181 
182  AS2( mov REG_iterationCount, iterationCount)
183  AS2( mov REG_input, input)
184  AS2( mov REG_output, output)
185  AS2( mov REG_state, s)
186  AS2( mov REG_rounds, r)
187  #endif
188 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
189 
190  AS_PUSH_IF86( bp)
191  AS2( cmp REG_iterationCount, 4)
192  ASJ( jl, 5, f)
193 
194 #if CRYPTOPP_BOOL_X86
195  AS2( mov ebx, esp)
196  AS2( and esp, -16)
197  AS2( sub esp, 32*16)
198  AS1( push ebx)
199 #endif
200 
201 #define SSE2_EXPAND_S(i, j) \
202  ASS( pshufd xmm4, xmm##i, j, j, j, j) \
203  AS2( movdqa [SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4)
204 
205  AS2( movdqa xmm0, [REG_state + 0*16])
206  AS2( movdqa xmm1, [REG_state + 1*16])
207  AS2( movdqa xmm2, [REG_state + 2*16])
208  AS2( movdqa xmm3, [REG_state + 3*16])
209  SSE2_EXPAND_S(0, 0)
210  SSE2_EXPAND_S(0, 1)
211  SSE2_EXPAND_S(0, 2)
212  SSE2_EXPAND_S(0, 3)
213  SSE2_EXPAND_S(1, 0)
214  SSE2_EXPAND_S(1, 2)
215  SSE2_EXPAND_S(1, 3)
216  SSE2_EXPAND_S(2, 1)
217  SSE2_EXPAND_S(2, 2)
218  SSE2_EXPAND_S(2, 3)
219  SSE2_EXPAND_S(3, 0)
220  SSE2_EXPAND_S(3, 1)
221  SSE2_EXPAND_S(3, 2)
222  SSE2_EXPAND_S(3, 3)
223 
224 #define SSE2_EXPAND_S85(i) \
225  AS2( mov dword ptr [SSE2_WORKSPACE + 8*16 + i*4 + 256], REG_roundsLeft) \
226  AS2( mov dword ptr [SSE2_WORKSPACE + 5*16 + i*4 + 256], REG_temp32) \
227  AS2( add REG_roundsLeft, 1) \
228  AS2( adc REG_temp32, 0)
229 
230  ASL(1)
231  AS2( mov REG_roundsLeft, dword ptr [REG_state + 8*4])
232  AS2( mov REG_temp32, dword ptr [REG_state + 5*4])
233  SSE2_EXPAND_S85(0)
234  SSE2_EXPAND_S85(1)
235  SSE2_EXPAND_S85(2)
236  SSE2_EXPAND_S85(3)
237  AS2( mov dword ptr [REG_state + 8*4], REG_roundsLeft)
238  AS2( mov dword ptr [REG_state + 5*4], REG_temp32)
239 
240 #define SSE2_QUARTER_ROUND(a, b, d, i) \
241  AS2( movdqa xmm4, xmm##d) \
242  AS2( paddd xmm4, xmm##a) \
243  AS2( movdqa xmm5, xmm4) \
244  AS2( pslld xmm4, i) \
245  AS2( psrld xmm5, 32-i) \
246  AS2( pxor xmm##b, xmm4) \
247  AS2( pxor xmm##b, xmm5)
248 
249 #define L01(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##A, [SSE2_WORKSPACE + d*16 + i*256]) /* y3 */
250 #define L02(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##C, [SSE2_WORKSPACE + a*16 + i*256]) /* y0 */
251 #define L03(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* y0+y3 */
252 #define L04(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
253 #define L05(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 7)
254 #define L06(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-7)
255 #define L07(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + b*16 + i*256])
256 #define L08(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z1 */
257 #define L09(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + b*16], xmm##A)
258 #define L10(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
259 #define L11(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* z1+y0 */
260 #define L12(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
261 #define L13(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 9)
262 #define L14(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-9)
263 #define L15(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + c*16 + i*256])
264 #define L16(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z2 */
265 #define L17(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + c*16], xmm##A)
266 #define L18(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
267 #define L19(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##B) /* z2+z1 */
268 #define L20(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
269 #define L21(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 13)
270 #define L22(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-13)
271 #define L23(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + d*16 + i*256])
272 #define L24(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z3 */
273 #define L25(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + d*16], xmm##A)
274 #define L26(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##D) /* z3+z2 */
275 #define L27(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
276 #define L28(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 18)
277 #define L29(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-18)
278 #define L30(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##C) /* xor y0 */
279 #define L31(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z0 */
280 #define L32(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + a*16], xmm##A)
281 
282 #define SSE2_QUARTER_ROUND_X8(i, a, b, c, d, e, f, g, h) \
283  L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) \
284  L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) \
285  L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) \
286  L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) \
287  L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) \
288  L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) \
289  L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) \
290  L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) \
291  L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) \
292  L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) \
293  L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) \
294  L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) \
295  L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) \
296  L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) \
297  L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) \
298  L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) \
299  L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) \
300  L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) \
301  L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) \
302  L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) \
303  L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) \
304  L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) \
305  L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) \
306  L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) \
307  L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) \
308  L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) \
309  L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) \
310  L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) \
311  L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) \
312  L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) \
313  L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) \
314  L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i)
315 
316 #define SSE2_QUARTER_ROUND_X16(i, a, b, c, d, e, f, g, h, A, B, C, D, E, F, G, H) \
317  L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) L01(8,9,10,11, A,B,C,D, i) L01(12,13,14,15, E,F,G,H, i) \
318  L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) L02(8,9,10,11, A,B,C,D, i) L02(12,13,14,15, E,F,G,H, i) \
319  L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) L03(8,9,10,11, A,B,C,D, i) L03(12,13,14,15, E,F,G,H, i) \
320  L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) L04(8,9,10,11, A,B,C,D, i) L04(12,13,14,15, E,F,G,H, i) \
321  L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) L05(8,9,10,11, A,B,C,D, i) L05(12,13,14,15, E,F,G,H, i) \
322  L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) L06(8,9,10,11, A,B,C,D, i) L06(12,13,14,15, E,F,G,H, i) \
323  L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) L07(8,9,10,11, A,B,C,D, i) L07(12,13,14,15, E,F,G,H, i) \
324  L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) L08(8,9,10,11, A,B,C,D, i) L08(12,13,14,15, E,F,G,H, i) \
325  L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) L09(8,9,10,11, A,B,C,D, i) L09(12,13,14,15, E,F,G,H, i) \
326  L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) L10(8,9,10,11, A,B,C,D, i) L10(12,13,14,15, E,F,G,H, i) \
327  L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) L11(8,9,10,11, A,B,C,D, i) L11(12,13,14,15, E,F,G,H, i) \
328  L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) L12(8,9,10,11, A,B,C,D, i) L12(12,13,14,15, E,F,G,H, i) \
329  L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) L13(8,9,10,11, A,B,C,D, i) L13(12,13,14,15, E,F,G,H, i) \
330  L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) L14(8,9,10,11, A,B,C,D, i) L14(12,13,14,15, E,F,G,H, i) \
331  L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) L15(8,9,10,11, A,B,C,D, i) L15(12,13,14,15, E,F,G,H, i) \
332  L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) L16(8,9,10,11, A,B,C,D, i) L16(12,13,14,15, E,F,G,H, i) \
333  L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) L17(8,9,10,11, A,B,C,D, i) L17(12,13,14,15, E,F,G,H, i) \
334  L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) L18(8,9,10,11, A,B,C,D, i) L18(12,13,14,15, E,F,G,H, i) \
335  L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) L19(8,9,10,11, A,B,C,D, i) L19(12,13,14,15, E,F,G,H, i) \
336  L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) L20(8,9,10,11, A,B,C,D, i) L20(12,13,14,15, E,F,G,H, i) \
337  L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) L21(8,9,10,11, A,B,C,D, i) L21(12,13,14,15, E,F,G,H, i) \
338  L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) L22(8,9,10,11, A,B,C,D, i) L22(12,13,14,15, E,F,G,H, i) \
339  L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) L23(8,9,10,11, A,B,C,D, i) L23(12,13,14,15, E,F,G,H, i) \
340  L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) L24(8,9,10,11, A,B,C,D, i) L24(12,13,14,15, E,F,G,H, i) \
341  L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) L25(8,9,10,11, A,B,C,D, i) L25(12,13,14,15, E,F,G,H, i) \
342  L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) L26(8,9,10,11, A,B,C,D, i) L26(12,13,14,15, E,F,G,H, i) \
343  L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) L27(8,9,10,11, A,B,C,D, i) L27(12,13,14,15, E,F,G,H, i) \
344  L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) L28(8,9,10,11, A,B,C,D, i) L28(12,13,14,15, E,F,G,H, i) \
345  L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) L29(8,9,10,11, A,B,C,D, i) L29(12,13,14,15, E,F,G,H, i) \
346  L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) L30(8,9,10,11, A,B,C,D, i) L30(12,13,14,15, E,F,G,H, i) \
347  L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) L31(8,9,10,11, A,B,C,D, i) L31(12,13,14,15, E,F,G,H, i) \
348  L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i) L32(8,9,10,11, A,B,C,D, i) L32(12,13,14,15, E,F,G,H, i)
349 
350 #if CRYPTOPP_BOOL_X64
351  SSE2_QUARTER_ROUND_X16(1, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
352 #else
353  SSE2_QUARTER_ROUND_X8(1, 2, 6, 10, 14, 3, 7, 11, 15)
354  SSE2_QUARTER_ROUND_X8(1, 0, 4, 8, 12, 1, 5, 9, 13)
355 #endif
356  AS2( mov REG_roundsLeft, REG_rounds)
357  ASJ( jmp, 2, f)
358 
359  ASL(SSE2_Salsa_Output)
360  AS2( movdqa xmm0, xmm4)
361  AS2( punpckldq xmm4, xmm5)
362  AS2( movdqa xmm1, xmm6)
363  AS2( punpckldq xmm6, xmm7)
364  AS2( movdqa xmm2, xmm4)
365  AS2( punpcklqdq xmm4, xmm6) // e
366  AS2( punpckhqdq xmm2, xmm6) // f
367  AS2( punpckhdq xmm0, xmm5)
368  AS2( punpckhdq xmm1, xmm7)
369  AS2( movdqa xmm6, xmm0)
370  AS2( punpcklqdq xmm0, xmm1) // g
371  AS2( punpckhqdq xmm6, xmm1) // h
372  AS_XMM_OUTPUT4(SSE2_Salsa_Output_A, REG_input, REG_output, 4, 2, 0, 6, 1, 0, 4, 8, 12, 1)
373  AS1( ret)
374 
375  ASL(6)
376 #if CRYPTOPP_BOOL_X64
377  SSE2_QUARTER_ROUND_X16(0, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
378  ASL(2)
379  SSE2_QUARTER_ROUND_X16(0, 0, 13, 10, 7, 1, 14, 11, 4, 2, 15, 8, 5, 3, 12, 9, 6)
380 #else
381  SSE2_QUARTER_ROUND_X8(0, 2, 6, 10, 14, 3, 7, 11, 15)
382  SSE2_QUARTER_ROUND_X8(0, 0, 4, 8, 12, 1, 5, 9, 13)
383  ASL(2)
384  SSE2_QUARTER_ROUND_X8(0, 2, 15, 8, 5, 3, 12, 9, 6)
385  SSE2_QUARTER_ROUND_X8(0, 0, 13, 10, 7, 1, 14, 11, 4)
386 #endif
387  AS2( sub REG_roundsLeft, 2)
388  ASJ( jnz, 6, b)
389 
390 #define SSE2_OUTPUT_4(a, b, c, d) \
391  AS2( movdqa xmm4, [SSE2_WORKSPACE + a*16 + 256])\
392  AS2( paddd xmm4, [SSE2_WORKSPACE + a*16])\
393  AS2( movdqa xmm5, [SSE2_WORKSPACE + b*16 + 256])\
394  AS2( paddd xmm5, [SSE2_WORKSPACE + b*16])\
395  AS2( movdqa xmm6, [SSE2_WORKSPACE + c*16 + 256])\
396  AS2( paddd xmm6, [SSE2_WORKSPACE + c*16])\
397  AS2( movdqa xmm7, [SSE2_WORKSPACE + d*16 + 256])\
398  AS2( paddd xmm7, [SSE2_WORKSPACE + d*16])\
399  ASC( call, SSE2_Salsa_Output)
400 
401  SSE2_OUTPUT_4(0, 13, 10, 7)
402  SSE2_OUTPUT_4(4, 1, 14, 11)
403  SSE2_OUTPUT_4(8, 5, 2, 15)
404  SSE2_OUTPUT_4(12, 9, 6, 3)
405  AS2( test REG_input, REG_input)
406  ASJ( jz, 9, f)
407  AS2( add REG_input, 12*16)
408  ASL(9)
409  AS2( add REG_output, 12*16)
410  AS2( sub REG_iterationCount, 4)
411  AS2( cmp REG_iterationCount, 4)
412  ASJ( jge, 1, b)
413  AS_POP_IF86( sp)
414 
415  ASL(5)
416  AS2( sub REG_iterationCount, 1)
417  ASJ( jl, 4, f)
418  AS2( movdqa xmm0, [REG_state + 0*16])
419  AS2( movdqa xmm1, [REG_state + 1*16])
420  AS2( movdqa xmm2, [REG_state + 2*16])
421  AS2( movdqa xmm3, [REG_state + 3*16])
422  AS2( mov REG_roundsLeft, REG_rounds)
423 
424  ASL(0)
425  SSE2_QUARTER_ROUND(0, 1, 3, 7)
426  SSE2_QUARTER_ROUND(1, 2, 0, 9)
427  SSE2_QUARTER_ROUND(2, 3, 1, 13)
428  SSE2_QUARTER_ROUND(3, 0, 2, 18)
429  ASS( pshufd xmm1, xmm1, 2, 1, 0, 3)
430  ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
431  ASS( pshufd xmm3, xmm3, 0, 3, 2, 1)
432  SSE2_QUARTER_ROUND(0, 3, 1, 7)
433  SSE2_QUARTER_ROUND(3, 2, 0, 9)
434  SSE2_QUARTER_ROUND(2, 1, 3, 13)
435  SSE2_QUARTER_ROUND(1, 0, 2, 18)
436  ASS( pshufd xmm1, xmm1, 0, 3, 2, 1)
437  ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
438  ASS( pshufd xmm3, xmm3, 2, 1, 0, 3)
439  AS2( sub REG_roundsLeft, 2)
440  ASJ( jnz, 0, b)
441 
442  AS2( paddd xmm0, [REG_state + 0*16])
443  AS2( paddd xmm1, [REG_state + 1*16])
444  AS2( paddd xmm2, [REG_state + 2*16])
445  AS2( paddd xmm3, [REG_state + 3*16])
446 
447  AS2( add dword ptr [REG_state + 8*4], 1)
448  AS2( adc dword ptr [REG_state + 5*4], 0)
449 
450  AS2( pcmpeqb xmm6, xmm6) // all ones
451  AS2( psrlq xmm6, 32) // lo32 mask
452  ASS( pshufd xmm7, xmm6, 0, 1, 2, 3) // hi32 mask
453  AS2( movdqa xmm4, xmm0)
454  AS2( movdqa xmm5, xmm3)
455  AS2( pand xmm0, xmm7)
456  AS2( pand xmm4, xmm6)
457  AS2( pand xmm3, xmm6)
458  AS2( pand xmm5, xmm7)
459  AS2( por xmm4, xmm5) // 0,13,2,15
460  AS2( movdqa xmm5, xmm1)
461  AS2( pand xmm1, xmm7)
462  AS2( pand xmm5, xmm6)
463  AS2( por xmm0, xmm5) // 4,1,6,3
464  AS2( pand xmm6, xmm2)
465  AS2( pand xmm2, xmm7)
466  AS2( por xmm1, xmm6) // 8,5,10,7
467  AS2( por xmm2, xmm3) // 12,9,14,11
468 
469  AS2( movdqa xmm5, xmm4)
470  AS2( movdqa xmm6, xmm0)
471  AS3( shufpd xmm4, xmm1, 2) // 0,13,10,7
472  AS3( shufpd xmm0, xmm2, 2) // 4,1,14,11
473  AS3( shufpd xmm1, xmm5, 2) // 8,5,2,15
474  AS3( shufpd xmm2, xmm6, 2) // 12,9,6,3
475 
476  // output keystream
477  AS_XMM_OUTPUT4(SSE2_Salsa_Output_B, REG_input, REG_output, 4, 0, 1, 2, 3, 0, 1, 2, 3, 4)
478  ASJ( jmp, 5, b)
479  ASL(4)
480 
481  AS_POP_IF86( bp)
482 #ifdef __GNUC__
483  AS_POP_IF86( bx)
484  ATT_PREFIX
485  #if CRYPTOPP_BOOL_X64
486  : "+r" (input), "+r" (output), "+r" (iterationCount)
487  : "r" (m_rounds), "r" (m_state.m_ptr), "r" (workspace)
488  : "%eax", "%rdx", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
489  #else
490  : "+a" (input), "+D" (output), "+c" (iterationCount)
491  : "d" (m_rounds), "S" (m_state.m_ptr)
492  : "memory", "cc"
493  #endif
494  );
495 #endif
496 #ifdef CRYPTOPP_GENERATE_X64_MASM
497  movdqa xmm6, [rsp + 0200h]
498  movdqa xmm7, [rsp + 0210h]
499  movdqa xmm8, [rsp + 0220h]
500  movdqa xmm9, [rsp + 0230h]
501  movdqa xmm10, [rsp + 0240h]
502  movdqa xmm11, [rsp + 0250h]
503  movdqa xmm12, [rsp + 0260h]
504  movdqa xmm13, [rsp + 0270h]
505  movdqa xmm14, [rsp + 0280h]
506  movdqa xmm15, [rsp + 0290h]
507  add rsp, 10*16 + 32*16 + 8
508  ret
509 Salsa20_OperateKeystream ENDP
510 #else
511  }
512  else
513 #endif
514 #endif
515 #ifndef CRYPTOPP_GENERATE_X64_MASM
516  {
517  word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
518 
519  while (iterationCount--)
520  {
521  x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3];
522  x4 = m_state[4]; x5 = m_state[5]; x6 = m_state[6]; x7 = m_state[7];
523  x8 = m_state[8]; x9 = m_state[9]; x10 = m_state[10]; x11 = m_state[11];
524  x12 = m_state[12]; x13 = m_state[13]; x14 = m_state[14]; x15 = m_state[15];
525 
526  for (int i=m_rounds; i>0; i-=2)
527  {
528  #define QUARTER_ROUND(a, b, c, d) \
529  b = b ^ rotlFixed(a + d, 7); \
530  c = c ^ rotlFixed(b + a, 9); \
531  d = d ^ rotlFixed(c + b, 13); \
532  a = a ^ rotlFixed(d + c, 18);
533 
534  QUARTER_ROUND(x0, x4, x8, x12)
535  QUARTER_ROUND(x1, x5, x9, x13)
536  QUARTER_ROUND(x2, x6, x10, x14)
537  QUARTER_ROUND(x3, x7, x11, x15)
538 
539  QUARTER_ROUND(x0, x13, x10, x7)
540  QUARTER_ROUND(x1, x14, x11, x4)
541  QUARTER_ROUND(x2, x15, x8, x5)
542  QUARTER_ROUND(x3, x12, x9, x6)
543  }
544 
545  #define SALSA_OUTPUT(x) {\
546  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
547  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\
548  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\
549  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\
550  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
551  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\
552  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\
553  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\
554  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
555  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\
556  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\
557  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\
558  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
559  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\
560  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\
561  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);}
562 
563 #ifndef CRYPTOPP_DOXYGEN_PROCESSING
564  CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, BYTES_PER_ITERATION);
565 #endif
566 
567  if (++m_state[8] == 0)
568  ++m_state[5];
569  }
570  }
571 } // see comment above if an internal compiler error occurs here
572 
573 void XSalsa20_Policy::CipherSetKey(const NameValuePairs &params, const byte *key, size_t length)
574 {
575  m_rounds = params.GetIntValueWithDefault(Name::Rounds(), 20);
576 
577  if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20))
578  throw InvalidRounds(XSalsa20::StaticAlgorithmName(), m_rounds);
579 
580  GetUserKey(LITTLE_ENDIAN_ORDER, m_key.begin(), m_key.size(), key, length);
581  if (length == 16)
582  memcpy(m_key.begin()+4, m_key.begin(), 16);
583 
584  // "expand 32-byte k"
585  m_state[0] = 0x61707865;
586  m_state[1] = 0x3320646e;
587  m_state[2] = 0x79622d32;
588  m_state[3] = 0x6b206574;
589 }
590 
591 void XSalsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length)
592 {
593  CRYPTOPP_UNUSED(keystreamBuffer), CRYPTOPP_UNUSED(length);
594  assert(length==24);
595 
596  word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
597 
599  get(x14)(x11)(x8)(x5)(m_state[14])(m_state[11]);
600 
601  x13 = m_key[0]; x10 = m_key[1]; x7 = m_key[2]; x4 = m_key[3];
602  x15 = m_key[4]; x12 = m_key[5]; x9 = m_key[6]; x6 = m_key[7];
603  x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3];
604 
605  for (int i=m_rounds; i>0; i-=2)
606  {
607  QUARTER_ROUND(x0, x4, x8, x12)
608  QUARTER_ROUND(x1, x5, x9, x13)
609  QUARTER_ROUND(x2, x6, x10, x14)
610  QUARTER_ROUND(x3, x7, x11, x15)
611 
612  QUARTER_ROUND(x0, x13, x10, x7)
613  QUARTER_ROUND(x1, x14, x11, x4)
614  QUARTER_ROUND(x2, x15, x8, x5)
615  QUARTER_ROUND(x3, x12, x9, x6)
616  }
617 
618  m_state[13] = x0; m_state[10] = x1; m_state[7] = x2; m_state[4] = x3;
619  m_state[15] = x14; m_state[12] = x11; m_state[9] = x8; m_state[6] = x5;
620  m_state[8] = m_state[5] = 0;
621 }
622 
623 NAMESPACE_END
624 
625 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
Standard names for retrieving values by name when working with NameValuePairs.
virtual unsigned int GetOptimalBlockSize() const
Provides number of ideal bytes to process.
Definition: strciphr.h:127
Utility functions for the Crypto++ library.
const char * Rounds()
int
Definition: argnames.h:23
unsigned int GetAlignment() const
Provides data alignment requirements.
Definition: strciphr.h:198
Library configuration file.
#define CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(x, y)
Helper macro to implement OperateKeystream.
Definition: strciphr.h:237
byte order is little-endian
Definition: cryptlib.h:125
void CipherSetKey(const NameValuePairs &params, const byte *key, size_t length)
Key the cipher.
Definition: salsa.cpp:573
int GetIntValueWithDefault(const char *name, int defaultValue) const
Get a named value with type int, with default.
Definition: cryptlib.h:380
Exception thrown when an invalid number of rounds is encountered.
Definition: simple.h:55
A::pointer data()
Provides a pointer to the first element in the memory block.
Definition: secblock.h:513
void CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length)
Resynchronize the cipher.
Definition: salsa.cpp:591
Safely right shift values when undefined behavior could occur.
Functions for CPU features and intrinsics.
Classes for Salsa and Salsa20 stream ciphers.
const char * IV()
ConstByteArrayParameter, also accepts const byte * for backwards compatibility.
Definition: argnames.h:21
bool HasSSE2()
Determines SSE2 availability.
Definition: cpu.h:205
Access a block of memory.
Definition: misc.h:2129
KeystreamOperation
Keystream operation flags.
Definition: strciphr.h:92
Crypto++ library namespace.
SymmetricCipher implementation.
Definition: strciphr.h:582
Interface for retrieving values given their names.
Definition: cryptlib.h:277