Crypto++  5.6.4
Free C++ class library of cryptographic schemes
salsa.cpp
1 // salsa.cpp - written and placed in the public domain by Wei Dai
2 
3 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM salsa.cpp" to generate MASM code
4 
5 #include "pch.h"
6 #include "config.h"
7 
8 #ifndef CRYPTOPP_GENERATE_X64_MASM
9 
10 #include "salsa.h"
11 #include "argnames.h"
12 #include "misc.h"
13 #include "cpu.h"
14 
15 #if CRYPTOPP_MSC_VERSION
16 # pragma warning(disable: 4702 4740)
17 #endif
18 
19 // TODO: work around GCC 4.8+ issue with SSE2 ASM until the exact details are known
20 // and fix is released. Duplicate with "valgrind ./cryptest.exe tv salsa"
21 // Clang due to "Inline assembly operands don't work with .intel_syntax"
22 // https://llvm.org/bugs/show_bug.cgi?id=24232
23 #if defined(CRYPTOPP_DISABLE_SALSA_ASM)
24 # undef CRYPTOPP_X86_ASM_AVAILABLE
25 # undef CRYPTOPP_X32_ASM_AVAILABLE
26 # undef CRYPTOPP_X64_ASM_AVAILABLE
27 # undef CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
28 # undef CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
29 # define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 0
30 # define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 0
31 #endif
32 
33 NAMESPACE_BEGIN(CryptoPP)
34 
35 #if CRYPTOPP_DEBUG && !defined(CRYPTOPP_DOXYGEN_PROCESSING)
36 void Salsa20_TestInstantiations()
37 {
40 }
41 #endif
42 
43 void Salsa20_Policy::CipherSetKey(const NameValuePairs &params, const byte *key, size_t length)
44 {
45  m_rounds = params.GetIntValueWithDefault(Name::Rounds(), 20);
46 
47  if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20))
48  throw InvalidRounds(Salsa20::StaticAlgorithmName(), m_rounds);
49 
50  // m_state is reordered for SSE2
52  get1(m_state[13])(m_state[10])(m_state[7])(m_state[4]);
53  GetBlock<word32, LittleEndian> get2(key + length - 16);
54  get2(m_state[15])(m_state[12])(m_state[9])(m_state[6]);
55 
56  // "expand 16-byte k" or "expand 32-byte k"
57  m_state[0] = 0x61707865;
58  m_state[1] = (length == 16) ? 0x3120646e : 0x3320646e;
59  m_state[2] = (length == 16) ? 0x79622d36 : 0x79622d32;
60  m_state[3] = 0x6b206574;
61 }
62 
63 void Salsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length)
64 {
65  CRYPTOPP_UNUSED(keystreamBuffer), CRYPTOPP_UNUSED(length);
66  CRYPTOPP_ASSERT(length==8);
67 
69  get(m_state[14])(m_state[11]);
70  m_state[8] = m_state[5] = 0;
71 }
72 
73 void Salsa20_Policy::SeekToIteration(lword iterationCount)
74 {
75  m_state[8] = (word32)iterationCount;
76  m_state[5] = (word32)SafeRightShift<32>(iterationCount);
77 }
78 
79 #if (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64) && !defined(CRYPTOPP_DISABLE_SALSA_ASM)
80 unsigned int Salsa20_Policy::GetAlignment() const
81 {
82 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
83  if (HasSSE2())
84  return 16;
85  else
86 #endif
87  return GetAlignmentOf<word32>();
88 }
89 
90 unsigned int Salsa20_Policy::GetOptimalBlockSize() const
91 {
92 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
93  if (HasSSE2())
94  return 4*BYTES_PER_ITERATION;
95  else
96 #endif
97  return BYTES_PER_ITERATION;
98 }
99 #endif
100 
101 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
102 extern "C" {
103 void Salsa20_OperateKeystream(byte *output, const byte *input, size_t iterationCount, int rounds, void *state);
104 }
105 #endif
106 
107 #if CRYPTOPP_MSC_VERSION
108 # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
109 #endif
110 
111 void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
112 {
113 #endif // #ifdef CRYPTOPP_GENERATE_X64_MASM
114 
115 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
116  Salsa20_OperateKeystream(output, input, iterationCount, m_rounds, m_state.data());
117  return;
118 #endif
119 
120 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
121 #ifdef CRYPTOPP_GENERATE_X64_MASM
122  ALIGN 8
123  Salsa20_OperateKeystream PROC FRAME
124  mov r10, [rsp + 5*8] ; state
125  alloc_stack(10*16 + 32*16 + 8)
126  save_xmm128 xmm6, 0200h
127  save_xmm128 xmm7, 0210h
128  save_xmm128 xmm8, 0220h
129  save_xmm128 xmm9, 0230h
130  save_xmm128 xmm10, 0240h
131  save_xmm128 xmm11, 0250h
132  save_xmm128 xmm12, 0260h
133  save_xmm128 xmm13, 0270h
134  save_xmm128 xmm14, 0280h
135  save_xmm128 xmm15, 0290h
136  .endprolog
137 
138  #define REG_output rcx
139  #define REG_input rdx
140  #define REG_iterationCount r8
141  #define REG_state r10
142  #define REG_rounds e9d
143  #define REG_roundsLeft eax
144  #define REG_temp32 r11d
145  #define REG_temp r11
146  #define SSE2_WORKSPACE rsp
147 #else
148  if (HasSSE2())
149  {
150  #if CRYPTOPP_BOOL_X64
151  #define REG_output %1
152  #define REG_input %0
153  #define REG_iterationCount %2
154  #define REG_state %4 /* constant */
155  #define REG_rounds %3 /* constant */
156  #define REG_roundsLeft eax
157  #define REG_temp32 edx
158  #define REG_temp rdx
159  #define SSE2_WORKSPACE %5 /* constant */
160 
161  CRYPTOPP_ALIGN_DATA(16) byte workspace[16*32];
162  #else
163  #define REG_output edi
164  #define REG_input eax
165  #define REG_iterationCount ecx
166  #define REG_state esi
167  #define REG_rounds edx
168  #define REG_roundsLeft ebx
169  #define REG_temp32 ebp
170  #define REG_temp ebp
171  #define SSE2_WORKSPACE esp + WORD_SZ
172  #endif
173 
174  #ifdef __GNUC__
175  __asm__ __volatile__
176  (
177  INTEL_NOPREFIX
178  AS_PUSH_IF86( bx)
179  #else
180  void *s = m_state.data();
181  word32 r = m_rounds;
182 
183  AS2( mov REG_iterationCount, iterationCount)
184  AS2( mov REG_input, input)
185  AS2( mov REG_output, output)
186  AS2( mov REG_state, s)
187  AS2( mov REG_rounds, r)
188  #endif
189 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
190 
191  AS_PUSH_IF86( bp)
192  AS2( cmp REG_iterationCount, 4)
193  ASJ( jl, 5, f)
194 
195 #if CRYPTOPP_BOOL_X86
196  AS2( mov ebx, esp)
197  AS2( and esp, -16)
198  AS2( sub esp, 32*16)
199  AS1( push ebx)
200 #endif
201 
202 #define SSE2_EXPAND_S(i, j) \
203  ASS( pshufd xmm4, xmm##i, j, j, j, j) \
204  AS2( movdqa [SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4)
205 
206  AS2( movdqa xmm0, [REG_state + 0*16])
207  AS2( movdqa xmm1, [REG_state + 1*16])
208  AS2( movdqa xmm2, [REG_state + 2*16])
209  AS2( movdqa xmm3, [REG_state + 3*16])
210  SSE2_EXPAND_S(0, 0)
211  SSE2_EXPAND_S(0, 1)
212  SSE2_EXPAND_S(0, 2)
213  SSE2_EXPAND_S(0, 3)
214  SSE2_EXPAND_S(1, 0)
215  SSE2_EXPAND_S(1, 2)
216  SSE2_EXPAND_S(1, 3)
217  SSE2_EXPAND_S(2, 1)
218  SSE2_EXPAND_S(2, 2)
219  SSE2_EXPAND_S(2, 3)
220  SSE2_EXPAND_S(3, 0)
221  SSE2_EXPAND_S(3, 1)
222  SSE2_EXPAND_S(3, 2)
223  SSE2_EXPAND_S(3, 3)
224 
225 #define SSE2_EXPAND_S85(i) \
226  AS2( mov dword ptr [SSE2_WORKSPACE + 8*16 + i*4 + 256], REG_roundsLeft) \
227  AS2( mov dword ptr [SSE2_WORKSPACE + 5*16 + i*4 + 256], REG_temp32) \
228  AS2( add REG_roundsLeft, 1) \
229  AS2( adc REG_temp32, 0)
230 
231  ASL(1)
232  AS2( mov REG_roundsLeft, dword ptr [REG_state + 8*4])
233  AS2( mov REG_temp32, dword ptr [REG_state + 5*4])
234  SSE2_EXPAND_S85(0)
235  SSE2_EXPAND_S85(1)
236  SSE2_EXPAND_S85(2)
237  SSE2_EXPAND_S85(3)
238  AS2( mov dword ptr [REG_state + 8*4], REG_roundsLeft)
239  AS2( mov dword ptr [REG_state + 5*4], REG_temp32)
240 
241 #define SSE2_QUARTER_ROUND(a, b, d, i) \
242  AS2( movdqa xmm4, xmm##d) \
243  AS2( paddd xmm4, xmm##a) \
244  AS2( movdqa xmm5, xmm4) \
245  AS2( pslld xmm4, i) \
246  AS2( psrld xmm5, 32-i) \
247  AS2( pxor xmm##b, xmm4) \
248  AS2( pxor xmm##b, xmm5)
249 
250 #define L01(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##A, [SSE2_WORKSPACE + d*16 + i*256]) /* y3 */
251 #define L02(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##C, [SSE2_WORKSPACE + a*16 + i*256]) /* y0 */
252 #define L03(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* y0+y3 */
253 #define L04(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
254 #define L05(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 7)
255 #define L06(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-7)
256 #define L07(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + b*16 + i*256])
257 #define L08(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z1 */
258 #define L09(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + b*16], xmm##A)
259 #define L10(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
260 #define L11(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##C) /* z1+y0 */
261 #define L12(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
262 #define L13(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 9)
263 #define L14(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-9)
264 #define L15(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + c*16 + i*256])
265 #define L16(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z2 */
266 #define L17(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + c*16], xmm##A)
267 #define L18(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
268 #define L19(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##B) /* z2+z1 */
269 #define L20(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##B, xmm##A)
270 #define L21(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 13)
271 #define L22(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##B, 32-13)
272 #define L23(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, [SSE2_WORKSPACE + d*16 + i*256])
273 #define L24(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##B) /* z3 */
274 #define L25(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + d*16], xmm##A)
275 #define L26(A,B,C,D,a,b,c,d,i) AS2( paddd xmm##A, xmm##D) /* z3+z2 */
276 #define L27(A,B,C,D,a,b,c,d,i) AS2( movdqa xmm##D, xmm##A)
277 #define L28(A,B,C,D,a,b,c,d,i) AS2( pslld xmm##A, 18)
278 #define L29(A,B,C,D,a,b,c,d,i) AS2( psrld xmm##D, 32-18)
279 #define L30(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##C) /* xor y0 */
280 #define L31(A,B,C,D,a,b,c,d,i) AS2( pxor xmm##A, xmm##D) /* z0 */
281 #define L32(A,B,C,D,a,b,c,d,i) AS2( movdqa [SSE2_WORKSPACE + a*16], xmm##A)
282 
283 #define SSE2_QUARTER_ROUND_X8(i, a, b, c, d, e, f, g, h) \
284  L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) \
285  L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) \
286  L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) \
287  L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) \
288  L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) \
289  L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) \
290  L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) \
291  L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) \
292  L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) \
293  L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) \
294  L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) \
295  L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) \
296  L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) \
297  L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) \
298  L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) \
299  L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) \
300  L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) \
301  L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) \
302  L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) \
303  L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) \
304  L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) \
305  L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) \
306  L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) \
307  L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) \
308  L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) \
309  L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) \
310  L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) \
311  L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) \
312  L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) \
313  L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) \
314  L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) \
315  L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i)
316 
317 #define SSE2_QUARTER_ROUND_X16(i, a, b, c, d, e, f, g, h, A, B, C, D, E, F, G, H) \
318  L01(0,1,2,3, a,b,c,d, i) L01(4,5,6,7, e,f,g,h, i) L01(8,9,10,11, A,B,C,D, i) L01(12,13,14,15, E,F,G,H, i) \
319  L02(0,1,2,3, a,b,c,d, i) L02(4,5,6,7, e,f,g,h, i) L02(8,9,10,11, A,B,C,D, i) L02(12,13,14,15, E,F,G,H, i) \
320  L03(0,1,2,3, a,b,c,d, i) L03(4,5,6,7, e,f,g,h, i) L03(8,9,10,11, A,B,C,D, i) L03(12,13,14,15, E,F,G,H, i) \
321  L04(0,1,2,3, a,b,c,d, i) L04(4,5,6,7, e,f,g,h, i) L04(8,9,10,11, A,B,C,D, i) L04(12,13,14,15, E,F,G,H, i) \
322  L05(0,1,2,3, a,b,c,d, i) L05(4,5,6,7, e,f,g,h, i) L05(8,9,10,11, A,B,C,D, i) L05(12,13,14,15, E,F,G,H, i) \
323  L06(0,1,2,3, a,b,c,d, i) L06(4,5,6,7, e,f,g,h, i) L06(8,9,10,11, A,B,C,D, i) L06(12,13,14,15, E,F,G,H, i) \
324  L07(0,1,2,3, a,b,c,d, i) L07(4,5,6,7, e,f,g,h, i) L07(8,9,10,11, A,B,C,D, i) L07(12,13,14,15, E,F,G,H, i) \
325  L08(0,1,2,3, a,b,c,d, i) L08(4,5,6,7, e,f,g,h, i) L08(8,9,10,11, A,B,C,D, i) L08(12,13,14,15, E,F,G,H, i) \
326  L09(0,1,2,3, a,b,c,d, i) L09(4,5,6,7, e,f,g,h, i) L09(8,9,10,11, A,B,C,D, i) L09(12,13,14,15, E,F,G,H, i) \
327  L10(0,1,2,3, a,b,c,d, i) L10(4,5,6,7, e,f,g,h, i) L10(8,9,10,11, A,B,C,D, i) L10(12,13,14,15, E,F,G,H, i) \
328  L11(0,1,2,3, a,b,c,d, i) L11(4,5,6,7, e,f,g,h, i) L11(8,9,10,11, A,B,C,D, i) L11(12,13,14,15, E,F,G,H, i) \
329  L12(0,1,2,3, a,b,c,d, i) L12(4,5,6,7, e,f,g,h, i) L12(8,9,10,11, A,B,C,D, i) L12(12,13,14,15, E,F,G,H, i) \
330  L13(0,1,2,3, a,b,c,d, i) L13(4,5,6,7, e,f,g,h, i) L13(8,9,10,11, A,B,C,D, i) L13(12,13,14,15, E,F,G,H, i) \
331  L14(0,1,2,3, a,b,c,d, i) L14(4,5,6,7, e,f,g,h, i) L14(8,9,10,11, A,B,C,D, i) L14(12,13,14,15, E,F,G,H, i) \
332  L15(0,1,2,3, a,b,c,d, i) L15(4,5,6,7, e,f,g,h, i) L15(8,9,10,11, A,B,C,D, i) L15(12,13,14,15, E,F,G,H, i) \
333  L16(0,1,2,3, a,b,c,d, i) L16(4,5,6,7, e,f,g,h, i) L16(8,9,10,11, A,B,C,D, i) L16(12,13,14,15, E,F,G,H, i) \
334  L17(0,1,2,3, a,b,c,d, i) L17(4,5,6,7, e,f,g,h, i) L17(8,9,10,11, A,B,C,D, i) L17(12,13,14,15, E,F,G,H, i) \
335  L18(0,1,2,3, a,b,c,d, i) L18(4,5,6,7, e,f,g,h, i) L18(8,9,10,11, A,B,C,D, i) L18(12,13,14,15, E,F,G,H, i) \
336  L19(0,1,2,3, a,b,c,d, i) L19(4,5,6,7, e,f,g,h, i) L19(8,9,10,11, A,B,C,D, i) L19(12,13,14,15, E,F,G,H, i) \
337  L20(0,1,2,3, a,b,c,d, i) L20(4,5,6,7, e,f,g,h, i) L20(8,9,10,11, A,B,C,D, i) L20(12,13,14,15, E,F,G,H, i) \
338  L21(0,1,2,3, a,b,c,d, i) L21(4,5,6,7, e,f,g,h, i) L21(8,9,10,11, A,B,C,D, i) L21(12,13,14,15, E,F,G,H, i) \
339  L22(0,1,2,3, a,b,c,d, i) L22(4,5,6,7, e,f,g,h, i) L22(8,9,10,11, A,B,C,D, i) L22(12,13,14,15, E,F,G,H, i) \
340  L23(0,1,2,3, a,b,c,d, i) L23(4,5,6,7, e,f,g,h, i) L23(8,9,10,11, A,B,C,D, i) L23(12,13,14,15, E,F,G,H, i) \
341  L24(0,1,2,3, a,b,c,d, i) L24(4,5,6,7, e,f,g,h, i) L24(8,9,10,11, A,B,C,D, i) L24(12,13,14,15, E,F,G,H, i) \
342  L25(0,1,2,3, a,b,c,d, i) L25(4,5,6,7, e,f,g,h, i) L25(8,9,10,11, A,B,C,D, i) L25(12,13,14,15, E,F,G,H, i) \
343  L26(0,1,2,3, a,b,c,d, i) L26(4,5,6,7, e,f,g,h, i) L26(8,9,10,11, A,B,C,D, i) L26(12,13,14,15, E,F,G,H, i) \
344  L27(0,1,2,3, a,b,c,d, i) L27(4,5,6,7, e,f,g,h, i) L27(8,9,10,11, A,B,C,D, i) L27(12,13,14,15, E,F,G,H, i) \
345  L28(0,1,2,3, a,b,c,d, i) L28(4,5,6,7, e,f,g,h, i) L28(8,9,10,11, A,B,C,D, i) L28(12,13,14,15, E,F,G,H, i) \
346  L29(0,1,2,3, a,b,c,d, i) L29(4,5,6,7, e,f,g,h, i) L29(8,9,10,11, A,B,C,D, i) L29(12,13,14,15, E,F,G,H, i) \
347  L30(0,1,2,3, a,b,c,d, i) L30(4,5,6,7, e,f,g,h, i) L30(8,9,10,11, A,B,C,D, i) L30(12,13,14,15, E,F,G,H, i) \
348  L31(0,1,2,3, a,b,c,d, i) L31(4,5,6,7, e,f,g,h, i) L31(8,9,10,11, A,B,C,D, i) L31(12,13,14,15, E,F,G,H, i) \
349  L32(0,1,2,3, a,b,c,d, i) L32(4,5,6,7, e,f,g,h, i) L32(8,9,10,11, A,B,C,D, i) L32(12,13,14,15, E,F,G,H, i)
350 
351 #if CRYPTOPP_BOOL_X64
352  SSE2_QUARTER_ROUND_X16(1, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
353 #else
354  SSE2_QUARTER_ROUND_X8(1, 2, 6, 10, 14, 3, 7, 11, 15)
355  SSE2_QUARTER_ROUND_X8(1, 0, 4, 8, 12, 1, 5, 9, 13)
356 #endif
357  AS2( mov REG_roundsLeft, REG_rounds)
358  ASJ( jmp, 2, f)
359 
360  ASL(SSE2_Salsa_Output)
361  AS2( movdqa xmm0, xmm4)
362  AS2( punpckldq xmm4, xmm5)
363  AS2( movdqa xmm1, xmm6)
364  AS2( punpckldq xmm6, xmm7)
365  AS2( movdqa xmm2, xmm4)
366  AS2( punpcklqdq xmm4, xmm6) // e
367  AS2( punpckhqdq xmm2, xmm6) // f
368  AS2( punpckhdq xmm0, xmm5)
369  AS2( punpckhdq xmm1, xmm7)
370  AS2( movdqa xmm6, xmm0)
371  AS2( punpcklqdq xmm0, xmm1) // g
372  AS2( punpckhqdq xmm6, xmm1) // h
373  AS_XMM_OUTPUT4(SSE2_Salsa_Output_A, REG_input, REG_output, 4, 2, 0, 6, 1, 0, 4, 8, 12, 1)
374  AS1( ret)
375 
376  ASL(6)
377 #if CRYPTOPP_BOOL_X64
378  SSE2_QUARTER_ROUND_X16(0, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
379  ASL(2)
380  SSE2_QUARTER_ROUND_X16(0, 0, 13, 10, 7, 1, 14, 11, 4, 2, 15, 8, 5, 3, 12, 9, 6)
381 #else
382  SSE2_QUARTER_ROUND_X8(0, 2, 6, 10, 14, 3, 7, 11, 15)
383  SSE2_QUARTER_ROUND_X8(0, 0, 4, 8, 12, 1, 5, 9, 13)
384  ASL(2)
385  SSE2_QUARTER_ROUND_X8(0, 2, 15, 8, 5, 3, 12, 9, 6)
386  SSE2_QUARTER_ROUND_X8(0, 0, 13, 10, 7, 1, 14, 11, 4)
387 #endif
388  AS2( sub REG_roundsLeft, 2)
389  ASJ( jnz, 6, b)
390 
391 #define SSE2_OUTPUT_4(a, b, c, d) \
392  AS2( movdqa xmm4, [SSE2_WORKSPACE + a*16 + 256])\
393  AS2( paddd xmm4, [SSE2_WORKSPACE + a*16])\
394  AS2( movdqa xmm5, [SSE2_WORKSPACE + b*16 + 256])\
395  AS2( paddd xmm5, [SSE2_WORKSPACE + b*16])\
396  AS2( movdqa xmm6, [SSE2_WORKSPACE + c*16 + 256])\
397  AS2( paddd xmm6, [SSE2_WORKSPACE + c*16])\
398  AS2( movdqa xmm7, [SSE2_WORKSPACE + d*16 + 256])\
399  AS2( paddd xmm7, [SSE2_WORKSPACE + d*16])\
400  ASC( call, SSE2_Salsa_Output)
401 
402  SSE2_OUTPUT_4(0, 13, 10, 7)
403  SSE2_OUTPUT_4(4, 1, 14, 11)
404  SSE2_OUTPUT_4(8, 5, 2, 15)
405  SSE2_OUTPUT_4(12, 9, 6, 3)
406  AS2( test REG_input, REG_input)
407  ASJ( jz, 9, f)
408  AS2( add REG_input, 12*16)
409  ASL(9)
410  AS2( add REG_output, 12*16)
411  AS2( sub REG_iterationCount, 4)
412  AS2( cmp REG_iterationCount, 4)
413  ASJ( jge, 1, b)
414  AS_POP_IF86( sp)
415 
416  ASL(5)
417  AS2( sub REG_iterationCount, 1)
418  ASJ( jl, 4, f)
419  AS2( movdqa xmm0, [REG_state + 0*16])
420  AS2( movdqa xmm1, [REG_state + 1*16])
421  AS2( movdqa xmm2, [REG_state + 2*16])
422  AS2( movdqa xmm3, [REG_state + 3*16])
423  AS2( mov REG_roundsLeft, REG_rounds)
424 
425  ASL(0)
426  SSE2_QUARTER_ROUND(0, 1, 3, 7)
427  SSE2_QUARTER_ROUND(1, 2, 0, 9)
428  SSE2_QUARTER_ROUND(2, 3, 1, 13)
429  SSE2_QUARTER_ROUND(3, 0, 2, 18)
430  ASS( pshufd xmm1, xmm1, 2, 1, 0, 3)
431  ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
432  ASS( pshufd xmm3, xmm3, 0, 3, 2, 1)
433  SSE2_QUARTER_ROUND(0, 3, 1, 7)
434  SSE2_QUARTER_ROUND(3, 2, 0, 9)
435  SSE2_QUARTER_ROUND(2, 1, 3, 13)
436  SSE2_QUARTER_ROUND(1, 0, 2, 18)
437  ASS( pshufd xmm1, xmm1, 0, 3, 2, 1)
438  ASS( pshufd xmm2, xmm2, 1, 0, 3, 2)
439  ASS( pshufd xmm3, xmm3, 2, 1, 0, 3)
440  AS2( sub REG_roundsLeft, 2)
441  ASJ( jnz, 0, b)
442 
443  AS2( paddd xmm0, [REG_state + 0*16])
444  AS2( paddd xmm1, [REG_state + 1*16])
445  AS2( paddd xmm2, [REG_state + 2*16])
446  AS2( paddd xmm3, [REG_state + 3*16])
447 
448  AS2( add dword ptr [REG_state + 8*4], 1)
449  AS2( adc dword ptr [REG_state + 5*4], 0)
450 
451  AS2( pcmpeqb xmm6, xmm6) // all ones
452  AS2( psrlq xmm6, 32) // lo32 mask
453  ASS( pshufd xmm7, xmm6, 0, 1, 2, 3) // hi32 mask
454  AS2( movdqa xmm4, xmm0)
455  AS2( movdqa xmm5, xmm3)
456  AS2( pand xmm0, xmm7)
457  AS2( pand xmm4, xmm6)
458  AS2( pand xmm3, xmm6)
459  AS2( pand xmm5, xmm7)
460  AS2( por xmm4, xmm5) // 0,13,2,15
461  AS2( movdqa xmm5, xmm1)
462  AS2( pand xmm1, xmm7)
463  AS2( pand xmm5, xmm6)
464  AS2( por xmm0, xmm5) // 4,1,6,3
465  AS2( pand xmm6, xmm2)
466  AS2( pand xmm2, xmm7)
467  AS2( por xmm1, xmm6) // 8,5,10,7
468  AS2( por xmm2, xmm3) // 12,9,14,11
469 
470  AS2( movdqa xmm5, xmm4)
471  AS2( movdqa xmm6, xmm0)
472  AS3( shufpd xmm4, xmm1, 2) // 0,13,10,7
473  AS3( shufpd xmm0, xmm2, 2) // 4,1,14,11
474  AS3( shufpd xmm1, xmm5, 2) // 8,5,2,15
475  AS3( shufpd xmm2, xmm6, 2) // 12,9,6,3
476 
477  // output keystream
478  AS_XMM_OUTPUT4(SSE2_Salsa_Output_B, REG_input, REG_output, 4, 0, 1, 2, 3, 0, 1, 2, 3, 4)
479  ASJ( jmp, 5, b)
480  ASL(4)
481 
482  AS_POP_IF86( bp)
483 #ifdef __GNUC__
484  AS_POP_IF86( bx)
485  ATT_PREFIX
486  #if CRYPTOPP_BOOL_X64
487  : "+r" (input), "+r" (output), "+r" (iterationCount)
488  : "r" (m_rounds), "r" (m_state.m_ptr), "r" (workspace)
489  : "%eax", "%rdx", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
490  #else
491  : "+a" (input), "+D" (output), "+c" (iterationCount)
492  : "d" (m_rounds), "S" (m_state.m_ptr)
493  : "memory", "cc"
494  #endif
495  );
496 #endif
497 #ifdef CRYPTOPP_GENERATE_X64_MASM
498  movdqa xmm6, [rsp + 0200h]
499  movdqa xmm7, [rsp + 0210h]
500  movdqa xmm8, [rsp + 0220h]
501  movdqa xmm9, [rsp + 0230h]
502  movdqa xmm10, [rsp + 0240h]
503  movdqa xmm11, [rsp + 0250h]
504  movdqa xmm12, [rsp + 0260h]
505  movdqa xmm13, [rsp + 0270h]
506  movdqa xmm14, [rsp + 0280h]
507  movdqa xmm15, [rsp + 0290h]
508  add rsp, 10*16 + 32*16 + 8
509  ret
510 Salsa20_OperateKeystream ENDP
511 #else
512  }
513  else
514 #endif
515 #endif
516 #ifndef CRYPTOPP_GENERATE_X64_MASM
517  {
518  word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
519 
520  while (iterationCount--)
521  {
522  x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3];
523  x4 = m_state[4]; x5 = m_state[5]; x6 = m_state[6]; x7 = m_state[7];
524  x8 = m_state[8]; x9 = m_state[9]; x10 = m_state[10]; x11 = m_state[11];
525  x12 = m_state[12]; x13 = m_state[13]; x14 = m_state[14]; x15 = m_state[15];
526 
527  for (int i=m_rounds; i>0; i-=2)
528  {
529  #define QUARTER_ROUND(a, b, c, d) \
530  b = b ^ rotlFixed(a + d, 7); \
531  c = c ^ rotlFixed(b + a, 9); \
532  d = d ^ rotlFixed(c + b, 13); \
533  a = a ^ rotlFixed(d + c, 18);
534 
535  QUARTER_ROUND(x0, x4, x8, x12)
536  QUARTER_ROUND(x1, x5, x9, x13)
537  QUARTER_ROUND(x2, x6, x10, x14)
538  QUARTER_ROUND(x3, x7, x11, x15)
539 
540  QUARTER_ROUND(x0, x13, x10, x7)
541  QUARTER_ROUND(x1, x14, x11, x4)
542  QUARTER_ROUND(x2, x15, x8, x5)
543  QUARTER_ROUND(x3, x12, x9, x6)
544  }
545 
546  #define SALSA_OUTPUT(x) {\
547  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
548  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\
549  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\
550  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\
551  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
552  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\
553  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\
554  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\
555  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
556  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\
557  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\
558  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\
559  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
560  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\
561  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\
562  CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);}
563 
564 #ifndef CRYPTOPP_DOXYGEN_PROCESSING
565  CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, BYTES_PER_ITERATION);
566 #endif
567 
568  if (++m_state[8] == 0)
569  ++m_state[5];
570  }
571  }
572 } // see comment above if an internal compiler error occurs here
573 
574 void XSalsa20_Policy::CipherSetKey(const NameValuePairs &params, const byte *key, size_t length)
575 {
576  m_rounds = params.GetIntValueWithDefault(Name::Rounds(), 20);
577 
578  if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20))
579  throw InvalidRounds(XSalsa20::StaticAlgorithmName(), m_rounds);
580 
581  GetUserKey(LITTLE_ENDIAN_ORDER, m_key.begin(), m_key.size(), key, length);
582  if (length == 16)
583  memcpy(m_key.begin()+4, m_key.begin(), 16);
584 
585  // "expand 32-byte k"
586  m_state[0] = 0x61707865;
587  m_state[1] = 0x3320646e;
588  m_state[2] = 0x79622d32;
589  m_state[3] = 0x6b206574;
590 }
591 
592 void XSalsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length)
593 {
594  CRYPTOPP_UNUSED(keystreamBuffer), CRYPTOPP_UNUSED(length);
595  CRYPTOPP_ASSERT(length==24);
596 
597  word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
598 
600  get(x14)(x11)(x8)(x5)(m_state[14])(m_state[11]);
601 
602  x13 = m_key[0]; x10 = m_key[1]; x7 = m_key[2]; x4 = m_key[3];
603  x15 = m_key[4]; x12 = m_key[5]; x9 = m_key[6]; x6 = m_key[7];
604  x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3];
605 
606  for (int i=m_rounds; i>0; i-=2)
607  {
608  QUARTER_ROUND(x0, x4, x8, x12)
609  QUARTER_ROUND(x1, x5, x9, x13)
610  QUARTER_ROUND(x2, x6, x10, x14)
611  QUARTER_ROUND(x3, x7, x11, x15)
612 
613  QUARTER_ROUND(x0, x13, x10, x7)
614  QUARTER_ROUND(x1, x14, x11, x4)
615  QUARTER_ROUND(x2, x15, x8, x5)
616  QUARTER_ROUND(x3, x12, x9, x6)
617  }
618 
619  m_state[13] = x0; m_state[10] = x1; m_state[7] = x2; m_state[4] = x3;
620  m_state[15] = x14; m_state[12] = x11; m_state[9] = x8; m_state[6] = x5;
621  m_state[8] = m_state[5] = 0;
622 }
623 
624 NAMESPACE_END
625 
626 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
Standard names for retrieving values by name when working with NameValuePairs.
virtual unsigned int GetOptimalBlockSize() const
Provides number of ideal bytes to process.
Definition: strciphr.h:127
Utility functions for the Crypto++ library.
const char * Rounds()
int
Definition: argnames.h:23
unsigned int GetAlignment() const
Provides data alignment requirements.
Definition: strciphr.h:198
Library configuration file.
#define CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(x, y)
Helper macro to implement OperateKeystream.
Definition: strciphr.h:237
byte order is little-endian
Definition: cryptlib.h:130
void CipherSetKey(const NameValuePairs &params, const byte *key, size_t length)
Key the cipher.
Definition: salsa.cpp:574
int GetIntValueWithDefault(const char *name, int defaultValue) const
Get a named value with type int, with default.
Definition: cryptlib.h:385
Exception thrown when an invalid number of rounds is encountered.
Definition: simple.h:55
A::pointer data()
Provides a pointer to the first element in the memory block.
Definition: secblock.h:516
void CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length)
Resynchronize the cipher.
Definition: salsa.cpp:592
Safely right shift values when undefined behavior could occur.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:62
Functions for CPU features and intrinsics.
Classes for Salsa and Salsa20 stream ciphers.
const char * IV()
ConstByteArrayParameter, also accepts const byte * for backwards compatibility.
Definition: argnames.h:21
bool HasSSE2()
Determines SSE2 availability.
Definition: cpu.h:160
Access a block of memory.
Definition: misc.h:2175
KeystreamOperation
Keystream operation flags.
Definition: strciphr.h:92
Crypto++ library namespace.
SymmetricCipher implementation.
Definition: strciphr.h:584
Interface for retrieving values given their names.
Definition: cryptlib.h:282