• Main Page
  • Namespaces
  • Classes
  • Files
  • File List
  • File Members

salsa.cpp

00001 // salsa.cpp - written and placed in the public domain by Wei Dai
00002 
00003 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM salsa.cpp" to generate MASM code
00004 
00005 #include "pch.h"
00006 
00007 #ifndef CRYPTOPP_GENERATE_X64_MASM
00008 
00009 #include "salsa.h"
00010 #include "misc.h"
00011 #include "argnames.h"
00012 #include "cpu.h"
00013 
00014 NAMESPACE_BEGIN(CryptoPP)
00015 
00016 void Salsa20_TestInstantiations()
00017 {
00018         Salsa20::Encryption x;
00019 }
00020 
00021 void Salsa20_Policy::CipherSetKey(const NameValuePairs &params, const byte *key, size_t length)
00022 {
00023         m_rounds = params.GetIntValueWithDefault(Name::Rounds(), 20);
00024 
00025         if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20))
00026                 throw InvalidRounds(Salsa20::StaticAlgorithmName(), m_rounds);
00027 
00028         // m_state is reordered for SSE2
00029         GetBlock<word32, LittleEndian> get1(key);
00030         get1(m_state[13])(m_state[10])(m_state[7])(m_state[4]);
00031         GetBlock<word32, LittleEndian> get2(key + length - 16);
00032         get2(m_state[15])(m_state[12])(m_state[9])(m_state[6]);
00033 
00034         // "expand 16-byte k" or "expand 32-byte k"
00035         m_state[0] = 0x61707865;
00036         m_state[1] = (length == 16) ? 0x3120646e : 0x3320646e;
00037         m_state[2] = (length == 16) ? 0x79622d36 : 0x79622d32;
00038         m_state[3] = 0x6b206574;
00039 }
00040 
00041 void Salsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length)
00042 {
00043         assert(length==8);
00044         GetBlock<word32, LittleEndian> get(IV);
00045         get(m_state[14])(m_state[11]);
00046         m_state[8] = m_state[5] = 0;
00047 }
00048 
00049 void Salsa20_Policy::SeekToIteration(lword iterationCount)
00050 {
00051         m_state[8] = (word32)iterationCount;
00052         m_state[5] = (word32)SafeRightShift<32>(iterationCount);
00053 }
00054 
00055 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
00056 unsigned int Salsa20_Policy::GetAlignment() const
00057 {
00058 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00059         if (HasSSE2())
00060                 return 16;
00061         else
00062 #endif
00063                 return GetAlignmentOf<word32>();
00064 }
00065 
00066 unsigned int Salsa20_Policy::GetOptimalBlockSize() const
00067 {
00068 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00069         if (HasSSE2())
00070                 return 4*BYTES_PER_ITERATION;
00071         else
00072 #endif
00073                 return BYTES_PER_ITERATION;
00074 }
00075 #endif
00076 
00077 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
00078 extern "C" {
00079 void Salsa20_OperateKeystream(byte *output, const byte *input, size_t iterationCount, int rounds, void *state);
00080 }
00081 #endif
00082 
00083 #pragma warning(disable: 4731)  // frame pointer register 'ebp' modified by inline assembly code
00084 
00085 void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
00086 {
00087 #endif  // #ifdef CRYPTOPP_GENERATE_X64_MASM
00088 
00089 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
00090         Salsa20_OperateKeystream(output, input, iterationCount, m_rounds, m_state.data());
00091         return;
00092 #endif
00093 
00094 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00095 #ifdef CRYPTOPP_GENERATE_X64_MASM
00096                 ALIGN   8
00097         Salsa20_OperateKeystream        PROC FRAME
00098                 mov             r10, [rsp + 5*8]                        ; state
00099                 alloc_stack(10*16 + 32*16 + 8)
00100                 save_xmm128 xmm6, 0200h
00101                 save_xmm128 xmm7, 0210h
00102                 save_xmm128 xmm8, 0220h
00103                 save_xmm128 xmm9, 0230h
00104                 save_xmm128 xmm10, 0240h
00105                 save_xmm128 xmm11, 0250h
00106                 save_xmm128 xmm12, 0260h
00107                 save_xmm128 xmm13, 0270h
00108                 save_xmm128 xmm14, 0280h
00109                 save_xmm128 xmm15, 0290h
00110                 .endprolog
00111 
00112         #define REG_output                      rcx
00113         #define REG_input                       rdx
00114         #define REG_iterationCount      r8
00115         #define REG_state                       r10
00116         #define REG_rounds                      e9d
00117         #define REG_roundsLeft          eax
00118         #define REG_temp32                      r11d
00119         #define REG_temp                        r11
00120         #define SSE2_WORKSPACE          rsp
00121 #else
00122         if (HasSSE2())
00123         {
00124         #if CRYPTOPP_BOOL_X64
00125                 #define REG_output                      %4
00126                 #define REG_input                       %1
00127                 #define REG_iterationCount      %2
00128                 #define REG_state                       %3
00129                 #define REG_rounds                      %0
00130                 #define REG_roundsLeft          eax
00131                 #define REG_temp32                      edx
00132                 #define REG_temp                        rdx
00133                 #define SSE2_WORKSPACE          %5
00134 
00135                 FixedSizeAlignedSecBlock<byte, 32*16> workspace;
00136         #else
00137                 #define REG_output                      edi
00138                 #define REG_input                       eax
00139                 #define REG_iterationCount      ecx
00140                 #define REG_state                       esi
00141                 #define REG_rounds                      edx
00142                 #define REG_roundsLeft          ebx
00143                 #define REG_temp32                      ebp
00144                 #define REG_temp                        ebp
00145                 #define SSE2_WORKSPACE          esp + WORD_SZ
00146         #endif
00147 
00148         #ifdef __GNUC__
00149                 __asm__ __volatile__
00150                 (
00151                         ".intel_syntax noprefix;"
00152                         AS_PUSH_IF86(   bx)
00153         #else
00154                 void *s = m_state.data();
00155                 word32 r = m_rounds;
00156 
00157                 AS2(    mov             REG_iterationCount, iterationCount)
00158                 AS2(    mov             REG_input, input)
00159                 AS2(    mov             REG_output, output)
00160                 AS2(    mov             REG_state, s)
00161                 AS2(    mov             REG_rounds, r)
00162         #endif
00163 #endif  // #ifndef CRYPTOPP_GENERATE_X64_MASM
00164 
00165                 AS_PUSH_IF86(   bp)
00166                 AS2(    cmp             REG_iterationCount, 4)
00167                 ASJ(    jl,             5, f)
00168 
00169 #if CRYPTOPP_BOOL_X86
00170                 AS2(    mov             ebx, esp)
00171                 AS2(    and             esp, -16)
00172                 AS2(    sub             esp, 32*16)
00173                 AS1(    push    ebx)
00174 #endif
00175 
00176 #define SSE2_EXPAND_S(i, j)             \
00177         ASS(    pshufd  xmm4, xmm##i, j, j, j, j)       \
00178         AS2(    movdqa  [SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4)
00179 
00180                 AS2(    movdqa  xmm0, [REG_state + 0*16])
00181                 AS2(    movdqa  xmm1, [REG_state + 1*16])
00182                 AS2(    movdqa  xmm2, [REG_state + 2*16])
00183                 AS2(    movdqa  xmm3, [REG_state + 3*16])
00184                 SSE2_EXPAND_S(0, 0)
00185                 SSE2_EXPAND_S(0, 1)
00186                 SSE2_EXPAND_S(0, 2)
00187                 SSE2_EXPAND_S(0, 3)
00188                 SSE2_EXPAND_S(1, 0)
00189                 SSE2_EXPAND_S(1, 2)
00190                 SSE2_EXPAND_S(1, 3)
00191                 SSE2_EXPAND_S(2, 1)
00192                 SSE2_EXPAND_S(2, 2)
00193                 SSE2_EXPAND_S(2, 3)
00194                 SSE2_EXPAND_S(3, 0)
00195                 SSE2_EXPAND_S(3, 1)
00196                 SSE2_EXPAND_S(3, 2)
00197                 SSE2_EXPAND_S(3, 3)
00198 
00199 #define SSE2_EXPAND_S85(i)              \
00200                 AS2(    mov             dword ptr [SSE2_WORKSPACE + 8*16 + i*4 + 256], REG_roundsLeft)  \
00201                 AS2(    mov             dword ptr [SSE2_WORKSPACE + 5*16 + i*4 + 256], REG_temp32)      \
00202                 AS2(    add             REG_roundsLeft, 1)      \
00203                 AS2(    adc             REG_temp32, 0)
00204 
00205                 ASL(1)
00206                 AS2(    mov             REG_roundsLeft, dword ptr [REG_state + 8*4])
00207                 AS2(    mov             REG_temp32, dword ptr [REG_state + 5*4])
00208                 SSE2_EXPAND_S85(0)
00209                 SSE2_EXPAND_S85(1)
00210                 SSE2_EXPAND_S85(2)
00211                 SSE2_EXPAND_S85(3)
00212                 AS2(    mov             dword ptr [REG_state + 8*4], REG_roundsLeft)
00213                 AS2(    mov             dword ptr [REG_state + 5*4], REG_temp32)
00214 
00215 #define SSE2_QUARTER_ROUND(a, b, d, i)          \
00216         AS2(    movdqa  xmm4, xmm##d)                   \
00217         AS2(    paddd   xmm4, xmm##a)                   \
00218         AS2(    movdqa  xmm5, xmm4)                             \
00219         AS2(    pslld   xmm4, i)                                \
00220         AS2(    psrld   xmm5, 32-i)                             \
00221         AS2(    pxor    xmm##b, xmm4)                   \
00222         AS2(    pxor    xmm##b, xmm5)
00223 
00224 #define L01(A,B,C,D,a,b,c,d,i)          AS2(    movdqa  xmm##A, [SSE2_WORKSPACE + d*16 + i*256])        /* y3 */
00225 #define L02(A,B,C,D,a,b,c,d,i)          AS2(    movdqa  xmm##C, [SSE2_WORKSPACE + a*16 + i*256])        /* y0 */        
00226 #define L03(A,B,C,D,a,b,c,d,i)          AS2(    paddd   xmm##A, xmm##C)         /* y0+y3 */                                                     
00227 #define L04(A,B,C,D,a,b,c,d,i)          AS2(    movdqa  xmm##B, xmm##A)                                                                                 
00228 #define L05(A,B,C,D,a,b,c,d,i)          AS2(    pslld   xmm##A, 7)                                                                                      
00229 #define L06(A,B,C,D,a,b,c,d,i)          AS2(    psrld   xmm##B, 32-7)                                                                                   
00230 #define L07(A,B,C,D,a,b,c,d,i)          AS2(    pxor    xmm##A, [SSE2_WORKSPACE + b*16 + i*256])                                
00231 #define L08(A,B,C,D,a,b,c,d,i)          AS2(    pxor    xmm##A, xmm##B)         /* z1 */                                                        
00232 #define L09(A,B,C,D,a,b,c,d,i)          AS2(    movdqa  [SSE2_WORKSPACE + b*16], xmm##A)                                
00233 #define L10(A,B,C,D,a,b,c,d,i)          AS2(    movdqa  xmm##B, xmm##A)                                                                                 
00234 #define L11(A,B,C,D,a,b,c,d,i)          AS2(    paddd   xmm##A, xmm##C)         /* z1+y0 */                                                     
00235 #define L12(A,B,C,D,a,b,c,d,i)          AS2(    movdqa  xmm##D, xmm##A)                                                                                 
00236 #define L13(A,B,C,D,a,b,c,d,i)          AS2(    pslld   xmm##A, 9)                                                                                      
00237 #define L14(A,B,C,D,a,b,c,d,i)          AS2(    psrld   xmm##D, 32-9)                                                                                   
00238 #define L15(A,B,C,D,a,b,c,d,i)          AS2(    pxor    xmm##A, [SSE2_WORKSPACE + c*16 + i*256])                                
00239 #define L16(A,B,C,D,a,b,c,d,i)          AS2(    pxor    xmm##A, xmm##D)         /* z2 */                                                        
00240 #define L17(A,B,C,D,a,b,c,d,i)          AS2(    movdqa  [SSE2_WORKSPACE + c*16], xmm##A)                                
00241 #define L18(A,B,C,D,a,b,c,d,i)          AS2(    movdqa  xmm##D, xmm##A)                                                                                 
00242 #define L19(A,B,C,D,a,b,c,d,i)          AS2(    paddd   xmm##A, xmm##B)         /* z2+z1 */                                                     
00243 #define L20(A,B,C,D,a,b,c,d,i)          AS2(    movdqa  xmm##B, xmm##A)                                                                                 
00244 #define L21(A,B,C,D,a,b,c,d,i)          AS2(    pslld   xmm##A, 13)                                                                                     
00245 #define L22(A,B,C,D,a,b,c,d,i)          AS2(    psrld   xmm##B, 32-13)                                                                          
00246 #define L23(A,B,C,D,a,b,c,d,i)          AS2(    pxor    xmm##A, [SSE2_WORKSPACE + d*16 + i*256])                                
00247 #define L24(A,B,C,D,a,b,c,d,i)          AS2(    pxor    xmm##A, xmm##B)         /* z3 */                                                        
00248 #define L25(A,B,C,D,a,b,c,d,i)          AS2(    movdqa  [SSE2_WORKSPACE + d*16], xmm##A)                                
00249 #define L26(A,B,C,D,a,b,c,d,i)          AS2(    paddd   xmm##A, xmm##D)         /* z3+z2 */                                                     
00250 #define L27(A,B,C,D,a,b,c,d,i)          AS2(    movdqa  xmm##D, xmm##A)                                                                                 
00251 #define L28(A,B,C,D,a,b,c,d,i)          AS2(    pslld   xmm##A, 18)                                                                                     
00252 #define L29(A,B,C,D,a,b,c,d,i)          AS2(    psrld   xmm##D, 32-18)                                                                          
00253 #define L30(A,B,C,D,a,b,c,d,i)          AS2(    pxor    xmm##A, xmm##C)         /* xor y0 */                                            
00254 #define L31(A,B,C,D,a,b,c,d,i)          AS2(    pxor    xmm##A, xmm##D)         /* z0 */                                                        
00255 #define L32(A,B,C,D,a,b,c,d,i)          AS2(    movdqa  [SSE2_WORKSPACE + a*16], xmm##A)                                
00256 
00257 #define SSE2_QUARTER_ROUND_X8(i, a, b, c, d, e, f, g, h)        \
00258         L01(0,1,2,3, a,b,c,d, i)        L01(4,5,6,7, e,f,g,h, i)        \
00259         L02(0,1,2,3, a,b,c,d, i)        L02(4,5,6,7, e,f,g,h, i)        \
00260         L03(0,1,2,3, a,b,c,d, i)        L03(4,5,6,7, e,f,g,h, i)        \
00261         L04(0,1,2,3, a,b,c,d, i)        L04(4,5,6,7, e,f,g,h, i)        \
00262         L05(0,1,2,3, a,b,c,d, i)        L05(4,5,6,7, e,f,g,h, i)        \
00263         L06(0,1,2,3, a,b,c,d, i)        L06(4,5,6,7, e,f,g,h, i)        \
00264         L07(0,1,2,3, a,b,c,d, i)        L07(4,5,6,7, e,f,g,h, i)        \
00265         L08(0,1,2,3, a,b,c,d, i)        L08(4,5,6,7, e,f,g,h, i)        \
00266         L09(0,1,2,3, a,b,c,d, i)        L09(4,5,6,7, e,f,g,h, i)        \
00267         L10(0,1,2,3, a,b,c,d, i)        L10(4,5,6,7, e,f,g,h, i)        \
00268         L11(0,1,2,3, a,b,c,d, i)        L11(4,5,6,7, e,f,g,h, i)        \
00269         L12(0,1,2,3, a,b,c,d, i)        L12(4,5,6,7, e,f,g,h, i)        \
00270         L13(0,1,2,3, a,b,c,d, i)        L13(4,5,6,7, e,f,g,h, i)        \
00271         L14(0,1,2,3, a,b,c,d, i)        L14(4,5,6,7, e,f,g,h, i)        \
00272         L15(0,1,2,3, a,b,c,d, i)        L15(4,5,6,7, e,f,g,h, i)        \
00273         L16(0,1,2,3, a,b,c,d, i)        L16(4,5,6,7, e,f,g,h, i)        \
00274         L17(0,1,2,3, a,b,c,d, i)        L17(4,5,6,7, e,f,g,h, i)        \
00275         L18(0,1,2,3, a,b,c,d, i)        L18(4,5,6,7, e,f,g,h, i)        \
00276         L19(0,1,2,3, a,b,c,d, i)        L19(4,5,6,7, e,f,g,h, i)        \
00277         L20(0,1,2,3, a,b,c,d, i)        L20(4,5,6,7, e,f,g,h, i)        \
00278         L21(0,1,2,3, a,b,c,d, i)        L21(4,5,6,7, e,f,g,h, i)        \
00279         L22(0,1,2,3, a,b,c,d, i)        L22(4,5,6,7, e,f,g,h, i)        \
00280         L23(0,1,2,3, a,b,c,d, i)        L23(4,5,6,7, e,f,g,h, i)        \
00281         L24(0,1,2,3, a,b,c,d, i)        L24(4,5,6,7, e,f,g,h, i)        \
00282         L25(0,1,2,3, a,b,c,d, i)        L25(4,5,6,7, e,f,g,h, i)        \
00283         L26(0,1,2,3, a,b,c,d, i)        L26(4,5,6,7, e,f,g,h, i)        \
00284         L27(0,1,2,3, a,b,c,d, i)        L27(4,5,6,7, e,f,g,h, i)        \
00285         L28(0,1,2,3, a,b,c,d, i)        L28(4,5,6,7, e,f,g,h, i)        \
00286         L29(0,1,2,3, a,b,c,d, i)        L29(4,5,6,7, e,f,g,h, i)        \
00287         L30(0,1,2,3, a,b,c,d, i)        L30(4,5,6,7, e,f,g,h, i)        \
00288         L31(0,1,2,3, a,b,c,d, i)        L31(4,5,6,7, e,f,g,h, i)        \
00289         L32(0,1,2,3, a,b,c,d, i)        L32(4,5,6,7, e,f,g,h, i)
00290 
00291 #define SSE2_QUARTER_ROUND_X16(i, a, b, c, d, e, f, g, h, A, B, C, D, E, F, G, H)       \
00292         L01(0,1,2,3, a,b,c,d, i)        L01(4,5,6,7, e,f,g,h, i)        L01(8,9,10,11, A,B,C,D, i)      L01(12,13,14,15, E,F,G,H, i)    \
00293         L02(0,1,2,3, a,b,c,d, i)        L02(4,5,6,7, e,f,g,h, i)        L02(8,9,10,11, A,B,C,D, i)      L02(12,13,14,15, E,F,G,H, i)    \
00294         L03(0,1,2,3, a,b,c,d, i)        L03(4,5,6,7, e,f,g,h, i)        L03(8,9,10,11, A,B,C,D, i)      L03(12,13,14,15, E,F,G,H, i)    \
00295         L04(0,1,2,3, a,b,c,d, i)        L04(4,5,6,7, e,f,g,h, i)        L04(8,9,10,11, A,B,C,D, i)      L04(12,13,14,15, E,F,G,H, i)    \
00296         L05(0,1,2,3, a,b,c,d, i)        L05(4,5,6,7, e,f,g,h, i)        L05(8,9,10,11, A,B,C,D, i)      L05(12,13,14,15, E,F,G,H, i)    \
00297         L06(0,1,2,3, a,b,c,d, i)        L06(4,5,6,7, e,f,g,h, i)        L06(8,9,10,11, A,B,C,D, i)      L06(12,13,14,15, E,F,G,H, i)    \
00298         L07(0,1,2,3, a,b,c,d, i)        L07(4,5,6,7, e,f,g,h, i)        L07(8,9,10,11, A,B,C,D, i)      L07(12,13,14,15, E,F,G,H, i)    \
00299         L08(0,1,2,3, a,b,c,d, i)        L08(4,5,6,7, e,f,g,h, i)        L08(8,9,10,11, A,B,C,D, i)      L08(12,13,14,15, E,F,G,H, i)    \
00300         L09(0,1,2,3, a,b,c,d, i)        L09(4,5,6,7, e,f,g,h, i)        L09(8,9,10,11, A,B,C,D, i)      L09(12,13,14,15, E,F,G,H, i)    \
00301         L10(0,1,2,3, a,b,c,d, i)        L10(4,5,6,7, e,f,g,h, i)        L10(8,9,10,11, A,B,C,D, i)      L10(12,13,14,15, E,F,G,H, i)    \
00302         L11(0,1,2,3, a,b,c,d, i)        L11(4,5,6,7, e,f,g,h, i)        L11(8,9,10,11, A,B,C,D, i)      L11(12,13,14,15, E,F,G,H, i)    \
00303         L12(0,1,2,3, a,b,c,d, i)        L12(4,5,6,7, e,f,g,h, i)        L12(8,9,10,11, A,B,C,D, i)      L12(12,13,14,15, E,F,G,H, i)    \
00304         L13(0,1,2,3, a,b,c,d, i)        L13(4,5,6,7, e,f,g,h, i)        L13(8,9,10,11, A,B,C,D, i)      L13(12,13,14,15, E,F,G,H, i)    \
00305         L14(0,1,2,3, a,b,c,d, i)        L14(4,5,6,7, e,f,g,h, i)        L14(8,9,10,11, A,B,C,D, i)      L14(12,13,14,15, E,F,G,H, i)    \
00306         L15(0,1,2,3, a,b,c,d, i)        L15(4,5,6,7, e,f,g,h, i)        L15(8,9,10,11, A,B,C,D, i)      L15(12,13,14,15, E,F,G,H, i)    \
00307         L16(0,1,2,3, a,b,c,d, i)        L16(4,5,6,7, e,f,g,h, i)        L16(8,9,10,11, A,B,C,D, i)      L16(12,13,14,15, E,F,G,H, i)    \
00308         L17(0,1,2,3, a,b,c,d, i)        L17(4,5,6,7, e,f,g,h, i)        L17(8,9,10,11, A,B,C,D, i)      L17(12,13,14,15, E,F,G,H, i)    \
00309         L18(0,1,2,3, a,b,c,d, i)        L18(4,5,6,7, e,f,g,h, i)        L18(8,9,10,11, A,B,C,D, i)      L18(12,13,14,15, E,F,G,H, i)    \
00310         L19(0,1,2,3, a,b,c,d, i)        L19(4,5,6,7, e,f,g,h, i)        L19(8,9,10,11, A,B,C,D, i)      L19(12,13,14,15, E,F,G,H, i)    \
00311         L20(0,1,2,3, a,b,c,d, i)        L20(4,5,6,7, e,f,g,h, i)        L20(8,9,10,11, A,B,C,D, i)      L20(12,13,14,15, E,F,G,H, i)    \
00312         L21(0,1,2,3, a,b,c,d, i)        L21(4,5,6,7, e,f,g,h, i)        L21(8,9,10,11, A,B,C,D, i)      L21(12,13,14,15, E,F,G,H, i)    \
00313         L22(0,1,2,3, a,b,c,d, i)        L22(4,5,6,7, e,f,g,h, i)        L22(8,9,10,11, A,B,C,D, i)      L22(12,13,14,15, E,F,G,H, i)    \
00314         L23(0,1,2,3, a,b,c,d, i)        L23(4,5,6,7, e,f,g,h, i)        L23(8,9,10,11, A,B,C,D, i)      L23(12,13,14,15, E,F,G,H, i)    \
00315         L24(0,1,2,3, a,b,c,d, i)        L24(4,5,6,7, e,f,g,h, i)        L24(8,9,10,11, A,B,C,D, i)      L24(12,13,14,15, E,F,G,H, i)    \
00316         L25(0,1,2,3, a,b,c,d, i)        L25(4,5,6,7, e,f,g,h, i)        L25(8,9,10,11, A,B,C,D, i)      L25(12,13,14,15, E,F,G,H, i)    \
00317         L26(0,1,2,3, a,b,c,d, i)        L26(4,5,6,7, e,f,g,h, i)        L26(8,9,10,11, A,B,C,D, i)      L26(12,13,14,15, E,F,G,H, i)    \
00318         L27(0,1,2,3, a,b,c,d, i)        L27(4,5,6,7, e,f,g,h, i)        L27(8,9,10,11, A,B,C,D, i)      L27(12,13,14,15, E,F,G,H, i)    \
00319         L28(0,1,2,3, a,b,c,d, i)        L28(4,5,6,7, e,f,g,h, i)        L28(8,9,10,11, A,B,C,D, i)      L28(12,13,14,15, E,F,G,H, i)    \
00320         L29(0,1,2,3, a,b,c,d, i)        L29(4,5,6,7, e,f,g,h, i)        L29(8,9,10,11, A,B,C,D, i)      L29(12,13,14,15, E,F,G,H, i)    \
00321         L30(0,1,2,3, a,b,c,d, i)        L30(4,5,6,7, e,f,g,h, i)        L30(8,9,10,11, A,B,C,D, i)      L30(12,13,14,15, E,F,G,H, i)    \
00322         L31(0,1,2,3, a,b,c,d, i)        L31(4,5,6,7, e,f,g,h, i)        L31(8,9,10,11, A,B,C,D, i)      L31(12,13,14,15, E,F,G,H, i)    \
00323         L32(0,1,2,3, a,b,c,d, i)        L32(4,5,6,7, e,f,g,h, i)        L32(8,9,10,11, A,B,C,D, i)      L32(12,13,14,15, E,F,G,H, i)
00324 
00325 #if CRYPTOPP_BOOL_X64
00326                 SSE2_QUARTER_ROUND_X16(1, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
00327 #else
00328                 SSE2_QUARTER_ROUND_X8(1, 2, 6, 10, 14, 3, 7, 11, 15)
00329                 SSE2_QUARTER_ROUND_X8(1, 0, 4, 8, 12, 1, 5, 9, 13)
00330 #endif
00331                 AS2(    mov             REG_roundsLeft, REG_rounds)
00332                 ASJ(    jmp,    2, f)
00333 
00334                 ASL(SSE2_Salsa_Output)
00335                 AS2(    movdqa          xmm0, xmm4)
00336                 AS2(    punpckldq       xmm4, xmm5)
00337                 AS2(    movdqa          xmm1, xmm6)
00338                 AS2(    punpckldq       xmm6, xmm7)
00339                 AS2(    movdqa          xmm2, xmm4)
00340                 AS2(    punpcklqdq      xmm4, xmm6)     // e
00341                 AS2(    punpckhqdq      xmm2, xmm6)     // f
00342                 AS2(    punpckhdq       xmm0, xmm5)
00343                 AS2(    punpckhdq       xmm1, xmm7)
00344                 AS2(    movdqa          xmm6, xmm0)
00345                 AS2(    punpcklqdq      xmm0, xmm1)     // g
00346                 AS2(    punpckhqdq      xmm6, xmm1)     // h
00347                 AS_XMM_OUTPUT4(SSE2_Salsa_Output_A, REG_input, REG_output, 4, 2, 0, 6, 1, 0, 4, 8, 12, 1)
00348                 AS1(    ret)
00349 
00350                 ASL(6)
00351 #if CRYPTOPP_BOOL_X64
00352                 SSE2_QUARTER_ROUND_X16(0, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
00353                 ASL(2)
00354                 SSE2_QUARTER_ROUND_X16(0, 0, 13, 10, 7, 1, 14, 11, 4, 2, 15, 8, 5, 3, 12, 9, 6)
00355 #else
00356                 SSE2_QUARTER_ROUND_X8(0, 2, 6, 10, 14, 3, 7, 11, 15)
00357                 SSE2_QUARTER_ROUND_X8(0, 0, 4, 8, 12, 1, 5, 9, 13)
00358                 ASL(2)
00359                 SSE2_QUARTER_ROUND_X8(0, 2, 15, 8, 5, 3, 12, 9, 6)
00360                 SSE2_QUARTER_ROUND_X8(0, 0, 13, 10, 7, 1, 14, 11, 4)
00361 #endif
00362                 AS2(    sub             REG_roundsLeft, 2)
00363                 ASJ(    jnz,    6, b)
00364 
00365 #define SSE2_OUTPUT_4(a, b, c, d)       \
00366         AS2(    movdqa          xmm4, [SSE2_WORKSPACE + a*16 + 256])\
00367         AS2(    paddd           xmm4, [SSE2_WORKSPACE + a*16])\
00368         AS2(    movdqa          xmm5, [SSE2_WORKSPACE + b*16 + 256])\
00369         AS2(    paddd           xmm5, [SSE2_WORKSPACE + b*16])\
00370         AS2(    movdqa          xmm6, [SSE2_WORKSPACE + c*16 + 256])\
00371         AS2(    paddd           xmm6, [SSE2_WORKSPACE + c*16])\
00372         AS2(    movdqa          xmm7, [SSE2_WORKSPACE + d*16 + 256])\
00373         AS2(    paddd           xmm7, [SSE2_WORKSPACE + d*16])\
00374         ASC(    call,           SSE2_Salsa_Output)
00375 
00376                 SSE2_OUTPUT_4(0, 13, 10, 7)
00377                 SSE2_OUTPUT_4(4, 1, 14, 11)
00378                 SSE2_OUTPUT_4(8, 5, 2, 15)
00379                 SSE2_OUTPUT_4(12, 9, 6, 3)
00380                 AS2(    test    REG_input, REG_input)
00381                 ASJ(    jz,             9, f)
00382                 AS2(    add             REG_input, 12*16)
00383                 ASL(9)
00384                 AS2(    add             REG_output, 12*16)
00385                 AS2(    sub             REG_iterationCount, 4)
00386                 AS2(    cmp             REG_iterationCount, 4)
00387                 ASJ(    jge,    1, b)
00388                 AS_POP_IF86(    sp)
00389 
00390                 ASL(5)
00391                 AS2(    sub             REG_iterationCount, 1)
00392                 ASJ(    jl,             4, f)
00393                 AS2(    movdqa  xmm0, [REG_state + 0*16])
00394                 AS2(    movdqa  xmm1, [REG_state + 1*16])
00395                 AS2(    movdqa  xmm2, [REG_state + 2*16])
00396                 AS2(    movdqa  xmm3, [REG_state + 3*16])
00397                 AS2(    mov             REG_roundsLeft, REG_rounds)
00398 
00399                 ASL(0)
00400                 SSE2_QUARTER_ROUND(0, 1, 3, 7)
00401                 SSE2_QUARTER_ROUND(1, 2, 0, 9)
00402                 SSE2_QUARTER_ROUND(2, 3, 1, 13)
00403                 SSE2_QUARTER_ROUND(3, 0, 2, 18)
00404                 ASS(    pshufd  xmm1, xmm1, 2, 1, 0, 3)
00405                 ASS(    pshufd  xmm2, xmm2, 1, 0, 3, 2)
00406                 ASS(    pshufd  xmm3, xmm3, 0, 3, 2, 1)
00407                 SSE2_QUARTER_ROUND(0, 3, 1, 7)
00408                 SSE2_QUARTER_ROUND(3, 2, 0, 9)
00409                 SSE2_QUARTER_ROUND(2, 1, 3, 13)
00410                 SSE2_QUARTER_ROUND(1, 0, 2, 18)
00411                 ASS(    pshufd  xmm1, xmm1, 0, 3, 2, 1)
00412                 ASS(    pshufd  xmm2, xmm2, 1, 0, 3, 2)
00413                 ASS(    pshufd  xmm3, xmm3, 2, 1, 0, 3)
00414                 AS2(    sub             REG_roundsLeft, 2)
00415                 ASJ(    jnz,    0, b)
00416 
00417                 AS2(    paddd   xmm0, [REG_state + 0*16])
00418                 AS2(    paddd   xmm1, [REG_state + 1*16])
00419                 AS2(    paddd   xmm2, [REG_state + 2*16])
00420                 AS2(    paddd   xmm3, [REG_state + 3*16])
00421 
00422                 AS2(    add             dword ptr [REG_state + 8*4], 1)
00423                 AS2(    adc             dword ptr [REG_state + 5*4], 0)
00424 
00425                 AS2(    pcmpeqb xmm6, xmm6)                     // all ones
00426                 AS2(    psrlq   xmm6, 32)                       // lo32 mask
00427                 ASS(    pshufd  xmm7, xmm6, 0, 1, 2, 3)         // hi32 mask
00428                 AS2(    movdqa  xmm4, xmm0)
00429                 AS2(    movdqa  xmm5, xmm3)
00430                 AS2(    pand    xmm0, xmm7)
00431                 AS2(    pand    xmm4, xmm6)
00432                 AS2(    pand    xmm3, xmm6)
00433                 AS2(    pand    xmm5, xmm7)
00434                 AS2(    por             xmm4, xmm5)                     // 0,13,2,15
00435                 AS2(    movdqa  xmm5, xmm1)
00436                 AS2(    pand    xmm1, xmm7)
00437                 AS2(    pand    xmm5, xmm6)
00438                 AS2(    por             xmm0, xmm5)                     // 4,1,6,3
00439                 AS2(    pand    xmm6, xmm2)
00440                 AS2(    pand    xmm2, xmm7)
00441                 AS2(    por             xmm1, xmm6)                     // 8,5,10,7
00442                 AS2(    por             xmm2, xmm3)                     // 12,9,14,11
00443 
00444                 AS2(    movdqa  xmm5, xmm4)
00445                 AS2(    movdqa  xmm6, xmm0)
00446                 AS3(    shufpd  xmm4, xmm1, 2)          // 0,13,10,7
00447                 AS3(    shufpd  xmm0, xmm2, 2)          // 4,1,14,11
00448                 AS3(    shufpd  xmm1, xmm5, 2)          // 8,5,2,15
00449                 AS3(    shufpd  xmm2, xmm6, 2)          // 12,9,6,3
00450 
00451                 // output keystream
00452                 AS_XMM_OUTPUT4(SSE2_Salsa_Output_B, REG_input, REG_output, 4, 0, 1, 2, 3, 0, 1, 2, 3, 4)
00453                 ASJ(    jmp,    5, b)
00454                 ASL(4)
00455 
00456                 AS_POP_IF86(    bp)
00457 #ifdef __GNUC__
00458                 AS_POP_IF86(    bx)
00459                 ".att_syntax prefix;"
00460                         : 
00461         #if CRYPTOPP_BOOL_X64
00462                         : "r" (m_rounds), "r" (input), "r" (iterationCount), "r" (m_state.data()), "r" (output), "r" (workspace.m_ptr)
00463                         : "%eax", "%edx", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
00464         #else
00465                         : "d" (m_rounds), "a" (input), "c" (iterationCount), "S" (m_state.data()), "D" (output)
00466                         : "memory", "cc"
00467         #endif
00468                 );
00469 #endif
00470 #ifdef CRYPTOPP_GENERATE_X64_MASM
00471         movdqa  xmm6, [rsp + 0200h]
00472         movdqa  xmm7, [rsp + 0210h]
00473         movdqa  xmm8, [rsp + 0220h]
00474         movdqa  xmm9, [rsp + 0230h]
00475         movdqa  xmm10, [rsp + 0240h]
00476         movdqa  xmm11, [rsp + 0250h]
00477         movdqa  xmm12, [rsp + 0260h]
00478         movdqa  xmm13, [rsp + 0270h]
00479         movdqa  xmm14, [rsp + 0280h]
00480         movdqa  xmm15, [rsp + 0290h]
00481         add             rsp, 10*16 + 32*16 + 8
00482         ret
00483 Salsa20_OperateKeystream ENDP
00484 #else
00485         }
00486         else
00487 #endif
00488 #endif
00489 #ifndef CRYPTOPP_GENERATE_X64_MASM
00490         {
00491                 word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
00492 
00493                 while (iterationCount--)
00494                 {
00495                         x0 = m_state[0];        x1 = m_state[1];        x2 = m_state[2];        x3 = m_state[3];
00496                         x4 = m_state[4];        x5 = m_state[5];        x6 = m_state[6];        x7 = m_state[7];
00497                         x8 = m_state[8];        x9 = m_state[9];        x10 = m_state[10];      x11 = m_state[11];
00498                         x12 = m_state[12];      x13 = m_state[13];      x14 = m_state[14];      x15 = m_state[15];
00499 
00500                         for (int i=m_rounds; i>0; i-=2)
00501                         {
00502                                 #define QUARTER_ROUND(a, b, c, d)       \
00503                                         b = b ^ rotlFixed(a + d, 7);    \
00504                                         c = c ^ rotlFixed(b + a, 9);    \
00505                                         d = d ^ rotlFixed(c + b, 13);   \
00506                                         a = a ^ rotlFixed(d + c, 18);
00507 
00508                                 QUARTER_ROUND(x0, x4, x8, x12)
00509                                 QUARTER_ROUND(x1, x5, x9, x13)
00510                                 QUARTER_ROUND(x2, x6, x10, x14)
00511                                 QUARTER_ROUND(x3, x7, x11, x15)
00512 
00513                                 QUARTER_ROUND(x0, x13, x10, x7)
00514                                 QUARTER_ROUND(x1, x14, x11, x4)
00515                                 QUARTER_ROUND(x2, x15, x8, x5)
00516                                 QUARTER_ROUND(x3, x12, x9, x6)
00517                         }
00518 
00519                         #define SALSA_OUTPUT(x) {\
00520                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
00521                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\
00522                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\
00523                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\
00524                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
00525                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\
00526                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\
00527                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\
00528                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
00529                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\
00530                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\
00531                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\
00532                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
00533                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\
00534                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\
00535                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);}
00536 
00537 #ifndef CRYPTOPP_DOXYGEN_PROCESSING
00538                         CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, BYTES_PER_ITERATION);
00539 #endif
00540 
00541                         if (++m_state[8] == 0)
00542                                 ++m_state[5];
00543                 }
00544         }
00545 }       // see comment above if an internal compiler error occurs here
00546 
00547 void XSalsa20_Policy::CipherSetKey(const NameValuePairs &params, const byte *key, size_t length)
00548 {
00549         m_rounds = params.GetIntValueWithDefault(Name::Rounds(), 20);
00550 
00551         if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20))
00552                 throw InvalidRounds(XSalsa20::StaticAlgorithmName(), m_rounds);
00553 
00554         GetUserKey(LITTLE_ENDIAN_ORDER, m_key.begin(), m_key.size(), key, length);
00555         if (length == 16)
00556                 memcpy(m_key.begin()+4, m_key.begin(), 16);
00557 
00558         // "expand 32-byte k"
00559         m_state[0] = 0x61707865;
00560         m_state[1] = 0x3320646e;
00561         m_state[2] = 0x79622d32;
00562         m_state[3] = 0x6b206574;
00563 }
00564 
00565 void XSalsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length)
00566 {
00567         assert(length==24);
00568 
00569         word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
00570 
00571         GetBlock<word32, LittleEndian> get(IV);
00572         get(x14)(x11)(x8)(x5)(m_state[14])(m_state[11]);
00573 
00574         x13 = m_key[0];         x10 = m_key[1];         x7 = m_key[2];          x4 = m_key[3];
00575         x15 = m_key[4];         x12 = m_key[5];         x9 = m_key[6];          x6 = m_key[7];
00576         x0 = m_state[0];        x1 = m_state[1];        x2 = m_state[2];        x3 = m_state[3];
00577 
00578         for (int i=m_rounds; i>0; i-=2)
00579         {
00580                 QUARTER_ROUND(x0, x4, x8, x12)
00581                 QUARTER_ROUND(x1, x5, x9, x13)
00582                 QUARTER_ROUND(x2, x6, x10, x14)
00583                 QUARTER_ROUND(x3, x7, x11, x15)
00584 
00585                 QUARTER_ROUND(x0, x13, x10, x7)
00586                 QUARTER_ROUND(x1, x14, x11, x4)
00587                 QUARTER_ROUND(x2, x15, x8, x5)
00588                 QUARTER_ROUND(x3, x12, x9, x6)
00589         }
00590 
00591         m_state[13] = x0;       m_state[10] = x1;       m_state[7] = x2;        m_state[4] = x3;
00592         m_state[15] = x14;      m_state[12] = x11;      m_state[9] = x8;        m_state[6] = x5;
00593         m_state[8] = m_state[5] = 0;
00594 }
00595 
00596 NAMESPACE_END
00597 
00598 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM

Generated on Mon Aug 9 2010 15:56:36 for Crypto++ by  doxygen 1.7.1