salsa.cpp

00001 // salsa.cpp - written and placed in the public domain by Wei Dai
00002 
00003 #include "pch.h"
00004 #include "salsa.h"
00005 #include "misc.h"
00006 #include "argnames.h"
00007 #include "cpu.h"
00008 
00009 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
00010 #include <emmintrin.h>
00011 #endif
00012 
00013 NAMESPACE_BEGIN(CryptoPP)
00014 
00015 void Salsa20_TestInstantiations()
00016 {
00017         Salsa20::Encryption x;
00018 }
00019 
00020 void Salsa20_Policy::CipherSetKey(const NameValuePairs &params, const byte *key, size_t length)
00021 {
00022         m_rounds = params.GetIntValueWithDefault(Name::Rounds(), 20);
00023 
00024         if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20))
00025                 throw InvalidRounds(StaticAlgorithmName(), m_rounds);
00026 
00027         // m_state is reordered for SSE2
00028         GetBlock<word32, LittleEndian, false> get1(key);
00029         get1(m_state[13])(m_state[10])(m_state[7])(m_state[4]);
00030         GetBlock<word32, LittleEndian, false> get2(key + length - 16);
00031         get2(m_state[15])(m_state[12])(m_state[9])(m_state[6]);
00032 
00033         // "expand 16-byte k" or "expand 32-byte k"
00034         m_state[0] = 0x61707865;
00035         m_state[1] = (length == 16) ? 0x3120646e : 0x3320646e;
00036         m_state[2] = (length == 16) ? 0x79622d36 : 0x79622d32;
00037         m_state[3] = 0x6b206574;
00038 }
00039 
00040 void Salsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV)
00041 {
00042         GetBlock<word32, LittleEndian, false> get(IV);
00043         get(m_state[14])(m_state[11]);
00044         m_state[8] = m_state[5] = 0;
00045 }
00046 
00047 void Salsa20_Policy::SeekToIteration(lword iterationCount)
00048 {
00049         m_state[8] = (word32)iterationCount;
00050         m_state[5] = (word32)SafeRightShift<32>(iterationCount);
00051 }
00052 
00053 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
00054 unsigned int Salsa20_Policy::GetAlignment() const
00055 {
00056 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
00057         if (HasSSE2())
00058                 return 16;
00059         else
00060 #endif
00061                 return 1;
00062 }
00063 
00064 unsigned int Salsa20_Policy::GetOptimalBlockSize() const
00065 {
00066 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
00067         if (HasSSE2())
00068                 return 4*BYTES_PER_ITERATION;
00069         else
00070 #endif
00071                 return BYTES_PER_ITERATION;
00072 }
00073 #endif
00074 
00075 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
00076 static const __m128i s_maskLo32 = _mm_shuffle_epi32(_mm_cvtsi32_si128(-1), _MM_SHUFFLE(1, 0, 1, 0));
00077 static const __m128i s_maskHi32 = _mm_slli_epi64(s_maskLo32, 32);
00078 #endif
00079 
00080 void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
00081 {
00082         int i;
00083 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
00084         #define SSE2_QUARTER_ROUND(a, b, d, i)                          {\
00085                 __m128i t = _mm_add_epi32(a, d);                                \
00086                 b = _mm_xor_si128(b, _mm_slli_epi32(t, i));             \
00087                 b = _mm_xor_si128(b, _mm_srli_epi32(t, 32-i));}
00088 
00089         if (HasSSE2())
00090         {
00091                 __m128i *s = (__m128i *)m_state.data();
00092 
00093 #if _MSC_VER > 1400 || (defined(_MSC_VER) && CRYPTOPP_BOOL_X86) || (CRYPTOPP_GCC_VERSION >= 40000 && CRYPTOPP_BOOL_X86)
00094                 // This code triggers an internal compiler error on MSVC 2005 when compiling 
00095                 // for x64 with optimizations on. hopefully it will get fixed in the next release.
00096                 // A bug report has been submitted at http://connect.microsoft.com/VisualStudio/feedback/ViewFeedback.aspx?FeedbackID=274123
00097                 // Also, GCC 3.4.4 generates incorrect code for x86 at -O2.
00098                 // GCC 4.1.1 generates incorrect code for x64 at -O2
00099                 if (iterationCount >= 4)
00100                 {
00101                         __m128i ss[16];
00102                         ss[0] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(0, 0, 0, 0));
00103                         ss[1] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(1, 1, 1, 1));
00104                         ss[2] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(2, 2, 2, 2));
00105                         ss[3] = _mm_shuffle_epi32(s[0], _MM_SHUFFLE(3, 3, 3, 3));
00106                         ss[4] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(0, 0, 0, 0));
00107                         ss[6] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(2, 2, 2, 2));
00108                         ss[7] = _mm_shuffle_epi32(s[1], _MM_SHUFFLE(3, 3, 3, 3));
00109                         ss[9] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(1, 1, 1, 1));
00110                         ss[10] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(2, 2, 2, 2));
00111                         ss[11] = _mm_shuffle_epi32(s[2], _MM_SHUFFLE(3, 3, 3, 3));
00112                         ss[12] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(0, 0, 0, 0));
00113                         ss[13] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(1, 1, 1, 1));
00114                         ss[14] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(2, 2, 2, 2));
00115                         ss[15] = _mm_shuffle_epi32(s[3], _MM_SHUFFLE(3, 3, 3, 3));
00116 
00117                         do
00118                         {
00119                                 word32 *countersLo = (word32*)&(ss[8]), *countersHi = (word32*)&(ss[5]);
00120                                 for (i=0; i<4; i++)
00121                                 {
00122                                         countersLo[i] = m_state[8];
00123                                         countersHi[i] = m_state[5];
00124                                         if (++m_state[8] == 0)
00125                                                 ++m_state[5];
00126                                 }
00127 
00128                                 __m128i x0 = ss[0];
00129                                 __m128i x1 = ss[1];
00130                                 __m128i x2 = ss[2];
00131                                 __m128i x3 = ss[3];
00132                                 __m128i x4 = ss[4];
00133                                 __m128i x5 = ss[5];
00134                                 __m128i x6 = ss[6];
00135                                 __m128i x7 = ss[7];
00136                                 __m128i x8 = ss[8];
00137                                 __m128i x9 = ss[9];
00138                                 __m128i x10 = ss[10];
00139                                 __m128i x11 = ss[11];
00140                                 __m128i x12 = ss[12];
00141                                 __m128i x13 = ss[13];
00142                                 __m128i x14 = ss[14];
00143                                 __m128i x15 = ss[15];
00144 
00145                                 for (i=m_rounds; i>0; i-=2)
00146                                 {
00147                                         #define QUARTER_ROUND(a, b, c, d)       \
00148                                                 SSE2_QUARTER_ROUND(a, b, d, 7)  \
00149                                                 SSE2_QUARTER_ROUND(b, c, a, 9)  \
00150                                                 SSE2_QUARTER_ROUND(c, d, b, 13) \
00151                                                 SSE2_QUARTER_ROUND(d, a, c, 18) 
00152 
00153                                         QUARTER_ROUND(x0, x4, x8, x12)
00154                                         QUARTER_ROUND(x1, x5, x9, x13)
00155                                         QUARTER_ROUND(x2, x6, x10, x14)
00156                                         QUARTER_ROUND(x3, x7, x11, x15)
00157 
00158                                         QUARTER_ROUND(x0, x13, x10, x7)
00159                                         QUARTER_ROUND(x1, x14, x11, x4)
00160                                         QUARTER_ROUND(x2, x15, x8, x5)
00161                                         QUARTER_ROUND(x3, x12, x9, x6)
00162 
00163                                         #undef QUARTER_ROUND
00164                                 }
00165 
00166                                 x0 = _mm_add_epi32(x0, ss[0]);
00167                                 x1 = _mm_add_epi32(x1, ss[1]);
00168                                 x2 = _mm_add_epi32(x2, ss[2]);
00169                                 x3 = _mm_add_epi32(x3, ss[3]);
00170                                 x4 = _mm_add_epi32(x4, ss[4]);
00171                                 x5 = _mm_add_epi32(x5, ss[5]);
00172                                 x6 = _mm_add_epi32(x6, ss[6]);
00173                                 x7 = _mm_add_epi32(x7, ss[7]);
00174                                 x8 = _mm_add_epi32(x8, ss[8]);
00175                                 x9 = _mm_add_epi32(x9, ss[9]);
00176                                 x10 = _mm_add_epi32(x10, ss[10]);
00177                                 x11 = _mm_add_epi32(x11, ss[11]);
00178                                 x12 = _mm_add_epi32(x12, ss[12]);
00179                                 x13 = _mm_add_epi32(x13, ss[13]);
00180                                 x14 = _mm_add_epi32(x14, ss[14]);
00181                                 x15 = _mm_add_epi32(x15, ss[15]);
00182 
00183                                 #define OUTPUT_4(x, a, b, c, d, e, f, g, h)     {\
00184                                         __m128i t0 = _mm_unpacklo_epi32(a, b);\
00185                                         __m128i t1 = _mm_unpacklo_epi32(c, d);\
00186                                         __m128i t2 = _mm_unpacklo_epi64(t0, t1);\
00187                                         CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, e, t2)\
00188                                         t2 = _mm_unpackhi_epi64(t0, t1);\
00189                                         CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, f, t2)\
00190                                         t0 = _mm_unpackhi_epi32(a, b);\
00191                                         t1 = _mm_unpackhi_epi32(c, d);\
00192                                         t2 = _mm_unpacklo_epi64(t0, t1);\
00193                                         CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, g, t2)\
00194                                         t2 = _mm_unpackhi_epi64(t0, t1);\
00195                                         CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, h, t2)}
00196 
00197                                 #define SALSA_OUTPUT(x)         \
00198                                         OUTPUT_4(x, x0, x13, x10, x7, 0, 4, 8, 12)\
00199                                         OUTPUT_4(x, x4, x1, x14, x11, 1, 5, 9, 13)\
00200                                         OUTPUT_4(x, x8, x5, x2, x15, 2, 6, 10, 14)\
00201                                         OUTPUT_4(x, x12, x9, x6, x3, 3, 7, 11, 15)
00202 
00203                                 CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, 4*BYTES_PER_ITERATION)
00204 
00205                                 #undef SALSA_OUTPUT
00206                         } while ((iterationCount-=4) >= 4);
00207                 }
00208 #endif
00209 
00210                 if (!IsP4()) while (iterationCount)
00211                 {
00212                         --iterationCount;
00213                         __m128i x0 = s[0];
00214                         __m128i x1 = s[1];
00215                         __m128i x2 = s[2];
00216                         __m128i x3 = s[3];
00217 
00218                         for (i=m_rounds; i>0; i-=2)
00219                         {
00220                                 SSE2_QUARTER_ROUND(x0, x1, x3, 7)
00221                                 SSE2_QUARTER_ROUND(x1, x2, x0, 9)
00222                                 SSE2_QUARTER_ROUND(x2, x3, x1, 13)
00223                                 SSE2_QUARTER_ROUND(x3, x0, x2, 18)
00224 
00225                                 x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(2, 1, 0, 3));
00226                                 x2 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1, 0, 3, 2));
00227                                 x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(0, 3, 2, 1));
00228 
00229                                 SSE2_QUARTER_ROUND(x0, x3, x1, 7)
00230                                 SSE2_QUARTER_ROUND(x3, x2, x0, 9)
00231                                 SSE2_QUARTER_ROUND(x2, x1, x3, 13)
00232                                 SSE2_QUARTER_ROUND(x1, x0, x2, 18)
00233 
00234                                 x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(0, 3, 2, 1));
00235                                 x2 = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1, 0, 3, 2));
00236                                 x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(2, 1, 0, 3));
00237                         }
00238 
00239                         x0 = _mm_add_epi32(x0, s[0]);
00240                         x1 = _mm_add_epi32(x1, s[1]);
00241                         x2 = _mm_add_epi32(x2, s[2]);
00242                         x3 = _mm_add_epi32(x3, s[3]);
00243 
00244                         if (++m_state[8] == 0)
00245                                 ++m_state[5];
00246 
00247                         __m128i k02 = _mm_or_si128(_mm_slli_epi64(x0, 32), _mm_srli_epi64(x3, 32));
00248                         k02 = _mm_shuffle_epi32(k02, _MM_SHUFFLE(0, 1, 2, 3));
00249                         __m128i k13 = _mm_or_si128(_mm_slli_epi64(x1, 32), _mm_srli_epi64(x0, 32));
00250                         k13 = _mm_shuffle_epi32(k13, _MM_SHUFFLE(0, 1, 2, 3));
00251                         __m128i k20 = _mm_or_si128(_mm_and_si128(x2, s_maskLo32), _mm_and_si128(x1, s_maskHi32));
00252                         __m128i k31 = _mm_or_si128(_mm_and_si128(x3, s_maskLo32), _mm_and_si128(x2, s_maskHi32));
00253 
00254                         __m128i k0 = _mm_unpackhi_epi64(k02, k20);
00255                         __m128i k1 = _mm_unpackhi_epi64(k13, k31);
00256                         __m128i k2 = _mm_unpacklo_epi64(k20, k02);
00257                         __m128i k3 = _mm_unpacklo_epi64(k31, k13);
00258 
00259                         #define SSE2_OUTPUT(x)  {\
00260                                 CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 0, k0)\
00261                                 CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 1, k1)\
00262                                 CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 2, k2)\
00263                                 CRYPTOPP_KEYSTREAM_OUTPUT_XMM(x, 3, k3)}
00264 
00265                         CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SSE2_OUTPUT, BYTES_PER_ITERATION);
00266                 }
00267         }
00268 #endif
00269 
00270         word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
00271 
00272         while (iterationCount--)
00273         {
00274                 x0 = m_state[0];
00275                 x1 = m_state[1];
00276                 x2 = m_state[2];
00277                 x3 = m_state[3];
00278                 x4 = m_state[4];
00279                 x5 = m_state[5];
00280                 x6 = m_state[6];
00281                 x7 = m_state[7];
00282                 x8 = m_state[8];
00283                 x9 = m_state[9];
00284                 x10 = m_state[10];
00285                 x11 = m_state[11];
00286                 x12 = m_state[12];
00287                 x13 = m_state[13];
00288                 x14 = m_state[14];
00289                 x15 = m_state[15];
00290 
00291                 for (i=m_rounds; i>0; i-=2)
00292                 {
00293                         #define QUARTER_ROUND(a, b, c, d)       \
00294                                 b = b ^ rotlFixed(a + d, 7);    \
00295                                 c = c ^ rotlFixed(b + a, 9);    \
00296                                 d = d ^ rotlFixed(c + b, 13);   \
00297                                 a = a ^ rotlFixed(d + c, 18);
00298 
00299                         QUARTER_ROUND(x0, x4, x8, x12)
00300                         QUARTER_ROUND(x1, x5, x9, x13)
00301                         QUARTER_ROUND(x2, x6, x10, x14)
00302                         QUARTER_ROUND(x3, x7, x11, x15)
00303 
00304                         QUARTER_ROUND(x0, x13, x10, x7)
00305                         QUARTER_ROUND(x1, x14, x11, x4)
00306                         QUARTER_ROUND(x2, x15, x8, x5)
00307                         QUARTER_ROUND(x3, x12, x9, x6)
00308                 }
00309 
00310                 #define SALSA_OUTPUT(x) {\
00311                         CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
00312                         CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\
00313                         CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\
00314                         CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\
00315                         CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
00316                         CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\
00317                         CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\
00318                         CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\
00319                         CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
00320                         CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\
00321                         CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\
00322                         CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\
00323                         CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
00324                         CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\
00325                         CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\
00326                         CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);}
00327 
00328 #ifndef CRYPTOPP_DOXYGEN_PROCESSING
00329                 CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, BYTES_PER_ITERATION);
00330 #endif
00331 
00332                 if (++m_state[8] == 0)
00333                         ++m_state[5];
00334         }
00335 }       // see comment above if an internal compiler error occurs here
00336 
00337 NAMESPACE_END

Generated on Fri Jun 1 11:11:24 2007 for Crypto++ by  doxygen 1.5.2