panama.cpp

00001 // panama.cpp - written and placed in the public domain by Wei Dai
00002 
00003 #include "pch.h"
00004 #include "panama.h"
00005 #include "misc.h"
00006 #include "cpu.h"
00007 
00008 NAMESPACE_BEGIN(CryptoPP)
00009 
00010 template <class B>
00011 void Panama<B>::Reset()
00012 {
00013         memset(m_state, 0, m_state.SizeInBytes());
00014 #if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
00015         m_state[17] = HasSSSE3();
00016 #endif
00017 }
00018 
00019 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00020 
00021 #pragma warning(disable: 4731)  // frame pointer register 'ebp' modified by inline assembly code
00022 
00023 void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
00024 {
00025 #ifdef __GNUC__
00026         __asm__ __volatile__
00027         (
00028                 ".intel_syntax noprefix;"
00029         AS_PUSH(                bx)
00030 #else
00031         AS2(    mov             WORD_REG(cx), count)
00032         AS2(    mov             WORD_REG(si), state)
00033         AS2(    mov             WORD_REG(di), z)
00034         AS2(    mov             WORD_REG(dx), y)
00035 #endif
00036         AS2(    shl             WORD_REG(cx), 5)
00037         ASJ(    jz,             5, f)
00038         AS2(    mov             ebx, [WORD_REG(si)+4*17])
00039         AS2(    add             WORD_REG(cx), WORD_REG(bx))
00040 
00041         AS_PUSH(                bp)
00042         AS_PUSH(                cx)
00043 
00044         AS2(    movdqa  xmm0, [WORD_REG(si)+0*16])
00045         AS2(    movdqa  xmm1, [WORD_REG(si)+1*16])
00046         AS2(    movdqa  xmm2, [WORD_REG(si)+2*16])
00047         AS2(    movdqa  xmm3, [WORD_REG(si)+3*16])
00048         AS2(    mov             eax, [WORD_REG(si)+4*16])
00049 
00050         ASL(4)
00051         // gamma and pi
00052 #if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
00053         AS2(    test    WORD_REG(bx), 1)
00054         ASJ(    jnz,    6, f)
00055 #endif
00056         AS2(    movdqa  xmm6, xmm2)
00057         AS2(    movss   xmm6, xmm3)
00058         ASS(    pshufd  xmm5, xmm6, 0, 3, 2, 1)
00059         AS2(    movd    xmm6, eax)
00060         AS2(    movdqa  xmm7, xmm3)
00061         AS2(    movss   xmm7, xmm6)
00062         ASS(    pshufd  xmm6, xmm7, 0, 3, 2, 1)
00063 #if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
00064         ASJ(    jmp,    7, f)
00065         ASL(6)
00066         AS2(    movdqa  xmm5, xmm3)
00067         AS3(    palignr xmm5, xmm2, 4)
00068         AS2(    movd    xmm6, eax)
00069         AS3(    palignr xmm6, xmm3, 4)
00070         ASL(7)
00071 #endif
00072 
00073         AS2(    movd    ecx, xmm2)
00074         AS1(    not             ecx)
00075         AS2(    movd    ebp, xmm3)
00076         AS2(    or              ecx, ebp)
00077         AS2(    xor             eax, ecx)
00078 
00079 #define SSE2_Index(i) ASM_MOD(((i)*13+16), 17)
00080 
00081 #define pi(i)   \
00082         AS2(    movd    ecx, xmm7)\
00083         AS2(    rol             ecx, ASM_MOD((ASM_MOD(5*i,17)*(ASM_MOD(5*i,17)+1)/2), 32))\
00084         AS2(    mov             [WORD_REG(si)+SSE2_Index(ASM_MOD(5*(i), 17))*4], ecx)
00085 
00086 #define pi4(x, y, z, a, b, c, d)        \
00087         AS2(    pcmpeqb xmm7, xmm7)\
00088         AS2(    pxor    xmm7, x)\
00089         AS2(    por             xmm7, y)\
00090         AS2(    pxor    xmm7, z)\
00091         pi(a)\
00092         ASS(    pshuflw xmm7, xmm7, 1, 0, 3, 2)\
00093         pi(b)\
00094         AS2(    punpckhqdq      xmm7, xmm7)\
00095         pi(c)\
00096         ASS(    pshuflw xmm7, xmm7, 1, 0, 3, 2)\
00097         pi(d)
00098 
00099         pi4(xmm1, xmm2, xmm3, 1, 5, 9, 13)
00100         pi4(xmm0, xmm1, xmm2, 2, 6, 10, 14)
00101         pi4(xmm6, xmm0, xmm1, 3, 7, 11, 15)
00102         pi4(xmm5, xmm6, xmm0, 4, 8, 12, 16)
00103 
00104         // output keystream and update buffer here to hide partial memory stalls between pi and theta
00105         AS2(    movdqa  xmm4, xmm3)
00106         AS2(    punpcklqdq      xmm3, xmm2)             // 1 5 2 6
00107         AS2(    punpckhdq       xmm4, xmm2)             // 9 10 13 14
00108         AS2(    movdqa  xmm2, xmm1)
00109         AS2(    punpcklqdq      xmm1, xmm0)             // 3 7 4 8
00110         AS2(    punpckhdq       xmm2, xmm0)             // 11 12 15 16
00111 
00112         // keystream
00113         AS2(    test    WORD_REG(di), WORD_REG(di))
00114         ASJ(    jz,             0, f)
00115         AS2(    movdqa  xmm6, xmm4)
00116         AS2(    punpcklqdq      xmm4, xmm2)
00117         AS2(    punpckhqdq      xmm6, xmm2)
00118         AS2(    test    WORD_REG(dx), 0xf)
00119         ASJ(    jnz,    2, f)
00120         AS2(    test    WORD_REG(dx), WORD_REG(dx))
00121         ASJ(    jz,             1, f)
00122         AS2(    pxor    xmm4, [WORD_REG(dx)])
00123         AS2(    pxor    xmm6, [WORD_REG(dx)+16])
00124         AS2(    add             WORD_REG(dx), 32)
00125         ASJ(    jmp,    1, f)
00126         ASL(2)
00127         AS2(    movdqu  xmm0, [WORD_REG(dx)])
00128         AS2(    movdqu  xmm2, [WORD_REG(dx)+16])
00129         AS2(    pxor    xmm4, xmm0)
00130         AS2(    pxor    xmm6, xmm2)
00131         AS2(    add             WORD_REG(dx), 32)
00132         ASL(1)
00133         AS2(    test    WORD_REG(di), 0xf)
00134         ASJ(    jnz,    3, f)
00135         AS2(    movdqa  [WORD_REG(di)], xmm4)
00136         AS2(    movdqa  [WORD_REG(di)+16], xmm6)
00137         AS2(    add             WORD_REG(di), 32)
00138         ASJ(    jmp,    0, f)
00139         ASL(3)
00140         AS2(    movdqu  [WORD_REG(di)], xmm4)
00141         AS2(    movdqu  [WORD_REG(di)+16], xmm6)
00142         AS2(    add             WORD_REG(di), 32)
00143         ASL(0)
00144 
00145         // buffer update
00146         AS2(    lea             WORD_REG(cx), [WORD_REG(bx) + 32])
00147         AS2(    and             WORD_REG(cx), 31*32)
00148         AS2(    lea             WORD_REG(bp), [WORD_REG(bx) + (32-24)*32])
00149         AS2(    and             WORD_REG(bp), 31*32)
00150 
00151         AS2(    movdqa  xmm0, [WORD_REG(si)+20*4+WORD_REG(cx)+0*8])
00152         AS2(    pxor    xmm3, xmm0)
00153         ASS(    pshufd  xmm0, xmm0, 2, 3, 0, 1)
00154         AS2(    movdqa  [WORD_REG(si)+20*4+WORD_REG(cx)+0*8], xmm3)
00155         AS2(    pxor    xmm0, [WORD_REG(si)+20*4+WORD_REG(bp)+2*8])
00156         AS2(    movdqa  [WORD_REG(si)+20*4+WORD_REG(bp)+2*8], xmm0)
00157 
00158         AS2(    movdqa  xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+2*8])
00159         AS2(    pxor    xmm1, xmm4)
00160         AS2(    movdqa  [WORD_REG(si)+20*4+WORD_REG(cx)+2*8], xmm1)
00161         AS2(    pxor    xmm4, [WORD_REG(si)+20*4+WORD_REG(bp)+0*8])
00162         AS2(    movdqa  [WORD_REG(si)+20*4+WORD_REG(bp)+0*8], xmm4)
00163 
00164         // theta
00165         AS2(    movdqa  xmm3, [WORD_REG(si)+3*16])
00166         AS2(    movdqa  xmm2, [WORD_REG(si)+2*16])
00167         AS2(    movdqa  xmm1, [WORD_REG(si)+1*16])
00168         AS2(    movdqa  xmm0, [WORD_REG(si)+0*16])
00169 
00170 #if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
00171         AS2(    test    WORD_REG(bx), 1)
00172         ASJ(    jnz,    8, f)
00173 #endif
00174         AS2(    movd    xmm6, eax)
00175         AS2(    movdqa  xmm7, xmm3)
00176         AS2(    movss   xmm7, xmm6)
00177         AS2(    movdqa  xmm6, xmm2)
00178         AS2(    movss   xmm6, xmm3)
00179         AS2(    movdqa  xmm5, xmm1)
00180         AS2(    movss   xmm5, xmm2)
00181         AS2(    movdqa  xmm4, xmm0)
00182         AS2(    movss   xmm4, xmm1)
00183         ASS(    pshufd  xmm7, xmm7, 0, 3, 2, 1)
00184         ASS(    pshufd  xmm6, xmm6, 0, 3, 2, 1)
00185         ASS(    pshufd  xmm5, xmm5, 0, 3, 2, 1)
00186         ASS(    pshufd  xmm4, xmm4, 0, 3, 2, 1)
00187 #if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
00188         ASJ(    jmp,    9, f)
00189         ASL(8)
00190         AS2(    movd    xmm7, eax)
00191         AS3(    palignr xmm7, xmm3, 4)
00192         AS2(    movq    xmm6, xmm3)
00193         AS3(    palignr xmm6, xmm2, 4)
00194         AS2(    movq    xmm5, xmm2)
00195         AS3(    palignr xmm5, xmm1, 4)
00196         AS2(    movq    xmm4, xmm1)
00197         AS3(    palignr xmm4, xmm0, 4)
00198         ASL(9)
00199 #endif
00200 
00201         AS2(    xor             eax, 1)
00202         AS2(    movd    ecx, xmm0)
00203         AS2(    xor             eax, ecx)
00204         AS2(    movd    ecx, xmm3)
00205         AS2(    xor             eax, ecx)
00206 
00207         AS2(    pxor    xmm3, xmm2)
00208         AS2(    pxor    xmm2, xmm1)
00209         AS2(    pxor    xmm1, xmm0)
00210         AS2(    pxor    xmm0, xmm7)
00211         AS2(    pxor    xmm3, xmm7)
00212         AS2(    pxor    xmm2, xmm6)
00213         AS2(    pxor    xmm1, xmm5)
00214         AS2(    pxor    xmm0, xmm4)
00215 
00216         // sigma
00217         AS2(    lea             WORD_REG(cx), [WORD_REG(bx) + (32-4)*32])
00218         AS2(    and             WORD_REG(cx), 31*32)
00219         AS2(    lea             WORD_REG(bp), [WORD_REG(bx) + 16*32])
00220         AS2(    and             WORD_REG(bp), 31*32)
00221 
00222         AS2(    movdqa  xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+0*16])
00223         AS2(    movdqa  xmm5, [WORD_REG(si)+20*4+WORD_REG(bp)+0*16])
00224         AS2(    movdqa  xmm6, xmm4)
00225         AS2(    punpcklqdq      xmm4, xmm5)
00226         AS2(    punpckhqdq      xmm6, xmm5)
00227         AS2(    pxor    xmm3, xmm4)
00228         AS2(    pxor    xmm2, xmm6)
00229 
00230         AS2(    movdqa  xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+1*16])
00231         AS2(    movdqa  xmm5, [WORD_REG(si)+20*4+WORD_REG(bp)+1*16])
00232         AS2(    movdqa  xmm6, xmm4)
00233         AS2(    punpcklqdq      xmm4, xmm5)
00234         AS2(    punpckhqdq      xmm6, xmm5)
00235         AS2(    pxor    xmm1, xmm4)
00236         AS2(    pxor    xmm0, xmm6)
00237 
00238         // loop
00239         AS2(    add             WORD_REG(bx), 32)
00240         AS2(    cmp             WORD_REG(bx), [WORD_REG(sp)])
00241         ASJ(    jne,    4, b)
00242 
00243         // save state
00244         AS2(    add             WORD_REG(sp), WORD_SZ)
00245         AS_POP(                 bp)
00246         AS2(    mov             [WORD_REG(si)+4*16], eax)
00247         AS2(    movdqa  [WORD_REG(si)+3*16], xmm3)
00248         AS2(    movdqa  [WORD_REG(si)+2*16], xmm2)
00249         AS2(    movdqa  [WORD_REG(si)+1*16], xmm1)
00250         AS2(    movdqa  [WORD_REG(si)+0*16], xmm0)
00251         ASL(5)
00252 
00253 #ifdef __GNUC__
00254         AS_POP(                 bx)
00255         ".att_syntax prefix;"
00256                 :
00257                 : "c" (count), "S" (state), "D" (z), "d" (y)
00258                 : "%eax", "memory", "cc"
00259         );
00260 #endif
00261 }
00262 
00263 #endif
00264 
00265 template <class B>
00266 void Panama<B>::Iterate(size_t count, const word32 *p, word32 *z, const word32 *y)
00267 {
00268         word32 bstart = m_state[17];
00269         word32 *const aPtr = m_state;
00270         word32 cPtr[17];
00271 
00272 #define bPtr ((byte *)(aPtr+20))
00273 
00274 // reorder the state for SSE2
00275 // a and c: 4 8 12 16 | 3 7 11 15 | 2 6 10 14 | 1 5 9 13 | 0
00276 //                      xmm0            xmm1            xmm2            xmm3            eax
00277 #define a(i) aPtr[((i)*13+16) % 17]             // 13 is inverse of 4 mod 17
00278 #define c(i) cPtr[((i)*13+16) % 17]
00279 // b: 0 4 | 1 5 | 2 6 | 3 7
00280 #define b(i, j) b##i[(j)*2%8 + (j)/4]
00281 
00282 // output
00283 #define OA(i) z[i] = ConditionalByteReverse(B::ToEnum(), a(i+9))
00284 #define OX(i) z[i] = y[i] ^ ConditionalByteReverse(B::ToEnum(), a(i+9))
00285 // buffer update
00286 #define US(i) {word32 t=b(0,i); b(0,i)=ConditionalByteReverse(B::ToEnum(), p[i])^t; b(25,(i+6)%8)^=t;}
00287 #define UL(i) {word32 t=b(0,i); b(0,i)=a(i+1)^t; b(25,(i+6)%8)^=t;}
00288 // gamma and pi
00289 #define GP(i) c(5*i%17) = rotlFixed(a(i) ^ (a((i+1)%17) | ~a((i+2)%17)), ((5*i%17)*((5*i%17)+1)/2)%32)
00290 // theta and sigma
00291 #define T(i,x) a(i) = c(i) ^ c((i+1)%17) ^ c((i+4)%17) ^ x
00292 #define TS1S(i) T(i+1, ConditionalByteReverse(B::ToEnum(), p[i]))
00293 #define TS1L(i) T(i+1, b(4,i))
00294 #define TS2(i) T(i+9, b(16,i))
00295 
00296         while (count--)
00297         {
00298                 if (z)
00299                 {
00300                         if (y)
00301                         {
00302                                 OX(0); OX(1); OX(2); OX(3); OX(4); OX(5); OX(6); OX(7);
00303                                 y += 8;
00304                         }
00305                         else
00306                         {
00307                                 OA(0); OA(1); OA(2); OA(3); OA(4); OA(5); OA(6); OA(7);
00308                         }
00309                         z += 8;
00310                 }
00311 
00312                 word32 *const b16 = (word32 *)(bPtr+((bstart+16*32) & 31*32));
00313                 word32 *const b4 = (word32 *)(bPtr+((bstart+(32-4)*32) & 31*32));
00314         bstart += 32;
00315                 word32 *const b0 = (word32 *)(bPtr+((bstart) & 31*32));
00316                 word32 *const b25 = (word32 *)(bPtr+((bstart+(32-25)*32) & 31*32));
00317 
00318                 if (p)
00319                 {
00320                         US(0); US(1); US(2); US(3); US(4); US(5); US(6); US(7);
00321                 }
00322                 else
00323                 {
00324                         UL(0); UL(1); UL(2); UL(3); UL(4); UL(5); UL(6); UL(7);
00325                 }
00326 
00327                 GP(0); 
00328                 GP(1); 
00329                 GP(2); 
00330                 GP(3); 
00331                 GP(4); 
00332                 GP(5); 
00333                 GP(6); 
00334                 GP(7);
00335                 GP(8); 
00336                 GP(9); 
00337                 GP(10); 
00338                 GP(11); 
00339                 GP(12); 
00340                 GP(13); 
00341                 GP(14); 
00342                 GP(15); 
00343                 GP(16);
00344 
00345                 T(0,1);
00346 
00347                 if (p)
00348                 {
00349                         TS1S(0); TS1S(1); TS1S(2); TS1S(3); TS1S(4); TS1S(5); TS1S(6); TS1S(7);
00350                         p += 8;
00351                 }
00352                 else
00353                 {
00354                         TS1L(0); TS1L(1); TS1L(2); TS1L(3); TS1L(4); TS1L(5); TS1L(6); TS1L(7);
00355                 }
00356 
00357                 TS2(0); TS2(1); TS2(2); TS2(3); TS2(4); TS2(5); TS2(6); TS2(7);
00358         }
00359         m_state[17] = bstart;
00360 }
00361 
00362 namespace Weak {
00363 template <class B>
00364 size_t PanamaHash<B>::HashMultipleBlocks(const word32 *input, size_t length)
00365 {
00366         this->Iterate(length / this->BLOCKSIZE, input);
00367         return length % this->BLOCKSIZE;
00368 }
00369 
00370 template <class B>
00371 void PanamaHash<B>::TruncatedFinal(byte *hash, size_t size)
00372 {
00373         this->ThrowIfInvalidTruncatedSize(size);
00374 
00375         PadLastBlock(this->BLOCKSIZE, 0x01);
00376         
00377         HashEndianCorrectedBlock(this->m_data);
00378 
00379         this->Iterate(32);      // pull
00380 
00381         FixedSizeSecBlock<word32, 8> buf;
00382         this->Iterate(1, NULL, buf, NULL);
00383 
00384         memcpy(hash, buf, size);
00385 
00386         this->Restart();                // reinit for next use
00387 }
00388 }
00389 
00390 template <class B>
00391 void PanamaCipherPolicy<B>::CipherSetKey(const NameValuePairs &params, const byte *key, size_t length)
00392 {
00393         assert(length==32);
00394         memcpy(m_key, key, 32);
00395 }
00396 
00397 template <class B>
00398 void PanamaCipherPolicy<B>::CipherResynchronize(byte *keystreamBuffer, const byte *iv)
00399 {
00400         this->Reset();
00401         this->Iterate(1, m_key);
00402         if (iv && IsAligned<word32>(iv))
00403                 this->Iterate(1, (const word32 *)iv);
00404         else
00405         {
00406                 FixedSizeSecBlock<word32, 8> buf;
00407                 if (iv)
00408                         memcpy(buf, iv, 32);
00409                 else
00410                         memset(buf, 0, 32);
00411                 this->Iterate(1, buf);
00412         }
00413 
00414 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00415         if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2())
00416                 Panama_SSE2_Pull(32, this->m_state, NULL, NULL);
00417         else
00418 #endif
00419                 this->Iterate(32);
00420 }
00421 
00422 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
00423 template <class B>
00424 unsigned int PanamaCipherPolicy<B>::GetAlignment() const
00425 {
00426 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00427         if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2())
00428                 return 16;
00429         else
00430 #endif
00431                 return 1;
00432 }
00433 #endif
00434 
00435 template <class B>
00436 void PanamaCipherPolicy<B>::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
00437 {
00438 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00439         if (B::ToEnum() == LITTLE_ENDIAN_ORDER && HasSSE2())
00440                 Panama_SSE2_Pull(iterationCount, this->m_state, (word32 *)output, (const word32 *)input);
00441         else
00442 #endif
00443                 this->Iterate(iterationCount, NULL, (word32 *)output, (const word32 *)input);
00444 }
00445 
00446 template class Panama<BigEndian>;
00447 template class Panama<LittleEndian>;
00448 
00449 template class Weak::PanamaHash<BigEndian>;
00450 template class Weak::PanamaHash<LittleEndian>;
00451 
00452 template class PanamaCipherPolicy<BigEndian>;
00453 template class PanamaCipherPolicy<LittleEndian>;
00454 
00455 NAMESPACE_END

Generated on Fri Jun 1 11:11:23 2007 for Crypto++ by  doxygen 1.5.2