sosemanuk.cpp

00001 // sosemanuk.cpp - written and placed in the public domain by Wei Dai
00002 
00003 #include "pch.h"
00004 #include "sosemanuk.h"
00005 #include "misc.h"
00006 #include "cpu.h"
00007 
00008 #include "serpentp.h"
00009 
00010 NAMESPACE_BEGIN(CryptoPP)
00011 
00012 void SosemanukPolicy::CipherSetKey(const NameValuePairs &params, const byte *userKey, size_t keylen)
00013 {
00014         Serpent_KeySchedule(m_key, 24, userKey, keylen);
00015 }
00016 
00017 void SosemanukPolicy::CipherResynchronize(byte *keystreamBuffer, const byte *iv)
00018 {
00019         word32 a, b, c, d, e;
00020         
00021         typedef BlockGetAndPut<word32, LittleEndian> Block;
00022         Block::Get(iv)(a)(b)(c)(d);
00023 
00024         const word32 *k = m_key;
00025         unsigned int i=1;
00026 
00027         do
00028         {
00029                 beforeS0(KX); beforeS0(S0); afterS0(LT);
00030                 afterS0(KX); afterS0(S1); afterS1(LT);
00031                 if (i == 3)     // after 18th round
00032                 {
00033                         m_state[4] = b;
00034                         m_state[5] = e;
00035                         m_state[10] = c;
00036                         m_state[11] = a;
00037                 }
00038                 afterS1(KX); afterS1(S2); afterS2(LT);
00039                 afterS2(KX); afterS2(S3); afterS3(LT);
00040                 if (i == 2)     // after 12th round
00041                 {
00042                         m_state[6] = c;
00043                         m_state[7] = d;
00044                         m_state[8] = b;
00045                         m_state[9] = e;
00046                 }
00047                 afterS3(KX); afterS3(S4); afterS4(LT);
00048                 afterS4(KX); afterS4(S5); afterS5(LT);
00049                 afterS5(KX); afterS5(S6); afterS6(LT);
00050                 afterS6(KX); afterS6(S7); afterS7(LT);
00051 
00052                 if (i == 3)
00053                         break;
00054 
00055                 ++i;
00056                 c = b;
00057                 b = e;
00058                 e = d;
00059                 d = a;
00060                 a = e;
00061                 k += 32;
00062         }
00063         while (true);
00064 
00065         afterS7(KX);
00066 
00067         m_state[0] = a;
00068         m_state[1] = b;
00069         m_state[2] = e;
00070         m_state[3] = d;
00071 
00072 #define XMUX(c, x, y)   (x ^ (y & (0 - (c & 1))))
00073         m_state[11] += XMUX(m_state[10], m_state[1], m_state[8]);
00074         m_state[10] = rotlFixed(m_state[10] * 0x54655307, 7);
00075 }
00076 
00077 static word32 s_mulTables[512] = {
00078 #if CRYPTOPP_BOOL_X86 | CRYPTOPP_BOOL_X64
00079         0x00000000, 0xE19FCF12, 0x6B973724, 0x8A08F836, 
00080         0xD6876E48, 0x3718A15A, 0xBD10596C, 0x5C8F967E, 
00081         0x05A7DC90, 0xE4381382, 0x6E30EBB4, 0x8FAF24A6, 
00082         0xD320B2D8, 0x32BF7DCA, 0xB8B785FC, 0x59284AEE, 
00083         0x0AE71189, 0xEB78DE9B, 0x617026AD, 0x80EFE9BF, 
00084         0xDC607FC1, 0x3DFFB0D3, 0xB7F748E5, 0x566887F7, 
00085         0x0F40CD19, 0xEEDF020B, 0x64D7FA3D, 0x8548352F, 
00086         0xD9C7A351, 0x38586C43, 0xB2509475, 0x53CF5B67, 
00087         0x146722BB, 0xF5F8EDA9, 0x7FF0159F, 0x9E6FDA8D, 
00088         0xC2E04CF3, 0x237F83E1, 0xA9777BD7, 0x48E8B4C5, 
00089         0x11C0FE2B, 0xF05F3139, 0x7A57C90F, 0x9BC8061D, 
00090         0xC7479063, 0x26D85F71, 0xACD0A747, 0x4D4F6855, 
00091         0x1E803332, 0xFF1FFC20, 0x75170416, 0x9488CB04, 
00092         0xC8075D7A, 0x29989268, 0xA3906A5E, 0x420FA54C, 
00093         0x1B27EFA2, 0xFAB820B0, 0x70B0D886, 0x912F1794, 
00094         0xCDA081EA, 0x2C3F4EF8, 0xA637B6CE, 0x47A879DC, 
00095         0x28CE44DF, 0xC9518BCD, 0x435973FB, 0xA2C6BCE9, 
00096         0xFE492A97, 0x1FD6E585, 0x95DE1DB3, 0x7441D2A1, 
00097         0x2D69984F, 0xCCF6575D, 0x46FEAF6B, 0xA7616079, 
00098         0xFBEEF607, 0x1A713915, 0x9079C123, 0x71E60E31, 
00099         0x22295556, 0xC3B69A44, 0x49BE6272, 0xA821AD60, 
00100         0xF4AE3B1E, 0x1531F40C, 0x9F390C3A, 0x7EA6C328, 
00101         0x278E89C6, 0xC61146D4, 0x4C19BEE2, 0xAD8671F0, 
00102         0xF109E78E, 0x1096289C, 0x9A9ED0AA, 0x7B011FB8, 
00103         0x3CA96664, 0xDD36A976, 0x573E5140, 0xB6A19E52, 
00104         0xEA2E082C, 0x0BB1C73E, 0x81B93F08, 0x6026F01A, 
00105         0x390EBAF4, 0xD89175E6, 0x52998DD0, 0xB30642C2, 
00106         0xEF89D4BC, 0x0E161BAE, 0x841EE398, 0x65812C8A, 
00107         0x364E77ED, 0xD7D1B8FF, 0x5DD940C9, 0xBC468FDB, 
00108         0xE0C919A5, 0x0156D6B7, 0x8B5E2E81, 0x6AC1E193, 
00109         0x33E9AB7D, 0xD276646F, 0x587E9C59, 0xB9E1534B, 
00110         0xE56EC535, 0x04F10A27, 0x8EF9F211, 0x6F663D03, 
00111         0x50358817, 0xB1AA4705, 0x3BA2BF33, 0xDA3D7021, 
00112         0x86B2E65F, 0x672D294D, 0xED25D17B, 0x0CBA1E69, 
00113         0x55925487, 0xB40D9B95, 0x3E0563A3, 0xDF9AACB1, 
00114         0x83153ACF, 0x628AF5DD, 0xE8820DEB, 0x091DC2F9, 
00115         0x5AD2999E, 0xBB4D568C, 0x3145AEBA, 0xD0DA61A8, 
00116         0x8C55F7D6, 0x6DCA38C4, 0xE7C2C0F2, 0x065D0FE0, 
00117         0x5F75450E, 0xBEEA8A1C, 0x34E2722A, 0xD57DBD38, 
00118         0x89F22B46, 0x686DE454, 0xE2651C62, 0x03FAD370, 
00119         0x4452AAAC, 0xA5CD65BE, 0x2FC59D88, 0xCE5A529A, 
00120         0x92D5C4E4, 0x734A0BF6, 0xF942F3C0, 0x18DD3CD2, 
00121         0x41F5763C, 0xA06AB92E, 0x2A624118, 0xCBFD8E0A, 
00122         0x97721874, 0x76EDD766, 0xFCE52F50, 0x1D7AE042, 
00123         0x4EB5BB25, 0xAF2A7437, 0x25228C01, 0xC4BD4313, 
00124         0x9832D56D, 0x79AD1A7F, 0xF3A5E249, 0x123A2D5B, 
00125         0x4B1267B5, 0xAA8DA8A7, 0x20855091, 0xC11A9F83, 
00126         0x9D9509FD, 0x7C0AC6EF, 0xF6023ED9, 0x179DF1CB, 
00127         0x78FBCCC8, 0x996403DA, 0x136CFBEC, 0xF2F334FE, 
00128         0xAE7CA280, 0x4FE36D92, 0xC5EB95A4, 0x24745AB6, 
00129         0x7D5C1058, 0x9CC3DF4A, 0x16CB277C, 0xF754E86E, 
00130         0xABDB7E10, 0x4A44B102, 0xC04C4934, 0x21D38626, 
00131         0x721CDD41, 0x93831253, 0x198BEA65, 0xF8142577, 
00132         0xA49BB309, 0x45047C1B, 0xCF0C842D, 0x2E934B3F, 
00133         0x77BB01D1, 0x9624CEC3, 0x1C2C36F5, 0xFDB3F9E7, 
00134         0xA13C6F99, 0x40A3A08B, 0xCAAB58BD, 0x2B3497AF, 
00135         0x6C9CEE73, 0x8D032161, 0x070BD957, 0xE6941645, 
00136         0xBA1B803B, 0x5B844F29, 0xD18CB71F, 0x3013780D, 
00137         0x693B32E3, 0x88A4FDF1, 0x02AC05C7, 0xE333CAD5, 
00138         0xBFBC5CAB, 0x5E2393B9, 0xD42B6B8F, 0x35B4A49D, 
00139         0x667BFFFA, 0x87E430E8, 0x0DECC8DE, 0xEC7307CC, 
00140         0xB0FC91B2, 0x51635EA0, 0xDB6BA696, 0x3AF46984, 
00141         0x63DC236A, 0x8243EC78, 0x084B144E, 0xE9D4DB5C, 
00142         0xB55B4D22, 0x54C48230, 0xDECC7A06, 0x3F53B514,
00143 #else
00144         0x00000000, 0xE19FCF13, 0x6B973726, 0x8A08F835,
00145         0xD6876E4C, 0x3718A15F, 0xBD10596A, 0x5C8F9679,
00146         0x05A7DC98, 0xE438138B, 0x6E30EBBE, 0x8FAF24AD,
00147         0xD320B2D4, 0x32BF7DC7, 0xB8B785F2, 0x59284AE1,
00148         0x0AE71199, 0xEB78DE8A, 0x617026BF, 0x80EFE9AC,
00149         0xDC607FD5, 0x3DFFB0C6, 0xB7F748F3, 0x566887E0,
00150         0x0F40CD01, 0xEEDF0212, 0x64D7FA27, 0x85483534,
00151         0xD9C7A34D, 0x38586C5E, 0xB250946B, 0x53CF5B78,
00152         0x1467229B, 0xF5F8ED88, 0x7FF015BD, 0x9E6FDAAE,
00153         0xC2E04CD7, 0x237F83C4, 0xA9777BF1, 0x48E8B4E2,
00154         0x11C0FE03, 0xF05F3110, 0x7A57C925, 0x9BC80636,
00155         0xC747904F, 0x26D85F5C, 0xACD0A769, 0x4D4F687A,
00156         0x1E803302, 0xFF1FFC11, 0x75170424, 0x9488CB37,
00157         0xC8075D4E, 0x2998925D, 0xA3906A68, 0x420FA57B,
00158         0x1B27EF9A, 0xFAB82089, 0x70B0D8BC, 0x912F17AF,
00159         0xCDA081D6, 0x2C3F4EC5, 0xA637B6F0, 0x47A879E3,
00160         0x28CE449F, 0xC9518B8C, 0x435973B9, 0xA2C6BCAA,
00161         0xFE492AD3, 0x1FD6E5C0, 0x95DE1DF5, 0x7441D2E6,
00162         0x2D699807, 0xCCF65714, 0x46FEAF21, 0xA7616032,
00163         0xFBEEF64B, 0x1A713958, 0x9079C16D, 0x71E60E7E,
00164         0x22295506, 0xC3B69A15, 0x49BE6220, 0xA821AD33,
00165         0xF4AE3B4A, 0x1531F459, 0x9F390C6C, 0x7EA6C37F,
00166         0x278E899E, 0xC611468D, 0x4C19BEB8, 0xAD8671AB,
00167         0xF109E7D2, 0x109628C1, 0x9A9ED0F4, 0x7B011FE7,
00168         0x3CA96604, 0xDD36A917, 0x573E5122, 0xB6A19E31,
00169         0xEA2E0848, 0x0BB1C75B, 0x81B93F6E, 0x6026F07D,
00170         0x390EBA9C, 0xD891758F, 0x52998DBA, 0xB30642A9,
00171         0xEF89D4D0, 0x0E161BC3, 0x841EE3F6, 0x65812CE5,
00172         0x364E779D, 0xD7D1B88E, 0x5DD940BB, 0xBC468FA8,
00173         0xE0C919D1, 0x0156D6C2, 0x8B5E2EF7, 0x6AC1E1E4,
00174         0x33E9AB05, 0xD2766416, 0x587E9C23, 0xB9E15330,
00175         0xE56EC549, 0x04F10A5A, 0x8EF9F26F, 0x6F663D7C,
00176         0x50358897, 0xB1AA4784, 0x3BA2BFB1, 0xDA3D70A2,
00177         0x86B2E6DB, 0x672D29C8, 0xED25D1FD, 0x0CBA1EEE,
00178         0x5592540F, 0xB40D9B1C, 0x3E056329, 0xDF9AAC3A,
00179         0x83153A43, 0x628AF550, 0xE8820D65, 0x091DC276,
00180         0x5AD2990E, 0xBB4D561D, 0x3145AE28, 0xD0DA613B,
00181         0x8C55F742, 0x6DCA3851, 0xE7C2C064, 0x065D0F77,
00182         0x5F754596, 0xBEEA8A85, 0x34E272B0, 0xD57DBDA3,
00183         0x89F22BDA, 0x686DE4C9, 0xE2651CFC, 0x03FAD3EF,
00184         0x4452AA0C, 0xA5CD651F, 0x2FC59D2A, 0xCE5A5239,
00185         0x92D5C440, 0x734A0B53, 0xF942F366, 0x18DD3C75,
00186         0x41F57694, 0xA06AB987, 0x2A6241B2, 0xCBFD8EA1,
00187         0x977218D8, 0x76EDD7CB, 0xFCE52FFE, 0x1D7AE0ED,
00188         0x4EB5BB95, 0xAF2A7486, 0x25228CB3, 0xC4BD43A0,
00189         0x9832D5D9, 0x79AD1ACA, 0xF3A5E2FF, 0x123A2DEC,
00190         0x4B12670D, 0xAA8DA81E, 0x2085502B, 0xC11A9F38,
00191         0x9D950941, 0x7C0AC652, 0xF6023E67, 0x179DF174,
00192         0x78FBCC08, 0x9964031B, 0x136CFB2E, 0xF2F3343D,
00193         0xAE7CA244, 0x4FE36D57, 0xC5EB9562, 0x24745A71,
00194         0x7D5C1090, 0x9CC3DF83, 0x16CB27B6, 0xF754E8A5,
00195         0xABDB7EDC, 0x4A44B1CF, 0xC04C49FA, 0x21D386E9,
00196         0x721CDD91, 0x93831282, 0x198BEAB7, 0xF81425A4,
00197         0xA49BB3DD, 0x45047CCE, 0xCF0C84FB, 0x2E934BE8,
00198         0x77BB0109, 0x9624CE1A, 0x1C2C362F, 0xFDB3F93C,
00199         0xA13C6F45, 0x40A3A056, 0xCAAB5863, 0x2B349770,
00200         0x6C9CEE93, 0x8D032180, 0x070BD9B5, 0xE69416A6,
00201         0xBA1B80DF, 0x5B844FCC, 0xD18CB7F9, 0x301378EA,
00202         0x693B320B, 0x88A4FD18, 0x02AC052D, 0xE333CA3E,
00203         0xBFBC5C47, 0x5E239354, 0xD42B6B61, 0x35B4A472,
00204         0x667BFF0A, 0x87E43019, 0x0DECC82C, 0xEC73073F,
00205         0xB0FC9146, 0x51635E55, 0xDB6BA660, 0x3AF46973,
00206         0x63DC2392, 0x8243EC81, 0x084B14B4, 0xE9D4DBA7,
00207         0xB55B4DDE, 0x54C482CD, 0xDECC7AF8, 0x3F53B5EB,
00208 #endif
00209         0x00000000, 0x180F40CD, 0x301E8033, 0x2811C0FE,
00210         0x603CA966, 0x7833E9AB, 0x50222955, 0x482D6998,
00211         0xC078FBCC, 0xD877BB01, 0xF0667BFF, 0xE8693B32,
00212         0xA04452AA, 0xB84B1267, 0x905AD299, 0x88559254,
00213         0x29F05F31, 0x31FF1FFC, 0x19EEDF02, 0x01E19FCF,
00214         0x49CCF657, 0x51C3B69A, 0x79D27664, 0x61DD36A9,
00215         0xE988A4FD, 0xF187E430, 0xD99624CE, 0xC1996403,
00216         0x89B40D9B, 0x91BB4D56, 0xB9AA8DA8, 0xA1A5CD65,
00217         0x5249BE62, 0x4A46FEAF, 0x62573E51, 0x7A587E9C,
00218         0x32751704, 0x2A7A57C9, 0x026B9737, 0x1A64D7FA,
00219         0x923145AE, 0x8A3E0563, 0xA22FC59D, 0xBA208550,
00220         0xF20DECC8, 0xEA02AC05, 0xC2136CFB, 0xDA1C2C36,
00221         0x7BB9E153, 0x63B6A19E, 0x4BA76160, 0x53A821AD,
00222         0x1B854835, 0x038A08F8, 0x2B9BC806, 0x339488CB,
00223         0xBBC11A9F, 0xA3CE5A52, 0x8BDF9AAC, 0x93D0DA61,
00224         0xDBFDB3F9, 0xC3F2F334, 0xEBE333CA, 0xF3EC7307,
00225         0xA492D5C4, 0xBC9D9509, 0x948C55F7, 0x8C83153A,
00226         0xC4AE7CA2, 0xDCA13C6F, 0xF4B0FC91, 0xECBFBC5C,
00227         0x64EA2E08, 0x7CE56EC5, 0x54F4AE3B, 0x4CFBEEF6,
00228         0x04D6876E, 0x1CD9C7A3, 0x34C8075D, 0x2CC74790,
00229         0x8D628AF5, 0x956DCA38, 0xBD7C0AC6, 0xA5734A0B,
00230         0xED5E2393, 0xF551635E, 0xDD40A3A0, 0xC54FE36D,
00231         0x4D1A7139, 0x551531F4, 0x7D04F10A, 0x650BB1C7,
00232         0x2D26D85F, 0x35299892, 0x1D38586C, 0x053718A1,
00233         0xF6DB6BA6, 0xEED42B6B, 0xC6C5EB95, 0xDECAAB58,
00234         0x96E7C2C0, 0x8EE8820D, 0xA6F942F3, 0xBEF6023E,
00235         0x36A3906A, 0x2EACD0A7, 0x06BD1059, 0x1EB25094,
00236         0x569F390C, 0x4E9079C1, 0x6681B93F, 0x7E8EF9F2,
00237         0xDF2B3497, 0xC724745A, 0xEF35B4A4, 0xF73AF469,
00238         0xBF179DF1, 0xA718DD3C, 0x8F091DC2, 0x97065D0F,
00239         0x1F53CF5B, 0x075C8F96, 0x2F4D4F68, 0x37420FA5,
00240         0x7F6F663D, 0x676026F0, 0x4F71E60E, 0x577EA6C3,
00241         0xE18D0321, 0xF98243EC, 0xD1938312, 0xC99CC3DF,
00242         0x81B1AA47, 0x99BEEA8A, 0xB1AF2A74, 0xA9A06AB9,
00243         0x21F5F8ED, 0x39FAB820, 0x11EB78DE, 0x09E43813,
00244         0x41C9518B, 0x59C61146, 0x71D7D1B8, 0x69D89175,
00245         0xC87D5C10, 0xD0721CDD, 0xF863DC23, 0xE06C9CEE,
00246         0xA841F576, 0xB04EB5BB, 0x985F7545, 0x80503588,
00247         0x0805A7DC, 0x100AE711, 0x381B27EF, 0x20146722,
00248         0x68390EBA, 0x70364E77, 0x58278E89, 0x4028CE44,
00249         0xB3C4BD43, 0xABCBFD8E, 0x83DA3D70, 0x9BD57DBD,
00250         0xD3F81425, 0xCBF754E8, 0xE3E69416, 0xFBE9D4DB,
00251         0x73BC468F, 0x6BB30642, 0x43A2C6BC, 0x5BAD8671,
00252         0x1380EFE9, 0x0B8FAF24, 0x239E6FDA, 0x3B912F17,
00253         0x9A34E272, 0x823BA2BF, 0xAA2A6241, 0xB225228C,
00254         0xFA084B14, 0xE2070BD9, 0xCA16CB27, 0xD2198BEA,
00255         0x5A4C19BE, 0x42435973, 0x6A52998D, 0x725DD940,
00256         0x3A70B0D8, 0x227FF015, 0x0A6E30EB, 0x12617026,
00257         0x451FD6E5, 0x5D109628, 0x750156D6, 0x6D0E161B,
00258         0x25237F83, 0x3D2C3F4E, 0x153DFFB0, 0x0D32BF7D,
00259         0x85672D29, 0x9D686DE4, 0xB579AD1A, 0xAD76EDD7,
00260         0xE55B844F, 0xFD54C482, 0xD545047C, 0xCD4A44B1,
00261         0x6CEF89D4, 0x74E0C919, 0x5CF109E7, 0x44FE492A,
00262         0x0CD320B2, 0x14DC607F, 0x3CCDA081, 0x24C2E04C,
00263         0xAC977218, 0xB49832D5, 0x9C89F22B, 0x8486B2E6,
00264         0xCCABDB7E, 0xD4A49BB3, 0xFCB55B4D, 0xE4BA1B80,
00265         0x17566887, 0x0F59284A, 0x2748E8B4, 0x3F47A879,
00266         0x776AC1E1, 0x6F65812C, 0x477441D2, 0x5F7B011F,
00267         0xD72E934B, 0xCF21D386, 0xE7301378, 0xFF3F53B5,
00268         0xB7123A2D, 0xAF1D7AE0, 0x870CBA1E, 0x9F03FAD3,
00269         0x3EA637B6, 0x26A9777B, 0x0EB8B785, 0x16B7F748,
00270         0x5E9A9ED0, 0x4695DE1D, 0x6E841EE3, 0x768B5E2E,
00271         0xFEDECC7A, 0xE6D18CB7, 0xCEC04C49, 0xD6CF0C84,
00272         0x9EE2651C, 0x86ED25D1, 0xAEFCE52F, 0xB6F3A5E2
00273 };
00274 
00275 
00276 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
00277 unsigned int SosemanukPolicy::GetAlignment() const
00278 {
00279 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00280 #ifdef __INTEL_COMPILER
00281         if (HasSSE2() && !IsP4())       // Intel compiler produces faster code for this algorithm on the P4
00282 #else
00283         if (HasSSE2())
00284 #endif
00285                 return 16;
00286         else
00287 #endif
00288                 return 1;
00289 }
00290 
00291 unsigned int SosemanukPolicy::GetOptimalBlockSize() const
00292 {
00293 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00294 #ifdef __INTEL_COMPILER
00295         if (HasSSE2() && !IsP4())       // Intel compiler produces faster code for this algorithm on the P4
00296 #else
00297         if (HasSSE2())
00298 #endif
00299                 return 4*BYTES_PER_ITERATION;
00300         else
00301 #endif
00302                 return BYTES_PER_ITERATION;
00303 }
00304 #endif
00305 
00306 #pragma warning(disable: 4731)  // frame pointer register 'ebp' modified by inline assembly code
00307 
00308 void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
00309 {
00310 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00311 #ifdef __INTEL_COMPILER
00312         if (HasSSE2() && !IsP4())       // Intel compiler produces faster code for this algorithm on the P4
00313 #else
00314         if (HasSSE2())
00315 #endif
00316         {
00317 #ifdef __GNUC__
00318                 __asm__ __volatile__
00319                 (
00320                 ".intel_syntax noprefix;"
00321                 AS_PUSH(                bx)
00322 #else
00323                 word32 *state = m_state;
00324                 AS2(    mov             WORD_REG(ax), state)
00325                 AS2(    mov             WORD_REG(di), output)
00326                 AS2(    mov             WORD_REG(dx), input)
00327                 AS2(    mov             WORD_REG(cx), iterationCount)
00328 #endif
00329 
00330 #define SSE2_output                     WORD_PTR [WORD_REG(sp)+1*WORD_SZ]
00331 #define SSE2_input                      WORD_PTR [WORD_REG(sp)+2*WORD_SZ]
00332 #define SSE2_wordsLeft          WORD_PTR [WORD_REG(sp)+3*WORD_SZ]
00333 #define SSE2_diEnd                      WORD_PTR [WORD_REG(sp)+4*WORD_SZ]
00334 #define SSE2_pMulTables         WORD_PTR [WORD_REG(sp)+5*WORD_SZ]
00335 #define SSE2_state                      WORD_PTR [WORD_REG(sp)+6*WORD_SZ]
00336 #define SSE2_wordsLeft2         WORD_PTR [WORD_REG(sp)+7*WORD_SZ]
00337 #define SSE2_stateCopy          WORD_REG(sp) + 8*WORD_SZ
00338 #define SSE2_uvStart            SSE2_stateCopy + 12*4
00339 
00340                 AS_PUSH(                bp)
00341                 AS2(    mov             WORD_REG(bx), WORD_REG(sp))
00342                 AS2(    and             WORD_REG(sp), -16)
00343                 AS2(    sub             WORD_REG(sp), 80*4*2+12*4+8*WORD_SZ)    // 80 v's, 80 u's, 12 state, 8 locals
00344                 AS2(    mov             [WORD_REG(sp)], WORD_REG(bx))
00345                 AS2(    mov             SSE2_output, WORD_REG(di))
00346                 AS2(    mov             SSE2_input, WORD_REG(dx))
00347                 AS2(    mov             SSE2_state, WORD_REG(ax))
00348 #ifndef _MSC_VER
00349                 AS2(    mov             SSE2_pMulTables, WORD_REG(si))
00350 #endif
00351                 AS2(    lea             WORD_REG(cx), [4*WORD_REG(cx)+WORD_REG(cx)])
00352                 AS2(    lea             WORD_REG(si), [4*WORD_REG(cx)])
00353                 AS2(    mov             SSE2_wordsLeft, WORD_REG(si))
00354                 AS2(    movdqa  xmm0, [WORD_REG(ax)+0*16])              // copy state to stack to save a register
00355                 AS2(    movdqa  [SSE2_stateCopy+0*16], xmm0)
00356                 AS2(    movdqa  xmm0, [WORD_REG(ax)+1*16])
00357                 AS2(    movdqa  [SSE2_stateCopy+1*16], xmm0)
00358                 AS2(    movq    xmm0, QWORD PTR [WORD_REG(ax)+2*16])
00359                 AS2(    movq    QWORD PTR [SSE2_stateCopy+2*16], xmm0)
00360                 AS2(    psrlq   xmm0, 32)
00361                 AS2(    movd    ebx, xmm0)                              // s(9)
00362                 AS2(    mov             ecx, [WORD_REG(ax)+10*4])
00363                 AS2(    mov             edx, [WORD_REG(ax)+11*4])
00364                 AS2(    pcmpeqb xmm7, xmm7)                             // all ones
00365 
00366 #define s(i)    SSE2_stateCopy + ASM_MOD(i,10)*4
00367 #define u(j)    WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4
00368 #define v(j)    WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4 + 80*4
00369 
00370 #define r10 ecx
00371 #define r11 edx
00372 #define r20 edx
00373 #define r21 ecx
00374 
00375 #define SSE2_STEP(i, j) \
00376         AS2(    mov             eax, [s(i+0)])\
00377         AS2(    mov             [v(i)], eax)\
00378         AS2(    rol             eax, 8)\
00379         AS2(    lea             ebp, [ebx + r2##j])\
00380         AS2(    xor             ebp, r1##j)\
00381         AS2(    mov             [u(i)], ebp)\
00382         AS2(    mov             ebp, 1)\
00383         AS2(    and             ebp, r2##j)\
00384         AS1(    neg             ebp)\
00385         AS2(    and             ebp, ebx)\
00386         AS2(    xor             ebx, eax)\
00387         AS2(    movzx   eax, al)\
00388         AS2(    xor             ebx, [WORD_REG(si)+WORD_REG(ax)*4])\
00389         AS2(    mov             eax, [s(i+3)])\
00390         AS2(    xor             ebp, [s(i+2)])\
00391         AS2(    add             r1##j, ebp)\
00392         AS2(    movzx   ebp, al)\
00393         AS2(    shr             eax, 8)\
00394         AS2(    xor             ebx, [WORD_REG(si)+1024+WORD_REG(bp)*4])\
00395         AS2(    xor             ebx, eax)\
00396         AS2(    imul    r2##j, 0x54655307)\
00397         AS2(    rol             r2##j, 7)\
00398         AS2(    mov             [s(i+0)], ebx)\
00399 
00400                 ASL(2)  // outer loop, each iteration of this processes 80 words
00401                 AS2(    lea             WORD_REG(di), [SSE2_uvStart])   // start of v and u
00402                 AS2(    mov             WORD_REG(ax), 80)
00403                 AS2(    cmp             WORD_REG(si), 80)
00404                 AS2(    cmovg   WORD_REG(si), WORD_REG(ax))
00405                 AS2(    mov             SSE2_wordsLeft2, WORD_REG(si))
00406                 AS2(    lea             WORD_REG(si), [WORD_REG(di)+WORD_REG(si)])              // use to end first inner loop
00407                 AS2(    mov             SSE2_diEnd, WORD_REG(si))
00408 #ifdef _MSC_VER
00409                 AS2(    lea             WORD_REG(si), s_mulTables)
00410 #else
00411                 AS2(    mov             WORD_REG(si), SSE2_pMulTables)
00412 #endif
00413 
00414                 ASL(0)  // first inner loop, 20 words each, 4 iterations
00415                 SSE2_STEP(0, 0)
00416                 SSE2_STEP(1, 1)
00417                 SSE2_STEP(2, 0)
00418                 SSE2_STEP(3, 1)
00419                 SSE2_STEP(4, 0)
00420                 SSE2_STEP(5, 1)
00421                 SSE2_STEP(6, 0)
00422                 SSE2_STEP(7, 1)
00423                 SSE2_STEP(8, 0)
00424                 SSE2_STEP(9, 1)
00425                 SSE2_STEP(10, 0)
00426                 SSE2_STEP(11, 1)
00427                 SSE2_STEP(12, 0)
00428                 SSE2_STEP(13, 1)
00429                 SSE2_STEP(14, 0)
00430                 SSE2_STEP(15, 1)
00431                 SSE2_STEP(16, 0)
00432                 SSE2_STEP(17, 1)
00433                 SSE2_STEP(18, 0)
00434                 SSE2_STEP(19, 1)
00435                 // loop
00436                 AS2(    add             WORD_REG(di), 5*4)
00437                 AS2(    cmp             WORD_REG(di), SSE2_diEnd)
00438                 ASJ(    jne,    0, b)
00439 
00440                 AS2(    mov             WORD_REG(ax), SSE2_input)
00441                 AS2(    mov             WORD_REG(bp), SSE2_output)
00442                 AS2(    lea             WORD_REG(di), [SSE2_uvStart])           // start of v and u
00443                 AS2(    mov             WORD_REG(si), SSE2_wordsLeft2)
00444 
00445                 ASL(1)  // second inner loop, 16 words each, 5 iterations
00446                 AS2(    movdqa  xmm0, [WORD_REG(di)+0*20*4])
00447                 AS2(    movdqa  xmm2, [WORD_REG(di)+2*20*4])
00448                 AS2(    movdqa  xmm3, [WORD_REG(di)+3*20*4])
00449                 AS2(    movdqa  xmm1, [WORD_REG(di)+1*20*4])
00450                 // S2
00451                 AS2(    movdqa  xmm4, xmm0)
00452                 AS2(    pand    xmm0, xmm2)
00453                 AS2(    pxor    xmm0, xmm3)
00454                 AS2(    pxor    xmm2, xmm1)
00455                 AS2(    pxor    xmm2, xmm0)
00456                 AS2(    por             xmm3, xmm4)
00457                 AS2(    pxor    xmm3, xmm1)
00458                 AS2(    pxor    xmm4, xmm2)
00459                 AS2(    movdqa  xmm1, xmm3)
00460                 AS2(    por             xmm3, xmm4)
00461                 AS2(    pxor    xmm3, xmm0)
00462                 AS2(    pand    xmm0, xmm1)
00463                 AS2(    pxor    xmm4, xmm0)
00464                 AS2(    pxor    xmm1, xmm3)
00465                 AS2(    pxor    xmm1, xmm4)
00466                 AS2(    pxor    xmm4, xmm7)
00467                 // xor with v
00468                 AS2(    pxor    xmm2, [WORD_REG(di)+80*4])
00469                 AS2(    pxor    xmm3, [WORD_REG(di)+80*5])
00470                 AS2(    pxor    xmm1, [WORD_REG(di)+80*6])
00471                 AS2(    pxor    xmm4, [WORD_REG(di)+80*7])
00472                 // exit loop early if less than 16 words left to output
00473                 // this is necessary because block size is 20 words, and we output 16 words in each iteration of this loop
00474                 AS2(    cmp             WORD_REG(si), 16)
00475                 ASJ(    jl,             4, f)
00476                 // unpack
00477                 AS2(    movdqa          xmm6, xmm2)
00478                 AS2(    punpckldq       xmm2, xmm3)
00479                 AS2(    movdqa          xmm5, xmm1)
00480                 AS2(    punpckldq       xmm1, xmm4)
00481                 AS2(    movdqa          xmm0, xmm2)
00482                 AS2(    punpcklqdq      xmm2, xmm1)
00483                 AS2(    punpckhqdq      xmm0, xmm1)
00484                 AS2(    punpckhdq       xmm6, xmm3)
00485                 AS2(    punpckhdq       xmm5, xmm4)
00486                 AS2(    movdqa          xmm3, xmm6)
00487                 AS2(    punpcklqdq      xmm6, xmm5)
00488                 AS2(    punpckhqdq      xmm3, xmm5)
00489                 // output keystream
00490                 AS2(    test    WORD_REG(ax), WORD_REG(ax))
00491                 ASJ(    jz,             3, f)
00492                 AS2(    test    eax, 0xf)
00493                 ASJ(    jnz,    7, f)
00494                 AS2(    pxor    xmm2, [WORD_REG(ax)+0*16])
00495                 AS2(    pxor    xmm0, [WORD_REG(ax)+1*16])
00496                 AS2(    pxor    xmm6, [WORD_REG(ax)+2*16])
00497                 AS2(    pxor    xmm3, [WORD_REG(ax)+3*16])
00498                 AS2(    add             WORD_REG(ax), 4*16)
00499                 ASJ(    jmp,    3, f)
00500                 ASL(7)
00501                 AS2(    movdqu  xmm1, [WORD_REG(ax)+0*16])
00502                 AS2(    pxor    xmm2, xmm1)
00503                 AS2(    movdqu  xmm1, [WORD_REG(ax)+1*16])
00504                 AS2(    pxor    xmm0, xmm1)
00505                 AS2(    movdqu  xmm1, [WORD_REG(ax)+2*16])
00506                 AS2(    pxor    xmm6, xmm1)
00507                 AS2(    movdqu  xmm1, [WORD_REG(ax)+3*16])
00508                 AS2(    pxor    xmm3, xmm1)
00509                 AS2(    add             WORD_REG(ax), 4*16)
00510                 ASL(3)
00511                 AS2(    test    ebp, 0xf)
00512                 ASJ(    jnz,    8, f)
00513                 AS2(    movdqa  [WORD_REG(bp)+0*16], xmm2)
00514                 AS2(    movdqa  [WORD_REG(bp)+1*16], xmm0)
00515                 AS2(    movdqa  [WORD_REG(bp)+2*16], xmm6)
00516                 AS2(    movdqa  [WORD_REG(bp)+3*16], xmm3)
00517                 ASJ(    jmp,    9, f)
00518                 ASL(8)
00519                 AS2(    movdqu  [WORD_REG(bp)+0*16], xmm2)
00520                 AS2(    movdqu  [WORD_REG(bp)+1*16], xmm0)
00521                 AS2(    movdqu  [WORD_REG(bp)+2*16], xmm6)
00522                 AS2(    movdqu  [WORD_REG(bp)+3*16], xmm3)
00523                 ASL(9)
00524                 // loop
00525                 AS2(    add             WORD_REG(di), 4*4)
00526                 AS2(    add             WORD_REG(bp), 4*16)
00527                 AS2(    sub             WORD_REG(si), 16)
00528                 ASJ(    jnz,    1, b)
00529 
00530                 // outer loop
00531                 AS2(    mov             WORD_REG(si), SSE2_wordsLeft)
00532                 AS2(    sub             WORD_REG(si), 80)
00533                 ASJ(    jz,             6, f)
00534                 AS2(    mov             SSE2_wordsLeft, WORD_REG(si))
00535                 AS2(    mov             SSE2_input, WORD_REG(ax))
00536                 AS2(    mov             SSE2_output, WORD_REG(bp))
00537                 ASJ(    jmp,    2, b)
00538 
00539                 ASL(4)  // final output of less than 16 words
00540                 AS2(    test    WORD_REG(ax), WORD_REG(ax))
00541                 ASJ(    jz,             5, f)
00542                 AS2(    movd    xmm0, [WORD_REG(ax)+0*4])
00543                 AS2(    pxor    xmm2, xmm0)
00544                 AS2(    movd    xmm0, [WORD_REG(ax)+1*4])
00545                 AS2(    pxor    xmm3, xmm0)
00546                 AS2(    movd    xmm0, [WORD_REG(ax)+2*4])
00547                 AS2(    pxor    xmm1, xmm0)
00548                 AS2(    movd    xmm0, [WORD_REG(ax)+3*4])
00549                 AS2(    pxor    xmm4, xmm0)
00550                 AS2(    add             WORD_REG(ax), 16)
00551                 ASL(5)
00552                 AS2(    movd    [WORD_REG(bp)+0*4], xmm2)
00553                 AS2(    movd    [WORD_REG(bp)+1*4], xmm3)
00554                 AS2(    movd    [WORD_REG(bp)+2*4], xmm1)
00555                 AS2(    movd    [WORD_REG(bp)+3*4], xmm4)
00556                 AS2(    sub             WORD_REG(si), 4)
00557                 ASJ(    jz,             6, f)
00558                 AS2(    add             WORD_REG(bp), 16)
00559                 AS2(    psrldq  xmm2, 4)
00560                 AS2(    psrldq  xmm3, 4)
00561                 AS2(    psrldq  xmm1, 4)
00562                 AS2(    psrldq  xmm4, 4)
00563                 ASJ(    jmp,    4, b)
00564 
00565                 ASL(6)  // save state
00566                 AS2(    mov             WORD_REG(bx), SSE2_state)
00567                 AS2(    movdqa  xmm0, [SSE2_stateCopy+0*16])
00568                 AS2(    movdqa  [WORD_REG(bx)+0*16], xmm0)
00569                 AS2(    movdqa  xmm0, [SSE2_stateCopy+1*16])
00570                 AS2(    movdqa  [WORD_REG(bx)+1*16], xmm0)
00571                 AS2(    movq    xmm0, QWORD PTR [SSE2_stateCopy+2*16])
00572                 AS2(    movq    QWORD PTR [WORD_REG(bx)+2*16], xmm0)
00573                 AS2(    mov             [WORD_REG(bx)+10*4], ecx)
00574                 AS2(    mov             [WORD_REG(bx)+11*4], edx)
00575 
00576                 AS_POP(                 sp)
00577                 AS_POP(                 bp)
00578 
00579 #ifdef __GNUC__
00580                 AS_POP(                 bx)
00581                 ".att_syntax prefix;"
00582                         :
00583                         : "a" (m_state.m_ptr), "c" (iterationCount), "S" (s_mulTables), "D" (output), "d" (input)
00584                         : "memory", "cc"
00585                 );
00586 #endif
00587         }
00588         else
00589 #endif
00590         {
00591 #if CRYPTOPP_BOOL_X86 | CRYPTOPP_BOOL_X64
00592 #define MUL_A(x)    (x = rotlFixed(x, 8), x ^ s_mulTables[byte(x)])
00593 #else
00594 #define MUL_A(x)    (((x) << 8) ^ s_mulTables[(x) >> 24])
00595 #endif
00596 
00597 #define DIV_A(x)    (((x) >> 8) ^ s_mulTables[256 + byte(x)])
00598 
00599 #define r1(i) ((i%2) ? reg2 : reg1)
00600 #define r2(i) ((i%2) ? reg1 : reg2)
00601 
00602 #define STEP(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, v, u)      \
00603                 u = (s##x9 + r2(x0)) ^ r1(x0);\
00604                 v = s##x0;\
00605                 s##x0 = MUL_A(s##x0) ^ DIV_A(s##x3) ^ s##x9;\
00606                 r1(x0) += XMUX(r2(x0), s##x2, s##x9);\
00607                 r2(x0) = rotlFixed(r2(x0) * 0x54655307, 7);\
00608 
00609 #define SOSEMANUK_OUTPUT(x)     \
00610         CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, u2 ^ v0);\
00611         CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, u3 ^ v1);\
00612         CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, u1 ^ v2);\
00613         CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, u4 ^ v3);
00614 
00615 #define OUTPUT4 \
00616         S2(0, u0, u1, u2, u3, u4);\
00617         CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SOSEMANUK_OUTPUT, 4*4);
00618 
00619         word32 s0 = m_state[0];
00620         word32 s1 = m_state[1];
00621         word32 s2 = m_state[2];
00622         word32 s3 = m_state[3];
00623         word32 s4 = m_state[4];
00624         word32 s5 = m_state[5];
00625         word32 s6 = m_state[6];
00626         word32 s7 = m_state[7];
00627         word32 s8 = m_state[8];
00628         word32 s9 = m_state[9];
00629         word32 reg1 = m_state[10];
00630         word32 reg2 = m_state[11];
00631         word32 u0, u1, u2, u3, u4, v0, v1, v2, v3;
00632 
00633         do
00634         {
00635                 STEP(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, v0, u0)
00636                 STEP(1, 2, 3, 4, 5, 6, 7, 8, 9, 0, v1, u1)
00637                 STEP(2, 3, 4, 5, 6, 7, 8, 9, 0, 1, v2, u2)
00638                 STEP(3, 4, 5, 6, 7, 8, 9, 0, 1, 2, v3, u3)
00639                 OUTPUT4
00640                 STEP(4, 5, 6, 7, 8, 9, 0, 1, 2, 3, v0, u0)
00641                 STEP(5, 6, 7, 8, 9, 0, 1, 2, 3, 4, v1, u1)
00642                 STEP(6, 7, 8, 9, 0, 1, 2, 3, 4, 5, v2, u2)
00643                 STEP(7, 8, 9, 0, 1, 2, 3, 4, 5, 6, v3, u3)
00644                 OUTPUT4
00645                 STEP(8, 9, 0, 1, 2, 3, 4, 5, 6, 7, v0, u0)
00646                 STEP(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, v1, u1)
00647                 STEP(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, v2, u2)
00648                 STEP(1, 2, 3, 4, 5, 6, 7, 8, 9, 0, v3, u3)
00649                 OUTPUT4
00650                 STEP(2, 3, 4, 5, 6, 7, 8, 9, 0, 1, v0, u0)
00651                 STEP(3, 4, 5, 6, 7, 8, 9, 0, 1, 2, v1, u1)
00652                 STEP(4, 5, 6, 7, 8, 9, 0, 1, 2, 3, v2, u2)
00653                 STEP(5, 6, 7, 8, 9, 0, 1, 2, 3, 4, v3, u3)
00654                 OUTPUT4
00655                 STEP(6, 7, 8, 9, 0, 1, 2, 3, 4, 5, v0, u0)
00656                 STEP(7, 8, 9, 0, 1, 2, 3, 4, 5, 6, v1, u1)
00657                 STEP(8, 9, 0, 1, 2, 3, 4, 5, 6, 7, v2, u2)
00658                 STEP(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, v3, u3)
00659                 OUTPUT4
00660         }
00661         while (--iterationCount);
00662 
00663         m_state[0] = s0;
00664         m_state[1] = s1;
00665         m_state[2] = s2;
00666         m_state[3] = s3;
00667         m_state[4] = s4;
00668         m_state[5] = s5;
00669         m_state[6] = s6;
00670         m_state[7] = s7;
00671         m_state[8] = s8;
00672         m_state[9] = s9;
00673         m_state[10] = reg1;
00674         m_state[11] = reg2;
00675         }
00676 }
00677 
00678 NAMESPACE_END

Generated on Fri Jun 1 11:11:25 2007 for Crypto++ by  doxygen 1.5.2