Crypto++: integer.cpp Source File

00001 // integer.cpp - written and placed in the public domain by Wei Dai
00002 // contains public domain code contributed by Alister Lee and Leonard Janke
00003 
00004 #include "pch.h"
00005 
00006 #ifndef CRYPTOPP_IMPORTS
00007 
00008 #include "integer.h"
00009 #include "modarith.h"
00010 #include "nbtheory.h"
00011 #include "asn.h"
00012 #include "oids.h"
00013 #include "words.h"
00014 #include "algparam.h"
00015 #include "pubkey.h"             // for P1363_KDF2
00016 #include "sha.h"
00017 #include "cpu.h"
00018 
00019 #include <iostream>
00020 
00021 #if _MSC_VER >= 1400
00022         #include <intrin.h>
00023 #endif
00024 
00025 #ifdef __DECCXX
00026         #include <c_asm.h>
00027 #endif
00028 
00029 #ifdef CRYPTOPP_MSVC6_NO_PP
00030         #pragma message("You do not seem to have the Visual C++ Processor Pack installed, so use of SSE2 instructions will be disabled.")
00031 #endif
00032 
00033 #define CRYPTOPP_INTEGER_SSE2 (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86)
00034 
00035 NAMESPACE_BEGIN(CryptoPP)
00036 
00037 bool AssignIntToInteger(const std::type_info &valueType, void *pInteger, const void *pInt)
00038 {
00039         if (valueType != typeid(Integer))
00040                 return false;
00041         *reinterpret_cast<Integer *>(pInteger) = *reinterpret_cast<const int *>(pInt);
00042         return true;
00043 }
00044 
00045 inline static int Compare(const word *A, const word *B, size_t N)
00046 {
00047         while (N--)
00048                 if (A[N] > B[N])
00049                         return 1;
00050                 else if (A[N] < B[N])
00051                         return -1;
00052 
00053         return 0;
00054 }
00055 
00056 inline static int Increment(word *A, size_t N, word B=1)
00057 {
00058         assert(N);
00059         word t = A[0];
00060         A[0] = t+B;
00061         if (A[0] >= t)
00062                 return 0;
00063         for (unsigned i=1; i<N; i++)
00064                 if (++A[i])
00065                         return 0;
00066         return 1;
00067 }
00068 
00069 inline static int Decrement(word *A, size_t N, word B=1)
00070 {
00071         assert(N);
00072         word t = A[0];
00073         A[0] = t-B;
00074         if (A[0] <= t)
00075                 return 0;
00076         for (unsigned i=1; i<N; i++)
00077                 if (A[i]--)
00078                         return 0;
00079         return 1;
00080 }
00081 
00082 static void TwosComplement(word *A, size_t N)
00083 {
00084         Decrement(A, N);
00085         for (unsigned i=0; i<N; i++)
00086                 A[i] = ~A[i];
00087 }
00088 
00089 static word AtomicInverseModPower2(word A)
00090 {
00091         assert(A%2==1);
00092 
00093         word R=A%8;
00094 
00095         for (unsigned i=3; i<WORD_BITS; i*=2)
00096                 R = R*(2-R*A);
00097 
00098         assert(R*A==1);
00099         return R;
00100 }
00101 
00102 // ********************************************************
00103 
00104 #if !defined(CRYPTOPP_NATIVE_DWORD_AVAILABLE) || CRYPTOPP_BOOL_X64
00105         #define Declare2Words(x)                        word x##0, x##1;
00106         #define AssignWord(a, b)                        a##0 = b; a##1 = 0;
00107         #define Add2WordsBy1(a, b, c)           a##0 = b##0 + c; a##1 = b##1 + (a##0 < c);
00108         #define LowWord(a)                                      a##0
00109         #define HighWord(a)                                     a##1
00110         #ifdef _MSC_VER
00111                 #define MultiplyWords(p, a, b)  p##0 = _umul128(a, b, &p##1);
00112                 #define Double3Words(c, d)              d##1 = __shiftleft128(d##0, d##1, 1); d##0 = __shiftleft128(c, d##0, 1); c *= 2;
00113         #elif defined(__DECCXX)
00114                 #define MultiplyWords(p, a, b)  p##0 = a*b; p##1 = asm("umulh %a0, %a1, %v0", a, b);
00115         #elif CRYPTOPP_BOOL_X64
00116                 #define MultiplyWords(p, a, b)  asm ("mulq %3" : "=a"(p##0), "=d"(p##1) : "a"(a), "g"(b) : "cc");
00117                 #define MulAcc(c, d, a, b)              asm ("mulq %6; addq %3, %0; adcq %4, %1; adcq $0, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1), "=a"(p0), "=d"(p1) : "a"(a), "g"(b) : "cc");
00118                 #define Double3Words(c, d)              asm ("addq %0, %0; adcq %1, %1; adcq %2, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1) : : "cc");
00119                 #define Acc2WordsBy1(a, b)              asm ("addq %2, %0; adcq $0, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b) : "cc");
00120                 #define Acc2WordsBy2(a, b)              asm ("addq %2, %0; adcq %3, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b##0), "r"(b##1) : "cc");
00121                 #define Acc3WordsBy2(c, d, e)   asm ("addq %5, %0; adcq %6, %1; adcq $0, %2;" : "+r"(c), "=r"(e##0), "=r"(e##1) : "1"(d##0), "2"(d##1), "r"(e##0), "r"(e##1) : "cc");
00122         #endif
00123         #ifndef Double3Words
00124                 #define Double3Words(c, d)              d##1 = 2*d##1 + (d##0>>(WORD_BITS-1)); d##0 = 2*d##0 + (c>>(WORD_BITS-1)); c *= 2;
00125         #endif
00126         #ifndef Acc2WordsBy2
00127                 #define Acc2WordsBy2(a, b)              a##0 += b##0; a##1 += a##0 < b##0; a##1 += b##1;
00128         #endif
00129         #define AddWithCarry(u, a, b)           {word t = a+b; u##0 = t + u##1; u##1 = (t<a) + (u##0<t);}
00130         #define SubtractWithBorrow(u, a, b)     {word t = a-b; u##0 = t - u##1; u##1 = (t>a) + (u##0>t);}
00131         #define GetCarry(u)                                     u##1
00132         #define GetBorrow(u)                            u##1
00133 #else
00134         #define Declare2Words(x)                        dword x;
00135         #if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER)
00136                 #define MultiplyWords(p, a, b)          p = __emulu(a, b);
00137         #else
00138                 #define MultiplyWords(p, a, b)          p = (dword)a*b;
00139         #endif
00140         #define AssignWord(a, b)                        a = b;
00141         #define Add2WordsBy1(a, b, c)           a = b + c;
00142         #define Acc2WordsBy2(a, b)                      a += b;
00143         #define LowWord(a)                                      word(a)
00144         #define HighWord(a)                                     word(a>>WORD_BITS)
00145         #define Double3Words(c, d)                      d = 2*d + (c>>(WORD_BITS-1)); c *= 2;
00146         #define AddWithCarry(u, a, b)           u = dword(a) + b + GetCarry(u);
00147         #define SubtractWithBorrow(u, a, b)     u = dword(a) - b - GetBorrow(u);
00148         #define GetCarry(u)                                     HighWord(u)
00149         #define GetBorrow(u)                            word(u>>(WORD_BITS*2-1))
00150 #endif
00151 #ifndef MulAcc
00152         #define MulAcc(c, d, a, b)                      MultiplyWords(p, a, b); Acc2WordsBy1(p, c); c = LowWord(p); Acc2WordsBy1(d, HighWord(p));
00153 #endif
00154 #ifndef Acc2WordsBy1
00155         #define Acc2WordsBy1(a, b)                      Add2WordsBy1(a, a, b)
00156 #endif
00157 #ifndef Acc3WordsBy2
00158         #define Acc3WordsBy2(c, d, e)           Acc2WordsBy1(e, c); c = LowWord(e); Add2WordsBy1(e, d, HighWord(e));
00159 #endif
00160 
00161 class DWord
00162 {
00163 public:
00164         DWord() {}
00165 
00166 #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
00167         explicit DWord(word low)
00168         {
00169                 m_whole = low;
00170         }
00171 #else
00172         explicit DWord(word low)
00173         {
00174                 m_halfs.low = low;
00175                 m_halfs.high = 0;
00176         }
00177 #endif
00178 
00179         DWord(word low, word high)
00180         {
00181                 m_halfs.low = low;
00182                 m_halfs.high = high;
00183         }
00184 
00185         static DWord Multiply(word a, word b)
00186         {
00187                 DWord r;
00188                 #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
00189                         r.m_whole = (dword)a * b;
00190                 #else
00191                         r.m_halfs.low = _umul128(a, b, &r.m_halfs.high);
00192                 #endif
00193                 return r;
00194         }
00195 
00196         static DWord MultiplyAndAdd(word a, word b, word c)
00197         {
00198                 DWord r = Multiply(a, b);
00199                 return r += c;
00200         }
00201 
00202         DWord & operator+=(word a)
00203         {
00204                 #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
00205                         m_whole = m_whole + a;
00206                 #else
00207                         m_halfs.low += a;
00208                         m_halfs.high += (m_halfs.low < a);
00209                 #endif
00210                 return *this;
00211         }
00212 
00213         DWord operator+(word a)
00214         {
00215                 DWord r;
00216                 #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
00217                         r.m_whole = m_whole + a;
00218                 #else
00219                         r.m_halfs.low = m_halfs.low + a;
00220                         r.m_halfs.high = m_halfs.high + (r.m_halfs.low < a);
00221                 #endif
00222                 return r;
00223         }
00224 
00225         DWord operator-(DWord a)
00226         {
00227                 DWord r;
00228                 #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
00229                         r.m_whole = m_whole - a.m_whole;
00230                 #else
00231                         r.m_halfs.low = m_halfs.low - a.m_halfs.low;
00232                         r.m_halfs.high = m_halfs.high - a.m_halfs.high - (r.m_halfs.low > m_halfs.low);
00233                 #endif
00234                 return r;
00235         }
00236 
00237         DWord operator-(word a)
00238         {
00239                 DWord r;
00240                 #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
00241                         r.m_whole = m_whole - a;
00242                 #else
00243                         r.m_halfs.low = m_halfs.low - a;
00244                         r.m_halfs.high = m_halfs.high - (r.m_halfs.low > m_halfs.low);
00245                 #endif
00246                 return r;
00247         }
00248 
00249         // returns quotient, which must fit in a word
00250         word operator/(word divisor);
00251 
00252         word operator%(word a);
00253 
00254         bool operator!() const
00255         {
00256         #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
00257                 return !m_whole;
00258         #else
00259                 return !m_halfs.high && !m_halfs.low;
00260         #endif
00261         }
00262 
00263         word GetLowHalf() const {return m_halfs.low;}
00264         word GetHighHalf() const {return m_halfs.high;}
00265         word GetHighHalfAsBorrow() const {return 0-m_halfs.high;}
00266 
00267 private:
00268         union
00269         {
00270         #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
00271                 dword m_whole;
00272         #endif
00273                 struct
00274                 {
00275                 #ifdef IS_LITTLE_ENDIAN
00276                         word low;
00277                         word high;
00278                 #else
00279                         word high;
00280                         word low;
00281                 #endif
00282                 } m_halfs;
00283         };
00284 };
00285 
00286 class Word
00287 {
00288 public:
00289         Word() {}
00290 
00291         Word(word value)
00292         {
00293                 m_whole = value;
00294         }
00295 
00296         Word(hword low, hword high)
00297         {
00298                 m_whole = low | (word(high) << (WORD_BITS/2));
00299         }
00300 
00301         static Word Multiply(hword a, hword b)
00302         {
00303                 Word r;
00304                 r.m_whole = (word)a * b;
00305                 return r;
00306         }
00307 
00308         Word operator-(Word a)
00309         {
00310                 Word r;
00311                 r.m_whole = m_whole - a.m_whole;
00312                 return r;
00313         }
00314 
00315         Word operator-(hword a)
00316         {
00317                 Word r;
00318                 r.m_whole = m_whole - a;
00319                 return r;
00320         }
00321 
00322         // returns quotient, which must fit in a word
00323         hword operator/(hword divisor)
00324         {
00325                 return hword(m_whole / divisor);
00326         }
00327 
00328         bool operator!() const
00329         {
00330                 return !m_whole;
00331         }
00332 
00333         word GetWhole() const {return m_whole;}
00334         hword GetLowHalf() const {return hword(m_whole);}
00335         hword GetHighHalf() const {return hword(m_whole>>(WORD_BITS/2));}
00336         hword GetHighHalfAsBorrow() const {return 0-hword(m_whole>>(WORD_BITS/2));}
00337         
00338 private:
00339         word m_whole;
00340 };
00341 
00342 // do a 3 word by 2 word divide, returns quotient and leaves remainder in A
00343 template <class S, class D>
00344 S DivideThreeWordsByTwo(S *A, S B0, S B1, D *dummy=NULL)
00345 {
00346         // assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a S
00347         assert(A[2] < B1 || (A[2]==B1 && A[1] < B0));
00348 
00349         // estimate the quotient: do a 2 S by 1 S divide
00350         S Q;
00351         if (S(B1+1) == 0)
00352                 Q = A[2];
00353         else
00354                 Q = D(A[1], A[2]) / S(B1+1);
00355 
00356         // now subtract Q*B from A
00357         D p = D::Multiply(B0, Q);
00358         D u = (D) A[0] - p.GetLowHalf();
00359         A[0] = u.GetLowHalf();
00360         u = (D) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() - D::Multiply(B1, Q);
00361         A[1] = u.GetLowHalf();
00362         A[2] += u.GetHighHalf();
00363 
00364         // Q <= actual quotient, so fix it
00365         while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
00366         {
00367                 u = (D) A[0] - B0;
00368                 A[0] = u.GetLowHalf();
00369                 u = (D) A[1] - B1 - u.GetHighHalfAsBorrow();
00370                 A[1] = u.GetLowHalf();
00371                 A[2] += u.GetHighHalf();
00372                 Q++;
00373                 assert(Q);      // shouldn't overflow
00374         }
00375 
00376         return Q;
00377 }
00378 
00379 // do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
00380 template <class S, class D>
00381 inline D DivideFourWordsByTwo(S *T, const D &Al, const D &Ah, const D &B)
00382 {
00383         if (!B) // if divisor is 0, we assume divisor==2**(2*WORD_BITS)
00384                 return D(Ah.GetLowHalf(), Ah.GetHighHalf());
00385         else
00386         {
00387                 S Q[2];
00388                 T[0] = Al.GetLowHalf();
00389                 T[1] = Al.GetHighHalf(); 
00390                 T[2] = Ah.GetLowHalf();
00391                 T[3] = Ah.GetHighHalf();
00392                 Q[1] = DivideThreeWordsByTwo<S, D>(T+1, B.GetLowHalf(), B.GetHighHalf());
00393                 Q[0] = DivideThreeWordsByTwo<S, D>(T, B.GetLowHalf(), B.GetHighHalf());
00394                 return D(Q[0], Q[1]);
00395         }
00396 }
00397 
00398 // returns quotient, which must fit in a word
00399 inline word DWord::operator/(word a)
00400 {
00401         #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
00402                 return word(m_whole / a);
00403         #else
00404                 hword r[4];
00405                 return DivideFourWordsByTwo<hword, Word>(r, m_halfs.low, m_halfs.high, a).GetWhole();
00406         #endif
00407 }
00408 
00409 inline word DWord::operator%(word a)
00410 {
00411         #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
00412                 return word(m_whole % a);
00413         #else
00414                 if (a < (word(1) << (WORD_BITS/2)))
00415                 {
00416                         hword h = hword(a);
00417                         word r = m_halfs.high % h;
00418                         r = ((m_halfs.low >> (WORD_BITS/2)) + (r << (WORD_BITS/2))) % h;
00419                         return hword((hword(m_halfs.low) + (r << (WORD_BITS/2))) % h);
00420                 }
00421                 else
00422                 {
00423                         hword r[4];
00424                         DivideFourWordsByTwo<hword, Word>(r, m_halfs.low, m_halfs.high, a);
00425                         return Word(r[0], r[1]).GetWhole();
00426                 }
00427         #endif
00428 }
00429 
00430 // ********************************************************
00431 
00432 // use some tricks to share assembly code between MSVC and GCC
00433 #if defined(__GNUC__)
00434         #define AddPrologue \
00435                 int result;     \
00436                 __asm__ __volatile__ \
00437                 ( \
00438                         ".intel_syntax noprefix;"
00439         #define AddEpilogue \
00440                         ".att_syntax prefix;" \
00441                                         : "=a" (result)\
00442                                         : "d" (C), "a" (A), "D" (B), "c" (N) \
00443                                         : "%esi", "memory", "cc" \
00444                 );\
00445                 return result;
00446         #define MulPrologue \
00447                 __asm__ __volatile__ \
00448                 ( \
00449                         ".intel_syntax noprefix;" \
00450                         AS1(    push    ebx) \
00451                         AS2(    mov             ebx, edx)
00452         #define MulEpilogue \
00453                         AS1(    pop             ebx) \
00454                         ".att_syntax prefix;" \
00455                         : \
00456                         : "d" (s_maskLow16), "c" (C), "a" (A), "D" (B) \
00457                         : "%esi", "memory", "cc" \
00458                 );
00459         #define SquPrologue             MulPrologue
00460         #define SquEpilogue     \
00461                         AS1(    pop             ebx) \
00462                         ".att_syntax prefix;" \
00463                         : \
00464                         : "d" (s_maskLow16), "c" (C), "a" (A) \
00465                         : "%esi", "%edi", "memory", "cc" \
00466                 );
00467         #define TopPrologue             MulPrologue
00468         #define TopEpilogue     \
00469                         AS1(    pop             ebx) \
00470                         ".att_syntax prefix;" \
00471                         : \
00472                         : "d" (s_maskLow16), "c" (C), "a" (A), "D" (B), "S" (L) \
00473                         : "memory", "cc" \
00474                 );
00475 #else
00476         #define AddPrologue \
00477                 __asm   push edi \
00478                 __asm   push esi \
00479                 __asm   mov             eax, [esp+12] \
00480                 __asm   mov             edi, [esp+16]
00481         #define AddEpilogue \
00482                 __asm   pop esi \
00483                 __asm   pop edi \
00484                 __asm   ret 8
00485 #if _MSC_VER < 1300
00486         #define SaveEBX         __asm push ebx
00487         #define RestoreEBX      __asm pop ebx
00488 #else
00489         #define SaveEBX
00490         #define RestoreEBX
00491 #endif
00492         #define SquPrologue                                     \
00493                 AS2(    mov             eax, A)                 \
00494                 AS2(    mov             ecx, C)                 \
00495                 SaveEBX                                                 \
00496                 AS2(    lea             ebx, s_maskLow16)
00497         #define MulPrologue                                     \
00498                 AS2(    mov             eax, A)                 \
00499                 AS2(    mov             edi, B)                 \
00500                 AS2(    mov             ecx, C)                 \
00501                 SaveEBX                                                 \
00502                 AS2(    lea             ebx, s_maskLow16)
00503         #define TopPrologue                                     \
00504                 AS2(    mov             eax, A)                 \
00505                 AS2(    mov             edi, B)                 \
00506                 AS2(    mov             ecx, C)                 \
00507                 AS2(    mov             esi, L)                 \
00508                 SaveEBX                                                 \
00509                 AS2(    lea             ebx, s_maskLow16)
00510         #define SquEpilogue             RestoreEBX
00511         #define MulEpilogue             RestoreEBX
00512         #define TopEpilogue             RestoreEBX
00513 #endif
00514 
00515 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
00516 extern "C" {
00517 int Baseline_Add(size_t N, word *C, const word *A, const word *B);
00518 int Baseline_Sub(size_t N, word *C, const word *A, const word *B);
00519 }
00520 #elif defined(CRYPTOPP_X64_ASM_AVAILABLE) && defined(__GNUC__)
00521 int Baseline_Add(size_t N, word *C, const word *A, const word *B)
00522 {
00523         word result;
00524         __asm__ __volatile__
00525         (
00526         ".intel_syntax;"
00527         AS1(    neg             %1)
00528         ASJ(    jz,             1, f)
00529         AS2(    mov             %0,[%3+8*%1])
00530         AS2(    add             %0,[%4+8*%1])
00531         AS2(    mov             [%2+8*%1],%0)
00532         ASL(0)
00533         AS2(    mov             %0,[%3+8*%1+8])
00534         AS2(    adc             %0,[%4+8*%1+8])
00535         AS2(    mov             [%2+8*%1+8],%0)
00536         AS2(    lea             %1,[%1+2])
00537         ASJ(    jrcxz,  1, f)
00538         AS2(    mov             %0,[%3+8*%1])
00539         AS2(    adc             %0,[%4+8*%1])
00540         AS2(    mov             [%2+8*%1],%0)
00541         ASJ(    jmp,    0, b)
00542         ASL(1)
00543         AS2(    mov             %0, 0)
00544         AS2(    adc             %0, %0)
00545         ".att_syntax;"
00546         : "=&r" (result)
00547         : "c" (N), "r" (C+N), "r" (A+N), "r" (B+N)
00548         : "memory", "cc"
00549         );
00550         return (int)result;
00551 }
00552 
00553 int Baseline_Sub(size_t N, word *C, const word *A, const word *B)
00554 {
00555         word result;
00556         __asm__ __volatile__
00557         (
00558         ".intel_syntax;"
00559         AS1(    neg             %1)
00560         ASJ(    jz,             1, f)
00561         AS2(    mov             %0,[%3+8*%1])
00562         AS2(    sub             %0,[%4+8*%1])
00563         AS2(    mov             [%2+8*%1],%0)
00564         ASL(0)
00565         AS2(    mov             %0,[%3+8*%1+8])
00566         AS2(    sbb             %0,[%4+8*%1+8])
00567         AS2(    mov             [%2+8*%1+8],%0)
00568         AS2(    lea             %1,[%1+2])
00569         ASJ(    jrcxz,  1, f)
00570         AS2(    mov             %0,[%3+8*%1])
00571         AS2(    sbb             %0,[%4+8*%1])
00572         AS2(    mov             [%2+8*%1],%0)
00573         ASJ(    jmp,    0, b)
00574         ASL(1)
00575         AS2(    mov             %0, 0)
00576         AS2(    adc             %0, %0)
00577         ".att_syntax;"
00578         : "=&r" (result)
00579         : "c" (N), "r" (C+N), "r" (A+N), "r" (B+N)
00580         : "memory", "cc"
00581         );
00582         return (int)result;
00583 }
00584 #elif defined(CRYPTOPP_X86_ASM_AVAILABLE) && CRYPTOPP_BOOL_X86
00585 CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
00586 {
00587         AddPrologue
00588 
00589         // now: eax = A, edi = B, edx = C, ecx = N
00590         AS2(    lea             eax, [eax+4*ecx])
00591         AS2(    lea             edi, [edi+4*ecx])
00592         AS2(    lea             edx, [edx+4*ecx])
00593 
00594         AS1(    neg             ecx)                            // ecx is negative index
00595         AS2(    test    ecx, 2)                         // this clears carry flag
00596         ASJ(    jz,             0, f)
00597         AS2(    sub             ecx, 2)
00598         ASJ(    jmp,    1, f)
00599 
00600         ASL(0)
00601         ASJ(    jecxz,  2, f)                           // loop until ecx overflows and becomes zero
00602         AS2(    mov             esi,[eax+4*ecx])
00603         AS2(    adc             esi,[edi+4*ecx])
00604         AS2(    mov             [edx+4*ecx],esi)
00605         AS2(    mov             esi,[eax+4*ecx+4])
00606         AS2(    adc             esi,[edi+4*ecx+4])
00607         AS2(    mov             [edx+4*ecx+4],esi)
00608         ASL(1)
00609         AS2(    mov             esi,[eax+4*ecx+8])
00610         AS2(    adc             esi,[edi+4*ecx+8])
00611         AS2(    mov             [edx+4*ecx+8],esi)
00612         AS2(    mov             esi,[eax+4*ecx+12])
00613         AS2(    adc             esi,[edi+4*ecx+12])
00614         AS2(    mov             [edx+4*ecx+12],esi)
00615 
00616         AS2(    lea             ecx,[ecx+4])            // advance index, avoid inc which causes slowdown on Intel Core 2
00617         ASJ(    jmp,    0, b)
00618 
00619         ASL(2)
00620         AS2(    mov             eax, 0)
00621         AS1(    setc    al)                                     // store carry into eax (return result register)
00622 
00623         AddEpilogue
00624 }
00625 
00626 CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
00627 {
00628         AddPrologue
00629 
00630         // now: eax = A, edi = B, edx = C, ecx = N
00631         AS2(    lea             eax, [eax+4*ecx])
00632         AS2(    lea             edi, [edi+4*ecx])
00633         AS2(    lea             edx, [edx+4*ecx])
00634 
00635         AS1(    neg             ecx)                            // ecx is negative index
00636         AS2(    test    ecx, 2)                         // this clears carry flag
00637         ASJ(    jz,             0, f)
00638         AS2(    sub             ecx, 2)
00639         ASJ(    jmp,    1, f)
00640 
00641         ASL(0)
00642         ASJ(    jecxz,  2, f)                           // loop until ecx overflows and becomes zero
00643         AS2(    mov             esi,[eax+4*ecx])
00644         AS2(    sbb             esi,[edi+4*ecx])
00645         AS2(    mov             [edx+4*ecx],esi)
00646         AS2(    mov             esi,[eax+4*ecx+4])
00647         AS2(    sbb             esi,[edi+4*ecx+4])
00648         AS2(    mov             [edx+4*ecx+4],esi)
00649         ASL(1)
00650         AS2(    mov             esi,[eax+4*ecx+8])
00651         AS2(    sbb             esi,[edi+4*ecx+8])
00652         AS2(    mov             [edx+4*ecx+8],esi)
00653         AS2(    mov             esi,[eax+4*ecx+12])
00654         AS2(    sbb             esi,[edi+4*ecx+12])
00655         AS2(    mov             [edx+4*ecx+12],esi)
00656 
00657         AS2(    lea             ecx,[ecx+4])            // advance index, avoid inc which causes slowdown on Intel Core 2
00658         ASJ(    jmp,    0, b)
00659 
00660         ASL(2)
00661         AS2(    mov             eax, 0)
00662         AS1(    setc    al)                                     // store carry into eax (return result register)
00663 
00664         AddEpilogue
00665 }
00666 
00667 #if CRYPTOPP_INTEGER_SSE2
00668 CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, const word *B)
00669 {
00670         AddPrologue
00671 
00672         // now: eax = A, edi = B, edx = C, ecx = N
00673         AS2(    lea             eax, [eax+4*ecx])
00674         AS2(    lea             edi, [edi+4*ecx])
00675         AS2(    lea             edx, [edx+4*ecx])
00676 
00677         AS1(    neg             ecx)                            // ecx is negative index
00678         AS2(    pxor    mm2, mm2)
00679         ASJ(    jz,             2, f)
00680         AS2(    test    ecx, 2)                         // this clears carry flag
00681         ASJ(    jz,             0, f)
00682         AS2(    sub             ecx, 2)
00683         ASJ(    jmp,    1, f)
00684 
00685         ASL(0)
00686         AS2(    movd     mm0, DWORD PTR [eax+4*ecx])
00687         AS2(    movd     mm1, DWORD PTR [edi+4*ecx])
00688         AS2(    paddq    mm0, mm1)
00689         AS2(    paddq    mm2, mm0)
00690         AS2(    movd     DWORD PTR [edx+4*ecx], mm2)
00691         AS2(    psrlq    mm2, 32)
00692 
00693         AS2(    movd     mm0, DWORD PTR [eax+4*ecx+4])
00694         AS2(    movd     mm1, DWORD PTR [edi+4*ecx+4])
00695         AS2(    paddq    mm0, mm1)
00696         AS2(    paddq    mm2, mm0)
00697         AS2(    movd     DWORD PTR [edx+4*ecx+4], mm2)
00698         AS2(    psrlq    mm2, 32)
00699 
00700         ASL(1)
00701         AS2(    movd     mm0, DWORD PTR [eax+4*ecx+8])
00702         AS2(    movd     mm1, DWORD PTR [edi+4*ecx+8])
00703         AS2(    paddq    mm0, mm1)
00704         AS2(    paddq    mm2, mm0)
00705         AS2(    movd     DWORD PTR [edx+4*ecx+8], mm2)
00706         AS2(    psrlq    mm2, 32)
00707 
00708         AS2(    movd     mm0, DWORD PTR [eax+4*ecx+12])
00709         AS2(    movd     mm1, DWORD PTR [edi+4*ecx+12])
00710         AS2(    paddq    mm0, mm1)
00711         AS2(    paddq    mm2, mm0)
00712         AS2(    movd     DWORD PTR [edx+4*ecx+12], mm2)
00713         AS2(    psrlq    mm2, 32)
00714 
00715         AS2(    add             ecx, 4)
00716         ASJ(    jnz,    0, b)
00717 
00718         ASL(2)
00719         AS2(    movd    eax, mm2)
00720         AS1(    emms)
00721 
00722         AddEpilogue
00723 }
00724 CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, const word *B)
00725 {
00726         AddPrologue
00727 
00728         // now: eax = A, edi = B, edx = C, ecx = N
00729         AS2(    lea             eax, [eax+4*ecx])
00730         AS2(    lea             edi, [edi+4*ecx])
00731         AS2(    lea             edx, [edx+4*ecx])
00732 
00733         AS1(    neg             ecx)                            // ecx is negative index
00734         AS2(    pxor    mm2, mm2)
00735         ASJ(    jz,             2, f)
00736         AS2(    test    ecx, 2)                         // this clears carry flag
00737         ASJ(    jz,             0, f)
00738         AS2(    sub             ecx, 2)
00739         ASJ(    jmp,    1, f)
00740 
00741         ASL(0)
00742         AS2(    movd     mm0, DWORD PTR [eax+4*ecx])
00743         AS2(    movd     mm1, DWORD PTR [edi+4*ecx])
00744         AS2(    psubq    mm0, mm1)
00745         AS2(    psubq    mm0, mm2)
00746         AS2(    movd     DWORD PTR [edx+4*ecx], mm0)
00747         AS2(    psrlq    mm0, 63)
00748 
00749         AS2(    movd     mm2, DWORD PTR [eax+4*ecx+4])
00750         AS2(    movd     mm1, DWORD PTR [edi+4*ecx+4])
00751         AS2(    psubq    mm2, mm1)
00752         AS2(    psubq    mm2, mm0)
00753         AS2(    movd     DWORD PTR [edx+4*ecx+4], mm2)
00754         AS2(    psrlq    mm2, 63)
00755 
00756         ASL(1)
00757         AS2(    movd     mm0, DWORD PTR [eax+4*ecx+8])
00758         AS2(    movd     mm1, DWORD PTR [edi+4*ecx+8])
00759         AS2(    psubq    mm0, mm1)
00760         AS2(    psubq    mm0, mm2)
00761         AS2(    movd     DWORD PTR [edx+4*ecx+8], mm0)
00762         AS2(    psrlq    mm0, 63)
00763 
00764         AS2(    movd     mm2, DWORD PTR [eax+4*ecx+12])
00765         AS2(    movd     mm1, DWORD PTR [edi+4*ecx+12])
00766         AS2(    psubq    mm2, mm1)
00767         AS2(    psubq    mm2, mm0)
00768         AS2(    movd     DWORD PTR [edx+4*ecx+12], mm2)
00769         AS2(    psrlq    mm2, 63)
00770 
00771         AS2(    add             ecx, 4)
00772         ASJ(    jnz,    0, b)
00773 
00774         ASL(2)
00775         AS2(    movd    eax, mm2)
00776         AS1(    emms)
00777 
00778         AddEpilogue
00779 }
00780 #endif  // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00781 #else
00782 int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
00783 {
00784         assert (N%2 == 0);
00785 
00786         Declare2Words(u);
00787         AssignWord(u, 0);
00788         for (size_t i=0; i<N; i+=2)
00789         {
00790                 AddWithCarry(u, A[i], B[i]);
00791                 C[i] = LowWord(u);
00792                 AddWithCarry(u, A[i+1], B[i+1]);
00793                 C[i+1] = LowWord(u);
00794         }
00795         return int(GetCarry(u));
00796 }
00797 
00798 int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
00799 {
00800         assert (N%2 == 0);
00801 
00802         Declare2Words(u);
00803         AssignWord(u, 0);
00804         for (size_t i=0; i<N; i+=2)
00805         {
00806                 SubtractWithBorrow(u, A[i], B[i]);
00807                 C[i] = LowWord(u);
00808                 SubtractWithBorrow(u, A[i+1], B[i+1]);
00809                 C[i+1] = LowWord(u);
00810         }
00811         return int(GetBorrow(u));
00812 }
00813 #endif
00814 
00815 static word LinearMultiply(word *C, const word *A, word B, size_t N)
00816 {
00817         word carry=0;
00818         for(unsigned i=0; i<N; i++)
00819         {
00820                 Declare2Words(p);
00821                 MultiplyWords(p, A[i], B);
00822                 Acc2WordsBy1(p, carry);
00823                 C[i] = LowWord(p);
00824                 carry = HighWord(p);
00825         }
00826         return carry;
00827 }
00828 
00829 #ifndef CRYPTOPP_DOXYGEN_PROCESSING
00830 
00831 #define Mul_2 \
00832         Mul_Begin(2) \
00833         Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
00834         Mul_End(1, 1)
00835 
00836 #define Mul_4 \
00837         Mul_Begin(4) \
00838         Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
00839         Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0)  \
00840         Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
00841         Mul_SaveAcc(3, 1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1)  \
00842         Mul_SaveAcc(4, 2, 3) Mul_Acc(3, 2) \
00843         Mul_End(5, 3)
00844 
00845 #define Mul_8 \
00846         Mul_Begin(8) \
00847         Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
00848         Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0)  \
00849         Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
00850         Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
00851         Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
00852         Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
00853         Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
00854         Mul_SaveAcc(7, 1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
00855         Mul_SaveAcc(8, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
00856         Mul_SaveAcc(9, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
00857         Mul_SaveAcc(10, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
00858         Mul_SaveAcc(11, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
00859         Mul_SaveAcc(12, 6, 7) Mul_Acc(7, 6) \
00860         Mul_End(13, 7)
00861 
00862 #define Mul_16 \
00863         Mul_Begin(16) \
00864         Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
00865         Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
00866         Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
00867         Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
00868         Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
00869         Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
00870         Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
00871         Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \
00872         Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \
00873         Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \
00874         Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \
00875         Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \
00876         Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \
00877         Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \
00878         Mul_SaveAcc(14, 0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
00879         Mul_SaveAcc(15, 1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
00880         Mul_SaveAcc(16, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
00881         Mul_SaveAcc(17, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
00882         Mul_SaveAcc(18, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
00883         Mul_SaveAcc(19, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
00884         Mul_SaveAcc(20, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
00885         Mul_SaveAcc(21, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
00886         Mul_SaveAcc(22, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
00887         Mul_SaveAcc(23, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
00888         Mul_SaveAcc(24, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
00889         Mul_SaveAcc(25, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
00890         Mul_SaveAcc(26, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
00891         Mul_SaveAcc(27, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
00892         Mul_SaveAcc(28, 14, 15) Mul_Acc(15, 14) \
00893         Mul_End(29, 15)
00894 
00895 #define Squ_2 \
00896         Squ_Begin(2) \
00897         Squ_End(2)
00898 
00899 #define Squ_4 \
00900         Squ_Begin(4) \
00901         Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
00902         Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
00903         Squ_SaveAcc(3, 1, 3) Squ_Diag(2) \
00904         Squ_SaveAcc(4, 2, 3) Squ_NonDiag \
00905         Squ_End(4)
00906 
00907 #define Squ_8 \
00908         Squ_Begin(8) \
00909         Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
00910         Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
00911         Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \
00912         Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \
00913         Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \
00914         Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \
00915         Squ_SaveAcc(7, 1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \
00916         Squ_SaveAcc(8, 2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5)  Squ_NonDiag \
00917         Squ_SaveAcc(9, 3, 7) Squ_Acc(4, 6) Squ_Diag(5) \
00918         Squ_SaveAcc(10, 4, 7) Squ_Acc(5, 6) Squ_NonDiag \
00919         Squ_SaveAcc(11, 5, 7) Squ_Diag(6) \
00920         Squ_SaveAcc(12, 6, 7) Squ_NonDiag \
00921         Squ_End(8)
00922 
00923 #define Squ_16 \
00924         Squ_Begin(16) \
00925         Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
00926         Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
00927         Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \
00928         Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \
00929         Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \
00930         Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \
00931         Squ_SaveAcc(7, 0, 8) Squ_Acc(1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \
00932         Squ_SaveAcc(8, 0, 9) Squ_Acc(1, 8) Squ_Acc(2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5) Squ_NonDiag \
00933         Squ_SaveAcc(9, 0, 10) Squ_Acc(1, 9) Squ_Acc(2, 8) Squ_Acc(3, 7) Squ_Acc(4, 6) Squ_Diag(5) \
00934         Squ_SaveAcc(10, 0, 11) Squ_Acc(1, 10) Squ_Acc(2, 9) Squ_Acc(3, 8) Squ_Acc(4, 7) Squ_Acc(5, 6) Squ_NonDiag \
00935         Squ_SaveAcc(11, 0, 12) Squ_Acc(1, 11) Squ_Acc(2, 10) Squ_Acc(3, 9) Squ_Acc(4, 8) Squ_Acc(5, 7) Squ_Diag(6) \
00936         Squ_SaveAcc(12, 0, 13) Squ_Acc(1, 12) Squ_Acc(2, 11) Squ_Acc(3, 10) Squ_Acc(4, 9) Squ_Acc(5, 8) Squ_Acc(6, 7) Squ_NonDiag \
00937         Squ_SaveAcc(13, 0, 14) Squ_Acc(1, 13) Squ_Acc(2, 12) Squ_Acc(3, 11) Squ_Acc(4, 10) Squ_Acc(5, 9) Squ_Acc(6, 8) Squ_Diag(7) \
00938         Squ_SaveAcc(14, 0, 15) Squ_Acc(1, 14) Squ_Acc(2, 13) Squ_Acc(3, 12) Squ_Acc(4, 11) Squ_Acc(5, 10) Squ_Acc(6, 9) Squ_Acc(7, 8) Squ_NonDiag \
00939         Squ_SaveAcc(15, 1, 15) Squ_Acc(2, 14) Squ_Acc(3, 13) Squ_Acc(4, 12) Squ_Acc(5, 11) Squ_Acc(6, 10) Squ_Acc(7, 9) Squ_Diag(8) \
00940         Squ_SaveAcc(16, 2, 15) Squ_Acc(3, 14) Squ_Acc(4, 13) Squ_Acc(5, 12) Squ_Acc(6, 11) Squ_Acc(7, 10) Squ_Acc(8, 9) Squ_NonDiag \
00941         Squ_SaveAcc(17, 3, 15) Squ_Acc(4, 14) Squ_Acc(5, 13) Squ_Acc(6, 12) Squ_Acc(7, 11) Squ_Acc(8, 10) Squ_Diag(9) \
00942         Squ_SaveAcc(18, 4, 15) Squ_Acc(5, 14) Squ_Acc(6, 13) Squ_Acc(7, 12) Squ_Acc(8, 11) Squ_Acc(9, 10) Squ_NonDiag \
00943         Squ_SaveAcc(19, 5, 15) Squ_Acc(6, 14) Squ_Acc(7, 13) Squ_Acc(8, 12) Squ_Acc(9, 11) Squ_Diag(10) \
00944         Squ_SaveAcc(20, 6, 15) Squ_Acc(7, 14) Squ_Acc(8, 13) Squ_Acc(9, 12) Squ_Acc(10, 11) Squ_NonDiag \
00945         Squ_SaveAcc(21, 7, 15) Squ_Acc(8, 14) Squ_Acc(9, 13) Squ_Acc(10, 12) Squ_Diag(11) \
00946         Squ_SaveAcc(22, 8, 15) Squ_Acc(9, 14) Squ_Acc(10, 13) Squ_Acc(11, 12) Squ_NonDiag \
00947         Squ_SaveAcc(23, 9, 15) Squ_Acc(10, 14) Squ_Acc(11, 13) Squ_Diag(12) \
00948         Squ_SaveAcc(24, 10, 15) Squ_Acc(11, 14) Squ_Acc(12, 13) Squ_NonDiag \
00949         Squ_SaveAcc(25, 11, 15) Squ_Acc(12, 14) Squ_Diag(13) \
00950         Squ_SaveAcc(26, 12, 15) Squ_Acc(13, 14) Squ_NonDiag \
00951         Squ_SaveAcc(27, 13, 15) Squ_Diag(14) \
00952         Squ_SaveAcc(28, 14, 15) Squ_NonDiag \
00953         Squ_End(16)
00954 
00955 #define Bot_2 \
00956         Mul_Begin(2) \
00957         Bot_SaveAcc(0, 0, 1) Bot_Acc(1, 0) \
00958         Bot_End(2)
00959 
00960 #define Bot_4 \
00961         Mul_Begin(4) \
00962         Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
00963         Mul_SaveAcc(1, 2, 0) Mul_Acc(1, 1) Mul_Acc(0, 2)  \
00964         Bot_SaveAcc(2, 0, 3) Bot_Acc(1, 2) Bot_Acc(2, 1) Bot_Acc(3, 0)  \
00965         Bot_End(4)
00966 
00967 #define Bot_8 \
00968         Mul_Begin(8) \
00969         Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
00970         Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0)  \
00971         Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
00972         Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
00973         Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
00974         Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
00975         Bot_SaveAcc(6, 0, 7) Bot_Acc(1, 6) Bot_Acc(2, 5) Bot_Acc(3, 4) Bot_Acc(4, 3) Bot_Acc(5, 2) Bot_Acc(6, 1) Bot_Acc(7, 0) \
00976         Bot_End(8)
00977 
00978 #define Bot_16 \
00979         Mul_Begin(16) \
00980         Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
00981         Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
00982         Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
00983         Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
00984         Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
00985         Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
00986         Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
00987         Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \
00988         Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \
00989         Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \
00990         Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \
00991         Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \
00992         Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \
00993         Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \
00994         Bot_SaveAcc(14, 0, 15) Bot_Acc(1, 14) Bot_Acc(2, 13) Bot_Acc(3, 12) Bot_Acc(4, 11) Bot_Acc(5, 10) Bot_Acc(6, 9) Bot_Acc(7, 8) Bot_Acc(8, 7) Bot_Acc(9, 6) Bot_Acc(10, 5) Bot_Acc(11, 4) Bot_Acc(12, 3) Bot_Acc(13, 2) Bot_Acc(14, 1) Bot_Acc(15, 0) \
00995         Bot_End(16)
00996         
00997 #endif
00998 
00999 #if 0
01000 #define Mul_Begin(n)                            \
01001         Declare2Words(p)                                \
01002         Declare2Words(c)                                \
01003         Declare2Words(d)                                \
01004         MultiplyWords(p, A[0], B[0])    \
01005         AssignWord(c, LowWord(p))               \
01006         AssignWord(d, HighWord(p))
01007 
01008 #define Mul_Acc(i, j)                           \
01009         MultiplyWords(p, A[i], B[j])    \
01010         Acc2WordsBy1(c, LowWord(p))             \
01011         Acc2WordsBy1(d, HighWord(p))
01012 
01013 #define Mul_SaveAcc(k, i, j)            \
01014         R[k] = LowWord(c);                              \
01015         Add2WordsBy1(c, d, HighWord(c)) \
01016         MultiplyWords(p, A[i], B[j])    \
01017         AssignWord(d, HighWord(p))              \
01018         Acc2WordsBy1(c, LowWord(p))
01019 
01020 #define Mul_End(n)                                      \
01021         R[2*n-3] = LowWord(c);                  \
01022         Acc2WordsBy1(d, HighWord(c))    \
01023         MultiplyWords(p, A[n-1], B[n-1])\
01024         Acc2WordsBy2(d, p)                              \
01025         R[2*n-2] = LowWord(d);                  \
01026         R[2*n-1] = HighWord(d);
01027 
01028 #define Bot_SaveAcc(k, i, j)            \
01029         R[k] = LowWord(c);                              \
01030         word e = LowWord(d) + HighWord(c);      \
01031         e += A[i] * B[j];
01032 
01033 #define Bot_Acc(i, j)   \
01034         e += A[i] * B[j];
01035 
01036 #define Bot_End(n)              \
01037         R[n-1] = e;
01038 #else
01039 #define Mul_Begin(n)                            \
01040         Declare2Words(p)                                \
01041         word c; \
01042         Declare2Words(d)                                \
01043         MultiplyWords(p, A[0], B[0])    \
01044         c = LowWord(p);         \
01045         AssignWord(d, HighWord(p))
01046 
01047 #define Mul_Acc(i, j)                           \
01048         MulAcc(c, d, A[i], B[j])
01049 
01050 #define Mul_SaveAcc(k, i, j)            \
01051         R[k] = c;                               \
01052         c = LowWord(d); \
01053         AssignWord(d, HighWord(d))      \
01054         MulAcc(c, d, A[i], B[j])
01055 
01056 #define Mul_End(k, i)                                   \
01057         R[k] = c;                       \
01058         MultiplyWords(p, A[i], B[i])    \
01059         Acc2WordsBy2(p, d)                              \
01060         R[k+1] = LowWord(p);                    \
01061         R[k+2] = HighWord(p);
01062 
01063 #define Bot_SaveAcc(k, i, j)            \
01064         R[k] = c;                               \
01065         c = LowWord(d); \
01066         c += A[i] * B[j];
01067 
01068 #define Bot_Acc(i, j)   \
01069         c += A[i] * B[j];
01070 
01071 #define Bot_End(n)              \
01072         R[n-1] = c;
01073 #endif
01074 
01075 #define Squ_Begin(n)                            \
01076         Declare2Words(p)                                \
01077         word c;                         \
01078         Declare2Words(d)                                \
01079         Declare2Words(e)                                \
01080         MultiplyWords(p, A[0], A[0])    \
01081         R[0] = LowWord(p);                              \
01082         AssignWord(e, HighWord(p))              \
01083         MultiplyWords(p, A[0], A[1])    \
01084         c = LowWord(p);         \
01085         AssignWord(d, HighWord(p))              \
01086         Squ_NonDiag                                             \
01087 
01088 #define Squ_NonDiag                             \
01089         Double3Words(c, d)
01090 
01091 #define Squ_SaveAcc(k, i, j)            \
01092         Acc3WordsBy2(c, d, e)                   \
01093         R[k] = c;                               \
01094         MultiplyWords(p, A[i], A[j])    \
01095         c = LowWord(p);         \
01096         AssignWord(d, HighWord(p))              \
01097 
01098 #define Squ_Acc(i, j)                           \
01099         MulAcc(c, d, A[i], A[j])
01100 
01101 #define Squ_Diag(i)                                     \
01102         Squ_NonDiag                                             \
01103         MulAcc(c, d, A[i], A[i])
01104 
01105 #define Squ_End(n)                                      \
01106         Acc3WordsBy2(c, d, e)                   \
01107         R[2*n-3] = c;                   \
01108         MultiplyWords(p, A[n-1], A[n-1])\
01109         Acc2WordsBy2(p, e)                              \
01110         R[2*n-2] = LowWord(p);                  \
01111         R[2*n-1] = HighWord(p);
01112 
01113 void Baseline_Multiply2(word *R, const word *A, const word *B)
01114 {
01115         Mul_2
01116 }
01117 
01118 void Baseline_Multiply4(word *R, const word *A, const word *B)
01119 {
01120         Mul_4
01121 }
01122 
01123 void Baseline_Multiply8(word *R, const word *A, const word *B)
01124 {
01125         Mul_8
01126 }
01127 
01128 void Baseline_Square2(word *R, const word *A)
01129 {
01130         Squ_2
01131 }
01132 
01133 void Baseline_Square4(word *R, const word *A)
01134 {
01135         Squ_4
01136 }
01137 
01138 void Baseline_Square8(word *R, const word *A)
01139 {
01140         Squ_8
01141 }
01142 
01143 void Baseline_MultiplyBottom2(word *R, const word *A, const word *B)
01144 {
01145         Bot_2
01146 }
01147 
01148 void Baseline_MultiplyBottom4(word *R, const word *A, const word *B)
01149 {
01150         Bot_4
01151 }
01152 
01153 void Baseline_MultiplyBottom8(word *R, const word *A, const word *B)
01154 {
01155         Bot_8
01156 }
01157 
01158 #define Top_Begin(n)                            \
01159         Declare2Words(p)                                \
01160         word c; \
01161         Declare2Words(d)                                \
01162         MultiplyWords(p, A[0], B[n-2]);\
01163         AssignWord(d, HighWord(p));
01164 
01165 #define Top_Acc(i, j)   \
01166         MultiplyWords(p, A[i], B[j]);\
01167         Acc2WordsBy1(d, HighWord(p));
01168 
01169 #define Top_SaveAcc0(i, j)              \
01170         c = LowWord(d); \
01171         AssignWord(d, HighWord(d))      \
01172         MulAcc(c, d, A[i], B[j])
01173 
01174 #define Top_SaveAcc1(i, j)              \
01175         c = L<c; \
01176         Acc2WordsBy1(d, c);     \
01177         c = LowWord(d); \
01178         AssignWord(d, HighWord(d))      \
01179         MulAcc(c, d, A[i], B[j])
01180 
01181 void Baseline_MultiplyTop2(word *R, const word *A, const word *B, word L)
01182 {
01183         word T[4];
01184         Baseline_Multiply2(T, A, B);
01185         R[0] = T[2];
01186         R[1] = T[3];
01187 }
01188 
01189 void Baseline_MultiplyTop4(word *R, const word *A, const word *B, word L)
01190 {
01191         Top_Begin(4)
01192         Top_Acc(1, 1) Top_Acc(2, 0)  \
01193         Top_SaveAcc0(0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
01194         Top_SaveAcc1(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1)  \
01195         Mul_SaveAcc(0, 2, 3) Mul_Acc(3, 2) \
01196         Mul_End(1, 3)
01197 }
01198 
01199 void Baseline_MultiplyTop8(word *R, const word *A, const word *B, word L)
01200 {
01201         Top_Begin(8)
01202         Top_Acc(1, 5) Top_Acc(2, 4) Top_Acc(3, 3) Top_Acc(4, 2) Top_Acc(5, 1) Top_Acc(6, 0) \
01203         Top_SaveAcc0(0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
01204         Top_SaveAcc1(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
01205         Mul_SaveAcc(0, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
01206         Mul_SaveAcc(1, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
01207         Mul_SaveAcc(2, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
01208         Mul_SaveAcc(3, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
01209         Mul_SaveAcc(4, 6, 7) Mul_Acc(7, 6) \
01210         Mul_End(5, 7)
01211 }
01212 
01213 #if !CRYPTOPP_INTEGER_SSE2      // save memory by not compiling these functions when SSE2 is available
01214 void Baseline_Multiply16(word *R, const word *A, const word *B)
01215 {
01216         Mul_16
01217 }
01218 
01219 void Baseline_Square16(word *R, const word *A)
01220 {
01221         Squ_16
01222 }
01223 
01224 void Baseline_MultiplyBottom16(word *R, const word *A, const word *B)
01225 {
01226         Bot_16
01227 }
01228 
01229 void Baseline_MultiplyTop16(word *R, const word *A, const word *B, word L)
01230 {
01231         Top_Begin(16)
01232         Top_Acc(1, 13) Top_Acc(2, 12) Top_Acc(3, 11) Top_Acc(4, 10) Top_Acc(5, 9) Top_Acc(6, 8) Top_Acc(7, 7) Top_Acc(8, 6) Top_Acc(9, 5) Top_Acc(10, 4) Top_Acc(11, 3) Top_Acc(12, 2) Top_Acc(13, 1) Top_Acc(14, 0) \
01233         Top_SaveAcc0(0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
01234         Top_SaveAcc1(1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
01235         Mul_SaveAcc(0, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
01236         Mul_SaveAcc(1, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
01237         Mul_SaveAcc(2, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
01238         Mul_SaveAcc(3, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
01239         Mul_SaveAcc(4, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
01240         Mul_SaveAcc(5, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
01241         Mul_SaveAcc(6, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
01242         Mul_SaveAcc(7, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
01243         Mul_SaveAcc(8, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
01244         Mul_SaveAcc(9, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
01245         Mul_SaveAcc(10, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
01246         Mul_SaveAcc(11, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
01247         Mul_SaveAcc(12, 14, 15) Mul_Acc(15, 14) \
01248         Mul_End(13, 15)
01249 }
01250 #endif
01251 
01252 // ********************************************************
01253 
01254 #if CRYPTOPP_INTEGER_SSE2
01255 
01256 CRYPTOPP_ALIGN_DATA(16) static const word32 s_maskLow16[4] CRYPTOPP_SECTION_ALIGN16 = {0xffff,0xffff,0xffff,0xffff};
01257 
01258 #undef Mul_Begin
01259 #undef Mul_Acc
01260 #undef Top_Begin
01261 #undef Top_Acc
01262 #undef Squ_Acc
01263 #undef Squ_NonDiag
01264 #undef Squ_Diag
01265 #undef Squ_SaveAcc
01266 #undef Squ_Begin
01267 #undef Mul_SaveAcc
01268 #undef Bot_Acc
01269 #undef Bot_SaveAcc
01270 #undef Bot_End
01271 #undef Squ_End
01272 #undef Mul_End
01273 
01274 #define SSE2_FinalSave(k)                       \
01275         AS2(    psllq           xmm5, 16)       \
01276         AS2(    paddq           xmm4, xmm5)     \
01277         AS2(    movq            QWORD PTR [ecx+8*(k)], xmm4)
01278 
01279 #define SSE2_SaveShift(k)                       \
01280         AS2(    movq            xmm0, xmm6)     \
01281         AS2(    punpckhqdq      xmm6, xmm0)     \
01282         AS2(    movq            xmm1, xmm7)     \
01283         AS2(    punpckhqdq      xmm7, xmm1)     \
01284         AS2(    paddd           xmm6, xmm0)     \
01285         AS2(    pslldq          xmm6, 4)        \
01286         AS2(    paddd           xmm7, xmm1)     \
01287         AS2(    paddd           xmm4, xmm6)     \
01288         AS2(    pslldq          xmm7, 4)        \
01289         AS2(    movq            xmm6, xmm4)     \
01290         AS2(    paddd           xmm5, xmm7)     \
01291         AS2(    movq            xmm7, xmm5)     \
01292         AS2(    movd            DWORD PTR [ecx+8*(k)], xmm4)    \
01293         AS2(    psrlq           xmm6, 16)       \
01294         AS2(    paddq           xmm6, xmm7)     \
01295         AS2(    punpckhqdq      xmm4, xmm0)     \
01296         AS2(    punpckhqdq      xmm5, xmm0)     \
01297         AS2(    movq            QWORD PTR [ecx+8*(k)+2], xmm6)  \
01298         AS2(    psrlq           xmm6, 3*16)     \
01299         AS2(    paddd           xmm4, xmm6)     \
01300 
01301 #define Squ_SSE2_SaveShift(k)                   \
01302         AS2(    movq            xmm0, xmm6)     \
01303         AS2(    punpckhqdq      xmm6, xmm0)     \
01304         AS2(    movq            xmm1, xmm7)     \
01305         AS2(    punpckhqdq      xmm7, xmm1)     \
01306         AS2(    paddd           xmm6, xmm0)     \
01307         AS2(    pslldq          xmm6, 4)        \
01308         AS2(    paddd           xmm7, xmm1)     \
01309         AS2(    paddd           xmm4, xmm6)     \
01310         AS2(    pslldq          xmm7, 4)        \
01311         AS2(    movhlps         xmm6, xmm4)     \
01312         AS2(    movd            DWORD PTR [ecx+8*(k)], xmm4)    \
01313         AS2(    paddd           xmm5, xmm7)     \
01314         AS2(    movhps          QWORD PTR [esp+12], xmm5)\
01315         AS2(    psrlq           xmm4, 16)       \
01316         AS2(    paddq           xmm4, xmm5)     \
01317         AS2(    movq            QWORD PTR [ecx+8*(k)+2], xmm4)  \
01318         AS2(    psrlq           xmm4, 3*16)     \
01319         AS2(    paddd           xmm4, xmm6)     \
01320         AS2(    movq            QWORD PTR [esp+4], xmm4)\
01321 
01322 #define SSE2_FirstMultiply(i)                           \
01323         AS2(    movdqa          xmm7, [esi+(i)*16])\
01324         AS2(    movdqa          xmm5, [edi-(i)*16])\
01325         AS2(    pmuludq         xmm5, xmm7)             \
01326         AS2(    movdqa          xmm4, [ebx])\
01327         AS2(    movdqa          xmm6, xmm4)             \
01328         AS2(    pand            xmm4, xmm5)             \
01329         AS2(    psrld           xmm5, 16)               \
01330         AS2(    pmuludq         xmm7, [edx-(i)*16])\
01331         AS2(    pand            xmm6, xmm7)             \
01332         AS2(    psrld           xmm7, 16)
01333 
01334 #define Squ_Begin(n)                                                    \
01335         SquPrologue                                                                     \
01336         AS2(    mov             esi, esp)\
01337         AS2(    and             esp, 0xfffffff0)\
01338         AS2(    lea             edi, [esp-32*n])\
01339         AS2(    sub             esp, 32*n+16)\
01340         AS1(    push    esi)\
01341         AS2(    mov             esi, edi)                                       \
01342         AS2(    xor             edx, edx)                                       \
01343         ASL(1)                                                                          \
01344         ASS(    pshufd  xmm0, [eax+edx], 3,1,2,0)       \
01345         ASS(    pshufd  xmm1, [eax+edx], 2,0,3,1)       \
01346         AS2(    movdqa  [edi+2*edx], xmm0)              \
01347         AS2(    psrlq   xmm0, 32)                                       \
01348         AS2(    movdqa  [edi+2*edx+16], xmm0)   \
01349         AS2(    movdqa  [edi+16*n+2*edx], xmm1)         \
01350         AS2(    psrlq   xmm1, 32)                                       \
01351         AS2(    movdqa  [edi+16*n+2*edx+16], xmm1)      \
01352         AS2(    add             edx, 16)                                        \
01353         AS2(    cmp             edx, 8*(n))                                     \
01354         ASJ(    jne,    1, b)                                           \
01355         AS2(    lea             edx, [edi+16*n])\
01356         SSE2_FirstMultiply(0)                                                   \
01357 
01358 #define Squ_Acc(i)                                                              \
01359         ASL(LSqu##i)                                                            \
01360         AS2(    movdqa          xmm1, [esi+(i)*16])     \
01361         AS2(    movdqa          xmm0, [edi-(i)*16])     \
01362         AS2(    movdqa          xmm2, [ebx])    \
01363         AS2(    pmuludq         xmm0, xmm1)                             \
01364         AS2(    pmuludq         xmm1, [edx-(i)*16])     \
01365         AS2(    movdqa          xmm3, xmm2)                     \
01366         AS2(    pand            xmm2, xmm0)                     \
01367         AS2(    psrld           xmm0, 16)                       \
01368         AS2(    paddd           xmm4, xmm2)                     \
01369         AS2(    paddd           xmm5, xmm0)                     \
01370         AS2(    pand            xmm3, xmm1)                     \
01371         AS2(    psrld           xmm1, 16)                       \
01372         AS2(    paddd           xmm6, xmm3)                     \
01373         AS2(    paddd           xmm7, xmm1)             \
01374 
01375 #define Squ_Acc1(i)             
01376 #define Squ_Acc2(i)             ASC(call, LSqu##i)
01377 #define Squ_Acc3(i)             Squ_Acc2(i)
01378 #define Squ_Acc4(i)             Squ_Acc2(i)
01379 #define Squ_Acc5(i)             Squ_Acc2(i)
01380 #define Squ_Acc6(i)             Squ_Acc2(i)
01381 #define Squ_Acc7(i)             Squ_Acc2(i)
01382 #define Squ_Acc8(i)             Squ_Acc2(i)
01383 
01384 #define SSE2_End(E, n)                                  \
01385         SSE2_SaveShift(2*(n)-3)                 \
01386         AS2(    movdqa          xmm7, [esi+16]) \
01387         AS2(    movdqa          xmm0, [edi])    \
01388         AS2(    pmuludq         xmm0, xmm7)                             \
01389         AS2(    movdqa          xmm2, [ebx])            \
01390         AS2(    pmuludq         xmm7, [edx])    \
01391         AS2(    movdqa          xmm6, xmm2)                             \
01392         AS2(    pand            xmm2, xmm0)                             \
01393         AS2(    psrld           xmm0, 16)                               \
01394         AS2(    paddd           xmm4, xmm2)                             \
01395         AS2(    paddd           xmm5, xmm0)                             \
01396         AS2(    pand            xmm6, xmm7)                             \
01397         AS2(    psrld           xmm7, 16)       \
01398         SSE2_SaveShift(2*(n)-2)                 \
01399         SSE2_FinalSave(2*(n)-1)                 \
01400         AS1(    pop             esp)\
01401         E
01402 
01403 #define Squ_End(n)              SSE2_End(SquEpilogue, n)
01404 #define Mul_End(n)              SSE2_End(MulEpilogue, n)
01405 #define Top_End(n)              SSE2_End(TopEpilogue, n)
01406 
01407 #define Squ_Column1(k, i)       \
01408         Squ_SSE2_SaveShift(k)                                   \
01409         AS2(    add                     esi, 16)        \
01410         SSE2_FirstMultiply(1)\
01411         Squ_Acc##i(i)   \
01412         AS2(    paddd           xmm4, xmm4)             \
01413         AS2(    paddd           xmm5, xmm5)             \
01414         AS2(    movdqa          xmm3, [esi])                            \
01415         AS2(    movq            xmm1, QWORD PTR [esi+8])        \
01416         AS2(    pmuludq         xmm1, xmm3)             \
01417         AS2(    pmuludq         xmm3, xmm3)             \
01418         AS2(    movdqa          xmm0, [ebx])\
01419         AS2(    movdqa          xmm2, xmm0)             \
01420         AS2(    pand            xmm0, xmm1)             \
01421         AS2(    psrld           xmm1, 16)               \
01422         AS2(    paddd           xmm6, xmm0)             \
01423         AS2(    paddd           xmm7, xmm1)             \
01424         AS2(    pand            xmm2, xmm3)             \
01425         AS2(    psrld           xmm3, 16)               \
01426         AS2(    paddd           xmm6, xmm6)             \
01427         AS2(    paddd           xmm7, xmm7)             \
01428         AS2(    paddd           xmm4, xmm2)             \
01429         AS2(    paddd           xmm5, xmm3)             \
01430         AS2(    movq            xmm0, QWORD PTR [esp+4])\
01431         AS2(    movq            xmm1, QWORD PTR [esp+12])\
01432         AS2(    paddd           xmm4, xmm0)\
01433         AS2(    paddd           xmm5, xmm1)\
01434 
01435 #define Squ_Column0(k, i)       \
01436         Squ_SSE2_SaveShift(k)                                   \
01437         AS2(    add                     edi, 16)        \
01438         AS2(    add                     edx, 16)        \
01439         SSE2_FirstMultiply(1)\
01440         Squ_Acc##i(i)   \
01441         AS2(    paddd           xmm6, xmm6)             \
01442         AS2(    paddd           xmm7, xmm7)             \
01443         AS2(    paddd           xmm4, xmm4)             \
01444         AS2(    paddd           xmm5, xmm5)             \
01445         AS2(    movq            xmm0, QWORD PTR [esp+4])\
01446         AS2(    movq            xmm1, QWORD PTR [esp+12])\
01447         AS2(    paddd           xmm4, xmm0)\
01448         AS2(    paddd           xmm5, xmm1)\
01449 
01450 #define SSE2_MulAdd45                                           \
01451         AS2(    movdqa          xmm7, [esi])    \
01452         AS2(    movdqa          xmm0, [edi])    \
01453         AS2(    pmuludq         xmm0, xmm7)                             \
01454         AS2(    movdqa          xmm2, [ebx])            \
01455         AS2(    pmuludq         xmm7, [edx])    \
01456         AS2(    movdqa          xmm6, xmm2)                             \
01457         AS2(    pand            xmm2, xmm0)                             \
01458         AS2(    psrld           xmm0, 16)                               \
01459         AS2(    paddd           xmm4, xmm2)                             \
01460         AS2(    paddd           xmm5, xmm0)                             \
01461         AS2(    pand            xmm6, xmm7)                             \
01462         AS2(    psrld           xmm7, 16)
01463 
01464 #define Mul_Begin(n)                                                    \
01465         MulPrologue                                                                     \
01466         AS2(    mov             esi, esp)\
01467         AS2(    and             esp, 0xfffffff0)\
01468         AS2(    sub             esp, 48*n+16)\
01469         AS1(    push    esi)\
01470         AS2(    xor             edx, edx)                                       \
01471         ASL(1)                                                                          \
01472         ASS(    pshufd  xmm0, [eax+edx], 3,1,2,0)       \
01473         ASS(    pshufd  xmm1, [eax+edx], 2,0,3,1)       \
01474         ASS(    pshufd  xmm2, [edi+edx], 3,1,2,0)       \
01475         AS2(    movdqa  [esp+20+2*edx], xmm0)           \
01476         AS2(    psrlq   xmm0, 32)                                       \
01477         AS2(    movdqa  [esp+20+2*edx+16], xmm0)        \
01478         AS2(    movdqa  [esp+20+16*n+2*edx], xmm1)              \
01479         AS2(    psrlq   xmm1, 32)                                       \
01480         AS2(    movdqa  [esp+20+16*n+2*edx+16], xmm1)   \
01481         AS2(    movdqa  [esp+20+32*n+2*edx], xmm2)              \
01482         AS2(    psrlq   xmm2, 32)                                       \
01483         AS2(    movdqa  [esp+20+32*n+2*edx+16], xmm2)   \
01484         AS2(    add             edx, 16)                                        \
01485         AS2(    cmp             edx, 8*(n))                                     \
01486         ASJ(    jne,    1, b)                                           \
01487         AS2(    lea             edi, [esp+20])\
01488         AS2(    lea             edx, [esp+20+16*n])\
01489         AS2(    lea             esi, [esp+20+32*n])\
01490         SSE2_FirstMultiply(0)                                                   \
01491 
01492 #define Mul_Acc(i)                                                              \
01493         ASL(LMul##i)                                                                            \
01494         AS2(    movdqa          xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16])   \
01495         AS2(    movdqa          xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16])   \
01496         AS2(    movdqa          xmm2, [ebx])    \
01497         AS2(    pmuludq         xmm0, xmm1)                             \
01498         AS2(    pmuludq         xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16])   \
01499         AS2(    movdqa          xmm3, xmm2)                     \
01500         AS2(    pand            xmm2, xmm0)                     \
01501         AS2(    psrld           xmm0, 16)                       \
01502         AS2(    paddd           xmm4, xmm2)                     \
01503         AS2(    paddd           xmm5, xmm0)                     \
01504         AS2(    pand            xmm3, xmm1)                     \
01505         AS2(    psrld           xmm1, 16)                       \
01506         AS2(    paddd           xmm6, xmm3)                     \
01507         AS2(    paddd           xmm7, xmm1)             \
01508 
01509 #define Mul_Acc1(i)             
01510 #define Mul_Acc2(i)             ASC(call, LMul##i)
01511 #define Mul_Acc3(i)             Mul_Acc2(i)
01512 #define Mul_Acc4(i)             Mul_Acc2(i)
01513 #define Mul_Acc5(i)             Mul_Acc2(i)
01514 #define Mul_Acc6(i)             Mul_Acc2(i)
01515 #define Mul_Acc7(i)             Mul_Acc2(i)
01516 #define Mul_Acc8(i)             Mul_Acc2(i)
01517 #define Mul_Acc9(i)             Mul_Acc2(i)
01518 #define Mul_Acc10(i)    Mul_Acc2(i)
01519 #define Mul_Acc11(i)    Mul_Acc2(i)
01520 #define Mul_Acc12(i)    Mul_Acc2(i)
01521 #define Mul_Acc13(i)    Mul_Acc2(i)
01522 #define Mul_Acc14(i)    Mul_Acc2(i)
01523 #define Mul_Acc15(i)    Mul_Acc2(i)
01524 #define Mul_Acc16(i)    Mul_Acc2(i)
01525 
01526 #define Mul_Column1(k, i)       \
01527         SSE2_SaveShift(k)                                       \
01528         AS2(    add                     esi, 16)        \
01529         SSE2_MulAdd45\
01530         Mul_Acc##i(i)   \
01531 
01532 #define Mul_Column0(k, i)       \
01533         SSE2_SaveShift(k)                                       \
01534         AS2(    add                     edi, 16)        \
01535         AS2(    add                     edx, 16)        \
01536         SSE2_MulAdd45\
01537         Mul_Acc##i(i)   \
01538 
01539 #define Bot_Acc(i)                                                      \
01540         AS2(    movdqa          xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16])   \
01541         AS2(    movdqa          xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16])   \
01542         AS2(    pmuludq         xmm0, xmm1)                             \
01543         AS2(    pmuludq         xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16])           \
01544         AS2(    paddq           xmm4, xmm0)                             \
01545         AS2(    paddd           xmm6, xmm1)
01546 
01547 #define Bot_SaveAcc(k)                                  \
01548         SSE2_SaveShift(k)                                                       \
01549         AS2(    add                     edi, 16)        \
01550         AS2(    add                     edx, 16)        \
01551         AS2(    movdqa          xmm6, [esi])    \
01552         AS2(    movdqa          xmm0, [edi])    \
01553         AS2(    pmuludq         xmm0, xmm6)                             \
01554         AS2(    paddq           xmm4, xmm0)                             \
01555         AS2(    psllq           xmm5, 16)                               \
01556         AS2(    paddq           xmm4, xmm5)                             \
01557         AS2(    pmuludq         xmm6, [edx])
01558 
01559 #define Bot_End(n)                                                      \
01560         AS2(    movhlps         xmm7, xmm6)                     \
01561         AS2(    paddd           xmm6, xmm7)                     \
01562         AS2(    psllq           xmm6, 32)                       \
01563         AS2(    paddd           xmm4, xmm6)                     \
01564         AS2(    movq            QWORD PTR [ecx+8*((n)-1)], xmm4)        \
01565         AS1(    pop             esp)\
01566         MulEpilogue
01567 
01568 #define Top_Begin(n)                                                    \
01569         TopPrologue                                                                     \
01570         AS2(    mov             edx, esp)\
01571         AS2(    and             esp, 0xfffffff0)\
01572         AS2(    sub             esp, 48*n+16)\
01573         AS1(    push    edx)\
01574         AS2(    xor             edx, edx)                                       \
01575         ASL(1)                                                                          \
01576         ASS(    pshufd  xmm0, [eax+edx], 3,1,2,0)       \
01577         ASS(    pshufd  xmm1, [eax+edx], 2,0,3,1)       \
01578         ASS(    pshufd  xmm2, [edi+edx], 3,1,2,0)       \
01579         AS2(    movdqa  [esp+20+2*edx], xmm0)           \
01580         AS2(    psrlq   xmm0, 32)                                       \
01581         AS2(    movdqa  [esp+20+2*edx+16], xmm0)        \
01582         AS2(    movdqa  [esp+20+16*n+2*edx], xmm1)              \
01583         AS2(    psrlq   xmm1, 32)                                       \
01584         AS2(    movdqa  [esp+20+16*n+2*edx+16], xmm1)   \
01585         AS2(    movdqa  [esp+20+32*n+2*edx], xmm2)              \
01586         AS2(    psrlq   xmm2, 32)                                       \
01587         AS2(    movdqa  [esp+20+32*n+2*edx+16], xmm2)   \
01588         AS2(    add             edx, 16)                                        \
01589         AS2(    cmp             edx, 8*(n))                                     \
01590         ASJ(    jne,    1, b)                                           \
01591         AS2(    mov             eax, esi)                                       \
01592         AS2(    lea             edi, [esp+20+00*n+16*(n/2-1)])\
01593         AS2(    lea             edx, [esp+20+16*n+16*(n/2-1)])\
01594         AS2(    lea             esi, [esp+20+32*n+16*(n/2-1)])\
01595         AS2(    pxor    xmm4, xmm4)\
01596         AS2(    pxor    xmm5, xmm5)
01597 
01598 #define Top_Acc(i)                                                      \
01599         AS2(    movq            xmm0, QWORD PTR [esi+i/2*(1-(i-2*(i/2))*2)*16+8])       \
01600         AS2(    pmuludq         xmm0, [edx-i/2*(1-(i-2*(i/2))*2)*16])   \
01601         AS2(    psrlq           xmm0, 48)                               \
01602         AS2(    paddd           xmm5, xmm0)\
01603 
01604 #define Top_Column0(i)  \
01605         AS2(    psllq           xmm5, 32)                               \
01606         AS2(    add                     edi, 16)        \
01607         AS2(    add                     edx, 16)        \
01608         SSE2_MulAdd45\
01609         Mul_Acc##i(i)   \
01610 
01611 #define Top_Column1(i)  \
01612         SSE2_SaveShift(0)                                       \
01613         AS2(    add                     esi, 16)        \
01614         SSE2_MulAdd45\
01615         Mul_Acc##i(i)   \
01616         AS2(    shr                     eax, 16)        \
01617         AS2(    movd            xmm0, eax)\
01618         AS2(    movd            xmm1, [ecx+4])\
01619         AS2(    psrld           xmm1, 16)\
01620         AS2(    pcmpgtd         xmm1, xmm0)\
01621         AS2(    psrld           xmm1, 31)\
01622         AS2(    paddd           xmm4, xmm1)\
01623 
01624 void SSE2_Square4(word *C, const word *A)
01625 {
01626         Squ_Begin(2)
01627         Squ_Column0(0, 1)
01628         Squ_End(2)
01629 }
01630 
01631 void SSE2_Square8(word *C, const word *A)
01632 {
01633         Squ_Begin(4)
01634 #ifndef __GNUC__
01635         ASJ(    jmp,    0, f)
01636         Squ_Acc(2)
01637         AS1(    ret) ASL(0)
01638 #endif
01639         Squ_Column0(0, 1)
01640         Squ_Column1(1, 1)
01641         Squ_Column0(2, 2)
01642         Squ_Column1(3, 1)
01643         Squ_Column0(4, 1)
01644         Squ_End(4)
01645 }
01646 
01647 void SSE2_Square16(word *C, const word *A)
01648 {
01649         Squ_Begin(8)
01650 #ifndef __GNUC__
01651         ASJ(    jmp,    0, f)
01652         Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
01653         AS1(    ret) ASL(0)
01654 #endif
01655         Squ_Column0(0, 1)
01656         Squ_Column1(1, 1)
01657         Squ_Column0(2, 2)
01658         Squ_Column1(3, 2)
01659         Squ_Column0(4, 3)
01660         Squ_Column1(5, 3)
01661         Squ_Column0(6, 4)
01662         Squ_Column1(7, 3)
01663         Squ_Column0(8, 3)
01664         Squ_Column1(9, 2)
01665         Squ_Column0(10, 2)
01666         Squ_Column1(11, 1)
01667         Squ_Column0(12, 1)
01668         Squ_End(8)
01669 }
01670 
01671 void SSE2_Square32(word *C, const word *A)
01672 {
01673         Squ_Begin(16)
01674         ASJ(    jmp,    0, f)
01675         Squ_Acc(8) Squ_Acc(7) Squ_Acc(6) Squ_Acc(5) Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
01676         AS1(    ret) ASL(0)
01677         Squ_Column0(0, 1)
01678         Squ_Column1(1, 1)
01679         Squ_Column0(2, 2)
01680         Squ_Column1(3, 2)
01681         Squ_Column0(4, 3)
01682         Squ_Column1(5, 3)
01683         Squ_Column0(6, 4)
01684         Squ_Column1(7, 4)
01685         Squ_Column0(8, 5)
01686         Squ_Column1(9, 5)
01687         Squ_Column0(10, 6)
01688         Squ_Column1(11, 6)
01689         Squ_Column0(12, 7)
01690         Squ_Column1(13, 7)
01691         Squ_Column0(14, 8)
01692         Squ_Column1(15, 7)
01693         Squ_Column0(16, 7)
01694         Squ_Column1(17, 6)
01695         Squ_Column0(18, 6)
01696         Squ_Column1(19, 5)
01697         Squ_Column0(20, 5)
01698         Squ_Column1(21, 4)
01699         Squ_Column0(22, 4)
01700         Squ_Column1(23, 3)
01701         Squ_Column0(24, 3)
01702         Squ_Column1(25, 2)
01703         Squ_Column0(26, 2)
01704         Squ_Column1(27, 1)
01705         Squ_Column0(28, 1)
01706         Squ_End(16)
01707 }
01708 
01709 void SSE2_Multiply4(word *C, const word *A, const word *B)
01710 {
01711         Mul_Begin(2)
01712 #ifndef __GNUC__
01713         ASJ(    jmp,    0, f)
01714         Mul_Acc(2)
01715         AS1(    ret) ASL(0)
01716 #endif
01717         Mul_Column0(0, 2)
01718         Mul_End(2)
01719 }
01720 
01721 void SSE2_Multiply8(word *C, const word *A, const word *B)
01722 {
01723         Mul_Begin(4)
01724 #ifndef __GNUC__
01725         ASJ(    jmp,    0, f)
01726         Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
01727         AS1(    ret) ASL(0)
01728 #endif
01729         Mul_Column0(0, 2)
01730         Mul_Column1(1, 3)
01731         Mul_Column0(2, 4)
01732         Mul_Column1(3, 3)
01733         Mul_Column0(4, 2)
01734         Mul_End(4)
01735 }
01736 
01737 void SSE2_Multiply16(word *C, const word *A, const word *B)
01738 {
01739         Mul_Begin(8)
01740 #ifndef __GNUC__
01741         ASJ(    jmp,    0, f)
01742         Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
01743         AS1(    ret) ASL(0)
01744 #endif
01745         Mul_Column0(0, 2)
01746         Mul_Column1(1, 3)
01747         Mul_Column0(2, 4)
01748         Mul_Column1(3, 5)
01749         Mul_Column0(4, 6)
01750         Mul_Column1(5, 7)
01751         Mul_Column0(6, 8)
01752         Mul_Column1(7, 7)
01753         Mul_Column0(8, 6)
01754         Mul_Column1(9, 5)
01755         Mul_Column0(10, 4)
01756         Mul_Column1(11, 3)
01757         Mul_Column0(12, 2)
01758         Mul_End(8)
01759 }
01760 
01761 void SSE2_Multiply32(word *C, const word *A, const word *B)
01762 {
01763         Mul_Begin(16)
01764         ASJ(    jmp,    0, f)
01765         Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
01766         AS1(    ret) ASL(0)
01767         Mul_Column0(0, 2)
01768         Mul_Column1(1, 3)
01769         Mul_Column0(2, 4)
01770         Mul_Column1(3, 5)
01771         Mul_Column0(4, 6)
01772         Mul_Column1(5, 7)
01773         Mul_Column0(6, 8)
01774         Mul_Column1(7, 9)
01775         Mul_Column0(8, 10)
01776         Mul_Column1(9, 11)
01777         Mul_Column0(10, 12)
01778         Mul_Column1(11, 13)
01779         Mul_Column0(12, 14)
01780         Mul_Column1(13, 15)
01781         Mul_Column0(14, 16)
01782         Mul_Column1(15, 15)
01783         Mul_Column0(16, 14)
01784         Mul_Column1(17, 13)
01785         Mul_Column0(18, 12)
01786         Mul_Column1(19, 11)
01787         Mul_Column0(20, 10)
01788         Mul_Column1(21, 9)
01789         Mul_Column0(22, 8)
01790         Mul_Column1(23, 7)
01791         Mul_Column0(24, 6)
01792         Mul_Column1(25, 5)
01793         Mul_Column0(26, 4)
01794         Mul_Column1(27, 3)
01795         Mul_Column0(28, 2)
01796         Mul_End(16)
01797 }
01798 
01799 void SSE2_MultiplyBottom4(word *C, const word *A, const word *B)
01800 {
01801         Mul_Begin(2)
01802         Bot_SaveAcc(0) Bot_Acc(2)
01803         Bot_End(2)
01804 }
01805 
01806 void SSE2_MultiplyBottom8(word *C, const word *A, const word *B)
01807 {
01808         Mul_Begin(4)
01809 #ifndef __GNUC__
01810         ASJ(    jmp,    0, f)
01811         Mul_Acc(3) Mul_Acc(2)
01812         AS1(    ret) ASL(0)
01813 #endif
01814         Mul_Column0(0, 2)
01815         Mul_Column1(1, 3)
01816         Bot_SaveAcc(2) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
01817         Bot_End(4)
01818 }
01819 
01820 void SSE2_MultiplyBottom16(word *C, const word *A, const word *B)
01821 {
01822         Mul_Begin(8)
01823 #ifndef __GNUC__
01824         ASJ(    jmp,    0, f)
01825         Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
01826         AS1(    ret) ASL(0)
01827 #endif
01828         Mul_Column0(0, 2)
01829         Mul_Column1(1, 3)
01830         Mul_Column0(2, 4)
01831         Mul_Column1(3, 5)
01832         Mul_Column0(4, 6)
01833         Mul_Column1(5, 7)
01834         Bot_SaveAcc(6) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
01835         Bot_End(8)
01836 }
01837 
01838 void SSE2_MultiplyBottom32(word *C, const word *A, const word *B)
01839 {
01840         Mul_Begin(16)
01841 #ifndef __GNUC__
01842         ASJ(    jmp,    0, f)
01843         Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
01844         AS1(    ret) ASL(0)
01845 #endif
01846         Mul_Column0(0, 2)
01847         Mul_Column1(1, 3)
01848         Mul_Column0(2, 4)
01849         Mul_Column1(3, 5)
01850         Mul_Column0(4, 6)
01851         Mul_Column1(5, 7)
01852         Mul_Column0(6, 8)
01853         Mul_Column1(7, 9)
01854         Mul_Column0(8, 10)
01855         Mul_Column1(9, 11)
01856         Mul_Column0(10, 12)
01857         Mul_Column1(11, 13)
01858         Mul_Column0(12, 14)
01859         Mul_Column1(13, 15)
01860         Bot_SaveAcc(14) Bot_Acc(16) Bot_Acc(15) Bot_Acc(14) Bot_Acc(13) Bot_Acc(12) Bot_Acc(11) Bot_Acc(10) Bot_Acc(9) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
01861         Bot_End(16)
01862 }
01863 
01864 void SSE2_MultiplyTop8(word *C, const word *A, const word *B, word L)
01865 {
01866         Top_Begin(4)
01867         Top_Acc(3) Top_Acc(2) Top_Acc(1)
01868 #ifndef __GNUC__
01869         ASJ(    jmp,    0, f)
01870         Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
01871         AS1(    ret) ASL(0)
01872 #endif
01873         Top_Column0(4)
01874         Top_Column1(3)
01875         Mul_Column0(0, 2)
01876         Top_End(2)
01877 }
01878 
01879 void SSE2_MultiplyTop16(word *C, const word *A, const word *B, word L)
01880 {
01881         Top_Begin(8)
01882         Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)
01883 #ifndef __GNUC__
01884         ASJ(    jmp,    0, f)
01885         Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
01886         AS1(    ret) ASL(0)
01887 #endif
01888         Top_Column0(8)
01889         Top_Column1(7)
01890         Mul_Column0(0, 6)
01891         Mul_Column1(1, 5)
01892         Mul_Column0(2, 4)
01893         Mul_Column1(3, 3)
01894         Mul_Column0(4, 2)
01895         Top_End(4)
01896 }
01897 
01898 void SSE2_MultiplyTop32(word *C, const word *A, const word *B, word L)
01899 {
01900         Top_Begin(16)
01901         Top_Acc(15) Top_Acc(14) Top_Acc(13) Top_Acc(12) Top_Acc(11) Top_Acc(10) Top_Acc(9) Top_Acc(8) Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)
01902 #ifndef __GNUC__
01903         ASJ(    jmp,    0, f)
01904         Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
01905         AS1(    ret) ASL(0)
01906 #endif
01907         Top_Column0(16)
01908         Top_Column1(15)
01909         Mul_Column0(0, 14)
01910         Mul_Column1(1, 13)
01911         Mul_Column0(2, 12)
01912         Mul_Column1(3, 11)
01913         Mul_Column0(4, 10)
01914         Mul_Column1(5, 9)
01915         Mul_Column0(6, 8)
01916         Mul_Column1(7, 7)
01917         Mul_Column0(8, 6)
01918         Mul_Column1(9, 5)
01919         Mul_Column0(10, 4)
01920         Mul_Column1(11, 3)
01921         Mul_Column0(12, 2)
01922         Top_End(8)
01923 }
01924 
01925 #endif  // #if CRYPTOPP_INTEGER_SSE2
01926 
01927 // ********************************************************
01928 
01929 typedef int (CRYPTOPP_FASTCALL * PAdd)(size_t N, word *C, const word *A, const word *B);
01930 typedef void (* PMul)(word *C, const word *A, const word *B);
01931 typedef void (* PSqu)(word *C, const word *A);
01932 typedef void (* PMulTop)(word *C, const word *A, const word *B, word L);
01933 
01934 #if CRYPTOPP_INTEGER_SSE2
01935 static PAdd s_pAdd = &Baseline_Add, s_pSub = &Baseline_Sub;
01936 static size_t s_recursionLimit = 8;
01937 #else
01938 static const size_t s_recursionLimit = 16;
01939 #endif
01940 
01941 static PMul s_pMul[9], s_pBot[9];
01942 static PSqu s_pSqu[9];
01943 static PMulTop s_pTop[9];
01944 
01945 static void SetFunctionPointers()
01946 {
01947         s_pMul[0] = &Baseline_Multiply2;
01948         s_pBot[0] = &Baseline_MultiplyBottom2;
01949         s_pSqu[0] = &Baseline_Square2;
01950         s_pTop[0] = &Baseline_MultiplyTop2;
01951         s_pTop[1] = &Baseline_MultiplyTop4;
01952 
01953 #if CRYPTOPP_INTEGER_SSE2
01954         if (HasSSE2())
01955         {
01956                 if (IsP4())
01957                 {
01958                         s_pAdd = &SSE2_Add;
01959                         s_pSub = &SSE2_Sub;
01960                 }
01961 
01962                 s_recursionLimit = 32;
01963 
01964                 s_pMul[1] = &SSE2_Multiply4;
01965                 s_pMul[2] = &SSE2_Multiply8;
01966                 s_pMul[4] = &SSE2_Multiply16;
01967                 s_pMul[8] = &SSE2_Multiply32;
01968 
01969                 s_pBot[1] = &SSE2_MultiplyBottom4;
01970                 s_pBot[2] = &SSE2_MultiplyBottom8;
01971                 s_pBot[4] = &SSE2_MultiplyBottom16;
01972                 s_pBot[8] = &SSE2_MultiplyBottom32;
01973 
01974                 s_pSqu[1] = &SSE2_Square4;
01975                 s_pSqu[2] = &SSE2_Square8;
01976                 s_pSqu[4] = &SSE2_Square16;
01977                 s_pSqu[8] = &SSE2_Square32;
01978 
01979                 s_pTop[2] = &SSE2_MultiplyTop8;
01980                 s_pTop[4] = &SSE2_MultiplyTop16;
01981                 s_pTop[8] = &SSE2_MultiplyTop32;
01982         }
01983         else
01984 #endif
01985         {
01986                 s_pMul[1] = &Baseline_Multiply4;
01987                 s_pMul[2] = &Baseline_Multiply8;
01988 
01989                 s_pBot[1] = &Baseline_MultiplyBottom4;
01990                 s_pBot[2] = &Baseline_MultiplyBottom8;
01991 
01992                 s_pSqu[1] = &Baseline_Square4;
01993                 s_pSqu[2] = &Baseline_Square8;
01994 
01995                 s_pTop[2] = &Baseline_MultiplyTop8;
01996 
01997 #if     !CRYPTOPP_INTEGER_SSE2
01998                 s_pMul[4] = &Baseline_Multiply16;
01999                 s_pBot[4] = &Baseline_MultiplyBottom16;
02000                 s_pSqu[4] = &Baseline_Square16;
02001                 s_pTop[4] = &Baseline_MultiplyTop16;
02002 #endif
02003         }
02004 }
02005 
02006 inline int Add(word *C, const word *A, const word *B, size_t N)
02007 {
02008 #if CRYPTOPP_INTEGER_SSE2
02009         return s_pAdd(N, C, A, B);
02010 #else
02011         return Baseline_Add(N, C, A, B);
02012 #endif
02013 }
02014 
02015 inline int Subtract(word *C, const word *A, const word *B, size_t N)
02016 {
02017 #if CRYPTOPP_INTEGER_SSE2
02018         return s_pSub(N, C, A, B);
02019 #else
02020         return Baseline_Sub(N, C, A, B);
02021 #endif
02022 }
02023 
02024 // ********************************************************
02025 
02026 
02027 #define A0              A
02028 #define A1              (A+N2)
02029 #define B0              B
02030 #define B1              (B+N2)
02031 
02032 #define T0              T
02033 #define T1              (T+N2)
02034 #define T2              (T+N)
02035 #define T3              (T+N+N2)
02036 
02037 #define R0              R
02038 #define R1              (R+N2)
02039 #define R2              (R+N)
02040 #define R3              (R+N+N2)
02041 
02042 // R[2*N] - result = A*B
02043 // T[2*N] - temporary work space
02044 // A[N] --- multiplier
02045 // B[N] --- multiplicant
02046 
02047 void RecursiveMultiply(word *R, word *T, const word *A, const word *B, size_t N)
02048 {
02049         assert(N>=2 && N%2==0);
02050 
02051         if (N <= s_recursionLimit)
02052                 s_pMul[N/4](R, A, B);
02053         else
02054         {
02055                 const size_t N2 = N/2;
02056 
02057                 size_t AN2 = Compare(A0, A1, N2) > 0 ?  0 : N2;
02058                 Subtract(R0, A + AN2, A + (N2 ^ AN2), N2);
02059 
02060                 size_t BN2 = Compare(B0, B1, N2) > 0 ?  0 : N2;
02061                 Subtract(R1, B + BN2, B + (N2 ^ BN2), N2);
02062 
02063                 RecursiveMultiply(R2, T2, A1, B1, N2);
02064                 RecursiveMultiply(T0, T2, R0, R1, N2);
02065                 RecursiveMultiply(R0, T2, A0, B0, N2);
02066 
02067                 // now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1
02068 
02069                 int c2 = Add(R2, R2, R1, N2);
02070                 int c3 = c2;
02071                 c2 += Add(R1, R2, R0, N2);
02072                 c3 += Add(R2, R2, R3, N2);
02073 
02074                 if (AN2 == BN2)
02075                         c3 -= Subtract(R1, R1, T0, N);
02076                 else
02077                         c3 += Add(R1, R1, T0, N);
02078 
02079                 c3 += Increment(R2, N2, c2);
02080                 assert (c3 >= 0 && c3 <= 2);
02081                 Increment(R3, N2, c3);
02082         }
02083 }
02084 
02085 // R[2*N] - result = A*A
02086 // T[2*N] - temporary work space
02087 // A[N] --- number to be squared
02088 
02089 void RecursiveSquare(word *R, word *T, const word *A, size_t N)
02090 {
02091         assert(N && N%2==0);
02092 
02093         if (N <= s_recursionLimit)
02094                 s_pSqu[N/4](R, A);
02095         else
02096         {
02097                 const size_t N2 = N/2;
02098 
02099                 RecursiveSquare(R0, T2, A0, N2);
02100                 RecursiveSquare(R2, T2, A1, N2);
02101                 RecursiveMultiply(T0, T2, A0, A1, N2);
02102 
02103                 int carry = Add(R1, R1, T0, N);
02104                 carry += Add(R1, R1, T0, N);
02105                 Increment(R3, N2, carry);
02106         }
02107 }
02108 
02109 // R[N] - bottom half of A*B
02110 // T[3*N/2] - temporary work space
02111 // A[N] - multiplier
02112 // B[N] - multiplicant
02113 
02114 void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, size_t N)
02115 {
02116         assert(N>=2 && N%2==0);
02117 
02118         if (N <= s_recursionLimit)
02119                 s_pBot[N/4](R, A, B);
02120         else
02121         {
02122                 const size_t N2 = N/2;
02123 
02124                 RecursiveMultiply(R, T, A0, B0, N2);
02125                 RecursiveMultiplyBottom(T0, T1, A1, B0, N2);
02126                 Add(R1, R1, T0, N2);
02127                 RecursiveMultiplyBottom(T0, T1, A0, B1, N2);
02128                 Add(R1, R1, T0, N2);
02129         }
02130 }
02131 
02132 // R[N] --- upper half of A*B
02133 // T[2*N] - temporary work space
02134 // L[N] --- lower half of A*B
02135 // A[N] --- multiplier
02136 // B[N] --- multiplicant
02137 
02138 void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, size_t N)
02139 {
02140         assert(N>=2 && N%2==0);
02141 
02142         if (N <= s_recursionLimit)
02143                 s_pTop[N/4](R, A, B, L[N-1]);
02144         else
02145         {
02146                 const size_t N2 = N/2;
02147 
02148                 size_t AN2 = Compare(A0, A1, N2) > 0 ?  0 : N2;
02149                 Subtract(R0, A + AN2, A + (N2 ^ AN2), N2);
02150 
02151                 size_t BN2 = Compare(B0, B1, N2) > 0 ?  0 : N2;
02152                 Subtract(R1, B + BN2, B + (N2 ^ BN2), N2);
02153 
02154                 RecursiveMultiply(T0, T2, R0, R1, N2);
02155                 RecursiveMultiply(R0, T2, A1, B1, N2);
02156 
02157                 // now T[01] holds (A1-A0)*(B0-B1) = A1*B0+A0*B1-A1*B1-A0*B0, R[01] holds A1*B1
02158 
02159                 int t, c3;
02160                 int c2 = Subtract(T2, L+N2, L, N2);
02161 
02162                 if (AN2 == BN2)
02163                 {
02164                         c2 -= Add(T2, T2, T0, N2);
02165                         t = (Compare(T2, R0, N2) == -1);
02166                         c3 = t - Subtract(T2, T2, T1, N2);
02167                 }
02168                 else
02169                 {
02170                         c2 += Subtract(T2, T2, T0, N2);
02171                         t = (Compare(T2, R0, N2) == -1);
02172                         c3 = t + Add(T2, T2, T1, N2);
02173                 }
02174 
02175                 c2 += t;
02176                 if (c2 >= 0)
02177                         c3 += Increment(T2, N2, c2);
02178                 else
02179                         c3 -= Decrement(T2, N2, -c2);
02180                 c3 += Add(R0, T2, R1, N2);
02181 
02182                 assert (c3 >= 0 && c3 <= 2);
02183                 Increment(R1, N2, c3);
02184         }
02185 }
02186 
02187 inline void Multiply(word *R, word *T, const word *A, const word *B, size_t N)
02188 {
02189         RecursiveMultiply(R, T, A, B, N);
02190 }
02191 
02192 inline void Square(word *R, word *T, const word *A, size_t N)
02193 {
02194         RecursiveSquare(R, T, A, N);
02195 }
02196 
02197 inline void MultiplyBottom(word *R, word *T, const word *A, const word *B, size_t N)
02198 {
02199         RecursiveMultiplyBottom(R, T, A, B, N);
02200 }
02201 
02202 // R[NA+NB] - result = A*B
02203 // T[NA+NB] - temporary work space
02204 // A[NA] ---- multiplier
02205 // B[NB] ---- multiplicant
02206 
02207 void AsymmetricMultiply(word *R, word *T, const word *A, size_t NA, const word *B, size_t NB)
02208 {
02209         if (NA == NB)
02210         {
02211                 if (A == B)
02212                         Square(R, T, A, NA);
02213                 else
02214                         Multiply(R, T, A, B, NA);
02215 
02216                 return;
02217         }
02218 
02219         if (NA > NB)
02220         {
02221                 std::swap(A, B);
02222                 std::swap(NA, NB);
02223         }
02224 
02225         assert(NB % NA == 0);
02226 
02227         if (NA==2 && !A[1])
02228         {
02229                 switch (A[0])
02230                 {
02231                 case 0:
02232                         SetWords(R, 0, NB+2);
02233                         return;
02234                 case 1:
02235                         CopyWords(R, B, NB);
02236                         R[NB] = R[NB+1] = 0;
02237                         return;
02238                 default:
02239                         R[NB] = LinearMultiply(R, B, A[0], NB);
02240                         R[NB+1] = 0;
02241                         return;
02242                 }
02243         }
02244 
02245         size_t i;
02246         if ((NB/NA)%2 == 0)
02247         {
02248                 Multiply(R, T, A, B, NA);
02249                 CopyWords(T+2*NA, R+NA, NA);
02250 
02251                 for (i=2*NA; i<NB; i+=2*NA)
02252                         Multiply(T+NA+i, T, A, B+i, NA);
02253                 for (i=NA; i<NB; i+=2*NA)
02254                         Multiply(R+i, T, A, B+i, NA);
02255         }
02256         else
02257         {
02258                 for (i=0; i<NB; i+=2*NA)
02259                         Multiply(R+i, T, A, B+i, NA);
02260                 for (i=NA; i<NB; i+=2*NA)
02261                         Multiply(T+NA+i, T, A, B+i, NA);
02262         }
02263 
02264         if (Add(R+NA, R+NA, T+2*NA, NB-NA))
02265                 Increment(R+NB, NA);
02266 }
02267 
02268 // R[N] ----- result = A inverse mod 2**(WORD_BITS*N)
02269 // T[3*N/2] - temporary work space
02270 // A[N] ----- an odd number as input
02271 
02272 void RecursiveInverseModPower2(word *R, word *T, const word *A, size_t N)
02273 {
02274         if (N==2)
02275         {
02276                 T[0] = AtomicInverseModPower2(A[0]);
02277                 T[1] = 0;
02278                 s_pBot[0](T+2, T, A);
02279                 TwosComplement(T+2, 2);
02280                 Increment(T+2, 2, 2);
02281                 s_pBot[0](R, T, T+2);
02282         }
02283         else
02284         {
02285                 const size_t N2 = N/2;
02286                 RecursiveInverseModPower2(R0, T0, A0, N2);
02287                 T0[0] = 1;
02288                 SetWords(T0+1, 0, N2-1);
02289                 MultiplyTop(R1, T1, T0, R0, A0, N2);
02290                 MultiplyBottom(T0, T1, R0, A1, N2);
02291                 Add(T0, R1, T0, N2);
02292                 TwosComplement(T0, N2);
02293                 MultiplyBottom(R1, T1, R0, T0, N2);
02294         }
02295 }
02296 
02297 // R[N] --- result = X/(2**(WORD_BITS*N)) mod M
02298 // T[3*N] - temporary work space
02299 // X[2*N] - number to be reduced
02300 // M[N] --- modulus
02301 // U[N] --- multiplicative inverse of M mod 2**(WORD_BITS*N)
02302 
02303 void MontgomeryReduce(word *R, word *T, word *X, const word *M, const word *U, size_t N)
02304 {
02305 #if 1
02306         MultiplyBottom(R, T, X, U, N);
02307         MultiplyTop(T, T+N, X, R, M, N);
02308         word borrow = Subtract(T, X+N, T, N);
02309         // defend against timing attack by doing this Add even when not needed
02310         word carry = Add(T+N, T, M, N);
02311         assert(carry || !borrow);
02312         CopyWords(R, T + (borrow ? N : 0), N);
02313 #elif 0
02314         const word u = 0-U[0];
02315         Declare2Words(p)
02316         for (size_t i=0; i<N; i++)
02317         {
02318                 const word t = u * X[i];
02319                 word c = 0;
02320                 for (size_t j=0; j<N; j+=2)
02321                 {
02322                         MultiplyWords(p, t, M[j]);
02323                         Acc2WordsBy1(p, X[i+j]);
02324                         Acc2WordsBy1(p, c);
02325                         X[i+j] = LowWord(p);
02326                         c = HighWord(p);
02327                         MultiplyWords(p, t, M[j+1]);
02328                         Acc2WordsBy1(p, X[i+j+1]);
02329                         Acc2WordsBy1(p, c);
02330                         X[i+j+1] = LowWord(p);
02331                         c = HighWord(p);
02332                 }
02333 
02334                 if (Increment(X+N+i, N-i, c))
02335                         while (!Subtract(X+N, X+N, M, N)) {}
02336         }
02337 
02338         memcpy(R, X+N, N*WORD_SIZE);
02339 #else
02340         __m64 u = _mm_cvtsi32_si64(0-U[0]), p;
02341         for (size_t i=0; i<N; i++)
02342         {
02343                 __m64 t = _mm_cvtsi32_si64(X[i]);
02344                 t = _mm_mul_su32(t, u);
02345                 __m64 c = _mm_setzero_si64();
02346                 for (size_t j=0; j<N; j+=2)
02347                 {
02348                         p = _mm_mul_su32(t, _mm_cvtsi32_si64(M[j]));
02349                         p = _mm_add_si64(p, _mm_cvtsi32_si64(X[i+j]));
02350                         c = _mm_add_si64(c, p);
02351                         X[i+j] = _mm_cvtsi64_si32(c);
02352                         c = _mm_srli_si64(c, 32);
02353                         p = _mm_mul_su32(t, _mm_cvtsi32_si64(M[j+1]));
02354                         p = _mm_add_si64(p, _mm_cvtsi32_si64(X[i+j+1]));
02355                         c = _mm_add_si64(c, p);
02356                         X[i+j+1] = _mm_cvtsi64_si32(c);
02357                         c = _mm_srli_si64(c, 32);
02358                 }
02359 
02360                 if (Increment(X+N+i, N-i, _mm_cvtsi64_si32(c)))
02361                         while (!Subtract(X+N, X+N, M, N)) {}
02362         }
02363 
02364         memcpy(R, X+N, N*WORD_SIZE);
02365         _mm_empty();
02366 #endif
02367 }
02368 
02369 // R[N] --- result = X/(2**(WORD_BITS*N/2)) mod M
02370 // T[2*N] - temporary work space
02371 // X[2*N] - number to be reduced
02372 // M[N] --- modulus
02373 // U[N/2] - multiplicative inverse of M mod 2**(WORD_BITS*N/2)
02374 // V[N] --- 2**(WORD_BITS*3*N/2) mod M
02375 
02376 void HalfMontgomeryReduce(word *R, word *T, const word *X, const word *M, const word *U, const word *V, size_t N)
02377 {
02378         assert(N%2==0 && N>=4);
02379 
02380 #define M0              M
02381 #define M1              (M+N2)
02382 #define V0              V
02383 #define V1              (V+N2)
02384 
02385 #define X0              X
02386 #define X1              (X+N2)
02387 #define X2              (X+N)
02388 #define X3              (X+N+N2)
02389 
02390         const size_t N2 = N/2;
02391         Multiply(T0, T2, V0, X3, N2);
02392         int c2 = Add(T0, T0, X0, N);
02393         MultiplyBottom(T3, T2, T0, U, N2);
02394         MultiplyTop(T2, R, T0, T3, M0, N2);
02395         c2 -= Subtract(T2, T1, T2, N2);
02396         Multiply(T0, R, T3, M1, N2);
02397         c2 -= Subtract(T0, T2, T0, N2);
02398         int c3 = -(int)Subtract(T1, X2, T1, N2);
02399         Multiply(R0, T2, V1, X3, N2);
02400         c3 += Add(R, R, T, N);
02401 
02402         if (c2>0)
02403                 c3 += Increment(R1, N2);
02404         else if (c2<0)
02405                 c3 -= Decrement(R1, N2, -c2);
02406 
02407         assert(c3>=-1 && c3<=1);
02408         if (c3>0)
02409                 Subtract(R, R, M, N);
02410         else if (c3<0)
02411                 Add(R, R, M, N);
02412 
02413 #undef M0
02414 #undef M1
02415 #undef V0
02416 #undef V1
02417 
02418 #undef X0
02419 #undef X1
02420 #undef X2
02421 #undef X3
02422 }
02423 
02424 #undef A0
02425 #undef A1
02426 #undef B0
02427 #undef B1
02428 
02429 #undef T0
02430 #undef T1
02431 #undef T2
02432 #undef T3
02433 
02434 #undef R0
02435 #undef R1
02436 #undef R2
02437 #undef R3
02438 
02439 /*
02440 // do a 3 word by 2 word divide, returns quotient and leaves remainder in A
02441 static word SubatomicDivide(word *A, word B0, word B1)
02442 {
02443         // assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a word
02444         assert(A[2] < B1 || (A[2]==B1 && A[1] < B0));
02445 
02446         // estimate the quotient: do a 2 word by 1 word divide
02447         word Q;
02448         if (B1+1 == 0)
02449                 Q = A[2];
02450         else
02451                 Q = DWord(A[1], A[2]).DividedBy(B1+1);
02452 
02453         // now subtract Q*B from A
02454         DWord p = DWord::Multiply(B0, Q);
02455         DWord u = (DWord) A[0] - p.GetLowHalf();
02456         A[0] = u.GetLowHalf();
02457         u = (DWord) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() - DWord::Multiply(B1, Q);
02458         A[1] = u.GetLowHalf();
02459         A[2] += u.GetHighHalf();
02460 
02461         // Q <= actual quotient, so fix it
02462         while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
02463         {
02464                 u = (DWord) A[0] - B0;
02465                 A[0] = u.GetLowHalf();
02466                 u = (DWord) A[1] - B1 - u.GetHighHalfAsBorrow();
02467                 A[1] = u.GetLowHalf();
02468                 A[2] += u.GetHighHalf();
02469                 Q++;
02470                 assert(Q);      // shouldn't overflow
02471         }
02472 
02473         return Q;
02474 }
02475 
02476 // do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
02477 static inline void AtomicDivide(word *Q, const word *A, const word *B)
02478 {
02479         if (!B[0] && !B[1]) // if divisor is 0, we assume divisor==2**(2*WORD_BITS)
02480         {
02481                 Q[0] = A[2];
02482                 Q[1] = A[3];
02483         }
02484         else
02485         {
02486                 word T[4];
02487                 T[0] = A[0]; T[1] = A[1]; T[2] = A[2]; T[3] = A[3];
02488                 Q[1] = SubatomicDivide(T+1, B[0], B[1]);
02489                 Q[0] = SubatomicDivide(T, B[0], B[1]);
02490 
02491 #ifndef NDEBUG
02492                 // multiply quotient and divisor and add remainder, make sure it equals dividend
02493                 assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0])));
02494                 word P[4];
02495                 LowLevel::Multiply2(P, Q, B);
02496                 Add(P, P, T, 4);
02497                 assert(memcmp(P, A, 4*WORD_SIZE)==0);
02498 #endif
02499         }
02500 }
02501 */
02502 
02503 static inline void AtomicDivide(word *Q, const word *A, const word *B)
02504 {
02505         word T[4];
02506         DWord q = DivideFourWordsByTwo<word, DWord>(T, DWord(A[0], A[1]), DWord(A[2], A[3]), DWord(B[0], B[1]));
02507         Q[0] = q.GetLowHalf();
02508         Q[1] = q.GetHighHalf();
02509 
02510 #ifndef NDEBUG
02511         if (B[0] || B[1])
02512         {
02513                 // multiply quotient and divisor and add remainder, make sure it equals dividend
02514                 assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0])));
02515                 word P[4];
02516                 s_pMul[0](P, Q, B);
02517                 Add(P, P, T, 4);
02518                 assert(memcmp(P, A, 4*WORD_SIZE)==0);
02519         }
02520 #endif
02521 }
02522 
02523 // for use by Divide(), corrects the underestimated quotient {Q1,Q0}
02524 static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, size_t N)
02525 {
02526         assert(N && N%2==0);
02527 
02528         AsymmetricMultiply(T, T+N+2, Q, 2, B, N);
02529 
02530         word borrow = Subtract(R, R, T, N+2);
02531         assert(!borrow && !R[N+1]);
02532 
02533         while (R[N] || Compare(R, B, N) >= 0)
02534         {
02535                 R[N] -= Subtract(R, R, B, N);
02536                 Q[1] += (++Q[0]==0);
02537                 assert(Q[0] || Q[1]); // no overflow
02538         }
02539 }
02540 
02541 // R[NB] -------- remainder = A%B
02542 // Q[NA-NB+2] --- quotient      = A/B
02543 // T[NA+3*(NB+2)] - temp work space
02544 // A[NA] -------- dividend
02545 // B[NB] -------- divisor
02546 
02547 void Divide(word *R, word *Q, word *T, const word *A, size_t NA, const word *B, size_t NB)
02548 {
02549         assert(NA && NB && NA%2==0 && NB%2==0);
02550         assert(B[NB-1] || B[NB-2]);
02551         assert(NB <= NA);
02552 
02553         // set up temporary work space
02554         word *const TA=T;
02555         word *const TB=T+NA+2;
02556         word *const TP=T+NA+2+NB;
02557 
02558         // copy B into TB and normalize it so that TB has highest bit set to 1
02559         unsigned shiftWords = (B[NB-1]==0);
02560         TB[0] = TB[NB-1] = 0;
02561         CopyWords(TB+shiftWords, B, NB-shiftWords);
02562         unsigned shiftBits = WORD_BITS - BitPrecision(TB[NB-1]);
02563         assert(shiftBits < WORD_BITS);
02564         ShiftWordsLeftByBits(TB, NB, shiftBits);
02565 
02566         // copy A into TA and normalize it
02567         TA[0] = TA[NA] = TA[NA+1] = 0;
02568         CopyWords(TA+shiftWords, A, NA);
02569         ShiftWordsLeftByBits(TA, NA+2, shiftBits);
02570 
02571         if (TA[NA+1]==0 && TA[NA] <= 1)
02572         {
02573                 Q[NA-NB+1] = Q[NA-NB] = 0;
02574                 while (TA[NA] || Compare(TA+NA-NB, TB, NB) >= 0)
02575                 {
02576                         TA[NA] -= Subtract(TA+NA-NB, TA+NA-NB, TB, NB);
02577                         ++Q[NA-NB];
02578                 }
02579         }
02580         else
02581         {
02582                 NA+=2;
02583                 assert(Compare(TA+NA-NB, TB, NB) < 0);
02584         }
02585 
02586         word BT[2];
02587         BT[0] = TB[NB-2] + 1;
02588         BT[1] = TB[NB-1] + (BT[0]==0);
02589 
02590         // start reducing TA mod TB, 2 words at a time
02591         for (size_t i=NA-2; i>=NB; i-=2)
02592         {
02593                 AtomicDivide(Q+i-NB, TA+i-2, BT);
02594                 CorrectQuotientEstimate(TA+i-NB, TP, Q+i-NB, TB, NB);
02595         }
02596 
02597         // copy TA into R, and denormalize it
02598         CopyWords(R, TA+shiftWords, NB);
02599         ShiftWordsRightByBits(R, NB, shiftBits);
02600 }
02601 
02602 static inline size_t EvenWordCount(const word *X, size_t N)
02603 {
02604         while (N && X[N-2]==0 && X[N-1]==0)
02605                 N-=2;
02606         return N;
02607 }
02608 
02609 // return k
02610 // R[N] --- result = A^(-1) * 2^k mod M
02611 // T[4*N] - temporary work space
02612 // A[NA] -- number to take inverse of
02613 // M[N] --- modulus
02614 
02615 unsigned int AlmostInverse(word *R, word *T, const word *A, size_t NA, const word *M, size_t N)
02616 {
02617         assert(NA<=N && N && N%2==0);
02618 
02619         word *b = T;
02620         word *c = T+N;
02621         word *f = T+2*N;
02622         word *g = T+3*N;
02623         size_t bcLen=2, fgLen=EvenWordCount(M, N);
02624         unsigned int k=0, s=0;
02625 
02626         SetWords(T, 0, 3*N);
02627         b[0]=1;
02628         CopyWords(f, A, NA);
02629         CopyWords(g, M, N);
02630 
02631         while (1)
02632         {
02633                 word t=f[0];
02634                 while (!t)
02635                 {
02636                         if (EvenWordCount(f, fgLen)==0)
02637                         {
02638                                 SetWords(R, 0, N);
02639                                 return 0;
02640                         }
02641 
02642                         ShiftWordsRightByWords(f, fgLen, 1);
02643                         if (c[bcLen-1]) bcLen+=2;
02644                         assert(bcLen <= N);
02645                         ShiftWordsLeftByWords(c, bcLen, 1);
02646                         k+=WORD_BITS;
02647                         t=f[0];
02648                 }
02649 
02650                 unsigned int i=0;
02651                 while (t%2 == 0)
02652                 {
02653                         t>>=1;
02654                         i++;
02655                 }
02656                 k+=i;
02657 
02658                 if (t==1 && f[1]==0 && EvenWordCount(f, fgLen)==2)
02659                 {
02660                         if (s%2==0)
02661                                 CopyWords(R, b, N);
02662                         else
02663                                 Subtract(R, M, b, N);
02664                         return k;
02665                 }
02666 
02667                 ShiftWordsRightByBits(f, fgLen, i);
02668                 t=ShiftWordsLeftByBits(c, bcLen, i);
02669                 if (t)
02670                 {
02671                         c[bcLen] = t;
02672                         bcLen+=2;
02673                         assert(bcLen <= N);
02674                 }
02675 
02676                 if (f[fgLen-2]==0 && g[fgLen-2]==0 && f[fgLen-1]==0 && g[fgLen-1]==0)
02677                         fgLen-=2;
02678 
02679                 if (Compare(f, g, fgLen)==-1)
02680                 {
02681                         std::swap(f, g);
02682                         std::swap(b, c);
02683                         s++;
02684                 }
02685 
02686                 Subtract(f, f, g, fgLen);
02687 
02688                 if (Add(b, b, c, bcLen))
02689                 {
02690                         b[bcLen] = 1;
02691                         bcLen+=2;
02692                         assert(bcLen <= N);
02693                 }
02694         }
02695 }
02696 
02697 // R[N] - result = A/(2^k) mod M
02698 // A[N] - input
02699 // M[N] - modulus
02700 
02701 void DivideByPower2Mod(word *R, const word *A, size_t k, const word *M, size_t N)
02702 {
02703         CopyWords(R, A, N);
02704 
02705         while (k--)
02706         {
02707                 if (R[0]%2==0)
02708                         ShiftWordsRightByBits(R, N, 1);
02709                 else
02710                 {
02711                         word carry = Add(R, R, M, N);
02712                         ShiftWordsRightByBits(R, N, 1);
02713                         R[N-1] += carry<<(WORD_BITS-1);
02714                 }
02715         }
02716 }
02717 
02718 // R[N] - result = A*(2^k) mod M
02719 // A[N] - input
02720 // M[N] - modulus
02721 
02722 void MultiplyByPower2Mod(word *R, const word *A, size_t k, const word *M, size_t N)
02723 {
02724         CopyWords(R, A, N);
02725 
02726         while (k--)
02727                 if (ShiftWordsLeftByBits(R, N, 1) || Compare(R, M, N)>=0)
02728                         Subtract(R, R, M, N);
02729 }
02730 
02731 // ******************************************************************
02732 
02733 InitializeInteger::InitializeInteger()
02734 {
02735         if (!g_pAssignIntToInteger)
02736         {
02737                 SetFunctionPointers();
02738                 g_pAssignIntToInteger = AssignIntToInteger;
02739         }
02740 }
02741 
02742 static const unsigned int RoundupSizeTable[] = {2, 2, 2, 4, 4, 8, 8, 8, 8};
02743 
02744 static inline size_t RoundupSize(size_t n)
02745 {
02746         if (n<=8)
02747                 return RoundupSizeTable[n];
02748         else if (n<=16)
02749                 return 16;
02750         else if (n<=32)
02751                 return 32;
02752         else if (n<=64)
02753                 return 64;
02754         else return size_t(1) << BitPrecision(n-1);
02755 }
02756 
02757 Integer::Integer()
02758         : reg(2), sign(POSITIVE)
02759 {
02760         reg[0] = reg[1] = 0;
02761 }
02762 
02763 Integer::Integer(const Integer& t)
02764         : reg(RoundupSize(t.WordCount())), sign(t.sign)
02765 {
02766         CopyWords(reg, t.reg, reg.size());
02767 }
02768 
02769 Integer::Integer(Sign s, lword value)
02770         : reg(2), sign(s)
02771 {
02772         reg[0] = word(value);
02773         reg[1] = word(SafeRightShift<WORD_BITS>(value));
02774 }
02775 
02776 Integer::Integer(signed long value)
02777         : reg(2)
02778 {
02779         if (value >= 0)
02780                 sign = POSITIVE;
02781         else
02782         {
02783                 sign = NEGATIVE;
02784                 value = -value;
02785         }
02786         reg[0] = word(value);
02787         reg[1] = word(SafeRightShift<WORD_BITS>((unsigned long)value));
02788 }
02789 
02790 Integer::Integer(Sign s, word high, word low)
02791         : reg(2), sign(s)
02792 {
02793         reg[0] = low;
02794         reg[1] = high;
02795 }
02796 
02797 bool Integer::IsConvertableToLong() const
02798 {
02799         if (ByteCount() > sizeof(long))
02800                 return false;
02801 
02802         unsigned long value = (unsigned long)reg[0];
02803         value += SafeLeftShift<WORD_BITS, unsigned long>((unsigned long)reg[1]);
02804 
02805         if (sign==POSITIVE)
02806                 return (signed long)value >= 0;
02807         else
02808                 return -(signed long)value < 0;
02809 }
02810 
02811 signed long Integer::ConvertToLong() const
02812 {
02813         assert(IsConvertableToLong());
02814 
02815         unsigned long value = (unsigned long)reg[0];
02816         value += SafeLeftShift<WORD_BITS, unsigned long>((unsigned long)reg[1]);
02817         return sign==POSITIVE ? value : -(signed long)value;
02818 }
02819 
02820 Integer::Integer(BufferedTransformation &encodedInteger, size_t byteCount, Signedness s)
02821 {
02822         Decode(encodedInteger, byteCount, s);
02823 }
02824 
02825 Integer::Integer(const byte *encodedInteger, size_t byteCount, Signedness s)
02826 {
02827         Decode(encodedInteger, byteCount, s);
02828 }
02829 
02830 Integer::Integer(BufferedTransformation &bt)
02831 {
02832         BERDecode(bt);
02833 }
02834 
02835 Integer::Integer(RandomNumberGenerator &rng, size_t bitcount)
02836 {
02837         Randomize(rng, bitcount);
02838 }
02839 
02840 Integer::Integer(RandomNumberGenerator &rng, const Integer &min, const Integer &max, RandomNumberType rnType, const Integer &equiv, const Integer &mod)
02841 {
02842         if (!Randomize(rng, min, max, rnType, equiv, mod))
02843                 throw Integer::RandomNumberNotFound();
02844 }
02845 
02846 Integer Integer::Power2(size_t e)
02847 {
02848         Integer r((word)0, BitsToWords(e+1));
02849         r.SetBit(e);
02850         return r;
02851 }
02852 
02853 template <long i>
02854 struct NewInteger
02855 {
02856         Integer * operator()() const
02857         {
02858                 return new Integer(i);
02859         }
02860 };
02861 
02862 const Integer &Integer::Zero()
02863 {
02864         return Singleton<Integer>().Ref();
02865 }
02866 
02867 const Integer &Integer::One()
02868 {
02869         return Singleton<Integer, NewInteger<1> >().Ref();
02870 }
02871 
02872 const Integer &Integer::Two()
02873 {
02874         return Singleton<Integer, NewInteger<2> >().Ref();
02875 }
02876 
02877 bool Integer::operator!() const
02878 {
02879         return IsNegative() ? false : (reg[0]==0 && WordCount()==0);
02880 }
02881 
02882 Integer& Integer::operator=(const Integer& t)
02883 {
02884         if (this != &t)
02885         {
02886                 if (reg.size() != t.reg.size() || t.reg[t.reg.size()/2] == 0)
02887                         reg.New(RoundupSize(t.WordCount()));
02888                 CopyWords(reg, t.reg, reg.size());
02889                 sign = t.sign;
02890         }
02891         return *this;
02892 }
02893 
02894 bool Integer::GetBit(size_t n) const
02895 {
02896         if (n/WORD_BITS >= reg.size())
02897                 return 0;
02898         else
02899                 return bool((reg[n/WORD_BITS] >> (n % WORD_BITS)) & 1);
02900 }
02901 
02902 void Integer::SetBit(size_t n, bool value)
02903 {
02904         if (value)
02905         {
02906                 reg.CleanGrow(RoundupSize(BitsToWords(n+1)));
02907                 reg[n/WORD_BITS] |= (word(1) << (n%WORD_BITS));
02908         }
02909         else
02910         {
02911                 if (n/WORD_BITS < reg.size())
02912                         reg[n/WORD_BITS] &= ~(word(1) << (n%WORD_BITS));
02913         }
02914 }
02915 
02916 byte Integer::GetByte(size_t n) const
02917 {
02918         if (n/WORD_SIZE >= reg.size())
02919                 return 0;
02920         else
02921                 return byte(reg[n/WORD_SIZE] >> ((n%WORD_SIZE)*8));
02922 }
02923 
02924 void Integer::SetByte(size_t n, byte value)
02925 {
02926         reg.CleanGrow(RoundupSize(BytesToWords(n+1)));
02927         reg[n/WORD_SIZE] &= ~(word(0xff) << 8*(n%WORD_SIZE));
02928         reg[n/WORD_SIZE] |= (word(value) << 8*(n%WORD_SIZE));
02929 }
02930 
02931 lword Integer::GetBits(size_t i, size_t n) const
02932 {
02933         lword v = 0;
02934         assert(n <= sizeof(v)*8);
02935         for (unsigned int j=0; j<n; j++)
02936                 v |= lword(GetBit(i+j)) << j;
02937         return v;
02938 }
02939 
02940 Integer Integer::operator-() const
02941 {
02942         Integer result(*this);
02943         result.Negate();
02944         return result;
02945 }
02946 
02947 Integer Integer::AbsoluteValue() const
02948 {
02949         Integer result(*this);
02950         result.sign = POSITIVE;
02951         return result;
02952 }
02953 
02954 void Integer::swap(Integer &a)
02955 {
02956         reg.swap(a.reg);
02957         std::swap(sign, a.sign);
02958 }
02959 
02960 Integer::Integer(word value, size_t length)
02961         : reg(RoundupSize(length)), sign(POSITIVE)
02962 {
02963         reg[0] = value;
02964         SetWords(reg+1, 0, reg.size()-1);
02965 }
02966 
02967 template <class T>
02968 static Integer StringToInteger(const T *str)
02969 {
02970         int radix;
02971         // GCC workaround
02972         // std::char_traits<wchar_t>::length() not defined in GCC 3.2 and STLport 4.5.3
02973         unsigned int length;
02974         for (length = 0; str[length] != 0; length++) {}
02975 
02976         Integer v;
02977 
02978         if (length == 0)
02979                 return v;
02980 
02981         switch (str[length-1])
02982         {
02983         case 'h':
02984         case 'H':
02985                 radix=16;
02986                 break;
02987         case 'o':
02988         case 'O':
02989                 radix=8;
02990                 break;
02991         case 'b':
02992         case 'B':
02993                 radix=2;
02994                 break;
02995         default:
02996                 radix=10;
02997         }
02998 
02999         if (length > 2 && str[0] == '0' && str[1] == 'x')
03000                 radix = 16;
03001 
03002         for (unsigned i=0; i<length; i++)
03003         {
03004                 int digit;
03005 
03006                 if (str[i] >= '0' && str[i] <= '9')
03007                         digit = str[i] - '0';
03008                 else if (str[i] >= 'A' && str[i] <= 'F')
03009                         digit = str[i] - 'A' + 10;
03010                 else if (str[i] >= 'a' && str[i] <= 'f')
03011                         digit = str[i] - 'a' + 10;
03012                 else
03013                         digit = radix;
03014 
03015                 if (digit < radix)
03016                 {
03017                         v *= radix;
03018                         v += digit;
03019                 }
03020         }
03021 
03022         if (str[0] == '-')
03023                 v.Negate();
03024 
03025         return v;
03026 }
03027 
03028 Integer::Integer(const char *str)
03029         : reg(2), sign(POSITIVE)
03030 {
03031         *this = StringToInteger(str);
03032 }
03033 
03034 Integer::Integer(const wchar_t *str)
03035         : reg(2), sign(POSITIVE)
03036 {
03037         *this = StringToInteger(str);
03038 }
03039 
03040 unsigned int Integer::WordCount() const
03041 {
03042         return (unsigned int)CountWords(reg, reg.size());
03043 }
03044 
03045 unsigned int Integer::ByteCount() const
03046 {
03047         unsigned wordCount = WordCount();
03048         if (wordCount)
03049                 return (wordCount-1)*WORD_SIZE + BytePrecision(reg[wordCount-1]);
03050         else
03051                 return 0;
03052 }
03053 
03054 unsigned int Integer::BitCount() const
03055 {
03056         unsigned wordCount = WordCount();
03057         if (wordCount)
03058                 return (wordCount-1)*WORD_BITS + BitPrecision(reg[wordCount-1]);
03059         else
03060                 return 0;
03061 }
03062 
03063 void Integer::Decode(const byte *input, size_t inputLen, Signedness s)
03064 {
03065         StringStore store(input, inputLen);
03066         Decode(store, inputLen, s);
03067 }
03068 
03069 void Integer::Decode(BufferedTransformation &bt, size_t inputLen, Signedness s)
03070 {
03071         assert(bt.MaxRetrievable() >= inputLen);
03072 
03073         byte b;
03074         bt.Peek(b);
03075         sign = ((s==SIGNED) && (b & 0x80)) ? NEGATIVE : POSITIVE;
03076 
03077         while (inputLen>0 && (sign==POSITIVE ? b==0 : b==0xff))
03078         {
03079                 bt.Skip(1);
03080                 inputLen--;
03081                 bt.Peek(b);
03082         }
03083 
03084         reg.CleanNew(RoundupSize(BytesToWords(inputLen)));
03085 
03086         for (size_t i=inputLen; i > 0; i--)
03087         {
03088                 bt.Get(b);
03089                 reg[(i-1)/WORD_SIZE] |= word(b) << ((i-1)%WORD_SIZE)*8;
03090         }
03091 
03092         if (sign == NEGATIVE)
03093         {
03094                 for (size_t i=inputLen; i<reg.size()*WORD_SIZE; i++)
03095                         reg[i/WORD_SIZE] |= word(0xff) << (i%WORD_SIZE)*8;
03096                 TwosComplement(reg, reg.size());
03097         }
03098 }
03099 
03100 size_t Integer::MinEncodedSize(Signedness signedness) const
03101 {
03102         unsigned int outputLen = STDMAX(1U, ByteCount());
03103         if (signedness == UNSIGNED)
03104                 return outputLen;
03105         if (NotNegative() && (GetByte(outputLen-1) & 0x80))
03106                 outputLen++;
03107         if (IsNegative() && *this < -Power2(outputLen*8-1))
03108                 outputLen++;
03109         return outputLen;
03110 }
03111 
03112 void Integer::Encode(byte *output, size_t outputLen, Signedness signedness) const
03113 {
03114         ArraySink sink(output, outputLen);
03115         Encode(sink, outputLen, signedness);
03116 }
03117 
03118 void Integer::Encode(BufferedTransformation &bt, size_t outputLen, Signedness signedness) const
03119 {
03120         if (signedness == UNSIGNED || NotNegative())
03121         {
03122                 for (size_t i=outputLen; i > 0; i--)
03123                         bt.Put(GetByte(i-1));
03124         }
03125         else
03126         {
03127                 // take two's complement of *this
03128                 Integer temp = Integer::Power2(8*STDMAX((size_t)ByteCount(), outputLen)) + *this;
03129                 temp.Encode(bt, outputLen, UNSIGNED);
03130         }
03131 }
03132 
03133 void Integer::DEREncode(BufferedTransformation &bt) const
03134 {
03135         DERGeneralEncoder enc(bt, INTEGER);
03136         Encode(enc, MinEncodedSize(SIGNED), SIGNED);
03137         enc.MessageEnd();
03138 }
03139 
03140 void Integer::BERDecode(const byte *input, size_t len)
03141 {
03142         StringStore store(input, len);
03143         BERDecode(store);
03144 }
03145 
03146 void Integer::BERDecode(BufferedTransformation &bt)
03147 {
03148         BERGeneralDecoder dec(bt, INTEGER);
03149         if (!dec.IsDefiniteLength() || dec.MaxRetrievable() < dec.RemainingLength())
03150                 BERDecodeError();
03151         Decode(dec, (size_t)dec.RemainingLength(), SIGNED);
03152         dec.MessageEnd();
03153 }
03154 
03155 void Integer::DEREncodeAsOctetString(BufferedTransformation &bt, size_t length) const
03156 {
03157         DERGeneralEncoder enc(bt, OCTET_STRING);
03158         Encode(enc, length);
03159         enc.MessageEnd();
03160 }
03161 
03162 void Integer::BERDecodeAsOctetString(BufferedTransformation &bt, size_t length)
03163 {
03164         BERGeneralDecoder dec(bt, OCTET_STRING);
03165         if (!dec.IsDefiniteLength() || dec.RemainingLength() != length)
03166                 BERDecodeError();
03167         Decode(dec, length);
03168         dec.MessageEnd();
03169 }
03170 
03171 size_t Integer::OpenPGPEncode(byte *output, size_t len) const
03172 {
03173         ArraySink sink(output, len);
03174         return OpenPGPEncode(sink);
03175 }
03176 
03177 size_t Integer::OpenPGPEncode(BufferedTransformation &bt) const
03178 {
03179         word16 bitCount = BitCount();
03180         bt.PutWord16(bitCount);
03181         size_t byteCount = BitsToBytes(bitCount);
03182         Encode(bt, byteCount);
03183         return 2 + byteCount;
03184 }
03185 
03186 void Integer::OpenPGPDecode(const byte *input, size_t len)
03187 {
03188         StringStore store(input, len);
03189         OpenPGPDecode(store);
03190 }
03191 
03192 void Integer::OpenPGPDecode(BufferedTransformation &bt)
03193 {
03194         word16 bitCount;
03195         if (bt.GetWord16(bitCount) != 2 || bt.MaxRetrievable() < BitsToBytes(bitCount))
03196                 throw OpenPGPDecodeErr();
03197         Decode(bt, BitsToBytes(bitCount));
03198 }
03199 
03200 void Integer::Randomize(RandomNumberGenerator &rng, size_t nbits)
03201 {
03202         const size_t nbytes = nbits/8 + 1;
03203         SecByteBlock buf(nbytes);
03204         rng.GenerateBlock(buf, nbytes);
03205         if (nbytes)
03206                 buf[0] = (byte)Crop(buf[0], nbits % 8);
03207         Decode(buf, nbytes, UNSIGNED);
03208 }
03209 
03210 void Integer::Randomize(RandomNumberGenerator &rng, const Integer &min, const Integer &max)
03211 {
03212         if (min > max)
03213                 throw InvalidArgument("Integer: Min must be no greater than Max");
03214 
03215         Integer range = max - min;
03216         const unsigned int nbits = range.BitCount();
03217 
03218         do
03219         {
03220                 Randomize(rng, nbits);
03221         }
03222         while (*this > range);
03223 
03224         *this += min;
03225 }
03226 
03227 bool Integer::Randomize(RandomNumberGenerator &rng, const Integer &min, const Integer &max, RandomNumberType rnType, const Integer &equiv, const Integer &mod)
03228 {
03229         return GenerateRandomNoThrow(rng, MakeParameters("Min", min)("Max", max)("RandomNumberType", rnType)("EquivalentTo", equiv)("Mod", mod));
03230 }
03231 
03232 class KDF2_RNG : public RandomNumberGenerator
03233 {
03234 public:
03235         KDF2_RNG(const byte *seed, size_t seedSize)
03236                 : m_counter(0), m_counterAndSeed(seedSize + 4)
03237         {
03238                 memcpy(m_counterAndSeed + 4, seed, seedSize);
03239         }
03240 
03241         void GenerateBlock(byte *output, size_t size)
03242         {
03243                 PutWord(false, BIG_ENDIAN_ORDER, m_counterAndSeed, m_counter);
03244                 ++m_counter;
03245                 P1363_KDF2<SHA1>::DeriveKey(output, size, m_counterAndSeed, m_counterAndSeed.size(), NULL, 0);
03246         }
03247 
03248 private:
03249         word32 m_counter;
03250         SecByteBlock m_counterAndSeed;
03251 };
03252 
03253 bool Integer::GenerateRandomNoThrow(RandomNumberGenerator &i_rng, const NameValuePairs &params)
03254 {
03255         Integer min = params.GetValueWithDefault("Min", Integer::Zero());
03256         Integer max;
03257         if (!params.GetValue("Max", max))
03258         {
03259                 int bitLength;
03260                 if (params.GetIntValue("BitLength", bitLength))
03261                         max = Integer::Power2(bitLength);
03262                 else
03263                         throw InvalidArgument("Integer: missing Max argument");
03264         }
03265         if (min > max)
03266                 throw InvalidArgument("Integer: Min must be no greater than Max");
03267 
03268         Integer equiv = params.GetValueWithDefault("EquivalentTo", Integer::Zero());
03269         Integer mod = params.GetValueWithDefault("Mod", Integer::One());
03270 
03271         if (equiv.IsNegative() || equiv >= mod)
03272                 throw InvalidArgument("Integer: invalid EquivalentTo and/or Mod argument");
03273 
03274         Integer::RandomNumberType rnType = params.GetValueWithDefault("RandomNumberType", Integer::ANY);
03275 
03276         member_ptr<KDF2_RNG> kdf2Rng;
03277         ConstByteArrayParameter seed;
03278         if (params.GetValue("Seed", seed))
03279         {
03280                 ByteQueue bq;
03281                 DERSequenceEncoder seq(bq);
03282                 min.DEREncode(seq);
03283                 max.DEREncode(seq);
03284                 equiv.DEREncode(seq);
03285                 mod.DEREncode(seq);
03286                 DEREncodeUnsigned(seq, rnType);
03287                 DEREncodeOctetString(seq, seed.begin(), seed.size());
03288                 seq.MessageEnd();
03289 
03290                 SecByteBlock finalSeed((size_t)bq.MaxRetrievable());
03291                 bq.Get(finalSeed, finalSeed.size());
03292                 kdf2Rng.reset(new KDF2_RNG(finalSeed.begin(), finalSeed.size()));
03293         }
03294         RandomNumberGenerator &rng = kdf2Rng.get() ? (RandomNumberGenerator &)*kdf2Rng : i_rng;
03295 
03296         switch (rnType)
03297         {
03298                 case ANY:
03299                         if (mod == One())
03300                                 Randomize(rng, min, max);
03301                         else
03302                         {
03303                                 Integer min1 = min + (equiv-min)%mod;
03304                                 if (max < min1)
03305                                         return false;
03306                                 Randomize(rng, Zero(), (max - min1) / mod);
03307                                 *this *= mod;
03308                                 *this += min1;
03309                         }
03310                         return true;
03311 
03312                 case PRIME:
03313                 {
03314                         const PrimeSelector *pSelector = params.GetValueWithDefault(Name::PointerToPrimeSelector(), (const PrimeSelector *)NULL);
03315 
03316                         int i;
03317                         i = 0;
03318                         while (1)
03319                         {
03320                                 if (++i==16)
03321                                 {
03322                                         // check if there are any suitable primes in [min, max]
03323                                         Integer first = min;
03324                                         if (FirstPrime(first, max, equiv, mod, pSelector))
03325                                         {
03326                                                 // if there is only one suitable prime, we're done
03327                                                 *this = first;
03328                                                 if (!FirstPrime(first, max, equiv, mod, pSelector))
03329                                                         return true;
03330                                         }
03331                                         else
03332                                                 return false;
03333                                 }
03334 
03335                                 Randomize(rng, min, max);
03336                                 if (FirstPrime(*this, STDMIN(*this+mod*PrimeSearchInterval(max), max), equiv, mod, pSelector))
03337                                         return true;
03338                         }
03339                 }
03340 
03341                 default:
03342                         throw InvalidArgument("Integer: invalid RandomNumberType argument");
03343         }
03344 }
03345 
03346 std::istream& operator>>(std::istream& in, Integer &a)
03347 {
03348         char c;
03349         unsigned int length = 0;
03350         SecBlock<char> str(length + 16);
03351 
03352         std::ws(in);
03353 
03354         do
03355         {
03356                 in.read(&c, 1);
03357                 str[length++] = c;
03358                 if (length >= str.size())
03359                         str.Grow(length + 16);
03360         }
03361         while (in && (c=='-' || c=='x' || (c>='0' && c<='9') || (c>='a' && c<='f') || (c>='A' && c<='F') || c=='h' || c=='H' || c=='o' || c=='O' || c==',' || c=='.'));
03362 
03363         if (in.gcount())
03364                 in.putback(c);
03365         str[length-1] = '\0';
03366         a = Integer(str);
03367 
03368         return in;
03369 }
03370 
03371 std::ostream& operator<<(std::ostream& out, const Integer &a)
03372 {
03373         // Get relevant conversion specifications from ostream.
03374         long f = out.flags() & std::ios::basefield; // Get base digits.
03375         int base, block;
03376         char suffix;
03377         switch(f)
03378         {
03379         case std::ios::oct :
03380                 base = 8;
03381                 block = 8;
03382                 suffix = 'o';
03383                 break;
03384         case std::ios::hex :
03385                 base = 16;
03386                 block = 4;
03387                 suffix = 'h';
03388                 break;
03389         default :
03390                 base = 10;
03391                 block = 3;
03392                 suffix = '.';
03393         }
03394 
03395         SecBlock<char> s(a.BitCount() / (BitPrecision(base)-1) + 1);
03396         Integer temp1=a, temp2;
03397         unsigned i=0;
03398         const char vec[]="0123456789ABCDEF";
03399 
03400         if (a.IsNegative())
03401         {
03402                 out << '-';
03403                 temp1.Negate();
03404         }
03405 
03406         if (!a)
03407                 out << '0';
03408 
03409         while (!!temp1)
03410         {
03411                 word digit;
03412                 Integer::Divide(digit, temp2, temp1, base);
03413                 s[i++]=vec[digit];
03414                 temp1=temp2;
03415         }
03416 
03417         while (i--)
03418         {
03419                 out << s[i];
03420 //              if (i && !(i%block))
03421 //                      out << ",";
03422         }
03423         return out << suffix;
03424 }
03425 
03426 Integer& Integer::operator++()
03427 {
03428         if (NotNegative())
03429         {
03430                 if (Increment(reg, reg.size()))
03431                 {
03432                         reg.CleanGrow(2*reg.size());
03433                         reg[reg.size()/2]=1;
03434                 }
03435         }
03436         else
03437         {
03438                 word borrow = Decrement(reg, reg.size());
03439                 assert(!borrow);
03440                 if (WordCount()==0)
03441                         *this = Zero();
03442         }
03443         return *this;
03444 }
03445 
03446 Integer& Integer::operator--()
03447 {
03448         if (IsNegative())
03449         {
03450                 if (Increment(reg, reg.size()))
03451                 {
03452                         reg.CleanGrow(2*reg.size());
03453                         reg[reg.size()/2]=1;
03454                 }
03455         }
03456         else
03457         {
03458                 if (Decrement(reg, reg.size()))
03459                         *this = -One();
03460         }
03461         return *this;
03462 }
03463 
03464 void PositiveAdd(Integer &sum, const Integer &a, const Integer& b)
03465 {
03466         int carry;
03467         if (a.reg.size() == b.reg.size())
03468                 carry = Add(sum.reg, a.reg, b.reg, a.reg.size());
03469         else if (a.reg.size() > b.reg.size())
03470         {
03471                 carry = Add(sum.reg, a.reg, b.reg, b.reg.size());
03472                 CopyWords(sum.reg+b.reg.size(), a.reg+b.reg.size(), a.reg.size()-b.reg.size());
03473                 carry = Increment(sum.reg+b.reg.size(), a.reg.size()-b.reg.size(), carry);
03474         }
03475         else
03476         {
03477                 carry = Add(sum.reg, a.reg, b.reg, a.reg.size());
03478                 CopyWords(sum.reg+a.reg.size(), b.reg+a.reg.size(), b.reg.size()-a.reg.size());
03479                 carry = Increment(sum.reg+a.reg.size(), b.reg.size()-a.reg.size(), carry);
03480         }
03481 
03482         if (carry)
03483         {
03484                 sum.reg.CleanGrow(2*sum.reg.size());
03485                 sum.reg[sum.reg.size()/2] = 1;
03486         }
03487         sum.sign = Integer::POSITIVE;
03488 }
03489 
03490 void PositiveSubtract(Integer &diff, const Integer &a, const Integer& b)
03491 {
03492         unsigned aSize = a.WordCount();
03493         aSize += aSize%2;
03494         unsigned bSize = b.WordCount();
03495         bSize += bSize%2;
03496 
03497         if (aSize == bSize)
03498         {
03499                 if (Compare(a.reg, b.reg, aSize) >= 0)
03500                 {
03501                         Subtract(diff.reg, a.reg, b.reg, aSize);
03502                         diff.sign = Integer::POSITIVE;
03503                 }
03504                 else
03505                 {
03506                         Subtract(diff.reg, b.reg, a.reg, aSize);
03507                         diff.sign = Integer::NEGATIVE;
03508                 }
03509         }
03510         else if (aSize > bSize)
03511         {
03512                 word borrow = Subtract(diff.reg, a.reg, b.reg, bSize);
03513                 CopyWords(diff.reg+bSize, a.reg+bSize, aSize-bSize);
03514                 borrow = Decrement(diff.reg+bSize, aSize-bSize, borrow);
03515                 assert(!borrow);
03516                 diff.sign = Integer::POSITIVE;
03517         }
03518         else
03519         {
03520                 word borrow = Subtract(diff.reg, b.reg, a.reg, aSize);
03521                 CopyWords(diff.reg+aSize, b.reg+aSize, bSize-aSize);
03522                 borrow = Decrement(diff.reg+aSize, bSize-aSize, borrow);
03523                 assert(!borrow);
03524                 diff.sign = Integer::NEGATIVE;
03525         }
03526 }
03527 
03528 // MSVC .NET 2003 workaround
03529 template <class T> inline const T& STDMAX2(const T& a, const T& b)
03530 {
03531         return a < b ? b : a;
03532 }
03533 
03534 Integer Integer::Plus(const Integer& b) const
03535 {
03536         Integer sum((word)0, STDMAX2(reg.size(), b.reg.size()));
03537         if (NotNegative())
03538         {
03539                 if (b.NotNegative())
03540                         PositiveAdd(sum, *this, b);
03541                 else
03542                         PositiveSubtract(sum, *this, b);
03543         }
03544         else
03545         {
03546                 if (b.NotNegative())
03547                         PositiveSubtract(sum, b, *this);
03548                 else
03549                 {
03550                         PositiveAdd(sum, *this, b);
03551                         sum.sign = Integer::NEGATIVE;
03552                 }
03553         }
03554         return sum;
03555 }
03556 
03557 Integer& Integer::operator+=(const Integer& t)
03558 {
03559         reg.CleanGrow(t.reg.size());
03560         if (NotNegative())
03561         {
03562                 if (t.NotNegative())
03563                         PositiveAdd(*this, *this, t);
03564                 else
03565                         PositiveSubtract(*this, *this, t);
03566         }
03567         else
03568         {
03569                 if (t.NotNegative())
03570                         PositiveSubtract(*this, t, *this);
03571                 else
03572                 {
03573                         PositiveAdd(*this, *this, t);
03574                         sign = Integer::NEGATIVE;
03575                 }
03576         }
03577         return *this;
03578 }
03579 
03580 Integer Integer::Minus(const Integer& b) const
03581 {
03582         Integer diff((word)0, STDMAX2(reg.size(), b.reg.size()));
03583         if (NotNegative())
03584         {
03585                 if (b.NotNegative())
03586                         PositiveSubtract(diff, *this, b);
03587                 else
03588                         PositiveAdd(diff, *this, b);
03589         }
03590         else
03591         {
03592                 if (b.NotNegative())
03593                 {
03594                         PositiveAdd(diff, *this, b);
03595                         diff.sign = Integer::NEGATIVE;
03596                 }
03597                 else
03598                         PositiveSubtract(diff, b, *this);
03599         }
03600         return diff;
03601 }
03602 
03603 Integer& Integer::operator-=(const Integer& t)
03604 {
03605         reg.CleanGrow(t.reg.size());
03606         if (NotNegative())
03607         {
03608                 if (t.NotNegative())
03609                         PositiveSubtract(*this, *this, t);
03610                 else
03611                         PositiveAdd(*this, *this, t);
03612         }
03613         else
03614         {
03615                 if (t.NotNegative())
03616                 {
03617                         PositiveAdd(*this, *this, t);
03618                         sign = Integer::NEGATIVE;
03619                 }
03620                 else
03621                         PositiveSubtract(*this, t, *this);
03622         }
03623         return *this;
03624 }
03625 
03626 Integer& Integer::operator<<=(size_t n)
03627 {
03628         const size_t wordCount = WordCount();
03629         const size_t shiftWords = n / WORD_BITS;
03630         const unsigned int shiftBits = (unsigned int)(n % WORD_BITS);
03631 
03632         reg.CleanGrow(RoundupSize(wordCount+BitsToWords(n)));
03633         ShiftWordsLeftByWords(reg, wordCount + shiftWords, shiftWords);
03634         ShiftWordsLeftByBits(reg+shiftWords, wordCount+BitsToWords(shiftBits), shiftBits);
03635         return *this;
03636 }
03637 
03638 Integer& Integer::operator>>=(size_t n)
03639 {
03640         const size_t wordCount = WordCount();
03641         const size_t shiftWords = n / WORD_BITS;
03642         const unsigned int shiftBits = (unsigned int)(n % WORD_BITS);
03643 
03644         ShiftWordsRightByWords(reg, wordCount, shiftWords);
03645         if (wordCount > shiftWords)
03646                 ShiftWordsRightByBits(reg, wordCount-shiftWords, shiftBits);
03647         if (IsNegative() && WordCount()==0)   // avoid -0
03648                 *this = Zero();
03649         return *this;
03650 }
03651 
03652 void PositiveMultiply(Integer &product, const Integer &a, const Integer &b)
03653 {
03654         size_t aSize = RoundupSize(a.WordCount());
03655         size_t bSize = RoundupSize(b.WordCount());
03656 
03657         product.reg.CleanNew(RoundupSize(aSize+bSize));
03658         product.sign = Integer::POSITIVE;
03659 
03660         IntegerSecBlock workspace(aSize + bSize);
03661         AsymmetricMultiply(product.reg, workspace, a.reg, aSize, b.reg, bSize);
03662 }
03663 
03664 void Multiply(Integer &product, const Integer &a, const Integer &b)
03665 {
03666         PositiveMultiply(product, a, b);
03667 
03668         if (a.NotNegative() != b.NotNegative())
03669                 product.Negate();
03670 }
03671 
03672 Integer Integer::Times(const Integer &b) const
03673 {
03674         Integer product;
03675         Multiply(product, *this, b);
03676         return product;
03677 }
03678 
03679 /*
03680 void PositiveDivide(Integer &remainder, Integer &quotient,
03681                                    const Integer &dividend, const Integer &divisor)
03682 {
03683         remainder.reg.CleanNew(divisor.reg.size());
03684         remainder.sign = Integer::POSITIVE;
03685         quotient.reg.New(0);
03686         quotient.sign = Integer::POSITIVE;
03687         unsigned i=dividend.BitCount();
03688         while (i--)
03689         {
03690                 word overflow = ShiftWordsLeftByBits(remainder.reg, remainder.reg.size(), 1);
03691                 remainder.reg[0] |= dividend[i];
03692                 if (overflow || remainder >= divisor)
03693                 {
03694                         Subtract(remainder.reg, remainder.reg, divisor.reg, remainder.reg.size());
03695                         quotient.SetBit(i);
03696                 }
03697         }
03698 }
03699 */
03700 
03701 void PositiveDivide(Integer &remainder, Integer &quotient,
03702                                    const Integer &a, const Integer &b)
03703 {
03704         unsigned aSize = a.WordCount();
03705         unsigned bSize = b.WordCount();
03706 
03707         if (!bSize)
03708                 throw Integer::DivideByZero();
03709 
03710         if (a.PositiveCompare(b) == -1)
03711         {
03712                 remainder = a;
03713                 remainder.sign = Integer::POSITIVE;
03714                 quotient = Integer::Zero();
03715                 return;
03716         }
03717 
03718         aSize += aSize%2;       // round up to next even number
03719         bSize += bSize%2;
03720 
03721         remainder.reg.CleanNew(RoundupSize(bSize));
03722         remainder.sign = Integer::POSITIVE;
03723         quotient.reg.CleanNew(RoundupSize(aSize-bSize+2));
03724         quotient.sign = Integer::POSITIVE;
03725 
03726         IntegerSecBlock T(aSize+3*(bSize+2));
03727         Divide(remainder.reg, quotient.reg, T, a.reg, aSize, b.reg, bSize);
03728 }
03729 
03730 void Integer::Divide(Integer &remainder, Integer &quotient, const Integer &dividend, const Integer &divisor)
03731 {
03732         PositiveDivide(remainder, quotient, dividend, divisor);
03733 
03734         if (dividend.IsNegative())
03735         {
03736                 quotient.Negate();
03737                 if (remainder.NotZero())
03738                 {
03739                         --quotient;
03740                         remainder = divisor.AbsoluteValue() - remainder;
03741                 }
03742         }
03743 
03744         if (divisor.IsNegative())
03745                 quotient.Negate();
03746 }
03747 
03748 void Integer::DivideByPowerOf2(Integer &r, Integer &q, const Integer &a, unsigned int n)
03749 {
03750         q = a;
03751         q >>= n;
03752 
03753         const size_t wordCount = BitsToWords(n);
03754         if (wordCount <= a.WordCount())
03755         {
03756                 r.reg.resize(RoundupSize(wordCount));
03757                 CopyWords(r.reg, a.reg, wordCount);
03758                 SetWords(r.reg+wordCount, 0, r.reg.size()-wordCount);
03759                 if (n % WORD_BITS != 0)
03760                         r.reg[wordCount-1] %= (word(1) << (n % WORD_BITS));
03761         }
03762         else
03763         {
03764                 r.reg.resize(RoundupSize(a.WordCount()));
03765                 CopyWords(r.reg, a.reg, r.reg.size());
03766         }
03767         r.sign = POSITIVE;
03768 
03769         if (a.IsNegative() && r.NotZero())
03770         {
03771                 --q;
03772                 r = Power2(n) - r;
03773         }
03774 }
03775 
03776 Integer Integer::DividedBy(const Integer &b) const
03777 {
03778         Integer remainder, quotient;
03779         Integer::Divide(remainder, quotient, *this, b);
03780         return quotient;
03781 }
03782 
03783 Integer Integer::Modulo(const Integer &b) const
03784 {
03785         Integer remainder, quotient;
03786         Integer::Divide(remainder, quotient, *this, b);
03787         return remainder;
03788 }
03789 
03790 void Integer::Divide(word &remainder, Integer &quotient, const Integer &dividend, word divisor)
03791 {
03792         if (!divisor)
03793                 throw Integer::DivideByZero();
03794 
03795         assert(divisor);
03796 
03797         if ((divisor & (divisor-1)) == 0)       // divisor is a power of 2
03798         {
03799                 quotient = dividend >> (BitPrecision(divisor)-1);
03800                 remainder = dividend.reg[0] & (divisor-1);
03801                 return;
03802         }
03803 
03804         unsigned int i = dividend.WordCount();
03805         quotient.reg.CleanNew(RoundupSize(i));
03806         remainder = 0;
03807         while (i--)
03808         {
03809                 quotient.reg[i] = DWord(dividend.reg[i], remainder) / divisor;
03810                 remainder = DWord(dividend.reg[i], remainder) % divisor;
03811         }
03812 
03813         if (dividend.NotNegative())
03814                 quotient.sign = POSITIVE;
03815         else
03816         {
03817                 quotient.sign = NEGATIVE;
03818                 if (remainder)
03819                 {
03820                         --quotient;
03821                         remainder = divisor - remainder;
03822                 }
03823         }
03824 }
03825 
03826 Integer Integer::DividedBy(word b) const
03827 {
03828         word remainder;
03829         Integer quotient;
03830         Integer::Divide(remainder, quotient, *this, b);
03831         return quotient;
03832 }
03833 
03834 word Integer::Modulo(word divisor) const
03835 {
03836         if (!divisor)
03837                 throw Integer::DivideByZero();
03838 
03839         assert(divisor);
03840 
03841         word remainder;
03842 
03843         if ((divisor & (divisor-1)) == 0)       // divisor is a power of 2
03844                 remainder = reg[0] & (divisor-1);
03845         else
03846         {
03847                 unsigned int i = WordCount();
03848 
03849                 if (divisor <= 5)
03850                 {
03851                         DWord sum(0, 0);
03852                         while (i--)
03853                                 sum += reg[i];
03854                         remainder = sum % divisor;
03855                 }
03856                 else
03857                 {
03858                         remainder = 0;
03859                         while (i--)
03860                                 remainder = DWord(reg[i], remainder) % divisor;
03861                 }
03862         }
03863 
03864         if (IsNegative() && remainder)
03865                 remainder = divisor - remainder;
03866 
03867         return remainder;
03868 }
03869 
03870 void Integer::Negate()
03871 {
03872         if (!!(*this))  // don't flip sign if *this==0
03873                 sign = Sign(1-sign);
03874 }
03875 
03876 int Integer::PositiveCompare(const Integer& t) const
03877 {
03878         unsigned size = WordCount(), tSize = t.WordCount();
03879 
03880         if (size == tSize)
03881                 return CryptoPP::Compare(reg, t.reg, size);
03882         else
03883                 return size > tSize ? 1 : -1;
03884 }
03885 
03886 int Integer::Compare(const Integer& t) const
03887 {
03888         if (NotNegative())
03889         {
03890                 if (t.NotNegative())
03891                         return PositiveCompare(t);
03892                 else
03893                         return 1;
03894         }
03895         else
03896         {
03897                 if (t.NotNegative())
03898                         return -1;
03899                 else
03900                         return -PositiveCompare(t);
03901         }
03902 }
03903 
03904 Integer Integer::SquareRoot() const
03905 {
03906         if (!IsPositive())
03907                 return Zero();
03908 
03909         // overestimate square root
03910         Integer x, y = Power2((BitCount()+1)/2);
03911         assert(y*y >= *this);
03912 
03913         do
03914         {
03915                 x = y;
03916                 y = (x + *this/x) >> 1;
03917         } while (y<x);
03918 
03919         return x;
03920 }
03921 
03922 bool Integer::IsSquare() const
03923 {
03924         Integer r = SquareRoot();
03925         return *this == r.Squared();
03926 }
03927 
03928 bool Integer::IsUnit() const
03929 {
03930         return (WordCount() == 1) && (reg[0] == 1);
03931 }
03932 
03933 Integer Integer::MultiplicativeInverse() const
03934 {
03935         return IsUnit() ? *this : Zero();
03936 }
03937 
03938 Integer a_times_b_mod_c(const Integer &x, const Integer& y, const Integer& m)
03939 {
03940         return x*y%m;
03941 }
03942 
03943 Integer a_exp_b_mod_c(const Integer &x, const Integer& e, const Integer& m)
03944 {
03945         ModularArithmetic mr(m);
03946         return mr.Exponentiate(x, e);
03947 }
03948 
03949 Integer Integer::Gcd(const Integer &a, const Integer &b)
03950 {
03951         return EuclideanDomainOf<Integer>().Gcd(a, b);
03952 }
03953 
03954 Integer Integer::InverseMod(const Integer &m) const
03955 {
03956         assert(m.NotNegative());
03957 
03958         if (IsNegative() || *this>=m)
03959                 return (*this%m).InverseMod(m);
03960 
03961         if (m.IsEven())
03962         {
03963                 if (!m || IsEven())
03964                         return Zero();  // no inverse
03965                 if (*this == One())
03966                         return One();
03967 
03968                 Integer u = m.InverseMod(*this);
03969                 return !u ? Zero() : (m*(*this-u)+1)/(*this);
03970         }
03971 
03972         SecBlock<word> T(m.reg.size() * 4);
03973         Integer r((word)0, m.reg.size());
03974         unsigned k = AlmostInverse(r.reg, T, reg, reg.size(), m.reg, m.reg.size());
03975         DivideByPower2Mod(r.reg, r.reg, k, m.reg, m.reg.size());
03976         return r;
03977 }
03978 
03979 word Integer::InverseMod(word mod) const
03980 {
03981         word g0 = mod, g1 = *this % mod;
03982         word v0 = 0, v1 = 1;
03983         word y;
03984 
03985         while (g1)
03986         {
03987                 if (g1 == 1)
03988                         return v1;
03989                 y = g0 / g1;
03990                 g0 = g0 % g1;
03991                 v0 += y * v1;
03992 
03993                 if (!g0)
03994                         break;
03995                 if (g0 == 1)
03996                         return mod-v0;
03997                 y = g1 / g0;
03998                 g1 = g1 % g0;
03999                 v1 += y * v0;
04000         }
04001         return 0;
04002 }
04003 
04004 // ********************************************************
04005 
04006 ModularArithmetic::ModularArithmetic(BufferedTransformation &bt)
04007 {
04008         BERSequenceDecoder seq(bt);
04009         OID oid(seq);
04010         if (oid != ASN1::prime_field())
04011                 BERDecodeError();
04012         m_modulus.BERDecode(seq);
04013         seq.MessageEnd();
04014         m_result.reg.resize(m_modulus.reg.size());
04015 }
04016 
04017 void ModularArithmetic::DEREncode(BufferedTransformation &bt) const
04018 {
04019         DERSequenceEncoder seq(bt);
04020         ASN1::prime_field().DEREncode(seq);
04021         m_modulus.DEREncode(seq);
04022         seq.MessageEnd();
04023 }
04024 
04025 void ModularArithmetic::DEREncodeElement(BufferedTransformation &out, const Element &a) const
04026 {
04027         a.DEREncodeAsOctetString(out, MaxElementByteLength());
04028 }
04029 
04030 void ModularArithmetic::BERDecodeElement(BufferedTransformation &in, Element &a) const
04031 {
04032         a.BERDecodeAsOctetString(in, MaxElementByteLength());
04033 }
04034 
04035 const Integer& ModularArithmetic::Half(const Integer &a) const
04036 {
04037         if (a.reg.size()==m_modulus.reg.size())
04038         {
04039                 CryptoPP::DivideByPower2Mod(m_result.reg.begin(), a.reg, 1, m_modulus.reg, a.reg.size());
04040                 return m_result;
04041         }
04042         else
04043                 return m_result1 = (a.IsEven() ? (a >> 1) : ((a+m_modulus) >> 1));
04044 }
04045 
04046 const Integer& ModularArithmetic::Add(const Integer &a, const Integer &b) const
04047 {
04048         if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
04049         {
04050                 if (CryptoPP::Add(m_result.reg.begin(), a.reg, b.reg, a.reg.size())
04051                         || Compare(m_result.reg, m_modulus.reg, a.reg.size()) >= 0)
04052                 {
04053                         CryptoPP::Subtract(m_result.reg.begin(), m_result.reg, m_modulus.reg, a.reg.size());
04054                 }
04055                 return m_result;
04056         }
04057         else
04058         {
04059                 m_result1 = a+b;
04060                 if (m_result1 >= m_modulus)
04061                         m_result1 -= m_modulus;
04062                 return m_result1;
04063         }
04064 }
04065 
04066 Integer& ModularArithmetic::Accumulate(Integer &a, const Integer &b) const
04067 {
04068         if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
04069         {
04070                 if (CryptoPP::Add(a.reg, a.reg, b.reg, a.reg.size())
04071                         || Compare(a.reg, m_modulus.reg, a.reg.size()) >= 0)
04072                 {
04073                         CryptoPP::Subtract(a.reg, a.reg, m_modulus.reg, a.reg.size());
04074                 }
04075         }
04076         else
04077         {
04078                 a+=b;
04079                 if (a>=m_modulus)
04080                         a-=m_modulus;
04081         }
04082 
04083         return a;
04084 }
04085 
04086 const Integer& ModularArithmetic::Subtract(const Integer &a, const Integer &b) const
04087 {
04088         if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
04089         {
04090                 if (CryptoPP::Subtract(m_result.reg.begin(), a.reg, b.reg, a.reg.size()))
04091                         CryptoPP::Add(m_result.reg.begin(), m_result.reg, m_modulus.reg, a.reg.size());
04092                 return m_result;
04093         }
04094         else
04095         {
04096                 m_result1 = a-b;
04097                 if (m_result1.IsNegative())
04098                         m_result1 += m_modulus;
04099                 return m_result1;
04100         }
04101 }
04102 
04103 Integer& ModularArithmetic::Reduce(Integer &a, const Integer &b) const
04104 {
04105         if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
04106         {
04107                 if (CryptoPP::Subtract(a.reg, a.reg, b.reg, a.reg.size()))
04108                         CryptoPP::Add(a.reg, a.reg, m_modulus.reg, a.reg.size());
04109         }
04110         else
04111         {
04112                 a-=b;
04113                 if (a.IsNegative())
04114                         a+=m_modulus;
04115         }
04116 
04117         return a;
04118 }
04119 
04120 const Integer& ModularArithmetic::Inverse(const Integer &a) const
04121 {
04122         if (!a)
04123                 return a;
04124 
04125         CopyWords(m_result.reg.begin(), m_modulus.reg, m_modulus.reg.size());
04126         if (CryptoPP::Subtract(m_result.reg.begin(), m_result.reg, a.reg, a.reg.size()))
04127                 Decrement(m_result.reg.begin()+a.reg.size(), m_modulus.reg.size()-a.reg.size());
04128 
04129         return m_result;
04130 }
04131 
04132 Integer ModularArithmetic::CascadeExponentiate(const Integer &x, const Integer &e1, const Integer &y, const Integer &e2) const
04133 {
04134         if (m_modulus.IsOdd())
04135         {
04136                 MontgomeryRepresentation dr(m_modulus);
04137                 return dr.ConvertOut(dr.CascadeExponentiate(dr.ConvertIn(x), e1, dr.ConvertIn(y), e2));
04138         }
04139         else
04140                 return AbstractRing<Integer>::CascadeExponentiate(x, e1, y, e2);
04141 }
04142 
04143 void ModularArithmetic::SimultaneousExponentiate(Integer *results, const Integer &base, const Integer *exponents, unsigned int exponentsCount) const
04144 {
04145         if (m_modulus.IsOdd())
04146         {
04147                 MontgomeryRepresentation dr(m_modulus);
04148                 dr.SimultaneousExponentiate(results, dr.ConvertIn(base), exponents, exponentsCount);
04149                 for (unsigned int i=0; i<exponentsCount; i++)
04150                         results[i] = dr.ConvertOut(results[i]);
04151         }
04152         else
04153                 AbstractRing<Integer>::SimultaneousExponentiate(results, base, exponents, exponentsCount);
04154 }
04155 
04156 MontgomeryRepresentation::MontgomeryRepresentation(const Integer &m)    // modulus must be odd
04157         : ModularArithmetic(m),
04158           m_u((word)0, m_modulus.reg.size()),
04159           m_workspace(5*m_modulus.reg.size())
04160 {
04161         if (!m_modulus.IsOdd())
04162                 throw InvalidArgument("MontgomeryRepresentation: Montgomery representation requires an odd modulus");
04163 
04164         RecursiveInverseModPower2(m_u.reg, m_workspace, m_modulus.reg, m_modulus.reg.size());
04165 }
04166 
04167 const Integer& MontgomeryRepresentation::Multiply(const Integer &a, const Integer &b) const
04168 {
04169         word *const T = m_workspace.begin();
04170         word *const R = m_result.reg.begin();
04171         const size_t N = m_modulus.reg.size();
04172         assert(a.reg.size()<=N && b.reg.size()<=N);
04173 
04174         AsymmetricMultiply(T, T+2*N, a.reg, a.reg.size(), b.reg, b.reg.size());
04175         SetWords(T+a.reg.size()+b.reg.size(), 0, 2*N-a.reg.size()-b.reg.size());
04176         MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
04177         return m_result;
04178 }
04179 
04180 const Integer& MontgomeryRepresentation::Square(const Integer &a) const
04181 {
04182         word *const T = m_workspace.begin();
04183         word *const R = m_result.reg.begin();
04184         const size_t N = m_modulus.reg.size();
04185         assert(a.reg.size()<=N);
04186 
04187         CryptoPP::Square(T, T+2*N, a.reg, a.reg.size());
04188         SetWords(T+2*a.reg.size(), 0, 2*N-2*a.reg.size());
04189         MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
04190         return m_result;
04191 }
04192 
04193 Integer MontgomeryRepresentation::ConvertOut(const Integer &a) const
04194 {
04195         word *const T = m_workspace.begin();
04196         word *const R = m_result.reg.begin();
04197         const size_t N = m_modulus.reg.size();
04198         assert(a.reg.size()<=N);
04199 
04200         CopyWords(T, a.reg, a.reg.size());
04201         SetWords(T+a.reg.size(), 0, 2*N-a.reg.size());
04202         MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
04203         return m_result;
04204 }
04205 
04206 const Integer& MontgomeryRepresentation::MultiplicativeInverse(const Integer &a) const
04207 {
04208 //        return (EuclideanMultiplicativeInverse(a, modulus)<<(2*WORD_BITS*modulus.reg.size()))%modulus;
04209         word *const T = m_workspace.begin();
04210         word *const R = m_result.reg.begin();
04211         const size_t N = m_modulus.reg.size();
04212         assert(a.reg.size()<=N);
04213 
04214         CopyWords(T, a.reg, a.reg.size());
04215         SetWords(T+a.reg.size(), 0, 2*N-a.reg.size());
04216         MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
04217         unsigned k = AlmostInverse(R, T, R, N, m_modulus.reg, N);
04218 
04219 //      cout << "k=" << k << " N*32=" << 32*N << endl;
04220 
04221         if (k>N*WORD_BITS)
04222                 DivideByPower2Mod(R, R, k-N*WORD_BITS, m_modulus.reg, N);
04223         else
04224                 MultiplyByPower2Mod(R, R, N*WORD_BITS-k, m_modulus.reg, N);
04225 
04226         return m_result;
04227 }
04228 
04229 NAMESPACE_END
04230 
04231 #endif