Crypto++  7.0
Free C++ class library of cryptographic schemes
adv_simd.h
Go to the documentation of this file.
1 // adv_simd.h - written and placed in the public domain by Jeffrey Walton
2 
3 /// \file adv_simd.h
4 /// \brief Template for AdvancedProcessBlocks and SIMD processing
5 
6 // The SIMD based implementations for ciphers that use SSE, NEON and Power7
7 // have a commom pattern. Namely, they have a specialized implementation of
8 // AdvancedProcessBlocks which processes multiple block using hardware
9 // acceleration. After several implementations we noticed a lot of copy and
10 // paste occuring. adv_simd.h provides a template to avoid the copy and paste.
11 //
12 // There are 11 templates provided in this file. The number following the
13 // function name, 64 or 128, is the block size. The name following the block
14 // size is the arrangement and acceleration. For example 4x1_SSE means Intel
15 // SSE using two encrypt (or decrypt) functions: one that operates on 4 SIMD
16 // words, and one that operates on 1 SIMD words.
17 //
18 // The distinction between SIMD words versus cipher blocks is important
19 // because 64-bit ciphers use one SIMD word for two cipher blocks. For
20 // example, AdvancedProcessBlocks64_6x2_ALTIVEC operates on 6 and 2 SIMD
21 // words, which is 12 and 4 cipher blocks. The function will do the right
22 // thing even if there is only one 64-bit block to encrypt.
23 //
24 // * AdvancedProcessBlocks64_2x1_SSE
25 // * AdvancedProcessBlocks64_4x1_SSE
26 // * AdvancedProcessBlocks128_4x1_SSE
27 // * AdvancedProcessBlocks64_6x2_SSE
28 // * AdvancedProcessBlocks128_6x2_SSE
29 // * AdvancedProcessBlocks64_6x2_NEON
30 // * AdvancedProcessBlocks128_4x1_NEON
31 // * AdvancedProcessBlocks128_6x2_NEON
32 // * AdvancedProcessBlocks64_6x2_ALTIVEC
33 // * AdvancedProcessBlocks128_4x1_ALTIVEC
34 // * AdvancedProcessBlocks128_6x1_ALTIVEC
35 //
36 // If an arrangement ends in 2, like 6x2, then the template will handle the
37 // single block case by padding with 0's and using the two SIMD word
38 // function. This happens at most one time when processing multiple blocks.
39 // The extra processing of a zero block is trivial and worth the tradeoff.
40 //
41 // The MAYBE_CONST macro present on x86 is a SunCC workaround. Some versions
42 // of SunCC lose/drop the const-ness in the F1 and F4 functions. It eventually
43 // results in a failed link due to the const/non-const mismatch.
44 
45 #ifndef CRYPTOPP_ADVANCED_SIMD_TEMPLATES
46 #define CRYPTOPP_ADVANCED_SIMD_TEMPLATES
47 
48 #include "config.h"
49 #include "misc.h"
50 #include "stdcpp.h"
51 
52 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
53 # include <arm_neon.h>
54 #endif
55 
56 #if (CRYPTOPP_ARM_ACLE_AVAILABLE)
57 # include <stdint.h>
58 # include <arm_acle.h>
59 #endif
60 
61 #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE)
62 # include <emmintrin.h>
63 # include <xmmintrin.h>
64 #endif
65 
66 // SunCC needs CRYPTOPP_SSSE3_AVAILABLE, too
67 #if (CRYPTOPP_SSSE3_AVAILABLE)
68 # include <emmintrin.h>
69 # include <pmmintrin.h>
70 # include <xmmintrin.h>
71 #endif
72 
73 #if defined(__ALTIVEC__)
74 # include "ppc_simd.h"
75 #endif
76 
77 #ifndef CRYPTOPP_INLINE
78 # if defined(CRYPTOPP_DEBUG)
79 # define CRYPTOPP_INLINE static
80 # else
81 # define CRYPTOPP_INLINE inline
82 # endif
83 #endif
84 
85 // ************************ All block ciphers *********************** //
86 
87 ANONYMOUS_NAMESPACE_BEGIN
88 
89 using CryptoPP::BlockTransformation;
90 
91 CRYPTOPP_CONSTANT(BT_XorInput = BlockTransformation::BT_XorInput)
92 CRYPTOPP_CONSTANT(BT_AllowParallel = BlockTransformation::BT_AllowParallel)
93 CRYPTOPP_CONSTANT(BT_InBlockIsCounter = BlockTransformation::BT_InBlockIsCounter)
94 CRYPTOPP_CONSTANT(BT_ReverseDirection = BlockTransformation::BT_ReverseDirection)
95 CRYPTOPP_CONSTANT(BT_DontIncrementInOutPointers = BlockTransformation::BT_DontIncrementInOutPointers)
96 
97 ANONYMOUS_NAMESPACE_END
98 
99 // *************************** ARM NEON ************************** //
100 
101 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
102 
103 NAMESPACE_BEGIN(CryptoPP)
104 
105 /// \brief AdvancedProcessBlocks for 2 and 6 blocks
106 /// \tparam F2 function to process 2 64-bit blocks
107 /// \tparam F6 function to process 6 64-bit blocks
108 /// \tparam W word type of the subkey table
109 /// \details AdvancedProcessBlocks64_6x2_NEON processes 6 and 2 NEON SIMD words
110 /// at a time. For a single block the template uses F2 with a zero block.
111 /// \details The subkey type is usually word32 or word64. F2 and F6 must use the
112 /// same word type.
113 template <typename F2, typename F6, typename W>
114 CRYPTOPP_INLINE size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6,
115  const W *subKeys, size_t rounds, const byte *inBlocks,
116  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
117 {
118  CRYPTOPP_ASSERT(subKeys);
119  CRYPTOPP_ASSERT(inBlocks);
120  CRYPTOPP_ASSERT(outBlocks);
121  CRYPTOPP_ASSERT(length >= 8);
122 
123 #if (CRYPTOPP_LITTLE_ENDIAN)
124  const uint32x4_t s_one = {0, 0, 0, 1<<24};
125  const uint32x4_t s_two = {0, 2<<24, 0, 2<<24};
126 #else
127  // TODO: verify these constants on ARM-BE
128  const uint32x4_t s_one = {0, 0, 0, 1};
129  const uint32x4_t s_two = {0, 2, 0, 2};
130 #endif
131 
132  const size_t blockSize = 8;
133  const size_t neonBlockSize = 16;
134 
135  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : neonBlockSize;
136  size_t xorIncrement = (xorBlocks != NULLPTR) ? neonBlockSize : 0;
137  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : neonBlockSize;
138 
139  // Clang and Coverity are generating findings using xorBlocks as a flag.
140  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
141  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
142 
143  if (flags & BT_ReverseDirection)
144  {
145  inBlocks = PtrAdd(inBlocks, length - neonBlockSize);
146  xorBlocks = PtrAdd(xorBlocks, length - neonBlockSize);
147  outBlocks = PtrAdd(outBlocks, length - neonBlockSize);
148  inIncrement = 0-inIncrement;
149  xorIncrement = 0-xorIncrement;
150  outIncrement = 0-outIncrement;
151  }
152 
153  if (flags & BT_AllowParallel)
154  {
155  while (length >= 6*neonBlockSize)
156  {
157  uint32x4_t block0, block1, block2, block3, block4, block5;
158  if (flags & BT_InBlockIsCounter)
159  {
160  // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
161  // After the dup load we have two counters in the NEON word. Then we need
162  // to increment the low ctr by 0 and the high ctr by 1.
163  const uint8x8_t ctr = vld1_u8(inBlocks);
164  block0 = vaddq_u32(s_one, vreinterpretq_u32_u8(vcombine_u8(ctr,ctr)));
165 
166  // After initial increment of {0,1} remaining counters increment by {2,2}.
167  block1 = vaddq_u32(s_two, block0);
168  block2 = vaddq_u32(s_two, block1);
169  block3 = vaddq_u32(s_two, block2);
170  block4 = vaddq_u32(s_two, block3);
171  block5 = vaddq_u32(s_two, block4);
172 
173  vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
174  vreinterpretq_u8_u32(vaddq_u32(s_two, block5))));
175  }
176  else
177  {
178  block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
179  inBlocks = PtrAdd(inBlocks, inIncrement);
180  block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
181  inBlocks = PtrAdd(inBlocks, inIncrement);
182  block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
183  inBlocks = PtrAdd(inBlocks, inIncrement);
184  block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
185  inBlocks = PtrAdd(inBlocks, inIncrement);
186  block4 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
187  inBlocks = PtrAdd(inBlocks, inIncrement);
188  block5 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
189  inBlocks = PtrAdd(inBlocks, inIncrement);
190  }
191 
192  if (xorInput)
193  {
194  block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
195  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
196  block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
197  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
198  block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
199  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
200  block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
201  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
202  block4 = veorq_u32(block4, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
203  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
204  block5 = veorq_u32(block5, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
205  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
206  }
207 
208  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
209 
210  if (xorOutput)
211  {
212  block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
213  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
214  block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
215  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
216  block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
217  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
218  block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
219  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
220  block4 = veorq_u32(block4, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
221  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
222  block5 = veorq_u32(block5, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
223  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
224  }
225 
226  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
227  outBlocks = PtrAdd(outBlocks, outIncrement);
228  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
229  outBlocks = PtrAdd(outBlocks, outIncrement);
230  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block2));
231  outBlocks = PtrAdd(outBlocks, outIncrement);
232  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block3));
233  outBlocks = PtrAdd(outBlocks, outIncrement);
234  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block4));
235  outBlocks = PtrAdd(outBlocks, outIncrement);
236  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block5));
237  outBlocks = PtrAdd(outBlocks, outIncrement);
238 
239  length -= 6*neonBlockSize;
240  }
241 
242  while (length >= 2*neonBlockSize)
243  {
244  uint32x4_t block0, block1;
245  if (flags & BT_InBlockIsCounter)
246  {
247  // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
248  // After the dup load we have two counters in the NEON word. Then we need
249  // to increment the low ctr by 0 and the high ctr by 1.
250  const uint8x8_t ctr = vld1_u8(inBlocks);
251  block0 = vaddq_u32(s_one, vreinterpretq_u32_u8(vcombine_u8(ctr,ctr)));
252 
253  // After initial increment of {0,1} remaining counters increment by {2,2}.
254  block1 = vaddq_u32(s_two, block0);
255 
256  vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
257  vreinterpretq_u8_u32(vaddq_u32(s_two, block1))));
258  }
259  else
260  {
261  block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
262  inBlocks = PtrAdd(inBlocks, inIncrement);
263  block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
264  inBlocks = PtrAdd(inBlocks, inIncrement);
265  }
266 
267  if (xorInput)
268  {
269  block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
270  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
271  block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
272  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
273  }
274 
275  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
276 
277  if (xorOutput)
278  {
279  block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
280  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
281  block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
282  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
283  }
284 
285  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
286  outBlocks = PtrAdd(outBlocks, outIncrement);
287  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
288  outBlocks = PtrAdd(outBlocks, outIncrement);
289 
290  length -= 2*neonBlockSize;
291  }
292  }
293 
294  if (length)
295  {
296  // Adjust to real block size
297  if (flags & BT_ReverseDirection)
298  {
299  inIncrement += inIncrement ? blockSize : 0;
300  xorIncrement += xorIncrement ? blockSize : 0;
301  outIncrement += outIncrement ? blockSize : 0;
302  inBlocks = PtrSub(inBlocks, inIncrement);
303  xorBlocks = PtrSub(xorBlocks, xorIncrement);
304  outBlocks = PtrSub(outBlocks, outIncrement);
305  }
306  else
307  {
308  inIncrement -= inIncrement ? blockSize : 0;
309  xorIncrement -= xorIncrement ? blockSize : 0;
310  outIncrement -= outIncrement ? blockSize : 0;
311  }
312 
313  while (length >= blockSize)
314  {
315  uint32x4_t block, zero = {0};
316 
317  const uint8x8_t v = vld1_u8(inBlocks);
318  block = vreinterpretq_u32_u8(vcombine_u8(v,v));
319 
320  if (xorInput)
321  {
322  const uint8x8_t x = vld1_u8(xorBlocks);
323  block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
324  }
325 
326  if (flags & BT_InBlockIsCounter)
327  const_cast<byte *>(inBlocks)[7]++;
328 
329  func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
330 
331  if (xorOutput)
332  {
333  const uint8x8_t x = vld1_u8(xorBlocks);
334  block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
335  }
336 
337  vst1_u8(const_cast<byte*>(outBlocks),
338  vget_low_u8(vreinterpretq_u8_u32(block)));
339 
340  inBlocks = PtrAdd(inBlocks, inIncrement);
341  outBlocks = PtrAdd(outBlocks, outIncrement);
342  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
343  length -= blockSize;
344  }
345  }
346 
347  return length;
348 }
349 
350 /// \brief AdvancedProcessBlocks for 1 and 6 blocks
351 /// \tparam F1 function to process 1 128-bit block
352 /// \tparam F6 function to process 6 128-bit blocks
353 /// \tparam W word type of the subkey table
354 /// \details AdvancedProcessBlocks128_6x1_NEON processes 6 and 2 NEON SIMD words
355 /// at a time.
356 /// \details The subkey type is usually word32 or word64. F1 and F6 must use the
357 /// same word type.
358 template <typename F1, typename F6, typename W>
359 CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_6x1_NEON(F1 func1, F6 func6,
360  const W *subKeys, size_t rounds, const byte *inBlocks,
361  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
362 {
363  CRYPTOPP_ASSERT(subKeys);
364  CRYPTOPP_ASSERT(inBlocks);
365  CRYPTOPP_ASSERT(outBlocks);
366  CRYPTOPP_ASSERT(length >= 16);
367 
368 #if (CRYPTOPP_LITTLE_ENDIAN)
369  const uint32x4_t s_one = {0, 0, 0, 1<<24};
370  //const uint32x4_t s_two = {0, 2<<24, 0, 2<<24};
371 #else
372  // TODO: verify these constants on ARM-BE
373  const uint32x4_t s_one = {0, 0, 0, 1};
374  //const uint32x4_t s_two = {0, 2, 0, 2};
375 #endif
376 
377  const size_t blockSize = 16;
378  // const size_t neonBlockSize = 16;
379 
380  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
381  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
382  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
383 
384  // Clang and Coverity are generating findings using xorBlocks as a flag.
385  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
386  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
387 
388  if (flags & BT_ReverseDirection)
389  {
390  inBlocks = PtrAdd(inBlocks, length - blockSize);
391  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
392  outBlocks = PtrAdd(outBlocks, length - blockSize);
393  inIncrement = 0-inIncrement;
394  xorIncrement = 0-xorIncrement;
395  outIncrement = 0-outIncrement;
396  }
397 
398  if (flags & BT_AllowParallel)
399  {
400  while (length >= 6*blockSize)
401  {
402  uint64x2_t block0, block1, block2, block3, block4, block5;
403  if (flags & BT_InBlockIsCounter)
404  {
405  const uint64x2_t one = vreinterpretq_u64_u32(s_one);
406  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
407  block1 = vaddq_u64(block0, one);
408  block2 = vaddq_u64(block1, one);
409  block3 = vaddq_u64(block2, one);
410  block4 = vaddq_u64(block3, one);
411  block5 = vaddq_u64(block4, one);
412  vst1q_u8(const_cast<byte*>(inBlocks),
413  vreinterpretq_u8_u64(vaddq_u64(block5, one)));
414  }
415  else
416  {
417  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
418  inBlocks = PtrAdd(inBlocks, inIncrement);
419  block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
420  inBlocks = PtrAdd(inBlocks, inIncrement);
421  block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
422  inBlocks = PtrAdd(inBlocks, inIncrement);
423  block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
424  inBlocks = PtrAdd(inBlocks, inIncrement);
425  block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
426  inBlocks = PtrAdd(inBlocks, inIncrement);
427  block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
428  inBlocks = PtrAdd(inBlocks, inIncrement);
429  }
430 
431  if (xorInput)
432  {
433  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
434  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
435  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
436  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
437  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
438  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
439  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
440  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
441  block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
442  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
443  block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
444  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
445  }
446 
447  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
448 
449  if (xorOutput)
450  {
451  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
452  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
453  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
454  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
455  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
456  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
457  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
458  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
459  block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
460  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
461  block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
462  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
463  }
464 
465  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
466  outBlocks = PtrAdd(outBlocks, outIncrement);
467  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
468  outBlocks = PtrAdd(outBlocks, outIncrement);
469  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
470  outBlocks = PtrAdd(outBlocks, outIncrement);
471  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
472  outBlocks = PtrAdd(outBlocks, outIncrement);
473  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
474  outBlocks = PtrAdd(outBlocks, outIncrement);
475  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
476  outBlocks = PtrAdd(outBlocks, outIncrement);
477 
478  length -= 6*blockSize;
479  }
480  }
481 
482  while (length >= blockSize)
483  {
484  uint64x2_t block;
485  block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
486 
487  if (xorInput)
488  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
489 
490  if (flags & BT_InBlockIsCounter)
491  const_cast<byte *>(inBlocks)[15]++;
492 
493  func1(block, subKeys, static_cast<unsigned int>(rounds));
494 
495  if (xorOutput)
496  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
497 
498  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
499 
500  inBlocks = PtrAdd(inBlocks, inIncrement);
501  outBlocks = PtrAdd(outBlocks, outIncrement);
502  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
503  length -= blockSize;
504  }
505 
506  return length;
507 }
508 
509 /// \brief AdvancedProcessBlocks for 1 and 4 blocks
510 /// \tparam F1 function to process 1 128-bit block
511 /// \tparam F4 function to process 4 128-bit blocks
512 /// \tparam W word type of the subkey table
513 /// \tparam V vector type of the NEON datatype
514 /// \details AdvancedProcessBlocks128_4x1_NEON processes 4 and 1 NEON SIMD words
515 /// at a time.
516 /// \details The subkey type is usually word32 or word64. V is the vector type and it is
517 /// usually uint32x4_t or uint64x2_t. F1, F4, W and V must use the same word and
518 /// vector type. The V parameter is used to avoid template argument
519 /// deduction/substitution failures.
520 template <typename F1, typename F4, typename W, typename V>
521 CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_4x1_NEON(F1 func1, F4 func4,
522  const V& unused, const W *subKeys, size_t rounds, const byte *inBlocks,
523  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
524 {
525  CRYPTOPP_ASSERT(subKeys);
526  CRYPTOPP_ASSERT(inBlocks);
527  CRYPTOPP_ASSERT(outBlocks);
528  CRYPTOPP_ASSERT(length >= 16);
529  CRYPTOPP_UNUSED(unused);
530 
531 #if (CRYPTOPP_LITTLE_ENDIAN)
532  const uint32x4_t s_one = {0, 0, 0, 1<<24};
533  //const uint32x4_t s_two = {0, 2<<24, 0, 2<<24};
534 #else
535  // TODO: verify these constants on ARM-BE
536  const uint32x4_t s_one = {0, 0, 0, 1};
537  //const uint32x4_t s_two = {0, 2, 0, 2};
538 #endif
539 
540  const size_t blockSize = 16;
541  // const size_t neonBlockSize = 16;
542 
543  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
544  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
545  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
546 
547  // Clang and Coverity are generating findings using xorBlocks as a flag.
548  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
549  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
550 
551  if (flags & BT_ReverseDirection)
552  {
553  inBlocks = PtrAdd(inBlocks, length - blockSize);
554  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
555  outBlocks = PtrAdd(outBlocks, length - blockSize);
556  inIncrement = 0-inIncrement;
557  xorIncrement = 0-xorIncrement;
558  outIncrement = 0-outIncrement;
559  }
560 
561  if (flags & BT_AllowParallel)
562  {
563  while (length >= 4*blockSize)
564  {
565  uint64x2_t block0, block1, block2, block3;
566  if (flags & BT_InBlockIsCounter)
567  {
568  const uint64x2_t one = vreinterpretq_u64_u32(s_one);
569  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
570  block1 = vaddq_u64(block0, one);
571  block2 = vaddq_u64(block1, one);
572  block3 = vaddq_u64(block2, one);
573  vst1q_u8(const_cast<byte*>(inBlocks),
574  vreinterpretq_u8_u64(vaddq_u64(block3, one)));
575  }
576  else
577  {
578  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
579  inBlocks = PtrAdd(inBlocks, inIncrement);
580  block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
581  inBlocks = PtrAdd(inBlocks, inIncrement);
582  block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
583  inBlocks = PtrAdd(inBlocks, inIncrement);
584  block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
585  inBlocks = PtrAdd(inBlocks, inIncrement);
586  }
587 
588  if (xorInput)
589  {
590  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
591  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
592  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
593  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
594  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
595  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
596  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
597  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
598  }
599 
600  func4((V&)block0, (V&)block1, (V&)block2, (V&)block3, subKeys, static_cast<unsigned int>(rounds));
601 
602  if (xorOutput)
603  {
604  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
605  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
606  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
607  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
608  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
609  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
610  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
611  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
612  }
613 
614  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
615  outBlocks = PtrAdd(outBlocks, outIncrement);
616  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
617  outBlocks = PtrAdd(outBlocks, outIncrement);
618  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
619  outBlocks = PtrAdd(outBlocks, outIncrement);
620  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
621  outBlocks = PtrAdd(outBlocks, outIncrement);
622 
623  length -= 4*blockSize;
624  }
625  }
626 
627  while (length >= blockSize)
628  {
629  uint64x2_t block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
630 
631  if (xorInput)
632  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
633 
634  if (flags & BT_InBlockIsCounter)
635  const_cast<byte *>(inBlocks)[15]++;
636 
637  func1( (V&)block, subKeys, static_cast<unsigned int>(rounds));
638 
639  if (xorOutput)
640  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
641 
642  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
643 
644  inBlocks = PtrAdd(inBlocks, inIncrement);
645  outBlocks = PtrAdd(outBlocks, outIncrement);
646  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
647  length -= blockSize;
648  }
649 
650  return length;
651 }
652 
653 /// \brief AdvancedProcessBlocks for 2 and 6 blocks
654 /// \tparam F2 function to process 2 128-bit blocks
655 /// \tparam F6 function to process 6 128-bit blocks
656 /// \tparam W word type of the subkey table
657 /// \details AdvancedProcessBlocks128_6x2_NEON processes 6 and 2 NEON SIMD words
658 /// at a time. For a single block the template uses F2 with a zero block.
659 /// \details The subkey type is usually word32 or word64. F2 and F6 must use the
660 /// same word type.
661 template <typename F2, typename F6, typename W>
662 CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6,
663  const W *subKeys, size_t rounds, const byte *inBlocks,
664  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
665 {
666  CRYPTOPP_ASSERT(subKeys);
667  CRYPTOPP_ASSERT(inBlocks);
668  CRYPTOPP_ASSERT(outBlocks);
669  CRYPTOPP_ASSERT(length >= 16);
670 
671 #if (CRYPTOPP_LITTLE_ENDIAN)
672  const uint32x4_t s_one = {0, 0, 0, 1<<24};
673  //const uint32x4_t s_two = {0, 2<<24, 0, 2<<24};
674 #else
675  // TODO: verify these constants on ARM-BE
676  const uint32x4_t s_one = {0, 0, 0, 1};
677  //const uint32x4_t s_two = {0, 2, 0, 2};
678 #endif
679 
680  const size_t blockSize = 16;
681  // const size_t neonBlockSize = 16;
682 
683  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
684  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
685  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
686 
687  // Clang and Coverity are generating findings using xorBlocks as a flag.
688  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
689  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
690 
691  if (flags & BT_ReverseDirection)
692  {
693  inBlocks = PtrAdd(inBlocks, length - blockSize);
694  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
695  outBlocks = PtrAdd(outBlocks, length - blockSize);
696  inIncrement = 0-inIncrement;
697  xorIncrement = 0-xorIncrement;
698  outIncrement = 0-outIncrement;
699  }
700 
701  if (flags & BT_AllowParallel)
702  {
703  while (length >= 6*blockSize)
704  {
705  uint64x2_t block0, block1, block2, block3, block4, block5;
706  if (flags & BT_InBlockIsCounter)
707  {
708  const uint64x2_t one = vreinterpretq_u64_u32(s_one);
709  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
710  block1 = vaddq_u64(block0, one);
711  block2 = vaddq_u64(block1, one);
712  block3 = vaddq_u64(block2, one);
713  block4 = vaddq_u64(block3, one);
714  block5 = vaddq_u64(block4, one);
715  vst1q_u8(const_cast<byte*>(inBlocks),
716  vreinterpretq_u8_u64(vaddq_u64(block5, one)));
717  }
718  else
719  {
720  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
721  inBlocks = PtrAdd(inBlocks, inIncrement);
722  block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
723  inBlocks = PtrAdd(inBlocks, inIncrement);
724  block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
725  inBlocks = PtrAdd(inBlocks, inIncrement);
726  block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
727  inBlocks = PtrAdd(inBlocks, inIncrement);
728  block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
729  inBlocks = PtrAdd(inBlocks, inIncrement);
730  block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
731  inBlocks = PtrAdd(inBlocks, inIncrement);
732  }
733 
734  if (xorInput)
735  {
736  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
737  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
738  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
739  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
740  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
741  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
742  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
743  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
744  block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
745  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
746  block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
747  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
748  }
749 
750  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
751 
752  if (xorOutput)
753  {
754  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
755  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
756  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
757  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
758  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
759  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
760  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
761  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
762  block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
763  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
764  block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
765  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
766  }
767 
768  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
769  outBlocks = PtrAdd(outBlocks, outIncrement);
770  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
771  outBlocks = PtrAdd(outBlocks, outIncrement);
772  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
773  outBlocks = PtrAdd(outBlocks, outIncrement);
774  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
775  outBlocks = PtrAdd(outBlocks, outIncrement);
776  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
777  outBlocks = PtrAdd(outBlocks, outIncrement);
778  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
779  outBlocks = PtrAdd(outBlocks, outIncrement);
780 
781  length -= 6*blockSize;
782  }
783 
784  while (length >= 2*blockSize)
785  {
786  uint64x2_t block0, block1;
787  if (flags & BT_InBlockIsCounter)
788  {
789  const uint64x2_t one = vreinterpretq_u64_u32(s_one);
790  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
791  block1 = vaddq_u64(block0, one);
792  vst1q_u8(const_cast<byte*>(inBlocks),
793  vreinterpretq_u8_u64(vaddq_u64(block1, one)));
794  }
795  else
796  {
797  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
798  inBlocks = PtrAdd(inBlocks, inIncrement);
799  block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
800  inBlocks = PtrAdd(inBlocks, inIncrement);
801  }
802 
803  if (xorInput)
804  {
805  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
806  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
807  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
808  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
809  }
810 
811  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
812 
813  if (xorOutput)
814  {
815  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
816  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
817  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
818  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
819  }
820 
821  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
822  outBlocks = PtrAdd(outBlocks, outIncrement);
823  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
824  outBlocks = PtrAdd(outBlocks, outIncrement);
825 
826  length -= 2*blockSize;
827  }
828  }
829 
830  while (length >= blockSize)
831  {
832  uint64x2_t block, zero = {0,0};
833  block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
834 
835  if (xorInput)
836  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
837 
838  if (flags & BT_InBlockIsCounter)
839  const_cast<byte *>(inBlocks)[15]++;
840 
841  func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
842 
843  if (xorOutput)
844  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
845 
846  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
847 
848  inBlocks = PtrAdd(inBlocks, inIncrement);
849  outBlocks = PtrAdd(outBlocks, outIncrement);
850  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
851  length -= blockSize;
852  }
853 
854  return length;
855 }
856 
857 NAMESPACE_END // CryptoPP
858 
859 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
860 
861 // *************************** Intel SSE ************************** //
862 
863 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
864 
865 // Hack for SunCC, http://github.com/weidai11/cryptopp/issues/224
866 #if (__SUNPRO_CC >= 0x5130)
867 # define MAYBE_CONST
868 # define MAYBE_UNCONST_CAST(T, x) const_cast<MAYBE_CONST T>(x)
869 #else
870 # define MAYBE_CONST const
871 # define MAYBE_UNCONST_CAST(T, x) (x)
872 #endif
873 
874 // Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670
875 #ifndef M128_CAST
876 # define M128_CAST(x) ((__m128i *)(void *)(x))
877 #endif
878 #ifndef CONST_M128_CAST
879 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
880 #endif
881 
882 NAMESPACE_BEGIN(CryptoPP)
883 
884 /// \brief AdvancedProcessBlocks for 1 and 2 blocks
885 /// \tparam F1 function to process 1 64-bit block
886 /// \tparam F2 function to process 2 64-bit blocks
887 /// \tparam W word type of the subkey table
888 /// \details AdvancedProcessBlocks64_2x1_SSE processes 2 and 1 SSE SIMD words
889 /// at a time.
890 /// \details The subkey type is usually word32 or word64. F1 and F2 must use the
891 /// same word type.
892 template <typename F1, typename F2, typename W>
893 CRYPTOPP_INLINE size_t AdvancedProcessBlocks64_2x1_SSE(F1 func1, F2 func2,
894  MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
895  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
896 {
897  CRYPTOPP_ASSERT(subKeys);
898  CRYPTOPP_ASSERT(inBlocks);
899  CRYPTOPP_ASSERT(outBlocks);
900  CRYPTOPP_ASSERT(length >= 8);
901 
902  const size_t blockSize = 8;
903  const size_t xmmBlockSize = 16;
904 
905  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
906  size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
907  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
908 
909  // Clang and Coverity are generating findings using xorBlocks as a flag.
910  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
911  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
912 
913  if (flags & BT_ReverseDirection)
914  {
915  inBlocks = PtrAdd(inBlocks, length - xmmBlockSize);
916  xorBlocks = PtrAdd(xorBlocks, length - xmmBlockSize);
917  outBlocks = PtrAdd(outBlocks, length - xmmBlockSize);
918  inIncrement = 0-inIncrement;
919  xorIncrement = 0-xorIncrement;
920  outIncrement = 0-outIncrement;
921  }
922 
923  if (flags & BT_AllowParallel)
924  {
925  double temp[2];
926  while (length >= 2*xmmBlockSize)
927  {
928  __m128i block0, block1;
929  if (flags & BT_InBlockIsCounter)
930  {
931  // Increment of 1 and 2 in big-endian compatible with the ctr byte array.
932  const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
933  const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
934 
935  // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
936  // After the dup load we have two counters in the XMM word. Then we need
937  // to increment the low ctr by 0 and the high ctr by 1.
938  std::memcpy(temp, inBlocks, blockSize);
939  block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
940 
941  // After initial increment of {0,1} remaining counters increment by {2,2}.
942  block1 = _mm_add_epi32(s_two, block0);
943 
944  // Store the next counter. When BT_InBlockIsCounter is set then
945  // inBlocks is backed by m_counterArray which is non-const.
946  _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block1)));
947  std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
948  }
949  else
950  {
951  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
952  inBlocks = PtrAdd(inBlocks, inIncrement);
953  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
954  inBlocks = PtrAdd(inBlocks, inIncrement);
955  }
956 
957  if (xorInput)
958  {
959  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
960  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
961  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
962  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
963  }
964 
965  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
966 
967  if (xorOutput)
968  {
969  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
970  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
971  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
972  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
973  }
974 
975  _mm_storeu_si128(M128_CAST(outBlocks), block0);
976  outBlocks = PtrAdd(outBlocks, outIncrement);
977  _mm_storeu_si128(M128_CAST(outBlocks), block1);
978  outBlocks = PtrAdd(outBlocks, outIncrement);
979 
980  length -= 2*xmmBlockSize;
981  }
982  }
983 
984  if (length)
985  {
986  // Adjust to real block size
987  if (flags & BT_ReverseDirection)
988  {
989  inIncrement += inIncrement ? blockSize : 0;
990  xorIncrement += xorIncrement ? blockSize : 0;
991  outIncrement += outIncrement ? blockSize : 0;
992  inBlocks = PtrSub(inBlocks, inIncrement);
993  xorBlocks = PtrSub(xorBlocks, xorIncrement);
994  outBlocks = PtrSub(outBlocks, outIncrement);
995  }
996  else
997  {
998  inIncrement -= inIncrement ? blockSize : 0;
999  xorIncrement -= xorIncrement ? blockSize : 0;
1000  outIncrement -= outIncrement ? blockSize : 0;
1001  }
1002 
1003  while (length >= blockSize)
1004  {
1005  double temp[2];
1006  std::memcpy(temp, inBlocks, blockSize);
1007  __m128i block = _mm_castpd_si128(_mm_load_sd(temp));
1008 
1009  if (xorInput)
1010  {
1011  std::memcpy(temp, xorBlocks, blockSize);
1012  block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
1013  }
1014 
1015  if (flags & BT_InBlockIsCounter)
1016  const_cast<byte *>(inBlocks)[7]++;
1017 
1018  func1(block, subKeys, static_cast<unsigned int>(rounds));
1019 
1020  if (xorOutput)
1021  {
1022  std::memcpy(temp, xorBlocks, blockSize);
1023  block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
1024  }
1025 
1026  _mm_store_sd(temp, _mm_castsi128_pd(block));
1027  std::memcpy(outBlocks, temp, blockSize);
1028 
1029  inBlocks = PtrAdd(inBlocks, inIncrement);
1030  outBlocks = PtrAdd(outBlocks, outIncrement);
1031  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1032  length -= blockSize;
1033  }
1034  }
1035 
1036  return length;
1037 }
1038 
1039 /// \brief AdvancedProcessBlocks for 2 and 6 blocks
1040 /// \tparam F2 function to process 2 64-bit blocks
1041 /// \tparam F6 function to process 6 64-bit blocks
1042 /// \tparam W word type of the subkey table
1043 /// \details AdvancedProcessBlocks64_6x2_SSE processes 6 and 2 SSE SIMD words
1044 /// at a time. For a single block the template uses F2 with a zero block.
1045 /// \details The subkey type is usually word32 or word64. F2 and F6 must use the
1046 /// same word type.
1047 template <typename F2, typename F6, typename W>
1048 CRYPTOPP_INLINE size_t AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6,
1049  MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
1050  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1051 {
1052  CRYPTOPP_ASSERT(subKeys);
1053  CRYPTOPP_ASSERT(inBlocks);
1054  CRYPTOPP_ASSERT(outBlocks);
1055  CRYPTOPP_ASSERT(length >= 8);
1056 
1057  const size_t blockSize = 8;
1058  const size_t xmmBlockSize = 16;
1059 
1060  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
1061  size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
1062  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
1063 
1064  // Clang and Coverity are generating findings using xorBlocks as a flag.
1065  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1066  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1067 
1068  if (flags & BT_ReverseDirection)
1069  {
1070  inBlocks = PtrAdd(inBlocks, length - xmmBlockSize);
1071  xorBlocks = PtrAdd(xorBlocks, length - xmmBlockSize);
1072  outBlocks = PtrAdd(outBlocks, length - xmmBlockSize);
1073  inIncrement = 0-inIncrement;
1074  xorIncrement = 0-xorIncrement;
1075  outIncrement = 0-outIncrement;
1076  }
1077 
1078  if (flags & BT_AllowParallel)
1079  {
1080  double temp[2];
1081  while (length >= 6*xmmBlockSize)
1082  {
1083  __m128i block0, block1, block2, block3, block4, block5;
1084  if (flags & BT_InBlockIsCounter)
1085  {
1086  // Increment of 1 and 2 in big-endian compatible with the ctr byte array.
1087  const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1088  const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
1089 
1090  // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
1091  // After the dup load we have two counters in the XMM word. Then we need
1092  // to increment the low ctr by 0 and the high ctr by 1.
1093  std::memcpy(temp, inBlocks, blockSize);
1094  block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
1095 
1096  // After initial increment of {0,1} remaining counters increment by {2,2}.
1097  block1 = _mm_add_epi32(s_two, block0);
1098  block2 = _mm_add_epi32(s_two, block1);
1099  block3 = _mm_add_epi32(s_two, block2);
1100  block4 = _mm_add_epi32(s_two, block3);
1101  block5 = _mm_add_epi32(s_two, block4);
1102 
1103  // Store the next counter. When BT_InBlockIsCounter is set then
1104  // inBlocks is backed by m_counterArray which is non-const.
1105  _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi32(s_two, block5)));
1106  std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
1107  }
1108  else
1109  {
1110  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1111  inBlocks = PtrAdd(inBlocks, inIncrement);
1112  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1113  inBlocks = PtrAdd(inBlocks, inIncrement);
1114  block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1115  inBlocks = PtrAdd(inBlocks, inIncrement);
1116  block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1117  inBlocks = PtrAdd(inBlocks, inIncrement);
1118  block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1119  inBlocks = PtrAdd(inBlocks, inIncrement);
1120  block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1121  inBlocks = PtrAdd(inBlocks, inIncrement);
1122  }
1123 
1124  if (xorInput)
1125  {
1126  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1127  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1128  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1129  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1130  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1131  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1132  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1133  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1134  block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1135  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1136  block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1137  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1138  }
1139 
1140  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
1141 
1142  if (xorOutput)
1143  {
1144  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1145  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1146  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1147  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1148  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1149  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1150  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1151  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1152  block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1153  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1154  block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1155  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1156  }
1157 
1158  _mm_storeu_si128(M128_CAST(outBlocks), block0);
1159  outBlocks = PtrAdd(outBlocks, outIncrement);
1160  _mm_storeu_si128(M128_CAST(outBlocks), block1);
1161  outBlocks = PtrAdd(outBlocks, outIncrement);
1162  _mm_storeu_si128(M128_CAST(outBlocks), block2);
1163  outBlocks = PtrAdd(outBlocks, outIncrement);
1164  _mm_storeu_si128(M128_CAST(outBlocks), block3);
1165  outBlocks = PtrAdd(outBlocks, outIncrement);
1166  _mm_storeu_si128(M128_CAST(outBlocks), block4);
1167  outBlocks = PtrAdd(outBlocks, outIncrement);
1168  _mm_storeu_si128(M128_CAST(outBlocks), block5);
1169  outBlocks = PtrAdd(outBlocks, outIncrement);
1170 
1171  length -= 6*xmmBlockSize;
1172  }
1173 
1174  while (length >= 2*xmmBlockSize)
1175  {
1176  __m128i block0, block1;
1177  if (flags & BT_InBlockIsCounter)
1178  {
1179  // Increment of 1 and 2 in big-endian compatible with the ctr byte array.
1180  const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1181  const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
1182 
1183  // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
1184  // After the dup load we have two counters in the XMM word. Then we need
1185  // to increment the low ctr by 0 and the high ctr by 1.
1186  std::memcpy(temp, inBlocks, blockSize);
1187  block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
1188 
1189  // After initial increment of {0,1} remaining counters increment by {2,2}.
1190  block1 = _mm_add_epi32(s_two, block0);
1191 
1192  // Store the next counter. When BT_InBlockIsCounter is set then
1193  // inBlocks is backed by m_counterArray which is non-const.
1194  _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block1)));
1195  std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
1196  }
1197  else
1198  {
1199  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1200  inBlocks = PtrAdd(inBlocks, inIncrement);
1201  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1202  inBlocks = PtrAdd(inBlocks, inIncrement);
1203  }
1204 
1205  if (xorInput)
1206  {
1207  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1208  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1209  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1210  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1211  }
1212 
1213  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
1214 
1215  if (xorOutput)
1216  {
1217  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1218  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1219  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1220  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1221  }
1222 
1223  _mm_storeu_si128(M128_CAST(outBlocks), block0);
1224  outBlocks = PtrAdd(outBlocks, outIncrement);
1225  _mm_storeu_si128(M128_CAST(outBlocks), block1);
1226  outBlocks = PtrAdd(outBlocks, outIncrement);
1227 
1228  length -= 2*xmmBlockSize;
1229  }
1230  }
1231 
1232  if (length)
1233  {
1234  // Adjust to real block size
1235  if (flags & BT_ReverseDirection)
1236  {
1237  inIncrement += inIncrement ? blockSize : 0;
1238  xorIncrement += xorIncrement ? blockSize : 0;
1239  outIncrement += outIncrement ? blockSize : 0;
1240  inBlocks = PtrSub(inBlocks, inIncrement);
1241  xorBlocks = PtrSub(xorBlocks, xorIncrement);
1242  outBlocks = PtrSub(outBlocks, outIncrement);
1243  }
1244  else
1245  {
1246  inIncrement -= inIncrement ? blockSize : 0;
1247  xorIncrement -= xorIncrement ? blockSize : 0;
1248  outIncrement -= outIncrement ? blockSize : 0;
1249  }
1250 
1251  while (length >= blockSize)
1252  {
1253  double temp[2];
1254  __m128i block, zero = _mm_setzero_si128();
1255  std::memcpy(temp, inBlocks, blockSize);
1256  block = _mm_castpd_si128(_mm_load_sd(temp));
1257 
1258  if (xorInput)
1259  {
1260  std::memcpy(temp, xorBlocks, blockSize);
1261  block = _mm_xor_si128(block,
1262  _mm_castpd_si128(_mm_load_sd(temp)));
1263  }
1264 
1265  if (flags & BT_InBlockIsCounter)
1266  const_cast<byte *>(inBlocks)[7]++;
1267 
1268  func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
1269 
1270  if (xorOutput)
1271  {
1272  std::memcpy(temp, xorBlocks, blockSize);
1273  block = _mm_xor_si128(block,
1274  _mm_castpd_si128(_mm_load_sd(temp)));
1275  }
1276 
1277  _mm_store_sd(temp, _mm_castsi128_pd(block));
1278  std::memcpy(outBlocks, temp, blockSize);
1279 
1280  inBlocks = PtrAdd(inBlocks, inIncrement);
1281  outBlocks = PtrAdd(outBlocks, outIncrement);
1282  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1283  length -= blockSize;
1284  }
1285  }
1286 
1287  return length;
1288 }
1289 
1290 /// \brief AdvancedProcessBlocks for 2 and 6 blocks
1291 /// \tparam F2 function to process 2 128-bit blocks
1292 /// \tparam F6 function to process 6 128-bit blocks
1293 /// \tparam W word type of the subkey table
1294 /// \details AdvancedProcessBlocks128_6x2_SSE processes 6 and 2 SSE SIMD words
1295 /// at a time. For a single block the template uses F2 with a zero block.
1296 /// \details The subkey type is usually word32 or word64. F2 and F6 must use the
1297 /// same word type.
1298 template <typename F2, typename F6, typename W>
1299 CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
1300  MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
1301  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1302 {
1303  CRYPTOPP_ASSERT(subKeys);
1304  CRYPTOPP_ASSERT(inBlocks);
1305  CRYPTOPP_ASSERT(outBlocks);
1306  CRYPTOPP_ASSERT(length >= 16);
1307 
1308  const size_t blockSize = 16;
1309  // const size_t xmmBlockSize = 16;
1310 
1311  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1312  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1313  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
1314 
1315  // Clang and Coverity are generating findings using xorBlocks as a flag.
1316  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1317  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1318 
1319  if (flags & BT_ReverseDirection)
1320  {
1321  inBlocks = PtrAdd(inBlocks, length - blockSize);
1322  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
1323  outBlocks = PtrAdd(outBlocks, length - blockSize);
1324  inIncrement = 0-inIncrement;
1325  xorIncrement = 0-xorIncrement;
1326  outIncrement = 0-outIncrement;
1327  }
1328 
1329  if (flags & BT_AllowParallel)
1330  {
1331  while (length >= 6*blockSize)
1332  {
1333  __m128i block0, block1, block2, block3, block4, block5;
1334  if (flags & BT_InBlockIsCounter)
1335  {
1336  // Increment of 1 in big-endian compatible with the ctr byte array.
1337  const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1338  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1339  block1 = _mm_add_epi32(block0, s_one);
1340  block2 = _mm_add_epi32(block1, s_one);
1341  block3 = _mm_add_epi32(block2, s_one);
1342  block4 = _mm_add_epi32(block3, s_one);
1343  block5 = _mm_add_epi32(block4, s_one);
1344  _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block5, s_one));
1345  }
1346  else
1347  {
1348  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1349  inBlocks = PtrAdd(inBlocks, inIncrement);
1350  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1351  inBlocks = PtrAdd(inBlocks, inIncrement);
1352  block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1353  inBlocks = PtrAdd(inBlocks, inIncrement);
1354  block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1355  inBlocks = PtrAdd(inBlocks, inIncrement);
1356  block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1357  inBlocks = PtrAdd(inBlocks, inIncrement);
1358  block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1359  inBlocks = PtrAdd(inBlocks, inIncrement);
1360  }
1361 
1362  if (xorInput)
1363  {
1364  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1365  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1366  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1367  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1368  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1369  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1370  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1371  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1372  block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1373  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1374  block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1375  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1376  }
1377 
1378  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
1379 
1380  if (xorOutput)
1381  {
1382  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1383  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1384  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1385  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1386  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1387  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1388  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1389  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1390  block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1391  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1392  block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1393  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1394  }
1395 
1396  _mm_storeu_si128(M128_CAST(outBlocks), block0);
1397  outBlocks = PtrAdd(outBlocks, outIncrement);
1398  _mm_storeu_si128(M128_CAST(outBlocks), block1);
1399  outBlocks = PtrAdd(outBlocks, outIncrement);
1400  _mm_storeu_si128(M128_CAST(outBlocks), block2);
1401  outBlocks = PtrAdd(outBlocks, outIncrement);
1402  _mm_storeu_si128(M128_CAST(outBlocks), block3);
1403  outBlocks = PtrAdd(outBlocks, outIncrement);
1404  _mm_storeu_si128(M128_CAST(outBlocks), block4);
1405  outBlocks = PtrAdd(outBlocks, outIncrement);
1406  _mm_storeu_si128(M128_CAST(outBlocks), block5);
1407  outBlocks = PtrAdd(outBlocks, outIncrement);
1408 
1409  length -= 6*blockSize;
1410  }
1411 
1412  while (length >= 2*blockSize)
1413  {
1414  __m128i block0, block1;
1415  if (flags & BT_InBlockIsCounter)
1416  {
1417  // Increment of 1 in big-endian compatible with the ctr byte array.
1418  const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1419  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1420  block1 = _mm_add_epi32(block0, s_one);
1421  _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, s_one));
1422  }
1423  else
1424  {
1425  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1426  inBlocks = PtrAdd(inBlocks, inIncrement);
1427  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1428  inBlocks = PtrAdd(inBlocks, inIncrement);
1429  }
1430 
1431  if (xorInput)
1432  {
1433  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1434  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1435  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1436  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1437  }
1438 
1439  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
1440 
1441  if (xorOutput)
1442  {
1443  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1444  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1445  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1446  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1447  }
1448 
1449  _mm_storeu_si128(M128_CAST(outBlocks), block0);
1450  outBlocks = PtrAdd(outBlocks, outIncrement);
1451  _mm_storeu_si128(M128_CAST(outBlocks), block1);
1452  outBlocks = PtrAdd(outBlocks, outIncrement);
1453 
1454  length -= 2*blockSize;
1455  }
1456  }
1457 
1458  while (length >= blockSize)
1459  {
1460  __m128i block, zero = _mm_setzero_si128();
1461  block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1462 
1463  if (xorInput)
1464  block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1465 
1466  if (flags & BT_InBlockIsCounter)
1467  const_cast<byte *>(inBlocks)[15]++;
1468 
1469  func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
1470 
1471  if (xorOutput)
1472  block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1473 
1474  _mm_storeu_si128(M128_CAST(outBlocks), block);
1475 
1476  inBlocks = PtrAdd(inBlocks, inIncrement);
1477  outBlocks = PtrAdd(outBlocks, outIncrement);
1478  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1479  length -= blockSize;
1480  }
1481 
1482  return length;
1483 }
1484 
1485 /// \brief AdvancedProcessBlocks for 1 and 4 blocks
1486 /// \tparam F1 function to process 1 128-bit block
1487 /// \tparam F4 function to process 4 128-bit blocks
1488 /// \tparam W word type of the subkey table
1489 /// \details AdvancedProcessBlocks128_4x1_SSE processes 4 and 1 SSE SIMD words
1490 /// at a time.
1491 /// \details The subkey type is usually word32 or word64. F1 and F4 must use the
1492 /// same word type.
1493 template <typename F1, typename F4, typename W>
1494 CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4,
1495  MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
1496  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1497 {
1498  CRYPTOPP_ASSERT(subKeys);
1499  CRYPTOPP_ASSERT(inBlocks);
1500  CRYPTOPP_ASSERT(outBlocks);
1501  CRYPTOPP_ASSERT(length >= 16);
1502 
1503  const size_t blockSize = 16;
1504  // const size_t xmmBlockSize = 16;
1505 
1506  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1507  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1508  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
1509 
1510  // Clang and Coverity are generating findings using xorBlocks as a flag.
1511  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1512  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1513 
1514  if (flags & BT_ReverseDirection)
1515  {
1516  inBlocks = PtrAdd(inBlocks, length - blockSize);
1517  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
1518  outBlocks = PtrAdd(outBlocks, length - blockSize);
1519  inIncrement = 0-inIncrement;
1520  xorIncrement = 0-xorIncrement;
1521  outIncrement = 0-outIncrement;
1522  }
1523 
1524  if (flags & BT_AllowParallel)
1525  {
1526  while (length >= 4*blockSize)
1527  {
1528  __m128i block0, block1, block2, block3;
1529  if (flags & BT_InBlockIsCounter)
1530  {
1531  // Increment of 1 in big-endian compatible with the ctr byte array.
1532  const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1533  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1534  block1 = _mm_add_epi32(block0, s_one);
1535  block2 = _mm_add_epi32(block1, s_one);
1536  block3 = _mm_add_epi32(block2, s_one);
1537  _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, s_one));
1538  }
1539  else
1540  {
1541  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1542  inBlocks = PtrAdd(inBlocks, inIncrement);
1543  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1544  inBlocks = PtrAdd(inBlocks, inIncrement);
1545  block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1546  inBlocks = PtrAdd(inBlocks, inIncrement);
1547  block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1548  inBlocks = PtrAdd(inBlocks, inIncrement);
1549  }
1550 
1551  if (xorInput)
1552  {
1553  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1554  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1555  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1556  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1557  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1558  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1559  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1560  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1561  }
1562 
1563  func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
1564 
1565  if (xorOutput)
1566  {
1567  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1568  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1569  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1570  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1571  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1572  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1573  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1574  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1575  }
1576 
1577  _mm_storeu_si128(M128_CAST(outBlocks), block0);
1578  outBlocks = PtrAdd(outBlocks, outIncrement);
1579  _mm_storeu_si128(M128_CAST(outBlocks), block1);
1580  outBlocks = PtrAdd(outBlocks, outIncrement);
1581  _mm_storeu_si128(M128_CAST(outBlocks), block2);
1582  outBlocks = PtrAdd(outBlocks, outIncrement);
1583  _mm_storeu_si128(M128_CAST(outBlocks), block3);
1584  outBlocks = PtrAdd(outBlocks, outIncrement);
1585 
1586  length -= 4*blockSize;
1587  }
1588  }
1589 
1590  while (length >= blockSize)
1591  {
1592  __m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1593 
1594  if (xorInput)
1595  block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1596 
1597  if (flags & BT_InBlockIsCounter)
1598  const_cast<byte *>(inBlocks)[15]++;
1599 
1600  func1(block, subKeys, static_cast<unsigned int>(rounds));
1601 
1602  if (xorOutput)
1603  block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1604 
1605  _mm_storeu_si128(M128_CAST(outBlocks), block);
1606 
1607  inBlocks = PtrAdd(inBlocks, inIncrement);
1608  outBlocks = PtrAdd(outBlocks, outIncrement);
1609  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1610  length -= blockSize;
1611  }
1612 
1613  return length;
1614 }
1615 
1616 /// \brief AdvancedProcessBlocks for 1 and 4 blocks
1617 /// \tparam F1 function to process 1 64-bit block
1618 /// \tparam F4 function to process 6 64-bit blocks
1619 /// \tparam W word type of the subkey table
1620 /// \details AdvancedProcessBlocks64_4x1_SSE processes 4 and 1 SSE SIMD words
1621 /// at a time.
1622 /// \details The subkey type is usually word32 or word64. F1 and F4 must use the
1623 /// same word type.
1624 template <typename F1, typename F4, typename W>
1625 CRYPTOPP_INLINE size_t AdvancedProcessBlocks64_4x1_SSE(F1 func1, F4 func4,
1626  MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
1627  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1628 {
1629  CRYPTOPP_ASSERT(subKeys);
1630  CRYPTOPP_ASSERT(inBlocks);
1631  CRYPTOPP_ASSERT(outBlocks);
1632  CRYPTOPP_ASSERT(length >= 8);
1633 
1634  const size_t blockSize = 8;
1635  const size_t xmmBlockSize = 16;
1636 
1637  size_t inIncrement = (flags & (BT_InBlockIsCounter | BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
1638  size_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
1639  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
1640 
1641  // Clang and Coverity are generating findings using xorBlocks as a flag.
1642  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1643  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1644 
1645  if (flags & BT_ReverseDirection)
1646  {
1647  inBlocks = PtrAdd(inBlocks, length - xmmBlockSize);
1648  xorBlocks = PtrAdd(xorBlocks, length - xmmBlockSize);
1649  outBlocks = PtrAdd(outBlocks, length - xmmBlockSize);
1650  inIncrement = 0 - inIncrement;
1651  xorIncrement = 0 - xorIncrement;
1652  outIncrement = 0 - outIncrement;
1653  }
1654 
1655  if (flags & BT_AllowParallel)
1656  {
1657  while (length >= 4*xmmBlockSize)
1658  {
1659  __m128i block0, block1, block2, block3;
1660  if (flags & BT_InBlockIsCounter)
1661  {
1662  // Increment of 1 and 2 in big-endian compatible with the ctr byte array.
1663  const __m128i s_one = _mm_set_epi32(1<<24, 0, 0, 0);
1664  const __m128i s_two = _mm_set_epi32(2<<24, 0, 2<<24, 0);
1665  double temp[2];
1666 
1667  // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
1668  // After the dup load we have two counters in the XMM word. Then we need
1669  // to increment the low ctr by 0 and the high ctr by 1.
1670  std::memcpy(temp, inBlocks, blockSize);
1671  block0 = _mm_add_epi32(s_one, _mm_castpd_si128(_mm_loaddup_pd(temp)));
1672 
1673  // After initial increment of {0,1} remaining counters increment by {2,2}.
1674  block1 = _mm_add_epi32(s_two, block0);
1675  block2 = _mm_add_epi32(s_two, block1);
1676  block3 = _mm_add_epi32(s_two, block2);
1677 
1678  // Store the next counter. When BT_InBlockIsCounter is set then
1679  // inBlocks is backed by m_counterArray which is non-const.
1680  _mm_store_sd(temp, _mm_castsi128_pd(_mm_add_epi64(s_two, block3)));
1681  std::memcpy(const_cast<byte*>(inBlocks), temp, blockSize);
1682  }
1683  else
1684  {
1685  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1686  inBlocks = PtrAdd(inBlocks, inIncrement);
1687  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1688  inBlocks = PtrAdd(inBlocks, inIncrement);
1689  block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1690  inBlocks = PtrAdd(inBlocks, inIncrement);
1691  block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1692  inBlocks = PtrAdd(inBlocks, inIncrement);
1693  }
1694 
1695  if (xorInput)
1696  {
1697  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1698  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1699  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1700  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1701  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1702  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1703  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1704  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1705  }
1706 
1707  func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
1708 
1709  if (xorOutput)
1710  {
1711  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1712  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1713  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1714  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1715  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1716  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1717  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1718  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1719  }
1720 
1721  _mm_storeu_si128(M128_CAST(outBlocks), block0);
1722  outBlocks = PtrAdd(outBlocks, outIncrement);
1723  _mm_storeu_si128(M128_CAST(outBlocks), block1);
1724  outBlocks = PtrAdd(outBlocks, outIncrement);
1725  _mm_storeu_si128(M128_CAST(outBlocks), block2);
1726  outBlocks = PtrAdd(outBlocks, outIncrement);
1727  _mm_storeu_si128(M128_CAST(outBlocks), block3);
1728  outBlocks = PtrAdd(outBlocks, outIncrement);
1729 
1730  length -= 4*xmmBlockSize;
1731  }
1732  }
1733 
1734  if (length)
1735  {
1736  // Adjust to real block size
1737  if (flags & BT_ReverseDirection)
1738  {
1739  inIncrement += inIncrement ? blockSize : 0;
1740  xorIncrement += xorIncrement ? blockSize : 0;
1741  outIncrement += outIncrement ? blockSize : 0;
1742  inBlocks = PtrSub(inBlocks, inIncrement);
1743  xorBlocks = PtrSub(xorBlocks, xorIncrement);
1744  outBlocks = PtrSub(outBlocks, outIncrement);
1745  }
1746  else
1747  {
1748  inIncrement -= inIncrement ? blockSize : 0;
1749  xorIncrement -= xorIncrement ? blockSize : 0;
1750  outIncrement -= outIncrement ? blockSize : 0;
1751  }
1752 
1753  while (length >= blockSize)
1754  {
1755  double temp[2];
1756  std::memcpy(temp, inBlocks, blockSize);
1757  __m128i block = _mm_castpd_si128(_mm_load_sd(temp));
1758 
1759  if (xorInput)
1760  {
1761  std::memcpy(temp, xorBlocks, blockSize);
1762  block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
1763  }
1764 
1765  if (flags & BT_InBlockIsCounter)
1766  const_cast<byte *>(inBlocks)[7]++;
1767 
1768  func1(block, subKeys, static_cast<unsigned int>(rounds));
1769 
1770  if (xorOutput)
1771  {
1772  std::memcpy(temp, xorBlocks, blockSize);
1773  block = _mm_xor_si128(block, _mm_castpd_si128(_mm_load_sd(temp)));
1774  }
1775 
1776  _mm_store_sd(temp, _mm_castsi128_pd(block));
1777  std::memcpy(outBlocks, temp, blockSize);
1778 
1779  inBlocks = PtrAdd(inBlocks, inIncrement);
1780  outBlocks = PtrAdd(outBlocks, outIncrement);
1781  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1782  length -= blockSize;
1783  }
1784  }
1785 
1786  return length;
1787 }
1788 
1789 NAMESPACE_END // CryptoPP
1790 
1791 #endif // CRYPTOPP_SSSE3_AVAILABLE
1792 
1793 // *********************** Altivec/Power 4 ********************** //
1794 
1795 #if defined(__ALTIVEC__)
1796 
1797 NAMESPACE_BEGIN(CryptoPP)
1798 
1799 /// \brief AdvancedProcessBlocks for 2 and 6 blocks
1800 /// \tparam F2 function to process 2 128-bit blocks
1801 /// \tparam F6 function to process 6 128-bit blocks
1802 /// \tparam W word type of the subkey table
1803 /// \details AdvancedProcessBlocks64_6x2_Altivec processes 6 and 2 Altivec SIMD words
1804 /// at a time. For a single block the template uses F2 with a zero block.
1805 /// \details The subkey type is usually word32 or word64. F2 and F6 must use the
1806 /// same word type.
1807 template <typename F2, typename F6, typename W>
1808 CRYPTOPP_INLINE size_t AdvancedProcessBlocks64_6x2_ALTIVEC(F2 func2, F6 func6,
1809  const W *subKeys, size_t rounds, const byte *inBlocks,
1810  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1811 {
1812  CRYPTOPP_ASSERT(subKeys);
1813  CRYPTOPP_ASSERT(inBlocks);
1814  CRYPTOPP_ASSERT(outBlocks);
1815  CRYPTOPP_ASSERT(length >= 8);
1816 
1817 #if (CRYPTOPP_LITTLE_ENDIAN)
1818  enum {LowOffset=8, HighOffset=0};
1819  const uint32x4_p s_one = {1,0,0,0};
1820  const uint32x4_p s_two = {2,0,2,0};
1821 #else
1822  enum {LowOffset=8, HighOffset=0};
1823  const uint32x4_p s_one = {0,0,0,1};
1824  const uint32x4_p s_two = {0,2,0,2};
1825 #endif
1826 
1827  const size_t blockSize = 8;
1828  const size_t vsxBlockSize = 16;
1829  CRYPTOPP_ALIGN_DATA(16) uint8_t temp[16];
1830 
1831  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : vsxBlockSize;
1832  size_t xorIncrement = (xorBlocks != NULLPTR) ? vsxBlockSize : 0;
1833  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : vsxBlockSize;
1834 
1835  // Clang and Coverity are generating findings using xorBlocks as a flag.
1836  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1837  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1838 
1839  if (flags & BT_ReverseDirection)
1840  {
1841  inBlocks = PtrAdd(inBlocks, length - vsxBlockSize);
1842  xorBlocks = PtrAdd(xorBlocks, length - vsxBlockSize);
1843  outBlocks = PtrAdd(outBlocks, length - vsxBlockSize);
1844  inIncrement = 0-inIncrement;
1845  xorIncrement = 0-xorIncrement;
1846  outIncrement = 0-outIncrement;
1847  }
1848 
1849  if (flags & BT_AllowParallel)
1850  {
1851  while (length >= 6*vsxBlockSize)
1852  {
1853  uint32x4_p block0, block1, block2, block3, block4, block5;
1854  if (flags & BT_InBlockIsCounter)
1855  {
1856  // There is no easy way to load 8-bytes into a vector. It is
1857  // even harder without POWER8 due to lack of 64-bit elements.
1858  std::memcpy(temp+LowOffset, inBlocks, 8);
1859  std::memcpy(temp+HighOffset, inBlocks, 8);
1860  uint32x4_p ctr = (uint32x4_p)VecLoadBE(temp);
1861 
1862  // For 64-bit block ciphers we need to load the CTR block,
1863  // which is 8 bytes. After the dup load we have two counters
1864  // in the Altivec word. Then we need to increment the low ctr
1865  // by 0 and the high ctr by 1.
1866  block0 = VecAdd(s_one, ctr);
1867 
1868  // After initial increment of {0,1} remaining counters
1869  // increment by {2,2}.
1870  block1 = VecAdd(s_two, block0);
1871  block2 = VecAdd(s_two, block1);
1872  block3 = VecAdd(s_two, block2);
1873  block4 = VecAdd(s_two, block3);
1874  block5 = VecAdd(s_two, block4);
1875 
1876  // Update the counter in the caller.
1877  const_cast<byte*>(inBlocks)[7] += 12;
1878  }
1879  else
1880  {
1881  block0 = VecLoadBE(inBlocks);
1882  inBlocks = PtrAdd(inBlocks, inIncrement);
1883  block1 = VecLoadBE(inBlocks);
1884  inBlocks = PtrAdd(inBlocks, inIncrement);
1885  block2 = VecLoadBE(inBlocks);
1886  inBlocks = PtrAdd(inBlocks, inIncrement);
1887  block3 = VecLoadBE(inBlocks);
1888  inBlocks = PtrAdd(inBlocks, inIncrement);
1889  block4 = VecLoadBE(inBlocks);
1890  inBlocks = PtrAdd(inBlocks, inIncrement);
1891  block5 = VecLoadBE(inBlocks);
1892  inBlocks = PtrAdd(inBlocks, inIncrement);
1893  }
1894 
1895  if (xorInput)
1896  {
1897  block0 = VecXor(block0, VecLoadBE(xorBlocks));
1898  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1899  block1 = VecXor(block1, VecLoadBE(xorBlocks));
1900  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1901  block2 = VecXor(block2, VecLoadBE(xorBlocks));
1902  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1903  block3 = VecXor(block3, VecLoadBE(xorBlocks));
1904  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1905  block4 = VecXor(block4, VecLoadBE(xorBlocks));
1906  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1907  block5 = VecXor(block5, VecLoadBE(xorBlocks));
1908  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1909  }
1910 
1911  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
1912 
1913  if (xorOutput)
1914  {
1915  block0 = VecXor(block0, VecLoadBE(xorBlocks));
1916  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1917  block1 = VecXor(block1, VecLoadBE(xorBlocks));
1918  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1919  block2 = VecXor(block2, VecLoadBE(xorBlocks));
1920  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1921  block3 = VecXor(block3, VecLoadBE(xorBlocks));
1922  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1923  block4 = VecXor(block4, VecLoadBE(xorBlocks));
1924  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1925  block5 = VecXor(block5, VecLoadBE(xorBlocks));
1926  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1927  }
1928 
1929  VecStoreBE(block0, outBlocks);
1930  outBlocks = PtrAdd(outBlocks, outIncrement);
1931  VecStoreBE(block1, outBlocks);
1932  outBlocks = PtrAdd(outBlocks, outIncrement);
1933  VecStoreBE(block2, outBlocks);
1934  outBlocks = PtrAdd(outBlocks, outIncrement);
1935  VecStoreBE(block3, outBlocks);
1936  outBlocks = PtrAdd(outBlocks, outIncrement);
1937  VecStoreBE(block4, outBlocks);
1938  outBlocks = PtrAdd(outBlocks, outIncrement);
1939  VecStoreBE(block5, outBlocks);
1940  outBlocks = PtrAdd(outBlocks, outIncrement);
1941 
1942  length -= 6*vsxBlockSize;
1943  }
1944 
1945  while (length >= 2*vsxBlockSize)
1946  {
1947  uint32x4_p block0, block1;
1948  if (flags & BT_InBlockIsCounter)
1949  {
1950  // There is no easy way to load 8-bytes into a vector. It is
1951  // even harder without POWER8 due to lack of 64-bit elements.
1952  std::memcpy(temp+LowOffset, inBlocks, 8);
1953  std::memcpy(temp+HighOffset, inBlocks, 8);
1954  uint32x4_p ctr = (uint32x4_p)VecLoadBE(temp);
1955 
1956  // For 64-bit block ciphers we need to load the CTR block,
1957  // which is 8 bytes. After the dup load we have two counters
1958  // in the Altivec word. Then we need to increment the low ctr
1959  // by 0 and the high ctr by 1.
1960  block0 = VecAdd(s_one, ctr);
1961 
1962  // After initial increment of {0,1} remaining counters
1963  // increment by {2,2}.
1964  block1 = VecAdd(s_two, block0);
1965 
1966  // Update the counter in the caller.
1967  const_cast<byte*>(inBlocks)[7] += 4;
1968  }
1969  else
1970  {
1971  block0 = VecLoadBE(inBlocks);
1972  inBlocks = PtrAdd(inBlocks, inIncrement);
1973  block1 = VecLoadBE(inBlocks);
1974  inBlocks = PtrAdd(inBlocks, inIncrement);
1975  }
1976 
1977  if (xorInput)
1978  {
1979  block0 = VecXor(block0, VecLoadBE(xorBlocks));
1980  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1981  block1 = VecXor(block1, VecLoadBE(xorBlocks));
1982  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1983  }
1984 
1985  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
1986 
1987  if (xorOutput)
1988  {
1989  block0 = VecXor(block0, VecLoadBE(xorBlocks));
1990  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1991  block1 = VecXor(block1, VecLoadBE(xorBlocks));
1992  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
1993  }
1994 
1995  VecStoreBE(block0, outBlocks);
1996  outBlocks = PtrAdd(outBlocks, outIncrement);
1997  VecStoreBE(block1, outBlocks);
1998  outBlocks = PtrAdd(outBlocks, outIncrement);
1999 
2000  length -= 2*vsxBlockSize;
2001  }
2002  }
2003 
2004  if (length)
2005  {
2006  // Adjust to real block size
2007  if (flags & BT_ReverseDirection)
2008  {
2009  inIncrement += inIncrement ? blockSize : 0;
2010  xorIncrement += xorIncrement ? blockSize : 0;
2011  outIncrement += outIncrement ? blockSize : 0;
2012  inBlocks = PtrSub(inBlocks, inIncrement);
2013  xorBlocks = PtrSub(xorBlocks, xorIncrement);
2014  outBlocks = PtrSub(outBlocks, outIncrement);
2015  }
2016  else
2017  {
2018  inIncrement -= inIncrement ? blockSize : 0;
2019  xorIncrement -= xorIncrement ? blockSize : 0;
2020  outIncrement -= outIncrement ? blockSize : 0;
2021  }
2022 
2023  while (length >= blockSize)
2024  {
2025  uint32x4_p block, zero = {0};
2026 
2027  // There is no easy way to load 8-bytes into a vector. It is
2028  // even harder without POWER8 due to lack of 64-bit elements.
2029  // The high 8 bytes are "don't care" but it if we don't
2030  // initialize the block then it generates warnings.
2031  std::memcpy(temp+LowOffset, inBlocks, 8);
2032  std::memcpy(temp+HighOffset, inBlocks, 8); // don't care
2033  block = (uint32x4_p)VecLoadBE(temp);
2034 
2035  if (xorInput)
2036  {
2037  std::memcpy(temp+LowOffset, xorBlocks, 8);
2038  std::memcpy(temp+HighOffset, xorBlocks, 8); // don't care
2039  uint32x4_p x = (uint32x4_p)VecLoadBE(temp);
2040  block = VecXor(block, x);
2041  }
2042 
2043  // Update the counter in the caller.
2044  if (flags & BT_InBlockIsCounter)
2045  const_cast<byte *>(inBlocks)[7]++;
2046 
2047  func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
2048 
2049  if (xorOutput)
2050  {
2051  std::memcpy(temp+LowOffset, xorBlocks, 8);
2052  std::memcpy(temp+HighOffset, xorBlocks, 8); // don't care
2053  uint32x4_p x = (uint32x4_p)VecLoadBE(temp);
2054  block = VecXor(block, x);
2055  }
2056 
2057  VecStoreBE(block, temp);
2058  std::memcpy(outBlocks, temp+LowOffset, 8);
2059 
2060  inBlocks = PtrAdd(inBlocks, inIncrement);
2061  outBlocks = PtrAdd(outBlocks, outIncrement);
2062  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2063  length -= blockSize;
2064  }
2065  }
2066 
2067  return length;
2068 }
2069 
2070 /// \brief AdvancedProcessBlocks for 1 and 4 blocks
2071 /// \tparam F1 function to process 1 128-bit block
2072 /// \tparam F4 function to process 4 128-bit blocks
2073 /// \tparam W word type of the subkey table
2074 /// \details AdvancedProcessBlocks128_4x1_ALTIVEC processes 4 and 1 Altivec SIMD words
2075 /// at a time.
2076 /// \details The subkey type is usually word32 or word64. F1 and F4 must use the
2077 /// same word type.
2078 template <typename F1, typename F4, typename W>
2079 CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4,
2080  const W *subKeys, size_t rounds, const byte *inBlocks,
2081  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
2082 {
2083  CRYPTOPP_ASSERT(subKeys);
2084  CRYPTOPP_ASSERT(inBlocks);
2085  CRYPTOPP_ASSERT(outBlocks);
2086  CRYPTOPP_ASSERT(length >= 16);
2087 
2088 #if (CRYPTOPP_LITTLE_ENDIAN)
2089  const uint32x4_p s_one = {1,0,0,0};
2090 #else
2091  const uint32x4_p s_one = {0,0,0,1};
2092 #endif
2093 
2094  const size_t blockSize = 16;
2095  // const size_t vsxBlockSize = 16;
2096 
2097  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
2098  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
2099  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
2100 
2101  // Clang and Coverity are generating findings using xorBlocks as a flag.
2102  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
2103  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
2104 
2105  if (flags & BT_ReverseDirection)
2106  {
2107  inBlocks = PtrAdd(inBlocks, length - blockSize);
2108  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
2109  outBlocks = PtrAdd(outBlocks, length - blockSize);
2110  inIncrement = 0-inIncrement;
2111  xorIncrement = 0-xorIncrement;
2112  outIncrement = 0-outIncrement;
2113  }
2114 
2115  if (flags & BT_AllowParallel)
2116  {
2117  while (length >= 4*blockSize)
2118  {
2119  uint32x4_p block0, block1, block2, block3;
2120 
2121  if (flags & BT_InBlockIsCounter)
2122  {
2123  block0 = VecLoadBE(inBlocks);
2124  block1 = VecAdd(block0, s_one);
2125  block2 = VecAdd(block1, s_one);
2126  block3 = VecAdd(block2, s_one);
2127 
2128  // Hack due to big-endian loads used by POWER8 (and maybe ARM-BE).
2129  // CTR_ModePolicy::OperateKeystream is wired such that after
2130  // returning from this function CTR_ModePolicy will detect wrap on
2131  // on the last counter byte and increment the next to last byte.
2132  // The problem is, with a big-endian load, inBlocks[15] is really
2133  // located at index 15. The vector addition using a 32-bit element
2134  // generates a carry into inBlocks[14] and then CTR_ModePolicy
2135  // increments inBlocks[14] too.
2136  const_cast<byte*>(inBlocks)[15] += 6;
2137  }
2138  else
2139  {
2140  block0 = VecLoadBE(inBlocks);
2141  inBlocks = PtrAdd(inBlocks, inIncrement);
2142  block1 = VecLoadBE(inBlocks);
2143  inBlocks = PtrAdd(inBlocks, inIncrement);
2144  block2 = VecLoadBE(inBlocks);
2145  inBlocks = PtrAdd(inBlocks, inIncrement);
2146  block3 = VecLoadBE(inBlocks);
2147  inBlocks = PtrAdd(inBlocks, inIncrement);
2148  }
2149 
2150  if (xorInput)
2151  {
2152  block0 = VecXor(block0, VecLoadBE(xorBlocks));
2153  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2154  block1 = VecXor(block1, VecLoadBE(xorBlocks));
2155  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2156  block2 = VecXor(block2, VecLoadBE(xorBlocks));
2157  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2158  block3 = VecXor(block3, VecLoadBE(xorBlocks));
2159  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2160  }
2161 
2162  func4(block0, block1, block2, block3, subKeys, rounds);
2163 
2164  if (xorOutput)
2165  {
2166  block0 = VecXor(block0, VecLoadBE(xorBlocks));
2167  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2168  block1 = VecXor(block1, VecLoadBE(xorBlocks));
2169  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2170  block2 = VecXor(block2, VecLoadBE(xorBlocks));
2171  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2172  block3 = VecXor(block3, VecLoadBE(xorBlocks));
2173  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2174  }
2175 
2176  VecStoreBE(block0, outBlocks);
2177  outBlocks = PtrAdd(outBlocks, outIncrement);
2178  VecStoreBE(block1, outBlocks);
2179  outBlocks = PtrAdd(outBlocks, outIncrement);
2180  VecStoreBE(block2, outBlocks);
2181  outBlocks = PtrAdd(outBlocks, outIncrement);
2182  VecStoreBE(block3, outBlocks);
2183  outBlocks = PtrAdd(outBlocks, outIncrement);
2184 
2185  length -= 4*blockSize;
2186  }
2187  }
2188 
2189  while (length >= blockSize)
2190  {
2191  uint32x4_p block = VecLoadBE(inBlocks);
2192 
2193  if (xorInput)
2194  block = VecXor(block, VecLoadBE(xorBlocks));
2195 
2196  if (flags & BT_InBlockIsCounter)
2197  const_cast<byte *>(inBlocks)[15]++;
2198 
2199  func1(block, subKeys, rounds);
2200 
2201  if (xorOutput)
2202  block = VecXor(block, VecLoadBE(xorBlocks));
2203 
2204  VecStoreBE(block, outBlocks);
2205 
2206  inBlocks = PtrAdd(inBlocks, inIncrement);
2207  outBlocks = PtrAdd(outBlocks, outIncrement);
2208  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2209  length -= blockSize;
2210  }
2211 
2212  return length;
2213 }
2214 
2215 /// \brief AdvancedProcessBlocks for 1 and 6 blocks
2216 /// \tparam F1 function to process 1 128-bit block
2217 /// \tparam F6 function to process 6 128-bit blocks
2218 /// \tparam W word type of the subkey table
2219 /// \details AdvancedProcessBlocks128_6x1_ALTIVEC processes 6 and 1 Altivec SIMD words
2220 /// at a time.
2221 /// \details The subkey type is usually word32 or word64. F1 and F6 must use the
2222 /// same word type.
2223 template <typename F1, typename F6, typename W>
2224 CRYPTOPP_INLINE size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
2225  const W *subKeys, size_t rounds, const byte *inBlocks,
2226  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
2227 {
2228  CRYPTOPP_ASSERT(subKeys);
2229  CRYPTOPP_ASSERT(inBlocks);
2230  CRYPTOPP_ASSERT(outBlocks);
2231  CRYPTOPP_ASSERT(length >= 16);
2232 
2233 #if (CRYPTOPP_LITTLE_ENDIAN)
2234  const uint32x4_p s_one = {1,0,0,0};
2235 #else
2236  const uint32x4_p s_one = {0,0,0,1};
2237 #endif
2238 
2239  const size_t blockSize = 16;
2240  // const size_t vsxBlockSize = 16;
2241 
2242  size_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
2243  size_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
2244  size_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
2245 
2246  // Clang and Coverity are generating findings using xorBlocks as a flag.
2247  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
2248  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
2249 
2250  if (flags & BT_ReverseDirection)
2251  {
2252  inBlocks = PtrAdd(inBlocks, length - blockSize);
2253  xorBlocks = PtrAdd(xorBlocks, length - blockSize);
2254  outBlocks = PtrAdd(outBlocks, length - blockSize);
2255  inIncrement = 0-inIncrement;
2256  xorIncrement = 0-xorIncrement;
2257  outIncrement = 0-outIncrement;
2258  }
2259 
2260  if (flags & BT_AllowParallel)
2261  {
2262  while (length >= 6*blockSize)
2263  {
2264  uint32x4_p block0, block1, block2, block3, block4, block5;
2265 
2266  if (flags & BT_InBlockIsCounter)
2267  {
2268  block0 = VecLoadBE(inBlocks);
2269  block1 = VecAdd(block0, s_one);
2270  block2 = VecAdd(block1, s_one);
2271  block3 = VecAdd(block2, s_one);
2272  block4 = VecAdd(block3, s_one);
2273  block5 = VecAdd(block4, s_one);
2274 
2275  // Hack due to big-endian loads used by POWER8 (and maybe ARM-BE).
2276  // CTR_ModePolicy::OperateKeystream is wired such that after
2277  // returning from this function CTR_ModePolicy will detect wrap on
2278  // on the last counter byte and increment the next to last byte.
2279  // The problem is, with a big-endian load, inBlocks[15] is really
2280  // located at index 15. The vector addition using a 32-bit element
2281  // generates a carry into inBlocks[14] and then CTR_ModePolicy
2282  // increments inBlocks[14] too.
2283  //
2284  // To find this bug we needed a test case with a ctr of 0xNN...FA.
2285  // The last octet is 0xFA and adding 6 creates the wrap to trigger
2286  // the issue. If the last octet was 0xFC then 4 would trigger it.
2287  // We dumb-lucked into the test with SPECK-128. The test case of
2288  // interest is the one with IV 348ECA9766C09F04 826520DE47A212FA.
2289  uint8x16_p temp = VecAdd((uint8x16_p)block5, (uint8x16_p)s_one);
2290  VecStoreBE(temp, const_cast<byte*>(inBlocks));
2291  }
2292  else
2293  {
2294  block0 = VecLoadBE(inBlocks);
2295  inBlocks = PtrAdd(inBlocks, inIncrement);
2296  block1 = VecLoadBE(inBlocks);
2297  inBlocks = PtrAdd(inBlocks, inIncrement);
2298  block2 = VecLoadBE(inBlocks);
2299  inBlocks = PtrAdd(inBlocks, inIncrement);
2300  block3 = VecLoadBE(inBlocks);
2301  inBlocks = PtrAdd(inBlocks, inIncrement);
2302  block4 = VecLoadBE(inBlocks);
2303  inBlocks = PtrAdd(inBlocks, inIncrement);
2304  block5 = VecLoadBE(inBlocks);
2305  inBlocks = PtrAdd(inBlocks, inIncrement);
2306  }
2307 
2308  if (xorInput)
2309  {
2310  block0 = VecXor(block0, VecLoadBE(xorBlocks));
2311  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2312  block1 = VecXor(block1, VecLoadBE(xorBlocks));
2313  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2314  block2 = VecXor(block2, VecLoadBE(xorBlocks));
2315  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2316  block3 = VecXor(block3, VecLoadBE(xorBlocks));
2317  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2318  block4 = VecXor(block4, VecLoadBE(xorBlocks));
2319  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2320  block5 = VecXor(block5, VecLoadBE(xorBlocks));
2321  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2322  }
2323 
2324  func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
2325 
2326  if (xorOutput)
2327  {
2328  block0 = VecXor(block0, VecLoadBE(xorBlocks));
2329  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2330  block1 = VecXor(block1, VecLoadBE(xorBlocks));
2331  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2332  block2 = VecXor(block2, VecLoadBE(xorBlocks));
2333  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2334  block3 = VecXor(block3, VecLoadBE(xorBlocks));
2335  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2336  block4 = VecXor(block4, VecLoadBE(xorBlocks));
2337  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2338  block5 = VecXor(block5, VecLoadBE(xorBlocks));
2339  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2340  }
2341 
2342  VecStoreBE(block0, outBlocks);
2343  outBlocks = PtrAdd(outBlocks, outIncrement);
2344  VecStoreBE(block1, outBlocks);
2345  outBlocks = PtrAdd(outBlocks, outIncrement);
2346  VecStoreBE(block2, outBlocks);
2347  outBlocks = PtrAdd(outBlocks, outIncrement);
2348  VecStoreBE(block3, outBlocks);
2349  outBlocks = PtrAdd(outBlocks, outIncrement);
2350  VecStoreBE(block4, outBlocks);
2351  outBlocks = PtrAdd(outBlocks, outIncrement);
2352  VecStoreBE(block5, outBlocks);
2353  outBlocks = PtrAdd(outBlocks, outIncrement);
2354 
2355  length -= 6*blockSize;
2356  }
2357  }
2358 
2359  while (length >= blockSize)
2360  {
2361  uint32x4_p block = VecLoadBE(inBlocks);
2362 
2363  if (xorInput)
2364  block = VecXor(block, VecLoadBE(xorBlocks));
2365 
2366  if (flags & BT_InBlockIsCounter)
2367  const_cast<byte *>(inBlocks)[15]++;
2368 
2369  func1(block, subKeys, rounds);
2370 
2371  if (xorOutput)
2372  block = VecXor(block, VecLoadBE(xorBlocks));
2373 
2374  VecStoreBE(block, outBlocks);
2375 
2376  inBlocks = PtrAdd(inBlocks, inIncrement);
2377  outBlocks = PtrAdd(outBlocks, outIncrement);
2378  xorBlocks = PtrAdd(xorBlocks, xorIncrement);
2379  length -= blockSize;
2380  }
2381 
2382  return length;
2383 }
2384 
2385 NAMESPACE_END // CryptoPP
2386 
2387 #endif // __ALTIVEC__
2388 
2389 #endif // CRYPTOPP_ADVANCED_SIMD_TEMPLATES
Allow parallel transformations.
Definition: cryptlib.h:897
Utility functions for the Crypto++ library.
Library configuration file.
should not modify block pointers
Definition: cryptlib.h:891
Common C++ header files.
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition: ppc_simd.h:963
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:128
Support functions for PowerPC and vector operations.
void VecStoreBE(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:746
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:60
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:945
PTR PtrSub(PTR pointer, OFF offset)
Create a pointer with an offset.
Definition: misc.h:384
PTR PtrAdd(PTR pointer, OFF offset)
Create a pointer with an offset.
Definition: misc.h:371
Xor inputs before transformation.
Definition: cryptlib.h:893
uint32x4_p VecLoadBE(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:438
perform the transformation in reverse
Definition: cryptlib.h:895
Crypto++ library namespace.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:118