Crypto++  7.0
Free C++ class library of cryptographic schemes
ppc_simd.h
Go to the documentation of this file.
1 // ppc_simd.h - written and placed in public domain by Jeffrey Walton
2 
3 /// \file ppc_simd.h
4 /// \brief Support functions for PowerPC and vector operations
5 /// \details This header provides an agnostic interface into Clang, GCC
6 /// and IBM XL C/C++ compilers modulo their different built-in functions
7 /// for accessing vector intructions.
8 /// \details The abstractions are necesssary to support back to GCC 4.8 and
9 /// XLC 11 and 12. GCC 4.8 and 4.9 are still popular, and they are the
10 /// default compiler for GCC112, GCC118 and others on the compile farm.
11 /// Older IBM XL C/C++ compilers also experience it due to lack of
12 /// <tt>vec_xl</tt> and <tt>vec_xst</tt> support on some platforms. Modern
13 /// compilers provide best support and don't need many of the hacks
14 /// below.
15 /// \details The library is tested with the following PowerPC machines and
16 /// compilers. GCC110, GCC111, GCC112, GCC119 and GCC135 are provided by
17 /// the <A HREF="https://cfarm.tetaneutral.net/">GCC Compile Farm</A>
18 /// - PowerMac G5, OSX 10.5, POWER4, Apple GCC 4.0
19 /// - PowerMac G5, OSX 10.5, POWER4, Macports GCC 5.0
20 /// - GCC110, Linux, POWER7, GCC 4.8.5
21 /// - GCC110, Linux, POWER7, XLC 12.01
22 /// - GCC111, AIX, POWER7, GCC 4.8.1
23 /// - GCC111, AIX, POWER7, XLC 12.01
24 /// - GCC112, Linux, POWER8, GCC 4.8.5
25 /// - GCC112, Linux, POWER8, XLC 13.01
26 /// - GCC112, Linux, POWER8, Clang 7.0
27 /// - GCC119, AIX, POWER8, GCC 7.2.0
28 /// - GCC119, AIX, POWER8, XLC 13.01
29 /// - GCC135, Linux, POWER9, GCC 7.0
30 /// \details 12 machines are used for testing because the three compilers form
31 /// five profiles. The profiles are listed below.
32 /// - GCC (Linux GCC, Macports GCC, etc. Consistent across machines)
33 /// - XLC 13.0 and earlier (all IBM components)
34 /// - XLC 13.1 and later on Linux (LLVM front-end, no compatibility macros)
35 /// - XLC 13.1 and later on Linux (LLVM front-end, -qxlcompatmacros option)
36 /// - LLVM Clang (traditional Clang compiler)
37 /// \details The LLVM front-end makes it tricky to write portable code because
38 /// LLVM pretends to be other compilers but cannot consume other compiler's
39 /// builtins. When using XLC with -qxlcompatmacros the compiler pretends to
40 /// be GCC, Clang and XLC all at once but it can only consume it's variety
41 /// of builtins.
42 /// \details At Crypto++ 8.0 the various VectorFunc{Name} were renamed to
43 /// VecFunc{Name}. For example, VectorAnd was changed to VecAnd. The name
44 /// change helped consolidate two slightly different implementations.
45 /// \since Crypto++ 6.0, LLVM Clang compiler support since Crypto++ 8.0
46 
47 // Use __ALTIVEC__, _ARCH_PWR7 and _ARCH_PWR8 when detecting actual
48 // availaibility of the feature for the source file being compiled. The
49 // preprocessor macros depend on compiler options like -maltivec; and
50 // not compiler versions.
51 
52 // DO NOT USE this pattern in VecLoad and VecStore. We have to use the
53 // spaghetti code tangled in preprocessor macros because XLC 12 generates
54 // bad code in some places. To verify the bad code generation test on
55 // GCC111 with XLC 12.01 installed. XLC 13.01 on GCC112 and GCC119 are OK.
56 //
57 // inline uint32x4_p VecLoad(const byte src[16])
58 // {
59 // #if defined(_ARCH_PWR7)
60 // return (uint32x4_p) *(uint8x16_p*)((byte*)src);
61 // #else
62 // return VecLoad_ALTIVEC(src);
63 // #endif
64 // }
65 
66 #ifndef CRYPTOPP_PPC_CRYPTO_H
67 #define CRYPTOPP_PPC_CRYPTO_H
68 
69 #include "config.h"
70 #include "misc.h"
71 
72 #if defined(__ALTIVEC__)
73 # include <altivec.h>
74 # undef vector
75 # undef pixel
76 # undef bool
77 #endif
78 
79 // IBM XLC on AIX does not define __CRYPTO__ like it should with -qarch=pwr8.
80 // Crypto is available in XLC 13.1 and above. More LLVM front-end goodness.
81 #if defined(_AIX) && defined(_ARCH_PWR8) && (__xlC__ >= 0xd01)
82 # undef __CRYPTO__
83 # define __CRYPTO__ 1
84 #endif
85 
86 // Hack to detect early XLC compilers. XLC compilers for POWER7 use
87 // vec_xlw4 and vec_xstw4 (and ld2 variants); not vec_xl and vec_st.
88 // Some XLC compilers for POWER7 and above use vec_xl and vec_xst.
89 // The way to tell the difference is, XLC compilers version 13.0 and
90 // earlier use vec_xlw4 and vec_xstw4. XLC compilers 13.1 and later
91 // are use vec_xl and vec_xst. The open question is, how to handle
92 // early Clang compilers for POWER7. We know the latest Clang
93 // compilers support vec_xl and vec_xst. Also see
94 // https://www-01.ibm.com/support/docview.wss?uid=swg21683541.
95 
96 #if defined(__xlc__) && (__xlc__ < 0x0d01)
97 # define __early_xlc__ 1
98 #endif
99 #if defined(__xlC__) && (__xlC__ < 0x0d01)
100 # define __early_xlC__ 1
101 #endif
102 
103 // VecLoad_ALTIVEC and VecStore_ALTIVEC are
104 // too noisy on modern compilers
105 #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
106 # pragma GCC diagnostic push
107 # pragma GCC diagnostic ignored "-Wdeprecated"
108 #endif
109 
110 NAMESPACE_BEGIN(CryptoPP)
111 
112 #if defined(__ALTIVEC__) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
113 
114 /// \brief Vector of 8-bit elements
115 /// \par Wraps
116 /// __vector unsigned char
117 /// \since Crypto++ 6.0
118 typedef __vector unsigned char uint8x16_p;
119 /// \brief Vector of 16-bit elements
120 /// \par Wraps
121 /// __vector unsigned short
122 /// \since Crypto++ 6.0
123 typedef __vector unsigned short uint16x8_p;
124 /// \brief Vector of 32-bit elements
125 /// \par Wraps
126 /// __vector unsigned int
127 /// \since Crypto++ 6.0
128 typedef __vector unsigned int uint32x4_p;
129 
130 #if defined(_ARCH_PWR7) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
131 /// \brief Vector of 64-bit elements
132 /// \details uint64x2_p is available on POWER7 and above. Some supporting
133 /// functions, like 64-bit <tt>vec_add</tt> (<tt>vaddudm</tt>), did not
134 /// arrive until POWER8.
135 /// \par Wraps
136 /// __vector unsigned long long
137 /// \since Crypto++ 6.0
138 typedef __vector unsigned long long uint64x2_p;
139 #endif // _ARCH_PWR7
140 
141 /// \brief The 0 vector
142 /// \returns a 32-bit vector of 0's
143 /// \since Crypto++ 8.0
145 {
146  const uint32x4_p v = {0,0,0,0};
147  return v;
148 }
149 
150 /// \brief The 1 vector
151 /// \returns a 32-bit vector of 1's
152 /// \since Crypto++ 8.0
154 {
155  const uint32x4_p v = {1,1,1,1};
156  return v;
157 }
158 
159 /// \brief Reverse bytes in a vector
160 /// \tparam T vector type
161 /// \param data the vector
162 /// \returns vector
163 /// \details VecReverse() reverses the bytes in a vector
164 /// \par Wraps
165 /// vec_perm
166 /// \since Crypto++ 6.0
167 template <class T>
168 inline T VecReverse(const T data)
169 {
170 #if (_ARCH_PWR9)
171  return (T)vec_revb((uint8x16_p)data);
172 #else
173  const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
174  return (T)vec_perm(data, data, mask);
175 #endif
176 }
177 
178 //////////////////////// Loads ////////////////////////
179 
180 /// \brief Loads a vector from a byte array
181 /// \param src the byte array
182 /// \details Loads a vector in native endian format from a byte array.
183 /// \details VecLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address
184 /// of <tt>src</tt> is aligned. If unaligned it uses <tt>vec_lvsl</tt>,
185 /// <tt>vec_ld</tt>, <tt>vec_perm</tt> and <tt>src</tt>. The fixups using
186 /// <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are relatively expensive so
187 /// you should provide aligned memory adresses.
188 /// \par Wraps
189 /// vec_ld, vec_lvsl, vec_perm
190 /// \since Crypto++ 6.0
191 inline uint32x4_p VecLoad_ALTIVEC(const byte src[16])
192 {
193  // Avoid IsAlignedOn for convenience.
194  uintptr_t eff = reinterpret_cast<uintptr_t>(src)+0;
195  if (eff % 16 == 0)
196  {
197  return (uint32x4_p)vec_ld(0, src);
198  }
199  else
200  {
201  // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
202  const uint8x16_p perm = vec_lvsl(0, src);
203  const uint8x16_p low = vec_ld(0, src);
204  const uint8x16_p high = vec_ld(15, src);
205  return (uint32x4_p)vec_perm(low, high, perm);
206  }
207 }
208 
209 /// \brief Loads a vector from a byte array
210 /// \param src the byte array
211 /// \param off offset into the src byte array
212 /// \details Loads a vector in native endian format from a byte array.
213 /// \details VecLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address
214 /// of <tt>src</tt> is aligned. If unaligned it uses <tt>vec_lvsl</tt>,
215 /// <tt>vec_ld</tt>, <tt>vec_perm</tt> and <tt>src</tt>.
216 /// \details The fixups using <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are
217 /// relatively expensive so you should provide aligned memory adresses.
218 /// \par Wraps
219 /// vec_ld, vec_lvsl, vec_perm
220 /// \since Crypto++ 6.0
221 inline uint32x4_p VecLoad_ALTIVEC(int off, const byte src[16])
222 {
223  // Avoid IsAlignedOn for convenience.
224  uintptr_t eff = reinterpret_cast<uintptr_t>(src)+off;
225  if (eff % 16 == 0)
226  {
227  return (uint32x4_p)vec_ld(off, src);
228  }
229  else
230  {
231  // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
232  const uint8x16_p perm = vec_lvsl(off, src);
233  const uint8x16_p low = vec_ld(off, src);
234  const uint8x16_p high = vec_ld(15, src);
235  return (uint32x4_p)vec_perm(low, high, perm);
236  }
237 }
238 
239 /// \brief Loads a vector from a byte array
240 /// \param src the byte array
241 /// \details VecLoad() loads a vector in from a byte array.
242 /// \details VecLoad() uses POWER7's <tt>vec_xl</tt> or
243 /// <tt>vec_vsx_ld</tt> if available. The instructions do not require
244 /// aligned effective memory addresses. VecLoad_ALTIVEC() is used if POWER7
245 /// is not available. VecLoad_ALTIVEC() can be relatively expensive if
246 /// extra instructions are required to fix up unaligned memory
247 /// addresses.
248 /// \par Wraps
249 /// vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld (and Altivec load)
250 /// \since Crypto++ 6.0
251 inline uint32x4_p VecLoad(const byte src[16])
252 {
253 #if defined(_ARCH_PWR7)
254 # if defined(__early_xlc__) || defined(__early_xlC__)
255  return (uint32x4_p)vec_xlw4(0, (byte*)src);
256 # elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
257  return (uint32x4_p)vec_xl(0, (byte*)src);
258 # else
259  return (uint32x4_p)vec_vsx_ld(0, (byte*)src);
260 # endif
261 #else
262  return VecLoad_ALTIVEC(src);
263 #endif
264 }
265 
266 /// \brief Loads a vector from a byte array
267 /// \param src the byte array
268 /// \param off offset into the byte array
269 /// \details VecLoad() loads a vector in from a byte array.
270 /// \details VecLoad() uses POWER7's <tt>vec_xl</tt> or
271 /// <tt>vec_vsx_ld</tt> if available. The instructions do not require
272 /// aligned effective memory addresses. VecLoad_ALTIVEC() is used if POWER7
273 /// is not available. VecLoad_ALTIVEC() can be relatively expensive if
274 /// extra instructions are required to fix up unaligned memory
275 /// addresses.
276 /// \par Wraps
277 /// vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld (and Altivec load)
278 /// \since Crypto++ 6.0
279 inline uint32x4_p VecLoad(int off, const byte src[16])
280 {
281 #if defined(_ARCH_PWR7)
282 # if defined(__early_xlc__) || defined(__early_xlC__)
283  return (uint32x4_p)vec_xlw4(off, (byte*)src);
284 # elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
285  return (uint32x4_p)vec_xl(off, (byte*)src);
286 # else
287  return (uint32x4_p)vec_vsx_ld(off, (byte*)src);
288 # endif
289 #else
290  return VecLoad_ALTIVEC(off, src);
291 #endif
292 }
293 
294 /// \brief Loads a vector from a word array
295 /// \param src the word array
296 /// \details VecLoad() loads a vector in from a word array.
297 /// \details VecLoad() uses POWER7's <tt>vec_xl</tt> or
298 /// <tt>vec_vsx_ld</tt> if available. The instructions do not require
299 /// aligned effective memory addresses. VecLoad_ALTIVEC() is used if POWER7
300 /// is not available. VecLoad_ALTIVEC() can be relatively expensive if
301 /// extra instructions are required to fix up unaligned memory
302 /// addresses.
303 /// \par Wraps
304 /// vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld (and Altivec load)
305 /// \since Crypto++ 8.0
306 inline uint32x4_p VecLoad(const word32 src[4])
307 {
308  return VecLoad((const byte*)src);
309 }
310 
311 /// \brief Loads a vector from a word array
312 /// \param src the word array
313 /// \param off offset into the word array
314 /// \details VecLoad() loads a vector in from a word array.
315 /// \details VecLoad() uses POWER7's <tt>vec_xl</tt> or
316 /// <tt>vec_vsx_ld</tt> if available. The instructions do not require
317 /// aligned effective memory addresses. VecLoad_ALTIVEC() is used if POWER7
318 /// is not available. VecLoad_ALTIVEC() can be relatively expensive if
319 /// extra instructions are required to fix up unaligned memory
320 /// addresses.
321 /// \par Wraps
322 /// vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld (and Altivec load)
323 /// \since Crypto++ 8.0
324 inline uint32x4_p VecLoad(int off, const word32 src[4])
325 {
326  return VecLoad(off, (const byte*)src);
327 }
328 
329 #if defined(_ARCH_PWR7) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
330 
331 /// \brief Loads a vector from a word array
332 /// \param src the word array
333 /// \details VecLoad() loads a vector in from a word array.
334 /// \details VecLoad() uses POWER7's <tt>vec_xl</tt> or
335 /// <tt>vec_vsx_ld</tt> if available. The instructions do not require
336 /// aligned effective memory addresses. VecLoad_ALTIVEC() is used if POWER7
337 /// is not available. VecLoad_ALTIVEC() can be relatively expensive if
338 /// extra instructions are required to fix up unaligned memory
339 /// addresses.
340 /// \details VecLoad() with 64-bit elements is available on POWER7 and above.
341 /// \par Wraps
342 /// vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld (and Altivec load)
343 /// \since Crypto++ 8.0
344 inline uint64x2_p VecLoad(const word64 src[2])
345 {
346  return (uint64x2_p)VecLoad((const byte*)src);
347 }
348 
349 /// \brief Loads a vector from a word array
350 /// \param src the word array
351 /// \param off offset into the word array
352 /// \details VecLoad() loads a vector in from a word array.
353 /// \details VecLoad() uses POWER7's <tt>vec_xl</tt> or
354 /// <tt>vec_vsx_ld</tt> if available. The instructions do not require
355 /// aligned effective memory addresses. VecLoad_ALTIVEC() is used if POWER7
356 /// is not available. VecLoad_ALTIVEC() can be relatively expensive if
357 /// extra instructions are required to fix up unaligned memory
358 /// addresses.
359 /// \details VecLoad() with 64-bit elements is available on POWER8 and above.
360 /// \par Wraps
361 /// vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld (and Altivec load)
362 /// \since Crypto++ 8.0
363 inline uint64x2_p VecLoad(int off, const word64 src[2])
364 {
365  return (uint64x2_p)VecLoad(off, (const byte*)src);
366 }
367 
368 #endif // _ARCH_PWR7
369 
370 /// \brief Loads a vector from an aligned byte array
371 /// \param src the byte array
372 /// \details VecLoadAligned() loads a vector in from an aligned byte array.
373 /// \details VecLoadAligned() uses POWER7's <tt>vec_xl</tt> or
374 /// <tt>vec_vsx_ld</tt> if available. The instructions do not require
375 /// aligned effective memory addresses. Altivec's <tt>vec_ld</tt> is used
376 /// if POWER7 is not available. The effective address of <tt>src</tt> must
377 /// be aligned.
378 /// \par Wraps
379 /// vec_ld, vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld
380 /// \since Crypto++ 8.0
381 inline uint32x4_p VecLoadAligned(const byte src[16])
382 {
383 #if defined(_ARCH_PWR7)
384 # if defined(__early_xlc__) || defined(__early_xlC__)
385  return (uint32x4_p)vec_xlw4(0, (byte*)src);
386 # elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
387  return (uint32x4_p)vec_xl(0, (byte*)src);
388 # else
389  return (uint32x4_p)vec_vsx_ld(0, (byte*)src);
390 # endif
391 #else // _ARCH_PWR7
392  CRYPTOPP_ASSERT(((uintptr_t)src) % 16 == 0);
393  return (uint32x4_p)vec_ld(0, (byte*)src);
394 #endif // _ARCH_PWR7
395 }
396 
397 /// \brief Loads a vector from an aligned byte array
398 /// \param src the byte array
399 /// \param off offset into the byte array
400 /// \details VecLoadAligned() loads a vector in from an aligned byte array.
401 /// \details VecLoadAligned() uses POWER7's <tt>vec_xl</tt> or
402 /// <tt>vec_vsx_ld</tt> if available. The instructions do not require
403 /// aligned effective memory addresses. Altivec's <tt>vec_ld</tt> is used
404 /// if POWER7 is not available. The effective address of <tt>src</tt> must
405 /// be aligned.
406 /// \par Wraps
407 /// vec_ld, vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld
408 /// \since Crypto++ 8.0
409 inline uint32x4_p VecLoadAligned(int off, const byte src[16])
410 {
411 #if defined(_ARCH_PWR7)
412 # if defined(__early_xlc__) || defined(__early_xlC__)
413  return (uint32x4_p)vec_xlw4(off, (byte*)src);
414 # elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
415  return (uint32x4_p)vec_xl(off, (byte*)src);
416 # else
417  return (uint32x4_p)vec_vsx_ld(off, (byte*)src);
418 # endif
419 #else // _ARCH_PWR7
420  CRYPTOPP_ASSERT((((uintptr_t)src)+off) % 16 == 0);
421  return (uint32x4_p)vec_ld(off, (byte*)src);
422 #endif // _ARCH_PWR7
423 }
424 
425 /// \brief Loads a vector from a byte array
426 /// \param src the byte array
427 /// \details VecLoadBE() loads a vector in from a byte array. VecLoadBE
428 /// will reverse all bytes in the array on a little endian system.
429 /// \details VecLoadBE() uses POWER7's <tt>vec_xl</tt> or
430 /// <tt>vec_vsx_ld</tt> if available. The instructions do not require
431 /// aligned effective memory addresses. VecLoad_ALTIVEC() is used if POWER7
432 /// is not available. VecLoad_ALTIVEC() can be relatively expensive if
433 /// extra instructions are required to fix up unaligned memory
434 /// addresses.
435 /// \par Wraps
436 /// vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld (and Altivec load)
437 /// \since Crypto++ 6.0
438 inline uint32x4_p VecLoadBE(const byte src[16])
439 {
440 #if defined(_ARCH_PWR7)
441 # if defined(__early_xlc__) || defined(__early_xlC__)
442 # if (CRYPTOPP_BIG_ENDIAN)
443  return (uint32x4_p)vec_xlw4(0, (byte*)src);
444 # else
445  return (uint32x4_p)VecReverse(vec_xlw4(0, (byte*)src));
446 # endif
447 # elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
448  return (uint32x4_p)vec_xl_be(0, (byte*)src);
449 # else
450 # if (CRYPTOPP_BIG_ENDIAN)
451  return (uint32x4_p)vec_vsx_ld(0, (byte*)src);
452 # else
453  return (uint32x4_p)VecReverse(vec_vsx_ld(0, (byte*)src));
454 # endif
455 # endif
456 #else // _ARCH_PWR7
457 # if (CRYPTOPP_BIG_ENDIAN)
458  return (uint32x4_p)VecLoad((const byte*)src);
459 # else
460  return (uint32x4_p)VecReverse(VecLoad((const byte*)src));
461 # endif
462 #endif // _ARCH_PWR7
463 }
464 
465 /// \brief Loads a vector from a byte array
466 /// \param src the byte array
467 /// \param off offset into the src byte array
468 /// \details VecLoadBE() loads a vector in from a byte array. VecLoadBE
469 /// will reverse all bytes in the array on a little endian system.
470 /// \details VecLoadBE() uses POWER7's <tt>vec_xl</tt> or
471 /// <tt>vec_vsx_ld</tt> if available. The instructions do not require
472 /// aligned effective memory addresses. VecLoad_ALTIVEC() is used if POWER7
473 /// is not available. VecLoad_ALTIVEC() can be relatively expensive if
474 /// extra instructions are required to fix up unaligned memory
475 /// addresses.
476 /// \par Wraps
477 /// vec_xlw4, vec_xld2, vec_xl, vec_vsx_ld (and Altivec load)
478 /// \since Crypto++ 6.0
479 inline uint32x4_p VecLoadBE(int off, const byte src[16])
480 {
481 #if defined(_ARCH_PWR7)
482 # if defined(__early_xlc__) || defined(__early_xlC__)
483 # if (CRYPTOPP_BIG_ENDIAN)
484  return (uint32x4_p)vec_xlw4(off, (byte*)src);
485 # else
486  return (uint32x4_p)VecReverse(vec_xlw4(off, (byte*)src));
487 # endif
488 # elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
489  return (uint32x4_p)vec_xl_be(off, (byte*)src);
490 # else
491 # if (CRYPTOPP_BIG_ENDIAN)
492  return (uint32x4_p)vec_vsx_ld(off, (byte*)src);
493 # else
494  return (uint32x4_p)VecReverse(vec_vsx_ld(off, (byte*)src));
495 # endif
496 # endif
497 #else // _ARCH_PWR7
498 # if (CRYPTOPP_BIG_ENDIAN)
499  return (uint32x4_p)VecLoad(off, (const byte*)src);
500 # else
501  return (uint32x4_p)VecReverse(VecLoad(off, (const byte*)src));
502 # endif
503 #endif // _ARCH_PWR7
504 }
505 
506 //////////////////////// Stores ////////////////////////
507 
508 /// \brief Stores a vector to a byte array
509 /// \tparam T vector type
510 /// \param data the vector
511 /// \param dest the byte array
512 /// \details VecStore_ALTIVEC() stores a vector to a byte array.
513 /// \details VecStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address
514 /// of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise.
515 /// <tt>vec_ste</tt> is relatively expensive so you should provide aligned
516 /// memory adresses.
517 /// \details VecStore_ALTIVEC() is used automatically when POWER7 or above
518 /// and unaligned loads is not available.
519 /// \par Wraps
520 /// vec_st, vec_ste, vec_lvsr, vec_perm
521 /// \since Crypto++ 8.0
522 template<class T>
523 inline void VecStore_ALTIVEC(const T data, byte dest[16])
524 {
525  // Avoid IsAlignedOn for convenience.
526  uintptr_t eff = reinterpret_cast<uintptr_t>(dest)+0;
527  if (eff % 16 == 0)
528  {
529  vec_st((uint8x16_p)data, 0, dest);
530  }
531  else
532  {
533  // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
534  uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, dest));
535  vec_ste((uint8x16_p) perm, 0, (unsigned char*) dest);
536  vec_ste((uint16x8_p) perm, 1, (unsigned short*)dest);
537  vec_ste((uint32x4_p) perm, 3, (unsigned int*) dest);
538  vec_ste((uint32x4_p) perm, 4, (unsigned int*) dest);
539  vec_ste((uint32x4_p) perm, 8, (unsigned int*) dest);
540  vec_ste((uint32x4_p) perm, 12, (unsigned int*) dest);
541  vec_ste((uint16x8_p) perm, 14, (unsigned short*)dest);
542  vec_ste((uint8x16_p) perm, 15, (unsigned char*) dest);
543  }
544 }
545 
546 /// \brief Stores a vector to a byte array
547 /// \tparam T vector type
548 /// \param data the vector
549 /// \param off the byte offset into the array
550 /// \param dest the byte array
551 /// \details VecStore_ALTIVEC() stores a vector to a byte array.
552 /// \details VecStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address
553 /// of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise.
554 /// <tt>vec_ste</tt> is relatively expensive so you should provide aligned
555 /// memory adresses.
556 /// \details VecStore_ALTIVEC() is used automatically when POWER7 or above
557 /// and unaligned loads is not available.
558 /// \par Wraps
559 /// vec_st, vec_ste, vec_lvsr, vec_perm
560 /// \since Crypto++ 8.0
561 template<class T>
562 inline void VecStore_ALTIVEC(const T data, int off, byte dest[16])
563 {
564  // Avoid IsAlignedOn for convenience.
565  uintptr_t eff = reinterpret_cast<uintptr_t>(dest)+off;
566  if (eff % 16 == 0)
567  {
568  vec_st((uint8x16_p)data, off, dest);
569  }
570  else
571  {
572  // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
573  uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(off, dest));
574  vec_ste((uint8x16_p) perm, 0, (unsigned char*) dest);
575  vec_ste((uint16x8_p) perm, 1, (unsigned short*)dest);
576  vec_ste((uint32x4_p) perm, 3, (unsigned int*) dest);
577  vec_ste((uint32x4_p) perm, 4, (unsigned int*) dest);
578  vec_ste((uint32x4_p) perm, 8, (unsigned int*) dest);
579  vec_ste((uint32x4_p) perm, 12, (unsigned int*) dest);
580  vec_ste((uint16x8_p) perm, 14, (unsigned short*)dest);
581  vec_ste((uint8x16_p) perm, 15, (unsigned char*) dest);
582  }
583 }
584 
585 /// \brief Stores a vector to a byte array
586 /// \tparam T vector type
587 /// \param data the vector
588 /// \param dest the byte array
589 /// \details VecStore() stores a vector to a byte array.
590 /// \details VecStore() uses POWER7's <tt>vec_xst</tt> or
591 /// <tt>vec_vsx_st</tt> if available. The instructions do not require
592 /// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
593 /// is not available. VecStore_ALTIVEC() can be relatively expensive if
594 /// extra instructions are required to fix up unaligned memory
595 /// addresses.
596 /// \par Wraps
597 /// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
598 /// \since Crypto++ 6.0
599 template<class T>
600 inline void VecStore(const T data, byte dest[16])
601 {
602 #if defined(_ARCH_PWR7)
603 # if defined(__early_xlc__) || defined(__early_xlC__)
604  vec_xstw4((uint8x16_p)data, 0, (byte*)dest);
605 # elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
606  vec_xst((uint8x16_p)data, 0, (byte*)dest);
607 # else
608  vec_vsx_st((uint8x16_p)data, 0, (byte*)dest);
609 # endif
610 #else
611  VecStore_ALTIVEC((uint8x16_p)data, 0, (byte*)dest);
612 #endif
613 }
614 
615 /// \brief Stores a vector to a byte array
616 /// \tparam T vector type
617 /// \param data the vector
618 /// \param off the byte offset into the array
619 /// \param dest the byte array
620 /// \details VecStore() stores a vector to a byte array.
621 /// \details VecStore() uses POWER7's <tt>vec_xst</tt> or
622 /// <tt>vec_vsx_st</tt> if available. The instructions do not require
623 /// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
624 /// is not available. VecStore_ALTIVEC() can be relatively expensive if
625 /// extra instructions are required to fix up unaligned memory
626 /// addresses.
627 /// \par Wraps
628 /// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
629 /// \since Crypto++ 6.0
630 template<class T>
631 inline void VecStore(const T data, int off, byte dest[16])
632 {
633 #if defined(_ARCH_PWR7)
634 # if defined(__early_xlc__) || defined(__early_xlC__)
635  vec_xstw4((uint8x16_p)data, off, (byte*)dest);
636 # elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
637  vec_xst((uint8x16_p)data, off, (byte*)dest);
638 # else
639  vec_vsx_st((uint8x16_p)data, off, (byte*)dest);
640 # endif
641 #else
642  VecStore_ALTIVEC((uint8x16_p)data, off, (byte*)dest);
643 #endif
644 }
645 
646 /// \brief Stores a vector to a word array
647 /// \tparam T vector type
648 /// \param data the vector
649 /// \param dest the word array
650 /// \details VecStore() stores a vector to a word array.
651 /// \details VecStore() uses POWER7's <tt>vec_xst</tt> or
652 /// <tt>vec_vsx_st</tt> if available. The instructions do not require
653 /// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
654 /// is not available. VecStore_ALTIVEC() can be relatively expensive if
655 /// extra instructions are required to fix up unaligned memory
656 /// addresses.
657 /// \par Wraps
658 /// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
659 /// \since Crypto++ 8.0
660 template<class T>
661 inline void VecStore(const T data, word32 dest[4])
662 {
663  VecStore((uint8x16_p)data, 0, (byte*)dest);
664 }
665 
666 /// \brief Stores a vector to a word array
667 /// \tparam T vector type
668 /// \param data the vector
669 /// \param off the byte offset into the array
670 /// \param dest the word array
671 /// \details VecStore() stores a vector to a word array.
672 /// \details VecStore() uses POWER7's <tt>vec_xst</tt> or
673 /// <tt>vec_vsx_st</tt> if available. The instructions do not require
674 /// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
675 /// is not available. VecStore_ALTIVEC() can be relatively expensive if
676 /// extra instructions are required to fix up unaligned memory
677 /// addresses.
678 /// \par Wraps
679 /// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
680 /// \since Crypto++ 8.0
681 template<class T>
682 inline void VecStore(const T data, int off, word32 dest[4])
683 {
684  VecStore((uint8x16_p)data, off, (byte*)dest);
685 }
686 
687 /// \brief Stores a vector to a word array
688 /// \tparam T vector type
689 /// \param data the vector
690 /// \param dest the word array
691 /// \details VecStore() stores a vector to a word array.
692 /// \details VecStore() uses POWER7's <tt>vec_xst</tt> or
693 /// <tt>vec_vsx_st</tt> if available. The instructions do not require
694 /// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
695 /// is not available. VecStore_ALTIVEC() can be relatively expensive if
696 /// extra instructions are required to fix up unaligned memory
697 /// addresses.
698 /// \details VecStore() with 64-bit elements is available on POWER8 and above.
699 /// \par Wraps
700 /// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
701 /// \since Crypto++ 8.0
702 template<class T>
703 inline void VecStore(const T data, word64 dest[2])
704 {
705  VecStore((uint8x16_p)data, 0, (byte*)dest);
706 }
707 
708 /// \brief Stores a vector to a word array
709 /// \tparam T vector type
710 /// \param data the vector
711 /// \param off the byte offset into the array
712 /// \param dest the word array
713 /// \details VecStore() stores a vector to a word array.
714 /// \details VecStore() uses POWER7's <tt>vec_xst</tt> or
715 /// <tt>vec_vsx_st</tt> if available. The instructions do not require
716 /// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
717 /// is not available. VecStore_ALTIVEC() can be relatively expensive if
718 /// extra instructions are required to fix up unaligned memory
719 /// addresses.
720 /// \details VecStore() with 64-bit elements is available on POWER8 and above.
721 /// \par Wraps
722 /// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
723 /// \since Crypto++ 8.0
724 template<class T>
725 inline void VecStore(const T data, int off, word64 dest[2])
726 {
727  VecStore((uint8x16_p)data, off, (byte*)dest);
728 }
729 
730 /// \brief Stores a vector to a byte array
731 /// \tparam T vector type
732 /// \param data the vector
733 /// \param dest the byte array
734 /// \details VecStoreBE() stores a vector to a byte array. VecStoreBE
735 /// will reverse all bytes in the array on a little endian system.
736 /// \details VecStoreBE() uses POWER7's <tt>vec_xst</tt> or
737 /// <tt>vec_vsx_st</tt> if available. The instructions do not require
738 /// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
739 /// is not available. VecStore_ALTIVEC() can be relatively expensive if
740 /// extra instructions are required to fix up unaligned memory
741 /// addresses.
742 /// \par Wraps
743 /// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
744 /// \since Crypto++ 6.0
745 template <class T>
746 inline void VecStoreBE(const T data, byte dest[16])
747 {
748 #if defined(_ARCH_PWR7)
749 # if defined(__early_xlc__) || defined(__early_xlC__)
750 # if (CRYPTOPP_BIG_ENDIAN)
751  vec_xstw4((uint8x16_p)data, 0, (byte*)dest);
752 # else
753  vec_xstw4((uint8x16_p)VecReverse(data), 0, (byte*)dest);
754 # endif
755 # elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
756  vec_xst_be((uint8x16_p)data, 0, (byte*)dest);
757 # else
758 # if (CRYPTOPP_BIG_ENDIAN)
759  vec_vsx_st((uint8x16_p)data, 0, (byte*)dest);
760 # else
761  vec_vsx_st((uint8x16_p)VecReverse(data), 0, (byte*)dest);
762 # endif
763 # endif
764 #else // _ARCH_PWR7
765 # if (CRYPTOPP_BIG_ENDIAN)
766  VecStore_ALTIVEC((uint8x16_p)data, 0, (byte*)dest);
767 # else
768  VecStore_ALTIVEC((uint8x16_p)VecReverse(data), 0, (byte*)dest);
769 # endif
770 #endif // _ARCH_PWR7
771 }
772 
773 /// \brief Stores a vector to a byte array
774 /// \tparam T vector type
775 /// \param data the vector
776 /// \param off offset into the dest byte array
777 /// \param dest the byte array
778 /// \details VecStoreBE() stores a vector to a byte array. VecStoreBE
779 /// will reverse all bytes in the array on a little endian system.
780 /// \details VecStoreBE() uses POWER7's <tt>vec_xst</tt> or
781 /// <tt>vec_vsx_st</tt> if available. The instructions do not require
782 /// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
783 /// is not available. VecStore_ALTIVEC() can be relatively expensive if
784 /// extra instructions are required to fix up unaligned memory
785 /// addresses.
786 /// \par Wraps
787 /// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
788 /// \since Crypto++ 6.0
789 template <class T>
790 inline void VecStoreBE(const T data, int off, byte dest[16])
791 {
792 #if defined(_ARCH_PWR7)
793 # if defined(__early_xlc__) || defined(__early_xlC__)
794 # if (CRYPTOPP_BIG_ENDIAN)
795  vec_xstw4((uint8x16_p)data, off, (byte*)dest);
796 # else
797  vec_xstw4((uint8x16_p)VecReverse(data), off, (byte*)dest);
798 # endif
799 # elif defined(__xlc__) || defined(__xlC__) || defined(__clang__)
800  vec_xst_be((uint8x16_p)data, off, (byte*)dest);
801 # else
802 # if (CRYPTOPP_BIG_ENDIAN)
803  vec_vsx_st((uint8x16_p)data, off, (byte*)dest);
804 # else
805  vec_vsx_st((uint8x16_p)VecReverse(data), off, (byte*)dest);
806 # endif
807 # endif
808 #else // _ARCH_PWR7
809 # if (CRYPTOPP_BIG_ENDIAN)
810  VecStore_ALTIVEC((uint8x16_p)data, off, (byte*)dest);
811 # else
812  VecStore_ALTIVEC((uint8x16_p)VecReverse(data), off, (byte*)dest);
813 # endif
814 #endif // _ARCH_PWR7
815 }
816 
817 /// \brief Stores a vector to a word array
818 /// \tparam T vector type
819 /// \param data the vector
820 /// \param dest the word array
821 /// \details VecStoreBE() stores a vector to a word array. VecStoreBE
822 /// will reverse all bytes in the array on a little endian system.
823 /// \details VecStoreBE() uses POWER7's <tt>vec_xst</tt> or
824 /// <tt>vec_vsx_st</tt> if available. The instructions do not require
825 /// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
826 /// is not available. VecStore_ALTIVEC() can be relatively expensive if
827 /// extra instructions are required to fix up unaligned memory
828 /// addresses.
829 /// \par Wraps
830 /// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
831 /// \since Crypto++ 8.0
832 template <class T>
833 inline void VecStoreBE(const T data, word32 dest[4])
834 {
835  return VecStoreBE((uint8x16_p)data, (byte*)dest);
836 }
837 
838 /// \brief Stores a vector to a word array
839 /// \tparam T vector type
840 /// \param data the vector
841 /// \param off offset into the dest word array
842 /// \param dest the word array
843 /// \details VecStoreBE() stores a vector to a word array. VecStoreBE
844 /// will reverse all words in the array on a little endian system.
845 /// \details VecStoreBE() uses POWER7's <tt>vec_xst</tt> or
846 /// <tt>vec_vsx_st</tt> if available. The instructions do not require
847 /// aligned effective memory addresses. VecStore_ALTIVEC() is used if POWER7
848 /// is not available. VecStore_ALTIVEC() can be relatively expensive if
849 /// extra instructions are required to fix up unaligned memory
850 /// addresses.
851 /// \par Wraps
852 /// vec_xstw4, vec_xstld2, vec_xst, vec_vsx_st (and Altivec store)
853 /// \since Crypto++ 8.0
854 template <class T>
855 inline void VecStoreBE(const T data, int off, word32 dest[4])
856 {
857  return VecStoreBE((uint8x16_p)data, off, (byte*)dest);
858 }
859 
860 //////////////////////// Miscellaneous ////////////////////////
861 
862 /// \brief Permutes a vector
863 /// \tparam T1 vector type
864 /// \tparam T2 vector type
865 /// \param vec the vector
866 /// \param mask vector mask
867 /// \returns vector
868 /// \details VecPermute() returns a new vector from vec based on
869 /// mask. mask is an uint8x16_p type vector. The return
870 /// vector is the same type as vec.
871 /// \par Wraps
872 /// vec_perm
873 /// \since Crypto++ 6.0
874 template <class T1, class T2>
875 inline T1 VecPermute(const T1 vec, const T2 mask)
876 {
877  return (T1)vec_perm(vec, vec, (uint8x16_p)mask);
878 }
879 
880 /// \brief Permutes two vectors
881 /// \tparam T1 vector type
882 /// \tparam T2 vector type
883 /// \param vec1 the first vector
884 /// \param vec2 the second vector
885 /// \param mask vector mask
886 /// \returns vector
887 /// \details VecPermute() returns a new vector from vec1 and vec2
888 /// based on mask. mask is an uint8x16_p type vector. The return
889 /// vector is the same type as vec1.
890 /// \par Wraps
891 /// vec_perm
892 /// \since Crypto++ 6.0
893 template <class T1, class T2>
894 inline T1 VecPermute(const T1 vec1, const T1 vec2, const T2 mask)
895 {
896  return (T1)vec_perm(vec1, (T1)vec2, (uint8x16_p)mask);
897 }
898 
899 /// \brief AND two vectors
900 /// \tparam T1 vector type
901 /// \tparam T2 vector type
902 /// \param vec1 the first vector
903 /// \param vec2 the second vector
904 /// \returns vector
905 /// \details VecAnd() returns a new vector from vec1 and vec2. The return
906 /// vector is the same type as vec1.
907 /// \par Wraps
908 /// vec_and
909 /// \since Crypto++ 6.0
910 template <class T1, class T2>
911 inline T1 VecAnd(const T1 vec1, const T2 vec2)
912 {
913  return (T1)vec_and(vec1, (T1)vec2);
914 }
915 
916 /// \brief OR two vectors
917 /// \tparam T1 vector type
918 /// \tparam T2 vector type
919 /// \param vec1 the first vector
920 /// \param vec2 the second vector
921 /// \returns vector
922 /// \details VecOr() returns a new vector from vec1 and vec2. The return
923 /// vector is the same type as vec1.
924 /// \par Wraps
925 /// vec_or
926 /// \since Crypto++ 6.0
927 template <class T1, class T2>
928 inline T1 VecOr(const T1 vec1, const T2 vec2)
929 {
930  return (T1)vec_or(vec1, (T1)vec2);
931 }
932 
933 /// \brief XOR two vectors
934 /// \tparam T1 vector type
935 /// \tparam T2 vector type
936 /// \param vec1 the first vector
937 /// \param vec2 the second vector
938 /// \returns vector
939 /// \details VecXor() returns a new vector from vec1 and vec2. The return
940 /// vector is the same type as vec1.
941 /// \par Wraps
942 /// vec_xor
943 /// \since Crypto++ 6.0
944 template <class T1, class T2>
945 inline T1 VecXor(const T1 vec1, const T2 vec2)
946 {
947  return (T1)vec_xor(vec1, (T1)vec2);
948 }
949 
950 /// \brief Add two vectors
951 /// \tparam T1 vector type
952 /// \tparam T2 vector type
953 /// \param vec1 the first vector
954 /// \param vec2 the second vector
955 /// \returns vector
956 /// \details VecAdd() returns a new vector from vec1 and vec2.
957 /// vec2 is cast to the same type as vec1. The return vector
958 /// is the same type as vec1.
959 /// \par Wraps
960 /// vec_add
961 /// \since Crypto++ 6.0
962 template <class T1, class T2>
963 inline T1 VecAdd(const T1 vec1, const T2 vec2)
964 {
965  return (T1)vec_add(vec1, (T1)vec2);
966 }
967 
968 /// \brief Subtract two vectors
969 /// \tparam T1 vector type
970 /// \tparam T2 vector type
971 /// \param vec1 the first vector
972 /// \param vec2 the second vector
973 /// \details VecSub() returns a new vector from vec1 and vec2.
974 /// vec2 is cast to the same type as vec1. The return vector
975 /// is the same type as vec1.
976 /// \par Wraps
977 /// vec_sub
978 /// \since Crypto++ 6.0
979 template <class T1, class T2>
980 inline T1 VecSub(const T1 vec1, const T2 vec2)
981 {
982  return (T1)vec_sub(vec1, (T1)vec2);
983 }
984 
985 /// \brief Add two vectors
986 /// \tparam T1 vector type
987 /// \tparam T2 vector type
988 /// \param vec1 the first vector
989 /// \param vec2 the second vector
990 /// \returns vector
991 /// \details VecAdd64() returns a new vector from vec1 and vec2.
992 /// vec1 and vec2 are added as if uint64x2_p vectors. On POWER7
993 /// and below VecAdd64() manages the carries from two elements in
994 /// a uint32x4_p vector.
995 /// \par Wraps
996 /// vec_add for POWER8, vec_addc, vec_perm, vec_add for Altivec
997 /// \since Crypto++ 8.0
998 inline uint32x4_p VecAdd64(const uint32x4_p& vec1, const uint32x4_p& vec2)
999 {
1000  // 64-bit elements available at POWER7, but addudm requires POWER8
1001 #if defined(_ARCH_PWR8)
1002  return (uint32x4_p)vec_add((uint64x2_p)vec1, (uint64x2_p)vec2);
1003 #else
1004  // The carry mask selects carries from elements 1 and 3 and sets remaining
1005  // elements to 0. The mask also shifts the carried values left by 4 bytes
1006  // so the carries are added to elements 0 and 2.
1007  const uint8x16_p cmask = {4,5,6,7, 16,16,16,16, 12,13,14,15, 16,16,16,16};
1008  const uint32x4_p zero = {0, 0, 0, 0};
1009 
1010  uint32x4_p cy = vec_addc(vec1, vec2);
1011  cy = vec_perm(cy, zero, cmask);
1012  return vec_add(vec_add(vec1, vec2), cy);
1013 #endif
1014 }
1015 
1016 /// \brief Shift a vector left
1017 /// \tparam C shift byte count
1018 /// \tparam T vector type
1019 /// \param vec the vector
1020 /// \returns vector
1021 /// \details VecShiftLeftOctet() returns a new vector after shifting the
1022 /// concatenation of the zero vector and the source vector by the specified
1023 /// number of bytes. The return vector is the same type as vec.
1024 /// \details On big endian machines VecShiftLeftOctet() is <tt>vec_sld(a, z,
1025 /// c)</tt>. On little endian machines VecShiftLeftOctet() is translated to
1026 /// <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as
1027 /// if on a big endian machine as shown below.
1028 /// <pre>
1029 /// uint8x16_p x = VecLoad(ptr);
1030 /// uint8x16_p y = VecShiftLeftOctet<12>(x);
1031 /// </pre>
1032 /// \par Wraps
1033 /// vec_sld
1034 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1035 /// endian sensitive?</A> on Stack Overflow
1036 /// \since Crypto++ 6.0
1037 template <unsigned int C, class T>
1038 inline T VecShiftLeftOctet(const T vec)
1039 {
1040  const T zero = {0};
1041  if (C >= 16)
1042  {
1043  // Out of range
1044  return zero;
1045  }
1046  else if (C == 0)
1047  {
1048  // Noop
1049  return vec;
1050  }
1051  else
1052  {
1053 #if (CRYPTOPP_BIG_ENDIAN)
1054  enum { R=C&0xf };
1055  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R);
1056 #else
1057  enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1058  return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R);
1059 #endif
1060  }
1061 }
1062 
1063 /// \brief Shift a vector right
1064 /// \tparam C shift byte count
1065 /// \tparam T vector type
1066 /// \param vec the vector
1067 /// \returns vector
1068 /// \details VecShiftRightOctet() returns a new vector after shifting the
1069 /// concatenation of the zero vector and the source vector by the specified
1070 /// number of bytes. The return vector is the same type as vec.
1071 /// \details On big endian machines VecShiftRightOctet() is <tt>vec_sld(a, z,
1072 /// c)</tt>. On little endian machines VecShiftRightOctet() is translated to
1073 /// <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as
1074 /// if on a big endian machine as shown below.
1075 /// <pre>
1076 /// uint8x16_p x = VecLoad(ptr);
1077 /// uint8x16_p y = VecShiftRightOctet<12>(y);
1078 /// </pre>
1079 /// \par Wraps
1080 /// vec_sld
1081 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1082 /// endian sensitive?</A> on Stack Overflow
1083 /// \since Crypto++ 6.0
1084 template <unsigned int C, class T>
1085 inline T VecShiftRightOctet(const T vec)
1086 {
1087  const T zero = {0};
1088  if (C >= 16)
1089  {
1090  // Out of range
1091  return zero;
1092  }
1093  else if (C == 0)
1094  {
1095  // Noop
1096  return vec;
1097  }
1098  else
1099  {
1100 #if (CRYPTOPP_BIG_ENDIAN)
1101  enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1102  return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R);
1103 #else
1104  enum { R=C&0xf };
1105  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R);
1106 #endif
1107  }
1108 }
1109 
1110 /// \brief Rotate a vector left
1111 /// \tparam C shift byte count
1112 /// \tparam T vector type
1113 /// \param vec the vector
1114 /// \returns vector
1115 /// \details VecRotateLeftOctet() returns a new vector after rotating the
1116 /// concatenation of the source vector with itself by the specified
1117 /// number of bytes. The return vector is the same type as vec.
1118 /// \par Wraps
1119 /// vec_sld
1120 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1121 /// endian sensitive?</A> on Stack Overflow
1122 /// \since Crypto++ 6.0
1123 template <unsigned int C, class T>
1124 inline T VecRotateLeftOctet(const T vec)
1125 {
1126 #if (CRYPTOPP_BIG_ENDIAN)
1127  enum { R = C&0xf };
1128  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1129 #else
1130  enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1131  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1132 #endif
1133 }
1134 
1135 /// \brief Rotate a vector right
1136 /// \tparam C shift byte count
1137 /// \tparam T vector type
1138 /// \param vec the vector
1139 /// \returns vector
1140 /// \details VecRotateRightOctet() returns a new vector after rotating the
1141 /// concatenation of the source vector with itself by the specified
1142 /// number of bytes. The return vector is the same type as vec.
1143 /// \par Wraps
1144 /// vec_sld
1145 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1146 /// endian sensitive?</A> on Stack Overflow
1147 /// \since Crypto++ 6.0
1148 template <unsigned int C, class T>
1149 inline T VecRotateRightOctet(const T vec)
1150 {
1151 #if (CRYPTOPP_BIG_ENDIAN)
1152  enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1153  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1154 #else
1155  enum { R = C&0xf };
1156  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1157 #endif
1158 }
1159 
1160 /// \brief Rotate a packed vector left
1161 /// \tparam C shift bit count
1162 /// \param vec the vector
1163 /// \returns vector
1164 /// \details VecRotateLeft() rotates each element in a packed vector by bit count.
1165 /// \par Wraps
1166 /// vec_rl
1167 /// \since Crypto++ 7.0
1168 template<unsigned int C>
1170 {
1171  const uint32x4_p m = {C, C, C, C};
1172  return vec_rl(vec, m);
1173 }
1174 
1175 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
1176 
1177 /// \brief Rotate a packed vector left
1178 /// \tparam C shift bit count
1179 /// \param vec the vector
1180 /// \returns vector
1181 /// \details VecRotateLeft() rotates each element in a packed vector by bit count.
1182 /// \details VecRotateLeft() with 64-bit elements is available on POWER8 and above.
1183 /// \par Wraps
1184 /// vec_rl
1185 /// \since Crypto++ 8.0
1186 template<unsigned int C>
1188 {
1189  const uint64x2_p m = {C, C};
1190  return vec_rl(vec, m);
1191 }
1192 
1193 #endif
1194 
1195 /// \brief Rotate a packed vector right
1196 /// \tparam C shift bit count
1197 /// \param vec the vector
1198 /// \returns vector
1199 /// \details VecRotateRight() rotates each element in a packed vector by bit count.
1200 /// \par Wraps
1201 /// vec_rl
1202 /// \since Crypto++ 7.0
1203 template<unsigned int C>
1205 {
1206  const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
1207  return vec_rl(vec, m);
1208 }
1209 
1210 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
1211 
1212 /// \brief Rotate a packed vector right
1213 /// \tparam C shift bit count
1214 /// \param vec the vector
1215 /// \returns vector
1216 /// \details VecRotateRight() rotates each element in a packed vector by bit count.
1217 /// \details VecRotateRight() with 64-bit elements is available on POWER8 and above.
1218 /// \par Wraps
1219 /// vec_rl
1220 /// \since Crypto++ 8.0
1221 template<unsigned int C>
1223 {
1224  const uint64x2_p m = {64-C, 64-C};
1225  return vec_rl(vec, m);
1226 }
1227 
1228 #endif
1229 
1230 /// \brief Exchange high and low double words
1231 /// \tparam T vector type
1232 /// \param vec the vector
1233 /// \returns vector
1234 /// \par Wraps
1235 /// vec_sld
1236 /// \since Crypto++ 7.0
1237 template <class T>
1238 inline T VecSwapWords(const T vec)
1239 {
1240  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, 8);
1241 }
1242 
1243 /// \brief Extract a dword from a vector
1244 /// \tparam T vector type
1245 /// \param val the vector
1246 /// \returns vector created from low dword
1247 /// \details VecGetLow() extracts the low dword from a vector. The low dword
1248 /// is composed of the least significant bits and occupies bytes 8 through 15
1249 /// when viewed as a big endian array. The return vector is the same type as
1250 /// the original vector and padded with 0's in the most significant bit positions.
1251 /// \par Wraps
1252 /// vec_sld
1253 /// \since Crypto++ 7.0
1254 template <class T>
1255 inline T VecGetLow(const T val)
1256 {
1257  //const T zero = {0};
1258  //const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 8,9,10,11, 12,13,14,15 };
1259  //return (T)vec_perm(zero, val, mask);
1260  return VecShiftRightOctet<8>(VecShiftLeftOctet<8>(val));
1261 }
1262 
1263 /// \brief Extract a dword from a vector
1264 /// \tparam T vector type
1265 /// \param val the vector
1266 /// \returns vector created from high dword
1267 /// \details VecGetHigh() extracts the high dword from a vector. The high dword
1268 /// is composed of the most significant bits and occupies bytes 0 through 7
1269 /// when viewed as a big endian array. The return vector is the same type as
1270 /// the original vector and padded with 0's in the most significant bit positions.
1271 /// \par Wraps
1272 /// vec_sld
1273 /// \since Crypto++ 7.0
1274 template <class T>
1275 inline T VecGetHigh(const T val)
1276 {
1277  //const T zero = {0};
1278  //const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 0,1,2,3, 4,5,6,7 };
1279  //return (T)vec_perm(zero, val, mask);
1280  return VecShiftRightOctet<8>(val);
1281 }
1282 
1283 /// \brief Compare two vectors
1284 /// \tparam T1 vector type
1285 /// \tparam T2 vector type
1286 /// \param vec1 the first vector
1287 /// \param vec2 the second vector
1288 /// \returns true if vec1 equals vec2, false otherwise
1289 /// \details VecEqual() performs a bitwise compare. The vector element types do
1290 /// not matter.
1291 /// \par Wraps
1292 /// vec_all_eq
1293 /// \since Crypto++ 8.0
1294 template <class T1, class T2>
1295 inline bool VecEqual(const T1 vec1, const T2 vec2)
1296 {
1297  return 1 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
1298 }
1299 
1300 /// \brief Compare two vectors
1301 /// \tparam T1 vector type
1302 /// \tparam T2 vector type
1303 /// \param vec1 the first vector
1304 /// \param vec2 the second vector
1305 /// \returns true if vec1 does not equal vec2, false otherwise
1306 /// \details VecNotEqual() performs a bitwise compare. The vector element types do
1307 /// not matter.
1308 /// \par Wraps
1309 /// vec_all_eq
1310 /// \since Crypto++ 8.0
1311 template <class T1, class T2>
1312 inline bool VecNotEqual(const T1 vec1, const T2 vec2)
1313 {
1314  return 0 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
1315 }
1316 
1317 //////////////////////// Power8 Crypto ////////////////////////
1318 
1319 #if defined(__CRYPTO__) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
1320 
1321 /// \brief One round of AES encryption
1322 /// \tparam T1 vector type
1323 /// \tparam T2 vector type
1324 /// \param state the state vector
1325 /// \param key the subkey vector
1326 /// \details VecEncrypt() performs one round of AES encryption of state
1327 /// using subkey key. The return vector is the same type as vec1.
1328 /// \details VecEncrypt() is available on POWER8 and above.
1329 /// \par Wraps
1330 /// __vcipher, __builtin_altivec_crypto_vcipher, __builtin_crypto_vcipher
1331 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
1332 template <class T1, class T2>
1333 inline T1 VecEncrypt(const T1 state, const T2 key)
1334 {
1335 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
1336  return (T1)__vcipher((uint8x16_p)state, (uint8x16_p)key);
1337 #elif defined(__clang__)
1338  return (T1)__builtin_altivec_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key);
1339 #elif defined(__GNUC__)
1340  return (T1)__builtin_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key);
1341 #else
1342  CRYPTOPP_ASSERT(0);
1343 #endif
1344 }
1345 
1346 /// \brief Final round of AES encryption
1347 /// \tparam T1 vector type
1348 /// \tparam T2 vector type
1349 /// \param state the state vector
1350 /// \param key the subkey vector
1351 /// \details VecEncryptLast() performs the final round of AES encryption
1352 /// of state using subkey key. The return vector is the same type as vec1.
1353 /// \details VecEncryptLast() is available on POWER8 and above.
1354 /// \par Wraps
1355 /// __vcipherlast, __builtin_altivec_crypto_vcipherlast, __builtin_crypto_vcipherlast
1356 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
1357 template <class T1, class T2>
1358 inline T1 VecEncryptLast(const T1 state, const T2 key)
1359 {
1360 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
1361  return (T1)__vcipherlast((uint8x16_p)state, (uint8x16_p)key);
1362 #elif defined(__clang__)
1363  return (T1)__builtin_altivec_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key);
1364 #elif defined(__GNUC__)
1365  return (T1)__builtin_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key);
1366 #else
1367  CRYPTOPP_ASSERT(0);
1368 #endif
1369 }
1370 
1371 /// \brief One round of AES decryption
1372 /// \tparam T1 vector type
1373 /// \tparam T2 vector type
1374 /// \param state the state vector
1375 /// \param key the subkey vector
1376 /// \details VecDecrypt() performs one round of AES decryption of state
1377 /// using subkey key. The return vector is the same type as vec1.
1378 /// \details VecDecrypt() is available on POWER8 and above.
1379 /// \par Wraps
1380 /// __vncipher, __builtin_altivec_crypto_vncipher, __builtin_crypto_vncipher
1381 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
1382 template <class T1, class T2>
1383 inline T1 VecDecrypt(const T1 state, const T2 key)
1384 {
1385 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
1386  return (T1)__vncipher((uint8x16_p)state, (uint8x16_p)key);
1387 #elif defined(__clang__)
1388  return (T1)__builtin_altivec_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key);
1389 #elif defined(__GNUC__)
1390  return (T1)__builtin_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key);
1391 #else
1392  CRYPTOPP_ASSERT(0);
1393 #endif
1394 }
1395 
1396 /// \brief Final round of AES decryption
1397 /// \tparam T1 vector type
1398 /// \tparam T2 vector type
1399 /// \param state the state vector
1400 /// \param key the subkey vector
1401 /// \details VecDecryptLast() performs the final round of AES decryption
1402 /// of state using subkey key. The return vector is the same type as vec1.
1403 /// \details VecDecryptLast() is available on POWER8 and above.
1404 /// \par Wraps
1405 /// __vncipherlast, __builtin_altivec_crypto_vncipherlast, __builtin_crypto_vncipherlast
1406 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
1407 template <class T1, class T2>
1408 inline T1 VecDecryptLast(const T1 state, const T2 key)
1409 {
1410 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
1411  return (T1)__vncipherlast((uint8x16_p)state, (uint8x16_p)key);
1412 #elif defined(__clang__)
1413  return (T1)__builtin_altivec_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key);
1414 #elif defined(__GNUC__)
1415  return (T1)__builtin_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key);
1416 #else
1417  CRYPTOPP_ASSERT(0);
1418 #endif
1419 }
1420 
1421 /// \brief SHA256 Sigma functions
1422 /// \tparam func function
1423 /// \tparam fmask function mask
1424 /// \tparam T vector type
1425 /// \param vec the block to transform
1426 /// \details VecSHA256() selects sigma0, sigma1, Sigma0, Sigma1 based on
1427 /// func and fmask. The return vector is the same type as vec.
1428 /// \details VecSHA256() is available on POWER8 and above.
1429 /// \par Wraps
1430 /// __vshasigmaw, __builtin_altivec_crypto_vshasigmaw, __builtin_crypto_vshasigmaw
1431 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
1432 template <int func, int fmask, class T>
1433 inline T VecSHA256(const T vec)
1434 {
1435 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
1436  return (T)__vshasigmaw((uint32x4_p)vec, func, fmask);
1437 #elif defined(__clang__)
1438  return (T)__builtin_altivec_crypto_vshasigmaw((uint32x4_p)vec, func, fmask);
1439 #elif defined(__GNUC__)
1440  return (T)__builtin_crypto_vshasigmaw((uint32x4_p)vec, func, fmask);
1441 #else
1442  CRYPTOPP_ASSERT(0);
1443 #endif
1444 }
1445 
1446 /// \brief SHA512 Sigma functions
1447 /// \tparam func function
1448 /// \tparam fmask function mask
1449 /// \tparam T vector type
1450 /// \param vec the block to transform
1451 /// \details VecSHA512() selects sigma0, sigma1, Sigma0, Sigma1 based on
1452 /// func and fmask. The return vector is the same type as vec.
1453 /// \details VecSHA512() is available on POWER8 and above.
1454 /// \par Wraps
1455 /// __vshasigmad, __builtin_altivec_crypto_vshasigmad, __builtin_crypto_vshasigmad
1456 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
1457 template <int func, int fmask, class T>
1458 inline T VecSHA512(const T vec)
1459 {
1460 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
1461  return (T)__vshasigmad((uint64x2_p)vec, func, fmask);
1462 #elif defined(__clang__)
1463  return (T)__builtin_altivec_crypto_vshasigmad((uint64x2_p)vec, func, fmask);
1464 #elif defined(__GNUC__)
1465  return (T)__builtin_crypto_vshasigmad((uint64x2_p)vec, func, fmask);
1466 #else
1467  CRYPTOPP_ASSERT(0);
1468 #endif
1469 }
1470 
1471 #endif // __CRYPTO__
1472 
1473 #endif // _ALTIVEC_
1474 
1475 NAMESPACE_END
1476 
1477 #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
1478 # pragma GCC diagnostic pop
1479 #endif
1480 
1481 #endif // CRYPTOPP_PPC_CRYPTO_H
T1 VecDecryptLast(const T1 state, const T2 key)
Final round of AES decryption.
Definition: ppc_simd.h:1408
Utility functions for the Crypto++ library.
T VecSHA512(const T vec)
SHA512 Sigma functions.
Definition: ppc_simd.h:1458
T VecShiftRightOctet(const T vec)
Shift a vector right.
Definition: ppc_simd.h:1085
uint32x4_p VecLoadAligned(const byte src[16])
Loads a vector from an aligned byte array.
Definition: ppc_simd.h:381
T VecReverse(const T data)
Reverse bytes in a vector.
Definition: ppc_simd.h:168
T VecGetLow(const T val)
Extract a dword from a vector.
Definition: ppc_simd.h:1255
T1 VecSub(const T1 vec1, const T2 vec2)
Subtract two vectors.
Definition: ppc_simd.h:980
uint32x4_p VecAdd64(const uint32x4_p &vec1, const uint32x4_p &vec2)
Add two vectors.
Definition: ppc_simd.h:998
Library configuration file.
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition: ppc_simd.h:963
T VecGetHigh(const T val)
Extract a dword from a vector.
Definition: ppc_simd.h:1275
uint32x4_p VecLoad_ALTIVEC(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:191
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:875
uint32x4_p VecOne()
The 1 vector.
Definition: ppc_simd.h:153
T VecSwapWords(const T vec)
Exchange high and low double words.
Definition: ppc_simd.h:1238
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:128
bool VecNotEqual(const T1 vec1, const T2 vec2)
Compare two vectors.
Definition: ppc_simd.h:1312
T VecRotateLeftOctet(const T vec)
Rotate a vector left.
Definition: ppc_simd.h:1124
void VecStoreBE(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:746
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:600
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:60
void VecStore_ALTIVEC(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:523
T VecSHA256(const T vec)
SHA256 Sigma functions.
Definition: ppc_simd.h:1433
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:945
bool VecEqual(const T1 vec1, const T2 vec2)
Compare two vectors.
Definition: ppc_simd.h:1295
uint32x4_p VecZero()
The 0 vector.
Definition: ppc_simd.h:144
__vector unsigned short uint16x8_p
Vector of 16-bit elements.
Definition: ppc_simd.h:123
uint32x4_p VecRotateLeft(const uint32x4_p vec)
Rotate a packed vector left.
Definition: ppc_simd.h:1169
uint32x4_p VecRotateRight(const uint32x4_p vec)
Rotate a packed vector right.
Definition: ppc_simd.h:1204
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:138
uint32x4_p VecLoadBE(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:438
T1 VecOr(const T1 vec1, const T2 vec2)
OR two vectors.
Definition: ppc_simd.h:928
T1 VecEncryptLast(const T1 state, const T2 key)
Final round of AES encryption.
Definition: ppc_simd.h:1358
Crypto++ library namespace.
T1 VecDecrypt(const T1 state, const T2 key)
One round of AES decryption.
Definition: ppc_simd.h:1383
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:251
T VecRotateRightOctet(const T vec)
Rotate a vector right.
Definition: ppc_simd.h:1149
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:118
T VecShiftLeftOctet(const T vec)
Shift a vector left.
Definition: ppc_simd.h:1038
T1 VecAnd(const T1 vec1, const T2 vec2)
AND two vectors.
Definition: ppc_simd.h:911
T1 VecEncrypt(const T1 state, const T2 key)
One round of AES encryption.
Definition: ppc_simd.h:1333