Crypto++  8.2
Free C++ class library of cryptographic schemes
xts.cpp
1 // xts.cpp - written and placed in the public domain by Jeffrey Walton
2 
3 // Aarch32, Aarch64, Altivec and X86_64 include SIMD as part of the
4 // base architecture. We can use the SIMD code below without an
5 // architecture option. No runtime tests are required. Unfortunately,
6 // we can't use it on Altivec because an architecture switch is required.
7 // The updated XorBuffer gains 0.3 to 1.5 cpb on the architectures for
8 // 16-byte block sizes.
9 
10 #include "pch.h"
11 
12 #include "xts.h"
13 #include "misc.h"
14 #include "modes.h"
15 #include "cpu.h"
16 
17 #if defined(CRYPTOPP_DEBUG)
18 # include "aes.h"
19 # include "threefish.h"
20 #endif
21 
22 // 0.3 to 0.4 cpb profit
23 #if defined(__SSE2__) || defined(_M_X64)
24 # include <emmintrin.h>
25 // Clang intrinsic casts
26 # define M128_CAST(x) ((__m128i *)(void *)(x))
27 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
28 #endif
29 
30 
31 #if defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM64)
32 # if (CRYPTOPP_ARM_NEON_HEADER)
33 # include <arm_neon.h>
34 # endif
35 #endif
36 
37 #if defined(__ALTIVEC__)
38 # include "ppc_simd.h"
39 #endif
40 
41 ANONYMOUS_NAMESPACE_BEGIN
42 
43 using namespace CryptoPP;
44 
45 #if defined(CRYPTOPP_DEBUG) && !defined(CRYPTOPP_DOXYGEN_PROCESSING)
46 
47 using CryptoPP::AES;
48 using CryptoPP::XTS_Mode;
49 using CryptoPP::Threefish512;
50 
51 void Modes_TestInstantiations()
52 {
53  XTS_Mode<AES>::Encryption m0;
54  XTS_Mode<AES>::Decryption m1;
55  XTS_Mode<AES>::Encryption m2;
56  XTS_Mode<AES>::Decryption m3;
57 
58 #if CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
59  XTS_Mode<Threefish512>::Encryption m4;
60  XTS_Mode<Threefish512>::Decryption m5;
61 #endif
62 }
63 #endif // CRYPTOPP_DEBUG
64 
65 inline void XorBuffer(byte *output, const byte *input, const byte *mask, size_t count)
66 {
67  CRYPTOPP_ASSERT(count >= 16 && (count % 16 == 0));
68 
69 #if defined(CRYPTOPP_DISABLE_ASM)
70  xorbuf(output, input, mask, count);
71 
72 #elif defined(__SSE2__) || defined(_M_X64)
73  for (size_t i=0; i<count; i+=16)
74  _mm_storeu_si128(M128_CAST(output+i),
75  _mm_xor_si128(
76  _mm_loadu_si128(CONST_M128_CAST(input+i)),
77  _mm_loadu_si128(CONST_M128_CAST(mask+i))));
78 
79 #elif defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM64)
80  for (size_t i=0; i<count; i+=16)
81  vst1q_u8(output+i, veorq_u8(vld1q_u8(input+i), vld1q_u8(mask+i)));
82 
83 #elif defined(__ALTIVEC__)
84  for (size_t i=0; i<count; i+=16)
85  VecStore(VecXor(VecLoad(input+i), VecLoad(mask+i)), output+i);
86 
87 #else
88  xorbuf(output, input, mask, count);
89 #endif
90 }
91 
92 inline void XorBuffer(byte *buf, const byte *mask, size_t count)
93 {
94  XorBuffer(buf, buf, mask, count);
95 }
96 
97 // Borrowed from CMAC, but little-endian representation
98 inline void GF_Double(byte *out, const byte* in, unsigned int len)
99 {
100 #if defined(_M_X64) || defined(_M_ARM64) || defined(_LP64) || defined(__LP64__)
101  word64 carry = 0, x;
102  for (size_t i=0, idx=0; i<len/8; ++i, idx+=8)
103  {
104  x = GetWord<word64>(false, LITTLE_ENDIAN_ORDER, in+idx);
105  word64 y = (x >> 63); x = (x << 1) + carry;
106  PutWord<word64>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
107  carry = y;
108  }
109 #else
110  word32 carry = 0, x;
111  for (size_t i=0, idx=0; i<len/4; ++i, idx+=4)
112  {
113  x = GetWord<word32>(false, LITTLE_ENDIAN_ORDER, in+idx);
114  word32 y = (x >> 31); x = (x << 1) + carry;
115  PutWord<word32>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
116  carry = y;
117  }
118 #endif
119 
120 #if CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
121 
123  CRYPTOPP_ASSERT(len >= 16);
124  CRYPTOPP_ASSERT(len <= 128);
125 
126  byte* k = out;
127  if (carry)
128  {
129  switch (len)
130  {
131  case 16:
132  {
133  const size_t LEIDX = 16-1;
134  k[LEIDX-15] ^= 0x87;
135  break;
136  }
137  case 32:
138  {
139  // https://crypto.stackexchange.com/q/9815/10496
140  // Polynomial x^256 + x^10 + x^5 + x^2 + 1
141  const size_t LEIDX = 32-1;
142  k[LEIDX-30] ^= 4;
143  k[LEIDX-31] ^= 0x25;
144  break;
145  }
146  case 64:
147  {
148  // https://crypto.stackexchange.com/q/9815/10496
149  // Polynomial x^512 + x^8 + x^5 + x^2 + 1
150  const size_t LEIDX = 64-1;
151  k[LEIDX-62] ^= 1;
152  k[LEIDX-63] ^= 0x25;
153  break;
154  }
155  case 128:
156  {
157  // https://crypto.stackexchange.com/q/9815/10496
158  // Polynomial x^1024 + x^19 + x^6 + x + 1
159  const size_t LEIDX = 128-1;
160  k[LEIDX-125] ^= 8;
161  k[LEIDX-126] ^= 0x00;
162  k[LEIDX-127] ^= 0x43;
163  break;
164  }
165  default:
166  CRYPTOPP_ASSERT(0);
167  }
168  }
169 #else
170  CRYPTOPP_ASSERT(len == 16);
171 
172  byte* k = out;
173  if (carry)
174  {
175  k[0] ^= 0x87;
176  return;
177  }
178 #endif // CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
179 }
180 
181 inline void GF_Double(byte *inout, unsigned int len)
182 {
183  GF_Double(inout, inout, len);
184 }
185 
186 ANONYMOUS_NAMESPACE_END
187 
188 NAMESPACE_BEGIN(CryptoPP)
189 
190 void XTS_ModeBase::ThrowIfInvalidBlockSize(size_t length)
191 {
192 #if CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
193  CRYPTOPP_ASSERT(length >= 16 && length <= 128 && IsPowerOf2(length));
194  if (length < 16 || length > 128 || !IsPowerOf2(length))
195  throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not valid");
196 #else
197  CRYPTOPP_ASSERT(length == 16);
198  if (length != 16)
199  throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not 16");
200 #endif
201 }
202 
204 {
205  CRYPTOPP_ASSERT(length % 2 == 0);
206  if (!GetBlockCipher().IsValidKeyLength((length+1)/2))
207  throw InvalidKeyLength(AlgorithmName(), length);
208 }
209 
210 void XTS_ModeBase::SetKey(const byte *key, size_t length, const NameValuePairs &params)
211 {
212  ThrowIfInvalidKeyLength(length);
213  ThrowIfInvalidBlockSize(BlockSize());
214 
215  const size_t klen = length/2;
216  AccessBlockCipher().SetKey(key+0, klen, params);
217  AccessTweakCipher().SetKey(key+klen, klen, params);
218 
219  ResizeBuffers();
220 
221  size_t ivLength;
222  const byte *iv = GetIVAndThrowIfInvalid(params, ivLength);
223  Resynchronize(iv, (int)ivLength);
224 }
225 
226 void XTS_ModeBase::Resynchronize(const byte *iv, int ivLength)
227 {
229  std::memcpy(m_xregister, m_register, ivLength);
230  GetTweakCipher().ProcessBlock(m_xregister);
231 }
232 
233 void XTS_ModeBase::Resynchronize(word64 sector, ByteOrder order)
234 {
235  SecByteBlock iv(GetTweakCipher().BlockSize());
236  PutWord<word64>(false, order, iv, sector);
237  std::memset(iv+8, 0x00, iv.size()-8);
238 
239  BlockOrientedCipherModeBase::Resynchronize(iv, (int)iv.size());
240  std::memcpy(m_xregister, iv, iv.size());
241  GetTweakCipher().ProcessBlock(m_xregister);
242 }
243 
244 void XTS_ModeBase::ResizeBuffers()
245 {
246  BlockOrientedCipherModeBase::ResizeBuffers();
247  m_xworkspace.New(GetBlockCipher().BlockSize()*ParallelBlocks);
248  m_xregister.New(GetBlockCipher().BlockSize()*ParallelBlocks);
249 }
250 
251 // ProcessData runs either 12-4-1 blocks, 8-2-1 or 4-1 blocks. Which is
252 // selected depends on ParallelBlocks in the header file. 12-4-1 or 8-2-1
253 // can be used on Aarch64 and PowerPC. Intel should use 4-1 due to lack
254 // of registers. The unneeded code paths should be removed by optimizer.
255 // The extra gyrations save us 1.8 cpb on Aarch64 and 2.1 cpb on PowerPC.
256 void XTS_ModeBase::ProcessData(byte *outString, const byte *inString, size_t length)
257 {
258  // data unit is multiple of 16 bytes
259  CRYPTOPP_ASSERT(length % BlockSize() == 0);
260 
261  enum { lastParallelBlock = ParallelBlocks-1 };
262  const unsigned int blockSize = GetBlockCipher().BlockSize();
263  const size_t parallelSize = blockSize*ParallelBlocks;
264 
265  // encrypt the data unit, optimal size at a time
266  while (length >= parallelSize)
267  {
268  // m_xregister[0] always points to the next tweak.
269  GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
270  GF_Double(m_xregister+2*blockSize, m_xregister+1*blockSize, blockSize);
271  GF_Double(m_xregister+3*blockSize, m_xregister+2*blockSize, blockSize);
272 
273  if (ParallelBlocks > 4)
274  {
275  GF_Double(m_xregister+4*blockSize, m_xregister+3*blockSize, blockSize);
276  GF_Double(m_xregister+5*blockSize, m_xregister+4*blockSize, blockSize);
277  GF_Double(m_xregister+6*blockSize, m_xregister+5*blockSize, blockSize);
278  GF_Double(m_xregister+7*blockSize, m_xregister+6*blockSize, blockSize);
279  }
280  if (ParallelBlocks > 8)
281  {
282  GF_Double(m_xregister+8*blockSize, m_xregister+7*blockSize, blockSize);
283  GF_Double(m_xregister+9*blockSize, m_xregister+8*blockSize, blockSize);
284  GF_Double(m_xregister+10*blockSize, m_xregister+9*blockSize, blockSize);
285  GF_Double(m_xregister+11*blockSize, m_xregister+10*blockSize, blockSize);
286  }
287 
288  // merge the tweak into the input block
289  XorBuffer(m_xworkspace, inString, m_xregister, parallelSize);
290 
291  // encrypt one block, merge the tweak into the output block
292  GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
293  outString, parallelSize, BlockTransformation::BT_AllowParallel);
294 
295  // m_xregister[0] always points to the next tweak.
296  GF_Double(m_xregister+0, m_xregister+lastParallelBlock*blockSize, blockSize);
297 
298  inString += parallelSize;
299  outString += parallelSize;
300  length -= parallelSize;
301  }
302 
303  // encrypt the data unit, 4 blocks at a time
304  while (ParallelBlocks == 12 && length >= blockSize*4)
305  {
306  // m_xregister[0] always points to the next tweak.
307  GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
308  GF_Double(m_xregister+2*blockSize, m_xregister+1*blockSize, blockSize);
309  GF_Double(m_xregister+3*blockSize, m_xregister+2*blockSize, blockSize);
310 
311  // merge the tweak into the input block
312  XorBuffer(m_xworkspace, inString, m_xregister, blockSize*4);
313 
314  // encrypt one block, merge the tweak into the output block
315  GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
316  outString, blockSize*4, BlockTransformation::BT_AllowParallel);
317 
318  // m_xregister[0] always points to the next tweak.
319  GF_Double(m_xregister+0, m_xregister+3*blockSize, blockSize);
320 
321  inString += blockSize*4;
322  outString += blockSize*4;
323  length -= blockSize*4;
324  }
325 
326  // encrypt the data unit, 2 blocks at a time
327  while (ParallelBlocks == 8 && length >= blockSize*2)
328  {
329  // m_xregister[0] always points to the next tweak.
330  GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
331 
332  // merge the tweak into the input block
333  XorBuffer(m_xworkspace, inString, m_xregister, blockSize*2);
334 
335  // encrypt one block, merge the tweak into the output block
336  GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
337  outString, blockSize*2, BlockTransformation::BT_AllowParallel);
338 
339  // m_xregister[0] always points to the next tweak.
340  GF_Double(m_xregister+0, m_xregister+1*blockSize, blockSize);
341 
342  inString += blockSize*2;
343  outString += blockSize*2;
344  length -= blockSize*2;
345  }
346 
347  // encrypt the data unit, blocksize at a time
348  while (length)
349  {
350  // merge the tweak into the input block
351  XorBuffer(m_xworkspace, inString, m_xregister, blockSize);
352 
353  // encrypt one block
354  GetBlockCipher().ProcessBlock(m_xworkspace);
355 
356  // merge the tweak into the output block
357  XorBuffer(outString, m_xworkspace, m_xregister, blockSize);
358 
359  // Multiply T by alpha
360  GF_Double(m_xregister, blockSize);
361 
362  inString += blockSize;
363  outString += blockSize;
364  length -= blockSize;
365  }
366 }
367 
368 size_t XTS_ModeBase::ProcessLastBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
369 {
370  // need at least a full AES block
371  CRYPTOPP_ASSERT(inLength >= BlockSize());
372 
373  if (inLength < BlockSize())
374  throw InvalidArgument("XTS: message is too short for ciphertext stealing");
375 
376  if (IsForwardTransformation())
377  return ProcessLastPlainBlock(outString, outLength, inString, inLength);
378  else
379  return ProcessLastCipherBlock(outString, outLength, inString, inLength);
380 }
381 
382 size_t XTS_ModeBase::ProcessLastPlainBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
383 {
384  // ensure output buffer is large enough
385  CRYPTOPP_ASSERT(outLength >= inLength);
386 
387  const unsigned int blockSize = GetBlockCipher().BlockSize();
388  const size_t blocks = inLength / blockSize;
389  const size_t tail = inLength % blockSize;
390  outLength = inLength;
391 
392  if (tail == 0)
393  {
394  // Allow ProcessData to handle all the full blocks
395  ProcessData(outString, inString, inLength);
396  return inLength;
397  }
398  else if (blocks > 1)
399  {
400  // Allow ProcessData to handle full blocks except one
401  const size_t head = (blocks-1)*blockSize;
402  ProcessData(outString, inString, inLength-head);
403 
404  outString += head;
405  inString += head; inLength -= head;
406  }
407 
408  ///// handle the full block /////
409 
410  // merge the tweak into the input block
411  XorBuffer(m_xworkspace, inString, m_xregister, blockSize);
412 
413  // encrypt one block
414  GetBlockCipher().ProcessBlock(m_xworkspace);
415 
416  // merge the tweak into the output block
417  XorBuffer(outString, m_xworkspace, m_xregister, blockSize);
418 
419  // Multiply T by alpha
420  GF_Double(m_xregister, blockSize);
421 
422  ///// handle final partial block /////
423 
424  inString += blockSize;
425  outString += blockSize;
426  const size_t len = inLength-blockSize;
427 
428  // copy in the final plaintext bytes
429  std::memcpy(m_xworkspace, inString, len);
430  // and copy out the final ciphertext bytes
431  std::memcpy(outString, outString-blockSize, len);
432  // "steal" ciphertext to complete the block
433  std::memcpy(m_xworkspace+len, outString-blockSize+len, blockSize-len);
434 
435  // merge the tweak into the input block
436  XorBuffer(m_xworkspace, m_xregister, blockSize);
437 
438  // encrypt one block
439  GetBlockCipher().ProcessBlock(m_xworkspace);
440 
441  // merge the tweak into the previous output block
442  XorBuffer(outString-blockSize, m_xworkspace, m_xregister, blockSize);
443 
444  return outLength;
445 }
446 
447 size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
448 {
449  // ensure output buffer is large enough
450  CRYPTOPP_ASSERT(outLength >= inLength);
451 
452  const unsigned int blockSize = GetBlockCipher().BlockSize();
453  const size_t blocks = inLength / blockSize;
454  const size_t tail = inLength % blockSize;
455  outLength = inLength;
456 
457  if (tail == 0)
458  {
459  // Allow ProcessData to handle all the full blocks
460  ProcessData(outString, inString, inLength);
461  return inLength;
462  }
463  else if (blocks > 1)
464  {
465  // Allow ProcessData to handle full blocks except one
466  const size_t head = (blocks-1)*blockSize;
467  ProcessData(outString, inString, inLength-head);
468 
469  outString += head;
470  inString += head; inLength -= head;
471  }
472 
473  #define poly1 (m_xregister+0*blockSize)
474  #define poly2 (m_xregister+1*blockSize)
475  GF_Double(poly2, poly1, blockSize);
476 
477  ///// handle final partial block /////
478 
479  inString += blockSize;
480  outString += blockSize;
481  const size_t len = inLength-blockSize;
482 
483  // merge the tweak into the input block
484  XorBuffer(m_xworkspace, inString-blockSize, poly2, blockSize);
485 
486  // encrypt one block
487  GetBlockCipher().ProcessBlock(m_xworkspace);
488 
489  // merge the tweak into the output block
490  XorBuffer(m_xworkspace, poly2, blockSize);
491 
492  // copy in the final plaintext bytes
493  std::memcpy(outString-blockSize, inString, len);
494  // and copy out the final ciphertext bytes
495  std::memcpy(outString, m_xworkspace, len);
496  // "steal" ciphertext to complete the block
497  std::memcpy(outString-blockSize+len, m_xworkspace+len, blockSize-len);
498 
499  ///// handle the full previous block /////
500 
501  inString -= blockSize;
502  outString -= blockSize;
503 
504  // merge the tweak into the input block
505  XorBuffer(m_xworkspace, outString, poly1, blockSize);
506 
507  // encrypt one block
508  GetBlockCipher().ProcessBlock(m_xworkspace);
509 
510  // merge the tweak into the output block
511  XorBuffer(outString, m_xworkspace, poly1, blockSize);
512 
513  return outLength;
514 }
515 
516 NAMESPACE_END
Allow parallel transformations.
Definition: cryptlib.h:925
An invalid argument was detected.
Definition: cryptlib.h:202
void Resynchronize(const byte *iv, int length=-1)
Resynchronize with an IV.
Definition: modes.h:260
#define CONST_M128_CAST(x)
Clang workaround.
Definition: adv_simd.h:614
Utility functions for the Crypto++ library.
ByteOrder
Provides the byte ordering.
Definition: cryptlib.h:143
Classes for block cipher modes of operation.
Exception thrown when an invalid key length is encountered.
Definition: simple.h:55
void ThrowIfInvalidKeyLength(size_t length)
Validates the key length.
Definition: xts.cpp:203
SecBlock<byte> typedef.
Definition: secblock.h:1097
void Resynchronize(const byte *iv, int ivLength=-1)
Resynchronize with an IV.
Definition: xts.cpp:226
byte order is little-endian
Definition: cryptlib.h:145
void SetKey(const byte *key, size_t length, const NameValuePairs &params=g_nullNameValuePairs)
Sets or reset the key of this object.
Definition: xts.cpp:210
#define M128_CAST(x)
Clang workaround.
Definition: adv_simd.h:609
Class file for the AES cipher (Rijndael)
Support functions for PowerPC and vector operations.
Classes for XTS block cipher mode of operation.
Precompiled header file.
Classes for the Threefish block cipher.
bool IsPowerOf2(const T &value)
Tests whether a value is a power of 2.
Definition: misc.h:990
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:891
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:68
const char * BlockSize()
int, in bytes
Definition: argnames.h:27
Functions for CPU features and intrinsics.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:1410
void ProcessData(byte *outString, const byte *inString, size_t length)
Encrypt or decrypt an array of bytes.
Definition: xts.cpp:256
CRYPTOPP_DLL void CRYPTOPP_API xorbuf(byte *buf, const byte *mask, size_t count)
Performs an XOR of a buffer with a mask.
Definition: misc.cpp:46
XTS block cipher mode of operation default implementation.
Definition: xts.h:49
Crypto++ library namespace.
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:365
Interface for retrieving values given their names.
Definition: cryptlib.h:321
size_t ProcessLastBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
Encrypt or decrypt the last block of data.
Definition: xts.cpp:368