Crypto++  8.2
Free C++ class library of cryptographic schemes
xts.cpp
1 // xts.cpp - written and placed in the public domain by Jeffrey Walton
2 
3 // Aarch32, Aarch64, Altivec and X86_64 include SIMD as part of the
4 // base architecture. We can use the SIMD code below without an
5 // architecture option. No runtime tests are required. Unfortunately,
6 // we can't use it on Altivec because an architecture switch is required.
7 // The updated XorBuffer gains 0.3 to 1.5 cpb on the architectures for
8 // 16-byte block sizes.
9 
10 #include "pch.h"
11 
12 #include "xts.h"
13 #include "misc.h"
14 #include "modes.h"
15 #include "cpu.h"
16 
17 #if defined(CRYPTOPP_DEBUG)
18 # include "aes.h"
19 # include "threefish.h"
20 #endif
21 
22 // 0.3 to 0.4 cpb profit
23 #if defined(__SSE2__) || defined(_M_X64)
24 # include <emmintrin.h>
25 // Clang intrinsic casts
26 # define M128_CAST(x) ((__m128i *)(void *)(x))
27 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
28 #endif
29 
30 
31 #if defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM64)
32 # if (CRYPTOPP_ARM_NEON_HEADER)
33 # include <arm_neon.h>
34 # endif
35 #endif
36 
37 #if defined(__ALTIVEC__)
38 # include "ppc_simd.h"
39 #endif
40 
41 ANONYMOUS_NAMESPACE_BEGIN
42 
43 using namespace CryptoPP;
44 
45 #if defined(CRYPTOPP_DEBUG) && !defined(CRYPTOPP_DOXYGEN_PROCESSING)
46 
47 using CryptoPP::AES;
48 using CryptoPP::XTS_Mode;
49 using CryptoPP::Threefish512;
50 
51 void Modes_TestInstantiations()
52 {
53  XTS_Mode<AES>::Encryption m0;
54  XTS_Mode<AES>::Decryption m1;
55  XTS_Mode<AES>::Encryption m2;
56  XTS_Mode<AES>::Decryption m3;
57 
58 #if CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
59  XTS_Mode<Threefish512>::Encryption m4;
60  XTS_Mode<Threefish512>::Decryption m5;
61 #endif
62 }
63 #endif // CRYPTOPP_DEBUG
64 
65 inline void XorBuffer(byte *output, const byte *input, const byte *mask, size_t count)
66 {
67  CRYPTOPP_ASSERT(count >= 16 && (count % 16 == 0));
68 
69 #if defined(CRYPTOPP_DISABLE_ASM)
70  xorbuf(output, input, mask, count);
71 
72 #elif defined(__SSE2__) || defined(_M_X64)
73  for (size_t i=0; i<count; i+=16)
74  _mm_storeu_si128(M128_CAST(output+i),
75  _mm_xor_si128(
76  _mm_loadu_si128(CONST_M128_CAST(input+i)),
77  _mm_loadu_si128(CONST_M128_CAST(mask+i))));
78 
79 #elif defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM64)
80  for (size_t i=0; i<count; i+=16)
81  vst1q_u8(output+i, veorq_u8(vld1q_u8(input+i), vld1q_u8(mask+i)));
82 
83 #elif defined(__ALTIVEC__)
84  for (size_t i=0; i<count; i+=16)
85  VecStore(VecXor(VecLoad(input+i), VecLoad(mask+i)), output+i);
86 
87 #else
88  xorbuf(output, input, mask, count);
89 #endif
90 }
91 
92 inline void XorBuffer(byte *buf, const byte *mask, size_t count)
93 {
94  XorBuffer(buf, buf, mask, count);
95 }
96 
97 // Borrowed from CMAC, but little-endian representation
98 inline void GF_Double(byte *out, const byte* in, unsigned int len)
99 {
100 #if defined(_M_X64) || defined(_M_ARM64) || defined(_LP64) || defined(__LP64__)
101  word64 carry = 0, x;
102  for (size_t i=0, idx=0; i<len/8; ++i, idx+=8)
103  {
104  x = GetWord<word64>(false, LITTLE_ENDIAN_ORDER, in+idx);
105  word64 y = (x >> 63); x = (x << 1) + carry;
106  PutWord<word64>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
107  carry = y;
108  }
109 #else
110  word32 carry = 0, x;
111  for (size_t i=0, idx=0; i<len/4; ++i, idx+=4)
112  {
113  x = GetWord<word32>(false, LITTLE_ENDIAN_ORDER, in+idx);
114  word32 y = (x >> 31); x = (x << 1) + carry;
115  PutWord<word32>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
116  carry = y;
117  }
118 #endif
119 
120 #if CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
121 
123  CRYPTOPP_ASSERT(len >= 16);
124  CRYPTOPP_ASSERT(len <= 128);
125 
126  byte* k = out;
127  if (carry)
128  {
129  switch (len)
130  {
131  case 16:
132  {
133  const size_t LEIDX = 16-1;
134  k[LEIDX-15] ^= 0x87;
135  break;
136  }
137  case 32:
138  {
139  // https://crypto.stackexchange.com/q/9815/10496
140  // Polynomial x^256 + x^10 + x^5 + x^2 + 1
141  const size_t LEIDX = 32-1;
142  k[LEIDX-30] ^= 4;
143  k[LEIDX-31] ^= 0x25;
144  break;
145  }
146  case 64:
147  {
148  // https://crypto.stackexchange.com/q/9815/10496
149  // Polynomial x^512 + x^8 + x^5 + x^2 + 1
150  const size_t LEIDX = 64-1;
151  k[LEIDX-62] ^= 1;
152  k[LEIDX-63] ^= 0x25;
153  break;
154  }
155  case 128:
156  {
157  // https://crypto.stackexchange.com/q/9815/10496
158  // Polynomial x^1024 + x^19 + x^6 + x + 1
159  const size_t LEIDX = 128-1;
160  k[LEIDX-125] ^= 8;
161  k[LEIDX-126] ^= 0x00;
162  k[LEIDX-127] ^= 0x43;
163  break;
164  }
165  default:
166  CRYPTOPP_ASSERT(0);
167  }
168  }
169 #else
170  CRYPTOPP_ASSERT(len == 16);
171 
172  byte* k = out;
173  if (carry)
174  {
175  k[0] ^= 0x87;
176  return;
177  }
178 #endif // CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
179 }
180 
181 inline void GF_Double(byte *inout, unsigned int len)
182 {
183  GF_Double(inout, inout, len);
184 }
185 
186 ANONYMOUS_NAMESPACE_END
187 
188 NAMESPACE_BEGIN(CryptoPP)
189 
190 void XTS_ModeBase::ThrowIfInvalidBlockSize(size_t length)
191 {
192 #if CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
193  CRYPTOPP_ASSERT(length >= 16 && length <= 128 && IsPowerOf2(length));
194  if (length < 16 || length > 128 || !IsPowerOf2(length))
195  throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not valid");
196 #else
197  CRYPTOPP_ASSERT(length == 16);
198  if (length != 16)
199  throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not 16");
200 #endif
201 }
202 
204 {
205  CRYPTOPP_ASSERT(length % 2 == 0);
206  if (!GetBlockCipher().IsValidKeyLength((length+1)/2))
207  throw InvalidKeyLength(AlgorithmName(), length);
208 }
209 
210 void XTS_ModeBase::SetKey(const byte *key, size_t length, const NameValuePairs &params)
211 {
212  ThrowIfInvalidKeyLength(length);
213  ThrowIfInvalidBlockSize(BlockSize());
214 
215  const size_t klen = length/2;
216  AccessBlockCipher().SetKey(key+0, klen, params);
217  AccessTweakCipher().SetKey(key+klen, klen, params);
218 
219  ResizeBuffers();
220 
221  size_t ivLength;
222  const byte *iv = GetIVAndThrowIfInvalid(params, ivLength);
223  Resynchronize(iv, (int)ivLength);
224 }
225 
226 void XTS_ModeBase::Resynchronize(const byte *iv, int ivLength)
227 {
229  std::memcpy(m_xregister, m_register, ivLength);
230  GetTweakCipher().ProcessBlock(m_xregister);
231 }
232 
233 void XTS_ModeBase::Resynchronize(word64 sector, ByteOrder order)
234 {
235  SecByteBlock iv(GetTweakCipher().BlockSize());
236  PutWord<word64>(false, order, iv, sector);
237  std::memset(iv+8, 0x00, iv.size()-8);
238 
240  std::memcpy(m_xregister, iv, iv.size());
241  GetTweakCipher().ProcessBlock(m_xregister);
242 }
243 
244 void XTS_ModeBase::ResizeBuffers()
245 {
246  BlockOrientedCipherModeBase::ResizeBuffers();
247  m_xworkspace.New(GetBlockCipher().BlockSize()*ParallelBlocks);
248  m_xregister.New(GetBlockCipher().BlockSize()*ParallelBlocks);
249 }
250 
251 // ProcessData runs either 12-4-1 blocks, 8-2-1 or 4-1 blocks. Which is
252 // selected depends on ParallelBlocks in the header file. 12-4-1 or 8-2-1
253 // can be used on Aarch64 and PowerPC. Intel should use 4-1 due to lack
254 // of registers. The unneeded code paths should be removed by optimizer.
255 // The extra gyrations save us 1.8 cpb on Aarch64 and 2.1 cpb on PowerPC.
256 void XTS_ModeBase::ProcessData(byte *outString, const byte *inString, size_t length)
257 {
258  // data unit is multiple of 16 bytes
259  CRYPTOPP_ASSERT(length % BlockSize() == 0);
260 
261  enum { lastParallelBlock = ParallelBlocks-1 };
262  const unsigned int blockSize = GetBlockCipher().BlockSize();
263  const size_t parallelSize = blockSize*ParallelBlocks;
264 
265  // encrypt the data unit, optimal size at a time
266  while (length >= parallelSize)
267  {
268  // m_xregister[0] always points to the next tweak.
269  GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
270  GF_Double(m_xregister+2*blockSize, m_xregister+1*blockSize, blockSize);
271  GF_Double(m_xregister+3*blockSize, m_xregister+2*blockSize, blockSize);
272 
273  if (ParallelBlocks > 4)
274  {
275  GF_Double(m_xregister+4*blockSize, m_xregister+3*blockSize, blockSize);
276  GF_Double(m_xregister+5*blockSize, m_xregister+4*blockSize, blockSize);
277  GF_Double(m_xregister+6*blockSize, m_xregister+5*blockSize, blockSize);
278  GF_Double(m_xregister+7*blockSize, m_xregister+6*blockSize, blockSize);
279  }
280  if (ParallelBlocks > 8)
281  {
282  GF_Double(m_xregister+8*blockSize, m_xregister+7*blockSize, blockSize);
283  GF_Double(m_xregister+9*blockSize, m_xregister+8*blockSize, blockSize);
284  GF_Double(m_xregister+10*blockSize, m_xregister+9*blockSize, blockSize);
285  GF_Double(m_xregister+11*blockSize, m_xregister+10*blockSize, blockSize);
286  }
287 
288  // merge the tweak into the input block
289  XorBuffer(m_xworkspace, inString, m_xregister, parallelSize);
290 
291  // encrypt one block, merge the tweak into the output block
292  GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
293  outString, parallelSize, BlockTransformation::BT_AllowParallel);
294 
295  // m_xregister[0] always points to the next tweak.
296  GF_Double(m_xregister+0, m_xregister+lastParallelBlock*blockSize, blockSize);
297 
298  inString += parallelSize;
299  outString += parallelSize;
300  length -= parallelSize;
301  }
302 
303  // encrypt the data unit, 4 blocks at a time
304  while (ParallelBlocks == 12 && length >= blockSize*4)
305  {
306  // m_xregister[0] always points to the next tweak.
307  GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
308  GF_Double(m_xregister+2*blockSize, m_xregister+1*blockSize, blockSize);
309  GF_Double(m_xregister+3*blockSize, m_xregister+2*blockSize, blockSize);
310 
311  // merge the tweak into the input block
312  XorBuffer(m_xworkspace, inString, m_xregister, blockSize*4);
313 
314  // encrypt one block, merge the tweak into the output block
315  GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
316  outString, blockSize*4, BlockTransformation::BT_AllowParallel);
317 
318  // m_xregister[0] always points to the next tweak.
319  GF_Double(m_xregister+0, m_xregister+3*blockSize, blockSize);
320 
321  inString += blockSize*4;
322  outString += blockSize*4;
323  length -= blockSize*4;
324  }
325 
326  // encrypt the data unit, 2 blocks at a time
327  while (ParallelBlocks == 8 && length >= blockSize*2)
328  {
329  // m_xregister[0] always points to the next tweak.
330  GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
331 
332  // merge the tweak into the input block
333  XorBuffer(m_xworkspace, inString, m_xregister, blockSize*2);
334 
335  // encrypt one block, merge the tweak into the output block
336  GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
337  outString, blockSize*2, BlockTransformation::BT_AllowParallel);
338 
339  // m_xregister[0] always points to the next tweak.
340  GF_Double(m_xregister+0, m_xregister+1*blockSize, blockSize);
341 
342  inString += blockSize*2;
343  outString += blockSize*2;
344  length -= blockSize*2;
345  }
346 
347  // encrypt the data unit, blocksize at a time
348  while (length)
349  {
350  // merge the tweak into the input block
351  XorBuffer(m_xworkspace, inString, m_xregister, blockSize);
352 
353  // encrypt one block
354  GetBlockCipher().ProcessBlock(m_xworkspace);
355 
356  // merge the tweak into the output block
357  XorBuffer(outString, m_xworkspace, m_xregister, blockSize);
358 
359  // Multiply T by alpha
360  GF_Double(m_xregister, blockSize);
361 
362  inString += blockSize;
363  outString += blockSize;
364  length -= blockSize;
365  }
366 }
367 
368 size_t XTS_ModeBase::ProcessLastBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
369 {
370  // need at least a full AES block
371  CRYPTOPP_ASSERT(inLength >= BlockSize());
372 
373  if (inLength < BlockSize())
374  throw InvalidArgument("XTS: message is too short for ciphertext stealing");
375 
376  if (IsForwardTransformation())
377  return ProcessLastPlainBlock(outString, outLength, inString, inLength);
378  else
379  return ProcessLastCipherBlock(outString, outLength, inString, inLength);
380 }
381 
382 size_t XTS_ModeBase::ProcessLastPlainBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
383 {
384  // ensure output buffer is large enough
385  CRYPTOPP_ASSERT(outLength >= inLength);
386 
387  const unsigned int blockSize = GetBlockCipher().BlockSize();
388  const size_t blocks = inLength / blockSize;
389  const size_t tail = inLength % blockSize;
390  outLength = inLength;
391 
392  if (tail == 0)
393  {
394  // Allow ProcessData to handle all the full blocks
395  ProcessData(outString, inString, inLength);
396  return inLength;
397  }
398  else if (blocks > 1)
399  {
400  // Allow ProcessData to handle full blocks except one
401  const size_t head = (blocks-1)*blockSize;
402  ProcessData(outString, inString, inLength-head);
403 
404  outString += head;
405  inString += head; inLength -= head;
406  }
407 
408  ///// handle the full block /////
409 
410  // merge the tweak into the input block
411  XorBuffer(m_xworkspace, inString, m_xregister, blockSize);
412 
413  // encrypt one block
414  GetBlockCipher().ProcessBlock(m_xworkspace);
415 
416  // merge the tweak into the output block
417  XorBuffer(outString, m_xworkspace, m_xregister, blockSize);
418 
419  // Multiply T by alpha
420  GF_Double(m_xregister, blockSize);
421 
422  ///// handle final partial block /////
423 
424  inString += blockSize;
425  outString += blockSize;
426  const size_t len = inLength-blockSize;
427 
428  // copy in the final plaintext bytes
429  std::memcpy(m_xworkspace, inString, len);
430  // and copy out the final ciphertext bytes
431  std::memcpy(outString, outString-blockSize, len);
432  // "steal" ciphertext to complete the block
433  std::memcpy(m_xworkspace+len, outString-blockSize+len, blockSize-len);
434 
435  // merge the tweak into the input block
436  XorBuffer(m_xworkspace, m_xregister, blockSize);
437 
438  // encrypt one block
439  GetBlockCipher().ProcessBlock(m_xworkspace);
440 
441  // merge the tweak into the previous output block
442  XorBuffer(outString-blockSize, m_xworkspace, m_xregister, blockSize);
443 
444  return outLength;
445 }
446 
447 size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
448 {
449  // ensure output buffer is large enough
450  CRYPTOPP_ASSERT(outLength >= inLength);
451 
452  const unsigned int blockSize = GetBlockCipher().BlockSize();
453  const size_t blocks = inLength / blockSize;
454  const size_t tail = inLength % blockSize;
455  outLength = inLength;
456 
457  if (tail == 0)
458  {
459  // Allow ProcessData to handle all the full blocks
460  ProcessData(outString, inString, inLength);
461  return inLength;
462  }
463  else if (blocks > 1)
464  {
465  // Allow ProcessData to handle full blocks except one
466  const size_t head = (blocks-1)*blockSize;
467  ProcessData(outString, inString, inLength-head);
468 
469  outString += head;
470  inString += head; inLength -= head;
471  }
472 
473  #define poly1 (m_xregister+0*blockSize)
474  #define poly2 (m_xregister+1*blockSize)
475  GF_Double(poly2, poly1, blockSize);
476 
477  ///// handle final partial block /////
478 
479  inString += blockSize;
480  outString += blockSize;
481  const size_t len = inLength-blockSize;
482 
483  // merge the tweak into the input block
484  XorBuffer(m_xworkspace, inString-blockSize, poly2, blockSize);
485 
486  // encrypt one block
487  GetBlockCipher().ProcessBlock(m_xworkspace);
488 
489  // merge the tweak into the output block
490  XorBuffer(m_xworkspace, poly2, blockSize);
491 
492  // copy in the final plaintext bytes
493  std::memcpy(outString-blockSize, inString, len);
494  // and copy out the final ciphertext bytes
495  std::memcpy(outString, m_xworkspace, len);
496  // "steal" ciphertext to complete the block
497  std::memcpy(outString-blockSize+len, m_xworkspace+len, blockSize-len);
498 
499  ///// handle the full previous block /////
500 
501  inString -= blockSize;
502  outString -= blockSize;
503 
504  // merge the tweak into the input block
505  XorBuffer(m_xworkspace, outString, poly1, blockSize);
506 
507  // encrypt one block
508  GetBlockCipher().ProcessBlock(m_xworkspace);
509 
510  // merge the tweak into the output block
511  XorBuffer(outString, m_xworkspace, poly1, blockSize);
512 
513  return outLength;
514 }
515 
516 NAMESPACE_END
XTS_ModeBase::Resynchronize
void Resynchronize(const byte *iv, int ivLength=-1)
Resynchronize with an IV.
Definition: xts.cpp:226
modes.h
Classes for block cipher modes of operation.
M128_CAST
#define M128_CAST(x)
Clang workaround.
Definition: adv_simd.h:856
SecByteBlock
SecBlock<byte> typedef.
Definition: secblock.h:1091
CRYPTOPP_ASSERT
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:69
VecStore
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:891
LITTLE_ENDIAN_ORDER
@ LITTLE_ENDIAN_ORDER
byte order is little-endian
Definition: cryptlib.h:145
IsPowerOf2
bool IsPowerOf2(const T &value)
Tests whether a value is a power of 2.
Definition: misc.h:987
BlockOrientedCipherModeBase::Resynchronize
void Resynchronize(const byte *iv, int length=-1)
Resynchronize with an IV.
Definition: modes.h:260
XTS_ModeBase::ThrowIfInvalidKeyLength
void ThrowIfInvalidKeyLength(size_t length)
Validates the key length.
Definition: xts.cpp:203
xts.h
Classes for XTS block cipher mode of operation.
pch.h
Precompiled header file.
XTS_ModeBase
XTS block cipher mode of operation default implementation.
Definition: xts.h:49
aes.h
Class file for the AES cipher (Rijndael)
misc.h
Utility functions for the Crypto++ library.
xorbuf
CRYPTOPP_DLL void CRYPTOPP_API xorbuf(byte *buf, const byte *mask, size_t count)
Performs an XOR of a buffer with a mask.
Definition: misc.cpp:46
XTS_ModeBase::SetKey
void SetKey(const byte *key, size_t length, const NameValuePairs &params=g_nullNameValuePairs)
Sets or reset the key of this object.
Definition: xts.cpp:210
ByteOrder
ByteOrder
Provides the byte ordering.
Definition: cryptlib.h:143
cpu.h
Functions for CPU features and intrinsics.
VecLoad
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:365
XTS_ModeBase::ProcessLastBlock
size_t ProcessLastBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
Encrypt or decrypt the last block of data.
Definition: xts.cpp:368
SecBlock::size
size_type size() const
Provides the count of elements in the SecBlock.
Definition: secblock.h:829
InvalidArgument
An invalid argument was detected.
Definition: cryptlib.h:202
VecXor
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:1410
CryptoPP
Crypto++ library namespace.
CONST_M128_CAST
#define CONST_M128_CAST(x)
Clang workaround.
Definition: adv_simd.h:861
ppc_simd.h
Support functions for PowerPC and vector operations.
InvalidKeyLength
Exception thrown when an invalid key length is encountered.
Definition: simple.h:55
XTS_ModeBase::ProcessData
void ProcessData(byte *outString, const byte *inString, size_t length)
Encrypt or decrypt an array of bytes.
Definition: xts.cpp:256
NameValuePairs
Interface for retrieving values given their names.
Definition: cryptlib.h:321
threefish.h
Classes for the Threefish block cipher.
BlockTransformation::BT_AllowParallel
@ BT_AllowParallel
Allow parallel transformations.
Definition: cryptlib.h:925