Crypto++  8.6
Free C++ class library of cryptographic schemes
xts.cpp
1 // xts.cpp - written and placed in the public domain by Jeffrey Walton
2 
3 // Aarch32, Aarch64, Altivec and X86_64 include SIMD as part of the
4 // base architecture. We can use the SIMD code below without an
5 // architecture option. No runtime tests are required. Unfortunately,
6 // we can't use it on Altivec because an architecture switch is required.
7 // The updated XorBuffer gains 0.3 to 1.5 cpb on the architectures for
8 // 16-byte block sizes.
9 
10 #include "pch.h"
11 
12 #include "xts.h"
13 #include "misc.h"
14 #include "modes.h"
15 #include "cpu.h"
16 
17 #if defined(CRYPTOPP_DEBUG)
18 # include "aes.h"
19 # include "threefish.h"
20 #endif
21 
22 // 0.3 to 0.4 cpb profit
23 #if defined(__SSE2__) || defined(_M_X64)
24 # include <emmintrin.h>
25 #endif
26 
27 #if defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM64)
28 # if (CRYPTOPP_ARM_NEON_HEADER)
29 # include <arm_neon.h>
30 # endif
31 #endif
32 
33 #if defined(__ALTIVEC__)
34 # include "ppc_simd.h"
35 #endif
36 
37 ANONYMOUS_NAMESPACE_BEGIN
38 
39 using namespace CryptoPP;
40 
41 #if defined(CRYPTOPP_DEBUG) && !defined(CRYPTOPP_DOXYGEN_PROCESSING)
42 
43 using CryptoPP::AES;
44 using CryptoPP::XTS_Mode;
45 using CryptoPP::Threefish512;
46 
47 void Modes_TestInstantiations()
48 {
49  XTS_Mode<AES>::Encryption m0;
50  XTS_Mode<AES>::Decryption m1;
51  XTS_Mode<AES>::Encryption m2;
52  XTS_Mode<AES>::Decryption m3;
53 
54 #if CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
55  XTS_Mode<Threefish512>::Encryption m4;
56  XTS_Mode<Threefish512>::Decryption m5;
57 #endif
58 }
59 #endif // CRYPTOPP_DEBUG
60 
61 inline void XorBuffer(byte *output, const byte *input, const byte *mask, size_t count)
62 {
63  CRYPTOPP_ASSERT(count >= 16 && (count % 16 == 0));
64 
65 #if defined(CRYPTOPP_DISABLE_ASM)
66  xorbuf(output, input, mask, count);
67 
68 #elif defined(__SSE2__) || defined(_M_X64)
69  for (size_t i=0; i<count; i+=16)
70  _mm_storeu_si128(M128_CAST(output+i),
71  _mm_xor_si128(
72  _mm_loadu_si128(CONST_M128_CAST(input+i)),
73  _mm_loadu_si128(CONST_M128_CAST(mask+i))));
74 
75 #elif defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM64)
76  for (size_t i=0; i<count; i+=16)
77  vst1q_u8(output+i, veorq_u8(vld1q_u8(input+i), vld1q_u8(mask+i)));
78 
79 #elif defined(__ALTIVEC__)
80  for (size_t i=0; i<count; i+=16)
81  VecStore(VecXor(VecLoad(input+i), VecLoad(mask+i)), output+i);
82 
83 #else
84  xorbuf(output, input, mask, count);
85 #endif
86 }
87 
88 inline void XorBuffer(byte *buf, const byte *mask, size_t count)
89 {
90  XorBuffer(buf, buf, mask, count);
91 }
92 
93 // Borrowed from CMAC, but little-endian representation
94 inline void GF_Double(byte *out, const byte* in, unsigned int len)
95 {
96 #if defined(_M_X64) || defined(_M_ARM64) || defined(_LP64) || defined(__LP64__)
97  word64 carry = 0, x;
98  for (size_t i=0, idx=0; i<len/8; ++i, idx+=8)
99  {
100  x = GetWord<word64>(false, LITTLE_ENDIAN_ORDER, in+idx);
101  word64 y = (x >> 63); x = (x << 1) + carry;
102  PutWord<word64>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
103  carry = y;
104  }
105 #else
106  word32 carry = 0, x;
107  for (size_t i=0, idx=0; i<len/4; ++i, idx+=4)
108  {
109  x = GetWord<word32>(false, LITTLE_ENDIAN_ORDER, in+idx);
110  word32 y = (x >> 31); x = (x << 1) + carry;
111  PutWord<word32>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
112  carry = y;
113  }
114 #endif
115 
116 #if CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
117 
119  CRYPTOPP_ASSERT(len >= 16);
120  CRYPTOPP_ASSERT(len <= 128);
121 
122  byte* k = out;
123  if (carry)
124  {
125  switch (len)
126  {
127  case 16:
128  {
129  const size_t LEIDX = 16-1;
130  k[LEIDX-15] ^= 0x87;
131  break;
132  }
133  case 32:
134  {
135  // https://crypto.stackexchange.com/q/9815/10496
136  // Polynomial x^256 + x^10 + x^5 + x^2 + 1
137  const size_t LEIDX = 32-1;
138  k[LEIDX-30] ^= 4;
139  k[LEIDX-31] ^= 0x25;
140  break;
141  }
142  case 64:
143  {
144  // https://crypto.stackexchange.com/q/9815/10496
145  // Polynomial x^512 + x^8 + x^5 + x^2 + 1
146  const size_t LEIDX = 64-1;
147  k[LEIDX-62] ^= 1;
148  k[LEIDX-63] ^= 0x25;
149  break;
150  }
151  case 128:
152  {
153  // https://crypto.stackexchange.com/q/9815/10496
154  // Polynomial x^1024 + x^19 + x^6 + x + 1
155  const size_t LEIDX = 128-1;
156  k[LEIDX-125] ^= 8;
157  k[LEIDX-126] ^= 0x00;
158  k[LEIDX-127] ^= 0x43;
159  break;
160  }
161  default:
162  CRYPTOPP_ASSERT(0);
163  }
164  }
165 #else
166  CRYPTOPP_ASSERT(len == 16);
167 
168  byte* k = out;
169  if (carry)
170  {
171  k[0] ^= 0x87;
172  return;
173  }
174 #endif // CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
175 }
176 
177 inline void GF_Double(byte *inout, unsigned int len)
178 {
179  GF_Double(inout, inout, len);
180 }
181 
182 ANONYMOUS_NAMESPACE_END
183 
184 NAMESPACE_BEGIN(CryptoPP)
185 
186 void XTS_ModeBase::ThrowIfInvalidBlockSize(size_t length)
187 {
188 #if CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
189  CRYPTOPP_ASSERT(length >= 16 && length <= 128 && IsPowerOf2(length));
190  if (length < 16 || length > 128 || !IsPowerOf2(length))
191  throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not valid");
192 #else
193  CRYPTOPP_ASSERT(length == 16);
194  if (length != 16)
195  throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not 16");
196 #endif
197 }
198 
200 {
201  CRYPTOPP_ASSERT(length % 2 == 0);
202  if (!GetBlockCipher().IsValidKeyLength((length+1)/2))
203  throw InvalidKeyLength(AlgorithmName(), length);
204 }
205 
206 void XTS_ModeBase::SetKey(const byte *key, size_t length, const NameValuePairs &params)
207 {
208  ThrowIfInvalidKeyLength(length);
209  ThrowIfInvalidBlockSize(BlockSize());
210 
211  const size_t klen = length/2;
212  AccessBlockCipher().SetKey(key+0, klen, params);
213  AccessTweakCipher().SetKey(key+klen, klen, params);
214 
215  ResizeBuffers();
216 
217  size_t ivLength;
218  const byte *iv = GetIVAndThrowIfInvalid(params, ivLength);
219  Resynchronize(iv, (int)ivLength);
220 }
221 
222 void XTS_ModeBase::Resynchronize(const byte *iv, int ivLength)
223 {
225  std::memcpy(m_xregister, m_register, ivLength);
226  GetTweakCipher().ProcessBlock(m_xregister);
227 }
228 
230 {
231  SecByteBlock iv(GetTweakCipher().BlockSize());
232  PutWord<word64>(false, order, iv, sector);
233  std::memset(iv+8, 0x00, iv.size()-8);
234 
236  std::memcpy(m_xregister, iv, iv.size());
237  GetTweakCipher().ProcessBlock(m_xregister);
238 }
239 
240 void XTS_ModeBase::ResizeBuffers()
241 {
242  BlockOrientedCipherModeBase::ResizeBuffers();
243  m_xworkspace.New(GetBlockCipher().BlockSize()*ParallelBlocks);
244  m_xregister.New(GetBlockCipher().BlockSize()*ParallelBlocks);
245 }
246 
247 // ProcessData runs either 12-4-1 blocks, 8-2-1 or 4-1 blocks. Which is
248 // selected depends on ParallelBlocks in the header file. 12-4-1 or 8-2-1
249 // can be used on Aarch64 and PowerPC. Intel should use 4-1 due to lack
250 // of registers. The unneeded code paths should be removed by optimizer.
251 // The extra gyrations save us 1.8 cpb on Aarch64 and 2.1 cpb on PowerPC.
252 void XTS_ModeBase::ProcessData(byte *outString, const byte *inString, size_t length)
253 {
254  // data unit is multiple of 16 bytes
255  CRYPTOPP_ASSERT(length % BlockSize() == 0);
256 
257  enum { lastParallelBlock = ParallelBlocks-1 };
258  const unsigned int blockSize = GetBlockCipher().BlockSize();
259  const size_t parallelSize = blockSize*ParallelBlocks;
260 
261  // encrypt the data unit, optimal size at a time
262  while (length >= parallelSize)
263  {
264  // m_xregister[0] always points to the next tweak.
265  GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
266  GF_Double(m_xregister+2*blockSize, m_xregister+1*blockSize, blockSize);
267  GF_Double(m_xregister+3*blockSize, m_xregister+2*blockSize, blockSize);
268 
269  if (ParallelBlocks > 4)
270  {
271  GF_Double(m_xregister+4*blockSize, m_xregister+3*blockSize, blockSize);
272  GF_Double(m_xregister+5*blockSize, m_xregister+4*blockSize, blockSize);
273  GF_Double(m_xregister+6*blockSize, m_xregister+5*blockSize, blockSize);
274  GF_Double(m_xregister+7*blockSize, m_xregister+6*blockSize, blockSize);
275  }
276  if (ParallelBlocks > 8)
277  {
278  GF_Double(m_xregister+8*blockSize, m_xregister+7*blockSize, blockSize);
279  GF_Double(m_xregister+9*blockSize, m_xregister+8*blockSize, blockSize);
280  GF_Double(m_xregister+10*blockSize, m_xregister+9*blockSize, blockSize);
281  GF_Double(m_xregister+11*blockSize, m_xregister+10*blockSize, blockSize);
282  }
283 
284  // merge the tweak into the input block
285  XorBuffer(m_xworkspace, inString, m_xregister, parallelSize);
286 
287  // encrypt one block, merge the tweak into the output block
288  GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
289  outString, parallelSize, BlockTransformation::BT_AllowParallel);
290 
291  // m_xregister[0] always points to the next tweak.
292  GF_Double(m_xregister+0, m_xregister+lastParallelBlock*blockSize, blockSize);
293 
294  inString += parallelSize;
295  outString += parallelSize;
296  length -= parallelSize;
297  }
298 
299  // encrypt the data unit, 4 blocks at a time
300  while (ParallelBlocks == 12 && length >= blockSize*4)
301  {
302  // m_xregister[0] always points to the next tweak.
303  GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
304  GF_Double(m_xregister+2*blockSize, m_xregister+1*blockSize, blockSize);
305  GF_Double(m_xregister+3*blockSize, m_xregister+2*blockSize, blockSize);
306 
307  // merge the tweak into the input block
308  XorBuffer(m_xworkspace, inString, m_xregister, blockSize*4);
309 
310  // encrypt one block, merge the tweak into the output block
311  GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
312  outString, blockSize*4, BlockTransformation::BT_AllowParallel);
313 
314  // m_xregister[0] always points to the next tweak.
315  GF_Double(m_xregister+0, m_xregister+3*blockSize, blockSize);
316 
317  inString += blockSize*4;
318  outString += blockSize*4;
319  length -= blockSize*4;
320  }
321 
322  // encrypt the data unit, 2 blocks at a time
323  while (ParallelBlocks == 8 && length >= blockSize*2)
324  {
325  // m_xregister[0] always points to the next tweak.
326  GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
327 
328  // merge the tweak into the input block
329  XorBuffer(m_xworkspace, inString, m_xregister, blockSize*2);
330 
331  // encrypt one block, merge the tweak into the output block
332  GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
333  outString, blockSize*2, BlockTransformation::BT_AllowParallel);
334 
335  // m_xregister[0] always points to the next tweak.
336  GF_Double(m_xregister+0, m_xregister+1*blockSize, blockSize);
337 
338  inString += blockSize*2;
339  outString += blockSize*2;
340  length -= blockSize*2;
341  }
342 
343  // encrypt the data unit, blocksize at a time
344  while (length)
345  {
346  // merge the tweak into the input block
347  XorBuffer(m_xworkspace, inString, m_xregister, blockSize);
348 
349  // encrypt one block
350  GetBlockCipher().ProcessBlock(m_xworkspace);
351 
352  // merge the tweak into the output block
353  XorBuffer(outString, m_xworkspace, m_xregister, blockSize);
354 
355  // Multiply T by alpha
356  GF_Double(m_xregister, blockSize);
357 
358  inString += blockSize;
359  outString += blockSize;
360  length -= blockSize;
361  }
362 }
363 
364 size_t XTS_ModeBase::ProcessLastBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
365 {
366  // need at least a full AES block
367  CRYPTOPP_ASSERT(inLength >= BlockSize());
368 
369  if (inLength < BlockSize())
370  throw InvalidArgument("XTS: message is too short for ciphertext stealing");
371 
372  if (IsForwardTransformation())
373  return ProcessLastPlainBlock(outString, outLength, inString, inLength);
374  else
375  return ProcessLastCipherBlock(outString, outLength, inString, inLength);
376 }
377 
378 size_t XTS_ModeBase::ProcessLastPlainBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
379 {
380  // ensure output buffer is large enough
381  CRYPTOPP_ASSERT(outLength >= inLength);
382 
383  const unsigned int blockSize = GetBlockCipher().BlockSize();
384  const size_t blocks = inLength / blockSize;
385  const size_t tail = inLength % blockSize;
386  outLength = inLength;
387 
388  if (tail == 0)
389  {
390  // Allow ProcessData to handle all the full blocks
391  ProcessData(outString, inString, inLength);
392  return inLength;
393  }
394  else if (blocks > 1)
395  {
396  // Allow ProcessData to handle full blocks except one
397  const size_t head = (blocks-1)*blockSize;
398  ProcessData(outString, inString, inLength-head);
399 
400  outString += head;
401  inString += head; inLength -= head;
402  }
403 
404  ///// handle the full block /////
405 
406  // merge the tweak into the input block
407  XorBuffer(m_xworkspace, inString, m_xregister, blockSize);
408 
409  // encrypt one block
410  GetBlockCipher().ProcessBlock(m_xworkspace);
411 
412  // merge the tweak into the output block
413  XorBuffer(outString, m_xworkspace, m_xregister, blockSize);
414 
415  // Multiply T by alpha
416  GF_Double(m_xregister, blockSize);
417 
418  ///// handle final partial block /////
419 
420  inString += blockSize;
421  outString += blockSize;
422  const size_t len = inLength-blockSize;
423 
424  // copy in the final plaintext bytes
425  std::memcpy(m_xworkspace, inString, len);
426  // and copy out the final ciphertext bytes
427  std::memcpy(outString, outString-blockSize, len);
428  // "steal" ciphertext to complete the block
429  std::memcpy(m_xworkspace+len, outString-blockSize+len, blockSize-len);
430 
431  // merge the tweak into the input block
432  XorBuffer(m_xworkspace, m_xregister, blockSize);
433 
434  // encrypt one block
435  GetBlockCipher().ProcessBlock(m_xworkspace);
436 
437  // merge the tweak into the previous output block
438  XorBuffer(outString-blockSize, m_xworkspace, m_xregister, blockSize);
439 
440  return outLength;
441 }
442 
443 size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
444 {
445  // ensure output buffer is large enough
446  CRYPTOPP_ASSERT(outLength >= inLength);
447 
448  const unsigned int blockSize = GetBlockCipher().BlockSize();
449  const size_t blocks = inLength / blockSize;
450  const size_t tail = inLength % blockSize;
451  outLength = inLength;
452 
453  if (tail == 0)
454  {
455  // Allow ProcessData to handle all the full blocks
456  ProcessData(outString, inString, inLength);
457  return inLength;
458  }
459  else if (blocks > 1)
460  {
461  // Allow ProcessData to handle full blocks except one
462  const size_t head = (blocks-1)*blockSize;
463  ProcessData(outString, inString, inLength-head);
464 
465  outString += head;
466  inString += head; inLength -= head;
467  }
468 
469  #define poly1 (m_xregister+0*blockSize)
470  #define poly2 (m_xregister+1*blockSize)
471  GF_Double(poly2, poly1, blockSize);
472 
473  ///// handle final partial block /////
474 
475  inString += blockSize;
476  outString += blockSize;
477  const size_t len = inLength-blockSize;
478 
479  // merge the tweak into the input block
480  XorBuffer(m_xworkspace, inString-blockSize, poly2, blockSize);
481 
482  // encrypt one block
483  GetBlockCipher().ProcessBlock(m_xworkspace);
484 
485  // merge the tweak into the output block
486  XorBuffer(m_xworkspace, poly2, blockSize);
487 
488  // copy in the final plaintext bytes
489  std::memcpy(outString-blockSize, inString, len);
490  // and copy out the final ciphertext bytes
491  std::memcpy(outString, m_xworkspace, len);
492  // "steal" ciphertext to complete the block
493  std::memcpy(outString-blockSize+len, m_xworkspace+len, blockSize-len);
494 
495  ///// handle the full previous block /////
496 
497  inString -= blockSize;
498  outString -= blockSize;
499 
500  // merge the tweak into the input block
501  XorBuffer(m_xworkspace, outString, poly1, blockSize);
502 
503  // encrypt one block
504  GetBlockCipher().ProcessBlock(m_xworkspace);
505 
506  // merge the tweak into the output block
507  XorBuffer(outString, m_xworkspace, poly1, blockSize);
508 
509  return outLength;
510 }
511 
512 NAMESPACE_END
XTS_ModeBase::Resynchronize
void Resynchronize(const byte *iv, int ivLength=-1)
Resynchronize with an IV.
Definition: xts.cpp:222
modes.h
Classes for block cipher modes of operation.
M128_CAST
#define M128_CAST(x)
Clang workaround.
Definition: adv_simd.h:609
SecByteBlock
SecBlock<byte> typedef.
Definition: secblock.h:1226
CRYPTOPP_ASSERT
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:68
VecStore
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:895
LITTLE_ENDIAN_ORDER
@ LITTLE_ENDIAN_ORDER
byte order is little-endian
Definition: cryptlib.h:145
IsPowerOf2
bool IsPowerOf2(const T &value)
Tests whether a value is a power of 2.
Definition: misc.h:1010
word64
unsigned long long word64
64-bit unsigned datatype
Definition: config_int.h:91
BlockOrientedCipherModeBase::Resynchronize
void Resynchronize(const byte *iv, int length=-1)
Resynchronize with an IV.
Definition: modes.h:260
XTS_ModeBase::ThrowIfInvalidKeyLength
void ThrowIfInvalidKeyLength(size_t length)
Validates the key length.
Definition: xts.cpp:199
xts.h
Classes for XTS block cipher mode of operation.
pch.h
Precompiled header file.
word32
unsigned int word32
32-bit unsigned datatype
Definition: config_int.h:62
XTS_ModeBase
XTS block cipher mode of operation default implementation.
Definition: xts.h:49
aes.h
Class file for the AES cipher (Rijndael)
misc.h
Utility functions for the Crypto++ library.
XTS_ModeBase::SetKey
void SetKey(const byte *key, size_t length, const NameValuePairs &params=g_nullNameValuePairs)
Sets or reset the key of this object.
Definition: xts.cpp:206
ByteOrder
ByteOrder
Provides the byte ordering.
Definition: cryptlib.h:143
cpu.h
Functions for CPU features and intrinsics.
VecLoad
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:369
XTS_ModeBase::ProcessLastBlock
size_t ProcessLastBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
Encrypt or decrypt the last block of data.
Definition: xts.cpp:364
SecBlock::size
size_type size() const
Provides the count of elements in the SecBlock.
Definition: secblock.h:867
InvalidArgument
An invalid argument was detected.
Definition: cryptlib.h:202
VecXor
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:1414
CryptoPP
Crypto++ library namespace.
xorbuf
CRYPTOPP_DLL void xorbuf(byte *buf, const byte *mask, size_t count)
Performs an XOR of a buffer with a mask.
CONST_M128_CAST
#define CONST_M128_CAST(x)
Clang workaround.
Definition: adv_simd.h:614
ppc_simd.h
Support functions for PowerPC and vector operations.
InvalidKeyLength
Exception thrown when an invalid key length is encountered.
Definition: simple.h:55
XTS_ModeBase::ProcessData
void ProcessData(byte *outString, const byte *inString, size_t length)
Encrypt or decrypt an array of bytes.
Definition: xts.cpp:252
NameValuePairs
Interface for retrieving values given their names.
Definition: cryptlib.h:321
threefish.h
Classes for the Threefish block cipher.
BlockTransformation::BT_AllowParallel
@ BT_AllowParallel
Allow parallel transformations.
Definition: cryptlib.h:925