diff --git a/doc/crypto.dox b/doc/crypto.dox index b053ab84..2c709fc2 100644 --- a/doc/crypto.dox +++ b/doc/crypto.dox @@ -47,6 +47,11 @@ constant-time, and much more secure. AES128, AES192, and AES256 are provided for use in applications where compatibility with other systems is desirable. +If code size is an issue for your application (for example on very low end +Arduino variants), then Speck on AVR is less than half the code size of +ChaCha, at the cost of more data memory for the state and longer key +setup times. + BLAKE2s and BLAKE2b are variations on the ChaCha stream cipher, designed for hashing, with 256-bit and 512-bit hash outputs respectively. They are intended as high performance replacements for SHA256 and SHA512 for when @@ -71,9 +76,9 @@ Ardunino Mega 2560 running at 16 MHz are similar: ChaCha (20 rounds)14.87us14.88us43.74us132 ChaCha (12 rounds)10.38us10.38us43.74us132 ChaCha (8 rounds)8.13us8.14us43.74us132 -Speck (128-bit key)N.NNusN.NNusN.NNus275 -Speck (192-bit key)N.NNusN.NNusN.NNus275 -Speck (256-bit key)N.NNusN.NNusN.NNus275 +Speck (128-bit key, ECB mode)10.72us11.09us304.56us275 +Speck (192-bit key, ECB mode)11.03us11.42us316.32us275 +Speck (256-bit key, ECB mode)11.35us11.74us328.33us275 AEAD AlgorithmEncryption (per byte)Decryption (per byte)Key SetupState Size (bytes) ChaChaPoly41.23us41.23us902.55us255 @@ -122,6 +127,9 @@ All figures are for the Arduino Due running at 84 MHz: ChaCha (20 rounds)0.87us0.88us4.96us136 ChaCha (12 rounds)0.70us0.71us4.96us136 ChaCha (8 rounds)0.62us0.62us4.96us136 +Speck (128-bit key, ECB mode)0.88us1.17us37.54us288 +Speck (192-bit key, ECB mode)0.90us1.20us38.92us288 +Speck (256-bit key, ECB mode)0.93us1.23us40.10us288 AEAD AlgorithmEncryption (per byte)Decryption (per byte)Key SetupState Size (bytes) ChaChaPoly1.66us1.66us45.02us280 diff --git a/libraries/Crypto/Speck.cpp b/libraries/Crypto/Speck.cpp index 7bc4b9a5..c305bf68 100644 --- a/libraries/Crypto/Speck.cpp +++ b/libraries/Crypto/Speck.cpp @@ -37,7 +37,7 @@ * This class implements the Speck family that uses 128-bit block sizes * with 128-bit, 192-bit, or 256-bit key sizes. Other Speck families support * smaller block sizes of 32, 48, 64, or 96 bits but such block sizes are - * really too small for use in modern cryptosystems. + * too small for use in modern cryptosystems. * * \note Current crytoanalysis (up until 2015) has not revealed any obvious * weaknesses in the full-round version of Speck. But if you are wary of @@ -47,6 +47,16 @@ * http://eprint.iacr.org/2013/404 */ +// The "avr-gcc" compiler doesn't do a very good job of compiling +// code involving 64-bit values. So we have to use inline assembly. +// It also helps to break the state up into 32-bit quantities +// because "asm" supports register names like %A0, %B0, %C0, %D0 +// for the bytes in a 32-bit quantity, but it does not support +// %E0, %F0, %G0, %H0 for the high bytes of a 64-bit quantity. +#if defined(__AVR__) +#define USE_AVR_INLINE_ASM 1 +#endif + /** * \brief Constructs a Speck block cipher with no initial key. * @@ -74,33 +84,7 @@ size_t Speck::keySize() const return 32; } -// Pack/unpack big-endian 64-bit quantities. -#if defined(__AVR__) -#define pack64(data, value) \ - do { \ - const uint8_t *src = (const uint8_t *)&(value); \ - (data)[0] = src[7]; \ - (data)[1] = src[6]; \ - (data)[2] = src[5]; \ - (data)[3] = src[4]; \ - (data)[4] = src[3]; \ - (data)[5] = src[2]; \ - (data)[6] = src[1]; \ - (data)[7] = src[0]; \ - } while (0) -#define unpack64(value, data) \ - do { \ - uint8_t *dest = (uint8_t *)&(value); \ - dest[0] = (data)[7]; \ - dest[1] = (data)[6]; \ - dest[2] = (data)[5]; \ - dest[3] = (data)[4]; \ - dest[4] = (data)[3]; \ - dest[5] = (data)[2]; \ - dest[6] = (data)[1]; \ - dest[7] = (data)[0]; \ - } while (0) -#else +// Pack/unpack byte-aligned big-endian 64-bit quantities. #define pack64(data, value) \ do { \ uint64_t v = htobe64((value)); \ @@ -111,10 +95,182 @@ size_t Speck::keySize() const memcpy(&(value), (data), sizeof(uint64_t)); \ (value) = be64toh((value)); \ } while (0) -#endif bool Speck::setKey(const uint8_t *key, size_t len) { +#if USE_AVR_INLINE_ASM + uint64_t l[4]; + uint8_t m, mb; + if (len == 32) { + m = 4; + mb = 3 * 8; + } else if (len == 24) { + m = 3; + mb = 2 * 8; + } else if (len == 16) { + m = 2; + mb = 8; + } else { + return false; + } + rounds = 30 + m; + + // Copy the first (m - 1) * 8 bytes of the key into the "l" array + // in reverse order to convert big endian into little-endian. + __asm__ __volatile__ ( + "1:\n" + "ld __tmp_reg__,-Z\n" + "st X+,__tmp_reg__\n" + "dec %2\n" + "brne 1b\n" + : : "x"(l), "z"(key + len - 8), "r"(mb) + ); + + // Copy the final 8 bytes of the key into k[0] in reverse order. + __asm__ __volatile__ ( + "1:\n" + "ld __tmp_reg__,-Z\n" + "st X+,__tmp_reg__\n" + "dec %2\n" + "brne 1b\n" + : : "x"(k), "z"(key + len), "r"(8) + ); + + // Expand the key to the full key schedule. + __asm__ __volatile__ ( + "1:\n" + // l[li_out] = (k[i] + rightRotate8_64(l[li_in])) ^ i; + "add %A1,%2\n" // X = &(l[li_in]) + "adc %B1,__zero_reg__\n" + "ld r8,X+\n" // x = l[li_in] + "ld r9,X+\n" + "ld r10,X+\n" + "ld r11,X+\n" + "ld r12,X+\n" + "ld r13,X+\n" + "ld r14,X+\n" + "ld r15,X+\n" + + "mov __tmp_reg__,r8\n" // x = rightRotate8_64(l[li_in]) + "mov r8,r9\n" + "mov r9,r10\n" + "mov r10,r11\n" + "mov r11,r12\n" + "mov r12,r13\n" + "mov r13,r14\n" + "mov r14,r15\n" + "mov r15,__tmp_reg__\n" + + "ld r16,Z+\n" // y = k[i] + "ld r17,Z+\n" + "ld r18,Z+\n" + "ld r19,Z+\n" + "ld r20,Z+\n" + "ld r21,Z+\n" + "ld r22,Z+\n" + "ld r23,Z+\n" + + "add r8,r16\n" // x += y + "adc r9,r17\n" + "adc r10,r18\n" + "adc r11,r19\n" + "adc r12,r20\n" + "adc r13,r21\n" + "adc r14,r22\n" + "adc r15,r23\n" + + "eor r8,%4\n" // x ^= i + + // X = X - li_in + li_out + "ldi r24,8\n" // li_in = li_in + 1 + "add %2,r24\n" + "sub %A1,%2\n" // return X to its initial value + "sbc %B1,__zero_reg__\n" + "ldi r25,0x1f\n" + "and %2,r25\n" // li_in = li_in % 4 + "add %A1,%3\n" // X = &(l[li_out]) + "adc %B1,__zero_reg__\n" + + "st X+,r8\n" // l[li_out] = x + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + "st X+,r14\n" + "st X+,r15\n" + + "add %3,r24\n" // li_out = li_out + 1 + "sub %A1,%3\n" // return X to its initial value + "sbc %B1,__zero_reg__\n" + "and %3,r25\n" // li_out = li_out % 4 + + // k[i + 1] = leftRotate3_64(k[i]) ^ l[li_out]; + "lsl r16\n" // y = leftRotate1_64(y) + "rol r17\n" + "rol r18\n" + "rol r19\n" + "rol r20\n" + "rol r21\n" + "rol r22\n" + "rol r23\n" + "adc r16,__zero_reg__\n" + + "lsl r16\n" // y = leftRotate1_64(y) + "rol r17\n" + "rol r18\n" + "rol r19\n" + "rol r20\n" + "rol r21\n" + "rol r22\n" + "rol r23\n" + "adc r16,__zero_reg__\n" + + "lsl r16\n" // y = leftRotate1_64(y) + "rol r17\n" + "rol r18\n" + "rol r19\n" + "rol r20\n" + "rol r21\n" + "rol r22\n" + "rol r23\n" + "adc r16,__zero_reg__\n" + + "eor r16,r8\n" // y ^= x + "eor r17,r9\n" + "eor r18,r10\n" + "eor r19,r11\n" + "eor r20,r12\n" + "eor r21,r13\n" + "eor r22,r14\n" + "eor r23,r15\n" + + "st Z,r16\n" // k[i + 1] = y + "std Z+1,r17\n" + "std Z+2,r18\n" + "std Z+3,r19\n" + "std Z+4,r20\n" + "std Z+5,r21\n" + "std Z+6,r22\n" + "std Z+7,r23\n" + + // Loop + "inc %4\n" // ++i + "dec %5\n" // --rounds + "breq 2f\n" + "rjmp 1b\n" + "2:\n" + + : : "z"(k), "x"(l), + "r"((uint8_t)0), // initial value of li_in + "r"((uint8_t)((m - 1) * 8)), // initial value of li_out + "r"(0), // initial value of i + "r"(rounds - 1) + : "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", + "r24", "r25" + ); +#else uint64_t l[4]; uint8_t m; if (len == 32) { @@ -146,12 +302,149 @@ bool Speck::setKey(const uint8_t *key, size_t len) if ((++li_out) >= m) li_out = 0; } +#endif clean(l); return true; } void Speck::encryptBlock(uint8_t *output, const uint8_t *input) { +#if USE_AVR_INLINE_ASM + uint32_t xlow, xhigh, ylow, yhigh; + + // Unpack the input into the x and y variables, converting + // from big-endian into little-endian in the process. + __asm__ __volatile__ ( + "ld %D1,Z\n" + "ldd %C1,Z+1\n" + "ldd %B1,Z+2\n" + "ldd %A1,Z+3\n" + "ldd %D0,Z+4\n" + "ldd %C0,Z+5\n" + "ldd %B0,Z+6\n" + "ldd %A0,Z+7\n" + "ldd %D3,Z+8\n" + "ldd %C3,Z+9\n" + "ldd %B3,Z+10\n" + "ldd %A3,Z+11\n" + "ldd %D2,Z+12\n" + "ldd %C2,Z+13\n" + "ldd %B2,Z+14\n" + "ldd %A2,Z+15\n" + : "=r"(xlow), "=r"(xhigh), "=r"(ylow), "=r"(yhigh) + : "z"(input) + ); + + // Perform all encryption rounds. Z points to the key schedule. + __asm__ __volatile__ ( + "1:\n" + // x = (rightRotate8_64(x) + y) ^ *s++; + "mov __tmp_reg__,%A0\n" // x = rightRotate8_64(x) + "mov %A0,%B0\n" + "mov %B0,%C0\n" + "mov %C0,%D0\n" + "mov %D0,%A1\n" + "mov %A1,%B1\n" + "mov %B1,%C1\n" + "mov %C1,%D1\n" + "mov %D1,__tmp_reg__\n" + + "add %A0,%A2\n" // x += y + "adc %B0,%B2\n" + "adc %C0,%C2\n" + "adc %D0,%D2\n" + "adc %A1,%A3\n" + "adc %B1,%B3\n" + "adc %C1,%C3\n" + "adc %D1,%D3\n" + + "ld __tmp_reg__,Z+\n" // x ^= *s++ + "eor %A0,__tmp_reg__\n" + "ld __tmp_reg__,Z+\n" + "eor %B0,__tmp_reg__\n" + "ld __tmp_reg__,Z+\n" + "eor %C0,__tmp_reg__\n" + "ld __tmp_reg__,Z+\n" + "eor %D0,__tmp_reg__\n" + "ld __tmp_reg__,Z+\n" + "eor %A1,__tmp_reg__\n" + "ld __tmp_reg__,Z+\n" + "eor %B1,__tmp_reg__\n" + "ld __tmp_reg__,Z+\n" + "eor %C1,__tmp_reg__\n" + "ld __tmp_reg__,Z+\n" + "eor %D1,__tmp_reg__\n" + + // y = leftRotate3_64(y) ^ x; + "lsl %A2\n" // y = leftRotate1_64(y) + "rol %B2\n" + "rol %C2\n" + "rol %D2\n" + "rol %A3\n" + "rol %B3\n" + "rol %C3\n" + "rol %D3\n" + "adc %A2,__zero_reg__\n" + + "lsl %A2\n" // y = leftRotate1_64(y) + "rol %B2\n" + "rol %C2\n" + "rol %D2\n" + "rol %A3\n" + "rol %B3\n" + "rol %C3\n" + "rol %D3\n" + + "adc %A2,__zero_reg__\n" + "lsl %A2\n" // y = leftRotate1_64(y) + "rol %B2\n" + "rol %C2\n" + "rol %D2\n" + "rol %A3\n" + "rol %B3\n" + "rol %C3\n" + "rol %D3\n" + "adc %A2,__zero_reg__\n" + + "eor %A2,%A0\n" // y ^= x + "eor %B2,%B0\n" + "eor %C2,%C0\n" + "eor %D2,%D0\n" + "eor %A3,%A1\n" + "eor %B3,%B1\n" + "eor %C3,%C1\n" + "eor %D3,%D1\n" + + // Loop + "dec %5\n" // --round + "breq 2f\n" + "rjmp 1b\n" + "2:\n" + : "+r"(xlow), "+r"(xhigh), "+r"(ylow), "+r"(yhigh) + : "z"(k), "r"(rounds) + ); + + // Pack the results into the output and convert back to big-endian. + __asm__ __volatile__ ( + "st Z,%D1\n" + "std Z+1,%C1\n" + "std Z+2,%B1\n" + "std Z+3,%A1\n" + "std Z+4,%D0\n" + "std Z+5,%C0\n" + "std Z+6,%B0\n" + "std Z+7,%A0\n" + "std Z+8,%D3\n" + "std Z+9,%C3\n" + "std Z+10,%B3\n" + "std Z+11,%A3\n" + "std Z+12,%D2\n" + "std Z+13,%C2\n" + "std Z+14,%B2\n" + "std Z+15,%A2\n" + : : "r"(xlow), "r"(xhigh), "r"(ylow), "r"(yhigh), "z"(output) + ); +#else uint64_t x, y; const uint64_t *s = k; unpack64(x, input); @@ -162,10 +455,150 @@ void Speck::encryptBlock(uint8_t *output, const uint8_t *input) } pack64(output, x); pack64(output + 8, y); +#endif } void Speck::decryptBlock(uint8_t *output, const uint8_t *input) { +#if USE_AVR_INLINE_ASM + uint32_t xlow, xhigh, ylow, yhigh; + + // Unpack the input into the x and y variables, converting + // from big-endian into little-endian in the process. + __asm__ __volatile__ ( + "ld %D1,Z\n" + "ldd %C1,Z+1\n" + "ldd %B1,Z+2\n" + "ldd %A1,Z+3\n" + "ldd %D0,Z+4\n" + "ldd %C0,Z+5\n" + "ldd %B0,Z+6\n" + "ldd %A0,Z+7\n" + "ldd %D3,Z+8\n" + "ldd %C3,Z+9\n" + "ldd %B3,Z+10\n" + "ldd %A3,Z+11\n" + "ldd %D2,Z+12\n" + "ldd %C2,Z+13\n" + "ldd %B2,Z+14\n" + "ldd %A2,Z+15\n" + : "=r"(xlow), "=r"(xhigh), "=r"(ylow), "=r"(yhigh) + : "z"(input) + ); + + // Perform all decryption rounds. Z points to the end of key schedule. + __asm__ __volatile__ ( + "1:\n" + // y = rightRotate3_64(x ^ y); + "eor %A2,%A0\n" // y ^= x + "eor %B2,%B0\n" + "eor %C2,%C0\n" + "eor %D2,%D0\n" + "eor %A3,%A1\n" + "eor %B3,%B1\n" + "eor %C3,%C1\n" + "eor %D3,%D1\n" + + "bst %A2,0\n" // y = rightRotate1_64(y) + "ror %D3\n" + "ror %C3\n" + "ror %B3\n" + "ror %A3\n" + "ror %D2\n" + "ror %C2\n" + "ror %B2\n" + "ror %A2\n" + "bld %D3,7\n" + + "bst %A2,0\n" // y = rightRotate1_64(y) + "ror %D3\n" + "ror %C3\n" + "ror %B3\n" + "ror %A3\n" + "ror %D2\n" + "ror %C2\n" + "ror %B2\n" + "ror %A2\n" + "bld %D3,7\n" + + "bst %A2,0\n" // y = rightRotate1_64(y) + "ror %D3\n" + "ror %C3\n" + "ror %B3\n" + "ror %A3\n" + "ror %D2\n" + "ror %C2\n" + "ror %B2\n" + "ror %A2\n" + "bld %D3,7\n" + + // x = leftRotate8_64((x ^ *s--) - y); + "ld __tmp_reg__,-Z\n" // x ^= *s-- + "eor %D1,__tmp_reg__\n" + "ld __tmp_reg__,-Z\n" + "eor %C1,__tmp_reg__\n" + "ld __tmp_reg__,-Z\n" + "eor %B1,__tmp_reg__\n" + "ld __tmp_reg__,-Z\n" + "eor %A1,__tmp_reg__\n" + "ld __tmp_reg__,-Z\n" + "eor %D0,__tmp_reg__\n" + "ld __tmp_reg__,-Z\n" + "eor %C0,__tmp_reg__\n" + "ld __tmp_reg__,-Z\n" + "eor %B0,__tmp_reg__\n" + "ld __tmp_reg__,-Z\n" + "eor %A0,__tmp_reg__\n" + + "sub %A0,%A2\n" // x -= y + "sbc %B0,%B2\n" + "sbc %C0,%C2\n" + "sbc %D0,%D2\n" + "sbc %A1,%A3\n" + "sbc %B1,%B3\n" + "sbc %C1,%C3\n" + "sbc %D1,%D3\n" + + "mov __tmp_reg__,%D1\n" // x = lefRotate8_64(x) + "mov %D1,%C1\n" + "mov %C1,%B1\n" + "mov %B1,%A1\n" + "mov %A1,%D0\n" + "mov %D0,%C0\n" + "mov %C0,%B0\n" + "mov %B0,%A0\n" + "mov %A0,__tmp_reg__\n" + + // Loop + "dec %5\n" // --round + "breq 2f\n" + "rjmp 1b\n" + "2:\n" + : "+r"(xlow), "+r"(xhigh), "+r"(ylow), "+r"(yhigh) + : "z"(k + rounds), "r"(rounds) + ); + + // Pack the results into the output and convert back to big-endian. + __asm__ __volatile__ ( + "st Z,%D1\n" + "std Z+1,%C1\n" + "std Z+2,%B1\n" + "std Z+3,%A1\n" + "std Z+4,%D0\n" + "std Z+5,%C0\n" + "std Z+6,%B0\n" + "std Z+7,%A0\n" + "std Z+8,%D3\n" + "std Z+9,%C3\n" + "std Z+10,%B3\n" + "std Z+11,%A3\n" + "std Z+12,%D2\n" + "std Z+13,%C2\n" + "std Z+14,%B2\n" + "std Z+15,%A2\n" + : : "r"(xlow), "r"(xhigh), "r"(ylow), "r"(yhigh), "z"(output) + ); +#else uint64_t x, y; const uint64_t *s = k + rounds - 1; unpack64(x, input); @@ -176,6 +609,7 @@ void Speck::decryptBlock(uint8_t *output, const uint8_t *input) } pack64(output, x); pack64(output + 8, y); +#endif } void Speck::clear()