1
0
mirror of https://github.com/taigrr/arduinolibs synced 2025-01-18 04:33:12 -08:00

AVR inline assembly version of Speck

This commit is contained in:
Rhys Weatherley 2015-12-05 14:13:11 +10:00
parent ee43158b64
commit 32d3d59cfb
2 changed files with 474 additions and 32 deletions

View File

@ -47,6 +47,11 @@ constant-time, and much more secure. AES128, AES192, and AES256 are
provided for use in applications where compatibility with other systems
is desirable.
If code size is an issue for your application (for example on very low end
Arduino variants), then Speck on AVR is less than half the code size of
ChaCha, at the cost of more data memory for the state and longer key
setup times.
BLAKE2s and BLAKE2b are variations on the ChaCha stream cipher, designed for
hashing, with 256-bit and 512-bit hash outputs respectively. They are
intended as high performance replacements for SHA256 and SHA512 for when
@ -71,9 +76,9 @@ Ardunino Mega 2560 running at 16 MHz are similar:
<tr><td>ChaCha (20 rounds)</td><td align="right">14.87us</td><td align="right">14.88us</td><td align="right">43.74us</td><td align="right">132</td></tr>
<tr><td>ChaCha (12 rounds)</td><td align="right">10.38us</td><td align="right">10.38us</td><td align="right">43.74us</td><td align="right">132</td></tr>
<tr><td>ChaCha (8 rounds)</td><td align="right">8.13us</td><td align="right">8.14us</td><td align="right">43.74us</td><td align="right">132</td></tr>
<tr><td>Speck (128-bit key)</td><td align="right">N.NNus</td><td align="right">N.NNus</td><td align="right">N.NNus</td><td align="right">275</td></tr>
<tr><td>Speck (192-bit key)</td><td align="right">N.NNus</td><td align="right">N.NNus</td><td align="right">N.NNus</td><td align="right">275</td></tr>
<tr><td>Speck (256-bit key)</td><td align="right">N.NNus</td><td align="right">N.NNus</td><td align="right">N.NNus</td><td align="right">275</td></tr>
<tr><td>Speck (128-bit key, ECB mode)</td><td align="right">10.72us</td><td align="right">11.09us</td><td align="right">304.56us</td><td align="right">275</td></tr>
<tr><td>Speck (192-bit key, ECB mode)</td><td align="right">11.03us</td><td align="right">11.42us</td><td align="right">316.32us</td><td align="right">275</td></tr>
<tr><td>Speck (256-bit key, ECB mode)</td><td align="right">11.35us</td><td align="right">11.74us</td><td align="right">328.33us</td><td align="right">275</td></tr>
<tr><td colspan="5"> </td></tr>
<tr><td>AEAD Algorithm</td><td align="right">Encryption (per byte)</td><td align="right">Decryption (per byte)</td><td>Key Setup</td><td>State Size (bytes)</td></tr>
<tr><td>ChaChaPoly</td><td align="right">41.23us</td><td align="right">41.23us</td><td align="right">902.55us</td><td align="right">255</td></tr>
@ -122,6 +127,9 @@ All figures are for the Arduino Due running at 84 MHz:
<tr><td>ChaCha (20 rounds)</td><td align="right">0.87us</td><td align="right">0.88us</td><td align="right">4.96us</td><td align="right">136</td></tr>
<tr><td>ChaCha (12 rounds)</td><td align="right">0.70us</td><td align="right">0.71us</td><td align="right">4.96us</td><td align="right">136</td></tr>
<tr><td>ChaCha (8 rounds)</td><td align="right">0.62us</td><td align="right">0.62us</td><td align="right">4.96us</td><td align="right">136</td></tr>
<tr><td>Speck (128-bit key, ECB mode)</td><td align="right">0.88us</td><td align="right">1.17us</td><td align="right">37.54us</td><td align="right">288</td></tr>
<tr><td>Speck (192-bit key, ECB mode)</td><td align="right">0.90us</td><td align="right">1.20us</td><td align="right">38.92us</td><td align="right">288</td></tr>
<tr><td>Speck (256-bit key, ECB mode)</td><td align="right">0.93us</td><td align="right">1.23us</td><td align="right">40.10us</td><td align="right">288</td></tr>
<tr><td colspan="5"> </td></tr>
<tr><td>AEAD Algorithm</td><td align="right">Encryption (per byte)</td><td align="right">Decryption (per byte)</td><td>Key Setup</td><td>State Size (bytes)</td></tr>
<tr><td>ChaChaPoly</td><td align="right">1.66us</td><td align="right">1.66us</td><td align="right">45.02us</td><td align="right">280</td></tr>

View File

@ -37,7 +37,7 @@
* This class implements the Speck family that uses 128-bit block sizes
* with 128-bit, 192-bit, or 256-bit key sizes. Other Speck families support
* smaller block sizes of 32, 48, 64, or 96 bits but such block sizes are
* really too small for use in modern cryptosystems.
* too small for use in modern cryptosystems.
*
* \note Current crytoanalysis (up until 2015) has not revealed any obvious
* weaknesses in the full-round version of Speck. But if you are wary of
@ -47,6 +47,16 @@
* http://eprint.iacr.org/2013/404
*/
// The "avr-gcc" compiler doesn't do a very good job of compiling
// code involving 64-bit values. So we have to use inline assembly.
// It also helps to break the state up into 32-bit quantities
// because "asm" supports register names like %A0, %B0, %C0, %D0
// for the bytes in a 32-bit quantity, but it does not support
// %E0, %F0, %G0, %H0 for the high bytes of a 64-bit quantity.
#if defined(__AVR__)
#define USE_AVR_INLINE_ASM 1
#endif
/**
* \brief Constructs a Speck block cipher with no initial key.
*
@ -74,33 +84,7 @@ size_t Speck::keySize() const
return 32;
}
// Pack/unpack big-endian 64-bit quantities.
#if defined(__AVR__)
#define pack64(data, value) \
do { \
const uint8_t *src = (const uint8_t *)&(value); \
(data)[0] = src[7]; \
(data)[1] = src[6]; \
(data)[2] = src[5]; \
(data)[3] = src[4]; \
(data)[4] = src[3]; \
(data)[5] = src[2]; \
(data)[6] = src[1]; \
(data)[7] = src[0]; \
} while (0)
#define unpack64(value, data) \
do { \
uint8_t *dest = (uint8_t *)&(value); \
dest[0] = (data)[7]; \
dest[1] = (data)[6]; \
dest[2] = (data)[5]; \
dest[3] = (data)[4]; \
dest[4] = (data)[3]; \
dest[5] = (data)[2]; \
dest[6] = (data)[1]; \
dest[7] = (data)[0]; \
} while (0)
#else
// Pack/unpack byte-aligned big-endian 64-bit quantities.
#define pack64(data, value) \
do { \
uint64_t v = htobe64((value)); \
@ -111,10 +95,182 @@ size_t Speck::keySize() const
memcpy(&(value), (data), sizeof(uint64_t)); \
(value) = be64toh((value)); \
} while (0)
#endif
bool Speck::setKey(const uint8_t *key, size_t len)
{
#if USE_AVR_INLINE_ASM
uint64_t l[4];
uint8_t m, mb;
if (len == 32) {
m = 4;
mb = 3 * 8;
} else if (len == 24) {
m = 3;
mb = 2 * 8;
} else if (len == 16) {
m = 2;
mb = 8;
} else {
return false;
}
rounds = 30 + m;
// Copy the first (m - 1) * 8 bytes of the key into the "l" array
// in reverse order to convert big endian into little-endian.
__asm__ __volatile__ (
"1:\n"
"ld __tmp_reg__,-Z\n"
"st X+,__tmp_reg__\n"
"dec %2\n"
"brne 1b\n"
: : "x"(l), "z"(key + len - 8), "r"(mb)
);
// Copy the final 8 bytes of the key into k[0] in reverse order.
__asm__ __volatile__ (
"1:\n"
"ld __tmp_reg__,-Z\n"
"st X+,__tmp_reg__\n"
"dec %2\n"
"brne 1b\n"
: : "x"(k), "z"(key + len), "r"(8)
);
// Expand the key to the full key schedule.
__asm__ __volatile__ (
"1:\n"
// l[li_out] = (k[i] + rightRotate8_64(l[li_in])) ^ i;
"add %A1,%2\n" // X = &(l[li_in])
"adc %B1,__zero_reg__\n"
"ld r8,X+\n" // x = l[li_in]
"ld r9,X+\n"
"ld r10,X+\n"
"ld r11,X+\n"
"ld r12,X+\n"
"ld r13,X+\n"
"ld r14,X+\n"
"ld r15,X+\n"
"mov __tmp_reg__,r8\n" // x = rightRotate8_64(l[li_in])
"mov r8,r9\n"
"mov r9,r10\n"
"mov r10,r11\n"
"mov r11,r12\n"
"mov r12,r13\n"
"mov r13,r14\n"
"mov r14,r15\n"
"mov r15,__tmp_reg__\n"
"ld r16,Z+\n" // y = k[i]
"ld r17,Z+\n"
"ld r18,Z+\n"
"ld r19,Z+\n"
"ld r20,Z+\n"
"ld r21,Z+\n"
"ld r22,Z+\n"
"ld r23,Z+\n"
"add r8,r16\n" // x += y
"adc r9,r17\n"
"adc r10,r18\n"
"adc r11,r19\n"
"adc r12,r20\n"
"adc r13,r21\n"
"adc r14,r22\n"
"adc r15,r23\n"
"eor r8,%4\n" // x ^= i
// X = X - li_in + li_out
"ldi r24,8\n" // li_in = li_in + 1
"add %2,r24\n"
"sub %A1,%2\n" // return X to its initial value
"sbc %B1,__zero_reg__\n"
"ldi r25,0x1f\n"
"and %2,r25\n" // li_in = li_in % 4
"add %A1,%3\n" // X = &(l[li_out])
"adc %B1,__zero_reg__\n"
"st X+,r8\n" // l[li_out] = x
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
"add %3,r24\n" // li_out = li_out + 1
"sub %A1,%3\n" // return X to its initial value
"sbc %B1,__zero_reg__\n"
"and %3,r25\n" // li_out = li_out % 4
// k[i + 1] = leftRotate3_64(k[i]) ^ l[li_out];
"lsl r16\n" // y = leftRotate1_64(y)
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16,__zero_reg__\n"
"lsl r16\n" // y = leftRotate1_64(y)
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16,__zero_reg__\n"
"lsl r16\n" // y = leftRotate1_64(y)
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16,__zero_reg__\n"
"eor r16,r8\n" // y ^= x
"eor r17,r9\n"
"eor r18,r10\n"
"eor r19,r11\n"
"eor r20,r12\n"
"eor r21,r13\n"
"eor r22,r14\n"
"eor r23,r15\n"
"st Z,r16\n" // k[i + 1] = y
"std Z+1,r17\n"
"std Z+2,r18\n"
"std Z+3,r19\n"
"std Z+4,r20\n"
"std Z+5,r21\n"
"std Z+6,r22\n"
"std Z+7,r23\n"
// Loop
"inc %4\n" // ++i
"dec %5\n" // --rounds
"breq 2f\n"
"rjmp 1b\n"
"2:\n"
: : "z"(k), "x"(l),
"r"((uint8_t)0), // initial value of li_in
"r"((uint8_t)((m - 1) * 8)), // initial value of li_out
"r"(0), // initial value of i
"r"(rounds - 1)
: "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
"r24", "r25"
);
#else
uint64_t l[4];
uint8_t m;
if (len == 32) {
@ -146,12 +302,149 @@ bool Speck::setKey(const uint8_t *key, size_t len)
if ((++li_out) >= m)
li_out = 0;
}
#endif
clean(l);
return true;
}
void Speck::encryptBlock(uint8_t *output, const uint8_t *input)
{
#if USE_AVR_INLINE_ASM
uint32_t xlow, xhigh, ylow, yhigh;
// Unpack the input into the x and y variables, converting
// from big-endian into little-endian in the process.
__asm__ __volatile__ (
"ld %D1,Z\n"
"ldd %C1,Z+1\n"
"ldd %B1,Z+2\n"
"ldd %A1,Z+3\n"
"ldd %D0,Z+4\n"
"ldd %C0,Z+5\n"
"ldd %B0,Z+6\n"
"ldd %A0,Z+7\n"
"ldd %D3,Z+8\n"
"ldd %C3,Z+9\n"
"ldd %B3,Z+10\n"
"ldd %A3,Z+11\n"
"ldd %D2,Z+12\n"
"ldd %C2,Z+13\n"
"ldd %B2,Z+14\n"
"ldd %A2,Z+15\n"
: "=r"(xlow), "=r"(xhigh), "=r"(ylow), "=r"(yhigh)
: "z"(input)
);
// Perform all encryption rounds. Z points to the key schedule.
__asm__ __volatile__ (
"1:\n"
// x = (rightRotate8_64(x) + y) ^ *s++;
"mov __tmp_reg__,%A0\n" // x = rightRotate8_64(x)
"mov %A0,%B0\n"
"mov %B0,%C0\n"
"mov %C0,%D0\n"
"mov %D0,%A1\n"
"mov %A1,%B1\n"
"mov %B1,%C1\n"
"mov %C1,%D1\n"
"mov %D1,__tmp_reg__\n"
"add %A0,%A2\n" // x += y
"adc %B0,%B2\n"
"adc %C0,%C2\n"
"adc %D0,%D2\n"
"adc %A1,%A3\n"
"adc %B1,%B3\n"
"adc %C1,%C3\n"
"adc %D1,%D3\n"
"ld __tmp_reg__,Z+\n" // x ^= *s++
"eor %A0,__tmp_reg__\n"
"ld __tmp_reg__,Z+\n"
"eor %B0,__tmp_reg__\n"
"ld __tmp_reg__,Z+\n"
"eor %C0,__tmp_reg__\n"
"ld __tmp_reg__,Z+\n"
"eor %D0,__tmp_reg__\n"
"ld __tmp_reg__,Z+\n"
"eor %A1,__tmp_reg__\n"
"ld __tmp_reg__,Z+\n"
"eor %B1,__tmp_reg__\n"
"ld __tmp_reg__,Z+\n"
"eor %C1,__tmp_reg__\n"
"ld __tmp_reg__,Z+\n"
"eor %D1,__tmp_reg__\n"
// y = leftRotate3_64(y) ^ x;
"lsl %A2\n" // y = leftRotate1_64(y)
"rol %B2\n"
"rol %C2\n"
"rol %D2\n"
"rol %A3\n"
"rol %B3\n"
"rol %C3\n"
"rol %D3\n"
"adc %A2,__zero_reg__\n"
"lsl %A2\n" // y = leftRotate1_64(y)
"rol %B2\n"
"rol %C2\n"
"rol %D2\n"
"rol %A3\n"
"rol %B3\n"
"rol %C3\n"
"rol %D3\n"
"adc %A2,__zero_reg__\n"
"lsl %A2\n" // y = leftRotate1_64(y)
"rol %B2\n"
"rol %C2\n"
"rol %D2\n"
"rol %A3\n"
"rol %B3\n"
"rol %C3\n"
"rol %D3\n"
"adc %A2,__zero_reg__\n"
"eor %A2,%A0\n" // y ^= x
"eor %B2,%B0\n"
"eor %C2,%C0\n"
"eor %D2,%D0\n"
"eor %A3,%A1\n"
"eor %B3,%B1\n"
"eor %C3,%C1\n"
"eor %D3,%D1\n"
// Loop
"dec %5\n" // --round
"breq 2f\n"
"rjmp 1b\n"
"2:\n"
: "+r"(xlow), "+r"(xhigh), "+r"(ylow), "+r"(yhigh)
: "z"(k), "r"(rounds)
);
// Pack the results into the output and convert back to big-endian.
__asm__ __volatile__ (
"st Z,%D1\n"
"std Z+1,%C1\n"
"std Z+2,%B1\n"
"std Z+3,%A1\n"
"std Z+4,%D0\n"
"std Z+5,%C0\n"
"std Z+6,%B0\n"
"std Z+7,%A0\n"
"std Z+8,%D3\n"
"std Z+9,%C3\n"
"std Z+10,%B3\n"
"std Z+11,%A3\n"
"std Z+12,%D2\n"
"std Z+13,%C2\n"
"std Z+14,%B2\n"
"std Z+15,%A2\n"
: : "r"(xlow), "r"(xhigh), "r"(ylow), "r"(yhigh), "z"(output)
);
#else
uint64_t x, y;
const uint64_t *s = k;
unpack64(x, input);
@ -162,10 +455,150 @@ void Speck::encryptBlock(uint8_t *output, const uint8_t *input)
}
pack64(output, x);
pack64(output + 8, y);
#endif
}
void Speck::decryptBlock(uint8_t *output, const uint8_t *input)
{
#if USE_AVR_INLINE_ASM
uint32_t xlow, xhigh, ylow, yhigh;
// Unpack the input into the x and y variables, converting
// from big-endian into little-endian in the process.
__asm__ __volatile__ (
"ld %D1,Z\n"
"ldd %C1,Z+1\n"
"ldd %B1,Z+2\n"
"ldd %A1,Z+3\n"
"ldd %D0,Z+4\n"
"ldd %C0,Z+5\n"
"ldd %B0,Z+6\n"
"ldd %A0,Z+7\n"
"ldd %D3,Z+8\n"
"ldd %C3,Z+9\n"
"ldd %B3,Z+10\n"
"ldd %A3,Z+11\n"
"ldd %D2,Z+12\n"
"ldd %C2,Z+13\n"
"ldd %B2,Z+14\n"
"ldd %A2,Z+15\n"
: "=r"(xlow), "=r"(xhigh), "=r"(ylow), "=r"(yhigh)
: "z"(input)
);
// Perform all decryption rounds. Z points to the end of key schedule.
__asm__ __volatile__ (
"1:\n"
// y = rightRotate3_64(x ^ y);
"eor %A2,%A0\n" // y ^= x
"eor %B2,%B0\n"
"eor %C2,%C0\n"
"eor %D2,%D0\n"
"eor %A3,%A1\n"
"eor %B3,%B1\n"
"eor %C3,%C1\n"
"eor %D3,%D1\n"
"bst %A2,0\n" // y = rightRotate1_64(y)
"ror %D3\n"
"ror %C3\n"
"ror %B3\n"
"ror %A3\n"
"ror %D2\n"
"ror %C2\n"
"ror %B2\n"
"ror %A2\n"
"bld %D3,7\n"
"bst %A2,0\n" // y = rightRotate1_64(y)
"ror %D3\n"
"ror %C3\n"
"ror %B3\n"
"ror %A3\n"
"ror %D2\n"
"ror %C2\n"
"ror %B2\n"
"ror %A2\n"
"bld %D3,7\n"
"bst %A2,0\n" // y = rightRotate1_64(y)
"ror %D3\n"
"ror %C3\n"
"ror %B3\n"
"ror %A3\n"
"ror %D2\n"
"ror %C2\n"
"ror %B2\n"
"ror %A2\n"
"bld %D3,7\n"
// x = leftRotate8_64((x ^ *s--) - y);
"ld __tmp_reg__,-Z\n" // x ^= *s--
"eor %D1,__tmp_reg__\n"
"ld __tmp_reg__,-Z\n"
"eor %C1,__tmp_reg__\n"
"ld __tmp_reg__,-Z\n"
"eor %B1,__tmp_reg__\n"
"ld __tmp_reg__,-Z\n"
"eor %A1,__tmp_reg__\n"
"ld __tmp_reg__,-Z\n"
"eor %D0,__tmp_reg__\n"
"ld __tmp_reg__,-Z\n"
"eor %C0,__tmp_reg__\n"
"ld __tmp_reg__,-Z\n"
"eor %B0,__tmp_reg__\n"
"ld __tmp_reg__,-Z\n"
"eor %A0,__tmp_reg__\n"
"sub %A0,%A2\n" // x -= y
"sbc %B0,%B2\n"
"sbc %C0,%C2\n"
"sbc %D0,%D2\n"
"sbc %A1,%A3\n"
"sbc %B1,%B3\n"
"sbc %C1,%C3\n"
"sbc %D1,%D3\n"
"mov __tmp_reg__,%D1\n" // x = lefRotate8_64(x)
"mov %D1,%C1\n"
"mov %C1,%B1\n"
"mov %B1,%A1\n"
"mov %A1,%D0\n"
"mov %D0,%C0\n"
"mov %C0,%B0\n"
"mov %B0,%A0\n"
"mov %A0,__tmp_reg__\n"
// Loop
"dec %5\n" // --round
"breq 2f\n"
"rjmp 1b\n"
"2:\n"
: "+r"(xlow), "+r"(xhigh), "+r"(ylow), "+r"(yhigh)
: "z"(k + rounds), "r"(rounds)
);
// Pack the results into the output and convert back to big-endian.
__asm__ __volatile__ (
"st Z,%D1\n"
"std Z+1,%C1\n"
"std Z+2,%B1\n"
"std Z+3,%A1\n"
"std Z+4,%D0\n"
"std Z+5,%C0\n"
"std Z+6,%B0\n"
"std Z+7,%A0\n"
"std Z+8,%D3\n"
"std Z+9,%C3\n"
"std Z+10,%B3\n"
"std Z+11,%A3\n"
"std Z+12,%D2\n"
"std Z+13,%C2\n"
"std Z+14,%B2\n"
"std Z+15,%A2\n"
: : "r"(xlow), "r"(xhigh), "r"(ylow), "r"(yhigh), "z"(output)
);
#else
uint64_t x, y;
const uint64_t *s = k + rounds - 1;
unpack64(x, input);
@ -176,6 +609,7 @@ void Speck::decryptBlock(uint8_t *output, const uint8_t *input)
}
pack64(output, x);
pack64(output + 8, y);
#endif
}
void Speck::clear()