1
0
mirror of https://github.com/taigrr/arduinolibs synced 2025-01-18 04:33:12 -08:00

Speed up Speck by using a custom AVR code generator

This also fixes the remaining asm issues with newer versions of gcc.
This commit is contained in:
Rhys Weatherley
2017-11-03 10:46:39 +10:00
parent b53f57225d
commit 277a0b63c9
5 changed files with 1765 additions and 1043 deletions

View File

@@ -105,6 +105,7 @@ size_t Speck::keySize() const
bool Speck::setKey(const uint8_t *key, size_t len)
{
#if USE_AVR_INLINE_ASM
// Automatically generated by the genspeck tool.
uint64_t l[4];
uint8_t m, mb;
if (len == 32) {
@@ -120,134 +121,148 @@ bool Speck::setKey(const uint8_t *key, size_t len)
return false;
}
rounds = 30 + m;
// Copy the first (m - 1) * 8 bytes of the key into the "l" array
// in reverse order to convert big endian into little-endian.
uint8_t r = rounds - 1;
__asm__ __volatile__ (
"ld __tmp_reg__,-X\n"
"st Z+,__tmp_reg__\n"
"ld __tmp_reg__,-X\n"
"st Z+,__tmp_reg__\n"
"ld __tmp_reg__,-X\n"
"st Z+,__tmp_reg__\n"
"ld __tmp_reg__,-X\n"
"st Z+,__tmp_reg__\n"
"ld __tmp_reg__,-X\n"
"st Z+,__tmp_reg__\n"
"ld __tmp_reg__,-X\n"
"st Z+,__tmp_reg__\n"
"ld __tmp_reg__,-X\n"
"st Z+,__tmp_reg__\n"
"ld __tmp_reg__,-X\n"
"st Z+,__tmp_reg__\n"
"sbiw r30,8\n"
"movw r10,r30\n"
"movw r30,%A2\n"
"ldd r8,%3\n"
"1:\n"
"ld __tmp_reg__,-Z\n"
"st X+,__tmp_reg__\n"
"dec %2\n"
"ld __tmp_reg__,-X\n"
"st Z+,__tmp_reg__\n"
"dec r8\n"
"brne 1b\n"
: : "x"(l), "z"(key + len - 8), "r"(mb)
"movw r26,%A2\n"
"movw r30,r10\n"
"clr %A2\n"
"ldd %B2,%3\n"
"clr r25\n"
"ld r16,Z+\n"
"ld r17,Z+\n"
"ld r18,Z+\n"
"ld r19,Z+\n"
"ld r20,Z+\n"
"ld r21,Z+\n"
"ld r22,Z+\n"
"ld r23,Z+\n"
"2:\n"
"add r26,%A2\n"
"adc r27,__zero_reg__\n"
"ld r15,X+\n"
"ld r8,X+\n"
"ld r9,X+\n"
"ld r10,X+\n"
"ld r11,X+\n"
"ld r12,X+\n"
"ld r13,X+\n"
"ld r14,X+\n"
"sub r26,%A2\n"
"sbc r27,__zero_reg__\n"
"sbiw r26,8\n"
"add r8,r16\n"
"adc r9,r17\n"
"adc r10,r18\n"
"adc r11,r19\n"
"adc r12,r20\n"
"adc r13,r21\n"
"adc r14,r22\n"
"adc r15,r23\n"
"eor r8,r25\n"
"add r26,%B2\n"
"adc r27,__zero_reg__\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
"sub r26,%B2\n"
"sbc r27,__zero_reg__\n"
"sbiw r26,8\n"
"lsl r16\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16, __zero_reg__\n"
"lsl r16\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16, __zero_reg__\n"
"lsl r16\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16, __zero_reg__\n"
"eor r16,r8\n"
"eor r17,r9\n"
"eor r18,r10\n"
"eor r19,r11\n"
"eor r20,r12\n"
"eor r21,r13\n"
"eor r22,r14\n"
"eor r23,r15\n"
"st Z+,r16\n"
"st Z+,r17\n"
"st Z+,r18\n"
"st Z+,r19\n"
"st Z+,r20\n"
"st Z+,r21\n"
"st Z+,r22\n"
"st Z+,r23\n"
"ldi r24,8\n"
"add %A2,r24\n"
"add %B2,r24\n"
"ldi r24,0x1F\n"
"and %A2,r24\n"
"and %B2,r24\n"
"ldd r8,%4\n"
"inc r25\n"
"cp r25,r8\n"
"breq 3f\n"
"rjmp 2b\n"
"3:\n"
"ldi r24,32\n"
"4:\n"
"st X+,__zero_reg__\n"
"dec r24\n"
"brne 4b\n"
: : "z"(k), "x"(key + len), "r"(l), "Q"(mb), "Q"(r)
: "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "memory"
, "r24", "r25"
);
// Copy the final 8 bytes of the key into k[0] in reverse order.
__asm__ __volatile__ (
"1:\n"
"ld __tmp_reg__,-Z\n"
"st X+,__tmp_reg__\n"
"dec %2\n"
"brne 1b\n"
: : "x"(k), "z"(key + len), "r"(8)
);
// Expand the key to the full key schedule.
uint8_t li_in = 0;
uint8_t li_out = m - 1;
for (uint8_t i = 0; i < (rounds - 1); ++i) {
__asm__ __volatile__ (
// l[li_out] = (k[i] + rightRotate8_64(l[li_in])) ^ i;
"ld r15,X+\n" // x = rightRotate8_64(l[li_in])
"ld r8,X+\n"
"ld r9,X+\n"
"ld r10,X+\n"
"ld r11,X+\n"
"ld r12,X+\n"
"ld r13,X+\n"
"ld r14,X+\n"
"ld r16,Z+\n" // y = k[i]
"ld r17,Z+\n"
"ld r18,Z+\n"
"ld r19,Z+\n"
"ld r20,Z+\n"
"ld r21,Z+\n"
"ld r22,Z+\n"
"ld r23,Z+\n"
"add r8,r16\n" // x += y
"adc r9,r17\n"
"adc r10,r18\n"
"adc r11,r19\n"
"adc r12,r20\n"
"adc r13,r21\n"
"adc r14,r22\n"
"adc r15,r23\n"
"eor r8,%3\n" // x ^= i
// k[i + 1] = leftRotate3_64(k[i]) ^ l[li_out];
"movw r26,%A2\n" // l[li_out] = x
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
"lsl r16\n" // y = leftRotate1_64(y)
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16,__zero_reg__\n"
"lsl r16\n" // y = leftRotate1_64(y)
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16,__zero_reg__\n"
"lsl r16\n" // y = leftRotate1_64(y)
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16,__zero_reg__\n"
"eor r16,r8\n" // y ^= x
"eor r17,r9\n"
"eor r18,r10\n"
"eor r19,r11\n"
"eor r20,r12\n"
"eor r21,r13\n"
"eor r22,r14\n"
"eor r23,r15\n"
"st Z+,r16\n" // k[i + 1] = y
"st Z+,r17\n"
"st Z+,r18\n"
"st Z+,r19\n"
"st Z+,r20\n"
"st Z+,r21\n"
"st Z+,r22\n"
"st Z+,r23\n"
: : "z"(&(k[i])), "x"(&(l[li_in])),
"r"(&(l[li_out])),
"r"(i)
: "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
"r24", "r25"
);
if ((++li_in) >= m)
li_in = 0;
if ((++li_out) >= m)
li_out = 0;
}
return true;
#else
uint64_t l[4];
uint8_t m;
@@ -280,138 +295,118 @@ bool Speck::setKey(const uint8_t *key, size_t len)
if ((++li_out) >= m)
li_out = 0;
}
#endif
clean(l);
return true;
#endif
}
void Speck::encryptBlock(uint8_t *output, const uint8_t *input)
{
#if USE_AVR_INLINE_ASM
uint32_t xlow, xhigh, ylow, yhigh;
// Unpack the input into the x and y variables, converting
// from big-endian into little-endian in the process.
__asm__ __volatile__ (
"ld %D1,Z\n"
"ldd %C1,Z+1\n"
"ldd %B1,Z+2\n"
"ldd %A1,Z+3\n"
"ldd %D0,Z+4\n"
"ldd %C0,Z+5\n"
"ldd %B0,Z+6\n"
"ldd %A0,Z+7\n"
"ldd %D3,Z+8\n"
"ldd %C3,Z+9\n"
"ldd %B3,Z+10\n"
"ldd %A3,Z+11\n"
"ldd %D2,Z+12\n"
"ldd %C2,Z+13\n"
"ldd %B2,Z+14\n"
"ldd %A2,Z+15\n"
: "=r"(xlow), "=r"(xhigh), "=r"(ylow), "=r"(yhigh)
: "z"(input)
);
// Perform all encryption rounds. Z points to the key schedule.
// Automatically generated by the genspeck tool.
__asm__ __volatile__ (
"ld r15,X+\n"
"ld r14,X+\n"
"ld r13,X+\n"
"ld r12,X+\n"
"ld r11,X+\n"
"ld r10,X+\n"
"ld r9,X+\n"
"ld r8,X+\n"
"ld r23,X+\n"
"ld r22,X+\n"
"ld r21,X+\n"
"ld r20,X+\n"
"ld r19,X+\n"
"ld r18,X+\n"
"ld r17,X+\n"
"ld r16,X\n"
"1:\n"
// x = (rightRotate8_64(x) + y) ^ *s++;
"add %B0,%A2\n" // x = rightRotate8_64(x), x += y
"adc %C0,%B2\n" // Note: right rotate is implicit.
"adc %D0,%C2\n"
"adc %A1,%D2\n"
"adc %B1,%A3\n"
"adc %C1,%B3\n"
"adc %D1,%C3\n"
"adc %A0,%D3\n"
"ld __tmp_reg__,Z+\n" // x ^= *s++
"eor __tmp_reg__,%B0\n" // Also fully apply the right rotate.
"ld %B0,Z+\n"
"eor %B0,%C0\n"
"ld %C0,Z+\n"
"eor %C0,%D0\n"
"ld %D0,Z+\n"
"eor %D0,%A1\n"
"ld %A1,Z+\n"
"eor %A1,%B1\n"
"ld %B1,Z+\n"
"eor %B1,%C1\n"
"ld %C1,Z+\n"
"eor %C1,%D1\n"
"ld %D1,Z+\n"
"eor %D1,%A0\n"
"mov %A0,__tmp_reg__\n"
// y = leftRotate3_64(y) ^ x;
"lsl %A2\n" // y = leftRotate1_64(y)
"rol %B2\n"
"rol %C2\n"
"rol %D2\n"
"rol %A3\n"
"rol %B3\n"
"rol %C3\n"
"rol %D3\n"
"adc %A2,__zero_reg__\n"
"lsl %A2\n" // y = leftRotate1_64(y)
"rol %B2\n"
"rol %C2\n"
"rol %D2\n"
"rol %A3\n"
"rol %B3\n"
"rol %C3\n"
"rol %D3\n"
"adc %A2,__zero_reg__\n"
"lsl %A2\n" // y = leftRotate1_64(y)
"rol %B2\n"
"rol %C2\n"
"rol %D2\n"
"rol %A3\n"
"rol %B3\n"
"rol %C3\n"
"rol %D3\n"
"adc %A2,__zero_reg__\n"
"eor %A2,%A0\n" // y ^= x
"eor %B2,%B0\n"
"eor %C2,%C0\n"
"eor %D2,%D0\n"
"eor %A3,%A1\n"
"eor %B3,%B1\n"
"eor %C3,%C1\n"
"eor %D3,%D1\n"
// Loop
"dec %5\n" // --round
"add r9,r16\n"
"adc r10,r17\n"
"adc r11,r18\n"
"adc r12,r19\n"
"adc r13,r20\n"
"adc r14,r21\n"
"adc r15,r22\n"
"adc r8,r23\n"
"ld __tmp_reg__,Z+\n"
"eor __tmp_reg__,r9\n"
"ld r9,Z+\n"
"eor r9,r10\n"
"ld r10,Z+\n"
"eor r10,r11\n"
"ld r11,Z+\n"
"eor r11,r12\n"
"ld r12,Z+\n"
"eor r12,r13\n"
"ld r13,Z+\n"
"eor r13,r14\n"
"ld r14,Z+\n"
"eor r14,r15\n"
"ld r15,Z+\n"
"eor r15,r8\n"
"mov r8,__tmp_reg__\n"
"lsl r16\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16, __zero_reg__\n"
"lsl r16\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16, __zero_reg__\n"
"lsl r16\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16, __zero_reg__\n"
"eor r16,r8\n"
"eor r17,r9\n"
"eor r18,r10\n"
"eor r19,r11\n"
"eor r20,r12\n"
"eor r21,r13\n"
"eor r22,r14\n"
"eor r23,r15\n"
"dec %2\n"
"breq 2f\n"
"rjmp 1b\n"
"2:\n"
: "+r"(xlow), "+r"(xhigh), "+r"(ylow), "+r"(yhigh)
: "z"(k), "r"(rounds)
);
// Pack the results into the output and convert back to big-endian.
__asm__ __volatile__ (
"st Z,%D1\n"
"std Z+1,%C1\n"
"std Z+2,%B1\n"
"std Z+3,%A1\n"
"std Z+4,%D0\n"
"std Z+5,%C0\n"
"std Z+6,%B0\n"
"std Z+7,%A0\n"
"std Z+8,%D3\n"
"std Z+9,%C3\n"
"std Z+10,%B3\n"
"std Z+11,%A3\n"
"std Z+12,%D2\n"
"std Z+13,%C2\n"
"std Z+14,%B2\n"
"std Z+15,%A2\n"
: : "r"(xlow), "r"(xhigh), "r"(ylow), "r"(yhigh), "z"(output)
"ldd r26,%A3\n"
"ldd r27,%B3\n"
"st X+,r15\n"
"st X+,r14\n"
"st X+,r13\n"
"st X+,r12\n"
"st X+,r11\n"
"st X+,r10\n"
"st X+,r9\n"
"st X+,r8\n"
"st X+,r23\n"
"st X+,r22\n"
"st X+,r21\n"
"st X+,r20\n"
"st X+,r19\n"
"st X+,r18\n"
"st X+,r17\n"
"st X,r16\n"
: : "x"(input), "z"(k), "r"(rounds), "Q"(output)
: "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "memory"
);
#else
uint64_t x, y;
@@ -430,133 +425,113 @@ void Speck::encryptBlock(uint8_t *output, const uint8_t *input)
void Speck::decryptBlock(uint8_t *output, const uint8_t *input)
{
#if USE_AVR_INLINE_ASM
uint32_t xlow, xhigh, ylow, yhigh;
// Unpack the input into the x and y variables, converting
// from big-endian into little-endian in the process.
__asm__ __volatile__ (
"ld %D1,Z\n"
"ldd %C1,Z+1\n"
"ldd %B1,Z+2\n"
"ldd %A1,Z+3\n"
"ldd %D0,Z+4\n"
"ldd %C0,Z+5\n"
"ldd %B0,Z+6\n"
"ldd %A0,Z+7\n"
"ldd %D3,Z+8\n"
"ldd %C3,Z+9\n"
"ldd %B3,Z+10\n"
"ldd %A3,Z+11\n"
"ldd %D2,Z+12\n"
"ldd %C2,Z+13\n"
"ldd %B2,Z+14\n"
"ldd %A2,Z+15\n"
: "=r"(xlow), "=r"(xhigh), "=r"(ylow), "=r"(yhigh)
: "z"(input)
);
// Perform all decryption rounds. Z points to the end of key schedule.
// Automatically generated by the genspeck tool.
__asm__ __volatile__ (
"ld r15,X+\n"
"ld r14,X+\n"
"ld r13,X+\n"
"ld r12,X+\n"
"ld r11,X+\n"
"ld r10,X+\n"
"ld r9,X+\n"
"ld r8,X+\n"
"ld r23,X+\n"
"ld r22,X+\n"
"ld r21,X+\n"
"ld r20,X+\n"
"ld r19,X+\n"
"ld r18,X+\n"
"ld r17,X+\n"
"ld r16,X\n"
"1:\n"
// y = rightRotate3_64(x ^ y);
"eor %A2,%A0\n" // y ^= x
"eor %B2,%B0\n"
"eor %C2,%C0\n"
"eor %D2,%D0\n"
"eor %A3,%A1\n"
"eor %B3,%B1\n"
"eor %C3,%C1\n"
"eor %D3,%D1\n"
"bst %A2,0\n" // y = rightRotate1_64(y)
"ror %D3\n"
"ror %C3\n"
"ror %B3\n"
"ror %A3\n"
"ror %D2\n"
"ror %C2\n"
"ror %B2\n"
"ror %A2\n"
"bld %D3,7\n"
"bst %A2,0\n" // y = rightRotate1_64(y)
"ror %D3\n"
"ror %C3\n"
"ror %B3\n"
"ror %A3\n"
"ror %D2\n"
"ror %C2\n"
"ror %B2\n"
"ror %A2\n"
"bld %D3,7\n"
"bst %A2,0\n" // y = rightRotate1_64(y)
"ror %D3\n"
"ror %C3\n"
"ror %B3\n"
"ror %A3\n"
"ror %D2\n"
"ror %C2\n"
"ror %B2\n"
"ror %A2\n"
"bld %D3,7\n"
// x = leftRotate8_64((x ^ *s--) - y);
"ld __tmp_reg__,-Z\n" // x ^= *s--
"eor __tmp_reg__,%D1\n" // Note: also implicitly left-rotates regs
"ld %D1,-Z\n"
"eor %D1,%C1\n"
"ld %C1,-Z\n"
"eor %C1,%B1\n"
"ld %B1,-Z\n"
"eor %B1,%A1\n"
"ld %A1,-Z\n"
"eor %A1,%D0\n"
"ld %D0,-Z\n"
"eor %D0,%C0\n"
"ld %C0,-Z\n"
"eor %C0,%B0\n"
"ld %B0,-Z\n"
"eor %B0,%A0\n"
"mov %A0,__tmp_reg__\n"
"sub %B0,%A2\n" // x -= y
"sbc %C0,%B2\n" // Note: regs are already left-rotated
"sbc %D0,%C2\n"
"sbc %A1,%D2\n"
"sbc %B1,%A3\n"
"sbc %C1,%B3\n"
"sbc %D1,%C3\n"
"sbc %A0,%D3\n"
// Loop
"dec %5\n" // --round
"eor r16,r8\n"
"eor r17,r9\n"
"eor r18,r10\n"
"eor r19,r11\n"
"eor r20,r12\n"
"eor r21,r13\n"
"eor r22,r14\n"
"eor r23,r15\n"
"bst r16,0\n"
"ror r23\n"
"ror r22\n"
"ror r21\n"
"ror r20\n"
"ror r19\n"
"ror r18\n"
"ror r17\n"
"ror r16\n"
"bld r23,7\n"
"bst r16,0\n"
"ror r23\n"
"ror r22\n"
"ror r21\n"
"ror r20\n"
"ror r19\n"
"ror r18\n"
"ror r17\n"
"ror r16\n"
"bld r23,7\n"
"bst r16,0\n"
"ror r23\n"
"ror r22\n"
"ror r21\n"
"ror r20\n"
"ror r19\n"
"ror r18\n"
"ror r17\n"
"ror r16\n"
"bld r23,7\n"
"ld __tmp_reg__,-Z\n"
"eor __tmp_reg__,r15\n"
"ld r15,-Z\n"
"eor r15,r14\n"
"ld r14,-Z\n"
"eor r14,r13\n"
"ld r13,-Z\n"
"eor r13,r12\n"
"ld r12,-Z\n"
"eor r12,r11\n"
"ld r11,-Z\n"
"eor r11,r10\n"
"ld r10,-Z\n"
"eor r10,r9\n"
"ld r9,-Z\n"
"eor r9,r8\n"
"mov r8,__tmp_reg__\n"
"sub r9,r16\n"
"sbc r10,r17\n"
"sbc r11,r18\n"
"sbc r12,r19\n"
"sbc r13,r20\n"
"sbc r14,r21\n"
"sbc r15,r22\n"
"sbc r8,r23\n"
"dec %2\n"
"breq 2f\n"
"rjmp 1b\n"
"2:\n"
: "+r"(xlow), "+r"(xhigh), "+r"(ylow), "+r"(yhigh)
: "z"(k + rounds), "r"(rounds)
);
// Pack the results into the output and convert back to big-endian.
__asm__ __volatile__ (
"st Z,%D1\n"
"std Z+1,%C1\n"
"std Z+2,%B1\n"
"std Z+3,%A1\n"
"std Z+4,%D0\n"
"std Z+5,%C0\n"
"std Z+6,%B0\n"
"std Z+7,%A0\n"
"std Z+8,%D3\n"
"std Z+9,%C3\n"
"std Z+10,%B3\n"
"std Z+11,%A3\n"
"std Z+12,%D2\n"
"std Z+13,%C2\n"
"std Z+14,%B2\n"
"std Z+15,%A2\n"
: : "r"(xlow), "r"(xhigh), "r"(ylow), "r"(yhigh), "z"(output)
"ldd r26,%A3\n"
"ldd r27,%B3\n"
"st X+,r15\n"
"st X+,r14\n"
"st X+,r13\n"
"st X+,r12\n"
"st X+,r11\n"
"st X+,r10\n"
"st X+,r9\n"
"st X+,r8\n"
"st X+,r23\n"
"st X+,r22\n"
"st X+,r21\n"
"st X+,r20\n"
"st X+,r19\n"
"st X+,r18\n"
"st X+,r17\n"
"st X,r16\n"
: : "x"(input), "z"(k + rounds), "r"(rounds), "Q"(output)
: "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "memory"
);
#else
uint64_t x, y;

View File

@@ -261,349 +261,283 @@ bool SpeckSmall::setKey(const uint8_t *key, size_t len)
void SpeckSmall::decryptBlock(uint8_t *output, const uint8_t *input)
{
#if USE_AVR_INLINE_ASM
uint64_t l[4];
uint32_t xlow, xhigh, ylow, yhigh;
uint32_t slow, shigh;
uint8_t li_in = (rounds + 3) & 0x03;
uint8_t li_out = (((rounds - 31) + li_in) & 0x03) * 8;
li_in *= 8;
// Prepare to expand the key schedule.
// Automatically generated by the genspeck tool.
uint64_t l[5];
uint8_t r = rounds;
uint8_t li_in = ((r + 3) & 0x03) * 8;
uint8_t li_out = ((((r - 31) & 0x03) * 8) + li_in) & 0x1F;
__asm__ __volatile__ (
"add r30,%4\n" // Z = &(this->l[li_out])
"adc r31,__zero_reg__\n"
"ld __tmp_reg__,Z\n" // s = this->l[li_out]
"std %A0,__tmp_reg__\n"
"ldd __tmp_reg__,Z+1\n"
"std %B0,__tmp_reg__\n"
"ldd __tmp_reg__,Z+2\n"
"std %C0,__tmp_reg__\n"
"ldd __tmp_reg__,Z+3\n"
"std %D0,__tmp_reg__\n"
"ldd __tmp_reg__,Z+4\n"
"std %A1,__tmp_reg__\n"
"ldd __tmp_reg__,Z+5\n"
"std %B1,__tmp_reg__\n"
"ldd __tmp_reg__,Z+6\n"
"std %C1,__tmp_reg__\n"
"ldd __tmp_reg__,Z+7\n"
"std %D1,__tmp_reg__\n"
"sub r30,%4\n" // Point Z back to the start of this->l.
"sbc r31,__zero_reg__\n"
"ldi r25,32\n" // Copy the entire this->l array into l.
"ldd r25,%4\n"
"ldi r24,32\n"
"1:\n"
"ld __tmp_reg__,Z+\n"
"st X+,__tmp_reg__\n"
"dec r25\n"
"ld __tmp_reg__,X+\n"
"st Z+,__tmp_reg__\n"
"dec r24\n"
"brne 1b\n"
: "=Q"(slow), "=Q"(shigh)
: "z"(this->l), "x"(l), "r"(li_out)
: "r25"
);
// Unpack the input into the x and y variables, converting
// from big-endian into little-endian in the process.
__asm__ __volatile__ (
"ld %D1,Z\n"
"ldd %C1,Z+1\n"
"ldd %B1,Z+2\n"
"ldd %A1,Z+3\n"
"ldd %D0,Z+4\n"
"ldd %C0,Z+5\n"
"ldd %B0,Z+6\n"
"ldd %A0,Z+7\n"
"ldd %D3,Z+8\n"
"ldd %C3,Z+9\n"
"ldd %B3,Z+10\n"
"ldd %A3,Z+11\n"
"ldd %D2,Z+12\n"
"ldd %C2,Z+13\n"
"ldd %B2,Z+14\n"
"ldd %A2,Z+15\n"
: "=r"(xlow), "=r"(xhigh), "=r"(ylow), "=r"(yhigh)
: "z"(input)
);
// Perform all decryption rounds while expanding the key schedule in-place.
__asm__ __volatile__ (
"mov r23,%9\n" // i = rounds - 1
"dec r23\n"
"1:\n"
// Adjust x and y for this round using the key schedule word s.
// y = rightRotate3_64(x ^ y);
"eor %A2,%A0\n" // y ^= x
"eor %B2,%B0\n"
"eor %C2,%C0\n"
"eor %D2,%D0\n"
"eor %A3,%A1\n"
"eor %B3,%B1\n"
"eor %C3,%C1\n"
"eor %D3,%D1\n"
"bst %A2,0\n" // y = rightRotate1_64(y)
"ror %D3\n"
"ror %C3\n"
"ror %B3\n"
"ror %A3\n"
"ror %D2\n"
"ror %C2\n"
"ror %B2\n"
"ror %A2\n"
"bld %D3,7\n"
"bst %A2,0\n" // y = rightRotate1_64(y)
"ror %D3\n"
"ror %C3\n"
"ror %B3\n"
"ror %A3\n"
"ror %D2\n"
"ror %C2\n"
"ror %B2\n"
"ror %A2\n"
"bld %D3,7\n"
"bst %A2,0\n" // y = rightRotate1_64(y)
"ror %D3\n"
"ror %C3\n"
"ror %B3\n"
"ror %A3\n"
"ror %D2\n"
"ror %C2\n"
"ror %B2\n"
"ror %A2\n"
"bld %D3,7\n"
// x = leftRotate8_64((x ^ s) - y);
"ldd __tmp_reg__,%A4\n" // x ^= s
"eor %A0,__tmp_reg__\n"
"ldd __tmp_reg__,%B4\n"
"eor %B0,__tmp_reg__\n"
"ldd __tmp_reg__,%C4\n"
"eor %C0,__tmp_reg__\n"
"ldd __tmp_reg__,%D4\n"
"eor %D0,__tmp_reg__\n"
"ldd __tmp_reg__,%A5\n"
"eor %A1,__tmp_reg__\n"
"ldd __tmp_reg__,%B5\n"
"eor %B1,__tmp_reg__\n"
"ldd __tmp_reg__,%C5\n"
"eor %C1,__tmp_reg__\n"
"ldd __tmp_reg__,%D5\n"
"eor %D1,__tmp_reg__\n"
"sub %A0,%A2\n" // x -= y
"sbc %B0,%B2\n"
"sbc %C0,%C2\n"
"sbc %D0,%D2\n"
"sbc %A1,%A3\n"
"sbc %B1,%B3\n"
"sbc %C1,%C3\n"
"sbc %D1,%D3\n"
"mov __tmp_reg__,%D1\n" // x = lefRotate8_64(x)
"mov %D1,%C1\n"
"mov %C1,%B1\n"
"mov %B1,%A1\n"
"mov %A1,%D0\n"
"mov %D0,%C0\n"
"mov %C0,%B0\n"
"mov %B0,%A0\n"
"mov %A0,__tmp_reg__\n"
// On the last round we don't need to compute s so we
// can exit early here if i == 0.
"or r23,r23\n" // if (i == 0)
"brne 2f\n"
"rjmp 3f\n"
"movw r26,r30\n"
"sbiw r30,32\n"
"add r30,r25\n"
"adc r31,__zero_reg__\n"
"ld __tmp_reg__,Z\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,Z+1\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,Z+2\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,Z+3\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,Z+4\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,Z+5\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,Z+6\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,Z+7\n"
"st X+,__tmp_reg__\n"
"sub r30,r25\n"
"sbc r31,__zero_reg__\n"
"movw r26,%A2\n"
"ld r15,X+\n"
"ld r14,X+\n"
"ld r13,X+\n"
"ld r12,X+\n"
"ld r11,X+\n"
"ld r10,X+\n"
"ld r9,X+\n"
"ld r8,X+\n"
"ld r23,X+\n"
"ld r22,X+\n"
"ld r21,X+\n"
"ld r20,X+\n"
"ld r19,X+\n"
"ld r18,X+\n"
"ld r17,X+\n"
"ld r16,X\n"
"ldd %A2,%6\n"
"mov %B2,r25\n"
"ldd r25,%5\n"
"dec r25\n"
"movw r26,r30\n"
"adiw r26,40\n"
"2:\n"
"dec r23\n" // --i
// Save x and y on the stack so we can reuse registers for t and s.
"push %A0\n"
"push %B0\n"
"push %C0\n"
"push %D0\n"
"push %A1\n"
"push %B1\n"
"push %C1\n"
"push %D1\n"
"push %A2\n"
"push %B2\n"
"push %C2\n"
"push %D2\n"
"push %A3\n"
"push %B3\n"
"push %C3\n"
"push %D3\n"
// Compute the key schedule word s for the next round.
// li_out = (li_out + 3) & 0x03;
"ldd r24,%7\n"
"ldi r25,24\n"
"add r24,r25\n"
"andi r24,0x1f\n"
"std %7,r24\n"
// s = rightRotate3_64(s ^ l[li_out]);
"add %A8,r24\n" // Z = &(l[li_out])
"adc %B8,__zero_reg__\n"
"ld %A0,Z\n" // t = l[li_out]
"ldd %B0,Z+1\n"
"ldd %C0,Z+2\n"
"ldd %D0,Z+3\n"
"ldd %A1,Z+4\n"
"ldd %B1,Z+5\n"
"ldd %C1,Z+6\n"
"ldd %D1,Z+7\n"
"ldd %A2,%A4\n" // load s
"ldd %B2,%B4\n"
"ldd %C2,%C4\n"
"ldd %D2,%D4\n"
"ldd %A3,%A5\n"
"ldd %B3,%B5\n"
"ldd %C3,%C5\n"
"ldd %D3,%D5\n"
"eor %A2,%A0\n" // s ^= t
"eor %B2,%B0\n"
"eor %C2,%C0\n"
"eor %D2,%D0\n"
"eor %A3,%A1\n"
"eor %B3,%B1\n"
"eor %C3,%C1\n"
"eor %D3,%D1\n"
"bst %A2,0\n" // s = rightRotate1_64(s)
"ror %D3\n"
"ror %C3\n"
"ror %B3\n"
"ror %A3\n"
"ror %D2\n"
"ror %C2\n"
"ror %B2\n"
"ror %A2\n"
"bld %D3,7\n"
"bst %A2,0\n" // s = rightRotate1_64(s)
"ror %D3\n"
"ror %C3\n"
"ror %B3\n"
"ror %A3\n"
"ror %D2\n"
"ror %C2\n"
"ror %B2\n"
"ror %A2\n"
"bld %D3,7\n"
"bst %A2,0\n" // s = rightRotate1_64(s)
"ror %D3\n"
"ror %C3\n"
"ror %B3\n"
"ror %A3\n"
"ror %D2\n"
"ror %C2\n"
"ror %B2\n"
"ror %A2\n"
"bld %D3,7\n"
"sub %A8,r24\n" // Z -= li_out
"sbc %B8,__zero_reg__\n"
// li_in = (li_in + 3) & 0x03;
"ldd r24,%6\n"
"add r24,r25\n"
"andi r24,0x1f\n"
"std %6,r24\n"
// l[li_in] = leftRotate8_64((l[li_out] ^ i) - s);
"add %A8,r24\n" // Z = &(l[li_in])
"adc %B8,__zero_reg__\n"
"eor %A0,r23\n" // t ^= i
"sub %A0,%A2\n" // t -= s
"sbc %B0,%B2\n"
"sbc %C0,%C2\n"
"sbc %D0,%D2\n"
"sbc %A1,%A3\n"
"sbc %B1,%B3\n"
"sbc %C1,%C3\n"
"sbc %D1,%D3\n"
"st Z,%D1\n" // l[li_in] = leftRotate8_64(t)
"std Z+1,%A0\n"
"std Z+2,%B0\n"
"std Z+3,%C0\n"
"std Z+4,%D0\n"
"std Z+5,%A1\n"
"std Z+6,%B1\n"
"std Z+7,%C1\n"
"sub %A8,r24\n" // Z -= li_in
"sbc %B8,__zero_reg__\n"
"std %A4,%A2\n" // store s
"std %B4,%B2\n"
"std %C4,%C2\n"
"std %D4,%D2\n"
"std %A5,%A3\n"
"std %B5,%B3\n"
"std %C5,%C3\n"
"std %D5,%D3\n"
// Pop registers from the stack to recover the x and y values.
"pop %D3\n"
"pop %C3\n"
"pop %B3\n"
"pop %A3\n"
"pop %D2\n"
"pop %C2\n"
"pop %B2\n"
"pop %A2\n"
"pop %D1\n"
"pop %C1\n"
"pop %B1\n"
"pop %A1\n"
"pop %D0\n"
"pop %C0\n"
"pop %B0\n"
"pop %A0\n"
// Bottom of the loop.
"rjmp 1b\n"
"eor r16,r8\n"
"eor r17,r9\n"
"eor r18,r10\n"
"eor r19,r11\n"
"eor r20,r12\n"
"eor r21,r13\n"
"eor r22,r14\n"
"eor r23,r15\n"
"bst r16,0\n"
"ror r23\n"
"ror r22\n"
"ror r21\n"
"ror r20\n"
"ror r19\n"
"ror r18\n"
"ror r17\n"
"ror r16\n"
"bld r23,7\n"
"bst r16,0\n"
"ror r23\n"
"ror r22\n"
"ror r21\n"
"ror r20\n"
"ror r19\n"
"ror r18\n"
"ror r17\n"
"ror r16\n"
"bld r23,7\n"
"bst r16,0\n"
"ror r23\n"
"ror r22\n"
"ror r21\n"
"ror r20\n"
"ror r19\n"
"ror r18\n"
"ror r17\n"
"ror r16\n"
"bld r23,7\n"
"ld __tmp_reg__,-X\n"
"eor __tmp_reg__,r15\n"
"ld r15,-X\n"
"eor r15,r14\n"
"ld r14,-X\n"
"eor r14,r13\n"
"ld r13,-X\n"
"eor r13,r12\n"
"ld r12,-X\n"
"eor r12,r11\n"
"ld r11,-X\n"
"eor r11,r10\n"
"ld r10,-X\n"
"eor r10,r9\n"
"ld r9,-X\n"
"eor r9,r8\n"
"mov r8,__tmp_reg__\n"
"sub r9,r16\n"
"sbc r10,r17\n"
"sbc r11,r18\n"
"sbc r12,r19\n"
"sbc r13,r20\n"
"sbc r14,r21\n"
"sbc r15,r22\n"
"sbc r8,r23\n"
"or r25,r25\n"
"brne 3f\n"
"rjmp 4f\n"
"3:\n"
: "+r"(xlow), "+r"(xhigh), "+r"(ylow), "+r"(yhigh),
"+Q"(slow), "+Q"(shigh), "+Q"(li_in), "+Q"(li_out)
: "z"(l), "r"(rounds)
: "r23", "r24", "r25"
);
// Pack the results into the output and convert back to big-endian.
__asm__ __volatile__ (
"st Z,%D1\n"
"std Z+1,%C1\n"
"std Z+2,%B1\n"
"std Z+3,%A1\n"
"std Z+4,%D0\n"
"std Z+5,%C0\n"
"std Z+6,%B0\n"
"std Z+7,%A0\n"
"std Z+8,%D3\n"
"std Z+9,%C3\n"
"std Z+10,%B3\n"
"std Z+11,%A3\n"
"std Z+12,%D2\n"
"std Z+13,%C2\n"
"std Z+14,%B2\n"
"std Z+15,%A2\n"
: : "r"(xlow), "r"(xhigh), "r"(ylow), "r"(yhigh), "z"(output)
"dec r25\n"
"push r8\n"
"push r9\n"
"push r10\n"
"push r11\n"
"push r12\n"
"push r13\n"
"push r14\n"
"push r15\n"
"push r16\n"
"push r17\n"
"push r18\n"
"push r19\n"
"push r20\n"
"push r21\n"
"push r22\n"
"push r23\n"
"ldi r24,24\n"
"add %A2,r24\n"
"add %B2,r24\n"
"ldi r24,0x1F\n"
"and %A2,r24\n"
"and %B2,r24\n"
"ld r16,X+\n"
"ld r17,X+\n"
"ld r18,X+\n"
"ld r19,X+\n"
"ld r20,X+\n"
"ld r21,X+\n"
"ld r22,X+\n"
"ld r23,X+\n"
"add r30,%B2\n"
"adc r31,__zero_reg__\n"
"ld r8,Z\n"
"ldd r9,Z+1\n"
"ldd r10,Z+2\n"
"ldd r11,Z+3\n"
"ldd r12,Z+4\n"
"ldd r13,Z+5\n"
"ldd r14,Z+6\n"
"ldd r15,Z+7\n"
"sub r30,%B2\n"
"sbc r31,__zero_reg__\n"
"eor r16,r8\n"
"eor r17,r9\n"
"eor r18,r10\n"
"eor r19,r11\n"
"eor r20,r12\n"
"eor r21,r13\n"
"eor r22,r14\n"
"eor r23,r15\n"
"bst r16,0\n"
"ror r23\n"
"ror r22\n"
"ror r21\n"
"ror r20\n"
"ror r19\n"
"ror r18\n"
"ror r17\n"
"ror r16\n"
"bld r23,7\n"
"bst r16,0\n"
"ror r23\n"
"ror r22\n"
"ror r21\n"
"ror r20\n"
"ror r19\n"
"ror r18\n"
"ror r17\n"
"ror r16\n"
"bld r23,7\n"
"bst r16,0\n"
"ror r23\n"
"ror r22\n"
"ror r21\n"
"ror r20\n"
"ror r19\n"
"ror r18\n"
"ror r17\n"
"ror r16\n"
"bld r23,7\n"
"st -X,r23\n"
"st -X,r22\n"
"st -X,r21\n"
"st -X,r20\n"
"st -X,r19\n"
"st -X,r18\n"
"st -X,r17\n"
"st -X,r16\n"
"adiw r26,8\n"
"eor r8,r25\n"
"sub r8,r16\n"
"sbc r9,r17\n"
"sbc r10,r18\n"
"sbc r11,r19\n"
"sbc r12,r20\n"
"sbc r13,r21\n"
"sbc r14,r22\n"
"sbc r15,r23\n"
"add r30,%A2\n"
"adc r31,__zero_reg__\n"
"st Z,r15\n"
"std Z+1,r8\n"
"std Z+2,r9\n"
"std Z+3,r10\n"
"std Z+4,r11\n"
"std Z+5,r12\n"
"std Z+6,r13\n"
"std Z+7,r14\n"
"sub r30,%A2\n"
"sbc r31,__zero_reg__\n"
"pop r23\n"
"pop r22\n"
"pop r21\n"
"pop r20\n"
"pop r19\n"
"pop r18\n"
"pop r17\n"
"pop r16\n"
"pop r15\n"
"pop r14\n"
"pop r13\n"
"pop r12\n"
"pop r11\n"
"pop r10\n"
"pop r9\n"
"pop r8\n"
"rjmp 2b\n"
"4:\n"
"ldd r26,%A3\n"
"ldd r27,%B3\n"
"st X+,r15\n"
"st X+,r14\n"
"st X+,r13\n"
"st X+,r12\n"
"st X+,r11\n"
"st X+,r10\n"
"st X+,r9\n"
"st X+,r8\n"
"st X+,r23\n"
"st X+,r22\n"
"st X+,r21\n"
"st X+,r20\n"
"st X+,r19\n"
"st X+,r18\n"
"st X+,r17\n"
"st X,r16\n"
: : "x"(this->l), "z"(l), "r"(input), "Q"(output), "Q"(li_out), "Q"(r), "Q"(li_in)
: "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "memory"
, "r24", "r25"
);
#else
uint64_t l[4];

View File

@@ -156,336 +156,257 @@ bool SpeckTiny::setKey(const uint8_t *key, size_t len)
void SpeckTiny::encryptBlock(uint8_t *output, const uint8_t *input)
{
#if USE_AVR_INLINE_ASM
uint64_t l[4];
uint32_t xlow, xhigh, ylow, yhigh;
uint32_t slow, shigh;
uint8_t li_in = 0;
uint8_t li_out = (rounds - 31) * 8;
// Copy the "k" array into "s" and the "l" array.
// Automatically generated by the genspeck tool.
uint64_t l[5];
uint8_t r = rounds;
uint8_t mb = (r - 31) * 8;
__asm__ __volatile__ (
"ldd r25,%4\n" // r25 = li_out
"ld __tmp_reg__,Z+\n"
"std %A0,__tmp_reg__\n"
"ld __tmp_reg__,Z+\n"
"std %B0,__tmp_reg__\n"
"ld __tmp_reg__,Z+\n"
"std %C0,__tmp_reg__\n"
"ld __tmp_reg__,Z+\n"
"std %D0,__tmp_reg__\n"
"ld __tmp_reg__,Z+\n"
"std %A1,__tmp_reg__\n"
"ld __tmp_reg__,Z+\n"
"std %B1,__tmp_reg__\n"
"ld __tmp_reg__,Z+\n"
"std %C1,__tmp_reg__\n"
"ld __tmp_reg__,Z+\n"
"std %D1,__tmp_reg__\n"
"1:\n" // l[0..] = k[1..]
"ld __tmp_reg__,Z+\n"
"st X+,__tmp_reg__\n"
"dec r25\n"
"brne 1b\n"
: "=Q"(slow), "=Q"(shigh)
: "z"(k), "x"(l), "Q"(li_out)
: "r25"
);
// Unpack the input into the x and y variables, converting
// from big-endian into little-endian in the process.
__asm__ __volatile__ (
"ld %D1,Z\n"
"ldd %C1,Z+1\n"
"ldd %B1,Z+2\n"
"ldd %A1,Z+3\n"
"ldd %D0,Z+4\n"
"ldd %C0,Z+5\n"
"ldd %B0,Z+6\n"
"ldd %A0,Z+7\n"
"ldd %D3,Z+8\n"
"ldd %C3,Z+9\n"
"ldd %B3,Z+10\n"
"ldd %A3,Z+11\n"
"ldd %D2,Z+12\n"
"ldd %C2,Z+13\n"
"ldd %B2,Z+14\n"
"ldd %A2,Z+15\n"
: "=r"(xlow), "=r"(xhigh), "=r"(ylow), "=r"(yhigh)
: "z"(input)
);
// Perform all encryption rounds while expanding the key schedule in-place.
__asm__ __volatile__ (
"mov r23,__zero_reg__\n" // i = 0
"movw r8,r30\n"
"ldd r16,%4\n"
"ldi r24,8\n"
"add r16,r24\n"
"1:\n"
// Adjust x and y for this round using the key schedule word s.
// x = (rightRotate8_64(x) + y) ^ s;
"mov __tmp_reg__,%A0\n" // x = rightRotate8_64(x)
"mov %A0,%B0\n"
"mov %B0,%C0\n"
"mov %C0,%D0\n"
"mov %D0,%A1\n"
"mov %A1,%B1\n"
"mov %B1,%C1\n"
"mov %C1,%D1\n"
"mov %D1,__tmp_reg__\n"
"add %A0,%A2\n" // x += y
"adc %B0,%B2\n"
"adc %C0,%C2\n"
"adc %D0,%D2\n"
"adc %A1,%A3\n"
"adc %B1,%B3\n"
"adc %C1,%C3\n"
"adc %D1,%D3\n"
"ldd __tmp_reg__,%A4\n" // x ^= s
"eor %A0,__tmp_reg__\n"
"ldd __tmp_reg__,%B4\n"
"eor %B0,__tmp_reg__\n"
"ldd __tmp_reg__,%C4\n"
"eor %C0,__tmp_reg__\n"
"ldd __tmp_reg__,%D4\n"
"eor %D0,__tmp_reg__\n"
"ldd __tmp_reg__,%A5\n"
"eor %A1,__tmp_reg__\n"
"ldd __tmp_reg__,%B5\n"
"eor %B1,__tmp_reg__\n"
"ldd __tmp_reg__,%C5\n"
"eor %C1,__tmp_reg__\n"
"ldd __tmp_reg__,%D5\n"
"eor %D1,__tmp_reg__\n"
// y = leftRotate3_64(y) ^ x;
"lsl %A2\n" // y = leftRotate1_64(y)
"rol %B2\n"
"rol %C2\n"
"rol %D2\n"
"rol %A3\n"
"rol %B3\n"
"rol %C3\n"
"rol %D3\n"
"adc %A2,__zero_reg__\n"
"lsl %A2\n" // y = leftRotate1_64(y)
"rol %B2\n"
"rol %C2\n"
"rol %D2\n"
"rol %A3\n"
"rol %B3\n"
"rol %C3\n"
"rol %D3\n"
"adc %A2,__zero_reg__\n"
"lsl %A2\n" // y = leftRotate1_64(y)
"rol %B2\n"
"rol %C2\n"
"rol %D2\n"
"rol %A3\n"
"rol %B3\n"
"rol %C3\n"
"rol %D3\n"
"adc %A2,__zero_reg__\n"
"eor %A2,%A0\n" // y ^= x
"eor %B2,%B0\n"
"eor %C2,%C0\n"
"eor %D2,%D0\n"
"eor %A3,%A1\n"
"eor %B3,%B1\n"
"eor %C3,%C1\n"
"eor %D3,%D1\n"
// On the last round we don't need to compute s so we
// can exit early here if (i + 1) == rounds.
"mov __tmp_reg__,r23\n" // temp = i + 1
"inc __tmp_reg__\n"
"cp __tmp_reg__,%9\n" // if (temp == rounds) ...
"brne 2f\n"
"rjmp 3f\n"
"ld __tmp_reg__,X+\n"
"st Z+,__tmp_reg__\n"
"dec r16\n"
"brne 1b\n"
"movw r30,r8\n"
"movw r26,%A2\n"
"ld r15,X+\n"
"ld r14,X+\n"
"ld r13,X+\n"
"ld r12,X+\n"
"ld r11,X+\n"
"ld r10,X+\n"
"ld r9,X+\n"
"ld r8,X+\n"
"ld r23,X+\n"
"ld r22,X+\n"
"ld r21,X+\n"
"ld r20,X+\n"
"ld r19,X+\n"
"ld r18,X+\n"
"ld r17,X+\n"
"ld r16,X\n"
"clr %A2\n"
"ldd %B2,%4\n"
"clr r25\n"
"2:\n"
// Save x and y on the stack so we can reuse registers for t and s.
"push %A0\n"
"push %B0\n"
"push %C0\n"
"push %D0\n"
"push %A1\n"
"push %B1\n"
"push %C1\n"
"push %D1\n"
"push %A2\n"
"push %B2\n"
"push %C2\n"
"push %D2\n"
"push %A3\n"
"push %B3\n"
"push %C3\n"
"push %D3\n"
// Compute the key schedule word s for the next round.
// l[li_out] = (s + rightRotate8_64(l[li_in])) ^ i;
"ldd r24,%6\n" // Z = &(l[li_in])
"add %A8,r24\n"
"adc %B8,__zero_reg__\n"
"ld %D1,Z+\n" // t = rightRotate8_64(l[li_in])
"ld %A0,Z+\n"
"ld %B0,Z+\n"
"ld %C0,Z+\n"
"ld %D0,Z+\n"
"ld %A1,Z+\n"
"ld %B1,Z+\n"
"ld %C1,Z+\n"
"ldd %A2,%A4\n" // load s
"ldd %B2,%B4\n"
"ldd %C2,%C4\n"
"ldd %D2,%D4\n"
"ldd %A3,%A5\n"
"ldd %B3,%B5\n"
"ldd %C3,%C5\n"
"ldd %D3,%D5\n"
"add %A0,%A2\n" // t += s
"adc %B0,%B2\n"
"adc %C0,%C2\n"
"adc %D0,%D2\n"
"adc %A1,%A3\n"
"adc %B1,%B3\n"
"adc %C1,%C3\n"
"adc %D1,%D3\n"
"eor %A0,r23\n" // t ^= i
// Z = Z - li_in + li_out
"ldi r25,8\n" // li_in = li_in + 1
"add r24,r25\n"
"sub %A8,r24\n" // return Z to its initial value
"sbc %B8,__zero_reg__\n"
"andi r24,0x1f\n" // li_in = li_in % 4
"std %6,r24\n"
"ldd r24,%7\n" // Z = &(l[li_out])
"add %A8,r24\n"
"adc %B8,__zero_reg__\n"
"st Z+,%A0\n" // l[li_out] = t
"st Z+,%B0\n"
"st Z+,%C0\n"
"st Z+,%D0\n"
"st Z+,%A1\n"
"st Z+,%B1\n"
"st Z+,%C1\n"
"st Z+,%D1\n"
"add r24,r25\n" // li_out = li_out + 1
"sub %A8,r24\n" // return Z to its initial value
"sbc %B8,__zero_reg__\n"
"andi r24,0x1f\n" // li_out = li_out % 4
"std %7,r24\n"
// s = leftRotate3_64(s) ^ l[li_out];
"lsl %A2\n" // s = leftRotate1_64(s)
"rol %B2\n"
"rol %C2\n"
"rol %D2\n"
"rol %A3\n"
"rol %B3\n"
"rol %C3\n"
"rol %D3\n"
"adc %A2,__zero_reg__\n"
"lsl %A2\n" // s = leftRotate1_64(s)
"rol %B2\n"
"rol %C2\n"
"rol %D2\n"
"rol %A3\n"
"rol %B3\n"
"rol %C3\n"
"rol %D3\n"
"adc %A2,__zero_reg__\n"
"lsl %A2\n" // s = leftRotate1_64(s)
"rol %B2\n"
"rol %C2\n"
"rol %D2\n"
"rol %A3\n"
"rol %B3\n"
"rol %C3\n"
"rol %D3\n"
"adc %A2,__zero_reg__\n"
"eor %A2,%A0\n" // s ^= l[li_out]
"eor %B2,%B0\n"
"eor %C2,%C0\n"
"eor %D2,%D0\n"
"eor %A3,%A1\n"
"eor %B3,%B1\n"
"eor %C3,%C1\n"
"eor %D3,%D1\n"
"std %A4,%A2\n" // store s
"std %B4,%B2\n"
"std %C4,%C2\n"
"std %D4,%D2\n"
"std %A5,%A3\n"
"std %B5,%B3\n"
"std %C5,%C3\n"
"std %D5,%D3\n"
// Pop registers from the stack to recover the x and y values.
"pop %D3\n"
"pop %C3\n"
"pop %B3\n"
"pop %A3\n"
"pop %D2\n"
"pop %C2\n"
"pop %B2\n"
"pop %A2\n"
"pop %D1\n"
"pop %C1\n"
"pop %B1\n"
"pop %A1\n"
"pop %D0\n"
"pop %C0\n"
"pop %B0\n"
"pop %A0\n"
// Bottom of the loop.
"inc r23\n"
"rjmp 1b\n"
"add r9,r16\n"
"adc r10,r17\n"
"adc r11,r18\n"
"adc r12,r19\n"
"adc r13,r20\n"
"adc r14,r21\n"
"adc r15,r22\n"
"adc r8,r23\n"
"ld __tmp_reg__,Z+\n"
"eor __tmp_reg__,r9\n"
"ld r9,Z+\n"
"eor r9,r10\n"
"ld r10,Z+\n"
"eor r10,r11\n"
"ld r11,Z+\n"
"eor r11,r12\n"
"ld r12,Z+\n"
"eor r12,r13\n"
"ld r13,Z+\n"
"eor r13,r14\n"
"ld r14,Z+\n"
"eor r14,r15\n"
"ld r15,Z+\n"
"eor r15,r8\n"
"mov r8,__tmp_reg__\n"
"lsl r16\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16, __zero_reg__\n"
"lsl r16\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16, __zero_reg__\n"
"lsl r16\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16, __zero_reg__\n"
"eor r16,r8\n"
"eor r17,r9\n"
"eor r18,r10\n"
"eor r19,r11\n"
"eor r20,r12\n"
"eor r21,r13\n"
"eor r22,r14\n"
"eor r23,r15\n"
"mov __tmp_reg__,r25\n"
"inc __tmp_reg__\n"
"ldd r24,%5\n"
"cp __tmp_reg__,r24\n"
"brne 3f\n"
"rjmp 4f\n"
"3:\n"
: "+r"(xlow), "+r"(xhigh), "+r"(ylow), "+r"(yhigh),
"+Q"(slow), "+Q"(shigh), "+Q"(li_in), "+Q"(li_out)
: "z"(l), "r"(rounds)
: "r23", "r24", "r25"
);
// Pack the results into the output and convert back to big-endian.
__asm__ __volatile__ (
"st Z,%D1\n"
"std Z+1,%C1\n"
"std Z+2,%B1\n"
"std Z+3,%A1\n"
"std Z+4,%D0\n"
"std Z+5,%C0\n"
"std Z+6,%B0\n"
"std Z+7,%A0\n"
"std Z+8,%D3\n"
"std Z+9,%C3\n"
"std Z+10,%B3\n"
"std Z+11,%A3\n"
"std Z+12,%D2\n"
"std Z+13,%C2\n"
"std Z+14,%B2\n"
"std Z+15,%A2\n"
: : "r"(xlow), "r"(xhigh), "r"(ylow), "r"(yhigh), "z"(output)
"push r8\n"
"push r9\n"
"push r10\n"
"push r11\n"
"push r12\n"
"push r13\n"
"push r14\n"
"push r15\n"
"push r16\n"
"push r17\n"
"push r18\n"
"push r19\n"
"push r20\n"
"push r21\n"
"push r22\n"
"push r23\n"
"sbiw r30,8\n"
"ld r16,Z\n"
"ldd r17,Z+1\n"
"ldd r18,Z+2\n"
"ldd r19,Z+3\n"
"ldd r20,Z+4\n"
"ldd r21,Z+5\n"
"ldd r22,Z+6\n"
"ldd r23,Z+7\n"
"add r30,%A2\n"
"adc r31,__zero_reg__\n"
"ldd r15,Z+8\n"
"ldd r8,Z+9\n"
"ldd r9,Z+10\n"
"ldd r10,Z+11\n"
"ldd r11,Z+12\n"
"ldd r12,Z+13\n"
"ldd r13,Z+14\n"
"ldd r14,Z+15\n"
"add r8,r16\n"
"adc r9,r17\n"
"adc r10,r18\n"
"adc r11,r19\n"
"adc r12,r20\n"
"adc r13,r21\n"
"adc r14,r22\n"
"adc r15,r23\n"
"eor r8,r25\n"
"sub r30,%A2\n"
"sbc r31,__zero_reg__\n"
"add r30,%B2\n"
"adc r31,__zero_reg__\n"
"std Z+8,r8\n"
"std Z+9,r9\n"
"std Z+10,r10\n"
"std Z+11,r11\n"
"std Z+12,r12\n"
"std Z+13,r13\n"
"std Z+14,r14\n"
"std Z+15,r15\n"
"sub r30,%B2\n"
"sbc r31,__zero_reg__\n"
"lsl r16\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16, __zero_reg__\n"
"lsl r16\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16, __zero_reg__\n"
"lsl r16\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16, __zero_reg__\n"
"eor r16,r8\n"
"eor r17,r9\n"
"eor r18,r10\n"
"eor r19,r11\n"
"eor r20,r12\n"
"eor r21,r13\n"
"eor r22,r14\n"
"eor r23,r15\n"
"st Z,r16\n"
"std Z+1,r17\n"
"std Z+2,r18\n"
"std Z+3,r19\n"
"std Z+4,r20\n"
"std Z+5,r21\n"
"std Z+6,r22\n"
"std Z+7,r23\n"
"ldi r24,8\n"
"add %A2,r24\n"
"add %B2,r24\n"
"ldi r24,0x1F\n"
"and %A2,r24\n"
"and %B2,r24\n"
"pop r23\n"
"pop r22\n"
"pop r21\n"
"pop r20\n"
"pop r19\n"
"pop r18\n"
"pop r17\n"
"pop r16\n"
"pop r15\n"
"pop r14\n"
"pop r13\n"
"pop r12\n"
"pop r11\n"
"pop r10\n"
"pop r9\n"
"pop r8\n"
"inc r25\n"
"rjmp 2b\n"
"4:\n"
"ldd r26,%A3\n"
"ldd r27,%B3\n"
"st X+,r15\n"
"st X+,r14\n"
"st X+,r13\n"
"st X+,r12\n"
"st X+,r11\n"
"st X+,r10\n"
"st X+,r9\n"
"st X+,r8\n"
"st X+,r23\n"
"st X+,r22\n"
"st X+,r21\n"
"st X+,r20\n"
"st X+,r19\n"
"st X+,r18\n"
"st X+,r17\n"
"st X,r16\n"
: : "x"(k), "z"(l), "r"(input), "Q"(output), "Q"(mb), "Q"(r)
: "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "memory"
, "r24", "r25"
);
#else
uint64_t l[4];