diff --git a/doc/crypto.dox b/doc/crypto.dox index a42d6a9e..42fd61a7 100644 --- a/doc/crypto.dox +++ b/doc/crypto.dox @@ -81,27 +81,27 @@ Ardunino Mega 2560 running at 16 MHz are similar: ChaCha (20 rounds)14.87us14.88us43.74us132 ChaCha (12 rounds)10.38us10.38us43.74us132 ChaCha (8 rounds)8.13us8.14us43.74us132 -Speck (128-bit key, ECB mode)10.72us11.09us287.02us275 -Speck (192-bit key, ECB mode)11.03us11.42us298.21us275 -Speck (256-bit key, ECB mode)11.35us11.74us309.66us275 -SpeckSmall (128-bit key, ECB mode)35.25us36.46us207.66us67 -SpeckSmall (192-bit key, ECB mode)36.56us37.56us220.55us67 -SpeckSmall (256-bit key, ECB mode)37.87us38.67us233.32us67 -SpeckTiny (128-bit key, ECB mode)35.25us 10.22us35 -SpeckTiny (192-bit key, ECB mode)36.56us 13.62us35 -SpeckTiny (256-bit key, ECB mode)37.87us 16.89us35 +Speck (128-bit key, ECB mode)9.74us10.12us253.94us275 +Speck (192-bit key, ECB mode)10.03us10.41us264.63us275 +Speck (256-bit key, ECB mode)10.31us10.71us275.26us275 +SpeckSmall (128-bit key, ECB mode)33.93us34.82us207.66us67 +SpeckSmall (192-bit key, ECB mode)35.20us35.88us220.55us67 +SpeckSmall (256-bit key, ECB mode)36.46us36.93us233.32us67 +SpeckTiny (128-bit key, ECB mode)33.93us 10.22us35 +SpeckTiny (192-bit key, ECB mode)35.20us 13.62us35 +SpeckTiny (256-bit key, ECB mode)36.46us 16.89us35 AEAD AlgorithmEncryption (per byte)Decryption (per byte)Key SetupState Size (bytes) ChaChaPoly41.20us41.19us902.36us221 GCM<AES128>109.71us109.26us1265.69us284 GCM<AES192>116.38us115.92us1485.56us316 GCM<AES256>123.04us122.59us1760.28us348 -GCM<Speck> (256-bit key)87.78us87.32us714.41us378 -GCM<SpeckTiny> (256-bit key)114.30us113.84us1270.32us138 +GCM<Speck> (256-bit key)86.74us86.29us646.88us378 +GCM<SpeckTiny> (256-bit key)112.90us112.44us1225.48us138 EAX<AES128>71.14us71.14us1311.97us268 EAX<AES256>97.80us97.80us1806.57us332 -EAX<Speck> (256-bit key)27.27us27.26us760.74us362 -EAX<SpeckTiny> (256-bit key)80.31us80.31us1316.60us122 +EAX<Speck> (256-bit key)25.89us25.88us690.63us362 +EAX<SpeckTiny> (256-bit key)78.20us78.20us1269.19us122 Hash AlgorithmHashing (per byte)Finalization State Size (bytes) SHA25643.85us2841.04us 107 diff --git a/gen/genspeck.c b/gen/genspeck.c new file mode 100644 index 00000000..91a37333 --- /dev/null +++ b/gen/genspeck.c @@ -0,0 +1,892 @@ +/* + * Copyright (C) 2016 Southern Storm Software, Pty Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +// Special-purpose compiler that generates the AVR version of Speck*. + +#include +#include + +static int indent = 4; + +static int t1_reg = 8; // Temporary 64-bit value (any reg). +static int t2_reg = 16; // Temporary 64-bit value (any reg). + +static int x_reg = 26; +//static int y_reg = 28; +static int z_reg = 30; + +static int const_reg = 24; // For temporary constants (must be a high reg). + +static int temp_reg = 25; // Spare temporary register. + +// Information about a set of registers storing a 64-bit quantity. +typedef struct +{ + int first; // First register in the set. + int offset; // Offset for multiple of 8 rotations. + +} Reg64; + +// Indent the code and print a string. +void indent_printf(const char *format, ...) +{ + va_list va; + int posn; + va_start(va, format); + for (posn = 0; posn < indent; ++posn) + putc(' ', stdout); + vfprintf(stdout, format, va); + va_end(va); +} + +// Print an assembler instruction within quotes. +void insn_printf(const char *format, ...) +{ + va_list va; + int posn; + va_start(va, format); + for (posn = 0; posn < indent; ++posn) + putc(' ', stdout); + putc('"', stdout); + vfprintf(stdout, format, va); + putc('\\', stdout); + putc('n', stdout); + putc('"', stdout); + putc('\n', stdout); + va_end(va); +} + +#define REGn(reg, n) ((reg)->first + ((n) + (reg)->offset) % 8) + +void leftRotate1(const Reg64 *reg) +{ + insn_printf("lsl r%d", REGn(reg, 0)); + insn_printf("rol r%d", REGn(reg, 1)); + insn_printf("rol r%d", REGn(reg, 2)); + insn_printf("rol r%d", REGn(reg, 3)); + insn_printf("rol r%d", REGn(reg, 4)); + insn_printf("rol r%d", REGn(reg, 5)); + insn_printf("rol r%d", REGn(reg, 6)); + insn_printf("rol r%d", REGn(reg, 7)); + insn_printf("adc r%d, __zero_reg__", REGn(reg, 0)); +} + +void leftRotate3(const Reg64 *reg) +{ + leftRotate1(reg); + leftRotate1(reg); + leftRotate1(reg); +} + +void leftRotate8(Reg64 *reg) +{ + reg->offset = (reg->offset + 7) % 8; +} + +void rightRotate1(const Reg64 *reg) +{ + insn_printf("bst r%d,0", REGn(reg, 0)); + insn_printf("ror r%d", REGn(reg, 7)); + insn_printf("ror r%d", REGn(reg, 6)); + insn_printf("ror r%d", REGn(reg, 5)); + insn_printf("ror r%d", REGn(reg, 4)); + insn_printf("ror r%d", REGn(reg, 3)); + insn_printf("ror r%d", REGn(reg, 2)); + insn_printf("ror r%d", REGn(reg, 1)); + insn_printf("ror r%d", REGn(reg, 0)); + insn_printf("bld r%d,7", REGn(reg, 7)); +} + +void rightRotate3(const Reg64 *reg) +{ + rightRotate1(reg); + rightRotate1(reg); + rightRotate1(reg); +} + +void rightRotate8(Reg64 *reg) +{ + reg->offset = (reg->offset + 1) % 8; +} + +void add64(const Reg64 *dst, const Reg64 *src) +{ + insn_printf("add r%d,r%d", REGn(dst, 0), REGn(src, 0)); + insn_printf("adc r%d,r%d", REGn(dst, 1), REGn(src, 1)); + insn_printf("adc r%d,r%d", REGn(dst, 2), REGn(src, 2)); + insn_printf("adc r%d,r%d", REGn(dst, 3), REGn(src, 3)); + insn_printf("adc r%d,r%d", REGn(dst, 4), REGn(src, 4)); + insn_printf("adc r%d,r%d", REGn(dst, 5), REGn(src, 5)); + insn_printf("adc r%d,r%d", REGn(dst, 6), REGn(src, 6)); + insn_printf("adc r%d,r%d", REGn(dst, 7), REGn(src, 7)); +} + +void sub64(const Reg64 *dst, const Reg64 *src) +{ + insn_printf("sub r%d,r%d", REGn(dst, 0), REGn(src, 0)); + insn_printf("sbc r%d,r%d", REGn(dst, 1), REGn(src, 1)); + insn_printf("sbc r%d,r%d", REGn(dst, 2), REGn(src, 2)); + insn_printf("sbc r%d,r%d", REGn(dst, 3), REGn(src, 3)); + insn_printf("sbc r%d,r%d", REGn(dst, 4), REGn(src, 4)); + insn_printf("sbc r%d,r%d", REGn(dst, 5), REGn(src, 5)); + insn_printf("sbc r%d,r%d", REGn(dst, 6), REGn(src, 6)); + insn_printf("sbc r%d,r%d", REGn(dst, 7), REGn(src, 7)); +} + +void eor64(const Reg64 *dst, const Reg64 *src) +{ + insn_printf("eor r%d,r%d", REGn(dst, 0), REGn(src, 0)); + insn_printf("eor r%d,r%d", REGn(dst, 1), REGn(src, 1)); + insn_printf("eor r%d,r%d", REGn(dst, 2), REGn(src, 2)); + insn_printf("eor r%d,r%d", REGn(dst, 3), REGn(src, 3)); + insn_printf("eor r%d,r%d", REGn(dst, 4), REGn(src, 4)); + insn_printf("eor r%d,r%d", REGn(dst, 5), REGn(src, 5)); + insn_printf("eor r%d,r%d", REGn(dst, 6), REGn(src, 6)); + insn_printf("eor r%d,r%d", REGn(dst, 7), REGn(src, 7)); +} + +void eor64Schedule(Reg64 *reg) +{ + // XOR with the schedule. + insn_printf("ld __tmp_reg__,Z+"); + insn_printf("eor __tmp_reg__,r%d", REGn(reg, 0)); + insn_printf("ld r%d,Z+", REGn(reg, 0)); + insn_printf("eor r%d,r%d", REGn(reg, 0), REGn(reg, 1)); + insn_printf("ld r%d,Z+", REGn(reg, 1)); + insn_printf("eor r%d,r%d", REGn(reg, 1), REGn(reg, 2)); + insn_printf("ld r%d,Z+", REGn(reg, 2)); + insn_printf("eor r%d,r%d", REGn(reg, 2), REGn(reg, 3)); + insn_printf("ld r%d,Z+", REGn(reg, 3)); + insn_printf("eor r%d,r%d", REGn(reg, 3), REGn(reg, 4)); + insn_printf("ld r%d,Z+", REGn(reg, 4)); + insn_printf("eor r%d,r%d", REGn(reg, 4), REGn(reg, 5)); + insn_printf("ld r%d,Z+", REGn(reg, 5)); + insn_printf("eor r%d,r%d", REGn(reg, 5), REGn(reg, 6)); + insn_printf("ld r%d,Z+", REGn(reg, 6)); + insn_printf("eor r%d,r%d", REGn(reg, 6), REGn(reg, 7)); + insn_printf("mov r%d,__tmp_reg__", REGn(reg, 7)); + + // The above operations also implicitly perform a right-rotation. + // Undo it by left-shifting back into the correct position. + leftRotate8(reg); +} + +void eor64ScheduleReversePtr(Reg64 *reg, const char *ptrReg) +{ + // XOR with the schedule. + insn_printf("ld __tmp_reg__,-%s", ptrReg); + insn_printf("eor __tmp_reg__,r%d", REGn(reg, 7)); + insn_printf("ld r%d,-%s", REGn(reg, 7), ptrReg); + insn_printf("eor r%d,r%d", REGn(reg, 7), REGn(reg, 6)); + insn_printf("ld r%d,-%s", REGn(reg, 6), ptrReg); + insn_printf("eor r%d,r%d", REGn(reg, 6), REGn(reg, 5)); + insn_printf("ld r%d,-%s", REGn(reg, 5), ptrReg); + insn_printf("eor r%d,r%d", REGn(reg, 5), REGn(reg, 4)); + insn_printf("ld r%d,-%s", REGn(reg, 4), ptrReg); + insn_printf("eor r%d,r%d", REGn(reg, 4), REGn(reg, 3)); + insn_printf("ld r%d,-%s", REGn(reg, 3), ptrReg); + insn_printf("eor r%d,r%d", REGn(reg, 3), REGn(reg, 2)); + insn_printf("ld r%d,-%s", REGn(reg, 2), ptrReg); + insn_printf("eor r%d,r%d", REGn(reg, 2), REGn(reg, 1)); + insn_printf("ld r%d,-%s", REGn(reg, 1), ptrReg); + insn_printf("eor r%d,r%d", REGn(reg, 1), REGn(reg, 0)); + insn_printf("mov r%d,__tmp_reg__", REGn(reg, 0)); + + // The above operations also implicitly perform a left-rotation. + // Undo it by right-shifting back into the correct position. + // We have to do this twice because the following step will be + // apply a left-rotation to put everything back where it belongs. + rightRotate8(reg); + rightRotate8(reg); +} + +void eor64ScheduleReverse(Reg64 *reg) +{ + eor64ScheduleReversePtr(reg, "Z"); +} + +void eor64ScheduleReverseX(Reg64 *reg) +{ + eor64ScheduleReversePtr(reg, "X"); +} + +// Unpack the input block and convert from big-endian to little-endian. +static void unpack_input(void) +{ + Reg64 xreg = {t1_reg, 0}; + Reg64 yreg = {t2_reg, 0}; + + insn_printf("ld r%d,X+", REGn(&xreg, 7)); + insn_printf("ld r%d,X+", REGn(&xreg, 6)); + insn_printf("ld r%d,X+", REGn(&xreg, 5)); + insn_printf("ld r%d,X+", REGn(&xreg, 4)); + insn_printf("ld r%d,X+", REGn(&xreg, 3)); + insn_printf("ld r%d,X+", REGn(&xreg, 2)); + insn_printf("ld r%d,X+", REGn(&xreg, 1)); + insn_printf("ld r%d,X+", REGn(&xreg, 0)); + + insn_printf("ld r%d,X+", REGn(&yreg, 7)); + insn_printf("ld r%d,X+", REGn(&yreg, 6)); + insn_printf("ld r%d,X+", REGn(&yreg, 5)); + insn_printf("ld r%d,X+", REGn(&yreg, 4)); + insn_printf("ld r%d,X+", REGn(&yreg, 3)); + insn_printf("ld r%d,X+", REGn(&yreg, 2)); + insn_printf("ld r%d,X+", REGn(&yreg, 1)); + insn_printf("ld r%d,X", REGn(&yreg, 0)); +} + +static void load_from_x(Reg64 *reg) +{ + insn_printf("ld r%d,X+", REGn(reg, 0)); + insn_printf("ld r%d,X+", REGn(reg, 1)); + insn_printf("ld r%d,X+", REGn(reg, 2)); + insn_printf("ld r%d,X+", REGn(reg, 3)); + insn_printf("ld r%d,X+", REGn(reg, 4)); + insn_printf("ld r%d,X+", REGn(reg, 5)); + insn_printf("ld r%d,X+", REGn(reg, 6)); + insn_printf("ld r%d,X+", REGn(reg, 7)); +} + +static void store_to_x(Reg64 *reg) +{ + insn_printf("st X+,r%d", REGn(reg, 0)); + insn_printf("st X+,r%d", REGn(reg, 1)); + insn_printf("st X+,r%d", REGn(reg, 2)); + insn_printf("st X+,r%d", REGn(reg, 3)); + insn_printf("st X+,r%d", REGn(reg, 4)); + insn_printf("st X+,r%d", REGn(reg, 5)); + insn_printf("st X+,r%d", REGn(reg, 6)); + insn_printf("st X+,r%d", REGn(reg, 7)); +} + +static void load_from_z(Reg64 *reg) +{ + insn_printf("ld r%d,Z+", REGn(reg, 0)); + insn_printf("ld r%d,Z+", REGn(reg, 1)); + insn_printf("ld r%d,Z+", REGn(reg, 2)); + insn_printf("ld r%d,Z+", REGn(reg, 3)); + insn_printf("ld r%d,Z+", REGn(reg, 4)); + insn_printf("ld r%d,Z+", REGn(reg, 5)); + insn_printf("ld r%d,Z+", REGn(reg, 6)); + insn_printf("ld r%d,Z+", REGn(reg, 7)); +} + +static void store_to_z(Reg64 *reg) +{ + insn_printf("st Z+,r%d", REGn(reg, 0)); + insn_printf("st Z+,r%d", REGn(reg, 1)); + insn_printf("st Z+,r%d", REGn(reg, 2)); + insn_printf("st Z+,r%d", REGn(reg, 3)); + insn_printf("st Z+,r%d", REGn(reg, 4)); + insn_printf("st Z+,r%d", REGn(reg, 5)); + insn_printf("st Z+,r%d", REGn(reg, 6)); + insn_printf("st Z+,r%d", REGn(reg, 7)); +} + +static void push64(Reg64 *reg) +{ + reg->offset = 0; + insn_printf("push r%d", REGn(reg, 0)); + insn_printf("push r%d", REGn(reg, 1)); + insn_printf("push r%d", REGn(reg, 2)); + insn_printf("push r%d", REGn(reg, 3)); + insn_printf("push r%d", REGn(reg, 4)); + insn_printf("push r%d", REGn(reg, 5)); + insn_printf("push r%d", REGn(reg, 6)); + insn_printf("push r%d", REGn(reg, 7)); +} + +static void pop64(Reg64 *reg) +{ + reg->offset = 0; + insn_printf("pop r%d", REGn(reg, 7)); + insn_printf("pop r%d", REGn(reg, 6)); + insn_printf("pop r%d", REGn(reg, 5)); + insn_printf("pop r%d", REGn(reg, 4)); + insn_printf("pop r%d", REGn(reg, 3)); + insn_printf("pop r%d", REGn(reg, 2)); + insn_printf("pop r%d", REGn(reg, 1)); + insn_printf("pop r%d", REGn(reg, 0)); +} + +// Main loop for Speck::encryptBlock(). +static void full_enc_main_loop(void) +{ + Reg64 xreg = {t1_reg, 0}; + Reg64 yreg = {t2_reg, 0}; + + // Top of the main loop. + insn_printf("1:"); + + // x = (rightRotate8_64(x) + y) ^ *s++; + rightRotate8(&xreg); + add64(&xreg, &yreg); + eor64Schedule(&xreg); + + // y = leftRotate3_64(y) ^ x; + leftRotate3(&yreg); + eor64(&yreg, &xreg); + + // Bottom of the main loop. + insn_printf("dec %%2"); + insn_printf("breq 2f"); + insn_printf("rjmp 1b"); + insn_printf("2:"); +} + +// Main loop for Speck::decryptBlock(). +static void full_dec_main_loop(void) +{ + Reg64 xreg = {t1_reg, 0}; + Reg64 yreg = {t2_reg, 0}; + + // Top of the main loop. + insn_printf("1:"); + + // y = rightRotate3_64(x ^ y); + eor64(&yreg, &xreg); + rightRotate3(&yreg); + + // x = leftRotate8_64((x ^ *s--) - y); + eor64ScheduleReverse(&xreg); + leftRotate8(&xreg); + sub64(&xreg, &yreg); + + // Bottom of the main loop. + insn_printf("dec %%2"); + insn_printf("breq 2f"); + insn_printf("rjmp 1b"); + insn_printf("2:"); +} + +// Pack the output block and convert from little-endian to big-endian. +static void pack_output(void) +{ + Reg64 xreg = {t1_reg, 0}; + Reg64 yreg = {t2_reg, 0}; + + insn_printf("ldd r%d,%%A3", x_reg); + insn_printf("ldd r%d,%%B3", x_reg + 1); + + insn_printf("st X+,r%d", REGn(&xreg, 7)); + insn_printf("st X+,r%d", REGn(&xreg, 6)); + insn_printf("st X+,r%d", REGn(&xreg, 5)); + insn_printf("st X+,r%d", REGn(&xreg, 4)); + insn_printf("st X+,r%d", REGn(&xreg, 3)); + insn_printf("st X+,r%d", REGn(&xreg, 2)); + insn_printf("st X+,r%d", REGn(&xreg, 1)); + insn_printf("st X+,r%d", REGn(&xreg, 0)); + + insn_printf("st X+,r%d", REGn(&yreg, 7)); + insn_printf("st X+,r%d", REGn(&yreg, 6)); + insn_printf("st X+,r%d", REGn(&yreg, 5)); + insn_printf("st X+,r%d", REGn(&yreg, 4)); + insn_printf("st X+,r%d", REGn(&yreg, 3)); + insn_printf("st X+,r%d", REGn(&yreg, 2)); + insn_printf("st X+,r%d", REGn(&yreg, 1)); + insn_printf("st X,r%d", REGn(&yreg, 0)); +} + +static void temp_regs(void) +{ + indent_printf(": \"r%d\", \"r%d\", \"r%d\", \"r%d\", " + "\"r%d\", \"r%d\", \"r%d\", \"r%d\",\n", + t1_reg, t1_reg + 1, t1_reg + 2, t1_reg + 3, + t1_reg + 4, t1_reg + 5, t1_reg + 6, t1_reg + 7); + indent_printf(" \"r%d\", \"r%d\", \"r%d\", \"r%d\", " + "\"r%d\", \"r%d\", \"r%d\", \"r%d\", \"memory\"\n", + t2_reg, t2_reg + 1, t2_reg + 2, t2_reg + 3, + t2_reg + 4, t2_reg + 5, t2_reg + 6, t2_reg + 7); +} + +static void full_setkey(void) +{ + Reg64 xreg = {t1_reg, 0}; + Reg64 yreg = {t2_reg, 0}; + + printf("void Speck::setKey(const uint8_t *key, size_t len)\n"); + printf("{\n"); + indent_printf("// Automatically generated by the genspeck tool.\n"); + + // Validate the key length. + indent_printf("uint64_t l[4];\n"); + indent_printf("uint8_t m, mb;\n"); + indent_printf("if (len == 32) {\n"); + indent_printf(" m = 4;\n"); + indent_printf(" mb = 3 * 8;\n"); + indent_printf("} else if (len == 24) {\n"); + indent_printf(" m = 3;\n"); + indent_printf(" mb = 2 * 8;\n"); + indent_printf("} else if (len == 16) {\n"); + indent_printf(" m = 2;\n"); + indent_printf(" mb = 8;\n"); + indent_printf("} else {\n"); + indent_printf(" return false;\n"); + indent_printf("}\n"); + indent_printf("rounds = 30 + m;\n"); + indent_printf("uint8_t r = rounds - 1;\n"); + indent_printf("__asm__ __volatile__ (\n"); + indent += 4; + + // Copy the key into k[0] and l while converting endianness. + insn_printf("ld __tmp_reg__,-X"); // k[0] = last 8 bytes of the key + insn_printf("st Z+,__tmp_reg__"); + insn_printf("ld __tmp_reg__,-X"); + insn_printf("st Z+,__tmp_reg__"); + insn_printf("ld __tmp_reg__,-X"); + insn_printf("st Z+,__tmp_reg__"); + insn_printf("ld __tmp_reg__,-X"); + insn_printf("st Z+,__tmp_reg__"); + insn_printf("ld __tmp_reg__,-X"); + insn_printf("st Z+,__tmp_reg__"); + insn_printf("ld __tmp_reg__,-X"); + insn_printf("st Z+,__tmp_reg__"); + insn_printf("ld __tmp_reg__,-X"); + insn_printf("st Z+,__tmp_reg__"); + insn_printf("ld __tmp_reg__,-X"); + insn_printf("st Z+,__tmp_reg__"); + insn_printf("sbiw r%d,8", z_reg); // Set Z back to beginning of k + insn_printf("movw r%d,r%d", t1_reg + 2, z_reg); // Save Z + insn_printf("movw r%d,%%A2", z_reg); // Z = l + insn_printf("ldd r%d,%%3", t1_reg); + insn_printf("1:"); + insn_printf("ld __tmp_reg__,-X"); // Copy first mb bytes from key + insn_printf("st Z+,__tmp_reg__"); + insn_printf("dec r%d", t1_reg); + insn_printf("brne 1b"); + insn_printf("movw r%d,%%A2", x_reg); // X = l + insn_printf("movw r%d,r%d", z_reg, t1_reg + 2); // Z = k + + // Expand the key to the full key schedule. + // Note: We can use %A2 and %B2 as spare temporary registers now. + insn_printf("clr %%A2"); // %A2 = li_in = 0 + insn_printf("ldd %%B2,%%3"); // %B2 = li_out = mb (= (m - 1) * 8) + insn_printf("clr r%d", temp_reg); // i = 0 + load_from_z(&yreg); // y = k[i] + insn_printf("2:"); + + // l[li_out] = (k[i] + rightRotate8_64(l[li_in])) ^ i + insn_printf("add r%d,%%A2", x_reg); // x = rightRotate8_64(l[li_in]) + insn_printf("adc r%d,__zero_reg__", x_reg + 1); + xreg.offset = 7; + load_from_x(&xreg); + xreg.offset = 0; + insn_printf("sub r%d,%%A2", x_reg); // restore X to point at base of l + insn_printf("sbc r%d,__zero_reg__", x_reg + 1); + insn_printf("sbiw r%d,8", x_reg); + add64(&xreg, &yreg); // x += y + insn_printf("eor r%d,r%d", REGn(&xreg, 0), temp_reg); // x ^= i + insn_printf("add r%d,%%B2", x_reg); // l[li_out] = x + insn_printf("adc r%d,__zero_reg__", x_reg + 1); + store_to_x(&xreg); + insn_printf("sub r%d,%%B2", x_reg); // restore X to point at base of l + insn_printf("sbc r%d,__zero_reg__", x_reg + 1); + insn_printf("sbiw r%d,8", x_reg); + + // k[i + 1] = leftRotate3_64(k[i]) ^ l[li_out]; + leftRotate3(&yreg); // y = leftRotate3(y) + eor64(&yreg, &xreg); // y ^= x + store_to_z(&yreg); // k[i + 1] = y + + // Advance li_in and li_out, wrapping around at the end of l. + insn_printf("ldi r%d,8", const_reg); + insn_printf("add %%A2,r%d", const_reg); + insn_printf("add %%B2,r%d", const_reg); + insn_printf("ldi r%d,0x1F", const_reg); + insn_printf("and %%A2,r%d", const_reg); + insn_printf("and %%B2,r%d", const_reg); + + // Bottom of the loop. + insn_printf("ldd r%d,%%4", t1_reg); // r8 = rounds - 1 + insn_printf("inc r%d", temp_reg); // ++i + insn_printf("cp r%d,r%d", temp_reg, t1_reg); + insn_printf("breq 3f"); + insn_printf("rjmp 2b"); + insn_printf("3:"); + + // Clean the l array. X register should still be pointing to it. + insn_printf("ldi r%d,32", const_reg); + insn_printf("4:"); + insn_printf("st X+,__zero_reg__"); + insn_printf("dec r%d", const_reg); + insn_printf("brne 4b"); + + // Declare the registers that we need. + indent_printf(": : \"z\"(k), \"x\"(key + len), \"r\"(l), \"Q\"(mb), \"Q\"(r)\n"); + temp_regs(); + indent_printf(", \"r%d\", \"r%d\"\n", const_reg, temp_reg); + indent -= 4; + indent_printf(");\n"); + + // End of function. + indent_printf("return true;\n"); + printf("}\n\n"); +} + +static void full_enc(void) +{ + printf("void Speck::encryptBlock(uint8_t *output, const uint8_t *input)\n"); + printf("{\n"); + indent_printf("// Automatically generated by the genspeck tool.\n"); + indent_printf("__asm__ __volatile__ (\n"); + indent += 4; + unpack_input(); + full_enc_main_loop(); + pack_output(); + indent_printf(": : \"x\"(input), \"z\"(k), \"r\"(rounds), \"Q\"(output)\n"); + temp_regs(); + indent -= 4; + indent_printf(");\n"); + printf("}\n\n"); +} + +static void full_dec(void) +{ + printf("void Speck::decryptBlock(uint8_t *output, const uint8_t *input)\n"); + printf("{\n"); + indent_printf("// Automatically generated by the genspeck tool.\n"); + indent_printf("__asm__ __volatile__ (\n"); + indent += 4; + unpack_input(); + full_dec_main_loop(); + pack_output(); + indent_printf(": : \"x\"(input), \"z\"(k + rounds), \"r\"(rounds), \"Q\"(output)\n"); + temp_regs(); + indent -= 4; + indent_printf(");\n"); + printf("}\n\n"); +} + +static void tiny_enc(void) +{ + Reg64 xreg = {t1_reg, 0}; + Reg64 yreg = {t2_reg, 0}; + + printf("void SpeckTiny::encryptBlock(uint8_t *output, const uint8_t *input)\n"); + printf("{\n"); + indent_printf("// Automatically generated by the genspeck tool.\n"); + indent_printf("uint64_t l[5];\n"); + indent_printf("uint8_t r = rounds;\n"); + indent_printf("uint8_t mb = (r - 31) * 8;\n"); + + // Copy the "k" array into the "l" array. The first element is "s" + // and the rest of the elements make up the normal l[0..3] values. + indent_printf("__asm__ __volatile__ (\n"); + indent += 4; + insn_printf("movw r%d,r%d", t1_reg, z_reg); // Save Z + insn_printf("ldd r%d,%%4", t2_reg); + insn_printf("ldi r%d,8", const_reg); + insn_printf("add r%d,r%d", t2_reg, const_reg); + insn_printf("1:"); + insn_printf("ld __tmp_reg__,X+"); + insn_printf("st Z+,__tmp_reg__"); + insn_printf("dec r%d", t2_reg); + insn_printf("brne 1b"); + insn_printf("movw r%d,r%d", z_reg, t1_reg); // Restore Z to point at l + + // Unpack the input. %A2 and %B2 are free temporary registers after this. + insn_printf("movw r%d,%%A2", x_reg); + unpack_input(); + + // Top of the loop. + insn_printf("clr %%A2"); // %A2 = li_in = 0 + insn_printf("ldd %%B2,%%4"); // %B2 = li_out = mb + insn_printf("clr r%d", temp_reg); // i = 0 + insn_printf("2:"); + + // Adjust x and y for this round using the key schedule word s (in l[0]). + // x = (rightRotate8_64(x) + y) ^ s; + rightRotate8(&xreg); + add64(&xreg, &yreg); + eor64Schedule(&xreg); + // y = leftRotate3_64(y) ^ x; + leftRotate3(&yreg); + eor64(&yreg, &xreg); + // At this point, Z has been incremented to point at l[1] which + // is the start of the actual l[0] from the original formulation. + + // If this is the last round, then we are done. There is no + // point calculating another key schedule element. + insn_printf("mov __tmp_reg__,r%d", temp_reg); + insn_printf("inc __tmp_reg__"); + insn_printf("ldd r%d,%%5", const_reg); + insn_printf("cp __tmp_reg__,r%d", const_reg); + insn_printf("brne 3f"); + insn_printf("rjmp 4f"); + insn_printf("3:"); + + // Save x and y on the stack - we need the registers to + // help us compute the next key schedule element. + push64(&xreg); + push64(&yreg); + + // Compute the key schedule word s for the next round. + insn_printf("sbiw r%d,8", z_reg); // Point Z back at l[0] + // l[li_out] = (s + rightRotate8_64(l[li_in])) ^ i; + insn_printf("ld r%d,Z", REGn(&yreg, 0)); // y = s + insn_printf("ldd r%d,Z+1", REGn(&yreg, 1)); + insn_printf("ldd r%d,Z+2", REGn(&yreg, 2)); + insn_printf("ldd r%d,Z+3", REGn(&yreg, 3)); + insn_printf("ldd r%d,Z+4", REGn(&yreg, 4)); + insn_printf("ldd r%d,Z+5", REGn(&yreg, 5)); + insn_printf("ldd r%d,Z+6", REGn(&yreg, 6)); + insn_printf("ldd r%d,Z+7", REGn(&yreg, 7)); + insn_printf("add r%d,%%A2", z_reg); // Z = &(l[li_in]) - 8 + insn_printf("adc r%d,__zero_reg__", z_reg + 1); + leftRotate8(&xreg); // x = rightRotate8(l[li_in]) + insn_printf("ldd r%d,Z+8", REGn(&xreg, 0)); + insn_printf("ldd r%d,Z+9", REGn(&xreg, 1)); + insn_printf("ldd r%d,Z+10", REGn(&xreg, 2)); + insn_printf("ldd r%d,Z+11", REGn(&xreg, 3)); + insn_printf("ldd r%d,Z+12", REGn(&xreg, 4)); + insn_printf("ldd r%d,Z+13", REGn(&xreg, 5)); + insn_printf("ldd r%d,Z+14", REGn(&xreg, 6)); + insn_printf("ldd r%d,Z+15", REGn(&xreg, 7)); + rightRotate8(&xreg); + add64(&xreg, &yreg); // x += y + insn_printf("eor r%d,r%d", REGn(&xreg, 0), temp_reg); // x ^= i + insn_printf("sub r%d,%%A2", z_reg); // Z = &(l[li_out]) - 8 + insn_printf("sbc r%d,__zero_reg__", z_reg + 1); + insn_printf("add r%d,%%B2", z_reg); + insn_printf("adc r%d,__zero_reg__", z_reg + 1); + insn_printf("std Z+8,r%d", REGn(&xreg, 0)); // l[li_out] = x + insn_printf("std Z+9,r%d", REGn(&xreg, 1)); + insn_printf("std Z+10,r%d", REGn(&xreg, 2)); + insn_printf("std Z+11,r%d", REGn(&xreg, 3)); + insn_printf("std Z+12,r%d", REGn(&xreg, 4)); + insn_printf("std Z+13,r%d", REGn(&xreg, 5)); + insn_printf("std Z+14,r%d", REGn(&xreg, 6)); + insn_printf("std Z+15,r%d", REGn(&xreg, 7)); + insn_printf("sub r%d,%%B2", z_reg); // Restore Z to base of l array + insn_printf("sbc r%d,__zero_reg__", z_reg + 1); + // s = leftRotate3_64(s) ^ l[li_out]; + leftRotate3(&yreg); + eor64(&yreg, &xreg); + insn_printf("st Z,r%d", REGn(&yreg, 0)); + insn_printf("std Z+1,r%d", REGn(&yreg, 1)); + insn_printf("std Z+2,r%d", REGn(&yreg, 2)); + insn_printf("std Z+3,r%d", REGn(&yreg, 3)); + insn_printf("std Z+4,r%d", REGn(&yreg, 4)); + insn_printf("std Z+5,r%d", REGn(&yreg, 5)); + insn_printf("std Z+6,r%d", REGn(&yreg, 6)); + insn_printf("std Z+7,r%d", REGn(&yreg, 7)); + + // Advance li_in and li_out, wrapping around at the end of l. + insn_printf("ldi r%d,8", const_reg); + insn_printf("add %%A2,r%d", const_reg); + insn_printf("add %%B2,r%d", const_reg); + insn_printf("ldi r%d,0x1F", const_reg); + insn_printf("and %%A2,r%d", const_reg); + insn_printf("and %%B2,r%d", const_reg); + + // Restore the original x and y. + pop64(&yreg); + pop64(&xreg); + + // Bottom of the loop. + insn_printf("inc r%d", temp_reg); // i++ + insn_printf("rjmp 2b"); + insn_printf("4:"); + + // Pack the results into the output buffer. + pack_output(); + + // Declare the registers that we need. + indent_printf(": : \"x\"(k), \"z\"(l), \"r\"(input), \"Q\"(output), \"Q\"(mb), \"Q\"(r)\n"); + temp_regs(); + indent_printf(", \"r%d\", \"r%d\"\n", const_reg, temp_reg); + indent -= 4; + indent_printf(");\n"); + printf("}\n\n"); +} + +static void small_dec(void) +{ + Reg64 xreg = {t1_reg, 0}; + Reg64 yreg = {t2_reg, 0}; + + printf("void SpeckSmall::decryptBlock(uint8_t *output, const uint8_t *input)\n"); + printf("{\n"); + indent_printf("// Automatically generated by the genspeck tool.\n"); + indent_printf("uint64_t l[5];\n"); + indent_printf("uint8_t r = rounds;\n"); + indent_printf("uint8_t li_in = ((r + 3) & 0x03) * 8;\n"); + indent_printf("uint8_t li_out = ((((r - 31) & 0x03) * 8) + li_in) & 0x1F;\n"); + indent_printf("__asm__ __volatile__ (\n"); + indent += 4; + + // Copy the this->l array into the local l array. Then copy + // the "s" value from l[li_out] to l[4]. + insn_printf("ldd r%d,%%4", temp_reg); // r25 = li_out + insn_printf("ldi r%d,32", const_reg); // Copy 32 bytes from this->l. + insn_printf("1:"); + insn_printf("ld __tmp_reg__,X+"); + insn_printf("st Z+,__tmp_reg__"); + insn_printf("dec r%d", const_reg); + insn_printf("brne 1b"); + insn_printf("movw r%d,r%d", x_reg, z_reg); // X = Z + 32 + insn_printf("sbiw r%d,32", z_reg); // Z = &(l[li_out]) + insn_printf("add r%d,r%d", z_reg, temp_reg); + insn_printf("adc r%d,__zero_reg__", z_reg + 1); + insn_printf("ld __tmp_reg__,Z"); // Copy l[li_out] to l[4] + insn_printf("st X+,__tmp_reg__"); + insn_printf("ldd __tmp_reg__,Z+1"); + insn_printf("st X+,__tmp_reg__"); + insn_printf("ldd __tmp_reg__,Z+2"); + insn_printf("st X+,__tmp_reg__"); + insn_printf("ldd __tmp_reg__,Z+3"); + insn_printf("st X+,__tmp_reg__"); + insn_printf("ldd __tmp_reg__,Z+4"); + insn_printf("st X+,__tmp_reg__"); + insn_printf("ldd __tmp_reg__,Z+5"); + insn_printf("st X+,__tmp_reg__"); + insn_printf("ldd __tmp_reg__,Z+6"); + insn_printf("st X+,__tmp_reg__"); + insn_printf("ldd __tmp_reg__,Z+7"); + insn_printf("st X+,__tmp_reg__"); + insn_printf("sub r%d,r%d", z_reg, temp_reg); // Z = &(l[0]) + insn_printf("sbc r%d,__zero_reg__", z_reg + 1); + + // Unpack the input. %A2 and %B2 are free temporary registers after this. + insn_printf("movw r%d,%%A2", x_reg); + unpack_input(); + + // Top of the loop. + insn_printf("ldd %%A2,%%6"); // %A2 = li_in + insn_printf("mov %%B2,r%d", temp_reg); // %B2 = li_out + insn_printf("ldd r%d,%%5", temp_reg); // i = rounds - 1 + insn_printf("dec r%d", temp_reg); + insn_printf("movw r%d,r%d", x_reg, z_reg); // X = Z + 40 = &(l[5]) + insn_printf("adiw r%d,40", x_reg); // i.e. point to end of l[4] + insn_printf("2:"); + + // Adjust x and y for this round using the key schedule word s (in l[4]). + // y = rightRotate3_64(x ^ y); + eor64(&yreg, &xreg); + rightRotate3(&yreg); + // x = leftRotate8_64((x ^ s) - y); + eor64ScheduleReverseX(&xreg); + leftRotate8(&xreg); + sub64(&xreg, &yreg); + + // If this is the last round, then we are done. There is no + // point calculating another key schedule element. + insn_printf("or r%d,r%d", temp_reg, temp_reg); // if (i == 0) + insn_printf("brne 3f"); + insn_printf("rjmp 4f"); + insn_printf("3:"); + insn_printf("dec r%d", temp_reg); // --i + + // Save x and y on the stack - we need the registers to + // help us compute the next key schedule element. + push64(&xreg); + push64(&yreg); + + // Move li_in and li_out backwards, wrapping around at the start of l. + insn_printf("ldi r%d,24", const_reg); + insn_printf("add %%A2,r%d", const_reg); + insn_printf("add %%B2,r%d", const_reg); + insn_printf("ldi r%d,0x1F", const_reg); + insn_printf("and %%A2,r%d", const_reg); + insn_printf("and %%B2,r%d", const_reg); + + // Compute the key schedule word s for the next round. + // s = rightRotate3_64(s ^ l[li_out]); + insn_printf("ld r%d,X+", REGn(&yreg, 0)); // y = s = l[4] + insn_printf("ld r%d,X+", REGn(&yreg, 1)); + insn_printf("ld r%d,X+", REGn(&yreg, 2)); + insn_printf("ld r%d,X+", REGn(&yreg, 3)); + insn_printf("ld r%d,X+", REGn(&yreg, 4)); + insn_printf("ld r%d,X+", REGn(&yreg, 5)); + insn_printf("ld r%d,X+", REGn(&yreg, 6)); + insn_printf("ld r%d,X+", REGn(&yreg, 7)); + insn_printf("add r%d,%%B2", z_reg); // Z = &(l[li_out]) + insn_printf("adc r%d,__zero_reg__", z_reg + 1); + insn_printf("ld r%d,Z", REGn(&xreg, 0)); // x = l[li_out] + insn_printf("ldd r%d,Z+1", REGn(&xreg, 1)); + insn_printf("ldd r%d,Z+2", REGn(&xreg, 2)); + insn_printf("ldd r%d,Z+3", REGn(&xreg, 3)); + insn_printf("ldd r%d,Z+4", REGn(&xreg, 4)); + insn_printf("ldd r%d,Z+5", REGn(&xreg, 5)); + insn_printf("ldd r%d,Z+6", REGn(&xreg, 6)); + insn_printf("ldd r%d,Z+7", REGn(&xreg, 7)); + insn_printf("sub r%d,%%B2", z_reg); // Z = &(l[0]) + insn_printf("sbc r%d,__zero_reg__", z_reg + 1); + eor64(&yreg, &xreg); + rightRotate3(&yreg); + insn_printf("st -X,r%d", REGn(&yreg, 7)); // store s back into l[4] + insn_printf("st -X,r%d", REGn(&yreg, 6)); + insn_printf("st -X,r%d", REGn(&yreg, 5)); + insn_printf("st -X,r%d", REGn(&yreg, 4)); + insn_printf("st -X,r%d", REGn(&yreg, 3)); + insn_printf("st -X,r%d", REGn(&yreg, 2)); + insn_printf("st -X,r%d", REGn(&yreg, 1)); + insn_printf("st -X,r%d", REGn(&yreg, 0)); + insn_printf("adiw r%d,8", x_reg); // X = &(l[5]) + // l[li_in] = leftRotate8_64((l[li_out] ^ i) - s); + insn_printf("eor r%d,r%d", t1_reg, temp_reg); // x ^= i + sub64(&xreg, &yreg); // x -= s + leftRotate8(&xreg); // x = leftRotate8(x) + insn_printf("add r%d,%%A2", z_reg); // Z = &(l[li_in]) + insn_printf("adc r%d,__zero_reg__", z_reg + 1); + insn_printf("st Z,r%d", REGn(&xreg, 0)); // l[li_in] = x + insn_printf("std Z+1,r%d", REGn(&xreg, 1)); + insn_printf("std Z+2,r%d", REGn(&xreg, 2)); + insn_printf("std Z+3,r%d", REGn(&xreg, 3)); + insn_printf("std Z+4,r%d", REGn(&xreg, 4)); + insn_printf("std Z+5,r%d", REGn(&xreg, 5)); + insn_printf("std Z+6,r%d", REGn(&xreg, 6)); + insn_printf("std Z+7,r%d", REGn(&xreg, 7)); + insn_printf("sub r%d,%%A2", z_reg); // Z = &(l[0]) + insn_printf("sbc r%d,__zero_reg__", z_reg + 1); + + // Restore the original x and y. + pop64(&yreg); + pop64(&xreg); + + // Bottom of the loop. + insn_printf("rjmp 2b"); + insn_printf("4:"); + + // Pack the results into the output buffer. + pack_output(); + + // Declare the registers that we need. + indent_printf(": : \"x\"(this->l), \"z\"(l), \"r\"(input), \"Q\"(output), \"Q\"(li_out), \"Q\"(r), \"Q\"(li_in)\n"); + temp_regs(); + indent_printf(", \"r%d\", \"r%d\"\n", const_reg, temp_reg); + indent -= 4; + indent_printf(");\n"); + printf("}\n\n"); +} + +int main(int argc, char *argv[]) +{ + full_setkey(); + full_enc(); + full_dec(); + + tiny_enc(); + + small_dec(); + return 0; +} diff --git a/libraries/Crypto/Speck.cpp b/libraries/Crypto/Speck.cpp index 5cd035e4..b8eaeed2 100644 --- a/libraries/Crypto/Speck.cpp +++ b/libraries/Crypto/Speck.cpp @@ -105,6 +105,7 @@ size_t Speck::keySize() const bool Speck::setKey(const uint8_t *key, size_t len) { #if USE_AVR_INLINE_ASM + // Automatically generated by the genspeck tool. uint64_t l[4]; uint8_t m, mb; if (len == 32) { @@ -120,134 +121,148 @@ bool Speck::setKey(const uint8_t *key, size_t len) return false; } rounds = 30 + m; - - // Copy the first (m - 1) * 8 bytes of the key into the "l" array - // in reverse order to convert big endian into little-endian. + uint8_t r = rounds - 1; __asm__ __volatile__ ( + "ld __tmp_reg__,-X\n" + "st Z+,__tmp_reg__\n" + "ld __tmp_reg__,-X\n" + "st Z+,__tmp_reg__\n" + "ld __tmp_reg__,-X\n" + "st Z+,__tmp_reg__\n" + "ld __tmp_reg__,-X\n" + "st Z+,__tmp_reg__\n" + "ld __tmp_reg__,-X\n" + "st Z+,__tmp_reg__\n" + "ld __tmp_reg__,-X\n" + "st Z+,__tmp_reg__\n" + "ld __tmp_reg__,-X\n" + "st Z+,__tmp_reg__\n" + "ld __tmp_reg__,-X\n" + "st Z+,__tmp_reg__\n" + "sbiw r30,8\n" + "movw r10,r30\n" + "movw r30,%A2\n" + "ldd r8,%3\n" "1:\n" - "ld __tmp_reg__,-Z\n" - "st X+,__tmp_reg__\n" - "dec %2\n" + "ld __tmp_reg__,-X\n" + "st Z+,__tmp_reg__\n" + "dec r8\n" "brne 1b\n" - : : "x"(l), "z"(key + len - 8), "r"(mb) + "movw r26,%A2\n" + "movw r30,r10\n" + "clr %A2\n" + "ldd %B2,%3\n" + "clr r25\n" + "ld r16,Z+\n" + "ld r17,Z+\n" + "ld r18,Z+\n" + "ld r19,Z+\n" + "ld r20,Z+\n" + "ld r21,Z+\n" + "ld r22,Z+\n" + "ld r23,Z+\n" + "2:\n" + "add r26,%A2\n" + "adc r27,__zero_reg__\n" + "ld r15,X+\n" + "ld r8,X+\n" + "ld r9,X+\n" + "ld r10,X+\n" + "ld r11,X+\n" + "ld r12,X+\n" + "ld r13,X+\n" + "ld r14,X+\n" + "sub r26,%A2\n" + "sbc r27,__zero_reg__\n" + "sbiw r26,8\n" + "add r8,r16\n" + "adc r9,r17\n" + "adc r10,r18\n" + "adc r11,r19\n" + "adc r12,r20\n" + "adc r13,r21\n" + "adc r14,r22\n" + "adc r15,r23\n" + "eor r8,r25\n" + "add r26,%B2\n" + "adc r27,__zero_reg__\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + "st X+,r14\n" + "st X+,r15\n" + "sub r26,%B2\n" + "sbc r27,__zero_reg__\n" + "sbiw r26,8\n" + "lsl r16\n" + "rol r17\n" + "rol r18\n" + "rol r19\n" + "rol r20\n" + "rol r21\n" + "rol r22\n" + "rol r23\n" + "adc r16, __zero_reg__\n" + "lsl r16\n" + "rol r17\n" + "rol r18\n" + "rol r19\n" + "rol r20\n" + "rol r21\n" + "rol r22\n" + "rol r23\n" + "adc r16, __zero_reg__\n" + "lsl r16\n" + "rol r17\n" + "rol r18\n" + "rol r19\n" + "rol r20\n" + "rol r21\n" + "rol r22\n" + "rol r23\n" + "adc r16, __zero_reg__\n" + "eor r16,r8\n" + "eor r17,r9\n" + "eor r18,r10\n" + "eor r19,r11\n" + "eor r20,r12\n" + "eor r21,r13\n" + "eor r22,r14\n" + "eor r23,r15\n" + "st Z+,r16\n" + "st Z+,r17\n" + "st Z+,r18\n" + "st Z+,r19\n" + "st Z+,r20\n" + "st Z+,r21\n" + "st Z+,r22\n" + "st Z+,r23\n" + "ldi r24,8\n" + "add %A2,r24\n" + "add %B2,r24\n" + "ldi r24,0x1F\n" + "and %A2,r24\n" + "and %B2,r24\n" + "ldd r8,%4\n" + "inc r25\n" + "cp r25,r8\n" + "breq 3f\n" + "rjmp 2b\n" + "3:\n" + "ldi r24,32\n" + "4:\n" + "st X+,__zero_reg__\n" + "dec r24\n" + "brne 4b\n" + : : "z"(k), "x"(key + len), "r"(l), "Q"(mb), "Q"(r) + : "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "memory" + , "r24", "r25" ); - - // Copy the final 8 bytes of the key into k[0] in reverse order. - __asm__ __volatile__ ( - "1:\n" - "ld __tmp_reg__,-Z\n" - "st X+,__tmp_reg__\n" - "dec %2\n" - "brne 1b\n" - : : "x"(k), "z"(key + len), "r"(8) - ); - - // Expand the key to the full key schedule. - uint8_t li_in = 0; - uint8_t li_out = m - 1; - for (uint8_t i = 0; i < (rounds - 1); ++i) { - __asm__ __volatile__ ( - // l[li_out] = (k[i] + rightRotate8_64(l[li_in])) ^ i; - "ld r15,X+\n" // x = rightRotate8_64(l[li_in]) - "ld r8,X+\n" - "ld r9,X+\n" - "ld r10,X+\n" - "ld r11,X+\n" - "ld r12,X+\n" - "ld r13,X+\n" - "ld r14,X+\n" - - "ld r16,Z+\n" // y = k[i] - "ld r17,Z+\n" - "ld r18,Z+\n" - "ld r19,Z+\n" - "ld r20,Z+\n" - "ld r21,Z+\n" - "ld r22,Z+\n" - "ld r23,Z+\n" - - "add r8,r16\n" // x += y - "adc r9,r17\n" - "adc r10,r18\n" - "adc r11,r19\n" - "adc r12,r20\n" - "adc r13,r21\n" - "adc r14,r22\n" - "adc r15,r23\n" - - "eor r8,%3\n" // x ^= i - - // k[i + 1] = leftRotate3_64(k[i]) ^ l[li_out]; - "movw r26,%A2\n" // l[li_out] = x - "st X+,r8\n" - "st X+,r9\n" - "st X+,r10\n" - "st X+,r11\n" - "st X+,r12\n" - "st X+,r13\n" - "st X+,r14\n" - "st X+,r15\n" - - "lsl r16\n" // y = leftRotate1_64(y) - "rol r17\n" - "rol r18\n" - "rol r19\n" - "rol r20\n" - "rol r21\n" - "rol r22\n" - "rol r23\n" - "adc r16,__zero_reg__\n" - - "lsl r16\n" // y = leftRotate1_64(y) - "rol r17\n" - "rol r18\n" - "rol r19\n" - "rol r20\n" - "rol r21\n" - "rol r22\n" - "rol r23\n" - "adc r16,__zero_reg__\n" - - "lsl r16\n" // y = leftRotate1_64(y) - "rol r17\n" - "rol r18\n" - "rol r19\n" - "rol r20\n" - "rol r21\n" - "rol r22\n" - "rol r23\n" - "adc r16,__zero_reg__\n" - - "eor r16,r8\n" // y ^= x - "eor r17,r9\n" - "eor r18,r10\n" - "eor r19,r11\n" - "eor r20,r12\n" - "eor r21,r13\n" - "eor r22,r14\n" - "eor r23,r15\n" - - "st Z+,r16\n" // k[i + 1] = y - "st Z+,r17\n" - "st Z+,r18\n" - "st Z+,r19\n" - "st Z+,r20\n" - "st Z+,r21\n" - "st Z+,r22\n" - "st Z+,r23\n" - - : : "z"(&(k[i])), "x"(&(l[li_in])), - "r"(&(l[li_out])), - "r"(i) - : "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", - "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", - "r24", "r25" - ); - if ((++li_in) >= m) - li_in = 0; - if ((++li_out) >= m) - li_out = 0; - } + return true; #else uint64_t l[4]; uint8_t m; @@ -280,138 +295,118 @@ bool Speck::setKey(const uint8_t *key, size_t len) if ((++li_out) >= m) li_out = 0; } -#endif clean(l); return true; +#endif } void Speck::encryptBlock(uint8_t *output, const uint8_t *input) { #if USE_AVR_INLINE_ASM - uint32_t xlow, xhigh, ylow, yhigh; - - // Unpack the input into the x and y variables, converting - // from big-endian into little-endian in the process. - __asm__ __volatile__ ( - "ld %D1,Z\n" - "ldd %C1,Z+1\n" - "ldd %B1,Z+2\n" - "ldd %A1,Z+3\n" - "ldd %D0,Z+4\n" - "ldd %C0,Z+5\n" - "ldd %B0,Z+6\n" - "ldd %A0,Z+7\n" - "ldd %D3,Z+8\n" - "ldd %C3,Z+9\n" - "ldd %B3,Z+10\n" - "ldd %A3,Z+11\n" - "ldd %D2,Z+12\n" - "ldd %C2,Z+13\n" - "ldd %B2,Z+14\n" - "ldd %A2,Z+15\n" - : "=r"(xlow), "=r"(xhigh), "=r"(ylow), "=r"(yhigh) - : "z"(input) - ); - - // Perform all encryption rounds. Z points to the key schedule. + // Automatically generated by the genspeck tool. __asm__ __volatile__ ( + "ld r15,X+\n" + "ld r14,X+\n" + "ld r13,X+\n" + "ld r12,X+\n" + "ld r11,X+\n" + "ld r10,X+\n" + "ld r9,X+\n" + "ld r8,X+\n" + "ld r23,X+\n" + "ld r22,X+\n" + "ld r21,X+\n" + "ld r20,X+\n" + "ld r19,X+\n" + "ld r18,X+\n" + "ld r17,X+\n" + "ld r16,X\n" "1:\n" - // x = (rightRotate8_64(x) + y) ^ *s++; - "add %B0,%A2\n" // x = rightRotate8_64(x), x += y - "adc %C0,%B2\n" // Note: right rotate is implicit. - "adc %D0,%C2\n" - "adc %A1,%D2\n" - "adc %B1,%A3\n" - "adc %C1,%B3\n" - "adc %D1,%C3\n" - "adc %A0,%D3\n" - - "ld __tmp_reg__,Z+\n" // x ^= *s++ - "eor __tmp_reg__,%B0\n" // Also fully apply the right rotate. - "ld %B0,Z+\n" - "eor %B0,%C0\n" - "ld %C0,Z+\n" - "eor %C0,%D0\n" - "ld %D0,Z+\n" - "eor %D0,%A1\n" - "ld %A1,Z+\n" - "eor %A1,%B1\n" - "ld %B1,Z+\n" - "eor %B1,%C1\n" - "ld %C1,Z+\n" - "eor %C1,%D1\n" - "ld %D1,Z+\n" - "eor %D1,%A0\n" - "mov %A0,__tmp_reg__\n" - - // y = leftRotate3_64(y) ^ x; - "lsl %A2\n" // y = leftRotate1_64(y) - "rol %B2\n" - "rol %C2\n" - "rol %D2\n" - "rol %A3\n" - "rol %B3\n" - "rol %C3\n" - "rol %D3\n" - "adc %A2,__zero_reg__\n" - - "lsl %A2\n" // y = leftRotate1_64(y) - "rol %B2\n" - "rol %C2\n" - "rol %D2\n" - "rol %A3\n" - "rol %B3\n" - "rol %C3\n" - "rol %D3\n" - "adc %A2,__zero_reg__\n" - - "lsl %A2\n" // y = leftRotate1_64(y) - "rol %B2\n" - "rol %C2\n" - "rol %D2\n" - "rol %A3\n" - "rol %B3\n" - "rol %C3\n" - "rol %D3\n" - "adc %A2,__zero_reg__\n" - - "eor %A2,%A0\n" // y ^= x - "eor %B2,%B0\n" - "eor %C2,%C0\n" - "eor %D2,%D0\n" - "eor %A3,%A1\n" - "eor %B3,%B1\n" - "eor %C3,%C1\n" - "eor %D3,%D1\n" - - // Loop - "dec %5\n" // --round + "add r9,r16\n" + "adc r10,r17\n" + "adc r11,r18\n" + "adc r12,r19\n" + "adc r13,r20\n" + "adc r14,r21\n" + "adc r15,r22\n" + "adc r8,r23\n" + "ld __tmp_reg__,Z+\n" + "eor __tmp_reg__,r9\n" + "ld r9,Z+\n" + "eor r9,r10\n" + "ld r10,Z+\n" + "eor r10,r11\n" + "ld r11,Z+\n" + "eor r11,r12\n" + "ld r12,Z+\n" + "eor r12,r13\n" + "ld r13,Z+\n" + "eor r13,r14\n" + "ld r14,Z+\n" + "eor r14,r15\n" + "ld r15,Z+\n" + "eor r15,r8\n" + "mov r8,__tmp_reg__\n" + "lsl r16\n" + "rol r17\n" + "rol r18\n" + "rol r19\n" + "rol r20\n" + "rol r21\n" + "rol r22\n" + "rol r23\n" + "adc r16, __zero_reg__\n" + "lsl r16\n" + "rol r17\n" + "rol r18\n" + "rol r19\n" + "rol r20\n" + "rol r21\n" + "rol r22\n" + "rol r23\n" + "adc r16, __zero_reg__\n" + "lsl r16\n" + "rol r17\n" + "rol r18\n" + "rol r19\n" + "rol r20\n" + "rol r21\n" + "rol r22\n" + "rol r23\n" + "adc r16, __zero_reg__\n" + "eor r16,r8\n" + "eor r17,r9\n" + "eor r18,r10\n" + "eor r19,r11\n" + "eor r20,r12\n" + "eor r21,r13\n" + "eor r22,r14\n" + "eor r23,r15\n" + "dec %2\n" "breq 2f\n" "rjmp 1b\n" "2:\n" - : "+r"(xlow), "+r"(xhigh), "+r"(ylow), "+r"(yhigh) - : "z"(k), "r"(rounds) - ); - - // Pack the results into the output and convert back to big-endian. - __asm__ __volatile__ ( - "st Z,%D1\n" - "std Z+1,%C1\n" - "std Z+2,%B1\n" - "std Z+3,%A1\n" - "std Z+4,%D0\n" - "std Z+5,%C0\n" - "std Z+6,%B0\n" - "std Z+7,%A0\n" - "std Z+8,%D3\n" - "std Z+9,%C3\n" - "std Z+10,%B3\n" - "std Z+11,%A3\n" - "std Z+12,%D2\n" - "std Z+13,%C2\n" - "std Z+14,%B2\n" - "std Z+15,%A2\n" - : : "r"(xlow), "r"(xhigh), "r"(ylow), "r"(yhigh), "z"(output) + "ldd r26,%A3\n" + "ldd r27,%B3\n" + "st X+,r15\n" + "st X+,r14\n" + "st X+,r13\n" + "st X+,r12\n" + "st X+,r11\n" + "st X+,r10\n" + "st X+,r9\n" + "st X+,r8\n" + "st X+,r23\n" + "st X+,r22\n" + "st X+,r21\n" + "st X+,r20\n" + "st X+,r19\n" + "st X+,r18\n" + "st X+,r17\n" + "st X,r16\n" + : : "x"(input), "z"(k), "r"(rounds), "Q"(output) + : "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "memory" ); #else uint64_t x, y; @@ -430,133 +425,113 @@ void Speck::encryptBlock(uint8_t *output, const uint8_t *input) void Speck::decryptBlock(uint8_t *output, const uint8_t *input) { #if USE_AVR_INLINE_ASM - uint32_t xlow, xhigh, ylow, yhigh; - - // Unpack the input into the x and y variables, converting - // from big-endian into little-endian in the process. - __asm__ __volatile__ ( - "ld %D1,Z\n" - "ldd %C1,Z+1\n" - "ldd %B1,Z+2\n" - "ldd %A1,Z+3\n" - "ldd %D0,Z+4\n" - "ldd %C0,Z+5\n" - "ldd %B0,Z+6\n" - "ldd %A0,Z+7\n" - "ldd %D3,Z+8\n" - "ldd %C3,Z+9\n" - "ldd %B3,Z+10\n" - "ldd %A3,Z+11\n" - "ldd %D2,Z+12\n" - "ldd %C2,Z+13\n" - "ldd %B2,Z+14\n" - "ldd %A2,Z+15\n" - : "=r"(xlow), "=r"(xhigh), "=r"(ylow), "=r"(yhigh) - : "z"(input) - ); - - // Perform all decryption rounds. Z points to the end of key schedule. + // Automatically generated by the genspeck tool. __asm__ __volatile__ ( + "ld r15,X+\n" + "ld r14,X+\n" + "ld r13,X+\n" + "ld r12,X+\n" + "ld r11,X+\n" + "ld r10,X+\n" + "ld r9,X+\n" + "ld r8,X+\n" + "ld r23,X+\n" + "ld r22,X+\n" + "ld r21,X+\n" + "ld r20,X+\n" + "ld r19,X+\n" + "ld r18,X+\n" + "ld r17,X+\n" + "ld r16,X\n" "1:\n" - // y = rightRotate3_64(x ^ y); - "eor %A2,%A0\n" // y ^= x - "eor %B2,%B0\n" - "eor %C2,%C0\n" - "eor %D2,%D0\n" - "eor %A3,%A1\n" - "eor %B3,%B1\n" - "eor %C3,%C1\n" - "eor %D3,%D1\n" - - "bst %A2,0\n" // y = rightRotate1_64(y) - "ror %D3\n" - "ror %C3\n" - "ror %B3\n" - "ror %A3\n" - "ror %D2\n" - "ror %C2\n" - "ror %B2\n" - "ror %A2\n" - "bld %D3,7\n" - - "bst %A2,0\n" // y = rightRotate1_64(y) - "ror %D3\n" - "ror %C3\n" - "ror %B3\n" - "ror %A3\n" - "ror %D2\n" - "ror %C2\n" - "ror %B2\n" - "ror %A2\n" - "bld %D3,7\n" - - "bst %A2,0\n" // y = rightRotate1_64(y) - "ror %D3\n" - "ror %C3\n" - "ror %B3\n" - "ror %A3\n" - "ror %D2\n" - "ror %C2\n" - "ror %B2\n" - "ror %A2\n" - "bld %D3,7\n" - - // x = leftRotate8_64((x ^ *s--) - y); - "ld __tmp_reg__,-Z\n" // x ^= *s-- - "eor __tmp_reg__,%D1\n" // Note: also implicitly left-rotates regs - "ld %D1,-Z\n" - "eor %D1,%C1\n" - "ld %C1,-Z\n" - "eor %C1,%B1\n" - "ld %B1,-Z\n" - "eor %B1,%A1\n" - "ld %A1,-Z\n" - "eor %A1,%D0\n" - "ld %D0,-Z\n" - "eor %D0,%C0\n" - "ld %C0,-Z\n" - "eor %C0,%B0\n" - "ld %B0,-Z\n" - "eor %B0,%A0\n" - "mov %A0,__tmp_reg__\n" - - "sub %B0,%A2\n" // x -= y - "sbc %C0,%B2\n" // Note: regs are already left-rotated - "sbc %D0,%C2\n" - "sbc %A1,%D2\n" - "sbc %B1,%A3\n" - "sbc %C1,%B3\n" - "sbc %D1,%C3\n" - "sbc %A0,%D3\n" - - // Loop - "dec %5\n" // --round + "eor r16,r8\n" + "eor r17,r9\n" + "eor r18,r10\n" + "eor r19,r11\n" + "eor r20,r12\n" + "eor r21,r13\n" + "eor r22,r14\n" + "eor r23,r15\n" + "bst r16,0\n" + "ror r23\n" + "ror r22\n" + "ror r21\n" + "ror r20\n" + "ror r19\n" + "ror r18\n" + "ror r17\n" + "ror r16\n" + "bld r23,7\n" + "bst r16,0\n" + "ror r23\n" + "ror r22\n" + "ror r21\n" + "ror r20\n" + "ror r19\n" + "ror r18\n" + "ror r17\n" + "ror r16\n" + "bld r23,7\n" + "bst r16,0\n" + "ror r23\n" + "ror r22\n" + "ror r21\n" + "ror r20\n" + "ror r19\n" + "ror r18\n" + "ror r17\n" + "ror r16\n" + "bld r23,7\n" + "ld __tmp_reg__,-Z\n" + "eor __tmp_reg__,r15\n" + "ld r15,-Z\n" + "eor r15,r14\n" + "ld r14,-Z\n" + "eor r14,r13\n" + "ld r13,-Z\n" + "eor r13,r12\n" + "ld r12,-Z\n" + "eor r12,r11\n" + "ld r11,-Z\n" + "eor r11,r10\n" + "ld r10,-Z\n" + "eor r10,r9\n" + "ld r9,-Z\n" + "eor r9,r8\n" + "mov r8,__tmp_reg__\n" + "sub r9,r16\n" + "sbc r10,r17\n" + "sbc r11,r18\n" + "sbc r12,r19\n" + "sbc r13,r20\n" + "sbc r14,r21\n" + "sbc r15,r22\n" + "sbc r8,r23\n" + "dec %2\n" "breq 2f\n" "rjmp 1b\n" "2:\n" - : "+r"(xlow), "+r"(xhigh), "+r"(ylow), "+r"(yhigh) - : "z"(k + rounds), "r"(rounds) - ); - - // Pack the results into the output and convert back to big-endian. - __asm__ __volatile__ ( - "st Z,%D1\n" - "std Z+1,%C1\n" - "std Z+2,%B1\n" - "std Z+3,%A1\n" - "std Z+4,%D0\n" - "std Z+5,%C0\n" - "std Z+6,%B0\n" - "std Z+7,%A0\n" - "std Z+8,%D3\n" - "std Z+9,%C3\n" - "std Z+10,%B3\n" - "std Z+11,%A3\n" - "std Z+12,%D2\n" - "std Z+13,%C2\n" - "std Z+14,%B2\n" - "std Z+15,%A2\n" - : : "r"(xlow), "r"(xhigh), "r"(ylow), "r"(yhigh), "z"(output) + "ldd r26,%A3\n" + "ldd r27,%B3\n" + "st X+,r15\n" + "st X+,r14\n" + "st X+,r13\n" + "st X+,r12\n" + "st X+,r11\n" + "st X+,r10\n" + "st X+,r9\n" + "st X+,r8\n" + "st X+,r23\n" + "st X+,r22\n" + "st X+,r21\n" + "st X+,r20\n" + "st X+,r19\n" + "st X+,r18\n" + "st X+,r17\n" + "st X,r16\n" + : : "x"(input), "z"(k + rounds), "r"(rounds), "Q"(output) + : "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "memory" ); #else uint64_t x, y; diff --git a/libraries/Crypto/SpeckSmall.cpp b/libraries/Crypto/SpeckSmall.cpp index c7ca6ddd..488399fb 100644 --- a/libraries/Crypto/SpeckSmall.cpp +++ b/libraries/Crypto/SpeckSmall.cpp @@ -261,349 +261,283 @@ bool SpeckSmall::setKey(const uint8_t *key, size_t len) void SpeckSmall::decryptBlock(uint8_t *output, const uint8_t *input) { #if USE_AVR_INLINE_ASM - uint64_t l[4]; - uint32_t xlow, xhigh, ylow, yhigh; - uint32_t slow, shigh; - uint8_t li_in = (rounds + 3) & 0x03; - uint8_t li_out = (((rounds - 31) + li_in) & 0x03) * 8; - li_in *= 8; - - // Prepare to expand the key schedule. + // Automatically generated by the genspeck tool. + uint64_t l[5]; + uint8_t r = rounds; + uint8_t li_in = ((r + 3) & 0x03) * 8; + uint8_t li_out = ((((r - 31) & 0x03) * 8) + li_in) & 0x1F; __asm__ __volatile__ ( - "add r30,%4\n" // Z = &(this->l[li_out]) - "adc r31,__zero_reg__\n" - "ld __tmp_reg__,Z\n" // s = this->l[li_out] - "std %A0,__tmp_reg__\n" - "ldd __tmp_reg__,Z+1\n" - "std %B0,__tmp_reg__\n" - "ldd __tmp_reg__,Z+2\n" - "std %C0,__tmp_reg__\n" - "ldd __tmp_reg__,Z+3\n" - "std %D0,__tmp_reg__\n" - "ldd __tmp_reg__,Z+4\n" - "std %A1,__tmp_reg__\n" - "ldd __tmp_reg__,Z+5\n" - "std %B1,__tmp_reg__\n" - "ldd __tmp_reg__,Z+6\n" - "std %C1,__tmp_reg__\n" - "ldd __tmp_reg__,Z+7\n" - "std %D1,__tmp_reg__\n" - "sub r30,%4\n" // Point Z back to the start of this->l. - "sbc r31,__zero_reg__\n" - - "ldi r25,32\n" // Copy the entire this->l array into l. + "ldd r25,%4\n" + "ldi r24,32\n" "1:\n" - "ld __tmp_reg__,Z+\n" - "st X+,__tmp_reg__\n" - "dec r25\n" + "ld __tmp_reg__,X+\n" + "st Z+,__tmp_reg__\n" + "dec r24\n" "brne 1b\n" - : "=Q"(slow), "=Q"(shigh) - : "z"(this->l), "x"(l), "r"(li_out) - : "r25" - ); - - // Unpack the input into the x and y variables, converting - // from big-endian into little-endian in the process. - __asm__ __volatile__ ( - "ld %D1,Z\n" - "ldd %C1,Z+1\n" - "ldd %B1,Z+2\n" - "ldd %A1,Z+3\n" - "ldd %D0,Z+4\n" - "ldd %C0,Z+5\n" - "ldd %B0,Z+6\n" - "ldd %A0,Z+7\n" - "ldd %D3,Z+8\n" - "ldd %C3,Z+9\n" - "ldd %B3,Z+10\n" - "ldd %A3,Z+11\n" - "ldd %D2,Z+12\n" - "ldd %C2,Z+13\n" - "ldd %B2,Z+14\n" - "ldd %A2,Z+15\n" - : "=r"(xlow), "=r"(xhigh), "=r"(ylow), "=r"(yhigh) - : "z"(input) - ); - - // Perform all decryption rounds while expanding the key schedule in-place. - __asm__ __volatile__ ( - "mov r23,%9\n" // i = rounds - 1 - "dec r23\n" - "1:\n" - - // Adjust x and y for this round using the key schedule word s. - - // y = rightRotate3_64(x ^ y); - "eor %A2,%A0\n" // y ^= x - "eor %B2,%B0\n" - "eor %C2,%C0\n" - "eor %D2,%D0\n" - "eor %A3,%A1\n" - "eor %B3,%B1\n" - "eor %C3,%C1\n" - "eor %D3,%D1\n" - - "bst %A2,0\n" // y = rightRotate1_64(y) - "ror %D3\n" - "ror %C3\n" - "ror %B3\n" - "ror %A3\n" - "ror %D2\n" - "ror %C2\n" - "ror %B2\n" - "ror %A2\n" - "bld %D3,7\n" - - "bst %A2,0\n" // y = rightRotate1_64(y) - "ror %D3\n" - "ror %C3\n" - "ror %B3\n" - "ror %A3\n" - "ror %D2\n" - "ror %C2\n" - "ror %B2\n" - "ror %A2\n" - "bld %D3,7\n" - - "bst %A2,0\n" // y = rightRotate1_64(y) - "ror %D3\n" - "ror %C3\n" - "ror %B3\n" - "ror %A3\n" - "ror %D2\n" - "ror %C2\n" - "ror %B2\n" - "ror %A2\n" - "bld %D3,7\n" - - // x = leftRotate8_64((x ^ s) - y); - "ldd __tmp_reg__,%A4\n" // x ^= s - "eor %A0,__tmp_reg__\n" - "ldd __tmp_reg__,%B4\n" - "eor %B0,__tmp_reg__\n" - "ldd __tmp_reg__,%C4\n" - "eor %C0,__tmp_reg__\n" - "ldd __tmp_reg__,%D4\n" - "eor %D0,__tmp_reg__\n" - "ldd __tmp_reg__,%A5\n" - "eor %A1,__tmp_reg__\n" - "ldd __tmp_reg__,%B5\n" - "eor %B1,__tmp_reg__\n" - "ldd __tmp_reg__,%C5\n" - "eor %C1,__tmp_reg__\n" - "ldd __tmp_reg__,%D5\n" - "eor %D1,__tmp_reg__\n" - - "sub %A0,%A2\n" // x -= y - "sbc %B0,%B2\n" - "sbc %C0,%C2\n" - "sbc %D0,%D2\n" - "sbc %A1,%A3\n" - "sbc %B1,%B3\n" - "sbc %C1,%C3\n" - "sbc %D1,%D3\n" - - "mov __tmp_reg__,%D1\n" // x = lefRotate8_64(x) - "mov %D1,%C1\n" - "mov %C1,%B1\n" - "mov %B1,%A1\n" - "mov %A1,%D0\n" - "mov %D0,%C0\n" - "mov %C0,%B0\n" - "mov %B0,%A0\n" - "mov %A0,__tmp_reg__\n" - - // On the last round we don't need to compute s so we - // can exit early here if i == 0. - "or r23,r23\n" // if (i == 0) - "brne 2f\n" - "rjmp 3f\n" + "movw r26,r30\n" + "sbiw r30,32\n" + "add r30,r25\n" + "adc r31,__zero_reg__\n" + "ld __tmp_reg__,Z\n" + "st X+,__tmp_reg__\n" + "ldd __tmp_reg__,Z+1\n" + "st X+,__tmp_reg__\n" + "ldd __tmp_reg__,Z+2\n" + "st X+,__tmp_reg__\n" + "ldd __tmp_reg__,Z+3\n" + "st X+,__tmp_reg__\n" + "ldd __tmp_reg__,Z+4\n" + "st X+,__tmp_reg__\n" + "ldd __tmp_reg__,Z+5\n" + "st X+,__tmp_reg__\n" + "ldd __tmp_reg__,Z+6\n" + "st X+,__tmp_reg__\n" + "ldd __tmp_reg__,Z+7\n" + "st X+,__tmp_reg__\n" + "sub r30,r25\n" + "sbc r31,__zero_reg__\n" + "movw r26,%A2\n" + "ld r15,X+\n" + "ld r14,X+\n" + "ld r13,X+\n" + "ld r12,X+\n" + "ld r11,X+\n" + "ld r10,X+\n" + "ld r9,X+\n" + "ld r8,X+\n" + "ld r23,X+\n" + "ld r22,X+\n" + "ld r21,X+\n" + "ld r20,X+\n" + "ld r19,X+\n" + "ld r18,X+\n" + "ld r17,X+\n" + "ld r16,X\n" + "ldd %A2,%6\n" + "mov %B2,r25\n" + "ldd r25,%5\n" + "dec r25\n" + "movw r26,r30\n" + "adiw r26,40\n" "2:\n" - "dec r23\n" // --i - - // Save x and y on the stack so we can reuse registers for t and s. - "push %A0\n" - "push %B0\n" - "push %C0\n" - "push %D0\n" - "push %A1\n" - "push %B1\n" - "push %C1\n" - "push %D1\n" - "push %A2\n" - "push %B2\n" - "push %C2\n" - "push %D2\n" - "push %A3\n" - "push %B3\n" - "push %C3\n" - "push %D3\n" - - // Compute the key schedule word s for the next round. - - // li_out = (li_out + 3) & 0x03; - "ldd r24,%7\n" - "ldi r25,24\n" - "add r24,r25\n" - "andi r24,0x1f\n" - "std %7,r24\n" - - // s = rightRotate3_64(s ^ l[li_out]); - "add %A8,r24\n" // Z = &(l[li_out]) - "adc %B8,__zero_reg__\n" - - "ld %A0,Z\n" // t = l[li_out] - "ldd %B0,Z+1\n" - "ldd %C0,Z+2\n" - "ldd %D0,Z+3\n" - "ldd %A1,Z+4\n" - "ldd %B1,Z+5\n" - "ldd %C1,Z+6\n" - "ldd %D1,Z+7\n" - - "ldd %A2,%A4\n" // load s - "ldd %B2,%B4\n" - "ldd %C2,%C4\n" - "ldd %D2,%D4\n" - "ldd %A3,%A5\n" - "ldd %B3,%B5\n" - "ldd %C3,%C5\n" - "ldd %D3,%D5\n" - - "eor %A2,%A0\n" // s ^= t - "eor %B2,%B0\n" - "eor %C2,%C0\n" - "eor %D2,%D0\n" - "eor %A3,%A1\n" - "eor %B3,%B1\n" - "eor %C3,%C1\n" - "eor %D3,%D1\n" - - "bst %A2,0\n" // s = rightRotate1_64(s) - "ror %D3\n" - "ror %C3\n" - "ror %B3\n" - "ror %A3\n" - "ror %D2\n" - "ror %C2\n" - "ror %B2\n" - "ror %A2\n" - "bld %D3,7\n" - - "bst %A2,0\n" // s = rightRotate1_64(s) - "ror %D3\n" - "ror %C3\n" - "ror %B3\n" - "ror %A3\n" - "ror %D2\n" - "ror %C2\n" - "ror %B2\n" - "ror %A2\n" - "bld %D3,7\n" - - "bst %A2,0\n" // s = rightRotate1_64(s) - "ror %D3\n" - "ror %C3\n" - "ror %B3\n" - "ror %A3\n" - "ror %D2\n" - "ror %C2\n" - "ror %B2\n" - "ror %A2\n" - "bld %D3,7\n" - - "sub %A8,r24\n" // Z -= li_out - "sbc %B8,__zero_reg__\n" - - // li_in = (li_in + 3) & 0x03; - "ldd r24,%6\n" - "add r24,r25\n" - "andi r24,0x1f\n" - "std %6,r24\n" - - // l[li_in] = leftRotate8_64((l[li_out] ^ i) - s); - "add %A8,r24\n" // Z = &(l[li_in]) - "adc %B8,__zero_reg__\n" - - "eor %A0,r23\n" // t ^= i - - "sub %A0,%A2\n" // t -= s - "sbc %B0,%B2\n" - "sbc %C0,%C2\n" - "sbc %D0,%D2\n" - "sbc %A1,%A3\n" - "sbc %B1,%B3\n" - "sbc %C1,%C3\n" - "sbc %D1,%D3\n" - - "st Z,%D1\n" // l[li_in] = leftRotate8_64(t) - "std Z+1,%A0\n" - "std Z+2,%B0\n" - "std Z+3,%C0\n" - "std Z+4,%D0\n" - "std Z+5,%A1\n" - "std Z+6,%B1\n" - "std Z+7,%C1\n" - - "sub %A8,r24\n" // Z -= li_in - "sbc %B8,__zero_reg__\n" - - "std %A4,%A2\n" // store s - "std %B4,%B2\n" - "std %C4,%C2\n" - "std %D4,%D2\n" - "std %A5,%A3\n" - "std %B5,%B3\n" - "std %C5,%C3\n" - "std %D5,%D3\n" - - // Pop registers from the stack to recover the x and y values. - "pop %D3\n" - "pop %C3\n" - "pop %B3\n" - "pop %A3\n" - "pop %D2\n" - "pop %C2\n" - "pop %B2\n" - "pop %A2\n" - "pop %D1\n" - "pop %C1\n" - "pop %B1\n" - "pop %A1\n" - "pop %D0\n" - "pop %C0\n" - "pop %B0\n" - "pop %A0\n" - - // Bottom of the loop. - "rjmp 1b\n" + "eor r16,r8\n" + "eor r17,r9\n" + "eor r18,r10\n" + "eor r19,r11\n" + "eor r20,r12\n" + "eor r21,r13\n" + "eor r22,r14\n" + "eor r23,r15\n" + "bst r16,0\n" + "ror r23\n" + "ror r22\n" + "ror r21\n" + "ror r20\n" + "ror r19\n" + "ror r18\n" + "ror r17\n" + "ror r16\n" + "bld r23,7\n" + "bst r16,0\n" + "ror r23\n" + "ror r22\n" + "ror r21\n" + "ror r20\n" + "ror r19\n" + "ror r18\n" + "ror r17\n" + "ror r16\n" + "bld r23,7\n" + "bst r16,0\n" + "ror r23\n" + "ror r22\n" + "ror r21\n" + "ror r20\n" + "ror r19\n" + "ror r18\n" + "ror r17\n" + "ror r16\n" + "bld r23,7\n" + "ld __tmp_reg__,-X\n" + "eor __tmp_reg__,r15\n" + "ld r15,-X\n" + "eor r15,r14\n" + "ld r14,-X\n" + "eor r14,r13\n" + "ld r13,-X\n" + "eor r13,r12\n" + "ld r12,-X\n" + "eor r12,r11\n" + "ld r11,-X\n" + "eor r11,r10\n" + "ld r10,-X\n" + "eor r10,r9\n" + "ld r9,-X\n" + "eor r9,r8\n" + "mov r8,__tmp_reg__\n" + "sub r9,r16\n" + "sbc r10,r17\n" + "sbc r11,r18\n" + "sbc r12,r19\n" + "sbc r13,r20\n" + "sbc r14,r21\n" + "sbc r15,r22\n" + "sbc r8,r23\n" + "or r25,r25\n" + "brne 3f\n" + "rjmp 4f\n" "3:\n" - - : "+r"(xlow), "+r"(xhigh), "+r"(ylow), "+r"(yhigh), - "+Q"(slow), "+Q"(shigh), "+Q"(li_in), "+Q"(li_out) - : "z"(l), "r"(rounds) - : "r23", "r24", "r25" - ); - - // Pack the results into the output and convert back to big-endian. - __asm__ __volatile__ ( - "st Z,%D1\n" - "std Z+1,%C1\n" - "std Z+2,%B1\n" - "std Z+3,%A1\n" - "std Z+4,%D0\n" - "std Z+5,%C0\n" - "std Z+6,%B0\n" - "std Z+7,%A0\n" - "std Z+8,%D3\n" - "std Z+9,%C3\n" - "std Z+10,%B3\n" - "std Z+11,%A3\n" - "std Z+12,%D2\n" - "std Z+13,%C2\n" - "std Z+14,%B2\n" - "std Z+15,%A2\n" - : : "r"(xlow), "r"(xhigh), "r"(ylow), "r"(yhigh), "z"(output) + "dec r25\n" + "push r8\n" + "push r9\n" + "push r10\n" + "push r11\n" + "push r12\n" + "push r13\n" + "push r14\n" + "push r15\n" + "push r16\n" + "push r17\n" + "push r18\n" + "push r19\n" + "push r20\n" + "push r21\n" + "push r22\n" + "push r23\n" + "ldi r24,24\n" + "add %A2,r24\n" + "add %B2,r24\n" + "ldi r24,0x1F\n" + "and %A2,r24\n" + "and %B2,r24\n" + "ld r16,X+\n" + "ld r17,X+\n" + "ld r18,X+\n" + "ld r19,X+\n" + "ld r20,X+\n" + "ld r21,X+\n" + "ld r22,X+\n" + "ld r23,X+\n" + "add r30,%B2\n" + "adc r31,__zero_reg__\n" + "ld r8,Z\n" + "ldd r9,Z+1\n" + "ldd r10,Z+2\n" + "ldd r11,Z+3\n" + "ldd r12,Z+4\n" + "ldd r13,Z+5\n" + "ldd r14,Z+6\n" + "ldd r15,Z+7\n" + "sub r30,%B2\n" + "sbc r31,__zero_reg__\n" + "eor r16,r8\n" + "eor r17,r9\n" + "eor r18,r10\n" + "eor r19,r11\n" + "eor r20,r12\n" + "eor r21,r13\n" + "eor r22,r14\n" + "eor r23,r15\n" + "bst r16,0\n" + "ror r23\n" + "ror r22\n" + "ror r21\n" + "ror r20\n" + "ror r19\n" + "ror r18\n" + "ror r17\n" + "ror r16\n" + "bld r23,7\n" + "bst r16,0\n" + "ror r23\n" + "ror r22\n" + "ror r21\n" + "ror r20\n" + "ror r19\n" + "ror r18\n" + "ror r17\n" + "ror r16\n" + "bld r23,7\n" + "bst r16,0\n" + "ror r23\n" + "ror r22\n" + "ror r21\n" + "ror r20\n" + "ror r19\n" + "ror r18\n" + "ror r17\n" + "ror r16\n" + "bld r23,7\n" + "st -X,r23\n" + "st -X,r22\n" + "st -X,r21\n" + "st -X,r20\n" + "st -X,r19\n" + "st -X,r18\n" + "st -X,r17\n" + "st -X,r16\n" + "adiw r26,8\n" + "eor r8,r25\n" + "sub r8,r16\n" + "sbc r9,r17\n" + "sbc r10,r18\n" + "sbc r11,r19\n" + "sbc r12,r20\n" + "sbc r13,r21\n" + "sbc r14,r22\n" + "sbc r15,r23\n" + "add r30,%A2\n" + "adc r31,__zero_reg__\n" + "st Z,r15\n" + "std Z+1,r8\n" + "std Z+2,r9\n" + "std Z+3,r10\n" + "std Z+4,r11\n" + "std Z+5,r12\n" + "std Z+6,r13\n" + "std Z+7,r14\n" + "sub r30,%A2\n" + "sbc r31,__zero_reg__\n" + "pop r23\n" + "pop r22\n" + "pop r21\n" + "pop r20\n" + "pop r19\n" + "pop r18\n" + "pop r17\n" + "pop r16\n" + "pop r15\n" + "pop r14\n" + "pop r13\n" + "pop r12\n" + "pop r11\n" + "pop r10\n" + "pop r9\n" + "pop r8\n" + "rjmp 2b\n" + "4:\n" + "ldd r26,%A3\n" + "ldd r27,%B3\n" + "st X+,r15\n" + "st X+,r14\n" + "st X+,r13\n" + "st X+,r12\n" + "st X+,r11\n" + "st X+,r10\n" + "st X+,r9\n" + "st X+,r8\n" + "st X+,r23\n" + "st X+,r22\n" + "st X+,r21\n" + "st X+,r20\n" + "st X+,r19\n" + "st X+,r18\n" + "st X+,r17\n" + "st X,r16\n" + : : "x"(this->l), "z"(l), "r"(input), "Q"(output), "Q"(li_out), "Q"(r), "Q"(li_in) + : "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "memory" + , "r24", "r25" ); #else uint64_t l[4]; diff --git a/libraries/Crypto/SpeckTiny.cpp b/libraries/Crypto/SpeckTiny.cpp index 85446089..aad12619 100644 --- a/libraries/Crypto/SpeckTiny.cpp +++ b/libraries/Crypto/SpeckTiny.cpp @@ -156,336 +156,257 @@ bool SpeckTiny::setKey(const uint8_t *key, size_t len) void SpeckTiny::encryptBlock(uint8_t *output, const uint8_t *input) { #if USE_AVR_INLINE_ASM - uint64_t l[4]; - uint32_t xlow, xhigh, ylow, yhigh; - uint32_t slow, shigh; - uint8_t li_in = 0; - uint8_t li_out = (rounds - 31) * 8; - - // Copy the "k" array into "s" and the "l" array. + // Automatically generated by the genspeck tool. + uint64_t l[5]; + uint8_t r = rounds; + uint8_t mb = (r - 31) * 8; __asm__ __volatile__ ( - "ldd r25,%4\n" // r25 = li_out - - "ld __tmp_reg__,Z+\n" - "std %A0,__tmp_reg__\n" - "ld __tmp_reg__,Z+\n" - "std %B0,__tmp_reg__\n" - "ld __tmp_reg__,Z+\n" - "std %C0,__tmp_reg__\n" - "ld __tmp_reg__,Z+\n" - "std %D0,__tmp_reg__\n" - "ld __tmp_reg__,Z+\n" - "std %A1,__tmp_reg__\n" - "ld __tmp_reg__,Z+\n" - "std %B1,__tmp_reg__\n" - "ld __tmp_reg__,Z+\n" - "std %C1,__tmp_reg__\n" - "ld __tmp_reg__,Z+\n" - "std %D1,__tmp_reg__\n" - - "1:\n" // l[0..] = k[1..] - "ld __tmp_reg__,Z+\n" - "st X+,__tmp_reg__\n" - "dec r25\n" - "brne 1b\n" - : "=Q"(slow), "=Q"(shigh) - : "z"(k), "x"(l), "Q"(li_out) - : "r25" - ); - - // Unpack the input into the x and y variables, converting - // from big-endian into little-endian in the process. - __asm__ __volatile__ ( - "ld %D1,Z\n" - "ldd %C1,Z+1\n" - "ldd %B1,Z+2\n" - "ldd %A1,Z+3\n" - "ldd %D0,Z+4\n" - "ldd %C0,Z+5\n" - "ldd %B0,Z+6\n" - "ldd %A0,Z+7\n" - "ldd %D3,Z+8\n" - "ldd %C3,Z+9\n" - "ldd %B3,Z+10\n" - "ldd %A3,Z+11\n" - "ldd %D2,Z+12\n" - "ldd %C2,Z+13\n" - "ldd %B2,Z+14\n" - "ldd %A2,Z+15\n" - : "=r"(xlow), "=r"(xhigh), "=r"(ylow), "=r"(yhigh) - : "z"(input) - ); - - // Perform all encryption rounds while expanding the key schedule in-place. - __asm__ __volatile__ ( - "mov r23,__zero_reg__\n" // i = 0 + "movw r8,r30\n" + "ldd r16,%4\n" + "ldi r24,8\n" + "add r16,r24\n" "1:\n" - - // Adjust x and y for this round using the key schedule word s. - - // x = (rightRotate8_64(x) + y) ^ s; - "mov __tmp_reg__,%A0\n" // x = rightRotate8_64(x) - "mov %A0,%B0\n" - "mov %B0,%C0\n" - "mov %C0,%D0\n" - "mov %D0,%A1\n" - "mov %A1,%B1\n" - "mov %B1,%C1\n" - "mov %C1,%D1\n" - "mov %D1,__tmp_reg__\n" - - "add %A0,%A2\n" // x += y - "adc %B0,%B2\n" - "adc %C0,%C2\n" - "adc %D0,%D2\n" - "adc %A1,%A3\n" - "adc %B1,%B3\n" - "adc %C1,%C3\n" - "adc %D1,%D3\n" - - "ldd __tmp_reg__,%A4\n" // x ^= s - "eor %A0,__tmp_reg__\n" - "ldd __tmp_reg__,%B4\n" - "eor %B0,__tmp_reg__\n" - "ldd __tmp_reg__,%C4\n" - "eor %C0,__tmp_reg__\n" - "ldd __tmp_reg__,%D4\n" - "eor %D0,__tmp_reg__\n" - "ldd __tmp_reg__,%A5\n" - "eor %A1,__tmp_reg__\n" - "ldd __tmp_reg__,%B5\n" - "eor %B1,__tmp_reg__\n" - "ldd __tmp_reg__,%C5\n" - "eor %C1,__tmp_reg__\n" - "ldd __tmp_reg__,%D5\n" - "eor %D1,__tmp_reg__\n" - - // y = leftRotate3_64(y) ^ x; - "lsl %A2\n" // y = leftRotate1_64(y) - "rol %B2\n" - "rol %C2\n" - "rol %D2\n" - "rol %A3\n" - "rol %B3\n" - "rol %C3\n" - "rol %D3\n" - "adc %A2,__zero_reg__\n" - - "lsl %A2\n" // y = leftRotate1_64(y) - "rol %B2\n" - "rol %C2\n" - "rol %D2\n" - "rol %A3\n" - "rol %B3\n" - "rol %C3\n" - "rol %D3\n" - - "adc %A2,__zero_reg__\n" - "lsl %A2\n" // y = leftRotate1_64(y) - "rol %B2\n" - "rol %C2\n" - "rol %D2\n" - "rol %A3\n" - "rol %B3\n" - "rol %C3\n" - "rol %D3\n" - "adc %A2,__zero_reg__\n" - - "eor %A2,%A0\n" // y ^= x - "eor %B2,%B0\n" - "eor %C2,%C0\n" - "eor %D2,%D0\n" - "eor %A3,%A1\n" - "eor %B3,%B1\n" - "eor %C3,%C1\n" - "eor %D3,%D1\n" - - // On the last round we don't need to compute s so we - // can exit early here if (i + 1) == rounds. - "mov __tmp_reg__,r23\n" // temp = i + 1 - "inc __tmp_reg__\n" - "cp __tmp_reg__,%9\n" // if (temp == rounds) ... - "brne 2f\n" - "rjmp 3f\n" + "ld __tmp_reg__,X+\n" + "st Z+,__tmp_reg__\n" + "dec r16\n" + "brne 1b\n" + "movw r30,r8\n" + "movw r26,%A2\n" + "ld r15,X+\n" + "ld r14,X+\n" + "ld r13,X+\n" + "ld r12,X+\n" + "ld r11,X+\n" + "ld r10,X+\n" + "ld r9,X+\n" + "ld r8,X+\n" + "ld r23,X+\n" + "ld r22,X+\n" + "ld r21,X+\n" + "ld r20,X+\n" + "ld r19,X+\n" + "ld r18,X+\n" + "ld r17,X+\n" + "ld r16,X\n" + "clr %A2\n" + "ldd %B2,%4\n" + "clr r25\n" "2:\n" - - // Save x and y on the stack so we can reuse registers for t and s. - "push %A0\n" - "push %B0\n" - "push %C0\n" - "push %D0\n" - "push %A1\n" - "push %B1\n" - "push %C1\n" - "push %D1\n" - "push %A2\n" - "push %B2\n" - "push %C2\n" - "push %D2\n" - "push %A3\n" - "push %B3\n" - "push %C3\n" - "push %D3\n" - - // Compute the key schedule word s for the next round. - - // l[li_out] = (s + rightRotate8_64(l[li_in])) ^ i; - "ldd r24,%6\n" // Z = &(l[li_in]) - "add %A8,r24\n" - "adc %B8,__zero_reg__\n" - - "ld %D1,Z+\n" // t = rightRotate8_64(l[li_in]) - "ld %A0,Z+\n" - "ld %B0,Z+\n" - "ld %C0,Z+\n" - "ld %D0,Z+\n" - "ld %A1,Z+\n" - "ld %B1,Z+\n" - "ld %C1,Z+\n" - - "ldd %A2,%A4\n" // load s - "ldd %B2,%B4\n" - "ldd %C2,%C4\n" - "ldd %D2,%D4\n" - "ldd %A3,%A5\n" - "ldd %B3,%B5\n" - "ldd %C3,%C5\n" - "ldd %D3,%D5\n" - - "add %A0,%A2\n" // t += s - "adc %B0,%B2\n" - "adc %C0,%C2\n" - "adc %D0,%D2\n" - "adc %A1,%A3\n" - "adc %B1,%B3\n" - "adc %C1,%C3\n" - "adc %D1,%D3\n" - - "eor %A0,r23\n" // t ^= i - - // Z = Z - li_in + li_out - "ldi r25,8\n" // li_in = li_in + 1 - "add r24,r25\n" - "sub %A8,r24\n" // return Z to its initial value - "sbc %B8,__zero_reg__\n" - "andi r24,0x1f\n" // li_in = li_in % 4 - "std %6,r24\n" - "ldd r24,%7\n" // Z = &(l[li_out]) - "add %A8,r24\n" - "adc %B8,__zero_reg__\n" - - "st Z+,%A0\n" // l[li_out] = t - "st Z+,%B0\n" - "st Z+,%C0\n" - "st Z+,%D0\n" - "st Z+,%A1\n" - "st Z+,%B1\n" - "st Z+,%C1\n" - "st Z+,%D1\n" - - "add r24,r25\n" // li_out = li_out + 1 - "sub %A8,r24\n" // return Z to its initial value - "sbc %B8,__zero_reg__\n" - "andi r24,0x1f\n" // li_out = li_out % 4 - "std %7,r24\n" - - // s = leftRotate3_64(s) ^ l[li_out]; - "lsl %A2\n" // s = leftRotate1_64(s) - "rol %B2\n" - "rol %C2\n" - "rol %D2\n" - "rol %A3\n" - "rol %B3\n" - "rol %C3\n" - "rol %D3\n" - "adc %A2,__zero_reg__\n" - - "lsl %A2\n" // s = leftRotate1_64(s) - "rol %B2\n" - "rol %C2\n" - "rol %D2\n" - "rol %A3\n" - "rol %B3\n" - "rol %C3\n" - "rol %D3\n" - "adc %A2,__zero_reg__\n" - - "lsl %A2\n" // s = leftRotate1_64(s) - "rol %B2\n" - "rol %C2\n" - "rol %D2\n" - "rol %A3\n" - "rol %B3\n" - "rol %C3\n" - "rol %D3\n" - "adc %A2,__zero_reg__\n" - - "eor %A2,%A0\n" // s ^= l[li_out] - "eor %B2,%B0\n" - "eor %C2,%C0\n" - "eor %D2,%D0\n" - "eor %A3,%A1\n" - "eor %B3,%B1\n" - "eor %C3,%C1\n" - "eor %D3,%D1\n" - - "std %A4,%A2\n" // store s - "std %B4,%B2\n" - "std %C4,%C2\n" - "std %D4,%D2\n" - "std %A5,%A3\n" - "std %B5,%B3\n" - "std %C5,%C3\n" - "std %D5,%D3\n" - - // Pop registers from the stack to recover the x and y values. - "pop %D3\n" - "pop %C3\n" - "pop %B3\n" - "pop %A3\n" - "pop %D2\n" - "pop %C2\n" - "pop %B2\n" - "pop %A2\n" - "pop %D1\n" - "pop %C1\n" - "pop %B1\n" - "pop %A1\n" - "pop %D0\n" - "pop %C0\n" - "pop %B0\n" - "pop %A0\n" - - // Bottom of the loop. - "inc r23\n" - "rjmp 1b\n" + "add r9,r16\n" + "adc r10,r17\n" + "adc r11,r18\n" + "adc r12,r19\n" + "adc r13,r20\n" + "adc r14,r21\n" + "adc r15,r22\n" + "adc r8,r23\n" + "ld __tmp_reg__,Z+\n" + "eor __tmp_reg__,r9\n" + "ld r9,Z+\n" + "eor r9,r10\n" + "ld r10,Z+\n" + "eor r10,r11\n" + "ld r11,Z+\n" + "eor r11,r12\n" + "ld r12,Z+\n" + "eor r12,r13\n" + "ld r13,Z+\n" + "eor r13,r14\n" + "ld r14,Z+\n" + "eor r14,r15\n" + "ld r15,Z+\n" + "eor r15,r8\n" + "mov r8,__tmp_reg__\n" + "lsl r16\n" + "rol r17\n" + "rol r18\n" + "rol r19\n" + "rol r20\n" + "rol r21\n" + "rol r22\n" + "rol r23\n" + "adc r16, __zero_reg__\n" + "lsl r16\n" + "rol r17\n" + "rol r18\n" + "rol r19\n" + "rol r20\n" + "rol r21\n" + "rol r22\n" + "rol r23\n" + "adc r16, __zero_reg__\n" + "lsl r16\n" + "rol r17\n" + "rol r18\n" + "rol r19\n" + "rol r20\n" + "rol r21\n" + "rol r22\n" + "rol r23\n" + "adc r16, __zero_reg__\n" + "eor r16,r8\n" + "eor r17,r9\n" + "eor r18,r10\n" + "eor r19,r11\n" + "eor r20,r12\n" + "eor r21,r13\n" + "eor r22,r14\n" + "eor r23,r15\n" + "mov __tmp_reg__,r25\n" + "inc __tmp_reg__\n" + "ldd r24,%5\n" + "cp __tmp_reg__,r24\n" + "brne 3f\n" + "rjmp 4f\n" "3:\n" - - : "+r"(xlow), "+r"(xhigh), "+r"(ylow), "+r"(yhigh), - "+Q"(slow), "+Q"(shigh), "+Q"(li_in), "+Q"(li_out) - : "z"(l), "r"(rounds) - : "r23", "r24", "r25" - ); - - // Pack the results into the output and convert back to big-endian. - __asm__ __volatile__ ( - "st Z,%D1\n" - "std Z+1,%C1\n" - "std Z+2,%B1\n" - "std Z+3,%A1\n" - "std Z+4,%D0\n" - "std Z+5,%C0\n" - "std Z+6,%B0\n" - "std Z+7,%A0\n" - "std Z+8,%D3\n" - "std Z+9,%C3\n" - "std Z+10,%B3\n" - "std Z+11,%A3\n" - "std Z+12,%D2\n" - "std Z+13,%C2\n" - "std Z+14,%B2\n" - "std Z+15,%A2\n" - : : "r"(xlow), "r"(xhigh), "r"(ylow), "r"(yhigh), "z"(output) + "push r8\n" + "push r9\n" + "push r10\n" + "push r11\n" + "push r12\n" + "push r13\n" + "push r14\n" + "push r15\n" + "push r16\n" + "push r17\n" + "push r18\n" + "push r19\n" + "push r20\n" + "push r21\n" + "push r22\n" + "push r23\n" + "sbiw r30,8\n" + "ld r16,Z\n" + "ldd r17,Z+1\n" + "ldd r18,Z+2\n" + "ldd r19,Z+3\n" + "ldd r20,Z+4\n" + "ldd r21,Z+5\n" + "ldd r22,Z+6\n" + "ldd r23,Z+7\n" + "add r30,%A2\n" + "adc r31,__zero_reg__\n" + "ldd r15,Z+8\n" + "ldd r8,Z+9\n" + "ldd r9,Z+10\n" + "ldd r10,Z+11\n" + "ldd r11,Z+12\n" + "ldd r12,Z+13\n" + "ldd r13,Z+14\n" + "ldd r14,Z+15\n" + "add r8,r16\n" + "adc r9,r17\n" + "adc r10,r18\n" + "adc r11,r19\n" + "adc r12,r20\n" + "adc r13,r21\n" + "adc r14,r22\n" + "adc r15,r23\n" + "eor r8,r25\n" + "sub r30,%A2\n" + "sbc r31,__zero_reg__\n" + "add r30,%B2\n" + "adc r31,__zero_reg__\n" + "std Z+8,r8\n" + "std Z+9,r9\n" + "std Z+10,r10\n" + "std Z+11,r11\n" + "std Z+12,r12\n" + "std Z+13,r13\n" + "std Z+14,r14\n" + "std Z+15,r15\n" + "sub r30,%B2\n" + "sbc r31,__zero_reg__\n" + "lsl r16\n" + "rol r17\n" + "rol r18\n" + "rol r19\n" + "rol r20\n" + "rol r21\n" + "rol r22\n" + "rol r23\n" + "adc r16, __zero_reg__\n" + "lsl r16\n" + "rol r17\n" + "rol r18\n" + "rol r19\n" + "rol r20\n" + "rol r21\n" + "rol r22\n" + "rol r23\n" + "adc r16, __zero_reg__\n" + "lsl r16\n" + "rol r17\n" + "rol r18\n" + "rol r19\n" + "rol r20\n" + "rol r21\n" + "rol r22\n" + "rol r23\n" + "adc r16, __zero_reg__\n" + "eor r16,r8\n" + "eor r17,r9\n" + "eor r18,r10\n" + "eor r19,r11\n" + "eor r20,r12\n" + "eor r21,r13\n" + "eor r22,r14\n" + "eor r23,r15\n" + "st Z,r16\n" + "std Z+1,r17\n" + "std Z+2,r18\n" + "std Z+3,r19\n" + "std Z+4,r20\n" + "std Z+5,r21\n" + "std Z+6,r22\n" + "std Z+7,r23\n" + "ldi r24,8\n" + "add %A2,r24\n" + "add %B2,r24\n" + "ldi r24,0x1F\n" + "and %A2,r24\n" + "and %B2,r24\n" + "pop r23\n" + "pop r22\n" + "pop r21\n" + "pop r20\n" + "pop r19\n" + "pop r18\n" + "pop r17\n" + "pop r16\n" + "pop r15\n" + "pop r14\n" + "pop r13\n" + "pop r12\n" + "pop r11\n" + "pop r10\n" + "pop r9\n" + "pop r8\n" + "inc r25\n" + "rjmp 2b\n" + "4:\n" + "ldd r26,%A3\n" + "ldd r27,%B3\n" + "st X+,r15\n" + "st X+,r14\n" + "st X+,r13\n" + "st X+,r12\n" + "st X+,r11\n" + "st X+,r10\n" + "st X+,r9\n" + "st X+,r8\n" + "st X+,r23\n" + "st X+,r22\n" + "st X+,r21\n" + "st X+,r20\n" + "st X+,r19\n" + "st X+,r18\n" + "st X+,r17\n" + "st X,r16\n" + : : "x"(k), "z"(l), "r"(input), "Q"(output), "Q"(mb), "Q"(r) + : "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "memory" + , "r24", "r25" ); #else uint64_t l[4];