1
0
mirror of https://github.com/taigrr/arduinolibs synced 2025-01-18 04:33:12 -08:00

Speed up Speck by using a custom AVR code generator

This also fixes the remaining asm issues with newer versions of gcc.
This commit is contained in:
Rhys Weatherley 2017-11-03 10:46:39 +10:00
parent b53f57225d
commit 277a0b63c9
5 changed files with 1765 additions and 1043 deletions

View File

@ -81,27 +81,27 @@ Ardunino Mega 2560 running at 16 MHz are similar:
<tr><td>ChaCha (20 rounds)</td><td align="right">14.87us</td><td align="right">14.88us</td><td align="right">43.74us</td><td align="right">132</td></tr>
<tr><td>ChaCha (12 rounds)</td><td align="right">10.38us</td><td align="right">10.38us</td><td align="right">43.74us</td><td align="right">132</td></tr>
<tr><td>ChaCha (8 rounds)</td><td align="right">8.13us</td><td align="right">8.14us</td><td align="right">43.74us</td><td align="right">132</td></tr>
<tr><td>Speck (128-bit key, ECB mode)</td><td align="right">10.72us</td><td align="right">11.09us</td><td align="right">287.02us</td><td align="right">275</td></tr>
<tr><td>Speck (192-bit key, ECB mode)</td><td align="right">11.03us</td><td align="right">11.42us</td><td align="right">298.21us</td><td align="right">275</td></tr>
<tr><td>Speck (256-bit key, ECB mode)</td><td align="right">11.35us</td><td align="right">11.74us</td><td align="right">309.66us</td><td align="right">275</td></tr>
<tr><td>SpeckSmall (128-bit key, ECB mode)</td><td align="right">35.25us</td><td align="right">36.46us</td><td align="right">207.66us</td><td align="right">67</td></tr>
<tr><td>SpeckSmall (192-bit key, ECB mode)</td><td align="right">36.56us</td><td align="right">37.56us</td><td align="right">220.55us</td><td align="right">67</td></tr>
<tr><td>SpeckSmall (256-bit key, ECB mode)</td><td align="right">37.87us</td><td align="right">38.67us</td><td align="right">233.32us</td><td align="right">67</td></tr>
<tr><td>SpeckTiny (128-bit key, ECB mode)</td><td align="right">35.25us</td><td align="right"> </td><td align="right">10.22us</td><td align="right">35</td></tr>
<tr><td>SpeckTiny (192-bit key, ECB mode)</td><td align="right">36.56us</td><td align="right"> </td><td align="right">13.62us</td><td align="right">35</td></tr>
<tr><td>SpeckTiny (256-bit key, ECB mode)</td><td align="right">37.87us</td><td align="right"> </td><td align="right">16.89us</td><td align="right">35</td></tr>
<tr><td>Speck (128-bit key, ECB mode)</td><td align="right">9.74us</td><td align="right">10.12us</td><td align="right">253.94us</td><td align="right">275</td></tr>
<tr><td>Speck (192-bit key, ECB mode)</td><td align="right">10.03us</td><td align="right">10.41us</td><td align="right">264.63us</td><td align="right">275</td></tr>
<tr><td>Speck (256-bit key, ECB mode)</td><td align="right">10.31us</td><td align="right">10.71us</td><td align="right">275.26us</td><td align="right">275</td></tr>
<tr><td>SpeckSmall (128-bit key, ECB mode)</td><td align="right">33.93us</td><td align="right">34.82us</td><td align="right">207.66us</td><td align="right">67</td></tr>
<tr><td>SpeckSmall (192-bit key, ECB mode)</td><td align="right">35.20us</td><td align="right">35.88us</td><td align="right">220.55us</td><td align="right">67</td></tr>
<tr><td>SpeckSmall (256-bit key, ECB mode)</td><td align="right">36.46us</td><td align="right">36.93us</td><td align="right">233.32us</td><td align="right">67</td></tr>
<tr><td>SpeckTiny (128-bit key, ECB mode)</td><td align="right">33.93us</td><td align="right"> </td><td align="right">10.22us</td><td align="right">35</td></tr>
<tr><td>SpeckTiny (192-bit key, ECB mode)</td><td align="right">35.20us</td><td align="right"> </td><td align="right">13.62us</td><td align="right">35</td></tr>
<tr><td>SpeckTiny (256-bit key, ECB mode)</td><td align="right">36.46us</td><td align="right"> </td><td align="right">16.89us</td><td align="right">35</td></tr>
<tr><td colspan="5"> </td></tr>
<tr><td>AEAD Algorithm</td><td align="right">Encryption (per byte)</td><td align="right">Decryption (per byte)</td><td align="right">Key Setup</td><td>State Size (bytes)</td></tr>
<tr><td>ChaChaPoly</td><td align="right">41.20us</td><td align="right">41.19us</td><td align="right">902.36us</td><td align="right">221</td></tr>
<tr><td>GCM&lt;AES128&gt;</td><td align="right">109.71us</td><td align="right">109.26us</td><td align="right">1265.69us</td><td align="right">284</td></tr>
<tr><td>GCM&lt;AES192&gt;</td><td align="right">116.38us</td><td align="right">115.92us</td><td align="right">1485.56us</td><td align="right">316</td></tr>
<tr><td>GCM&lt;AES256&gt;</td><td align="right">123.04us</td><td align="right">122.59us</td><td align="right">1760.28us</td><td align="right">348</td></tr>
<tr><td>GCM&lt;Speck&gt; (256-bit key)</td><td align="right">87.78us</td><td align="right">87.32us</td><td align="right">714.41us</td><td align="right">378</td></tr>
<tr><td>GCM&lt;SpeckTiny&gt; (256-bit key)</td><td align="right">114.30us</td><td align="right">113.84us</td><td align="right">1270.32us</td><td align="right">138</td></tr>
<tr><td>GCM&lt;Speck&gt; (256-bit key)</td><td align="right">86.74us</td><td align="right">86.29us</td><td align="right">646.88us</td><td align="right">378</td></tr>
<tr><td>GCM&lt;SpeckTiny&gt; (256-bit key)</td><td align="right">112.90us</td><td align="right">112.44us</td><td align="right">1225.48us</td><td align="right">138</td></tr>
<tr><td>EAX&lt;AES128&gt;</td><td align="right">71.14us</td><td align="right">71.14us</td><td align="right">1311.97us</td><td align="right">268</td></tr>
<tr><td>EAX&lt;AES256&gt;</td><td align="right">97.80us</td><td align="right">97.80us</td><td align="right">1806.57us</td><td align="right">332</td></tr>
<tr><td>EAX&lt;Speck&gt; (256-bit key)</td><td align="right">27.27us</td><td align="right">27.26us</td><td align="right">760.74us</td><td align="right">362</td></tr>
<tr><td>EAX&lt;SpeckTiny&gt; (256-bit key)</td><td align="right">80.31us</td><td align="right">80.31us</td><td align="right">1316.60us</td><td align="right">122</td></tr>
<tr><td>EAX&lt;Speck&gt; (256-bit key)</td><td align="right">25.89us</td><td align="right">25.88us</td><td align="right">690.63us</td><td align="right">362</td></tr>
<tr><td>EAX&lt;SpeckTiny&gt; (256-bit key)</td><td align="right">78.20us</td><td align="right">78.20us</td><td align="right">1269.19us</td><td align="right">122</td></tr>
<tr><td colspan="5"> </td></tr>
<tr><td>Hash Algorithm</td><td align="right">Hashing (per byte)</td><td align="right">Finalization</td><td> </td><td>State Size (bytes)</td></tr>
<tr><td>SHA256</td><td align="right">43.85us</td><td align="right">2841.04us</td><td align="right"> </td><td align="right">107</td></tr>

892
gen/genspeck.c Normal file
View File

@ -0,0 +1,892 @@
/*
* Copyright (C) 2016 Southern Storm Software, Pty Ltd.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
// Special-purpose compiler that generates the AVR version of Speck*.
#include <stdio.h>
#include <stdarg.h>
static int indent = 4;
static int t1_reg = 8; // Temporary 64-bit value (any reg).
static int t2_reg = 16; // Temporary 64-bit value (any reg).
static int x_reg = 26;
//static int y_reg = 28;
static int z_reg = 30;
static int const_reg = 24; // For temporary constants (must be a high reg).
static int temp_reg = 25; // Spare temporary register.
// Information about a set of registers storing a 64-bit quantity.
typedef struct
{
int first; // First register in the set.
int offset; // Offset for multiple of 8 rotations.
} Reg64;
// Indent the code and print a string.
void indent_printf(const char *format, ...)
{
va_list va;
int posn;
va_start(va, format);
for (posn = 0; posn < indent; ++posn)
putc(' ', stdout);
vfprintf(stdout, format, va);
va_end(va);
}
// Print an assembler instruction within quotes.
void insn_printf(const char *format, ...)
{
va_list va;
int posn;
va_start(va, format);
for (posn = 0; posn < indent; ++posn)
putc(' ', stdout);
putc('"', stdout);
vfprintf(stdout, format, va);
putc('\\', stdout);
putc('n', stdout);
putc('"', stdout);
putc('\n', stdout);
va_end(va);
}
#define REGn(reg, n) ((reg)->first + ((n) + (reg)->offset) % 8)
void leftRotate1(const Reg64 *reg)
{
insn_printf("lsl r%d", REGn(reg, 0));
insn_printf("rol r%d", REGn(reg, 1));
insn_printf("rol r%d", REGn(reg, 2));
insn_printf("rol r%d", REGn(reg, 3));
insn_printf("rol r%d", REGn(reg, 4));
insn_printf("rol r%d", REGn(reg, 5));
insn_printf("rol r%d", REGn(reg, 6));
insn_printf("rol r%d", REGn(reg, 7));
insn_printf("adc r%d, __zero_reg__", REGn(reg, 0));
}
void leftRotate3(const Reg64 *reg)
{
leftRotate1(reg);
leftRotate1(reg);
leftRotate1(reg);
}
void leftRotate8(Reg64 *reg)
{
reg->offset = (reg->offset + 7) % 8;
}
void rightRotate1(const Reg64 *reg)
{
insn_printf("bst r%d,0", REGn(reg, 0));
insn_printf("ror r%d", REGn(reg, 7));
insn_printf("ror r%d", REGn(reg, 6));
insn_printf("ror r%d", REGn(reg, 5));
insn_printf("ror r%d", REGn(reg, 4));
insn_printf("ror r%d", REGn(reg, 3));
insn_printf("ror r%d", REGn(reg, 2));
insn_printf("ror r%d", REGn(reg, 1));
insn_printf("ror r%d", REGn(reg, 0));
insn_printf("bld r%d,7", REGn(reg, 7));
}
void rightRotate3(const Reg64 *reg)
{
rightRotate1(reg);
rightRotate1(reg);
rightRotate1(reg);
}
void rightRotate8(Reg64 *reg)
{
reg->offset = (reg->offset + 1) % 8;
}
void add64(const Reg64 *dst, const Reg64 *src)
{
insn_printf("add r%d,r%d", REGn(dst, 0), REGn(src, 0));
insn_printf("adc r%d,r%d", REGn(dst, 1), REGn(src, 1));
insn_printf("adc r%d,r%d", REGn(dst, 2), REGn(src, 2));
insn_printf("adc r%d,r%d", REGn(dst, 3), REGn(src, 3));
insn_printf("adc r%d,r%d", REGn(dst, 4), REGn(src, 4));
insn_printf("adc r%d,r%d", REGn(dst, 5), REGn(src, 5));
insn_printf("adc r%d,r%d", REGn(dst, 6), REGn(src, 6));
insn_printf("adc r%d,r%d", REGn(dst, 7), REGn(src, 7));
}
void sub64(const Reg64 *dst, const Reg64 *src)
{
insn_printf("sub r%d,r%d", REGn(dst, 0), REGn(src, 0));
insn_printf("sbc r%d,r%d", REGn(dst, 1), REGn(src, 1));
insn_printf("sbc r%d,r%d", REGn(dst, 2), REGn(src, 2));
insn_printf("sbc r%d,r%d", REGn(dst, 3), REGn(src, 3));
insn_printf("sbc r%d,r%d", REGn(dst, 4), REGn(src, 4));
insn_printf("sbc r%d,r%d", REGn(dst, 5), REGn(src, 5));
insn_printf("sbc r%d,r%d", REGn(dst, 6), REGn(src, 6));
insn_printf("sbc r%d,r%d", REGn(dst, 7), REGn(src, 7));
}
void eor64(const Reg64 *dst, const Reg64 *src)
{
insn_printf("eor r%d,r%d", REGn(dst, 0), REGn(src, 0));
insn_printf("eor r%d,r%d", REGn(dst, 1), REGn(src, 1));
insn_printf("eor r%d,r%d", REGn(dst, 2), REGn(src, 2));
insn_printf("eor r%d,r%d", REGn(dst, 3), REGn(src, 3));
insn_printf("eor r%d,r%d", REGn(dst, 4), REGn(src, 4));
insn_printf("eor r%d,r%d", REGn(dst, 5), REGn(src, 5));
insn_printf("eor r%d,r%d", REGn(dst, 6), REGn(src, 6));
insn_printf("eor r%d,r%d", REGn(dst, 7), REGn(src, 7));
}
void eor64Schedule(Reg64 *reg)
{
// XOR with the schedule.
insn_printf("ld __tmp_reg__,Z+");
insn_printf("eor __tmp_reg__,r%d", REGn(reg, 0));
insn_printf("ld r%d,Z+", REGn(reg, 0));
insn_printf("eor r%d,r%d", REGn(reg, 0), REGn(reg, 1));
insn_printf("ld r%d,Z+", REGn(reg, 1));
insn_printf("eor r%d,r%d", REGn(reg, 1), REGn(reg, 2));
insn_printf("ld r%d,Z+", REGn(reg, 2));
insn_printf("eor r%d,r%d", REGn(reg, 2), REGn(reg, 3));
insn_printf("ld r%d,Z+", REGn(reg, 3));
insn_printf("eor r%d,r%d", REGn(reg, 3), REGn(reg, 4));
insn_printf("ld r%d,Z+", REGn(reg, 4));
insn_printf("eor r%d,r%d", REGn(reg, 4), REGn(reg, 5));
insn_printf("ld r%d,Z+", REGn(reg, 5));
insn_printf("eor r%d,r%d", REGn(reg, 5), REGn(reg, 6));
insn_printf("ld r%d,Z+", REGn(reg, 6));
insn_printf("eor r%d,r%d", REGn(reg, 6), REGn(reg, 7));
insn_printf("mov r%d,__tmp_reg__", REGn(reg, 7));
// The above operations also implicitly perform a right-rotation.
// Undo it by left-shifting back into the correct position.
leftRotate8(reg);
}
void eor64ScheduleReversePtr(Reg64 *reg, const char *ptrReg)
{
// XOR with the schedule.
insn_printf("ld __tmp_reg__,-%s", ptrReg);
insn_printf("eor __tmp_reg__,r%d", REGn(reg, 7));
insn_printf("ld r%d,-%s", REGn(reg, 7), ptrReg);
insn_printf("eor r%d,r%d", REGn(reg, 7), REGn(reg, 6));
insn_printf("ld r%d,-%s", REGn(reg, 6), ptrReg);
insn_printf("eor r%d,r%d", REGn(reg, 6), REGn(reg, 5));
insn_printf("ld r%d,-%s", REGn(reg, 5), ptrReg);
insn_printf("eor r%d,r%d", REGn(reg, 5), REGn(reg, 4));
insn_printf("ld r%d,-%s", REGn(reg, 4), ptrReg);
insn_printf("eor r%d,r%d", REGn(reg, 4), REGn(reg, 3));
insn_printf("ld r%d,-%s", REGn(reg, 3), ptrReg);
insn_printf("eor r%d,r%d", REGn(reg, 3), REGn(reg, 2));
insn_printf("ld r%d,-%s", REGn(reg, 2), ptrReg);
insn_printf("eor r%d,r%d", REGn(reg, 2), REGn(reg, 1));
insn_printf("ld r%d,-%s", REGn(reg, 1), ptrReg);
insn_printf("eor r%d,r%d", REGn(reg, 1), REGn(reg, 0));
insn_printf("mov r%d,__tmp_reg__", REGn(reg, 0));
// The above operations also implicitly perform a left-rotation.
// Undo it by right-shifting back into the correct position.
// We have to do this twice because the following step will be
// apply a left-rotation to put everything back where it belongs.
rightRotate8(reg);
rightRotate8(reg);
}
void eor64ScheduleReverse(Reg64 *reg)
{
eor64ScheduleReversePtr(reg, "Z");
}
void eor64ScheduleReverseX(Reg64 *reg)
{
eor64ScheduleReversePtr(reg, "X");
}
// Unpack the input block and convert from big-endian to little-endian.
static void unpack_input(void)
{
Reg64 xreg = {t1_reg, 0};
Reg64 yreg = {t2_reg, 0};
insn_printf("ld r%d,X+", REGn(&xreg, 7));
insn_printf("ld r%d,X+", REGn(&xreg, 6));
insn_printf("ld r%d,X+", REGn(&xreg, 5));
insn_printf("ld r%d,X+", REGn(&xreg, 4));
insn_printf("ld r%d,X+", REGn(&xreg, 3));
insn_printf("ld r%d,X+", REGn(&xreg, 2));
insn_printf("ld r%d,X+", REGn(&xreg, 1));
insn_printf("ld r%d,X+", REGn(&xreg, 0));
insn_printf("ld r%d,X+", REGn(&yreg, 7));
insn_printf("ld r%d,X+", REGn(&yreg, 6));
insn_printf("ld r%d,X+", REGn(&yreg, 5));
insn_printf("ld r%d,X+", REGn(&yreg, 4));
insn_printf("ld r%d,X+", REGn(&yreg, 3));
insn_printf("ld r%d,X+", REGn(&yreg, 2));
insn_printf("ld r%d,X+", REGn(&yreg, 1));
insn_printf("ld r%d,X", REGn(&yreg, 0));
}
static void load_from_x(Reg64 *reg)
{
insn_printf("ld r%d,X+", REGn(reg, 0));
insn_printf("ld r%d,X+", REGn(reg, 1));
insn_printf("ld r%d,X+", REGn(reg, 2));
insn_printf("ld r%d,X+", REGn(reg, 3));
insn_printf("ld r%d,X+", REGn(reg, 4));
insn_printf("ld r%d,X+", REGn(reg, 5));
insn_printf("ld r%d,X+", REGn(reg, 6));
insn_printf("ld r%d,X+", REGn(reg, 7));
}
static void store_to_x(Reg64 *reg)
{
insn_printf("st X+,r%d", REGn(reg, 0));
insn_printf("st X+,r%d", REGn(reg, 1));
insn_printf("st X+,r%d", REGn(reg, 2));
insn_printf("st X+,r%d", REGn(reg, 3));
insn_printf("st X+,r%d", REGn(reg, 4));
insn_printf("st X+,r%d", REGn(reg, 5));
insn_printf("st X+,r%d", REGn(reg, 6));
insn_printf("st X+,r%d", REGn(reg, 7));
}
static void load_from_z(Reg64 *reg)
{
insn_printf("ld r%d,Z+", REGn(reg, 0));
insn_printf("ld r%d,Z+", REGn(reg, 1));
insn_printf("ld r%d,Z+", REGn(reg, 2));
insn_printf("ld r%d,Z+", REGn(reg, 3));
insn_printf("ld r%d,Z+", REGn(reg, 4));
insn_printf("ld r%d,Z+", REGn(reg, 5));
insn_printf("ld r%d,Z+", REGn(reg, 6));
insn_printf("ld r%d,Z+", REGn(reg, 7));
}
static void store_to_z(Reg64 *reg)
{
insn_printf("st Z+,r%d", REGn(reg, 0));
insn_printf("st Z+,r%d", REGn(reg, 1));
insn_printf("st Z+,r%d", REGn(reg, 2));
insn_printf("st Z+,r%d", REGn(reg, 3));
insn_printf("st Z+,r%d", REGn(reg, 4));
insn_printf("st Z+,r%d", REGn(reg, 5));
insn_printf("st Z+,r%d", REGn(reg, 6));
insn_printf("st Z+,r%d", REGn(reg, 7));
}
static void push64(Reg64 *reg)
{
reg->offset = 0;
insn_printf("push r%d", REGn(reg, 0));
insn_printf("push r%d", REGn(reg, 1));
insn_printf("push r%d", REGn(reg, 2));
insn_printf("push r%d", REGn(reg, 3));
insn_printf("push r%d", REGn(reg, 4));
insn_printf("push r%d", REGn(reg, 5));
insn_printf("push r%d", REGn(reg, 6));
insn_printf("push r%d", REGn(reg, 7));
}
static void pop64(Reg64 *reg)
{
reg->offset = 0;
insn_printf("pop r%d", REGn(reg, 7));
insn_printf("pop r%d", REGn(reg, 6));
insn_printf("pop r%d", REGn(reg, 5));
insn_printf("pop r%d", REGn(reg, 4));
insn_printf("pop r%d", REGn(reg, 3));
insn_printf("pop r%d", REGn(reg, 2));
insn_printf("pop r%d", REGn(reg, 1));
insn_printf("pop r%d", REGn(reg, 0));
}
// Main loop for Speck::encryptBlock().
static void full_enc_main_loop(void)
{
Reg64 xreg = {t1_reg, 0};
Reg64 yreg = {t2_reg, 0};
// Top of the main loop.
insn_printf("1:");
// x = (rightRotate8_64(x) + y) ^ *s++;
rightRotate8(&xreg);
add64(&xreg, &yreg);
eor64Schedule(&xreg);
// y = leftRotate3_64(y) ^ x;
leftRotate3(&yreg);
eor64(&yreg, &xreg);
// Bottom of the main loop.
insn_printf("dec %%2");
insn_printf("breq 2f");
insn_printf("rjmp 1b");
insn_printf("2:");
}
// Main loop for Speck::decryptBlock().
static void full_dec_main_loop(void)
{
Reg64 xreg = {t1_reg, 0};
Reg64 yreg = {t2_reg, 0};
// Top of the main loop.
insn_printf("1:");
// y = rightRotate3_64(x ^ y);
eor64(&yreg, &xreg);
rightRotate3(&yreg);
// x = leftRotate8_64((x ^ *s--) - y);
eor64ScheduleReverse(&xreg);
leftRotate8(&xreg);
sub64(&xreg, &yreg);
// Bottom of the main loop.
insn_printf("dec %%2");
insn_printf("breq 2f");
insn_printf("rjmp 1b");
insn_printf("2:");
}
// Pack the output block and convert from little-endian to big-endian.
static void pack_output(void)
{
Reg64 xreg = {t1_reg, 0};
Reg64 yreg = {t2_reg, 0};
insn_printf("ldd r%d,%%A3", x_reg);
insn_printf("ldd r%d,%%B3", x_reg + 1);
insn_printf("st X+,r%d", REGn(&xreg, 7));
insn_printf("st X+,r%d", REGn(&xreg, 6));
insn_printf("st X+,r%d", REGn(&xreg, 5));
insn_printf("st X+,r%d", REGn(&xreg, 4));
insn_printf("st X+,r%d", REGn(&xreg, 3));
insn_printf("st X+,r%d", REGn(&xreg, 2));
insn_printf("st X+,r%d", REGn(&xreg, 1));
insn_printf("st X+,r%d", REGn(&xreg, 0));
insn_printf("st X+,r%d", REGn(&yreg, 7));
insn_printf("st X+,r%d", REGn(&yreg, 6));
insn_printf("st X+,r%d", REGn(&yreg, 5));
insn_printf("st X+,r%d", REGn(&yreg, 4));
insn_printf("st X+,r%d", REGn(&yreg, 3));
insn_printf("st X+,r%d", REGn(&yreg, 2));
insn_printf("st X+,r%d", REGn(&yreg, 1));
insn_printf("st X,r%d", REGn(&yreg, 0));
}
static void temp_regs(void)
{
indent_printf(": \"r%d\", \"r%d\", \"r%d\", \"r%d\", "
"\"r%d\", \"r%d\", \"r%d\", \"r%d\",\n",
t1_reg, t1_reg + 1, t1_reg + 2, t1_reg + 3,
t1_reg + 4, t1_reg + 5, t1_reg + 6, t1_reg + 7);
indent_printf(" \"r%d\", \"r%d\", \"r%d\", \"r%d\", "
"\"r%d\", \"r%d\", \"r%d\", \"r%d\", \"memory\"\n",
t2_reg, t2_reg + 1, t2_reg + 2, t2_reg + 3,
t2_reg + 4, t2_reg + 5, t2_reg + 6, t2_reg + 7);
}
static void full_setkey(void)
{
Reg64 xreg = {t1_reg, 0};
Reg64 yreg = {t2_reg, 0};
printf("void Speck::setKey(const uint8_t *key, size_t len)\n");
printf("{\n");
indent_printf("// Automatically generated by the genspeck tool.\n");
// Validate the key length.
indent_printf("uint64_t l[4];\n");
indent_printf("uint8_t m, mb;\n");
indent_printf("if (len == 32) {\n");
indent_printf(" m = 4;\n");
indent_printf(" mb = 3 * 8;\n");
indent_printf("} else if (len == 24) {\n");
indent_printf(" m = 3;\n");
indent_printf(" mb = 2 * 8;\n");
indent_printf("} else if (len == 16) {\n");
indent_printf(" m = 2;\n");
indent_printf(" mb = 8;\n");
indent_printf("} else {\n");
indent_printf(" return false;\n");
indent_printf("}\n");
indent_printf("rounds = 30 + m;\n");
indent_printf("uint8_t r = rounds - 1;\n");
indent_printf("__asm__ __volatile__ (\n");
indent += 4;
// Copy the key into k[0] and l while converting endianness.
insn_printf("ld __tmp_reg__,-X"); // k[0] = last 8 bytes of the key
insn_printf("st Z+,__tmp_reg__");
insn_printf("ld __tmp_reg__,-X");
insn_printf("st Z+,__tmp_reg__");
insn_printf("ld __tmp_reg__,-X");
insn_printf("st Z+,__tmp_reg__");
insn_printf("ld __tmp_reg__,-X");
insn_printf("st Z+,__tmp_reg__");
insn_printf("ld __tmp_reg__,-X");
insn_printf("st Z+,__tmp_reg__");
insn_printf("ld __tmp_reg__,-X");
insn_printf("st Z+,__tmp_reg__");
insn_printf("ld __tmp_reg__,-X");
insn_printf("st Z+,__tmp_reg__");
insn_printf("ld __tmp_reg__,-X");
insn_printf("st Z+,__tmp_reg__");
insn_printf("sbiw r%d,8", z_reg); // Set Z back to beginning of k
insn_printf("movw r%d,r%d", t1_reg + 2, z_reg); // Save Z
insn_printf("movw r%d,%%A2", z_reg); // Z = l
insn_printf("ldd r%d,%%3", t1_reg);
insn_printf("1:");
insn_printf("ld __tmp_reg__,-X"); // Copy first mb bytes from key
insn_printf("st Z+,__tmp_reg__");
insn_printf("dec r%d", t1_reg);
insn_printf("brne 1b");
insn_printf("movw r%d,%%A2", x_reg); // X = l
insn_printf("movw r%d,r%d", z_reg, t1_reg + 2); // Z = k
// Expand the key to the full key schedule.
// Note: We can use %A2 and %B2 as spare temporary registers now.
insn_printf("clr %%A2"); // %A2 = li_in = 0
insn_printf("ldd %%B2,%%3"); // %B2 = li_out = mb (= (m - 1) * 8)
insn_printf("clr r%d", temp_reg); // i = 0
load_from_z(&yreg); // y = k[i]
insn_printf("2:");
// l[li_out] = (k[i] + rightRotate8_64(l[li_in])) ^ i
insn_printf("add r%d,%%A2", x_reg); // x = rightRotate8_64(l[li_in])
insn_printf("adc r%d,__zero_reg__", x_reg + 1);
xreg.offset = 7;
load_from_x(&xreg);
xreg.offset = 0;
insn_printf("sub r%d,%%A2", x_reg); // restore X to point at base of l
insn_printf("sbc r%d,__zero_reg__", x_reg + 1);
insn_printf("sbiw r%d,8", x_reg);
add64(&xreg, &yreg); // x += y
insn_printf("eor r%d,r%d", REGn(&xreg, 0), temp_reg); // x ^= i
insn_printf("add r%d,%%B2", x_reg); // l[li_out] = x
insn_printf("adc r%d,__zero_reg__", x_reg + 1);
store_to_x(&xreg);
insn_printf("sub r%d,%%B2", x_reg); // restore X to point at base of l
insn_printf("sbc r%d,__zero_reg__", x_reg + 1);
insn_printf("sbiw r%d,8", x_reg);
// k[i + 1] = leftRotate3_64(k[i]) ^ l[li_out];
leftRotate3(&yreg); // y = leftRotate3(y)
eor64(&yreg, &xreg); // y ^= x
store_to_z(&yreg); // k[i + 1] = y
// Advance li_in and li_out, wrapping around at the end of l.
insn_printf("ldi r%d,8", const_reg);
insn_printf("add %%A2,r%d", const_reg);
insn_printf("add %%B2,r%d", const_reg);
insn_printf("ldi r%d,0x1F", const_reg);
insn_printf("and %%A2,r%d", const_reg);
insn_printf("and %%B2,r%d", const_reg);
// Bottom of the loop.
insn_printf("ldd r%d,%%4", t1_reg); // r8 = rounds - 1
insn_printf("inc r%d", temp_reg); // ++i
insn_printf("cp r%d,r%d", temp_reg, t1_reg);
insn_printf("breq 3f");
insn_printf("rjmp 2b");
insn_printf("3:");
// Clean the l array. X register should still be pointing to it.
insn_printf("ldi r%d,32", const_reg);
insn_printf("4:");
insn_printf("st X+,__zero_reg__");
insn_printf("dec r%d", const_reg);
insn_printf("brne 4b");
// Declare the registers that we need.
indent_printf(": : \"z\"(k), \"x\"(key + len), \"r\"(l), \"Q\"(mb), \"Q\"(r)\n");
temp_regs();
indent_printf(", \"r%d\", \"r%d\"\n", const_reg, temp_reg);
indent -= 4;
indent_printf(");\n");
// End of function.
indent_printf("return true;\n");
printf("}\n\n");
}
static void full_enc(void)
{
printf("void Speck::encryptBlock(uint8_t *output, const uint8_t *input)\n");
printf("{\n");
indent_printf("// Automatically generated by the genspeck tool.\n");
indent_printf("__asm__ __volatile__ (\n");
indent += 4;
unpack_input();
full_enc_main_loop();
pack_output();
indent_printf(": : \"x\"(input), \"z\"(k), \"r\"(rounds), \"Q\"(output)\n");
temp_regs();
indent -= 4;
indent_printf(");\n");
printf("}\n\n");
}
static void full_dec(void)
{
printf("void Speck::decryptBlock(uint8_t *output, const uint8_t *input)\n");
printf("{\n");
indent_printf("// Automatically generated by the genspeck tool.\n");
indent_printf("__asm__ __volatile__ (\n");
indent += 4;
unpack_input();
full_dec_main_loop();
pack_output();
indent_printf(": : \"x\"(input), \"z\"(k + rounds), \"r\"(rounds), \"Q\"(output)\n");
temp_regs();
indent -= 4;
indent_printf(");\n");
printf("}\n\n");
}
static void tiny_enc(void)
{
Reg64 xreg = {t1_reg, 0};
Reg64 yreg = {t2_reg, 0};
printf("void SpeckTiny::encryptBlock(uint8_t *output, const uint8_t *input)\n");
printf("{\n");
indent_printf("// Automatically generated by the genspeck tool.\n");
indent_printf("uint64_t l[5];\n");
indent_printf("uint8_t r = rounds;\n");
indent_printf("uint8_t mb = (r - 31) * 8;\n");
// Copy the "k" array into the "l" array. The first element is "s"
// and the rest of the elements make up the normal l[0..3] values.
indent_printf("__asm__ __volatile__ (\n");
indent += 4;
insn_printf("movw r%d,r%d", t1_reg, z_reg); // Save Z
insn_printf("ldd r%d,%%4", t2_reg);
insn_printf("ldi r%d,8", const_reg);
insn_printf("add r%d,r%d", t2_reg, const_reg);
insn_printf("1:");
insn_printf("ld __tmp_reg__,X+");
insn_printf("st Z+,__tmp_reg__");
insn_printf("dec r%d", t2_reg);
insn_printf("brne 1b");
insn_printf("movw r%d,r%d", z_reg, t1_reg); // Restore Z to point at l
// Unpack the input. %A2 and %B2 are free temporary registers after this.
insn_printf("movw r%d,%%A2", x_reg);
unpack_input();
// Top of the loop.
insn_printf("clr %%A2"); // %A2 = li_in = 0
insn_printf("ldd %%B2,%%4"); // %B2 = li_out = mb
insn_printf("clr r%d", temp_reg); // i = 0
insn_printf("2:");
// Adjust x and y for this round using the key schedule word s (in l[0]).
// x = (rightRotate8_64(x) + y) ^ s;
rightRotate8(&xreg);
add64(&xreg, &yreg);
eor64Schedule(&xreg);
// y = leftRotate3_64(y) ^ x;
leftRotate3(&yreg);
eor64(&yreg, &xreg);
// At this point, Z has been incremented to point at l[1] which
// is the start of the actual l[0] from the original formulation.
// If this is the last round, then we are done. There is no
// point calculating another key schedule element.
insn_printf("mov __tmp_reg__,r%d", temp_reg);
insn_printf("inc __tmp_reg__");
insn_printf("ldd r%d,%%5", const_reg);
insn_printf("cp __tmp_reg__,r%d", const_reg);
insn_printf("brne 3f");
insn_printf("rjmp 4f");
insn_printf("3:");
// Save x and y on the stack - we need the registers to
// help us compute the next key schedule element.
push64(&xreg);
push64(&yreg);
// Compute the key schedule word s for the next round.
insn_printf("sbiw r%d,8", z_reg); // Point Z back at l[0]
// l[li_out] = (s + rightRotate8_64(l[li_in])) ^ i;
insn_printf("ld r%d,Z", REGn(&yreg, 0)); // y = s
insn_printf("ldd r%d,Z+1", REGn(&yreg, 1));
insn_printf("ldd r%d,Z+2", REGn(&yreg, 2));
insn_printf("ldd r%d,Z+3", REGn(&yreg, 3));
insn_printf("ldd r%d,Z+4", REGn(&yreg, 4));
insn_printf("ldd r%d,Z+5", REGn(&yreg, 5));
insn_printf("ldd r%d,Z+6", REGn(&yreg, 6));
insn_printf("ldd r%d,Z+7", REGn(&yreg, 7));
insn_printf("add r%d,%%A2", z_reg); // Z = &(l[li_in]) - 8
insn_printf("adc r%d,__zero_reg__", z_reg + 1);
leftRotate8(&xreg); // x = rightRotate8(l[li_in])
insn_printf("ldd r%d,Z+8", REGn(&xreg, 0));
insn_printf("ldd r%d,Z+9", REGn(&xreg, 1));
insn_printf("ldd r%d,Z+10", REGn(&xreg, 2));
insn_printf("ldd r%d,Z+11", REGn(&xreg, 3));
insn_printf("ldd r%d,Z+12", REGn(&xreg, 4));
insn_printf("ldd r%d,Z+13", REGn(&xreg, 5));
insn_printf("ldd r%d,Z+14", REGn(&xreg, 6));
insn_printf("ldd r%d,Z+15", REGn(&xreg, 7));
rightRotate8(&xreg);
add64(&xreg, &yreg); // x += y
insn_printf("eor r%d,r%d", REGn(&xreg, 0), temp_reg); // x ^= i
insn_printf("sub r%d,%%A2", z_reg); // Z = &(l[li_out]) - 8
insn_printf("sbc r%d,__zero_reg__", z_reg + 1);
insn_printf("add r%d,%%B2", z_reg);
insn_printf("adc r%d,__zero_reg__", z_reg + 1);
insn_printf("std Z+8,r%d", REGn(&xreg, 0)); // l[li_out] = x
insn_printf("std Z+9,r%d", REGn(&xreg, 1));
insn_printf("std Z+10,r%d", REGn(&xreg, 2));
insn_printf("std Z+11,r%d", REGn(&xreg, 3));
insn_printf("std Z+12,r%d", REGn(&xreg, 4));
insn_printf("std Z+13,r%d", REGn(&xreg, 5));
insn_printf("std Z+14,r%d", REGn(&xreg, 6));
insn_printf("std Z+15,r%d", REGn(&xreg, 7));
insn_printf("sub r%d,%%B2", z_reg); // Restore Z to base of l array
insn_printf("sbc r%d,__zero_reg__", z_reg + 1);
// s = leftRotate3_64(s) ^ l[li_out];
leftRotate3(&yreg);
eor64(&yreg, &xreg);
insn_printf("st Z,r%d", REGn(&yreg, 0));
insn_printf("std Z+1,r%d", REGn(&yreg, 1));
insn_printf("std Z+2,r%d", REGn(&yreg, 2));
insn_printf("std Z+3,r%d", REGn(&yreg, 3));
insn_printf("std Z+4,r%d", REGn(&yreg, 4));
insn_printf("std Z+5,r%d", REGn(&yreg, 5));
insn_printf("std Z+6,r%d", REGn(&yreg, 6));
insn_printf("std Z+7,r%d", REGn(&yreg, 7));
// Advance li_in and li_out, wrapping around at the end of l.
insn_printf("ldi r%d,8", const_reg);
insn_printf("add %%A2,r%d", const_reg);
insn_printf("add %%B2,r%d", const_reg);
insn_printf("ldi r%d,0x1F", const_reg);
insn_printf("and %%A2,r%d", const_reg);
insn_printf("and %%B2,r%d", const_reg);
// Restore the original x and y.
pop64(&yreg);
pop64(&xreg);
// Bottom of the loop.
insn_printf("inc r%d", temp_reg); // i++
insn_printf("rjmp 2b");
insn_printf("4:");
// Pack the results into the output buffer.
pack_output();
// Declare the registers that we need.
indent_printf(": : \"x\"(k), \"z\"(l), \"r\"(input), \"Q\"(output), \"Q\"(mb), \"Q\"(r)\n");
temp_regs();
indent_printf(", \"r%d\", \"r%d\"\n", const_reg, temp_reg);
indent -= 4;
indent_printf(");\n");
printf("}\n\n");
}
static void small_dec(void)
{
Reg64 xreg = {t1_reg, 0};
Reg64 yreg = {t2_reg, 0};
printf("void SpeckSmall::decryptBlock(uint8_t *output, const uint8_t *input)\n");
printf("{\n");
indent_printf("// Automatically generated by the genspeck tool.\n");
indent_printf("uint64_t l[5];\n");
indent_printf("uint8_t r = rounds;\n");
indent_printf("uint8_t li_in = ((r + 3) & 0x03) * 8;\n");
indent_printf("uint8_t li_out = ((((r - 31) & 0x03) * 8) + li_in) & 0x1F;\n");
indent_printf("__asm__ __volatile__ (\n");
indent += 4;
// Copy the this->l array into the local l array. Then copy
// the "s" value from l[li_out] to l[4].
insn_printf("ldd r%d,%%4", temp_reg); // r25 = li_out
insn_printf("ldi r%d,32", const_reg); // Copy 32 bytes from this->l.
insn_printf("1:");
insn_printf("ld __tmp_reg__,X+");
insn_printf("st Z+,__tmp_reg__");
insn_printf("dec r%d", const_reg);
insn_printf("brne 1b");
insn_printf("movw r%d,r%d", x_reg, z_reg); // X = Z + 32
insn_printf("sbiw r%d,32", z_reg); // Z = &(l[li_out])
insn_printf("add r%d,r%d", z_reg, temp_reg);
insn_printf("adc r%d,__zero_reg__", z_reg + 1);
insn_printf("ld __tmp_reg__,Z"); // Copy l[li_out] to l[4]
insn_printf("st X+,__tmp_reg__");
insn_printf("ldd __tmp_reg__,Z+1");
insn_printf("st X+,__tmp_reg__");
insn_printf("ldd __tmp_reg__,Z+2");
insn_printf("st X+,__tmp_reg__");
insn_printf("ldd __tmp_reg__,Z+3");
insn_printf("st X+,__tmp_reg__");
insn_printf("ldd __tmp_reg__,Z+4");
insn_printf("st X+,__tmp_reg__");
insn_printf("ldd __tmp_reg__,Z+5");
insn_printf("st X+,__tmp_reg__");
insn_printf("ldd __tmp_reg__,Z+6");
insn_printf("st X+,__tmp_reg__");
insn_printf("ldd __tmp_reg__,Z+7");
insn_printf("st X+,__tmp_reg__");
insn_printf("sub r%d,r%d", z_reg, temp_reg); // Z = &(l[0])
insn_printf("sbc r%d,__zero_reg__", z_reg + 1);
// Unpack the input. %A2 and %B2 are free temporary registers after this.
insn_printf("movw r%d,%%A2", x_reg);
unpack_input();
// Top of the loop.
insn_printf("ldd %%A2,%%6"); // %A2 = li_in
insn_printf("mov %%B2,r%d", temp_reg); // %B2 = li_out
insn_printf("ldd r%d,%%5", temp_reg); // i = rounds - 1
insn_printf("dec r%d", temp_reg);
insn_printf("movw r%d,r%d", x_reg, z_reg); // X = Z + 40 = &(l[5])
insn_printf("adiw r%d,40", x_reg); // i.e. point to end of l[4]
insn_printf("2:");
// Adjust x and y for this round using the key schedule word s (in l[4]).
// y = rightRotate3_64(x ^ y);
eor64(&yreg, &xreg);
rightRotate3(&yreg);
// x = leftRotate8_64((x ^ s) - y);
eor64ScheduleReverseX(&xreg);
leftRotate8(&xreg);
sub64(&xreg, &yreg);
// If this is the last round, then we are done. There is no
// point calculating another key schedule element.
insn_printf("or r%d,r%d", temp_reg, temp_reg); // if (i == 0)
insn_printf("brne 3f");
insn_printf("rjmp 4f");
insn_printf("3:");
insn_printf("dec r%d", temp_reg); // --i
// Save x and y on the stack - we need the registers to
// help us compute the next key schedule element.
push64(&xreg);
push64(&yreg);
// Move li_in and li_out backwards, wrapping around at the start of l.
insn_printf("ldi r%d,24", const_reg);
insn_printf("add %%A2,r%d", const_reg);
insn_printf("add %%B2,r%d", const_reg);
insn_printf("ldi r%d,0x1F", const_reg);
insn_printf("and %%A2,r%d", const_reg);
insn_printf("and %%B2,r%d", const_reg);
// Compute the key schedule word s for the next round.
// s = rightRotate3_64(s ^ l[li_out]);
insn_printf("ld r%d,X+", REGn(&yreg, 0)); // y = s = l[4]
insn_printf("ld r%d,X+", REGn(&yreg, 1));
insn_printf("ld r%d,X+", REGn(&yreg, 2));
insn_printf("ld r%d,X+", REGn(&yreg, 3));
insn_printf("ld r%d,X+", REGn(&yreg, 4));
insn_printf("ld r%d,X+", REGn(&yreg, 5));
insn_printf("ld r%d,X+", REGn(&yreg, 6));
insn_printf("ld r%d,X+", REGn(&yreg, 7));
insn_printf("add r%d,%%B2", z_reg); // Z = &(l[li_out])
insn_printf("adc r%d,__zero_reg__", z_reg + 1);
insn_printf("ld r%d,Z", REGn(&xreg, 0)); // x = l[li_out]
insn_printf("ldd r%d,Z+1", REGn(&xreg, 1));
insn_printf("ldd r%d,Z+2", REGn(&xreg, 2));
insn_printf("ldd r%d,Z+3", REGn(&xreg, 3));
insn_printf("ldd r%d,Z+4", REGn(&xreg, 4));
insn_printf("ldd r%d,Z+5", REGn(&xreg, 5));
insn_printf("ldd r%d,Z+6", REGn(&xreg, 6));
insn_printf("ldd r%d,Z+7", REGn(&xreg, 7));
insn_printf("sub r%d,%%B2", z_reg); // Z = &(l[0])
insn_printf("sbc r%d,__zero_reg__", z_reg + 1);
eor64(&yreg, &xreg);
rightRotate3(&yreg);
insn_printf("st -X,r%d", REGn(&yreg, 7)); // store s back into l[4]
insn_printf("st -X,r%d", REGn(&yreg, 6));
insn_printf("st -X,r%d", REGn(&yreg, 5));
insn_printf("st -X,r%d", REGn(&yreg, 4));
insn_printf("st -X,r%d", REGn(&yreg, 3));
insn_printf("st -X,r%d", REGn(&yreg, 2));
insn_printf("st -X,r%d", REGn(&yreg, 1));
insn_printf("st -X,r%d", REGn(&yreg, 0));
insn_printf("adiw r%d,8", x_reg); // X = &(l[5])
// l[li_in] = leftRotate8_64((l[li_out] ^ i) - s);
insn_printf("eor r%d,r%d", t1_reg, temp_reg); // x ^= i
sub64(&xreg, &yreg); // x -= s
leftRotate8(&xreg); // x = leftRotate8(x)
insn_printf("add r%d,%%A2", z_reg); // Z = &(l[li_in])
insn_printf("adc r%d,__zero_reg__", z_reg + 1);
insn_printf("st Z,r%d", REGn(&xreg, 0)); // l[li_in] = x
insn_printf("std Z+1,r%d", REGn(&xreg, 1));
insn_printf("std Z+2,r%d", REGn(&xreg, 2));
insn_printf("std Z+3,r%d", REGn(&xreg, 3));
insn_printf("std Z+4,r%d", REGn(&xreg, 4));
insn_printf("std Z+5,r%d", REGn(&xreg, 5));
insn_printf("std Z+6,r%d", REGn(&xreg, 6));
insn_printf("std Z+7,r%d", REGn(&xreg, 7));
insn_printf("sub r%d,%%A2", z_reg); // Z = &(l[0])
insn_printf("sbc r%d,__zero_reg__", z_reg + 1);
// Restore the original x and y.
pop64(&yreg);
pop64(&xreg);
// Bottom of the loop.
insn_printf("rjmp 2b");
insn_printf("4:");
// Pack the results into the output buffer.
pack_output();
// Declare the registers that we need.
indent_printf(": : \"x\"(this->l), \"z\"(l), \"r\"(input), \"Q\"(output), \"Q\"(li_out), \"Q\"(r), \"Q\"(li_in)\n");
temp_regs();
indent_printf(", \"r%d\", \"r%d\"\n", const_reg, temp_reg);
indent -= 4;
indent_printf(");\n");
printf("}\n\n");
}
int main(int argc, char *argv[])
{
full_setkey();
full_enc();
full_dec();
tiny_enc();
small_dec();
return 0;
}

View File

@ -105,6 +105,7 @@ size_t Speck::keySize() const
bool Speck::setKey(const uint8_t *key, size_t len)
{
#if USE_AVR_INLINE_ASM
// Automatically generated by the genspeck tool.
uint64_t l[4];
uint8_t m, mb;
if (len == 32) {
@ -120,134 +121,148 @@ bool Speck::setKey(const uint8_t *key, size_t len)
return false;
}
rounds = 30 + m;
// Copy the first (m - 1) * 8 bytes of the key into the "l" array
// in reverse order to convert big endian into little-endian.
uint8_t r = rounds - 1;
__asm__ __volatile__ (
"ld __tmp_reg__,-X\n"
"st Z+,__tmp_reg__\n"
"ld __tmp_reg__,-X\n"
"st Z+,__tmp_reg__\n"
"ld __tmp_reg__,-X\n"
"st Z+,__tmp_reg__\n"
"ld __tmp_reg__,-X\n"
"st Z+,__tmp_reg__\n"
"ld __tmp_reg__,-X\n"
"st Z+,__tmp_reg__\n"
"ld __tmp_reg__,-X\n"
"st Z+,__tmp_reg__\n"
"ld __tmp_reg__,-X\n"
"st Z+,__tmp_reg__\n"
"ld __tmp_reg__,-X\n"
"st Z+,__tmp_reg__\n"
"sbiw r30,8\n"
"movw r10,r30\n"
"movw r30,%A2\n"
"ldd r8,%3\n"
"1:\n"
"ld __tmp_reg__,-Z\n"
"st X+,__tmp_reg__\n"
"dec %2\n"
"ld __tmp_reg__,-X\n"
"st Z+,__tmp_reg__\n"
"dec r8\n"
"brne 1b\n"
: : "x"(l), "z"(key + len - 8), "r"(mb)
"movw r26,%A2\n"
"movw r30,r10\n"
"clr %A2\n"
"ldd %B2,%3\n"
"clr r25\n"
"ld r16,Z+\n"
"ld r17,Z+\n"
"ld r18,Z+\n"
"ld r19,Z+\n"
"ld r20,Z+\n"
"ld r21,Z+\n"
"ld r22,Z+\n"
"ld r23,Z+\n"
"2:\n"
"add r26,%A2\n"
"adc r27,__zero_reg__\n"
"ld r15,X+\n"
"ld r8,X+\n"
"ld r9,X+\n"
"ld r10,X+\n"
"ld r11,X+\n"
"ld r12,X+\n"
"ld r13,X+\n"
"ld r14,X+\n"
"sub r26,%A2\n"
"sbc r27,__zero_reg__\n"
"sbiw r26,8\n"
"add r8,r16\n"
"adc r9,r17\n"
"adc r10,r18\n"
"adc r11,r19\n"
"adc r12,r20\n"
"adc r13,r21\n"
"adc r14,r22\n"
"adc r15,r23\n"
"eor r8,r25\n"
"add r26,%B2\n"
"adc r27,__zero_reg__\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
"sub r26,%B2\n"
"sbc r27,__zero_reg__\n"
"sbiw r26,8\n"
"lsl r16\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16, __zero_reg__\n"
"lsl r16\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16, __zero_reg__\n"
"lsl r16\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16, __zero_reg__\n"
"eor r16,r8\n"
"eor r17,r9\n"
"eor r18,r10\n"
"eor r19,r11\n"
"eor r20,r12\n"
"eor r21,r13\n"
"eor r22,r14\n"
"eor r23,r15\n"
"st Z+,r16\n"
"st Z+,r17\n"
"st Z+,r18\n"
"st Z+,r19\n"
"st Z+,r20\n"
"st Z+,r21\n"
"st Z+,r22\n"
"st Z+,r23\n"
"ldi r24,8\n"
"add %A2,r24\n"
"add %B2,r24\n"
"ldi r24,0x1F\n"
"and %A2,r24\n"
"and %B2,r24\n"
"ldd r8,%4\n"
"inc r25\n"
"cp r25,r8\n"
"breq 3f\n"
"rjmp 2b\n"
"3:\n"
"ldi r24,32\n"
"4:\n"
"st X+,__zero_reg__\n"
"dec r24\n"
"brne 4b\n"
: : "z"(k), "x"(key + len), "r"(l), "Q"(mb), "Q"(r)
: "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "memory"
, "r24", "r25"
);
// Copy the final 8 bytes of the key into k[0] in reverse order.
__asm__ __volatile__ (
"1:\n"
"ld __tmp_reg__,-Z\n"
"st X+,__tmp_reg__\n"
"dec %2\n"
"brne 1b\n"
: : "x"(k), "z"(key + len), "r"(8)
);
// Expand the key to the full key schedule.
uint8_t li_in = 0;
uint8_t li_out = m - 1;
for (uint8_t i = 0; i < (rounds - 1); ++i) {
__asm__ __volatile__ (
// l[li_out] = (k[i] + rightRotate8_64(l[li_in])) ^ i;
"ld r15,X+\n" // x = rightRotate8_64(l[li_in])
"ld r8,X+\n"
"ld r9,X+\n"
"ld r10,X+\n"
"ld r11,X+\n"
"ld r12,X+\n"
"ld r13,X+\n"
"ld r14,X+\n"
"ld r16,Z+\n" // y = k[i]
"ld r17,Z+\n"
"ld r18,Z+\n"
"ld r19,Z+\n"
"ld r20,Z+\n"
"ld r21,Z+\n"
"ld r22,Z+\n"
"ld r23,Z+\n"
"add r8,r16\n" // x += y
"adc r9,r17\n"
"adc r10,r18\n"
"adc r11,r19\n"
"adc r12,r20\n"
"adc r13,r21\n"
"adc r14,r22\n"
"adc r15,r23\n"
"eor r8,%3\n" // x ^= i
// k[i + 1] = leftRotate3_64(k[i]) ^ l[li_out];
"movw r26,%A2\n" // l[li_out] = x
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
"lsl r16\n" // y = leftRotate1_64(y)
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16,__zero_reg__\n"
"lsl r16\n" // y = leftRotate1_64(y)
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16,__zero_reg__\n"
"lsl r16\n" // y = leftRotate1_64(y)
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16,__zero_reg__\n"
"eor r16,r8\n" // y ^= x
"eor r17,r9\n"
"eor r18,r10\n"
"eor r19,r11\n"
"eor r20,r12\n"
"eor r21,r13\n"
"eor r22,r14\n"
"eor r23,r15\n"
"st Z+,r16\n" // k[i + 1] = y
"st Z+,r17\n"
"st Z+,r18\n"
"st Z+,r19\n"
"st Z+,r20\n"
"st Z+,r21\n"
"st Z+,r22\n"
"st Z+,r23\n"
: : "z"(&(k[i])), "x"(&(l[li_in])),
"r"(&(l[li_out])),
"r"(i)
: "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
"r24", "r25"
);
if ((++li_in) >= m)
li_in = 0;
if ((++li_out) >= m)
li_out = 0;
}
return true;
#else
uint64_t l[4];
uint8_t m;
@ -280,138 +295,118 @@ bool Speck::setKey(const uint8_t *key, size_t len)
if ((++li_out) >= m)
li_out = 0;
}
#endif
clean(l);
return true;
#endif
}
void Speck::encryptBlock(uint8_t *output, const uint8_t *input)
{
#if USE_AVR_INLINE_ASM
uint32_t xlow, xhigh, ylow, yhigh;
// Unpack the input into the x and y variables, converting
// from big-endian into little-endian in the process.
__asm__ __volatile__ (
"ld %D1,Z\n"
"ldd %C1,Z+1\n"
"ldd %B1,Z+2\n"
"ldd %A1,Z+3\n"
"ldd %D0,Z+4\n"
"ldd %C0,Z+5\n"
"ldd %B0,Z+6\n"
"ldd %A0,Z+7\n"
"ldd %D3,Z+8\n"
"ldd %C3,Z+9\n"
"ldd %B3,Z+10\n"
"ldd %A3,Z+11\n"
"ldd %D2,Z+12\n"
"ldd %C2,Z+13\n"
"ldd %B2,Z+14\n"
"ldd %A2,Z+15\n"
: "=r"(xlow), "=r"(xhigh), "=r"(ylow), "=r"(yhigh)
: "z"(input)
);
// Perform all encryption rounds. Z points to the key schedule.
// Automatically generated by the genspeck tool.
__asm__ __volatile__ (
"ld r15,X+\n"
"ld r14,X+\n"
"ld r13,X+\n"
"ld r12,X+\n"
"ld r11,X+\n"
"ld r10,X+\n"
"ld r9,X+\n"
"ld r8,X+\n"
"ld r23,X+\n"
"ld r22,X+\n"
"ld r21,X+\n"
"ld r20,X+\n"
"ld r19,X+\n"
"ld r18,X+\n"
"ld r17,X+\n"
"ld r16,X\n"
"1:\n"
// x = (rightRotate8_64(x) + y) ^ *s++;
"add %B0,%A2\n" // x = rightRotate8_64(x), x += y
"adc %C0,%B2\n" // Note: right rotate is implicit.
"adc %D0,%C2\n"
"adc %A1,%D2\n"
"adc %B1,%A3\n"
"adc %C1,%B3\n"
"adc %D1,%C3\n"
"adc %A0,%D3\n"
"ld __tmp_reg__,Z+\n" // x ^= *s++
"eor __tmp_reg__,%B0\n" // Also fully apply the right rotate.
"ld %B0,Z+\n"
"eor %B0,%C0\n"
"ld %C0,Z+\n"
"eor %C0,%D0\n"
"ld %D0,Z+\n"
"eor %D0,%A1\n"
"ld %A1,Z+\n"
"eor %A1,%B1\n"
"ld %B1,Z+\n"
"eor %B1,%C1\n"
"ld %C1,Z+\n"
"eor %C1,%D1\n"
"ld %D1,Z+\n"
"eor %D1,%A0\n"
"mov %A0,__tmp_reg__\n"
// y = leftRotate3_64(y) ^ x;
"lsl %A2\n" // y = leftRotate1_64(y)
"rol %B2\n"
"rol %C2\n"
"rol %D2\n"
"rol %A3\n"
"rol %B3\n"
"rol %C3\n"
"rol %D3\n"
"adc %A2,__zero_reg__\n"
"lsl %A2\n" // y = leftRotate1_64(y)
"rol %B2\n"
"rol %C2\n"
"rol %D2\n"
"rol %A3\n"
"rol %B3\n"
"rol %C3\n"
"rol %D3\n"
"adc %A2,__zero_reg__\n"
"lsl %A2\n" // y = leftRotate1_64(y)
"rol %B2\n"
"rol %C2\n"
"rol %D2\n"
"rol %A3\n"
"rol %B3\n"
"rol %C3\n"
"rol %D3\n"
"adc %A2,__zero_reg__\n"
"eor %A2,%A0\n" // y ^= x
"eor %B2,%B0\n"
"eor %C2,%C0\n"
"eor %D2,%D0\n"
"eor %A3,%A1\n"
"eor %B3,%B1\n"
"eor %C3,%C1\n"
"eor %D3,%D1\n"
// Loop
"dec %5\n" // --round
"add r9,r16\n"
"adc r10,r17\n"
"adc r11,r18\n"
"adc r12,r19\n"
"adc r13,r20\n"
"adc r14,r21\n"
"adc r15,r22\n"
"adc r8,r23\n"
"ld __tmp_reg__,Z+\n"
"eor __tmp_reg__,r9\n"
"ld r9,Z+\n"
"eor r9,r10\n"
"ld r10,Z+\n"
"eor r10,r11\n"
"ld r11,Z+\n"
"eor r11,r12\n"
"ld r12,Z+\n"
"eor r12,r13\n"
"ld r13,Z+\n"
"eor r13,r14\n"
"ld r14,Z+\n"
"eor r14,r15\n"
"ld r15,Z+\n"
"eor r15,r8\n"
"mov r8,__tmp_reg__\n"
"lsl r16\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16, __zero_reg__\n"
"lsl r16\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16, __zero_reg__\n"
"lsl r16\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16, __zero_reg__\n"
"eor r16,r8\n"
"eor r17,r9\n"
"eor r18,r10\n"
"eor r19,r11\n"
"eor r20,r12\n"
"eor r21,r13\n"
"eor r22,r14\n"
"eor r23,r15\n"
"dec %2\n"
"breq 2f\n"
"rjmp 1b\n"
"2:\n"
: "+r"(xlow), "+r"(xhigh), "+r"(ylow), "+r"(yhigh)
: "z"(k), "r"(rounds)
);
// Pack the results into the output and convert back to big-endian.
__asm__ __volatile__ (
"st Z,%D1\n"
"std Z+1,%C1\n"
"std Z+2,%B1\n"
"std Z+3,%A1\n"
"std Z+4,%D0\n"
"std Z+5,%C0\n"
"std Z+6,%B0\n"
"std Z+7,%A0\n"
"std Z+8,%D3\n"
"std Z+9,%C3\n"
"std Z+10,%B3\n"
"std Z+11,%A3\n"
"std Z+12,%D2\n"
"std Z+13,%C2\n"
"std Z+14,%B2\n"
"std Z+15,%A2\n"
: : "r"(xlow), "r"(xhigh), "r"(ylow), "r"(yhigh), "z"(output)
"ldd r26,%A3\n"
"ldd r27,%B3\n"
"st X+,r15\n"
"st X+,r14\n"
"st X+,r13\n"
"st X+,r12\n"
"st X+,r11\n"
"st X+,r10\n"
"st X+,r9\n"
"st X+,r8\n"
"st X+,r23\n"
"st X+,r22\n"
"st X+,r21\n"
"st X+,r20\n"
"st X+,r19\n"
"st X+,r18\n"
"st X+,r17\n"
"st X,r16\n"
: : "x"(input), "z"(k), "r"(rounds), "Q"(output)
: "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "memory"
);
#else
uint64_t x, y;
@ -430,133 +425,113 @@ void Speck::encryptBlock(uint8_t *output, const uint8_t *input)
void Speck::decryptBlock(uint8_t *output, const uint8_t *input)
{
#if USE_AVR_INLINE_ASM
uint32_t xlow, xhigh, ylow, yhigh;
// Unpack the input into the x and y variables, converting
// from big-endian into little-endian in the process.
__asm__ __volatile__ (
"ld %D1,Z\n"
"ldd %C1,Z+1\n"
"ldd %B1,Z+2\n"
"ldd %A1,Z+3\n"
"ldd %D0,Z+4\n"
"ldd %C0,Z+5\n"
"ldd %B0,Z+6\n"
"ldd %A0,Z+7\n"
"ldd %D3,Z+8\n"
"ldd %C3,Z+9\n"
"ldd %B3,Z+10\n"
"ldd %A3,Z+11\n"
"ldd %D2,Z+12\n"
"ldd %C2,Z+13\n"
"ldd %B2,Z+14\n"
"ldd %A2,Z+15\n"
: "=r"(xlow), "=r"(xhigh), "=r"(ylow), "=r"(yhigh)
: "z"(input)
);
// Perform all decryption rounds. Z points to the end of key schedule.
// Automatically generated by the genspeck tool.
__asm__ __volatile__ (
"ld r15,X+\n"
"ld r14,X+\n"
"ld r13,X+\n"
"ld r12,X+\n"
"ld r11,X+\n"
"ld r10,X+\n"
"ld r9,X+\n"
"ld r8,X+\n"
"ld r23,X+\n"
"ld r22,X+\n"
"ld r21,X+\n"
"ld r20,X+\n"
"ld r19,X+\n"
"ld r18,X+\n"
"ld r17,X+\n"
"ld r16,X\n"
"1:\n"
// y = rightRotate3_64(x ^ y);
"eor %A2,%A0\n" // y ^= x
"eor %B2,%B0\n"
"eor %C2,%C0\n"
"eor %D2,%D0\n"
"eor %A3,%A1\n"
"eor %B3,%B1\n"
"eor %C3,%C1\n"
"eor %D3,%D1\n"
"bst %A2,0\n" // y = rightRotate1_64(y)
"ror %D3\n"
"ror %C3\n"
"ror %B3\n"
"ror %A3\n"
"ror %D2\n"
"ror %C2\n"
"ror %B2\n"
"ror %A2\n"
"bld %D3,7\n"
"bst %A2,0\n" // y = rightRotate1_64(y)
"ror %D3\n"
"ror %C3\n"
"ror %B3\n"
"ror %A3\n"
"ror %D2\n"
"ror %C2\n"
"ror %B2\n"
"ror %A2\n"
"bld %D3,7\n"
"bst %A2,0\n" // y = rightRotate1_64(y)
"ror %D3\n"
"ror %C3\n"
"ror %B3\n"
"ror %A3\n"
"ror %D2\n"
"ror %C2\n"
"ror %B2\n"
"ror %A2\n"
"bld %D3,7\n"
// x = leftRotate8_64((x ^ *s--) - y);
"ld __tmp_reg__,-Z\n" // x ^= *s--
"eor __tmp_reg__,%D1\n" // Note: also implicitly left-rotates regs
"ld %D1,-Z\n"
"eor %D1,%C1\n"
"ld %C1,-Z\n"
"eor %C1,%B1\n"
"ld %B1,-Z\n"
"eor %B1,%A1\n"
"ld %A1,-Z\n"
"eor %A1,%D0\n"
"ld %D0,-Z\n"
"eor %D0,%C0\n"
"ld %C0,-Z\n"
"eor %C0,%B0\n"
"ld %B0,-Z\n"
"eor %B0,%A0\n"
"mov %A0,__tmp_reg__\n"
"sub %B0,%A2\n" // x -= y
"sbc %C0,%B2\n" // Note: regs are already left-rotated
"sbc %D0,%C2\n"
"sbc %A1,%D2\n"
"sbc %B1,%A3\n"
"sbc %C1,%B3\n"
"sbc %D1,%C3\n"
"sbc %A0,%D3\n"
// Loop
"dec %5\n" // --round
"eor r16,r8\n"
"eor r17,r9\n"
"eor r18,r10\n"
"eor r19,r11\n"
"eor r20,r12\n"
"eor r21,r13\n"
"eor r22,r14\n"
"eor r23,r15\n"
"bst r16,0\n"
"ror r23\n"
"ror r22\n"
"ror r21\n"
"ror r20\n"
"ror r19\n"
"ror r18\n"
"ror r17\n"
"ror r16\n"
"bld r23,7\n"
"bst r16,0\n"
"ror r23\n"
"ror r22\n"
"ror r21\n"
"ror r20\n"
"ror r19\n"
"ror r18\n"
"ror r17\n"
"ror r16\n"
"bld r23,7\n"
"bst r16,0\n"
"ror r23\n"
"ror r22\n"
"ror r21\n"
"ror r20\n"
"ror r19\n"
"ror r18\n"
"ror r17\n"
"ror r16\n"
"bld r23,7\n"
"ld __tmp_reg__,-Z\n"
"eor __tmp_reg__,r15\n"
"ld r15,-Z\n"
"eor r15,r14\n"
"ld r14,-Z\n"
"eor r14,r13\n"
"ld r13,-Z\n"
"eor r13,r12\n"
"ld r12,-Z\n"
"eor r12,r11\n"
"ld r11,-Z\n"
"eor r11,r10\n"
"ld r10,-Z\n"
"eor r10,r9\n"
"ld r9,-Z\n"
"eor r9,r8\n"
"mov r8,__tmp_reg__\n"
"sub r9,r16\n"
"sbc r10,r17\n"
"sbc r11,r18\n"
"sbc r12,r19\n"
"sbc r13,r20\n"
"sbc r14,r21\n"
"sbc r15,r22\n"
"sbc r8,r23\n"
"dec %2\n"
"breq 2f\n"
"rjmp 1b\n"
"2:\n"
: "+r"(xlow), "+r"(xhigh), "+r"(ylow), "+r"(yhigh)
: "z"(k + rounds), "r"(rounds)
);
// Pack the results into the output and convert back to big-endian.
__asm__ __volatile__ (
"st Z,%D1\n"
"std Z+1,%C1\n"
"std Z+2,%B1\n"
"std Z+3,%A1\n"
"std Z+4,%D0\n"
"std Z+5,%C0\n"
"std Z+6,%B0\n"
"std Z+7,%A0\n"
"std Z+8,%D3\n"
"std Z+9,%C3\n"
"std Z+10,%B3\n"
"std Z+11,%A3\n"
"std Z+12,%D2\n"
"std Z+13,%C2\n"
"std Z+14,%B2\n"
"std Z+15,%A2\n"
: : "r"(xlow), "r"(xhigh), "r"(ylow), "r"(yhigh), "z"(output)
"ldd r26,%A3\n"
"ldd r27,%B3\n"
"st X+,r15\n"
"st X+,r14\n"
"st X+,r13\n"
"st X+,r12\n"
"st X+,r11\n"
"st X+,r10\n"
"st X+,r9\n"
"st X+,r8\n"
"st X+,r23\n"
"st X+,r22\n"
"st X+,r21\n"
"st X+,r20\n"
"st X+,r19\n"
"st X+,r18\n"
"st X+,r17\n"
"st X,r16\n"
: : "x"(input), "z"(k + rounds), "r"(rounds), "Q"(output)
: "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "memory"
);
#else
uint64_t x, y;

View File

@ -261,349 +261,283 @@ bool SpeckSmall::setKey(const uint8_t *key, size_t len)
void SpeckSmall::decryptBlock(uint8_t *output, const uint8_t *input)
{
#if USE_AVR_INLINE_ASM
uint64_t l[4];
uint32_t xlow, xhigh, ylow, yhigh;
uint32_t slow, shigh;
uint8_t li_in = (rounds + 3) & 0x03;
uint8_t li_out = (((rounds - 31) + li_in) & 0x03) * 8;
li_in *= 8;
// Prepare to expand the key schedule.
// Automatically generated by the genspeck tool.
uint64_t l[5];
uint8_t r = rounds;
uint8_t li_in = ((r + 3) & 0x03) * 8;
uint8_t li_out = ((((r - 31) & 0x03) * 8) + li_in) & 0x1F;
__asm__ __volatile__ (
"add r30,%4\n" // Z = &(this->l[li_out])
"adc r31,__zero_reg__\n"
"ld __tmp_reg__,Z\n" // s = this->l[li_out]
"std %A0,__tmp_reg__\n"
"ldd __tmp_reg__,Z+1\n"
"std %B0,__tmp_reg__\n"
"ldd __tmp_reg__,Z+2\n"
"std %C0,__tmp_reg__\n"
"ldd __tmp_reg__,Z+3\n"
"std %D0,__tmp_reg__\n"
"ldd __tmp_reg__,Z+4\n"
"std %A1,__tmp_reg__\n"
"ldd __tmp_reg__,Z+5\n"
"std %B1,__tmp_reg__\n"
"ldd __tmp_reg__,Z+6\n"
"std %C1,__tmp_reg__\n"
"ldd __tmp_reg__,Z+7\n"
"std %D1,__tmp_reg__\n"
"sub r30,%4\n" // Point Z back to the start of this->l.
"sbc r31,__zero_reg__\n"
"ldi r25,32\n" // Copy the entire this->l array into l.
"ldd r25,%4\n"
"ldi r24,32\n"
"1:\n"
"ld __tmp_reg__,Z+\n"
"st X+,__tmp_reg__\n"
"dec r25\n"
"ld __tmp_reg__,X+\n"
"st Z+,__tmp_reg__\n"
"dec r24\n"
"brne 1b\n"
: "=Q"(slow), "=Q"(shigh)
: "z"(this->l), "x"(l), "r"(li_out)
: "r25"
);
// Unpack the input into the x and y variables, converting
// from big-endian into little-endian in the process.
__asm__ __volatile__ (
"ld %D1,Z\n"
"ldd %C1,Z+1\n"
"ldd %B1,Z+2\n"
"ldd %A1,Z+3\n"
"ldd %D0,Z+4\n"
"ldd %C0,Z+5\n"
"ldd %B0,Z+6\n"
"ldd %A0,Z+7\n"
"ldd %D3,Z+8\n"
"ldd %C3,Z+9\n"
"ldd %B3,Z+10\n"
"ldd %A3,Z+11\n"
"ldd %D2,Z+12\n"
"ldd %C2,Z+13\n"
"ldd %B2,Z+14\n"
"ldd %A2,Z+15\n"
: "=r"(xlow), "=r"(xhigh), "=r"(ylow), "=r"(yhigh)
: "z"(input)
);
// Perform all decryption rounds while expanding the key schedule in-place.
__asm__ __volatile__ (
"mov r23,%9\n" // i = rounds - 1
"dec r23\n"
"1:\n"
// Adjust x and y for this round using the key schedule word s.
// y = rightRotate3_64(x ^ y);
"eor %A2,%A0\n" // y ^= x
"eor %B2,%B0\n"
"eor %C2,%C0\n"
"eor %D2,%D0\n"
"eor %A3,%A1\n"
"eor %B3,%B1\n"
"eor %C3,%C1\n"
"eor %D3,%D1\n"
"bst %A2,0\n" // y = rightRotate1_64(y)
"ror %D3\n"
"ror %C3\n"
"ror %B3\n"
"ror %A3\n"
"ror %D2\n"
"ror %C2\n"
"ror %B2\n"
"ror %A2\n"
"bld %D3,7\n"
"bst %A2,0\n" // y = rightRotate1_64(y)
"ror %D3\n"
"ror %C3\n"
"ror %B3\n"
"ror %A3\n"
"ror %D2\n"
"ror %C2\n"
"ror %B2\n"
"ror %A2\n"
"bld %D3,7\n"
"bst %A2,0\n" // y = rightRotate1_64(y)
"ror %D3\n"
"ror %C3\n"
"ror %B3\n"
"ror %A3\n"
"ror %D2\n"
"ror %C2\n"
"ror %B2\n"
"ror %A2\n"
"bld %D3,7\n"
// x = leftRotate8_64((x ^ s) - y);
"ldd __tmp_reg__,%A4\n" // x ^= s
"eor %A0,__tmp_reg__\n"
"ldd __tmp_reg__,%B4\n"
"eor %B0,__tmp_reg__\n"
"ldd __tmp_reg__,%C4\n"
"eor %C0,__tmp_reg__\n"
"ldd __tmp_reg__,%D4\n"
"eor %D0,__tmp_reg__\n"
"ldd __tmp_reg__,%A5\n"
"eor %A1,__tmp_reg__\n"
"ldd __tmp_reg__,%B5\n"
"eor %B1,__tmp_reg__\n"
"ldd __tmp_reg__,%C5\n"
"eor %C1,__tmp_reg__\n"
"ldd __tmp_reg__,%D5\n"
"eor %D1,__tmp_reg__\n"
"sub %A0,%A2\n" // x -= y
"sbc %B0,%B2\n"
"sbc %C0,%C2\n"
"sbc %D0,%D2\n"
"sbc %A1,%A3\n"
"sbc %B1,%B3\n"
"sbc %C1,%C3\n"
"sbc %D1,%D3\n"
"mov __tmp_reg__,%D1\n" // x = lefRotate8_64(x)
"mov %D1,%C1\n"
"mov %C1,%B1\n"
"mov %B1,%A1\n"
"mov %A1,%D0\n"
"mov %D0,%C0\n"
"mov %C0,%B0\n"
"mov %B0,%A0\n"
"mov %A0,__tmp_reg__\n"
// On the last round we don't need to compute s so we
// can exit early here if i == 0.
"or r23,r23\n" // if (i == 0)
"brne 2f\n"
"rjmp 3f\n"
"movw r26,r30\n"
"sbiw r30,32\n"
"add r30,r25\n"
"adc r31,__zero_reg__\n"
"ld __tmp_reg__,Z\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,Z+1\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,Z+2\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,Z+3\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,Z+4\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,Z+5\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,Z+6\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,Z+7\n"
"st X+,__tmp_reg__\n"
"sub r30,r25\n"
"sbc r31,__zero_reg__\n"
"movw r26,%A2\n"
"ld r15,X+\n"
"ld r14,X+\n"
"ld r13,X+\n"
"ld r12,X+\n"
"ld r11,X+\n"
"ld r10,X+\n"
"ld r9,X+\n"
"ld r8,X+\n"
"ld r23,X+\n"
"ld r22,X+\n"
"ld r21,X+\n"
"ld r20,X+\n"
"ld r19,X+\n"
"ld r18,X+\n"
"ld r17,X+\n"
"ld r16,X\n"
"ldd %A2,%6\n"
"mov %B2,r25\n"
"ldd r25,%5\n"
"dec r25\n"
"movw r26,r30\n"
"adiw r26,40\n"
"2:\n"
"dec r23\n" // --i
// Save x and y on the stack so we can reuse registers for t and s.
"push %A0\n"
"push %B0\n"
"push %C0\n"
"push %D0\n"
"push %A1\n"
"push %B1\n"
"push %C1\n"
"push %D1\n"
"push %A2\n"
"push %B2\n"
"push %C2\n"
"push %D2\n"
"push %A3\n"
"push %B3\n"
"push %C3\n"
"push %D3\n"
// Compute the key schedule word s for the next round.
// li_out = (li_out + 3) & 0x03;
"ldd r24,%7\n"
"ldi r25,24\n"
"add r24,r25\n"
"andi r24,0x1f\n"
"std %7,r24\n"
// s = rightRotate3_64(s ^ l[li_out]);
"add %A8,r24\n" // Z = &(l[li_out])
"adc %B8,__zero_reg__\n"
"ld %A0,Z\n" // t = l[li_out]
"ldd %B0,Z+1\n"
"ldd %C0,Z+2\n"
"ldd %D0,Z+3\n"
"ldd %A1,Z+4\n"
"ldd %B1,Z+5\n"
"ldd %C1,Z+6\n"
"ldd %D1,Z+7\n"
"ldd %A2,%A4\n" // load s
"ldd %B2,%B4\n"
"ldd %C2,%C4\n"
"ldd %D2,%D4\n"
"ldd %A3,%A5\n"
"ldd %B3,%B5\n"
"ldd %C3,%C5\n"
"ldd %D3,%D5\n"
"eor %A2,%A0\n" // s ^= t
"eor %B2,%B0\n"
"eor %C2,%C0\n"
"eor %D2,%D0\n"
"eor %A3,%A1\n"
"eor %B3,%B1\n"
"eor %C3,%C1\n"
"eor %D3,%D1\n"
"bst %A2,0\n" // s = rightRotate1_64(s)
"ror %D3\n"
"ror %C3\n"
"ror %B3\n"
"ror %A3\n"
"ror %D2\n"
"ror %C2\n"
"ror %B2\n"
"ror %A2\n"
"bld %D3,7\n"
"bst %A2,0\n" // s = rightRotate1_64(s)
"ror %D3\n"
"ror %C3\n"
"ror %B3\n"
"ror %A3\n"
"ror %D2\n"
"ror %C2\n"
"ror %B2\n"
"ror %A2\n"
"bld %D3,7\n"
"bst %A2,0\n" // s = rightRotate1_64(s)
"ror %D3\n"
"ror %C3\n"
"ror %B3\n"
"ror %A3\n"
"ror %D2\n"
"ror %C2\n"
"ror %B2\n"
"ror %A2\n"
"bld %D3,7\n"
"sub %A8,r24\n" // Z -= li_out
"sbc %B8,__zero_reg__\n"
// li_in = (li_in + 3) & 0x03;
"ldd r24,%6\n"
"add r24,r25\n"
"andi r24,0x1f\n"
"std %6,r24\n"
// l[li_in] = leftRotate8_64((l[li_out] ^ i) - s);
"add %A8,r24\n" // Z = &(l[li_in])
"adc %B8,__zero_reg__\n"
"eor %A0,r23\n" // t ^= i
"sub %A0,%A2\n" // t -= s
"sbc %B0,%B2\n"
"sbc %C0,%C2\n"
"sbc %D0,%D2\n"
"sbc %A1,%A3\n"
"sbc %B1,%B3\n"
"sbc %C1,%C3\n"
"sbc %D1,%D3\n"
"st Z,%D1\n" // l[li_in] = leftRotate8_64(t)
"std Z+1,%A0\n"
"std Z+2,%B0\n"
"std Z+3,%C0\n"
"std Z+4,%D0\n"
"std Z+5,%A1\n"
"std Z+6,%B1\n"
"std Z+7,%C1\n"
"sub %A8,r24\n" // Z -= li_in
"sbc %B8,__zero_reg__\n"
"std %A4,%A2\n" // store s
"std %B4,%B2\n"
"std %C4,%C2\n"
"std %D4,%D2\n"
"std %A5,%A3\n"
"std %B5,%B3\n"
"std %C5,%C3\n"
"std %D5,%D3\n"
// Pop registers from the stack to recover the x and y values.
"pop %D3\n"
"pop %C3\n"
"pop %B3\n"
"pop %A3\n"
"pop %D2\n"
"pop %C2\n"
"pop %B2\n"
"pop %A2\n"
"pop %D1\n"
"pop %C1\n"
"pop %B1\n"
"pop %A1\n"
"pop %D0\n"
"pop %C0\n"
"pop %B0\n"
"pop %A0\n"
// Bottom of the loop.
"rjmp 1b\n"
"eor r16,r8\n"
"eor r17,r9\n"
"eor r18,r10\n"
"eor r19,r11\n"
"eor r20,r12\n"
"eor r21,r13\n"
"eor r22,r14\n"
"eor r23,r15\n"
"bst r16,0\n"
"ror r23\n"
"ror r22\n"
"ror r21\n"
"ror r20\n"
"ror r19\n"
"ror r18\n"
"ror r17\n"
"ror r16\n"
"bld r23,7\n"
"bst r16,0\n"
"ror r23\n"
"ror r22\n"
"ror r21\n"
"ror r20\n"
"ror r19\n"
"ror r18\n"
"ror r17\n"
"ror r16\n"
"bld r23,7\n"
"bst r16,0\n"
"ror r23\n"
"ror r22\n"
"ror r21\n"
"ror r20\n"
"ror r19\n"
"ror r18\n"
"ror r17\n"
"ror r16\n"
"bld r23,7\n"
"ld __tmp_reg__,-X\n"
"eor __tmp_reg__,r15\n"
"ld r15,-X\n"
"eor r15,r14\n"
"ld r14,-X\n"
"eor r14,r13\n"
"ld r13,-X\n"
"eor r13,r12\n"
"ld r12,-X\n"
"eor r12,r11\n"
"ld r11,-X\n"
"eor r11,r10\n"
"ld r10,-X\n"
"eor r10,r9\n"
"ld r9,-X\n"
"eor r9,r8\n"
"mov r8,__tmp_reg__\n"
"sub r9,r16\n"
"sbc r10,r17\n"
"sbc r11,r18\n"
"sbc r12,r19\n"
"sbc r13,r20\n"
"sbc r14,r21\n"
"sbc r15,r22\n"
"sbc r8,r23\n"
"or r25,r25\n"
"brne 3f\n"
"rjmp 4f\n"
"3:\n"
: "+r"(xlow), "+r"(xhigh), "+r"(ylow), "+r"(yhigh),
"+Q"(slow), "+Q"(shigh), "+Q"(li_in), "+Q"(li_out)
: "z"(l), "r"(rounds)
: "r23", "r24", "r25"
);
// Pack the results into the output and convert back to big-endian.
__asm__ __volatile__ (
"st Z,%D1\n"
"std Z+1,%C1\n"
"std Z+2,%B1\n"
"std Z+3,%A1\n"
"std Z+4,%D0\n"
"std Z+5,%C0\n"
"std Z+6,%B0\n"
"std Z+7,%A0\n"
"std Z+8,%D3\n"
"std Z+9,%C3\n"
"std Z+10,%B3\n"
"std Z+11,%A3\n"
"std Z+12,%D2\n"
"std Z+13,%C2\n"
"std Z+14,%B2\n"
"std Z+15,%A2\n"
: : "r"(xlow), "r"(xhigh), "r"(ylow), "r"(yhigh), "z"(output)
"dec r25\n"
"push r8\n"
"push r9\n"
"push r10\n"
"push r11\n"
"push r12\n"
"push r13\n"
"push r14\n"
"push r15\n"
"push r16\n"
"push r17\n"
"push r18\n"
"push r19\n"
"push r20\n"
"push r21\n"
"push r22\n"
"push r23\n"
"ldi r24,24\n"
"add %A2,r24\n"
"add %B2,r24\n"
"ldi r24,0x1F\n"
"and %A2,r24\n"
"and %B2,r24\n"
"ld r16,X+\n"
"ld r17,X+\n"
"ld r18,X+\n"
"ld r19,X+\n"
"ld r20,X+\n"
"ld r21,X+\n"
"ld r22,X+\n"
"ld r23,X+\n"
"add r30,%B2\n"
"adc r31,__zero_reg__\n"
"ld r8,Z\n"
"ldd r9,Z+1\n"
"ldd r10,Z+2\n"
"ldd r11,Z+3\n"
"ldd r12,Z+4\n"
"ldd r13,Z+5\n"
"ldd r14,Z+6\n"
"ldd r15,Z+7\n"
"sub r30,%B2\n"
"sbc r31,__zero_reg__\n"
"eor r16,r8\n"
"eor r17,r9\n"
"eor r18,r10\n"
"eor r19,r11\n"
"eor r20,r12\n"
"eor r21,r13\n"
"eor r22,r14\n"
"eor r23,r15\n"
"bst r16,0\n"
"ror r23\n"
"ror r22\n"
"ror r21\n"
"ror r20\n"
"ror r19\n"
"ror r18\n"
"ror r17\n"
"ror r16\n"
"bld r23,7\n"
"bst r16,0\n"
"ror r23\n"
"ror r22\n"
"ror r21\n"
"ror r20\n"
"ror r19\n"
"ror r18\n"
"ror r17\n"
"ror r16\n"
"bld r23,7\n"
"bst r16,0\n"
"ror r23\n"
"ror r22\n"
"ror r21\n"
"ror r20\n"
"ror r19\n"
"ror r18\n"
"ror r17\n"
"ror r16\n"
"bld r23,7\n"
"st -X,r23\n"
"st -X,r22\n"
"st -X,r21\n"
"st -X,r20\n"
"st -X,r19\n"
"st -X,r18\n"
"st -X,r17\n"
"st -X,r16\n"
"adiw r26,8\n"
"eor r8,r25\n"
"sub r8,r16\n"
"sbc r9,r17\n"
"sbc r10,r18\n"
"sbc r11,r19\n"
"sbc r12,r20\n"
"sbc r13,r21\n"
"sbc r14,r22\n"
"sbc r15,r23\n"
"add r30,%A2\n"
"adc r31,__zero_reg__\n"
"st Z,r15\n"
"std Z+1,r8\n"
"std Z+2,r9\n"
"std Z+3,r10\n"
"std Z+4,r11\n"
"std Z+5,r12\n"
"std Z+6,r13\n"
"std Z+7,r14\n"
"sub r30,%A2\n"
"sbc r31,__zero_reg__\n"
"pop r23\n"
"pop r22\n"
"pop r21\n"
"pop r20\n"
"pop r19\n"
"pop r18\n"
"pop r17\n"
"pop r16\n"
"pop r15\n"
"pop r14\n"
"pop r13\n"
"pop r12\n"
"pop r11\n"
"pop r10\n"
"pop r9\n"
"pop r8\n"
"rjmp 2b\n"
"4:\n"
"ldd r26,%A3\n"
"ldd r27,%B3\n"
"st X+,r15\n"
"st X+,r14\n"
"st X+,r13\n"
"st X+,r12\n"
"st X+,r11\n"
"st X+,r10\n"
"st X+,r9\n"
"st X+,r8\n"
"st X+,r23\n"
"st X+,r22\n"
"st X+,r21\n"
"st X+,r20\n"
"st X+,r19\n"
"st X+,r18\n"
"st X+,r17\n"
"st X,r16\n"
: : "x"(this->l), "z"(l), "r"(input), "Q"(output), "Q"(li_out), "Q"(r), "Q"(li_in)
: "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "memory"
, "r24", "r25"
);
#else
uint64_t l[4];

View File

@ -156,336 +156,257 @@ bool SpeckTiny::setKey(const uint8_t *key, size_t len)
void SpeckTiny::encryptBlock(uint8_t *output, const uint8_t *input)
{
#if USE_AVR_INLINE_ASM
uint64_t l[4];
uint32_t xlow, xhigh, ylow, yhigh;
uint32_t slow, shigh;
uint8_t li_in = 0;
uint8_t li_out = (rounds - 31) * 8;
// Copy the "k" array into "s" and the "l" array.
// Automatically generated by the genspeck tool.
uint64_t l[5];
uint8_t r = rounds;
uint8_t mb = (r - 31) * 8;
__asm__ __volatile__ (
"ldd r25,%4\n" // r25 = li_out
"ld __tmp_reg__,Z+\n"
"std %A0,__tmp_reg__\n"
"ld __tmp_reg__,Z+\n"
"std %B0,__tmp_reg__\n"
"ld __tmp_reg__,Z+\n"
"std %C0,__tmp_reg__\n"
"ld __tmp_reg__,Z+\n"
"std %D0,__tmp_reg__\n"
"ld __tmp_reg__,Z+\n"
"std %A1,__tmp_reg__\n"
"ld __tmp_reg__,Z+\n"
"std %B1,__tmp_reg__\n"
"ld __tmp_reg__,Z+\n"
"std %C1,__tmp_reg__\n"
"ld __tmp_reg__,Z+\n"
"std %D1,__tmp_reg__\n"
"1:\n" // l[0..] = k[1..]
"ld __tmp_reg__,Z+\n"
"st X+,__tmp_reg__\n"
"dec r25\n"
"brne 1b\n"
: "=Q"(slow), "=Q"(shigh)
: "z"(k), "x"(l), "Q"(li_out)
: "r25"
);
// Unpack the input into the x and y variables, converting
// from big-endian into little-endian in the process.
__asm__ __volatile__ (
"ld %D1,Z\n"
"ldd %C1,Z+1\n"
"ldd %B1,Z+2\n"
"ldd %A1,Z+3\n"
"ldd %D0,Z+4\n"
"ldd %C0,Z+5\n"
"ldd %B0,Z+6\n"
"ldd %A0,Z+7\n"
"ldd %D3,Z+8\n"
"ldd %C3,Z+9\n"
"ldd %B3,Z+10\n"
"ldd %A3,Z+11\n"
"ldd %D2,Z+12\n"
"ldd %C2,Z+13\n"
"ldd %B2,Z+14\n"
"ldd %A2,Z+15\n"
: "=r"(xlow), "=r"(xhigh), "=r"(ylow), "=r"(yhigh)
: "z"(input)
);
// Perform all encryption rounds while expanding the key schedule in-place.
__asm__ __volatile__ (
"mov r23,__zero_reg__\n" // i = 0
"movw r8,r30\n"
"ldd r16,%4\n"
"ldi r24,8\n"
"add r16,r24\n"
"1:\n"
// Adjust x and y for this round using the key schedule word s.
// x = (rightRotate8_64(x) + y) ^ s;
"mov __tmp_reg__,%A0\n" // x = rightRotate8_64(x)
"mov %A0,%B0\n"
"mov %B0,%C0\n"
"mov %C0,%D0\n"
"mov %D0,%A1\n"
"mov %A1,%B1\n"
"mov %B1,%C1\n"
"mov %C1,%D1\n"
"mov %D1,__tmp_reg__\n"
"add %A0,%A2\n" // x += y
"adc %B0,%B2\n"
"adc %C0,%C2\n"
"adc %D0,%D2\n"
"adc %A1,%A3\n"
"adc %B1,%B3\n"
"adc %C1,%C3\n"
"adc %D1,%D3\n"
"ldd __tmp_reg__,%A4\n" // x ^= s
"eor %A0,__tmp_reg__\n"
"ldd __tmp_reg__,%B4\n"
"eor %B0,__tmp_reg__\n"
"ldd __tmp_reg__,%C4\n"
"eor %C0,__tmp_reg__\n"
"ldd __tmp_reg__,%D4\n"
"eor %D0,__tmp_reg__\n"
"ldd __tmp_reg__,%A5\n"
"eor %A1,__tmp_reg__\n"
"ldd __tmp_reg__,%B5\n"
"eor %B1,__tmp_reg__\n"
"ldd __tmp_reg__,%C5\n"
"eor %C1,__tmp_reg__\n"
"ldd __tmp_reg__,%D5\n"
"eor %D1,__tmp_reg__\n"
// y = leftRotate3_64(y) ^ x;
"lsl %A2\n" // y = leftRotate1_64(y)
"rol %B2\n"
"rol %C2\n"
"rol %D2\n"
"rol %A3\n"
"rol %B3\n"
"rol %C3\n"
"rol %D3\n"
"adc %A2,__zero_reg__\n"
"lsl %A2\n" // y = leftRotate1_64(y)
"rol %B2\n"
"rol %C2\n"
"rol %D2\n"
"rol %A3\n"
"rol %B3\n"
"rol %C3\n"
"rol %D3\n"
"adc %A2,__zero_reg__\n"
"lsl %A2\n" // y = leftRotate1_64(y)
"rol %B2\n"
"rol %C2\n"
"rol %D2\n"
"rol %A3\n"
"rol %B3\n"
"rol %C3\n"
"rol %D3\n"
"adc %A2,__zero_reg__\n"
"eor %A2,%A0\n" // y ^= x
"eor %B2,%B0\n"
"eor %C2,%C0\n"
"eor %D2,%D0\n"
"eor %A3,%A1\n"
"eor %B3,%B1\n"
"eor %C3,%C1\n"
"eor %D3,%D1\n"
// On the last round we don't need to compute s so we
// can exit early here if (i + 1) == rounds.
"mov __tmp_reg__,r23\n" // temp = i + 1
"inc __tmp_reg__\n"
"cp __tmp_reg__,%9\n" // if (temp == rounds) ...
"brne 2f\n"
"rjmp 3f\n"
"ld __tmp_reg__,X+\n"
"st Z+,__tmp_reg__\n"
"dec r16\n"
"brne 1b\n"
"movw r30,r8\n"
"movw r26,%A2\n"
"ld r15,X+\n"
"ld r14,X+\n"
"ld r13,X+\n"
"ld r12,X+\n"
"ld r11,X+\n"
"ld r10,X+\n"
"ld r9,X+\n"
"ld r8,X+\n"
"ld r23,X+\n"
"ld r22,X+\n"
"ld r21,X+\n"
"ld r20,X+\n"
"ld r19,X+\n"
"ld r18,X+\n"
"ld r17,X+\n"
"ld r16,X\n"
"clr %A2\n"
"ldd %B2,%4\n"
"clr r25\n"
"2:\n"
// Save x and y on the stack so we can reuse registers for t and s.
"push %A0\n"
"push %B0\n"
"push %C0\n"
"push %D0\n"
"push %A1\n"
"push %B1\n"
"push %C1\n"
"push %D1\n"
"push %A2\n"
"push %B2\n"
"push %C2\n"
"push %D2\n"
"push %A3\n"
"push %B3\n"
"push %C3\n"
"push %D3\n"
// Compute the key schedule word s for the next round.
// l[li_out] = (s + rightRotate8_64(l[li_in])) ^ i;
"ldd r24,%6\n" // Z = &(l[li_in])
"add %A8,r24\n"
"adc %B8,__zero_reg__\n"
"ld %D1,Z+\n" // t = rightRotate8_64(l[li_in])
"ld %A0,Z+\n"
"ld %B0,Z+\n"
"ld %C0,Z+\n"
"ld %D0,Z+\n"
"ld %A1,Z+\n"
"ld %B1,Z+\n"
"ld %C1,Z+\n"
"ldd %A2,%A4\n" // load s
"ldd %B2,%B4\n"
"ldd %C2,%C4\n"
"ldd %D2,%D4\n"
"ldd %A3,%A5\n"
"ldd %B3,%B5\n"
"ldd %C3,%C5\n"
"ldd %D3,%D5\n"
"add %A0,%A2\n" // t += s
"adc %B0,%B2\n"
"adc %C0,%C2\n"
"adc %D0,%D2\n"
"adc %A1,%A3\n"
"adc %B1,%B3\n"
"adc %C1,%C3\n"
"adc %D1,%D3\n"
"eor %A0,r23\n" // t ^= i
// Z = Z - li_in + li_out
"ldi r25,8\n" // li_in = li_in + 1
"add r24,r25\n"
"sub %A8,r24\n" // return Z to its initial value
"sbc %B8,__zero_reg__\n"
"andi r24,0x1f\n" // li_in = li_in % 4
"std %6,r24\n"
"ldd r24,%7\n" // Z = &(l[li_out])
"add %A8,r24\n"
"adc %B8,__zero_reg__\n"
"st Z+,%A0\n" // l[li_out] = t
"st Z+,%B0\n"
"st Z+,%C0\n"
"st Z+,%D0\n"
"st Z+,%A1\n"
"st Z+,%B1\n"
"st Z+,%C1\n"
"st Z+,%D1\n"
"add r24,r25\n" // li_out = li_out + 1
"sub %A8,r24\n" // return Z to its initial value
"sbc %B8,__zero_reg__\n"
"andi r24,0x1f\n" // li_out = li_out % 4
"std %7,r24\n"
// s = leftRotate3_64(s) ^ l[li_out];
"lsl %A2\n" // s = leftRotate1_64(s)
"rol %B2\n"
"rol %C2\n"
"rol %D2\n"
"rol %A3\n"
"rol %B3\n"
"rol %C3\n"
"rol %D3\n"
"adc %A2,__zero_reg__\n"
"lsl %A2\n" // s = leftRotate1_64(s)
"rol %B2\n"
"rol %C2\n"
"rol %D2\n"
"rol %A3\n"
"rol %B3\n"
"rol %C3\n"
"rol %D3\n"
"adc %A2,__zero_reg__\n"
"lsl %A2\n" // s = leftRotate1_64(s)
"rol %B2\n"
"rol %C2\n"
"rol %D2\n"
"rol %A3\n"
"rol %B3\n"
"rol %C3\n"
"rol %D3\n"
"adc %A2,__zero_reg__\n"
"eor %A2,%A0\n" // s ^= l[li_out]
"eor %B2,%B0\n"
"eor %C2,%C0\n"
"eor %D2,%D0\n"
"eor %A3,%A1\n"
"eor %B3,%B1\n"
"eor %C3,%C1\n"
"eor %D3,%D1\n"
"std %A4,%A2\n" // store s
"std %B4,%B2\n"
"std %C4,%C2\n"
"std %D4,%D2\n"
"std %A5,%A3\n"
"std %B5,%B3\n"
"std %C5,%C3\n"
"std %D5,%D3\n"
// Pop registers from the stack to recover the x and y values.
"pop %D3\n"
"pop %C3\n"
"pop %B3\n"
"pop %A3\n"
"pop %D2\n"
"pop %C2\n"
"pop %B2\n"
"pop %A2\n"
"pop %D1\n"
"pop %C1\n"
"pop %B1\n"
"pop %A1\n"
"pop %D0\n"
"pop %C0\n"
"pop %B0\n"
"pop %A0\n"
// Bottom of the loop.
"inc r23\n"
"rjmp 1b\n"
"add r9,r16\n"
"adc r10,r17\n"
"adc r11,r18\n"
"adc r12,r19\n"
"adc r13,r20\n"
"adc r14,r21\n"
"adc r15,r22\n"
"adc r8,r23\n"
"ld __tmp_reg__,Z+\n"
"eor __tmp_reg__,r9\n"
"ld r9,Z+\n"
"eor r9,r10\n"
"ld r10,Z+\n"
"eor r10,r11\n"
"ld r11,Z+\n"
"eor r11,r12\n"
"ld r12,Z+\n"
"eor r12,r13\n"
"ld r13,Z+\n"
"eor r13,r14\n"
"ld r14,Z+\n"
"eor r14,r15\n"
"ld r15,Z+\n"
"eor r15,r8\n"
"mov r8,__tmp_reg__\n"
"lsl r16\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16, __zero_reg__\n"
"lsl r16\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16, __zero_reg__\n"
"lsl r16\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16, __zero_reg__\n"
"eor r16,r8\n"
"eor r17,r9\n"
"eor r18,r10\n"
"eor r19,r11\n"
"eor r20,r12\n"
"eor r21,r13\n"
"eor r22,r14\n"
"eor r23,r15\n"
"mov __tmp_reg__,r25\n"
"inc __tmp_reg__\n"
"ldd r24,%5\n"
"cp __tmp_reg__,r24\n"
"brne 3f\n"
"rjmp 4f\n"
"3:\n"
: "+r"(xlow), "+r"(xhigh), "+r"(ylow), "+r"(yhigh),
"+Q"(slow), "+Q"(shigh), "+Q"(li_in), "+Q"(li_out)
: "z"(l), "r"(rounds)
: "r23", "r24", "r25"
);
// Pack the results into the output and convert back to big-endian.
__asm__ __volatile__ (
"st Z,%D1\n"
"std Z+1,%C1\n"
"std Z+2,%B1\n"
"std Z+3,%A1\n"
"std Z+4,%D0\n"
"std Z+5,%C0\n"
"std Z+6,%B0\n"
"std Z+7,%A0\n"
"std Z+8,%D3\n"
"std Z+9,%C3\n"
"std Z+10,%B3\n"
"std Z+11,%A3\n"
"std Z+12,%D2\n"
"std Z+13,%C2\n"
"std Z+14,%B2\n"
"std Z+15,%A2\n"
: : "r"(xlow), "r"(xhigh), "r"(ylow), "r"(yhigh), "z"(output)
"push r8\n"
"push r9\n"
"push r10\n"
"push r11\n"
"push r12\n"
"push r13\n"
"push r14\n"
"push r15\n"
"push r16\n"
"push r17\n"
"push r18\n"
"push r19\n"
"push r20\n"
"push r21\n"
"push r22\n"
"push r23\n"
"sbiw r30,8\n"
"ld r16,Z\n"
"ldd r17,Z+1\n"
"ldd r18,Z+2\n"
"ldd r19,Z+3\n"
"ldd r20,Z+4\n"
"ldd r21,Z+5\n"
"ldd r22,Z+6\n"
"ldd r23,Z+7\n"
"add r30,%A2\n"
"adc r31,__zero_reg__\n"
"ldd r15,Z+8\n"
"ldd r8,Z+9\n"
"ldd r9,Z+10\n"
"ldd r10,Z+11\n"
"ldd r11,Z+12\n"
"ldd r12,Z+13\n"
"ldd r13,Z+14\n"
"ldd r14,Z+15\n"
"add r8,r16\n"
"adc r9,r17\n"
"adc r10,r18\n"
"adc r11,r19\n"
"adc r12,r20\n"
"adc r13,r21\n"
"adc r14,r22\n"
"adc r15,r23\n"
"eor r8,r25\n"
"sub r30,%A2\n"
"sbc r31,__zero_reg__\n"
"add r30,%B2\n"
"adc r31,__zero_reg__\n"
"std Z+8,r8\n"
"std Z+9,r9\n"
"std Z+10,r10\n"
"std Z+11,r11\n"
"std Z+12,r12\n"
"std Z+13,r13\n"
"std Z+14,r14\n"
"std Z+15,r15\n"
"sub r30,%B2\n"
"sbc r31,__zero_reg__\n"
"lsl r16\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16, __zero_reg__\n"
"lsl r16\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16, __zero_reg__\n"
"lsl r16\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"rol r21\n"
"rol r22\n"
"rol r23\n"
"adc r16, __zero_reg__\n"
"eor r16,r8\n"
"eor r17,r9\n"
"eor r18,r10\n"
"eor r19,r11\n"
"eor r20,r12\n"
"eor r21,r13\n"
"eor r22,r14\n"
"eor r23,r15\n"
"st Z,r16\n"
"std Z+1,r17\n"
"std Z+2,r18\n"
"std Z+3,r19\n"
"std Z+4,r20\n"
"std Z+5,r21\n"
"std Z+6,r22\n"
"std Z+7,r23\n"
"ldi r24,8\n"
"add %A2,r24\n"
"add %B2,r24\n"
"ldi r24,0x1F\n"
"and %A2,r24\n"
"and %B2,r24\n"
"pop r23\n"
"pop r22\n"
"pop r21\n"
"pop r20\n"
"pop r19\n"
"pop r18\n"
"pop r17\n"
"pop r16\n"
"pop r15\n"
"pop r14\n"
"pop r13\n"
"pop r12\n"
"pop r11\n"
"pop r10\n"
"pop r9\n"
"pop r8\n"
"inc r25\n"
"rjmp 2b\n"
"4:\n"
"ldd r26,%A3\n"
"ldd r27,%B3\n"
"st X+,r15\n"
"st X+,r14\n"
"st X+,r13\n"
"st X+,r12\n"
"st X+,r11\n"
"st X+,r10\n"
"st X+,r9\n"
"st X+,r8\n"
"st X+,r23\n"
"st X+,r22\n"
"st X+,r21\n"
"st X+,r20\n"
"st X+,r19\n"
"st X+,r18\n"
"st X+,r17\n"
"st X,r16\n"
: : "x"(k), "z"(l), "r"(input), "Q"(output), "Q"(mb), "Q"(r)
: "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "memory"
, "r24", "r25"
);
#else
uint64_t l[4];