mirror of
https://github.com/taigrr/arduinolibs
synced 2025-01-18 04:33:12 -08:00
Speed up Speck by using a custom AVR code generator
This also fixes the remaining asm issues with newer versions of gcc.
This commit is contained in:
parent
b53f57225d
commit
277a0b63c9
@ -81,27 +81,27 @@ Ardunino Mega 2560 running at 16 MHz are similar:
|
||||
<tr><td>ChaCha (20 rounds)</td><td align="right">14.87us</td><td align="right">14.88us</td><td align="right">43.74us</td><td align="right">132</td></tr>
|
||||
<tr><td>ChaCha (12 rounds)</td><td align="right">10.38us</td><td align="right">10.38us</td><td align="right">43.74us</td><td align="right">132</td></tr>
|
||||
<tr><td>ChaCha (8 rounds)</td><td align="right">8.13us</td><td align="right">8.14us</td><td align="right">43.74us</td><td align="right">132</td></tr>
|
||||
<tr><td>Speck (128-bit key, ECB mode)</td><td align="right">10.72us</td><td align="right">11.09us</td><td align="right">287.02us</td><td align="right">275</td></tr>
|
||||
<tr><td>Speck (192-bit key, ECB mode)</td><td align="right">11.03us</td><td align="right">11.42us</td><td align="right">298.21us</td><td align="right">275</td></tr>
|
||||
<tr><td>Speck (256-bit key, ECB mode)</td><td align="right">11.35us</td><td align="right">11.74us</td><td align="right">309.66us</td><td align="right">275</td></tr>
|
||||
<tr><td>SpeckSmall (128-bit key, ECB mode)</td><td align="right">35.25us</td><td align="right">36.46us</td><td align="right">207.66us</td><td align="right">67</td></tr>
|
||||
<tr><td>SpeckSmall (192-bit key, ECB mode)</td><td align="right">36.56us</td><td align="right">37.56us</td><td align="right">220.55us</td><td align="right">67</td></tr>
|
||||
<tr><td>SpeckSmall (256-bit key, ECB mode)</td><td align="right">37.87us</td><td align="right">38.67us</td><td align="right">233.32us</td><td align="right">67</td></tr>
|
||||
<tr><td>SpeckTiny (128-bit key, ECB mode)</td><td align="right">35.25us</td><td align="right"> </td><td align="right">10.22us</td><td align="right">35</td></tr>
|
||||
<tr><td>SpeckTiny (192-bit key, ECB mode)</td><td align="right">36.56us</td><td align="right"> </td><td align="right">13.62us</td><td align="right">35</td></tr>
|
||||
<tr><td>SpeckTiny (256-bit key, ECB mode)</td><td align="right">37.87us</td><td align="right"> </td><td align="right">16.89us</td><td align="right">35</td></tr>
|
||||
<tr><td>Speck (128-bit key, ECB mode)</td><td align="right">9.74us</td><td align="right">10.12us</td><td align="right">253.94us</td><td align="right">275</td></tr>
|
||||
<tr><td>Speck (192-bit key, ECB mode)</td><td align="right">10.03us</td><td align="right">10.41us</td><td align="right">264.63us</td><td align="right">275</td></tr>
|
||||
<tr><td>Speck (256-bit key, ECB mode)</td><td align="right">10.31us</td><td align="right">10.71us</td><td align="right">275.26us</td><td align="right">275</td></tr>
|
||||
<tr><td>SpeckSmall (128-bit key, ECB mode)</td><td align="right">33.93us</td><td align="right">34.82us</td><td align="right">207.66us</td><td align="right">67</td></tr>
|
||||
<tr><td>SpeckSmall (192-bit key, ECB mode)</td><td align="right">35.20us</td><td align="right">35.88us</td><td align="right">220.55us</td><td align="right">67</td></tr>
|
||||
<tr><td>SpeckSmall (256-bit key, ECB mode)</td><td align="right">36.46us</td><td align="right">36.93us</td><td align="right">233.32us</td><td align="right">67</td></tr>
|
||||
<tr><td>SpeckTiny (128-bit key, ECB mode)</td><td align="right">33.93us</td><td align="right"> </td><td align="right">10.22us</td><td align="right">35</td></tr>
|
||||
<tr><td>SpeckTiny (192-bit key, ECB mode)</td><td align="right">35.20us</td><td align="right"> </td><td align="right">13.62us</td><td align="right">35</td></tr>
|
||||
<tr><td>SpeckTiny (256-bit key, ECB mode)</td><td align="right">36.46us</td><td align="right"> </td><td align="right">16.89us</td><td align="right">35</td></tr>
|
||||
<tr><td colspan="5"> </td></tr>
|
||||
<tr><td>AEAD Algorithm</td><td align="right">Encryption (per byte)</td><td align="right">Decryption (per byte)</td><td align="right">Key Setup</td><td>State Size (bytes)</td></tr>
|
||||
<tr><td>ChaChaPoly</td><td align="right">41.20us</td><td align="right">41.19us</td><td align="right">902.36us</td><td align="right">221</td></tr>
|
||||
<tr><td>GCM<AES128></td><td align="right">109.71us</td><td align="right">109.26us</td><td align="right">1265.69us</td><td align="right">284</td></tr>
|
||||
<tr><td>GCM<AES192></td><td align="right">116.38us</td><td align="right">115.92us</td><td align="right">1485.56us</td><td align="right">316</td></tr>
|
||||
<tr><td>GCM<AES256></td><td align="right">123.04us</td><td align="right">122.59us</td><td align="right">1760.28us</td><td align="right">348</td></tr>
|
||||
<tr><td>GCM<Speck> (256-bit key)</td><td align="right">87.78us</td><td align="right">87.32us</td><td align="right">714.41us</td><td align="right">378</td></tr>
|
||||
<tr><td>GCM<SpeckTiny> (256-bit key)</td><td align="right">114.30us</td><td align="right">113.84us</td><td align="right">1270.32us</td><td align="right">138</td></tr>
|
||||
<tr><td>GCM<Speck> (256-bit key)</td><td align="right">86.74us</td><td align="right">86.29us</td><td align="right">646.88us</td><td align="right">378</td></tr>
|
||||
<tr><td>GCM<SpeckTiny> (256-bit key)</td><td align="right">112.90us</td><td align="right">112.44us</td><td align="right">1225.48us</td><td align="right">138</td></tr>
|
||||
<tr><td>EAX<AES128></td><td align="right">71.14us</td><td align="right">71.14us</td><td align="right">1311.97us</td><td align="right">268</td></tr>
|
||||
<tr><td>EAX<AES256></td><td align="right">97.80us</td><td align="right">97.80us</td><td align="right">1806.57us</td><td align="right">332</td></tr>
|
||||
<tr><td>EAX<Speck> (256-bit key)</td><td align="right">27.27us</td><td align="right">27.26us</td><td align="right">760.74us</td><td align="right">362</td></tr>
|
||||
<tr><td>EAX<SpeckTiny> (256-bit key)</td><td align="right">80.31us</td><td align="right">80.31us</td><td align="right">1316.60us</td><td align="right">122</td></tr>
|
||||
<tr><td>EAX<Speck> (256-bit key)</td><td align="right">25.89us</td><td align="right">25.88us</td><td align="right">690.63us</td><td align="right">362</td></tr>
|
||||
<tr><td>EAX<SpeckTiny> (256-bit key)</td><td align="right">78.20us</td><td align="right">78.20us</td><td align="right">1269.19us</td><td align="right">122</td></tr>
|
||||
<tr><td colspan="5"> </td></tr>
|
||||
<tr><td>Hash Algorithm</td><td align="right">Hashing (per byte)</td><td align="right">Finalization</td><td> </td><td>State Size (bytes)</td></tr>
|
||||
<tr><td>SHA256</td><td align="right">43.85us</td><td align="right">2841.04us</td><td align="right"> </td><td align="right">107</td></tr>
|
||||
|
892
gen/genspeck.c
Normal file
892
gen/genspeck.c
Normal file
@ -0,0 +1,892 @@
|
||||
/*
|
||||
* Copyright (C) 2016 Southern Storm Software, Pty Ltd.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included
|
||||
* in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
// Special-purpose compiler that generates the AVR version of Speck*.
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdarg.h>
|
||||
|
||||
static int indent = 4;
|
||||
|
||||
static int t1_reg = 8; // Temporary 64-bit value (any reg).
|
||||
static int t2_reg = 16; // Temporary 64-bit value (any reg).
|
||||
|
||||
static int x_reg = 26;
|
||||
//static int y_reg = 28;
|
||||
static int z_reg = 30;
|
||||
|
||||
static int const_reg = 24; // For temporary constants (must be a high reg).
|
||||
|
||||
static int temp_reg = 25; // Spare temporary register.
|
||||
|
||||
// Information about a set of registers storing a 64-bit quantity.
|
||||
typedef struct
|
||||
{
|
||||
int first; // First register in the set.
|
||||
int offset; // Offset for multiple of 8 rotations.
|
||||
|
||||
} Reg64;
|
||||
|
||||
// Indent the code and print a string.
|
||||
void indent_printf(const char *format, ...)
|
||||
{
|
||||
va_list va;
|
||||
int posn;
|
||||
va_start(va, format);
|
||||
for (posn = 0; posn < indent; ++posn)
|
||||
putc(' ', stdout);
|
||||
vfprintf(stdout, format, va);
|
||||
va_end(va);
|
||||
}
|
||||
|
||||
// Print an assembler instruction within quotes.
|
||||
void insn_printf(const char *format, ...)
|
||||
{
|
||||
va_list va;
|
||||
int posn;
|
||||
va_start(va, format);
|
||||
for (posn = 0; posn < indent; ++posn)
|
||||
putc(' ', stdout);
|
||||
putc('"', stdout);
|
||||
vfprintf(stdout, format, va);
|
||||
putc('\\', stdout);
|
||||
putc('n', stdout);
|
||||
putc('"', stdout);
|
||||
putc('\n', stdout);
|
||||
va_end(va);
|
||||
}
|
||||
|
||||
#define REGn(reg, n) ((reg)->first + ((n) + (reg)->offset) % 8)
|
||||
|
||||
void leftRotate1(const Reg64 *reg)
|
||||
{
|
||||
insn_printf("lsl r%d", REGn(reg, 0));
|
||||
insn_printf("rol r%d", REGn(reg, 1));
|
||||
insn_printf("rol r%d", REGn(reg, 2));
|
||||
insn_printf("rol r%d", REGn(reg, 3));
|
||||
insn_printf("rol r%d", REGn(reg, 4));
|
||||
insn_printf("rol r%d", REGn(reg, 5));
|
||||
insn_printf("rol r%d", REGn(reg, 6));
|
||||
insn_printf("rol r%d", REGn(reg, 7));
|
||||
insn_printf("adc r%d, __zero_reg__", REGn(reg, 0));
|
||||
}
|
||||
|
||||
void leftRotate3(const Reg64 *reg)
|
||||
{
|
||||
leftRotate1(reg);
|
||||
leftRotate1(reg);
|
||||
leftRotate1(reg);
|
||||
}
|
||||
|
||||
void leftRotate8(Reg64 *reg)
|
||||
{
|
||||
reg->offset = (reg->offset + 7) % 8;
|
||||
}
|
||||
|
||||
void rightRotate1(const Reg64 *reg)
|
||||
{
|
||||
insn_printf("bst r%d,0", REGn(reg, 0));
|
||||
insn_printf("ror r%d", REGn(reg, 7));
|
||||
insn_printf("ror r%d", REGn(reg, 6));
|
||||
insn_printf("ror r%d", REGn(reg, 5));
|
||||
insn_printf("ror r%d", REGn(reg, 4));
|
||||
insn_printf("ror r%d", REGn(reg, 3));
|
||||
insn_printf("ror r%d", REGn(reg, 2));
|
||||
insn_printf("ror r%d", REGn(reg, 1));
|
||||
insn_printf("ror r%d", REGn(reg, 0));
|
||||
insn_printf("bld r%d,7", REGn(reg, 7));
|
||||
}
|
||||
|
||||
void rightRotate3(const Reg64 *reg)
|
||||
{
|
||||
rightRotate1(reg);
|
||||
rightRotate1(reg);
|
||||
rightRotate1(reg);
|
||||
}
|
||||
|
||||
void rightRotate8(Reg64 *reg)
|
||||
{
|
||||
reg->offset = (reg->offset + 1) % 8;
|
||||
}
|
||||
|
||||
void add64(const Reg64 *dst, const Reg64 *src)
|
||||
{
|
||||
insn_printf("add r%d,r%d", REGn(dst, 0), REGn(src, 0));
|
||||
insn_printf("adc r%d,r%d", REGn(dst, 1), REGn(src, 1));
|
||||
insn_printf("adc r%d,r%d", REGn(dst, 2), REGn(src, 2));
|
||||
insn_printf("adc r%d,r%d", REGn(dst, 3), REGn(src, 3));
|
||||
insn_printf("adc r%d,r%d", REGn(dst, 4), REGn(src, 4));
|
||||
insn_printf("adc r%d,r%d", REGn(dst, 5), REGn(src, 5));
|
||||
insn_printf("adc r%d,r%d", REGn(dst, 6), REGn(src, 6));
|
||||
insn_printf("adc r%d,r%d", REGn(dst, 7), REGn(src, 7));
|
||||
}
|
||||
|
||||
void sub64(const Reg64 *dst, const Reg64 *src)
|
||||
{
|
||||
insn_printf("sub r%d,r%d", REGn(dst, 0), REGn(src, 0));
|
||||
insn_printf("sbc r%d,r%d", REGn(dst, 1), REGn(src, 1));
|
||||
insn_printf("sbc r%d,r%d", REGn(dst, 2), REGn(src, 2));
|
||||
insn_printf("sbc r%d,r%d", REGn(dst, 3), REGn(src, 3));
|
||||
insn_printf("sbc r%d,r%d", REGn(dst, 4), REGn(src, 4));
|
||||
insn_printf("sbc r%d,r%d", REGn(dst, 5), REGn(src, 5));
|
||||
insn_printf("sbc r%d,r%d", REGn(dst, 6), REGn(src, 6));
|
||||
insn_printf("sbc r%d,r%d", REGn(dst, 7), REGn(src, 7));
|
||||
}
|
||||
|
||||
void eor64(const Reg64 *dst, const Reg64 *src)
|
||||
{
|
||||
insn_printf("eor r%d,r%d", REGn(dst, 0), REGn(src, 0));
|
||||
insn_printf("eor r%d,r%d", REGn(dst, 1), REGn(src, 1));
|
||||
insn_printf("eor r%d,r%d", REGn(dst, 2), REGn(src, 2));
|
||||
insn_printf("eor r%d,r%d", REGn(dst, 3), REGn(src, 3));
|
||||
insn_printf("eor r%d,r%d", REGn(dst, 4), REGn(src, 4));
|
||||
insn_printf("eor r%d,r%d", REGn(dst, 5), REGn(src, 5));
|
||||
insn_printf("eor r%d,r%d", REGn(dst, 6), REGn(src, 6));
|
||||
insn_printf("eor r%d,r%d", REGn(dst, 7), REGn(src, 7));
|
||||
}
|
||||
|
||||
void eor64Schedule(Reg64 *reg)
|
||||
{
|
||||
// XOR with the schedule.
|
||||
insn_printf("ld __tmp_reg__,Z+");
|
||||
insn_printf("eor __tmp_reg__,r%d", REGn(reg, 0));
|
||||
insn_printf("ld r%d,Z+", REGn(reg, 0));
|
||||
insn_printf("eor r%d,r%d", REGn(reg, 0), REGn(reg, 1));
|
||||
insn_printf("ld r%d,Z+", REGn(reg, 1));
|
||||
insn_printf("eor r%d,r%d", REGn(reg, 1), REGn(reg, 2));
|
||||
insn_printf("ld r%d,Z+", REGn(reg, 2));
|
||||
insn_printf("eor r%d,r%d", REGn(reg, 2), REGn(reg, 3));
|
||||
insn_printf("ld r%d,Z+", REGn(reg, 3));
|
||||
insn_printf("eor r%d,r%d", REGn(reg, 3), REGn(reg, 4));
|
||||
insn_printf("ld r%d,Z+", REGn(reg, 4));
|
||||
insn_printf("eor r%d,r%d", REGn(reg, 4), REGn(reg, 5));
|
||||
insn_printf("ld r%d,Z+", REGn(reg, 5));
|
||||
insn_printf("eor r%d,r%d", REGn(reg, 5), REGn(reg, 6));
|
||||
insn_printf("ld r%d,Z+", REGn(reg, 6));
|
||||
insn_printf("eor r%d,r%d", REGn(reg, 6), REGn(reg, 7));
|
||||
insn_printf("mov r%d,__tmp_reg__", REGn(reg, 7));
|
||||
|
||||
// The above operations also implicitly perform a right-rotation.
|
||||
// Undo it by left-shifting back into the correct position.
|
||||
leftRotate8(reg);
|
||||
}
|
||||
|
||||
void eor64ScheduleReversePtr(Reg64 *reg, const char *ptrReg)
|
||||
{
|
||||
// XOR with the schedule.
|
||||
insn_printf("ld __tmp_reg__,-%s", ptrReg);
|
||||
insn_printf("eor __tmp_reg__,r%d", REGn(reg, 7));
|
||||
insn_printf("ld r%d,-%s", REGn(reg, 7), ptrReg);
|
||||
insn_printf("eor r%d,r%d", REGn(reg, 7), REGn(reg, 6));
|
||||
insn_printf("ld r%d,-%s", REGn(reg, 6), ptrReg);
|
||||
insn_printf("eor r%d,r%d", REGn(reg, 6), REGn(reg, 5));
|
||||
insn_printf("ld r%d,-%s", REGn(reg, 5), ptrReg);
|
||||
insn_printf("eor r%d,r%d", REGn(reg, 5), REGn(reg, 4));
|
||||
insn_printf("ld r%d,-%s", REGn(reg, 4), ptrReg);
|
||||
insn_printf("eor r%d,r%d", REGn(reg, 4), REGn(reg, 3));
|
||||
insn_printf("ld r%d,-%s", REGn(reg, 3), ptrReg);
|
||||
insn_printf("eor r%d,r%d", REGn(reg, 3), REGn(reg, 2));
|
||||
insn_printf("ld r%d,-%s", REGn(reg, 2), ptrReg);
|
||||
insn_printf("eor r%d,r%d", REGn(reg, 2), REGn(reg, 1));
|
||||
insn_printf("ld r%d,-%s", REGn(reg, 1), ptrReg);
|
||||
insn_printf("eor r%d,r%d", REGn(reg, 1), REGn(reg, 0));
|
||||
insn_printf("mov r%d,__tmp_reg__", REGn(reg, 0));
|
||||
|
||||
// The above operations also implicitly perform a left-rotation.
|
||||
// Undo it by right-shifting back into the correct position.
|
||||
// We have to do this twice because the following step will be
|
||||
// apply a left-rotation to put everything back where it belongs.
|
||||
rightRotate8(reg);
|
||||
rightRotate8(reg);
|
||||
}
|
||||
|
||||
void eor64ScheduleReverse(Reg64 *reg)
|
||||
{
|
||||
eor64ScheduleReversePtr(reg, "Z");
|
||||
}
|
||||
|
||||
void eor64ScheduleReverseX(Reg64 *reg)
|
||||
{
|
||||
eor64ScheduleReversePtr(reg, "X");
|
||||
}
|
||||
|
||||
// Unpack the input block and convert from big-endian to little-endian.
|
||||
static void unpack_input(void)
|
||||
{
|
||||
Reg64 xreg = {t1_reg, 0};
|
||||
Reg64 yreg = {t2_reg, 0};
|
||||
|
||||
insn_printf("ld r%d,X+", REGn(&xreg, 7));
|
||||
insn_printf("ld r%d,X+", REGn(&xreg, 6));
|
||||
insn_printf("ld r%d,X+", REGn(&xreg, 5));
|
||||
insn_printf("ld r%d,X+", REGn(&xreg, 4));
|
||||
insn_printf("ld r%d,X+", REGn(&xreg, 3));
|
||||
insn_printf("ld r%d,X+", REGn(&xreg, 2));
|
||||
insn_printf("ld r%d,X+", REGn(&xreg, 1));
|
||||
insn_printf("ld r%d,X+", REGn(&xreg, 0));
|
||||
|
||||
insn_printf("ld r%d,X+", REGn(&yreg, 7));
|
||||
insn_printf("ld r%d,X+", REGn(&yreg, 6));
|
||||
insn_printf("ld r%d,X+", REGn(&yreg, 5));
|
||||
insn_printf("ld r%d,X+", REGn(&yreg, 4));
|
||||
insn_printf("ld r%d,X+", REGn(&yreg, 3));
|
||||
insn_printf("ld r%d,X+", REGn(&yreg, 2));
|
||||
insn_printf("ld r%d,X+", REGn(&yreg, 1));
|
||||
insn_printf("ld r%d,X", REGn(&yreg, 0));
|
||||
}
|
||||
|
||||
static void load_from_x(Reg64 *reg)
|
||||
{
|
||||
insn_printf("ld r%d,X+", REGn(reg, 0));
|
||||
insn_printf("ld r%d,X+", REGn(reg, 1));
|
||||
insn_printf("ld r%d,X+", REGn(reg, 2));
|
||||
insn_printf("ld r%d,X+", REGn(reg, 3));
|
||||
insn_printf("ld r%d,X+", REGn(reg, 4));
|
||||
insn_printf("ld r%d,X+", REGn(reg, 5));
|
||||
insn_printf("ld r%d,X+", REGn(reg, 6));
|
||||
insn_printf("ld r%d,X+", REGn(reg, 7));
|
||||
}
|
||||
|
||||
static void store_to_x(Reg64 *reg)
|
||||
{
|
||||
insn_printf("st X+,r%d", REGn(reg, 0));
|
||||
insn_printf("st X+,r%d", REGn(reg, 1));
|
||||
insn_printf("st X+,r%d", REGn(reg, 2));
|
||||
insn_printf("st X+,r%d", REGn(reg, 3));
|
||||
insn_printf("st X+,r%d", REGn(reg, 4));
|
||||
insn_printf("st X+,r%d", REGn(reg, 5));
|
||||
insn_printf("st X+,r%d", REGn(reg, 6));
|
||||
insn_printf("st X+,r%d", REGn(reg, 7));
|
||||
}
|
||||
|
||||
static void load_from_z(Reg64 *reg)
|
||||
{
|
||||
insn_printf("ld r%d,Z+", REGn(reg, 0));
|
||||
insn_printf("ld r%d,Z+", REGn(reg, 1));
|
||||
insn_printf("ld r%d,Z+", REGn(reg, 2));
|
||||
insn_printf("ld r%d,Z+", REGn(reg, 3));
|
||||
insn_printf("ld r%d,Z+", REGn(reg, 4));
|
||||
insn_printf("ld r%d,Z+", REGn(reg, 5));
|
||||
insn_printf("ld r%d,Z+", REGn(reg, 6));
|
||||
insn_printf("ld r%d,Z+", REGn(reg, 7));
|
||||
}
|
||||
|
||||
static void store_to_z(Reg64 *reg)
|
||||
{
|
||||
insn_printf("st Z+,r%d", REGn(reg, 0));
|
||||
insn_printf("st Z+,r%d", REGn(reg, 1));
|
||||
insn_printf("st Z+,r%d", REGn(reg, 2));
|
||||
insn_printf("st Z+,r%d", REGn(reg, 3));
|
||||
insn_printf("st Z+,r%d", REGn(reg, 4));
|
||||
insn_printf("st Z+,r%d", REGn(reg, 5));
|
||||
insn_printf("st Z+,r%d", REGn(reg, 6));
|
||||
insn_printf("st Z+,r%d", REGn(reg, 7));
|
||||
}
|
||||
|
||||
static void push64(Reg64 *reg)
|
||||
{
|
||||
reg->offset = 0;
|
||||
insn_printf("push r%d", REGn(reg, 0));
|
||||
insn_printf("push r%d", REGn(reg, 1));
|
||||
insn_printf("push r%d", REGn(reg, 2));
|
||||
insn_printf("push r%d", REGn(reg, 3));
|
||||
insn_printf("push r%d", REGn(reg, 4));
|
||||
insn_printf("push r%d", REGn(reg, 5));
|
||||
insn_printf("push r%d", REGn(reg, 6));
|
||||
insn_printf("push r%d", REGn(reg, 7));
|
||||
}
|
||||
|
||||
static void pop64(Reg64 *reg)
|
||||
{
|
||||
reg->offset = 0;
|
||||
insn_printf("pop r%d", REGn(reg, 7));
|
||||
insn_printf("pop r%d", REGn(reg, 6));
|
||||
insn_printf("pop r%d", REGn(reg, 5));
|
||||
insn_printf("pop r%d", REGn(reg, 4));
|
||||
insn_printf("pop r%d", REGn(reg, 3));
|
||||
insn_printf("pop r%d", REGn(reg, 2));
|
||||
insn_printf("pop r%d", REGn(reg, 1));
|
||||
insn_printf("pop r%d", REGn(reg, 0));
|
||||
}
|
||||
|
||||
// Main loop for Speck::encryptBlock().
|
||||
static void full_enc_main_loop(void)
|
||||
{
|
||||
Reg64 xreg = {t1_reg, 0};
|
||||
Reg64 yreg = {t2_reg, 0};
|
||||
|
||||
// Top of the main loop.
|
||||
insn_printf("1:");
|
||||
|
||||
// x = (rightRotate8_64(x) + y) ^ *s++;
|
||||
rightRotate8(&xreg);
|
||||
add64(&xreg, &yreg);
|
||||
eor64Schedule(&xreg);
|
||||
|
||||
// y = leftRotate3_64(y) ^ x;
|
||||
leftRotate3(&yreg);
|
||||
eor64(&yreg, &xreg);
|
||||
|
||||
// Bottom of the main loop.
|
||||
insn_printf("dec %%2");
|
||||
insn_printf("breq 2f");
|
||||
insn_printf("rjmp 1b");
|
||||
insn_printf("2:");
|
||||
}
|
||||
|
||||
// Main loop for Speck::decryptBlock().
|
||||
static void full_dec_main_loop(void)
|
||||
{
|
||||
Reg64 xreg = {t1_reg, 0};
|
||||
Reg64 yreg = {t2_reg, 0};
|
||||
|
||||
// Top of the main loop.
|
||||
insn_printf("1:");
|
||||
|
||||
// y = rightRotate3_64(x ^ y);
|
||||
eor64(&yreg, &xreg);
|
||||
rightRotate3(&yreg);
|
||||
|
||||
// x = leftRotate8_64((x ^ *s--) - y);
|
||||
eor64ScheduleReverse(&xreg);
|
||||
leftRotate8(&xreg);
|
||||
sub64(&xreg, &yreg);
|
||||
|
||||
// Bottom of the main loop.
|
||||
insn_printf("dec %%2");
|
||||
insn_printf("breq 2f");
|
||||
insn_printf("rjmp 1b");
|
||||
insn_printf("2:");
|
||||
}
|
||||
|
||||
// Pack the output block and convert from little-endian to big-endian.
|
||||
static void pack_output(void)
|
||||
{
|
||||
Reg64 xreg = {t1_reg, 0};
|
||||
Reg64 yreg = {t2_reg, 0};
|
||||
|
||||
insn_printf("ldd r%d,%%A3", x_reg);
|
||||
insn_printf("ldd r%d,%%B3", x_reg + 1);
|
||||
|
||||
insn_printf("st X+,r%d", REGn(&xreg, 7));
|
||||
insn_printf("st X+,r%d", REGn(&xreg, 6));
|
||||
insn_printf("st X+,r%d", REGn(&xreg, 5));
|
||||
insn_printf("st X+,r%d", REGn(&xreg, 4));
|
||||
insn_printf("st X+,r%d", REGn(&xreg, 3));
|
||||
insn_printf("st X+,r%d", REGn(&xreg, 2));
|
||||
insn_printf("st X+,r%d", REGn(&xreg, 1));
|
||||
insn_printf("st X+,r%d", REGn(&xreg, 0));
|
||||
|
||||
insn_printf("st X+,r%d", REGn(&yreg, 7));
|
||||
insn_printf("st X+,r%d", REGn(&yreg, 6));
|
||||
insn_printf("st X+,r%d", REGn(&yreg, 5));
|
||||
insn_printf("st X+,r%d", REGn(&yreg, 4));
|
||||
insn_printf("st X+,r%d", REGn(&yreg, 3));
|
||||
insn_printf("st X+,r%d", REGn(&yreg, 2));
|
||||
insn_printf("st X+,r%d", REGn(&yreg, 1));
|
||||
insn_printf("st X,r%d", REGn(&yreg, 0));
|
||||
}
|
||||
|
||||
static void temp_regs(void)
|
||||
{
|
||||
indent_printf(": \"r%d\", \"r%d\", \"r%d\", \"r%d\", "
|
||||
"\"r%d\", \"r%d\", \"r%d\", \"r%d\",\n",
|
||||
t1_reg, t1_reg + 1, t1_reg + 2, t1_reg + 3,
|
||||
t1_reg + 4, t1_reg + 5, t1_reg + 6, t1_reg + 7);
|
||||
indent_printf(" \"r%d\", \"r%d\", \"r%d\", \"r%d\", "
|
||||
"\"r%d\", \"r%d\", \"r%d\", \"r%d\", \"memory\"\n",
|
||||
t2_reg, t2_reg + 1, t2_reg + 2, t2_reg + 3,
|
||||
t2_reg + 4, t2_reg + 5, t2_reg + 6, t2_reg + 7);
|
||||
}
|
||||
|
||||
static void full_setkey(void)
|
||||
{
|
||||
Reg64 xreg = {t1_reg, 0};
|
||||
Reg64 yreg = {t2_reg, 0};
|
||||
|
||||
printf("void Speck::setKey(const uint8_t *key, size_t len)\n");
|
||||
printf("{\n");
|
||||
indent_printf("// Automatically generated by the genspeck tool.\n");
|
||||
|
||||
// Validate the key length.
|
||||
indent_printf("uint64_t l[4];\n");
|
||||
indent_printf("uint8_t m, mb;\n");
|
||||
indent_printf("if (len == 32) {\n");
|
||||
indent_printf(" m = 4;\n");
|
||||
indent_printf(" mb = 3 * 8;\n");
|
||||
indent_printf("} else if (len == 24) {\n");
|
||||
indent_printf(" m = 3;\n");
|
||||
indent_printf(" mb = 2 * 8;\n");
|
||||
indent_printf("} else if (len == 16) {\n");
|
||||
indent_printf(" m = 2;\n");
|
||||
indent_printf(" mb = 8;\n");
|
||||
indent_printf("} else {\n");
|
||||
indent_printf(" return false;\n");
|
||||
indent_printf("}\n");
|
||||
indent_printf("rounds = 30 + m;\n");
|
||||
indent_printf("uint8_t r = rounds - 1;\n");
|
||||
indent_printf("__asm__ __volatile__ (\n");
|
||||
indent += 4;
|
||||
|
||||
// Copy the key into k[0] and l while converting endianness.
|
||||
insn_printf("ld __tmp_reg__,-X"); // k[0] = last 8 bytes of the key
|
||||
insn_printf("st Z+,__tmp_reg__");
|
||||
insn_printf("ld __tmp_reg__,-X");
|
||||
insn_printf("st Z+,__tmp_reg__");
|
||||
insn_printf("ld __tmp_reg__,-X");
|
||||
insn_printf("st Z+,__tmp_reg__");
|
||||
insn_printf("ld __tmp_reg__,-X");
|
||||
insn_printf("st Z+,__tmp_reg__");
|
||||
insn_printf("ld __tmp_reg__,-X");
|
||||
insn_printf("st Z+,__tmp_reg__");
|
||||
insn_printf("ld __tmp_reg__,-X");
|
||||
insn_printf("st Z+,__tmp_reg__");
|
||||
insn_printf("ld __tmp_reg__,-X");
|
||||
insn_printf("st Z+,__tmp_reg__");
|
||||
insn_printf("ld __tmp_reg__,-X");
|
||||
insn_printf("st Z+,__tmp_reg__");
|
||||
insn_printf("sbiw r%d,8", z_reg); // Set Z back to beginning of k
|
||||
insn_printf("movw r%d,r%d", t1_reg + 2, z_reg); // Save Z
|
||||
insn_printf("movw r%d,%%A2", z_reg); // Z = l
|
||||
insn_printf("ldd r%d,%%3", t1_reg);
|
||||
insn_printf("1:");
|
||||
insn_printf("ld __tmp_reg__,-X"); // Copy first mb bytes from key
|
||||
insn_printf("st Z+,__tmp_reg__");
|
||||
insn_printf("dec r%d", t1_reg);
|
||||
insn_printf("brne 1b");
|
||||
insn_printf("movw r%d,%%A2", x_reg); // X = l
|
||||
insn_printf("movw r%d,r%d", z_reg, t1_reg + 2); // Z = k
|
||||
|
||||
// Expand the key to the full key schedule.
|
||||
// Note: We can use %A2 and %B2 as spare temporary registers now.
|
||||
insn_printf("clr %%A2"); // %A2 = li_in = 0
|
||||
insn_printf("ldd %%B2,%%3"); // %B2 = li_out = mb (= (m - 1) * 8)
|
||||
insn_printf("clr r%d", temp_reg); // i = 0
|
||||
load_from_z(&yreg); // y = k[i]
|
||||
insn_printf("2:");
|
||||
|
||||
// l[li_out] = (k[i] + rightRotate8_64(l[li_in])) ^ i
|
||||
insn_printf("add r%d,%%A2", x_reg); // x = rightRotate8_64(l[li_in])
|
||||
insn_printf("adc r%d,__zero_reg__", x_reg + 1);
|
||||
xreg.offset = 7;
|
||||
load_from_x(&xreg);
|
||||
xreg.offset = 0;
|
||||
insn_printf("sub r%d,%%A2", x_reg); // restore X to point at base of l
|
||||
insn_printf("sbc r%d,__zero_reg__", x_reg + 1);
|
||||
insn_printf("sbiw r%d,8", x_reg);
|
||||
add64(&xreg, &yreg); // x += y
|
||||
insn_printf("eor r%d,r%d", REGn(&xreg, 0), temp_reg); // x ^= i
|
||||
insn_printf("add r%d,%%B2", x_reg); // l[li_out] = x
|
||||
insn_printf("adc r%d,__zero_reg__", x_reg + 1);
|
||||
store_to_x(&xreg);
|
||||
insn_printf("sub r%d,%%B2", x_reg); // restore X to point at base of l
|
||||
insn_printf("sbc r%d,__zero_reg__", x_reg + 1);
|
||||
insn_printf("sbiw r%d,8", x_reg);
|
||||
|
||||
// k[i + 1] = leftRotate3_64(k[i]) ^ l[li_out];
|
||||
leftRotate3(&yreg); // y = leftRotate3(y)
|
||||
eor64(&yreg, &xreg); // y ^= x
|
||||
store_to_z(&yreg); // k[i + 1] = y
|
||||
|
||||
// Advance li_in and li_out, wrapping around at the end of l.
|
||||
insn_printf("ldi r%d,8", const_reg);
|
||||
insn_printf("add %%A2,r%d", const_reg);
|
||||
insn_printf("add %%B2,r%d", const_reg);
|
||||
insn_printf("ldi r%d,0x1F", const_reg);
|
||||
insn_printf("and %%A2,r%d", const_reg);
|
||||
insn_printf("and %%B2,r%d", const_reg);
|
||||
|
||||
// Bottom of the loop.
|
||||
insn_printf("ldd r%d,%%4", t1_reg); // r8 = rounds - 1
|
||||
insn_printf("inc r%d", temp_reg); // ++i
|
||||
insn_printf("cp r%d,r%d", temp_reg, t1_reg);
|
||||
insn_printf("breq 3f");
|
||||
insn_printf("rjmp 2b");
|
||||
insn_printf("3:");
|
||||
|
||||
// Clean the l array. X register should still be pointing to it.
|
||||
insn_printf("ldi r%d,32", const_reg);
|
||||
insn_printf("4:");
|
||||
insn_printf("st X+,__zero_reg__");
|
||||
insn_printf("dec r%d", const_reg);
|
||||
insn_printf("brne 4b");
|
||||
|
||||
// Declare the registers that we need.
|
||||
indent_printf(": : \"z\"(k), \"x\"(key + len), \"r\"(l), \"Q\"(mb), \"Q\"(r)\n");
|
||||
temp_regs();
|
||||
indent_printf(", \"r%d\", \"r%d\"\n", const_reg, temp_reg);
|
||||
indent -= 4;
|
||||
indent_printf(");\n");
|
||||
|
||||
// End of function.
|
||||
indent_printf("return true;\n");
|
||||
printf("}\n\n");
|
||||
}
|
||||
|
||||
static void full_enc(void)
|
||||
{
|
||||
printf("void Speck::encryptBlock(uint8_t *output, const uint8_t *input)\n");
|
||||
printf("{\n");
|
||||
indent_printf("// Automatically generated by the genspeck tool.\n");
|
||||
indent_printf("__asm__ __volatile__ (\n");
|
||||
indent += 4;
|
||||
unpack_input();
|
||||
full_enc_main_loop();
|
||||
pack_output();
|
||||
indent_printf(": : \"x\"(input), \"z\"(k), \"r\"(rounds), \"Q\"(output)\n");
|
||||
temp_regs();
|
||||
indent -= 4;
|
||||
indent_printf(");\n");
|
||||
printf("}\n\n");
|
||||
}
|
||||
|
||||
static void full_dec(void)
|
||||
{
|
||||
printf("void Speck::decryptBlock(uint8_t *output, const uint8_t *input)\n");
|
||||
printf("{\n");
|
||||
indent_printf("// Automatically generated by the genspeck tool.\n");
|
||||
indent_printf("__asm__ __volatile__ (\n");
|
||||
indent += 4;
|
||||
unpack_input();
|
||||
full_dec_main_loop();
|
||||
pack_output();
|
||||
indent_printf(": : \"x\"(input), \"z\"(k + rounds), \"r\"(rounds), \"Q\"(output)\n");
|
||||
temp_regs();
|
||||
indent -= 4;
|
||||
indent_printf(");\n");
|
||||
printf("}\n\n");
|
||||
}
|
||||
|
||||
static void tiny_enc(void)
|
||||
{
|
||||
Reg64 xreg = {t1_reg, 0};
|
||||
Reg64 yreg = {t2_reg, 0};
|
||||
|
||||
printf("void SpeckTiny::encryptBlock(uint8_t *output, const uint8_t *input)\n");
|
||||
printf("{\n");
|
||||
indent_printf("// Automatically generated by the genspeck tool.\n");
|
||||
indent_printf("uint64_t l[5];\n");
|
||||
indent_printf("uint8_t r = rounds;\n");
|
||||
indent_printf("uint8_t mb = (r - 31) * 8;\n");
|
||||
|
||||
// Copy the "k" array into the "l" array. The first element is "s"
|
||||
// and the rest of the elements make up the normal l[0..3] values.
|
||||
indent_printf("__asm__ __volatile__ (\n");
|
||||
indent += 4;
|
||||
insn_printf("movw r%d,r%d", t1_reg, z_reg); // Save Z
|
||||
insn_printf("ldd r%d,%%4", t2_reg);
|
||||
insn_printf("ldi r%d,8", const_reg);
|
||||
insn_printf("add r%d,r%d", t2_reg, const_reg);
|
||||
insn_printf("1:");
|
||||
insn_printf("ld __tmp_reg__,X+");
|
||||
insn_printf("st Z+,__tmp_reg__");
|
||||
insn_printf("dec r%d", t2_reg);
|
||||
insn_printf("brne 1b");
|
||||
insn_printf("movw r%d,r%d", z_reg, t1_reg); // Restore Z to point at l
|
||||
|
||||
// Unpack the input. %A2 and %B2 are free temporary registers after this.
|
||||
insn_printf("movw r%d,%%A2", x_reg);
|
||||
unpack_input();
|
||||
|
||||
// Top of the loop.
|
||||
insn_printf("clr %%A2"); // %A2 = li_in = 0
|
||||
insn_printf("ldd %%B2,%%4"); // %B2 = li_out = mb
|
||||
insn_printf("clr r%d", temp_reg); // i = 0
|
||||
insn_printf("2:");
|
||||
|
||||
// Adjust x and y for this round using the key schedule word s (in l[0]).
|
||||
// x = (rightRotate8_64(x) + y) ^ s;
|
||||
rightRotate8(&xreg);
|
||||
add64(&xreg, &yreg);
|
||||
eor64Schedule(&xreg);
|
||||
// y = leftRotate3_64(y) ^ x;
|
||||
leftRotate3(&yreg);
|
||||
eor64(&yreg, &xreg);
|
||||
// At this point, Z has been incremented to point at l[1] which
|
||||
// is the start of the actual l[0] from the original formulation.
|
||||
|
||||
// If this is the last round, then we are done. There is no
|
||||
// point calculating another key schedule element.
|
||||
insn_printf("mov __tmp_reg__,r%d", temp_reg);
|
||||
insn_printf("inc __tmp_reg__");
|
||||
insn_printf("ldd r%d,%%5", const_reg);
|
||||
insn_printf("cp __tmp_reg__,r%d", const_reg);
|
||||
insn_printf("brne 3f");
|
||||
insn_printf("rjmp 4f");
|
||||
insn_printf("3:");
|
||||
|
||||
// Save x and y on the stack - we need the registers to
|
||||
// help us compute the next key schedule element.
|
||||
push64(&xreg);
|
||||
push64(&yreg);
|
||||
|
||||
// Compute the key schedule word s for the next round.
|
||||
insn_printf("sbiw r%d,8", z_reg); // Point Z back at l[0]
|
||||
// l[li_out] = (s + rightRotate8_64(l[li_in])) ^ i;
|
||||
insn_printf("ld r%d,Z", REGn(&yreg, 0)); // y = s
|
||||
insn_printf("ldd r%d,Z+1", REGn(&yreg, 1));
|
||||
insn_printf("ldd r%d,Z+2", REGn(&yreg, 2));
|
||||
insn_printf("ldd r%d,Z+3", REGn(&yreg, 3));
|
||||
insn_printf("ldd r%d,Z+4", REGn(&yreg, 4));
|
||||
insn_printf("ldd r%d,Z+5", REGn(&yreg, 5));
|
||||
insn_printf("ldd r%d,Z+6", REGn(&yreg, 6));
|
||||
insn_printf("ldd r%d,Z+7", REGn(&yreg, 7));
|
||||
insn_printf("add r%d,%%A2", z_reg); // Z = &(l[li_in]) - 8
|
||||
insn_printf("adc r%d,__zero_reg__", z_reg + 1);
|
||||
leftRotate8(&xreg); // x = rightRotate8(l[li_in])
|
||||
insn_printf("ldd r%d,Z+8", REGn(&xreg, 0));
|
||||
insn_printf("ldd r%d,Z+9", REGn(&xreg, 1));
|
||||
insn_printf("ldd r%d,Z+10", REGn(&xreg, 2));
|
||||
insn_printf("ldd r%d,Z+11", REGn(&xreg, 3));
|
||||
insn_printf("ldd r%d,Z+12", REGn(&xreg, 4));
|
||||
insn_printf("ldd r%d,Z+13", REGn(&xreg, 5));
|
||||
insn_printf("ldd r%d,Z+14", REGn(&xreg, 6));
|
||||
insn_printf("ldd r%d,Z+15", REGn(&xreg, 7));
|
||||
rightRotate8(&xreg);
|
||||
add64(&xreg, &yreg); // x += y
|
||||
insn_printf("eor r%d,r%d", REGn(&xreg, 0), temp_reg); // x ^= i
|
||||
insn_printf("sub r%d,%%A2", z_reg); // Z = &(l[li_out]) - 8
|
||||
insn_printf("sbc r%d,__zero_reg__", z_reg + 1);
|
||||
insn_printf("add r%d,%%B2", z_reg);
|
||||
insn_printf("adc r%d,__zero_reg__", z_reg + 1);
|
||||
insn_printf("std Z+8,r%d", REGn(&xreg, 0)); // l[li_out] = x
|
||||
insn_printf("std Z+9,r%d", REGn(&xreg, 1));
|
||||
insn_printf("std Z+10,r%d", REGn(&xreg, 2));
|
||||
insn_printf("std Z+11,r%d", REGn(&xreg, 3));
|
||||
insn_printf("std Z+12,r%d", REGn(&xreg, 4));
|
||||
insn_printf("std Z+13,r%d", REGn(&xreg, 5));
|
||||
insn_printf("std Z+14,r%d", REGn(&xreg, 6));
|
||||
insn_printf("std Z+15,r%d", REGn(&xreg, 7));
|
||||
insn_printf("sub r%d,%%B2", z_reg); // Restore Z to base of l array
|
||||
insn_printf("sbc r%d,__zero_reg__", z_reg + 1);
|
||||
// s = leftRotate3_64(s) ^ l[li_out];
|
||||
leftRotate3(&yreg);
|
||||
eor64(&yreg, &xreg);
|
||||
insn_printf("st Z,r%d", REGn(&yreg, 0));
|
||||
insn_printf("std Z+1,r%d", REGn(&yreg, 1));
|
||||
insn_printf("std Z+2,r%d", REGn(&yreg, 2));
|
||||
insn_printf("std Z+3,r%d", REGn(&yreg, 3));
|
||||
insn_printf("std Z+4,r%d", REGn(&yreg, 4));
|
||||
insn_printf("std Z+5,r%d", REGn(&yreg, 5));
|
||||
insn_printf("std Z+6,r%d", REGn(&yreg, 6));
|
||||
insn_printf("std Z+7,r%d", REGn(&yreg, 7));
|
||||
|
||||
// Advance li_in and li_out, wrapping around at the end of l.
|
||||
insn_printf("ldi r%d,8", const_reg);
|
||||
insn_printf("add %%A2,r%d", const_reg);
|
||||
insn_printf("add %%B2,r%d", const_reg);
|
||||
insn_printf("ldi r%d,0x1F", const_reg);
|
||||
insn_printf("and %%A2,r%d", const_reg);
|
||||
insn_printf("and %%B2,r%d", const_reg);
|
||||
|
||||
// Restore the original x and y.
|
||||
pop64(&yreg);
|
||||
pop64(&xreg);
|
||||
|
||||
// Bottom of the loop.
|
||||
insn_printf("inc r%d", temp_reg); // i++
|
||||
insn_printf("rjmp 2b");
|
||||
insn_printf("4:");
|
||||
|
||||
// Pack the results into the output buffer.
|
||||
pack_output();
|
||||
|
||||
// Declare the registers that we need.
|
||||
indent_printf(": : \"x\"(k), \"z\"(l), \"r\"(input), \"Q\"(output), \"Q\"(mb), \"Q\"(r)\n");
|
||||
temp_regs();
|
||||
indent_printf(", \"r%d\", \"r%d\"\n", const_reg, temp_reg);
|
||||
indent -= 4;
|
||||
indent_printf(");\n");
|
||||
printf("}\n\n");
|
||||
}
|
||||
|
||||
static void small_dec(void)
|
||||
{
|
||||
Reg64 xreg = {t1_reg, 0};
|
||||
Reg64 yreg = {t2_reg, 0};
|
||||
|
||||
printf("void SpeckSmall::decryptBlock(uint8_t *output, const uint8_t *input)\n");
|
||||
printf("{\n");
|
||||
indent_printf("// Automatically generated by the genspeck tool.\n");
|
||||
indent_printf("uint64_t l[5];\n");
|
||||
indent_printf("uint8_t r = rounds;\n");
|
||||
indent_printf("uint8_t li_in = ((r + 3) & 0x03) * 8;\n");
|
||||
indent_printf("uint8_t li_out = ((((r - 31) & 0x03) * 8) + li_in) & 0x1F;\n");
|
||||
indent_printf("__asm__ __volatile__ (\n");
|
||||
indent += 4;
|
||||
|
||||
// Copy the this->l array into the local l array. Then copy
|
||||
// the "s" value from l[li_out] to l[4].
|
||||
insn_printf("ldd r%d,%%4", temp_reg); // r25 = li_out
|
||||
insn_printf("ldi r%d,32", const_reg); // Copy 32 bytes from this->l.
|
||||
insn_printf("1:");
|
||||
insn_printf("ld __tmp_reg__,X+");
|
||||
insn_printf("st Z+,__tmp_reg__");
|
||||
insn_printf("dec r%d", const_reg);
|
||||
insn_printf("brne 1b");
|
||||
insn_printf("movw r%d,r%d", x_reg, z_reg); // X = Z + 32
|
||||
insn_printf("sbiw r%d,32", z_reg); // Z = &(l[li_out])
|
||||
insn_printf("add r%d,r%d", z_reg, temp_reg);
|
||||
insn_printf("adc r%d,__zero_reg__", z_reg + 1);
|
||||
insn_printf("ld __tmp_reg__,Z"); // Copy l[li_out] to l[4]
|
||||
insn_printf("st X+,__tmp_reg__");
|
||||
insn_printf("ldd __tmp_reg__,Z+1");
|
||||
insn_printf("st X+,__tmp_reg__");
|
||||
insn_printf("ldd __tmp_reg__,Z+2");
|
||||
insn_printf("st X+,__tmp_reg__");
|
||||
insn_printf("ldd __tmp_reg__,Z+3");
|
||||
insn_printf("st X+,__tmp_reg__");
|
||||
insn_printf("ldd __tmp_reg__,Z+4");
|
||||
insn_printf("st X+,__tmp_reg__");
|
||||
insn_printf("ldd __tmp_reg__,Z+5");
|
||||
insn_printf("st X+,__tmp_reg__");
|
||||
insn_printf("ldd __tmp_reg__,Z+6");
|
||||
insn_printf("st X+,__tmp_reg__");
|
||||
insn_printf("ldd __tmp_reg__,Z+7");
|
||||
insn_printf("st X+,__tmp_reg__");
|
||||
insn_printf("sub r%d,r%d", z_reg, temp_reg); // Z = &(l[0])
|
||||
insn_printf("sbc r%d,__zero_reg__", z_reg + 1);
|
||||
|
||||
// Unpack the input. %A2 and %B2 are free temporary registers after this.
|
||||
insn_printf("movw r%d,%%A2", x_reg);
|
||||
unpack_input();
|
||||
|
||||
// Top of the loop.
|
||||
insn_printf("ldd %%A2,%%6"); // %A2 = li_in
|
||||
insn_printf("mov %%B2,r%d", temp_reg); // %B2 = li_out
|
||||
insn_printf("ldd r%d,%%5", temp_reg); // i = rounds - 1
|
||||
insn_printf("dec r%d", temp_reg);
|
||||
insn_printf("movw r%d,r%d", x_reg, z_reg); // X = Z + 40 = &(l[5])
|
||||
insn_printf("adiw r%d,40", x_reg); // i.e. point to end of l[4]
|
||||
insn_printf("2:");
|
||||
|
||||
// Adjust x and y for this round using the key schedule word s (in l[4]).
|
||||
// y = rightRotate3_64(x ^ y);
|
||||
eor64(&yreg, &xreg);
|
||||
rightRotate3(&yreg);
|
||||
// x = leftRotate8_64((x ^ s) - y);
|
||||
eor64ScheduleReverseX(&xreg);
|
||||
leftRotate8(&xreg);
|
||||
sub64(&xreg, &yreg);
|
||||
|
||||
// If this is the last round, then we are done. There is no
|
||||
// point calculating another key schedule element.
|
||||
insn_printf("or r%d,r%d", temp_reg, temp_reg); // if (i == 0)
|
||||
insn_printf("brne 3f");
|
||||
insn_printf("rjmp 4f");
|
||||
insn_printf("3:");
|
||||
insn_printf("dec r%d", temp_reg); // --i
|
||||
|
||||
// Save x and y on the stack - we need the registers to
|
||||
// help us compute the next key schedule element.
|
||||
push64(&xreg);
|
||||
push64(&yreg);
|
||||
|
||||
// Move li_in and li_out backwards, wrapping around at the start of l.
|
||||
insn_printf("ldi r%d,24", const_reg);
|
||||
insn_printf("add %%A2,r%d", const_reg);
|
||||
insn_printf("add %%B2,r%d", const_reg);
|
||||
insn_printf("ldi r%d,0x1F", const_reg);
|
||||
insn_printf("and %%A2,r%d", const_reg);
|
||||
insn_printf("and %%B2,r%d", const_reg);
|
||||
|
||||
// Compute the key schedule word s for the next round.
|
||||
// s = rightRotate3_64(s ^ l[li_out]);
|
||||
insn_printf("ld r%d,X+", REGn(&yreg, 0)); // y = s = l[4]
|
||||
insn_printf("ld r%d,X+", REGn(&yreg, 1));
|
||||
insn_printf("ld r%d,X+", REGn(&yreg, 2));
|
||||
insn_printf("ld r%d,X+", REGn(&yreg, 3));
|
||||
insn_printf("ld r%d,X+", REGn(&yreg, 4));
|
||||
insn_printf("ld r%d,X+", REGn(&yreg, 5));
|
||||
insn_printf("ld r%d,X+", REGn(&yreg, 6));
|
||||
insn_printf("ld r%d,X+", REGn(&yreg, 7));
|
||||
insn_printf("add r%d,%%B2", z_reg); // Z = &(l[li_out])
|
||||
insn_printf("adc r%d,__zero_reg__", z_reg + 1);
|
||||
insn_printf("ld r%d,Z", REGn(&xreg, 0)); // x = l[li_out]
|
||||
insn_printf("ldd r%d,Z+1", REGn(&xreg, 1));
|
||||
insn_printf("ldd r%d,Z+2", REGn(&xreg, 2));
|
||||
insn_printf("ldd r%d,Z+3", REGn(&xreg, 3));
|
||||
insn_printf("ldd r%d,Z+4", REGn(&xreg, 4));
|
||||
insn_printf("ldd r%d,Z+5", REGn(&xreg, 5));
|
||||
insn_printf("ldd r%d,Z+6", REGn(&xreg, 6));
|
||||
insn_printf("ldd r%d,Z+7", REGn(&xreg, 7));
|
||||
insn_printf("sub r%d,%%B2", z_reg); // Z = &(l[0])
|
||||
insn_printf("sbc r%d,__zero_reg__", z_reg + 1);
|
||||
eor64(&yreg, &xreg);
|
||||
rightRotate3(&yreg);
|
||||
insn_printf("st -X,r%d", REGn(&yreg, 7)); // store s back into l[4]
|
||||
insn_printf("st -X,r%d", REGn(&yreg, 6));
|
||||
insn_printf("st -X,r%d", REGn(&yreg, 5));
|
||||
insn_printf("st -X,r%d", REGn(&yreg, 4));
|
||||
insn_printf("st -X,r%d", REGn(&yreg, 3));
|
||||
insn_printf("st -X,r%d", REGn(&yreg, 2));
|
||||
insn_printf("st -X,r%d", REGn(&yreg, 1));
|
||||
insn_printf("st -X,r%d", REGn(&yreg, 0));
|
||||
insn_printf("adiw r%d,8", x_reg); // X = &(l[5])
|
||||
// l[li_in] = leftRotate8_64((l[li_out] ^ i) - s);
|
||||
insn_printf("eor r%d,r%d", t1_reg, temp_reg); // x ^= i
|
||||
sub64(&xreg, &yreg); // x -= s
|
||||
leftRotate8(&xreg); // x = leftRotate8(x)
|
||||
insn_printf("add r%d,%%A2", z_reg); // Z = &(l[li_in])
|
||||
insn_printf("adc r%d,__zero_reg__", z_reg + 1);
|
||||
insn_printf("st Z,r%d", REGn(&xreg, 0)); // l[li_in] = x
|
||||
insn_printf("std Z+1,r%d", REGn(&xreg, 1));
|
||||
insn_printf("std Z+2,r%d", REGn(&xreg, 2));
|
||||
insn_printf("std Z+3,r%d", REGn(&xreg, 3));
|
||||
insn_printf("std Z+4,r%d", REGn(&xreg, 4));
|
||||
insn_printf("std Z+5,r%d", REGn(&xreg, 5));
|
||||
insn_printf("std Z+6,r%d", REGn(&xreg, 6));
|
||||
insn_printf("std Z+7,r%d", REGn(&xreg, 7));
|
||||
insn_printf("sub r%d,%%A2", z_reg); // Z = &(l[0])
|
||||
insn_printf("sbc r%d,__zero_reg__", z_reg + 1);
|
||||
|
||||
// Restore the original x and y.
|
||||
pop64(&yreg);
|
||||
pop64(&xreg);
|
||||
|
||||
// Bottom of the loop.
|
||||
insn_printf("rjmp 2b");
|
||||
insn_printf("4:");
|
||||
|
||||
// Pack the results into the output buffer.
|
||||
pack_output();
|
||||
|
||||
// Declare the registers that we need.
|
||||
indent_printf(": : \"x\"(this->l), \"z\"(l), \"r\"(input), \"Q\"(output), \"Q\"(li_out), \"Q\"(r), \"Q\"(li_in)\n");
|
||||
temp_regs();
|
||||
indent_printf(", \"r%d\", \"r%d\"\n", const_reg, temp_reg);
|
||||
indent -= 4;
|
||||
indent_printf(");\n");
|
||||
printf("}\n\n");
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
full_setkey();
|
||||
full_enc();
|
||||
full_dec();
|
||||
|
||||
tiny_enc();
|
||||
|
||||
small_dec();
|
||||
return 0;
|
||||
}
|
@ -105,6 +105,7 @@ size_t Speck::keySize() const
|
||||
bool Speck::setKey(const uint8_t *key, size_t len)
|
||||
{
|
||||
#if USE_AVR_INLINE_ASM
|
||||
// Automatically generated by the genspeck tool.
|
||||
uint64_t l[4];
|
||||
uint8_t m, mb;
|
||||
if (len == 32) {
|
||||
@ -120,44 +121,39 @@ bool Speck::setKey(const uint8_t *key, size_t len)
|
||||
return false;
|
||||
}
|
||||
rounds = 30 + m;
|
||||
|
||||
// Copy the first (m - 1) * 8 bytes of the key into the "l" array
|
||||
// in reverse order to convert big endian into little-endian.
|
||||
uint8_t r = rounds - 1;
|
||||
__asm__ __volatile__ (
|
||||
"ld __tmp_reg__,-X\n"
|
||||
"st Z+,__tmp_reg__\n"
|
||||
"ld __tmp_reg__,-X\n"
|
||||
"st Z+,__tmp_reg__\n"
|
||||
"ld __tmp_reg__,-X\n"
|
||||
"st Z+,__tmp_reg__\n"
|
||||
"ld __tmp_reg__,-X\n"
|
||||
"st Z+,__tmp_reg__\n"
|
||||
"ld __tmp_reg__,-X\n"
|
||||
"st Z+,__tmp_reg__\n"
|
||||
"ld __tmp_reg__,-X\n"
|
||||
"st Z+,__tmp_reg__\n"
|
||||
"ld __tmp_reg__,-X\n"
|
||||
"st Z+,__tmp_reg__\n"
|
||||
"ld __tmp_reg__,-X\n"
|
||||
"st Z+,__tmp_reg__\n"
|
||||
"sbiw r30,8\n"
|
||||
"movw r10,r30\n"
|
||||
"movw r30,%A2\n"
|
||||
"ldd r8,%3\n"
|
||||
"1:\n"
|
||||
"ld __tmp_reg__,-Z\n"
|
||||
"st X+,__tmp_reg__\n"
|
||||
"dec %2\n"
|
||||
"ld __tmp_reg__,-X\n"
|
||||
"st Z+,__tmp_reg__\n"
|
||||
"dec r8\n"
|
||||
"brne 1b\n"
|
||||
: : "x"(l), "z"(key + len - 8), "r"(mb)
|
||||
);
|
||||
|
||||
// Copy the final 8 bytes of the key into k[0] in reverse order.
|
||||
__asm__ __volatile__ (
|
||||
"1:\n"
|
||||
"ld __tmp_reg__,-Z\n"
|
||||
"st X+,__tmp_reg__\n"
|
||||
"dec %2\n"
|
||||
"brne 1b\n"
|
||||
: : "x"(k), "z"(key + len), "r"(8)
|
||||
);
|
||||
|
||||
// Expand the key to the full key schedule.
|
||||
uint8_t li_in = 0;
|
||||
uint8_t li_out = m - 1;
|
||||
for (uint8_t i = 0; i < (rounds - 1); ++i) {
|
||||
__asm__ __volatile__ (
|
||||
// l[li_out] = (k[i] + rightRotate8_64(l[li_in])) ^ i;
|
||||
"ld r15,X+\n" // x = rightRotate8_64(l[li_in])
|
||||
"ld r8,X+\n"
|
||||
"ld r9,X+\n"
|
||||
"ld r10,X+\n"
|
||||
"ld r11,X+\n"
|
||||
"ld r12,X+\n"
|
||||
"ld r13,X+\n"
|
||||
"ld r14,X+\n"
|
||||
|
||||
"ld r16,Z+\n" // y = k[i]
|
||||
"movw r26,%A2\n"
|
||||
"movw r30,r10\n"
|
||||
"clr %A2\n"
|
||||
"ldd %B2,%3\n"
|
||||
"clr r25\n"
|
||||
"ld r16,Z+\n"
|
||||
"ld r17,Z+\n"
|
||||
"ld r18,Z+\n"
|
||||
"ld r19,Z+\n"
|
||||
@ -165,8 +161,21 @@ bool Speck::setKey(const uint8_t *key, size_t len)
|
||||
"ld r21,Z+\n"
|
||||
"ld r22,Z+\n"
|
||||
"ld r23,Z+\n"
|
||||
|
||||
"add r8,r16\n" // x += y
|
||||
"2:\n"
|
||||
"add r26,%A2\n"
|
||||
"adc r27,__zero_reg__\n"
|
||||
"ld r15,X+\n"
|
||||
"ld r8,X+\n"
|
||||
"ld r9,X+\n"
|
||||
"ld r10,X+\n"
|
||||
"ld r11,X+\n"
|
||||
"ld r12,X+\n"
|
||||
"ld r13,X+\n"
|
||||
"ld r14,X+\n"
|
||||
"sub r26,%A2\n"
|
||||
"sbc r27,__zero_reg__\n"
|
||||
"sbiw r26,8\n"
|
||||
"add r8,r16\n"
|
||||
"adc r9,r17\n"
|
||||
"adc r10,r18\n"
|
||||
"adc r11,r19\n"
|
||||
@ -174,11 +183,9 @@ bool Speck::setKey(const uint8_t *key, size_t len)
|
||||
"adc r13,r21\n"
|
||||
"adc r14,r22\n"
|
||||
"adc r15,r23\n"
|
||||
|
||||
"eor r8,%3\n" // x ^= i
|
||||
|
||||
// k[i + 1] = leftRotate3_64(k[i]) ^ l[li_out];
|
||||
"movw r26,%A2\n" // l[li_out] = x
|
||||
"eor r8,r25\n"
|
||||
"add r26,%B2\n"
|
||||
"adc r27,__zero_reg__\n"
|
||||
"st X+,r8\n"
|
||||
"st X+,r9\n"
|
||||
"st X+,r10\n"
|
||||
@ -187,8 +194,10 @@ bool Speck::setKey(const uint8_t *key, size_t len)
|
||||
"st X+,r13\n"
|
||||
"st X+,r14\n"
|
||||
"st X+,r15\n"
|
||||
|
||||
"lsl r16\n" // y = leftRotate1_64(y)
|
||||
"sub r26,%B2\n"
|
||||
"sbc r27,__zero_reg__\n"
|
||||
"sbiw r26,8\n"
|
||||
"lsl r16\n"
|
||||
"rol r17\n"
|
||||
"rol r18\n"
|
||||
"rol r19\n"
|
||||
@ -196,9 +205,8 @@ bool Speck::setKey(const uint8_t *key, size_t len)
|
||||
"rol r21\n"
|
||||
"rol r22\n"
|
||||
"rol r23\n"
|
||||
"adc r16,__zero_reg__\n"
|
||||
|
||||
"lsl r16\n" // y = leftRotate1_64(y)
|
||||
"adc r16, __zero_reg__\n"
|
||||
"lsl r16\n"
|
||||
"rol r17\n"
|
||||
"rol r18\n"
|
||||
"rol r19\n"
|
||||
@ -206,9 +214,8 @@ bool Speck::setKey(const uint8_t *key, size_t len)
|
||||
"rol r21\n"
|
||||
"rol r22\n"
|
||||
"rol r23\n"
|
||||
"adc r16,__zero_reg__\n"
|
||||
|
||||
"lsl r16\n" // y = leftRotate1_64(y)
|
||||
"adc r16, __zero_reg__\n"
|
||||
"lsl r16\n"
|
||||
"rol r17\n"
|
||||
"rol r18\n"
|
||||
"rol r19\n"
|
||||
@ -216,9 +223,8 @@ bool Speck::setKey(const uint8_t *key, size_t len)
|
||||
"rol r21\n"
|
||||
"rol r22\n"
|
||||
"rol r23\n"
|
||||
"adc r16,__zero_reg__\n"
|
||||
|
||||
"eor r16,r8\n" // y ^= x
|
||||
"adc r16, __zero_reg__\n"
|
||||
"eor r16,r8\n"
|
||||
"eor r17,r9\n"
|
||||
"eor r18,r10\n"
|
||||
"eor r19,r11\n"
|
||||
@ -226,8 +232,7 @@ bool Speck::setKey(const uint8_t *key, size_t len)
|
||||
"eor r21,r13\n"
|
||||
"eor r22,r14\n"
|
||||
"eor r23,r15\n"
|
||||
|
||||
"st Z+,r16\n" // k[i + 1] = y
|
||||
"st Z+,r16\n"
|
||||
"st Z+,r17\n"
|
||||
"st Z+,r18\n"
|
||||
"st Z+,r19\n"
|
||||
@ -235,19 +240,29 @@ bool Speck::setKey(const uint8_t *key, size_t len)
|
||||
"st Z+,r21\n"
|
||||
"st Z+,r22\n"
|
||||
"st Z+,r23\n"
|
||||
|
||||
: : "z"(&(k[i])), "x"(&(l[li_in])),
|
||||
"r"(&(l[li_out])),
|
||||
"r"(i)
|
||||
"ldi r24,8\n"
|
||||
"add %A2,r24\n"
|
||||
"add %B2,r24\n"
|
||||
"ldi r24,0x1F\n"
|
||||
"and %A2,r24\n"
|
||||
"and %B2,r24\n"
|
||||
"ldd r8,%4\n"
|
||||
"inc r25\n"
|
||||
"cp r25,r8\n"
|
||||
"breq 3f\n"
|
||||
"rjmp 2b\n"
|
||||
"3:\n"
|
||||
"ldi r24,32\n"
|
||||
"4:\n"
|
||||
"st X+,__zero_reg__\n"
|
||||
"dec r24\n"
|
||||
"brne 4b\n"
|
||||
: : "z"(k), "x"(key + len), "r"(l), "Q"(mb), "Q"(r)
|
||||
: "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
|
||||
"r24", "r25"
|
||||
"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "memory"
|
||||
, "r24", "r25"
|
||||
);
|
||||
if ((++li_in) >= m)
|
||||
li_in = 0;
|
||||
if ((++li_out) >= m)
|
||||
li_out = 0;
|
||||
}
|
||||
return true;
|
||||
#else
|
||||
uint64_t l[4];
|
||||
uint8_t m;
|
||||
@ -280,138 +295,118 @@ bool Speck::setKey(const uint8_t *key, size_t len)
|
||||
if ((++li_out) >= m)
|
||||
li_out = 0;
|
||||
}
|
||||
#endif
|
||||
clean(l);
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
void Speck::encryptBlock(uint8_t *output, const uint8_t *input)
|
||||
{
|
||||
#if USE_AVR_INLINE_ASM
|
||||
uint32_t xlow, xhigh, ylow, yhigh;
|
||||
|
||||
// Unpack the input into the x and y variables, converting
|
||||
// from big-endian into little-endian in the process.
|
||||
__asm__ __volatile__ (
|
||||
"ld %D1,Z\n"
|
||||
"ldd %C1,Z+1\n"
|
||||
"ldd %B1,Z+2\n"
|
||||
"ldd %A1,Z+3\n"
|
||||
"ldd %D0,Z+4\n"
|
||||
"ldd %C0,Z+5\n"
|
||||
"ldd %B0,Z+6\n"
|
||||
"ldd %A0,Z+7\n"
|
||||
"ldd %D3,Z+8\n"
|
||||
"ldd %C3,Z+9\n"
|
||||
"ldd %B3,Z+10\n"
|
||||
"ldd %A3,Z+11\n"
|
||||
"ldd %D2,Z+12\n"
|
||||
"ldd %C2,Z+13\n"
|
||||
"ldd %B2,Z+14\n"
|
||||
"ldd %A2,Z+15\n"
|
||||
: "=r"(xlow), "=r"(xhigh), "=r"(ylow), "=r"(yhigh)
|
||||
: "z"(input)
|
||||
);
|
||||
|
||||
// Perform all encryption rounds. Z points to the key schedule.
|
||||
// Automatically generated by the genspeck tool.
|
||||
__asm__ __volatile__ (
|
||||
"ld r15,X+\n"
|
||||
"ld r14,X+\n"
|
||||
"ld r13,X+\n"
|
||||
"ld r12,X+\n"
|
||||
"ld r11,X+\n"
|
||||
"ld r10,X+\n"
|
||||
"ld r9,X+\n"
|
||||
"ld r8,X+\n"
|
||||
"ld r23,X+\n"
|
||||
"ld r22,X+\n"
|
||||
"ld r21,X+\n"
|
||||
"ld r20,X+\n"
|
||||
"ld r19,X+\n"
|
||||
"ld r18,X+\n"
|
||||
"ld r17,X+\n"
|
||||
"ld r16,X\n"
|
||||
"1:\n"
|
||||
// x = (rightRotate8_64(x) + y) ^ *s++;
|
||||
"add %B0,%A2\n" // x = rightRotate8_64(x), x += y
|
||||
"adc %C0,%B2\n" // Note: right rotate is implicit.
|
||||
"adc %D0,%C2\n"
|
||||
"adc %A1,%D2\n"
|
||||
"adc %B1,%A3\n"
|
||||
"adc %C1,%B3\n"
|
||||
"adc %D1,%C3\n"
|
||||
"adc %A0,%D3\n"
|
||||
|
||||
"ld __tmp_reg__,Z+\n" // x ^= *s++
|
||||
"eor __tmp_reg__,%B0\n" // Also fully apply the right rotate.
|
||||
"ld %B0,Z+\n"
|
||||
"eor %B0,%C0\n"
|
||||
"ld %C0,Z+\n"
|
||||
"eor %C0,%D0\n"
|
||||
"ld %D0,Z+\n"
|
||||
"eor %D0,%A1\n"
|
||||
"ld %A1,Z+\n"
|
||||
"eor %A1,%B1\n"
|
||||
"ld %B1,Z+\n"
|
||||
"eor %B1,%C1\n"
|
||||
"ld %C1,Z+\n"
|
||||
"eor %C1,%D1\n"
|
||||
"ld %D1,Z+\n"
|
||||
"eor %D1,%A0\n"
|
||||
"mov %A0,__tmp_reg__\n"
|
||||
|
||||
// y = leftRotate3_64(y) ^ x;
|
||||
"lsl %A2\n" // y = leftRotate1_64(y)
|
||||
"rol %B2\n"
|
||||
"rol %C2\n"
|
||||
"rol %D2\n"
|
||||
"rol %A3\n"
|
||||
"rol %B3\n"
|
||||
"rol %C3\n"
|
||||
"rol %D3\n"
|
||||
"adc %A2,__zero_reg__\n"
|
||||
|
||||
"lsl %A2\n" // y = leftRotate1_64(y)
|
||||
"rol %B2\n"
|
||||
"rol %C2\n"
|
||||
"rol %D2\n"
|
||||
"rol %A3\n"
|
||||
"rol %B3\n"
|
||||
"rol %C3\n"
|
||||
"rol %D3\n"
|
||||
"adc %A2,__zero_reg__\n"
|
||||
|
||||
"lsl %A2\n" // y = leftRotate1_64(y)
|
||||
"rol %B2\n"
|
||||
"rol %C2\n"
|
||||
"rol %D2\n"
|
||||
"rol %A3\n"
|
||||
"rol %B3\n"
|
||||
"rol %C3\n"
|
||||
"rol %D3\n"
|
||||
"adc %A2,__zero_reg__\n"
|
||||
|
||||
"eor %A2,%A0\n" // y ^= x
|
||||
"eor %B2,%B0\n"
|
||||
"eor %C2,%C0\n"
|
||||
"eor %D2,%D0\n"
|
||||
"eor %A3,%A1\n"
|
||||
"eor %B3,%B1\n"
|
||||
"eor %C3,%C1\n"
|
||||
"eor %D3,%D1\n"
|
||||
|
||||
// Loop
|
||||
"dec %5\n" // --round
|
||||
"add r9,r16\n"
|
||||
"adc r10,r17\n"
|
||||
"adc r11,r18\n"
|
||||
"adc r12,r19\n"
|
||||
"adc r13,r20\n"
|
||||
"adc r14,r21\n"
|
||||
"adc r15,r22\n"
|
||||
"adc r8,r23\n"
|
||||
"ld __tmp_reg__,Z+\n"
|
||||
"eor __tmp_reg__,r9\n"
|
||||
"ld r9,Z+\n"
|
||||
"eor r9,r10\n"
|
||||
"ld r10,Z+\n"
|
||||
"eor r10,r11\n"
|
||||
"ld r11,Z+\n"
|
||||
"eor r11,r12\n"
|
||||
"ld r12,Z+\n"
|
||||
"eor r12,r13\n"
|
||||
"ld r13,Z+\n"
|
||||
"eor r13,r14\n"
|
||||
"ld r14,Z+\n"
|
||||
"eor r14,r15\n"
|
||||
"ld r15,Z+\n"
|
||||
"eor r15,r8\n"
|
||||
"mov r8,__tmp_reg__\n"
|
||||
"lsl r16\n"
|
||||
"rol r17\n"
|
||||
"rol r18\n"
|
||||
"rol r19\n"
|
||||
"rol r20\n"
|
||||
"rol r21\n"
|
||||
"rol r22\n"
|
||||
"rol r23\n"
|
||||
"adc r16, __zero_reg__\n"
|
||||
"lsl r16\n"
|
||||
"rol r17\n"
|
||||
"rol r18\n"
|
||||
"rol r19\n"
|
||||
"rol r20\n"
|
||||
"rol r21\n"
|
||||
"rol r22\n"
|
||||
"rol r23\n"
|
||||
"adc r16, __zero_reg__\n"
|
||||
"lsl r16\n"
|
||||
"rol r17\n"
|
||||
"rol r18\n"
|
||||
"rol r19\n"
|
||||
"rol r20\n"
|
||||
"rol r21\n"
|
||||
"rol r22\n"
|
||||
"rol r23\n"
|
||||
"adc r16, __zero_reg__\n"
|
||||
"eor r16,r8\n"
|
||||
"eor r17,r9\n"
|
||||
"eor r18,r10\n"
|
||||
"eor r19,r11\n"
|
||||
"eor r20,r12\n"
|
||||
"eor r21,r13\n"
|
||||
"eor r22,r14\n"
|
||||
"eor r23,r15\n"
|
||||
"dec %2\n"
|
||||
"breq 2f\n"
|
||||
"rjmp 1b\n"
|
||||
"2:\n"
|
||||
: "+r"(xlow), "+r"(xhigh), "+r"(ylow), "+r"(yhigh)
|
||||
: "z"(k), "r"(rounds)
|
||||
);
|
||||
|
||||
// Pack the results into the output and convert back to big-endian.
|
||||
__asm__ __volatile__ (
|
||||
"st Z,%D1\n"
|
||||
"std Z+1,%C1\n"
|
||||
"std Z+2,%B1\n"
|
||||
"std Z+3,%A1\n"
|
||||
"std Z+4,%D0\n"
|
||||
"std Z+5,%C0\n"
|
||||
"std Z+6,%B0\n"
|
||||
"std Z+7,%A0\n"
|
||||
"std Z+8,%D3\n"
|
||||
"std Z+9,%C3\n"
|
||||
"std Z+10,%B3\n"
|
||||
"std Z+11,%A3\n"
|
||||
"std Z+12,%D2\n"
|
||||
"std Z+13,%C2\n"
|
||||
"std Z+14,%B2\n"
|
||||
"std Z+15,%A2\n"
|
||||
: : "r"(xlow), "r"(xhigh), "r"(ylow), "r"(yhigh), "z"(output)
|
||||
"ldd r26,%A3\n"
|
||||
"ldd r27,%B3\n"
|
||||
"st X+,r15\n"
|
||||
"st X+,r14\n"
|
||||
"st X+,r13\n"
|
||||
"st X+,r12\n"
|
||||
"st X+,r11\n"
|
||||
"st X+,r10\n"
|
||||
"st X+,r9\n"
|
||||
"st X+,r8\n"
|
||||
"st X+,r23\n"
|
||||
"st X+,r22\n"
|
||||
"st X+,r21\n"
|
||||
"st X+,r20\n"
|
||||
"st X+,r19\n"
|
||||
"st X+,r18\n"
|
||||
"st X+,r17\n"
|
||||
"st X,r16\n"
|
||||
: : "x"(input), "z"(k), "r"(rounds), "Q"(output)
|
||||
: "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "memory"
|
||||
);
|
||||
#else
|
||||
uint64_t x, y;
|
||||
@ -430,133 +425,113 @@ void Speck::encryptBlock(uint8_t *output, const uint8_t *input)
|
||||
void Speck::decryptBlock(uint8_t *output, const uint8_t *input)
|
||||
{
|
||||
#if USE_AVR_INLINE_ASM
|
||||
uint32_t xlow, xhigh, ylow, yhigh;
|
||||
|
||||
// Unpack the input into the x and y variables, converting
|
||||
// from big-endian into little-endian in the process.
|
||||
__asm__ __volatile__ (
|
||||
"ld %D1,Z\n"
|
||||
"ldd %C1,Z+1\n"
|
||||
"ldd %B1,Z+2\n"
|
||||
"ldd %A1,Z+3\n"
|
||||
"ldd %D0,Z+4\n"
|
||||
"ldd %C0,Z+5\n"
|
||||
"ldd %B0,Z+6\n"
|
||||
"ldd %A0,Z+7\n"
|
||||
"ldd %D3,Z+8\n"
|
||||
"ldd %C3,Z+9\n"
|
||||
"ldd %B3,Z+10\n"
|
||||
"ldd %A3,Z+11\n"
|
||||
"ldd %D2,Z+12\n"
|
||||
"ldd %C2,Z+13\n"
|
||||
"ldd %B2,Z+14\n"
|
||||
"ldd %A2,Z+15\n"
|
||||
: "=r"(xlow), "=r"(xhigh), "=r"(ylow), "=r"(yhigh)
|
||||
: "z"(input)
|
||||
);
|
||||
|
||||
// Perform all decryption rounds. Z points to the end of key schedule.
|
||||
// Automatically generated by the genspeck tool.
|
||||
__asm__ __volatile__ (
|
||||
"ld r15,X+\n"
|
||||
"ld r14,X+\n"
|
||||
"ld r13,X+\n"
|
||||
"ld r12,X+\n"
|
||||
"ld r11,X+\n"
|
||||
"ld r10,X+\n"
|
||||
"ld r9,X+\n"
|
||||
"ld r8,X+\n"
|
||||
"ld r23,X+\n"
|
||||
"ld r22,X+\n"
|
||||
"ld r21,X+\n"
|
||||
"ld r20,X+\n"
|
||||
"ld r19,X+\n"
|
||||
"ld r18,X+\n"
|
||||
"ld r17,X+\n"
|
||||
"ld r16,X\n"
|
||||
"1:\n"
|
||||
// y = rightRotate3_64(x ^ y);
|
||||
"eor %A2,%A0\n" // y ^= x
|
||||
"eor %B2,%B0\n"
|
||||
"eor %C2,%C0\n"
|
||||
"eor %D2,%D0\n"
|
||||
"eor %A3,%A1\n"
|
||||
"eor %B3,%B1\n"
|
||||
"eor %C3,%C1\n"
|
||||
"eor %D3,%D1\n"
|
||||
|
||||
"bst %A2,0\n" // y = rightRotate1_64(y)
|
||||
"ror %D3\n"
|
||||
"ror %C3\n"
|
||||
"ror %B3\n"
|
||||
"ror %A3\n"
|
||||
"ror %D2\n"
|
||||
"ror %C2\n"
|
||||
"ror %B2\n"
|
||||
"ror %A2\n"
|
||||
"bld %D3,7\n"
|
||||
|
||||
"bst %A2,0\n" // y = rightRotate1_64(y)
|
||||
"ror %D3\n"
|
||||
"ror %C3\n"
|
||||
"ror %B3\n"
|
||||
"ror %A3\n"
|
||||
"ror %D2\n"
|
||||
"ror %C2\n"
|
||||
"ror %B2\n"
|
||||
"ror %A2\n"
|
||||
"bld %D3,7\n"
|
||||
|
||||
"bst %A2,0\n" // y = rightRotate1_64(y)
|
||||
"ror %D3\n"
|
||||
"ror %C3\n"
|
||||
"ror %B3\n"
|
||||
"ror %A3\n"
|
||||
"ror %D2\n"
|
||||
"ror %C2\n"
|
||||
"ror %B2\n"
|
||||
"ror %A2\n"
|
||||
"bld %D3,7\n"
|
||||
|
||||
// x = leftRotate8_64((x ^ *s--) - y);
|
||||
"ld __tmp_reg__,-Z\n" // x ^= *s--
|
||||
"eor __tmp_reg__,%D1\n" // Note: also implicitly left-rotates regs
|
||||
"ld %D1,-Z\n"
|
||||
"eor %D1,%C1\n"
|
||||
"ld %C1,-Z\n"
|
||||
"eor %C1,%B1\n"
|
||||
"ld %B1,-Z\n"
|
||||
"eor %B1,%A1\n"
|
||||
"ld %A1,-Z\n"
|
||||
"eor %A1,%D0\n"
|
||||
"ld %D0,-Z\n"
|
||||
"eor %D0,%C0\n"
|
||||
"ld %C0,-Z\n"
|
||||
"eor %C0,%B0\n"
|
||||
"ld %B0,-Z\n"
|
||||
"eor %B0,%A0\n"
|
||||
"mov %A0,__tmp_reg__\n"
|
||||
|
||||
"sub %B0,%A2\n" // x -= y
|
||||
"sbc %C0,%B2\n" // Note: regs are already left-rotated
|
||||
"sbc %D0,%C2\n"
|
||||
"sbc %A1,%D2\n"
|
||||
"sbc %B1,%A3\n"
|
||||
"sbc %C1,%B3\n"
|
||||
"sbc %D1,%C3\n"
|
||||
"sbc %A0,%D3\n"
|
||||
|
||||
// Loop
|
||||
"dec %5\n" // --round
|
||||
"eor r16,r8\n"
|
||||
"eor r17,r9\n"
|
||||
"eor r18,r10\n"
|
||||
"eor r19,r11\n"
|
||||
"eor r20,r12\n"
|
||||
"eor r21,r13\n"
|
||||
"eor r22,r14\n"
|
||||
"eor r23,r15\n"
|
||||
"bst r16,0\n"
|
||||
"ror r23\n"
|
||||
"ror r22\n"
|
||||
"ror r21\n"
|
||||
"ror r20\n"
|
||||
"ror r19\n"
|
||||
"ror r18\n"
|
||||
"ror r17\n"
|
||||
"ror r16\n"
|
||||
"bld r23,7\n"
|
||||
"bst r16,0\n"
|
||||
"ror r23\n"
|
||||
"ror r22\n"
|
||||
"ror r21\n"
|
||||
"ror r20\n"
|
||||
"ror r19\n"
|
||||
"ror r18\n"
|
||||
"ror r17\n"
|
||||
"ror r16\n"
|
||||
"bld r23,7\n"
|
||||
"bst r16,0\n"
|
||||
"ror r23\n"
|
||||
"ror r22\n"
|
||||
"ror r21\n"
|
||||
"ror r20\n"
|
||||
"ror r19\n"
|
||||
"ror r18\n"
|
||||
"ror r17\n"
|
||||
"ror r16\n"
|
||||
"bld r23,7\n"
|
||||
"ld __tmp_reg__,-Z\n"
|
||||
"eor __tmp_reg__,r15\n"
|
||||
"ld r15,-Z\n"
|
||||
"eor r15,r14\n"
|
||||
"ld r14,-Z\n"
|
||||
"eor r14,r13\n"
|
||||
"ld r13,-Z\n"
|
||||
"eor r13,r12\n"
|
||||
"ld r12,-Z\n"
|
||||
"eor r12,r11\n"
|
||||
"ld r11,-Z\n"
|
||||
"eor r11,r10\n"
|
||||
"ld r10,-Z\n"
|
||||
"eor r10,r9\n"
|
||||
"ld r9,-Z\n"
|
||||
"eor r9,r8\n"
|
||||
"mov r8,__tmp_reg__\n"
|
||||
"sub r9,r16\n"
|
||||
"sbc r10,r17\n"
|
||||
"sbc r11,r18\n"
|
||||
"sbc r12,r19\n"
|
||||
"sbc r13,r20\n"
|
||||
"sbc r14,r21\n"
|
||||
"sbc r15,r22\n"
|
||||
"sbc r8,r23\n"
|
||||
"dec %2\n"
|
||||
"breq 2f\n"
|
||||
"rjmp 1b\n"
|
||||
"2:\n"
|
||||
: "+r"(xlow), "+r"(xhigh), "+r"(ylow), "+r"(yhigh)
|
||||
: "z"(k + rounds), "r"(rounds)
|
||||
);
|
||||
|
||||
// Pack the results into the output and convert back to big-endian.
|
||||
__asm__ __volatile__ (
|
||||
"st Z,%D1\n"
|
||||
"std Z+1,%C1\n"
|
||||
"std Z+2,%B1\n"
|
||||
"std Z+3,%A1\n"
|
||||
"std Z+4,%D0\n"
|
||||
"std Z+5,%C0\n"
|
||||
"std Z+6,%B0\n"
|
||||
"std Z+7,%A0\n"
|
||||
"std Z+8,%D3\n"
|
||||
"std Z+9,%C3\n"
|
||||
"std Z+10,%B3\n"
|
||||
"std Z+11,%A3\n"
|
||||
"std Z+12,%D2\n"
|
||||
"std Z+13,%C2\n"
|
||||
"std Z+14,%B2\n"
|
||||
"std Z+15,%A2\n"
|
||||
: : "r"(xlow), "r"(xhigh), "r"(ylow), "r"(yhigh), "z"(output)
|
||||
"ldd r26,%A3\n"
|
||||
"ldd r27,%B3\n"
|
||||
"st X+,r15\n"
|
||||
"st X+,r14\n"
|
||||
"st X+,r13\n"
|
||||
"st X+,r12\n"
|
||||
"st X+,r11\n"
|
||||
"st X+,r10\n"
|
||||
"st X+,r9\n"
|
||||
"st X+,r8\n"
|
||||
"st X+,r23\n"
|
||||
"st X+,r22\n"
|
||||
"st X+,r21\n"
|
||||
"st X+,r20\n"
|
||||
"st X+,r19\n"
|
||||
"st X+,r18\n"
|
||||
"st X+,r17\n"
|
||||
"st X,r16\n"
|
||||
: : "x"(input), "z"(k + rounds), "r"(rounds), "Q"(output)
|
||||
: "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "memory"
|
||||
);
|
||||
#else
|
||||
uint64_t x, y;
|
||||
|
@ -261,349 +261,283 @@ bool SpeckSmall::setKey(const uint8_t *key, size_t len)
|
||||
void SpeckSmall::decryptBlock(uint8_t *output, const uint8_t *input)
|
||||
{
|
||||
#if USE_AVR_INLINE_ASM
|
||||
uint64_t l[4];
|
||||
uint32_t xlow, xhigh, ylow, yhigh;
|
||||
uint32_t slow, shigh;
|
||||
uint8_t li_in = (rounds + 3) & 0x03;
|
||||
uint8_t li_out = (((rounds - 31) + li_in) & 0x03) * 8;
|
||||
li_in *= 8;
|
||||
|
||||
// Prepare to expand the key schedule.
|
||||
// Automatically generated by the genspeck tool.
|
||||
uint64_t l[5];
|
||||
uint8_t r = rounds;
|
||||
uint8_t li_in = ((r + 3) & 0x03) * 8;
|
||||
uint8_t li_out = ((((r - 31) & 0x03) * 8) + li_in) & 0x1F;
|
||||
__asm__ __volatile__ (
|
||||
"add r30,%4\n" // Z = &(this->l[li_out])
|
||||
"adc r31,__zero_reg__\n"
|
||||
"ld __tmp_reg__,Z\n" // s = this->l[li_out]
|
||||
"std %A0,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,Z+1\n"
|
||||
"std %B0,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,Z+2\n"
|
||||
"std %C0,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,Z+3\n"
|
||||
"std %D0,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,Z+4\n"
|
||||
"std %A1,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,Z+5\n"
|
||||
"std %B1,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,Z+6\n"
|
||||
"std %C1,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,Z+7\n"
|
||||
"std %D1,__tmp_reg__\n"
|
||||
"sub r30,%4\n" // Point Z back to the start of this->l.
|
||||
"sbc r31,__zero_reg__\n"
|
||||
|
||||
"ldi r25,32\n" // Copy the entire this->l array into l.
|
||||
"ldd r25,%4\n"
|
||||
"ldi r24,32\n"
|
||||
"1:\n"
|
||||
"ld __tmp_reg__,Z+\n"
|
||||
"st X+,__tmp_reg__\n"
|
||||
"dec r25\n"
|
||||
"ld __tmp_reg__,X+\n"
|
||||
"st Z+,__tmp_reg__\n"
|
||||
"dec r24\n"
|
||||
"brne 1b\n"
|
||||
: "=Q"(slow), "=Q"(shigh)
|
||||
: "z"(this->l), "x"(l), "r"(li_out)
|
||||
: "r25"
|
||||
);
|
||||
|
||||
// Unpack the input into the x and y variables, converting
|
||||
// from big-endian into little-endian in the process.
|
||||
__asm__ __volatile__ (
|
||||
"ld %D1,Z\n"
|
||||
"ldd %C1,Z+1\n"
|
||||
"ldd %B1,Z+2\n"
|
||||
"ldd %A1,Z+3\n"
|
||||
"ldd %D0,Z+4\n"
|
||||
"ldd %C0,Z+5\n"
|
||||
"ldd %B0,Z+6\n"
|
||||
"ldd %A0,Z+7\n"
|
||||
"ldd %D3,Z+8\n"
|
||||
"ldd %C3,Z+9\n"
|
||||
"ldd %B3,Z+10\n"
|
||||
"ldd %A3,Z+11\n"
|
||||
"ldd %D2,Z+12\n"
|
||||
"ldd %C2,Z+13\n"
|
||||
"ldd %B2,Z+14\n"
|
||||
"ldd %A2,Z+15\n"
|
||||
: "=r"(xlow), "=r"(xhigh), "=r"(ylow), "=r"(yhigh)
|
||||
: "z"(input)
|
||||
);
|
||||
|
||||
// Perform all decryption rounds while expanding the key schedule in-place.
|
||||
__asm__ __volatile__ (
|
||||
"mov r23,%9\n" // i = rounds - 1
|
||||
"dec r23\n"
|
||||
"1:\n"
|
||||
|
||||
// Adjust x and y for this round using the key schedule word s.
|
||||
|
||||
// y = rightRotate3_64(x ^ y);
|
||||
"eor %A2,%A0\n" // y ^= x
|
||||
"eor %B2,%B0\n"
|
||||
"eor %C2,%C0\n"
|
||||
"eor %D2,%D0\n"
|
||||
"eor %A3,%A1\n"
|
||||
"eor %B3,%B1\n"
|
||||
"eor %C3,%C1\n"
|
||||
"eor %D3,%D1\n"
|
||||
|
||||
"bst %A2,0\n" // y = rightRotate1_64(y)
|
||||
"ror %D3\n"
|
||||
"ror %C3\n"
|
||||
"ror %B3\n"
|
||||
"ror %A3\n"
|
||||
"ror %D2\n"
|
||||
"ror %C2\n"
|
||||
"ror %B2\n"
|
||||
"ror %A2\n"
|
||||
"bld %D3,7\n"
|
||||
|
||||
"bst %A2,0\n" // y = rightRotate1_64(y)
|
||||
"ror %D3\n"
|
||||
"ror %C3\n"
|
||||
"ror %B3\n"
|
||||
"ror %A3\n"
|
||||
"ror %D2\n"
|
||||
"ror %C2\n"
|
||||
"ror %B2\n"
|
||||
"ror %A2\n"
|
||||
"bld %D3,7\n"
|
||||
|
||||
"bst %A2,0\n" // y = rightRotate1_64(y)
|
||||
"ror %D3\n"
|
||||
"ror %C3\n"
|
||||
"ror %B3\n"
|
||||
"ror %A3\n"
|
||||
"ror %D2\n"
|
||||
"ror %C2\n"
|
||||
"ror %B2\n"
|
||||
"ror %A2\n"
|
||||
"bld %D3,7\n"
|
||||
|
||||
// x = leftRotate8_64((x ^ s) - y);
|
||||
"ldd __tmp_reg__,%A4\n" // x ^= s
|
||||
"eor %A0,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,%B4\n"
|
||||
"eor %B0,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,%C4\n"
|
||||
"eor %C0,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,%D4\n"
|
||||
"eor %D0,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,%A5\n"
|
||||
"eor %A1,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,%B5\n"
|
||||
"eor %B1,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,%C5\n"
|
||||
"eor %C1,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,%D5\n"
|
||||
"eor %D1,__tmp_reg__\n"
|
||||
|
||||
"sub %A0,%A2\n" // x -= y
|
||||
"sbc %B0,%B2\n"
|
||||
"sbc %C0,%C2\n"
|
||||
"sbc %D0,%D2\n"
|
||||
"sbc %A1,%A3\n"
|
||||
"sbc %B1,%B3\n"
|
||||
"sbc %C1,%C3\n"
|
||||
"sbc %D1,%D3\n"
|
||||
|
||||
"mov __tmp_reg__,%D1\n" // x = lefRotate8_64(x)
|
||||
"mov %D1,%C1\n"
|
||||
"mov %C1,%B1\n"
|
||||
"mov %B1,%A1\n"
|
||||
"mov %A1,%D0\n"
|
||||
"mov %D0,%C0\n"
|
||||
"mov %C0,%B0\n"
|
||||
"mov %B0,%A0\n"
|
||||
"mov %A0,__tmp_reg__\n"
|
||||
|
||||
// On the last round we don't need to compute s so we
|
||||
// can exit early here if i == 0.
|
||||
"or r23,r23\n" // if (i == 0)
|
||||
"brne 2f\n"
|
||||
"rjmp 3f\n"
|
||||
"movw r26,r30\n"
|
||||
"sbiw r30,32\n"
|
||||
"add r30,r25\n"
|
||||
"adc r31,__zero_reg__\n"
|
||||
"ld __tmp_reg__,Z\n"
|
||||
"st X+,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,Z+1\n"
|
||||
"st X+,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,Z+2\n"
|
||||
"st X+,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,Z+3\n"
|
||||
"st X+,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,Z+4\n"
|
||||
"st X+,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,Z+5\n"
|
||||
"st X+,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,Z+6\n"
|
||||
"st X+,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,Z+7\n"
|
||||
"st X+,__tmp_reg__\n"
|
||||
"sub r30,r25\n"
|
||||
"sbc r31,__zero_reg__\n"
|
||||
"movw r26,%A2\n"
|
||||
"ld r15,X+\n"
|
||||
"ld r14,X+\n"
|
||||
"ld r13,X+\n"
|
||||
"ld r12,X+\n"
|
||||
"ld r11,X+\n"
|
||||
"ld r10,X+\n"
|
||||
"ld r9,X+\n"
|
||||
"ld r8,X+\n"
|
||||
"ld r23,X+\n"
|
||||
"ld r22,X+\n"
|
||||
"ld r21,X+\n"
|
||||
"ld r20,X+\n"
|
||||
"ld r19,X+\n"
|
||||
"ld r18,X+\n"
|
||||
"ld r17,X+\n"
|
||||
"ld r16,X\n"
|
||||
"ldd %A2,%6\n"
|
||||
"mov %B2,r25\n"
|
||||
"ldd r25,%5\n"
|
||||
"dec r25\n"
|
||||
"movw r26,r30\n"
|
||||
"adiw r26,40\n"
|
||||
"2:\n"
|
||||
"dec r23\n" // --i
|
||||
|
||||
// Save x and y on the stack so we can reuse registers for t and s.
|
||||
"push %A0\n"
|
||||
"push %B0\n"
|
||||
"push %C0\n"
|
||||
"push %D0\n"
|
||||
"push %A1\n"
|
||||
"push %B1\n"
|
||||
"push %C1\n"
|
||||
"push %D1\n"
|
||||
"push %A2\n"
|
||||
"push %B2\n"
|
||||
"push %C2\n"
|
||||
"push %D2\n"
|
||||
"push %A3\n"
|
||||
"push %B3\n"
|
||||
"push %C3\n"
|
||||
"push %D3\n"
|
||||
|
||||
// Compute the key schedule word s for the next round.
|
||||
|
||||
// li_out = (li_out + 3) & 0x03;
|
||||
"ldd r24,%7\n"
|
||||
"ldi r25,24\n"
|
||||
"add r24,r25\n"
|
||||
"andi r24,0x1f\n"
|
||||
"std %7,r24\n"
|
||||
|
||||
// s = rightRotate3_64(s ^ l[li_out]);
|
||||
"add %A8,r24\n" // Z = &(l[li_out])
|
||||
"adc %B8,__zero_reg__\n"
|
||||
|
||||
"ld %A0,Z\n" // t = l[li_out]
|
||||
"ldd %B0,Z+1\n"
|
||||
"ldd %C0,Z+2\n"
|
||||
"ldd %D0,Z+3\n"
|
||||
"ldd %A1,Z+4\n"
|
||||
"ldd %B1,Z+5\n"
|
||||
"ldd %C1,Z+6\n"
|
||||
"ldd %D1,Z+7\n"
|
||||
|
||||
"ldd %A2,%A4\n" // load s
|
||||
"ldd %B2,%B4\n"
|
||||
"ldd %C2,%C4\n"
|
||||
"ldd %D2,%D4\n"
|
||||
"ldd %A3,%A5\n"
|
||||
"ldd %B3,%B5\n"
|
||||
"ldd %C3,%C5\n"
|
||||
"ldd %D3,%D5\n"
|
||||
|
||||
"eor %A2,%A0\n" // s ^= t
|
||||
"eor %B2,%B0\n"
|
||||
"eor %C2,%C0\n"
|
||||
"eor %D2,%D0\n"
|
||||
"eor %A3,%A1\n"
|
||||
"eor %B3,%B1\n"
|
||||
"eor %C3,%C1\n"
|
||||
"eor %D3,%D1\n"
|
||||
|
||||
"bst %A2,0\n" // s = rightRotate1_64(s)
|
||||
"ror %D3\n"
|
||||
"ror %C3\n"
|
||||
"ror %B3\n"
|
||||
"ror %A3\n"
|
||||
"ror %D2\n"
|
||||
"ror %C2\n"
|
||||
"ror %B2\n"
|
||||
"ror %A2\n"
|
||||
"bld %D3,7\n"
|
||||
|
||||
"bst %A2,0\n" // s = rightRotate1_64(s)
|
||||
"ror %D3\n"
|
||||
"ror %C3\n"
|
||||
"ror %B3\n"
|
||||
"ror %A3\n"
|
||||
"ror %D2\n"
|
||||
"ror %C2\n"
|
||||
"ror %B2\n"
|
||||
"ror %A2\n"
|
||||
"bld %D3,7\n"
|
||||
|
||||
"bst %A2,0\n" // s = rightRotate1_64(s)
|
||||
"ror %D3\n"
|
||||
"ror %C3\n"
|
||||
"ror %B3\n"
|
||||
"ror %A3\n"
|
||||
"ror %D2\n"
|
||||
"ror %C2\n"
|
||||
"ror %B2\n"
|
||||
"ror %A2\n"
|
||||
"bld %D3,7\n"
|
||||
|
||||
"sub %A8,r24\n" // Z -= li_out
|
||||
"sbc %B8,__zero_reg__\n"
|
||||
|
||||
// li_in = (li_in + 3) & 0x03;
|
||||
"ldd r24,%6\n"
|
||||
"add r24,r25\n"
|
||||
"andi r24,0x1f\n"
|
||||
"std %6,r24\n"
|
||||
|
||||
// l[li_in] = leftRotate8_64((l[li_out] ^ i) - s);
|
||||
"add %A8,r24\n" // Z = &(l[li_in])
|
||||
"adc %B8,__zero_reg__\n"
|
||||
|
||||
"eor %A0,r23\n" // t ^= i
|
||||
|
||||
"sub %A0,%A2\n" // t -= s
|
||||
"sbc %B0,%B2\n"
|
||||
"sbc %C0,%C2\n"
|
||||
"sbc %D0,%D2\n"
|
||||
"sbc %A1,%A3\n"
|
||||
"sbc %B1,%B3\n"
|
||||
"sbc %C1,%C3\n"
|
||||
"sbc %D1,%D3\n"
|
||||
|
||||
"st Z,%D1\n" // l[li_in] = leftRotate8_64(t)
|
||||
"std Z+1,%A0\n"
|
||||
"std Z+2,%B0\n"
|
||||
"std Z+3,%C0\n"
|
||||
"std Z+4,%D0\n"
|
||||
"std Z+5,%A1\n"
|
||||
"std Z+6,%B1\n"
|
||||
"std Z+7,%C1\n"
|
||||
|
||||
"sub %A8,r24\n" // Z -= li_in
|
||||
"sbc %B8,__zero_reg__\n"
|
||||
|
||||
"std %A4,%A2\n" // store s
|
||||
"std %B4,%B2\n"
|
||||
"std %C4,%C2\n"
|
||||
"std %D4,%D2\n"
|
||||
"std %A5,%A3\n"
|
||||
"std %B5,%B3\n"
|
||||
"std %C5,%C3\n"
|
||||
"std %D5,%D3\n"
|
||||
|
||||
// Pop registers from the stack to recover the x and y values.
|
||||
"pop %D3\n"
|
||||
"pop %C3\n"
|
||||
"pop %B3\n"
|
||||
"pop %A3\n"
|
||||
"pop %D2\n"
|
||||
"pop %C2\n"
|
||||
"pop %B2\n"
|
||||
"pop %A2\n"
|
||||
"pop %D1\n"
|
||||
"pop %C1\n"
|
||||
"pop %B1\n"
|
||||
"pop %A1\n"
|
||||
"pop %D0\n"
|
||||
"pop %C0\n"
|
||||
"pop %B0\n"
|
||||
"pop %A0\n"
|
||||
|
||||
// Bottom of the loop.
|
||||
"rjmp 1b\n"
|
||||
"eor r16,r8\n"
|
||||
"eor r17,r9\n"
|
||||
"eor r18,r10\n"
|
||||
"eor r19,r11\n"
|
||||
"eor r20,r12\n"
|
||||
"eor r21,r13\n"
|
||||
"eor r22,r14\n"
|
||||
"eor r23,r15\n"
|
||||
"bst r16,0\n"
|
||||
"ror r23\n"
|
||||
"ror r22\n"
|
||||
"ror r21\n"
|
||||
"ror r20\n"
|
||||
"ror r19\n"
|
||||
"ror r18\n"
|
||||
"ror r17\n"
|
||||
"ror r16\n"
|
||||
"bld r23,7\n"
|
||||
"bst r16,0\n"
|
||||
"ror r23\n"
|
||||
"ror r22\n"
|
||||
"ror r21\n"
|
||||
"ror r20\n"
|
||||
"ror r19\n"
|
||||
"ror r18\n"
|
||||
"ror r17\n"
|
||||
"ror r16\n"
|
||||
"bld r23,7\n"
|
||||
"bst r16,0\n"
|
||||
"ror r23\n"
|
||||
"ror r22\n"
|
||||
"ror r21\n"
|
||||
"ror r20\n"
|
||||
"ror r19\n"
|
||||
"ror r18\n"
|
||||
"ror r17\n"
|
||||
"ror r16\n"
|
||||
"bld r23,7\n"
|
||||
"ld __tmp_reg__,-X\n"
|
||||
"eor __tmp_reg__,r15\n"
|
||||
"ld r15,-X\n"
|
||||
"eor r15,r14\n"
|
||||
"ld r14,-X\n"
|
||||
"eor r14,r13\n"
|
||||
"ld r13,-X\n"
|
||||
"eor r13,r12\n"
|
||||
"ld r12,-X\n"
|
||||
"eor r12,r11\n"
|
||||
"ld r11,-X\n"
|
||||
"eor r11,r10\n"
|
||||
"ld r10,-X\n"
|
||||
"eor r10,r9\n"
|
||||
"ld r9,-X\n"
|
||||
"eor r9,r8\n"
|
||||
"mov r8,__tmp_reg__\n"
|
||||
"sub r9,r16\n"
|
||||
"sbc r10,r17\n"
|
||||
"sbc r11,r18\n"
|
||||
"sbc r12,r19\n"
|
||||
"sbc r13,r20\n"
|
||||
"sbc r14,r21\n"
|
||||
"sbc r15,r22\n"
|
||||
"sbc r8,r23\n"
|
||||
"or r25,r25\n"
|
||||
"brne 3f\n"
|
||||
"rjmp 4f\n"
|
||||
"3:\n"
|
||||
|
||||
: "+r"(xlow), "+r"(xhigh), "+r"(ylow), "+r"(yhigh),
|
||||
"+Q"(slow), "+Q"(shigh), "+Q"(li_in), "+Q"(li_out)
|
||||
: "z"(l), "r"(rounds)
|
||||
: "r23", "r24", "r25"
|
||||
);
|
||||
|
||||
// Pack the results into the output and convert back to big-endian.
|
||||
__asm__ __volatile__ (
|
||||
"st Z,%D1\n"
|
||||
"std Z+1,%C1\n"
|
||||
"std Z+2,%B1\n"
|
||||
"std Z+3,%A1\n"
|
||||
"std Z+4,%D0\n"
|
||||
"std Z+5,%C0\n"
|
||||
"std Z+6,%B0\n"
|
||||
"std Z+7,%A0\n"
|
||||
"std Z+8,%D3\n"
|
||||
"std Z+9,%C3\n"
|
||||
"std Z+10,%B3\n"
|
||||
"std Z+11,%A3\n"
|
||||
"std Z+12,%D2\n"
|
||||
"std Z+13,%C2\n"
|
||||
"std Z+14,%B2\n"
|
||||
"std Z+15,%A2\n"
|
||||
: : "r"(xlow), "r"(xhigh), "r"(ylow), "r"(yhigh), "z"(output)
|
||||
"dec r25\n"
|
||||
"push r8\n"
|
||||
"push r9\n"
|
||||
"push r10\n"
|
||||
"push r11\n"
|
||||
"push r12\n"
|
||||
"push r13\n"
|
||||
"push r14\n"
|
||||
"push r15\n"
|
||||
"push r16\n"
|
||||
"push r17\n"
|
||||
"push r18\n"
|
||||
"push r19\n"
|
||||
"push r20\n"
|
||||
"push r21\n"
|
||||
"push r22\n"
|
||||
"push r23\n"
|
||||
"ldi r24,24\n"
|
||||
"add %A2,r24\n"
|
||||
"add %B2,r24\n"
|
||||
"ldi r24,0x1F\n"
|
||||
"and %A2,r24\n"
|
||||
"and %B2,r24\n"
|
||||
"ld r16,X+\n"
|
||||
"ld r17,X+\n"
|
||||
"ld r18,X+\n"
|
||||
"ld r19,X+\n"
|
||||
"ld r20,X+\n"
|
||||
"ld r21,X+\n"
|
||||
"ld r22,X+\n"
|
||||
"ld r23,X+\n"
|
||||
"add r30,%B2\n"
|
||||
"adc r31,__zero_reg__\n"
|
||||
"ld r8,Z\n"
|
||||
"ldd r9,Z+1\n"
|
||||
"ldd r10,Z+2\n"
|
||||
"ldd r11,Z+3\n"
|
||||
"ldd r12,Z+4\n"
|
||||
"ldd r13,Z+5\n"
|
||||
"ldd r14,Z+6\n"
|
||||
"ldd r15,Z+7\n"
|
||||
"sub r30,%B2\n"
|
||||
"sbc r31,__zero_reg__\n"
|
||||
"eor r16,r8\n"
|
||||
"eor r17,r9\n"
|
||||
"eor r18,r10\n"
|
||||
"eor r19,r11\n"
|
||||
"eor r20,r12\n"
|
||||
"eor r21,r13\n"
|
||||
"eor r22,r14\n"
|
||||
"eor r23,r15\n"
|
||||
"bst r16,0\n"
|
||||
"ror r23\n"
|
||||
"ror r22\n"
|
||||
"ror r21\n"
|
||||
"ror r20\n"
|
||||
"ror r19\n"
|
||||
"ror r18\n"
|
||||
"ror r17\n"
|
||||
"ror r16\n"
|
||||
"bld r23,7\n"
|
||||
"bst r16,0\n"
|
||||
"ror r23\n"
|
||||
"ror r22\n"
|
||||
"ror r21\n"
|
||||
"ror r20\n"
|
||||
"ror r19\n"
|
||||
"ror r18\n"
|
||||
"ror r17\n"
|
||||
"ror r16\n"
|
||||
"bld r23,7\n"
|
||||
"bst r16,0\n"
|
||||
"ror r23\n"
|
||||
"ror r22\n"
|
||||
"ror r21\n"
|
||||
"ror r20\n"
|
||||
"ror r19\n"
|
||||
"ror r18\n"
|
||||
"ror r17\n"
|
||||
"ror r16\n"
|
||||
"bld r23,7\n"
|
||||
"st -X,r23\n"
|
||||
"st -X,r22\n"
|
||||
"st -X,r21\n"
|
||||
"st -X,r20\n"
|
||||
"st -X,r19\n"
|
||||
"st -X,r18\n"
|
||||
"st -X,r17\n"
|
||||
"st -X,r16\n"
|
||||
"adiw r26,8\n"
|
||||
"eor r8,r25\n"
|
||||
"sub r8,r16\n"
|
||||
"sbc r9,r17\n"
|
||||
"sbc r10,r18\n"
|
||||
"sbc r11,r19\n"
|
||||
"sbc r12,r20\n"
|
||||
"sbc r13,r21\n"
|
||||
"sbc r14,r22\n"
|
||||
"sbc r15,r23\n"
|
||||
"add r30,%A2\n"
|
||||
"adc r31,__zero_reg__\n"
|
||||
"st Z,r15\n"
|
||||
"std Z+1,r8\n"
|
||||
"std Z+2,r9\n"
|
||||
"std Z+3,r10\n"
|
||||
"std Z+4,r11\n"
|
||||
"std Z+5,r12\n"
|
||||
"std Z+6,r13\n"
|
||||
"std Z+7,r14\n"
|
||||
"sub r30,%A2\n"
|
||||
"sbc r31,__zero_reg__\n"
|
||||
"pop r23\n"
|
||||
"pop r22\n"
|
||||
"pop r21\n"
|
||||
"pop r20\n"
|
||||
"pop r19\n"
|
||||
"pop r18\n"
|
||||
"pop r17\n"
|
||||
"pop r16\n"
|
||||
"pop r15\n"
|
||||
"pop r14\n"
|
||||
"pop r13\n"
|
||||
"pop r12\n"
|
||||
"pop r11\n"
|
||||
"pop r10\n"
|
||||
"pop r9\n"
|
||||
"pop r8\n"
|
||||
"rjmp 2b\n"
|
||||
"4:\n"
|
||||
"ldd r26,%A3\n"
|
||||
"ldd r27,%B3\n"
|
||||
"st X+,r15\n"
|
||||
"st X+,r14\n"
|
||||
"st X+,r13\n"
|
||||
"st X+,r12\n"
|
||||
"st X+,r11\n"
|
||||
"st X+,r10\n"
|
||||
"st X+,r9\n"
|
||||
"st X+,r8\n"
|
||||
"st X+,r23\n"
|
||||
"st X+,r22\n"
|
||||
"st X+,r21\n"
|
||||
"st X+,r20\n"
|
||||
"st X+,r19\n"
|
||||
"st X+,r18\n"
|
||||
"st X+,r17\n"
|
||||
"st X,r16\n"
|
||||
: : "x"(this->l), "z"(l), "r"(input), "Q"(output), "Q"(li_out), "Q"(r), "Q"(li_in)
|
||||
: "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "memory"
|
||||
, "r24", "r25"
|
||||
);
|
||||
#else
|
||||
uint64_t l[4];
|
||||
|
@ -156,336 +156,257 @@ bool SpeckTiny::setKey(const uint8_t *key, size_t len)
|
||||
void SpeckTiny::encryptBlock(uint8_t *output, const uint8_t *input)
|
||||
{
|
||||
#if USE_AVR_INLINE_ASM
|
||||
uint64_t l[4];
|
||||
uint32_t xlow, xhigh, ylow, yhigh;
|
||||
uint32_t slow, shigh;
|
||||
uint8_t li_in = 0;
|
||||
uint8_t li_out = (rounds - 31) * 8;
|
||||
|
||||
// Copy the "k" array into "s" and the "l" array.
|
||||
// Automatically generated by the genspeck tool.
|
||||
uint64_t l[5];
|
||||
uint8_t r = rounds;
|
||||
uint8_t mb = (r - 31) * 8;
|
||||
__asm__ __volatile__ (
|
||||
"ldd r25,%4\n" // r25 = li_out
|
||||
|
||||
"ld __tmp_reg__,Z+\n"
|
||||
"std %A0,__tmp_reg__\n"
|
||||
"ld __tmp_reg__,Z+\n"
|
||||
"std %B0,__tmp_reg__\n"
|
||||
"ld __tmp_reg__,Z+\n"
|
||||
"std %C0,__tmp_reg__\n"
|
||||
"ld __tmp_reg__,Z+\n"
|
||||
"std %D0,__tmp_reg__\n"
|
||||
"ld __tmp_reg__,Z+\n"
|
||||
"std %A1,__tmp_reg__\n"
|
||||
"ld __tmp_reg__,Z+\n"
|
||||
"std %B1,__tmp_reg__\n"
|
||||
"ld __tmp_reg__,Z+\n"
|
||||
"std %C1,__tmp_reg__\n"
|
||||
"ld __tmp_reg__,Z+\n"
|
||||
"std %D1,__tmp_reg__\n"
|
||||
|
||||
"1:\n" // l[0..] = k[1..]
|
||||
"ld __tmp_reg__,Z+\n"
|
||||
"st X+,__tmp_reg__\n"
|
||||
"dec r25\n"
|
||||
"brne 1b\n"
|
||||
: "=Q"(slow), "=Q"(shigh)
|
||||
: "z"(k), "x"(l), "Q"(li_out)
|
||||
: "r25"
|
||||
);
|
||||
|
||||
// Unpack the input into the x and y variables, converting
|
||||
// from big-endian into little-endian in the process.
|
||||
__asm__ __volatile__ (
|
||||
"ld %D1,Z\n"
|
||||
"ldd %C1,Z+1\n"
|
||||
"ldd %B1,Z+2\n"
|
||||
"ldd %A1,Z+3\n"
|
||||
"ldd %D0,Z+4\n"
|
||||
"ldd %C0,Z+5\n"
|
||||
"ldd %B0,Z+6\n"
|
||||
"ldd %A0,Z+7\n"
|
||||
"ldd %D3,Z+8\n"
|
||||
"ldd %C3,Z+9\n"
|
||||
"ldd %B3,Z+10\n"
|
||||
"ldd %A3,Z+11\n"
|
||||
"ldd %D2,Z+12\n"
|
||||
"ldd %C2,Z+13\n"
|
||||
"ldd %B2,Z+14\n"
|
||||
"ldd %A2,Z+15\n"
|
||||
: "=r"(xlow), "=r"(xhigh), "=r"(ylow), "=r"(yhigh)
|
||||
: "z"(input)
|
||||
);
|
||||
|
||||
// Perform all encryption rounds while expanding the key schedule in-place.
|
||||
__asm__ __volatile__ (
|
||||
"mov r23,__zero_reg__\n" // i = 0
|
||||
"movw r8,r30\n"
|
||||
"ldd r16,%4\n"
|
||||
"ldi r24,8\n"
|
||||
"add r16,r24\n"
|
||||
"1:\n"
|
||||
|
||||
// Adjust x and y for this round using the key schedule word s.
|
||||
|
||||
// x = (rightRotate8_64(x) + y) ^ s;
|
||||
"mov __tmp_reg__,%A0\n" // x = rightRotate8_64(x)
|
||||
"mov %A0,%B0\n"
|
||||
"mov %B0,%C0\n"
|
||||
"mov %C0,%D0\n"
|
||||
"mov %D0,%A1\n"
|
||||
"mov %A1,%B1\n"
|
||||
"mov %B1,%C1\n"
|
||||
"mov %C1,%D1\n"
|
||||
"mov %D1,__tmp_reg__\n"
|
||||
|
||||
"add %A0,%A2\n" // x += y
|
||||
"adc %B0,%B2\n"
|
||||
"adc %C0,%C2\n"
|
||||
"adc %D0,%D2\n"
|
||||
"adc %A1,%A3\n"
|
||||
"adc %B1,%B3\n"
|
||||
"adc %C1,%C3\n"
|
||||
"adc %D1,%D3\n"
|
||||
|
||||
"ldd __tmp_reg__,%A4\n" // x ^= s
|
||||
"eor %A0,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,%B4\n"
|
||||
"eor %B0,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,%C4\n"
|
||||
"eor %C0,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,%D4\n"
|
||||
"eor %D0,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,%A5\n"
|
||||
"eor %A1,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,%B5\n"
|
||||
"eor %B1,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,%C5\n"
|
||||
"eor %C1,__tmp_reg__\n"
|
||||
"ldd __tmp_reg__,%D5\n"
|
||||
"eor %D1,__tmp_reg__\n"
|
||||
|
||||
// y = leftRotate3_64(y) ^ x;
|
||||
"lsl %A2\n" // y = leftRotate1_64(y)
|
||||
"rol %B2\n"
|
||||
"rol %C2\n"
|
||||
"rol %D2\n"
|
||||
"rol %A3\n"
|
||||
"rol %B3\n"
|
||||
"rol %C3\n"
|
||||
"rol %D3\n"
|
||||
"adc %A2,__zero_reg__\n"
|
||||
|
||||
"lsl %A2\n" // y = leftRotate1_64(y)
|
||||
"rol %B2\n"
|
||||
"rol %C2\n"
|
||||
"rol %D2\n"
|
||||
"rol %A3\n"
|
||||
"rol %B3\n"
|
||||
"rol %C3\n"
|
||||
"rol %D3\n"
|
||||
|
||||
"adc %A2,__zero_reg__\n"
|
||||
"lsl %A2\n" // y = leftRotate1_64(y)
|
||||
"rol %B2\n"
|
||||
"rol %C2\n"
|
||||
"rol %D2\n"
|
||||
"rol %A3\n"
|
||||
"rol %B3\n"
|
||||
"rol %C3\n"
|
||||
"rol %D3\n"
|
||||
"adc %A2,__zero_reg__\n"
|
||||
|
||||
"eor %A2,%A0\n" // y ^= x
|
||||
"eor %B2,%B0\n"
|
||||
"eor %C2,%C0\n"
|
||||
"eor %D2,%D0\n"
|
||||
"eor %A3,%A1\n"
|
||||
"eor %B3,%B1\n"
|
||||
"eor %C3,%C1\n"
|
||||
"eor %D3,%D1\n"
|
||||
|
||||
// On the last round we don't need to compute s so we
|
||||
// can exit early here if (i + 1) == rounds.
|
||||
"mov __tmp_reg__,r23\n" // temp = i + 1
|
||||
"inc __tmp_reg__\n"
|
||||
"cp __tmp_reg__,%9\n" // if (temp == rounds) ...
|
||||
"brne 2f\n"
|
||||
"rjmp 3f\n"
|
||||
"ld __tmp_reg__,X+\n"
|
||||
"st Z+,__tmp_reg__\n"
|
||||
"dec r16\n"
|
||||
"brne 1b\n"
|
||||
"movw r30,r8\n"
|
||||
"movw r26,%A2\n"
|
||||
"ld r15,X+\n"
|
||||
"ld r14,X+\n"
|
||||
"ld r13,X+\n"
|
||||
"ld r12,X+\n"
|
||||
"ld r11,X+\n"
|
||||
"ld r10,X+\n"
|
||||
"ld r9,X+\n"
|
||||
"ld r8,X+\n"
|
||||
"ld r23,X+\n"
|
||||
"ld r22,X+\n"
|
||||
"ld r21,X+\n"
|
||||
"ld r20,X+\n"
|
||||
"ld r19,X+\n"
|
||||
"ld r18,X+\n"
|
||||
"ld r17,X+\n"
|
||||
"ld r16,X\n"
|
||||
"clr %A2\n"
|
||||
"ldd %B2,%4\n"
|
||||
"clr r25\n"
|
||||
"2:\n"
|
||||
|
||||
// Save x and y on the stack so we can reuse registers for t and s.
|
||||
"push %A0\n"
|
||||
"push %B0\n"
|
||||
"push %C0\n"
|
||||
"push %D0\n"
|
||||
"push %A1\n"
|
||||
"push %B1\n"
|
||||
"push %C1\n"
|
||||
"push %D1\n"
|
||||
"push %A2\n"
|
||||
"push %B2\n"
|
||||
"push %C2\n"
|
||||
"push %D2\n"
|
||||
"push %A3\n"
|
||||
"push %B3\n"
|
||||
"push %C3\n"
|
||||
"push %D3\n"
|
||||
|
||||
// Compute the key schedule word s for the next round.
|
||||
|
||||
// l[li_out] = (s + rightRotate8_64(l[li_in])) ^ i;
|
||||
"ldd r24,%6\n" // Z = &(l[li_in])
|
||||
"add %A8,r24\n"
|
||||
"adc %B8,__zero_reg__\n"
|
||||
|
||||
"ld %D1,Z+\n" // t = rightRotate8_64(l[li_in])
|
||||
"ld %A0,Z+\n"
|
||||
"ld %B0,Z+\n"
|
||||
"ld %C0,Z+\n"
|
||||
"ld %D0,Z+\n"
|
||||
"ld %A1,Z+\n"
|
||||
"ld %B1,Z+\n"
|
||||
"ld %C1,Z+\n"
|
||||
|
||||
"ldd %A2,%A4\n" // load s
|
||||
"ldd %B2,%B4\n"
|
||||
"ldd %C2,%C4\n"
|
||||
"ldd %D2,%D4\n"
|
||||
"ldd %A3,%A5\n"
|
||||
"ldd %B3,%B5\n"
|
||||
"ldd %C3,%C5\n"
|
||||
"ldd %D3,%D5\n"
|
||||
|
||||
"add %A0,%A2\n" // t += s
|
||||
"adc %B0,%B2\n"
|
||||
"adc %C0,%C2\n"
|
||||
"adc %D0,%D2\n"
|
||||
"adc %A1,%A3\n"
|
||||
"adc %B1,%B3\n"
|
||||
"adc %C1,%C3\n"
|
||||
"adc %D1,%D3\n"
|
||||
|
||||
"eor %A0,r23\n" // t ^= i
|
||||
|
||||
// Z = Z - li_in + li_out
|
||||
"ldi r25,8\n" // li_in = li_in + 1
|
||||
"add r24,r25\n"
|
||||
"sub %A8,r24\n" // return Z to its initial value
|
||||
"sbc %B8,__zero_reg__\n"
|
||||
"andi r24,0x1f\n" // li_in = li_in % 4
|
||||
"std %6,r24\n"
|
||||
"ldd r24,%7\n" // Z = &(l[li_out])
|
||||
"add %A8,r24\n"
|
||||
"adc %B8,__zero_reg__\n"
|
||||
|
||||
"st Z+,%A0\n" // l[li_out] = t
|
||||
"st Z+,%B0\n"
|
||||
"st Z+,%C0\n"
|
||||
"st Z+,%D0\n"
|
||||
"st Z+,%A1\n"
|
||||
"st Z+,%B1\n"
|
||||
"st Z+,%C1\n"
|
||||
"st Z+,%D1\n"
|
||||
|
||||
"add r24,r25\n" // li_out = li_out + 1
|
||||
"sub %A8,r24\n" // return Z to its initial value
|
||||
"sbc %B8,__zero_reg__\n"
|
||||
"andi r24,0x1f\n" // li_out = li_out % 4
|
||||
"std %7,r24\n"
|
||||
|
||||
// s = leftRotate3_64(s) ^ l[li_out];
|
||||
"lsl %A2\n" // s = leftRotate1_64(s)
|
||||
"rol %B2\n"
|
||||
"rol %C2\n"
|
||||
"rol %D2\n"
|
||||
"rol %A3\n"
|
||||
"rol %B3\n"
|
||||
"rol %C3\n"
|
||||
"rol %D3\n"
|
||||
"adc %A2,__zero_reg__\n"
|
||||
|
||||
"lsl %A2\n" // s = leftRotate1_64(s)
|
||||
"rol %B2\n"
|
||||
"rol %C2\n"
|
||||
"rol %D2\n"
|
||||
"rol %A3\n"
|
||||
"rol %B3\n"
|
||||
"rol %C3\n"
|
||||
"rol %D3\n"
|
||||
"adc %A2,__zero_reg__\n"
|
||||
|
||||
"lsl %A2\n" // s = leftRotate1_64(s)
|
||||
"rol %B2\n"
|
||||
"rol %C2\n"
|
||||
"rol %D2\n"
|
||||
"rol %A3\n"
|
||||
"rol %B3\n"
|
||||
"rol %C3\n"
|
||||
"rol %D3\n"
|
||||
"adc %A2,__zero_reg__\n"
|
||||
|
||||
"eor %A2,%A0\n" // s ^= l[li_out]
|
||||
"eor %B2,%B0\n"
|
||||
"eor %C2,%C0\n"
|
||||
"eor %D2,%D0\n"
|
||||
"eor %A3,%A1\n"
|
||||
"eor %B3,%B1\n"
|
||||
"eor %C3,%C1\n"
|
||||
"eor %D3,%D1\n"
|
||||
|
||||
"std %A4,%A2\n" // store s
|
||||
"std %B4,%B2\n"
|
||||
"std %C4,%C2\n"
|
||||
"std %D4,%D2\n"
|
||||
"std %A5,%A3\n"
|
||||
"std %B5,%B3\n"
|
||||
"std %C5,%C3\n"
|
||||
"std %D5,%D3\n"
|
||||
|
||||
// Pop registers from the stack to recover the x and y values.
|
||||
"pop %D3\n"
|
||||
"pop %C3\n"
|
||||
"pop %B3\n"
|
||||
"pop %A3\n"
|
||||
"pop %D2\n"
|
||||
"pop %C2\n"
|
||||
"pop %B2\n"
|
||||
"pop %A2\n"
|
||||
"pop %D1\n"
|
||||
"pop %C1\n"
|
||||
"pop %B1\n"
|
||||
"pop %A1\n"
|
||||
"pop %D0\n"
|
||||
"pop %C0\n"
|
||||
"pop %B0\n"
|
||||
"pop %A0\n"
|
||||
|
||||
// Bottom of the loop.
|
||||
"inc r23\n"
|
||||
"rjmp 1b\n"
|
||||
"add r9,r16\n"
|
||||
"adc r10,r17\n"
|
||||
"adc r11,r18\n"
|
||||
"adc r12,r19\n"
|
||||
"adc r13,r20\n"
|
||||
"adc r14,r21\n"
|
||||
"adc r15,r22\n"
|
||||
"adc r8,r23\n"
|
||||
"ld __tmp_reg__,Z+\n"
|
||||
"eor __tmp_reg__,r9\n"
|
||||
"ld r9,Z+\n"
|
||||
"eor r9,r10\n"
|
||||
"ld r10,Z+\n"
|
||||
"eor r10,r11\n"
|
||||
"ld r11,Z+\n"
|
||||
"eor r11,r12\n"
|
||||
"ld r12,Z+\n"
|
||||
"eor r12,r13\n"
|
||||
"ld r13,Z+\n"
|
||||
"eor r13,r14\n"
|
||||
"ld r14,Z+\n"
|
||||
"eor r14,r15\n"
|
||||
"ld r15,Z+\n"
|
||||
"eor r15,r8\n"
|
||||
"mov r8,__tmp_reg__\n"
|
||||
"lsl r16\n"
|
||||
"rol r17\n"
|
||||
"rol r18\n"
|
||||
"rol r19\n"
|
||||
"rol r20\n"
|
||||
"rol r21\n"
|
||||
"rol r22\n"
|
||||
"rol r23\n"
|
||||
"adc r16, __zero_reg__\n"
|
||||
"lsl r16\n"
|
||||
"rol r17\n"
|
||||
"rol r18\n"
|
||||
"rol r19\n"
|
||||
"rol r20\n"
|
||||
"rol r21\n"
|
||||
"rol r22\n"
|
||||
"rol r23\n"
|
||||
"adc r16, __zero_reg__\n"
|
||||
"lsl r16\n"
|
||||
"rol r17\n"
|
||||
"rol r18\n"
|
||||
"rol r19\n"
|
||||
"rol r20\n"
|
||||
"rol r21\n"
|
||||
"rol r22\n"
|
||||
"rol r23\n"
|
||||
"adc r16, __zero_reg__\n"
|
||||
"eor r16,r8\n"
|
||||
"eor r17,r9\n"
|
||||
"eor r18,r10\n"
|
||||
"eor r19,r11\n"
|
||||
"eor r20,r12\n"
|
||||
"eor r21,r13\n"
|
||||
"eor r22,r14\n"
|
||||
"eor r23,r15\n"
|
||||
"mov __tmp_reg__,r25\n"
|
||||
"inc __tmp_reg__\n"
|
||||
"ldd r24,%5\n"
|
||||
"cp __tmp_reg__,r24\n"
|
||||
"brne 3f\n"
|
||||
"rjmp 4f\n"
|
||||
"3:\n"
|
||||
|
||||
: "+r"(xlow), "+r"(xhigh), "+r"(ylow), "+r"(yhigh),
|
||||
"+Q"(slow), "+Q"(shigh), "+Q"(li_in), "+Q"(li_out)
|
||||
: "z"(l), "r"(rounds)
|
||||
: "r23", "r24", "r25"
|
||||
);
|
||||
|
||||
// Pack the results into the output and convert back to big-endian.
|
||||
__asm__ __volatile__ (
|
||||
"st Z,%D1\n"
|
||||
"std Z+1,%C1\n"
|
||||
"std Z+2,%B1\n"
|
||||
"std Z+3,%A1\n"
|
||||
"std Z+4,%D0\n"
|
||||
"std Z+5,%C0\n"
|
||||
"std Z+6,%B0\n"
|
||||
"std Z+7,%A0\n"
|
||||
"std Z+8,%D3\n"
|
||||
"std Z+9,%C3\n"
|
||||
"std Z+10,%B3\n"
|
||||
"std Z+11,%A3\n"
|
||||
"std Z+12,%D2\n"
|
||||
"std Z+13,%C2\n"
|
||||
"std Z+14,%B2\n"
|
||||
"std Z+15,%A2\n"
|
||||
: : "r"(xlow), "r"(xhigh), "r"(ylow), "r"(yhigh), "z"(output)
|
||||
"push r8\n"
|
||||
"push r9\n"
|
||||
"push r10\n"
|
||||
"push r11\n"
|
||||
"push r12\n"
|
||||
"push r13\n"
|
||||
"push r14\n"
|
||||
"push r15\n"
|
||||
"push r16\n"
|
||||
"push r17\n"
|
||||
"push r18\n"
|
||||
"push r19\n"
|
||||
"push r20\n"
|
||||
"push r21\n"
|
||||
"push r22\n"
|
||||
"push r23\n"
|
||||
"sbiw r30,8\n"
|
||||
"ld r16,Z\n"
|
||||
"ldd r17,Z+1\n"
|
||||
"ldd r18,Z+2\n"
|
||||
"ldd r19,Z+3\n"
|
||||
"ldd r20,Z+4\n"
|
||||
"ldd r21,Z+5\n"
|
||||
"ldd r22,Z+6\n"
|
||||
"ldd r23,Z+7\n"
|
||||
"add r30,%A2\n"
|
||||
"adc r31,__zero_reg__\n"
|
||||
"ldd r15,Z+8\n"
|
||||
"ldd r8,Z+9\n"
|
||||
"ldd r9,Z+10\n"
|
||||
"ldd r10,Z+11\n"
|
||||
"ldd r11,Z+12\n"
|
||||
"ldd r12,Z+13\n"
|
||||
"ldd r13,Z+14\n"
|
||||
"ldd r14,Z+15\n"
|
||||
"add r8,r16\n"
|
||||
"adc r9,r17\n"
|
||||
"adc r10,r18\n"
|
||||
"adc r11,r19\n"
|
||||
"adc r12,r20\n"
|
||||
"adc r13,r21\n"
|
||||
"adc r14,r22\n"
|
||||
"adc r15,r23\n"
|
||||
"eor r8,r25\n"
|
||||
"sub r30,%A2\n"
|
||||
"sbc r31,__zero_reg__\n"
|
||||
"add r30,%B2\n"
|
||||
"adc r31,__zero_reg__\n"
|
||||
"std Z+8,r8\n"
|
||||
"std Z+9,r9\n"
|
||||
"std Z+10,r10\n"
|
||||
"std Z+11,r11\n"
|
||||
"std Z+12,r12\n"
|
||||
"std Z+13,r13\n"
|
||||
"std Z+14,r14\n"
|
||||
"std Z+15,r15\n"
|
||||
"sub r30,%B2\n"
|
||||
"sbc r31,__zero_reg__\n"
|
||||
"lsl r16\n"
|
||||
"rol r17\n"
|
||||
"rol r18\n"
|
||||
"rol r19\n"
|
||||
"rol r20\n"
|
||||
"rol r21\n"
|
||||
"rol r22\n"
|
||||
"rol r23\n"
|
||||
"adc r16, __zero_reg__\n"
|
||||
"lsl r16\n"
|
||||
"rol r17\n"
|
||||
"rol r18\n"
|
||||
"rol r19\n"
|
||||
"rol r20\n"
|
||||
"rol r21\n"
|
||||
"rol r22\n"
|
||||
"rol r23\n"
|
||||
"adc r16, __zero_reg__\n"
|
||||
"lsl r16\n"
|
||||
"rol r17\n"
|
||||
"rol r18\n"
|
||||
"rol r19\n"
|
||||
"rol r20\n"
|
||||
"rol r21\n"
|
||||
"rol r22\n"
|
||||
"rol r23\n"
|
||||
"adc r16, __zero_reg__\n"
|
||||
"eor r16,r8\n"
|
||||
"eor r17,r9\n"
|
||||
"eor r18,r10\n"
|
||||
"eor r19,r11\n"
|
||||
"eor r20,r12\n"
|
||||
"eor r21,r13\n"
|
||||
"eor r22,r14\n"
|
||||
"eor r23,r15\n"
|
||||
"st Z,r16\n"
|
||||
"std Z+1,r17\n"
|
||||
"std Z+2,r18\n"
|
||||
"std Z+3,r19\n"
|
||||
"std Z+4,r20\n"
|
||||
"std Z+5,r21\n"
|
||||
"std Z+6,r22\n"
|
||||
"std Z+7,r23\n"
|
||||
"ldi r24,8\n"
|
||||
"add %A2,r24\n"
|
||||
"add %B2,r24\n"
|
||||
"ldi r24,0x1F\n"
|
||||
"and %A2,r24\n"
|
||||
"and %B2,r24\n"
|
||||
"pop r23\n"
|
||||
"pop r22\n"
|
||||
"pop r21\n"
|
||||
"pop r20\n"
|
||||
"pop r19\n"
|
||||
"pop r18\n"
|
||||
"pop r17\n"
|
||||
"pop r16\n"
|
||||
"pop r15\n"
|
||||
"pop r14\n"
|
||||
"pop r13\n"
|
||||
"pop r12\n"
|
||||
"pop r11\n"
|
||||
"pop r10\n"
|
||||
"pop r9\n"
|
||||
"pop r8\n"
|
||||
"inc r25\n"
|
||||
"rjmp 2b\n"
|
||||
"4:\n"
|
||||
"ldd r26,%A3\n"
|
||||
"ldd r27,%B3\n"
|
||||
"st X+,r15\n"
|
||||
"st X+,r14\n"
|
||||
"st X+,r13\n"
|
||||
"st X+,r12\n"
|
||||
"st X+,r11\n"
|
||||
"st X+,r10\n"
|
||||
"st X+,r9\n"
|
||||
"st X+,r8\n"
|
||||
"st X+,r23\n"
|
||||
"st X+,r22\n"
|
||||
"st X+,r21\n"
|
||||
"st X+,r20\n"
|
||||
"st X+,r19\n"
|
||||
"st X+,r18\n"
|
||||
"st X+,r17\n"
|
||||
"st X,r16\n"
|
||||
: : "x"(k), "z"(l), "r"(input), "Q"(output), "Q"(mb), "Q"(r)
|
||||
: "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "memory"
|
||||
, "r24", "r25"
|
||||
);
|
||||
#else
|
||||
uint64_t l[4];
|
||||
|
Loading…
x
Reference in New Issue
Block a user