diff --git a/doc/crypto.dox b/doc/crypto.dox
index b445d728..0b7ee49c 100644
--- a/doc/crypto.dox
+++ b/doc/crypto.dox
@@ -97,8 +97,8 @@ Ardunino Mega 2560 running at 16 MHz are similar:
SHA1 | 21.90us | 1423.28us | | 95 |
SHA256 | 43.85us | 2841.04us | | 107 |
SHA512 | 122.82us | 15953.42us | | 211 |
-SHA3_256 | 121.69us | 16486.33us | | 405 |
-SHA3_512 | 229.12us | 16502.34us | | 405 |
+SHA3_256 | 61.78us | 8328.70us | | 405 |
+SHA3_512 | 115.94us | 8344.80us | | 405 |
BLAKE2s | 18.54us | 1200.06us | | 171 |
BLAKE2b | 50.70us | 6515.87us | | 339 |
|
diff --git a/gen/genkeccak.c b/gen/genkeccak.c
new file mode 100644
index 00000000..5bf573a1
--- /dev/null
+++ b/gen/genkeccak.c
@@ -0,0 +1,612 @@
+/*
+ * Copyright (C) 2016 Southern Storm Software, Pty Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+// Special-purpose compiler that generates the AVR version of KeccakCore.
+
+#include
+#include
+
+// 1 to inline rotates, 0 to call to helper functions.
+// 0 gives a smaller code size at a slight performance cost.
+static int inline_rotates = 1;
+
+static int indent = 4;
+
+static int t_reg = 8; // Temporary 64-bit value (any reg).
+
+static int x_reg = 26;
+static int y_reg = 28;
+static int z_reg = 30;
+
+static int const_reg = 21; // For temporary constants (must be a high reg).
+
+static int loop1_reg = 20; // For keeping track of loop counters (high regs).
+static int loop2_reg = 19;
+static int loop3_reg = 18;
+
+static int save1_reg = 17; // Save registers (any reg).
+static int save2_reg = 16;
+
+// Indent the code and print a string.
+void indent_printf(const char *format, ...)
+{
+ va_list va;
+ int posn;
+ va_start(va, format);
+ for (posn = 0; posn < indent; ++posn)
+ putc(' ', stdout);
+ vfprintf(stdout, format, va);
+ va_end(va);
+}
+
+// Print an assembler instruction within quotes.
+void insn_printf(const char *format, ...)
+{
+ va_list va;
+ int posn;
+ va_start(va, format);
+ for (posn = 0; posn < indent; ++posn)
+ putc(' ', stdout);
+ putc('"', stdout);
+ vfprintf(stdout, format, va);
+ putc('\\', stdout);
+ putc('n', stdout);
+ putc('"', stdout);
+ putc('\n', stdout);
+ va_end(va);
+}
+
+void leftRotate1(int reg)
+{
+ insn_printf("lsl r%d", reg);
+ insn_printf("rol r%d", reg + 1);
+ insn_printf("rol r%d", reg + 2);
+ insn_printf("rol r%d", reg + 3);
+ insn_printf("rol r%d", reg + 4);
+ insn_printf("rol r%d", reg + 5);
+ insn_printf("rol r%d", reg + 6);
+ insn_printf("rol r%d", reg + 7);
+ insn_printf("adc r%d, __zero_reg__", reg);
+}
+
+void rightRotate1(int reg)
+{
+ insn_printf("bst r%d,0", reg);
+ insn_printf("ror r%d", reg + 7);
+ insn_printf("ror r%d", reg + 6);
+ insn_printf("ror r%d", reg + 5);
+ insn_printf("ror r%d", reg + 4);
+ insn_printf("ror r%d", reg + 3);
+ insn_printf("ror r%d", reg + 2);
+ insn_printf("ror r%d", reg + 1);
+ insn_printf("ror r%d", reg);
+ insn_printf("bld r%d,7", reg + 7);
+}
+
+void adjust_pointer_reg(int reg, int delta)
+{
+ if (delta >= 64) {
+ insn_printf("ldi %d,%d", const_reg, delta & 0xFF);
+ insn_printf("add r%d,%d", reg, const_reg);
+ if ((delta >> 8) != 0) {
+ insn_printf("ldi %d,%d", const_reg, (delta >> 8) & 0xFF);
+ insn_printf("adc r%d,r%d", reg + 1, const_reg);
+ } else {
+ insn_printf("adc r%d,__zero_reg__", reg + 1);
+ }
+ } else if (delta > 0) {
+ insn_printf("adiw r%d,%d", reg, delta);
+ } else if (delta <= -64) {
+ delta = -delta;
+ insn_printf("subi r%d,%d", reg, delta & 0xFF);
+ if ((delta >> 8) != 0) {
+ insn_printf("sbci r%d,%d", reg + 1, (delta >> 8) & 0xFF);
+ } else {
+ insn_printf("sbc r%d,__zero_reg__", reg + 1);
+ }
+ } else if (delta < 0) {
+ insn_printf("sbiw r%d,%d", reg, -delta);
+ }
+}
+
+void load64_from_z(int reg, int offset)
+{
+ if (offset == 0)
+ insn_printf("ld r%d,Z", reg);
+ else
+ insn_printf("ldd r%d,Z+%d", reg, offset);
+ insn_printf("ldd r%d,Z+%d", reg + 1, offset + 1);
+ insn_printf("ldd r%d,Z+%d", reg + 2, offset + 2);
+ insn_printf("ldd r%d,Z+%d", reg + 3, offset + 3);
+ insn_printf("ldd r%d,Z+%d", reg + 4, offset + 4);
+ insn_printf("ldd r%d,Z+%d", reg + 5, offset + 5);
+ insn_printf("ldd r%d,Z+%d", reg + 6, offset + 6);
+ insn_printf("ldd r%d,Z+%d", reg + 7, offset + 7);
+}
+
+void load64_from_y(int reg, int offset)
+{
+ if (offset == 0)
+ insn_printf("ld r%d,Y", reg);
+ else
+ insn_printf("ldd r%d,Y+%d", reg, offset);
+ insn_printf("ldd r%d,Y+%d", reg + 1, offset + 1);
+ insn_printf("ldd r%d,Y+%d", reg + 2, offset + 2);
+ insn_printf("ldd r%d,Y+%d", reg + 3, offset + 3);
+ insn_printf("ldd r%d,Y+%d", reg + 4, offset + 4);
+ insn_printf("ldd r%d,Y+%d", reg + 5, offset + 5);
+ insn_printf("ldd r%d,Y+%d", reg + 6, offset + 6);
+ insn_printf("ldd r%d,Y+%d", reg + 7, offset + 7);
+}
+
+void load64_from_z_combine(const char *op, int reg, int offset)
+{
+ int posn;
+ for (posn = 0; posn < 8; ++posn, ++offset, ++reg) {
+ if (offset == 0)
+ insn_printf("ld __tmp_reg__,Z");
+ else
+ insn_printf("ldd __tmp_reg__,Z+%d", offset);
+ insn_printf("%s r%d,__tmp_reg__", op, reg);
+ }
+}
+
+void load64_from_y_combine(const char *op, int reg, int offset)
+{
+ int posn;
+ for (posn = 0; posn < 8; ++posn, ++offset, ++reg) {
+ if (offset == 0)
+ insn_printf("ld __tmp_reg__,Y");
+ else
+ insn_printf("ldd __tmp_reg__,Y+%d", offset);
+ insn_printf("%s r%d,__tmp_reg__", op, reg);
+ }
+}
+
+void combine64_with_z(const char *op, int reg)
+{
+ int posn;
+ for (posn = 0; posn < 8; ++posn, ++reg) {
+ if (posn == 0)
+ insn_printf("ld __tmp_reg__,Z");
+ else
+ insn_printf("ldd __tmp_reg__,Z+%d", posn);
+ insn_printf("%s __tmp_reg__,r%d", op, reg);
+ if (posn == 0)
+ insn_printf("st Z,__tmp_reg__");
+ else
+ insn_printf("std Z+%d,__tmp_reg__", posn);
+ }
+}
+
+void load64_from_x(int reg)
+{
+ insn_printf("ld r%d,X+", reg);
+ insn_printf("ld r%d,X+", reg + 1);
+ insn_printf("ld r%d,X+", reg + 2);
+ insn_printf("ld r%d,X+", reg + 3);
+ insn_printf("ld r%d,X+", reg + 4);
+ insn_printf("ld r%d,X+", reg + 5);
+ insn_printf("ld r%d,X+", reg + 6);
+ insn_printf("ld r%d,X+", reg + 7);
+}
+
+void store64_to_x(int reg)
+{
+ insn_printf("st X+,r%d", reg);
+ insn_printf("st X+,r%d", reg + 1);
+ insn_printf("st X+,r%d", reg + 2);
+ insn_printf("st X+,r%d", reg + 3);
+ insn_printf("st X+,r%d", reg + 4);
+ insn_printf("st X+,r%d", reg + 5);
+ insn_printf("st X+,r%d", reg + 6);
+ insn_printf("st X+,r%d", reg + 7);
+}
+
+void theta(void)
+{
+ int index;
+
+ printf("\n");
+ indent_printf("// Step mapping theta. Compute C.\n");
+ insn_printf("ldi r%d,5", loop1_reg);
+ insn_printf("100:");
+
+ // Load state.A[0][index] into t_reg.
+ load64_from_z(t_reg, 0);
+
+ // XOR with state.A[1][index] .. state.A[4][index]
+ insn_printf("ldi r%d,4", loop2_reg);
+ insn_printf("101:");
+ adjust_pointer_reg(z_reg, 40);
+ load64_from_z_combine("eor", t_reg, 0);
+ insn_printf("dec r%d", loop2_reg);
+ insn_printf("brne 101b");
+
+ // Store into state.B[0][index].
+ store64_to_x(t_reg);
+
+ // End of the outer loop.
+ adjust_pointer_reg(z_reg, -(160 - 8));
+ insn_printf("dec r%d", loop1_reg);
+ insn_printf("brne 100b");
+ adjust_pointer_reg(z_reg, -40);
+
+ // Generate the D values into the second row of B. To make this
+ // easier, we know that the original X value is also in Y so we
+ // can use offsets relative to Y for the first row of B.
+ printf("\n");
+ indent_printf("// Step mapping theta. Compute D.\n");
+ for (index = 0; index < 5; ++index) {
+ load64_from_y(t_reg, ((index + 1) % 5) * 8);
+ leftRotate1(t_reg);
+ load64_from_y_combine("eor", t_reg, ((index + 4) % 5) * 8);
+ store64_to_x(t_reg);
+ }
+ adjust_pointer_reg(x_reg, -40);
+
+ // XOR every D[index] with every A[x][index] element.
+ printf("\n");
+ indent_printf("// Step mapping theta. XOR D with A.\n");
+ insn_printf("ldi r%d,5", loop1_reg);
+ insn_printf("102:");
+ load64_from_x(t_reg);
+ insn_printf("ldi r%d,5", loop2_reg);
+ insn_printf("103:");
+ combine64_with_z("eor", t_reg);
+ adjust_pointer_reg(z_reg, 40);
+ insn_printf("dec r%d", loop2_reg);
+ insn_printf("brne 103b");
+ adjust_pointer_reg(z_reg, -(200 - 8));
+ insn_printf("dec r%d", loop1_reg);
+ insn_printf("brne 102b");
+ adjust_pointer_reg(x_reg, -80);
+ adjust_pointer_reg(z_reg, -40);
+}
+
+void rho_pi(void)
+{
+ typedef struct {
+ int x, y, rot;
+ } map;
+ static map const Bmap[5][5] = { // indexed by y, x
+ {
+ {0, 0, 0}, // B[0][0]
+ {0, 3, 28}, // B[1][0]
+ {0, 1, 1}, // B[2][0]
+ {0, 4, 27}, // B[3][0]
+ {0, 2, 62} // B[4][0]
+ },
+ {
+ {1, 1, 44}, // B[0][1]
+ {1, 4, 20}, // B[1][1]
+ {1, 2, 6}, // B[2][1]
+ {1, 0, 36}, // B[3][1]
+ {1, 3, 55} // B[4][1]
+ },
+ {
+ {2, 2, 43}, // B[0][2]
+ {2, 0, 3}, // B[1][2]
+ {2, 3, 25}, // B[2][2]
+ {2, 1, 10}, // B[3][2]
+ {2, 4, 39} // B[4][2]
+ },
+ {
+ {3, 3, 21}, // B[0][3]
+ {3, 1, 45}, // B[1][3]
+ {3, 4, 8}, // B[2][3]
+ {3, 2, 15}, // B[3][3]
+ {3, 0, 41} // B[4][3]
+ },
+ {
+ {4, 4, 14}, // B[0][4]
+ {4, 2, 61}, // B[1][4]
+ {4, 0, 18}, // B[2][4]
+ {4, 3, 56}, // B[3][4]
+ {4, 1, 2} // B[4][4]
+ }
+ };
+ int Boffset = 0;
+ int Aoffset = 0;
+ int offset;
+ int Bx, By, Ax, Ay, rot, adjust;
+ printf("\n");
+ indent_printf("// Step mappings rho and pi combined into one step.\n");
+ for (By = 0; By < 5; ++By) {
+ for (Bx = 0; Bx < 5; ++Bx) {
+ // What do we need to load?
+ Ax = Bmap[By][Bx].x;
+ Ay = Bmap[By][Bx].y;
+ rot = Bmap[By][Bx].rot;
+
+ // Heading for this step.
+ printf("\n");
+ if (rot != 0) {
+ indent_printf("// state.B[%d][%d] = leftRotate%d_64(state.A[%d][%d])\n",
+ Bx, By, rot, Ax, Ay);
+ } else {
+ indent_printf("// state.B[%d][%d] = state.A[%d][%d]\n",
+ Bx, By, Ax, Ay);
+ }
+
+ // Adjust rX and rZ to point at the new locations.
+ offset = (Bx * 5 + By) * 8;
+ adjust_pointer_reg(x_reg, offset - Boffset);
+ Boffset = offset;
+ offset = Ax * 5 * 8;
+ adjust_pointer_reg(z_reg, offset - Aoffset);
+ Aoffset = offset;
+ offset = Ay * 8;
+
+ // Load the A/rZ value into the temp regs.
+ load64_from_z(t_reg, offset);
+
+ // Rotate.
+ adjust = (8 - rot / 8) % 8;
+ switch (rot % 8) {
+ case 0:
+ break;
+ case 1:
+ if (inline_rotates) {
+ leftRotate1(t_reg);
+ } else {
+ insn_printf("call 11f");
+ }
+ break;
+ case 2:
+ if (inline_rotates) {
+ leftRotate1(t_reg);
+ leftRotate1(t_reg);
+ } else {
+ insn_printf("call 12f");
+ }
+ break;
+ case 3:
+ if (inline_rotates) {
+ leftRotate1(t_reg);
+ leftRotate1(t_reg);
+ leftRotate1(t_reg);
+ } else {
+ insn_printf("call 13f");
+ }
+ break;
+ case 4:
+ if (inline_rotates) {
+ leftRotate1(t_reg);
+ leftRotate1(t_reg);
+ leftRotate1(t_reg);
+ leftRotate1(t_reg);
+ } else {
+ insn_printf("call 14f");
+ }
+ break;
+ case 5:
+ if (inline_rotates) {
+ rightRotate1(t_reg);
+ rightRotate1(t_reg);
+ rightRotate1(t_reg);
+ } else {
+ insn_printf("call 23f");
+ }
+ adjust = (adjust + 7) % 8;
+ break;
+ case 6:
+ if (inline_rotates) {
+ rightRotate1(t_reg);
+ rightRotate1(t_reg);
+ } else {
+ insn_printf("call 22f");
+ }
+ adjust = (adjust + 7) % 8;
+ break;
+ case 7:
+ if (inline_rotates) {
+ rightRotate1(t_reg);
+ } else {
+ insn_printf("call 21f");
+ }
+ adjust = (adjust + 7) % 8;
+ break;
+ default:
+ break;
+ }
+
+ // Perform byte rotations and store into B/rX.
+ insn_printf("st X+,r%d", t_reg + adjust);
+ insn_printf("st X+,r%d", t_reg + (adjust + 1) % 8);
+ insn_printf("st X+,r%d", t_reg + (adjust + 2) % 8);
+ insn_printf("st X+,r%d", t_reg + (adjust + 3) % 8);
+ insn_printf("st X+,r%d", t_reg + (adjust + 4) % 8);
+ insn_printf("st X+,r%d", t_reg + (adjust + 5) % 8);
+ insn_printf("st X+,r%d", t_reg + (adjust + 6) % 8);
+ insn_printf("st X+,r%d", t_reg + (adjust + 7) % 8);
+ Boffset += 8; // rX has advanced by 8 while doing this.
+ }
+ }
+
+ // Return rX and rZ to their starting values for the next step mapping.
+ adjust_pointer_reg(x_reg, -Boffset);
+ adjust_pointer_reg(z_reg, -Aoffset);
+}
+
+void load8_from_y(int reg, int offset)
+{
+ if (offset == 0)
+ insn_printf("ld r%d,Y", reg);
+ else
+ insn_printf("ldd r%d,Y+%d", reg, offset);
+}
+
+void store8_to_z(int reg, int offset)
+{
+ if (offset == 0)
+ insn_printf("st Z,r%d", reg);
+ else
+ insn_printf("std Z+%d,r%d", offset, reg);
+}
+
+void chi(void)
+{
+ int index;
+
+ // Step mapping chi. A is pointed to by Z and B is pointed to by X/Y.
+ // state.A[index2][index] =
+ // state.B[index2][index] ^
+ // ((~state.B[index2][(index + 1) % 5]) &
+ // state.B[index2][(index + 2) % 5]);
+ // We compute this using an interleaving method. We load five bytes
+ // from the 5 words in a row of B and then compute the 5 output bytes
+ // from that and store. Then we move onto the next 5 bytes of each row.
+ int in1 = t_reg;
+ int in2 = t_reg + 1;
+ int in3 = t_reg + 2;
+ int in4 = t_reg + 3;
+ int in5 = t_reg + 4;
+ int out1 = t_reg + 5;
+ int out2 = t_reg + 6;
+ int out3 = t_reg + 7;
+ int out4 = save1_reg;
+ int out5 = save2_reg;
+ printf("\n");
+ indent_printf("// Step mapping chi.\n");
+ insn_printf("ldi r%d,5", loop1_reg);
+ insn_printf("50:");
+ for (index = 0; index < 8; ++index) {
+ load8_from_y(in1, index);
+ load8_from_y(in2, index + 8);
+ load8_from_y(in3, index + 16);
+ load8_from_y(in4, index + 24);
+ load8_from_y(in5, index + 32);
+ insn_printf("mov r%d,r%d", out1, in2);
+ insn_printf("com r%d", out1);
+ insn_printf("and r%d,r%d", out1, in3);
+ insn_printf("eor r%d,r%d", out1, in1);
+ insn_printf("mov r%d,r%d", out2, in3);
+ insn_printf("com r%d", out2);
+ insn_printf("and r%d,r%d", out2, in4);
+ insn_printf("eor r%d,r%d", out2, in2);
+ insn_printf("mov r%d,r%d", out3, in4);
+ insn_printf("com r%d", out3);
+ insn_printf("and r%d,r%d", out3, in5);
+ insn_printf("eor r%d,r%d", out3, in3);
+ insn_printf("mov r%d,r%d", out4, in5);
+ insn_printf("com r%d", out4);
+ insn_printf("and r%d,r%d", out4, in1);
+ insn_printf("eor r%d,r%d", out4, in4);
+ insn_printf("mov r%d,r%d", out5, in1);
+ insn_printf("com r%d", out5);
+ insn_printf("and r%d,r%d", out5, in2);
+ insn_printf("eor r%d,r%d", out5, in5);
+ store8_to_z(out1, index);
+ store8_to_z(out2, index + 8);
+ store8_to_z(out3, index + 16);
+ store8_to_z(out4, index + 24);
+ store8_to_z(out5, index + 32);
+ }
+ adjust_pointer_reg(z_reg, 5 * 8);
+ adjust_pointer_reg(y_reg, 5 * 8);
+ insn_printf("dec r%d", loop1_reg);
+ insn_printf("breq 51f");
+ insn_printf("rjmp 50b");
+ insn_printf("51:");
+
+ // Restore Y and Z. We don't need this yet because chi() is the
+ // last thing we do so there's no point resetting the registers.
+ //adjust_pointer_reg(y_reg, -200);
+ //adjust_pointer_reg(z_reg, -200);
+}
+
+void outer_loop(void)
+{
+ // Save Y and then copy X into Y. This way we can use displacements
+ // relative to the Y register which we cannot do with X.
+ insn_printf("push r%d", y_reg + 1);
+ insn_printf("push r%d", y_reg);
+ insn_printf("mov r%d,r%d", y_reg, x_reg);
+ insn_printf("mov r%d,r%d", y_reg + 1, x_reg + 1);
+
+ // Output the step mappings.
+ theta();
+ rho_pi();
+ chi();
+ // iota();
+
+ // Restore Y.
+ insn_printf("pop r%d", y_reg);
+ insn_printf("pop r%d", y_reg + 1);
+
+ // Dump the rotation utilities.
+ if (!inline_rotates) {
+ insn_printf("rjmp 3f");
+ printf("\n");
+ indent_printf("// Left rotate by 4 bits\n");
+ insn_printf("14:");
+ leftRotate1(t_reg);
+ indent_printf("// Left rotate by 3 bits\n");
+ insn_printf("13:");
+ leftRotate1(t_reg);
+ indent_printf("// Left rotate by 2 bits\n");
+ insn_printf("12:");
+ leftRotate1(t_reg);
+ indent_printf("// Left rotate by 1 bit\n");
+ insn_printf("11:");
+ leftRotate1(t_reg);
+ insn_printf("ret");
+ printf("\n");
+ indent_printf("// Right rotate by 3 bits\n");
+ insn_printf("23:");
+ rightRotate1(t_reg);
+ indent_printf("// Right rotate by 2 bits\n");
+ insn_printf("22:");
+ rightRotate1(t_reg);
+ indent_printf("// Right rotate by 1 bit\n");
+ insn_printf("21:");
+ rightRotate1(t_reg);
+ insn_printf("ret");
+ }
+
+ // End of assembly.
+ printf("\n");
+ indent_printf("// Done\n");
+ if (!inline_rotates) {
+ insn_printf("3:");
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ indent_printf("__asm__ __volatile__ (\n");
+ indent += 4;
+ outer_loop();
+ indent_printf(": : \"x\"(state.B), \"z\"(state.A)\n");
+ indent_printf(": \"r%d\", \"r%d\", \"r%d\", \"r%d\", "
+ "\"r%d\", \"r%d\", \"r%d\", \"r%d\",\n",
+ t_reg, t_reg + 1, t_reg + 2, t_reg + 3,
+ t_reg + 4, t_reg + 5, t_reg + 6, t_reg + 7);
+ indent_printf(" \"r%d\", \"r%d\", \"r%d\", \"r%d\", "
+ "\"r%d\", \"r%d\", \"memory\"\n",
+ save2_reg, save1_reg, loop3_reg, loop2_reg,
+ loop1_reg, const_reg);
+ indent -= 4;
+ indent_printf(");\n");
+ return 0;
+}
diff --git a/libraries/Crypto/KeccakCore.cpp b/libraries/Crypto/KeccakCore.cpp
index eeaa56ad..86b22bbf 100644
--- a/libraries/Crypto/KeccakCore.cpp
+++ b/libraries/Crypto/KeccakCore.cpp
@@ -287,6 +287,1551 @@ void KeccakCore::setHMACKey(const void *key, size_t len, uint8_t pad, size_t has
*/
void KeccakCore::keccakp()
{
+#if defined(__AVR__)
+ // This assembly code was generated by the "genkeccak.c" program.
+ // Do not modify this code directly. Instead modify "genkeccak.c"
+ // and then re-generate the code here.
+ for (uint8_t round = 0; round < 24; ++round) {
+ __asm__ __volatile__ (
+ "push r29\n"
+ "push r28\n"
+ "mov r28,r26\n"
+ "mov r29,r27\n"
+
+ // Step mapping theta. Compute C.
+ "ldi r20,5\n"
+ "100:\n"
+ "ld r8,Z\n"
+ "ldd r9,Z+1\n"
+ "ldd r10,Z+2\n"
+ "ldd r11,Z+3\n"
+ "ldd r12,Z+4\n"
+ "ldd r13,Z+5\n"
+ "ldd r14,Z+6\n"
+ "ldd r15,Z+7\n"
+ "ldi r19,4\n"
+ "101:\n"
+ "adiw r30,40\n"
+ "ld __tmp_reg__,Z\n"
+ "eor r8,__tmp_reg__\n"
+ "ldd __tmp_reg__,Z+1\n"
+ "eor r9,__tmp_reg__\n"
+ "ldd __tmp_reg__,Z+2\n"
+ "eor r10,__tmp_reg__\n"
+ "ldd __tmp_reg__,Z+3\n"
+ "eor r11,__tmp_reg__\n"
+ "ldd __tmp_reg__,Z+4\n"
+ "eor r12,__tmp_reg__\n"
+ "ldd __tmp_reg__,Z+5\n"
+ "eor r13,__tmp_reg__\n"
+ "ldd __tmp_reg__,Z+6\n"
+ "eor r14,__tmp_reg__\n"
+ "ldd __tmp_reg__,Z+7\n"
+ "eor r15,__tmp_reg__\n"
+ "dec r19\n"
+ "brne 101b\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+ "subi r30,152\n"
+ "sbc r31,__zero_reg__\n"
+ "dec r20\n"
+ "brne 100b\n"
+ "sbiw r30,40\n"
+
+ // Step mapping theta. Compute D.
+ "ldd r8,Y+8\n"
+ "ldd r9,Y+9\n"
+ "ldd r10,Y+10\n"
+ "ldd r11,Y+11\n"
+ "ldd r12,Y+12\n"
+ "ldd r13,Y+13\n"
+ "ldd r14,Y+14\n"
+ "ldd r15,Y+15\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "ldd __tmp_reg__,Y+32\n"
+ "eor r8,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+33\n"
+ "eor r9,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+34\n"
+ "eor r10,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+35\n"
+ "eor r11,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+36\n"
+ "eor r12,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+37\n"
+ "eor r13,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+38\n"
+ "eor r14,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+39\n"
+ "eor r15,__tmp_reg__\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+ "ldd r8,Y+16\n"
+ "ldd r9,Y+17\n"
+ "ldd r10,Y+18\n"
+ "ldd r11,Y+19\n"
+ "ldd r12,Y+20\n"
+ "ldd r13,Y+21\n"
+ "ldd r14,Y+22\n"
+ "ldd r15,Y+23\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "ld __tmp_reg__,Y\n"
+ "eor r8,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+1\n"
+ "eor r9,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+2\n"
+ "eor r10,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+3\n"
+ "eor r11,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+4\n"
+ "eor r12,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+5\n"
+ "eor r13,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+6\n"
+ "eor r14,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+7\n"
+ "eor r15,__tmp_reg__\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+ "ldd r8,Y+24\n"
+ "ldd r9,Y+25\n"
+ "ldd r10,Y+26\n"
+ "ldd r11,Y+27\n"
+ "ldd r12,Y+28\n"
+ "ldd r13,Y+29\n"
+ "ldd r14,Y+30\n"
+ "ldd r15,Y+31\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "ldd __tmp_reg__,Y+8\n"
+ "eor r8,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+9\n"
+ "eor r9,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+10\n"
+ "eor r10,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+11\n"
+ "eor r11,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+12\n"
+ "eor r12,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+13\n"
+ "eor r13,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+14\n"
+ "eor r14,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+15\n"
+ "eor r15,__tmp_reg__\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+ "ldd r8,Y+32\n"
+ "ldd r9,Y+33\n"
+ "ldd r10,Y+34\n"
+ "ldd r11,Y+35\n"
+ "ldd r12,Y+36\n"
+ "ldd r13,Y+37\n"
+ "ldd r14,Y+38\n"
+ "ldd r15,Y+39\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "ldd __tmp_reg__,Y+16\n"
+ "eor r8,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+17\n"
+ "eor r9,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+18\n"
+ "eor r10,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+19\n"
+ "eor r11,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+20\n"
+ "eor r12,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+21\n"
+ "eor r13,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+22\n"
+ "eor r14,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+23\n"
+ "eor r15,__tmp_reg__\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+ "ld r8,Y\n"
+ "ldd r9,Y+1\n"
+ "ldd r10,Y+2\n"
+ "ldd r11,Y+3\n"
+ "ldd r12,Y+4\n"
+ "ldd r13,Y+5\n"
+ "ldd r14,Y+6\n"
+ "ldd r15,Y+7\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "ldd __tmp_reg__,Y+24\n"
+ "eor r8,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+25\n"
+ "eor r9,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+26\n"
+ "eor r10,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+27\n"
+ "eor r11,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+28\n"
+ "eor r12,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+29\n"
+ "eor r13,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+30\n"
+ "eor r14,__tmp_reg__\n"
+ "ldd __tmp_reg__,Y+31\n"
+ "eor r15,__tmp_reg__\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+ "sbiw r26,40\n"
+
+ // Step mapping theta. XOR D with A.
+ "ldi r20,5\n"
+ "102:\n"
+ "ld r8,X+\n"
+ "ld r9,X+\n"
+ "ld r10,X+\n"
+ "ld r11,X+\n"
+ "ld r12,X+\n"
+ "ld r13,X+\n"
+ "ld r14,X+\n"
+ "ld r15,X+\n"
+ "ldi r19,5\n"
+ "103:\n"
+ "ld __tmp_reg__,Z\n"
+ "eor __tmp_reg__,r8\n"
+ "st Z,__tmp_reg__\n"
+ "ldd __tmp_reg__,Z+1\n"
+ "eor __tmp_reg__,r9\n"
+ "std Z+1,__tmp_reg__\n"
+ "ldd __tmp_reg__,Z+2\n"
+ "eor __tmp_reg__,r10\n"
+ "std Z+2,__tmp_reg__\n"
+ "ldd __tmp_reg__,Z+3\n"
+ "eor __tmp_reg__,r11\n"
+ "std Z+3,__tmp_reg__\n"
+ "ldd __tmp_reg__,Z+4\n"
+ "eor __tmp_reg__,r12\n"
+ "std Z+4,__tmp_reg__\n"
+ "ldd __tmp_reg__,Z+5\n"
+ "eor __tmp_reg__,r13\n"
+ "std Z+5,__tmp_reg__\n"
+ "ldd __tmp_reg__,Z+6\n"
+ "eor __tmp_reg__,r14\n"
+ "std Z+6,__tmp_reg__\n"
+ "ldd __tmp_reg__,Z+7\n"
+ "eor __tmp_reg__,r15\n"
+ "std Z+7,__tmp_reg__\n"
+ "adiw r30,40\n"
+ "dec r19\n"
+ "brne 103b\n"
+ "subi r30,192\n"
+ "sbc r31,__zero_reg__\n"
+ "dec r20\n"
+ "brne 102b\n"
+ "subi r26,80\n"
+ "sbc r27,__zero_reg__\n"
+ "sbiw r30,40\n"
+
+ // Step mappings rho and pi combined into one step.
+
+ // state.B[0][0] = state.A[0][0]
+ "ld r8,Z\n"
+ "ldd r9,Z+1\n"
+ "ldd r10,Z+2\n"
+ "ldd r11,Z+3\n"
+ "ldd r12,Z+4\n"
+ "ldd r13,Z+5\n"
+ "ldd r14,Z+6\n"
+ "ldd r15,Z+7\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+
+ // state.B[1][0] = leftRotate28_64(state.A[0][3])
+ "adiw r26,32\n"
+ "ldd r8,Z+24\n"
+ "ldd r9,Z+25\n"
+ "ldd r10,Z+26\n"
+ "ldd r11,Z+27\n"
+ "ldd r12,Z+28\n"
+ "ldd r13,Z+29\n"
+ "ldd r14,Z+30\n"
+ "ldd r15,Z+31\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+
+ // state.B[2][0] = leftRotate1_64(state.A[0][1])
+ "adiw r26,32\n"
+ "ldd r8,Z+8\n"
+ "ldd r9,Z+9\n"
+ "ldd r10,Z+10\n"
+ "ldd r11,Z+11\n"
+ "ldd r12,Z+12\n"
+ "ldd r13,Z+13\n"
+ "ldd r14,Z+14\n"
+ "ldd r15,Z+15\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+
+ // state.B[3][0] = leftRotate27_64(state.A[0][4])
+ "adiw r26,32\n"
+ "ldd r8,Z+32\n"
+ "ldd r9,Z+33\n"
+ "ldd r10,Z+34\n"
+ "ldd r11,Z+35\n"
+ "ldd r12,Z+36\n"
+ "ldd r13,Z+37\n"
+ "ldd r14,Z+38\n"
+ "ldd r15,Z+39\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+
+ // state.B[4][0] = leftRotate62_64(state.A[0][2])
+ "adiw r26,32\n"
+ "ldd r8,Z+16\n"
+ "ldd r9,Z+17\n"
+ "ldd r10,Z+18\n"
+ "ldd r11,Z+19\n"
+ "ldd r12,Z+20\n"
+ "ldd r13,Z+21\n"
+ "ldd r14,Z+22\n"
+ "ldd r15,Z+23\n"
+ "bst r8,0\n"
+ "ror r15\n"
+ "ror r14\n"
+ "ror r13\n"
+ "ror r12\n"
+ "ror r11\n"
+ "ror r10\n"
+ "ror r9\n"
+ "ror r8\n"
+ "bld r15,7\n"
+ "bst r8,0\n"
+ "ror r15\n"
+ "ror r14\n"
+ "ror r13\n"
+ "ror r12\n"
+ "ror r11\n"
+ "ror r10\n"
+ "ror r9\n"
+ "ror r8\n"
+ "bld r15,7\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+
+ // state.B[0][1] = leftRotate44_64(state.A[1][1])
+ "subi r26,160\n"
+ "sbc r27,__zero_reg__\n"
+ "adiw r30,40\n"
+ "ldd r8,Z+8\n"
+ "ldd r9,Z+9\n"
+ "ldd r10,Z+10\n"
+ "ldd r11,Z+11\n"
+ "ldd r12,Z+12\n"
+ "ldd r13,Z+13\n"
+ "ldd r14,Z+14\n"
+ "ldd r15,Z+15\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+
+ // state.B[1][1] = leftRotate20_64(state.A[1][4])
+ "adiw r26,32\n"
+ "ldd r8,Z+32\n"
+ "ldd r9,Z+33\n"
+ "ldd r10,Z+34\n"
+ "ldd r11,Z+35\n"
+ "ldd r12,Z+36\n"
+ "ldd r13,Z+37\n"
+ "ldd r14,Z+38\n"
+ "ldd r15,Z+39\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+
+ // state.B[2][1] = leftRotate6_64(state.A[1][2])
+ "adiw r26,32\n"
+ "ldd r8,Z+16\n"
+ "ldd r9,Z+17\n"
+ "ldd r10,Z+18\n"
+ "ldd r11,Z+19\n"
+ "ldd r12,Z+20\n"
+ "ldd r13,Z+21\n"
+ "ldd r14,Z+22\n"
+ "ldd r15,Z+23\n"
+ "bst r8,0\n"
+ "ror r15\n"
+ "ror r14\n"
+ "ror r13\n"
+ "ror r12\n"
+ "ror r11\n"
+ "ror r10\n"
+ "ror r9\n"
+ "ror r8\n"
+ "bld r15,7\n"
+ "bst r8,0\n"
+ "ror r15\n"
+ "ror r14\n"
+ "ror r13\n"
+ "ror r12\n"
+ "ror r11\n"
+ "ror r10\n"
+ "ror r9\n"
+ "ror r8\n"
+ "bld r15,7\n"
+ "st X+,r15\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+
+ // state.B[3][1] = leftRotate36_64(state.A[1][0])
+ "adiw r26,32\n"
+ "ld r8,Z\n"
+ "ldd r9,Z+1\n"
+ "ldd r10,Z+2\n"
+ "ldd r11,Z+3\n"
+ "ldd r12,Z+4\n"
+ "ldd r13,Z+5\n"
+ "ldd r14,Z+6\n"
+ "ldd r15,Z+7\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+
+ // state.B[4][1] = leftRotate55_64(state.A[1][3])
+ "adiw r26,32\n"
+ "ldd r8,Z+24\n"
+ "ldd r9,Z+25\n"
+ "ldd r10,Z+26\n"
+ "ldd r11,Z+27\n"
+ "ldd r12,Z+28\n"
+ "ldd r13,Z+29\n"
+ "ldd r14,Z+30\n"
+ "ldd r15,Z+31\n"
+ "bst r8,0\n"
+ "ror r15\n"
+ "ror r14\n"
+ "ror r13\n"
+ "ror r12\n"
+ "ror r11\n"
+ "ror r10\n"
+ "ror r9\n"
+ "ror r8\n"
+ "bld r15,7\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+ "st X+,r8\n"
+
+ // state.B[0][2] = leftRotate43_64(state.A[2][2])
+ "subi r26,160\n"
+ "sbc r27,__zero_reg__\n"
+ "adiw r30,40\n"
+ "ldd r8,Z+16\n"
+ "ldd r9,Z+17\n"
+ "ldd r10,Z+18\n"
+ "ldd r11,Z+19\n"
+ "ldd r12,Z+20\n"
+ "ldd r13,Z+21\n"
+ "ldd r14,Z+22\n"
+ "ldd r15,Z+23\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+
+ // state.B[1][2] = leftRotate3_64(state.A[2][0])
+ "adiw r26,32\n"
+ "ld r8,Z\n"
+ "ldd r9,Z+1\n"
+ "ldd r10,Z+2\n"
+ "ldd r11,Z+3\n"
+ "ldd r12,Z+4\n"
+ "ldd r13,Z+5\n"
+ "ldd r14,Z+6\n"
+ "ldd r15,Z+7\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+
+ // state.B[2][2] = leftRotate25_64(state.A[2][3])
+ "adiw r26,32\n"
+ "ldd r8,Z+24\n"
+ "ldd r9,Z+25\n"
+ "ldd r10,Z+26\n"
+ "ldd r11,Z+27\n"
+ "ldd r12,Z+28\n"
+ "ldd r13,Z+29\n"
+ "ldd r14,Z+30\n"
+ "ldd r15,Z+31\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+
+ // state.B[3][2] = leftRotate10_64(state.A[2][1])
+ "adiw r26,32\n"
+ "ldd r8,Z+8\n"
+ "ldd r9,Z+9\n"
+ "ldd r10,Z+10\n"
+ "ldd r11,Z+11\n"
+ "ldd r12,Z+12\n"
+ "ldd r13,Z+13\n"
+ "ldd r14,Z+14\n"
+ "ldd r15,Z+15\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "st X+,r15\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+
+ // state.B[4][2] = leftRotate39_64(state.A[2][4])
+ "adiw r26,32\n"
+ "ldd r8,Z+32\n"
+ "ldd r9,Z+33\n"
+ "ldd r10,Z+34\n"
+ "ldd r11,Z+35\n"
+ "ldd r12,Z+36\n"
+ "ldd r13,Z+37\n"
+ "ldd r14,Z+38\n"
+ "ldd r15,Z+39\n"
+ "bst r8,0\n"
+ "ror r15\n"
+ "ror r14\n"
+ "ror r13\n"
+ "ror r12\n"
+ "ror r11\n"
+ "ror r10\n"
+ "ror r9\n"
+ "ror r8\n"
+ "bld r15,7\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+
+ // state.B[0][3] = leftRotate21_64(state.A[3][3])
+ "subi r26,160\n"
+ "sbc r27,__zero_reg__\n"
+ "adiw r30,40\n"
+ "ldd r8,Z+24\n"
+ "ldd r9,Z+25\n"
+ "ldd r10,Z+26\n"
+ "ldd r11,Z+27\n"
+ "ldd r12,Z+28\n"
+ "ldd r13,Z+29\n"
+ "ldd r14,Z+30\n"
+ "ldd r15,Z+31\n"
+ "bst r8,0\n"
+ "ror r15\n"
+ "ror r14\n"
+ "ror r13\n"
+ "ror r12\n"
+ "ror r11\n"
+ "ror r10\n"
+ "ror r9\n"
+ "ror r8\n"
+ "bld r15,7\n"
+ "bst r8,0\n"
+ "ror r15\n"
+ "ror r14\n"
+ "ror r13\n"
+ "ror r12\n"
+ "ror r11\n"
+ "ror r10\n"
+ "ror r9\n"
+ "ror r8\n"
+ "bld r15,7\n"
+ "bst r8,0\n"
+ "ror r15\n"
+ "ror r14\n"
+ "ror r13\n"
+ "ror r12\n"
+ "ror r11\n"
+ "ror r10\n"
+ "ror r9\n"
+ "ror r8\n"
+ "bld r15,7\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+
+ // state.B[1][3] = leftRotate45_64(state.A[3][1])
+ "adiw r26,32\n"
+ "ldd r8,Z+8\n"
+ "ldd r9,Z+9\n"
+ "ldd r10,Z+10\n"
+ "ldd r11,Z+11\n"
+ "ldd r12,Z+12\n"
+ "ldd r13,Z+13\n"
+ "ldd r14,Z+14\n"
+ "ldd r15,Z+15\n"
+ "bst r8,0\n"
+ "ror r15\n"
+ "ror r14\n"
+ "ror r13\n"
+ "ror r12\n"
+ "ror r11\n"
+ "ror r10\n"
+ "ror r9\n"
+ "ror r8\n"
+ "bld r15,7\n"
+ "bst r8,0\n"
+ "ror r15\n"
+ "ror r14\n"
+ "ror r13\n"
+ "ror r12\n"
+ "ror r11\n"
+ "ror r10\n"
+ "ror r9\n"
+ "ror r8\n"
+ "bld r15,7\n"
+ "bst r8,0\n"
+ "ror r15\n"
+ "ror r14\n"
+ "ror r13\n"
+ "ror r12\n"
+ "ror r11\n"
+ "ror r10\n"
+ "ror r9\n"
+ "ror r8\n"
+ "bld r15,7\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+
+ // state.B[2][3] = leftRotate8_64(state.A[3][4])
+ "adiw r26,32\n"
+ "ldd r8,Z+32\n"
+ "ldd r9,Z+33\n"
+ "ldd r10,Z+34\n"
+ "ldd r11,Z+35\n"
+ "ldd r12,Z+36\n"
+ "ldd r13,Z+37\n"
+ "ldd r14,Z+38\n"
+ "ldd r15,Z+39\n"
+ "st X+,r15\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+
+ // state.B[3][3] = leftRotate15_64(state.A[3][2])
+ "adiw r26,32\n"
+ "ldd r8,Z+16\n"
+ "ldd r9,Z+17\n"
+ "ldd r10,Z+18\n"
+ "ldd r11,Z+19\n"
+ "ldd r12,Z+20\n"
+ "ldd r13,Z+21\n"
+ "ldd r14,Z+22\n"
+ "ldd r15,Z+23\n"
+ "bst r8,0\n"
+ "ror r15\n"
+ "ror r14\n"
+ "ror r13\n"
+ "ror r12\n"
+ "ror r11\n"
+ "ror r10\n"
+ "ror r9\n"
+ "ror r8\n"
+ "bld r15,7\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+
+ // state.B[4][3] = leftRotate41_64(state.A[3][0])
+ "adiw r26,32\n"
+ "ld r8,Z\n"
+ "ldd r9,Z+1\n"
+ "ldd r10,Z+2\n"
+ "ldd r11,Z+3\n"
+ "ldd r12,Z+4\n"
+ "ldd r13,Z+5\n"
+ "ldd r14,Z+6\n"
+ "ldd r15,Z+7\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+
+ // state.B[0][4] = leftRotate14_64(state.A[4][4])
+ "subi r26,160\n"
+ "sbc r27,__zero_reg__\n"
+ "adiw r30,40\n"
+ "ldd r8,Z+32\n"
+ "ldd r9,Z+33\n"
+ "ldd r10,Z+34\n"
+ "ldd r11,Z+35\n"
+ "ldd r12,Z+36\n"
+ "ldd r13,Z+37\n"
+ "ldd r14,Z+38\n"
+ "ldd r15,Z+39\n"
+ "bst r8,0\n"
+ "ror r15\n"
+ "ror r14\n"
+ "ror r13\n"
+ "ror r12\n"
+ "ror r11\n"
+ "ror r10\n"
+ "ror r9\n"
+ "ror r8\n"
+ "bld r15,7\n"
+ "bst r8,0\n"
+ "ror r15\n"
+ "ror r14\n"
+ "ror r13\n"
+ "ror r12\n"
+ "ror r11\n"
+ "ror r10\n"
+ "ror r9\n"
+ "ror r8\n"
+ "bld r15,7\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+
+ // state.B[1][4] = leftRotate61_64(state.A[4][2])
+ "adiw r26,32\n"
+ "ldd r8,Z+16\n"
+ "ldd r9,Z+17\n"
+ "ldd r10,Z+18\n"
+ "ldd r11,Z+19\n"
+ "ldd r12,Z+20\n"
+ "ldd r13,Z+21\n"
+ "ldd r14,Z+22\n"
+ "ldd r15,Z+23\n"
+ "bst r8,0\n"
+ "ror r15\n"
+ "ror r14\n"
+ "ror r13\n"
+ "ror r12\n"
+ "ror r11\n"
+ "ror r10\n"
+ "ror r9\n"
+ "ror r8\n"
+ "bld r15,7\n"
+ "bst r8,0\n"
+ "ror r15\n"
+ "ror r14\n"
+ "ror r13\n"
+ "ror r12\n"
+ "ror r11\n"
+ "ror r10\n"
+ "ror r9\n"
+ "ror r8\n"
+ "bld r15,7\n"
+ "bst r8,0\n"
+ "ror r15\n"
+ "ror r14\n"
+ "ror r13\n"
+ "ror r12\n"
+ "ror r11\n"
+ "ror r10\n"
+ "ror r9\n"
+ "ror r8\n"
+ "bld r15,7\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+
+ // state.B[2][4] = leftRotate18_64(state.A[4][0])
+ "adiw r26,32\n"
+ "ld r8,Z\n"
+ "ldd r9,Z+1\n"
+ "ldd r10,Z+2\n"
+ "ldd r11,Z+3\n"
+ "ldd r12,Z+4\n"
+ "ldd r13,Z+5\n"
+ "ldd r14,Z+6\n"
+ "ldd r15,Z+7\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+
+ // state.B[3][4] = leftRotate56_64(state.A[4][3])
+ "adiw r26,32\n"
+ "ldd r8,Z+24\n"
+ "ldd r9,Z+25\n"
+ "ldd r10,Z+26\n"
+ "ldd r11,Z+27\n"
+ "ldd r12,Z+28\n"
+ "ldd r13,Z+29\n"
+ "ldd r14,Z+30\n"
+ "ldd r15,Z+31\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+ "st X+,r8\n"
+
+ // state.B[4][4] = leftRotate2_64(state.A[4][1])
+ "adiw r26,32\n"
+ "ldd r8,Z+8\n"
+ "ldd r9,Z+9\n"
+ "ldd r10,Z+10\n"
+ "ldd r11,Z+11\n"
+ "ldd r12,Z+12\n"
+ "ldd r13,Z+13\n"
+ "ldd r14,Z+14\n"
+ "ldd r15,Z+15\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "lsl r8\n"
+ "rol r9\n"
+ "rol r10\n"
+ "rol r11\n"
+ "rol r12\n"
+ "rol r13\n"
+ "rol r14\n"
+ "rol r15\n"
+ "adc r8, __zero_reg__\n"
+ "st X+,r8\n"
+ "st X+,r9\n"
+ "st X+,r10\n"
+ "st X+,r11\n"
+ "st X+,r12\n"
+ "st X+,r13\n"
+ "st X+,r14\n"
+ "st X+,r15\n"
+ "subi r26,200\n"
+ "sbc r27,__zero_reg__\n"
+ "subi r30,160\n"
+ "sbc r31,__zero_reg__\n"
+
+ // Step mapping chi.
+ "ldi r20,5\n"
+ "50:\n"
+ "ld r8,Y\n"
+ "ldd r9,Y+8\n"
+ "ldd r10,Y+16\n"
+ "ldd r11,Y+24\n"
+ "ldd r12,Y+32\n"
+ "mov r13,r9\n"
+ "com r13\n"
+ "and r13,r10\n"
+ "eor r13,r8\n"
+ "mov r14,r10\n"
+ "com r14\n"
+ "and r14,r11\n"
+ "eor r14,r9\n"
+ "mov r15,r11\n"
+ "com r15\n"
+ "and r15,r12\n"
+ "eor r15,r10\n"
+ "mov r17,r12\n"
+ "com r17\n"
+ "and r17,r8\n"
+ "eor r17,r11\n"
+ "mov r16,r8\n"
+ "com r16\n"
+ "and r16,r9\n"
+ "eor r16,r12\n"
+ "st Z,r13\n"
+ "std Z+8,r14\n"
+ "std Z+16,r15\n"
+ "std Z+24,r17\n"
+ "std Z+32,r16\n"
+ "ldd r8,Y+1\n"
+ "ldd r9,Y+9\n"
+ "ldd r10,Y+17\n"
+ "ldd r11,Y+25\n"
+ "ldd r12,Y+33\n"
+ "mov r13,r9\n"
+ "com r13\n"
+ "and r13,r10\n"
+ "eor r13,r8\n"
+ "mov r14,r10\n"
+ "com r14\n"
+ "and r14,r11\n"
+ "eor r14,r9\n"
+ "mov r15,r11\n"
+ "com r15\n"
+ "and r15,r12\n"
+ "eor r15,r10\n"
+ "mov r17,r12\n"
+ "com r17\n"
+ "and r17,r8\n"
+ "eor r17,r11\n"
+ "mov r16,r8\n"
+ "com r16\n"
+ "and r16,r9\n"
+ "eor r16,r12\n"
+ "std Z+1,r13\n"
+ "std Z+9,r14\n"
+ "std Z+17,r15\n"
+ "std Z+25,r17\n"
+ "std Z+33,r16\n"
+ "ldd r8,Y+2\n"
+ "ldd r9,Y+10\n"
+ "ldd r10,Y+18\n"
+ "ldd r11,Y+26\n"
+ "ldd r12,Y+34\n"
+ "mov r13,r9\n"
+ "com r13\n"
+ "and r13,r10\n"
+ "eor r13,r8\n"
+ "mov r14,r10\n"
+ "com r14\n"
+ "and r14,r11\n"
+ "eor r14,r9\n"
+ "mov r15,r11\n"
+ "com r15\n"
+ "and r15,r12\n"
+ "eor r15,r10\n"
+ "mov r17,r12\n"
+ "com r17\n"
+ "and r17,r8\n"
+ "eor r17,r11\n"
+ "mov r16,r8\n"
+ "com r16\n"
+ "and r16,r9\n"
+ "eor r16,r12\n"
+ "std Z+2,r13\n"
+ "std Z+10,r14\n"
+ "std Z+18,r15\n"
+ "std Z+26,r17\n"
+ "std Z+34,r16\n"
+ "ldd r8,Y+3\n"
+ "ldd r9,Y+11\n"
+ "ldd r10,Y+19\n"
+ "ldd r11,Y+27\n"
+ "ldd r12,Y+35\n"
+ "mov r13,r9\n"
+ "com r13\n"
+ "and r13,r10\n"
+ "eor r13,r8\n"
+ "mov r14,r10\n"
+ "com r14\n"
+ "and r14,r11\n"
+ "eor r14,r9\n"
+ "mov r15,r11\n"
+ "com r15\n"
+ "and r15,r12\n"
+ "eor r15,r10\n"
+ "mov r17,r12\n"
+ "com r17\n"
+ "and r17,r8\n"
+ "eor r17,r11\n"
+ "mov r16,r8\n"
+ "com r16\n"
+ "and r16,r9\n"
+ "eor r16,r12\n"
+ "std Z+3,r13\n"
+ "std Z+11,r14\n"
+ "std Z+19,r15\n"
+ "std Z+27,r17\n"
+ "std Z+35,r16\n"
+ "ldd r8,Y+4\n"
+ "ldd r9,Y+12\n"
+ "ldd r10,Y+20\n"
+ "ldd r11,Y+28\n"
+ "ldd r12,Y+36\n"
+ "mov r13,r9\n"
+ "com r13\n"
+ "and r13,r10\n"
+ "eor r13,r8\n"
+ "mov r14,r10\n"
+ "com r14\n"
+ "and r14,r11\n"
+ "eor r14,r9\n"
+ "mov r15,r11\n"
+ "com r15\n"
+ "and r15,r12\n"
+ "eor r15,r10\n"
+ "mov r17,r12\n"
+ "com r17\n"
+ "and r17,r8\n"
+ "eor r17,r11\n"
+ "mov r16,r8\n"
+ "com r16\n"
+ "and r16,r9\n"
+ "eor r16,r12\n"
+ "std Z+4,r13\n"
+ "std Z+12,r14\n"
+ "std Z+20,r15\n"
+ "std Z+28,r17\n"
+ "std Z+36,r16\n"
+ "ldd r8,Y+5\n"
+ "ldd r9,Y+13\n"
+ "ldd r10,Y+21\n"
+ "ldd r11,Y+29\n"
+ "ldd r12,Y+37\n"
+ "mov r13,r9\n"
+ "com r13\n"
+ "and r13,r10\n"
+ "eor r13,r8\n"
+ "mov r14,r10\n"
+ "com r14\n"
+ "and r14,r11\n"
+ "eor r14,r9\n"
+ "mov r15,r11\n"
+ "com r15\n"
+ "and r15,r12\n"
+ "eor r15,r10\n"
+ "mov r17,r12\n"
+ "com r17\n"
+ "and r17,r8\n"
+ "eor r17,r11\n"
+ "mov r16,r8\n"
+ "com r16\n"
+ "and r16,r9\n"
+ "eor r16,r12\n"
+ "std Z+5,r13\n"
+ "std Z+13,r14\n"
+ "std Z+21,r15\n"
+ "std Z+29,r17\n"
+ "std Z+37,r16\n"
+ "ldd r8,Y+6\n"
+ "ldd r9,Y+14\n"
+ "ldd r10,Y+22\n"
+ "ldd r11,Y+30\n"
+ "ldd r12,Y+38\n"
+ "mov r13,r9\n"
+ "com r13\n"
+ "and r13,r10\n"
+ "eor r13,r8\n"
+ "mov r14,r10\n"
+ "com r14\n"
+ "and r14,r11\n"
+ "eor r14,r9\n"
+ "mov r15,r11\n"
+ "com r15\n"
+ "and r15,r12\n"
+ "eor r15,r10\n"
+ "mov r17,r12\n"
+ "com r17\n"
+ "and r17,r8\n"
+ "eor r17,r11\n"
+ "mov r16,r8\n"
+ "com r16\n"
+ "and r16,r9\n"
+ "eor r16,r12\n"
+ "std Z+6,r13\n"
+ "std Z+14,r14\n"
+ "std Z+22,r15\n"
+ "std Z+30,r17\n"
+ "std Z+38,r16\n"
+ "ldd r8,Y+7\n"
+ "ldd r9,Y+15\n"
+ "ldd r10,Y+23\n"
+ "ldd r11,Y+31\n"
+ "ldd r12,Y+39\n"
+ "mov r13,r9\n"
+ "com r13\n"
+ "and r13,r10\n"
+ "eor r13,r8\n"
+ "mov r14,r10\n"
+ "com r14\n"
+ "and r14,r11\n"
+ "eor r14,r9\n"
+ "mov r15,r11\n"
+ "com r15\n"
+ "and r15,r12\n"
+ "eor r15,r10\n"
+ "mov r17,r12\n"
+ "com r17\n"
+ "and r17,r8\n"
+ "eor r17,r11\n"
+ "mov r16,r8\n"
+ "com r16\n"
+ "and r16,r9\n"
+ "eor r16,r12\n"
+ "std Z+7,r13\n"
+ "std Z+15,r14\n"
+ "std Z+23,r15\n"
+ "std Z+31,r17\n"
+ "std Z+39,r16\n"
+ "adiw r30,40\n"
+ "adiw r28,40\n"
+ "dec r20\n"
+ "breq 51f\n"
+ "rjmp 50b\n"
+ "51:\n"
+ "pop r28\n"
+ "pop r29\n"
+
+ // Done
+ : : "x"(state.B), "z"(state.A)
+ : "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+ "r16", "r17", "r18", "r19", "r20", "r21", "memory"
+ );
+#else
static const uint8_t addMod5Table[9] PROGMEM = {
0, 1, 2, 3, 4, 0, 1, 2, 3
};
@@ -346,6 +1891,7 @@ void KeccakCore::keccakp()
state.B[index2][addMod5(index, 2)]);
}
}
+#endif
// Step mapping iota. XOR A[0][0] with the round constant.
static uint64_t const RC[24] PROGMEM = {
diff --git a/libraries/Crypto/examples/TestSHA3_256/TestSHA3_256.ino b/libraries/Crypto/examples/TestSHA3_256/TestSHA3_256.ino
index 71d17c4b..51864638 100644
--- a/libraries/Crypto/examples/TestSHA3_256/TestSHA3_256.ino
+++ b/libraries/Crypto/examples/TestSHA3_256/TestSHA3_256.ino
@@ -261,7 +261,6 @@ void testHMAC(Hash *hash, size_t keyLen)
Serial.println("Failed");
}
-/*
void perfFinalize(Hash *hash)
{
unsigned long start;
@@ -285,7 +284,6 @@ void perfFinalize(Hash *hash)
Serial.print((1000.0 * 1000000.0) / elapsed);
Serial.println(" ops per second");
}
-*/
void setup()
{
@@ -314,7 +312,7 @@ void setup()
Serial.println("Performance Tests:");
perfHash(&sha3_256);
- //perfFinalize(&sha3_256);
+ perfFinalize(&sha3_256);
}
void loop()
diff --git a/libraries/Crypto/examples/TestSHA3_512/TestSHA3_512.ino b/libraries/Crypto/examples/TestSHA3_512/TestSHA3_512.ino
index 3e77ff0d..474e0c36 100644
--- a/libraries/Crypto/examples/TestSHA3_512/TestSHA3_512.ino
+++ b/libraries/Crypto/examples/TestSHA3_512/TestSHA3_512.ino
@@ -263,7 +263,6 @@ void testHMAC(Hash *hash, size_t keyLen)
Serial.println("Failed");
}
-/*
void perfFinalize(Hash *hash)
{
unsigned long start;
@@ -285,7 +284,6 @@ void perfFinalize(Hash *hash)
Serial.print((1000.0 * 1000000.0) / elapsed);
Serial.println(" ops per second");
}
-*/
void setup()
{
@@ -314,7 +312,7 @@ void setup()
Serial.println("Performance Tests:");
perfHash(&sha3_512);
- //perfFinalize(&sha3_512);
+ perfFinalize(&sha3_512);
}
void loop()