diff --git a/doc/crypto.dox b/doc/crypto.dox index b445d728..0b7ee49c 100644 --- a/doc/crypto.dox +++ b/doc/crypto.dox @@ -97,8 +97,8 @@ Ardunino Mega 2560 running at 16 MHz are similar: SHA121.90us1423.28us 95 SHA25643.85us2841.04us 107 SHA512122.82us15953.42us 211 -SHA3_256121.69us16486.33us 405 -SHA3_512229.12us16502.34us 405 +SHA3_25661.78us8328.70us 405 +SHA3_512115.94us8344.80us 405 BLAKE2s18.54us1200.06us 171 BLAKE2b50.70us6515.87us 339 diff --git a/gen/genkeccak.c b/gen/genkeccak.c new file mode 100644 index 00000000..5bf573a1 --- /dev/null +++ b/gen/genkeccak.c @@ -0,0 +1,612 @@ +/* + * Copyright (C) 2016 Southern Storm Software, Pty Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +// Special-purpose compiler that generates the AVR version of KeccakCore. + +#include +#include + +// 1 to inline rotates, 0 to call to helper functions. +// 0 gives a smaller code size at a slight performance cost. +static int inline_rotates = 1; + +static int indent = 4; + +static int t_reg = 8; // Temporary 64-bit value (any reg). + +static int x_reg = 26; +static int y_reg = 28; +static int z_reg = 30; + +static int const_reg = 21; // For temporary constants (must be a high reg). + +static int loop1_reg = 20; // For keeping track of loop counters (high regs). +static int loop2_reg = 19; +static int loop3_reg = 18; + +static int save1_reg = 17; // Save registers (any reg). +static int save2_reg = 16; + +// Indent the code and print a string. +void indent_printf(const char *format, ...) +{ + va_list va; + int posn; + va_start(va, format); + for (posn = 0; posn < indent; ++posn) + putc(' ', stdout); + vfprintf(stdout, format, va); + va_end(va); +} + +// Print an assembler instruction within quotes. +void insn_printf(const char *format, ...) +{ + va_list va; + int posn; + va_start(va, format); + for (posn = 0; posn < indent; ++posn) + putc(' ', stdout); + putc('"', stdout); + vfprintf(stdout, format, va); + putc('\\', stdout); + putc('n', stdout); + putc('"', stdout); + putc('\n', stdout); + va_end(va); +} + +void leftRotate1(int reg) +{ + insn_printf("lsl r%d", reg); + insn_printf("rol r%d", reg + 1); + insn_printf("rol r%d", reg + 2); + insn_printf("rol r%d", reg + 3); + insn_printf("rol r%d", reg + 4); + insn_printf("rol r%d", reg + 5); + insn_printf("rol r%d", reg + 6); + insn_printf("rol r%d", reg + 7); + insn_printf("adc r%d, __zero_reg__", reg); +} + +void rightRotate1(int reg) +{ + insn_printf("bst r%d,0", reg); + insn_printf("ror r%d", reg + 7); + insn_printf("ror r%d", reg + 6); + insn_printf("ror r%d", reg + 5); + insn_printf("ror r%d", reg + 4); + insn_printf("ror r%d", reg + 3); + insn_printf("ror r%d", reg + 2); + insn_printf("ror r%d", reg + 1); + insn_printf("ror r%d", reg); + insn_printf("bld r%d,7", reg + 7); +} + +void adjust_pointer_reg(int reg, int delta) +{ + if (delta >= 64) { + insn_printf("ldi %d,%d", const_reg, delta & 0xFF); + insn_printf("add r%d,%d", reg, const_reg); + if ((delta >> 8) != 0) { + insn_printf("ldi %d,%d", const_reg, (delta >> 8) & 0xFF); + insn_printf("adc r%d,r%d", reg + 1, const_reg); + } else { + insn_printf("adc r%d,__zero_reg__", reg + 1); + } + } else if (delta > 0) { + insn_printf("adiw r%d,%d", reg, delta); + } else if (delta <= -64) { + delta = -delta; + insn_printf("subi r%d,%d", reg, delta & 0xFF); + if ((delta >> 8) != 0) { + insn_printf("sbci r%d,%d", reg + 1, (delta >> 8) & 0xFF); + } else { + insn_printf("sbc r%d,__zero_reg__", reg + 1); + } + } else if (delta < 0) { + insn_printf("sbiw r%d,%d", reg, -delta); + } +} + +void load64_from_z(int reg, int offset) +{ + if (offset == 0) + insn_printf("ld r%d,Z", reg); + else + insn_printf("ldd r%d,Z+%d", reg, offset); + insn_printf("ldd r%d,Z+%d", reg + 1, offset + 1); + insn_printf("ldd r%d,Z+%d", reg + 2, offset + 2); + insn_printf("ldd r%d,Z+%d", reg + 3, offset + 3); + insn_printf("ldd r%d,Z+%d", reg + 4, offset + 4); + insn_printf("ldd r%d,Z+%d", reg + 5, offset + 5); + insn_printf("ldd r%d,Z+%d", reg + 6, offset + 6); + insn_printf("ldd r%d,Z+%d", reg + 7, offset + 7); +} + +void load64_from_y(int reg, int offset) +{ + if (offset == 0) + insn_printf("ld r%d,Y", reg); + else + insn_printf("ldd r%d,Y+%d", reg, offset); + insn_printf("ldd r%d,Y+%d", reg + 1, offset + 1); + insn_printf("ldd r%d,Y+%d", reg + 2, offset + 2); + insn_printf("ldd r%d,Y+%d", reg + 3, offset + 3); + insn_printf("ldd r%d,Y+%d", reg + 4, offset + 4); + insn_printf("ldd r%d,Y+%d", reg + 5, offset + 5); + insn_printf("ldd r%d,Y+%d", reg + 6, offset + 6); + insn_printf("ldd r%d,Y+%d", reg + 7, offset + 7); +} + +void load64_from_z_combine(const char *op, int reg, int offset) +{ + int posn; + for (posn = 0; posn < 8; ++posn, ++offset, ++reg) { + if (offset == 0) + insn_printf("ld __tmp_reg__,Z"); + else + insn_printf("ldd __tmp_reg__,Z+%d", offset); + insn_printf("%s r%d,__tmp_reg__", op, reg); + } +} + +void load64_from_y_combine(const char *op, int reg, int offset) +{ + int posn; + for (posn = 0; posn < 8; ++posn, ++offset, ++reg) { + if (offset == 0) + insn_printf("ld __tmp_reg__,Y"); + else + insn_printf("ldd __tmp_reg__,Y+%d", offset); + insn_printf("%s r%d,__tmp_reg__", op, reg); + } +} + +void combine64_with_z(const char *op, int reg) +{ + int posn; + for (posn = 0; posn < 8; ++posn, ++reg) { + if (posn == 0) + insn_printf("ld __tmp_reg__,Z"); + else + insn_printf("ldd __tmp_reg__,Z+%d", posn); + insn_printf("%s __tmp_reg__,r%d", op, reg); + if (posn == 0) + insn_printf("st Z,__tmp_reg__"); + else + insn_printf("std Z+%d,__tmp_reg__", posn); + } +} + +void load64_from_x(int reg) +{ + insn_printf("ld r%d,X+", reg); + insn_printf("ld r%d,X+", reg + 1); + insn_printf("ld r%d,X+", reg + 2); + insn_printf("ld r%d,X+", reg + 3); + insn_printf("ld r%d,X+", reg + 4); + insn_printf("ld r%d,X+", reg + 5); + insn_printf("ld r%d,X+", reg + 6); + insn_printf("ld r%d,X+", reg + 7); +} + +void store64_to_x(int reg) +{ + insn_printf("st X+,r%d", reg); + insn_printf("st X+,r%d", reg + 1); + insn_printf("st X+,r%d", reg + 2); + insn_printf("st X+,r%d", reg + 3); + insn_printf("st X+,r%d", reg + 4); + insn_printf("st X+,r%d", reg + 5); + insn_printf("st X+,r%d", reg + 6); + insn_printf("st X+,r%d", reg + 7); +} + +void theta(void) +{ + int index; + + printf("\n"); + indent_printf("// Step mapping theta. Compute C.\n"); + insn_printf("ldi r%d,5", loop1_reg); + insn_printf("100:"); + + // Load state.A[0][index] into t_reg. + load64_from_z(t_reg, 0); + + // XOR with state.A[1][index] .. state.A[4][index] + insn_printf("ldi r%d,4", loop2_reg); + insn_printf("101:"); + adjust_pointer_reg(z_reg, 40); + load64_from_z_combine("eor", t_reg, 0); + insn_printf("dec r%d", loop2_reg); + insn_printf("brne 101b"); + + // Store into state.B[0][index]. + store64_to_x(t_reg); + + // End of the outer loop. + adjust_pointer_reg(z_reg, -(160 - 8)); + insn_printf("dec r%d", loop1_reg); + insn_printf("brne 100b"); + adjust_pointer_reg(z_reg, -40); + + // Generate the D values into the second row of B. To make this + // easier, we know that the original X value is also in Y so we + // can use offsets relative to Y for the first row of B. + printf("\n"); + indent_printf("// Step mapping theta. Compute D.\n"); + for (index = 0; index < 5; ++index) { + load64_from_y(t_reg, ((index + 1) % 5) * 8); + leftRotate1(t_reg); + load64_from_y_combine("eor", t_reg, ((index + 4) % 5) * 8); + store64_to_x(t_reg); + } + adjust_pointer_reg(x_reg, -40); + + // XOR every D[index] with every A[x][index] element. + printf("\n"); + indent_printf("// Step mapping theta. XOR D with A.\n"); + insn_printf("ldi r%d,5", loop1_reg); + insn_printf("102:"); + load64_from_x(t_reg); + insn_printf("ldi r%d,5", loop2_reg); + insn_printf("103:"); + combine64_with_z("eor", t_reg); + adjust_pointer_reg(z_reg, 40); + insn_printf("dec r%d", loop2_reg); + insn_printf("brne 103b"); + adjust_pointer_reg(z_reg, -(200 - 8)); + insn_printf("dec r%d", loop1_reg); + insn_printf("brne 102b"); + adjust_pointer_reg(x_reg, -80); + adjust_pointer_reg(z_reg, -40); +} + +void rho_pi(void) +{ + typedef struct { + int x, y, rot; + } map; + static map const Bmap[5][5] = { // indexed by y, x + { + {0, 0, 0}, // B[0][0] + {0, 3, 28}, // B[1][0] + {0, 1, 1}, // B[2][0] + {0, 4, 27}, // B[3][0] + {0, 2, 62} // B[4][0] + }, + { + {1, 1, 44}, // B[0][1] + {1, 4, 20}, // B[1][1] + {1, 2, 6}, // B[2][1] + {1, 0, 36}, // B[3][1] + {1, 3, 55} // B[4][1] + }, + { + {2, 2, 43}, // B[0][2] + {2, 0, 3}, // B[1][2] + {2, 3, 25}, // B[2][2] + {2, 1, 10}, // B[3][2] + {2, 4, 39} // B[4][2] + }, + { + {3, 3, 21}, // B[0][3] + {3, 1, 45}, // B[1][3] + {3, 4, 8}, // B[2][3] + {3, 2, 15}, // B[3][3] + {3, 0, 41} // B[4][3] + }, + { + {4, 4, 14}, // B[0][4] + {4, 2, 61}, // B[1][4] + {4, 0, 18}, // B[2][4] + {4, 3, 56}, // B[3][4] + {4, 1, 2} // B[4][4] + } + }; + int Boffset = 0; + int Aoffset = 0; + int offset; + int Bx, By, Ax, Ay, rot, adjust; + printf("\n"); + indent_printf("// Step mappings rho and pi combined into one step.\n"); + for (By = 0; By < 5; ++By) { + for (Bx = 0; Bx < 5; ++Bx) { + // What do we need to load? + Ax = Bmap[By][Bx].x; + Ay = Bmap[By][Bx].y; + rot = Bmap[By][Bx].rot; + + // Heading for this step. + printf("\n"); + if (rot != 0) { + indent_printf("// state.B[%d][%d] = leftRotate%d_64(state.A[%d][%d])\n", + Bx, By, rot, Ax, Ay); + } else { + indent_printf("// state.B[%d][%d] = state.A[%d][%d]\n", + Bx, By, Ax, Ay); + } + + // Adjust rX and rZ to point at the new locations. + offset = (Bx * 5 + By) * 8; + adjust_pointer_reg(x_reg, offset - Boffset); + Boffset = offset; + offset = Ax * 5 * 8; + adjust_pointer_reg(z_reg, offset - Aoffset); + Aoffset = offset; + offset = Ay * 8; + + // Load the A/rZ value into the temp regs. + load64_from_z(t_reg, offset); + + // Rotate. + adjust = (8 - rot / 8) % 8; + switch (rot % 8) { + case 0: + break; + case 1: + if (inline_rotates) { + leftRotate1(t_reg); + } else { + insn_printf("call 11f"); + } + break; + case 2: + if (inline_rotates) { + leftRotate1(t_reg); + leftRotate1(t_reg); + } else { + insn_printf("call 12f"); + } + break; + case 3: + if (inline_rotates) { + leftRotate1(t_reg); + leftRotate1(t_reg); + leftRotate1(t_reg); + } else { + insn_printf("call 13f"); + } + break; + case 4: + if (inline_rotates) { + leftRotate1(t_reg); + leftRotate1(t_reg); + leftRotate1(t_reg); + leftRotate1(t_reg); + } else { + insn_printf("call 14f"); + } + break; + case 5: + if (inline_rotates) { + rightRotate1(t_reg); + rightRotate1(t_reg); + rightRotate1(t_reg); + } else { + insn_printf("call 23f"); + } + adjust = (adjust + 7) % 8; + break; + case 6: + if (inline_rotates) { + rightRotate1(t_reg); + rightRotate1(t_reg); + } else { + insn_printf("call 22f"); + } + adjust = (adjust + 7) % 8; + break; + case 7: + if (inline_rotates) { + rightRotate1(t_reg); + } else { + insn_printf("call 21f"); + } + adjust = (adjust + 7) % 8; + break; + default: + break; + } + + // Perform byte rotations and store into B/rX. + insn_printf("st X+,r%d", t_reg + adjust); + insn_printf("st X+,r%d", t_reg + (adjust + 1) % 8); + insn_printf("st X+,r%d", t_reg + (adjust + 2) % 8); + insn_printf("st X+,r%d", t_reg + (adjust + 3) % 8); + insn_printf("st X+,r%d", t_reg + (adjust + 4) % 8); + insn_printf("st X+,r%d", t_reg + (adjust + 5) % 8); + insn_printf("st X+,r%d", t_reg + (adjust + 6) % 8); + insn_printf("st X+,r%d", t_reg + (adjust + 7) % 8); + Boffset += 8; // rX has advanced by 8 while doing this. + } + } + + // Return rX and rZ to their starting values for the next step mapping. + adjust_pointer_reg(x_reg, -Boffset); + adjust_pointer_reg(z_reg, -Aoffset); +} + +void load8_from_y(int reg, int offset) +{ + if (offset == 0) + insn_printf("ld r%d,Y", reg); + else + insn_printf("ldd r%d,Y+%d", reg, offset); +} + +void store8_to_z(int reg, int offset) +{ + if (offset == 0) + insn_printf("st Z,r%d", reg); + else + insn_printf("std Z+%d,r%d", offset, reg); +} + +void chi(void) +{ + int index; + + // Step mapping chi. A is pointed to by Z and B is pointed to by X/Y. + // state.A[index2][index] = + // state.B[index2][index] ^ + // ((~state.B[index2][(index + 1) % 5]) & + // state.B[index2][(index + 2) % 5]); + // We compute this using an interleaving method. We load five bytes + // from the 5 words in a row of B and then compute the 5 output bytes + // from that and store. Then we move onto the next 5 bytes of each row. + int in1 = t_reg; + int in2 = t_reg + 1; + int in3 = t_reg + 2; + int in4 = t_reg + 3; + int in5 = t_reg + 4; + int out1 = t_reg + 5; + int out2 = t_reg + 6; + int out3 = t_reg + 7; + int out4 = save1_reg; + int out5 = save2_reg; + printf("\n"); + indent_printf("// Step mapping chi.\n"); + insn_printf("ldi r%d,5", loop1_reg); + insn_printf("50:"); + for (index = 0; index < 8; ++index) { + load8_from_y(in1, index); + load8_from_y(in2, index + 8); + load8_from_y(in3, index + 16); + load8_from_y(in4, index + 24); + load8_from_y(in5, index + 32); + insn_printf("mov r%d,r%d", out1, in2); + insn_printf("com r%d", out1); + insn_printf("and r%d,r%d", out1, in3); + insn_printf("eor r%d,r%d", out1, in1); + insn_printf("mov r%d,r%d", out2, in3); + insn_printf("com r%d", out2); + insn_printf("and r%d,r%d", out2, in4); + insn_printf("eor r%d,r%d", out2, in2); + insn_printf("mov r%d,r%d", out3, in4); + insn_printf("com r%d", out3); + insn_printf("and r%d,r%d", out3, in5); + insn_printf("eor r%d,r%d", out3, in3); + insn_printf("mov r%d,r%d", out4, in5); + insn_printf("com r%d", out4); + insn_printf("and r%d,r%d", out4, in1); + insn_printf("eor r%d,r%d", out4, in4); + insn_printf("mov r%d,r%d", out5, in1); + insn_printf("com r%d", out5); + insn_printf("and r%d,r%d", out5, in2); + insn_printf("eor r%d,r%d", out5, in5); + store8_to_z(out1, index); + store8_to_z(out2, index + 8); + store8_to_z(out3, index + 16); + store8_to_z(out4, index + 24); + store8_to_z(out5, index + 32); + } + adjust_pointer_reg(z_reg, 5 * 8); + adjust_pointer_reg(y_reg, 5 * 8); + insn_printf("dec r%d", loop1_reg); + insn_printf("breq 51f"); + insn_printf("rjmp 50b"); + insn_printf("51:"); + + // Restore Y and Z. We don't need this yet because chi() is the + // last thing we do so there's no point resetting the registers. + //adjust_pointer_reg(y_reg, -200); + //adjust_pointer_reg(z_reg, -200); +} + +void outer_loop(void) +{ + // Save Y and then copy X into Y. This way we can use displacements + // relative to the Y register which we cannot do with X. + insn_printf("push r%d", y_reg + 1); + insn_printf("push r%d", y_reg); + insn_printf("mov r%d,r%d", y_reg, x_reg); + insn_printf("mov r%d,r%d", y_reg + 1, x_reg + 1); + + // Output the step mappings. + theta(); + rho_pi(); + chi(); + // iota(); + + // Restore Y. + insn_printf("pop r%d", y_reg); + insn_printf("pop r%d", y_reg + 1); + + // Dump the rotation utilities. + if (!inline_rotates) { + insn_printf("rjmp 3f"); + printf("\n"); + indent_printf("// Left rotate by 4 bits\n"); + insn_printf("14:"); + leftRotate1(t_reg); + indent_printf("// Left rotate by 3 bits\n"); + insn_printf("13:"); + leftRotate1(t_reg); + indent_printf("// Left rotate by 2 bits\n"); + insn_printf("12:"); + leftRotate1(t_reg); + indent_printf("// Left rotate by 1 bit\n"); + insn_printf("11:"); + leftRotate1(t_reg); + insn_printf("ret"); + printf("\n"); + indent_printf("// Right rotate by 3 bits\n"); + insn_printf("23:"); + rightRotate1(t_reg); + indent_printf("// Right rotate by 2 bits\n"); + insn_printf("22:"); + rightRotate1(t_reg); + indent_printf("// Right rotate by 1 bit\n"); + insn_printf("21:"); + rightRotate1(t_reg); + insn_printf("ret"); + } + + // End of assembly. + printf("\n"); + indent_printf("// Done\n"); + if (!inline_rotates) { + insn_printf("3:"); + } +} + +int main(int argc, char *argv[]) +{ + indent_printf("__asm__ __volatile__ (\n"); + indent += 4; + outer_loop(); + indent_printf(": : \"x\"(state.B), \"z\"(state.A)\n"); + indent_printf(": \"r%d\", \"r%d\", \"r%d\", \"r%d\", " + "\"r%d\", \"r%d\", \"r%d\", \"r%d\",\n", + t_reg, t_reg + 1, t_reg + 2, t_reg + 3, + t_reg + 4, t_reg + 5, t_reg + 6, t_reg + 7); + indent_printf(" \"r%d\", \"r%d\", \"r%d\", \"r%d\", " + "\"r%d\", \"r%d\", \"memory\"\n", + save2_reg, save1_reg, loop3_reg, loop2_reg, + loop1_reg, const_reg); + indent -= 4; + indent_printf(");\n"); + return 0; +} diff --git a/libraries/Crypto/KeccakCore.cpp b/libraries/Crypto/KeccakCore.cpp index eeaa56ad..86b22bbf 100644 --- a/libraries/Crypto/KeccakCore.cpp +++ b/libraries/Crypto/KeccakCore.cpp @@ -287,6 +287,1551 @@ void KeccakCore::setHMACKey(const void *key, size_t len, uint8_t pad, size_t has */ void KeccakCore::keccakp() { +#if defined(__AVR__) + // This assembly code was generated by the "genkeccak.c" program. + // Do not modify this code directly. Instead modify "genkeccak.c" + // and then re-generate the code here. + for (uint8_t round = 0; round < 24; ++round) { + __asm__ __volatile__ ( + "push r29\n" + "push r28\n" + "mov r28,r26\n" + "mov r29,r27\n" + + // Step mapping theta. Compute C. + "ldi r20,5\n" + "100:\n" + "ld r8,Z\n" + "ldd r9,Z+1\n" + "ldd r10,Z+2\n" + "ldd r11,Z+3\n" + "ldd r12,Z+4\n" + "ldd r13,Z+5\n" + "ldd r14,Z+6\n" + "ldd r15,Z+7\n" + "ldi r19,4\n" + "101:\n" + "adiw r30,40\n" + "ld __tmp_reg__,Z\n" + "eor r8,__tmp_reg__\n" + "ldd __tmp_reg__,Z+1\n" + "eor r9,__tmp_reg__\n" + "ldd __tmp_reg__,Z+2\n" + "eor r10,__tmp_reg__\n" + "ldd __tmp_reg__,Z+3\n" + "eor r11,__tmp_reg__\n" + "ldd __tmp_reg__,Z+4\n" + "eor r12,__tmp_reg__\n" + "ldd __tmp_reg__,Z+5\n" + "eor r13,__tmp_reg__\n" + "ldd __tmp_reg__,Z+6\n" + "eor r14,__tmp_reg__\n" + "ldd __tmp_reg__,Z+7\n" + "eor r15,__tmp_reg__\n" + "dec r19\n" + "brne 101b\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + "st X+,r14\n" + "st X+,r15\n" + "subi r30,152\n" + "sbc r31,__zero_reg__\n" + "dec r20\n" + "brne 100b\n" + "sbiw r30,40\n" + + // Step mapping theta. Compute D. + "ldd r8,Y+8\n" + "ldd r9,Y+9\n" + "ldd r10,Y+10\n" + "ldd r11,Y+11\n" + "ldd r12,Y+12\n" + "ldd r13,Y+13\n" + "ldd r14,Y+14\n" + "ldd r15,Y+15\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "ldd __tmp_reg__,Y+32\n" + "eor r8,__tmp_reg__\n" + "ldd __tmp_reg__,Y+33\n" + "eor r9,__tmp_reg__\n" + "ldd __tmp_reg__,Y+34\n" + "eor r10,__tmp_reg__\n" + "ldd __tmp_reg__,Y+35\n" + "eor r11,__tmp_reg__\n" + "ldd __tmp_reg__,Y+36\n" + "eor r12,__tmp_reg__\n" + "ldd __tmp_reg__,Y+37\n" + "eor r13,__tmp_reg__\n" + "ldd __tmp_reg__,Y+38\n" + "eor r14,__tmp_reg__\n" + "ldd __tmp_reg__,Y+39\n" + "eor r15,__tmp_reg__\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + "st X+,r14\n" + "st X+,r15\n" + "ldd r8,Y+16\n" + "ldd r9,Y+17\n" + "ldd r10,Y+18\n" + "ldd r11,Y+19\n" + "ldd r12,Y+20\n" + "ldd r13,Y+21\n" + "ldd r14,Y+22\n" + "ldd r15,Y+23\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "ld __tmp_reg__,Y\n" + "eor r8,__tmp_reg__\n" + "ldd __tmp_reg__,Y+1\n" + "eor r9,__tmp_reg__\n" + "ldd __tmp_reg__,Y+2\n" + "eor r10,__tmp_reg__\n" + "ldd __tmp_reg__,Y+3\n" + "eor r11,__tmp_reg__\n" + "ldd __tmp_reg__,Y+4\n" + "eor r12,__tmp_reg__\n" + "ldd __tmp_reg__,Y+5\n" + "eor r13,__tmp_reg__\n" + "ldd __tmp_reg__,Y+6\n" + "eor r14,__tmp_reg__\n" + "ldd __tmp_reg__,Y+7\n" + "eor r15,__tmp_reg__\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + "st X+,r14\n" + "st X+,r15\n" + "ldd r8,Y+24\n" + "ldd r9,Y+25\n" + "ldd r10,Y+26\n" + "ldd r11,Y+27\n" + "ldd r12,Y+28\n" + "ldd r13,Y+29\n" + "ldd r14,Y+30\n" + "ldd r15,Y+31\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "ldd __tmp_reg__,Y+8\n" + "eor r8,__tmp_reg__\n" + "ldd __tmp_reg__,Y+9\n" + "eor r9,__tmp_reg__\n" + "ldd __tmp_reg__,Y+10\n" + "eor r10,__tmp_reg__\n" + "ldd __tmp_reg__,Y+11\n" + "eor r11,__tmp_reg__\n" + "ldd __tmp_reg__,Y+12\n" + "eor r12,__tmp_reg__\n" + "ldd __tmp_reg__,Y+13\n" + "eor r13,__tmp_reg__\n" + "ldd __tmp_reg__,Y+14\n" + "eor r14,__tmp_reg__\n" + "ldd __tmp_reg__,Y+15\n" + "eor r15,__tmp_reg__\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + "st X+,r14\n" + "st X+,r15\n" + "ldd r8,Y+32\n" + "ldd r9,Y+33\n" + "ldd r10,Y+34\n" + "ldd r11,Y+35\n" + "ldd r12,Y+36\n" + "ldd r13,Y+37\n" + "ldd r14,Y+38\n" + "ldd r15,Y+39\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "ldd __tmp_reg__,Y+16\n" + "eor r8,__tmp_reg__\n" + "ldd __tmp_reg__,Y+17\n" + "eor r9,__tmp_reg__\n" + "ldd __tmp_reg__,Y+18\n" + "eor r10,__tmp_reg__\n" + "ldd __tmp_reg__,Y+19\n" + "eor r11,__tmp_reg__\n" + "ldd __tmp_reg__,Y+20\n" + "eor r12,__tmp_reg__\n" + "ldd __tmp_reg__,Y+21\n" + "eor r13,__tmp_reg__\n" + "ldd __tmp_reg__,Y+22\n" + "eor r14,__tmp_reg__\n" + "ldd __tmp_reg__,Y+23\n" + "eor r15,__tmp_reg__\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + "st X+,r14\n" + "st X+,r15\n" + "ld r8,Y\n" + "ldd r9,Y+1\n" + "ldd r10,Y+2\n" + "ldd r11,Y+3\n" + "ldd r12,Y+4\n" + "ldd r13,Y+5\n" + "ldd r14,Y+6\n" + "ldd r15,Y+7\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "ldd __tmp_reg__,Y+24\n" + "eor r8,__tmp_reg__\n" + "ldd __tmp_reg__,Y+25\n" + "eor r9,__tmp_reg__\n" + "ldd __tmp_reg__,Y+26\n" + "eor r10,__tmp_reg__\n" + "ldd __tmp_reg__,Y+27\n" + "eor r11,__tmp_reg__\n" + "ldd __tmp_reg__,Y+28\n" + "eor r12,__tmp_reg__\n" + "ldd __tmp_reg__,Y+29\n" + "eor r13,__tmp_reg__\n" + "ldd __tmp_reg__,Y+30\n" + "eor r14,__tmp_reg__\n" + "ldd __tmp_reg__,Y+31\n" + "eor r15,__tmp_reg__\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + "st X+,r14\n" + "st X+,r15\n" + "sbiw r26,40\n" + + // Step mapping theta. XOR D with A. + "ldi r20,5\n" + "102:\n" + "ld r8,X+\n" + "ld r9,X+\n" + "ld r10,X+\n" + "ld r11,X+\n" + "ld r12,X+\n" + "ld r13,X+\n" + "ld r14,X+\n" + "ld r15,X+\n" + "ldi r19,5\n" + "103:\n" + "ld __tmp_reg__,Z\n" + "eor __tmp_reg__,r8\n" + "st Z,__tmp_reg__\n" + "ldd __tmp_reg__,Z+1\n" + "eor __tmp_reg__,r9\n" + "std Z+1,__tmp_reg__\n" + "ldd __tmp_reg__,Z+2\n" + "eor __tmp_reg__,r10\n" + "std Z+2,__tmp_reg__\n" + "ldd __tmp_reg__,Z+3\n" + "eor __tmp_reg__,r11\n" + "std Z+3,__tmp_reg__\n" + "ldd __tmp_reg__,Z+4\n" + "eor __tmp_reg__,r12\n" + "std Z+4,__tmp_reg__\n" + "ldd __tmp_reg__,Z+5\n" + "eor __tmp_reg__,r13\n" + "std Z+5,__tmp_reg__\n" + "ldd __tmp_reg__,Z+6\n" + "eor __tmp_reg__,r14\n" + "std Z+6,__tmp_reg__\n" + "ldd __tmp_reg__,Z+7\n" + "eor __tmp_reg__,r15\n" + "std Z+7,__tmp_reg__\n" + "adiw r30,40\n" + "dec r19\n" + "brne 103b\n" + "subi r30,192\n" + "sbc r31,__zero_reg__\n" + "dec r20\n" + "brne 102b\n" + "subi r26,80\n" + "sbc r27,__zero_reg__\n" + "sbiw r30,40\n" + + // Step mappings rho and pi combined into one step. + + // state.B[0][0] = state.A[0][0] + "ld r8,Z\n" + "ldd r9,Z+1\n" + "ldd r10,Z+2\n" + "ldd r11,Z+3\n" + "ldd r12,Z+4\n" + "ldd r13,Z+5\n" + "ldd r14,Z+6\n" + "ldd r15,Z+7\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + "st X+,r14\n" + "st X+,r15\n" + + // state.B[1][0] = leftRotate28_64(state.A[0][3]) + "adiw r26,32\n" + "ldd r8,Z+24\n" + "ldd r9,Z+25\n" + "ldd r10,Z+26\n" + "ldd r11,Z+27\n" + "ldd r12,Z+28\n" + "ldd r13,Z+29\n" + "ldd r14,Z+30\n" + "ldd r15,Z+31\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "st X+,r13\n" + "st X+,r14\n" + "st X+,r15\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + + // state.B[2][0] = leftRotate1_64(state.A[0][1]) + "adiw r26,32\n" + "ldd r8,Z+8\n" + "ldd r9,Z+9\n" + "ldd r10,Z+10\n" + "ldd r11,Z+11\n" + "ldd r12,Z+12\n" + "ldd r13,Z+13\n" + "ldd r14,Z+14\n" + "ldd r15,Z+15\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + "st X+,r14\n" + "st X+,r15\n" + + // state.B[3][0] = leftRotate27_64(state.A[0][4]) + "adiw r26,32\n" + "ldd r8,Z+32\n" + "ldd r9,Z+33\n" + "ldd r10,Z+34\n" + "ldd r11,Z+35\n" + "ldd r12,Z+36\n" + "ldd r13,Z+37\n" + "ldd r14,Z+38\n" + "ldd r15,Z+39\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "st X+,r13\n" + "st X+,r14\n" + "st X+,r15\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + + // state.B[4][0] = leftRotate62_64(state.A[0][2]) + "adiw r26,32\n" + "ldd r8,Z+16\n" + "ldd r9,Z+17\n" + "ldd r10,Z+18\n" + "ldd r11,Z+19\n" + "ldd r12,Z+20\n" + "ldd r13,Z+21\n" + "ldd r14,Z+22\n" + "ldd r15,Z+23\n" + "bst r8,0\n" + "ror r15\n" + "ror r14\n" + "ror r13\n" + "ror r12\n" + "ror r11\n" + "ror r10\n" + "ror r9\n" + "ror r8\n" + "bld r15,7\n" + "bst r8,0\n" + "ror r15\n" + "ror r14\n" + "ror r13\n" + "ror r12\n" + "ror r11\n" + "ror r10\n" + "ror r9\n" + "ror r8\n" + "bld r15,7\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + "st X+,r14\n" + "st X+,r15\n" + + // state.B[0][1] = leftRotate44_64(state.A[1][1]) + "subi r26,160\n" + "sbc r27,__zero_reg__\n" + "adiw r30,40\n" + "ldd r8,Z+8\n" + "ldd r9,Z+9\n" + "ldd r10,Z+10\n" + "ldd r11,Z+11\n" + "ldd r12,Z+12\n" + "ldd r13,Z+13\n" + "ldd r14,Z+14\n" + "ldd r15,Z+15\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + "st X+,r14\n" + "st X+,r15\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + + // state.B[1][1] = leftRotate20_64(state.A[1][4]) + "adiw r26,32\n" + "ldd r8,Z+32\n" + "ldd r9,Z+33\n" + "ldd r10,Z+34\n" + "ldd r11,Z+35\n" + "ldd r12,Z+36\n" + "ldd r13,Z+37\n" + "ldd r14,Z+38\n" + "ldd r15,Z+39\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "st X+,r14\n" + "st X+,r15\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + + // state.B[2][1] = leftRotate6_64(state.A[1][2]) + "adiw r26,32\n" + "ldd r8,Z+16\n" + "ldd r9,Z+17\n" + "ldd r10,Z+18\n" + "ldd r11,Z+19\n" + "ldd r12,Z+20\n" + "ldd r13,Z+21\n" + "ldd r14,Z+22\n" + "ldd r15,Z+23\n" + "bst r8,0\n" + "ror r15\n" + "ror r14\n" + "ror r13\n" + "ror r12\n" + "ror r11\n" + "ror r10\n" + "ror r9\n" + "ror r8\n" + "bld r15,7\n" + "bst r8,0\n" + "ror r15\n" + "ror r14\n" + "ror r13\n" + "ror r12\n" + "ror r11\n" + "ror r10\n" + "ror r9\n" + "ror r8\n" + "bld r15,7\n" + "st X+,r15\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + "st X+,r14\n" + + // state.B[3][1] = leftRotate36_64(state.A[1][0]) + "adiw r26,32\n" + "ld r8,Z\n" + "ldd r9,Z+1\n" + "ldd r10,Z+2\n" + "ldd r11,Z+3\n" + "ldd r12,Z+4\n" + "ldd r13,Z+5\n" + "ldd r14,Z+6\n" + "ldd r15,Z+7\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "st X+,r12\n" + "st X+,r13\n" + "st X+,r14\n" + "st X+,r15\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + + // state.B[4][1] = leftRotate55_64(state.A[1][3]) + "adiw r26,32\n" + "ldd r8,Z+24\n" + "ldd r9,Z+25\n" + "ldd r10,Z+26\n" + "ldd r11,Z+27\n" + "ldd r12,Z+28\n" + "ldd r13,Z+29\n" + "ldd r14,Z+30\n" + "ldd r15,Z+31\n" + "bst r8,0\n" + "ror r15\n" + "ror r14\n" + "ror r13\n" + "ror r12\n" + "ror r11\n" + "ror r10\n" + "ror r9\n" + "ror r8\n" + "bld r15,7\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + "st X+,r14\n" + "st X+,r15\n" + "st X+,r8\n" + + // state.B[0][2] = leftRotate43_64(state.A[2][2]) + "subi r26,160\n" + "sbc r27,__zero_reg__\n" + "adiw r30,40\n" + "ldd r8,Z+16\n" + "ldd r9,Z+17\n" + "ldd r10,Z+18\n" + "ldd r11,Z+19\n" + "ldd r12,Z+20\n" + "ldd r13,Z+21\n" + "ldd r14,Z+22\n" + "ldd r15,Z+23\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + "st X+,r14\n" + "st X+,r15\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + + // state.B[1][2] = leftRotate3_64(state.A[2][0]) + "adiw r26,32\n" + "ld r8,Z\n" + "ldd r9,Z+1\n" + "ldd r10,Z+2\n" + "ldd r11,Z+3\n" + "ldd r12,Z+4\n" + "ldd r13,Z+5\n" + "ldd r14,Z+6\n" + "ldd r15,Z+7\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + "st X+,r14\n" + "st X+,r15\n" + + // state.B[2][2] = leftRotate25_64(state.A[2][3]) + "adiw r26,32\n" + "ldd r8,Z+24\n" + "ldd r9,Z+25\n" + "ldd r10,Z+26\n" + "ldd r11,Z+27\n" + "ldd r12,Z+28\n" + "ldd r13,Z+29\n" + "ldd r14,Z+30\n" + "ldd r15,Z+31\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "st X+,r13\n" + "st X+,r14\n" + "st X+,r15\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + + // state.B[3][2] = leftRotate10_64(state.A[2][1]) + "adiw r26,32\n" + "ldd r8,Z+8\n" + "ldd r9,Z+9\n" + "ldd r10,Z+10\n" + "ldd r11,Z+11\n" + "ldd r12,Z+12\n" + "ldd r13,Z+13\n" + "ldd r14,Z+14\n" + "ldd r15,Z+15\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "st X+,r15\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + "st X+,r14\n" + + // state.B[4][2] = leftRotate39_64(state.A[2][4]) + "adiw r26,32\n" + "ldd r8,Z+32\n" + "ldd r9,Z+33\n" + "ldd r10,Z+34\n" + "ldd r11,Z+35\n" + "ldd r12,Z+36\n" + "ldd r13,Z+37\n" + "ldd r14,Z+38\n" + "ldd r15,Z+39\n" + "bst r8,0\n" + "ror r15\n" + "ror r14\n" + "ror r13\n" + "ror r12\n" + "ror r11\n" + "ror r10\n" + "ror r9\n" + "ror r8\n" + "bld r15,7\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + "st X+,r14\n" + "st X+,r15\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + + // state.B[0][3] = leftRotate21_64(state.A[3][3]) + "subi r26,160\n" + "sbc r27,__zero_reg__\n" + "adiw r30,40\n" + "ldd r8,Z+24\n" + "ldd r9,Z+25\n" + "ldd r10,Z+26\n" + "ldd r11,Z+27\n" + "ldd r12,Z+28\n" + "ldd r13,Z+29\n" + "ldd r14,Z+30\n" + "ldd r15,Z+31\n" + "bst r8,0\n" + "ror r15\n" + "ror r14\n" + "ror r13\n" + "ror r12\n" + "ror r11\n" + "ror r10\n" + "ror r9\n" + "ror r8\n" + "bld r15,7\n" + "bst r8,0\n" + "ror r15\n" + "ror r14\n" + "ror r13\n" + "ror r12\n" + "ror r11\n" + "ror r10\n" + "ror r9\n" + "ror r8\n" + "bld r15,7\n" + "bst r8,0\n" + "ror r15\n" + "ror r14\n" + "ror r13\n" + "ror r12\n" + "ror r11\n" + "ror r10\n" + "ror r9\n" + "ror r8\n" + "bld r15,7\n" + "st X+,r13\n" + "st X+,r14\n" + "st X+,r15\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + + // state.B[1][3] = leftRotate45_64(state.A[3][1]) + "adiw r26,32\n" + "ldd r8,Z+8\n" + "ldd r9,Z+9\n" + "ldd r10,Z+10\n" + "ldd r11,Z+11\n" + "ldd r12,Z+12\n" + "ldd r13,Z+13\n" + "ldd r14,Z+14\n" + "ldd r15,Z+15\n" + "bst r8,0\n" + "ror r15\n" + "ror r14\n" + "ror r13\n" + "ror r12\n" + "ror r11\n" + "ror r10\n" + "ror r9\n" + "ror r8\n" + "bld r15,7\n" + "bst r8,0\n" + "ror r15\n" + "ror r14\n" + "ror r13\n" + "ror r12\n" + "ror r11\n" + "ror r10\n" + "ror r9\n" + "ror r8\n" + "bld r15,7\n" + "bst r8,0\n" + "ror r15\n" + "ror r14\n" + "ror r13\n" + "ror r12\n" + "ror r11\n" + "ror r10\n" + "ror r9\n" + "ror r8\n" + "bld r15,7\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + "st X+,r14\n" + "st X+,r15\n" + "st X+,r8\n" + "st X+,r9\n" + + // state.B[2][3] = leftRotate8_64(state.A[3][4]) + "adiw r26,32\n" + "ldd r8,Z+32\n" + "ldd r9,Z+33\n" + "ldd r10,Z+34\n" + "ldd r11,Z+35\n" + "ldd r12,Z+36\n" + "ldd r13,Z+37\n" + "ldd r14,Z+38\n" + "ldd r15,Z+39\n" + "st X+,r15\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + "st X+,r14\n" + + // state.B[3][3] = leftRotate15_64(state.A[3][2]) + "adiw r26,32\n" + "ldd r8,Z+16\n" + "ldd r9,Z+17\n" + "ldd r10,Z+18\n" + "ldd r11,Z+19\n" + "ldd r12,Z+20\n" + "ldd r13,Z+21\n" + "ldd r14,Z+22\n" + "ldd r15,Z+23\n" + "bst r8,0\n" + "ror r15\n" + "ror r14\n" + "ror r13\n" + "ror r12\n" + "ror r11\n" + "ror r10\n" + "ror r9\n" + "ror r8\n" + "bld r15,7\n" + "st X+,r14\n" + "st X+,r15\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + + // state.B[4][3] = leftRotate41_64(state.A[3][0]) + "adiw r26,32\n" + "ld r8,Z\n" + "ldd r9,Z+1\n" + "ldd r10,Z+2\n" + "ldd r11,Z+3\n" + "ldd r12,Z+4\n" + "ldd r13,Z+5\n" + "ldd r14,Z+6\n" + "ldd r15,Z+7\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + "st X+,r14\n" + "st X+,r15\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + + // state.B[0][4] = leftRotate14_64(state.A[4][4]) + "subi r26,160\n" + "sbc r27,__zero_reg__\n" + "adiw r30,40\n" + "ldd r8,Z+32\n" + "ldd r9,Z+33\n" + "ldd r10,Z+34\n" + "ldd r11,Z+35\n" + "ldd r12,Z+36\n" + "ldd r13,Z+37\n" + "ldd r14,Z+38\n" + "ldd r15,Z+39\n" + "bst r8,0\n" + "ror r15\n" + "ror r14\n" + "ror r13\n" + "ror r12\n" + "ror r11\n" + "ror r10\n" + "ror r9\n" + "ror r8\n" + "bld r15,7\n" + "bst r8,0\n" + "ror r15\n" + "ror r14\n" + "ror r13\n" + "ror r12\n" + "ror r11\n" + "ror r10\n" + "ror r9\n" + "ror r8\n" + "bld r15,7\n" + "st X+,r14\n" + "st X+,r15\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + + // state.B[1][4] = leftRotate61_64(state.A[4][2]) + "adiw r26,32\n" + "ldd r8,Z+16\n" + "ldd r9,Z+17\n" + "ldd r10,Z+18\n" + "ldd r11,Z+19\n" + "ldd r12,Z+20\n" + "ldd r13,Z+21\n" + "ldd r14,Z+22\n" + "ldd r15,Z+23\n" + "bst r8,0\n" + "ror r15\n" + "ror r14\n" + "ror r13\n" + "ror r12\n" + "ror r11\n" + "ror r10\n" + "ror r9\n" + "ror r8\n" + "bld r15,7\n" + "bst r8,0\n" + "ror r15\n" + "ror r14\n" + "ror r13\n" + "ror r12\n" + "ror r11\n" + "ror r10\n" + "ror r9\n" + "ror r8\n" + "bld r15,7\n" + "bst r8,0\n" + "ror r15\n" + "ror r14\n" + "ror r13\n" + "ror r12\n" + "ror r11\n" + "ror r10\n" + "ror r9\n" + "ror r8\n" + "bld r15,7\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + "st X+,r14\n" + "st X+,r15\n" + + // state.B[2][4] = leftRotate18_64(state.A[4][0]) + "adiw r26,32\n" + "ld r8,Z\n" + "ldd r9,Z+1\n" + "ldd r10,Z+2\n" + "ldd r11,Z+3\n" + "ldd r12,Z+4\n" + "ldd r13,Z+5\n" + "ldd r14,Z+6\n" + "ldd r15,Z+7\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "st X+,r14\n" + "st X+,r15\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + + // state.B[3][4] = leftRotate56_64(state.A[4][3]) + "adiw r26,32\n" + "ldd r8,Z+24\n" + "ldd r9,Z+25\n" + "ldd r10,Z+26\n" + "ldd r11,Z+27\n" + "ldd r12,Z+28\n" + "ldd r13,Z+29\n" + "ldd r14,Z+30\n" + "ldd r15,Z+31\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + "st X+,r14\n" + "st X+,r15\n" + "st X+,r8\n" + + // state.B[4][4] = leftRotate2_64(state.A[4][1]) + "adiw r26,32\n" + "ldd r8,Z+8\n" + "ldd r9,Z+9\n" + "ldd r10,Z+10\n" + "ldd r11,Z+11\n" + "ldd r12,Z+12\n" + "ldd r13,Z+13\n" + "ldd r14,Z+14\n" + "ldd r15,Z+15\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "lsl r8\n" + "rol r9\n" + "rol r10\n" + "rol r11\n" + "rol r12\n" + "rol r13\n" + "rol r14\n" + "rol r15\n" + "adc r8, __zero_reg__\n" + "st X+,r8\n" + "st X+,r9\n" + "st X+,r10\n" + "st X+,r11\n" + "st X+,r12\n" + "st X+,r13\n" + "st X+,r14\n" + "st X+,r15\n" + "subi r26,200\n" + "sbc r27,__zero_reg__\n" + "subi r30,160\n" + "sbc r31,__zero_reg__\n" + + // Step mapping chi. + "ldi r20,5\n" + "50:\n" + "ld r8,Y\n" + "ldd r9,Y+8\n" + "ldd r10,Y+16\n" + "ldd r11,Y+24\n" + "ldd r12,Y+32\n" + "mov r13,r9\n" + "com r13\n" + "and r13,r10\n" + "eor r13,r8\n" + "mov r14,r10\n" + "com r14\n" + "and r14,r11\n" + "eor r14,r9\n" + "mov r15,r11\n" + "com r15\n" + "and r15,r12\n" + "eor r15,r10\n" + "mov r17,r12\n" + "com r17\n" + "and r17,r8\n" + "eor r17,r11\n" + "mov r16,r8\n" + "com r16\n" + "and r16,r9\n" + "eor r16,r12\n" + "st Z,r13\n" + "std Z+8,r14\n" + "std Z+16,r15\n" + "std Z+24,r17\n" + "std Z+32,r16\n" + "ldd r8,Y+1\n" + "ldd r9,Y+9\n" + "ldd r10,Y+17\n" + "ldd r11,Y+25\n" + "ldd r12,Y+33\n" + "mov r13,r9\n" + "com r13\n" + "and r13,r10\n" + "eor r13,r8\n" + "mov r14,r10\n" + "com r14\n" + "and r14,r11\n" + "eor r14,r9\n" + "mov r15,r11\n" + "com r15\n" + "and r15,r12\n" + "eor r15,r10\n" + "mov r17,r12\n" + "com r17\n" + "and r17,r8\n" + "eor r17,r11\n" + "mov r16,r8\n" + "com r16\n" + "and r16,r9\n" + "eor r16,r12\n" + "std Z+1,r13\n" + "std Z+9,r14\n" + "std Z+17,r15\n" + "std Z+25,r17\n" + "std Z+33,r16\n" + "ldd r8,Y+2\n" + "ldd r9,Y+10\n" + "ldd r10,Y+18\n" + "ldd r11,Y+26\n" + "ldd r12,Y+34\n" + "mov r13,r9\n" + "com r13\n" + "and r13,r10\n" + "eor r13,r8\n" + "mov r14,r10\n" + "com r14\n" + "and r14,r11\n" + "eor r14,r9\n" + "mov r15,r11\n" + "com r15\n" + "and r15,r12\n" + "eor r15,r10\n" + "mov r17,r12\n" + "com r17\n" + "and r17,r8\n" + "eor r17,r11\n" + "mov r16,r8\n" + "com r16\n" + "and r16,r9\n" + "eor r16,r12\n" + "std Z+2,r13\n" + "std Z+10,r14\n" + "std Z+18,r15\n" + "std Z+26,r17\n" + "std Z+34,r16\n" + "ldd r8,Y+3\n" + "ldd r9,Y+11\n" + "ldd r10,Y+19\n" + "ldd r11,Y+27\n" + "ldd r12,Y+35\n" + "mov r13,r9\n" + "com r13\n" + "and r13,r10\n" + "eor r13,r8\n" + "mov r14,r10\n" + "com r14\n" + "and r14,r11\n" + "eor r14,r9\n" + "mov r15,r11\n" + "com r15\n" + "and r15,r12\n" + "eor r15,r10\n" + "mov r17,r12\n" + "com r17\n" + "and r17,r8\n" + "eor r17,r11\n" + "mov r16,r8\n" + "com r16\n" + "and r16,r9\n" + "eor r16,r12\n" + "std Z+3,r13\n" + "std Z+11,r14\n" + "std Z+19,r15\n" + "std Z+27,r17\n" + "std Z+35,r16\n" + "ldd r8,Y+4\n" + "ldd r9,Y+12\n" + "ldd r10,Y+20\n" + "ldd r11,Y+28\n" + "ldd r12,Y+36\n" + "mov r13,r9\n" + "com r13\n" + "and r13,r10\n" + "eor r13,r8\n" + "mov r14,r10\n" + "com r14\n" + "and r14,r11\n" + "eor r14,r9\n" + "mov r15,r11\n" + "com r15\n" + "and r15,r12\n" + "eor r15,r10\n" + "mov r17,r12\n" + "com r17\n" + "and r17,r8\n" + "eor r17,r11\n" + "mov r16,r8\n" + "com r16\n" + "and r16,r9\n" + "eor r16,r12\n" + "std Z+4,r13\n" + "std Z+12,r14\n" + "std Z+20,r15\n" + "std Z+28,r17\n" + "std Z+36,r16\n" + "ldd r8,Y+5\n" + "ldd r9,Y+13\n" + "ldd r10,Y+21\n" + "ldd r11,Y+29\n" + "ldd r12,Y+37\n" + "mov r13,r9\n" + "com r13\n" + "and r13,r10\n" + "eor r13,r8\n" + "mov r14,r10\n" + "com r14\n" + "and r14,r11\n" + "eor r14,r9\n" + "mov r15,r11\n" + "com r15\n" + "and r15,r12\n" + "eor r15,r10\n" + "mov r17,r12\n" + "com r17\n" + "and r17,r8\n" + "eor r17,r11\n" + "mov r16,r8\n" + "com r16\n" + "and r16,r9\n" + "eor r16,r12\n" + "std Z+5,r13\n" + "std Z+13,r14\n" + "std Z+21,r15\n" + "std Z+29,r17\n" + "std Z+37,r16\n" + "ldd r8,Y+6\n" + "ldd r9,Y+14\n" + "ldd r10,Y+22\n" + "ldd r11,Y+30\n" + "ldd r12,Y+38\n" + "mov r13,r9\n" + "com r13\n" + "and r13,r10\n" + "eor r13,r8\n" + "mov r14,r10\n" + "com r14\n" + "and r14,r11\n" + "eor r14,r9\n" + "mov r15,r11\n" + "com r15\n" + "and r15,r12\n" + "eor r15,r10\n" + "mov r17,r12\n" + "com r17\n" + "and r17,r8\n" + "eor r17,r11\n" + "mov r16,r8\n" + "com r16\n" + "and r16,r9\n" + "eor r16,r12\n" + "std Z+6,r13\n" + "std Z+14,r14\n" + "std Z+22,r15\n" + "std Z+30,r17\n" + "std Z+38,r16\n" + "ldd r8,Y+7\n" + "ldd r9,Y+15\n" + "ldd r10,Y+23\n" + "ldd r11,Y+31\n" + "ldd r12,Y+39\n" + "mov r13,r9\n" + "com r13\n" + "and r13,r10\n" + "eor r13,r8\n" + "mov r14,r10\n" + "com r14\n" + "and r14,r11\n" + "eor r14,r9\n" + "mov r15,r11\n" + "com r15\n" + "and r15,r12\n" + "eor r15,r10\n" + "mov r17,r12\n" + "com r17\n" + "and r17,r8\n" + "eor r17,r11\n" + "mov r16,r8\n" + "com r16\n" + "and r16,r9\n" + "eor r16,r12\n" + "std Z+7,r13\n" + "std Z+15,r14\n" + "std Z+23,r15\n" + "std Z+31,r17\n" + "std Z+39,r16\n" + "adiw r30,40\n" + "adiw r28,40\n" + "dec r20\n" + "breq 51f\n" + "rjmp 50b\n" + "51:\n" + "pop r28\n" + "pop r29\n" + + // Done + : : "x"(state.B), "z"(state.A) + : "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "r16", "r17", "r18", "r19", "r20", "r21", "memory" + ); +#else static const uint8_t addMod5Table[9] PROGMEM = { 0, 1, 2, 3, 4, 0, 1, 2, 3 }; @@ -346,6 +1891,7 @@ void KeccakCore::keccakp() state.B[index2][addMod5(index, 2)]); } } +#endif // Step mapping iota. XOR A[0][0] with the round constant. static uint64_t const RC[24] PROGMEM = { diff --git a/libraries/Crypto/examples/TestSHA3_256/TestSHA3_256.ino b/libraries/Crypto/examples/TestSHA3_256/TestSHA3_256.ino index 71d17c4b..51864638 100644 --- a/libraries/Crypto/examples/TestSHA3_256/TestSHA3_256.ino +++ b/libraries/Crypto/examples/TestSHA3_256/TestSHA3_256.ino @@ -261,7 +261,6 @@ void testHMAC(Hash *hash, size_t keyLen) Serial.println("Failed"); } -/* void perfFinalize(Hash *hash) { unsigned long start; @@ -285,7 +284,6 @@ void perfFinalize(Hash *hash) Serial.print((1000.0 * 1000000.0) / elapsed); Serial.println(" ops per second"); } -*/ void setup() { @@ -314,7 +312,7 @@ void setup() Serial.println("Performance Tests:"); perfHash(&sha3_256); - //perfFinalize(&sha3_256); + perfFinalize(&sha3_256); } void loop() diff --git a/libraries/Crypto/examples/TestSHA3_512/TestSHA3_512.ino b/libraries/Crypto/examples/TestSHA3_512/TestSHA3_512.ino index 3e77ff0d..474e0c36 100644 --- a/libraries/Crypto/examples/TestSHA3_512/TestSHA3_512.ino +++ b/libraries/Crypto/examples/TestSHA3_512/TestSHA3_512.ino @@ -263,7 +263,6 @@ void testHMAC(Hash *hash, size_t keyLen) Serial.println("Failed"); } -/* void perfFinalize(Hash *hash) { unsigned long start; @@ -285,7 +284,6 @@ void perfFinalize(Hash *hash) Serial.print((1000.0 * 1000000.0) / elapsed); Serial.println(" ops per second"); } -*/ void setup() { @@ -314,7 +312,7 @@ void setup() Serial.println("Performance Tests:"); perfHash(&sha3_512); - //perfFinalize(&sha3_512); + perfFinalize(&sha3_512); } void loop()