1
0
mirror of https://github.com/taigrr/arduinolibs synced 2025-01-18 04:33:12 -08:00
arduinolibs/libraries/Crypto/KeccakCore.cpp

1896 lines
47 KiB
C++

/*
* Copyright (C) 2015 Southern Storm Software, Pty Ltd.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include "KeccakCore.h"
#include "Crypto.h"
#include "utility/EndianUtil.h"
#include "utility/RotateUtil.h"
#include "utility/ProgMemUtil.h"
#include <string.h>
/**
* \class KeccakCore KeccakCore.h <KeccakCore.h>
* \brief Keccak core sponge function.
*
* KeccakCore provides the core sponge function for different capacities.
* It is used to implement Hash algorithms such as SHA3.
*
* References: http://en.wikipedia.org/wiki/SHA-3
*
* \sa SHA3
*/
#if !defined(CRYPTO_LITTLE_ENDIAN)
// All of the Arduino platforms we care about are little-endian.
#error "KeccakCore is not supported on big-endian platforms yet - todo"
#endif
/**
* \brief Constructs a new Keccak sponge function.
*
* The capacity() will initially be set to 1536, which normally won't be
* of much use to the caller. The constructor should be followed by a
* call to setCapacity() to select the capacity of interest.
*/
KeccakCore::KeccakCore()
: _blockSize(8)
{
memset(state.A, 0, sizeof(state.A));
state.inputSize = 0;
state.outputSize = 0;
}
/**
* \brief Destroys this Keccak sponge function after clearing all
* sensitive information.
*/
KeccakCore::~KeccakCore()
{
clean(state);
}
/**
* \brief Returns the capacity of the sponge function in bits.
*
* \sa setCapacity(), blockSize()
*/
size_t KeccakCore::capacity() const
{
return 1600 - ((size_t)_blockSize) * 8;
}
/**
* \brief Sets the capacity of the Keccak sponge function in bits.
*
* \param capacity The capacity of the Keccak sponge function in bits which
* should be a multiple of 64 and between 64 and 1536.
*
* \note It is possible to create a sponge function with this constructor that
* doesn't strictly conform with the capacity and hash size constraints
* defined in the relevant standards. It is the responsibility of callers
* to only use standard parameter combinations.
*
* \sa capacity(), blockSize()
*/
void KeccakCore::setCapacity(size_t capacity)
{
_blockSize = (1600 - capacity) / 8;
reset();
}
/**
* \fn size_t KeccakCore::blockSize() const
* \brief Returns the input block size for the sponge function in bytes.
*
* The block size is (1600 - capacity()) / 8.
*
* \sa capacity()
*/
/**
* \brief Resets the Keccak sponge function ready for a new session.
*
* \sa update(), extract()
*/
void KeccakCore::reset()
{
memset(state.A, 0, sizeof(state.A));
state.inputSize = 0;
state.outputSize = 0;
}
/**
* \brief Updates the Keccak sponge function with more input data.
*
* \param data The extra input data to incorporate.
* \param size The size of the new data to incorporate.
*
* This function will invoke the sponge function whenever a full blockSize()
* bytes of input data have been accumulated. Call pad() after the last
* block to finalize the input before calling extract().
*
* \sa pad(), extract(), reset()
*/
void KeccakCore::update(const void *data, size_t size)
{
// Stop generating output while we incorporate the new data.
state.outputSize = 0;
// Break the input up into chunks and process each in turn.
const uint8_t *d = (const uint8_t *)data;
while (size > 0) {
uint8_t len = _blockSize - state.inputSize;
if (len > size)
len = size;
uint8_t *Abytes = ((uint8_t *)state.A) + state.inputSize;
for (uint8_t posn = 0; posn < len; ++posn)
Abytes[posn] ^= d[posn];
state.inputSize += len;
size -= len;
d += len;
if (state.inputSize == _blockSize) {
keccakp();
state.inputSize = 0;
}
}
}
/**
* \brief Pads the last block of input data to blockSize().
*
* \param tag The tag byte to add to the padding to identify SHA3 (0x06),
* SHAKE (0x1F), or the plain pre-standardized version of Keccak (0x01).
*
* The sponge function will be invoked to process the completed padding block.
*
* \sa update(), extract()
*/
void KeccakCore::pad(uint8_t tag)
{
// Padding for SHA3-NNN variants according to FIPS 202 appends "01",
// then another "1", then many zero bits, followed by a final "1".
// SHAKE appends "1111" first instead of "01". Note that SHA-3 numbers
// bits from the least significant, so appending "01" is equivalent
// to 0x02 for byte-aligned data, not 0x40.
uint8_t size = state.inputSize;
uint64_t *Awords = &(state.A[0][0]);
Awords[size / 8] ^= (((uint64_t)tag) << ((size % 8) * 8));
Awords[(_blockSize - 1) / 8] ^= 0x8000000000000000ULL;
keccakp();
state.inputSize = 0;
state.outputSize = 0;
}
/**
* \brief Extracts data from the Keccak sponge function.
*
* \param data The data buffer to fill with extracted data.
* \param size The number number of bytes of extracted data that are required.
*
* If more than blockSize() bytes are required, the sponge function will
* be invoked to generate additional data.
*
* \sa update(), reset(), extractHash()
*/
void KeccakCore::extract(void *data, size_t size)
{
// Stop accepting input while we are generating output.
state.inputSize = 0;
// Copy the output data into the caller's return buffer.
uint8_t *d = (uint8_t *)data;
uint8_t tempSize;
while (size > 0) {
// Generate another output block if the current one has been exhausted.
if (state.outputSize >= _blockSize) {
keccakp();
state.outputSize = 0;
}
// How many bytes can we copy this time around?
tempSize = _blockSize - state.outputSize;
if (tempSize > size)
tempSize = size;
// Copy the partial output data into the caller's return buffer.
memcpy(d, ((uint8_t *)(state.A)) + state.outputSize, tempSize);
state.outputSize += tempSize;
size -= tempSize;
d += tempSize;
}
}
/**
* \brief Clears all sensitive data from this object.
*/
void KeccakCore::clear()
{
clean(state);
}
/**
* \brief Sets a HMAC key for a Keccak-based hash algorithm.
*
* \param key Points to the HMAC key for the hashing process.
* \param len Length of the HMAC \a key in bytes.
* \param pad Inner (0x36) or outer (0x5C) padding value to XOR with
* the formatted HMAC key.
* \param hashSize The size of the output from the hash algorithm.
*
* This function is intended to help classes implement Hash::resetHMAC() and
* Hash::finalizeHMAC() by directly formatting the HMAC key into the
* internal block buffer and resetting the hash.
*/
void KeccakCore::setHMACKey(const void *key, size_t len, uint8_t pad, size_t hashSize)
{
uint8_t *Abytes = (uint8_t *)state.A;
size_t size = blockSize();
reset();
if (len <= size) {
// Because the state has just been reset, state.A is set to
// all-zeroes. We can copy the key directly into the state
// and then XOR the block with the pad value.
memcpy(Abytes, key, len);
} else {
// The key is larger than the block size. Hash it down.
// Afterwards, state.A will contain the first block of data
// to be extracted. We truncate it to the first "hashSize"
// bytes and XOR with the padding.
update(key, len);
this->pad(0x06);
memset(Abytes + hashSize, pad, size - hashSize);
memset(Abytes + size, 0, sizeof(state.A) - size);
size = hashSize;
}
while (size > 0) {
*Abytes++ ^= pad;
--size;
}
keccakp();
}
/**
* \brief Transform the state with the KECCAK-p sponge function with b = 1600.
*/
void KeccakCore::keccakp()
{
#if defined(__AVR__)
// This assembly code was generated by the "genkeccak.c" program.
// Do not modify this code directly. Instead modify "genkeccak.c"
// and then re-generate the code here.
for (uint8_t round = 0; round < 24; ++round) {
__asm__ __volatile__ (
"push r29\n"
"push r28\n"
"mov r28,r26\n"
"mov r29,r27\n"
// Step mapping theta. Compute C.
"ldi r20,5\n"
"100:\n"
"ld r8,Z\n"
"ldd r9,Z+1\n"
"ldd r10,Z+2\n"
"ldd r11,Z+3\n"
"ldd r12,Z+4\n"
"ldd r13,Z+5\n"
"ldd r14,Z+6\n"
"ldd r15,Z+7\n"
"ldi r19,4\n"
"101:\n"
"adiw r30,40\n"
"ld __tmp_reg__,Z\n"
"eor r8,__tmp_reg__\n"
"ldd __tmp_reg__,Z+1\n"
"eor r9,__tmp_reg__\n"
"ldd __tmp_reg__,Z+2\n"
"eor r10,__tmp_reg__\n"
"ldd __tmp_reg__,Z+3\n"
"eor r11,__tmp_reg__\n"
"ldd __tmp_reg__,Z+4\n"
"eor r12,__tmp_reg__\n"
"ldd __tmp_reg__,Z+5\n"
"eor r13,__tmp_reg__\n"
"ldd __tmp_reg__,Z+6\n"
"eor r14,__tmp_reg__\n"
"ldd __tmp_reg__,Z+7\n"
"eor r15,__tmp_reg__\n"
"dec r19\n"
"brne 101b\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
"subi r30,152\n"
"sbc r31,__zero_reg__\n"
"dec r20\n"
"brne 100b\n"
"sbiw r30,40\n"
// Step mapping theta. Compute D.
"ldd r8,Y+8\n"
"ldd r9,Y+9\n"
"ldd r10,Y+10\n"
"ldd r11,Y+11\n"
"ldd r12,Y+12\n"
"ldd r13,Y+13\n"
"ldd r14,Y+14\n"
"ldd r15,Y+15\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"ldd __tmp_reg__,Y+32\n"
"eor r8,__tmp_reg__\n"
"ldd __tmp_reg__,Y+33\n"
"eor r9,__tmp_reg__\n"
"ldd __tmp_reg__,Y+34\n"
"eor r10,__tmp_reg__\n"
"ldd __tmp_reg__,Y+35\n"
"eor r11,__tmp_reg__\n"
"ldd __tmp_reg__,Y+36\n"
"eor r12,__tmp_reg__\n"
"ldd __tmp_reg__,Y+37\n"
"eor r13,__tmp_reg__\n"
"ldd __tmp_reg__,Y+38\n"
"eor r14,__tmp_reg__\n"
"ldd __tmp_reg__,Y+39\n"
"eor r15,__tmp_reg__\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
"ldd r8,Y+16\n"
"ldd r9,Y+17\n"
"ldd r10,Y+18\n"
"ldd r11,Y+19\n"
"ldd r12,Y+20\n"
"ldd r13,Y+21\n"
"ldd r14,Y+22\n"
"ldd r15,Y+23\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"ld __tmp_reg__,Y\n"
"eor r8,__tmp_reg__\n"
"ldd __tmp_reg__,Y+1\n"
"eor r9,__tmp_reg__\n"
"ldd __tmp_reg__,Y+2\n"
"eor r10,__tmp_reg__\n"
"ldd __tmp_reg__,Y+3\n"
"eor r11,__tmp_reg__\n"
"ldd __tmp_reg__,Y+4\n"
"eor r12,__tmp_reg__\n"
"ldd __tmp_reg__,Y+5\n"
"eor r13,__tmp_reg__\n"
"ldd __tmp_reg__,Y+6\n"
"eor r14,__tmp_reg__\n"
"ldd __tmp_reg__,Y+7\n"
"eor r15,__tmp_reg__\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
"ldd r8,Y+24\n"
"ldd r9,Y+25\n"
"ldd r10,Y+26\n"
"ldd r11,Y+27\n"
"ldd r12,Y+28\n"
"ldd r13,Y+29\n"
"ldd r14,Y+30\n"
"ldd r15,Y+31\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"ldd __tmp_reg__,Y+8\n"
"eor r8,__tmp_reg__\n"
"ldd __tmp_reg__,Y+9\n"
"eor r9,__tmp_reg__\n"
"ldd __tmp_reg__,Y+10\n"
"eor r10,__tmp_reg__\n"
"ldd __tmp_reg__,Y+11\n"
"eor r11,__tmp_reg__\n"
"ldd __tmp_reg__,Y+12\n"
"eor r12,__tmp_reg__\n"
"ldd __tmp_reg__,Y+13\n"
"eor r13,__tmp_reg__\n"
"ldd __tmp_reg__,Y+14\n"
"eor r14,__tmp_reg__\n"
"ldd __tmp_reg__,Y+15\n"
"eor r15,__tmp_reg__\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
"ldd r8,Y+32\n"
"ldd r9,Y+33\n"
"ldd r10,Y+34\n"
"ldd r11,Y+35\n"
"ldd r12,Y+36\n"
"ldd r13,Y+37\n"
"ldd r14,Y+38\n"
"ldd r15,Y+39\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"ldd __tmp_reg__,Y+16\n"
"eor r8,__tmp_reg__\n"
"ldd __tmp_reg__,Y+17\n"
"eor r9,__tmp_reg__\n"
"ldd __tmp_reg__,Y+18\n"
"eor r10,__tmp_reg__\n"
"ldd __tmp_reg__,Y+19\n"
"eor r11,__tmp_reg__\n"
"ldd __tmp_reg__,Y+20\n"
"eor r12,__tmp_reg__\n"
"ldd __tmp_reg__,Y+21\n"
"eor r13,__tmp_reg__\n"
"ldd __tmp_reg__,Y+22\n"
"eor r14,__tmp_reg__\n"
"ldd __tmp_reg__,Y+23\n"
"eor r15,__tmp_reg__\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
"ld r8,Y\n"
"ldd r9,Y+1\n"
"ldd r10,Y+2\n"
"ldd r11,Y+3\n"
"ldd r12,Y+4\n"
"ldd r13,Y+5\n"
"ldd r14,Y+6\n"
"ldd r15,Y+7\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"ldd __tmp_reg__,Y+24\n"
"eor r8,__tmp_reg__\n"
"ldd __tmp_reg__,Y+25\n"
"eor r9,__tmp_reg__\n"
"ldd __tmp_reg__,Y+26\n"
"eor r10,__tmp_reg__\n"
"ldd __tmp_reg__,Y+27\n"
"eor r11,__tmp_reg__\n"
"ldd __tmp_reg__,Y+28\n"
"eor r12,__tmp_reg__\n"
"ldd __tmp_reg__,Y+29\n"
"eor r13,__tmp_reg__\n"
"ldd __tmp_reg__,Y+30\n"
"eor r14,__tmp_reg__\n"
"ldd __tmp_reg__,Y+31\n"
"eor r15,__tmp_reg__\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
"sbiw r26,40\n"
// Step mapping theta. XOR D with A.
"ldi r20,5\n"
"102:\n"
"ld r8,X+\n"
"ld r9,X+\n"
"ld r10,X+\n"
"ld r11,X+\n"
"ld r12,X+\n"
"ld r13,X+\n"
"ld r14,X+\n"
"ld r15,X+\n"
"ldi r19,5\n"
"103:\n"
"ld __tmp_reg__,Z\n"
"eor __tmp_reg__,r8\n"
"st Z,__tmp_reg__\n"
"ldd __tmp_reg__,Z+1\n"
"eor __tmp_reg__,r9\n"
"std Z+1,__tmp_reg__\n"
"ldd __tmp_reg__,Z+2\n"
"eor __tmp_reg__,r10\n"
"std Z+2,__tmp_reg__\n"
"ldd __tmp_reg__,Z+3\n"
"eor __tmp_reg__,r11\n"
"std Z+3,__tmp_reg__\n"
"ldd __tmp_reg__,Z+4\n"
"eor __tmp_reg__,r12\n"
"std Z+4,__tmp_reg__\n"
"ldd __tmp_reg__,Z+5\n"
"eor __tmp_reg__,r13\n"
"std Z+5,__tmp_reg__\n"
"ldd __tmp_reg__,Z+6\n"
"eor __tmp_reg__,r14\n"
"std Z+6,__tmp_reg__\n"
"ldd __tmp_reg__,Z+7\n"
"eor __tmp_reg__,r15\n"
"std Z+7,__tmp_reg__\n"
"adiw r30,40\n"
"dec r19\n"
"brne 103b\n"
"subi r30,192\n"
"sbc r31,__zero_reg__\n"
"dec r20\n"
"brne 102b\n"
"subi r26,80\n"
"sbc r27,__zero_reg__\n"
"sbiw r30,40\n"
// Step mappings rho and pi combined into one step.
// state.B[0][0] = state.A[0][0]
"ld r8,Z\n"
"ldd r9,Z+1\n"
"ldd r10,Z+2\n"
"ldd r11,Z+3\n"
"ldd r12,Z+4\n"
"ldd r13,Z+5\n"
"ldd r14,Z+6\n"
"ldd r15,Z+7\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
// state.B[1][0] = leftRotate28_64(state.A[0][3])
"adiw r26,32\n"
"ldd r8,Z+24\n"
"ldd r9,Z+25\n"
"ldd r10,Z+26\n"
"ldd r11,Z+27\n"
"ldd r12,Z+28\n"
"ldd r13,Z+29\n"
"ldd r14,Z+30\n"
"ldd r15,Z+31\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
// state.B[2][0] = leftRotate1_64(state.A[0][1])
"adiw r26,32\n"
"ldd r8,Z+8\n"
"ldd r9,Z+9\n"
"ldd r10,Z+10\n"
"ldd r11,Z+11\n"
"ldd r12,Z+12\n"
"ldd r13,Z+13\n"
"ldd r14,Z+14\n"
"ldd r15,Z+15\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
// state.B[3][0] = leftRotate27_64(state.A[0][4])
"adiw r26,32\n"
"ldd r8,Z+32\n"
"ldd r9,Z+33\n"
"ldd r10,Z+34\n"
"ldd r11,Z+35\n"
"ldd r12,Z+36\n"
"ldd r13,Z+37\n"
"ldd r14,Z+38\n"
"ldd r15,Z+39\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
// state.B[4][0] = leftRotate62_64(state.A[0][2])
"adiw r26,32\n"
"ldd r8,Z+16\n"
"ldd r9,Z+17\n"
"ldd r10,Z+18\n"
"ldd r11,Z+19\n"
"ldd r12,Z+20\n"
"ldd r13,Z+21\n"
"ldd r14,Z+22\n"
"ldd r15,Z+23\n"
"bst r8,0\n"
"ror r15\n"
"ror r14\n"
"ror r13\n"
"ror r12\n"
"ror r11\n"
"ror r10\n"
"ror r9\n"
"ror r8\n"
"bld r15,7\n"
"bst r8,0\n"
"ror r15\n"
"ror r14\n"
"ror r13\n"
"ror r12\n"
"ror r11\n"
"ror r10\n"
"ror r9\n"
"ror r8\n"
"bld r15,7\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
// state.B[0][1] = leftRotate44_64(state.A[1][1])
"subi r26,160\n"
"sbc r27,__zero_reg__\n"
"adiw r30,40\n"
"ldd r8,Z+8\n"
"ldd r9,Z+9\n"
"ldd r10,Z+10\n"
"ldd r11,Z+11\n"
"ldd r12,Z+12\n"
"ldd r13,Z+13\n"
"ldd r14,Z+14\n"
"ldd r15,Z+15\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
// state.B[1][1] = leftRotate20_64(state.A[1][4])
"adiw r26,32\n"
"ldd r8,Z+32\n"
"ldd r9,Z+33\n"
"ldd r10,Z+34\n"
"ldd r11,Z+35\n"
"ldd r12,Z+36\n"
"ldd r13,Z+37\n"
"ldd r14,Z+38\n"
"ldd r15,Z+39\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"st X+,r14\n"
"st X+,r15\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
// state.B[2][1] = leftRotate6_64(state.A[1][2])
"adiw r26,32\n"
"ldd r8,Z+16\n"
"ldd r9,Z+17\n"
"ldd r10,Z+18\n"
"ldd r11,Z+19\n"
"ldd r12,Z+20\n"
"ldd r13,Z+21\n"
"ldd r14,Z+22\n"
"ldd r15,Z+23\n"
"bst r8,0\n"
"ror r15\n"
"ror r14\n"
"ror r13\n"
"ror r12\n"
"ror r11\n"
"ror r10\n"
"ror r9\n"
"ror r8\n"
"bld r15,7\n"
"bst r8,0\n"
"ror r15\n"
"ror r14\n"
"ror r13\n"
"ror r12\n"
"ror r11\n"
"ror r10\n"
"ror r9\n"
"ror r8\n"
"bld r15,7\n"
"st X+,r15\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
// state.B[3][1] = leftRotate36_64(state.A[1][0])
"adiw r26,32\n"
"ld r8,Z\n"
"ldd r9,Z+1\n"
"ldd r10,Z+2\n"
"ldd r11,Z+3\n"
"ldd r12,Z+4\n"
"ldd r13,Z+5\n"
"ldd r14,Z+6\n"
"ldd r15,Z+7\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
// state.B[4][1] = leftRotate55_64(state.A[1][3])
"adiw r26,32\n"
"ldd r8,Z+24\n"
"ldd r9,Z+25\n"
"ldd r10,Z+26\n"
"ldd r11,Z+27\n"
"ldd r12,Z+28\n"
"ldd r13,Z+29\n"
"ldd r14,Z+30\n"
"ldd r15,Z+31\n"
"bst r8,0\n"
"ror r15\n"
"ror r14\n"
"ror r13\n"
"ror r12\n"
"ror r11\n"
"ror r10\n"
"ror r9\n"
"ror r8\n"
"bld r15,7\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
"st X+,r8\n"
// state.B[0][2] = leftRotate43_64(state.A[2][2])
"subi r26,160\n"
"sbc r27,__zero_reg__\n"
"adiw r30,40\n"
"ldd r8,Z+16\n"
"ldd r9,Z+17\n"
"ldd r10,Z+18\n"
"ldd r11,Z+19\n"
"ldd r12,Z+20\n"
"ldd r13,Z+21\n"
"ldd r14,Z+22\n"
"ldd r15,Z+23\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
// state.B[1][2] = leftRotate3_64(state.A[2][0])
"adiw r26,32\n"
"ld r8,Z\n"
"ldd r9,Z+1\n"
"ldd r10,Z+2\n"
"ldd r11,Z+3\n"
"ldd r12,Z+4\n"
"ldd r13,Z+5\n"
"ldd r14,Z+6\n"
"ldd r15,Z+7\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
// state.B[2][2] = leftRotate25_64(state.A[2][3])
"adiw r26,32\n"
"ldd r8,Z+24\n"
"ldd r9,Z+25\n"
"ldd r10,Z+26\n"
"ldd r11,Z+27\n"
"ldd r12,Z+28\n"
"ldd r13,Z+29\n"
"ldd r14,Z+30\n"
"ldd r15,Z+31\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
// state.B[3][2] = leftRotate10_64(state.A[2][1])
"adiw r26,32\n"
"ldd r8,Z+8\n"
"ldd r9,Z+9\n"
"ldd r10,Z+10\n"
"ldd r11,Z+11\n"
"ldd r12,Z+12\n"
"ldd r13,Z+13\n"
"ldd r14,Z+14\n"
"ldd r15,Z+15\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"st X+,r15\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
// state.B[4][2] = leftRotate39_64(state.A[2][4])
"adiw r26,32\n"
"ldd r8,Z+32\n"
"ldd r9,Z+33\n"
"ldd r10,Z+34\n"
"ldd r11,Z+35\n"
"ldd r12,Z+36\n"
"ldd r13,Z+37\n"
"ldd r14,Z+38\n"
"ldd r15,Z+39\n"
"bst r8,0\n"
"ror r15\n"
"ror r14\n"
"ror r13\n"
"ror r12\n"
"ror r11\n"
"ror r10\n"
"ror r9\n"
"ror r8\n"
"bld r15,7\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
// state.B[0][3] = leftRotate21_64(state.A[3][3])
"subi r26,160\n"
"sbc r27,__zero_reg__\n"
"adiw r30,40\n"
"ldd r8,Z+24\n"
"ldd r9,Z+25\n"
"ldd r10,Z+26\n"
"ldd r11,Z+27\n"
"ldd r12,Z+28\n"
"ldd r13,Z+29\n"
"ldd r14,Z+30\n"
"ldd r15,Z+31\n"
"bst r8,0\n"
"ror r15\n"
"ror r14\n"
"ror r13\n"
"ror r12\n"
"ror r11\n"
"ror r10\n"
"ror r9\n"
"ror r8\n"
"bld r15,7\n"
"bst r8,0\n"
"ror r15\n"
"ror r14\n"
"ror r13\n"
"ror r12\n"
"ror r11\n"
"ror r10\n"
"ror r9\n"
"ror r8\n"
"bld r15,7\n"
"bst r8,0\n"
"ror r15\n"
"ror r14\n"
"ror r13\n"
"ror r12\n"
"ror r11\n"
"ror r10\n"
"ror r9\n"
"ror r8\n"
"bld r15,7\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
// state.B[1][3] = leftRotate45_64(state.A[3][1])
"adiw r26,32\n"
"ldd r8,Z+8\n"
"ldd r9,Z+9\n"
"ldd r10,Z+10\n"
"ldd r11,Z+11\n"
"ldd r12,Z+12\n"
"ldd r13,Z+13\n"
"ldd r14,Z+14\n"
"ldd r15,Z+15\n"
"bst r8,0\n"
"ror r15\n"
"ror r14\n"
"ror r13\n"
"ror r12\n"
"ror r11\n"
"ror r10\n"
"ror r9\n"
"ror r8\n"
"bld r15,7\n"
"bst r8,0\n"
"ror r15\n"
"ror r14\n"
"ror r13\n"
"ror r12\n"
"ror r11\n"
"ror r10\n"
"ror r9\n"
"ror r8\n"
"bld r15,7\n"
"bst r8,0\n"
"ror r15\n"
"ror r14\n"
"ror r13\n"
"ror r12\n"
"ror r11\n"
"ror r10\n"
"ror r9\n"
"ror r8\n"
"bld r15,7\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
"st X+,r8\n"
"st X+,r9\n"
// state.B[2][3] = leftRotate8_64(state.A[3][4])
"adiw r26,32\n"
"ldd r8,Z+32\n"
"ldd r9,Z+33\n"
"ldd r10,Z+34\n"
"ldd r11,Z+35\n"
"ldd r12,Z+36\n"
"ldd r13,Z+37\n"
"ldd r14,Z+38\n"
"ldd r15,Z+39\n"
"st X+,r15\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
// state.B[3][3] = leftRotate15_64(state.A[3][2])
"adiw r26,32\n"
"ldd r8,Z+16\n"
"ldd r9,Z+17\n"
"ldd r10,Z+18\n"
"ldd r11,Z+19\n"
"ldd r12,Z+20\n"
"ldd r13,Z+21\n"
"ldd r14,Z+22\n"
"ldd r15,Z+23\n"
"bst r8,0\n"
"ror r15\n"
"ror r14\n"
"ror r13\n"
"ror r12\n"
"ror r11\n"
"ror r10\n"
"ror r9\n"
"ror r8\n"
"bld r15,7\n"
"st X+,r14\n"
"st X+,r15\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
// state.B[4][3] = leftRotate41_64(state.A[3][0])
"adiw r26,32\n"
"ld r8,Z\n"
"ldd r9,Z+1\n"
"ldd r10,Z+2\n"
"ldd r11,Z+3\n"
"ldd r12,Z+4\n"
"ldd r13,Z+5\n"
"ldd r14,Z+6\n"
"ldd r15,Z+7\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
// state.B[0][4] = leftRotate14_64(state.A[4][4])
"subi r26,160\n"
"sbc r27,__zero_reg__\n"
"adiw r30,40\n"
"ldd r8,Z+32\n"
"ldd r9,Z+33\n"
"ldd r10,Z+34\n"
"ldd r11,Z+35\n"
"ldd r12,Z+36\n"
"ldd r13,Z+37\n"
"ldd r14,Z+38\n"
"ldd r15,Z+39\n"
"bst r8,0\n"
"ror r15\n"
"ror r14\n"
"ror r13\n"
"ror r12\n"
"ror r11\n"
"ror r10\n"
"ror r9\n"
"ror r8\n"
"bld r15,7\n"
"bst r8,0\n"
"ror r15\n"
"ror r14\n"
"ror r13\n"
"ror r12\n"
"ror r11\n"
"ror r10\n"
"ror r9\n"
"ror r8\n"
"bld r15,7\n"
"st X+,r14\n"
"st X+,r15\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
// state.B[1][4] = leftRotate61_64(state.A[4][2])
"adiw r26,32\n"
"ldd r8,Z+16\n"
"ldd r9,Z+17\n"
"ldd r10,Z+18\n"
"ldd r11,Z+19\n"
"ldd r12,Z+20\n"
"ldd r13,Z+21\n"
"ldd r14,Z+22\n"
"ldd r15,Z+23\n"
"bst r8,0\n"
"ror r15\n"
"ror r14\n"
"ror r13\n"
"ror r12\n"
"ror r11\n"
"ror r10\n"
"ror r9\n"
"ror r8\n"
"bld r15,7\n"
"bst r8,0\n"
"ror r15\n"
"ror r14\n"
"ror r13\n"
"ror r12\n"
"ror r11\n"
"ror r10\n"
"ror r9\n"
"ror r8\n"
"bld r15,7\n"
"bst r8,0\n"
"ror r15\n"
"ror r14\n"
"ror r13\n"
"ror r12\n"
"ror r11\n"
"ror r10\n"
"ror r9\n"
"ror r8\n"
"bld r15,7\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
// state.B[2][4] = leftRotate18_64(state.A[4][0])
"adiw r26,32\n"
"ld r8,Z\n"
"ldd r9,Z+1\n"
"ldd r10,Z+2\n"
"ldd r11,Z+3\n"
"ldd r12,Z+4\n"
"ldd r13,Z+5\n"
"ldd r14,Z+6\n"
"ldd r15,Z+7\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"st X+,r14\n"
"st X+,r15\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
// state.B[3][4] = leftRotate56_64(state.A[4][3])
"adiw r26,32\n"
"ldd r8,Z+24\n"
"ldd r9,Z+25\n"
"ldd r10,Z+26\n"
"ldd r11,Z+27\n"
"ldd r12,Z+28\n"
"ldd r13,Z+29\n"
"ldd r14,Z+30\n"
"ldd r15,Z+31\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
"st X+,r8\n"
// state.B[4][4] = leftRotate2_64(state.A[4][1])
"adiw r26,32\n"
"ldd r8,Z+8\n"
"ldd r9,Z+9\n"
"ldd r10,Z+10\n"
"ldd r11,Z+11\n"
"ldd r12,Z+12\n"
"ldd r13,Z+13\n"
"ldd r14,Z+14\n"
"ldd r15,Z+15\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"lsl r8\n"
"rol r9\n"
"rol r10\n"
"rol r11\n"
"rol r12\n"
"rol r13\n"
"rol r14\n"
"rol r15\n"
"adc r8, __zero_reg__\n"
"st X+,r8\n"
"st X+,r9\n"
"st X+,r10\n"
"st X+,r11\n"
"st X+,r12\n"
"st X+,r13\n"
"st X+,r14\n"
"st X+,r15\n"
"subi r26,200\n"
"sbc r27,__zero_reg__\n"
"subi r30,160\n"
"sbc r31,__zero_reg__\n"
// Step mapping chi.
"ldi r20,5\n"
"50:\n"
"ld r8,Y\n"
"ldd r9,Y+8\n"
"ldd r10,Y+16\n"
"ldd r11,Y+24\n"
"ldd r12,Y+32\n"
"mov r13,r9\n"
"com r13\n"
"and r13,r10\n"
"eor r13,r8\n"
"mov r14,r10\n"
"com r14\n"
"and r14,r11\n"
"eor r14,r9\n"
"mov r15,r11\n"
"com r15\n"
"and r15,r12\n"
"eor r15,r10\n"
"mov r17,r12\n"
"com r17\n"
"and r17,r8\n"
"eor r17,r11\n"
"mov r16,r8\n"
"com r16\n"
"and r16,r9\n"
"eor r16,r12\n"
"st Z,r13\n"
"std Z+8,r14\n"
"std Z+16,r15\n"
"std Z+24,r17\n"
"std Z+32,r16\n"
"ldd r8,Y+1\n"
"ldd r9,Y+9\n"
"ldd r10,Y+17\n"
"ldd r11,Y+25\n"
"ldd r12,Y+33\n"
"mov r13,r9\n"
"com r13\n"
"and r13,r10\n"
"eor r13,r8\n"
"mov r14,r10\n"
"com r14\n"
"and r14,r11\n"
"eor r14,r9\n"
"mov r15,r11\n"
"com r15\n"
"and r15,r12\n"
"eor r15,r10\n"
"mov r17,r12\n"
"com r17\n"
"and r17,r8\n"
"eor r17,r11\n"
"mov r16,r8\n"
"com r16\n"
"and r16,r9\n"
"eor r16,r12\n"
"std Z+1,r13\n"
"std Z+9,r14\n"
"std Z+17,r15\n"
"std Z+25,r17\n"
"std Z+33,r16\n"
"ldd r8,Y+2\n"
"ldd r9,Y+10\n"
"ldd r10,Y+18\n"
"ldd r11,Y+26\n"
"ldd r12,Y+34\n"
"mov r13,r9\n"
"com r13\n"
"and r13,r10\n"
"eor r13,r8\n"
"mov r14,r10\n"
"com r14\n"
"and r14,r11\n"
"eor r14,r9\n"
"mov r15,r11\n"
"com r15\n"
"and r15,r12\n"
"eor r15,r10\n"
"mov r17,r12\n"
"com r17\n"
"and r17,r8\n"
"eor r17,r11\n"
"mov r16,r8\n"
"com r16\n"
"and r16,r9\n"
"eor r16,r12\n"
"std Z+2,r13\n"
"std Z+10,r14\n"
"std Z+18,r15\n"
"std Z+26,r17\n"
"std Z+34,r16\n"
"ldd r8,Y+3\n"
"ldd r9,Y+11\n"
"ldd r10,Y+19\n"
"ldd r11,Y+27\n"
"ldd r12,Y+35\n"
"mov r13,r9\n"
"com r13\n"
"and r13,r10\n"
"eor r13,r8\n"
"mov r14,r10\n"
"com r14\n"
"and r14,r11\n"
"eor r14,r9\n"
"mov r15,r11\n"
"com r15\n"
"and r15,r12\n"
"eor r15,r10\n"
"mov r17,r12\n"
"com r17\n"
"and r17,r8\n"
"eor r17,r11\n"
"mov r16,r8\n"
"com r16\n"
"and r16,r9\n"
"eor r16,r12\n"
"std Z+3,r13\n"
"std Z+11,r14\n"
"std Z+19,r15\n"
"std Z+27,r17\n"
"std Z+35,r16\n"
"ldd r8,Y+4\n"
"ldd r9,Y+12\n"
"ldd r10,Y+20\n"
"ldd r11,Y+28\n"
"ldd r12,Y+36\n"
"mov r13,r9\n"
"com r13\n"
"and r13,r10\n"
"eor r13,r8\n"
"mov r14,r10\n"
"com r14\n"
"and r14,r11\n"
"eor r14,r9\n"
"mov r15,r11\n"
"com r15\n"
"and r15,r12\n"
"eor r15,r10\n"
"mov r17,r12\n"
"com r17\n"
"and r17,r8\n"
"eor r17,r11\n"
"mov r16,r8\n"
"com r16\n"
"and r16,r9\n"
"eor r16,r12\n"
"std Z+4,r13\n"
"std Z+12,r14\n"
"std Z+20,r15\n"
"std Z+28,r17\n"
"std Z+36,r16\n"
"ldd r8,Y+5\n"
"ldd r9,Y+13\n"
"ldd r10,Y+21\n"
"ldd r11,Y+29\n"
"ldd r12,Y+37\n"
"mov r13,r9\n"
"com r13\n"
"and r13,r10\n"
"eor r13,r8\n"
"mov r14,r10\n"
"com r14\n"
"and r14,r11\n"
"eor r14,r9\n"
"mov r15,r11\n"
"com r15\n"
"and r15,r12\n"
"eor r15,r10\n"
"mov r17,r12\n"
"com r17\n"
"and r17,r8\n"
"eor r17,r11\n"
"mov r16,r8\n"
"com r16\n"
"and r16,r9\n"
"eor r16,r12\n"
"std Z+5,r13\n"
"std Z+13,r14\n"
"std Z+21,r15\n"
"std Z+29,r17\n"
"std Z+37,r16\n"
"ldd r8,Y+6\n"
"ldd r9,Y+14\n"
"ldd r10,Y+22\n"
"ldd r11,Y+30\n"
"ldd r12,Y+38\n"
"mov r13,r9\n"
"com r13\n"
"and r13,r10\n"
"eor r13,r8\n"
"mov r14,r10\n"
"com r14\n"
"and r14,r11\n"
"eor r14,r9\n"
"mov r15,r11\n"
"com r15\n"
"and r15,r12\n"
"eor r15,r10\n"
"mov r17,r12\n"
"com r17\n"
"and r17,r8\n"
"eor r17,r11\n"
"mov r16,r8\n"
"com r16\n"
"and r16,r9\n"
"eor r16,r12\n"
"std Z+6,r13\n"
"std Z+14,r14\n"
"std Z+22,r15\n"
"std Z+30,r17\n"
"std Z+38,r16\n"
"ldd r8,Y+7\n"
"ldd r9,Y+15\n"
"ldd r10,Y+23\n"
"ldd r11,Y+31\n"
"ldd r12,Y+39\n"
"mov r13,r9\n"
"com r13\n"
"and r13,r10\n"
"eor r13,r8\n"
"mov r14,r10\n"
"com r14\n"
"and r14,r11\n"
"eor r14,r9\n"
"mov r15,r11\n"
"com r15\n"
"and r15,r12\n"
"eor r15,r10\n"
"mov r17,r12\n"
"com r17\n"
"and r17,r8\n"
"eor r17,r11\n"
"mov r16,r8\n"
"com r16\n"
"and r16,r9\n"
"eor r16,r12\n"
"std Z+7,r13\n"
"std Z+15,r14\n"
"std Z+23,r15\n"
"std Z+31,r17\n"
"std Z+39,r16\n"
"adiw r30,40\n"
"adiw r28,40\n"
"dec r20\n"
"breq 51f\n"
"rjmp 50b\n"
"51:\n"
"pop r28\n"
"pop r29\n"
// Done
: : "x"(state.B), "z"(state.A)
: "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"r16", "r17", "r18", "r19", "r20", "r21", "memory"
);
#else
static const uint8_t addMod5Table[9] PROGMEM = {
0, 1, 2, 3, 4, 0, 1, 2, 3
};
#define addMod5(x, y) (pgm_read_byte(&(addMod5Table[(x) + (y)])))
uint64_t D;
uint8_t index, index2;
for (uint8_t round = 0; round < 24; ++round) {
// Step mapping theta. The specification mentions two temporary
// arrays of size 5 called C and D. To save a bit of memory,
// we use the first row of B to store C and compute D on the fly.
for (index = 0; index < 5; ++index) {
state.B[0][index] = state.A[0][index] ^ state.A[1][index] ^
state.A[2][index] ^ state.A[3][index] ^
state.A[4][index];
}
for (index = 0; index < 5; ++index) {
D = state.B[0][addMod5(index, 4)] ^
leftRotate1_64(state.B[0][addMod5(index, 1)]);
for (index2 = 0; index2 < 5; ++index2)
state.A[index2][index] ^= D;
}
// Step mapping rho and pi combined into a single step.
// Rotate all lanes by a specific offset and rearrange.
state.B[0][0] = state.A[0][0];
state.B[1][0] = leftRotate28_64(state.A[0][3]);
state.B[2][0] = leftRotate1_64 (state.A[0][1]);
state.B[3][0] = leftRotate27_64(state.A[0][4]);
state.B[4][0] = leftRotate62_64(state.A[0][2]);
state.B[0][1] = leftRotate44_64(state.A[1][1]);
state.B[1][1] = leftRotate20_64(state.A[1][4]);
state.B[2][1] = leftRotate6_64 (state.A[1][2]);
state.B[3][1] = leftRotate36_64(state.A[1][0]);
state.B[4][1] = leftRotate55_64(state.A[1][3]);
state.B[0][2] = leftRotate43_64(state.A[2][2]);
state.B[1][2] = leftRotate3_64 (state.A[2][0]);
state.B[2][2] = leftRotate25_64(state.A[2][3]);
state.B[3][2] = leftRotate10_64(state.A[2][1]);
state.B[4][2] = leftRotate39_64(state.A[2][4]);
state.B[0][3] = leftRotate21_64(state.A[3][3]);
state.B[1][3] = leftRotate45_64(state.A[3][1]);
state.B[2][3] = leftRotate8_64 (state.A[3][4]);
state.B[3][3] = leftRotate15_64(state.A[3][2]);
state.B[4][3] = leftRotate41_64(state.A[3][0]);
state.B[0][4] = leftRotate14_64(state.A[4][4]);
state.B[1][4] = leftRotate61_64(state.A[4][2]);
state.B[2][4] = leftRotate18_64(state.A[4][0]);
state.B[3][4] = leftRotate56_64(state.A[4][3]);
state.B[4][4] = leftRotate2_64 (state.A[4][1]);
// Step mapping chi. Combine each lane with two other lanes in its row.
for (index = 0; index < 5; ++index) {
for (index2 = 0; index2 < 5; ++index2) {
state.A[index2][index] =
state.B[index2][index] ^
((~state.B[index2][addMod5(index, 1)]) &
state.B[index2][addMod5(index, 2)]);
}
}
#endif
// Step mapping iota. XOR A[0][0] with the round constant.
static uint64_t const RC[24] PROGMEM = {
0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808AULL,
0x8000000080008000ULL, 0x000000000000808BULL, 0x0000000080000001ULL,
0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008AULL,
0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000AULL,
0x000000008000808BULL, 0x800000000000008BULL, 0x8000000000008089ULL,
0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
0x000000000000800AULL, 0x800000008000000AULL, 0x8000000080008081ULL,
0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
};
state.A[0][0] ^= pgm_read_qword(RC + round);
}
}