1
0
mirror of https://github.com/taigrr/arduinolibs synced 2025-01-18 04:33:12 -08:00

Speed up GHASH with AVR assembly code

Also split the Galois operations off into a separate GF128 class.
This commit is contained in:
Rhys Weatherley 2016-02-07 13:30:21 +10:00
parent 2decb74161
commit 21ac06136a
9 changed files with 614 additions and 114 deletions

View File

@ -86,12 +86,15 @@ Ardunino Mega 2560 running at 16 MHz are similar:
<tr><td colspan="5"> </td></tr>
<tr><td>AEAD Algorithm</td><td align="right">Encryption (per byte)</td><td align="right">Decryption (per byte)</td><td>Key Setup</td><td>State Size (bytes)</td></tr>
<tr><td>ChaChaPoly</td><td align="right">41.20us</td><td align="right">41.19us</td><td align="right">902.36us</td><td align="right">221</td></tr>
<tr><td>GCM&lt;AES128&gt;</td><td align="right">183.25us</td><td align="right">182.80us</td><td align="right">1272.73us</td><td align="right">284</td></tr>
<tr><td>GCM&lt;AES192&gt;</td><td align="right">189.92us</td><td align="right">189.47us</td><td align="right">1492.60us</td><td align="right">316</td></tr>
<tr><td>GCM&lt;AES256&gt;</td><td align="right">196.59us</td><td align="right">196.13us</td><td align="right">1767.33us</td><td align="right">348</td></tr>
<tr><td>EAX&lt;AES128&gt;</td><td align="right">71.14us</td><td align="right">71.14us</td><td align="right">1329.44us</td><td align="right">268</td></tr>
<tr><td>EAX&lt;Speck&gt; (128-bit key)</td><td align="right">26.01us</td><td align="right">26.01us</td><td align="right">735.46us</td><td align="right">362</td></tr>
<tr><td>EAX&lt;SpeckLowMemory&gt; (128-bit key)</td><td align="right">75.08us</td><td align="right">75.07us</td><td align="right">1243.66us</td><td align="right">122</td></tr>
<tr><td>GCM&lt;AES128&gt;</td><td align="right">109.71us</td><td align="right">109.26us</td><td align="right">1265.69us</td><td align="right">284</td></tr>
<tr><td>GCM&lt;AES192&gt;</td><td align="right">116.38us</td><td align="right">115.92us</td><td align="right">1485.56us</td><td align="right">316</td></tr>
<tr><td>GCM&lt;AES256&gt;</td><td align="right">123.04us</td><td align="right">122.59us</td><td align="right">1760.28us</td><td align="right">348</td></tr>
<tr><td>GCM&lt;Speck&gt; (256-bit key)</td><td align="right">87.78us</td><td align="right">87.32us</td><td align="right">714.41us</td><td align="right">378</td></tr>
<tr><td>GCM&lt;SpeckLowMemory&gt; (256-bit key)</td><td align="right">114.30us</td><td align="right">113.84us</td><td align="right">1270.32us</td><td align="right">138</td></tr>
<tr><td>EAX&lt;AES128&gt;</td><td align="right">71.14us</td><td align="right">71.14us</td><td align="right">1311.97us</td><td align="right">268</td></tr>
<tr><td>EAX&lt;AES256&gt;</td><td align="right">97.80us</td><td align="right">97.80us</td><td align="right">1806.57us</td><td align="right">332</td></tr>
<tr><td>EAX&lt;Speck&gt; (256-bit key)</td><td align="right">27.27us</td><td align="right">27.26us</td><td align="right">760.74us</td><td align="right">362</td></tr>
<tr><td>EAX&lt;SpeckLowMemory&gt; (256-bit key)</td><td align="right">80.31us</td><td align="right">80.31us</td><td align="right">1316.60us</td><td align="right">122</td></tr>
<tr><td colspan="5"> </td></tr>
<tr><td>Hash Algorithm</td><td align="right">Hashing (per byte)</td><td align="right">Finalization</td><td> </td><td>State Size (bytes)</td></tr>
<tr><td>SHA256</td><td align="right">43.85us</td><td align="right">2841.04us</td><td align="right"> </td><td align="right">107</td></tr>
@ -105,7 +108,7 @@ Ardunino Mega 2560 running at 16 MHz are similar:
<tr><td>SHA256 (HMAC mode)</td><td align="right">43.85us</td><td align="right">8552.61us</td><td align="right">2836.49us</td><td align="right">107</td></tr>
<tr><td>BLAKE2s (HMAC mode)</td><td align="right">20.65us</td><td align="right">4055.56us</td><td align="right">1350.00us</td><td align="right">107</td></tr>
<tr><td>Poly1305</td><td align="right">26.26us</td><td align="right">489.11us</td><td align="right">17.06us</td><td align="right">53</td></tr>
<tr><td>GHASH</td><td align="right">148.14us</td><td align="right">17.09us</td><td align="right">21.87us</td><td align="right">33</td></tr>
<tr><td>GHASH</td><td align="right">74.59us</td><td align="right">15.91us</td><td align="right">14.79us</td><td align="right">33</td></tr>
<tr><td colspan="5"> </td></tr>
<tr><td>Public Key Operation</td><td align="right">Time (per operation)</td><td colspan="3">Comment</td></tr>
<tr><td>Curve25519::eval()</td><td align="right">3119ms</td><td colspan="3">Raw curve evaluation</td></tr>
@ -141,12 +144,15 @@ All figures are for the Arduino Due running at 84 MHz:
<tr><td colspan="5"> </td></tr>
<tr><td>AEAD Algorithm</td><td align="right">Encryption (per byte)</td><td align="right">Decryption (per byte)</td><td>Key Setup</td><td>State Size (bytes)</td></tr>
<tr><td>ChaChaPoly</td><td align="right">1.71us</td><td align="right">1.71us</td><td align="right">45.08us</td><td align="right">240</td></tr>
<tr><td>GCM&lt;AES128&gt;</td><td align="right">10.29us</td><td align="right">10.29us</td><td align="right">223.82us</td><td align="right">312</td></tr>
<tr><td>GCM&lt;AES192&gt;</td><td align="right">11.50us</td><td align="right">11.51us</td><td align="right">265.62us</td><td align="right">344</td></tr>
<tr><td>GCM&lt;AES256&gt;</td><td align="right">12.67us</td><td align="right">12.67us</td><td align="right">313.06us</td><td align="right">376</td></tr>
<tr><td>EAX&lt;AES128&gt;</td><td align="right">12.29us</td><td align="right">12.29us</td><td align="right">236.47us</td><td align="right">280</td></tr>
<tr><td>EAX&lt;Speck&gt; (128-bit key)</td><td align="right">2.65us</td><td align="right">2.65us</td><td align="right">79.46us</td><td align="right">384</td></tr>
<tr><td>EAX&lt;SpeckLowMemory&gt; (128-bit key)</td><td align="right">6.29us</td><td align="right">6.29us</td><td align="right">106.60us</td><td align="right">144</td></tr>
<tr><td>GCM&lt;AES128&gt;</td><td align="right">10.90us</td><td align="right">10.90us</td><td align="right">248.83us</td><td align="right">312</td></tr>
<tr><td>GCM&lt;AES192&gt;</td><td align="right">12.30us</td><td align="right">12.31us</td><td align="right">296.83us</td><td align="right">344</td></tr>
<tr><td>GCM&lt;AES256&gt;</td><td align="right">13.66us</td><td align="right">13.67us</td><td align="right">350.25us</td><td align="right">376</td></tr>
<tr><td>GCM&lt;Speck&gt; (256-bit key)</td><td align="right">5.27us</td><td align="right">5.28us</td><td align="right">75.31us</td><td align="right">408</td></tr>
<tr><td>GCM&lt;SpeckLowMemory&gt; (256-bit key)</td><td align="right">7.06us</td><td align="right">7.07us</td><td align="right">94.20us</td><td align="right">168</td></tr>
<tr><td>EAX&lt;AES128&gt;</td><td align="right">12.33us</td><td align="right">12.33us</td><td align="right">234.91us</td><td align="right">280</td></tr>
<tr><td>EAX&lt;AES256&gt;</td><td align="right">16.99us</td><td align="right">16.99us</td><td align="right">322.92us</td><td align="right">344</td></tr>
<tr><td>EAX&lt;Speck&gt; (256-bit key)</td><td align="right">2.80us</td><td align="right">2.80us</td><td align="right">81.63us</td><td align="right">384</td></tr>
<tr><td>EAX&lt;SpeckLowMemory&gt; (256-bit key)</td><td align="right">6.69us</td><td align="right">6.69us</td><td align="right">110.91us</td><td align="right">144</td></tr>
<tr><td colspan="5"> </td></tr>
<tr><td>Hash Algorithm</td><td align="right">Hashing (per byte)</td><td align="right">Finalization</td><td> </td><td>State Size (bytes)</td></tr>
<tr><td>SHA256</td><td align="right">1.15us</td><td align="right">76.60us</td><td align="right"> </td><td align="right">120</td></tr>
@ -160,7 +166,7 @@ All figures are for the Arduino Due running at 84 MHz:
<tr><td>SHA256 (HMAC mode)</td><td align="right">1.15us</td><td align="right">238.98us</td><td align="right">80.44us</td><td align="right">120</td></tr>
<tr><td>BLAKE2s (HMAC mode)</td><td align="right">0.72us</td><td align="right">157.75us</td><td align="right">57.18us</td><td align="right">120</td></tr>
<tr><td>Poly1305</td><td align="right">0.81us</td><td align="right">19.01us</td><td align="right">2.57us</td><td align="right">60</td></tr>
<tr><td>GHASH</td><td align="right">4.37us</td><td align="right">1.50us</td><td align="right">4.37us</td><td align="right">36</td></tr>
<tr><td>GHASH</td><td align="right">4.47us</td><td align="right">1.52us</td><td align="right">2.60us</td><td align="right">36</td></tr>
<tr><td colspan="5"> </td></tr>
<tr><td>Public Key Operation</td><td align="right">Time (per operation)</td><td colspan="3">Comment</td></tr>
<tr><td>Curve25519::eval()</td><td align="right">103ms</td><td colspan="3">Raw curve evaluation</td></tr>

View File

@ -21,6 +21,7 @@
*/
#include "EAX.h"
#include "GF128.h"
#include "Crypto.h"
#include <string.h>
@ -145,19 +146,6 @@ void EAXCommon::clear()
clean(state);
}
// Doubles a 128-bit value in the GF(2^128) field.
static void gfDouble(uint8_t value[16])
{
uint16_t temp = 0;
for (uint8_t index = 16; index > 0; ) {
--index;
temp |= (((uint16_t)(value[index])) << 1);
value[index] = (uint8_t)temp;
temp >>= 8;
}
value[15] ^= (uint8_t)((-temp) & 0x87);
}
/**
* \brief Initialises the first OMAC hashing context and creates the B value.
*
@ -175,7 +163,7 @@ void EAXCommon::omacInitFirst(uint8_t omac[16])
// Generate the B value from the encrypted block of zeroes.
// We will need this later when finalising the OMAC hashes.
memcpy(state.b, omac, 16);
gfDouble(state.b);
GF128::dblEAX(state.b);
}
/**
@ -230,17 +218,17 @@ void EAXCommon::omacFinal(uint8_t omac[16])
// Apply padding if necessary.
if (state.authPosn != 16) {
// Need padding: XOR with P = 2 * B.
uint8_t p[16];
uint32_t p[4];
memcpy(p, state.b, 16);
gfDouble(p);
GF128::dblEAX(p);
omac[state.authPosn] ^= 0x80;
for (uint8_t index = 0; index < 16; ++index)
omac[index] ^= p[index];
omac[index] ^= ((const uint8_t *)p)[index];
clean(p);
} else {
// No padding necessary: XOR with B.
for (uint8_t index = 0; index < 16; ++index)
omac[index] ^= state.b[index];
omac[index] ^= ((const uint8_t *)(state.b))[index];
}
// Encrypt the hash to get the final OMAC value.

View File

@ -59,7 +59,7 @@ private:
uint8_t stream[16];
uint8_t tag[16];
uint8_t hash[16];
uint8_t b[16];
uint32_t b[4];
uint8_t encPosn;
uint8_t authPosn;
uint8_t authMode;

480
libraries/Crypto/GF128.cpp Normal file
View File

@ -0,0 +1,480 @@
/*
* Copyright (C) 2016 Southern Storm Software, Pty Ltd.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include "GF128.h"
#include "utility/EndianUtil.h"
#include <string.h>
/**
* \class GF128 GF128.h <GF128.h>
* \brief Operations in the Galois field GF(2^128).
*
* This class contains helper functions for performing operations in
* the Galois field GF(2^128) which is used as the basis of GCM and GHASH.
* These functions are provided for use by other cryptographic protocols
* that make use of GF(2^128).
*
* Most of the functions in this class use the field, polynomial, and
* byte ordering conventions described in NIST SP 800-38D (GCM). The one
* exception is dblEAX() which uses the conventions of EAX mode instead.
*
* References: <a href="http://csrc.nist.gov/publications/nistpubs/800-38D/SP-800-38D.pdf">NIST SP 800-38D</a>
*
* \sa GCM, GHASH
*/
/**
* \brief Initialize multiplication in the GF(2^128) field.
*
* \param H The hash state to be initialized.
* \param key Points to the 16 byte authentication key which is assumed
* to be in big-endian byte order.
*
* This function and the companion mul() are intended for use by other
* classes that need access to the raw GF(2^128) field multiplication of
* GHASH without the overhead of GHASH itself.
*
* \sa mul(), dbl()
*/
void GF128::mulInit(uint32_t H[4], const void *key)
{
#if defined(__AVR__)
// Copy the key into H but leave it in big endian order because
// we can correct for the byte order in mul() below.
memcpy(H, key, 16);
#else
// Copy the key into H and convert from big endian to host order.
memcpy(H, key, 16);
#if defined(CRYPTO_LITTLE_ENDIAN)
H[0] = be32toh(H[0]);
H[1] = be32toh(H[1]);
H[2] = be32toh(H[2]);
H[3] = be32toh(H[3]);
#endif
#endif
}
/**
* \brief Perform a multiplication in the GF(2^128) field.
*
* \param Y The first value to multiply, and the result. This array is
* assumed to be in big-endian order on entry and exit.
* \param H The second value to multiply, which must have been initialized
* by the mulInit() function.
*
* This function and the companion mulInit() are intended for use by other
* classes that need access to the raw GF(2^128) field multiplication of
* GHASH without the overhead of GHASH itself.
*
* \sa mulInit(), dbl()
*/
void GF128::mul(uint32_t Y[4], const uint32_t H[4])
{
#if defined(__AVR__)
uint32_t Z[4] = {0, 0, 0, 0}; // Z = 0
uint32_t V0 = H[0]; // V = H
uint32_t V1 = H[1];
uint32_t V2 = H[2];
uint32_t V3 = H[3];
// Multiply Z by V for the set bits in Y, starting at the top.
// This is a very simple bit by bit version that may not be very
// fast but it should be resistant to cache timing attacks.
for (uint8_t posn = 0; posn < 16; ++posn) {
uint8_t value = ((const uint8_t *)Y)[posn];
for (uint8_t bit = 0; bit < 8; ++bit) {
__asm__ __volatile__ (
// Extract the high bit of "value" and turn it into a mask.
"ldd r24,%8\n"
"lsl r24\n"
"std %8,r24\n"
"mov __tmp_reg__,__zero_reg__\n"
"sbc __tmp_reg__,__zero_reg__\n"
// XOR V with Z if the bit is 1.
"mov r24,%D0\n" // Z0 ^= (V0 & mask)
"and r24,__tmp_reg__\n"
"ldd r25,%D4\n"
"eor r25,r24\n"
"std %D4,r25\n"
"mov r24,%C0\n"
"and r24,__tmp_reg__\n"
"ldd r25,%C4\n"
"eor r25,r24\n"
"std %C4,r25\n"
"mov r24,%B0\n"
"and r24,__tmp_reg__\n"
"ldd r25,%B4\n"
"eor r25,r24\n"
"std %B4,r25\n"
"mov r24,%A0\n"
"and r24,__tmp_reg__\n"
"ldd r25,%A4\n"
"eor r25,r24\n"
"std %A4,r25\n"
"mov r24,%D1\n" // Z1 ^= (V1 & mask)
"and r24,__tmp_reg__\n"
"ldd r25,%D5\n"
"eor r25,r24\n"
"std %D5,r25\n"
"mov r24,%C1\n"
"and r24,__tmp_reg__\n"
"ldd r25,%C5\n"
"eor r25,r24\n"
"std %C5,r25\n"
"mov r24,%B1\n"
"and r24,__tmp_reg__\n"
"ldd r25,%B5\n"
"eor r25,r24\n"
"std %B5,r25\n"
"mov r24,%A1\n"
"and r24,__tmp_reg__\n"
"ldd r25,%A5\n"
"eor r25,r24\n"
"std %A5,r25\n"
"mov r24,%D2\n" // Z2 ^= (V2 & mask)
"and r24,__tmp_reg__\n"
"ldd r25,%D6\n"
"eor r25,r24\n"
"std %D6,r25\n"
"mov r24,%C2\n"
"and r24,__tmp_reg__\n"
"ldd r25,%C6\n"
"eor r25,r24\n"
"std %C6,r25\n"
"mov r24,%B2\n"
"and r24,__tmp_reg__\n"
"ldd r25,%B6\n"
"eor r25,r24\n"
"std %B6,r25\n"
"mov r24,%A2\n"
"and r24,__tmp_reg__\n"
"ldd r25,%A6\n"
"eor r25,r24\n"
"std %A6,r25\n"
"mov r24,%D3\n" // Z3 ^= (V3 & mask)
"and r24,__tmp_reg__\n"
"ldd r25,%D7\n"
"eor r25,r24\n"
"std %D7,r25\n"
"mov r24,%C3\n"
"and r24,__tmp_reg__\n"
"ldd r25,%C7\n"
"eor r25,r24\n"
"std %C7,r25\n"
"mov r24,%B3\n"
"and r24,__tmp_reg__\n"
"ldd r25,%B7\n"
"eor r25,r24\n"
"std %B7,r25\n"
"mov r24,%A3\n"
"and r24,__tmp_reg__\n"
"ldd r25,%A7\n"
"eor r25,r24\n"
"std %A7,r25\n"
// Rotate V right by 1 bit.
"lsr %A0\n"
"ror %B0\n"
"ror %C0\n"
"ror %D0\n"
"ror %A1\n"
"ror %B1\n"
"ror %C1\n"
"ror %D1\n"
"ror %A2\n"
"ror %B2\n"
"ror %C2\n"
"ror %D2\n"
"ror %A3\n"
"ror %B3\n"
"ror %C3\n"
"ror %D3\n"
"mov r24,__zero_reg__\n"
"sbc r24,__zero_reg__\n"
"andi r24,0xE1\n"
"eor %A0,r24\n"
: "+r"(V0), "+r"(V1), "+r"(V2), "+r"(V3)
: "Q"(Z[0]), "Q"(Z[1]), "Q"(Z[2]), "Q"(Z[3]), "Q"(value)
: "r24", "r25"
);
}
}
// We have finished the block so copy Z into Y and byte-swap.
__asm__ __volatile__ (
"ldd __tmp_reg__,%A0\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,%B0\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,%C0\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,%D0\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,%A1\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,%B1\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,%C1\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,%D1\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,%A2\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,%B2\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,%C2\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,%D2\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,%A3\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,%B3\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,%C3\n"
"st X+,__tmp_reg__\n"
"ldd __tmp_reg__,%D3\n"
"st X,__tmp_reg__\n"
: : "Q"(Z[0]), "Q"(Z[1]), "Q"(Z[2]), "Q"(Z[3]), "x"(Y)
);
#else // !__AVR__
uint32_t Z0 = 0; // Z = 0
uint32_t Z1 = 0;
uint32_t Z2 = 0;
uint32_t Z3 = 0;
uint32_t V0 = H[0]; // V = H
uint32_t V1 = H[1];
uint32_t V2 = H[2];
uint32_t V3 = H[3];
// Multiply Z by V for the set bits in Y, starting at the top.
// This is a very simple bit by bit version that may not be very
// fast but it should be resistant to cache timing attacks.
for (uint8_t posn = 0; posn < 16; ++posn) {
uint8_t value = ((const uint8_t *)Y)[posn];
for (uint8_t bit = 0; bit < 8; ++bit, value <<= 1) {
// Extract the high bit of "value" and turn it into a mask.
uint32_t mask = (~((uint32_t)(value >> 7))) + 1;
// XOR V with Z if the bit is 1.
Z0 ^= (V0 & mask);
Z1 ^= (V1 & mask);
Z2 ^= (V2 & mask);
Z3 ^= (V3 & mask);
// Rotate V right by 1 bit.
mask = ((~(V3 & 0x01)) + 1) & 0xE1000000;
V3 = (V3 >> 1) | (V2 << 31);
V2 = (V2 >> 1) | (V1 << 31);
V1 = (V1 >> 1) | (V0 << 31);
V0 = (V0 >> 1) ^ mask;
}
}
// We have finished the block so copy Z into Y and byte-swap.
Y[0] = htobe32(Z0);
Y[1] = htobe32(Z1);
Y[2] = htobe32(Z2);
Y[3] = htobe32(Z3);
#endif // !__AVR__
}
/**
* \brief Doubles a value in the GF(2^128) field.
*
* \param V The value to double, and the result. This array is
* assumed to be in big-endian order on entry and exit.
*
* Block cipher modes such as <a href="https://en.wikipedia.org/wiki/Disk_encryption_theory#Xor-encrypt-xor_.28XEX.29">XEX</a>
* are similar to CTR mode but instead of incrementing the nonce every
* block, the modes multiply the nonce by 2 in the GF(2^128) field every
* block. This function is provided to help with implementing such modes.
*
* \sa dblEAX(), mul()
*/
void GF128::dbl(uint32_t V[4])
{
#if defined(__AVR__)
__asm__ __volatile__ (
"ld r16,Z\n"
"ldd r17,Z+1\n"
"ldd r18,Z+2\n"
"ldd r19,Z+3\n"
"lsr r16\n"
"ror r17\n"
"ror r18\n"
"ror r19\n"
"std Z+1,r17\n"
"std Z+2,r18\n"
"std Z+3,r19\n"
"ldd r17,Z+4\n"
"ldd r18,Z+5\n"
"ldd r19,Z+6\n"
"ldd r20,Z+7\n"
"ror r17\n"
"ror r18\n"
"ror r19\n"
"ror r20\n"
"std Z+4,r17\n"
"std Z+5,r18\n"
"std Z+6,r19\n"
"std Z+7,r20\n"
"ldd r17,Z+8\n"
"ldd r18,Z+9\n"
"ldd r19,Z+10\n"
"ldd r20,Z+11\n"
"ror r17\n"
"ror r18\n"
"ror r19\n"
"ror r20\n"
"std Z+8,r17\n"
"std Z+9,r18\n"
"std Z+10,r19\n"
"std Z+11,r20\n"
"ldd r17,Z+12\n"
"ldd r18,Z+13\n"
"ldd r19,Z+14\n"
"ldd r20,Z+15\n"
"ror r17\n"
"ror r18\n"
"ror r19\n"
"ror r20\n"
"std Z+12,r17\n"
"std Z+13,r18\n"
"std Z+14,r19\n"
"std Z+15,r20\n"
"mov r17,__zero_reg__\n"
"sbc r17,__zero_reg__\n"
"andi r17,0xE1\n"
"eor r16,r17\n"
"st Z,r16\n"
: : "z"(V)
: "r16", "r17", "r18", "r19", "r20"
);
#else
uint32_t V0 = be32toh(V[0]);
uint32_t V1 = be32toh(V[1]);
uint32_t V2 = be32toh(V[2]);
uint32_t V3 = be32toh(V[3]);
uint32_t mask = ((~(V3 & 0x01)) + 1) & 0xE1000000;
V3 = (V3 >> 1) | (V2 << 31);
V2 = (V2 >> 1) | (V1 << 31);
V1 = (V1 >> 1) | (V0 << 31);
V0 = (V0 >> 1) ^ mask;
V[0] = htobe32(V0);
V[1] = htobe32(V1);
V[2] = htobe32(V2);
V[3] = htobe32(V3);
#endif
}
/**
* \brief Doubles a value in the GF(2^128) field using EAX conventions.
*
* \param V The value to double, and the result. This array is
* assumed to be in big-endian order on entry and exit.
*
* This function differs from dbl() that it uses the conventions of EAX mode
* instead of those of NIST SP 800-38D (GCM). The two operations have
* equivalent security but the bits are ordered differently with the
* value shifted left instead of right.
*
* References: https://en.wikipedia.org/wiki/EAX_mode,
* http://web.cs.ucdavis.edu/~rogaway/papers/eax.html
*
* \sa dbl(), mul()
*/
void GF128::dblEAX(uint32_t V[4])
{
#if defined(__AVR__)
__asm__ __volatile__ (
"ldd r16,Z+15\n"
"ldd r17,Z+14\n"
"ldd r18,Z+13\n"
"ldd r19,Z+12\n"
"lsl r16\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"std Z+14,r17\n"
"std Z+13,r18\n"
"std Z+12,r19\n"
"ldd r17,Z+11\n"
"ldd r18,Z+10\n"
"ldd r19,Z+9\n"
"ldd r20,Z+8\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"std Z+11,r17\n"
"std Z+10,r18\n"
"std Z+9,r19\n"
"std Z+8,r20\n"
"ldd r17,Z+7\n"
"ldd r18,Z+6\n"
"ldd r19,Z+5\n"
"ldd r20,Z+4\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"std Z+7,r17\n"
"std Z+6,r18\n"
"std Z+5,r19\n"
"std Z+4,r20\n"
"ldd r17,Z+3\n"
"ldd r18,Z+2\n"
"ldd r19,Z+1\n"
"ld r20,Z\n"
"rol r17\n"
"rol r18\n"
"rol r19\n"
"rol r20\n"
"std Z+3,r17\n"
"std Z+2,r18\n"
"std Z+1,r19\n"
"st Z,r20\n"
"mov r17,__zero_reg__\n"
"sbc r17,__zero_reg__\n"
"andi r17,0x87\n"
"eor r16,r17\n"
"std Z+15,r16\n"
: : "z"(V)
: "r16", "r17", "r18", "r19", "r20"
);
#else
uint32_t V0 = be32toh(V[0]);
uint32_t V1 = be32toh(V[1]);
uint32_t V2 = be32toh(V[2]);
uint32_t V3 = be32toh(V[3]);
uint32_t mask = ((~(V0 >> 31)) + 1) & 0x00000087;
V0 = (V0 << 1) | (V1 >> 31);
V1 = (V1 << 1) | (V2 >> 31);
V2 = (V2 << 1) | (V3 >> 31);
V3 = (V3 << 1) ^ mask;
V[0] = htobe32(V0);
V[1] = htobe32(V1);
V[2] = htobe32(V2);
V[3] = htobe32(V3);
#endif
}

41
libraries/Crypto/GF128.h Normal file
View File

@ -0,0 +1,41 @@
/*
* Copyright (C) 2016 Southern Storm Software, Pty Ltd.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef CRYPTO_GF128_h
#define CRYPTO_GF128_h
#include <inttypes.h>
class GF128
{
private:
GF128() {}
~GF128() {}
public:
static void mulInit(uint32_t H[4], const void *key);
static void mul(uint32_t Y[4], const uint32_t H[4]);
static void dbl(uint32_t V[4]);
static void dblEAX(uint32_t V[4]);
};
#endif

View File

@ -21,8 +21,8 @@
*/
#include "GHASH.h"
#include "GF128.h"
#include "Crypto.h"
#include "utility/EndianUtil.h"
#include <string.h>
/**
@ -66,16 +66,7 @@ GHASH::~GHASH()
*/
void GHASH::reset(const void *key)
{
// Copy the key into H and convert from big endian to host order.
memcpy(state.H, key, 16);
#if defined(CRYPTO_LITTLE_ENDIAN)
state.H[0] = be32toh(state.H[0]);
state.H[1] = be32toh(state.H[1]);
state.H[2] = be32toh(state.H[2]);
state.H[3] = be32toh(state.H[3]);
#endif
// Reset the hash.
GF128::mulInit(state.H, key);
memset(state.Y, 0, sizeof(state.Y));
state.posn = 0;
}
@ -106,7 +97,7 @@ void GHASH::update(const void *data, size_t len)
len -= size;
d += size;
if (state.posn == 16) {
processChunk();
GF128::mul(state.Y, state.H);
state.posn = 0;
}
}
@ -148,7 +139,7 @@ void GHASH::pad()
if (state.posn != 0) {
// Padding involves XOR'ing the rest of state.Y with zeroes,
// which does nothing. Immediately process the next chunk.
processChunk();
GF128::mul(state.Y, state.H);
state.posn = 0;
}
}
@ -160,45 +151,3 @@ void GHASH::clear()
{
clean(state);
}
void GHASH::processChunk()
{
uint32_t Z0 = 0; // Z = 0
uint32_t Z1 = 0;
uint32_t Z2 = 0;
uint32_t Z3 = 0;
uint32_t V0 = state.H[0]; // V = H
uint32_t V1 = state.H[1];
uint32_t V2 = state.H[2];
uint32_t V3 = state.H[3];
// Multiply Z by V for the set bits in Y, starting at the top.
// This is a very simple bit by bit version that may not be very
// fast but it should be resistant to cache timing attacks.
for (uint8_t posn = 0; posn < 16; ++posn) {
uint8_t value = ((const uint8_t *)state.Y)[posn];
for (uint8_t bit = 0; bit < 8; ++bit, value <<= 1) {
// Extract the high bit of "value" and turn it into a mask.
uint32_t mask = (~((uint32_t)(value >> 7))) + 1;
// XOR V with Z if the bit is 1.
Z0 ^= (V0 & mask);
Z1 ^= (V1 & mask);
Z2 ^= (V2 & mask);
Z3 ^= (V3 & mask);
// Rotate V right by 1 bit.
mask = ((~(V3 & 0x01)) + 1) & 0xE1000000;
V3 = (V3 >> 1) | (V2 << 31);
V2 = (V2 >> 1) | (V1 << 31);
V1 = (V1 >> 1) | (V0 << 31);
V0 = (V0 >> 1) ^ mask;
}
}
// We have finished the block so copy Z into Y and byte-swap.
state.Y[0] = htobe32(Z0);
state.Y[1] = htobe32(Z1);
state.Y[2] = htobe32(Z2);
state.Y[3] = htobe32(Z3);
}

View File

@ -46,8 +46,6 @@ private:
uint32_t Y[4];
uint8_t posn;
} state;
void processChunk();
};
#endif

View File

@ -233,6 +233,7 @@ static TestVector const testVectorEAX10 PROGMEM = {
TestVector testVector;
EAX<AES128> *eax;
EAX<AES256> *eax256;
EAX<Speck> *eaxSpeck;
EAX<SpeckLowMemory> *eaxSpeckLowMemory;
@ -353,7 +354,7 @@ void perfCipherSetKey(AuthenticatedCipher *cipher, const struct TestVector *test
start = micros();
for (count = 0; count < 1000; ++count) {
cipher->setKey(test->key, 16);
cipher->setKey(test->key, cipher->keySize());
cipher->setIV(test->iv, test->ivsize);
}
elapsed = micros() - start;
@ -378,7 +379,7 @@ void perfCipherEncrypt(AuthenticatedCipher *cipher, const struct TestVector *tes
Serial.print(test->name);
Serial.print(" Encrypt ... ");
cipher->setKey(test->key, 16);
cipher->setKey(test->key, cipher->keySize());
cipher->setIV(test->iv, test->ivsize);
start = micros();
for (count = 0; count < 500; ++count) {
@ -406,7 +407,7 @@ void perfCipherDecrypt(AuthenticatedCipher *cipher, const struct TestVector *tes
Serial.print(test->name);
Serial.print(" Decrypt ... ");
cipher->setKey(test->key, 16);
cipher->setKey(test->key, cipher->keySize());
cipher->setIV(test->iv, test->ivsize);
start = micros();
for (count = 0; count < 500; ++count) {
@ -434,7 +435,7 @@ void perfCipherAddAuthData(AuthenticatedCipher *cipher, const struct TestVector
Serial.print(test->name);
Serial.print(" AddAuthData ... ");
cipher->setKey(test->key, 16);
cipher->setKey(test->key, cipher->keySize());
cipher->setIV(test->iv, test->ivsize);
start = micros();
memset(buffer, 0xBA, 128);
@ -463,7 +464,7 @@ void perfCipherComputeTag(AuthenticatedCipher *cipher, const struct TestVector *
Serial.print(test->name);
Serial.print(" ComputeTag ... ");
cipher->setKey(test->key, 16);
cipher->setKey(test->key, cipher->keySize());
cipher->setIV(test->iv, test->ivsize);
start = micros();
for (count = 0; count < 1000; ++count) {
@ -495,6 +496,8 @@ void setup()
Serial.println("State Sizes:");
Serial.print("EAX<AES128> ... ");
Serial.println(sizeof(*eax));
Serial.print("EAX<AES256> ... ");
Serial.println(sizeof(*eax256));
Serial.print("EAX<Speck> ... ");
Serial.println(sizeof(*eaxSpeck));
Serial.print("EAX<SpeckLowMemory> ... ");
@ -520,6 +523,10 @@ void setup()
perfCipher(eax, &testVectorEAX1, "AES-128");
Serial.println();
delete eax;
eax256 = new EAX<AES256>();
perfCipher(eax, &testVectorEAX1, "AES-256");
Serial.println();
delete eax256;
eaxSpeck = new EAX<Speck>();
perfCipher(eaxSpeck, &testVectorEAX1, "Speck");
Serial.println();

View File

@ -26,10 +26,19 @@ This example runs tests on the GCM implementation to verify correct behaviour.
#include <Crypto.h>
#include <AES.h>
#include <Speck.h>
#include <SpeckLowMemory.h>
#include <GCM.h>
#include <string.h>
#include <avr/pgmspace.h>
// There isn't enough memory to test both AES and Speck on the Uno,
// so disable Speck testing on AVR platforms unless explicitly enabled.
// When enabled, some of the AES tests are disabled to reclaim memory.
#if defined(__AVR__)
//#define TEST_SPECK 1
#endif
#define MAX_PLAINTEXT_LEN 64
struct TestVector
@ -65,6 +74,7 @@ static TestVector const testVectorGCM1 PROGMEM = {
.tagsize = 16,
.ivsize = 12
};
#ifndef TEST_SPECK
static TestVector const testVectorGCM2 PROGMEM = {
.name = "AES-128 GCM #2",
.key = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@ -176,6 +186,7 @@ static TestVector const testVectorGCM5 PROGMEM = {
.tagsize = 16,
.ivsize = 8
};
#endif // !TEST_SPECK
static TestVector const testVectorGCM10 PROGMEM = {
.name = "AES-192 GCM #10",
.key = {0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c,
@ -249,6 +260,8 @@ TestVector testVector;
GCM<AES128> *gcmaes128 = 0;
GCM<AES192> *gcmaes192 = 0;
GCM<AES256> *gcmaes256 = 0;
GCM<Speck> *gcmspeck = 0;
GCM<SpeckLowMemory> *gcmspecklm = 0;
byte buffer[128];
@ -348,7 +361,7 @@ void testCipher(AuthenticatedCipher *cipher, const struct TestVector *test)
Serial.println("Failed");
}
void perfCipherSetKey(AuthenticatedCipher *cipher, const struct TestVector *test)
void perfCipherSetKey(AuthenticatedCipher *cipher, const struct TestVector *test, const char *name)
{
unsigned long start;
unsigned long elapsed;
@ -357,7 +370,7 @@ void perfCipherSetKey(AuthenticatedCipher *cipher, const struct TestVector *test
memcpy_P(&testVector, test, sizeof(TestVector));
test = &testVector;
Serial.print(test->name);
Serial.print(name);
Serial.print(" SetKey ... ");
start = micros();
@ -373,7 +386,7 @@ void perfCipherSetKey(AuthenticatedCipher *cipher, const struct TestVector *test
Serial.println(" per second");
}
void perfCipherEncrypt(AuthenticatedCipher *cipher, const struct TestVector *test)
void perfCipherEncrypt(AuthenticatedCipher *cipher, const struct TestVector *test, const char *name)
{
unsigned long start;
unsigned long elapsed;
@ -382,7 +395,7 @@ void perfCipherEncrypt(AuthenticatedCipher *cipher, const struct TestVector *tes
memcpy_P(&testVector, test, sizeof(TestVector));
test = &testVector;
Serial.print(test->name);
Serial.print(name);
Serial.print(" Encrypt ... ");
cipher->setKey(test->key, cipher->keySize());
@ -399,7 +412,7 @@ void perfCipherEncrypt(AuthenticatedCipher *cipher, const struct TestVector *tes
Serial.println(" bytes per second");
}
void perfCipherDecrypt(AuthenticatedCipher *cipher, const struct TestVector *test)
void perfCipherDecrypt(AuthenticatedCipher *cipher, const struct TestVector *test, const char *name)
{
unsigned long start;
unsigned long elapsed;
@ -408,7 +421,7 @@ void perfCipherDecrypt(AuthenticatedCipher *cipher, const struct TestVector *tes
memcpy_P(&testVector, test, sizeof(TestVector));
test = &testVector;
Serial.print(test->name);
Serial.print(name);
Serial.print(" Decrypt ... ");
cipher->setKey(test->key, cipher->keySize());
@ -425,7 +438,7 @@ void perfCipherDecrypt(AuthenticatedCipher *cipher, const struct TestVector *tes
Serial.println(" bytes per second");
}
void perfCipherAddAuthData(AuthenticatedCipher *cipher, const struct TestVector *test)
void perfCipherAddAuthData(AuthenticatedCipher *cipher, const struct TestVector *test, const char *name)
{
unsigned long start;
unsigned long elapsed;
@ -434,7 +447,7 @@ void perfCipherAddAuthData(AuthenticatedCipher *cipher, const struct TestVector
memcpy_P(&testVector, test, sizeof(TestVector));
test = &testVector;
Serial.print(test->name);
Serial.print(name);
Serial.print(" AddAuthData ... ");
cipher->setKey(test->key, cipher->keySize());
@ -452,7 +465,7 @@ void perfCipherAddAuthData(AuthenticatedCipher *cipher, const struct TestVector
Serial.println(" bytes per second");
}
void perfCipherComputeTag(AuthenticatedCipher *cipher, const struct TestVector *test)
void perfCipherComputeTag(AuthenticatedCipher *cipher, const struct TestVector *test, const char *name)
{
unsigned long start;
unsigned long elapsed;
@ -461,7 +474,7 @@ void perfCipherComputeTag(AuthenticatedCipher *cipher, const struct TestVector *
memcpy_P(&testVector, test, sizeof(TestVector));
test = &testVector;
Serial.print(test->name);
Serial.print(name);
Serial.print(" ComputeTag ... ");
cipher->setKey(test->key, cipher->keySize());
@ -478,13 +491,13 @@ void perfCipherComputeTag(AuthenticatedCipher *cipher, const struct TestVector *
Serial.println(" per second");
}
void perfCipher(AuthenticatedCipher *cipher, const struct TestVector *test)
void perfCipher(AuthenticatedCipher *cipher, const struct TestVector *test, const char *name)
{
perfCipherSetKey(cipher, test);
perfCipherEncrypt(cipher, test);
perfCipherDecrypt(cipher, test);
perfCipherAddAuthData(cipher, test);
perfCipherComputeTag(cipher, test);
perfCipherSetKey(cipher, test, name);
perfCipherEncrypt(cipher, test, name);
perfCipherDecrypt(cipher, test, name);
perfCipherAddAuthData(cipher, test, name);
perfCipherComputeTag(cipher, test, name);
}
void setup()
@ -493,6 +506,7 @@ void setup()
Serial.println();
#ifndef TEST_SPECK
Serial.println("State Sizes:");
Serial.print("GCM<AES128> ... ");
Serial.println(sizeof(*gcmaes128));
@ -500,15 +514,22 @@ void setup()
Serial.println(sizeof(*gcmaes192));
Serial.print("GCM<AES256> ... ");
Serial.println(sizeof(*gcmaes256));
Serial.print("GCM<Speck> ... ");
Serial.println(sizeof(*gcmspeck));
Serial.print("GCM<SpeckLowMemory> ... ");
Serial.println(sizeof(*gcmspecklm));
Serial.println();
#endif
Serial.println("Test Vectors:");
gcmaes128 = new GCM<AES128>();
testCipher(gcmaes128, &testVectorGCM1);
#ifndef TEST_SPECK
testCipher(gcmaes128, &testVectorGCM2);
testCipher(gcmaes128, &testVectorGCM3);
testCipher(gcmaes128, &testVectorGCM4);
testCipher(gcmaes128, &testVectorGCM5);
#endif
delete gcmaes128;
gcmaes192 = new GCM<AES192>();
testCipher(gcmaes192, &testVectorGCM10);
@ -520,15 +541,25 @@ void setup()
Serial.println();
Serial.println("Performance Tests:");
#ifndef TEST_SPECK
gcmaes128 = new GCM<AES128>();
perfCipher(gcmaes128, &testVectorGCM1);
perfCipher(gcmaes128, &testVectorGCM1, testVectorGCM1.name);
delete gcmaes128;
gcmaes192 = new GCM<AES192>();
perfCipher(gcmaes192, &testVectorGCM10);
perfCipher(gcmaes192, &testVectorGCM10, testVectorGCM10.name);
delete gcmaes192;
gcmaes256 = new GCM<AES256>();
perfCipher(gcmaes256, &testVectorGCM16);
perfCipher(gcmaes256, &testVectorGCM16, testVectorGCM16.name);
delete gcmaes256;
#endif
#if defined(TEST_SPECK) || !defined(__AVR__)
gcmspeck = new GCM<Speck>();
perfCipher(gcmspeck, &testVectorGCM16, "GCM-Speck-256");
delete gcmspeck;
gcmspecklm = new GCM<SpeckLowMemory>();
perfCipher(gcmspecklm, &testVectorGCM16, "GCM-SpeckLowMemory-256");
delete gcmspecklm;
#endif
}
void loop()