Speed up GHASH with AVR assembly code

Also split the Galois operations off into a separate GF128 class.
2025-01-18 04:33:12 -08:00 · 2016-02-07 13:30:21 +10:00
parent 2decb74161
commit 21ac06136a
9 changed files with 614 additions and 114 deletions
--- a/doc/crypto.dox
+++ b/doc/crypto.dox
@@ -86,12 +86,15 @@ Ardunino Mega 2560 running at 16 MHz are similar:
 <tr><td colspan="5"> </td></tr>
 <tr><td>AEAD Algorithm</td><td align="right">Encryption (per byte)</td><td align="right">Decryption (per byte)</td><td>Key Setup</td><td>State Size (bytes)</td></tr>
 <tr><td>ChaChaPoly</td><td align="right">41.20us</td><td align="right">41.19us</td><td align="right">902.36us</td><td align="right">221</td></tr>
-<tr><td>GCM&lt;AES128&gt;</td><td align="right">183.25us</td><td align="right">182.80us</td><td align="right">1272.73us</td><td align="right">284</td></tr>
-<tr><td>GCM&lt;AES192&gt;</td><td align="right">189.92us</td><td align="right">189.47us</td><td align="right">1492.60us</td><td align="right">316</td></tr>
-<tr><td>GCM&lt;AES256&gt;</td><td align="right">196.59us</td><td align="right">196.13us</td><td align="right">1767.33us</td><td align="right">348</td></tr>
-<tr><td>EAX&lt;AES128&gt;</td><td align="right">71.14us</td><td align="right">71.14us</td><td align="right">1329.44us</td><td align="right">268</td></tr>
-<tr><td>EAX&lt;Speck&gt; (128-bit key)</td><td align="right">26.01us</td><td align="right">26.01us</td><td align="right">735.46us</td><td align="right">362</td></tr>
-<tr><td>EAX&lt;SpeckLowMemory&gt; (128-bit key)</td><td align="right">75.08us</td><td align="right">75.07us</td><td align="right">1243.66us</td><td align="right">122</td></tr>
+<tr><td>GCM&lt;AES128&gt;</td><td align="right">109.71us</td><td align="right">109.26us</td><td align="right">1265.69us</td><td align="right">284</td></tr>
+<tr><td>GCM&lt;AES192&gt;</td><td align="right">116.38us</td><td align="right">115.92us</td><td align="right">1485.56us</td><td align="right">316</td></tr>
+<tr><td>GCM&lt;AES256&gt;</td><td align="right">123.04us</td><td align="right">122.59us</td><td align="right">1760.28us</td><td align="right">348</td></tr>
+<tr><td>GCM&lt;Speck&gt; (256-bit key)</td><td align="right">87.78us</td><td align="right">87.32us</td><td align="right">714.41us</td><td align="right">378</td></tr>
+<tr><td>GCM&lt;SpeckLowMemory&gt; (256-bit key)</td><td align="right">114.30us</td><td align="right">113.84us</td><td align="right">1270.32us</td><td align="right">138</td></tr>
+<tr><td>EAX&lt;AES128&gt;</td><td align="right">71.14us</td><td align="right">71.14us</td><td align="right">1311.97us</td><td align="right">268</td></tr>
+<tr><td>EAX&lt;AES256&gt;</td><td align="right">97.80us</td><td align="right">97.80us</td><td align="right">1806.57us</td><td align="right">332</td></tr>
+<tr><td>EAX&lt;Speck&gt; (256-bit key)</td><td align="right">27.27us</td><td align="right">27.26us</td><td align="right">760.74us</td><td align="right">362</td></tr>
+<tr><td>EAX&lt;SpeckLowMemory&gt; (256-bit key)</td><td align="right">80.31us</td><td align="right">80.31us</td><td align="right">1316.60us</td><td align="right">122</td></tr>
 <tr><td colspan="5"> </td></tr>
 <tr><td>Hash Algorithm</td><td align="right">Hashing (per byte)</td><td align="right">Finalization</td><td> </td><td>State Size (bytes)</td></tr>
 <tr><td>SHA256</td><td align="right">43.85us</td><td align="right">2841.04us</td><td align="right"> </td><td align="right">107</td></tr>
@@ -105,7 +108,7 @@ Ardunino Mega 2560 running at 16 MHz are similar:
 <tr><td>SHA256 (HMAC mode)</td><td align="right">43.85us</td><td align="right">8552.61us</td><td align="right">2836.49us</td><td align="right">107</td></tr>
 <tr><td>BLAKE2s (HMAC mode)</td><td align="right">20.65us</td><td align="right">4055.56us</td><td align="right">1350.00us</td><td align="right">107</td></tr>
 <tr><td>Poly1305</td><td align="right">26.26us</td><td align="right">489.11us</td><td align="right">17.06us</td><td align="right">53</td></tr>
-<tr><td>GHASH</td><td align="right">148.14us</td><td align="right">17.09us</td><td align="right">21.87us</td><td align="right">33</td></tr>
+<tr><td>GHASH</td><td align="right">74.59us</td><td align="right">15.91us</td><td align="right">14.79us</td><td align="right">33</td></tr>
 <tr><td colspan="5"> </td></tr>
 <tr><td>Public Key Operation</td><td align="right">Time (per operation)</td><td colspan="3">Comment</td></tr>
 <tr><td>Curve25519::eval()</td><td align="right">3119ms</td><td colspan="3">Raw curve evaluation</td></tr>
@@ -141,12 +144,15 @@ All figures are for the Arduino Due running at 84 MHz:
 <tr><td colspan="5"> </td></tr>
 <tr><td>AEAD Algorithm</td><td align="right">Encryption (per byte)</td><td align="right">Decryption (per byte)</td><td>Key Setup</td><td>State Size (bytes)</td></tr>
 <tr><td>ChaChaPoly</td><td align="right">1.71us</td><td align="right">1.71us</td><td align="right">45.08us</td><td align="right">240</td></tr>
-<tr><td>GCM&lt;AES128&gt;</td><td align="right">10.29us</td><td align="right">10.29us</td><td align="right">223.82us</td><td align="right">312</td></tr>
-<tr><td>GCM&lt;AES192&gt;</td><td align="right">11.50us</td><td align="right">11.51us</td><td align="right">265.62us</td><td align="right">344</td></tr>
-<tr><td>GCM&lt;AES256&gt;</td><td align="right">12.67us</td><td align="right">12.67us</td><td align="right">313.06us</td><td align="right">376</td></tr>
-<tr><td>EAX&lt;AES128&gt;</td><td align="right">12.29us</td><td align="right">12.29us</td><td align="right">236.47us</td><td align="right">280</td></tr>
-<tr><td>EAX&lt;Speck&gt; (128-bit key)</td><td align="right">2.65us</td><td align="right">2.65us</td><td align="right">79.46us</td><td align="right">384</td></tr>
-<tr><td>EAX&lt;SpeckLowMemory&gt; (128-bit key)</td><td align="right">6.29us</td><td align="right">6.29us</td><td align="right">106.60us</td><td align="right">144</td></tr>
+<tr><td>GCM&lt;AES128&gt;</td><td align="right">10.90us</td><td align="right">10.90us</td><td align="right">248.83us</td><td align="right">312</td></tr>
+<tr><td>GCM&lt;AES192&gt;</td><td align="right">12.30us</td><td align="right">12.31us</td><td align="right">296.83us</td><td align="right">344</td></tr>
+<tr><td>GCM&lt;AES256&gt;</td><td align="right">13.66us</td><td align="right">13.67us</td><td align="right">350.25us</td><td align="right">376</td></tr>
+<tr><td>GCM&lt;Speck&gt; (256-bit key)</td><td align="right">5.27us</td><td align="right">5.28us</td><td align="right">75.31us</td><td align="right">408</td></tr>
+<tr><td>GCM&lt;SpeckLowMemory&gt; (256-bit key)</td><td align="right">7.06us</td><td align="right">7.07us</td><td align="right">94.20us</td><td align="right">168</td></tr>
+<tr><td>EAX&lt;AES128&gt;</td><td align="right">12.33us</td><td align="right">12.33us</td><td align="right">234.91us</td><td align="right">280</td></tr>
+<tr><td>EAX&lt;AES256&gt;</td><td align="right">16.99us</td><td align="right">16.99us</td><td align="right">322.92us</td><td align="right">344</td></tr>
+<tr><td>EAX&lt;Speck&gt; (256-bit key)</td><td align="right">2.80us</td><td align="right">2.80us</td><td align="right">81.63us</td><td align="right">384</td></tr>
+<tr><td>EAX&lt;SpeckLowMemory&gt; (256-bit key)</td><td align="right">6.69us</td><td align="right">6.69us</td><td align="right">110.91us</td><td align="right">144</td></tr>
 <tr><td colspan="5"> </td></tr>
 <tr><td>Hash Algorithm</td><td align="right">Hashing (per byte)</td><td align="right">Finalization</td><td> </td><td>State Size (bytes)</td></tr>
 <tr><td>SHA256</td><td align="right">1.15us</td><td align="right">76.60us</td><td align="right"> </td><td align="right">120</td></tr>
@@ -160,7 +166,7 @@ All figures are for the Arduino Due running at 84 MHz:
 <tr><td>SHA256 (HMAC mode)</td><td align="right">1.15us</td><td align="right">238.98us</td><td align="right">80.44us</td><td align="right">120</td></tr>
 <tr><td>BLAKE2s (HMAC mode)</td><td align="right">0.72us</td><td align="right">157.75us</td><td align="right">57.18us</td><td align="right">120</td></tr>
 <tr><td>Poly1305</td><td align="right">0.81us</td><td align="right">19.01us</td><td align="right">2.57us</td><td align="right">60</td></tr>
-<tr><td>GHASH</td><td align="right">4.37us</td><td align="right">1.50us</td><td align="right">4.37us</td><td align="right">36</td></tr>
+<tr><td>GHASH</td><td align="right">4.47us</td><td align="right">1.52us</td><td align="right">2.60us</td><td align="right">36</td></tr>
 <tr><td colspan="5"> </td></tr>
 <tr><td>Public Key Operation</td><td align="right">Time (per operation)</td><td colspan="3">Comment</td></tr>
 <tr><td>Curve25519::eval()</td><td align="right">103ms</td><td colspan="3">Raw curve evaluation</td></tr>
--- a/libraries/Crypto/EAX.cpp
+++ b/libraries/Crypto/EAX.cpp
@@ -21,6 +21,7 @@
 */

 #include "EAX.h"
+#include "GF128.h"
 #include "Crypto.h"
 #include <string.h>

@@ -145,19 +146,6 @@ void EAXCommon::clear()
    clean(state);
 }

-// Doubles a 128-bit value in the GF(2^128) field.
-static void gfDouble(uint8_t value[16])
-{
-    uint16_t temp = 0;
-    for (uint8_t index = 16; index > 0; ) {
-        --index;
-        temp |= (((uint16_t)(value[index])) << 1);
-        value[index] = (uint8_t)temp;
-        temp >>= 8;
-    }
-    value[15] ^= (uint8_t)((-temp) & 0x87);
-}
-
 /**
 * \brief Initialises the first OMAC hashing context and creates the B value.
 *
@@ -175,7 +163,7 @@ void EAXCommon::omacInitFirst(uint8_t omac[16])
    // Generate the B value from the encrypted block of zeroes.
    // We will need this later when finalising the OMAC hashes.
    memcpy(state.b, omac, 16);
-    gfDouble(state.b);
+    GF128::dblEAX(state.b);
 }

 /**
@@ -230,17 +218,17 @@ void EAXCommon::omacFinal(uint8_t omac[16])
    // Apply padding if necessary.
    if (state.authPosn != 16) {
        // Need padding: XOR with P = 2 * B.
-        uint8_t p[16];
+        uint32_t p[4];
        memcpy(p, state.b, 16);
-        gfDouble(p);
+        GF128::dblEAX(p);
        omac[state.authPosn] ^= 0x80;
        for (uint8_t index = 0; index < 16; ++index)
-            omac[index] ^= p[index];
+            omac[index] ^= ((const uint8_t *)p)[index];
        clean(p);
    } else {
        // No padding necessary: XOR with B.
        for (uint8_t index = 0; index < 16; ++index)
-            omac[index] ^= state.b[index];
+            omac[index] ^= ((const uint8_t *)(state.b))[index];
    }

    // Encrypt the hash to get the final OMAC value.
--- a/libraries/Crypto/EAX.h
+++ b/libraries/Crypto/EAX.h
@@ -59,7 +59,7 @@ private:
        uint8_t stream[16];
        uint8_t tag[16];
        uint8_t hash[16];
-        uint8_t b[16];
+        uint32_t b[4];
        uint8_t encPosn;
        uint8_t authPosn;
        uint8_t authMode;
--- a/libraries/Crypto/GF128.cpp
+++ b/libraries/Crypto/GF128.cpp
@@ -0,0 +1,480 @@
+/*
+ * Copyright (C) 2016 Southern Storm Software, Pty Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "GF128.h"
+#include "utility/EndianUtil.h"
+#include <string.h>
+
+/**
+ * \class GF128 GF128.h <GF128.h>
+ * \brief Operations in the Galois field GF(2^128).
+ *
+ * This class contains helper functions for performing operations in
+ * the Galois field GF(2^128) which is used as the basis of GCM and GHASH.
+ * These functions are provided for use by other cryptographic protocols
+ * that make use of GF(2^128).
+ *
+ * Most of the functions in this class use the field, polynomial, and
+ * byte ordering conventions described in NIST SP 800-38D (GCM).  The one
+ * exception is dblEAX() which uses the conventions of EAX mode instead.
+ *
+ * References: <a href="http://csrc.nist.gov/publications/nistpubs/800-38D/SP-800-38D.pdf">NIST SP 800-38D</a>
+ *
+ * \sa GCM, GHASH
+ */
+
+/**
+ * \brief Initialize multiplication in the GF(2^128) field.
+ *
+ * \param H The hash state to be initialized.
+ * \param key Points to the 16 byte authentication key which is assumed
+ * to be in big-endian byte order.
+ *
+ * This function and the companion mul() are intended for use by other
+ * classes that need access to the raw GF(2^128) field multiplication of
+ * GHASH without the overhead of GHASH itself.
+ *
+ * \sa mul(), dbl()
+ */
+void GF128::mulInit(uint32_t H[4], const void *key)
+{
+#if defined(__AVR__)
+    // Copy the key into H but leave it in big endian order because
+    // we can correct for the byte order in mul() below.
+    memcpy(H, key, 16);
+#else
+    // Copy the key into H and convert from big endian to host order.
+    memcpy(H, key, 16);
+#if defined(CRYPTO_LITTLE_ENDIAN)
+    H[0] = be32toh(H[0]);
+    H[1] = be32toh(H[1]);
+    H[2] = be32toh(H[2]);
+    H[3] = be32toh(H[3]);
+#endif
+#endif
+}
+
+/**
+ * \brief Perform a multiplication in the GF(2^128) field.
+ *
+ * \param Y The first value to multiply, and the result.  This array is
+ * assumed to be in big-endian order on entry and exit.
+ * \param H The second value to multiply, which must have been initialized
+ * by the mulInit() function.
+ *
+ * This function and the companion mulInit() are intended for use by other
+ * classes that need access to the raw GF(2^128) field multiplication of
+ * GHASH without the overhead of GHASH itself.
+ *
+ * \sa mulInit(), dbl()
+ */
+void GF128::mul(uint32_t Y[4], const uint32_t H[4])
+{
+#if defined(__AVR__)
+    uint32_t Z[4] = {0, 0, 0, 0};   // Z = 0
+    uint32_t V0 = H[0];             // V = H
+    uint32_t V1 = H[1];
+    uint32_t V2 = H[2];
+    uint32_t V3 = H[3];
+
+    // Multiply Z by V for the set bits in Y, starting at the top.
+    // This is a very simple bit by bit version that may not be very
+    // fast but it should be resistant to cache timing attacks.
+    for (uint8_t posn = 0; posn < 16; ++posn) {
+        uint8_t value = ((const uint8_t *)Y)[posn];
+        for (uint8_t bit = 0; bit < 8; ++bit) {
+            __asm__ __volatile__ (
+                // Extract the high bit of "value" and turn it into a mask.
+                "ldd r24,%8\n"
+                "lsl r24\n"
+                "std %8,r24\n"
+                "mov __tmp_reg__,__zero_reg__\n"
+                "sbc __tmp_reg__,__zero_reg__\n"
+
+                // XOR V with Z if the bit is 1.
+                "mov r24,%D0\n"         // Z0 ^= (V0 & mask)
+                "and r24,__tmp_reg__\n"
+                "ldd r25,%D4\n"
+                "eor r25,r24\n"
+                "std %D4,r25\n"
+                "mov r24,%C0\n"
+                "and r24,__tmp_reg__\n"
+                "ldd r25,%C4\n"
+                "eor r25,r24\n"
+                "std %C4,r25\n"
+                "mov r24,%B0\n"
+                "and r24,__tmp_reg__\n"
+                "ldd r25,%B4\n"
+                "eor r25,r24\n"
+                "std %B4,r25\n"
+                "mov r24,%A0\n"
+                "and r24,__tmp_reg__\n"
+                "ldd r25,%A4\n"
+                "eor r25,r24\n"
+                "std %A4,r25\n"
+                "mov r24,%D1\n"         // Z1 ^= (V1 & mask)
+                "and r24,__tmp_reg__\n"
+                "ldd r25,%D5\n"
+                "eor r25,r24\n"
+                "std %D5,r25\n"
+                "mov r24,%C1\n"
+                "and r24,__tmp_reg__\n"
+                "ldd r25,%C5\n"
+                "eor r25,r24\n"
+                "std %C5,r25\n"
+                "mov r24,%B1\n"
+                "and r24,__tmp_reg__\n"
+                "ldd r25,%B5\n"
+                "eor r25,r24\n"
+                "std %B5,r25\n"
+                "mov r24,%A1\n"
+                "and r24,__tmp_reg__\n"
+                "ldd r25,%A5\n"
+                "eor r25,r24\n"
+                "std %A5,r25\n"
+                "mov r24,%D2\n"         // Z2 ^= (V2 & mask)
+                "and r24,__tmp_reg__\n"
+                "ldd r25,%D6\n"
+                "eor r25,r24\n"
+                "std %D6,r25\n"
+                "mov r24,%C2\n"
+                "and r24,__tmp_reg__\n"
+                "ldd r25,%C6\n"
+                "eor r25,r24\n"
+                "std %C6,r25\n"
+                "mov r24,%B2\n"
+                "and r24,__tmp_reg__\n"
+                "ldd r25,%B6\n"
+                "eor r25,r24\n"
+                "std %B6,r25\n"
+                "mov r24,%A2\n"
+                "and r24,__tmp_reg__\n"
+                "ldd r25,%A6\n"
+                "eor r25,r24\n"
+                "std %A6,r25\n"
+                "mov r24,%D3\n"         // Z3 ^= (V3 & mask)
+                "and r24,__tmp_reg__\n"
+                "ldd r25,%D7\n"
+                "eor r25,r24\n"
+                "std %D7,r25\n"
+                "mov r24,%C3\n"
+                "and r24,__tmp_reg__\n"
+                "ldd r25,%C7\n"
+                "eor r25,r24\n"
+                "std %C7,r25\n"
+                "mov r24,%B3\n"
+                "and r24,__tmp_reg__\n"
+                "ldd r25,%B7\n"
+                "eor r25,r24\n"
+                "std %B7,r25\n"
+                "mov r24,%A3\n"
+                "and r24,__tmp_reg__\n"
+                "ldd r25,%A7\n"
+                "eor r25,r24\n"
+                "std %A7,r25\n"
+
+                // Rotate V right by 1 bit.
+                "lsr %A0\n"
+                "ror %B0\n"
+                "ror %C0\n"
+                "ror %D0\n"
+                "ror %A1\n"
+                "ror %B1\n"
+                "ror %C1\n"
+                "ror %D1\n"
+                "ror %A2\n"
+                "ror %B2\n"
+                "ror %C2\n"
+                "ror %D2\n"
+                "ror %A3\n"
+                "ror %B3\n"
+                "ror %C3\n"
+                "ror %D3\n"
+                "mov r24,__zero_reg__\n"
+                "sbc r24,__zero_reg__\n"
+                "andi r24,0xE1\n"
+                "eor %A0,r24\n"
+                : "+r"(V0), "+r"(V1), "+r"(V2), "+r"(V3)
+                : "Q"(Z[0]), "Q"(Z[1]), "Q"(Z[2]), "Q"(Z[3]), "Q"(value)
+                : "r24", "r25"
+            );
+        }
+    }
+
+    // We have finished the block so copy Z into Y and byte-swap.
+    __asm__ __volatile__ (
+        "ldd __tmp_reg__,%A0\n"
+        "st X+,__tmp_reg__\n"
+        "ldd __tmp_reg__,%B0\n"
+        "st X+,__tmp_reg__\n"
+        "ldd __tmp_reg__,%C0\n"
+        "st X+,__tmp_reg__\n"
+        "ldd __tmp_reg__,%D0\n"
+        "st X+,__tmp_reg__\n"
+        "ldd __tmp_reg__,%A1\n"
+        "st X+,__tmp_reg__\n"
+        "ldd __tmp_reg__,%B1\n"
+        "st X+,__tmp_reg__\n"
+        "ldd __tmp_reg__,%C1\n"
+        "st X+,__tmp_reg__\n"
+        "ldd __tmp_reg__,%D1\n"
+        "st X+,__tmp_reg__\n"
+        "ldd __tmp_reg__,%A2\n"
+        "st X+,__tmp_reg__\n"
+        "ldd __tmp_reg__,%B2\n"
+        "st X+,__tmp_reg__\n"
+        "ldd __tmp_reg__,%C2\n"
+        "st X+,__tmp_reg__\n"
+        "ldd __tmp_reg__,%D2\n"
+        "st X+,__tmp_reg__\n"
+        "ldd __tmp_reg__,%A3\n"
+        "st X+,__tmp_reg__\n"
+        "ldd __tmp_reg__,%B3\n"
+        "st X+,__tmp_reg__\n"
+        "ldd __tmp_reg__,%C3\n"
+        "st X+,__tmp_reg__\n"
+        "ldd __tmp_reg__,%D3\n"
+        "st X,__tmp_reg__\n"
+        : : "Q"(Z[0]), "Q"(Z[1]), "Q"(Z[2]), "Q"(Z[3]), "x"(Y)
+    );
+#else // !__AVR__
+    uint32_t Z0 = 0;        // Z = 0
+    uint32_t Z1 = 0;
+    uint32_t Z2 = 0;
+    uint32_t Z3 = 0;
+    uint32_t V0 = H[0];     // V = H
+    uint32_t V1 = H[1];
+    uint32_t V2 = H[2];
+    uint32_t V3 = H[3];
+
+    // Multiply Z by V for the set bits in Y, starting at the top.
+    // This is a very simple bit by bit version that may not be very
+    // fast but it should be resistant to cache timing attacks.
+    for (uint8_t posn = 0; posn < 16; ++posn) {
+        uint8_t value = ((const uint8_t *)Y)[posn];
+        for (uint8_t bit = 0; bit < 8; ++bit, value <<= 1) {
+            // Extract the high bit of "value" and turn it into a mask.
+            uint32_t mask = (~((uint32_t)(value >> 7))) + 1;
+
+            // XOR V with Z if the bit is 1.
+            Z0 ^= (V0 & mask);
+            Z1 ^= (V1 & mask);
+            Z2 ^= (V2 & mask);
+            Z3 ^= (V3 & mask);
+
+            // Rotate V right by 1 bit.
+            mask = ((~(V3 & 0x01)) + 1) & 0xE1000000;
+            V3 = (V3 >> 1) | (V2 << 31);
+            V2 = (V2 >> 1) | (V1 << 31);
+            V1 = (V1 >> 1) | (V0 << 31);
+            V0 = (V0 >> 1) ^ mask;
+        }
+    }
+
+    // We have finished the block so copy Z into Y and byte-swap.
+    Y[0] = htobe32(Z0);
+    Y[1] = htobe32(Z1);
+    Y[2] = htobe32(Z2);
+    Y[3] = htobe32(Z3);
+#endif // !__AVR__
+}
+
+/**
+ * \brief Doubles a value in the GF(2^128) field.
+ *
+ * \param V The value to double, and the result.  This array is
+ * assumed to be in big-endian order on entry and exit.
+ *
+ * Block cipher modes such as <a href="https://en.wikipedia.org/wiki/Disk_encryption_theory#Xor-encrypt-xor_.28XEX.29">XEX</a>
+ * are similar to CTR mode but instead of incrementing the nonce every
+ * block, the modes multiply the nonce by 2 in the GF(2^128) field every
+ * block.  This function is provided to help with implementing such modes.
+ *
+ * \sa dblEAX(), mul()
+ */
+void GF128::dbl(uint32_t V[4])
+{
+#if defined(__AVR__)
+    __asm__ __volatile__ (
+        "ld r16,Z\n"
+        "ldd r17,Z+1\n"
+        "ldd r18,Z+2\n"
+        "ldd r19,Z+3\n"
+        "lsr r16\n"
+        "ror r17\n"
+        "ror r18\n"
+        "ror r19\n"
+        "std Z+1,r17\n"
+        "std Z+2,r18\n"
+        "std Z+3,r19\n"
+        "ldd r17,Z+4\n"
+        "ldd r18,Z+5\n"
+        "ldd r19,Z+6\n"
+        "ldd r20,Z+7\n"
+        "ror r17\n"
+        "ror r18\n"
+        "ror r19\n"
+        "ror r20\n"
+        "std Z+4,r17\n"
+        "std Z+5,r18\n"
+        "std Z+6,r19\n"
+        "std Z+7,r20\n"
+        "ldd r17,Z+8\n"
+        "ldd r18,Z+9\n"
+        "ldd r19,Z+10\n"
+        "ldd r20,Z+11\n"
+        "ror r17\n"
+        "ror r18\n"
+        "ror r19\n"
+        "ror r20\n"
+        "std Z+8,r17\n"
+        "std Z+9,r18\n"
+        "std Z+10,r19\n"
+        "std Z+11,r20\n"
+        "ldd r17,Z+12\n"
+        "ldd r18,Z+13\n"
+        "ldd r19,Z+14\n"
+        "ldd r20,Z+15\n"
+        "ror r17\n"
+        "ror r18\n"
+        "ror r19\n"
+        "ror r20\n"
+        "std Z+12,r17\n"
+        "std Z+13,r18\n"
+        "std Z+14,r19\n"
+        "std Z+15,r20\n"
+        "mov r17,__zero_reg__\n"
+        "sbc r17,__zero_reg__\n"
+        "andi r17,0xE1\n"
+        "eor r16,r17\n"
+        "st Z,r16\n"
+        : : "z"(V)
+        : "r16", "r17", "r18", "r19", "r20"
+    );
+#else
+    uint32_t V0 = be32toh(V[0]);
+    uint32_t V1 = be32toh(V[1]);
+    uint32_t V2 = be32toh(V[2]);
+    uint32_t V3 = be32toh(V[3]);
+    uint32_t mask = ((~(V3 & 0x01)) + 1) & 0xE1000000;
+    V3 = (V3 >> 1) | (V2 << 31);
+    V2 = (V2 >> 1) | (V1 << 31);
+    V1 = (V1 >> 1) | (V0 << 31);
+    V0 = (V0 >> 1) ^ mask;
+    V[0] = htobe32(V0);
+    V[1] = htobe32(V1);
+    V[2] = htobe32(V2);
+    V[3] = htobe32(V3);
+#endif
+}
+
+/**
+ * \brief Doubles a value in the GF(2^128) field using EAX conventions.
+ *
+ * \param V The value to double, and the result.  This array is
+ * assumed to be in big-endian order on entry and exit.
+ *
+ * This function differs from dbl() that it uses the conventions of EAX mode
+ * instead of those of NIST SP 800-38D (GCM).  The two operations have
+ * equivalent security but the bits are ordered differently with the
+ * value shifted left instead of right.
+ *
+ * References: https://en.wikipedia.org/wiki/EAX_mode,
+ * http://web.cs.ucdavis.edu/~rogaway/papers/eax.html
+ *
+ * \sa dbl(), mul()
+ */
+void GF128::dblEAX(uint32_t V[4])
+{
+#if defined(__AVR__)
+    __asm__ __volatile__ (
+        "ldd r16,Z+15\n"
+        "ldd r17,Z+14\n"
+        "ldd r18,Z+13\n"
+        "ldd r19,Z+12\n"
+        "lsl r16\n"
+        "rol r17\n"
+        "rol r18\n"
+        "rol r19\n"
+        "std Z+14,r17\n"
+        "std Z+13,r18\n"
+        "std Z+12,r19\n"
+        "ldd r17,Z+11\n"
+        "ldd r18,Z+10\n"
+        "ldd r19,Z+9\n"
+        "ldd r20,Z+8\n"
+        "rol r17\n"
+        "rol r18\n"
+        "rol r19\n"
+        "rol r20\n"
+        "std Z+11,r17\n"
+        "std Z+10,r18\n"
+        "std Z+9,r19\n"
+        "std Z+8,r20\n"
+        "ldd r17,Z+7\n"
+        "ldd r18,Z+6\n"
+        "ldd r19,Z+5\n"
+        "ldd r20,Z+4\n"
+        "rol r17\n"
+        "rol r18\n"
+        "rol r19\n"
+        "rol r20\n"
+        "std Z+7,r17\n"
+        "std Z+6,r18\n"
+        "std Z+5,r19\n"
+        "std Z+4,r20\n"
+        "ldd r17,Z+3\n"
+        "ldd r18,Z+2\n"
+        "ldd r19,Z+1\n"
+        "ld r20,Z\n"
+        "rol r17\n"
+        "rol r18\n"
+        "rol r19\n"
+        "rol r20\n"
+        "std Z+3,r17\n"
+        "std Z+2,r18\n"
+        "std Z+1,r19\n"
+        "st Z,r20\n"
+        "mov r17,__zero_reg__\n"
+        "sbc r17,__zero_reg__\n"
+        "andi r17,0x87\n"
+        "eor r16,r17\n"
+        "std Z+15,r16\n"
+        : : "z"(V)
+        : "r16", "r17", "r18", "r19", "r20"
+    );
+#else
+    uint32_t V0 = be32toh(V[0]);
+    uint32_t V1 = be32toh(V[1]);
+    uint32_t V2 = be32toh(V[2]);
+    uint32_t V3 = be32toh(V[3]);
+    uint32_t mask = ((~(V0 >> 31)) + 1) & 0x00000087;
+    V0 = (V0 << 1) | (V1 >> 31);
+    V1 = (V1 << 1) | (V2 >> 31);
+    V2 = (V2 << 1) | (V3 >> 31);
+    V3 = (V3 << 1) ^ mask;
+    V[0] = htobe32(V0);
+    V[1] = htobe32(V1);
+    V[2] = htobe32(V2);
+    V[3] = htobe32(V3);
+#endif
+}
--- a/libraries/Crypto/GF128.h
+++ b/libraries/Crypto/GF128.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (C) 2016 Southern Storm Software, Pty Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef CRYPTO_GF128_h
+#define CRYPTO_GF128_h
+
+#include <inttypes.h>
+
+class GF128
+{
+private:
+    GF128() {}
+    ~GF128() {}
+
+public:
+    static void mulInit(uint32_t H[4], const void *key);
+    static void mul(uint32_t Y[4], const uint32_t H[4]);
+    static void dbl(uint32_t V[4]);
+    static void dblEAX(uint32_t V[4]);
+};
+
+#endif
--- a/libraries/Crypto/GHASH.cpp
+++ b/libraries/Crypto/GHASH.cpp
@@ -21,8 +21,8 @@
 */

 #include "GHASH.h"
+#include "GF128.h"
 #include "Crypto.h"
-#include "utility/EndianUtil.h"
 #include <string.h>

 /**
@@ -66,16 +66,7 @@ GHASH::~GHASH()
 */
 void GHASH::reset(const void *key)
 {
-    // Copy the key into H and convert from big endian to host order.
-    memcpy(state.H, key, 16);
-#if defined(CRYPTO_LITTLE_ENDIAN)
-    state.H[0] = be32toh(state.H[0]);
-    state.H[1] = be32toh(state.H[1]);
-    state.H[2] = be32toh(state.H[2]);
-    state.H[3] = be32toh(state.H[3]);
-#endif
-
-    // Reset the hash.
+    GF128::mulInit(state.H, key);
    memset(state.Y, 0, sizeof(state.Y));
    state.posn = 0;
 }
@@ -106,7 +97,7 @@ void GHASH::update(const void *data, size_t len)
        len -= size;
        d += size;
        if (state.posn == 16) {
-            processChunk();
+            GF128::mul(state.Y, state.H);
            state.posn = 0;
        }
    }
@@ -148,7 +139,7 @@ void GHASH::pad()
    if (state.posn != 0) {
        // Padding involves XOR'ing the rest of state.Y with zeroes,
        // which does nothing.  Immediately process the next chunk.
-        processChunk();
+        GF128::mul(state.Y, state.H);
        state.posn = 0;
    }
 }
@@ -160,45 +151,3 @@ void GHASH::clear()
 {
    clean(state);
 }
-
-void GHASH::processChunk()
-{
-    uint32_t Z0 = 0;            // Z = 0
-    uint32_t Z1 = 0;
-    uint32_t Z2 = 0;
-    uint32_t Z3 = 0;
-    uint32_t V0 = state.H[0];   // V = H
-    uint32_t V1 = state.H[1];
-    uint32_t V2 = state.H[2];
-    uint32_t V3 = state.H[3];
-
-    // Multiply Z by V for the set bits in Y, starting at the top.
-    // This is a very simple bit by bit version that may not be very
-    // fast but it should be resistant to cache timing attacks.
-    for (uint8_t posn = 0; posn < 16; ++posn) {
-        uint8_t value = ((const uint8_t *)state.Y)[posn];
-        for (uint8_t bit = 0; bit < 8; ++bit, value <<= 1) {
-            // Extract the high bit of "value" and turn it into a mask.
-            uint32_t mask = (~((uint32_t)(value >> 7))) + 1;
-
-            // XOR V with Z if the bit is 1.
-            Z0 ^= (V0 & mask);
-            Z1 ^= (V1 & mask);
-            Z2 ^= (V2 & mask);
-            Z3 ^= (V3 & mask);
-
-            // Rotate V right by 1 bit.
-            mask = ((~(V3 & 0x01)) + 1) & 0xE1000000;
-            V3 = (V3 >> 1) | (V2 << 31);
-            V2 = (V2 >> 1) | (V1 << 31);
-            V1 = (V1 >> 1) | (V0 << 31);
-            V0 = (V0 >> 1) ^ mask;
-        }
-    }
-
-    // We have finished the block so copy Z into Y and byte-swap.
-    state.Y[0] = htobe32(Z0);
-    state.Y[1] = htobe32(Z1);
-    state.Y[2] = htobe32(Z2);
-    state.Y[3] = htobe32(Z3);
-}
--- a/libraries/Crypto/GHASH.h
+++ b/libraries/Crypto/GHASH.h
@@ -46,8 +46,6 @@ private:
        uint32_t Y[4];
        uint8_t posn;
    } state;
-
-    void processChunk();
 };

 #endif
--- a/libraries/Crypto/examples/TestEAX/TestEAX.ino
+++ b/libraries/Crypto/examples/TestEAX/TestEAX.ino
@@ -233,6 +233,7 @@ static TestVector const testVectorEAX10 PROGMEM = {
 TestVector testVector;

 EAX<AES128> *eax;
+EAX<AES256> *eax256;
 EAX<Speck> *eaxSpeck;
 EAX<SpeckLowMemory> *eaxSpeckLowMemory;

@@ -353,7 +354,7 @@ void perfCipherSetKey(AuthenticatedCipher *cipher, const struct TestVector *test

    start = micros();
    for (count = 0; count < 1000; ++count) {
-        cipher->setKey(test->key, 16);
+        cipher->setKey(test->key, cipher->keySize());
        cipher->setIV(test->iv, test->ivsize);
    }
    elapsed = micros() - start;
@@ -378,7 +379,7 @@ void perfCipherEncrypt(AuthenticatedCipher *cipher, const struct TestVector *tes
    Serial.print(test->name);
    Serial.print(" Encrypt ... ");

-    cipher->setKey(test->key, 16);
+    cipher->setKey(test->key, cipher->keySize());
    cipher->setIV(test->iv, test->ivsize);
    start = micros();
    for (count = 0; count < 500; ++count) {
@@ -406,7 +407,7 @@ void perfCipherDecrypt(AuthenticatedCipher *cipher, const struct TestVector *tes
    Serial.print(test->name);
    Serial.print(" Decrypt ... ");

-    cipher->setKey(test->key, 16);
+    cipher->setKey(test->key, cipher->keySize());
    cipher->setIV(test->iv, test->ivsize);
    start = micros();
    for (count = 0; count < 500; ++count) {
@@ -434,7 +435,7 @@ void perfCipherAddAuthData(AuthenticatedCipher *cipher, const struct TestVector
    Serial.print(test->name);
    Serial.print(" AddAuthData ... ");

-    cipher->setKey(test->key, 16);
+    cipher->setKey(test->key, cipher->keySize());
    cipher->setIV(test->iv, test->ivsize);
    start = micros();
    memset(buffer, 0xBA, 128);
@@ -463,7 +464,7 @@ void perfCipherComputeTag(AuthenticatedCipher *cipher, const struct TestVector *
    Serial.print(test->name);
    Serial.print(" ComputeTag ... ");

-    cipher->setKey(test->key, 16);
+    cipher->setKey(test->key, cipher->keySize());
    cipher->setIV(test->iv, test->ivsize);
    start = micros();
    for (count = 0; count < 1000; ++count) {
@@ -495,6 +496,8 @@ void setup()
    Serial.println("State Sizes:");
    Serial.print("EAX<AES128> ... ");
    Serial.println(sizeof(*eax));
+    Serial.print("EAX<AES256> ... ");
+    Serial.println(sizeof(*eax256));
    Serial.print("EAX<Speck> ... ");
    Serial.println(sizeof(*eaxSpeck));
    Serial.print("EAX<SpeckLowMemory> ... ");
@@ -520,6 +523,10 @@ void setup()
    perfCipher(eax, &testVectorEAX1, "AES-128");
    Serial.println();
    delete eax;
+    eax256 = new EAX<AES256>();
+    perfCipher(eax, &testVectorEAX1, "AES-256");
+    Serial.println();
+    delete eax256;
    eaxSpeck = new EAX<Speck>();
    perfCipher(eaxSpeck, &testVectorEAX1, "Speck");
    Serial.println();
--- a/libraries/Crypto/examples/TestGCM/TestGCM.ino
+++ b/libraries/Crypto/examples/TestGCM/TestGCM.ino
@@ -26,10 +26,19 @@ This example runs tests on the GCM implementation to verify correct behaviour.

 #include <Crypto.h>
 #include <AES.h>
+#include <Speck.h>
+#include <SpeckLowMemory.h>
 #include <GCM.h>
 #include <string.h>
 #include <avr/pgmspace.h>

+// There isn't enough memory to test both AES and Speck on the Uno,
+// so disable Speck testing on AVR platforms unless explicitly enabled.
+// When enabled, some of the AES tests are disabled to reclaim memory.
+#if defined(__AVR__)
+//#define TEST_SPECK 1
+#endif
+
 #define MAX_PLAINTEXT_LEN 64

 struct TestVector
@@ -65,6 +74,7 @@ static TestVector const testVectorGCM1 PROGMEM = {
    .tagsize     = 16,
    .ivsize      = 12
 };
+#ifndef TEST_SPECK
 static TestVector const testVectorGCM2 PROGMEM = {
    .name        = "AES-128 GCM #2",
    .key         = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -176,6 +186,7 @@ static TestVector const testVectorGCM5 PROGMEM = {
    .tagsize     = 16,
    .ivsize      = 8
 };
+#endif // !TEST_SPECK
 static TestVector const testVectorGCM10 PROGMEM = {
    .name        = "AES-192 GCM #10",
    .key         = {0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c,
@@ -249,6 +260,8 @@ TestVector testVector;
 GCM<AES128> *gcmaes128 = 0;
 GCM<AES192> *gcmaes192 = 0;
 GCM<AES256> *gcmaes256 = 0;
+GCM<Speck> *gcmspeck = 0;
+GCM<SpeckLowMemory> *gcmspecklm = 0;

 byte buffer[128];

@@ -348,7 +361,7 @@ void testCipher(AuthenticatedCipher *cipher, const struct TestVector *test)
        Serial.println("Failed");
 }

-void perfCipherSetKey(AuthenticatedCipher *cipher, const struct TestVector *test)
+void perfCipherSetKey(AuthenticatedCipher *cipher, const struct TestVector *test, const char *name)
 {
    unsigned long start;
    unsigned long elapsed;
@@ -357,7 +370,7 @@ void perfCipherSetKey(AuthenticatedCipher *cipher, const struct TestVector *test
    memcpy_P(&testVector, test, sizeof(TestVector));
    test = &testVector;

-    Serial.print(test->name);
+    Serial.print(name);
    Serial.print(" SetKey ... ");

    start = micros();
@@ -373,7 +386,7 @@ void perfCipherSetKey(AuthenticatedCipher *cipher, const struct TestVector *test
    Serial.println(" per second");
 }

-void perfCipherEncrypt(AuthenticatedCipher *cipher, const struct TestVector *test)
+void perfCipherEncrypt(AuthenticatedCipher *cipher, const struct TestVector *test, const char *name)
 {
    unsigned long start;
    unsigned long elapsed;
@@ -382,7 +395,7 @@ void perfCipherEncrypt(AuthenticatedCipher *cipher, const struct TestVector *tes
    memcpy_P(&testVector, test, sizeof(TestVector));
    test = &testVector;

-    Serial.print(test->name);
+    Serial.print(name);
    Serial.print(" Encrypt ... ");

    cipher->setKey(test->key, cipher->keySize());
@@ -399,7 +412,7 @@ void perfCipherEncrypt(AuthenticatedCipher *cipher, const struct TestVector *tes
    Serial.println(" bytes per second");
 }

-void perfCipherDecrypt(AuthenticatedCipher *cipher, const struct TestVector *test)
+void perfCipherDecrypt(AuthenticatedCipher *cipher, const struct TestVector *test, const char *name)
 {
    unsigned long start;
    unsigned long elapsed;
@@ -408,7 +421,7 @@ void perfCipherDecrypt(AuthenticatedCipher *cipher, const struct TestVector *tes
    memcpy_P(&testVector, test, sizeof(TestVector));
    test = &testVector;

-    Serial.print(test->name);
+    Serial.print(name);
    Serial.print(" Decrypt ... ");

    cipher->setKey(test->key, cipher->keySize());
@@ -425,7 +438,7 @@ void perfCipherDecrypt(AuthenticatedCipher *cipher, const struct TestVector *tes
    Serial.println(" bytes per second");
 }

-void perfCipherAddAuthData(AuthenticatedCipher *cipher, const struct TestVector *test)
+void perfCipherAddAuthData(AuthenticatedCipher *cipher, const struct TestVector *test, const char *name)
 {
    unsigned long start;
    unsigned long elapsed;
@@ -434,7 +447,7 @@ void perfCipherAddAuthData(AuthenticatedCipher *cipher, const struct TestVector
    memcpy_P(&testVector, test, sizeof(TestVector));
    test = &testVector;

-    Serial.print(test->name);
+    Serial.print(name);
    Serial.print(" AddAuthData ... ");

    cipher->setKey(test->key, cipher->keySize());
@@ -452,7 +465,7 @@ void perfCipherAddAuthData(AuthenticatedCipher *cipher, const struct TestVector
    Serial.println(" bytes per second");
 }

-void perfCipherComputeTag(AuthenticatedCipher *cipher, const struct TestVector *test)
+void perfCipherComputeTag(AuthenticatedCipher *cipher, const struct TestVector *test, const char *name)
 {
    unsigned long start;
    unsigned long elapsed;
@@ -461,7 +474,7 @@ void perfCipherComputeTag(AuthenticatedCipher *cipher, const struct TestVector *
    memcpy_P(&testVector, test, sizeof(TestVector));
    test = &testVector;

-    Serial.print(test->name);
+    Serial.print(name);
    Serial.print(" ComputeTag ... ");

    cipher->setKey(test->key, cipher->keySize());
@@ -478,13 +491,13 @@ void perfCipherComputeTag(AuthenticatedCipher *cipher, const struct TestVector *
    Serial.println(" per second");
 }

-void perfCipher(AuthenticatedCipher *cipher, const struct TestVector *test)
+void perfCipher(AuthenticatedCipher *cipher, const struct TestVector *test, const char *name)
 {
-    perfCipherSetKey(cipher, test);
-    perfCipherEncrypt(cipher, test);
-    perfCipherDecrypt(cipher, test);
-    perfCipherAddAuthData(cipher, test);
-    perfCipherComputeTag(cipher, test);
+    perfCipherSetKey(cipher, test, name);
+    perfCipherEncrypt(cipher, test, name);
+    perfCipherDecrypt(cipher, test, name);
+    perfCipherAddAuthData(cipher, test, name);
+    perfCipherComputeTag(cipher, test, name);
 }

 void setup()
@@ -493,6 +506,7 @@ void setup()

    Serial.println();

+#ifndef TEST_SPECK
    Serial.println("State Sizes:");
    Serial.print("GCM<AES128> ... ");
    Serial.println(sizeof(*gcmaes128));
@@ -500,15 +514,22 @@ void setup()
    Serial.println(sizeof(*gcmaes192));
    Serial.print("GCM<AES256> ... ");
    Serial.println(sizeof(*gcmaes256));
+    Serial.print("GCM<Speck> ... ");
+    Serial.println(sizeof(*gcmspeck));
+    Serial.print("GCM<SpeckLowMemory> ... ");
+    Serial.println(sizeof(*gcmspecklm));
    Serial.println();
+#endif

    Serial.println("Test Vectors:");
    gcmaes128 = new GCM<AES128>();
    testCipher(gcmaes128, &testVectorGCM1);
+#ifndef TEST_SPECK
    testCipher(gcmaes128, &testVectorGCM2);
    testCipher(gcmaes128, &testVectorGCM3);
    testCipher(gcmaes128, &testVectorGCM4);
    testCipher(gcmaes128, &testVectorGCM5);
+#endif
    delete gcmaes128;
    gcmaes192 = new GCM<AES192>();
    testCipher(gcmaes192, &testVectorGCM10);
@@ -520,15 +541,25 @@ void setup()
    Serial.println();

    Serial.println("Performance Tests:");
+#ifndef TEST_SPECK
    gcmaes128 = new GCM<AES128>();
-    perfCipher(gcmaes128, &testVectorGCM1);
+    perfCipher(gcmaes128, &testVectorGCM1, testVectorGCM1.name);
    delete gcmaes128;
    gcmaes192 = new GCM<AES192>();
-    perfCipher(gcmaes192, &testVectorGCM10);
+    perfCipher(gcmaes192, &testVectorGCM10, testVectorGCM10.name);
    delete gcmaes192;
    gcmaes256 = new GCM<AES256>();
-    perfCipher(gcmaes256, &testVectorGCM16);
+    perfCipher(gcmaes256, &testVectorGCM16, testVectorGCM16.name);
    delete gcmaes256;
+#endif
+#if defined(TEST_SPECK) || !defined(__AVR__)
+    gcmspeck = new GCM<Speck>();
+    perfCipher(gcmspeck, &testVectorGCM16, "GCM-Speck-256");
+    delete gcmspeck;
+    gcmspecklm = new GCM<SpeckLowMemory>();
+    perfCipher(gcmspecklm, &testVectorGCM16, "GCM-SpeckLowMemory-256");
+    delete gcmspecklm;
+#endif
 }

 void loop()