From 9ff24b0ddff4329e89f9cfc387dcf19970c98f64 Mon Sep 17 00:00:00 2001
From: Rhys Weatherley <rhys.weatherley@gmail.com>
Date: Sat, 26 Mar 2016 19:03:31 +1000
Subject: [PATCH] Implementation of the NIST P-521 curve

---
 doc/crypto.dox                                |   14 +-
 doc/mainpage.dox                              |    2 +-
 host/Crypto/Makefile                          |    3 +
 libraries/Crypto/BigNumberUtil.cpp            |   19 +
 libraries/Crypto/BigNumberUtil.h              |    2 +
 libraries/Crypto/P521.cpp                     | 1642 +++++++++++++++++
 libraries/Crypto/P521.h                       |  112 ++
 .../Crypto/examples/TestP521/TestP521.ino     |  519 ++++++
 .../examples/TestP521Math/TestP521Math.ino    |  612 ++++++
 libraries/Crypto/utility/LimbUtil.h           |    8 +-
 10 files changed, 2928 insertions(+), 5 deletions(-)
 create mode 100644 libraries/Crypto/P521.cpp
 create mode 100644 libraries/Crypto/P521.h
 create mode 100644 libraries/Crypto/examples/TestP521/TestP521.ino
 create mode 100644 libraries/Crypto/examples/TestP521Math/TestP521Math.ino
diff --git a/doc/crypto.dox b/doc/crypto.dox
index 07af41c1..0529741f 100644
--- a/doc/crypto.dox
+++ b/doc/crypto.dox
@@ -33,7 +33,7 @@
 \li Hash algorithms: SHA256, SHA512, SHA3_256, SHA3_512, BLAKE2s, BLAKE2b (regular and HMAC modes)
 \li Extendable output functions (XOF's): SHAKE128, SHAKE256
 \li Message authenticators: Poly1305, GHASH, OMAC
-\li Public key algorithms: Curve25519, Ed25519
+\li Public key algorithms: Curve25519, Ed25519, P521
 \li Random number generation: \link RNGClass RNG\endlink, TransistorNoiseSource, RingOscillatorNoiseSource
 
 All cryptographic algorithms have been optimized for 8-bit Arduino platforms
@@ -129,6 +129,12 @@ Ardunino Mega 2560 running at 16 MHz are similar:
 <tr><td>Ed25519::sign()</td><td align="right">5148ms</td><td colspan="3">Digital signature generation</td></tr>
 <tr><td>Ed25519::verify()</td><td align="right">8196ms</td><td colspan="3">Digital signature verification</td></tr>
 <tr><td>Ed25519::derivePublicKey()</td><td align="right">5102ms</td><td colspan="3">Derive a public key from a private key</td></tr>
+<tr><td>P521::eval()</td><td align="right">46290ms</td><td colspan="3">Raw curve evaluation</td></tr>
+<tr><td>P521::dh1()</td><td align="right">46293ms</td><td colspan="3">First half of Diffie-Hellman key agreement</td></tr>
+<tr><td>P521::dh2()</td><td align="right">46304ms</td><td colspan="3">Second half of Diffie-Hellman key agreement</td></tr>
+<tr><td>P521::sign()</td><td align="right">60514ms</td><td colspan="3">Digital signature generation</td></tr>
+<tr><td>P521::verify()</td><td align="right">109078ms</td><td colspan="3">Digital signature verification</td></tr>
+<tr><td>P521::derivePublicKey()</td><td align="right">46290ms</td><td colspan="3">Derive a public key from a private key</td></tr>
 </table>
 
 Where a cipher supports more than one key size (such as ChaCha), the values
@@ -196,5 +202,11 @@ All figures are for the Arduino Due running at 84 MHz:
 <tr><td>Ed25519::sign()</td><td align="right">195ms</td><td colspan="3">Digital signature generation</td></tr>
 <tr><td>Ed25519::verify()</td><td align="right">306ms</td><td colspan="3">Digital signature verification</td></tr>
 <tr><td>Ed25519::derivePublicKey()</td><td align="right">194ms</td><td colspan="3">Derive a public key from a private key</td></tr>
+<tr><td>P521::eval()</td><td align="right">1503ms</td><td colspan="3">Raw curve evaluation</td></tr>
+<tr><td>P521::dh1()</td><td align="right">1503ms</td><td colspan="3">First half of Diffie-Hellman key agreement</td></tr>
+<tr><td>P521::dh2()</td><td align="right">1503ms</td><td colspan="3">Second half of Diffie-Hellman key agreement</td></tr>
+<tr><td>P521::sign()</td><td align="right">1860ms</td><td colspan="3">Digital signature generation</td></tr>
+<tr><td>P521::verify()</td><td align="right">3423ms</td><td colspan="3">Digital signature verification</td></tr>
+<tr><td>P521::derivePublicKey()</td><td align="right">1503ms</td><td colspan="3">Derive a public key from a private key</td></tr>
 </table>
 */
diff --git a/doc/mainpage.dox b/doc/mainpage.dox
index c8ed795f..3aa6b707 100644
--- a/doc/mainpage.dox
+++ b/doc/mainpage.dox
@@ -98,7 +98,7 @@ realtime clock and the LCD library to implement an alarm clock.
 \li Hash algorithms: SHA256, SHA512, SHA3_256, SHA3_512, BLAKE2s, BLAKE2b (regular and HMAC modes)
 \li Extendable output functions (XOF's): SHAKE128, SHAKE256
 \li Message authenticators: Poly1305, GHASH, OMAC
-\li Public key algorithms: Curve25519, Ed25519
+\li Public key algorithms: Curve25519, Ed25519, P521
 \li Random number generation: \link RNGClass RNG\endlink, TransistorNoiseSource, RingOscillatorNoiseSource
 
 More information can be found on the \ref crypto "Cryptographic Library" page.
diff --git a/host/Crypto/Makefile b/host/Crypto/Makefile
index bbffdf64..f3e28bd3 100644
--- a/host/Crypto/Makefile
+++ b/host/Crypto/Makefile
@@ -47,6 +47,7 @@ SOURCES =   \
 	NoiseSource.cpp \
 	OFB.cpp \
 	OMAC.cpp \
+        P521.cpp \
 	Poly1305.cpp \
         RNG_host.cpp \
 	SHA256.cpp \
@@ -76,6 +77,8 @@ SKETCHES = \
 	TestGCM/TestGCM.ino \
 	TestGHASH/TestGHASH.ino \
 	TestOFB/TestOFB.ino \
+	TestP521/TestP521.ino \
+	TestP521Math/TestP521Math.ino \
 	TestPoly1305/TestPoly1305.ino \
 	TestSHA256/TestSHA256.ino \
 	TestSHA3_256/TestSHA3_256.ino \
diff --git a/libraries/Crypto/BigNumberUtil.cpp b/libraries/Crypto/BigNumberUtil.cpp
index 1c413cb4..d942ab22 100644
--- a/libraries/Crypto/BigNumberUtil.cpp
+++ b/libraries/Crypto/BigNumberUtil.cpp
@@ -634,3 +634,22 @@ void BigNumberUtil::reduceQuick_P(limb_t *result, const limb_t *x,
         --size;
     }
 }
+
+/**
+ * \brief Determine if a big number is zero.
+ *
+ * \param x Points to the number to test.
+ * \param size The number of limbs in \a x.
+ * \return Returns 1 if \a x is zero or 0 otherwise.
+ *
+ * This function attempts to make the determination in constant time.
+ */
+limb_t BigNumberUtil::isZero(const limb_t *x, size_t size)
+{
+    limb_t word = 0;
+    while (size > 0) {
+        word |= *x++;
+        --size;
+    }
+    return (limb_t)(((((dlimb_t)1) << LIMB_BITS) - word) >> LIMB_BITS);
+}
diff --git a/libraries/Crypto/BigNumberUtil.h b/libraries/Crypto/BigNumberUtil.h
index 83f6d63f..2bf2724e 100644
--- a/libraries/Crypto/BigNumberUtil.h
+++ b/libraries/Crypto/BigNumberUtil.h
@@ -86,6 +86,8 @@ public:
     static void reduceQuick_P(limb_t *result, const limb_t *x,
                               const limb_t *y, size_t size);
 
+    static limb_t isZero(const limb_t *x, size_t size);
+
 private:
     // Constructor and destructor are private - cannot instantiate this class.
     BigNumberUtil() {}
diff --git a/libraries/Crypto/P521.cpp b/libraries/Crypto/P521.cpp
new file mode 100644
index 00000000..8ebeb2d0
--- /dev/null
+++ b/libraries/Crypto/P521.cpp
@@ -0,0 +1,1642 @@
+/*
+ * Copyright (C) 2016 Southern Storm Software, Pty Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "P521.h"
+#include "Crypto.h"
+#include "RNG.h"
+#include "SHA512.h"
+#include "utility/LimbUtil.h"
+#include <string.h>
+#include <Arduino.h>
+
+/**
+ * \class P521 P521.h <P521.h>
+ * \brief Elliptic curve operations with the NIST P-521 curve.
+ *
+ * This class supports both ECDH key exchange and ECDSA signatures.
+ *
+ * \note The public functions in this class need a substantial amount of
+ * stack space to store intermediate results while the curve function is
+ * being evaluated.  About 2k of free stack space is recommended for safety.
+ *
+ * References: NIST FIPS 186-4,
+ * <a href="http://tools.ietf.org/html/rfc6090">RFC 6090</a>,
+ * <a href="http://tools.ietf.org/html/rfc6979">RFC 6979</a>,
+ * <a href="http://tools.ietf.org/html/rfc6090">RFC 5903</a>
+ *
+ * \sa Curve25519
+ */
+
+// Number of limbs that are needed to represent a 521-bit number.
+#define NUM_LIMBS_521BIT    NUM_LIMBS_BITS(521)
+
+// Number of limbs that are needed to represent a 1042-bit number.
+// To simply things we also require that this be twice the size of
+// NUM_LIMB_521BIT which involves a little wastage at the high end
+// of one extra limb for 8-bit and 32-bit limbs.  There is no
+// wastage for 16-bit limbs.
+#define NUM_LIMBS_1042BIT   (NUM_LIMBS_BITS(521) * 2)
+
+// The overhead of clean() calls in mul(), etc can add up to a lot of
+// processing time.  Only do such cleanups if strict mode has been enabled.
+#if defined(P521_STRICT_CLEAN)
+#define strict_clean(x)     clean(x)
+#else
+#define strict_clean(x)     do { ; } while (0)
+#endif
+
+// Expand the partial 9-bit left over limb at the top of a 521-bit number.
+#if BIGNUMBER_LIMB_8BIT
+#define LIMB_PARTIAL(value) ((uint8_t)(value)), \
+                            ((uint8_t)((value) >> 8))
+#else
+#define LIMB_PARTIAL(value) (value)
+#endif
+
+/** @cond */
+
+// The group order "q" value from RFC 4754 and RFC 5903.  This is the
+// same as the "n" value from Appendix D.1.2.5 of NIST FIPS 186-4.
+static limb_t const P521_q[NUM_LIMBS_521BIT] PROGMEM = {
+    LIMB(0x91386409), LIMB(0xbb6fb71e), LIMB(0x899c47ae), LIMB(0x3bb5c9b8),
+    LIMB(0xf709a5d0), LIMB(0x7fcc0148), LIMB(0xbf2f966b), LIMB(0x51868783),
+    LIMB(0xfffffffa), LIMB(0xffffffff), LIMB(0xffffffff), LIMB(0xffffffff),
+    LIMB(0xffffffff), LIMB(0xffffffff), LIMB(0xffffffff), LIMB(0xffffffff),
+    LIMB_PARTIAL(0x1ff)
+};
+
+// The "b" value from Appendix D.1.2.5 of NIST FIPS 186-4.
+static limb_t const P521_b[NUM_LIMBS_521BIT] PROGMEM = {
+    LIMB(0x6b503f00), LIMB(0xef451fd4), LIMB(0x3d2c34f1), LIMB(0x3573df88),
+    LIMB(0x3bb1bf07), LIMB(0x1652c0bd), LIMB(0xec7e937b), LIMB(0x56193951),
+    LIMB(0x8ef109e1), LIMB(0xb8b48991), LIMB(0x99b315f3), LIMB(0xa2da725b),
+    LIMB(0xb68540ee), LIMB(0x929a21a0), LIMB(0x8e1c9a1f), LIMB(0x953eb961),
+    LIMB_PARTIAL(0x051)
+};
+
+// The "Gx" value from Appendix D.1.2.5 of NIST FIPS 186-4.
+static limb_t const P521_Gx[NUM_LIMBS_521BIT] PROGMEM = {
+    LIMB(0xc2e5bd66), LIMB(0xf97e7e31), LIMB(0x856a429b), LIMB(0x3348b3c1),
+    LIMB(0xa2ffa8de), LIMB(0xfe1dc127), LIMB(0xefe75928), LIMB(0xa14b5e77),
+    LIMB(0x6b4d3dba), LIMB(0xf828af60), LIMB(0x053fb521), LIMB(0x9c648139),
+    LIMB(0x2395b442), LIMB(0x9e3ecb66), LIMB(0x0404e9cd), LIMB(0x858e06b7),
+    LIMB_PARTIAL(0x0c6)
+};
+
+// The "Gy" value from Appendix D.1.2.5 of NIST FIPS 186-4.
+static limb_t const P521_Gy[NUM_LIMBS_521BIT] PROGMEM = {
+    LIMB(0x9fd16650), LIMB(0x88be9476), LIMB(0xa272c240), LIMB(0x353c7086),
+    LIMB(0x3fad0761), LIMB(0xc550b901), LIMB(0x5ef42640), LIMB(0x97ee7299),
+    LIMB(0x273e662c), LIMB(0x17afbd17), LIMB(0x579b4468), LIMB(0x98f54449),
+    LIMB(0x2c7d1bd9), LIMB(0x5c8a5fb4), LIMB(0x9a3bc004), LIMB(0x39296a78),
+    LIMB_PARTIAL(0x118)
+};
+
+/** @endcond */
+
+/**
+ * \brief Evaluates the curve function.
+ *
+ * \param result The result of applying the curve function, which consists
+ * of the x and y values of the result point encoded in big-endian order.
+ * \param f The scalar value to multiply by \a point to create the \a result.
+ * This is assumed to be be a 521-bit number in big-endian order.
+ * \param point The curve point to multiply consisting of the x and y
+ * values encoded in big-endian order.  If \a point is NULL, then the
+ * generator Gx and Gy values for the curve will be used instead.
+ *
+ * \return Returns true if \a f * \a point could be evaluated, or false if
+ * \a point is not a point on the curve.
+ *
+ * This function provides access to the raw curve operation for testing
+ * purposes.  Normally an application would use a higher-level function
+ * like dh1(), dh2(), sign(), or verify().
+ *
+ * \sa dh1(), sign()
+ */
+bool P521::eval(uint8_t result[132], const uint8_t f[66], const uint8_t point[132])
+{
+    limb_t x[NUM_LIMBS_521BIT];
+    limb_t y[NUM_LIMBS_521BIT];
+    bool ok;
+
+    // Unpack the curve point from the parameters and validate it.
+    if (point) {
+        BigNumberUtil::unpackBE(x, NUM_LIMBS_521BIT, point, 66);
+        BigNumberUtil::unpackBE(y, NUM_LIMBS_521BIT, point + 66, 66);
+        ok = validate(x, y);
+    } else {
+        memcpy_P(x, P521_Gx, sizeof(x));
+        memcpy_P(y, P521_Gy, sizeof(y));
+        ok = true;
+    }
+
+    // Evaluate the curve function.
+    evaluate(x, y, f);
+
+    // Pack the answer into the result array.
+    BigNumberUtil::packBE(result, 66, x, NUM_LIMBS_521BIT);
+    BigNumberUtil::packBE(result + 66, 66, y, NUM_LIMBS_521BIT);
+
+    // Clean up.
+    clean(x);
+    clean(y);
+    return ok;
+}
+
+/**
+ * \brief Performs phase 1 of an ECDH key exchange using P-521.
+ *
+ * \param k The key value to send to the other party as part of the exchange.
+ * \param f The generated secret value for this party.  This must not be
+ * transmitted to any party or stored in permanent storage.  It only needs
+ * to be kept in memory until dh2() is called.
+ *
+ * The \a f value is generated with \link RNGClass::rand() RNG.rand()\endlink.
+ * It is the caller's responsibility to ensure that the global random number
+ * pool has sufficient entropy to generate the 66 bytes of \a f safely
+ * before calling this function.
+ *
+ * The following example demonstrates how to perform a full ECDH
+ * key exchange using dh1() and dh2():
+ *
+ * \code
+ * uint8_t f[66];
+ * uint8_t k[132];
+ *
+ * // Generate the secret value "f" and the public value "k".
+ * P521::dh1(k, f);
+ *
+ * // Send "k" to the other party.
+ * ...
+ *
+ * // Read the "k" value that the other party sent to us.
+ * ...
+ *
+ * // Generate the shared secret in "f".
+ * if (!P521::dh2(k, f)) {
+ *     // The received "k" value was invalid - abort the session.
+ *     ...
+ * }
+ *
+ * // The "f" value can now be used to generate session keys for encryption.
+ * ...
+ * \endcode
+ *
+ * Reference: <a href="http://tools.ietf.org/html/rfc6090">RFC 6090</a>
+ *
+ * \sa dh2()
+ */
+void P521::dh1(uint8_t k[132], uint8_t f[66])
+{
+    generatePrivateKey(f);
+    derivePublicKey(k, f);
+}
+
+/**
+ * \brief Performs phase 2 of an ECDH key exchange using P-521.
+ *
+ * \param k The public key value that was received from the other
+ * party as part of the exchange.
+ * \param f On entry, this is the secret value for this party that was
+ * generated by dh1().  On exit, this will be the shared secret.
+ *
+ * \return Returns true if the key exchange was successful, or false if
+ * the \a k value is invalid.
+ *
+ * Reference: <a href="http://tools.ietf.org/html/rfc6090">RFC 6090</a>
+ *
+ * \sa dh1()
+ */
+bool P521::dh2(const uint8_t k[132], uint8_t f[66])
+{
+    // Unpack the (x, y) point from k.
+    limb_t x[NUM_LIMBS_521BIT];
+    limb_t y[NUM_LIMBS_521BIT];
+    BigNumberUtil::unpackBE(x, NUM_LIMBS_521BIT, k, 66);
+    BigNumberUtil::unpackBE(y, NUM_LIMBS_521BIT, k + 66, 66);
+
+    // Validate the curve point.  We keep going to preserve the timing.
+    bool ok = validate(x, y);
+
+    // Evaluate the curve function.
+    evaluate(x, y, f);
+
+    // The secret key is the x component of the final value.
+    BigNumberUtil::packBE(f, 66, x, NUM_LIMBS_521BIT);
+
+    // Clean up.
+    clean(x);
+    clean(y);
+    return ok;
+}
+
+/**
+ * \brief Signs a message using a specific P-521 private key.
+ *
+ * \param signature The signature value.
+ * \param privateKey The private key to use to sign the message.
+ * \param message Points to the message to be signed.
+ * \param len The length of the \a message to be signed.
+ * \param hash The hash algorithm to use to hash the \a message before signing.
+ * If \a hash is NULL, then the \a message is assumed to already be a hash
+ * value from some previous process.
+ *
+ * This function generates deterministic ECDSA signatures according to
+ * RFC 6979.  The \a hash function is used to generate the k value for
+ * the signature.  If \a hash is NULL, then SHA512 is used.
+ * The \a hash object must be capable of HMAC mode.
+ *
+ * The length of the hashed message must be less than or equal to 64
+ * bytes in size.  Longer messages will be truncated to 64 bytes.
+ *
+ * References: <a href="http://tools.ietf.org/html/rfc6090">RFC 6090</a>,
+ * <a href="http://tools.ietf.org/html/rfc6979">RFC 6979</a>
+ *
+ * \sa verify(), generatePrivateKey()
+ */
+void P521::sign(uint8_t signature[132], const uint8_t privateKey[66],
+                const void *message, size_t len, Hash *hash)
+{
+    uint8_t hm[66];
+    uint8_t k[66];
+    limb_t x[NUM_LIMBS_521BIT];
+    limb_t y[NUM_LIMBS_521BIT];
+    limb_t t[NUM_LIMBS_521BIT];
+    uint64_t count = 0;
+
+    // Format the incoming message, hashing it if necessary.
+    if (hash) {
+        // Hash the message.
+        hash->reset();
+        hash->update(message, len);
+        len = hash->hashSize();
+        if (len > 64)
+            len = 64;
+        memset(hm, 0, 66 - len);
+        hash->finalize(hm + 66 - len, len);
+    } else {
+        // The message is the hash.
+        if (len > 64)
+            len = 64;
+        memset(hm, 0, 66 - len);
+        memcpy(hm + 66 - len, message, len);
+    }
+
+    // Keep generating k values until both r and s are non-zero.
+    for (;;) {
+        // Generate the k value deterministically according to RFC 6979.
+        if (hash)
+            generateK(k, hm, privateKey, hash, count);
+        else
+            generateK(k, hm, privateKey, count);
+
+        // Generate r = kG.x mod q.
+        memcpy_P(x, P521_Gx, sizeof(x));
+        memcpy_P(y, P521_Gy, sizeof(y));
+        evaluate(x, y, k);
+        BigNumberUtil::reduceQuick_P(x, x, P521_q, NUM_LIMBS_521BIT);
+        BigNumberUtil::packBE(signature, 66, x, NUM_LIMBS_521BIT);
+
+        // If r is zero, then we need to generate a new k value.
+        // This is utterly improbable, but let's be safe anyway.
+        if (BigNumberUtil::isZero(x, NUM_LIMBS_521BIT)) {
+            ++count;
+            continue;
+        }
+
+        // Generate s = (privateKey * r + hm) / k mod q.
+        BigNumberUtil::unpackBE(y, NUM_LIMBS_521BIT, privateKey, 66);
+        mulQ(y, y, x);
+        BigNumberUtil::unpackBE(x, NUM_LIMBS_521BIT, hm, 66);
+        BigNumberUtil::add(x, x, y, NUM_LIMBS_521BIT);
+        BigNumberUtil::reduceQuick_P(x, x, P521_q, NUM_LIMBS_521BIT);
+        BigNumberUtil::unpackBE(y, NUM_LIMBS_521BIT, k, 66);
+        recipQ(t, y);
+        mulQ(x, x, t);
+        BigNumberUtil::packBE(signature + 66, 66, x, NUM_LIMBS_521BIT);
+
+        // Exit the loop if s is non-zero.
+        if (!BigNumberUtil::isZero(x, NUM_LIMBS_521BIT))
+            break;
+
+        // We need to generate a new k value according to RFC 6979.
+        // This is utterly improbable, but let's be safe anyway.
+        ++count;
+    }
+
+    // Clean up.
+    clean(hm);
+    clean(k);
+    clean(x);
+    clean(y);
+    clean(t);
+}
+
+/**
+ * \brief Verifies a signature using a specific P-521 public key.
+ *
+ * \param signature The signature value to be verified.
+ * \param publicKey The public key to use to verify the signature.
+ * \param message The message whose signature is to be verified.
+ * \param len The length of the \a message to be verified.
+ * \param hash The hash algorithm to use to hash the \a message before
+ * verification.  If \a hash is NULL, then the \a message is assumed to
+ * already be a hash value from some previous process.
+ *
+ * The length of the hashed message must be less than or equal to 64
+ * bytes in size.  Longer messages will be truncated to 64 bytes.
+ *
+ * \return Returns true if the \a signature is valid for \a message;
+ * or false if the \a publicKey or \a signature is not valid.
+ *
+ * \sa sign()
+ */
+bool P521::verify(const uint8_t signature[132],
+                  const uint8_t publicKey[132],
+                  const void *message, size_t len, Hash *hash)
+{
+    limb_t x[NUM_LIMBS_521BIT];
+    limb_t y[NUM_LIMBS_521BIT];
+    limb_t r[NUM_LIMBS_521BIT];
+    limb_t s[NUM_LIMBS_521BIT];
+    limb_t u1[NUM_LIMBS_521BIT];
+    limb_t u2[NUM_LIMBS_521BIT];
+    uint8_t t[66];
+    bool ok = false;
+
+    // Because we are operating on public values, we don't need to
+    // be as strict about constant time.  Bail out early if there
+    // is a problem with the parameters.
+
+    // Unpack the signature.  The values must be between 1 and q - 1.
+    BigNumberUtil::unpackBE(r, NUM_LIMBS_521BIT, signature, 66);
+    BigNumberUtil::unpackBE(s, NUM_LIMBS_521BIT, signature + 66, 66);
+    if (BigNumberUtil::isZero(r, NUM_LIMBS_521BIT) ||
+            BigNumberUtil::isZero(s, NUM_LIMBS_521BIT) ||
+            !BigNumberUtil::sub_P(x, r, P521_q, NUM_LIMBS_521BIT) ||
+            !BigNumberUtil::sub_P(x, s, P521_q, NUM_LIMBS_521BIT)) {
+        goto failed;
+    }
+
+    // Unpack the public key and check that it is a valid curve point.
+    BigNumberUtil::unpackBE(x, NUM_LIMBS_521BIT, publicKey, 66);
+    BigNumberUtil::unpackBE(y, NUM_LIMBS_521BIT, publicKey + 66, 66);
+    if (!validate(x, y)) {
+        goto failed;
+    }
+
+    // Hash the message to generate hm, which we store into u1.
+    if (hash) {
+        // Hash the message.
+        hash->reset();
+        hash->update(message, len);
+        len = hash->hashSize();
+        if (len > 64)
+            len = 64;
+        hash->finalize(u2, len);
+        BigNumberUtil::unpackBE(u1, NUM_LIMBS_521BIT, (uint8_t *)u2, len);
+    } else {
+        // The message is the hash.
+        if (len > 64)
+            len = 64;
+        BigNumberUtil::unpackBE(u1, NUM_LIMBS_521BIT, (uint8_t *)message, len);
+    }
+
+    // Compute u1 = hm * s^-1 mod q and u2 = r * s^-1 mod q.
+    recipQ(u2, s);
+    mulQ(u1, u1, u2);
+    mulQ(u2, r, u2);
+
+    // Compute the curve point R = u2 * publicKey + u1 * G.
+    BigNumberUtil::packBE(t, 66, u2, NUM_LIMBS_521BIT);
+    evaluate(x, y, t);
+    memcpy_P(u2, P521_Gx, sizeof(x));
+    memcpy_P(s, P521_Gy, sizeof(y));
+    BigNumberUtil::packBE(t, 66, u1, NUM_LIMBS_521BIT);
+    evaluate(u2, s, t);
+    addAffine(u2, s, x, y);
+
+    // If R.x = r mod q, then the signature is valid.
+    BigNumberUtil::reduceQuick_P(u1, u2, P521_q, NUM_LIMBS_521BIT);
+    ok = secure_compare(u1, r, NUM_LIMBS_521BIT * sizeof(limb_t));
+
+    // Clean up and exit.
+failed:
+    clean(x);
+    clean(y);
+    clean(r);
+    clean(s);
+    clean(u1);
+    clean(u2);
+    clean(t);
+    return ok;
+}
+
+/**
+ * \brief Generates a private key for P-521 signing operations.
+ *
+ * \param privateKey The resulting private key.
+ *
+ * The private key is generated with \link RNGClass::rand() RNG.rand()\endlink.
+ * It is the caller's responsibility to ensure that the global random number
+ * pool has sufficient entropy to generate the 521 bits of the key safely
+ * before calling this function.
+ *
+ * \sa derivePublicKey(), sign()
+ */
+void P521::generatePrivateKey(uint8_t privateKey[66])
+{
+    // Generate a random 521-bit value for the private key.  The value
+    // must be generated uniformly at random between 1 and q - 1 where q
+    // is the group order (RFC 6090).  We use the recommended algorithm
+    // from Appendix B of RFC 6090: generate a random 521-bit value
+    // and discard it if it is not within the range 1 to q - 1.
+    limb_t x[NUM_LIMBS_521BIT];
+    do {
+        RNG.rand((uint8_t *)x, sizeof(x));
+#if BIGNUMBER_LIMB_8BIT
+        x[NUM_LIMBS_521BIT - 1] &= 0x01;
+#else
+        x[NUM_LIMBS_521BIT - 1] &= 0x1FF;
+#endif
+        BigNumberUtil::packBE(privateKey, 66, x, NUM_LIMBS_521BIT);
+    } while (BigNumberUtil::isZero(x, NUM_LIMBS_521BIT) ||
+             !BigNumberUtil::sub_P(x, x, P521_q, NUM_LIMBS_521BIT));
+    clean(x);
+}
+
+/**
+ * \brief Derives the public key from a private key for P-521
+ * signing operations.
+ *
+ * \param publicKey The public key.
+ * \param privateKey The private key, which is assumed to have been
+ * created by generatePrivateKey().
+ *
+ * \sa generatePrivateKey(), verify()
+ */
+void P521::derivePublicKey(uint8_t publicKey[132], const uint8_t privateKey[66])
+{
+    // Evaluate the curve function starting with the generator.
+    limb_t x[NUM_LIMBS_521BIT];
+    limb_t y[NUM_LIMBS_521BIT];
+    memcpy_P(x, P521_Gx, sizeof(x));
+    memcpy_P(y, P521_Gy, sizeof(y));
+    evaluate(x, y, privateKey);
+
+    // Pack the (x, y) point into the public key.
+    BigNumberUtil::packBE(publicKey, 66, x, NUM_LIMBS_521BIT);
+    BigNumberUtil::packBE(publicKey + 66, 66, y, NUM_LIMBS_521BIT);
+
+    // Clean up.
+    clean(x);
+    clean(y);
+}
+
+/**
+ * \brief Validates a private key value to ensure that it is
+ * between 1 and q - 1.
+ *
+ * \param privateKey The private key value to validate.
+ * \return Returns true if \a privateKey is valid, false if not.
+ *
+ * \sa isValidPublicKey()
+ */
+bool P521::isValidPrivateKey(const uint8_t privateKey[66])
+{
+    // The value "q" as a byte array from most to least significant.
+    static uint8_t const P521_q_bytes[66] PROGMEM = {
+        0x01, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFA, 0x51, 0x86, 0x87, 0x83, 0xBF, 0x2F,
+        0x96, 0x6B, 0x7F, 0xCC, 0x01, 0x48, 0xF7, 0x09,
+        0xA5, 0xD0, 0x3B, 0xB5, 0xC9, 0xB8, 0x89, 0x9C,
+        0x47, 0xAE, 0xBB, 0x6F, 0xB7, 0x1E, 0x91, 0x38,
+        0x64, 0x09
+    };
+    uint8_t zeroTest = 0;
+    uint8_t posn = 66;
+    uint16_t borrow = 0;
+    while (posn > 0) {
+        --posn;
+
+        // Check for zero.
+        zeroTest |= privateKey[posn];
+
+        // Subtract P521_q_bytes from the key.  If there is no borrow,
+        // then the key value was greater than or equal to q.
+        borrow = ((uint16_t)(privateKey[posn])) -
+                 pgm_read_byte(&(P521_q_bytes[posn])) -
+                 ((borrow >> 8) & 0x01);
+    }
+    return zeroTest != 0 && borrow != 0;
+}
+
+/**
+ * \brief Validates a public key to ensure that it is a valid curve point.
+ *
+ * \param publicKey The public key value to validate.
+ * \return Returns true if \a publicKey is valid, false if not.
+ *
+ * \sa isValidPrivateKey()
+ */
+bool P521::isValidPublicKey(const uint8_t publicKey[132])
+{
+    limb_t x[NUM_LIMBS_521BIT];
+    limb_t y[NUM_LIMBS_521BIT];
+    BigNumberUtil::unpackBE(x, NUM_LIMBS_521BIT, publicKey, 66);
+    BigNumberUtil::unpackBE(y, NUM_LIMBS_521BIT, publicKey + 66, 66);
+    bool ok = validate(x, y);
+    clean(x);
+    clean(y);
+    return ok;
+}
+
+/**
+ * \fn bool P521::isValidCurvePoint(const uint8_t point[132])
+ * \brief Validates a point to ensure that it is on the curve.
+ *
+ * \param point The point to validate.
+ * \return Returns true if \a point is valid and on the curve, false if not.
+ *
+ * This is a convenience function that calls isValidPublicKey() as the
+ * two operations are equivalent.
+ */
+
+/**
+ * \brief Evaluates the curve function by multiplying (x, y) by f.
+ *
+ * \param x The X co-ordinate of the curve point.  Replaced with the X
+ * co-ordinate of the result on exit.
+ * \param y The Y co-ordinate of the curve point.  Replaced with the Y
+ * co-ordinate of the result on exit.
+ * \param f The 521-bit scalar to multiply (x, y) by, most significant
+ * bit first.
+ */
+void P521::evaluate(limb_t *x, limb_t *y, const uint8_t f[66])
+{
+    limb_t x1[NUM_LIMBS_521BIT];
+    limb_t y1[NUM_LIMBS_521BIT];
+    limb_t z1[NUM_LIMBS_521BIT];
+    limb_t x2[NUM_LIMBS_521BIT];
+    limb_t y2[NUM_LIMBS_521BIT];
+    limb_t z2[NUM_LIMBS_521BIT];
+
+    // We want the input in Jacobian co-ordinates.  The point (x, y, z)
+    // corresponds to the affine point (x / z^2, y / z^3), so if we set z
+    // to 1 we end up with Jacobian co-ordinates.  Remember that z is 1
+    // and continue on.
+
+    // Set the answer to the point-at-infinity initially (z = 0).
+    memset(x1, 0, sizeof(x1));
+    memset(y1, 0, sizeof(y1));
+    memset(z1, 0, sizeof(z1));
+
+    // Special handling for the highest bit.  We can skip dblPoint()/addPoint()
+    // and simply conditionally move (x, y, z) into (x1, y1, z1).
+    uint8_t select = (f[0] & 0x01);
+    cmove(select, x1, x);
+    cmove(select, y1, y);
+    cmove1(select, z1); // z = 1
+
+    // Iterate over the remaining 520 bits of f from highest to lowest.
+    uint8_t mask = 0x80;
+    uint8_t fposn = 1;
+    for (uint16_t t = 520; t > 0; --t) {
+        // Double the answer.
+        dblPoint(x1, y1, z1, x1, y1, z1);
+
+        // Add (x, y, z) to (x1, y1, z1) for the next 1 bit.
+        // We must always do this to preserve the overall timing.
+        // The z value is always 1 so we can omit that argument.
+        addPoint(x2, y2, z2, x1, y1, z1, x, y/*, z*/);
+
+        // If the bit was 1, then move (x2, y2, z2) into (x1, y1, z1).
+        select = (f[fposn] & mask);
+        cmove(select, x1, x2);
+        cmove(select, y1, y2);
+        cmove(select, z1, z2);
+
+        // Move onto the next bit.
+        mask >>= 1;
+        if (!mask) {
+            ++fposn;
+            mask = 0x80;
+        }
+    }
+
+    // Convert from Jacobian co-ordinates back into affine co-ordinates.
+    // x = x1 * (z1^2)^-1, y = y1 * (z1^3)^-1.
+    recip(x2, z1);
+    square(y2, x2);
+    mul(x, x1, y2);
+    mul(y2, y2, x2);
+    mul(y, y1, y2);
+
+    // Clean up.
+    clean(x1);
+    clean(y1);
+    clean(z1);
+    clean(x2);
+    clean(y2);
+    clean(z2);
+}
+
+/**
+ * \brief Adds two affine points.
+ *
+ * \param x1 The X value for the first point to add, and the result.
+ * \param y1 The Y value for the first point to add, and the result.
+ * \param x2 The X value for the second point to add.
+ * \param y2 The Y value for the second point to add.
+ *
+ * The Z values for the two points are assumed to be 1.
+ */
+void P521::addAffine(limb_t *x1, limb_t *y1, const limb_t *x2, const limb_t *y2)
+{
+    limb_t xout[NUM_LIMBS_521BIT];
+    limb_t yout[NUM_LIMBS_521BIT];
+    limb_t zout[NUM_LIMBS_521BIT];
+    limb_t z1[NUM_LIMBS_521BIT];
+
+    // z1 = 1
+    z1[0] = 1;
+    memset(z1 + 1, 0, (NUM_LIMBS_521BIT - 1) * sizeof(limb_t));
+
+    // Add the two points.
+    addPoint(xout, yout, zout, x1, y1, z1, x2, y2/*, z2*/);
+
+    // Convert from Jacobian co-ordinates back into affine co-ordinates.
+    // x1 = xout * (zout^2)^-1, y1 = yout * (zout^3)^-1.
+    recip(z1, zout);
+    square(zout, z1);
+    mul(x1, xout, zout);
+    mul(zout, zout, z1);
+    mul(y1, yout, zout);
+
+    // Clean up.
+    clean(xout);
+    clean(yout);
+    clean(zout);
+    clean(z1);
+}
+
+/**
+ * \brief Validates that (x, y) is actually a point on the curve.
+ *
+ * \param x The X co-ordinate of the point to test.
+ * \param y The Y co-ordinate of the point to test.
+ * \return Returns true if (x, y) is on the curve, or false if not.
+ *
+ * \sa inRange()
+ */
+bool P521::validate(const limb_t *x, const limb_t *y)
+{
+    bool result;
+
+    // If x or y is greater than or equal to 2^521 - 1, then the
+    // point is definitely not on the curve.  Preserve timing by
+    // delaying the reporting of the result until later.
+    result = inRange(x);
+    result &= inRange(y);
+
+    // We need to check that y^2 = x^3 - 3 * x + b mod 2^521 - 1.
+    limb_t t1[NUM_LIMBS_521BIT];
+    limb_t t2[NUM_LIMBS_521BIT];
+    square(t1, x);
+    mul(t1, t1, x);
+    mulLiteral(t2, x, 3);
+    sub(t1, t1, t2);
+    memcpy_P(t2, P521_b, sizeof(t2));
+    add(t1, t1, t2);
+    square(t2, y);
+    result &= secure_compare(t1, t2, sizeof(t1));
+    clean(t1);
+    clean(t2);
+    return result;
+}
+
+/**
+ * \brief Determines if a value is between 0 and 2^521 - 2.
+ *
+ * \param x The value to test.
+ * \return Returns true if \a x is in range, false if not.
+ *
+ * \sa validate()
+ */
+bool P521::inRange(const limb_t *x)
+{
+    // Do a trial subtraction of 2^521 - 1 from x, which is equivalent
+    // to adding 1 and subtracting 2^521.  We only need the carry.
+    dlimb_t carry = 1;
+    limb_t word = 0;
+    for (uint8_t index = 0; index < NUM_LIMBS_521BIT; ++index) {
+        carry += *x++;
+        word = (limb_t)carry;
+        carry >>= LIMB_BITS;
+    }
+
+    // Determine the carry out from the low 521 bits.
+#if BIGNUMBER_LIMB_8BIT
+    carry = (carry << 7) + (word >> 1);
+#else
+    carry = (carry << (LIMB_BITS - 9)) + (word >> 9);
+#endif
+
+    // If the carry is zero, then x was in range.  Otherwise it is out
+    // of range.  Check for zero in a way that preserves constant timing.
+    word = (limb_t)(carry | (carry >> LIMB_BITS));
+    word = (limb_t)(((((dlimb_t)1) << LIMB_BITS) - word) >> LIMB_BITS);
+    return (bool)word;
+}
+
+/**
+ * \brief Reduces a number modulo 2^521 - 1.
+ *
+ * \param result The array that will contain the result when the
+ * function exits.  Must be NUM_LIMBS_521BIT limbs in size.
+ * \param x The number to be reduced, which must be NUM_LIMBS_1042BIT
+ * limbs in size and less than square(2^521 - 1).  This array can be
+ * the same as \a result.
+ */
+void P521::reduce(limb_t *result, const limb_t *x)
+{
+#if BIGNUMBER_LIMB_16BIT || BIGNUMBER_LIMB_32BIT
+    // According to NIST FIPS 186-4, we add the high 521 bits to the
+    // low 521 bits and then do a trial subtraction of 2^521 - 1.
+    // We do both in a single step.  Subtracting 2^521 - 1 is equivalent
+    // to adding 1 and subtracting 2^521.
+    uint8_t index;
+    const limb_t *xl = x;
+    const limb_t *xh = x + NUM_LIMBS_521BIT;
+    limb_t *rr = result;
+    dlimb_t carry;
+    limb_t word = x[NUM_LIMBS_521BIT - 1];
+    carry = (word >> 9) + 1;
+    word &= 0x1FF;
+    for (index = 0; index < (NUM_LIMBS_521BIT - 1); ++index) {
+        carry += *xl++;
+        carry += ((dlimb_t)(*xh++)) << (LIMB_BITS - 9);
+        *rr++ = (limb_t)carry;
+        carry >>= LIMB_BITS;
+    }
+    carry += word;
+    carry += ((dlimb_t)(x[NUM_LIMBS_1042BIT - 1])) << (LIMB_BITS - 9);
+    word = (limb_t)carry;
+    *rr = word;
+
+    // If the carry out was 1, then mask it off and we have the answer.
+    // If the carry out was 0, then we need to add 2^521 - 1 back again.
+    // To preserve the timing we perform a conditional subtract of 1 and
+    // then mask off the high bits.
+    carry = ((word >> 9) ^ 0x01) & 0x01;
+    rr = result;
+    for (index = 0; index < NUM_LIMBS_521BIT; ++index) {
+        carry = ((dlimb_t)(*rr)) - carry;
+        *rr++ = (limb_t)carry;
+        carry = (carry >> LIMB_BITS) & 0x01;
+    }
+    *(--rr) &= 0x1FF;
+#elif BIGNUMBER_LIMB_8BIT
+    // Same as above, but for 8-bit limbs.
+    uint8_t index;
+    const limb_t *xl = x;
+    const limb_t *xh = x + NUM_LIMBS_521BIT;
+    limb_t *rr = result;
+    dlimb_t carry;
+    limb_t word = x[NUM_LIMBS_521BIT - 1];
+    carry = (word >> 1) + 1;
+    word &= 0x01;
+    for (index = 0; index < (NUM_LIMBS_521BIT - 1); ++index) {
+        carry += *xl++;
+        carry += ((dlimb_t)(*xh++)) << 7;
+        *rr++ = (limb_t)carry;
+        carry >>= LIMB_BITS;
+    }
+    carry += word;
+    carry += ((dlimb_t)(x[NUM_LIMBS_1042BIT - 1])) << 1;
+    word = (limb_t)carry;
+    *rr = word;
+    carry = ((word >> 1) ^ 0x01) & 0x01;
+    rr = result;
+    for (index = 0; index < NUM_LIMBS_521BIT; ++index) {
+        carry = ((dlimb_t)(*rr)) - carry;
+        *rr++ = (limb_t)carry;
+        carry = (carry >> LIMB_BITS) & 0x01;
+    }
+    *(--rr) &= 0x01;
+#else
+    #error "Don't know how to reduce values mod 2^521 - 1"
+#endif
+}
+
+/**
+ * \brief Quickly reduces a number modulo 2^521 - 1.
+ *
+ * \param x The number to be reduced, which must be NUM_LIMBS_521BIT
+ * limbs in size and less than or equal to 2 * (2^521 - 2).
+ *
+ * The answer is also put into \a x and will consist of NUM_LIMBS_521BIT limbs.
+ *
+ * This function is intended for reducing the result of additions where
+ * the caller knows that \a x is within the described range.  A single
+ * trial subtraction is all that is needed to reduce the number.
+ */
+void P521::reduceQuick(limb_t *x)
+{
+    // Perform a trial subtraction of 2^521 - 1 from x.  This is
+    // equivalent to adding 1 and subtracting 2^521 - 1.
+    uint8_t index;
+    limb_t *xx = x;
+    dlimb_t carry = 1;
+    for (index = 0; index < NUM_LIMBS_521BIT; ++index) {
+        carry += *xx;
+        *xx++ = (limb_t)carry;
+        carry >>= LIMB_BITS;
+    }
+
+    // If the carry out was 1, then mask it off and we have the answer.
+    // If the carry out was 0, then we need to add 2^521 - 1 back again.
+    // To preserve the timing we perform a conditional subtract of 1 and
+    // then mask off the high bits.
+#if BIGNUMBER_LIMB_16BIT || BIGNUMBER_LIMB_32BIT
+    carry = ((x[NUM_LIMBS_521BIT - 1] >> 9) ^ 0x01) & 0x01;
+    xx = x;
+    for (index = 0; index < NUM_LIMBS_521BIT; ++index) {
+        carry = ((dlimb_t)(*xx)) - carry;
+        *xx++ = (limb_t)carry;
+        carry = (carry >> LIMB_BITS) & 0x01;
+    }
+    *(--xx) &= 0x1FF;
+#elif BIGNUMBER_LIMB_8BIT
+    carry = ((x[NUM_LIMBS_521BIT - 1] >> 1) ^ 0x01) & 0x01;
+    xx = x;
+    for (index = 0; index < NUM_LIMBS_521BIT; ++index) {
+        carry = ((dlimb_t)(*xx)) - carry;
+        *xx++ = (limb_t)carry;
+        carry = (carry >> LIMB_BITS) & 0x01;
+    }
+    *(--xx) &= 0x01;
+#endif
+}
+
+/**
+ * \brief Multiplies two 521-bit values to produce a 1042-bit result.
+ *
+ * \param result The result, which must be NUM_LIMBS_1042BIT limbs in size
+ * and must not overlap with \a x or \a y.
+ * \param x The first value to multiply, which must be NUM_LIMBS_521BIT
+ * limbs in size.
+ * \param y The second value to multiply, which must be NUM_LIMBS_521BIT
+ * limbs in size.
+ *
+ * \sa mul()
+ */
+void P521::mulNoReduce(limb_t *result, const limb_t *x, const limb_t *y)
+{
+    uint8_t i, j;
+    dlimb_t carry;
+    limb_t word;
+    const limb_t *yy;
+    limb_t *rr;
+
+    // Multiply the lowest word of x by y.
+    carry = 0;
+    word = x[0];
+    yy = y;
+    rr = result;
+    for (i = 0; i < NUM_LIMBS_521BIT; ++i) {
+        carry += ((dlimb_t)(*yy++)) * word;
+        *rr++ = (limb_t)carry;
+        carry >>= LIMB_BITS;
+    }
+    *rr = (limb_t)carry;
+
+    // Multiply and add the remaining words of x by y.
+    for (i = 1; i < NUM_LIMBS_521BIT; ++i) {
+        word = x[i];
+        carry = 0;
+        yy = y;
+        rr = result + i;
+        for (j = 0; j < NUM_LIMBS_521BIT; ++j) {
+            carry += ((dlimb_t)(*yy++)) * word;
+            carry += *rr;
+            *rr++ = (limb_t)carry;
+            carry >>= LIMB_BITS;
+        }
+        *rr = (limb_t)carry;
+    }
+}
+
+/**
+ * \brief Multiplies two values and then reduces the result modulo 2^521 - 1.
+ *
+ * \param result The result, which must be NUM_LIMBS_521BIT limbs in size
+ * and can be the same array as \a x or \a y.
+ * \param x The first value to multiply, which must be NUM_LIMBS_521BIT limbs
+ * in size and less than 2^521 - 1.
+ * \param y The second value to multiply, which must be NUM_LIMBS_521BIT limbs
+ * in size and less than 2^521 - 1.  This can be the same array as \a x.
+ */
+void P521::mul(limb_t *result, const limb_t *x, const limb_t *y)
+{
+    limb_t temp[NUM_LIMBS_1042BIT];
+    mulNoReduce(temp, x, y);
+    reduce(result, temp);
+    strict_clean(temp);
+}
+
+/**
+ * \fn void P521::square(limb_t *result, const limb_t *x)
+ * \brief Squares a value and then reduces it modulo 2^521 - 1.
+ *
+ * \param result The result, which must be NUM_LIMBS_521BIT limbs in size and
+ * can be the same array as \a x.
+ * \param x The value to square, which must be NUM_LIMBS_521BIT limbs in size
+ * and less than 2^521 - 1.
+ */
+
+/**
+ * \brief Multiply a value by a single-limb literal modulo 2^521 - 1.
+ *
+ * \param result The result, which must be NUM_LIMBS_521BIT limbs in size and
+ * can be the same array as \a x.
+ * \param x The first value to multiply, which must be NUM_LIMBS_521BIT limbs
+ * in size and less than 2^521 - 1.
+ * \param y The second value to multiply, which must be less than 128.
+ */
+void P521::mulLiteral(limb_t *result, const limb_t *x, limb_t y)
+{
+    uint8_t index;
+    dlimb_t carry = 0;
+    const limb_t *xx = x;
+    limb_t *rr = result;
+
+    // Multiply x by the literal and put it into the result array.
+    // We assume that y is small enough that overflow from the
+    // highest limb will not occur during this process.
+    for (index = 0; index < NUM_LIMBS_521BIT; ++index) {
+        carry += ((dlimb_t)(*xx++)) * y;
+        *rr++ = (limb_t)carry;
+        carry >>= LIMB_BITS;
+    }
+
+    // Reduce the value modulo 2^521 - 1.  The high half is only a
+    // single limb, so we can short-cut some of reduce() here.
+#if BIGNUMBER_LIMB_16BIT || BIGNUMBER_LIMB_32BIT
+    limb_t word = result[NUM_LIMBS_521BIT - 1];
+    carry = (word >> 9) + 1;
+    word &= 0x1FF;
+    rr = result;
+    for (index = 0; index < (NUM_LIMBS_521BIT - 1); ++index) {
+        carry += *rr;
+        *rr++ = (limb_t)carry;
+        carry >>= LIMB_BITS;
+    }
+    carry += word;
+    word = (limb_t)carry;
+    *rr = word;
+
+    // If the carry out was 1, then mask it off and we have the answer.
+    // If the carry out was 0, then we need to add 2^521 - 1 back again.
+    // To preserve the timing we perform a conditional subtract of 1 and
+    // then mask off the high bits.
+    carry = ((word >> 9) ^ 0x01) & 0x01;
+    rr = result;
+    for (index = 0; index < NUM_LIMBS_521BIT; ++index) {
+        carry = ((dlimb_t)(*rr)) - carry;
+        *rr++ = (limb_t)carry;
+        carry = (carry >> LIMB_BITS) & 0x01;
+    }
+    *(--rr) &= 0x1FF;
+#elif BIGNUMBER_LIMB_8BIT
+    // Same as above, but for 8-bit limbs.
+    limb_t word = result[NUM_LIMBS_521BIT - 1];
+    carry = (word >> 1) + 1;
+    word &= 0x01;
+    rr = result;
+    for (index = 0; index < (NUM_LIMBS_521BIT - 1); ++index) {
+        carry += *rr;
+        *rr++ = (limb_t)carry;
+        carry >>= LIMB_BITS;
+    }
+    carry += word;
+    word = (limb_t)carry;
+    *rr = word;
+    carry = ((word >> 1) ^ 0x01) & 0x01;
+    rr = result;
+    for (index = 0; index < NUM_LIMBS_521BIT; ++index) {
+        carry = ((dlimb_t)(*rr)) - carry;
+        *rr++ = (limb_t)carry;
+        carry = (carry >> LIMB_BITS) & 0x01;
+    }
+    *(--rr) &= 0x01;
+#endif
+}
+
+/**
+ * \brief Adds two values and then reduces the result modulo 2^521 - 1.
+ *
+ * \param result The result, which must be NUM_LIMBS_521BIT limbs in size
+ * and can be the same array as \a x or \a y.
+ * \param x The first value to multiply, which must be NUM_LIMBS_521BIT
+ * limbs in size and less than 2^521 - 1.
+ * \param y The second value to multiply, which must be NUM_LIMBS_521BIT
+ * limbs in size and less than 2^521 - 1.
+ */
+void P521::add(limb_t *result, const limb_t *x, const limb_t *y)
+{
+    dlimb_t carry = 0;
+    limb_t *rr = result;
+    for (uint8_t posn = 0; posn < NUM_LIMBS_521BIT; ++posn) {
+        carry += *x++;
+        carry += *y++;
+        *rr++ = (limb_t)carry;
+        carry >>= LIMB_BITS;
+    }
+    reduceQuick(result);
+}
+
+/**
+ * \brief Subtracts two values and then reduces the result modulo 2^521 - 1.
+ *
+ * \param result The result, which must be NUM_LIMBS_521BIT limbs in size
+ * and can be the same array as \a x or \a y.
+ * \param x The first value to multiply, which must be NUM_LIMBS_521BIT
+ * limbs in size and less than 2^521 - 1.
+ * \param y The second value to multiply, which must be NUM_LIMBS_521BIT
+ * limbs in size and less than 2^521 - 1.
+ */
+void P521::sub(limb_t *result, const limb_t *x, const limb_t *y)
+{
+    dlimb_t borrow;
+    uint8_t posn;
+    limb_t *rr = result;
+
+    // Subtract y from x to generate the intermediate result.
+    borrow = 0;
+    for (posn = 0; posn < NUM_LIMBS_521BIT; ++posn) {
+        borrow = ((dlimb_t)(*x++)) - (*y++) - ((borrow >> LIMB_BITS) & 0x01);
+        *rr++ = (limb_t)borrow;
+    }
+
+    // If we had a borrow, then the result has gone negative and we
+    // have to add 2^521 - 1 to the result to make it positive again.
+    // The top bits of "borrow" will be all 1's if there is a borrow
+    // or it will be all 0's if there was no borrow.  Easiest is to
+    // conditionally subtract 1 and then mask off the high bits.
+    rr = result;
+    borrow = (borrow >> LIMB_BITS) & 1U;
+    borrow = ((dlimb_t)(*rr)) - borrow;
+    *rr++ = (limb_t)borrow;
+    for (posn = 1; posn < NUM_LIMBS_521BIT; ++posn) {
+        borrow = ((dlimb_t)(*rr)) - ((borrow >> LIMB_BITS) & 0x01);
+        *rr++ = (limb_t)borrow;
+    }
+#if BIGNUMBER_LIMB_8BIT
+    *(--rr) &= 0x01;
+#else
+    *(--rr) &= 0x1FF;
+#endif
+}
+
+/**
+ * \brief Doubles a point represented in Jacobian co-ordinates.
+ *
+ * \param xout The X value for the result.
+ * \param yout The Y value for the result.
+ * \param zout The Z value for the result.
+ * \param xin The X value for the point to be doubled.
+ * \param yin The Y value for the point to be doubled.
+ * \param zin The Z value for the point to be doubled.
+ *
+ * The output parameters can be the same as the input parameters
+ * to double in-place.
+ *
+ * Reference: http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
+ */
+void P521::dblPoint(limb_t *xout, limb_t *yout, limb_t *zout,
+                    const limb_t *xin, const limb_t *yin,
+                    const limb_t *zin)
+{
+    limb_t alpha[NUM_LIMBS_521BIT];
+    limb_t beta[NUM_LIMBS_521BIT];
+    limb_t gamma[NUM_LIMBS_521BIT];
+    limb_t delta[NUM_LIMBS_521BIT];
+    limb_t tmp[NUM_LIMBS_521BIT];
+
+    // Double the point.  If it is the point at infinity (z = 0),
+    // then zout will still be zero at the end of this process so
+    // we don't need any special handling for that case.
+    square(delta, zin);       // delta = z^2
+    square(gamma, yin);       // gamma = y^2
+    mul(beta, xin, gamma);    // beta = x * gamma
+    sub(tmp, xin, delta);     // alpha = 3 * (x - delta) * (x + delta)
+    mulLiteral(alpha, tmp, 3);
+    add(tmp, xin, delta);
+    mul(alpha, alpha, tmp);
+    square(xout, alpha);      // xout = alpha^2 - 8 * beta
+    mulLiteral(tmp, beta, 8);
+    sub(xout, xout, tmp);
+    add(zout, yin, zin);      // zout = (y + z)^2 - gamma - delta
+    square(zout, zout);
+    sub(zout, zout, gamma);
+    sub(zout, zout, delta);
+    mulLiteral(yout, beta, 4);// yout = alpha * (4 * beta - xout) - 8 * gamma^2
+    sub(yout, yout, xout);
+    mul(yout, alpha, yout);
+    square(gamma, gamma);
+    mulLiteral(gamma, gamma, 8);
+    sub(yout, yout, gamma);
+
+    // Clean up.
+    strict_clean(alpha);
+    strict_clean(beta);
+    strict_clean(gamma);
+    strict_clean(delta);
+    strict_clean(tmp);
+}
+
+/**
+ * \brief Adds two curve points, one represented in Jacobian co-ordinates,
+ * and the other represented in affine co-ordinates.
+ *
+ * \param xout The X value for the result.
+ * \param yout The Y value for the result.
+ * \param zout The Z value for the result.
+ * \param x1 The X value for the first point to add.
+ * \param y1 The Y value for the first point to add.
+ * \param z1 The Z value for the first point to add.
+ * \param x2 The X value for the second point to add.
+ * \param y2 The Y value for the second point to add.
+ *
+ * The output parameters must not overlap with either of the inputs.
+ *
+ * The Z value of the second point is implicitly assumed to be 1.
+ *
+ * Reference: http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
+ */
+void P521::addPoint(limb_t *xout, limb_t *yout, limb_t *zout,
+                    const limb_t *x1, const limb_t *y1,
+                    const limb_t *z1, const limb_t *x2,
+                    const limb_t *y2)
+{
+    limb_t z1z1[NUM_LIMBS_521BIT];
+    limb_t u2[NUM_LIMBS_521BIT];
+    limb_t s2[NUM_LIMBS_521BIT];
+    limb_t h[NUM_LIMBS_521BIT];
+    limb_t i[NUM_LIMBS_521BIT];
+    limb_t j[NUM_LIMBS_521BIT];
+    limb_t r[NUM_LIMBS_521BIT];
+    limb_t v[NUM_LIMBS_521BIT];
+
+    // Determine if the first value is the point-at-infinity identity element.
+    // The second z value is always 1 so it cannot be the point-at-infinity.
+    limb_t p1IsIdentity = BigNumberUtil::isZero(z1, NUM_LIMBS_521BIT);
+
+    // Multiply the points, assuming that z2 = 1.
+    square(z1z1, z1);               // z1z1 = z1^2
+    mul(u2, x2, z1z1);              // u2 = x2 * z1z1
+    mul(s2, y2, z1);                // s2 = y2 * z1 * z1z1
+    mul(s2, s2, z1z1);
+    sub(h, u2, x1);                 // h = u2 - x1
+    mulLiteral(i, h, 2);            // i = (2 * h)^2
+    square(i, i);
+    sub(r, s2, y1);                 // r = 2 * (s2 - y1)
+    add(r, r, r);
+    mul(j, h, i);                   // j = h * i
+    mul(v, x1, i);                  // v = x1 * i
+    square(xout, r);                // xout = r^2 - j - 2 * v
+    sub(xout, xout, j);
+    sub(xout, xout, v);
+    sub(xout, xout, v);
+    sub(yout, v, xout);             // yout = r * (v - xout) - 2 * y1 * j
+    mul(yout, r, yout);
+    mul(j, y1, j);
+    sub(yout, yout, j);
+    sub(yout, yout, j);
+    mul(zout, z1, h);               // zout = 2 * z1 * h
+    add(zout, zout, zout);
+
+    // Select the answer to return.  If (x1, y1, z1) was the identity,
+    // then the answer is (x2, y2, z2).  Otherwise it is (xout, yout, zout).
+    // Conditionally move the second argument over the output if necessary.
+    cmove(p1IsIdentity, xout, x2);
+    cmove(p1IsIdentity, yout, y2);
+    cmove1(p1IsIdentity, zout); // z2 = 1
+
+    // Clean up.
+    strict_clean(z1z1);
+    strict_clean(u2);
+    strict_clean(s2);
+    strict_clean(h);
+    strict_clean(i);
+    strict_clean(j);
+    strict_clean(r);
+    strict_clean(v);
+}
+
+/**
+ * \brief Conditionally moves \a y into \a x if a selection value is non-zero.
+ *
+ * \param select Non-zero to move \a y into \a x, zero to leave \a x unchanged.
+ * \param x The destination to move into.
+ * \param y The value to conditionally move.
+ *
+ * The move is performed in a way that it should take the same amount of
+ * time irrespective of the value of \a select.
+ *
+ * \sa cmove1()
+ */
+void P521::cmove(limb_t select, limb_t *x, const limb_t *y)
+{
+    uint8_t posn;
+    limb_t dummy;
+    limb_t sel;
+
+    // Turn "select" into an all-zeroes or all-ones mask.  We don't care
+    // which bit or bits is set in the original "select" value.
+    sel = (limb_t)(((((dlimb_t)1) << LIMB_BITS) - select) >> LIMB_BITS);
+    --sel;
+
+    // Move y into x based on "select".
+    for (posn = 0; posn < NUM_LIMBS_521BIT; ++posn) {
+        dummy = sel & (*x ^ *y++);
+        *x++ ^= dummy;
+    }
+}
+
+/**
+ * \brief Conditionally moves 1 into \a x if a selection value is non-zero.
+ *
+ * \param select Non-zero to move 1 into \a x, zero to leave \a x unchanged.
+ * \param x The destination to move into.
+ *
+ * The move is performed in a way that it should take the same amount of
+ * time irrespective of the value of \a select.
+ *
+ * \sa cmove()
+ */
+void P521::cmove1(limb_t select, limb_t *x)
+{
+    uint8_t posn;
+    limb_t dummy;
+    limb_t sel;
+
+    // Turn "select" into an all-zeroes or all-ones mask.  We don't care
+    // which bit or bits is set in the original "select" value.
+    sel = (limb_t)(((((dlimb_t)1) << LIMB_BITS) - select) >> LIMB_BITS);
+    --sel;
+
+    // Move 1 into x based on "select".
+    dummy = sel & (*x ^ 1);
+    *x++ ^= dummy;
+    for (posn = 1; posn < NUM_LIMBS_521BIT; ++posn) {
+        dummy = sel & *x;
+        *x++ ^= dummy;
+    }
+}
+
+/**
+ * \brief Computes the reciprocal of a number modulo 2^521 - 1.
+ *
+ * \param result The result as a array of NUM_LIMBS_521BIT limbs in size.
+ * This cannot be the same array as \a x.
+ * \param x The number to compute the reciprocal for, also NUM_LIMBS_521BIT
+ * limbs in size.
+ */
+void P521::recip(limb_t *result, const limb_t *x)
+{
+    limb_t t1[NUM_LIMBS_521BIT];
+
+    // The reciprocal is the same as x ^ (p - 2) where p = 2^521 - 1.
+    // The big-endian hexadecimal expansion of (p - 2) is:
+    // 01FF FFFFFFF FFFFFFFF ... FFFFFFFF FFFFFFFD
+    //
+    // The naive implementation needs to do 2 multiplications per 1 bit and
+    // 1 multiplication per 0 bit.  We can improve upon this by creating a
+    // pattern 1111 and then shifting and multiplying to create 11111111,
+    // and then 1111111111111111, and so on for the top 512-bits.
+
+    // Build a 4-bit pattern 1111 in the result.
+    square(result, x);
+    mul(result, result, x);
+    square(result, result);
+    mul(result, result, x);
+    square(result, result);
+    mul(result, result, x);
+
+    // Shift and multiply by increasing powers of two.  This turns
+    // 1111 into 11111111, and then 1111111111111111, and so on.
+    for (size_t power = 4; power <= 256; power <<= 1) {
+        square(t1, result);
+        for (size_t temp = 1; temp < power; ++temp)
+            square(t1, t1);
+        mul(result, result, t1);
+    }
+
+    // Handle the 9 lowest bits of (p - 2), 111111101, from highest to lowest.
+    for (uint8_t index = 0; index < 7; ++index) {
+        square(result, result);
+        mul(result, result, x);
+    }
+    square(result, result);
+    square(result, result);
+    mul(result, result, x);
+
+    // Clean up.
+    clean(t1);
+}
+
+/**
+ * \brief Reduces a number modulo q.
+ *
+ * \param result The result array, which must be NUM_LIMBS_521BIT limbs in size.
+ * \param r The value to reduce, which must be NUM_LIMBS_1042BIT limbs in size.
+ *
+ * It is allowed for \a result to be the same as \a r.
+ */
+void P521::reduceQ(limb_t *result, const limb_t *r)
+{
+    // Algorithm from: http://en.wikipedia.org/wiki/Barrett_reduction
+    //
+    // We assume that r is less than or equal to (q - 1)^2.
+    //
+    // We want to compute result = r mod q.  Find the smallest k such
+    // that 2^k > q.  In our case, k = 521.  Then set m = floor(4^k / q)
+    // and let r = r - q * floor(m * r / 4^k).  This will be the result
+    // or it will be at most one subtraction of q away from the result.
+    //
+    // Note: m is a 522-bit number, which fits in the same number of limbs
+    // as a 521-bit number assuming that limbs are 8 bits or more in size.
+    static limb_t const numM[NUM_LIMBS_521BIT] PROGMEM = {
+        LIMB(0x6EC79BF7), LIMB(0x449048E1), LIMB(0x7663B851), LIMB(0xC44A3647),
+        LIMB(0x08F65A2F), LIMB(0x8033FEB7), LIMB(0x40D06994), LIMB(0xAE79787C),
+        LIMB(0x00000005), LIMB(0x00000000), LIMB(0x00000000), LIMB(0x00000000),
+        LIMB(0x00000000), LIMB(0x00000000), LIMB(0x00000000), LIMB(0x00000000),
+        LIMB_PARTIAL(0x200)
+    };
+    limb_t temp[NUM_LIMBS_1042BIT + NUM_LIMBS_521BIT];
+    limb_t temp2[NUM_LIMBS_521BIT];
+
+    // Multiply r by m.
+    BigNumberUtil::mul_P(temp, r, NUM_LIMBS_1042BIT, numM, NUM_LIMBS_521BIT);
+
+    // Compute (m * r / 4^521) = (m * r / 2^1042).
+#if BIGNUMBER_LIMB_8BIT || BIGNUMBER_LIMB_16BIT
+    dlimb_t carry = temp[NUM_LIMBS_BITS(1040)] >> 2;
+    for (uint8_t index = 0; index < NUM_LIMBS_521BIT; ++index) {
+        carry += ((dlimb_t)(temp[NUM_LIMBS_BITS(1040) + index + 1])) << (LIMB_BITS - 2);
+        temp2[index] = (limb_t)carry;
+        carry >>= LIMB_BITS;
+    }
+#elif BIGNUMBER_LIMB_32BIT
+    dlimb_t carry = temp[NUM_LIMBS_BITS(1024)] >> 18;
+    for (uint8_t index = 0; index < NUM_LIMBS_521BIT; ++index) {
+        carry += ((dlimb_t)(temp[NUM_LIMBS_BITS(1024) + index + 1])) << 14;
+        temp2[index] = (limb_t)carry;
+        carry >>= LIMB_BITS;
+    }
+#endif
+
+    // Multiply (m * r) / 2^1042 by q and subtract it from r.
+    // We can ignore the high words of the subtraction result
+    // because they will all turn into zero after the subtraction.
+    BigNumberUtil::mul_P(temp, temp2, NUM_LIMBS_521BIT,
+                         P521_q, NUM_LIMBS_521BIT);
+    BigNumberUtil::sub(result, r, temp, NUM_LIMBS_521BIT);
+
+    // Perform a trial subtraction of q from the result to reduce it.
+    BigNumberUtil::reduceQuick_P(result, result, P521_q, NUM_LIMBS_521BIT);
+
+    // Clean up and exit.
+    clean(temp);
+    clean(temp2);
+}
+
+/**
+ * \brief Multiplies two values and then reduces the result modulo q.
+ *
+ * \param result The result, which must be NUM_LIMBS_521BIT limbs in size
+ * and can be the same array as \a x or \a y.
+ * \param x The first value to multiply, which must be NUM_LIMBS_521BIT limbs
+ * in size and less than q.
+ * \param y The second value to multiply, which must be NUM_LIMBS_521BIT limbs
+ * in size and less than q.  This can be the same array as \a x.
+ */
+void P521::mulQ(limb_t *result, const limb_t *x, const limb_t *y)
+{
+    limb_t temp[NUM_LIMBS_1042BIT];
+    mulNoReduce(temp, x, y);
+    reduceQ(result, temp);
+    strict_clean(temp);
+}
+
+/**
+ * \brief Computes the reciprocal of a number modulo q.
+ *
+ * \param result The result as a array of NUM_LIMBS_521BIT limbs in size.
+ * This cannot be the same array as \a x.
+ * \param x The number to compute the reciprocal for, also NUM_LIMBS_521BIT
+ * limbs in size.
+ */
+void P521::recipQ(limb_t *result, const limb_t *x)
+{
+    // Bottom 265 bits of q - 2.  The top 256 bits are all-1's.
+    static limb_t const P521_q_m2[] PROGMEM = {
+        LIMB(0x91386407), LIMB(0xbb6fb71e), LIMB(0x899c47ae), LIMB(0x3bb5c9b8),
+        LIMB(0xf709a5d0), LIMB(0x7fcc0148), LIMB(0xbf2f966b), LIMB(0x51868783),
+        LIMB_PARTIAL(0x1fa)
+    };
+
+    // Raise x to the power of q - 2, mod q.  We start with the top
+    // 256 bits which are all-1's, using a similar technique to recip().
+    limb_t t1[NUM_LIMBS_521BIT];
+    mulQ(result, x, x);
+    mulQ(result, result, x);
+    mulQ(result, result, result);
+    mulQ(result, result, x);
+    mulQ(result, result, result);
+    mulQ(result, result, x);
+    for (size_t power = 4; power <= 128; power <<= 1) {
+        mulQ(t1, result, result);
+        for (size_t temp = 1; temp < power; ++temp)
+            mulQ(t1, t1, t1);
+        mulQ(result, result, t1);
+    }
+    clean(t1);
+
+    // Deal with the bottom 265 bits from highest to lowest.  Square for
+    // each bit and multiply in x whenever there is a 1 bit.  The timing
+    // is based on the publicly-known constant q - 2, not on the value of x.
+    size_t bit = 265;
+    while (bit > 0) {
+        --bit;
+        mulQ(result, result, result);
+        if (pgm_read_limb(&(P521_q_m2[bit / LIMB_BITS])) &
+                (((limb_t)1) << (bit % LIMB_BITS))) {
+            mulQ(result, result, x);
+        }
+    }
+}
+
+/**
+ * \brief Generates a k value using the algorithm from RFC 6979.
+ *
+ * \param k The value to generate.
+ * \param hm The hashed message formatted ready to be signed.
+ * \param x The private key to sign with.
+ * \param hash The hash algorithm to use.
+ * \param count Iteration counter for generating new values of k when the
+ * previous one is rejected.
+ */
+void P521::generateK(uint8_t k[66], const uint8_t hm[66],
+                     const uint8_t x[66], Hash *hash, uint64_t count)
+{
+    size_t hlen = hash->hashSize();
+    uint8_t V[64];
+    uint8_t K[64];
+    uint8_t marker;
+
+    // If for some reason a hash function was supplied with more than
+    // 512 bits of output, truncate hash values to the first 512 bits.
+    // We cannot support more than this yet.
+    if (hlen > 64)
+        hlen = 64;
+
+    // RFC 6979, Section 3.2, Step a.  Hash the message, reduce modulo q,
+    // and produce an octet string the same length as q, bits2octets(H(m)).
+    // We support hashes up to 512 bits and q is a 521-bit number, so "hm"
+    // is already the bits2octets(H(m)) value that we need.
+
+    // Steps b and c.  Set V to all-ones and K to all-zeroes.
+    memset(V, 0x01, hlen);
+    memset(K, 0x00, hlen);
+
+    // Step d.  K = HMAC_K(V || 0x00 || x || hm).  We make a small
+    // modification here to append the count value if it is non-zero.
+    // We use this to generate a new k if we have to re-enter this
+    // function because the previous one was rejected by sign().
+    // This is slightly different to RFC 6979 which says that the
+    // loop in step h below should be continued.  That code path is
+    // difficult to access, so instead modify K and V in steps d and f.
+    // This alternative construction is compatible with the second
+    // variant described in section 3.6 of RFC 6979.
+    hash->resetHMAC(K, hlen);
+    hash->update(V, hlen);
+    marker = 0x00;
+    hash->update(&marker, 1);
+    hash->update(x, 66);
+    hash->update(hm, 66);
+    if (count)
+        hash->update(&count, sizeof(count));
+    hash->finalizeHMAC(K, hlen, K, hlen);
+
+    // Step e.  V = HMAC_K(V)
+    hash->resetHMAC(K, hlen);
+    hash->update(V, hlen);
+    hash->finalizeHMAC(K, hlen, V, hlen);
+
+    // Step f.  K = HMAC_K(V || 0x01 || x || hm)
+    hash->resetHMAC(K, hlen);
+    hash->update(V, hlen);
+    marker = 0x01;
+    hash->update(&marker, 1);
+    hash->update(x, 66);
+    hash->update(hm, 66);
+    if (count)
+        hash->update(&count, sizeof(count));
+    hash->finalizeHMAC(K, hlen, K, hlen);
+
+    // Step g.  V = HMAC_K(V)
+    hash->resetHMAC(K, hlen);
+    hash->update(V, hlen);
+    hash->finalizeHMAC(K, hlen, V, hlen);
+
+    // Step h.  Generate candidate k values until we find what we want.
+    for (;;) {
+        // Step h.1 and h.2.  Generate a string of 66 bytes in length.
+        //      T = empty
+        //      while (len(T) < 66)
+        //          V = HMAC_K(V)
+        //          T = T || V
+        size_t posn = 0;
+        while (posn < 66) {
+            size_t temp = 66 - posn;
+            if (temp > hlen)
+                temp = hlen;
+            hash->resetHMAC(K, hlen);
+            hash->update(V, hlen);
+            hash->finalizeHMAC(K, hlen, V, hlen);
+            memcpy(k + posn, V, temp);
+            posn += temp;
+        }
+
+        // Step h.3.  k = bits2int(T) and exit the loop if k is not in
+        // the range 1 to q - 1.  Note: We have to extract the 521 most
+        // significant bits of T, which means shifting it right by seven
+        // bits to put it into the correct form.
+        for (posn = 65; posn > 0; --posn)
+            k[posn] = (k[posn - 1] << 1) | (k[posn] >> 7);
+        k[0] >>= 7;
+        if (isValidPrivateKey(k))
+            break;
+
+        // Generate new K and V values and try again.
+        //      K = HMAC_K(V || 0x00)
+        //      V = HMAC_K(V)
+        hash->resetHMAC(K, hlen);
+        hash->update(V, hlen);
+        marker = 0x00;
+        hash->update(&marker, 1);
+        hash->finalizeHMAC(K, hlen, K, hlen);
+        hash->resetHMAC(K, hlen);
+        hash->update(V, hlen);
+        hash->finalizeHMAC(K, hlen, V, hlen);
+    }
+
+    // Clean up.
+    clean(V);
+    clean(K);
+}
+
+/**
+ * \brief Generates a k value using the algorithm from RFC 6979.
+ *
+ * \param k The value to generate.
+ * \param hm The hashed message formatted ready to be signed.
+ * \param x The private key to sign with.
+ * \param count Iteration counter for generating new values of k when the
+ * previous one is rejected.
+ *
+ * This override uses SHA512 to generate k values.  It is used when
+ * sign() was not passed an explicit hash object by the application.
+ */
+void P521::generateK(uint8_t k[66], const uint8_t hm[66],
+                     const uint8_t x[66], uint64_t count)
+{
+    SHA512 hash;
+    generateK(k, hm, x, &hash, count);
+}
diff --git a/libraries/Crypto/P521.h b/libraries/Crypto/P521.h
new file mode 100644
index 00000000..09ac02fd
--- /dev/null
+++ b/libraries/Crypto/P521.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (C) 2016 Southern Storm Software, Pty Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef CRYPTO_P521_h
+#define CRYPTO_P521_h
+
+#include "BigNumberUtil.h"
+
+class Hash;
+
+class P521
+{
+public:
+
+    static bool eval(uint8_t result[132], const uint8_t f[66], const uint8_t point[132]);
+
+    static void dh1(uint8_t k[132], uint8_t f[66]);
+    static bool dh2(const uint8_t k[132], uint8_t f[66]);
+
+    static void sign(uint8_t signature[132], const uint8_t privateKey[66],
+                     const void *message, size_t len, Hash *hash = 0);
+    static bool verify(const uint8_t signature[132],
+                       const uint8_t publicKey[132],
+                       const void *message, size_t len, Hash *hash = 0);
+
+    static void generatePrivateKey(uint8_t privateKey[66]);
+    static void derivePublicKey(uint8_t publicKey[132], const uint8_t privateKey[66]);
+
+    static bool isValidPrivateKey(const uint8_t privateKey[66]);
+    static bool isValidPublicKey(const uint8_t publicKey[132]);
+
+    static bool isValidCurvePoint(const uint8_t point[132])
+    {
+        return isValidPublicKey(point);
+    }
+
+#if defined(TEST_P521_FIELD_OPS)
+public:
+#else
+private:
+#endif
+    static void evaluate(limb_t *x, limb_t *y, const uint8_t f[66]);
+
+    static void addAffine(limb_t *x1, limb_t *y1,
+                          const limb_t *x2, const limb_t *y2);
+
+    static bool validate(const limb_t *x, const limb_t *y);
+    static bool inRange(const limb_t *x);
+
+    static void reduce(limb_t *result, const limb_t *x);
+    static void reduceQuick(limb_t *x);
+
+    static void mulNoReduce(limb_t *result, const limb_t *x, const limb_t *y);
+
+    static void mul(limb_t *result, const limb_t *x, const limb_t *y);
+    static void square(limb_t *result, const limb_t *x)
+    {
+        mul(result, x, x);
+    }
+
+    static void mulLiteral(limb_t *result, const limb_t *x, limb_t y);
+
+    static void add(limb_t *result, const limb_t *x, const limb_t *y);
+    static void sub(limb_t *result, const limb_t *x, const limb_t *y);
+
+    static void dblPoint(limb_t *xout, limb_t *yout, limb_t *zout,
+                         const limb_t *xin, const limb_t *yin,
+                         const limb_t *zin);
+    static void addPoint(limb_t *xout, limb_t *yout, limb_t *zout,
+                         const limb_t *x1, const limb_t *y1,
+                         const limb_t *z1, const limb_t *x2,
+                         const limb_t *y2);
+
+    static void cmove(limb_t select, limb_t *x, const limb_t *y);
+    static void cmove1(limb_t select, limb_t *x);
+
+    static void recip(limb_t *result, const limb_t *x);
+
+    static void reduceQ(limb_t *result, const limb_t *r);
+    static void mulQ(limb_t *result, const limb_t *x, const limb_t *y);
+    static void recipQ(limb_t *result, const limb_t *x);
+
+    static void generateK(uint8_t k[66], const uint8_t hm[66],
+                          const uint8_t x[66], Hash *hash, uint64_t count);
+    static void generateK(uint8_t k[66], const uint8_t hm[66],
+                          const uint8_t x[66], uint64_t count);
+
+    // Constructor and destructor are private - cannot instantiate this class.
+    P521() {}
+    ~P521() {}
+};
+
+#endif
diff --git a/libraries/Crypto/examples/TestP521/TestP521.ino b/libraries/Crypto/examples/TestP521/TestP521.ino
new file mode 100644
index 00000000..50832c6a
--- /dev/null
+++ b/libraries/Crypto/examples/TestP521/TestP521.ino
@@ -0,0 +1,519 @@
+/*
+ * Copyright (C) 2016 Southern Storm Software, Pty Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+This example runs tests on the P521 algorithm.
+
+Note: This example is too large to run on the Arduino Uno or other
+AVR platforms with 32K or less of flash memory.
+*/
+
+#include <Crypto.h>
+#include <P521.h>
+#include <SHA256.h>
+#include <SHA512.h>
+#include <RNG.h>
+#include <RNG.h>
+#include <string.h>
+#include <avr/pgmspace.h>
+
+void printNumber(const char *name, const uint8_t *x, size_t len)
+{
+    static const char hexchars[] = "0123456789ABCDEF";
+    Serial.print(name);
+    Serial.print(" = ");
+    for (size_t posn = 0; posn < len; ++posn) {
+        Serial.print(hexchars[(x[posn] >> 4) & 0x0F]);
+        Serial.print(hexchars[x[posn] & 0x0F]);
+    }
+    Serial.println();
+}
+
+static int P521_memcmp_P(const void *s1, const void *s2, size_t len)
+{
+    const uint8_t *u1 = (const uint8_t *)s1;
+    const uint8_t *u2 = (const uint8_t *)s2;
+    while (len > 0) {
+        int ch1 = *u1++;
+        int ch2 = pgm_read_byte(u2++);
+        if (ch1 != ch2)
+            return ch1 - ch2;
+        --len;
+    }
+    return 0;
+}
+
+static uint8_t alice_k[132];
+static uint8_t alice_f[66];
+static uint8_t bob_k[132];
+static uint8_t bob_f[66];
+
+// Check the eval() function using the test vectors from RFC 5903.
+void testEval()
+{
+    static uint8_t const alice_private[66] PROGMEM = {
+        0x00, 0x37, 0xAD, 0xE9, 0x31, 0x9A, 0x89, 0xF4,
+        0xDA, 0xBD, 0xB3, 0xEF, 0x41, 0x1A, 0xAC, 0xCC,
+        0xA5, 0x12, 0x3C, 0x61, 0xAC, 0xAB, 0x57, 0xB5,
+        0x39, 0x3D, 0xCE, 0x47, 0x60, 0x81, 0x72, 0xA0,
+        0x95, 0xAA, 0x85, 0xA3, 0x0F, 0xE1, 0xC2, 0x95,
+        0x2C, 0x67, 0x71, 0xD9, 0x37, 0xBA, 0x97, 0x77,
+        0xF5, 0x95, 0x7B, 0x26, 0x39, 0xBA, 0xB0, 0x72,
+        0x46, 0x2F, 0x68, 0xC2, 0x7A, 0x57, 0x38, 0x2D,
+        0x4A, 0x52
+    };
+    static uint8_t const alice_public[132] PROGMEM = {
+        0x00, 0x15, 0x41, 0x7E, 0x84, 0xDB, 0xF2, 0x8C,
+        0x0A, 0xD3, 0xC2, 0x78, 0x71, 0x33, 0x49, 0xDC,
+        0x7D, 0xF1, 0x53, 0xC8, 0x97, 0xA1, 0x89, 0x1B,
+        0xD9, 0x8B, 0xAB, 0x43, 0x57, 0xC9, 0xEC, 0xBE,
+        0xE1, 0xE3, 0xBF, 0x42, 0xE0, 0x0B, 0x8E, 0x38,
+        0x0A, 0xEA, 0xE5, 0x7C, 0x2D, 0x10, 0x75, 0x64,
+        0x94, 0x18, 0x85, 0x94, 0x2A, 0xF5, 0xA7, 0xF4,
+        0x60, 0x17, 0x23, 0xC4, 0x19, 0x5D, 0x17, 0x6C,
+        0xED, 0x3E, 0x01, 0x7C, 0xAE, 0x20, 0xB6, 0x64,
+        0x1D, 0x2E, 0xEB, 0x69, 0x57, 0x86, 0xD8, 0xC9,
+        0x46, 0x14, 0x62, 0x39, 0xD0, 0x99, 0xE1, 0x8E,
+        0x1D, 0x5A, 0x51, 0x4C, 0x73, 0x9D, 0x7C, 0xB4,
+        0xA1, 0x0A, 0xD8, 0xA7, 0x88, 0x01, 0x5A, 0xC4,
+        0x05, 0xD7, 0x79, 0x9D, 0xC7, 0x5E, 0x7B, 0x7D,
+        0x5B, 0x6C, 0xF2, 0x26, 0x1A, 0x6A, 0x7F, 0x15,
+        0x07, 0x43, 0x8B, 0xF0, 0x1B, 0xEB, 0x6C, 0xA3,
+        0x92, 0x6F, 0x95, 0x82
+    };
+    static uint8_t const bob_private[66] PROGMEM = {
+        0x01, 0x45, 0xBA, 0x99, 0xA8, 0x47, 0xAF, 0x43,
+        0x79, 0x3F, 0xDD, 0x0E, 0x87, 0x2E, 0x7C, 0xDF,
+        0xA1, 0x6B, 0xE3, 0x0F, 0xDC, 0x78, 0x0F, 0x97,
+        0xBC, 0xCC, 0x3F, 0x07, 0x83, 0x80, 0x20, 0x1E,
+        0x9C, 0x67, 0x7D, 0x60, 0x0B, 0x34, 0x37, 0x57,
+        0xA3, 0xBD, 0xBF, 0x2A, 0x31, 0x63, 0xE4, 0xC2,
+        0xF8, 0x69, 0xCC, 0xA7, 0x45, 0x8A, 0xA4, 0xA4,
+        0xEF, 0xFC, 0x31, 0x1F, 0x5C, 0xB1, 0x51, 0x68,
+        0x5E, 0xB9
+    };
+    static uint8_t const bob_public[132] PROGMEM = {
+        0x00, 0xD0, 0xB3, 0x97, 0x5A, 0xC4, 0xB7, 0x99,
+        0xF5, 0xBE, 0xA1, 0x6D, 0x5E, 0x13, 0xE9, 0xAF,
+        0x97, 0x1D, 0x5E, 0x9B, 0x98, 0x4C, 0x9F, 0x39,
+        0x72, 0x8B, 0x5E, 0x57, 0x39, 0x73, 0x5A, 0x21,
+        0x9B, 0x97, 0xC3, 0x56, 0x43, 0x6A, 0xDC, 0x6E,
+        0x95, 0xBB, 0x03, 0x52, 0xF6, 0xBE, 0x64, 0xA6,
+        0xC2, 0x91, 0x2D, 0x4E, 0xF2, 0xD0, 0x43, 0x3C,
+        0xED, 0x2B, 0x61, 0x71, 0x64, 0x00, 0x12, 0xD9,
+        0x46, 0x0F, 0x01, 0x5C, 0x68, 0x22, 0x63, 0x83,
+        0x95, 0x6E, 0x3B, 0xD0, 0x66, 0xE7, 0x97, 0xB6,
+        0x23, 0xC2, 0x7C, 0xE0, 0xEA, 0xC2, 0xF5, 0x51,
+        0xA1, 0x0C, 0x2C, 0x72, 0x4D, 0x98, 0x52, 0x07,
+        0x7B, 0x87, 0x22, 0x0B, 0x65, 0x36, 0xC5, 0xC4,
+        0x08, 0xA1, 0xD2, 0xAE, 0xBB, 0x8E, 0x86, 0xD6,
+        0x78, 0xAE, 0x49, 0xCB, 0x57, 0x09, 0x1F, 0x47,
+        0x32, 0x29, 0x65, 0x79, 0xAB, 0x44, 0xFC, 0xD1,
+        0x7F, 0x0F, 0xC5, 0x6A
+    };
+    static uint8_t const shared_secret[66] PROGMEM = {
+        0x01, 0x14, 0x4C, 0x7D, 0x79, 0xAE, 0x69, 0x56,
+        0xBC, 0x8E, 0xDB, 0x8E, 0x7C, 0x78, 0x7C, 0x45,
+        0x21, 0xCB, 0x08, 0x6F, 0xA6, 0x44, 0x07, 0xF9,
+        0x78, 0x94, 0xE5, 0xE6, 0xB2, 0xD7, 0x9B, 0x04,
+        0xD1, 0x42, 0x7E, 0x73, 0xCA, 0x4B, 0xAA, 0x24,
+        0x0A, 0x34, 0x78, 0x68, 0x59, 0x81, 0x0C, 0x06,
+        0xB3, 0xC7, 0x15, 0xA3, 0xA8, 0xCC, 0x31, 0x51,
+        0xF2, 0xBE, 0xE4, 0x17, 0x99, 0x6D, 0x19, 0xF3,
+        0xDD, 0xEA
+    };
+
+    // Evaluate the curve function and check the public keys.
+    uint8_t result[132];
+    Serial.println("Fixed test vectors:");
+    Serial.print("Computing Alice's public key ... ");
+    Serial.flush();
+    memcpy_P(alice_f, alice_private, 66);
+    unsigned long start = micros();
+    P521::eval(result, alice_f, 0);
+    unsigned long elapsed = micros() - start;
+    if (P521_memcmp_P(result, alice_public, 132) == 0) {
+        Serial.print("ok");
+    } else {
+        Serial.println("failed");
+        printNumber("actual  ", result, 132);
+        printNumber("expected", alice_f, 132);
+    }
+    Serial.print(" (elapsed ");
+    Serial.print(elapsed);
+    Serial.println(" us)");
+    Serial.print("Computing Bob's public key ... ");
+    Serial.flush();
+    memcpy_P(bob_f, bob_private, 66);
+    start = micros();
+    P521::eval(result, bob_f, 0);
+    elapsed = micros() - start;
+    if (P521_memcmp_P(result, bob_public, 132) == 0) {
+        Serial.print("ok");
+    } else {
+        Serial.println("failed");
+        printNumber("actual  ", result, 132);
+        printNumber("expected", bob_f, 132);
+    }
+    Serial.print(" (elapsed ");
+    Serial.print(elapsed);
+    Serial.println(" us)");
+
+    // Compute the shared secret from each side.
+    Serial.print("Computing Alice's shared secret ... ");
+    Serial.flush();
+    memcpy_P(alice_f, alice_private, 66);
+    memcpy_P(bob_k, bob_public, 132);
+    memcpy_P(bob_f, shared_secret, 66);
+    start = micros();
+    P521::eval(result, alice_f, bob_k);
+    elapsed = micros() - start;
+    if (P521_memcmp_P(result, shared_secret, 66) == 0) {
+        Serial.print("ok");
+    } else {
+        Serial.println("failed");
+        printNumber("actual  ", result, 66);
+        printNumber("expected", bob_f, 66);
+    }
+    Serial.print(" (elapsed ");
+    Serial.print(elapsed);
+    Serial.println(" us)");
+    Serial.print("Computing Bob's shared secret ... ");
+    Serial.flush();
+    memcpy_P(bob_f, bob_private, 66);
+    memcpy_P(alice_k, alice_public, 132);
+    memcpy_P(alice_f, shared_secret, 66);
+    start = micros();
+    P521::eval(result, bob_f, alice_k);
+    elapsed = micros() - start;
+    if (P521_memcmp_P(result, shared_secret, 66) == 0) {
+        Serial.print("ok");
+    } else {
+        Serial.println("failed");
+        printNumber("actual  ", result, 66);
+        printNumber("expected", alice_f, 66);
+    }
+    Serial.print(" (elapsed ");
+    Serial.print(elapsed);
+    Serial.println(" us)");
+}
+
+void testDH()
+{
+    Serial.println("Diffie-Hellman key exchange:");
+    Serial.print("Generate random k/f for Alice ... ");
+    Serial.flush();
+    unsigned long start = micros();
+    P521::dh1(alice_k, alice_f);
+    unsigned long elapsed = micros() - start;
+    Serial.print("elapsed ");
+    Serial.print(elapsed);
+    Serial.println(" us");
+
+    Serial.print("Generate random k/f for Bob ... ");
+    Serial.flush();
+    start = micros();
+    P521::dh1(bob_k, bob_f);
+    elapsed = micros() - start;
+    Serial.print("elapsed ");
+    Serial.print(elapsed);
+    Serial.println(" us");
+
+    Serial.print("Generate shared secret for Alice ... ");
+    Serial.flush();
+    start = micros();
+    P521::dh2(bob_k, alice_f);
+    elapsed = micros() - start;
+    Serial.print("elapsed ");
+    Serial.print(elapsed);
+    Serial.println(" us");
+
+    Serial.print("Generate shared secret for Bob ... ");
+    Serial.flush();
+    start = micros();
+    P521::dh2(alice_k, bob_f);
+    elapsed = micros() - start;
+    Serial.print("elapsed ");
+    Serial.print(elapsed);
+    Serial.println(" us");
+
+    Serial.print("Check that the shared secrets match ... ");
+    if (memcmp(alice_f, bob_f, 66) == 0) {
+        Serial.println("ok");
+    } else {
+        Serial.println("failed");
+        printNumber("actual  ", alice_f, 66);
+        printNumber("expected", bob_f, 66);
+    }
+}
+
+struct TestSignKey
+{
+    uint8_t privateKey[66];
+    uint8_t publicKey[132];
+};
+
+// Test key from RFC 6979, Appendix A.2.7.
+static TestSignKey const testKeyP521 PROGMEM = {
+    {0x00, 0xFA, 0xD0, 0x6D, 0xAA, 0x62, 0xBA, 0x3B,        // x
+     0x25, 0xD2, 0xFB, 0x40, 0x13, 0x3D, 0xA7, 0x57,
+     0x20, 0x5D, 0xE6, 0x7F, 0x5B, 0xB0, 0x01, 0x8F,
+     0xEE, 0x8C, 0x86, 0xE1, 0xB6, 0x8C, 0x7E, 0x75,
+     0xCA, 0xA8, 0x96, 0xEB, 0x32, 0xF1, 0xF4, 0x7C,
+     0x70, 0x85, 0x58, 0x36, 0xA6, 0xD1, 0x6F, 0xCC,
+     0x14, 0x66, 0xF6, 0xD8, 0xFB, 0xEC, 0x67, 0xDB,
+     0x89, 0xEC, 0x0C, 0x08, 0xB0, 0xE9, 0x96, 0xB8,
+     0x35, 0x38},
+    {0x01, 0x89, 0x45, 0x50, 0xD0, 0x78, 0x59, 0x32,        // Ux
+     0xE0, 0x0E, 0xAA, 0x23, 0xB6, 0x94, 0xF2, 0x13,
+     0xF8, 0xC3, 0x12, 0x1F, 0x86, 0xDC, 0x97, 0xA0,
+     0x4E, 0x5A, 0x71, 0x67, 0xDB, 0x4E, 0x5B, 0xCD,
+     0x37, 0x11, 0x23, 0xD4, 0x6E, 0x45, 0xDB, 0x6B,
+     0x5D, 0x53, 0x70, 0xA7, 0xF2, 0x0F, 0xB6, 0x33,
+     0x15, 0x5D, 0x38, 0xFF, 0xA1, 0x6D, 0x2B, 0xD7,
+     0x61, 0xDC, 0xAC, 0x47, 0x4B, 0x9A, 0x2F, 0x50,
+     0x23, 0xA4,
+     0x00, 0x49, 0x31, 0x01, 0xC9, 0x62, 0xCD, 0x4D,        // Uy
+     0x2F, 0xDD, 0xF7, 0x82, 0x28, 0x5E, 0x64, 0x58,
+     0x41, 0x39, 0xC2, 0xF9, 0x1B, 0x47, 0xF8, 0x7F,
+     0xF8, 0x23, 0x54, 0xD6, 0x63, 0x0F, 0x74, 0x6A,
+     0x28, 0xA0, 0xDB, 0x25, 0x74, 0x1B, 0x5B, 0x34,
+     0xA8, 0x28, 0x00, 0x8B, 0x22, 0xAC, 0xC2, 0x3F,
+     0x92, 0x4F, 0xAA, 0xFB, 0xD4, 0xD3, 0x3F, 0x81,
+     0xEA, 0x66, 0x95, 0x6D, 0xFE, 0xAA, 0x2B, 0xFD,
+     0xFC, 0xF5}
+};
+
+struct TestSignVector
+{
+    const char *name;
+    const char *data;
+    uint8_t signature[132];
+};
+
+// Test vectors from RFC 6979, Appendix A.2.7.
+static TestSignVector const testVectorP521_1 PROGMEM = {
+    // P-521 test case with SHA-256 and message "sample".
+    "P-521 #1",
+    "sample",
+    {0x01, 0x51, 0x1B, 0xB4, 0xD6, 0x75, 0x11, 0x4F,        // r
+     0xE2, 0x66, 0xFC, 0x43, 0x72, 0xB8, 0x76, 0x82,
+     0xBA, 0xEC, 0xC0, 0x1D, 0x3C, 0xC6, 0x2C, 0xF2,
+     0x30, 0x3C, 0x92, 0xB3, 0x52, 0x60, 0x12, 0x65,
+     0x9D, 0x16, 0x87, 0x6E, 0x25, 0xC7, 0xC1, 0xE5,
+     0x76, 0x48, 0xF2, 0x3B, 0x73, 0x56, 0x4D, 0x67,
+     0xF6, 0x1C, 0x6F, 0x14, 0xD5, 0x27, 0xD5, 0x49,
+     0x72, 0x81, 0x04, 0x21, 0xE7, 0xD8, 0x75, 0x89,
+     0xE1, 0xA7,
+     0x00, 0x4A, 0x17, 0x11, 0x43, 0xA8, 0x31, 0x63,        // s
+     0xD6, 0xDF, 0x46, 0x0A, 0xAF, 0x61, 0x52, 0x26,
+     0x95, 0xF2, 0x07, 0xA5, 0x8B, 0x95, 0xC0, 0x64,
+     0x4D, 0x87, 0xE5, 0x2A, 0xA1, 0xA3, 0x47, 0x91,
+     0x6E, 0x4F, 0x7A, 0x72, 0x93, 0x0B, 0x1B, 0xC0,
+     0x6D, 0xBE, 0x22, 0xCE, 0x3F, 0x58, 0x26, 0x4A,
+     0xFD, 0x23, 0x70, 0x4C, 0xBB, 0x63, 0xB2, 0x9B,
+     0x93, 0x1F, 0x7D, 0xE6, 0xC9, 0xD9, 0x49, 0xA7,
+     0xEC, 0xFC}
+};
+static TestSignVector const testVectorP521_2 PROGMEM = {
+    // P-521 test case with SHA-512 and message "sample".
+    "P-521 #2",
+    "sample",
+    {0x00, 0xC3, 0x28, 0xFA, 0xFC, 0xBD, 0x79, 0xDD,        // r
+     0x77, 0x85, 0x03, 0x70, 0xC4, 0x63, 0x25, 0xD9,
+     0x87, 0xCB, 0x52, 0x55, 0x69, 0xFB, 0x63, 0xC5,
+     0xD3, 0xBC, 0x53, 0x95, 0x0E, 0x6D, 0x4C, 0x5F,
+     0x17, 0x4E, 0x25, 0xA1, 0xEE, 0x90, 0x17, 0xB5,
+     0xD4, 0x50, 0x60, 0x6A, 0xDD, 0x15, 0x2B, 0x53,
+     0x49, 0x31, 0xD7, 0xD4, 0xE8, 0x45, 0x5C, 0xC9,
+     0x1F, 0x9B, 0x15, 0xBF, 0x05, 0xEC, 0x36, 0xE3,
+     0x77, 0xFA,
+     0x00, 0x61, 0x7C, 0xCE, 0x7C, 0xF5, 0x06, 0x48,        // s
+     0x06, 0xC4, 0x67, 0xF6, 0x78, 0xD3, 0xB4, 0x08,
+     0x0D, 0x6F, 0x1C, 0xC5, 0x0A, 0xF2, 0x6C, 0xA2,
+     0x09, 0x41, 0x73, 0x08, 0x28, 0x1B, 0x68, 0xAF,
+     0x28, 0x26, 0x23, 0xEA, 0xA6, 0x3E, 0x5B, 0x5C,
+     0x07, 0x23, 0xD8, 0xB8, 0xC3, 0x7F, 0xF0, 0x77,
+     0x7B, 0x1A, 0x20, 0xF8, 0xCC, 0xB1, 0xDC, 0xCC,
+     0x43, 0x99, 0x7F, 0x1E, 0xE0, 0xE4, 0x4D, 0xA4,
+     0xA6, 0x7A}
+};
+static TestSignVector const testVectorP521_3 PROGMEM = {
+    // P-521 test case with SHA-256 and message "test".
+    "P-521 #3",
+    "test",
+    {0x00, 0x0E, 0x87, 0x1C, 0x4A, 0x14, 0xF9, 0x93,        // r
+     0xC6, 0xC7, 0x36, 0x95, 0x01, 0x90, 0x0C, 0x4B,
+     0xC1, 0xE9, 0xC7, 0xB0, 0xB4, 0xBA, 0x44, 0xE0,
+     0x48, 0x68, 0xB3, 0x0B, 0x41, 0xD8, 0x07, 0x10,
+     0x42, 0xEB, 0x28, 0xC4, 0xC2, 0x50, 0x41, 0x1D,
+     0x0C, 0xE0, 0x8C, 0xD1, 0x97, 0xE4, 0x18, 0x8E,
+     0xA4, 0x87, 0x6F, 0x27, 0x9F, 0x90, 0xB3, 0xD8,
+     0xD7, 0x4A, 0x3C, 0x76, 0xE6, 0xF1, 0xE4, 0x65,
+     0x6A, 0xA8,
+     0x00, 0xCD, 0x52, 0xDB, 0xAA, 0x33, 0xB0, 0x63,        // s
+     0xC3, 0xA6, 0xCD, 0x80, 0x58, 0xA1, 0xFB, 0x0A,
+     0x46, 0xA4, 0x75, 0x4B, 0x03, 0x4F, 0xCC, 0x64,
+     0x47, 0x66, 0xCA, 0x14, 0xDA, 0x8C, 0xA5, 0xCA,
+     0x9F, 0xDE, 0x00, 0xE8, 0x8C, 0x1A, 0xD6, 0x0C,
+     0xCB, 0xA7, 0x59, 0x02, 0x52, 0x99, 0x07, 0x9D,
+     0x7A, 0x42, 0x7E, 0xC3, 0xCC, 0x5B, 0x61, 0x9B,
+     0xFB, 0xC8, 0x28, 0xE7, 0x76, 0x9B, 0xCD, 0x69,
+     0x4E, 0x86}
+};
+static TestSignVector const testVectorP521_4 PROGMEM = {
+    // P-521 test case with SHA-512 and message "test".
+    "P-521 #4",
+    "test",
+    {0x01, 0x3E, 0x99, 0x02, 0x0A, 0xBF, 0x5C, 0xEE,        // r
+     0x75, 0x25, 0xD1, 0x6B, 0x69, 0xB2, 0x29, 0x65,
+     0x2A, 0xB6, 0xBD, 0xF2, 0xAF, 0xFC, 0xAE, 0xF3,
+     0x87, 0x73, 0xB4, 0xB7, 0xD0, 0x87, 0x25, 0xF1,
+     0x0C, 0xDB, 0x93, 0x48, 0x2F, 0xDC, 0xC5, 0x4E,
+     0xDC, 0xEE, 0x91, 0xEC, 0xA4, 0x16, 0x6B, 0x2A,
+     0x7C, 0x62, 0x65, 0xEF, 0x0C, 0xE2, 0xBD, 0x70,
+     0x51, 0xB7, 0xCE, 0xF9, 0x45, 0xBA, 0xBD, 0x47,
+     0xEE, 0x6D,
+     0x01, 0xFB, 0xD0, 0x01, 0x3C, 0x67, 0x4A, 0xA7,        // s
+     0x9C, 0xB3, 0x98, 0x49, 0x52, 0x79, 0x16, 0xCE,
+     0x30, 0x1C, 0x66, 0xEA, 0x7C, 0xE8, 0xB8, 0x06,
+     0x82, 0x78, 0x6A, 0xD6, 0x0F, 0x98, 0xF7, 0xE7,
+     0x8A, 0x19, 0xCA, 0x69, 0xEF, 0xF5, 0xC5, 0x74,
+     0x00, 0xE3, 0xB3, 0xA0, 0xAD, 0x66, 0xCE, 0x09,
+     0x78, 0x21, 0x4D, 0x13, 0xBA, 0xF4, 0xE9, 0xAC,
+     0x60, 0x75, 0x2F, 0x7B, 0x15, 0x5E, 0x2D, 0xE4,
+     0xDC, 0xE3}
+};
+
+void testSignCommon(const struct TestSignVector *_test, Hash *hash)
+{
+    uint8_t *privateKey = alice_f;
+    uint8_t *publicKey = alice_k;
+    uint8_t *sig = bob_k;
+    static TestSignVector test;
+
+    memcpy_P(&test, _test, sizeof(test));
+
+    Serial.print(test.name);
+    Serial.print(" Sign ... ");
+    Serial.flush();
+
+    memcpy_P(privateKey, testKeyP521.privateKey, 66);
+
+    unsigned long start = micros();
+    P521::sign(sig, privateKey, test.data, strlen(test.data), hash);
+    unsigned long elapsed = micros() - start;
+    Serial.print(elapsed);
+    Serial.print(" us ... ");
+
+    bool ok = !memcmp(sig, test.signature, 132);
+    if (ok) {
+        Serial.println("ok");
+    } else {
+        Serial.println("failed");
+        printNumber("actual  ", sig, 132);
+        printNumber("expected", test.signature, 132);
+    }
+
+    Serial.print(test.name);
+    Serial.print(" Verify ... ");
+    Serial.flush();
+
+    memcpy_P(publicKey, testKeyP521.publicKey, 132);
+
+    start = micros();
+    bool verified = P521::verify
+        (test.signature, publicKey, test.data, strlen(test.data), hash);
+    elapsed = micros() - start;
+    Serial.print(elapsed);
+    Serial.print(" us ... ");
+
+    if (verified)
+        Serial.println("ok");
+    else
+        Serial.println("failed");
+
+    Serial.print(test.name);
+    Serial.print(" Derive Public Key ... ");
+    Serial.flush();
+
+    memcpy_P(privateKey, testKeyP521.privateKey, 66);
+
+    start = micros();
+    P521::derivePublicKey(publicKey, privateKey);
+    elapsed = micros() - start;
+    Serial.print(elapsed);
+    Serial.print(" us ... ");
+
+    ok = !P521_memcmp_P(publicKey, testKeyP521.publicKey, 132);
+    if (ok) {
+        Serial.println("ok");
+    } else {
+        Serial.println("failed");
+        printNumber("actual  ", publicKey, 132);
+        memcpy_P(publicKey, testKeyP521.publicKey, 132);
+        printNumber("expected", publicKey, 132);
+    }
+}
+
+void testSignSHA256(const struct TestSignVector *test)
+{
+    SHA256 hash;
+    testSignCommon(test, &hash);
+}
+
+void testSignSHA512(const struct TestSignVector *test)
+{
+    SHA512 hash;
+    testSignCommon(test, &hash);
+}
+
+void testSign()
+{
+    Serial.println("Digital signatures:");
+    testSignSHA256(&testVectorP521_1);
+    testSignSHA512(&testVectorP521_2);
+    testSignSHA256(&testVectorP521_3);
+    testSignSHA512(&testVectorP521_4);
+}
+
+void setup()
+{
+    Serial.begin(9600);
+
+    // Start the random number generator.  We don't initialise a noise
+    // source here because we don't need one for testing purposes.
+    // Real DH applications should of course use a proper noise source.
+    RNG.begin("TestP521 1.0", 950);
+
+    // Perform the tests.
+    testEval();
+    Serial.println();
+    testDH();
+    Serial.println();
+    testSign();
+    Serial.println();
+}
+
+void loop()
+{
+}
diff --git a/libraries/Crypto/examples/TestP521Math/TestP521Math.ino b/libraries/Crypto/examples/TestP521Math/TestP521Math.ino
new file mode 100644
index 00000000..6997191f
--- /dev/null
+++ b/libraries/Crypto/examples/TestP521Math/TestP521Math.ino
@@ -0,0 +1,612 @@
+/*
+ * Copyright (C) 2016 Southern Storm Software, Pty Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+This example runs tests on the P521 field mathematics independent
+of the full curve operation itself.
+*/
+
+// Enable access to the internals of P521 to test the raw field ops.
+#define TEST_P521_FIELD_OPS 1
+
+#include <Crypto.h>
+#include <P521.h>
+#include <utility/ProgMemUtil.h>
+#include <string.h>
+
+// Copy some definitions from the P521 class for convenience.
+#define NUM_LIMBS   ((66 + sizeof(limb_t) - 1) / sizeof(limb_t))
+#define LIMB_BITS   (8 * sizeof(limb_t))
+#define INVERSE_LIMB (~((limb_t)0))
+
+// For simpleMod() below we need a type that is 4 times the size of limb_t.
+#if BIGNUMBER_LIMB_8BIT
+#define qlimb_t uint32_t
+#elif BIGNUMBER_LIMB_16BIT
+#define qlimb_t uint64_t
+#else
+#define BIGNUMBER_NO_QLIMB 1
+#endif
+
+limb_t arg1[NUM_LIMBS];
+limb_t arg2[NUM_LIMBS];
+limb_t result[NUM_LIMBS];
+limb_t result2[NUM_LIMBS * 2 + 1];
+limb_t temp[NUM_LIMBS];
+
+// Convert a decimal string in program memory into a number.
+void fromString(limb_t *x, uint8_t size, const char *str)
+{
+    uint8_t ch, posn;
+    memset(x, 0, sizeof(limb_t) * size);
+    while ((ch = pgm_read_byte((uint8_t *)str)) != '\0') {
+        if (ch >= '0' && ch <= '9') {
+            // Quick and simple method to multiply by 10 and add the new digit.
+            dlimb_t carry = ch - '0';
+            for (posn = 0; posn < size; ++posn) {
+                carry += ((dlimb_t)x[posn]) * 10U;
+                x[posn] = (limb_t)carry;
+                carry >>= LIMB_BITS;
+            }
+        }
+        ++str;
+    }
+}
+
+// Compare two numbers of NUM_LIMBS in length.  Returns -1, 0, or 1.
+int compare(const limb_t *x, const limb_t *y)
+{
+    for (uint8_t posn = NUM_LIMBS; posn > 0; --posn) {
+        limb_t a = x[posn - 1];
+        limb_t b = y[posn - 1];
+        if (a < b)
+            return -1;
+        else if (a > b)
+            return 1;
+    }
+    return 0;
+}
+
+// Compare two numbers where one is a decimal string.  Returns -1, 0, or 1.
+int compare(const limb_t *x, const char *y)
+{
+    limb_t val[NUM_LIMBS];
+    fromString(val, NUM_LIMBS, y);
+    return compare(x, val);
+}
+
+void printNumber(const char *name, const limb_t *x)
+{
+    static const char hexchars[] = "0123456789ABCDEF";
+    Serial.print(name);
+    Serial.print(" = ");
+    for (uint8_t posn = NUM_LIMBS; posn > 0; --posn) {
+        for (uint8_t bit = LIMB_BITS; bit > 0; ) {
+            bit -= 4;
+            Serial.print(hexchars[(x[posn - 1] >> bit) & 0x0F]);
+        }
+        Serial.print(' ');
+    }
+    Serial.println();
+}
+
+// Standard numbers that are useful in field operation tests.
+char const num_0[] PROGMEM = "0";
+char const num_1[] PROGMEM = "1";
+char const num_2[] PROGMEM = "2";
+char const num_4[] PROGMEM = "4";
+char const num_5[] PROGMEM = "5";
+char const num_128[] PROGMEM = "128";
+char const num_256[] PROGMEM = "256";
+char const num_2_64_m7[] PROGMEM = "18446744073709551609"; // 2^64 - 7
+char const num_2_129_m5[] PROGMEM = "680564733841876926926749214863536422907"; // 2^129 - 5
+char const num_pi[] PROGMEM = "31415926535897932384626433832795028841971693993751058209749445923078164062862"; // 77 digits of pi
+char const num_2_255_m253[] PROGMEM = "57896044618658097711785492504343953926634992332820282019728792003956564819715"; // 2^255 - 253
+char const num_2_255_m20[] PROGMEM = "57896044618658097711785492504343953926634992332820282019728792003956564819948"; // 2^255 - 20
+char const num_2_255_m19[] PROGMEM = "57896044618658097711785492504343953926634992332820282019728792003956564819949"; // 2^255 - 19
+char const num_2_255_m19_x2[] PROGMEM = "115792089237316195423570985008687907853269984665640564039457584007913129639898"; // (2^255 - 19) * 2
+char const num_2_521_m1[] PROGMEM = "6864797660130609714981900799081393217269435300143305409394463459185543183397656052122559640661454554977296311391480858037121987999716643812574028291115057151"; // 2^521 - 1
+char const num_2_521_m2[] PROGMEM = "6864797660130609714981900799081393217269435300143305409394463459185543183397656052122559640661454554977296311391480858037121987999716643812574028291115057150"; // 2^521 - 2
+char const num_2_521_m20[] PROGMEM = "6864797660130609714981900799081393217269435300143305409394463459185543183397656052122559640661454554977296311391480858037121987999716643812574028291115057132"; // 2^521 - 20
+char const num_pi_big[] PROGMEM = "3141592653589793238462643383279502884197169399375105820974944592307816406286208998628034825342117067982148086513282306647093844609550582231725359408128481117"; // 157 digits of pi
+
+// Table of useful numbers less than 2^521 - 1.
+const char * const numbers[] = {
+    num_0,
+    num_1,
+    num_2,
+    num_4,
+    num_5,
+    num_128,
+    num_256,
+    num_2_64_m7,
+    num_2_129_m5,
+    num_pi,
+    num_2_255_m253,
+    num_2_255_m20,
+    num_2_255_m19_x2,
+    num_pi_big,
+    num_2_521_m2,
+    num_2_521_m20,
+    0
+};
+#define numbers_count   ((sizeof(numbers) / sizeof(numbers[0])) - 1)
+
+#define foreach_number(var) \
+    const char *var = numbers[0]; \
+    for (unsigned index##var = 0; index##var < numbers_count; \
+         ++index##var, var = numbers[index##var])
+
+void printProgMem(const char *str)
+{
+    uint8_t ch;
+    while ((ch = pgm_read_byte((uint8_t *)str)) != '\0') {
+        Serial.print((char)ch);
+        ++str;
+    }
+}
+
+// Simple implementation of modular addition to cross-check the library.
+void simpleAdd(limb_t *result, const limb_t *x, const limb_t *y)
+{
+    uint8_t posn;
+    dlimb_t carry = 0;
+    for (posn = 0; posn < NUM_LIMBS; ++posn) {
+        carry += x[posn];
+        carry += y[posn];
+        result[posn] = (limb_t)carry;
+        carry >>= LIMB_BITS;
+    }
+    if (compare(result, num_2_521_m1) >= 0) {
+        // Subtract 2^521 - 1 to get the final result.
+        // Same as add 1 and then subtract 2^521.
+        carry = 1;
+        for (posn = 0; posn < NUM_LIMBS; ++posn) {
+            carry += result[posn];
+            result[posn] = (limb_t)carry;
+            carry >>= LIMB_BITS;
+        }
+#if BIGNUMBER_LIMB_8BIT
+        result[NUM_LIMBS - 1] &= 0x01;
+#else
+        result[NUM_LIMBS - 1] &= 0x1FF;
+#endif
+    }
+}
+
+// Simple implementation of subtraction to cross-check the library.
+// Note: this does not reduce the result modulo 2^521 - 1 and we
+// assume that x is greater than or equal to y.
+void simpleSub(limb_t *result, const limb_t *x, const limb_t *y)
+{
+    uint8_t posn;
+    dlimb_t borrow = 0;
+    for (posn = 0; posn < NUM_LIMBS; ++posn) {
+        borrow = ((dlimb_t)x[posn]) - y[posn] - borrow;
+        result[posn] = (limb_t)borrow;
+        borrow = (borrow >> LIMB_BITS) != 0;
+    }
+}
+
+// Simple implementation of multiplication to cross-check the library.
+// Note: this does not reduce the result modulo 2^521 - 1.
+// The "result" buffer must contain at least NUM_LIMBS * 2 limbs.
+void simpleMul(limb_t *result, const limb_t *x, const limb_t *y)
+{
+    memset(result, 0, NUM_LIMBS * 2 * sizeof(limb_t));
+    for (uint8_t i = 0; i < NUM_LIMBS; ++i) {
+        for (uint8_t j = 0; j < NUM_LIMBS; ++j) {
+            uint8_t n = i + j;
+            dlimb_t carry =
+                ((dlimb_t)x[i]) * y[j] + result[n];
+            result[n] = (limb_t)carry;
+            carry >>= LIMB_BITS;
+            ++n;
+            while (carry != 0 && n < (NUM_LIMBS * 2)) {
+                carry += result[n];
+                result[n] = (limb_t)carry;
+                carry >>= LIMB_BITS;
+                ++n;
+            }
+        }
+    }
+}
+
+#if defined(BIGNUMBER_NO_QLIMB)
+
+// Quick check to correct the estimate on a quotient word.
+static inline limb_t correctEstimate
+    (limb_t q, limb_t y1, limb_t y2, dlimb_t x01, limb_t x2)
+{
+    // Algorithm D from section 4.3.1 of "The Art Of Computer Programming",
+    // D. Knuth, Volume 2, "Seminumerical Algorithms", Second Edition, 1981.
+    //
+    // We want to check if (y2 * q) > ((x01 - y1 * q) * b + x2) where
+    // b is (1 << LIMB_BITS).  If it is, then q must be reduced by 1.
+    //
+    // One wrinkle that isn't obvious from Knuth's description is that it
+    // is possible for (x01 - y1 * q) >= b, especially in the case where
+    // x0 = y1 and q = b - 1.  This will cause an overflow of the intermediate
+    // double-word result ((x01 - y1 * q) * b).
+    //
+    // In assembly language, we could use the carry flag to detect when
+    // (x01 - y1 * q) * b overflows, but we can't access the carry flag
+    // in C++.  So we have to account for the carry in a different way here.
+
+    // Calculate the remainder using the estimated quotient.
+    dlimb_t r = x01 - ((dlimb_t)y1) * q;
+
+    // If there will be a double-word carry when we calculate (r * b),
+    // then (y2 * q) is obviously going to be less than (r * b), so we
+    // can stop here.  The estimated quotient is correct.
+    if (r & (((dlimb_t)INVERSE_LIMB) << LIMB_BITS))
+        return q;
+
+    // Bail out if (y2 * q) <= (r * b + x2).  The estimate is correct.
+    dlimb_t y2q = ((dlimb_t)y2) * q;
+    if (y2q <= ((r << LIMB_BITS) + x2))
+        return q;
+
+    // Correct for the estimated quotient being off by 1.
+    --q;
+
+    // Now repeat the check to correct for q values that are off by 2.
+    r += y1;    // r' = (x01 - y1 * (q - 1)) = (x01 - y1 * q + y2) = r + y1
+    if (r & (((dlimb_t)INVERSE_LIMB) << LIMB_BITS))
+        return q;
+    // y2q' = (y2 * (q - 1)) = (y2 * q - y2) = y2q - y2
+    if ((y2q - y2) <= ((r << LIMB_BITS) + x2))
+        return q;
+
+    // Perform the final correction for q values that are off by 2.
+    return q - 1;
+}
+
+#endif
+
+// Shift a big number left by a number of bits.
+void shiftLeft(limb_t *x, size_t size, uint8_t shift)
+{
+    dlimb_t carry = 0;
+    while (size > 0) {
+        carry += ((dlimb_t)(*x)) << shift;
+        *x++ = (limb_t)carry;
+        carry >>= LIMB_BITS;
+        --size;
+    }
+}
+
+// Shift a big number right by a number of bits.
+void shiftRight(limb_t *x, size_t size, uint8_t shift)
+{
+    limb_t carry = 0;
+    limb_t word;
+    while (size > 0) {
+        --size;
+        word = x[size];
+        x[size] = (word >> shift) | carry;
+        carry = (word << (LIMB_BITS - shift));
+    }
+}
+
+// Simple implementation of modular division to cross-check the library.
+// Calling this "simple" is a bit of a misnomer.  It is a full implementation
+// of Algorithm D from section 4.3.1 of "The Art Of Computer Programming",
+// D. Knuth, Volume 2, "Seminumerical Algorithms", Second Edition, 1981.
+// This is quite slow on embedded platforms, but it should be correct.
+// Note: "x" is assumed to be (NUM_LIMBS * 2 + 1) limbs in size because
+// we need a limb for the extra leading zero word added by step D1.
+void simpleMod(limb_t *x)
+{
+    limb_t divisor[NUM_LIMBS];
+    uint8_t j, k, shift;
+
+    // Step D1. Normalize.
+    // The divisor (2^521 - 1) and "x" need to be shifted left until
+    // the top-most bit of the divisor is 1.
+#if BIGNUMBER_LIMB_8BIT
+    shift = 7;
+#else
+    shift = LIMB_BITS - 9;
+#endif
+    fromString(divisor, NUM_LIMBS, num_2_521_m1);
+    shiftLeft(divisor, NUM_LIMBS, shift);
+    x[NUM_LIMBS * 2] = 0;
+    shiftLeft(x, NUM_LIMBS * 2 + 1, shift);
+
+    // Step D2/D7. Loop on j
+    for (j = 0; j <= NUM_LIMBS; ++j) {
+        // Step D3. Calculate an estimate of the top-most quotient word.
+        limb_t *u = x + NUM_LIMBS * 2 - 2 - j;
+        limb_t *v = divisor + NUM_LIMBS - 2;
+        limb_t q;
+        dlimb_t uword = ((((dlimb_t)u[2]) << LIMB_BITS) + u[1]);
+        if (u[2] == v[1])
+            q = ~((limb_t)0);
+        else
+            q = (limb_t)(uword / v[1]);
+
+        // Step D3, part 2.  Correct the estimate downwards by 1 or 2.
+        // One subtlety of Knuth's algorithm is that it looks like the test
+        // is working with double-word quantities but it is actually using
+        // double-word plus a carry bit.  So we need to use qlimb_t for this.
+#if !defined(BIGNUMBER_NO_QLIMB)
+        qlimb_t test = ((((qlimb_t)uword) - ((dlimb_t)q) * v[1]) << LIMB_BITS) + u[0];
+        if ((((dlimb_t)q) * v[0]) > test) {
+            --q;
+            test = ((((qlimb_t)uword) - ((dlimb_t)q) * v[1]) << LIMB_BITS) + u[0];
+            if ((((dlimb_t)q) * v[0]) > test)
+                --q;
+        }
+#else
+        // 32-bit platform - we don't have a 128-bit numeric type so we have
+        // to calculate the estimate in another way to preserve the carry bit.
+        q = correctEstimate(q, v[0], v[1], uword, u[0]);
+#endif
+
+        // Step D4. Multiply and subtract.
+        u = x + (NUM_LIMBS - j);
+        v = divisor;
+        dlimb_t carry = 0;
+        dlimb_t borrow = 0;
+        for (k = 0; k < NUM_LIMBS; ++k) {
+            carry += ((dlimb_t)v[k]) * q;
+            borrow = ((dlimb_t)u[k]) - ((limb_t)carry) - borrow;
+            u[k] = (dlimb_t)borrow;
+            carry >>= LIMB_BITS;
+            borrow = ((borrow >> LIMB_BITS) != 0);
+        }
+        borrow = ((dlimb_t)u[k]) - ((limb_t)carry) - borrow;
+        u[k] = (dlimb_t)borrow;
+
+        // Step D5. Test remainder.  Nothing further to do if no borrow.
+        if ((borrow >> LIMB_BITS) == 0)
+            continue;
+
+        // Step D6. Borrow occurred: add back.
+        carry = 0;
+        for (k = 0; k < NUM_LIMBS; ++k) {
+            carry += u[k];
+            carry += v[k];
+            u[k] = (limb_t)carry;
+            carry >>= LIMB_BITS;
+        }
+        u[k] += (limb_t)carry;
+    }
+
+    // Step D8. Unnormalize.
+    // Shift the remainder right to undo the earlier left shift.
+    shiftRight(x, NUM_LIMBS, shift);
+}
+
+void testAdd(const char *x, const char *y)
+{
+    printProgMem(x);
+    Serial.print(" + ");
+    printProgMem(y);
+    Serial.print(": ");
+    Serial.flush();
+
+    fromString(arg1, NUM_LIMBS, x);
+    fromString(arg2, NUM_LIMBS, y);
+    P521::add(result, arg1, arg2);
+
+    simpleAdd(result2, arg1, arg2);
+
+    if (compare(result, result2) == 0) {
+        Serial.println("ok");
+    } else {
+        Serial.println("failed");
+        printNumber("actual  ", result);
+        printNumber("expected", result2);
+    }
+}
+
+void testAdd()
+{
+    Serial.println("Addition:");
+    foreach_number (x) {
+        foreach_number (y) {
+            testAdd(x, y);
+        }
+    }
+    Serial.println();
+}
+
+void testSub(const char *x, const char *y)
+{
+    printProgMem(x);
+    Serial.print(" - ");
+    printProgMem(y);
+    Serial.print(": ");
+    Serial.flush();
+
+    fromString(arg1, NUM_LIMBS, x);
+    fromString(arg2, NUM_LIMBS, y);
+    P521::sub(result, arg1, arg2);
+
+    if (compare(arg1, arg2) >= 0) {
+        // First argument is larger than the second.
+        simpleSub(result2, arg1, arg2);
+    } else {
+        // First argument is smaller than the second.
+        // Compute arg1 + (2^521 - 1 - arg2).
+        fromString(temp, NUM_LIMBS, num_2_521_m1);
+        simpleSub(result2, temp, arg2);
+        simpleAdd(result2, arg1, result2);
+    }
+
+    if (compare(result, result2) == 0) {
+        Serial.println("ok");
+    } else {
+        Serial.println("failed");
+        printNumber("actual  ", result);
+        printNumber("expected", result2);
+    }
+}
+
+void testSub()
+{
+    Serial.println("Subtraction:");
+    foreach_number (x) {
+        foreach_number (y) {
+            testSub(x, y);
+        }
+    }
+    Serial.println();
+}
+
+void testMul(const char *x, const char *y)
+{
+    printProgMem(x);
+    Serial.print(" * ");
+    printProgMem(y);
+    Serial.print(": ");
+    Serial.flush();
+
+    fromString(arg1, NUM_LIMBS, x);
+    fromString(arg2, NUM_LIMBS, y);
+
+    if (compare(arg1, arg2) != 0)
+        P521::mul(result, arg1, arg2);
+    else
+        P521::square(result, arg1);
+
+    simpleMul(result2, arg1, arg2);
+    simpleMod(result2);
+
+    if (compare(result, result2) == 0) {
+        Serial.println("ok");
+    } else {
+        Serial.println("failed");
+        printNumber("actual  ", result);
+        printNumber("expected", result2);
+    }
+}
+
+void testMul()
+{
+    Serial.println("Multiplication:");
+    foreach_number (x) {
+        foreach_number (y) {
+            testMul(x, y);
+        }
+    }
+    Serial.println();
+}
+
+void testMove(const char *x, const char *y, uint8_t select)
+{
+    printProgMem(x);
+    Serial.print(" <- ");
+    printProgMem(y);
+    Serial.print(": ");
+    Serial.flush();
+
+    fromString(arg1, NUM_LIMBS, x);
+    fromString(arg2, NUM_LIMBS, y);
+
+    memcpy(result, arg1, NUM_LIMBS * sizeof(limb_t));
+    memcpy(result2, arg2, NUM_LIMBS * sizeof(limb_t));
+
+    // Perform the move using the selection bit.
+    P521::cmove(select, result, result2);
+    bool ok = compare(result, arg2) == 0;
+
+    // Reset and test not moving.
+    memcpy(result, arg1, NUM_LIMBS * sizeof(limb_t));
+    memcpy(result2, arg2, NUM_LIMBS * sizeof(limb_t));
+    P521::cmove(0, result, result2);
+    if (ok)
+        ok = compare(result, arg1) == 0;
+
+    if (ok) {
+        Serial.println("ok");
+    } else {
+        Serial.println("failed");
+    }
+}
+
+void testMove()
+{
+    Serial.println("Move:");
+    uint8_t bit = 0;
+    foreach_number (x) {
+        foreach_number (y) {
+            testMove(x, y, ((uint8_t)1) << bit);
+            bit = (bit + 1) % 8;
+        }
+    }
+    Serial.println();
+}
+
+void testRecip(const char *x)
+{
+    printProgMem(x);
+    Serial.print("^-1");
+    Serial.print(": ");
+    Serial.flush();
+
+    fromString(arg1, NUM_LIMBS, x);
+    P521::recip(result, arg1);
+
+    bool ok;
+    if (compare(arg1, num_0) == 0) {
+        // 0^-1 = 0
+        ok = (compare(result, num_0) == 0);
+    } else {
+        // Multiply the result with arg1 - we expect 1 as the result.
+        P521::mul(result2, result, arg1);
+        ok = (compare(result2, num_1) == 0);
+    }
+
+    if (ok) {
+        Serial.println("ok");
+    } else {
+        Serial.println("failed");
+        printNumber("actual", result);
+    }
+}
+
+void testRecip()
+{
+    Serial.println("Reciprocal:");
+    foreach_number (x) {
+        testRecip(x);
+    }
+    Serial.println();
+}
+
+void setup()
+{
+    Serial.begin(9600);
+
+    testAdd();
+    testSub();
+    testMul();
+    testMove();
+    testRecip();
+}
+
+void loop()
+{
+}
diff --git a/libraries/Crypto/utility/LimbUtil.h b/libraries/Crypto/utility/LimbUtil.h
index 410b0f6b..bdbaca0a 100644
--- a/libraries/Crypto/utility/LimbUtil.h
+++ b/libraries/Crypto/utility/LimbUtil.h
@@ -26,9 +26,11 @@
 #include "ProgMemUtil.h"
 
 // Number of limbs in a big number value of various sizes.
-#define NUM_LIMBS_128BIT (16 / sizeof(limb_t))
-#define NUM_LIMBS_256BIT (32 / sizeof(limb_t))
-#define NUM_LIMBS_512BIT (64 / sizeof(limb_t))
+#define NUM_LIMBS_BITS(n) \
+    (((n) + sizeof(limb_t) * 8 - 1) / (8 * sizeof(limb_t)))
+#define NUM_LIMBS_128BIT NUM_LIMBS_BITS(128)
+#define NUM_LIMBS_256BIT NUM_LIMBS_BITS(256)
+#define NUM_LIMBS_512BIT NUM_LIMBS_BITS(512)
 
 // The number of bits in a limb.
 #define LIMB_BITS   (8 * sizeof(limb_t))