1
0
mirror of https://github.com/taigrr/arduinolibs synced 2025-01-18 04:33:12 -08:00

Reduce the object state size for Poly1305

This commit is contained in:
Rhys Weatherley 2016-01-16 08:57:46 +10:00
parent b852d222b4
commit 95313613b7
3 changed files with 21 additions and 19 deletions

View File

@ -85,7 +85,7 @@ Ardunino Mega 2560 running at 16 MHz are similar:
<tr><td>SpeckLowMemory (256-bit key, ECB mode)</td><td align="right">37.87us</td><td align="right"> </td><td align="right">16.89us</td><td align="right">35</td></tr>
<tr><td colspan="5"> </td></tr>
<tr><td>AEAD Algorithm</td><td align="right">Encryption (per byte)</td><td align="right">Decryption (per byte)</td><td>Key Setup</td><td>State Size (bytes)</td></tr>
<tr><td>ChaChaPoly</td><td align="right">41.23us</td><td align="right">41.23us</td><td align="right">902.55us</td><td align="right">255</td></tr>
<tr><td>ChaChaPoly</td><td align="right">41.20us</td><td align="right">41.19us</td><td align="right">902.36us</td><td align="right">221</td></tr>
<tr><td>GCM&lt;AES128&gt;</td><td align="right">183.25us</td><td align="right">182.80us</td><td align="right">1272.73us</td><td align="right">284</td></tr>
<tr><td>GCM&lt;AES192&gt;</td><td align="right">189.92us</td><td align="right">189.47us</td><td align="right">1492.60us</td><td align="right">316</td></tr>
<tr><td>GCM&lt;AES256&gt;</td><td align="right">196.59us</td><td align="right">196.13us</td><td align="right">1767.33us</td><td align="right">348</td></tr>
@ -106,7 +106,7 @@ Ardunino Mega 2560 running at 16 MHz are similar:
<tr><td>SHA1 (HMAC mode)</td><td align="right">21.90us</td><td align="right">4296.33us</td><td align="right">1420.24us</td><td align="right">95</td></tr>
<tr><td>SHA256 (HMAC mode)</td><td align="right">43.85us</td><td align="right">8552.61us</td><td align="right">2836.49us</td><td align="right">107</td></tr>
<tr><td>BLAKE2s (HMAC mode)</td><td align="right">20.65us</td><td align="right">4055.56us</td><td align="right">1350.00us</td><td align="right">107</td></tr>
<tr><td>Poly1305</td><td align="right">26.29us</td><td align="right">486.15us</td><td align="right">17.26us</td><td align="right">87</td></tr>
<tr><td>Poly1305</td><td align="right">26.26us</td><td align="right">489.11us</td><td align="right">17.06us</td><td align="right">53</td></tr>
<tr><td>GHASH</td><td align="right">148.14us</td><td align="right">17.09us</td><td align="right">21.87us</td><td align="right">33</td></tr>
<tr><td colspan="5"> </td></tr>
<tr><td>Public Key Operation</td><td align="right">Time (per operation)</td><td colspan="3">Comment</td></tr>
@ -142,7 +142,7 @@ All figures are for the Arduino Due running at 84 MHz:
<tr><td>SpeckLowMemory (256-bit key, ECB mode)</td><td align="right">2.90us</td><td align="right"> </td><td align="right">1.83us</td><td align="right">48</td></tr>
<tr><td colspan="5"> </td></tr>
<tr><td>AEAD Algorithm</td><td align="right">Encryption (per byte)</td><td align="right">Decryption (per byte)</td><td>Key Setup</td><td>State Size (bytes)</td></tr>
<tr><td>ChaChaPoly</td><td align="right">1.66us</td><td align="right">1.66us</td><td align="right">45.02us</td><td align="right">280</td></tr>
<tr><td>ChaChaPoly</td><td align="right">1.71us</td><td align="right">1.71us</td><td align="right">45.08us</td><td align="right">240</td></tr>
<tr><td>GCM&lt;AES128&gt;</td><td align="right">10.29us</td><td align="right">10.29us</td><td align="right">223.82us</td><td align="right">312</td></tr>
<tr><td>GCM&lt;AES192&gt;</td><td align="right">11.50us</td><td align="right">11.51us</td><td align="right">265.62us</td><td align="right">344</td></tr>
<tr><td>GCM&lt;AES256&gt;</td><td align="right">12.67us</td><td align="right">12.67us</td><td align="right">313.06us</td><td align="right">376</td></tr>
@ -163,7 +163,7 @@ All figures are for the Arduino Due running at 84 MHz:
<tr><td>SHA1 (HMAC mode)</td><td align="right">0.94us</td><td align="right">193.92us</td><td align="right">65.09us</td><td align="right">112</td></tr>
<tr><td>SHA256 (HMAC mode)</td><td align="right">1.15us</td><td align="right">238.98us</td><td align="right">80.44us</td><td align="right">120</td></tr>
<tr><td>BLAKE2s (HMAC mode)</td><td align="right">0.72us</td><td align="right">157.75us</td><td align="right">57.18us</td><td align="right">120</td></tr>
<tr><td>Poly1305</td><td align="right">0.85us</td><td align="right">19.25us</td><td align="right">2.35us</td><td align="right">96</td></tr>
<tr><td>Poly1305</td><td align="right">0.81us</td><td align="right">19.01us</td><td align="right">2.57us</td><td align="right">60</td></tr>
<tr><td>GHASH</td><td align="right">4.37us</td><td align="right">1.50us</td><td align="right">4.37us</td><td align="right">36</td></tr>
<tr><td colspan="5"> </td></tr>
<tr><td>Public Key Operation</td><td align="right">Time (per operation)</td><td colspan="3">Comment</td></tr>

View File

@ -180,6 +180,7 @@ void Poly1305::finalize(const void *nonce, void *token, size_t len)
{
dlimb_t carry;
uint8_t i;
limb_t t[NUM_LIMBS_256BIT + 1];
// Pad and flush the final chunk.
if (state.chunkSize > 0) {
@ -211,7 +212,7 @@ void Poly1305::finalize(const void *nonce, void *token, size_t len)
carry = 5;
for (i = 0; i < NUM_LIMBS_130BIT; ++i) {
carry += state.h[i];
state.t[i] = (limb_t)carry;
t[i] = (limb_t)carry;
carry >>= LIMB_BITS;
}
@ -221,10 +222,10 @@ void Poly1305::finalize(const void *nonce, void *token, size_t len)
// of the result because we are about to drop it in the next step.
// We have to do it this way to avoid giving away any information
// about the value of h in the instruction timing.
limb_t mask = (~((state.t[NUM_LIMBS_128BIT] >> 2) & 1)) + 1;
limb_t mask = (~((t[NUM_LIMBS_128BIT] >> 2) & 1)) + 1;
limb_t nmask = ~mask;
for (i = 0; i < NUM_LIMBS_128BIT; ++i) {
state.h[i] = (state.h[i] & nmask) | (state.t[i] & mask);
state.h[i] = (state.h[i] & nmask) | (t[i] & mask);
}
// Add the encrypted nonce and format the final hash.
@ -271,6 +272,8 @@ void Poly1305::clear()
*/
void Poly1305::processChunk()
{
limb_t t[NUM_LIMBS_256BIT + 1];
// Compute h = ((h + c) * r) mod (2^130 - 5).
// Start with h += c. We assume that h is less than (2^130 - 5) * 6
@ -292,28 +295,28 @@ void Poly1305::processChunk()
limb_t word = state.r[0];
for (i = 0; i < NUM_LIMBS_130BIT; ++i) {
carry += ((dlimb_t)(state.h[i])) * word;
state.t[i] = (limb_t)carry;
t[i] = (limb_t)carry;
carry >>= LIMB_BITS;
}
state.t[NUM_LIMBS_130BIT] = (limb_t)carry;
t[NUM_LIMBS_130BIT] = (limb_t)carry;
for (i = 1; i < NUM_LIMBS_128BIT; ++i) {
word = state.r[i];
carry = 0;
for (j = 0; j < NUM_LIMBS_130BIT; ++j) {
carry += ((dlimb_t)(state.h[j])) * word;
carry += state.t[i + j];
state.t[i + j] = (limb_t)carry;
carry += t[i + j];
t[i + j] = (limb_t)carry;
carry >>= LIMB_BITS;
}
state.t[i + NUM_LIMBS_130BIT] = (limb_t)carry;
t[i + NUM_LIMBS_130BIT] = (limb_t)carry;
}
// Reduce h * r modulo (2^130 - 5) by multiplying the high 130 bits by 5
// and adding them to the low 130 bits. See the explaination in the
// comments for Curve25519::reduce() for a description of how this works.
carry = ((dlimb_t)(state.t[NUM_LIMBS_128BIT] >> 2)) +
(state.t[NUM_LIMBS_128BIT] & ~((limb_t)3));
state.t[NUM_LIMBS_128BIT] &= 0x0003;
carry = ((dlimb_t)(t[NUM_LIMBS_128BIT] >> 2)) +
(t[NUM_LIMBS_128BIT] & ~((limb_t)3));
t[NUM_LIMBS_128BIT] &= 0x0003;
for (i = 0; i < NUM_LIMBS_128BIT; ++i) {
// Shift the next word of t up by (LIMB_BITS - 2) bits and then
// multiply it by 5. Breaking it down, we can add the results
@ -323,14 +326,14 @@ void Poly1305::processChunk()
// fit within a dlimb_t variable. However, we can defer adding
// (word << LIMB_BITS) until after the "carry >>= LIMB_BITS" step
// because it won't affect the low bits of the carry.
word = state.t[i + NUM_LIMBS_130BIT];
word = t[i + NUM_LIMBS_130BIT];
carry += ((dlimb_t)word) << (LIMB_BITS - 2);
carry += state.t[i];
carry += t[i];
state.h[i] = (limb_t)carry;
carry >>= LIMB_BITS;
carry += word;
}
state.h[i] = (limb_t)(carry + state.t[NUM_LIMBS_128BIT]);
state.h[i] = (limb_t)(carry + t[NUM_LIMBS_128BIT]);
// At this point, h is either the answer of reducing modulo (2^130 - 5)
// or it is at most 5 subtractions away from the answer we want.

View File

@ -45,7 +45,6 @@ private:
limb_t h[(16 / sizeof(limb_t)) + 1];
limb_t c[(16 / sizeof(limb_t)) + 1];
limb_t r[(16 / sizeof(limb_t))];
limb_t t[(32 / sizeof(limb_t)) + 1];
uint8_t chunkSize;
} state;