ArduinoLibs
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Curve25519.cpp
1 /*
2  * Copyright (C) 2015 Southern Storm Software, Pty Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included
12  * in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20  * DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "Curve25519.h"
24 #include "Crypto.h"
25 #include "RNG.h"
26 #include "utility/LimbUtil.h"
27 #include <string.h>
28 
44 // Global switch to enable/disable AVR inline assembly optimizations.
45 #if defined(__AVR__)
46 // Disabled for now - there are issues with newer Arduino compilers. FIXME
47 //#define CURVE25519_ASM_AVR 1
48 #endif
49 
50 // The overhead of clean() calls in mul(), reduceQuick(), etc can
51 // add up to a lot of processing time during eval(). Only do such
52 // cleanups if strict mode has been enabled. Other implementations
53 // like curve25519-donna don't do any cleaning at all so the value
54 // of cleaning up the stack is dubious at best anyway.
55 #if defined(CURVE25519_STRICT_CLEAN)
56 #define strict_clean(x) clean(x)
57 #else
58 #define strict_clean(x) do { ; } while (0)
59 #endif
60 
80 bool Curve25519::eval(uint8_t result[32], const uint8_t s[32], const uint8_t x[32])
81 {
82  limb_t x_1[NUM_LIMBS_256BIT];
83  limb_t x_2[NUM_LIMBS_256BIT];
84  limb_t x_3[NUM_LIMBS_256BIT];
85  limb_t z_2[NUM_LIMBS_256BIT];
86  limb_t z_3[NUM_LIMBS_256BIT];
87  limb_t A[NUM_LIMBS_256BIT];
88  limb_t B[NUM_LIMBS_256BIT];
89  limb_t C[NUM_LIMBS_256BIT];
90  limb_t D[NUM_LIMBS_256BIT];
91  limb_t E[NUM_LIMBS_256BIT];
92  limb_t AA[NUM_LIMBS_256BIT];
93  limb_t BB[NUM_LIMBS_256BIT];
94  limb_t DA[NUM_LIMBS_256BIT];
95  limb_t CB[NUM_LIMBS_256BIT];
96  uint8_t mask;
97  uint8_t sposn;
98  uint8_t select;
99  uint8_t swap;
100  bool retval;
101 
102  // Unpack the "x" argument into the limb representation
103  // which also masks off the high bit. NULL means 9.
104  if (x) {
105  // x1 = x
106  BigNumberUtil::unpackLE(x_1, NUM_LIMBS_256BIT, x, 32);
107  x_1[NUM_LIMBS_256BIT - 1] &= ((((limb_t)1) << (LIMB_BITS - 1)) - 1);
108  } else {
109  memset(x_1, 0, sizeof(x_1)); // x_1 = 9
110  x_1[0] = 9;
111  }
112 
113  // Check that "x" is within the range of the modulo field.
114  // We can do this with a reduction - if there was no borrow
115  // then the value of "x" was out of range. Timing is sensitive
116  // here so that we don't reveal anything about the value of "x".
117  // If there was a reduction, then continue executing the rest
118  // of this function with the (now) in-range "x" value and
119  // report the failure at the end.
120  retval = (bool)(reduceQuick(x_1) & 0x01);
121 
122  // Initialize the other temporary variables.
123  memset(x_2, 0, sizeof(x_2)); // x_2 = 1
124  x_2[0] = 1;
125  memset(z_2, 0, sizeof(z_2)); // z_2 = 0
126  memcpy(x_3, x_1, sizeof(x_1)); // x_3 = x
127  memcpy(z_3, x_2, sizeof(x_2)); // z_3 = 1
128 
129  // Iterate over all 255 bits of "s" from the highest to the lowest.
130  // We ignore the high bit of the 256-bit representation of "s".
131  mask = 0x40;
132  sposn = 31;
133  swap = 0;
134  for (uint8_t t = 255; t > 0; --t) {
135  // Conditional swaps on entry to this bit but only if we
136  // didn't swap on the previous bit.
137  select = s[sposn] & mask;
138  swap ^= select;
139  cswap(swap, x_2, x_3);
140  cswap(swap, z_2, z_3);
141 
142  // Evaluate the curve.
143  add(A, x_2, z_2); // A = x_2 + z_2
144  square(AA, A); // AA = A^2
145  sub(B, x_2, z_2); // B = x_2 - z_2
146  square(BB, B); // BB = B^2
147  sub(E, AA, BB); // E = AA - BB
148  add(C, x_3, z_3); // C = x_3 + z_3
149  sub(D, x_3, z_3); // D = x_3 - z_3
150  mul(DA, D, A); // DA = D * A
151  mul(CB, C, B); // CB = C * B
152  add(x_3, DA, CB); // x_3 = (DA + CB)^2
153  square(x_3, x_3);
154  sub(z_3, DA, CB); // z_3 = x_1 * (DA - CB)^2
155  square(z_3, z_3);
156  mul(z_3, z_3, x_1);
157  mul(x_2, AA, BB); // x_2 = AA * BB
158  mulA24(z_2, E); // z_2 = E * (AA + a24 * E)
159  add(z_2, z_2, AA);
160  mul(z_2, z_2, E);
161 
162  // Move onto the next lower bit of "s".
163  mask >>= 1;
164  if (!mask) {
165  --sposn;
166  mask = 0x80;
167  swap = select << 7;
168  } else {
169  swap = select >> 1;
170  }
171  }
172 
173  // Final conditional swaps.
174  cswap(swap, x_2, x_3);
175  cswap(swap, z_2, z_3);
176 
177  // Compute x_2 * (z_2 ^ (p - 2)) where p = 2^255 - 19.
178  recip(z_3, z_2);
179  mul(x_2, x_2, z_3);
180 
181  // Pack the result into the return array.
182  BigNumberUtil::packLE(result, 32, x_2, NUM_LIMBS_256BIT);
183 
184  // Clean up and exit.
185  clean(x_1);
186  clean(x_2);
187  clean(x_3);
188  clean(z_2);
189  clean(z_3);
190  clean(A);
191  clean(B);
192  clean(C);
193  clean(D);
194  clean(E);
195  clean(AA);
196  clean(BB);
197  clean(DA);
198  clean(CB);
199  return retval;
200 }
201 
245 void Curve25519::dh1(uint8_t k[32], uint8_t f[32])
246 {
247  do {
248  // Generate a random "f" value and then adjust the value to make
249  // it valid as an "s" value for eval(). According to the specification
250  // we need to mask off the 3 right-most bits of f[0], mask off the
251  // left-most bit of f[31], and set the second to left-most bit of f[31].
252  RNG.rand(f, 32);
253  f[0] &= 0xF8;
254  f[31] = (f[31] & 0x7F) | 0x40;
255 
256  // Evaluate the curve function: k = Curve25519::eval(f, 9).
257  // We pass NULL to eval() to indicate the value 9. There is no
258  // need to check the return value from eval() because we know
259  // that 9 is a valid field element.
260  eval(k, f, 0);
261 
262  // If "k" is weak for contributory behaviour then reject it,
263  // generate another "f" value, and try again. This case is
264  // highly unlikely but we still perform the check just in case.
265  } while (isWeakPoint(k));
266 }
267 
283 bool Curve25519::dh2(uint8_t k[32], uint8_t f[32])
284 {
285  uint8_t weak;
286 
287  // Evaluate the curve function: k = Curve25519::eval(f, k).
288  // If "k" is weak for contributory behaviour before or after
289  // the curve evaluation, then fail the exchange. For safety
290  // we perform every phase of the weak checks even if we could
291  // bail out earlier so that the execution takes the same
292  // amount of time for weak and non-weak "k" values.
293  weak = isWeakPoint(k); // Is "k" weak before?
294  weak |= ((eval(k, f, k) ^ 0x01) & 0x01); // Is "k" weak during?
295  weak |= isWeakPoint(k); // Is "k" weak after?
296  clean(f, 32);
297  return (bool)((weak ^ 0x01) & 0x01);
298 }
299 
307 uint8_t Curve25519::isWeakPoint(const uint8_t k[32])
308 {
309  // List of weak points from http://cr.yp.to/ecdh.html
310  // That page lists some others but they are variants on these
311  // of the form "point + i * (2^255 - 19)" for i = 0, 1, 2.
312  // Here we mask off the high bit and eval() catches the rest.
313  static const uint8_t points[5][32] PROGMEM = {
314  {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
315  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
316  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
317  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
318  {0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
319  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
320  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
321  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
322  {0xE0, 0xEB, 0x7A, 0x7C, 0x3B, 0x41, 0xB8, 0xAE,
323  0x16, 0x56, 0xE3, 0xFA, 0xF1, 0x9F, 0xC4, 0x6A,
324  0xDA, 0x09, 0x8D, 0xEB, 0x9C, 0x32, 0xB1, 0xFD,
325  0x86, 0x62, 0x05, 0x16, 0x5F, 0x49, 0xB8, 0x00},
326  {0x5F, 0x9C, 0x95, 0xBC, 0xA3, 0x50, 0x8C, 0x24,
327  0xB1, 0xD0, 0xB1, 0x55, 0x9C, 0x83, 0xEF, 0x5B,
328  0x04, 0x44, 0x5C, 0xC4, 0x58, 0x1C, 0x8E, 0x86,
329  0xD8, 0x22, 0x4E, 0xDD, 0xD0, 0x9F, 0x11, 0x57},
330  {0xEC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
331  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
332  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
333  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x7F}
334  };
335 
336  // Check each of the weak points in turn. We perform the
337  // comparisons carefully so as not to reveal the value of "k"
338  // in the instruction timing. If "k" is indeed weak then
339  // we still check everything so as not to reveal which
340  // weak point it is.
341  uint8_t result = 0;
342  for (uint8_t posn = 0; posn < 5; ++posn) {
343  const uint8_t *point = points[posn];
344  uint8_t check = (pgm_read_byte(point + 31) ^ k[31]) & 0x7F;
345  for (uint8_t index = 31; index > 0; --index)
346  check |= (pgm_read_byte(point + index - 1) ^ k[index - 1]);
347  result |= (uint8_t)((((uint16_t)0x0100) - check) >> 8);
348  }
349 
350  // The "result" variable will be non-zero if there was a match.
351  return result;
352 }
353 
366 void Curve25519::reduce(limb_t *result, limb_t *x, uint8_t size)
367 {
368  /*
369  Note: This explaination is best viewed with a UTF-8 text viewer.
370 
371  To help explain what this function is doing, the following describes
372  how to efficiently compute reductions modulo a base of the form (2ⁿ - b)
373  where b is greater than zero and (b + 1)² <= 2ⁿ.
374 
375  Here we are interested in reducing the result of multiplying two
376  numbers that are less than or equal to (2ⁿ - b - 1). That is,
377  multiplying numbers that have already been reduced.
378 
379  Given some x less than or equal to (2ⁿ - b - 1)², we want to find a
380  y less than (2ⁿ - b) such that:
381 
382  y ≡ x mod (2ⁿ - b)
383 
384  We know that for all integer values of k >= 0:
385 
386  y ≡ x - k * (2ⁿ - b)
387  ≡ x - k * 2ⁿ + k * b
388 
389  In our case we choose k = ⌊x / 2ⁿ⌋ and then let:
390 
391  w = (x mod 2ⁿ) + ⌊x / 2ⁿ⌋ * b
392 
393  The value w will either be the answer y or y can be obtained by
394  repeatedly subtracting (2ⁿ - b) from w until it is less than (2ⁿ - b).
395  At most b subtractions will be required.
396 
397  In our case b is 19 which is more subtractions than we would like to do,
398  but we can handle that by performing the above reduction twice and then
399  performing a single trial subtraction:
400 
401  w = (x mod 2ⁿ) + ⌊x / 2ⁿ⌋ * b
402  y = (w mod 2ⁿ) + ⌊w / 2ⁿ⌋ * b
403  if y >= (2ⁿ - b)
404  y -= (2ⁿ - b)
405 
406  The value y is the answer we want for reducing x modulo (2ⁿ - b).
407  */
408 
409 #if !defined(CURVE25519_ASM_AVR)
410  dlimb_t carry;
411  uint8_t posn;
412 
413  // Calculate (x mod 2^255) + ((x / 2^255) * 19) which will
414  // either produce the answer we want or it will produce a
415  // value of the form "answer + j * (2^255 - 19)".
416  carry = ((dlimb_t)(x[NUM_LIMBS_256BIT - 1] >> (LIMB_BITS - 1))) * 19U;
417  x[NUM_LIMBS_256BIT - 1] &= ((((limb_t)1) << (LIMB_BITS - 1)) - 1);
418  for (posn = 0; posn < size; ++posn) {
419  carry += ((dlimb_t)(x[posn + NUM_LIMBS_256BIT])) * 38U;
420  carry += x[posn];
421  x[posn] = (limb_t)carry;
422  carry >>= LIMB_BITS;
423  }
424  if (size < NUM_LIMBS_256BIT) {
425  // The high order half of the number is short; e.g. for mulA24().
426  // Propagate the carry through the rest of the low order part.
427  for (posn = size; posn < NUM_LIMBS_256BIT; ++posn) {
428  carry += x[posn];
429  x[posn] = (limb_t)carry;
430  carry >>= LIMB_BITS;
431  }
432  }
433 
434  // The "j" value may still be too large due to the final carry-out.
435  // We must repeat the reduction. If we already have the answer,
436  // then this won't do any harm but we must still do the calculation
437  // to preserve the overall timing.
438  carry *= 38U;
439  carry += ((dlimb_t)(x[NUM_LIMBS_256BIT - 1] >> (LIMB_BITS - 1))) * 19U;
440  x[NUM_LIMBS_256BIT - 1] &= ((((limb_t)1) << (LIMB_BITS - 1)) - 1);
441  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
442  carry += x[posn];
443  x[posn] = (limb_t)carry;
444  carry >>= LIMB_BITS;
445  }
446 
447  // At this point "x" will either be the answer or it will be the
448  // answer plus (2^255 - 19). Perform a trial subtraction which
449  // is equivalent to adding 19 and subtracting 2^255. We put the
450  // trial answer into the top-most limbs of the original "x" array.
451  // We add 19 here; the subtraction of 2^255 occurs in the next step.
452  carry = 19U;
453  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
454  carry += x[posn];
455  x[posn + NUM_LIMBS_256BIT] = (limb_t)carry;
456  carry >>= LIMB_BITS;
457  }
458 
459  // If there was a borrow, then the bottom-most limbs of "x" are the
460  // correct answer. If there was no borrow, then the top-most limbs
461  // of "x" are the correct answer. Select the correct answer but do
462  // it in a way that instruction timing will not reveal which value
463  // was selected. Borrow will occur if the high bit of the previous
464  // result is 0: turn the high bit into a selection mask.
465  limb_t mask = (limb_t)(((slimb_t)(x[NUM_LIMBS_512BIT - 1])) >> (LIMB_BITS - 1));
466  limb_t nmask = ~mask;
467  x[NUM_LIMBS_512BIT - 1] &= ((((limb_t)1) << (LIMB_BITS - 1)) - 1);
468  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
469  result[posn] = (x[posn] & nmask) | (x[posn + NUM_LIMBS_256BIT] & mask);
470  }
471 #else
472  __asm__ __volatile__ (
473  // Calculate (x mod 2^255) + ((x / 2^255) * 19) which will
474  // either produce the answer we want or it will produce a
475  // value of the form "answer + j * (2^255 - 19)".
476  "ldd r24,Z+31\n" // Extract the high bit of x[31]
477  "mov r25,r24\n" // and mask it off
478  "andi r25,0x7F\n"
479  "std Z+31,r25\n"
480  "lsl r24\n" // carry = high bit * 19
481  "mov r24,__zero_reg__\n"
482  "sbc r24,__zero_reg__\n"
483  "andi r24,19\n"
484 
485  "mov r25,%1\n" // load "size" into r25
486  "ldi r23,38\n" // r23 = 38
487  "mov r22,__zero_reg__\n" // r22 = 0 (we're about to destroy r1)
488  "1:\n"
489  "ld r16,Z\n" // r16 = x[0]
490  "ldd r17,Z+32\n" // r17 = x[32]
491  "mul r17,r23\n" // r0:r1 = r17 * 38
492  "add r0,r24\n" // r0:r1 += carry
493  "adc r1,r22\n"
494  "add r0,r16\n" // r0:r1 += r16
495  "adc r1,r22\n"
496  "st Z+,r0\n" // *x++ = r0
497  "mov r24,r1\n" // carry = r1
498  "dec r25\n" // if (--r25 != 0) loop
499  "brne 1b\n"
500 
501  // If the size is short, then we need to continue propagating carries.
502  "ldi r25,32\n"
503  "cp %1,r25\n"
504  "breq 3f\n"
505  "sub r25,%1\n"
506  "ld __tmp_reg__,Z\n"
507  "add __tmp_reg__,r24\n"
508  "st Z+,__tmp_reg__\n"
509  "dec r25\n"
510  "2:\n"
511  "ld __tmp_reg__,Z\n" // *x++ += carry
512  "adc __tmp_reg__,r22\n"
513  "st Z+,__tmp_reg__\n"
514  "dec r25\n"
515  "brne 2b\n"
516  "mov r24,r22\n" // put the carry back into r24
517  "adc r24,r22\n"
518  "3:\n"
519  "sbiw r30,32\n" // Point Z back to the start of "x"
520 
521  // The "j" value may still be too large due to the final carry-out.
522  // We must repeat the reduction. If we already have the answer,
523  // then this won't do any harm but we must still do the calculation
524  // to preserve the overall timing.
525  "mul r24,r23\n" // carry *= 38
526  "ldd r24,Z+31\n" // Extract the high bit of x[31]
527  "mov r25,r24\n" // and mask it off
528  "andi r25,0x7F\n"
529  "std Z+31,r25\n"
530  "lsl r24\n" // carry += high bit * 19
531  "mov r24,r22\n"
532  "sbc r24,r22\n"
533  "andi r24,19\n"
534  "add r0,r24\n"
535  "adc r1,r22\n" // 9-bit carry is now in r0:r1
536 
537  // Propagate the carry through the rest of x.
538  "ld r24,Z\n" // x[0]
539  "add r0,r24\n"
540  "adc r1,r22\n"
541  "st Z+,r0\n"
542  "ld r24,Z\n" // x[1]
543  "add r1,r24\n"
544  "st Z+,r1\n"
545  "ldi r25,30\n" // x[2..31]
546  "4:\n"
547  "ld r24,Z\n"
548  "adc r24,r22\n"
549  "st Z+,r24\n"
550  "dec r25\n"
551  "brne 4b\n"
552  "sbiw r30,32\n" // Point Z back to the start of "x"
553 
554  // We destroyed __zero_reg__ (r1) above, so restore its zero value.
555  "mov __zero_reg__,r22\n"
556 
557  // At this point "x" will either be the answer or it will be the
558  // answer plus (2^255 - 19). Perform a trial subtraction which
559  // is equivalent to adding 19 and subtracting 2^255. We put the
560  // trial answer into the top-most limbs of the original "x" array.
561  // We add 19 here; the subtraction of 2^255 occurs in the next step.
562  "ldi r24,8\n" // Loop counter.
563  "ldi r25,19\n" // carry = 19
564  "5:\n"
565  "ld r16,Z+\n" // r16:r19:carry = *xx++ + carry
566  "ld r17,Z+\n"
567  "ld r18,Z+\n"
568  "ld r19,Z+\n"
569  "add r16,r25\n" // r16:r19:carry += carry
570  "adc r17,__zero_reg__\n"
571  "adc r18,__zero_reg__\n"
572  "adc r19,__zero_reg__\n"
573  "mov r25,__zero_reg__\n"
574  "adc r25,r25\n"
575  "std Z+28,r16\n" // *tt++ = r16:r19
576  "std Z+29,r17\n"
577  "std Z+30,r18\n"
578  "std Z+31,r19\n"
579  "dec r24\n"
580  "brne 5b\n"
581 
582  // Subtract 2^255 from x[32..63] which is equivalent to extracting
583  // the top bit and then masking it off. If the top bit is zero
584  // then a borrow has occurred and this isn't the answer we want.
585  "mov r25,r19\n"
586  "andi r19,0x7F\n"
587  "std Z+31,r19\n"
588  "lsl r25\n"
589  "mov r25,__zero_reg__\n"
590  "sbc r25,__zero_reg__\n"
591 
592  // At this point, r25 is 0 if the original x[0..31] is the answer
593  // we want, or 0xFF if x[32..63] is the answer we want. Essentially
594  // we need to do a conditional move of either x[0..31] or x[32..63]
595  // into "result".
596  "sbiw r30,32\n" // Point Z back to x[0].
597  "ldi r24,8\n"
598  "6:\n"
599  "ldd r16,Z+32\n"
600  "ldd r17,Z+33\n"
601  "ldd r18,Z+34\n"
602  "ldd r19,Z+35\n"
603  "ld r20,Z+\n"
604  "ld r21,Z+\n"
605  "ld r22,Z+\n"
606  "ld r23,Z+\n"
607  "eor r16,r20\n"
608  "eor r17,r21\n"
609  "eor r18,r22\n"
610  "eor r19,r23\n"
611  "and r16,r25\n"
612  "and r17,r25\n"
613  "and r18,r25\n"
614  "and r19,r25\n"
615  "eor r20,r16\n"
616  "eor r21,r17\n"
617  "eor r22,r18\n"
618  "eor r23,r19\n"
619  "st X+,r20\n"
620  "st X+,r21\n"
621  "st X+,r22\n"
622  "st X+,r23\n"
623  "dec r24\n"
624  "brne 6b\n"
625 
626  : : "z"(x), "r"((uint8_t)(size * sizeof(limb_t))), "x"(result)
627  : "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
628  "r24", "r25"
629  );
630 #endif
631 }
632 
646 limb_t Curve25519::reduceQuick(limb_t *x)
647 {
648 #if !defined(CURVE25519_ASM_AVR)
649  limb_t temp[NUM_LIMBS_256BIT];
650  dlimb_t carry;
651  uint8_t posn;
652  limb_t *xx;
653  limb_t *tt;
654 
655  // Perform a trial subtraction of (2^255 - 19) from "x" which is
656  // equivalent to adding 19 and subtracting 2^255. We add 19 here;
657  // the subtraction of 2^255 occurs in the next step.
658  carry = 19U;
659  xx = x;
660  tt = temp;
661  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
662  carry += *xx++;
663  *tt++ = (limb_t)carry;
664  carry >>= LIMB_BITS;
665  }
666 
667  // If there was a borrow, then the original "x" is the correct answer.
668  // If there was no borrow, then "temp" is the correct answer. Select the
669  // correct answer but do it in a way that instruction timing will not
670  // reveal which value was selected. Borrow will occur if the high bit
671  // of "temp" is 0: turn the high bit into a selection mask.
672  limb_t mask = (limb_t)(((slimb_t)(temp[NUM_LIMBS_256BIT - 1])) >> (LIMB_BITS - 1));
673  limb_t nmask = ~mask;
674  temp[NUM_LIMBS_256BIT - 1] &= ((((limb_t)1) << (LIMB_BITS - 1)) - 1);
675  xx = x;
676  tt = temp;
677  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
678  *xx = ((*xx) & nmask) | ((*tt++) & mask);
679  ++xx;
680  }
681 
682  // Clean up "temp".
683  strict_clean(temp);
684 
685  // Return a zero value if we actually subtracted (2^255 - 19) from "x".
686  return nmask;
687 #else // CURVE25519_ASM_AVR
688  limb_t temp[NUM_LIMBS_256BIT];
689  uint8_t result;
690  __asm__ __volatile__ (
691  // Subtract (2^255 - 19) from "x", which is the same as adding 19
692  // and then subtracting 2^255.
693  "ldi r24,8\n" // Loop counter.
694  "ldi r25,19\n" // carry = 19
695  "1:\n"
696  "ld r16,Z+\n" // r16:r19:carry = *xx++ + carry
697  "ld r17,Z+\n"
698  "ld r18,Z+\n"
699  "ld r19,Z+\n"
700  "add r16,r25\n" // r16:r19:carry += carry
701  "adc r17,__zero_reg__\n"
702  "adc r18,__zero_reg__\n"
703  "adc r19,__zero_reg__\n"
704  "mov r25,__zero_reg__\n"
705  "adc r25,r25\n"
706  "st X+,r16\n" // *tt++ = r16:r19
707  "st X+,r17\n"
708  "st X+,r18\n"
709  "st X+,r19\n"
710  "dec r24\n"
711  "brne 1b\n"
712 
713  // Subtract 2^255 from "temp" which is equivalent to extracting
714  // the top bit and then masking it off. If the top bit is zero
715  // then a borrow has occurred and this isn't the answer we want.
716  "mov r25,r19\n"
717  "andi r19,0x7F\n"
718  "st -X,r19\n"
719  "lsl r25\n"
720  "mov r25,__zero_reg__\n"
721  "sbc r25,__zero_reg__\n"
722 
723  // At this point, r25 is 0 if the original "x" is the answer
724  // we want, or 0xFF if "temp" is the answer we want. Essentially
725  // we need to do a conditional move of "temp" into "x".
726  "sbiw r26,31\n" // Point X back to the start of "temp".
727  "sbiw r30,32\n" // Point Z back to the start of "x".
728  "ldi r24,8\n"
729  "2:\n"
730  "ld r16,X+\n"
731  "ld r17,X+\n"
732  "ld r18,X+\n"
733  "ld r19,X+\n"
734  "ld r20,Z\n"
735  "ldd r21,Z+1\n"
736  "ldd r22,Z+2\n"
737  "ldd r23,Z+3\n"
738  "eor r16,r20\n"
739  "eor r17,r21\n"
740  "eor r18,r22\n"
741  "eor r19,r23\n"
742  "and r16,r25\n"
743  "and r17,r25\n"
744  "and r18,r25\n"
745  "and r19,r25\n"
746  "eor r20,r16\n"
747  "eor r21,r17\n"
748  "eor r22,r18\n"
749  "eor r23,r19\n"
750  "st Z+,r20\n"
751  "st Z+,r21\n"
752  "st Z+,r22\n"
753  "st Z+,r23\n"
754  "dec r24\n"
755  "brne 2b\n"
756  "mov %0,r25\n"
757  : "=r"(result)
758  : "x"(temp), "z"(x)
759  : "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
760  "r24", "r25"
761  );
762  strict_clean(temp);
763  return result;
764 #endif // CURVE25519_ASM_AVR
765 }
766 
779 void Curve25519::mulNoReduce(limb_t *result, const limb_t *x, const limb_t *y)
780 {
781 #if !defined(CURVE25519_ASM_AVR)
782  uint8_t i, j;
783  dlimb_t carry;
784  limb_t word;
785  const limb_t *yy;
786  limb_t *rr;
787 
788  // Multiply the lowest word of x by y.
789  carry = 0;
790  word = x[0];
791  yy = y;
792  rr = result;
793  for (i = 0; i < NUM_LIMBS_256BIT; ++i) {
794  carry += ((dlimb_t)(*yy++)) * word;
795  *rr++ = (limb_t)carry;
796  carry >>= LIMB_BITS;
797  }
798  *rr = (limb_t)carry;
799 
800  // Multiply and add the remaining words of x by y.
801  for (i = 1; i < NUM_LIMBS_256BIT; ++i) {
802  word = x[i];
803  carry = 0;
804  yy = y;
805  rr = result + i;
806  for (j = 0; j < NUM_LIMBS_256BIT; ++j) {
807  carry += ((dlimb_t)(*yy++)) * word;
808  carry += *rr;
809  *rr++ = (limb_t)carry;
810  carry >>= LIMB_BITS;
811  }
812  *rr = (limb_t)carry;
813  }
814 #else
815  __asm__ __volatile__ (
816  // Save Y and copy the "result" pointer into it.
817  "push r28\n"
818  "push r29\n"
819  "mov r28,%A2\n"
820  "mov r29,%B2\n"
821 
822  // Multiply the first byte of "x" by y[0..31].
823  "ldi r25,8\n" // loop 8 times: 4 bytes of y each time
824  "clr r24\n" // carry = 0
825  "clr r22\n" // r22 = 0 to replace __zero_reg__
826  "ld r23,X+\n" // r23 = *x++
827  "1:\n"
828  "ld r16,Z\n" // r16 = y[0]
829  "mul r16,r23\n" // r8:r9 = y[0] * r23
830  "movw r8,r0\n"
831  "ldd r16,Z+2\n" // r16 = y[2]
832  "mul r16,r23\n" // r10:r11 = y[2] * r23
833  "movw r10,r0\n"
834  "ldd r16,Z+1\n" // r16 = y[1]
835  "mul r16,r23\n" // r9:r10:r11 += y[1] * r23
836  "add r9,r0\n"
837  "adc r10,r1\n"
838  "adc r11,r22\n"
839  "ldd r16,Z+3\n" // r16 = y[3]
840  "mul r16,r23\n" // r11:r1 += y[3] * r23
841  "add r11,r0\n"
842  "adc r1,r22\n"
843  "add r8,r24\n" // r8:r9:r10:r11:r1 += carry
844  "adc r9,r22\n"
845  "adc r10,r22\n"
846  "adc r11,r22\n"
847  "adc r1,r22\n"
848  "mov r24,r1\n" // carry = r1
849  "st Y+,r8\n" // *rr++ = r8:r9:r10:r11
850  "st Y+,r9\n"
851  "st Y+,r10\n"
852  "st Y+,r11\n"
853  "adiw r30,4\n"
854  "dec r25\n"
855  "brne 1b\n"
856  "st Y+,r24\n" // *rr++ = carry
857  "sbiw r28,32\n" // rr -= 32
858  "sbiw r30,32\n" // Point Z back to the start of y
859 
860  // Multiply and add the remaining bytes of "x" by y[0..31].
861  "ldi r21,31\n" // 31 more bytes of x to go.
862  "2:\n"
863  "ldi r25,8\n" // loop 8 times: 4 bytes of y each time
864  "clr r24\n" // carry = 0
865  "ld r23,X+\n" // r23 = *x++
866  "3:\n"
867  "ld r16,Z\n" // r16 = y[0]
868  "mul r16,r23\n" // r8:r9 = y[0] * r23
869  "movw r8,r0\n"
870  "ldd r16,Z+2\n" // r16 = y[2]
871  "mul r16,r23\n" // r10:r11 = y[2] * r23
872  "movw r10,r0\n"
873  "ldd r16,Z+1\n" // r16 = y[1]
874  "mul r16,r23\n" // r9:r10:r11 += y[1] * r23
875  "add r9,r0\n"
876  "adc r10,r1\n"
877  "adc r11,r22\n"
878  "ldd r16,Z+3\n" // r16 = y[3]
879  "mul r16,r23\n" // r11:r1 += y[3] * r23
880  "add r11,r0\n"
881  "adc r1,r22\n"
882  "add r8,r24\n" // r8:r9:r10:r11:r1 += carry
883  "adc r9,r22\n"
884  "adc r10,r22\n"
885  "adc r11,r22\n"
886  "adc r1,r22\n"
887  "ld r16,Y\n" // r8:r9:r10:r11:r1 += rr[0..3]
888  "add r8,r16\n"
889  "ldd r16,Y+1\n"
890  "adc r9,r16\n"
891  "ldd r16,Y+2\n"
892  "adc r10,r16\n"
893  "ldd r16,Y+3\n"
894  "adc r11,r16\n"
895  "adc r1,r22\n"
896  "mov r24,r1\n" // carry = r1
897  "st Y+,r8\n" // *rr++ = r8:r9:r10:r11
898  "st Y+,r9\n"
899  "st Y+,r10\n"
900  "st Y+,r11\n"
901  "adiw r30,4\n"
902  "dec r25\n"
903  "brne 3b\n"
904  "st Y+,r24\n" // *r++ = carry
905  "sbiw r28,32\n" // rr -= 32
906  "sbiw r30,32\n" // Point Z back to the start of y
907  "dec r21\n"
908  "brne 2b\n"
909 
910  // Restore Y and __zero_reg__.
911  "pop r29\n"
912  "pop r28\n"
913  "clr __zero_reg__\n"
914  : : "x"(x), "z"(y), "r"(result)
915  : "r8", "r9", "r10", "r11", "r16", "r20", "r21", "r22",
916  "r23", "r24", "r25"
917  );
918 #endif
919 }
920 
931 void Curve25519::mul(limb_t *result, const limb_t *x, const limb_t *y)
932 {
933  limb_t temp[NUM_LIMBS_512BIT];
934  mulNoReduce(temp, x, y);
935  reduce(result, temp, NUM_LIMBS_256BIT);
936  strict_clean(temp);
937 }
938 
958 void Curve25519::mulA24(limb_t *result, const limb_t *x)
959 {
960 #if !defined(CURVE25519_ASM_AVR)
961  // The constant a24 = 121665 (0x1DB41) as a limb array.
962 #if BIGNUMBER_LIMB_8BIT
963  static limb_t const a24[3] PROGMEM = {0x41, 0xDB, 0x01};
964 #elif BIGNUMBER_LIMB_16BIT
965  static limb_t const a24[2] PROGMEM = {0xDB41, 0x0001};
966 #elif BIGNUMBER_LIMB_32BIT || BIGNUMBER_LIMB_64BIT
967  static limb_t const a24[1] PROGMEM = {0x0001DB41};
968 #else
969  #error "limb_t must be 8, 16, 32, or 64 bits in size"
970 #endif
971  #define NUM_A24_LIMBS (sizeof(a24) / sizeof(limb_t))
972 
973  // Multiply the lowest limb of a24 by x and zero-extend into the result.
974  limb_t temp[NUM_LIMBS_512BIT];
975  uint8_t i, j;
976  dlimb_t carry = 0;
977  limb_t word = pgm_read_limb(&(a24[0]));
978  const limb_t *xx = x;
979  limb_t *tt = temp;
980  for (i = 0; i < NUM_LIMBS_256BIT; ++i) {
981  carry += ((dlimb_t)(*xx++)) * word;
982  *tt++ = (limb_t)carry;
983  carry >>= LIMB_BITS;
984  }
985  *tt = (limb_t)carry;
986 
987  // Multiply and add the remaining limbs of a24.
988  for (i = 1; i < NUM_A24_LIMBS; ++i) {
989  word = pgm_read_limb(&(a24[i]));
990  carry = 0;
991  xx = x;
992  tt = temp + i;
993  for (j = 0; j < NUM_LIMBS_256BIT; ++j) {
994  carry += ((dlimb_t)(*xx++)) * word;
995  carry += *tt;
996  *tt++ = (limb_t)carry;
997  carry >>= LIMB_BITS;
998  }
999  *tt = (limb_t)carry;
1000  }
1001 #else
1002  limb_t temp[NUM_LIMBS_512BIT];
1003  #define NUM_A24_LIMBS ((3 + sizeof(limb_t) - 1) / sizeof(limb_t))
1004  __asm__ __volatile__ (
1005  // Load the two low bytes of a24 into r16 and r17.
1006  // The third byte is 0x01 which we can deal with implicitly.
1007  "ldi r16,0x41\n"
1008  "ldi r17,0xDB\n"
1009 
1010  // Iterate over the bytes of "x" and multiply each with a24.
1011  "ldi r25,32\n" // 32 bytes in "x"
1012  "clr r22\n" // r22 = 0
1013  "clr r18\n" // r18:r19:r11 = 0 (carry)
1014  "clr r19\n"
1015  "clr r11\n"
1016  "1:\n"
1017  "ld r21,X+\n" // r21 = *x++
1018  "mul r21,r16\n" // r8:r9 = r21 * a24[0]
1019  "movw r8,r0\n"
1020  "mul r21,r17\n" // r9:r1 += r21 * a24[1]
1021  "add r9,r0\n"
1022  "adc r1,r21\n" // r1:r10 += r21 * a24[2] (implicitly 1)
1023  "mov r10,r22\n"
1024  "adc r10,r22\n"
1025  "add r8,r18\n" // r8:r9:r1:r10 += carry
1026  "adc r9,r19\n"
1027  "adc r1,r11\n"
1028  "adc r10,r22\n"
1029  "st Z+,r8\n" // *tt++ = r8
1030  "mov r18,r9\n" // carry = r9:r1:r10
1031  "mov r19,r1\n"
1032  "mov r11,r10\n"
1033  "dec r25\n"
1034  "brne 1b\n"
1035  "st Z,r18\n" // *tt = carry
1036  "std Z+1,r19\n"
1037  "std Z+2,r11\n"
1038 #if BIGNUMBER_LIMB_16BIT || BIGNUMBER_LIMB_32BIT
1039  "std Z+3,r22\n" // Zero pad to a limb boundary
1040 #endif
1041 
1042  // Restore __zero_reg__
1043  "clr __zero_reg__\n"
1044 
1045  : : "x"(x), "z"(temp)
1046  : "r8", "r9", "r10", "r11", "r16", "r17", "r18", "r19",
1047  "r20", "r21", "r22", "r25"
1048  );
1049 #endif
1050 
1051  // Reduce the intermediate result modulo 2^255 - 19.
1052  reduce(result, temp, NUM_A24_LIMBS);
1053  strict_clean(temp);
1054 }
1055 
1067 void Curve25519::mul_P(limb_t *result, const limb_t *x, const limb_t *y)
1068 {
1069  limb_t temp[NUM_LIMBS_512BIT];
1070  uint8_t i, j;
1071  dlimb_t carry;
1072  limb_t word;
1073  const limb_t *xx;
1074  limb_t *tt;
1075 
1076  // Multiply the lowest word of y by x.
1077  carry = 0;
1078  word = pgm_read_limb(&(y[0]));
1079  xx = x;
1080  tt = temp;
1081  for (i = 0; i < NUM_LIMBS_256BIT; ++i) {
1082  carry += ((dlimb_t)(*xx++)) * word;
1083  *tt++ = (limb_t)carry;
1084  carry >>= LIMB_BITS;
1085  }
1086  *tt = (limb_t)carry;
1087 
1088  // Multiply and add the remaining words of y by x.
1089  for (i = 1; i < NUM_LIMBS_256BIT; ++i) {
1090  word = pgm_read_limb(&(y[i]));
1091  carry = 0;
1092  xx = x;
1093  tt = temp + i;
1094  for (j = 0; j < NUM_LIMBS_256BIT; ++j) {
1095  carry += ((dlimb_t)(*xx++)) * word;
1096  carry += *tt;
1097  *tt++ = (limb_t)carry;
1098  carry >>= LIMB_BITS;
1099  }
1100  *tt = (limb_t)carry;
1101  }
1102 
1103  // Reduce the intermediate result modulo 2^255 - 19.
1104  reduce(result, temp, NUM_LIMBS_256BIT);
1105  strict_clean(temp);
1106 }
1107 
1118 void Curve25519::add(limb_t *result, const limb_t *x, const limb_t *y)
1119 {
1120 #if !defined(CURVE25519_ASM_AVR)
1121  dlimb_t carry = 0;
1122  uint8_t posn;
1123  limb_t *rr = result;
1124 
1125  // Add the two arrays to obtain the intermediate result.
1126  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
1127  carry += *x++;
1128  carry += *y++;
1129  *rr++ = (limb_t)carry;
1130  carry >>= LIMB_BITS;
1131  }
1132 #else // CURVE25519_ASM_AVR
1133  __asm__ __volatile__ (
1134  // Save Y and copy the "result" pointer into it.
1135  "push r28\n"
1136  "push r29\n"
1137  "mov r28,%A2\n"
1138  "mov r29,%B2\n"
1139 
1140  // Unroll the loop to operate on 4 bytes at a time (8 iterations).
1141  "ldi r24,8\n" // Loop counter.
1142  "clr r25\n" // carry = 0
1143  "1:\n"
1144  "ld r16,X+\n" // r16:r19 = *x++
1145  "ld r17,X+\n"
1146  "ld r18,X+\n"
1147  "ld r19,X+\n"
1148  "ld r20,Z+\n" // r20:r23 = *y++
1149  "ld r21,Z+\n"
1150  "ld r22,Z+\n"
1151  "ld r23,Z+\n"
1152  "add r16,r25\n" // r16:r19:carry += carry
1153  "adc r17,__zero_reg__\n"
1154  "adc r18,__zero_reg__\n"
1155  "adc r19,__zero_reg__\n"
1156  "mov r25,__zero_reg__\n"
1157  "adc r25,r25\n"
1158  "add r16,r20\n" // r16:r19:carry += r20:r23
1159  "adc r17,r21\n"
1160  "adc r18,r22\n"
1161  "adc r19,r23\n"
1162  "adc r25,__zero_reg__\n"
1163  "st Y+,r16\n" // *rr++ = r16:r23
1164  "st Y+,r17\n"
1165  "st Y+,r18\n"
1166  "st Y+,r19\n"
1167  "dec r24\n"
1168  "brne 1b\n"
1169 
1170  // Restore Y.
1171  "pop r29\n"
1172  "pop r28\n"
1173  : : "x"(x), "z"(y), "r"(result)
1174  : "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
1175  "r24", "r25"
1176  );
1177 #endif // CURVE25519_ASM_AVR
1178 
1179  // Reduce the result using the quick trial subtraction method.
1180  reduceQuick(result);
1181 }
1182 
1193 void Curve25519::sub(limb_t *result, const limb_t *x, const limb_t *y)
1194 {
1195 #if !defined(CURVE25519_ASM_AVR)
1196  dlimb_t borrow;
1197  uint8_t posn;
1198  limb_t *rr = result;
1199 
1200  // Subtract y from x to generate the intermediate result.
1201  borrow = 0;
1202  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
1203  borrow = ((dlimb_t)(*x++)) - (*y++) - ((borrow >> LIMB_BITS) & 0x01);
1204  *rr++ = (limb_t)borrow;
1205  }
1206 
1207  // If we had a borrow, then the result has gone negative and we
1208  // have to add 2^255 - 19 to the result to make it positive again.
1209  // The top bits of "borrow" will be all 1's if there is a borrow
1210  // or it will be all 0's if there was no borrow. Easiest is to
1211  // conditionally subtract 19 and then mask off the high bit.
1212  rr = result;
1213  borrow = (borrow >> LIMB_BITS) & 19U;
1214  borrow = ((dlimb_t)(*rr)) - borrow;
1215  *rr++ = (limb_t)borrow;
1216  for (posn = 1; posn < NUM_LIMBS_256BIT; ++posn) {
1217  borrow = ((dlimb_t)(*rr)) - ((borrow >> LIMB_BITS) & 0x01);
1218  *rr++ = (limb_t)borrow;
1219  }
1220  *(--rr) &= ((((limb_t)1) << (LIMB_BITS - 1)) - 1);
1221 #else // CURVE25519_ASM_AVR
1222  __asm__ __volatile__ (
1223  // Save Y and copy the "result" pointer into it.
1224  "push r28\n"
1225  "push r29\n"
1226  "mov r28,%A2\n"
1227  "mov r29,%B2\n"
1228 
1229  // Unroll the sub loop to operate on 4 bytes at a time (8 iterations).
1230  "ldi r24,8\n" // Loop counter.
1231  "clr r25\n" // borrow = 0
1232  "1:\n"
1233  "ld r16,X+\n" // r16:r19 = *x++
1234  "ld r17,X+\n"
1235  "ld r18,X+\n"
1236  "ld r19,X+\n"
1237  "ld r20,Z+\n" // r20:r23 = *y++
1238  "ld r21,Z+\n"
1239  "ld r22,Z+\n"
1240  "ld r23,Z+\n"
1241  "sub r16,r25\n" // r16:r19:borrow -= borrow
1242  "sbc r17,__zero_reg__\n"
1243  "sbc r18,__zero_reg__\n"
1244  "sbc r19,__zero_reg__\n"
1245  "mov r25,__zero_reg__\n"
1246  "sbc r25,__zero_reg__\n"
1247  "sub r16,r20\n" // r16:r19:borrow -= r20:r23
1248  "sbc r17,r21\n"
1249  "sbc r18,r22\n"
1250  "sbc r19,r23\n"
1251  "sbc r25,__zero_reg__\n"
1252  "st Y+,r16\n" // *rr++ = r16:r23
1253  "st Y+,r17\n"
1254  "st Y+,r18\n"
1255  "st Y+,r19\n"
1256  "andi r25,1\n" // Only need the bottom bit of the borrow
1257  "dec r24\n"
1258  "brne 1b\n"
1259 
1260  // If there was a borrow, then we need to add 2^255 - 19 back.
1261  // We conditionally subtract 19 and then mask off the high bit.
1262  "neg r25\n" // borrow = mask(borrow) & 19
1263  "andi r25,19\n"
1264  "sbiw r28,32\n" // Point Y back to the start of "result"
1265  "ldi r24,8\n"
1266  "2:\n"
1267  "ld r16,Y\n" // r16:r19 = *rr
1268  "ldd r17,Y+1\n"
1269  "ldd r18,Y+2\n"
1270  "ldd r19,Y+3\n"
1271  "sub r16,r25\n"
1272  "sbc r17,__zero_reg__\n" // r16:r19:borrow -= borrow
1273  "sbc r18,__zero_reg__\n"
1274  "sbc r19,__zero_reg__\n"
1275  "mov r25,__zero_reg__\n"
1276  "sbc r25,__zero_reg__\n"
1277  "andi r25,1\n"
1278  "st Y+,r16\n" // *r++ = r16:r19
1279  "st Y+,r17\n"
1280  "st Y+,r18\n"
1281  "st Y+,r19\n"
1282  "dec r24\n"
1283  "brne 2b\n"
1284  "andi r19,0x7F\n" // Mask off the high bit in the last byte
1285  "sbiw r28,1\n"
1286  "st Y,r19\n"
1287 
1288  // Restore Y.
1289  "pop r29\n"
1290  "pop r28\n"
1291  : : "x"(x), "z"(y), "r"(result)
1292  : "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
1293  "r24", "r25"
1294  );
1295 #endif // CURVE25519_ASM_AVR
1296 }
1297 
1310 void Curve25519::cswap(limb_t select, limb_t *x, limb_t *y)
1311 {
1312 #if !defined(CURVE25519_ASM_AVR)
1313  uint8_t posn;
1314  limb_t dummy;
1315  limb_t sel;
1316 
1317  // Turn "select" into an all-zeroes or all-ones mask. We don't care
1318  // which bit or bits is set in the original "select" value.
1319  sel = (limb_t)(((((dlimb_t)1) << LIMB_BITS) - select) >> LIMB_BITS);
1320  --sel;
1321 
1322  // Swap the two values based on "select". Algorithm from:
1323  // http://tools.ietf.org/html/rfc7748
1324  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
1325  dummy = sel & (x[posn] ^ y[posn]);
1326  x[posn] ^= dummy;
1327  y[posn] ^= dummy;
1328  }
1329 #else // CURVE25519_ASM_AVR
1330  __asm__ __volatile__ (
1331  // Combine all bytes from "select" into one and then turn
1332  // that byte into the "sel" mask in r24.
1333  "clr r24\n"
1334 #if BIGNUMBER_LIMB_8BIT
1335  "sub r24,%2\n"
1336 #elif BIGNUMBER_LIMB_16BIT
1337  "or %A2,%B2\n"
1338  "sub r24,%A2\n"
1339 #elif BIGNUMBER_LIMB_32BIT
1340  "or %A2,%B2\n"
1341  "or %A2,%C2\n"
1342  "or %A2,%D2\n"
1343  "sub r24,%A2\n"
1344 #endif
1345  "mov r24,__zero_reg__\n"
1346  "sbc r24,r24\n"
1347 
1348  // Perform the conditional swap 4 bytes at a time.
1349  "ldi r25,8\n"
1350  "1:\n"
1351  "ld r16,X+\n" // r16:r19 = *x
1352  "ld r17,X+\n"
1353  "ld r18,X+\n"
1354  "ld r19,X\n"
1355  "ld r20,Z\n" // r20:r23 = *y
1356  "ldd r21,Z+1\n"
1357  "ldd r22,Z+2\n"
1358  "ldd r23,Z+3\n"
1359  "mov r12,r16\n" // r12:r15 = (r16:r19 ^ r20:r23) & sel
1360  "mov r13,r17\n"
1361  "mov r14,r18\n"
1362  "mov r15,r19\n"
1363  "eor r12,r20\n"
1364  "eor r13,r21\n"
1365  "eor r14,r22\n"
1366  "eor r15,r23\n"
1367  "and r12,r24\n"
1368  "and r13,r24\n"
1369  "and r14,r24\n"
1370  "and r15,r24\n"
1371  "eor r16,r12\n" // r16:r19 ^= r12:r15
1372  "eor r17,r13\n"
1373  "eor r18,r14\n"
1374  "eor r19,r15\n"
1375  "eor r20,r12\n" // r20:r23 ^= r12:r15
1376  "eor r21,r13\n"
1377  "eor r22,r14\n"
1378  "eor r23,r15\n"
1379  "st X,r19\n" // *x++ = r16:r19
1380  "st -X,r18\n"
1381  "st -X,r17\n"
1382  "st -X,r16\n"
1383  "adiw r26,4\n"
1384  "st Z+,r20\n" // *y++ = r20:r23
1385  "st Z+,r21\n"
1386  "st Z+,r22\n"
1387  "st Z+,r23\n"
1388  "dec r25\n"
1389  "brne 1b\n"
1390 
1391  : : "x"(x), "z"(y), "r"(select)
1392  : "r12", "r13", "r14", "r15", "r16", "r17", "r18", "r19",
1393  "r20", "r21", "r22", "r23", "r24", "r25"
1394  );
1395 #endif // CURVE25519_ASM_AVR
1396 }
1397 
1410 void Curve25519::cmove(limb_t select, limb_t *x, const limb_t *y)
1411 {
1412 #if !defined(CURVE25519_ASM_AVR)
1413  uint8_t posn;
1414  limb_t dummy;
1415  limb_t sel;
1416 
1417  // Turn "select" into an all-zeroes or all-ones mask. We don't care
1418  // which bit or bits is set in the original "select" value.
1419  sel = (limb_t)(((((dlimb_t)1) << LIMB_BITS) - select) >> LIMB_BITS);
1420  --sel;
1421 
1422  // Move y into x based on "select". Similar to conditional swap above.
1423  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
1424  dummy = sel & (x[posn] ^ y[posn]);
1425  x[posn] ^= dummy;
1426  }
1427 #else // CURVE25519_ASM_AVR
1428  __asm__ __volatile__ (
1429  // Combine all bytes from "select" into one and then turn
1430  // that byte into the "sel" mask in r24.
1431  "clr r24\n"
1432 #if BIGNUMBER_LIMB_8BIT
1433  "sub r24,%2\n"
1434 #elif BIGNUMBER_LIMB_16BIT
1435  "or %A2,%B2\n"
1436  "sub r24,%A2\n"
1437 #elif BIGNUMBER_LIMB_32BIT
1438  "or %A2,%B2\n"
1439  "or %A2,%C2\n"
1440  "or %A2,%D2\n"
1441  "sub r24,%A2\n"
1442 #endif
1443  "mov r24,__zero_reg__\n"
1444  "sbc r24,r24\n"
1445 
1446  // Perform the conditional move 4 bytes at a time.
1447  "ldi r25,8\n"
1448  "1:\n"
1449  "ld r16,X+\n" // r16:r19 = *x
1450  "ld r17,X+\n"
1451  "ld r18,X+\n"
1452  "ld r19,X\n"
1453  "ld r20,Z+\n" // r20:r23 = *y++
1454  "ld r21,Z+\n"
1455  "ld r22,Z+\n"
1456  "ld r23,Z+\n"
1457  "eor r20,r16\n" // r20:r23 = (r16:r19 ^ r20:r23) & sel
1458  "eor r21,r17\n"
1459  "eor r22,r18\n"
1460  "eor r23,r19\n"
1461  "and r20,r24\n"
1462  "and r21,r24\n"
1463  "and r22,r24\n"
1464  "and r23,r24\n"
1465  "eor r16,r20\n" // r16:r19 ^= r20:r23
1466  "eor r17,r21\n"
1467  "eor r18,r22\n"
1468  "eor r19,r23\n"
1469  "st X,r19\n" // *x++ = r16:r19
1470  "st -X,r18\n"
1471  "st -X,r17\n"
1472  "st -X,r16\n"
1473  "adiw r26,4\n"
1474  "dec r25\n"
1475  "brne 1b\n"
1476 
1477  : : "x"(x), "z"(y), "r"(select)
1478  : "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
1479  "r24", "r25"
1480  );
1481 #endif // CURVE25519_ASM_AVR
1482 }
1483 
1490 void Curve25519::pow250(limb_t *result, const limb_t *x)
1491 {
1492  limb_t t1[NUM_LIMBS_256BIT];
1493  uint8_t i, j;
1494 
1495  // The big-endian hexadecimal expansion of (2^250 - 1) is:
1496  // 03FFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF
1497  //
1498  // The naive implementation needs to do 2 multiplications per 1 bit and
1499  // 1 multiplication per 0 bit. We can improve upon this by creating a
1500  // pattern 0000000001 ... 0000000001. If we square and multiply the
1501  // pattern by itself we can turn the pattern into the partial results
1502  // 0000000011 ... 0000000011, 0000000111 ... 0000000111, etc.
1503  // This averages out to about 1.1 multiplications per 1 bit instead of 2.
1504 
1505  // Build a pattern of 250 bits in length of repeated copies of 0000000001.
1506  #define RECIP_GROUP_SIZE 10
1507  #define RECIP_GROUP_BITS 250 // Must be a multiple of RECIP_GROUP_SIZE.
1508  square(t1, x);
1509  for (j = 0; j < (RECIP_GROUP_SIZE - 1); ++j)
1510  square(t1, t1);
1511  mul(result, t1, x);
1512  for (i = 0; i < ((RECIP_GROUP_BITS / RECIP_GROUP_SIZE) - 2); ++i) {
1513  for (j = 0; j < RECIP_GROUP_SIZE; ++j)
1514  square(t1, t1);
1515  mul(result, result, t1);
1516  }
1517 
1518  // Multiply bit-shifted versions of the 0000000001 pattern into
1519  // the result to "fill in" the gaps in the pattern.
1520  square(t1, result);
1521  mul(result, result, t1);
1522  for (j = 0; j < (RECIP_GROUP_SIZE - 2); ++j) {
1523  square(t1, t1);
1524  mul(result, result, t1);
1525  }
1526 
1527  // Clean up and exit.
1528  clean(t1);
1529 }
1530 
1538 void Curve25519::recip(limb_t *result, const limb_t *x)
1539 {
1540  // The reciprocal is the same as x ^ (p - 2) where p = 2^255 - 19.
1541  // The big-endian hexadecimal expansion of (p - 2) is:
1542  // 7FFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFEB
1543  // Start with the 250 upper bits of the expansion of (p - 2).
1544  pow250(result, x);
1545 
1546  // Deal with the 5 lowest bits of (p - 2), 01011, from highest to lowest.
1547  square(result, result);
1548  square(result, result);
1549  mul(result, result, x);
1550  square(result, result);
1551  square(result, result);
1552  mul(result, result, x);
1553  square(result, result);
1554  mul(result, result, x);
1555 }
1556 
1572 bool Curve25519::sqrt(limb_t *result, const limb_t *x)
1573 {
1574  // sqrt(-1) mod (2^255 - 19).
1575  static limb_t const numSqrtM1[NUM_LIMBS_256BIT] PROGMEM = {
1576  LIMB_PAIR(0x4A0EA0B0, 0xC4EE1B27), LIMB_PAIR(0xAD2FE478, 0x2F431806),
1577  LIMB_PAIR(0x3DFBD7A7, 0x2B4D0099), LIMB_PAIR(0x4FC1DF0B, 0x2B832480)
1578  };
1579  limb_t y[NUM_LIMBS_256BIT];
1580 
1581  // Algorithm from: http://tools.ietf.org/html/rfc7748
1582 
1583  // Compute a candidate root: result = x^((p + 3) / 8) mod p.
1584  // (p + 3) / 8 = (2^252 - 2) which is 251 one bits followed by a zero:
1585  // 0FFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE
1586  pow250(result, x);
1587  square(result, result);
1588  mul(result, result, x);
1589  square(result, result);
1590 
1591  // Did we get the square root immediately?
1592  square(y, result);
1593  if (memcmp(x, y, sizeof(y)) == 0) {
1594  clean(y);
1595  return true;
1596  }
1597 
1598  // Multiply the result by sqrt(-1) and check again.
1599  mul_P(result, result, numSqrtM1);
1600  square(y, result);
1601  if (memcmp(x, y, sizeof(y)) == 0) {
1602  clean(y);
1603  return true;
1604  }
1605 
1606  // The number does not have a square root.
1607  clean(y);
1608  return false;
1609 }
void rand(uint8_t *data, size_t len)
Generates random bytes into a caller-supplied buffer.
Definition: RNG.cpp:508
static bool eval(uint8_t result[32], const uint8_t s[32], const uint8_t x[32])
Evaluates the raw Curve25519 function.
Definition: Curve25519.cpp:80
static void unpackLE(limb_t *limbs, size_t count, const uint8_t *bytes, size_t len)
Unpacks the little-endian byte representation of a big number into a limb array.
static void packLE(uint8_t *bytes, size_t len, const limb_t *limbs, size_t count)
Packs the little-endian byte representation of a big number into a byte array.
static void dh1(uint8_t k[32], uint8_t f[32])
Performs phase 1 of a Diffie-Hellman key exchange using Curve25519.
Definition: Curve25519.cpp:245
static bool dh2(uint8_t k[32], uint8_t f[32])
Performs phase 2 of a Diffie-Hellman key exchange using Curve25519.
Definition: Curve25519.cpp:283