ArduinoLibs
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Curve25519.cpp
1 /*
2  * Copyright (C) 2015 Southern Storm Software, Pty Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included
12  * in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20  * DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "Curve25519.h"
24 #include "Crypto.h"
25 #include "RNG.h"
26 #include "utility/LimbUtil.h"
27 #include <string.h>
28 
44 // Global switch to enable/disable AVR inline assembly optimizations.
45 #if defined(__AVR__)
46 #define CURVE25519_ASM_AVR 1
47 #endif
48 
49 // The overhead of clean() calls in mul(), reduceQuick(), etc can
50 // add up to a lot of processing time during eval(). Only do such
51 // cleanups if strict mode has been enabled. Other implementations
52 // like curve25519-donna don't do any cleaning at all so the value
53 // of cleaning up the stack is dubious at best anyway.
54 #if defined(CURVE25519_STRICT_CLEAN)
55 #define strict_clean(x) clean(x)
56 #else
57 #define strict_clean(x) do { ; } while (0)
58 #endif
59 
79 bool Curve25519::eval(uint8_t result[32], const uint8_t s[32], const uint8_t x[32])
80 {
81  limb_t x_1[NUM_LIMBS_256BIT];
82  limb_t x_2[NUM_LIMBS_256BIT];
83  limb_t x_3[NUM_LIMBS_256BIT];
84  limb_t z_2[NUM_LIMBS_256BIT];
85  limb_t z_3[NUM_LIMBS_256BIT];
86  limb_t A[NUM_LIMBS_256BIT];
87  limb_t B[NUM_LIMBS_256BIT];
88  limb_t C[NUM_LIMBS_256BIT];
89  limb_t D[NUM_LIMBS_256BIT];
90  limb_t E[NUM_LIMBS_256BIT];
91  limb_t AA[NUM_LIMBS_256BIT];
92  limb_t BB[NUM_LIMBS_256BIT];
93  limb_t DA[NUM_LIMBS_256BIT];
94  limb_t CB[NUM_LIMBS_256BIT];
95  uint8_t mask;
96  uint8_t sposn;
97  uint8_t select;
98  uint8_t swap;
99  bool retval;
100 
101  // Unpack the "x" argument into the limb representation
102  // which also masks off the high bit. NULL means 9.
103  if (x) {
104  // x1 = x
105  BigNumberUtil::unpackLE(x_1, NUM_LIMBS_256BIT, x, 32);
106  x_1[NUM_LIMBS_256BIT - 1] &= ((((limb_t)1) << (LIMB_BITS - 1)) - 1);
107  } else {
108  memset(x_1, 0, sizeof(x_1)); // x_1 = 9
109  x_1[0] = 9;
110  }
111 
112  // Check that "x" is within the range of the modulo field.
113  // We can do this with a reduction - if there was no borrow
114  // then the value of "x" was out of range. Timing is sensitive
115  // here so that we don't reveal anything about the value of "x".
116  // If there was a reduction, then continue executing the rest
117  // of this function with the (now) in-range "x" value and
118  // report the failure at the end.
119  retval = (bool)(reduceQuick(x_1) & 0x01);
120 
121  // Initialize the other temporary variables.
122  memset(x_2, 0, sizeof(x_2)); // x_2 = 1
123  x_2[0] = 1;
124  memset(z_2, 0, sizeof(z_2)); // z_2 = 0
125  memcpy(x_3, x_1, sizeof(x_1)); // x_3 = x
126  memcpy(z_3, x_2, sizeof(x_2)); // z_3 = 1
127 
128  // Iterate over all 255 bits of "s" from the highest to the lowest.
129  // We ignore the high bit of the 256-bit representation of "s".
130  mask = 0x40;
131  sposn = 31;
132  swap = 0;
133  for (uint8_t t = 255; t > 0; --t) {
134  // Conditional swaps on entry to this bit but only if we
135  // didn't swap on the previous bit.
136  select = s[sposn] & mask;
137  swap ^= select;
138  cswap(swap, x_2, x_3);
139  cswap(swap, z_2, z_3);
140 
141  // Evaluate the curve.
142  add(A, x_2, z_2); // A = x_2 + z_2
143  square(AA, A); // AA = A^2
144  sub(B, x_2, z_2); // B = x_2 - z_2
145  square(BB, B); // BB = B^2
146  sub(E, AA, BB); // E = AA - BB
147  add(C, x_3, z_3); // C = x_3 + z_3
148  sub(D, x_3, z_3); // D = x_3 - z_3
149  mul(DA, D, A); // DA = D * A
150  mul(CB, C, B); // CB = C * B
151  add(x_3, DA, CB); // x_3 = (DA + CB)^2
152  square(x_3, x_3);
153  sub(z_3, DA, CB); // z_3 = x_1 * (DA - CB)^2
154  square(z_3, z_3);
155  mul(z_3, z_3, x_1);
156  mul(x_2, AA, BB); // x_2 = AA * BB
157  mulA24(z_2, E); // z_2 = E * (AA + a24 * E)
158  add(z_2, z_2, AA);
159  mul(z_2, z_2, E);
160 
161  // Move onto the next lower bit of "s".
162  mask >>= 1;
163  if (!mask) {
164  --sposn;
165  mask = 0x80;
166  swap = select << 7;
167  } else {
168  swap = select >> 1;
169  }
170  }
171 
172  // Final conditional swaps.
173  cswap(swap, x_2, x_3);
174  cswap(swap, z_2, z_3);
175 
176  // Compute x_2 * (z_2 ^ (p - 2)) where p = 2^255 - 19.
177  recip(z_3, z_2);
178  mul(x_2, x_2, z_3);
179 
180  // Pack the result into the return array.
181  BigNumberUtil::packLE(result, 32, x_2, NUM_LIMBS_256BIT);
182 
183  // Clean up and exit.
184  clean(x_1);
185  clean(x_2);
186  clean(x_3);
187  clean(z_2);
188  clean(z_3);
189  clean(A);
190  clean(B);
191  clean(C);
192  clean(D);
193  clean(E);
194  clean(AA);
195  clean(BB);
196  clean(DA);
197  clean(CB);
198  return retval;
199 }
200 
244 void Curve25519::dh1(uint8_t k[32], uint8_t f[32])
245 {
246  do {
247  // Generate a random "f" value and then adjust the value to make
248  // it valid as an "s" value for eval(). According to the specification
249  // we need to mask off the 3 right-most bits of f[0], mask off the
250  // left-most bit of f[31], and set the second to left-most bit of f[31].
251  RNG.rand(f, 32);
252  f[0] &= 0xF8;
253  f[31] = (f[31] & 0x7F) | 0x40;
254 
255  // Evaluate the curve function: k = Curve25519::eval(f, 9).
256  // We pass NULL to eval() to indicate the value 9. There is no
257  // need to check the return value from eval() because we know
258  // that 9 is a valid field element.
259  eval(k, f, 0);
260 
261  // If "k" is weak for contributory behaviour then reject it,
262  // generate another "f" value, and try again. This case is
263  // highly unlikely but we still perform the check just in case.
264  } while (isWeakPoint(k));
265 }
266 
282 bool Curve25519::dh2(uint8_t k[32], uint8_t f[32])
283 {
284  uint8_t weak;
285 
286  // Evaluate the curve function: k = Curve25519::eval(f, k).
287  // If "k" is weak for contributory behaviour before or after
288  // the curve evaluation, then fail the exchange. For safety
289  // we perform every phase of the weak checks even if we could
290  // bail out earlier so that the execution takes the same
291  // amount of time for weak and non-weak "k" values.
292  weak = isWeakPoint(k); // Is "k" weak before?
293  weak |= ((eval(k, f, k) ^ 0x01) & 0x01); // Is "k" weak during?
294  weak |= isWeakPoint(k); // Is "k" weak after?
295  clean(f, 32);
296  return (bool)((weak ^ 0x01) & 0x01);
297 }
298 
306 uint8_t Curve25519::isWeakPoint(const uint8_t k[32])
307 {
308  // List of weak points from http://cr.yp.to/ecdh.html
309  // That page lists some others but they are variants on these
310  // of the form "point + i * (2^255 - 19)" for i = 0, 1, 2.
311  // Here we mask off the high bit and eval() catches the rest.
312  static const uint8_t points[5][32] PROGMEM = {
313  {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
314  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
315  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
316  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
317  {0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
318  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
319  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
320  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
321  {0xE0, 0xEB, 0x7A, 0x7C, 0x3B, 0x41, 0xB8, 0xAE,
322  0x16, 0x56, 0xE3, 0xFA, 0xF1, 0x9F, 0xC4, 0x6A,
323  0xDA, 0x09, 0x8D, 0xEB, 0x9C, 0x32, 0xB1, 0xFD,
324  0x86, 0x62, 0x05, 0x16, 0x5F, 0x49, 0xB8, 0x00},
325  {0x5F, 0x9C, 0x95, 0xBC, 0xA3, 0x50, 0x8C, 0x24,
326  0xB1, 0xD0, 0xB1, 0x55, 0x9C, 0x83, 0xEF, 0x5B,
327  0x04, 0x44, 0x5C, 0xC4, 0x58, 0x1C, 0x8E, 0x86,
328  0xD8, 0x22, 0x4E, 0xDD, 0xD0, 0x9F, 0x11, 0x57},
329  {0xEC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
330  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
331  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
332  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x7F}
333  };
334 
335  // Check each of the weak points in turn. We perform the
336  // comparisons carefully so as not to reveal the value of "k"
337  // in the instruction timing. If "k" is indeed weak then
338  // we still check everything so as not to reveal which
339  // weak point it is.
340  uint8_t result = 0;
341  for (uint8_t posn = 0; posn < 5; ++posn) {
342  const uint8_t *point = points[posn];
343  uint8_t check = (pgm_read_byte(point + 31) ^ k[31]) & 0x7F;
344  for (uint8_t index = 31; index > 0; --index)
345  check |= (pgm_read_byte(point + index - 1) ^ k[index - 1]);
346  result |= (uint8_t)((((uint16_t)0x0100) - check) >> 8);
347  }
348 
349  // The "result" variable will be non-zero if there was a match.
350  return result;
351 }
352 
365 void Curve25519::reduce(limb_t *result, limb_t *x, uint8_t size)
366 {
367  /*
368  Note: This explaination is best viewed with a UTF-8 text viewer.
369 
370  To help explain what this function is doing, the following describes
371  how to efficiently compute reductions modulo a base of the form (2ⁿ - b)
372  where b is greater than zero and (b + 1)² <= 2ⁿ.
373 
374  Here we are interested in reducing the result of multiplying two
375  numbers that are less than or equal to (2ⁿ - b - 1). That is,
376  multiplying numbers that have already been reduced.
377 
378  Given some x less than or equal to (2ⁿ - b - 1)², we want to find a
379  y less than (2ⁿ - b) such that:
380 
381  y ≡ x mod (2ⁿ - b)
382 
383  We know that for all integer values of k >= 0:
384 
385  y ≡ x - k * (2ⁿ - b)
386  ≡ x - k * 2ⁿ + k * b
387 
388  In our case we choose k = ⌊x / 2ⁿ⌋ and then let:
389 
390  w = (x mod 2ⁿ) + ⌊x / 2ⁿ⌋ * b
391 
392  The value w will either be the answer y or y can be obtained by
393  repeatedly subtracting (2ⁿ - b) from w until it is less than (2ⁿ - b).
394  At most b subtractions will be required.
395 
396  In our case b is 19 which is more subtractions than we would like to do,
397  but we can handle that by performing the above reduction twice and then
398  performing a single trial subtraction:
399 
400  w = (x mod 2ⁿ) + ⌊x / 2ⁿ⌋ * b
401  y = (w mod 2ⁿ) + ⌊w / 2ⁿ⌋ * b
402  if y >= (2ⁿ - b)
403  y -= (2ⁿ - b)
404 
405  The value y is the answer we want for reducing x modulo (2ⁿ - b).
406  */
407 
408 #if !defined(CURVE25519_ASM_AVR)
409  dlimb_t carry;
410  uint8_t posn;
411 
412  // Calculate (x mod 2^255) + ((x / 2^255) * 19) which will
413  // either produce the answer we want or it will produce a
414  // value of the form "answer + j * (2^255 - 19)".
415  carry = ((dlimb_t)(x[NUM_LIMBS_256BIT - 1] >> (LIMB_BITS - 1))) * 19U;
416  x[NUM_LIMBS_256BIT - 1] &= ((((limb_t)1) << (LIMB_BITS - 1)) - 1);
417  for (posn = 0; posn < size; ++posn) {
418  carry += ((dlimb_t)(x[posn + NUM_LIMBS_256BIT])) * 38U;
419  carry += x[posn];
420  x[posn] = (limb_t)carry;
421  carry >>= LIMB_BITS;
422  }
423  if (size < NUM_LIMBS_256BIT) {
424  // The high order half of the number is short; e.g. for mulA24().
425  // Propagate the carry through the rest of the low order part.
426  for (posn = size; posn < NUM_LIMBS_256BIT; ++posn) {
427  carry += x[posn];
428  x[posn] = (limb_t)carry;
429  carry >>= LIMB_BITS;
430  }
431  }
432 
433  // The "j" value may still be too large due to the final carry-out.
434  // We must repeat the reduction. If we already have the answer,
435  // then this won't do any harm but we must still do the calculation
436  // to preserve the overall timing.
437  carry *= 38U;
438  carry += ((dlimb_t)(x[NUM_LIMBS_256BIT - 1] >> (LIMB_BITS - 1))) * 19U;
439  x[NUM_LIMBS_256BIT - 1] &= ((((limb_t)1) << (LIMB_BITS - 1)) - 1);
440  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
441  carry += x[posn];
442  x[posn] = (limb_t)carry;
443  carry >>= LIMB_BITS;
444  }
445 
446  // At this point "x" will either be the answer or it will be the
447  // answer plus (2^255 - 19). Perform a trial subtraction which
448  // is equivalent to adding 19 and subtracting 2^255. We put the
449  // trial answer into the top-most limbs of the original "x" array.
450  // We add 19 here; the subtraction of 2^255 occurs in the next step.
451  carry = 19U;
452  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
453  carry += x[posn];
454  x[posn + NUM_LIMBS_256BIT] = (limb_t)carry;
455  carry >>= LIMB_BITS;
456  }
457 
458  // If there was a borrow, then the bottom-most limbs of "x" are the
459  // correct answer. If there was no borrow, then the top-most limbs
460  // of "x" are the correct answer. Select the correct answer but do
461  // it in a way that instruction timing will not reveal which value
462  // was selected. Borrow will occur if the high bit of the previous
463  // result is 0: turn the high bit into a selection mask.
464  limb_t mask = (limb_t)(((slimb_t)(x[NUM_LIMBS_512BIT - 1])) >> (LIMB_BITS - 1));
465  limb_t nmask = ~mask;
466  x[NUM_LIMBS_512BIT - 1] &= ((((limb_t)1) << (LIMB_BITS - 1)) - 1);
467  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
468  result[posn] = (x[posn] & nmask) | (x[posn + NUM_LIMBS_256BIT] & mask);
469  }
470 #else
471  __asm__ __volatile__ (
472  // Calculate (x mod 2^255) + ((x / 2^255) * 19) which will
473  // either produce the answer we want or it will produce a
474  // value of the form "answer + j * (2^255 - 19)".
475  "ldd r24,Z+31\n" // Extract the high bit of x[31]
476  "mov r25,r24\n" // and mask it off
477  "andi r25,0x7F\n"
478  "std Z+31,r25\n"
479  "lsl r24\n" // carry = high bit * 19
480  "mov r24,__zero_reg__\n"
481  "sbc r24,__zero_reg__\n"
482  "andi r24,19\n"
483 
484  "mov r25,%1\n" // load "size" into r25
485  "ldi r23,38\n" // r23 = 38
486  "mov r22,__zero_reg__\n" // r22 = 0 (we're about to destroy r1)
487  "1:\n"
488  "ld r16,Z\n" // r16 = x[0]
489  "ldd r17,Z+32\n" // r17 = x[32]
490  "mul r17,r23\n" // r0:r1 = r17 * 38
491  "add r0,r24\n" // r0:r1 += carry
492  "adc r1,r22\n"
493  "add r0,r16\n" // r0:r1 += r16
494  "adc r1,r22\n"
495  "st Z+,r0\n" // *x++ = r0
496  "mov r24,r1\n" // carry = r1
497  "dec r25\n" // if (--r25 != 0) loop
498  "brne 1b\n"
499 
500  // If the size is short, then we need to continue propagating carries.
501  "ldi r25,32\n"
502  "cp %1,r25\n"
503  "breq 3f\n"
504  "sub r25,%1\n"
505  "ld __tmp_reg__,Z\n"
506  "add __tmp_reg__,r24\n"
507  "st Z+,__tmp_reg__\n"
508  "dec r25\n"
509  "2:\n"
510  "ld __tmp_reg__,Z\n" // *x++ += carry
511  "adc __tmp_reg__,r22\n"
512  "st Z+,__tmp_reg__\n"
513  "dec r25\n"
514  "brne 2b\n"
515  "mov r24,r22\n" // put the carry back into r24
516  "adc r24,r22\n"
517  "3:\n"
518  "sbiw r30,32\n" // Point Z back to the start of "x"
519 
520  // The "j" value may still be too large due to the final carry-out.
521  // We must repeat the reduction. If we already have the answer,
522  // then this won't do any harm but we must still do the calculation
523  // to preserve the overall timing.
524  "mul r24,r23\n" // carry *= 38
525  "ldd r24,Z+31\n" // Extract the high bit of x[31]
526  "mov r25,r24\n" // and mask it off
527  "andi r25,0x7F\n"
528  "std Z+31,r25\n"
529  "lsl r24\n" // carry += high bit * 19
530  "mov r24,r22\n"
531  "sbc r24,r22\n"
532  "andi r24,19\n"
533  "add r0,r24\n"
534  "adc r1,r22\n" // 9-bit carry is now in r0:r1
535 
536  // Propagate the carry through the rest of x.
537  "ld r24,Z\n" // x[0]
538  "add r0,r24\n"
539  "adc r1,r22\n"
540  "st Z+,r0\n"
541  "ld r24,Z\n" // x[1]
542  "add r1,r24\n"
543  "st Z+,r1\n"
544  "ldi r25,30\n" // x[2..31]
545  "4:\n"
546  "ld r24,Z\n"
547  "adc r24,r22\n"
548  "st Z+,r24\n"
549  "dec r25\n"
550  "brne 4b\n"
551  "sbiw r30,32\n" // Point Z back to the start of "x"
552 
553  // We destroyed __zero_reg__ (r1) above, so restore its zero value.
554  "mov __zero_reg__,r22\n"
555 
556  // At this point "x" will either be the answer or it will be the
557  // answer plus (2^255 - 19). Perform a trial subtraction which
558  // is equivalent to adding 19 and subtracting 2^255. We put the
559  // trial answer into the top-most limbs of the original "x" array.
560  // We add 19 here; the subtraction of 2^255 occurs in the next step.
561  "ldi r24,8\n" // Loop counter.
562  "ldi r25,19\n" // carry = 19
563  "5:\n"
564  "ld r16,Z+\n" // r16:r19:carry = *xx++ + carry
565  "ld r17,Z+\n"
566  "ld r18,Z+\n"
567  "ld r19,Z+\n"
568  "add r16,r25\n" // r16:r19:carry += carry
569  "adc r17,__zero_reg__\n"
570  "adc r18,__zero_reg__\n"
571  "adc r19,__zero_reg__\n"
572  "mov r25,__zero_reg__\n"
573  "adc r25,r25\n"
574  "std Z+28,r16\n" // *tt++ = r16:r19
575  "std Z+29,r17\n"
576  "std Z+30,r18\n"
577  "std Z+31,r19\n"
578  "dec r24\n"
579  "brne 5b\n"
580 
581  // Subtract 2^255 from x[32..63] which is equivalent to extracting
582  // the top bit and then masking it off. If the top bit is zero
583  // then a borrow has occurred and this isn't the answer we want.
584  "mov r25,r19\n"
585  "andi r19,0x7F\n"
586  "std Z+31,r19\n"
587  "lsl r25\n"
588  "mov r25,__zero_reg__\n"
589  "sbc r25,__zero_reg__\n"
590 
591  // At this point, r25 is 0 if the original x[0..31] is the answer
592  // we want, or 0xFF if x[32..63] is the answer we want. Essentially
593  // we need to do a conditional move of either x[0..31] or x[32..63]
594  // into "result".
595  "sbiw r30,32\n" // Point Z back to x[0].
596  "ldi r24,8\n"
597  "6:\n"
598  "ldd r16,Z+32\n"
599  "ldd r17,Z+33\n"
600  "ldd r18,Z+34\n"
601  "ldd r19,Z+35\n"
602  "ld r20,Z+\n"
603  "ld r21,Z+\n"
604  "ld r22,Z+\n"
605  "ld r23,Z+\n"
606  "eor r16,r20\n"
607  "eor r17,r21\n"
608  "eor r18,r22\n"
609  "eor r19,r23\n"
610  "and r16,r25\n"
611  "and r17,r25\n"
612  "and r18,r25\n"
613  "and r19,r25\n"
614  "eor r20,r16\n"
615  "eor r21,r17\n"
616  "eor r22,r18\n"
617  "eor r23,r19\n"
618  "st X+,r20\n"
619  "st X+,r21\n"
620  "st X+,r22\n"
621  "st X+,r23\n"
622  "dec r24\n"
623  "brne 6b\n"
624 
625  : : "z"(x), "r"((uint8_t)(size * sizeof(limb_t))), "x"(result)
626  : "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
627  "r24", "r25"
628  );
629 #endif
630 }
631 
645 limb_t Curve25519::reduceQuick(limb_t *x)
646 {
647 #if !defined(CURVE25519_ASM_AVR)
648  limb_t temp[NUM_LIMBS_256BIT];
649  dlimb_t carry;
650  uint8_t posn;
651  limb_t *xx;
652  limb_t *tt;
653 
654  // Perform a trial subtraction of (2^255 - 19) from "x" which is
655  // equivalent to adding 19 and subtracting 2^255. We add 19 here;
656  // the subtraction of 2^255 occurs in the next step.
657  carry = 19U;
658  xx = x;
659  tt = temp;
660  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
661  carry += *xx++;
662  *tt++ = (limb_t)carry;
663  carry >>= LIMB_BITS;
664  }
665 
666  // If there was a borrow, then the original "x" is the correct answer.
667  // If there was no borrow, then "temp" is the correct answer. Select the
668  // correct answer but do it in a way that instruction timing will not
669  // reveal which value was selected. Borrow will occur if the high bit
670  // of "temp" is 0: turn the high bit into a selection mask.
671  limb_t mask = (limb_t)(((slimb_t)(temp[NUM_LIMBS_256BIT - 1])) >> (LIMB_BITS - 1));
672  limb_t nmask = ~mask;
673  temp[NUM_LIMBS_256BIT - 1] &= ((((limb_t)1) << (LIMB_BITS - 1)) - 1);
674  xx = x;
675  tt = temp;
676  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
677  *xx = ((*xx) & nmask) | ((*tt++) & mask);
678  ++xx;
679  }
680 
681  // Clean up "temp".
682  strict_clean(temp);
683 
684  // Return a zero value if we actually subtracted (2^255 - 19) from "x".
685  return nmask;
686 #else // CURVE25519_ASM_AVR
687  limb_t temp[NUM_LIMBS_256BIT];
688  uint8_t result;
689  __asm__ __volatile__ (
690  // Subtract (2^255 - 19) from "x", which is the same as adding 19
691  // and then subtracting 2^255.
692  "ldi r24,8\n" // Loop counter.
693  "ldi r25,19\n" // carry = 19
694  "1:\n"
695  "ld r16,Z+\n" // r16:r19:carry = *xx++ + carry
696  "ld r17,Z+\n"
697  "ld r18,Z+\n"
698  "ld r19,Z+\n"
699  "add r16,r25\n" // r16:r19:carry += carry
700  "adc r17,__zero_reg__\n"
701  "adc r18,__zero_reg__\n"
702  "adc r19,__zero_reg__\n"
703  "mov r25,__zero_reg__\n"
704  "adc r25,r25\n"
705  "st X+,r16\n" // *tt++ = r16:r19
706  "st X+,r17\n"
707  "st X+,r18\n"
708  "st X+,r19\n"
709  "dec r24\n"
710  "brne 1b\n"
711 
712  // Subtract 2^255 from "temp" which is equivalent to extracting
713  // the top bit and then masking it off. If the top bit is zero
714  // then a borrow has occurred and this isn't the answer we want.
715  "mov r25,r19\n"
716  "andi r19,0x7F\n"
717  "st -X,r19\n"
718  "lsl r25\n"
719  "mov r25,__zero_reg__\n"
720  "sbc r25,__zero_reg__\n"
721 
722  // At this point, r25 is 0 if the original "x" is the answer
723  // we want, or 0xFF if "temp" is the answer we want. Essentially
724  // we need to do a conditional move of "temp" into "x".
725  "sbiw r26,31\n" // Point X back to the start of "temp".
726  "sbiw r30,32\n" // Point Z back to the start of "x".
727  "ldi r24,8\n"
728  "2:\n"
729  "ld r16,X+\n"
730  "ld r17,X+\n"
731  "ld r18,X+\n"
732  "ld r19,X+\n"
733  "ld r20,Z\n"
734  "ldd r21,Z+1\n"
735  "ldd r22,Z+2\n"
736  "ldd r23,Z+3\n"
737  "eor r16,r20\n"
738  "eor r17,r21\n"
739  "eor r18,r22\n"
740  "eor r19,r23\n"
741  "and r16,r25\n"
742  "and r17,r25\n"
743  "and r18,r25\n"
744  "and r19,r25\n"
745  "eor r20,r16\n"
746  "eor r21,r17\n"
747  "eor r22,r18\n"
748  "eor r23,r19\n"
749  "st Z+,r20\n"
750  "st Z+,r21\n"
751  "st Z+,r22\n"
752  "st Z+,r23\n"
753  "dec r24\n"
754  "brne 2b\n"
755  "mov %0,r25\n"
756  : "=r"(result)
757  : "x"(temp), "z"(x)
758  : "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
759  "r24", "r25"
760  );
761  strict_clean(temp);
762  return result;
763 #endif // CURVE25519_ASM_AVR
764 }
765 
778 void Curve25519::mulNoReduce(limb_t *result, const limb_t *x, const limb_t *y)
779 {
780 #if !defined(CURVE25519_ASM_AVR)
781  uint8_t i, j;
782  dlimb_t carry;
783  limb_t word;
784  const limb_t *yy;
785  limb_t *rr;
786 
787  // Multiply the lowest word of x by y.
788  carry = 0;
789  word = x[0];
790  yy = y;
791  rr = result;
792  for (i = 0; i < NUM_LIMBS_256BIT; ++i) {
793  carry += ((dlimb_t)(*yy++)) * word;
794  *rr++ = (limb_t)carry;
795  carry >>= LIMB_BITS;
796  }
797  *rr = (limb_t)carry;
798 
799  // Multiply and add the remaining words of x by y.
800  for (i = 1; i < NUM_LIMBS_256BIT; ++i) {
801  word = x[i];
802  carry = 0;
803  yy = y;
804  rr = result + i;
805  for (j = 0; j < NUM_LIMBS_256BIT; ++j) {
806  carry += ((dlimb_t)(*yy++)) * word;
807  carry += *rr;
808  *rr++ = (limb_t)carry;
809  carry >>= LIMB_BITS;
810  }
811  *rr = (limb_t)carry;
812  }
813 #else
814  __asm__ __volatile__ (
815  // Save Y and copy the "result" pointer into it.
816  "push r28\n"
817  "push r29\n"
818  "mov r28,%A2\n"
819  "mov r29,%B2\n"
820 
821  // Multiply the first byte of "x" by y[0..31].
822  "ldi r25,8\n" // loop 8 times: 4 bytes of y each time
823  "clr r24\n" // carry = 0
824  "clr r22\n" // r22 = 0 to replace __zero_reg__
825  "ld r23,X+\n" // r23 = *x++
826  "1:\n"
827  "ld r16,Z\n" // r16 = y[0]
828  "mul r16,r23\n" // r8:r9 = y[0] * r23
829  "movw r8,r0\n"
830  "ldd r16,Z+2\n" // r16 = y[2]
831  "mul r16,r23\n" // r10:r11 = y[2] * r23
832  "movw r10,r0\n"
833  "ldd r16,Z+1\n" // r16 = y[1]
834  "mul r16,r23\n" // r9:r10:r11 += y[1] * r23
835  "add r9,r0\n"
836  "adc r10,r1\n"
837  "adc r11,r22\n"
838  "ldd r16,Z+3\n" // r16 = y[3]
839  "mul r16,r23\n" // r11:r1 += y[3] * r23
840  "add r11,r0\n"
841  "adc r1,r22\n"
842  "add r8,r24\n" // r8:r9:r10:r11:r1 += carry
843  "adc r9,r22\n"
844  "adc r10,r22\n"
845  "adc r11,r22\n"
846  "adc r1,r22\n"
847  "mov r24,r1\n" // carry = r1
848  "st Y+,r8\n" // *rr++ = r8:r9:r10:r11
849  "st Y+,r9\n"
850  "st Y+,r10\n"
851  "st Y+,r11\n"
852  "adiw r30,4\n"
853  "dec r25\n"
854  "brne 1b\n"
855  "st Y+,r24\n" // *rr++ = carry
856  "sbiw r28,32\n" // rr -= 32
857  "sbiw r30,32\n" // Point Z back to the start of y
858 
859  // Multiply and add the remaining bytes of "x" by y[0..31].
860  "ldi r21,31\n" // 31 more bytes of x to go.
861  "2:\n"
862  "ldi r25,8\n" // loop 8 times: 4 bytes of y each time
863  "clr r24\n" // carry = 0
864  "ld r23,X+\n" // r23 = *x++
865  "3:\n"
866  "ld r16,Z\n" // r16 = y[0]
867  "mul r16,r23\n" // r8:r9 = y[0] * r23
868  "movw r8,r0\n"
869  "ldd r16,Z+2\n" // r16 = y[2]
870  "mul r16,r23\n" // r10:r11 = y[2] * r23
871  "movw r10,r0\n"
872  "ldd r16,Z+1\n" // r16 = y[1]
873  "mul r16,r23\n" // r9:r10:r11 += y[1] * r23
874  "add r9,r0\n"
875  "adc r10,r1\n"
876  "adc r11,r22\n"
877  "ldd r16,Z+3\n" // r16 = y[3]
878  "mul r16,r23\n" // r11:r1 += y[3] * r23
879  "add r11,r0\n"
880  "adc r1,r22\n"
881  "add r8,r24\n" // r8:r9:r10:r11:r1 += carry
882  "adc r9,r22\n"
883  "adc r10,r22\n"
884  "adc r11,r22\n"
885  "adc r1,r22\n"
886  "ld r16,Y\n" // r8:r9:r10:r11:r1 += rr[0..3]
887  "add r8,r16\n"
888  "ldd r16,Y+1\n"
889  "adc r9,r16\n"
890  "ldd r16,Y+2\n"
891  "adc r10,r16\n"
892  "ldd r16,Y+3\n"
893  "adc r11,r16\n"
894  "adc r1,r22\n"
895  "mov r24,r1\n" // carry = r1
896  "st Y+,r8\n" // *rr++ = r8:r9:r10:r11
897  "st Y+,r9\n"
898  "st Y+,r10\n"
899  "st Y+,r11\n"
900  "adiw r30,4\n"
901  "dec r25\n"
902  "brne 3b\n"
903  "st Y+,r24\n" // *r++ = carry
904  "sbiw r28,32\n" // rr -= 32
905  "sbiw r30,32\n" // Point Z back to the start of y
906  "dec r21\n"
907  "brne 2b\n"
908 
909  // Restore Y and __zero_reg__.
910  "pop r29\n"
911  "pop r28\n"
912  "clr __zero_reg__\n"
913  : : "x"(x), "z"(y), "r"(result)
914  : "r8", "r9", "r10", "r11", "r16", "r20", "r21", "r22",
915  "r23", "r24", "r25"
916  );
917 #endif
918 }
919 
930 void Curve25519::mul(limb_t *result, const limb_t *x, const limb_t *y)
931 {
932  limb_t temp[NUM_LIMBS_512BIT];
933  mulNoReduce(temp, x, y);
934  reduce(result, temp, NUM_LIMBS_256BIT);
935  strict_clean(temp);
936 }
937 
957 void Curve25519::mulA24(limb_t *result, const limb_t *x)
958 {
959 #if !defined(CURVE25519_ASM_AVR)
960  // The constant a24 = 121665 (0x1DB41) as a limb array.
961 #if BIGNUMBER_LIMB_8BIT
962  static limb_t const a24[3] PROGMEM = {0x41, 0xDB, 0x01};
963 #elif BIGNUMBER_LIMB_16BIT
964  static limb_t const a24[2] PROGMEM = {0xDB41, 0x0001};
965 #elif BIGNUMBER_LIMB_32BIT
966  static limb_t const a24[1] PROGMEM = {0x0001DB41};
967 #else
968  #error "limb_t must be 8, 16, or 32 bits in size"
969 #endif
970  #define NUM_A24_LIMBS (sizeof(a24) / sizeof(limb_t))
971 
972  // Multiply the lowest limb of a24 by x and zero-extend into the result.
973  limb_t temp[NUM_LIMBS_512BIT];
974  uint8_t i, j;
975  dlimb_t carry = 0;
976  limb_t word = pgm_read_limb(&(a24[0]));
977  const limb_t *xx = x;
978  limb_t *tt = temp;
979  for (i = 0; i < NUM_LIMBS_256BIT; ++i) {
980  carry += ((dlimb_t)(*xx++)) * word;
981  *tt++ = (limb_t)carry;
982  carry >>= LIMB_BITS;
983  }
984  *tt = (limb_t)carry;
985 
986  // Multiply and add the remaining limbs of a24.
987  for (i = 1; i < NUM_A24_LIMBS; ++i) {
988  word = pgm_read_limb(&(a24[i]));
989  carry = 0;
990  xx = x;
991  tt = temp + i;
992  for (j = 0; j < NUM_LIMBS_256BIT; ++j) {
993  carry += ((dlimb_t)(*xx++)) * word;
994  carry += *tt;
995  *tt++ = (limb_t)carry;
996  carry >>= LIMB_BITS;
997  }
998  *tt = (limb_t)carry;
999  }
1000 #else
1001  limb_t temp[NUM_LIMBS_512BIT];
1002  #define NUM_A24_LIMBS ((3 + sizeof(limb_t) - 1) / sizeof(limb_t))
1003  __asm__ __volatile__ (
1004  // Load the two low bytes of a24 into r16 and r17.
1005  // The third byte is 0x01 which we can deal with implicitly.
1006  "ldi r16,0x41\n"
1007  "ldi r17,0xDB\n"
1008 
1009  // Iterate over the bytes of "x" and multiply each with a24.
1010  "ldi r25,32\n" // 32 bytes in "x"
1011  "clr r22\n" // r22 = 0
1012  "clr r18\n" // r18:r19:r11 = 0 (carry)
1013  "clr r19\n"
1014  "clr r11\n"
1015  "1:\n"
1016  "ld r21,X+\n" // r21 = *x++
1017  "mul r21,r16\n" // r8:r9 = r21 * a24[0]
1018  "movw r8,r0\n"
1019  "mul r21,r17\n" // r9:r1 += r21 * a24[1]
1020  "add r9,r0\n"
1021  "adc r1,r21\n" // r1:r10 += r21 * a24[2] (implicitly 1)
1022  "mov r10,r22\n"
1023  "adc r10,r22\n"
1024  "add r8,r18\n" // r8:r9:r1:r10 += carry
1025  "adc r9,r19\n"
1026  "adc r1,r11\n"
1027  "adc r10,r22\n"
1028  "st Z+,r8\n" // *tt++ = r8
1029  "mov r18,r9\n" // carry = r9:r1:r10
1030  "mov r19,r1\n"
1031  "mov r11,r10\n"
1032  "dec r25\n"
1033  "brne 1b\n"
1034  "st Z,r18\n" // *tt = carry
1035  "std Z+1,r19\n"
1036  "std Z+2,r11\n"
1037 #if BIGNUMBER_LIMB_16BIT || BIGNUMBER_LIMB_32BIT
1038  "std Z+3,r22\n" // Zero pad to a limb boundary
1039 #endif
1040 
1041  // Restore __zero_reg__
1042  "clr __zero_reg__\n"
1043 
1044  : : "x"(x), "z"(temp)
1045  : "r8", "r9", "r10", "r11", "r16", "r17", "r18", "r19",
1046  "r20", "r21", "r22", "r25"
1047  );
1048 #endif
1049 
1050  // Reduce the intermediate result modulo 2^255 - 19.
1051  reduce(result, temp, NUM_A24_LIMBS);
1052  strict_clean(temp);
1053 }
1054 
1066 void Curve25519::mul_P(limb_t *result, const limb_t *x, const limb_t *y)
1067 {
1068  limb_t temp[NUM_LIMBS_512BIT];
1069  uint8_t i, j;
1070  dlimb_t carry;
1071  limb_t word;
1072  const limb_t *xx;
1073  limb_t *tt;
1074 
1075  // Multiply the lowest word of y by x.
1076  carry = 0;
1077  word = pgm_read_limb(&(y[0]));
1078  xx = x;
1079  tt = temp;
1080  for (i = 0; i < NUM_LIMBS_256BIT; ++i) {
1081  carry += ((dlimb_t)(*xx++)) * word;
1082  *tt++ = (limb_t)carry;
1083  carry >>= LIMB_BITS;
1084  }
1085  *tt = (limb_t)carry;
1086 
1087  // Multiply and add the remaining words of y by x.
1088  for (i = 1; i < NUM_LIMBS_256BIT; ++i) {
1089  word = pgm_read_limb(&(y[i]));
1090  carry = 0;
1091  xx = x;
1092  tt = temp + i;
1093  for (j = 0; j < NUM_LIMBS_256BIT; ++j) {
1094  carry += ((dlimb_t)(*xx++)) * word;
1095  carry += *tt;
1096  *tt++ = (limb_t)carry;
1097  carry >>= LIMB_BITS;
1098  }
1099  *tt = (limb_t)carry;
1100  }
1101 
1102  // Reduce the intermediate result modulo 2^255 - 19.
1103  reduce(result, temp, NUM_LIMBS_256BIT);
1104  strict_clean(temp);
1105 }
1106 
1117 void Curve25519::add(limb_t *result, const limb_t *x, const limb_t *y)
1118 {
1119 #if !defined(CURVE25519_ASM_AVR)
1120  dlimb_t carry = 0;
1121  uint8_t posn;
1122  limb_t *rr = result;
1123 
1124  // Add the two arrays to obtain the intermediate result.
1125  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
1126  carry += *x++;
1127  carry += *y++;
1128  *rr++ = (limb_t)carry;
1129  carry >>= LIMB_BITS;
1130  }
1131 #else // CURVE25519_ASM_AVR
1132  __asm__ __volatile__ (
1133  // Save Y and copy the "result" pointer into it.
1134  "push r28\n"
1135  "push r29\n"
1136  "mov r28,%A2\n"
1137  "mov r29,%B2\n"
1138 
1139  // Unroll the loop to operate on 4 bytes at a time (8 iterations).
1140  "ldi r24,8\n" // Loop counter.
1141  "clr r25\n" // carry = 0
1142  "1:\n"
1143  "ld r16,X+\n" // r16:r19 = *x++
1144  "ld r17,X+\n"
1145  "ld r18,X+\n"
1146  "ld r19,X+\n"
1147  "ld r20,Z+\n" // r20:r23 = *y++
1148  "ld r21,Z+\n"
1149  "ld r22,Z+\n"
1150  "ld r23,Z+\n"
1151  "add r16,r25\n" // r16:r19:carry += carry
1152  "adc r17,__zero_reg__\n"
1153  "adc r18,__zero_reg__\n"
1154  "adc r19,__zero_reg__\n"
1155  "mov r25,__zero_reg__\n"
1156  "adc r25,r25\n"
1157  "add r16,r20\n" // r16:r19:carry += r20:r23
1158  "adc r17,r21\n"
1159  "adc r18,r22\n"
1160  "adc r19,r23\n"
1161  "adc r25,__zero_reg__\n"
1162  "st Y+,r16\n" // *rr++ = r16:r23
1163  "st Y+,r17\n"
1164  "st Y+,r18\n"
1165  "st Y+,r19\n"
1166  "dec r24\n"
1167  "brne 1b\n"
1168 
1169  // Restore Y.
1170  "pop r29\n"
1171  "pop r28\n"
1172  : : "x"(x), "z"(y), "r"(result)
1173  : "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
1174  "r24", "r25"
1175  );
1176 #endif // CURVE25519_ASM_AVR
1177 
1178  // Reduce the result using the quick trial subtraction method.
1179  reduceQuick(result);
1180 }
1181 
1192 void Curve25519::sub(limb_t *result, const limb_t *x, const limb_t *y)
1193 {
1194 #if !defined(CURVE25519_ASM_AVR)
1195  dlimb_t borrow;
1196  uint8_t posn;
1197  limb_t *rr = result;
1198 
1199  // Subtract y from x to generate the intermediate result.
1200  borrow = 0;
1201  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
1202  borrow = ((dlimb_t)(*x++)) - (*y++) - ((borrow >> LIMB_BITS) & 0x01);
1203  *rr++ = (limb_t)borrow;
1204  }
1205 
1206  // If we had a borrow, then the result has gone negative and we
1207  // have to add 2^255 - 19 to the result to make it positive again.
1208  // The top bits of "borrow" will be all 1's if there is a borrow
1209  // or it will be all 0's if there was no borrow. Easiest is to
1210  // conditionally subtract 19 and then mask off the high bit.
1211  rr = result;
1212  borrow = (borrow >> LIMB_BITS) & 19U;
1213  borrow = ((dlimb_t)(*rr)) - borrow;
1214  *rr++ = (limb_t)borrow;
1215  for (posn = 1; posn < NUM_LIMBS_256BIT; ++posn) {
1216  borrow = ((dlimb_t)(*rr)) - ((borrow >> LIMB_BITS) & 0x01);
1217  *rr++ = (limb_t)borrow;
1218  }
1219  *(--rr) &= ((((limb_t)1) << (LIMB_BITS - 1)) - 1);
1220 #else // CURVE25519_ASM_AVR
1221  __asm__ __volatile__ (
1222  // Save Y and copy the "result" pointer into it.
1223  "push r28\n"
1224  "push r29\n"
1225  "mov r28,%A2\n"
1226  "mov r29,%B2\n"
1227 
1228  // Unroll the sub loop to operate on 4 bytes at a time (8 iterations).
1229  "ldi r24,8\n" // Loop counter.
1230  "clr r25\n" // borrow = 0
1231  "1:\n"
1232  "ld r16,X+\n" // r16:r19 = *x++
1233  "ld r17,X+\n"
1234  "ld r18,X+\n"
1235  "ld r19,X+\n"
1236  "ld r20,Z+\n" // r20:r23 = *y++
1237  "ld r21,Z+\n"
1238  "ld r22,Z+\n"
1239  "ld r23,Z+\n"
1240  "sub r16,r25\n" // r16:r19:borrow -= borrow
1241  "sbc r17,__zero_reg__\n"
1242  "sbc r18,__zero_reg__\n"
1243  "sbc r19,__zero_reg__\n"
1244  "mov r25,__zero_reg__\n"
1245  "sbc r25,__zero_reg__\n"
1246  "sub r16,r20\n" // r16:r19:borrow -= r20:r23
1247  "sbc r17,r21\n"
1248  "sbc r18,r22\n"
1249  "sbc r19,r23\n"
1250  "sbc r25,__zero_reg__\n"
1251  "st Y+,r16\n" // *rr++ = r16:r23
1252  "st Y+,r17\n"
1253  "st Y+,r18\n"
1254  "st Y+,r19\n"
1255  "andi r25,1\n" // Only need the bottom bit of the borrow
1256  "dec r24\n"
1257  "brne 1b\n"
1258 
1259  // If there was a borrow, then we need to add 2^255 - 19 back.
1260  // We conditionally subtract 19 and then mask off the high bit.
1261  "neg r25\n" // borrow = mask(borrow) & 19
1262  "andi r25,19\n"
1263  "sbiw r28,32\n" // Point Y back to the start of "result"
1264  "ldi r24,8\n"
1265  "2:\n"
1266  "ld r16,Y\n" // r16:r19 = *rr
1267  "ldd r17,Y+1\n"
1268  "ldd r18,Y+2\n"
1269  "ldd r19,Y+3\n"
1270  "sub r16,r25\n"
1271  "sbc r17,__zero_reg__\n" // r16:r19:borrow -= borrow
1272  "sbc r18,__zero_reg__\n"
1273  "sbc r19,__zero_reg__\n"
1274  "mov r25,__zero_reg__\n"
1275  "sbc r25,__zero_reg__\n"
1276  "andi r25,1\n"
1277  "st Y+,r16\n" // *r++ = r16:r19
1278  "st Y+,r17\n"
1279  "st Y+,r18\n"
1280  "st Y+,r19\n"
1281  "dec r24\n"
1282  "brne 2b\n"
1283  "andi r19,0x7F\n" // Mask off the high bit in the last byte
1284  "sbiw r28,1\n"
1285  "st Y,r19\n"
1286 
1287  // Restore Y.
1288  "pop r29\n"
1289  "pop r28\n"
1290  : : "x"(x), "z"(y), "r"(result)
1291  : "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
1292  "r24", "r25"
1293  );
1294 #endif // CURVE25519_ASM_AVR
1295 }
1296 
1309 void Curve25519::cswap(limb_t select, limb_t *x, limb_t *y)
1310 {
1311 #if !defined(CURVE25519_ASM_AVR)
1312  uint8_t posn;
1313  limb_t dummy;
1314  limb_t sel;
1315 
1316  // Turn "select" into an all-zeroes or all-ones mask. We don't care
1317  // which bit or bits is set in the original "select" value.
1318  sel = (limb_t)(((((dlimb_t)1) << LIMB_BITS) - select) >> LIMB_BITS);
1319  --sel;
1320 
1321  // Swap the two values based on "select". Algorithm from:
1322  // https://tools.ietf.org/html/draft-irtf-cfrg-curves-02
1323  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
1324  dummy = sel & (x[posn] ^ y[posn]);
1325  x[posn] ^= dummy;
1326  y[posn] ^= dummy;
1327  }
1328 #else // CURVE25519_ASM_AVR
1329  __asm__ __volatile__ (
1330  // Combine all bytes from "select" into one and then turn
1331  // that byte into the "sel" mask in r24.
1332  "clr r24\n"
1333 #if BIGNUMBER_LIMB_8BIT
1334  "sub r24,%2\n"
1335 #elif BIGNUMBER_LIMB_16BIT
1336  "or %A2,%B2\n"
1337  "sub r24,%A2\n"
1338 #elif BIGNUMBER_LIMB_32BIT
1339  "or %A2,%B2\n"
1340  "or %A2,%C2\n"
1341  "or %A2,%D2\n"
1342  "sub r24,%A2\n"
1343 #endif
1344  "mov r24,__zero_reg__\n"
1345  "sbc r24,r24\n"
1346 
1347  // Perform the conditional swap 4 bytes at a time.
1348  "ldi r25,8\n"
1349  "1:\n"
1350  "ld r16,X+\n" // r16:r19 = *x
1351  "ld r17,X+\n"
1352  "ld r18,X+\n"
1353  "ld r19,X\n"
1354  "ld r20,Z\n" // r20:r23 = *y
1355  "ldd r21,Z+1\n"
1356  "ldd r22,Z+2\n"
1357  "ldd r23,Z+3\n"
1358  "mov r12,r16\n" // r12:r15 = (r16:r19 ^ r20:r23) & sel
1359  "mov r13,r17\n"
1360  "mov r14,r18\n"
1361  "mov r15,r19\n"
1362  "eor r12,r20\n"
1363  "eor r13,r21\n"
1364  "eor r14,r22\n"
1365  "eor r15,r23\n"
1366  "and r12,r24\n"
1367  "and r13,r24\n"
1368  "and r14,r24\n"
1369  "and r15,r24\n"
1370  "eor r16,r12\n" // r16:r19 ^= r12:r15
1371  "eor r17,r13\n"
1372  "eor r18,r14\n"
1373  "eor r19,r15\n"
1374  "eor r20,r12\n" // r20:r23 ^= r12:r15
1375  "eor r21,r13\n"
1376  "eor r22,r14\n"
1377  "eor r23,r15\n"
1378  "st X,r19\n" // *x++ = r16:r19
1379  "st -X,r18\n"
1380  "st -X,r17\n"
1381  "st -X,r16\n"
1382  "adiw r26,4\n"
1383  "st Z+,r20\n" // *y++ = r20:r23
1384  "st Z+,r21\n"
1385  "st Z+,r22\n"
1386  "st Z+,r23\n"
1387  "dec r25\n"
1388  "brne 1b\n"
1389 
1390  : : "x"(x), "z"(y), "r"(select)
1391  : "r12", "r13", "r14", "r15", "r16", "r17", "r18", "r19",
1392  "r20", "r21", "r22", "r23", "r24", "r25"
1393  );
1394 #endif // CURVE25519_ASM_AVR
1395 }
1396 
1409 void Curve25519::cmove(limb_t select, limb_t *x, const limb_t *y)
1410 {
1411 #if !defined(CURVE25519_ASM_AVR)
1412  uint8_t posn;
1413  limb_t dummy;
1414  limb_t sel;
1415 
1416  // Turn "select" into an all-zeroes or all-ones mask. We don't care
1417  // which bit or bits is set in the original "select" value.
1418  sel = (limb_t)(((((dlimb_t)1) << LIMB_BITS) - select) >> LIMB_BITS);
1419  --sel;
1420 
1421  // Move y into x based on "select". Similar to conditional swap above.
1422  for (posn = 0; posn < NUM_LIMBS_256BIT; ++posn) {
1423  dummy = sel & (x[posn] ^ y[posn]);
1424  x[posn] ^= dummy;
1425  }
1426 #else // CURVE25519_ASM_AVR
1427  __asm__ __volatile__ (
1428  // Combine all bytes from "select" into one and then turn
1429  // that byte into the "sel" mask in r24.
1430  "clr r24\n"
1431 #if BIGNUMBER_LIMB_8BIT
1432  "sub r24,%2\n"
1433 #elif BIGNUMBER_LIMB_16BIT
1434  "or %A2,%B2\n"
1435  "sub r24,%A2\n"
1436 #elif BIGNUMBER_LIMB_32BIT
1437  "or %A2,%B2\n"
1438  "or %A2,%C2\n"
1439  "or %A2,%D2\n"
1440  "sub r24,%A2\n"
1441 #endif
1442  "mov r24,__zero_reg__\n"
1443  "sbc r24,r24\n"
1444 
1445  // Perform the conditional move 4 bytes at a time.
1446  "ldi r25,8\n"
1447  "1:\n"
1448  "ld r16,X+\n" // r16:r19 = *x
1449  "ld r17,X+\n"
1450  "ld r18,X+\n"
1451  "ld r19,X\n"
1452  "ld r20,Z+\n" // r20:r23 = *y++
1453  "ld r21,Z+\n"
1454  "ld r22,Z+\n"
1455  "ld r23,Z+\n"
1456  "eor r20,r16\n" // r20:r23 = (r16:r19 ^ r20:r23) & sel
1457  "eor r21,r17\n"
1458  "eor r22,r18\n"
1459  "eor r23,r19\n"
1460  "and r20,r24\n"
1461  "and r21,r24\n"
1462  "and r22,r24\n"
1463  "and r23,r24\n"
1464  "eor r16,r20\n" // r16:r19 ^= r20:r23
1465  "eor r17,r21\n"
1466  "eor r18,r22\n"
1467  "eor r19,r23\n"
1468  "st X,r19\n" // *x++ = r16:r19
1469  "st -X,r18\n"
1470  "st -X,r17\n"
1471  "st -X,r16\n"
1472  "adiw r26,4\n"
1473  "dec r25\n"
1474  "brne 1b\n"
1475 
1476  : : "x"(x), "z"(y), "r"(select)
1477  : "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
1478  "r24", "r25"
1479  );
1480 #endif // CURVE25519_ASM_AVR
1481 }
1482 
1489 void Curve25519::pow250(limb_t *result, const limb_t *x)
1490 {
1491  limb_t t1[NUM_LIMBS_256BIT];
1492  uint8_t i, j;
1493 
1494  // The big-endian hexadecimal expansion of (2^250 - 1) is:
1495  // 03FFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF
1496  //
1497  // The naive implementation needs to do 2 multiplications per 1 bit and
1498  // 1 multiplication per 0 bit. We can improve upon this by creating a
1499  // pattern 0000000001 ... 0000000001. If we square and multiply the
1500  // pattern by itself we can turn the pattern into the partial results
1501  // 0000000011 ... 0000000011, 0000000111 ... 0000000111, etc.
1502  // This averages out to about 1.1 multiplications per 1 bit instead of 2.
1503 
1504  // Build a pattern of 250 bits in length of repeated copies of 0000000001.
1505  #define RECIP_GROUP_SIZE 10
1506  #define RECIP_GROUP_BITS 250 // Must be a multiple of RECIP_GROUP_SIZE.
1507  square(t1, x);
1508  for (j = 0; j < (RECIP_GROUP_SIZE - 1); ++j)
1509  square(t1, t1);
1510  mul(result, t1, x);
1511  for (i = 0; i < ((RECIP_GROUP_BITS / RECIP_GROUP_SIZE) - 2); ++i) {
1512  for (j = 0; j < RECIP_GROUP_SIZE; ++j)
1513  square(t1, t1);
1514  mul(result, result, t1);
1515  }
1516 
1517  // Multiply bit-shifted versions of the 0000000001 pattern into
1518  // the result to "fill in" the gaps in the pattern.
1519  square(t1, result);
1520  mul(result, result, t1);
1521  for (j = 0; j < (RECIP_GROUP_SIZE - 2); ++j) {
1522  square(t1, t1);
1523  mul(result, result, t1);
1524  }
1525 
1526  // Clean up and exit.
1527  clean(t1);
1528 }
1529 
1537 void Curve25519::recip(limb_t *result, const limb_t *x)
1538 {
1539  // The reciprocal is the same as x ^ (p - 2) where p = 2^255 - 19.
1540  // The big-endian hexadecimal expansion of (p - 2) is:
1541  // 7FFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFEB
1542  // Start with the 250 upper bits of the expansion of (p - 2).
1543  pow250(result, x);
1544 
1545  // Deal with the 5 lowest bits of (p - 2), 01011, from highest to lowest.
1546  square(result, result);
1547  square(result, result);
1548  mul(result, result, x);
1549  square(result, result);
1550  square(result, result);
1551  mul(result, result, x);
1552  square(result, result);
1553  mul(result, result, x);
1554 }
1555 
1571 bool Curve25519::sqrt(limb_t *result, const limb_t *x)
1572 {
1573  // sqrt(-1) mod (2^255 - 19).
1574  static limb_t const numSqrtM1[NUM_LIMBS_256BIT] PROGMEM = {
1575  LIMB(0x4A0EA0B0), LIMB(0xC4EE1B27), LIMB(0xAD2FE478), LIMB(0x2F431806),
1576  LIMB(0x3DFBD7A7), LIMB(0x2B4D0099), LIMB(0x4FC1DF0B), LIMB(0x2B832480)
1577  };
1578  limb_t y[NUM_LIMBS_256BIT];
1579 
1580  // Algorithm from:
1581  // https://tools.ietf.org/id/draft-josefsson-eddsa-ed25519-02.txt
1582 
1583  // Compute a candidate root: result = x^((p + 3) / 8) mod p.
1584  // (p + 3) / 8 = (2^252 - 2) which is 251 one bits followed by a zero:
1585  // 0FFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE
1586  pow250(result, x);
1587  square(result, result);
1588  mul(result, result, x);
1589  square(result, result);
1590 
1591  // Did we get the square root immediately?
1592  square(y, result);
1593  if (memcmp(x, y, sizeof(y)) == 0) {
1594  clean(y);
1595  return true;
1596  }
1597 
1598  // Multiply the result by sqrt(-1) and check again.
1599  mul_P(result, result, numSqrtM1);
1600  square(y, result);
1601  if (memcmp(x, y, sizeof(y)) == 0) {
1602  clean(y);
1603  return true;
1604  }
1605 
1606  // The number does not have a square root.
1607  clean(y);
1608  return false;
1609 }
void rand(uint8_t *data, size_t len)
Generates random bytes into a caller-supplied buffer.
Definition: RNG.cpp:508
static bool eval(uint8_t result[32], const uint8_t s[32], const uint8_t x[32])
Evaluates the raw Curve25519 function.
Definition: Curve25519.cpp:79
static void unpackLE(limb_t *limbs, size_t count, const uint8_t *bytes, size_t len)
Unpacks the little-endian byte representation of a big number into a limb array.
static void packLE(uint8_t *bytes, size_t len, const limb_t *limbs, size_t count)
Packs the little-endian byte representation of a big number into a byte array.
static void dh1(uint8_t k[32], uint8_t f[32])
Performs phase 1 of a Diffie-Hellman key exchange using Curve25519.
Definition: Curve25519.cpp:244
static bool dh2(uint8_t k[32], uint8_t f[32])
Performs phase 2 of a Diffie-Hellman key exchange using Curve25519.
Definition: Curve25519.cpp:282