66 static void fsum(limb *output,
const limb *in) {
68 for (i = 0; i < 10; i += 2) {
69 output[0+i] = output[0+i] + in[0+i];
70 output[1+i] = output[1+i] + in[1+i];
76 static void fdifference(limb *output,
const limb *in) {
78 for (i = 0; i < 10; ++i) {
79 output[i] = in[i] - output[i];
84 static void fscalar_product(limb *output,
const limb *in,
const limb scalar) {
86 for (i = 0; i < 10; ++i) {
87 output[i] = in[i] * scalar;
97 static void fproduct(limb *output,
const limb *in2,
const limb *in) {
98 output[0] = ((limb) ((s32) in2[0])) * ((s32) in[0]);
99 output[1] = ((limb) ((s32) in2[0])) * ((s32) in[1]) +
100 ((limb) ((s32) in2[1])) * ((s32) in[0]);
101 output[2] = 2 * ((limb) ((s32) in2[1])) * ((s32) in[1]) +
102 ((limb) ((s32) in2[0])) * ((s32) in[2]) +
103 ((limb) ((s32) in2[2])) * ((s32) in[0]);
104 output[3] = ((limb) ((s32) in2[1])) * ((s32) in[2]) +
105 ((limb) ((s32) in2[2])) * ((s32) in[1]) +
106 ((limb) ((s32) in2[0])) * ((s32) in[3]) +
107 ((limb) ((s32) in2[3])) * ((s32) in[0]);
108 output[4] = ((limb) ((s32) in2[2])) * ((s32) in[2]) +
109 2 * (((limb) ((s32) in2[1])) * ((s32) in[3]) +
110 ((limb) ((s32) in2[3])) * ((s32) in[1])) +
111 ((limb) ((s32) in2[0])) * ((s32) in[4]) +
112 ((limb) ((s32) in2[4])) * ((s32) in[0]);
113 output[5] = ((limb) ((s32) in2[2])) * ((s32) in[3]) +
114 ((limb) ((s32) in2[3])) * ((s32) in[2]) +
115 ((limb) ((s32) in2[1])) * ((s32) in[4]) +
116 ((limb) ((s32) in2[4])) * ((s32) in[1]) +
117 ((limb) ((s32) in2[0])) * ((s32) in[5]) +
118 ((limb) ((s32) in2[5])) * ((s32) in[0]);
119 output[6] = 2 * (((limb) ((s32) in2[3])) * ((s32) in[3]) +
120 ((limb) ((s32) in2[1])) * ((s32) in[5]) +
121 ((limb) ((s32) in2[5])) * ((s32) in[1])) +
122 ((limb) ((s32) in2[2])) * ((s32) in[4]) +
123 ((limb) ((s32) in2[4])) * ((s32) in[2]) +
124 ((limb) ((s32) in2[0])) * ((s32) in[6]) +
125 ((limb) ((s32) in2[6])) * ((s32) in[0]);
126 output[7] = ((limb) ((s32) in2[3])) * ((s32) in[4]) +
127 ((limb) ((s32) in2[4])) * ((s32) in[3]) +
128 ((limb) ((s32) in2[2])) * ((s32) in[5]) +
129 ((limb) ((s32) in2[5])) * ((s32) in[2]) +
130 ((limb) ((s32) in2[1])) * ((s32) in[6]) +
131 ((limb) ((s32) in2[6])) * ((s32) in[1]) +
132 ((limb) ((s32) in2[0])) * ((s32) in[7]) +
133 ((limb) ((s32) in2[7])) * ((s32) in[0]);
134 output[8] = ((limb) ((s32) in2[4])) * ((s32) in[4]) +
135 2 * (((limb) ((s32) in2[3])) * ((s32) in[5]) +
136 ((limb) ((s32) in2[5])) * ((s32) in[3]) +
137 ((limb) ((s32) in2[1])) * ((s32) in[7]) +
138 ((limb) ((s32) in2[7])) * ((s32) in[1])) +
139 ((limb) ((s32) in2[2])) * ((s32) in[6]) +
140 ((limb) ((s32) in2[6])) * ((s32) in[2]) +
141 ((limb) ((s32) in2[0])) * ((s32) in[8]) +
142 ((limb) ((s32) in2[8])) * ((s32) in[0]);
143 output[9] = ((limb) ((s32) in2[4])) * ((s32) in[5]) +
144 ((limb) ((s32) in2[5])) * ((s32) in[4]) +
145 ((limb) ((s32) in2[3])) * ((s32) in[6]) +
146 ((limb) ((s32) in2[6])) * ((s32) in[3]) +
147 ((limb) ((s32) in2[2])) * ((s32) in[7]) +
148 ((limb) ((s32) in2[7])) * ((s32) in[2]) +
149 ((limb) ((s32) in2[1])) * ((s32) in[8]) +
150 ((limb) ((s32) in2[8])) * ((s32) in[1]) +
151 ((limb) ((s32) in2[0])) * ((s32) in[9]) +
152 ((limb) ((s32) in2[9])) * ((s32) in[0]);
153 output[10] = 2 * (((limb) ((s32) in2[5])) * ((s32) in[5]) +
154 ((limb) ((s32) in2[3])) * ((s32) in[7]) +
155 ((limb) ((s32) in2[7])) * ((s32) in[3]) +
156 ((limb) ((s32) in2[1])) * ((s32) in[9]) +
157 ((limb) ((s32) in2[9])) * ((s32) in[1])) +
158 ((limb) ((s32) in2[4])) * ((s32) in[6]) +
159 ((limb) ((s32) in2[6])) * ((s32) in[4]) +
160 ((limb) ((s32) in2[2])) * ((s32) in[8]) +
161 ((limb) ((s32) in2[8])) * ((s32) in[2]);
162 output[11] = ((limb) ((s32) in2[5])) * ((s32) in[6]) +
163 ((limb) ((s32) in2[6])) * ((s32) in[5]) +
164 ((limb) ((s32) in2[4])) * ((s32) in[7]) +
165 ((limb) ((s32) in2[7])) * ((s32) in[4]) +
166 ((limb) ((s32) in2[3])) * ((s32) in[8]) +
167 ((limb) ((s32) in2[8])) * ((s32) in[3]) +
168 ((limb) ((s32) in2[2])) * ((s32) in[9]) +
169 ((limb) ((s32) in2[9])) * ((s32) in[2]);
170 output[12] = ((limb) ((s32) in2[6])) * ((s32) in[6]) +
171 2 * (((limb) ((s32) in2[5])) * ((s32) in[7]) +
172 ((limb) ((s32) in2[7])) * ((s32) in[5]) +
173 ((limb) ((s32) in2[3])) * ((s32) in[9]) +
174 ((limb) ((s32) in2[9])) * ((s32) in[3])) +
175 ((limb) ((s32) in2[4])) * ((s32) in[8]) +
176 ((limb) ((s32) in2[8])) * ((s32) in[4]);
177 output[13] = ((limb) ((s32) in2[6])) * ((s32) in[7]) +
178 ((limb) ((s32) in2[7])) * ((s32) in[6]) +
179 ((limb) ((s32) in2[5])) * ((s32) in[8]) +
180 ((limb) ((s32) in2[8])) * ((s32) in[5]) +
181 ((limb) ((s32) in2[4])) * ((s32) in[9]) +
182 ((limb) ((s32) in2[9])) * ((s32) in[4]);
183 output[14] = 2 * (((limb) ((s32) in2[7])) * ((s32) in[7]) +
184 ((limb) ((s32) in2[5])) * ((s32) in[9]) +
185 ((limb) ((s32) in2[9])) * ((s32) in[5])) +
186 ((limb) ((s32) in2[6])) * ((s32) in[8]) +
187 ((limb) ((s32) in2[8])) * ((s32) in[6]);
188 output[15] = ((limb) ((s32) in2[7])) * ((s32) in[8]) +
189 ((limb) ((s32) in2[8])) * ((s32) in[7]) +
190 ((limb) ((s32) in2[6])) * ((s32) in[9]) +
191 ((limb) ((s32) in2[9])) * ((s32) in[6]);
192 output[16] = ((limb) ((s32) in2[8])) * ((s32) in[8]) +
193 2 * (((limb) ((s32) in2[7])) * ((s32) in[9]) +
194 ((limb) ((s32) in2[9])) * ((s32) in[7]));
195 output[17] = ((limb) ((s32) in2[8])) * ((s32) in[9]) +
196 ((limb) ((s32) in2[9])) * ((s32) in[8]);
197 output[18] = 2 * ((limb) ((s32) in2[9])) * ((s32) in[9]);
204 static void freduce_degree(limb *output) {
209 output[8] += output[18] << 4;
210 output[8] += output[18] << 1;
211 output[8] += output[18];
212 output[7] += output[17] << 4;
213 output[7] += output[17] << 1;
214 output[7] += output[17];
215 output[6] += output[16] << 4;
216 output[6] += output[16] << 1;
217 output[6] += output[16];
218 output[5] += output[15] << 4;
219 output[5] += output[15] << 1;
220 output[5] += output[15];
221 output[4] += output[14] << 4;
222 output[4] += output[14] << 1;
223 output[4] += output[14];
224 output[3] += output[13] << 4;
225 output[3] += output[13] << 1;
226 output[3] += output[13];
227 output[2] += output[12] << 4;
228 output[2] += output[12] << 1;
229 output[2] += output[12];
230 output[1] += output[11] << 4;
231 output[1] += output[11] << 1;
232 output[1] += output[11];
233 output[0] += output[10] << 4;
234 output[0] += output[10] << 1;
235 output[0] += output[10];
239 #error "This code only works on a two's complement system"
246 div_by_2_26(
const limb v)
249 const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
251 const int32_t sign = ((int32_t) highword) >> 31;
253 const int32_t roundoff = ((uint32_t) sign) >> 6;
255 return (v + roundoff) >> 26;
262 div_by_2_25(
const limb v)
265 const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
267 const int32_t sign = ((int32_t) highword) >> 31;
269 const int32_t roundoff = ((uint32_t) sign) >> 7;
271 return (v + roundoff) >> 25;
279 div_s32_by_2_25(
const s32 v)
281 const s32 roundoff = ((uint32_t)(v >> 31)) >> 7;
282 return (v + roundoff) >> 25;
289 static void freduce_coefficients(limb *output) {
294 for (i = 0; i < 10; i += 2) {
295 limb over = div_by_2_26(output[i]);
300 output[i] -= over << 26;
309 over = div_by_2_25(output[i+1]);
310 output[i+1] -= over << 25;
314 output[0] += output[10] << 4;
315 output[0] += output[10] << 1;
316 output[0] += output[10];
323 limb over = div_by_2_26(output[0]);
324 output[0] -= over << 26;
339 fmul(limb *output,
const limb *in,
const limb *in2) {
341 fproduct(t, in, in2);
344 freduce_coefficients(t);
346 memcpy(output, t,
sizeof(limb) * 10);
355 static void fsquare_inner(limb *output,
const limb *in) {
356 output[0] = ((limb) ((s32) in[0])) * ((s32) in[0]);
357 output[1] = 2 * ((limb) ((s32) in[0])) * ((s32) in[1]);
358 output[2] = 2 * (((limb) ((s32) in[1])) * ((s32) in[1]) +
359 ((limb) ((s32) in[0])) * ((s32) in[2]));
360 output[3] = 2 * (((limb) ((s32) in[1])) * ((s32) in[2]) +
361 ((limb) ((s32) in[0])) * ((s32) in[3]));
362 output[4] = ((limb) ((s32) in[2])) * ((s32) in[2]) +
363 4 * ((limb) ((s32) in[1])) * ((s32) in[3]) +
364 2 * ((limb) ((s32) in[0])) * ((s32) in[4]);
365 output[5] = 2 * (((limb) ((s32) in[2])) * ((s32) in[3]) +
366 ((limb) ((s32) in[1])) * ((s32) in[4]) +
367 ((limb) ((s32) in[0])) * ((s32) in[5]));
368 output[6] = 2 * (((limb) ((s32) in[3])) * ((s32) in[3]) +
369 ((limb) ((s32) in[2])) * ((s32) in[4]) +
370 ((limb) ((s32) in[0])) * ((s32) in[6]) +
371 2 * ((limb) ((s32) in[1])) * ((s32) in[5]));
372 output[7] = 2 * (((limb) ((s32) in[3])) * ((s32) in[4]) +
373 ((limb) ((s32) in[2])) * ((s32) in[5]) +
374 ((limb) ((s32) in[1])) * ((s32) in[6]) +
375 ((limb) ((s32) in[0])) * ((s32) in[7]));
376 output[8] = ((limb) ((s32) in[4])) * ((s32) in[4]) +
377 2 * (((limb) ((s32) in[2])) * ((s32) in[6]) +
378 ((limb) ((s32) in[0])) * ((s32) in[8]) +
379 2 * (((limb) ((s32) in[1])) * ((s32) in[7]) +
380 ((limb) ((s32) in[3])) * ((s32) in[5])));
381 output[9] = 2 * (((limb) ((s32) in[4])) * ((s32) in[5]) +
382 ((limb) ((s32) in[3])) * ((s32) in[6]) +
383 ((limb) ((s32) in[2])) * ((s32) in[7]) +
384 ((limb) ((s32) in[1])) * ((s32) in[8]) +
385 ((limb) ((s32) in[0])) * ((s32) in[9]));
386 output[10] = 2 * (((limb) ((s32) in[5])) * ((s32) in[5]) +
387 ((limb) ((s32) in[4])) * ((s32) in[6]) +
388 ((limb) ((s32) in[2])) * ((s32) in[8]) +
389 2 * (((limb) ((s32) in[3])) * ((s32) in[7]) +
390 ((limb) ((s32) in[1])) * ((s32) in[9])));
391 output[11] = 2 * (((limb) ((s32) in[5])) * ((s32) in[6]) +
392 ((limb) ((s32) in[4])) * ((s32) in[7]) +
393 ((limb) ((s32) in[3])) * ((s32) in[8]) +
394 ((limb) ((s32) in[2])) * ((s32) in[9]));
395 output[12] = ((limb) ((s32) in[6])) * ((s32) in[6]) +
396 2 * (((limb) ((s32) in[4])) * ((s32) in[8]) +
397 2 * (((limb) ((s32) in[5])) * ((s32) in[7]) +
398 ((limb) ((s32) in[3])) * ((s32) in[9])));
399 output[13] = 2 * (((limb) ((s32) in[6])) * ((s32) in[7]) +
400 ((limb) ((s32) in[5])) * ((s32) in[8]) +
401 ((limb) ((s32) in[4])) * ((s32) in[9]));
402 output[14] = 2 * (((limb) ((s32) in[7])) * ((s32) in[7]) +
403 ((limb) ((s32) in[6])) * ((s32) in[8]) +
404 2 * ((limb) ((s32) in[5])) * ((s32) in[9]));
405 output[15] = 2 * (((limb) ((s32) in[7])) * ((s32) in[8]) +
406 ((limb) ((s32) in[6])) * ((s32) in[9]));
407 output[16] = ((limb) ((s32) in[8])) * ((s32) in[8]) +
408 4 * ((limb) ((s32) in[7])) * ((s32) in[9]);
409 output[17] = 2 * ((limb) ((s32) in[8])) * ((s32) in[9]);
410 output[18] = 2 * ((limb) ((s32) in[9])) * ((s32) in[9]);
421 fsquare(limb *output,
const limb *in) {
423 fsquare_inner(t, in);
428 freduce_coefficients(t);
430 memcpy(output, t,
sizeof(limb) * 10);
435 fexpand(limb *output,
const u8 *input) {
436 #define F(n,start,shift,mask) \
437 output[n] = ((((limb) input[start + 0]) | \
438 ((limb) input[start + 1]) << 8 | \
439 ((limb) input[start + 2]) << 16 | \
440 ((limb) input[start + 3]) << 24) >> shift) & mask;
441 F(0, 0, 0, 0x3ffffff);
442 F(1, 3, 2, 0x1ffffff);
443 F(2, 6, 3, 0x3ffffff);
444 F(3, 9, 5, 0x1ffffff);
445 F(4, 12, 6, 0x3ffffff);
446 F(5, 16, 0, 0x1ffffff);
447 F(6, 19, 1, 0x3ffffff);
448 F(7, 22, 3, 0x1ffffff);
449 F(8, 25, 4, 0x3ffffff);
450 F(9, 28, 6, 0x1ffffff);
454 #if (-32 >> 1) != -16
455 #error "This code only works when >> does sign-extension on negative numbers"
459 static s32 s32_eq(s32 a, s32 b) {
471 static s32 s32_gte(s32 a, s32 b) {
482 fcontract(u8 *output, limb *input_limbs) {
488 for (i = 0; i < 10; i++) {
489 input[i] = (s32) input_limbs[i];
492 for (j = 0; j < 2; ++j) {
493 for (i = 0; i < 9; ++i) {
497 const s32 mask = input[i] >> 31;
498 const s32 carry = -((input[i] & mask) >> 25);
499 input[i] = input[i] + (carry << 25);
500 input[i+1] = input[i+1] - carry;
502 const s32 mask = input[i] >> 31;
503 const s32 carry = -((input[i] & mask) >> 26);
504 input[i] = input[i] + (carry << 26);
505 input[i+1] = input[i+1] - carry;
512 const s32 mask = input[9] >> 31;
513 const s32 carry = -((input[9] & mask) >> 25);
514 input[9] = input[9] + (carry << 25);
515 input[0] = input[0] - (carry * 19);
536 const s32 mask = input[0] >> 31;
537 const s32 carry = -((input[0] & mask) >> 26);
538 input[0] = input[0] + (carry << 26);
539 input[1] = input[1] - carry;
544 for (j = 0; j < 2; j++) {
545 for (i = 0; i < 9; i++) {
547 const s32 carry = input[i] >> 25;
548 input[i] &= 0x1ffffff;
551 const s32 carry = input[i] >> 26;
552 input[i] &= 0x3ffffff;
558 const s32 carry = input[9] >> 25;
559 input[9] &= 0x1ffffff;
560 input[0] += 19*carry;
574 s32 mask = s32_gte(input[0], 0x3ffffed);
575 for (i = 1; i < 10; i++) {
577 mask &= s32_eq(input[i], 0x1ffffff);
579 mask &= s32_eq(input[i], 0x3ffffff);
585 input[0] -= mask & 0x3ffffed;
587 for (i = 1; i < 10; i++) {
589 input[i] -= mask & 0x1ffffff;
591 input[i] -= mask & 0x3ffffff;
604 output[s+0] |= input[i] & 0xff; \
605 output[s+1] = (input[i] >> 8) & 0xff; \
606 output[s+2] = (input[i] >> 16) & 0xff; \
607 output[s+3] = (input[i] >> 24) & 0xff;
634 static void fmonty(limb *x2, limb *z2,
637 limb *xprime, limb *zprime,
639 limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19],
640 zzprime[19], zzzprime[19], xxxprime[19];
642 memcpy(origx, x, 10 *
sizeof(limb));
645 fdifference(z, origx);
648 memcpy(origxprime, xprime,
sizeof(limb) * 10);
649 fsum(xprime, zprime);
651 fdifference(zprime, origxprime);
653 fproduct(xxprime, xprime, z);
657 fproduct(zzprime, x, zprime);
659 freduce_degree(xxprime);
660 freduce_coefficients(xxprime);
662 freduce_degree(zzprime);
663 freduce_coefficients(zzprime);
665 memcpy(origxprime, xxprime,
sizeof(limb) * 10);
666 fsum(xxprime, zzprime);
668 fdifference(zzprime, origxprime);
670 fsquare(xxxprime, xxprime);
672 fsquare(zzzprime, zzprime);
674 fproduct(zzprime, zzzprime, qmqp);
676 freduce_degree(zzprime);
677 freduce_coefficients(zzprime);
679 memcpy(x3, xxxprime,
sizeof(limb) * 10);
680 memcpy(z3, zzprime,
sizeof(limb) * 10);
686 fproduct(x2, xx, zz);
689 freduce_coefficients(x2);
693 memset(zzz + 10, 0,
sizeof(limb) * 9);
694 fscalar_product(zzz, zz, 121665);
698 freduce_coefficients(zzz);
702 fproduct(z2, zz, zzz);
705 freduce_coefficients(z2);
719 swap_conditional(limb a[19], limb b[19], limb iswap) {
721 const s32 swap = (s32) -iswap;
723 for (i = 0; i < 10; ++i) {
724 const s32 x = swap & ( ((s32)a[i]) ^ ((s32)b[i]) );
725 a[i] = ((s32)a[i]) ^ x;
726 b[i] = ((s32)b[i]) ^ x;
736 cmult(limb *resultx, limb *resultz,
const u8 *n,
const limb *q) {
737 limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0};
738 limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t;
739 limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1};
740 limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h;
744 memcpy(nqpqx, q,
sizeof(limb) * 10);
746 for (i = 0; i < 32; ++i) {
748 for (j = 0; j < 8; ++j) {
749 const limb bit =
byte >> 7;
751 swap_conditional(nqx, nqpqx, bit);
752 swap_conditional(nqz, nqpqz, bit);
758 swap_conditional(nqx2, nqpqx2, bit);
759 swap_conditional(nqz2, nqpqz2, bit);
778 memcpy(resultx, nqx,
sizeof(limb) * 10);
779 memcpy(resultz, nqz,
sizeof(limb) * 10);
786 crecip(limb *out,
const limb *z) {
812 fmul(z2_10_0,t0,z2_5_0);
816 for (i = 2;i < 10;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
817 fmul(z2_20_0,t1,z2_10_0);
821 for (i = 2;i < 20;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
826 for (i = 2;i < 10;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
827 fmul(z2_50_0,t0,z2_10_0);
831 for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
832 fmul(z2_100_0,t1,z2_50_0);
834 fsquare(t1,z2_100_0);
836 for (i = 2;i < 100;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
837 fmul(t1,t0,z2_100_0);
841 for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
852 int curve25519_donna(u8 *mypublic,
const u8 *secret,
const u8 *basepoint);
855 curve25519_donna(u8 *mypublic,
const u8 *secret,
const u8 *basepoint) {
856 limb bp[10], x[10], z[11], zmone[10];
860 for (i = 0; i < 32; ++i) e[i] = secret[i];
865 fexpand(bp, basepoint);
869 fcontract(mypublic, z);
Integer definitions used throughout Tor.