blob: e93065224a8ae07e3ab2ce600ca42f24b65bf445 [file] [log] [blame]
Alexandre Lision7fd5d3d2013-12-04 13:06:40 -05001/*
2 * lbn64.c - Low-level bignum routines, 64-bit version.
3 *
4 * Copyright (c) 1995 Colin Plumb. All rights reserved.
5 * For licensing and other legal details, see the file legal.c.
6 *
7 * NOTE: the magic constants "64" and "128" appear in many places in this
8 * file, including inside identifiers. Because it is not possible to
9 * ask "#ifdef" of a macro expansion, it is not possible to use the
10 * preprocessor to conditionalize these properly. Thus, this file is
11 * intended to be edited with textual search and replace to produce
12 * alternate word size versions. Any reference to the number of bits
13 * in a word must be the string "64", and that string must not appear
14 * otherwise. Any reference to twice this number must appear as "128",
15 * which likewise must not appear otherwise. Is that clear?
16 *
17 * Remember, when doubling the bit size replace the larger number (128)
18 * first, then the smaller (64). When halving the bit size, do the
19 * opposite. Otherwise, things will get wierd. Also, be sure to replace
20 * every instance that appears. (:%s/foo/bar/g in vi)
21 *
22 * These routines work with a pointer to the least-significant end of
23 * an array of WORD64s. The BIG(x), LITTLE(y) and BIGLTTLE(x,y) macros
24 * defined in lbn.h (which expand to x on a big-edian machine and y on a
25 * little-endian machine) are used to conditionalize the code to work
26 * either way. If you have no assembly primitives, it doesn't matter.
27 * Note that on a big-endian machine, the least-significant-end pointer
28 * is ONE PAST THE END. The bytes are ptr[-1] through ptr[-len].
29 * On little-endian, they are ptr[0] through ptr[len-1]. This makes
30 * perfect sense if you consider pointers to point *between* bytes rather
31 * than at them.
32 *
33 * Because the array index values are unsigned integers, ptr[-i]
34 * may not work properly, since the index -i is evaluated as an unsigned,
35 * and if pointers are wider, zero-extension will produce a positive
36 * number rahter than the needed negative. The expression used in this
37 * code, *(ptr-i) will, however, work. (The array syntax is equivalent
38 * to *(ptr+-i), which is a pretty subtle difference.)
39 *
40 * Many of these routines will get very unhappy if fed zero-length inputs.
41 * They use assert() to enforce this. An higher layer of code must make
42 * sure that these aren't called with zero-length inputs.
43 *
44 * Any of these routines can be replaced with more efficient versions
45 * elsewhere, by just #defining their names. If one of the names
46 * is #defined, the C code is not compiled in and no declaration is
47 * made. Use the BNINCLUDE file to do that. Typically, you compile
48 * asm subroutines with the same name and just, e.g.
49 * #define lbnMulAdd1_64 lbnMulAdd1_64
50 *
51 * If you want to write asm routines, start with lbnMulAdd1_64().
52 * This is the workhorse of modular exponentiation. lbnMulN1_64() is
53 * also used a fair bit, although not as much and it's defined in terms
54 * of lbnMulAdd1_64 if that has a custom version. lbnMulSub1_64 and
55 * lbnDiv21_64 are used in the usual division and remainder finding.
56 * (Not the Montgomery reduction used in modular exponentiation, though.)
57 * Once you have lbnMulAdd1_64 defined, writing the other two should
58 * be pretty easy. (Just make sure you get the sign of the subtraction
59 * in lbnMulSub1_64 right - it's dest = dest - source * k.)
60 *
61 * The only definitions that absolutely need a double-word (BNWORD128)
62 * type are lbnMulAdd1_64 and lbnMulSub1_64; if those are provided,
63 * the rest follows. lbnDiv21_64, however, is a lot slower unless you
64 * have them, and lbnModQ_64 takes after it. That one is used quite a
65 * bit for prime sieving.
66 */
67
68#ifndef HAVE_CONFIG_H
69#define HAVE_CONFIG_H 0
70#endif
71#if HAVE_CONFIG_H
72#include <bnconfig.h>
73#endif
74
75/*
76 * Some compilers complain about #if FOO if FOO isn't defined,
77 * so do the ANSI-mandated thing explicitly...
78 */
79#ifndef NO_ASSERT_H
80#define NO_ASSERT_H 0
81#endif
82#ifndef NO_STRING_H
83#define NO_STRING_H 0
84#endif
85#ifndef HAVE_STRINGS_H
86#define HAVE_STRINGS_H 0
87#endif
88#ifndef NEED_MEMORY_H
89#define NEED_MEMORY_H 0
90#endif
91
92#if !NO_ASSERT_H
93#include <assert.h>
94#else
95#define assert(x) (void)0
96#endif
97
98#if !NO_STRING_H
99#include <string.h> /* For memcpy */
100#elif HAVE_STRINGS_H
101#include <strings.h>
102#endif
103#if NEED_MEMORY_H
104#include <memory.h>
105#endif
106
107#include "lbn.h"
108#include "lbn64.h"
109#include "lbnmem.h"
110
111#include "kludge.h"
112
113#ifndef BNWORD64
114#error 64-bit bignum library requires a 64-bit data type
115#endif
116
117/* If this is defined, include bnYield() calls */
118#if BNYIELD
119extern int (*bnYield)(void); /* From bn.c */
120#endif
121
122/*
123 * Most of the multiply (and Montgomery reduce) routines use an outer
124 * loop that iterates over one of the operands - a so-called operand
125 * scanning approach. One big advantage of this is that the assembly
126 * support routines are simpler. The loops can be rearranged to have
127 * an outer loop that iterates over the product, a so-called product
128 * scanning approach. This has the advantage of writing less data
129 * and doing fewer adds to memory, so is supposedly faster. Some
130 * code has been written using a product-scanning approach, but
131 * it appears to be slower, so it is turned off by default. Some
132 * experimentation would be appreciated.
133 *
134 * (The code is also annoying to get right and not very well commented,
135 * one of my pet peeves about math libraries. I'm sorry.)
136 */
137#ifndef PRODUCT_SCAN
138#define PRODUCT_SCAN 0
139#endif
140
141/*
142 * Copy an array of words. <Marvin mode on> Thrilling, isn't it? </Marvin>
143 * This is a good example of how the byte offsets and BIGLITTLE() macros work.
144 * Another alternative would have been
145 * memcpy(dest BIG(-len), src BIG(-len), len*sizeof(BNWORD64)), but I find that
146 * putting operators into conditional macros is confusing.
147 */
148#ifndef lbnCopy_64
149void
150lbnCopy_64(BNWORD64 *dest, BNWORD64 const *src, unsigned len)
151{
152 memcpy(BIGLITTLE(dest-len,dest), BIGLITTLE(src-len,src),
153 len * sizeof(*src));
154}
155#endif /* !lbnCopy_64 */
156
157/*
158 * Fill n words with zero. This does it manually rather than calling
159 * memset because it can assume alignment to make things faster while
160 * memset can't. Note how big-endian numbers are naturally addressed
161 * using predecrement, while little-endian is postincrement.
162 */
163#ifndef lbnZero_64
164void
165lbnZero_64(BNWORD64 *num, unsigned len)
166{
167 while (len--)
168 BIGLITTLE(*--num,*num++) = 0;
169}
170#endif /* !lbnZero_64 */
171
172/*
173 * Negate an array of words.
174 * Negation is subtraction from zero. Negating low-order words
175 * entails doing nothing until a non-zero word is hit. Once that
176 * is negated, a borrow is generated and never dies until the end
177 * of the number is hit. Negation with borrow, -x-1, is the same as ~x.
178 * Repeat that until the end of the number.
179 *
180 * Doesn't return borrow out because that's pretty useless - it's
181 * always set unless the input is 0, which is easy to notice in
182 * normalized form.
183 */
184#ifndef lbnNeg_64
185void
186lbnNeg_64(BNWORD64 *num, unsigned len)
187{
188 assert(len);
189
190 /* Skip low-order zero words */
191 while (BIGLITTLE(*--num,*num) == 0) {
192 if (!--len)
193 return;
194 LITTLE(num++;)
195 }
196 /* Negate the lowest-order non-zero word */
197 *num = -*num;
198 /* Complement all the higher-order words */
199 while (--len) {
200 BIGLITTLE(--num,++num);
201 *num = ~*num;
202 }
203}
204#endif /* !lbnNeg_64 */
205
206
207/*
208 * lbnAdd1_64: add the single-word "carry" to the given number.
209 * Used for minor increments and propagating the carry after
210 * adding in a shorter bignum.
211 *
212 * Technique: If we have a double-width word, presumably the compiler
213 * can add using its carry in inline code, so we just use a larger
214 * accumulator to compute the carry from the first addition.
215 * If not, it's more complex. After adding the first carry, which may
216 * be > 1, compare the sum and the carry. If the sum wraps (causing a
217 * carry out from the addition), the result will be less than each of the
218 * inputs, since the wrap subtracts a number (2^64) which is larger than
219 * the other input can possibly be. If the sum is >= the carry input,
220 * return success immediately.
221 * In either case, if there is a carry, enter a loop incrementing words
222 * until one does not wrap. Since we are adding 1 each time, the wrap
223 * will be to 0 and we can test for equality.
224 */
225#ifndef lbnAdd1_64 /* If defined, it's provided as an asm subroutine */
226#ifdef BNWORD128
227BNWORD64
228lbnAdd1_64(BNWORD64 *num, unsigned len, BNWORD64 carry)
229{
230 BNWORD128 t;
231 assert(len > 0); /* Alternative: if (!len) return carry */
232
233 t = (BNWORD128)BIGLITTLE(*--num,*num) + carry;
234 BIGLITTLE(*num,*num++) = (BNWORD64)t;
235 if ((t >> 64) == 0)
236 return 0;
237 while (--len) {
238 if (++BIGLITTLE(*--num,*num++) != 0)
239 return 0;
240 }
241 return 1;
242}
243#else /* no BNWORD128 */
244BNWORD64
245lbnAdd1_64(BNWORD64 *num, unsigned len, BNWORD64 carry)
246{
247 assert(len > 0); /* Alternative: if (!len) return carry */
248
249 if ((BIGLITTLE(*--num,*num++) += carry) >= carry)
250 return 0;
251 while (--len) {
252 if (++BIGLITTLE(*--num,*num++) != 0)
253 return 0;
254 }
255 return 1;
256}
257#endif
258#endif/* !lbnAdd1_64 */
259
260/*
261 * lbnSub1_64: subtract the single-word "borrow" from the given number.
262 * Used for minor decrements and propagating the borrow after
263 * subtracting a shorter bignum.
264 *
265 * Technique: Similar to the add, above. If there is a double-length type,
266 * use that to generate the first borrow.
267 * If not, after subtracting the first borrow, which may be > 1, compare
268 * the difference and the *negative* of the carry. If the subtract wraps
269 * (causing a borrow out from the subtraction), the result will be at least
270 * as large as -borrow. If the result < -borrow, then no borrow out has
271 * appeared and we may return immediately, except when borrow == 0. To
272 * deal with that case, use the identity that -x = ~x+1, and instead of
273 * comparing < -borrow, compare for <= ~borrow.
274 * Either way, if there is a borrow out, enter a loop decrementing words
275 * until a non-zero word is reached.
276 *
277 * Note the cast of ~borrow to (BNWORD64). If the size of an int is larger
278 * than BNWORD64, C rules say the number is expanded for the arithmetic, so
279 * the inversion will be done on an int and the value won't be quite what
280 * is expected.
281 */
282#ifndef lbnSub1_64 /* If defined, it's provided as an asm subroutine */
283#ifdef BNWORD128
284BNWORD64
285lbnSub1_64(BNWORD64 *num, unsigned len, BNWORD64 borrow)
286{
287 BNWORD128 t;
288 assert(len > 0); /* Alternative: if (!len) return borrow */
289
290 t = (BNWORD128)BIGLITTLE(*--num,*num) - borrow;
291 BIGLITTLE(*num,*num++) = (BNWORD64)t;
292 if ((t >> 64) == 0)
293 return 0;
294 while (--len) {
295 if ((BIGLITTLE(*--num,*num++))-- != 0)
296 return 0;
297 }
298 return 1;
299}
300#else /* no BNWORD128 */
301BNWORD64
302lbnSub1_64(BNWORD64 *num, unsigned len, BNWORD64 borrow)
303{
304 assert(len > 0); /* Alternative: if (!len) return borrow */
305
306 if ((BIGLITTLE(*--num,*num++) -= borrow) <= (BNWORD64)~borrow)
307 return 0;
308 while (--len) {
309 if ((BIGLITTLE(*--num,*num++))-- != 0)
310 return 0;
311 }
312 return 1;
313}
314#endif
315#endif /* !lbnSub1_64 */
316
317/*
318 * lbnAddN_64: add two bignums of the same length, returning the carry (0 or 1).
319 * One of the building blocks, along with lbnAdd1, of adding two bignums of
320 * differing lengths.
321 *
322 * Technique: Maintain a word of carry. If there is no double-width type,
323 * use the same technique as in lbnAdd1, above, to maintain the carry by
324 * comparing the inputs. Adding the carry sources is used as an OR operator;
325 * at most one of the two comparisons can possibly be true. The first can
326 * only be true if carry == 1 and x, the result, is 0. In that case the
327 * second can't possibly be true.
328 */
329#ifndef lbnAddN_64
330#ifdef BNWORD128
331BNWORD64
332lbnAddN_64(BNWORD64 *num1, BNWORD64 const *num2, unsigned len)
333{
334 BNWORD128 t;
335
336 assert(len > 0);
337
338 t = (BNWORD128)BIGLITTLE(*--num1,*num1) + BIGLITTLE(*--num2,*num2++);
339 BIGLITTLE(*num1,*num1++) = (BNWORD64)t;
340 while (--len) {
341 t = (BNWORD128)BIGLITTLE(*--num1,*num1) +
342 (BNWORD128)BIGLITTLE(*--num2,*num2++) + (t >> 64);
343 BIGLITTLE(*num1,*num1++) = (BNWORD64)t;
344 }
345
346 return (BNWORD64)(t>>64);
347}
348#else /* no BNWORD128 */
349BNWORD64
350lbnAddN_64(BNWORD64 *num1, BNWORD64 const *num2, unsigned len)
351{
352 BNWORD64 x, carry = 0;
353
354 assert(len > 0); /* Alternative: change loop to test at start */
355
356 do {
357 x = BIGLITTLE(*--num2,*num2++);
358 carry = (x += carry) < carry;
359 carry += (BIGLITTLE(*--num1,*num1++) += x) < x;
360 } while (--len);
361
362 return carry;
363}
364#endif
365#endif /* !lbnAddN_64 */
366
367/*
368 * lbnSubN_64: add two bignums of the same length, returning the carry (0 or 1).
369 * One of the building blocks, along with subn1, of subtracting two bignums of
370 * differing lengths.
371 *
372 * Technique: If no double-width type is availble, maintain a word of borrow.
373 * First, add the borrow to the subtrahend (did you have to learn all those
374 * awful words in elementary school, too?), and if it overflows, set the
375 * borrow again. Then subtract the modified subtrahend from the next word
376 * of input, using the same technique as in subn1, above.
377 * Adding the borrows is used as an OR operator; at most one of the two
378 * comparisons can possibly be true. The first can only be true if
379 * borrow == 1 and x, the result, is 0. In that case the second can't
380 * possibly be true.
381 *
382 * In the double-word case, (BNWORD64)-(t>>64) is subtracted, rather than
383 * adding t>>64, because the shift would need to sign-extend and that's
384 * not guaranteed to happen in ANSI C, even with signed types.
385 */
386#ifndef lbnSubN_64
387#ifdef BNWORD128
388BNWORD64
389lbnSubN_64(BNWORD64 *num1, BNWORD64 const *num2, unsigned len)
390{
391 BNWORD128 t;
392
393 assert(len > 0);
394
395 t = (BNWORD128)BIGLITTLE(*--num1,*num1) - BIGLITTLE(*--num2,*num2++);
396 BIGLITTLE(*num1,*num1++) = (BNWORD64)t;
397
398 while (--len) {
399 t = (BNWORD128)BIGLITTLE(*--num1,*num1) -
400 (BNWORD128)BIGLITTLE(*--num2,*num2++) - (BNWORD64)-(t >> 64);
401 BIGLITTLE(*num1,*num1++) = (BNWORD64)t;
402 }
403
404 return -(BNWORD64)(t>>64);
405}
406#else
407BNWORD64
408lbnSubN_64(BNWORD64 *num1, BNWORD64 const *num2, unsigned len)
409{
410 BNWORD64 x, borrow = 0;
411
412 assert(len > 0); /* Alternative: change loop to test at start */
413
414 do {
415 x = BIGLITTLE(*--num2,*num2++);
416 borrow = (x += borrow) < borrow;
417 borrow += (BIGLITTLE(*--num1,*num1++) -= x) > (BNWORD64)~x;
418 } while (--len);
419
420 return borrow;
421}
422#endif
423#endif /* !lbnSubN_64 */
424
425#ifndef lbnCmp_64
426/*
427 * lbnCmp_64: compare two bignums of equal length, returning the sign of
428 * num1 - num2. (-1, 0 or +1).
429 *
430 * Technique: Change the little-endian pointers to big-endian pointers
431 * and compare from the most-significant end until a difference if found.
432 * When it is, figure out the sign of the difference and return it.
433 */
434int
435lbnCmp_64(BNWORD64 const *num1, BNWORD64 const *num2, unsigned len)
436{
437 BIGLITTLE(num1 -= len, num1 += len);
438 BIGLITTLE(num2 -= len, num2 += len);
439
440 while (len--) {
441 if (BIGLITTLE(*num1++ != *num2++, *--num1 != *--num2)) {
442 if (BIGLITTLE(num1[-1] < num2[-1], *num1 < *num2))
443 return -1;
444 else
445 return 1;
446 }
447 }
448 return 0;
449}
450#endif /* !lbnCmp_64 */
451
452/*
453 * mul64_ppmmaa(ph,pl,x,y,a,b) is an optional routine that
454 * computes (ph,pl) = x * y + a + b. mul64_ppmma and mul64_ppmm
455 * are simpler versions. If you want to be lazy, all of these
456 * can be defined in terms of the others, so here we create any
457 * that have not been defined in terms of the ones that have been.
458 */
459
460/* Define ones with fewer a's in terms of ones with more a's */
461#if !defined(mul64_ppmma) && defined(mul64_ppmmaa)
462#define mul64_ppmma(ph,pl,x,y,a) mul64_ppmmaa(ph,pl,x,y,a,0)
463#endif
464
465#if !defined(mul64_ppmm) && defined(mul64_ppmma)
466#define mul64_ppmm(ph,pl,x,y) mul64_ppmma(ph,pl,x,y,0)
467#endif
468
469/*
470 * Use this definition to test the mul64_ppmm-based operations on machines
471 * that do not provide mul64_ppmm. Change the final "0" to a "1" to
472 * enable it.
473 */
474#if !defined(mul64_ppmm) && defined(BNWORD128) && 0 /* Debugging */
475#define mul64_ppmm(ph,pl,x,y) \
476 ({BNWORD128 _ = (BNWORD128)(x)*(y); (pl) = _; (ph) = _>>64;})
477#endif
478
479#if defined(mul64_ppmm) && !defined(mul64_ppmma)
480#define mul64_ppmma(ph,pl,x,y,a) \
481 (mul64_ppmm(ph,pl,x,y), (ph) += ((pl) += (a)) < (a))
482#endif
483
484#if defined(mul64_ppmma) && !defined(mul64_ppmmaa)
485#define mul64_ppmmaa(ph,pl,x,y,a,b) \
486 (mul64_ppmma(ph,pl,x,y,a), (ph) += ((pl) += (b)) < (b))
487#endif
488
489/*
490 * lbnMulN1_64: Multiply an n-word input by a 1-word input and store the
491 * n+1-word product. This uses either the mul64_ppmm and mul64_ppmma
492 * macros, or C multiplication with the BNWORD128 type. This uses mul64_ppmma
493 * if available, assuming you won't bother defining it unless you can do
494 * better than the normal multiplication.
495 */
496#ifndef lbnMulN1_64
497#ifdef lbnMulAdd1_64 /* If we have this asm primitive, use it. */
498void
499lbnMulN1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
500{
501 lbnZero_64(out, len);
502 BIGLITTLE(*(out-len-1),*(out+len)) = lbnMulAdd1_64(out, in, len, k);
503}
504#elif defined(mul64_ppmm)
505void
506lbnMulN1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
507{
508 BNWORD64 carry, carryin;
509
510 assert(len > 0);
511
512 BIG(--out;--in;);
513 mul64_ppmm(carry, *out, *in, k);
514 LITTLE(out++;in++;)
515
516 while (--len) {
517 BIG(--out;--in;)
518 carryin = carry;
519 mul64_ppmma(carry, *out, *in, k, carryin);
520 LITTLE(out++;in++;)
521 }
522 BIGLITTLE(*--out,*out) = carry;
523}
524#elif defined(BNWORD128)
525void
526lbnMulN1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
527{
528 BNWORD128 p;
529
530 assert(len > 0);
531
532 p = (BNWORD128)BIGLITTLE(*--in,*in++) * k;
533 BIGLITTLE(*--out,*out++) = (BNWORD64)p;
534
535 while (--len) {
536 p = (BNWORD128)BIGLITTLE(*--in,*in++) * k + (BNWORD64)(p >> 64);
537 BIGLITTLE(*--out,*out++) = (BNWORD64)p;
538 }
539 BIGLITTLE(*--out,*out) = (BNWORD64)(p >> 64);
540}
541#else
542#error No 64x64 -> 128 multiply available for 64-bit bignum package
543#endif
544#endif /* lbnMulN1_64 */
545
546/*
547 * lbnMulAdd1_64: Multiply an n-word input by a 1-word input and add the
548 * low n words of the product to the destination. *Returns the n+1st word
549 * of the product.* (That turns out to be more convenient than adding
550 * it into the destination and dealing with a possible unit carry out
551 * of *that*.) This uses either the mul64_ppmma and mul64_ppmmaa macros,
552 * or C multiplication with the BNWORD128 type.
553 *
554 * If you're going to write assembly primitives, this is the one to
555 * start with. It is by far the most commonly called function.
556 */
557#ifndef lbnMulAdd1_64
558#if defined(mul64_ppmm)
559BNWORD64
560lbnMulAdd1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
561{
562 BNWORD64 prod, carry, carryin;
563
564 assert(len > 0);
565
566 BIG(--out;--in;);
567 carryin = *out;
568 mul64_ppmma(carry, *out, *in, k, carryin);
569 LITTLE(out++;in++;)
570
571 while (--len) {
572 BIG(--out;--in;);
573 carryin = carry;
574 mul64_ppmmaa(carry, prod, *in, k, carryin, *out);
575 *out = prod;
576 LITTLE(out++;in++;)
577 }
578
579 return carry;
580}
581#elif defined(BNWORD128)
582BNWORD64
583lbnMulAdd1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
584{
585 BNWORD128 p;
586
587 assert(len > 0);
588
589 p = (BNWORD128)BIGLITTLE(*--in,*in++) * k + BIGLITTLE(*--out,*out);
590 BIGLITTLE(*out,*out++) = (BNWORD64)p;
591
592 while (--len) {
593 p = (BNWORD128)BIGLITTLE(*--in,*in++) * k +
594 (BNWORD64)(p >> 64) + BIGLITTLE(*--out,*out);
595 BIGLITTLE(*out,*out++) = (BNWORD64)p;
596 }
597
598 return (BNWORD64)(p >> 64);
599}
600#else
601#error No 64x64 -> 128 multiply available for 64-bit bignum package
602#endif
603#endif /* lbnMulAdd1_64 */
604
605/*
606 * lbnMulSub1_64: Multiply an n-word input by a 1-word input and subtract the
607 * n-word product from the destination. Returns the n+1st word of the product.
608 * This uses either the mul64_ppmm and mul64_ppmma macros, or
609 * C multiplication with the BNWORD128 type.
610 *
611 * This is rather uglier than adding, but fortunately it's only used in
612 * division which is not used too heavily.
613 */
614#ifndef lbnMulSub1_64
615#if defined(mul64_ppmm)
616BNWORD64
617lbnMulSub1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
618{
619 BNWORD64 prod, carry, carryin;
620
621 assert(len > 0);
622
623 BIG(--in;)
624 mul64_ppmm(carry, prod, *in, k);
625 LITTLE(in++;)
626 carry += (BIGLITTLE(*--out,*out++) -= prod) > (BNWORD64)~prod;
627
628 while (--len) {
629 BIG(--in;);
630 carryin = carry;
631 mul64_ppmma(carry, prod, *in, k, carryin);
632 LITTLE(in++;)
633 carry += (BIGLITTLE(*--out,*out++) -= prod) > (BNWORD64)~prod;
634 }
635
636 return carry;
637}
638#elif defined(BNWORD128)
639BNWORD64
640lbnMulSub1_64(BNWORD64 *out, BNWORD64 const *in, unsigned len, BNWORD64 k)
641{
642 BNWORD128 p;
643 BNWORD64 carry, t;
644
645 assert(len > 0);
646
647 p = (BNWORD128)BIGLITTLE(*--in,*in++) * k;
648 t = BIGLITTLE(*--out,*out);
649 carry = (BNWORD64)(p>>64) + ((BIGLITTLE(*out,*out++)=t-(BNWORD64)p) > t);
650
651 while (--len) {
652 p = (BNWORD128)BIGLITTLE(*--in,*in++) * k + carry;
653 t = BIGLITTLE(*--out,*out);
654 carry = (BNWORD64)(p>>64) +
655 ( (BIGLITTLE(*out,*out++)=t-(BNWORD64)p) > t );
656 }
657
658 return carry;
659}
660#else
661#error No 64x64 -> 128 multiply available for 64-bit bignum package
662#endif
663#endif /* !lbnMulSub1_64 */
664
665/*
666 * Shift n words left "shift" bits. 0 < shift < 64. Returns the
667 * carry, any bits shifted off the left-hand side (0 <= carry < 2^shift).
668 */
669#ifndef lbnLshift_64
670BNWORD64
671lbnLshift_64(BNWORD64 *num, unsigned len, unsigned shift)
672{
673 BNWORD64 x, carry;
674
675 assert(shift > 0);
676 assert(shift < 64);
677
678 carry = 0;
679 while (len--) {
680 BIG(--num;)
681 x = *num;
682 *num = (x<<shift) | carry;
683 LITTLE(num++;)
684 carry = x >> (64-shift);
685 }
686 return carry;
687}
688#endif /* !lbnLshift_64 */
689
690/*
691 * An optimized version of the above, for shifts of 1.
692 * Some machines can use add-with-carry tricks for this.
693 */
694#ifndef lbnDouble_64
695BNWORD64
696lbnDouble_64(BNWORD64 *num, unsigned len)
697{
698 BNWORD64 x, carry;
699
700 carry = 0;
701 while (len--) {
702 BIG(--num;)
703 x = *num;
704 *num = (x<<1) | carry;
705 LITTLE(num++;)
706 carry = x >> (64-1);
707 }
708 return carry;
709}
710#endif /* !lbnDouble_64 */
711
712/*
713 * Shift n words right "shift" bits. 0 < shift < 64. Returns the
714 * carry, any bits shifted off the right-hand side (0 <= carry < 2^shift).
715 */
716#ifndef lbnRshift_64
717BNWORD64
718lbnRshift_64(BNWORD64 *num, unsigned len, unsigned shift)
719{
720 BNWORD64 x, carry = 0;
721
722 assert(shift > 0);
723 assert(shift < 64);
724
725 BIGLITTLE(num -= len, num += len);
726
727 while (len--) {
728 LITTLE(--num;)
729 x = *num;
730 *num = (x>>shift) | carry;
731 BIG(num++;)
732 carry = x << (64-shift);
733 }
734 return carry >> (64-shift);
735}
736#endif /* !lbnRshift_64 */
737
738/*
739 * Multiply two numbers of the given lengths. prod and num2 may overlap,
740 * provided that the low len1 bits of prod are free. (This corresponds
741 * nicely to the place the result is returned from lbnMontReduce_64.)
742 *
743 * TODO: Use Karatsuba multiply. The overlap constraints may have
744 * to get rewhacked.
745 */
746#ifndef lbnMul_64
747void
748lbnMul_64(BNWORD64 *prod, BNWORD64 const *num1, unsigned len1,
749 BNWORD64 const *num2, unsigned len2)
750{
751 /* Special case of zero */
752 if (!len1 || !len2) {
753 lbnZero_64(prod, len1+len2);
754 return;
755 }
756
757 /* Multiply first word */
758 lbnMulN1_64(prod, num1, len1, BIGLITTLE(*--num2,*num2++));
759
760 /*
761 * Add in subsequent words, storing the most significant word,
762 * which is new each time.
763 */
764 while (--len2) {
765 BIGLITTLE(--prod,prod++);
766 BIGLITTLE(*(prod-len1-1),*(prod+len1)) =
767 lbnMulAdd1_64(prod, num1, len1, BIGLITTLE(*--num2,*num2++));
768 }
769}
770#endif /* !lbnMul_64 */
771
772/*
773 * lbnMulX_64 is a square multiply - both inputs are the same length.
774 * It's normally just a macro wrapper around the general multiply,
775 * but might be implementable in assembly more efficiently (such as
776 * when product scanning).
777 */
778#ifndef lbnMulX_64
779#if defined(BNWORD128) && PRODUCT_SCAN
780/*
781 * Test code to see whether product scanning is any faster. It seems
782 * to make the C code slower, so PRODUCT_SCAN is not defined.
783 */
784static void
785lbnMulX_64(BNWORD64 *prod, BNWORD64 const *num1, BNWORD64 const *num2,
786 unsigned len)
787{
788 BNWORD128 x, y;
789 BNWORD64 const *p1, *p2;
790 unsigned carry;
791 unsigned i, j;
792
793 /* Special case of zero */
794 if (!len)
795 return;
796
797 x = (BNWORD128)BIGLITTLE(num1[-1] * num2[-1], num1[0] * num2[0]);
798 BIGLITTLE(*--prod, *prod++) = (BNWORD64)x;
799 x >>= 64;
800
801 for (i = 1; i < len; i++) {
802 carry = 0;
803 p1 = num1;
804 p2 = BIGLITTLE(num2-i-1,num2+i+1);
805 for (j = 0; j <= i; j++) {
806 BIG(y = (BNWORD128)*--p1 * *p2++;)
807 LITTLE(y = (BNWORD128)*p1++ * *--p2;)
808 x += y;
809 carry += (x < y);
810 }
811 BIGLITTLE(*--prod,*prod++) = (BNWORD64)x;
812 x = (x >> 64) | (BNWORD128)carry << 64;
813 }
814 for (i = 1; i < len; i++) {
815 carry = 0;
816 p1 = BIGLITTLE(num1-i,num1+i);
817 p2 = BIGLITTLE(num2-len,num2+len);
818 for (j = i; j < len; j++) {
819 BIG(y = (BNWORD128)*--p1 * *p2++;)
820 LITTLE(y = (BNWORD128)*p1++ * *--p2;)
821 x += y;
822 carry += (x < y);
823 }
824 BIGLITTLE(*--prod,*prod++) = (BNWORD64)x;
825 x = (x >> 64) | (BNWORD128)carry << 64;
826 }
827
828 BIGLITTLE(*--prod,*prod) = (BNWORD64)x;
829}
830#else /* !defined(BNWORD128) || !PRODUCT_SCAN */
831/* Default trivial macro definition */
832#define lbnMulX_64(prod, num1, num2, len) lbnMul_64(prod, num1, len, num2, len)
833#endif /* !defined(BNWORD128) || !PRODUCT_SCAN */
834#endif /* !lbmMulX_64 */
835
836#if !defined(lbnMontMul_64) && defined(BNWORD128) && PRODUCT_SCAN
837/*
838 * Test code for product-scanning multiply. This seems to slow the C
839 * code down rather than speed it up.
840 * This does a multiply and Montgomery reduction together, using the
841 * same loops. The outer loop scans across the product, twice.
842 * The first pass computes the low half of the product and the
843 * Montgomery multipliers. These are stored in the product array,
844 * which contains no data as of yet. x and carry add up the columns
845 * and propagate carries forward.
846 *
847 * The second half multiplies the upper half, adding in the modulus
848 * times the Montgomery multipliers. The results of this multiply
849 * are stored.
850 */
851static void
852lbnMontMul_64(BNWORD64 *prod, BNWORD64 const *num1, BNWORD64 const *num2,
853 BNWORD64 const *mod, unsigned len, BNWORD64 inv)
854{
855 BNWORD128 x, y;
856 BNWORD64 const *p1, *p2, *pm;
857 BNWORD64 *pp;
858 BNWORD64 t;
859 unsigned carry;
860 unsigned i, j;
861
862 /* Special case of zero */
863 if (!len)
864 return;
865
866 /*
867 * This computes directly into the high half of prod, so just
868 * shift the pointer and consider prod only "len" elements long
869 * for the rest of the code.
870 */
871 BIGLITTLE(prod -= len, prod += len);
872
873 /* Pass 1 - compute Montgomery multipliers */
874 /* First iteration can have certain simplifications. */
875 x = (BNWORD128)BIGLITTLE(num1[-1] * num2[-1], num1[0] * num2[0]);
876 BIGLITTLE(prod[-1], prod[0]) = t = inv * (BNWORD64)x;
877 y = (BNWORD128)t * BIGLITTLE(mod[-1],mod[0]);
878 x += y;
879 /* Note: GCC 2.6.3 has a bug if you try to eliminate "carry" */
880 carry = (x < y);
881 assert((BNWORD64)x == 0);
882 x = x >> 64 | (BNWORD128)carry << 64;
883
884 for (i = 1; i < len; i++) {
885 carry = 0;
886 p1 = num1;
887 p2 = BIGLITTLE(num2-i-1,num2+i+1);
888 pp = prod;
889 pm = BIGLITTLE(mod-i-1,mod+i+1);
890 for (j = 0; j < i; j++) {
891 y = (BNWORD128)BIGLITTLE(*--p1 * *p2++, *p1++ * *--p2);
892 x += y;
893 carry += (x < y);
894 y = (BNWORD128)BIGLITTLE(*--pp * *pm++, *pp++ * *--pm);
895 x += y;
896 carry += (x < y);
897 }
898 y = (BNWORD128)BIGLITTLE(p1[-1] * p2[0], p1[0] * p2[-1]);
899 x += y;
900 carry += (x < y);
901 assert(BIGLITTLE(pp == prod-i, pp == prod+i));
902 BIGLITTLE(pp[-1], pp[0]) = t = inv * (BNWORD64)x;
903 assert(BIGLITTLE(pm == mod-1, pm == mod+1));
904 y = (BNWORD128)t * BIGLITTLE(pm[0],pm[-1]);
905 x += y;
906 carry += (x < y);
907 assert((BNWORD64)x == 0);
908 x = x >> 64 | (BNWORD128)carry << 64;
909 }
910
911 /* Pass 2 - compute reduced product and store */
912 for (i = 1; i < len; i++) {
913 carry = 0;
914 p1 = BIGLITTLE(num1-i,num1+i);
915 p2 = BIGLITTLE(num2-len,num2+len);
916 pm = BIGLITTLE(mod-i,mod+i);
917 pp = BIGLITTLE(prod-len,prod+len);
918 for (j = i; j < len; j++) {
919 y = (BNWORD128)BIGLITTLE(*--p1 * *p2++, *p1++ * *--p2);
920 x += y;
921 carry += (x < y);
922 y = (BNWORD128)BIGLITTLE(*--pm * *pp++, *pm++ * *--pp);
923 x += y;
924 carry += (x < y);
925 }
926 assert(BIGLITTLE(pm == mod-len, pm == mod+len));
927 assert(BIGLITTLE(pp == prod-i, pp == prod+i));
928 BIGLITTLE(pp[0],pp[-1]) = (BNWORD64)x;
929 x = (x >> 64) | (BNWORD128)carry << 64;
930 }
931
932 /* Last round of second half, simplified. */
933 BIGLITTLE(*(prod-len),*(prod+len-1)) = (BNWORD64)x;
934 carry = (x >> 64);
935
936 while (carry)
937 carry -= lbnSubN_64(prod, mod, len);
938 while (lbnCmp_64(prod, mod, len) >= 0)
939 (void)lbnSubN_64(prod, mod, len);
940}
941/* Suppress later definition */
942#define lbnMontMul_64 lbnMontMul_64
943#endif
944
945#if !defined(lbnSquare_64) && defined(BNWORD128) && PRODUCT_SCAN
946/*
947 * Trial code for product-scanning squaring. This seems to slow the C
948 * code down rather than speed it up.
949 */
950void
951lbnSquare_64(BNWORD64 *prod, BNWORD64 const *num, unsigned len)
952{
953 BNWORD128 x, y, z;
954 BNWORD64 const *p1, *p2;
955 unsigned carry;
956 unsigned i, j;
957
958 /* Special case of zero */
959 if (!len)
960 return;
961
962 /* Word 0 of product */
963 x = (BNWORD128)BIGLITTLE(num[-1] * num[-1], num[0] * num[0]);
964 BIGLITTLE(*--prod, *prod++) = (BNWORD64)x;
965 x >>= 64;
966
967 /* Words 1 through len-1 */
968 for (i = 1; i < len; i++) {
969 carry = 0;
970 y = 0;
971 p1 = num;
972 p2 = BIGLITTLE(num-i-1,num+i+1);
973 for (j = 0; j < (i+1)/2; j++) {
974 BIG(z = (BNWORD128)*--p1 * *p2++;)
975 LITTLE(z = (BNWORD128)*p1++ * *--p2;)
976 y += z;
977 carry += (y < z);
978 }
979 y += z = y;
980 carry += carry + (y < z);
981 if ((i & 1) == 0) {
982 assert(BIGLITTLE(--p1 == p2, p1 == --p2));
983 BIG(z = (BNWORD128)*p2 * *p2;)
984 LITTLE(z = (BNWORD128)*p1 * *p1;)
985 y += z;
986 carry += (y < z);
987 }
988 x += y;
989 carry += (x < y);
990 BIGLITTLE(*--prod,*prod++) = (BNWORD64)x;
991 x = (x >> 64) | (BNWORD128)carry << 64;
992 }
993 /* Words len through 2*len-2 */
994 for (i = 1; i < len; i++) {
995 carry = 0;
996 y = 0;
997 p1 = BIGLITTLE(num-i,num+i);
998 p2 = BIGLITTLE(num-len,num+len);
999 for (j = 0; j < (len-i)/2; j++) {
1000 BIG(z = (BNWORD128)*--p1 * *p2++;)
1001 LITTLE(z = (BNWORD128)*p1++ * *--p2;)
1002 y += z;
1003 carry += (y < z);
1004 }
1005 y += z = y;
1006 carry += carry + (y < z);
1007 if ((len-i) & 1) {
1008 assert(BIGLITTLE(--p1 == p2, p1 == --p2));
1009 BIG(z = (BNWORD128)*p2 * *p2;)
1010 LITTLE(z = (BNWORD128)*p1 * *p1;)
1011 y += z;
1012 carry += (y < z);
1013 }
1014 x += y;
1015 carry += (x < y);
1016 BIGLITTLE(*--prod,*prod++) = (BNWORD64)x;
1017 x = (x >> 64) | (BNWORD128)carry << 64;
1018 }
1019
1020 /* Word 2*len-1 */
1021 BIGLITTLE(*--prod,*prod) = (BNWORD64)x;
1022}
1023/* Suppress later definition */
1024#define lbnSquare_64 lbnSquare_64
1025#endif
1026
1027/*
1028 * Square a number, using optimized squaring to reduce the number of
1029 * primitive multiples that are executed. There may not be any
1030 * overlap of the input and output.
1031 *
1032 * Technique: Consider the partial products in the multiplication
1033 * of "abcde" by itself:
1034 *
1035 * a b c d e
1036 * * a b c d e
1037 * ==================
1038 * ae be ce de ee
1039 * ad bd cd dd de
1040 * ac bc cc cd ce
1041 * ab bb bc bd be
1042 * aa ab ac ad ae
1043 *
1044 * Note that everything above the main diagonal:
1045 * ae be ce de = (abcd) * e
1046 * ad bd cd = (abc) * d
1047 * ac bc = (ab) * c
1048 * ab = (a) * b
1049 *
1050 * is a copy of everything below the main diagonal:
1051 * de
1052 * cd ce
1053 * bc bd be
1054 * ab ac ad ae
1055 *
1056 * Thus, the sum is 2 * (off the diagonal) + diagonal.
1057 *
1058 * This is accumulated beginning with the diagonal (which
1059 * consist of the squares of the digits of the input), which is then
1060 * divided by two, the off-diagonal added, and multiplied by two
1061 * again. The low bit is simply a copy of the low bit of the
1062 * input, so it doesn't need special care.
1063 *
1064 * TODO: Merge the shift by 1 with the squaring loop.
1065 * TODO: Use Karatsuba. (a*W+b)^2 = a^2 * (W^2+W) + b^2 * (W+1) - (a-b)^2 * W.
1066 */
1067#ifndef lbnSquare_64
1068void
1069lbnSquare_64(BNWORD64 *prod, BNWORD64 const *num, unsigned len)
1070{
1071 BNWORD64 t;
1072 BNWORD64 *prodx = prod; /* Working copy of the argument */
1073 BNWORD64 const *numx = num; /* Working copy of the argument */
1074 unsigned lenx = len; /* Working copy of the argument */
1075
1076 if (!len)
1077 return;
1078
1079 /* First, store all the squares */
1080 while (lenx--) {
1081#ifdef mul64_ppmm
1082 BNWORD64 ph, pl;
1083 t = BIGLITTLE(*--numx,*numx++);
1084 mul64_ppmm(ph,pl,t,t);
1085 BIGLITTLE(*--prodx,*prodx++) = pl;
1086 BIGLITTLE(*--prodx,*prodx++) = ph;
1087#elif defined(BNWORD128) /* use BNWORD128 */
1088 BNWORD128 p;
1089 t = BIGLITTLE(*--numx,*numx++);
1090 p = (BNWORD128)t * t;
1091 BIGLITTLE(*--prodx,*prodx++) = (BNWORD64)p;
1092 BIGLITTLE(*--prodx,*prodx++) = (BNWORD64)(p>>64);
1093#else /* Use lbnMulN1_64 */
1094 t = BIGLITTLE(numx[-1],*numx);
1095 lbnMulN1_64(prodx, numx, 1, t);
1096 BIGLITTLE(--numx,numx++);
1097 BIGLITTLE(prodx -= 2, prodx += 2);
1098#endif
1099 }
1100 /* Then, shift right 1 bit */
1101 (void)lbnRshift_64(prod, 2*len, 1);
1102
1103 /* Then, add in the off-diagonal sums */
1104 lenx = len;
1105 numx = num;
1106 prodx = prod;
1107 while (--lenx) {
1108 t = BIGLITTLE(*--numx,*numx++);
1109 BIGLITTLE(--prodx,prodx++);
1110 t = lbnMulAdd1_64(prodx, numx, lenx, t);
1111 lbnAdd1_64(BIGLITTLE(prodx-lenx,prodx+lenx), lenx+1, t);
1112 BIGLITTLE(--prodx,prodx++);
1113 }
1114
1115 /* Shift it back up */
1116 lbnDouble_64(prod, 2*len);
1117
1118 /* And set the low bit appropriately */
1119 BIGLITTLE(prod[-1],prod[0]) |= BIGLITTLE(num[-1],num[0]) & 1;
1120}
1121#endif /* !lbnSquare_64 */
1122
1123/*
1124 * lbnNorm_64 - given a number, return a modified length such that the
1125 * most significant digit is non-zero. Zero-length input is okay.
1126 */
1127#ifndef lbnNorm_64
1128unsigned
1129lbnNorm_64(BNWORD64 const *num, unsigned len)
1130{
1131 BIGLITTLE(num -= len,num += len);
1132 while (len && BIGLITTLE(*num++,*--num) == 0)
1133 --len;
1134 return len;
1135}
1136#endif /* lbnNorm_64 */
1137
1138/*
1139 * lbnBits_64 - return the number of significant bits in the array.
1140 * It starts by normalizing the array. Zero-length input is okay.
1141 * Then assuming there's anything to it, it fetches the high word,
1142 * generates a bit length by multiplying the word length by 64, and
1143 * subtracts off 64/2, 64/4, 64/8, ... bits if the high bits are clear.
1144 */
1145#ifndef lbnBits_64
1146unsigned
1147lbnBits_64(BNWORD64 const *num, unsigned len)
1148{
1149 BNWORD64 t;
1150 unsigned i;
1151
1152 len = lbnNorm_64(num, len);
1153 if (len) {
1154 t = BIGLITTLE(*(num-len),*(num+(len-1)));
1155 assert(t);
1156 len *= 64;
1157 i = 64/2;
1158 do {
1159 if (t >> i)
1160 t >>= i;
1161 else
1162 len -= i;
1163 } while ((i /= 2) != 0);
1164 }
1165 return len;
1166}
1167#endif /* lbnBits_64 */
1168
1169/*
1170 * If defined, use hand-rolled divide rather than compiler's native.
1171 * If the machine doesn't do it in line, the manual code is probably
1172 * faster, since it can assume normalization and the fact that the
1173 * quotient will fit into 64 bits, which a general 128-bit divide
1174 * in a compiler's run-time library can't do.
1175 */
1176#ifndef BN_SLOW_DIVIDE_128
1177/* Assume that divisors of more than thirty-two bits are slow */
1178#define BN_SLOW_DIVIDE_128 (128 > 0x20)
1179#endif
1180
1181/*
1182 * Return (nh<<64|nl) % d, and place the quotient digit into *q.
1183 * It is guaranteed that nh < d, and that d is normalized (with its high
1184 * bit set). If we have a double-width type, it's easy. If not, ooh,
1185 * yuk!
1186 */
1187#ifndef lbnDiv21_64
1188#if defined(BNWORD128) && !BN_SLOW_DIVIDE_128
1189BNWORD64
1190lbnDiv21_64(BNWORD64 *q, BNWORD64 nh, BNWORD64 nl, BNWORD64 d)
1191{
1192 BNWORD128 n = (BNWORD128)nh << 64 | nl;
1193
1194 /* Divisor must be normalized */
1195 assert(d >> (64-1) == 1);
1196
1197 *q = n / d;
1198 return n % d;
1199}
1200#else
1201/*
1202 * This is where it gets ugly.
1203 *
1204 * Do the division in two halves, using Algorithm D from section 4.3.1
1205 * of Knuth. Note Theorem B from that section, that the quotient estimate
1206 * is never more than the true quotient, and is never more than two
1207 * too low.
1208 *
1209 * The mapping onto conventional long division is (everything a half word):
1210 * _____________qh___ql_
1211 * dh dl ) nh.h nh.l nl.h nl.l
1212 * - (qh * d)
1213 * -----------
1214 * rrrr rrrr nl.l
1215 * - (ql * d)
1216 * -----------
1217 * rrrr rrrr
1218 *
1219 * The implicit 3/2-digit d*qh and d*ql subtractors are computed this way:
1220 * First, estimate a q digit so that nh/dh works. Subtracting qh*dh from
1221 * the (nh.h nh.l) list leaves a 1/2-word remainder r. Then compute the
1222 * low part of the subtractor, qh * dl. This also needs to be subtracted
1223 * from (nh.h nh.l nl.h) to get the final remainder. So we take the
1224 * remainder, which is (nh.h nh.l) - qh*dl, shift it and add in nl.h, and
1225 * try to subtract qh * dl from that. Since the remainder is 1/2-word
1226 * long, shifting and adding nl.h results in a single word r.
1227 * It is possible that the remainder we're working with, r, is less than
1228 * the product qh * dl, if we estimated qh too high. The estimation
1229 * technique can produce a qh that is too large (never too small), leading
1230 * to r which is too small. In that case, decrement the digit qh, add
1231 * shifted dh to r (to correct for that error), and subtract dl from the
1232 * product we're comparing r with. That's the "correct" way to do it, but
1233 * just adding dl to r instead of subtracting it from the product is
1234 * equivalent and a lot simpler. You just have to watch out for overflow.
1235 *
1236 * The process is repeated with (rrrr rrrr nl.l) for the low digit of the
1237 * quotient ql.
1238 *
1239 * The various uses of 64/2 for shifts are because of the note about
1240 * automatic editing of this file at the very top of the file.
1241 */
1242#define highhalf(x) ( (x) >> 64/2 )
1243#define lowhalf(x) ( (x) & (((BNWORD64)1 << 64/2)-1) )
1244BNWORD64
1245lbnDiv21_64(BNWORD64 *q, BNWORD64 nh, BNWORD64 nl, BNWORD64 d)
1246{
1247 BNWORD64 dh = highhalf(d), dl = lowhalf(d);
1248 BNWORD64 qh, ql, prod, r;
1249
1250 /* Divisor must be normalized */
1251 assert((d >> (64-1)) == 1);
1252
1253 /* Do first half-word of division */
1254 qh = nh / dh;
1255 r = nh % dh;
1256 prod = qh * dl;
1257
1258 /*
1259 * Add next half-word of numerator to remainder and correct.
1260 * qh may be up to two too large.
1261 */
1262 r = (r << (64/2)) | highhalf(nl);
1263 if (r < prod) {
1264 --qh; r += d;
1265 if (r >= d && r < prod) {
1266 --qh; r += d;
1267 }
1268 }
1269 r -= prod;
1270
1271 /* Do second half-word of division */
1272 ql = r / dh;
1273 r = r % dh;
1274 prod = ql * dl;
1275
1276 r = (r << (64/2)) | lowhalf(nl);
1277 if (r < prod) {
1278 --ql; r += d;
1279 if (r >= d && r < prod) {
1280 --ql; r += d;
1281 }
1282 }
1283 r -= prod;
1284
1285 *q = (qh << (64/2)) | ql;
1286
1287 return r;
1288}
1289#endif
1290#endif /* lbnDiv21_64 */
1291
1292
1293/*
1294 * In the division functions, the dividend and divisor are referred to
1295 * as "n" and "d", which stand for "numerator" and "denominator".
1296 *
1297 * The quotient is (nlen-dlen+1) digits long. It may be overlapped with
1298 * the high (nlen-dlen) words of the dividend, but one extra word is needed
1299 * on top to hold the top word.
1300 */
1301
1302/*
1303 * Divide an n-word number by a 1-word number, storing the remainder
1304 * and n-1 words of the n-word quotient. The high word is returned.
1305 * It IS legal for rem to point to the same address as n, and for
1306 * q to point one word higher.
1307 *
1308 * TODO: If BN_SLOW_DIVIDE_128, add a divnhalf_64 which uses 64-bit
1309 * dividends if the divisor is half that long.
1310 * TODO: Shift the dividend on the fly to avoid the last division and
1311 * instead have a remainder that needs shifting.
1312 * TODO: Use reciprocals rather than dividing.
1313 */
1314#ifndef lbnDiv1_64
1315BNWORD64
1316lbnDiv1_64(BNWORD64 *q, BNWORD64 *rem, BNWORD64 const *n, unsigned len,
1317 BNWORD64 d)
1318{
1319 unsigned shift;
1320 unsigned xlen;
1321 BNWORD64 r;
1322 BNWORD64 qhigh;
1323
1324 assert(len > 0);
1325 assert(d);
1326
1327 if (len == 1) {
1328 r = *n;
1329 *rem = r%d;
1330 return r/d;
1331 }
1332
1333 shift = 0;
1334 r = d;
1335 xlen = 64/2;
1336 do {
1337 if (r >> xlen)
1338 r >>= xlen;
1339 else
1340 shift += xlen;
1341 } while ((xlen /= 2) != 0);
1342 assert((d >> (64-1-shift)) == 1);
1343 d <<= shift;
1344
1345 BIGLITTLE(q -= len-1,q += len-1);
1346 BIGLITTLE(n -= len,n += len);
1347
1348 r = BIGLITTLE(*n++,*--n);
1349 if (r < d) {
1350 qhigh = 0;
1351 } else {
1352 qhigh = r/d;
1353 r %= d;
1354 }
1355
1356 xlen = len;
1357 while (--xlen)
1358 r = lbnDiv21_64(BIGLITTLE(q++,--q), r, BIGLITTLE(*n++,*--n), d);
1359
1360 /*
1361 * Final correction for shift - shift the quotient up "shift"
1362 * bits, and merge in the extra bits of quotient. Then reduce
1363 * the final remainder mod the real d.
1364 */
1365 if (shift) {
1366 d >>= shift;
1367 qhigh = (qhigh << shift) | lbnLshift_64(q, len-1, shift);
1368 BIGLITTLE(q[-1],*q) |= r/d;
1369 r %= d;
1370 }
1371 *rem = r;
1372
1373 return qhigh;
1374}
1375#endif
1376
1377/*
1378 * This function performs a "quick" modulus of a number with a divisor
1379 * d which is guaranteed to be at most sixteen bits, i.e. less than 65536.
1380 * This applies regardless of the word size the library is compiled with.
1381 *
1382 * This function is important to prime generation, for sieving.
1383 */
1384#ifndef lbnModQ_64
1385/* If there's a custom lbnMod21_64, no normalization needed */
1386#ifdef lbnMod21_64
1387unsigned
1388lbnModQ_64(BNWORD64 const *n, unsigned len, unsigned d)
1389{
1390 unsigned i, shift;
1391 BNWORD64 r;
1392
1393 assert(len > 0);
1394
1395 BIGLITTLE(n -= len,n += len);
1396
1397 /* Try using a compare to avoid the first divide */
1398 r = BIGLITTLE(*n++,*--n);
1399 if (r >= d)
1400 r %= d;
1401 while (--len)
1402 r = lbnMod21_64(r, BIGLITTLE(*n++,*--n), d);
1403
1404 return r;
1405}
1406#elif defined(BNWORD128) && !BN_SLOW_DIVIDE_128
1407unsigned
1408lbnModQ_64(BNWORD64 const *n, unsigned len, unsigned d)
1409{
1410 BNWORD64 r;
1411
1412 if (!--len)
1413 return BIGLITTLE(n[-1],n[0]) % d;
1414
1415 BIGLITTLE(n -= len,n += len);
1416 r = BIGLITTLE(n[-1],n[0]);
1417
1418 do {
1419 r = (BNWORD64)((((BNWORD128)r<<64) | BIGLITTLE(*n++,*--n)) % d);
1420 } while (--len);
1421
1422 return r;
1423}
1424#elif 64 >= 0x20
1425/*
1426 * If the single word size can hold 65535*65536, then this function
1427 * is avilable.
1428 */
1429#ifndef highhalf
1430#define highhalf(x) ( (x) >> 64/2 )
1431#define lowhalf(x) ( (x) & ((1 << 64/2)-1) )
1432#endif
1433unsigned
1434lbnModQ_64(BNWORD64 const *n, unsigned len, unsigned d)
1435{
1436 BNWORD64 r, x;
1437
1438 BIGLITTLE(n -= len,n += len);
1439
1440 r = BIGLITTLE(*n++,*--n);
1441 while (--len) {
1442 x = BIGLITTLE(*n++,*--n);
1443 r = (r%d << 64/2) | highhalf(x);
1444 r = (r%d << 64/2) | lowhalf(x);
1445 }
1446
1447 return r%d;
1448}
1449#else
1450/* Default case - use lbnDiv21_64 */
1451unsigned
1452lbnModQ_64(BNWORD64 const *n, unsigned len, unsigned d)
1453{
1454 unsigned i, shift;
1455 BNWORD64 r;
1456 BNWORD64 q;
1457
1458 assert(len > 0);
1459
1460 shift = 0;
1461 r = d;
1462 i = 64;
1463 while (i /= 2) {
1464 if (r >> i)
1465 r >>= i;
1466 else
1467 shift += i;
1468 }
1469 assert(d >> (64-1-shift) == 1);
1470 d <<= shift;
1471
1472 BIGLITTLE(n -= len,n += len);
1473
1474 r = BIGLITTLE(*n++,*--n);
1475 if (r >= d)
1476 r %= d;
1477
1478 while (--len)
1479 r = lbnDiv21_64(&q, r, BIGLITTLE(*n++,*--n), d);
1480
1481 /*
1482 * Final correction for shift - shift the quotient up "shift"
1483 * bits, and merge in the extra bits of quotient. Then reduce
1484 * the final remainder mod the real d.
1485 */
1486 if (shift)
1487 r %= d >> shift;
1488
1489 return r;
1490}
1491#endif
1492#endif /* lbnModQ_64 */
1493
1494/*
1495 * Reduce n mod d and return the quotient. That is, find:
1496 * q = n / d;
1497 * n = n % d;
1498 * d is altered during the execution of this subroutine by normalizing it.
1499 * It must already have its most significant word non-zero; it is shifted
1500 * so its most significant bit is non-zero.
1501 *
1502 * The quotient q is nlen-dlen+1 words long. To make it possible to
1503 * overlap the quptient with the input (you can store it in the high dlen
1504 * words), the high word of the quotient is *not* stored, but is returned.
1505 * (If all you want is the remainder, you don't care about it, anyway.)
1506 *
1507 * This uses algorithm D from Knuth (4.3.1), except that we do binary
1508 * (shift) normalization of the divisor. WARNING: This is hairy!
1509 *
1510 * This function is used for some modular reduction, but it is not used in
1511 * the modular exponentiation loops; they use Montgomery form and the
1512 * corresponding, more efficient, Montgomery reduction. This code
1513 * is needed for the conversion to Montgomery form, however, so it
1514 * has to be here and it might as well be reasonably efficient.
1515 *
1516 * The overall operation is as follows ("top" and "up" refer to the
1517 * most significant end of the number; "bottom" and "down", the least):
1518 *
1519 * - Shift the divisor up until the most significant bit is set.
1520 * - Shift the dividend up the same amount. This will produce the
1521 * correct quotient, and the remainder can be recovered by shifting
1522 * it back down the same number of bits. This may produce an overflow
1523 * word, but the word is always strictly less than the most significant
1524 * divisor word.
1525 * - Estimate the first quotient digit qhat:
1526 * - First take the top two words (one of which is the overflow) of the
1527 * dividend and divide by the top word of the divisor:
1528 * qhat = (nh,nm)/dh. This qhat is >= the correct quotient digit
1529 * and, since dh is normalized, it is at most two over.
1530 * - Second, correct by comparing the top three words. If
1531 * (dh,dl) * qhat > (nh,nm,ml), decrease qhat and try again.
1532 * The second iteration can be simpler because there can't be a third.
1533 * The computation can be simplified by subtracting dh*qhat from
1534 * both sides, suitably shifted. This reduces the left side to
1535 * dl*qhat. On the right, (nh,nm)-dh*qhat is simply the
1536 * remainder r from (nh,nm)%dh, so the right is (r,nl).
1537 * This produces qhat that is almost always correct and at
1538 * most (prob ~ 2/2^64) one too high.
1539 * - Subtract qhat times the divisor (suitably shifted) from the dividend.
1540 * If there is a borrow, qhat was wrong, so decrement it
1541 * and add the divisor back in (once).
1542 * - Store the final quotient digit qhat in the quotient array q.
1543 *
1544 * Repeat the quotient digit computation for successive digits of the
1545 * quotient until the whole quotient has been computed. Then shift the
1546 * divisor and the remainder down to correct for the normalization.
1547 *
1548 * TODO: Special case 2-word divisors.
1549 * TODO: Use reciprocals rather than dividing.
1550 */
1551#ifndef divn_64
1552BNWORD64
1553lbnDiv_64(BNWORD64 *q, BNWORD64 *n, unsigned nlen, BNWORD64 *d, unsigned dlen)
1554{
1555 BNWORD64 nh,nm,nl; /* Top three words of the dividend */
1556 BNWORD64 dh,dl; /* Top two words of the divisor */
1557 BNWORD64 qhat; /* Extimate of quotient word */
1558 BNWORD64 r; /* Remainder from quotient estimate division */
1559 BNWORD64 qhigh; /* High word of quotient */
1560 unsigned i; /* Temp */
1561 unsigned shift; /* Bits shifted by normalization */
1562 unsigned qlen = nlen-dlen; /* Size of quotient (less 1) */
1563#ifdef mul64_ppmm
1564 BNWORD64 t64;
1565#elif defined(BNWORD128)
1566 BNWORD128 t128;
1567#else /* use lbnMulN1_64 */
1568 BNWORD64 t2[2];
1569#define t2high BIGLITTLE(t2[0],t2[1])
1570#define t2low BIGLITTLE(t2[1],t2[0])
1571#endif
1572
1573 assert(dlen);
1574 assert(nlen >= dlen);
1575
1576 /*
1577 * Special cases for short divisors. The general case uses the
1578 * top top 2 digits of the divisor (d) to estimate a quotient digit,
1579 * so it breaks if there are fewer digits available. Thus, we need
1580 * special cases for a divisor of length 1. A divisor of length
1581 * 2 can have a *lot* of administrivia overhead removed removed,
1582 * so it's probably worth special-casing that case, too.
1583 */
1584 if (dlen == 1)
1585 return lbnDiv1_64(q, BIGLITTLE(n-1,n), n, nlen,
1586 BIGLITTLE(d[-1],d[0]));
1587
1588#if 0
1589 /*
1590 * @@@ This is not yet written... The general loop will do,
1591 * albeit less efficiently
1592 */
1593 if (dlen == 2) {
1594 /*
1595 * divisor two digits long:
1596 * use the 3/2 technique from Knuth, but we know
1597 * it's exact.
1598 */
1599 dh = BIGLITTLE(d[-1],d[0]);
1600 dl = BIGLITTLE(d[-2],d[1]);
1601 shift = 0;
1602 if ((sh & ((BNWORD64)1 << 64-1-shift)) == 0) {
1603 do {
1604 shift++;
1605 } while (dh & (BNWORD64)1<<64-1-shift) == 0);
1606 dh = dh << shift | dl >> (64-shift);
1607 dl <<= shift;
1608
1609
1610 }
1611
1612
1613 for (shift = 0; (dh & (BNWORD64)1 << 64-1-shift)) == 0; shift++)
1614 ;
1615 if (shift) {
1616 }
1617 dh = dh << shift | dl >> (64-shift);
1618 shift = 0;
1619 while (dh
1620 }
1621#endif
1622
1623 dh = BIGLITTLE(*(d-dlen),*(d+(dlen-1)));
1624 assert(dh);
1625
1626 /* Normalize the divisor */
1627 shift = 0;
1628 r = dh;
1629 i = 64/2;
1630 do {
1631 if (r >> i)
1632 r >>= i;
1633 else
1634 shift += i;
1635 } while ((i /= 2) != 0);
1636
1637 nh = 0;
1638 if (shift) {
1639 lbnLshift_64(d, dlen, shift);
1640 dh = BIGLITTLE(*(d-dlen),*(d+(dlen-1)));
1641 nh = lbnLshift_64(n, nlen, shift);
1642 }
1643
1644 /* Assert that dh is now normalized */
1645 assert(dh >> (64-1));
1646
1647 /* Also get the second-most significant word of the divisor */
1648 dl = BIGLITTLE(*(d-(dlen-1)),*(d+(dlen-2)));
1649
1650 /*
1651 * Adjust pointers: n to point to least significant end of first
1652 * first subtract, and q to one the most-significant end of the
1653 * quotient array.
1654 */
1655 BIGLITTLE(n -= qlen,n += qlen);
1656 BIGLITTLE(q -= qlen,q += qlen);
1657
1658 /* Fetch the most significant stored word of the dividend */
1659 nm = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
1660
1661 /*
1662 * Compute the first digit of the quotient, based on the
1663 * first two words of the dividend (the most significant of which
1664 * is the overflow word h).
1665 */
1666 if (nh) {
1667 assert(nh < dh);
1668 r = lbnDiv21_64(&qhat, nh, nm, dh);
1669 } else if (nm >= dh) {
1670 qhat = nm/dh;
1671 r = nm % dh;
1672 } else { /* Quotient is zero */
1673 qhigh = 0;
1674 goto divloop;
1675 }
1676
1677 /* Now get the third most significant word of the dividend */
1678 nl = BIGLITTLE(*(n-(dlen-1)),*(n+(dlen-2)));
1679
1680 /*
1681 * Correct qhat, the estimate of quotient digit.
1682 * qhat can only be high, and at most two words high,
1683 * so the loop can be unrolled and abbreviated.
1684 */
1685#ifdef mul64_ppmm
1686 mul64_ppmm(nm, t64, qhat, dl);
1687 if (nm > r || (nm == r && t64 > nl)) {
1688 /* Decrement qhat and adjust comparison parameters */
1689 qhat--;
1690 if ((r += dh) >= dh) {
1691 nm -= (t64 < dl);
1692 t64 -= dl;
1693 if (nm > r || (nm == r && t64 > nl))
1694 qhat--;
1695 }
1696 }
1697#elif defined(BNWORD128)
1698 t128 = (BNWORD128)qhat * dl;
1699 if (t128 > ((BNWORD128)r << 64) + nl) {
1700 /* Decrement qhat and adjust comparison parameters */
1701 qhat--;
1702 if ((r += dh) > dh) {
1703 t128 -= dl;
1704 if (t128 > ((BNWORD128)r << 64) + nl)
1705 qhat--;
1706 }
1707 }
1708#else /* Use lbnMulN1_64 */
1709 lbnMulN1_64(BIGLITTLE(t2+2,t2), &dl, 1, qhat);
1710 if (t2high > r || (t2high == r && t2low > nl)) {
1711 /* Decrement qhat and adjust comparison parameters */
1712 qhat--;
1713 if ((r += dh) >= dh) {
1714 t2high -= (t2low < dl);
1715 t2low -= dl;
1716 if (t2high > r || (t2high == r && t2low > nl))
1717 qhat--;
1718 }
1719 }
1720#endif
1721
1722 /* Do the multiply and subtract */
1723 r = lbnMulSub1_64(n, d, dlen, qhat);
1724 /* If there was a borrow, add back once. */
1725 if (r > nh) { /* Borrow? */
1726 (void)lbnAddN_64(n, d, dlen);
1727 qhat--;
1728 }
1729
1730 /* Remember the first quotient digit. */
1731 qhigh = qhat;
1732
1733 /* Now, the main division loop: */
1734divloop:
1735 while (qlen--) {
1736
1737 /* Advance n */
1738 nh = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
1739 BIGLITTLE(++n,--n);
1740 nm = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
1741
1742 if (nh == dh) {
1743 qhat = ~(BNWORD64)0;
1744 /* Optimized computation of r = (nh,nm) - qhat * dh */
1745 r = nh + nm;
1746 if (r < nh)
1747 goto subtract;
1748 } else {
1749 assert(nh < dh);
1750 r = lbnDiv21_64(&qhat, nh, nm, dh);
1751 }
1752
1753 nl = BIGLITTLE(*(n-(dlen-1)),*(n+(dlen-2)));
1754#ifdef mul64_ppmm
1755 mul64_ppmm(nm, t64, qhat, dl);
1756 if (nm > r || (nm == r && t64 > nl)) {
1757 /* Decrement qhat and adjust comparison parameters */
1758 qhat--;
1759 if ((r += dh) >= dh) {
1760 nm -= (t64 < dl);
1761 t64 -= dl;
1762 if (nm > r || (nm == r && t64 > nl))
1763 qhat--;
1764 }
1765 }
1766#elif defined(BNWORD128)
1767 t128 = (BNWORD128)qhat * dl;
1768 if (t128 > ((BNWORD128)r<<64) + nl) {
1769 /* Decrement qhat and adjust comparison parameters */
1770 qhat--;
1771 if ((r += dh) >= dh) {
1772 t128 -= dl;
1773 if (t128 > ((BNWORD128)r << 64) + nl)
1774 qhat--;
1775 }
1776 }
1777#else /* Use lbnMulN1_64 */
1778 lbnMulN1_64(BIGLITTLE(t2+2,t2), &dl, 1, qhat);
1779 if (t2high > r || (t2high == r && t2low > nl)) {
1780 /* Decrement qhat and adjust comparison parameters */
1781 qhat--;
1782 if ((r += dh) >= dh) {
1783 t2high -= (t2low < dl);
1784 t2low -= dl;
1785 if (t2high > r || (t2high == r && t2low > nl))
1786 qhat--;
1787 }
1788 }
1789#endif
1790
1791 /*
1792 * As a point of interest, note that it is not worth checking
1793 * for qhat of 0 or 1 and installing special-case code. These
1794 * occur with probability 2^-64, so spending 1 cycle to check
1795 * for them is only worth it if we save more than 2^15 cycles,
1796 * and a multiply-and-subtract for numbers in the 1024-bit
1797 * range just doesn't take that long.
1798 */
1799subtract:
1800 /*
1801 * n points to the least significant end of the substring
1802 * of n to be subtracted from. qhat is either exact or
1803 * one too large. If the subtract gets a borrow, it was
1804 * one too large and the divisor is added back in. It's
1805 * a dlen+1 word add which is guaranteed to produce a
1806 * carry out, so it can be done very simply.
1807 */
1808 r = lbnMulSub1_64(n, d, dlen, qhat);
1809 if (r > nh) { /* Borrow? */
1810 (void)lbnAddN_64(n, d, dlen);
1811 qhat--;
1812 }
1813 /* Store the quotient digit */
1814 BIGLITTLE(*q++,*--q) = qhat;
1815 }
1816 /* Tah dah! */
1817
1818 if (shift) {
1819 lbnRshift_64(d, dlen, shift);
1820 lbnRshift_64(n, dlen, shift);
1821 }
1822
1823 return qhigh;
1824}
1825#endif
1826
1827/*
1828 * Find the negative multiplicative inverse of x (x must be odd!) modulo 2^64.
1829 *
1830 * This just performs Newton's iteration until it gets the
1831 * inverse. The initial estimate is always correct to 3 bits, and
1832 * sometimes 4. The number of valid bits doubles each iteration.
1833 * (To prove it, assume x * y == 1 (mod 2^n), and introduce a variable
1834 * for the error mod 2^2n. x * y == 1 + k*2^n (mod 2^2n) and follow
1835 * the iteration through.)
1836 */
1837#ifndef lbnMontInv1_64
1838BNWORD64
1839lbnMontInv1_64(BNWORD64 const x)
1840{
1841 BNWORD64 y = x, z;
1842
1843 assert(x & 1);
1844
1845 while ((z = x*y) != 1)
1846 y *= 2 - z;
1847 return -y;
1848}
1849#endif /* !lbnMontInv1_64 */
1850
1851#if defined(BNWORD128) && PRODUCT_SCAN
1852/*
1853 * Test code for product-scanning Montgomery reduction.
1854 * This seems to slow the C code down rather than speed it up.
1855 *
1856 * The first loop computes the Montgomery multipliers, storing them over
1857 * the low half of the number n.
1858 *
1859 * The second half multiplies the upper half, adding in the modulus
1860 * times the Montgomery multipliers. The results of this multiply
1861 * are stored.
1862 */
1863void
1864lbnMontReduce_64(BNWORD64 *n, BNWORD64 const *mod, unsigned mlen, BNWORD64 inv)
1865{
1866 BNWORD128 x, y;
1867 BNWORD64 const *pm;
1868 BNWORD64 *pn;
1869 BNWORD64 t;
1870 unsigned carry;
1871 unsigned i, j;
1872
1873 /* Special case of zero */
1874 if (!mlen)
1875 return;
1876
1877 /* Pass 1 - compute Montgomery multipliers */
1878 /* First iteration can have certain simplifications. */
1879 t = BIGLITTLE(n[-1],n[0]);
1880 x = t;
1881 t *= inv;
1882 BIGLITTLE(n[-1], n[0]) = t;
1883 x += (BNWORD128)t * BIGLITTLE(mod[-1],mod[0]); /* Can't overflow */
1884 assert((BNWORD64)x == 0);
1885 x = x >> 64;
1886
1887 for (i = 1; i < mlen; i++) {
1888 carry = 0;
1889 pn = n;
1890 pm = BIGLITTLE(mod-i-1,mod+i+1);
1891 for (j = 0; j < i; j++) {
1892 y = (BNWORD128)BIGLITTLE(*--pn * *pm++, *pn++ * *--pm);
1893 x += y;
1894 carry += (x < y);
1895 }
1896 assert(BIGLITTLE(pn == n-i, pn == n+i));
1897 y = t = BIGLITTLE(pn[-1], pn[0]);
1898 x += y;
1899 carry += (x < y);
1900 BIGLITTLE(pn[-1], pn[0]) = t = inv * (BNWORD64)x;
1901 assert(BIGLITTLE(pm == mod-1, pm == mod+1));
1902 y = (BNWORD128)t * BIGLITTLE(pm[0],pm[-1]);
1903 x += y;
1904 carry += (x < y);
1905 assert((BNWORD64)x == 0);
1906 x = x >> 64 | (BNWORD128)carry << 64;
1907 }
1908
1909 BIGLITTLE(n -= mlen, n += mlen);
1910
1911 /* Pass 2 - compute upper words and add to n */
1912 for (i = 1; i < mlen; i++) {
1913 carry = 0;
1914 pm = BIGLITTLE(mod-i,mod+i);
1915 pn = n;
1916 for (j = i; j < mlen; j++) {
1917 y = (BNWORD128)BIGLITTLE(*--pm * *pn++, *pm++ * *--pn);
1918 x += y;
1919 carry += (x < y);
1920 }
1921 assert(BIGLITTLE(pm == mod-mlen, pm == mod+mlen));
1922 assert(BIGLITTLE(pn == n+mlen-i, pn == n-mlen+i));
1923 y = t = BIGLITTLE(*(n-i),*(n+i-1));
1924 x += y;
1925 carry += (x < y);
1926 BIGLITTLE(*(n-i),*(n+i-1)) = (BNWORD64)x;
1927 x = (x >> 64) | (BNWORD128)carry << 64;
1928 }
1929
1930 /* Last round of second half, simplified. */
1931 t = BIGLITTLE(*(n-mlen),*(n+mlen-1));
1932 x += t;
1933 BIGLITTLE(*(n-mlen),*(n+mlen-1)) = (BNWORD64)x;
1934 carry = (unsigned)(x >> 64);
1935
1936 while (carry)
1937 carry -= lbnSubN_64(n, mod, mlen);
1938 while (lbnCmp_64(n, mod, mlen) >= 0)
1939 (void)lbnSubN_64(n, mod, mlen);
1940}
1941#define lbnMontReduce_64 lbnMontReduce_64
1942#endif
1943
1944/*
1945 * Montgomery reduce n, modulo mod. This reduces modulo mod and divides by
1946 * 2^(64*mlen). Returns the result in the *top* mlen words of the argument n.
1947 * This is ready for another multiplication using lbnMul_64.
1948 *
1949 * Montgomery representation is a very useful way to encode numbers when
1950 * you're doing lots of modular reduction. What you do is pick a multiplier
1951 * R which is relatively prime to the modulus and very easy to divide by.
1952 * Since the modulus is odd, R is closen as a power of 2, so the division
1953 * is a shift. In fact, it's a shift of an integral number of words,
1954 * so the shift can be implicit - just drop the low-order words.
1955 *
1956 * Now, choose R *larger* than the modulus m, 2^(64*mlen). Then convert
1957 * all numbers a, b, etc. to Montgomery form M(a), M(b), etc using the
1958 * relationship M(a) = a*R mod m, M(b) = b*R mod m, etc. Note that:
1959 * - The Montgomery form of a number depends on the modulus m.
1960 * A fixed modulus m is assumed throughout this discussion.
1961 * - Since R is relaitvely prime to m, multiplication by R is invertible;
1962 * no information about the numbers is lost, they're just scrambled.
1963 * - Adding (and subtracting) numbers in this form works just as usual.
1964 * M(a+b) = (a+b)*R mod m = (a*R + b*R) mod m = (M(a) + M(b)) mod m
1965 * - Multiplying numbers in this form produces a*b*R*R. The problem
1966 * is to divide out the excess factor of R, modulo m as well as to
1967 * reduce to the given length mlen. It turns out that this can be
1968 * done *faster* than a normal divide, which is where the speedup
1969 * in Montgomery division comes from.
1970 *
1971 * Normal reduction chooses a most-significant quotient digit q and then
1972 * subtracts q*m from the number to be reduced. Choosing q is tricky
1973 * and involved (just look at lbnDiv_64 to see!) and is usually
1974 * imperfect, requiring a check for correction after the subtraction.
1975 *
1976 * Montgomery reduction *adds* a multiple of m to the *low-order* part
1977 * of the number to be reduced. This multiple is chosen to make the
1978 * low-order part of the number come out to zero. This can be done
1979 * with no trickery or error using a precomputed inverse of the modulus.
1980 * In this code, the "part" is one word, but any width can be used.
1981 *
1982 * Repeating this step sufficiently often results in a value which
1983 * is a multiple of R (a power of two, remember) but is still (since
1984 * the additions were to the low-order part and thus did not increase
1985 * the value of the number being reduced very much) still not much
1986 * larger than m*R. Then implicitly divide by R and subtract off
1987 * m until the result is in the correct range.
1988 *
1989 * Since the low-order part being cancelled is less than R, the
1990 * multiple of m added must have a multiplier which is at most R-1.
1991 * Assuming that the input is at most m*R-1, the final number is
1992 * at most m*(2*R-1)-1 = 2*m*R - m - 1, so subtracting m once from
1993 * the high-order part, equivalent to subtracting m*R from the
1994 * while number, produces a result which is at most m*R - m - 1,
1995 * which divided by R is at most m-1.
1996 *
1997 * To convert *to* Montgomery form, you need a regular remainder
1998 * routine, although you can just compute R*R (mod m) and do the
1999 * conversion using Montgomery multiplication. To convert *from*
2000 * Montgomery form, just Montgomery reduce the number to
2001 * remove the extra factor of R.
2002 *
2003 * TODO: Change to a full inverse and use Karatsuba's multiplication
2004 * rather than this word-at-a-time.
2005 */
2006#ifndef lbnMontReduce_64
2007void
2008lbnMontReduce_64(BNWORD64 *n, BNWORD64 const *mod, unsigned const mlen,
2009 BNWORD64 inv)
2010{
2011 BNWORD64 t;
2012 BNWORD64 c = 0;
2013 unsigned len = mlen;
2014
2015 /* inv must be the negative inverse of mod's least significant word */
2016 assert((BNWORD64)(inv * BIGLITTLE(mod[-1],mod[0])) == (BNWORD64)-1);
2017
2018 assert(len);
2019
2020 do {
2021 t = lbnMulAdd1_64(n, mod, mlen, inv * BIGLITTLE(n[-1],n[0]));
2022 c += lbnAdd1_64(BIGLITTLE(n-mlen,n+mlen), len, t);
2023 BIGLITTLE(--n,++n);
2024 } while (--len);
2025
2026 /*
2027 * All that adding can cause an overflow past the modulus size,
2028 * but it's unusual, and never by much, so a subtraction loop
2029 * is the right way to deal with it.
2030 * This subtraction happens infrequently - I've only ever seen it
2031 * invoked once per reduction, and then just under 22.5% of the time.
2032 */
2033 while (c)
2034 c -= lbnSubN_64(n, mod, mlen);
2035 while (lbnCmp_64(n, mod, mlen) >= 0)
2036 (void)lbnSubN_64(n, mod, mlen);
2037}
2038#endif /* !lbnMontReduce_64 */
2039
2040/*
2041 * A couple of helpers that you might want to implement atomically
2042 * in asm sometime.
2043 */
2044#ifndef lbnMontMul_64
2045/*
2046 * Multiply "num1" by "num2", modulo "mod", all of length "len", and
2047 * place the result in the high half of "prod". "inv" is the inverse
2048 * of the least-significant word of the modulus, modulo 2^64.
2049 * This uses numbers in Montgomery form. Reduce using "len" and "inv".
2050 *
2051 * This is implemented as a macro to win on compilers that don't do
2052 * inlining, since it's so trivial.
2053 */
2054#define lbnMontMul_64(prod, n1, n2, mod, len, inv) \
2055 (lbnMulX_64(prod, n1, n2, len), lbnMontReduce_64(prod, mod, len, inv))
2056#endif /* !lbnMontMul_64 */
2057
2058#ifndef lbnMontSquare_64
2059/*
2060 * Square "n", modulo "mod", both of length "len", and place the result
2061 * in the high half of "prod". "inv" is the inverse of the least-significant
2062 * word of the modulus, modulo 2^64.
2063 * This uses numbers in Montgomery form. Reduce using "len" and "inv".
2064 *
2065 * This is implemented as a macro to win on compilers that don't do
2066 * inlining, since it's so trivial.
2067 */
2068#define lbnMontSquare_64(prod, n, mod, len, inv) \
2069 (lbnSquare_64(prod, n, len), lbnMontReduce_64(prod, mod, len, inv))
2070
2071#endif /* !lbnMontSquare_64 */
2072
2073/*
2074 * Convert a number to Montgomery form - requires mlen + nlen words
2075 * of memory in "n".
2076 */
2077void
2078lbnToMont_64(BNWORD64 *n, unsigned nlen, BNWORD64 *mod, unsigned mlen)
2079{
2080 /* Move n up "mlen" words */
2081 lbnCopy_64(BIGLITTLE(n-mlen,n+mlen), n, nlen);
2082 lbnZero_64(n, mlen);
2083 /* Do the division - dump the quotient in the high-order words */
2084 (void)lbnDiv_64(BIGLITTLE(n-mlen,n+mlen), n, mlen+nlen, mod, mlen);
2085}
2086
2087/*
2088 * Convert from Montgomery form. Montgomery reduction is all that is
2089 * needed.
2090 */
2091void
2092lbnFromMont_64(BNWORD64 *n, BNWORD64 *mod, unsigned len)
2093{
2094 /* Zero the high words of n */
2095 lbnZero_64(BIGLITTLE(n-len,n+len), len);
2096 lbnMontReduce_64(n, mod, len, lbnMontInv1_64(mod[BIGLITTLE(-1,0)]));
2097 /* Move n down len words */
2098 lbnCopy_64(n, BIGLITTLE(n-len,n+len), len);
2099}
2100
2101/*
2102 * The windowed exponentiation algorithm, precomputes a table of odd
2103 * powers of n up to 2^k. See the comment in bnExpMod_64 below for
2104 * an explanation of how it actually works works.
2105 *
2106 * It takes 2^(k-1)-1 multiplies to compute the table, and (e-1)/(k+1)
2107 * multiplies (on average) to perform the exponentiation. To minimize
2108 * the sum, k must vary with e. The optimal window sizes vary with the
2109 * exponent length. Here are some selected values and the boundary cases.
2110 * (An underscore _ has been inserted into some of the numbers to ensure
2111 * that magic strings like 64 do not appear in this table. It should be
2112 * ignored.)
2113 *
2114 * At e = 1 bits, k=1 (0.000000) is best
2115 * At e = 2 bits, k=1 (0.500000) is best
2116 * At e = 4 bits, k=1 (1.500000) is best
2117 * At e = 8 bits, k=2 (3.333333) < k=1 (3.500000)
2118 * At e = 1_6 bits, k=2 (6.000000) is best
2119 * At e = 26 bits, k=3 (9.250000) < k=2 (9.333333)
2120 * At e = 3_2 bits, k=3 (10.750000) is best
2121 * At e = 6_4 bits, k=3 (18.750000) is best
2122 * At e = 82 bits, k=4 (23.200000) < k=3 (23.250000)
2123 * At e = 128 bits, k=4 (3_2.400000) is best
2124 * At e = 242 bits, k=5 (55.1_66667) < k=4 (55.200000)
2125 * At e = 256 bits, k=5 (57.500000) is best
2126 * At e = 512 bits, k=5 (100.1_66667) is best
2127 * At e = 674 bits, k=6 (127.142857) < k=5 (127.1_66667)
2128 * At e = 1024 bits, k=6 (177.142857) is best
2129 * At e = 1794 bits, k=7 (287.125000) < k=6 (287.142857)
2130 * At e = 2048 bits, k=7 (318.875000) is best
2131 * At e = 4096 bits, k=7 (574.875000) is best
2132 *
2133 * The numbers in parentheses are the expected number of multiplications
2134 * needed to do the computation. The normal russian-peasant modular
2135 * exponentiation technique always uses (e-1)/2. For exponents as
2136 * small as 192 bits (below the range of current factoring algorithms),
2137 * half of the multiplies are eliminated, 45.2 as opposed to the naive
2138 * 95.5. Counting the 191 squarings as 3/4 a multiply each (squaring
2139 * proper is just over half of multiplying, but the Montgomery
2140 * reduction in each case is also a multiply), that's 143.25
2141 * multiplies, for totals of 188.45 vs. 238.75 - a 21% savings.
2142 * For larger exponents (like 512 bits), it's 483.92 vs. 639.25, a
2143 * 24.3% savings. It asymptotically approaches 25%.
2144 *
2145 * Um, actually there's a slightly more accurate way to count, which
2146 * really is the average number of multiplies required, averaged
2147 * uniformly over all 2^(e-1) e-bit numbers, from 2^(e-1) to (2^e)-1.
2148 * It's based on the recurrence that for the last b bits, b <= k, at
2149 * most one multiply is needed (and none at all 1/2^b of the time),
2150 * while when b > k, the odds are 1/2 each way that the bit will be
2151 * 0 (meaning no multiplies to reduce it to the b-1-bit case) and
2152 * 1/2 that the bit will be 1, starting a k-bit window and requiring
2153 * 1 multiply beyond the b-k-bit case. Since the most significant
2154 * bit is always 1, a k-bit window always starts there, and that
2155 * multiply is by 1, so it isn't a multiply at all. Thus, the
2156 * number of multiplies is simply that needed for the last e-k bits.
2157 * This recurrence produces:
2158 *
2159 * At e = 1 bits, k=1 (0.000000) is best
2160 * At e = 2 bits, k=1 (0.500000) is best
2161 * At e = 4 bits, k=1 (1.500000) is best
2162 * At e = 6 bits, k=2 (2.437500) < k=1 (2.500000)
2163 * At e = 8 bits, k=2 (3.109375) is best
2164 * At e = 1_6 bits, k=2 (5.777771) is best
2165 * At e = 24 bits, k=3 (8.437629) < k=2 (8.444444)
2166 * At e = 3_2 bits, k=3 (10.437492) is best
2167 * At e = 6_4 bits, k=3 (18.437500) is best
2168 * At e = 81 bits, k=4 (22.6_40000) < k=3 (22.687500)
2169 * At e = 128 bits, k=4 (3_2.040000) is best
2170 * At e = 241 bits, k=5 (54.611111) < k=4 (54.6_40000)
2171 * At e = 256 bits, k=5 (57.111111) is best
2172 * At e = 512 bits, k=5 (99.777778) is best
2173 * At e = 673 bits, k=6 (126.591837) < k=5 (126.611111)
2174 * At e = 1024 bits, k=6 (176.734694) is best
2175 * At e = 1793 bits, k=7 (286.578125) < k=6 (286.591837)
2176 * At e = 2048 bits, k=7 (318.453125) is best
2177 * At e = 4096 bits, k=7 (574.453125) is best
2178 *
2179 * This has the rollover points at 6, 24, 81, 241, 673 and 1793 instead
2180 * of 8, 26, 82, 242, 674, and 1794. Not a very big difference.
2181 * (The numbers past that are k=8 at 4609 and k=9 at 11521,
2182 * vs. one more in each case for the approximation.)
2183 *
2184 * Given that exponents for which k>7 are useful are uncommon,
2185 * a fixed size table for k <= 7 is used for simplicity.
2186 *
2187 * The basic number of squarings needed is e-1, although a k-bit
2188 * window (for k > 1) can save, on average, k-2 of those, too.
2189 * That savings currently isn't counted here. It would drive the
2190 * crossover points slightly lower.
2191 * (Actually, this win is also reduced in the DoubleExpMod case,
2192 * meaning we'd have to split the tables. Except for that, the
2193 * multiplies by powers of the two bases are independent, so
2194 * the same logic applies to each as the single case.)
2195 *
2196 * Table entry i is the largest number of bits in an exponent to
2197 * process with a window size of i+1. Entry 6 is the largest
2198 * possible unsigned number, so the window will never be more
2199 * than 7 bits, requiring 2^6 = 0x40 slots.
2200 */
2201#define BNEXPMOD_MAX_WINDOW 7
2202static unsigned const bnExpModThreshTable[BNEXPMOD_MAX_WINDOW] = {
2203 5, 23, 80, 240, 672, 1792, (unsigned)-1
2204/* 7, 25, 81, 241, 673, 1793, (unsigned)-1 ### The old approximations */
2205};
2206
2207/*
2208 * Perform modular exponentiation, as fast as possible! This uses
2209 * Montgomery reduction, optimized squaring, and windowed exponentiation.
2210 * The modulus "mod" MUST be odd!
2211 *
2212 * This returns 0 on success, -1 on out of memory.
2213 *
2214 * The window algorithm:
2215 * The idea is to keep a running product of b1 = n^(high-order bits of exp),
2216 * and then keep appending exponent bits to it. The following patterns
2217 * apply to a 3-bit window (k = 3):
2218 * To append 0: square
2219 * To append 1: square, multiply by n^1
2220 * To append 10: square, multiply by n^1, square
2221 * To append 11: square, square, multiply by n^3
2222 * To append 100: square, multiply by n^1, square, square
2223 * To append 101: square, square, square, multiply by n^5
2224 * To append 110: square, square, multiply by n^3, square
2225 * To append 111: square, square, square, multiply by n^7
2226 *
2227 * Since each pattern involves only one multiply, the longer the pattern
2228 * the better, except that a 0 (no multiplies) can be appended directly.
2229 * We precompute a table of odd powers of n, up to 2^k, and can then
2230 * multiply k bits of exponent at a time. Actually, assuming random
2231 * exponents, there is on average one zero bit between needs to
2232 * multiply (1/2 of the time there's none, 1/4 of the time there's 1,
2233 * 1/8 of the time, there's 2, 1/64 of the time, there's 3, etc.), so
2234 * you have to do one multiply per k+1 bits of exponent.
2235 *
2236 * The loop walks down the exponent, squaring the result buffer as
2237 * it goes. There is a wbits+1 bit lookahead buffer, buf, that is
2238 * filled with the upcoming exponent bits. (What is read after the
2239 * end of the exponent is unimportant, but it is filled with zero here.)
2240 * When the most-significant bit of this buffer becomes set, i.e.
2241 * (buf & tblmask) != 0, we have to decide what pattern to multiply
2242 * by, and when to do it. We decide, remember to do it in future
2243 * after a suitable number of squarings have passed (e.g. a pattern
2244 * of "100" in the buffer requires that we multiply by n^1 immediately;
2245 * a pattern of "110" calls for multiplying by n^3 after one more
2246 * squaring), clear the buffer, and continue.
2247 *
2248 * When we start, there is one more optimization: the result buffer
2249 * is implcitly one, so squaring it or multiplying by it can be
2250 * optimized away. Further, if we start with a pattern like "100"
2251 * in the lookahead window, rather than placing n into the buffer
2252 * and then starting to square it, we have already computed n^2
2253 * to compute the odd-powers table, so we can place that into
2254 * the buffer and save a squaring.
2255 *
2256 * This means that if you have a k-bit window, to compute n^z,
2257 * where z is the high k bits of the exponent, 1/2 of the time
2258 * it requires no squarings. 1/4 of the time, it requires 1
2259 * squaring, ... 1/2^(k-1) of the time, it reqires k-2 squarings.
2260 * And the remaining 1/2^(k-1) of the time, the top k bits are a
2261 * 1 followed by k-1 0 bits, so it again only requires k-2
2262 * squarings, not k-1. The average of these is 1. Add that
2263 * to the one squaring we have to do to compute the table,
2264 * and you'll see that a k-bit window saves k-2 squarings
2265 * as well as reducing the multiplies. (It actually doesn't
2266 * hurt in the case k = 1, either.)
2267 *
2268 * n must have mlen words allocated. Although fewer may be in use
2269 * when n is passed in, all are in use on exit.
2270 */
2271int
2272lbnExpMod_64(BNWORD64 *result, BNWORD64 const *n, unsigned nlen,
2273 BNWORD64 const *e, unsigned elen, BNWORD64 *mod, unsigned mlen)
2274{
2275 BNWORD64 *table[1 << (BNEXPMOD_MAX_WINDOW-1)];
2276 /* Table of odd powers of n */
2277 unsigned ebits; /* Exponent bits */
2278 unsigned wbits; /* Window size */
2279 unsigned tblmask; /* Mask of exponentiation window */
2280 BNWORD64 bitpos; /* Mask of current look-ahead bit */
2281 unsigned buf; /* Buffer of exponent bits */
2282 unsigned multpos; /* Where to do pending multiply */
2283 BNWORD64 const *mult; /* What to multiply by */
2284 unsigned i; /* Loop counter */
2285 int isone; /* Flag: accum. is implicitly one */
2286 BNWORD64 *a, *b; /* Working buffers/accumulators */
2287 BNWORD64 *t; /* Pointer into the working buffers */
2288 BNWORD64 inv; /* mod^-1 modulo 2^64 */
2289 int y; /* bnYield() result */
2290
2291 assert(mlen);
2292 assert(nlen <= mlen);
2293
2294 /* First, a couple of trivial cases. */
2295 elen = lbnNorm_64(e, elen);
2296 if (!elen) {
2297 /* x ^ 0 == 1 */
2298 lbnZero_64(result, mlen);
2299 BIGLITTLE(result[-1],result[0]) = 1;
2300 return 0;
2301 }
2302 ebits = lbnBits_64(e, elen);
2303 if (ebits == 1) {
2304 /* x ^ 1 == x */
2305 if (n != result)
2306 lbnCopy_64(result, n, nlen);
2307 if (mlen > nlen)
2308 lbnZero_64(BIGLITTLE(result-nlen,result+nlen),
2309 mlen-nlen);
2310 return 0;
2311 }
2312
2313 /* Okay, now move the exponent pointer to the most-significant word */
2314 e = BIGLITTLE(e-elen, e+elen-1);
2315
2316 /* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */
2317 wbits = 0;
2318 while (ebits > bnExpModThreshTable[wbits])
2319 wbits++;
2320
2321 /* Allocate working storage: two product buffers and the tables. */
2322 LBNALLOC(a, BNWORD64, 2*mlen);
2323 if (!a)
2324 return -1;
2325 LBNALLOC(b, BNWORD64, 2*mlen);
2326 if (!b) {
2327 LBNFREE(a, 2*mlen);
2328 return -1;
2329 }
2330
2331 /* Convert to the appropriate table size: tblmask = 1<<(k-1) */
2332 tblmask = 1u << wbits;
2333
2334 /* We have the result buffer available, so use it. */
2335 table[0] = result;
2336
2337 /*
2338 * Okay, we now have a minimal-sized table - expand it.
2339 * This is allowed to fail! If so, scale back the table size
2340 * and proceed.
2341 */
2342 for (i = 1; i < tblmask; i++) {
2343 LBNALLOC(t, BNWORD64, mlen);
2344 if (!t) /* Out of memory! Quit the loop. */
2345 break;
2346 table[i] = t;
2347 }
2348
2349 /* If we stopped, with i < tblmask, shrink the tables appropriately */
2350 while (tblmask > i) {
2351 wbits--;
2352 tblmask >>= 1;
2353 }
2354 /* Free up our overallocations */
2355 while (--i > tblmask)
2356 LBNFREE(table[i], mlen);
2357
2358 /* Okay, fill in the table */
2359
2360 /* Compute the necessary modular inverse */
2361 inv = lbnMontInv1_64(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
2362
2363 /* Convert n to Montgomery form */
2364
2365 /* Move n up "mlen" words into a */
2366 t = BIGLITTLE(a-mlen, a+mlen);
2367 lbnCopy_64(t, n, nlen);
2368 lbnZero_64(a, mlen);
2369 /* Do the division - lose the quotient into the high-order words */
2370 (void)lbnDiv_64(t, a, mlen+nlen, mod, mlen);
2371 /* Copy into first table entry */
2372 lbnCopy_64(table[0], a, mlen);
2373
2374 /* Square a into b */
2375 lbnMontSquare_64(b, a, mod, mlen, inv);
2376
2377 /* Use high half of b to initialize the table */
2378 t = BIGLITTLE(b-mlen, b+mlen);
2379 for (i = 1; i < tblmask; i++) {
2380 lbnMontMul_64(a, t, table[i-1], mod, mlen, inv);
2381 lbnCopy_64(table[i], BIGLITTLE(a-mlen, a+mlen), mlen);
2382#if BNYIELD
2383 if (bnYield && (y = bnYield()) < 0)
2384 goto yield;
2385#endif
2386 }
2387
2388 /* We might use b = n^2 later... */
2389
2390 /* Initialze the fetch pointer */
2391 bitpos = (BNWORD64)1 << ((ebits-1) & (64-1)); /* Initialize mask */
2392
2393 /* This should point to the msbit of e */
2394 assert((*e & bitpos) != 0);
2395
2396 /*
2397 * Pre-load the window. Becuase the window size is
2398 * never larger than the exponent size, there is no need to
2399 * detect running off the end of e in here.
2400 *
2401 * The read-ahead is controlled by elen and the bitpos mask.
2402 * Note that this is *ahead* of ebits, which tracks the
2403 * most significant end of the window. The purpose of this
2404 * initialization is to get the two wbits+1 bits apart,
2405 * like they should be.
2406 *
2407 * Note that bitpos and e1len together keep track of the
2408 * lookahead read pointer in the exponent that is used here.
2409 */
2410 buf = 0;
2411 for (i = 0; i <= wbits; i++) {
2412 buf = (buf << 1) | ((*e & bitpos) != 0);
2413 bitpos >>= 1;
2414 if (!bitpos) {
2415 BIGLITTLE(e++,e--);
2416 bitpos = (BNWORD64)1 << (64-1);
2417 elen--;
2418 }
2419 }
2420 assert(buf & tblmask);
2421
2422 /*
2423 * Set the pending multiply positions to a location that will
2424 * never be encountered, thus ensuring that nothing will happen
2425 * until the need for a multiply appears and one is scheduled.
2426 */
2427 multpos = ebits; /* A NULL value */
2428 mult = 0; /* Force a crash if we use these */
2429
2430 /*
2431 * Okay, now begins the real work. The first step is
2432 * slightly magic, so it's done outside the main loop,
2433 * but it's very similar to what's inside.
2434 */
2435 ebits--; /* Start processing the first bit... */
2436 isone = 1;
2437
2438 /*
2439 * This is just like the multiply in the loop, except that
2440 * - We know the msbit of buf is set, and
2441 * - We have the extra value n^2 floating around.
2442 * So, do the usual computation, and if the result is that
2443 * the buffer should be multiplied by n^1 immediately
2444 * (which we'd normally then square), we multiply it
2445 * (which reduces to a copy, which reduces to setting a flag)
2446 * by n^2 and skip the squaring. Thus, we do the
2447 * multiply and the squaring in one step.
2448 */
2449 assert(buf & tblmask);
2450 multpos = ebits - wbits;
2451 while ((buf & 1) == 0) {
2452 buf >>= 1;
2453 multpos++;
2454 }
2455 /* Intermediates can wrap, but final must NOT */
2456 assert(multpos <= ebits);
2457 mult = table[buf>>1];
2458 buf = 0;
2459
2460 /* Special case: use already-computed value sitting in buffer */
2461 if (multpos == ebits)
2462 isone = 0;
2463
2464 /*
2465 * At this point, the buffer (which is the high half of b) holds
2466 * either 1 (implicitly, as the "isone" flag is set), or n^2.
2467 */
2468
2469 /*
2470 * The main loop. The procedure is:
2471 * - Advance the window
2472 * - If the most-significant bit of the window is set,
2473 * schedule a multiply for the appropriate time in the
2474 * future (may be immediately)
2475 * - Perform any pending multiples
2476 * - Check for termination
2477 * - Square the buffer
2478 *
2479 * At any given time, the acumulated product is held in
2480 * the high half of b.
2481 */
2482 for (;;) {
2483 ebits--;
2484
2485 /* Advance the window */
2486 assert(buf < tblmask);
2487 buf <<= 1;
2488 /*
2489 * This reads ahead of the current exponent position
2490 * (controlled by ebits), so we have to be able to read
2491 * past the lsb of the exponents without error.
2492 */
2493 if (elen) {
2494 buf |= ((*e & bitpos) != 0);
2495 bitpos >>= 1;
2496 if (!bitpos) {
2497 BIGLITTLE(e++,e--);
2498 bitpos = (BNWORD64)1 << (64-1);
2499 elen--;
2500 }
2501 }
2502
2503 /* Examine the window for pending multiplies */
2504 if (buf & tblmask) {
2505 multpos = ebits - wbits;
2506 while ((buf & 1) == 0) {
2507 buf >>= 1;
2508 multpos++;
2509 }
2510 /* Intermediates can wrap, but final must NOT */
2511 assert(multpos <= ebits);
2512 mult = table[buf>>1];
2513 buf = 0;
2514 }
2515
2516 /* If we have a pending multiply, do it */
2517 if (ebits == multpos) {
2518 /* Multiply by the table entry remembered previously */
2519 t = BIGLITTLE(b-mlen, b+mlen);
2520 if (isone) {
2521 /* Multiply by 1 is a trivial case */
2522 lbnCopy_64(t, mult, mlen);
2523 isone = 0;
2524 } else {
2525 lbnMontMul_64(a, t, mult, mod, mlen, inv);
2526 /* Swap a and b */
2527 t = a; a = b; b = t;
2528 }
2529 }
2530
2531 /* Are we done? */
2532 if (!ebits)
2533 break;
2534
2535 /* Square the input */
2536 if (!isone) {
2537 t = BIGLITTLE(b-mlen, b+mlen);
2538 lbnMontSquare_64(a, t, mod, mlen, inv);
2539 /* Swap a and b */
2540 t = a; a = b; b = t;
2541 }
2542#if BNYIELD
2543 if (bnYield && (y = bnYield()) < 0)
2544 goto yield;
2545#endif
2546 } /* for (;;) */
2547
2548 assert(!isone);
2549 assert(!buf);
2550
2551 /* DONE! */
2552
2553 /* Convert result out of Montgomery form */
2554 t = BIGLITTLE(b-mlen, b+mlen);
2555 lbnCopy_64(b, t, mlen);
2556 lbnZero_64(t, mlen);
2557 lbnMontReduce_64(b, mod, mlen, inv);
2558 lbnCopy_64(result, t, mlen);
2559 /*
2560 * Clean up - free intermediate storage.
2561 * Do NOT free table[0], which is the result
2562 * buffer.
2563 */
2564 y = 0;
2565#if BNYIELD
2566yield:
2567#endif
2568 while (--tblmask)
2569 LBNFREE(table[tblmask], mlen);
2570 LBNFREE(b, 2*mlen);
2571 LBNFREE(a, 2*mlen);
2572
2573 return y; /* Success */
2574}
2575
2576/*
2577 * Compute and return n1^e1 * n2^e2 mod "mod".
2578 * result may be either input buffer, or something separate.
2579 * It must be "mlen" words long.
2580 *
2581 * There is a current position in the exponents, which is kept in e1bits.
2582 * (The exponents are swapped if necessary so e1 is the longer of the two.)
2583 * At any given time, the value in the accumulator is
2584 * n1^(e1>>e1bits) * n2^(e2>>e1bits) mod "mod".
2585 * As e1bits is counted down, this is updated, by squaring it and doing
2586 * any necessary multiplies.
2587 * To decide on the necessary multiplies, two windows, each w1bits+1 bits
2588 * wide, are maintained in buf1 and buf2, which read *ahead* of the
2589 * e1bits position (with appropriate handling of the case when e1bits
2590 * drops below w1bits+1). When the most-significant bit of either window
2591 * becomes set, indicating that something needs to be multiplied by
2592 * the accumulator or it will get out of sync, the window is examined
2593 * to see which power of n1 or n2 to multiply by, and when (possibly
2594 * later, if the power is greater than 1) the multiply should take
2595 * place. Then the multiply and its location are remembered and the
2596 * window is cleared.
2597 *
2598 * If we had every power of n1 in the table, the multiply would always
2599 * be w1bits steps in the future. But we only keep the odd powers,
2600 * so instead of waiting w1bits squarings and then multiplying
2601 * by n1^k, we wait w1bits-k squarings and multiply by n1.
2602 *
2603 * Actually, w2bits can be less than w1bits, but the window is the same
2604 * size, to make it easier to keep track of where we're reading. The
2605 * appropriate number of low-order bits of the window are just ignored.
2606 */
2607int
2608lbnDoubleExpMod_64(BNWORD64 *result,
2609 BNWORD64 const *n1, unsigned n1len,
2610 BNWORD64 const *e1, unsigned e1len,
2611 BNWORD64 const *n2, unsigned n2len,
2612 BNWORD64 const *e2, unsigned e2len,
2613 BNWORD64 *mod, unsigned mlen)
2614{
2615 BNWORD64 *table1[1 << (BNEXPMOD_MAX_WINDOW-1)];
2616 /* Table of odd powers of n1 */
2617 BNWORD64 *table2[1 << (BNEXPMOD_MAX_WINDOW-1)];
2618 /* Table of odd powers of n2 */
2619 unsigned e1bits, e2bits; /* Exponent bits */
2620 unsigned w1bits, w2bits; /* Window sizes */
2621 unsigned tblmask; /* Mask of exponentiation window */
2622 BNWORD64 bitpos; /* Mask of current look-ahead bit */
2623 unsigned buf1, buf2; /* Buffer of exponent bits */
2624 unsigned mult1pos, mult2pos; /* Where to do pending multiply */
2625 BNWORD64 const *mult1, *mult2; /* What to multiply by */
2626 unsigned i; /* Loop counter */
2627 int isone; /* Flag: accum. is implicitly one */
2628 BNWORD64 *a, *b; /* Working buffers/accumulators */
2629 BNWORD64 *t; /* Pointer into the working buffers */
2630 BNWORD64 inv; /* mod^-1 modulo 2^64 */
2631 int y; /* bnYield() result */
2632
2633 assert(mlen);
2634 assert(n1len <= mlen);
2635 assert(n2len <= mlen);
2636
2637 /* First, a couple of trivial cases. */
2638 e1len = lbnNorm_64(e1, e1len);
2639 e2len = lbnNorm_64(e2, e2len);
2640
2641 /* Ensure that the first exponent is the longer */
2642 e1bits = lbnBits_64(e1, e1len);
2643 e2bits = lbnBits_64(e2, e2len);
2644 if (e1bits < e2bits) {
2645 i = e1len; e1len = e2len; e2len = i;
2646 i = e1bits; e1bits = e2bits; e2bits = i;
2647 t = (BNWORD64 *)n1; n1 = n2; n2 = t;
2648 t = (BNWORD64 *)e1; e1 = e2; e2 = t;
2649 }
2650 assert(e1bits >= e2bits);
2651
2652 /* Handle a trivial case */
2653 if (!e2len)
2654 return lbnExpMod_64(result, n1, n1len, e1, e1len, mod, mlen);
2655 assert(e2bits);
2656
2657 /* The code below fucks up if the exponents aren't at least 2 bits */
2658 if (e1bits == 1) {
2659 assert(e2bits == 1);
2660
2661 LBNALLOC(a, BNWORD64, n1len+n2len);
2662 if (!a)
2663 return -1;
2664
2665 lbnMul_64(a, n1, n1len, n2, n2len);
2666 /* Do a direct modular reduction */
2667 if (n1len + n2len >= mlen)
2668 (void)lbnDiv_64(a+mlen, a, n1len+n2len, mod, mlen);
2669 lbnCopy_64(result, a, mlen);
2670 LBNFREE(a, n1len+n2len);
2671 return 0;
2672 }
2673
2674 /* Okay, now move the exponent pointers to the most-significant word */
2675 e1 = BIGLITTLE(e1-e1len, e1+e1len-1);
2676 e2 = BIGLITTLE(e2-e2len, e2+e2len-1);
2677
2678 /* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */
2679 w1bits = 0;
2680 while (e1bits > bnExpModThreshTable[w1bits])
2681 w1bits++;
2682 w2bits = 0;
2683 while (e2bits > bnExpModThreshTable[w2bits])
2684 w2bits++;
2685
2686 assert(w1bits >= w2bits);
2687
2688 /* Allocate working storage: two product buffers and the tables. */
2689 LBNALLOC(a, BNWORD64, 2*mlen);
2690 if (!a)
2691 return -1;
2692 LBNALLOC(b, BNWORD64, 2*mlen);
2693 if (!b) {
2694 LBNFREE(a, 2*mlen);
2695 return -1;
2696 }
2697
2698 /* Convert to the appropriate table size: tblmask = 1<<(k-1) */
2699 tblmask = 1u << w1bits;
2700 /* Use buf2 for its size, temporarily */
2701 buf2 = 1u << w2bits;
2702
2703 LBNALLOC(t, BNWORD64, mlen);
2704 if (!t) {
2705 LBNFREE(b, 2*mlen);
2706 LBNFREE(a, 2*mlen);
2707 return -1;
2708 }
2709 table1[0] = t;
2710 table2[0] = result;
2711
2712 /*
2713 * Okay, we now have some minimal-sized tables - expand them.
2714 * This is allowed to fail! If so, scale back the table sizes
2715 * and proceed. We allocate both tables at the same time
2716 * so if it fails partway through, they'll both be a reasonable
2717 * size rather than one huge and one tiny.
2718 * When i passes buf2 (the number of entries in the e2 window,
2719 * which may be less than the number of entries in the e1 window),
2720 * stop allocating e2 space.
2721 */
2722 for (i = 1; i < tblmask; i++) {
2723 LBNALLOC(t, BNWORD64, mlen);
2724 if (!t) /* Out of memory! Quit the loop. */
2725 break;
2726 table1[i] = t;
2727 if (i < buf2) {
2728 LBNALLOC(t, BNWORD64, mlen);
2729 if (!t) {
2730 LBNFREE(table1[i], mlen);
2731 break;
2732 }
2733 table2[i] = t;
2734 }
2735 }
2736
2737 /* If we stopped, with i < tblmask, shrink the tables appropriately */
2738 while (tblmask > i) {
2739 w1bits--;
2740 tblmask >>= 1;
2741 }
2742 /* Free up our overallocations */
2743 while (--i > tblmask) {
2744 if (i < buf2)
2745 LBNFREE(table2[i], mlen);
2746 LBNFREE(table1[i], mlen);
2747 }
2748 /* And shrink the second window too, if needed */
2749 if (w2bits > w1bits) {
2750 w2bits = w1bits;
2751 buf2 = tblmask;
2752 }
2753
2754 /*
2755 * From now on, use the w2bits variable for the difference
2756 * between w1bits and w2bits.
2757 */
2758 w2bits = w1bits-w2bits;
2759
2760 /* Okay, fill in the tables */
2761
2762 /* Compute the necessary modular inverse */
2763 inv = lbnMontInv1_64(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
2764
2765 /* Convert n1 to Montgomery form */
2766
2767 /* Move n1 up "mlen" words into a */
2768 t = BIGLITTLE(a-mlen, a+mlen);
2769 lbnCopy_64(t, n1, n1len);
2770 lbnZero_64(a, mlen);
2771 /* Do the division - lose the quotient into the high-order words */
2772 (void)lbnDiv_64(t, a, mlen+n1len, mod, mlen);
2773 /* Copy into first table entry */
2774 lbnCopy_64(table1[0], a, mlen);
2775
2776 /* Square a into b */
2777 lbnMontSquare_64(b, a, mod, mlen, inv);
2778
2779 /* Use high half of b to initialize the first table */
2780 t = BIGLITTLE(b-mlen, b+mlen);
2781 for (i = 1; i < tblmask; i++) {
2782 lbnMontMul_64(a, t, table1[i-1], mod, mlen, inv);
2783 lbnCopy_64(table1[i], BIGLITTLE(a-mlen, a+mlen), mlen);
2784#if BNYIELD
2785 if (bnYield && (y = bnYield()) < 0)
2786 goto yield;
2787#endif
2788 }
2789
2790 /* Convert n2 to Montgomery form */
2791
2792 t = BIGLITTLE(a-mlen, a+mlen);
2793 /* Move n2 up "mlen" words into a */
2794 lbnCopy_64(t, n2, n2len);
2795 lbnZero_64(a, mlen);
2796 /* Do the division - lose the quotient into the high-order words */
2797 (void)lbnDiv_64(t, a, mlen+n2len, mod, mlen);
2798 /* Copy into first table entry */
2799 lbnCopy_64(table2[0], a, mlen);
2800
2801 /* Square it into a */
2802 lbnMontSquare_64(a, table2[0], mod, mlen, inv);
2803 /* Copy to b, low half */
2804 lbnCopy_64(b, t, mlen);
2805
2806 /* Use b to initialize the second table */
2807 for (i = 1; i < buf2; i++) {
2808 lbnMontMul_64(a, b, table2[i-1], mod, mlen, inv);
2809 lbnCopy_64(table2[i], t, mlen);
2810#if BNYIELD
2811 if (bnYield && (y = bnYield()) < 0)
2812 goto yield;
2813#endif
2814 }
2815
2816 /*
2817 * Okay, a recap: at this point, the low part of b holds
2818 * n2^2, the high part holds n1^2, and the tables are
2819 * initialized with the odd powers of n1 and n2 from 1
2820 * through 2*tblmask-1 and 2*buf2-1.
2821 *
2822 * We might use those squares in b later, or we might not.
2823 */
2824
2825 /* Initialze the fetch pointer */
2826 bitpos = (BNWORD64)1 << ((e1bits-1) & (64-1)); /* Initialize mask */
2827
2828 /* This should point to the msbit of e1 */
2829 assert((*e1 & bitpos) != 0);
2830
2831 /*
2832 * Pre-load the windows. Becuase the window size is
2833 * never larger than the exponent size, there is no need to
2834 * detect running off the end of e1 in here.
2835 *
2836 * The read-ahead is controlled by e1len and the bitpos mask.
2837 * Note that this is *ahead* of e1bits, which tracks the
2838 * most significant end of the window. The purpose of this
2839 * initialization is to get the two w1bits+1 bits apart,
2840 * like they should be.
2841 *
2842 * Note that bitpos and e1len together keep track of the
2843 * lookahead read pointer in the exponent that is used here.
2844 * e2len is not decremented, it is only ever compared with
2845 * e1len as *that* is decremented.
2846 */
2847 buf1 = buf2 = 0;
2848 for (i = 0; i <= w1bits; i++) {
2849 buf1 = (buf1 << 1) | ((*e1 & bitpos) != 0);
2850 if (e1len <= e2len)
2851 buf2 = (buf2 << 1) | ((*e2 & bitpos) != 0);
2852 bitpos >>= 1;
2853 if (!bitpos) {
2854 BIGLITTLE(e1++,e1--);
2855 if (e1len <= e2len)
2856 BIGLITTLE(e2++,e2--);
2857 bitpos = (BNWORD64)1 << (64-1);
2858 e1len--;
2859 }
2860 }
2861 assert(buf1 & tblmask);
2862
2863 /*
2864 * Set the pending multiply positions to a location that will
2865 * never be encountered, thus ensuring that nothing will happen
2866 * until the need for a multiply appears and one is scheduled.
2867 */
2868 mult1pos = mult2pos = e1bits; /* A NULL value */
2869 mult1 = mult2 = 0; /* Force a crash if we use these */
2870
2871 /*
2872 * Okay, now begins the real work. The first step is
2873 * slightly magic, so it's done outside the main loop,
2874 * but it's very similar to what's inside.
2875 */
2876 isone = 1; /* Buffer is implicitly 1, so replace * by copy */
2877 e1bits--; /* Start processing the first bit... */
2878
2879 /*
2880 * This is just like the multiply in the loop, except that
2881 * - We know the msbit of buf1 is set, and
2882 * - We have the extra value n1^2 floating around.
2883 * So, do the usual computation, and if the result is that
2884 * the buffer should be multiplied by n1^1 immediately
2885 * (which we'd normally then square), we multiply it
2886 * (which reduces to a copy, which reduces to setting a flag)
2887 * by n1^2 and skip the squaring. Thus, we do the
2888 * multiply and the squaring in one step.
2889 */
2890 assert(buf1 & tblmask);
2891 mult1pos = e1bits - w1bits;
2892 while ((buf1 & 1) == 0) {
2893 buf1 >>= 1;
2894 mult1pos++;
2895 }
2896 /* Intermediates can wrap, but final must NOT */
2897 assert(mult1pos <= e1bits);
2898 mult1 = table1[buf1>>1];
2899 buf1 = 0;
2900
2901 /* Special case: use already-computed value sitting in buffer */
2902 if (mult1pos == e1bits)
2903 isone = 0;
2904
2905 /*
2906 * The first multiply by a power of n2. Similar, but
2907 * we might not even want to schedule a multiply if e2 is
2908 * shorter than e1, and the window might be shorter so
2909 * we have to leave the low w2bits bits alone.
2910 */
2911 if (buf2 & tblmask) {
2912 /* Remember low-order bits for later */
2913 i = buf2 & ((1u << w2bits) - 1);
2914 buf2 >>= w2bits;
2915 mult2pos = e1bits - w1bits + w2bits;
2916 while ((buf2 & 1) == 0) {
2917 buf2 >>= 1;
2918 mult2pos++;
2919 }
2920 assert(mult2pos <= e1bits);
2921 mult2 = table2[buf2>>1];
2922 buf2 = i;
2923
2924 if (mult2pos == e1bits) {
2925 t = BIGLITTLE(b-mlen, b+mlen);
2926 if (isone) {
2927 lbnCopy_64(t, b, mlen); /* Copy low to high */
2928 isone = 0;
2929 } else {
2930 lbnMontMul_64(a, t, b, mod, mlen, inv);
2931 t = a; a = b; b = t;
2932 }
2933 }
2934 }
2935
2936 /*
2937 * At this point, the buffer (which is the high half of b)
2938 * holds either 1 (implicitly, as the "isone" flag is set),
2939 * n1^2, n2^2 or n1^2 * n2^2.
2940 */
2941
2942 /*
2943 * The main loop. The procedure is:
2944 * - Advance the windows
2945 * - If the most-significant bit of a window is set,
2946 * schedule a multiply for the appropriate time in the
2947 * future (may be immediately)
2948 * - Perform any pending multiples
2949 * - Check for termination
2950 * - Square the buffers
2951 *
2952 * At any given time, the acumulated product is held in
2953 * the high half of b.
2954 */
2955 for (;;) {
2956 e1bits--;
2957
2958 /* Advance the windows */
2959 assert(buf1 < tblmask);
2960 buf1 <<= 1;
2961 assert(buf2 < tblmask);
2962 buf2 <<= 1;
2963 /*
2964 * This reads ahead of the current exponent position
2965 * (controlled by e1bits), so we have to be able to read
2966 * past the lsb of the exponents without error.
2967 */
2968 if (e1len) {
2969 buf1 |= ((*e1 & bitpos) != 0);
2970 if (e1len <= e2len)
2971 buf2 |= ((*e2 & bitpos) != 0);
2972 bitpos >>= 1;
2973 if (!bitpos) {
2974 BIGLITTLE(e1++,e1--);
2975 if (e1len <= e2len)
2976 BIGLITTLE(e2++,e2--);
2977 bitpos = (BNWORD64)1 << (64-1);
2978 e1len--;
2979 }
2980 }
2981
2982 /* Examine the first window for pending multiplies */
2983 if (buf1 & tblmask) {
2984 mult1pos = e1bits - w1bits;
2985 while ((buf1 & 1) == 0) {
2986 buf1 >>= 1;
2987 mult1pos++;
2988 }
2989 /* Intermediates can wrap, but final must NOT */
2990 assert(mult1pos <= e1bits);
2991 mult1 = table1[buf1>>1];
2992 buf1 = 0;
2993 }
2994
2995 /*
2996 * Examine the second window for pending multiplies.
2997 * Window 2 can be smaller than window 1, but we
2998 * keep the same number of bits in buf2, so we need
2999 * to ignore any low-order bits in the buffer when
3000 * computing what to multiply by, and recompute them
3001 * later.
3002 */
3003 if (buf2 & tblmask) {
3004 /* Remember low-order bits for later */
3005 i = buf2 & ((1u << w2bits) - 1);
3006 buf2 >>= w2bits;
3007 mult2pos = e1bits - w1bits + w2bits;
3008 while ((buf2 & 1) == 0) {
3009 buf2 >>= 1;
3010 mult2pos++;
3011 }
3012 assert(mult2pos <= e1bits);
3013 mult2 = table2[buf2>>1];
3014 buf2 = i;
3015 }
3016
3017
3018 /* If we have a pending multiply for e1, do it */
3019 if (e1bits == mult1pos) {
3020 /* Multiply by the table entry remembered previously */
3021 t = BIGLITTLE(b-mlen, b+mlen);
3022 if (isone) {
3023 /* Multiply by 1 is a trivial case */
3024 lbnCopy_64(t, mult1, mlen);
3025 isone = 0;
3026 } else {
3027 lbnMontMul_64(a, t, mult1, mod, mlen, inv);
3028 /* Swap a and b */
3029 t = a; a = b; b = t;
3030 }
3031 }
3032
3033 /* If we have a pending multiply for e2, do it */
3034 if (e1bits == mult2pos) {
3035 /* Multiply by the table entry remembered previously */
3036 t = BIGLITTLE(b-mlen, b+mlen);
3037 if (isone) {
3038 /* Multiply by 1 is a trivial case */
3039 lbnCopy_64(t, mult2, mlen);
3040 isone = 0;
3041 } else {
3042 lbnMontMul_64(a, t, mult2, mod, mlen, inv);
3043 /* Swap a and b */
3044 t = a; a = b; b = t;
3045 }
3046 }
3047
3048 /* Are we done? */
3049 if (!e1bits)
3050 break;
3051
3052 /* Square the buffer */
3053 if (!isone) {
3054 t = BIGLITTLE(b-mlen, b+mlen);
3055 lbnMontSquare_64(a, t, mod, mlen, inv);
3056 /* Swap a and b */
3057 t = a; a = b; b = t;
3058 }
3059#if BNYIELD
3060 if (bnYield && (y = bnYield()) < 0)
3061 goto yield;
3062#endif
3063 } /* for (;;) */
3064
3065 assert(!isone);
3066 assert(!buf1);
3067 assert(!buf2);
3068
3069 /* DONE! */
3070
3071 /* Convert result out of Montgomery form */
3072 t = BIGLITTLE(b-mlen, b+mlen);
3073 lbnCopy_64(b, t, mlen);
3074 lbnZero_64(t, mlen);
3075 lbnMontReduce_64(b, mod, mlen, inv);
3076 lbnCopy_64(result, t, mlen);
3077
3078 /* Clean up - free intermediate storage */
3079 y = 0;
3080#if BNYIELD
3081yield:
3082#endif
3083 buf2 = tblmask >> w2bits;
3084 while (--tblmask) {
3085 if (tblmask < buf2)
3086 LBNFREE(table2[tblmask], mlen);
3087 LBNFREE(table1[tblmask], mlen);
3088 }
3089 t = table1[0];
3090 LBNFREE(t, mlen);
3091 LBNFREE(b, 2*mlen);
3092 LBNFREE(a, 2*mlen);
3093
3094 return y; /* Success */
3095}
3096
3097/*
3098 * 2^exp (mod mod). This is an optimized version for use in Fermat
3099 * tests. The input value of n is ignored; it is returned with
3100 * "mlen" words valid.
3101 */
3102int
3103lbnTwoExpMod_64(BNWORD64 *n, BNWORD64 const *exp, unsigned elen,
3104 BNWORD64 *mod, unsigned mlen)
3105{
3106 unsigned e; /* Copy of high words of the exponent */
3107 unsigned bits; /* Assorted counter of bits */
3108 BNWORD64 const *bitptr;
3109 BNWORD64 bitword, bitpos;
3110 BNWORD64 *a, *b, *a1;
3111 BNWORD64 inv;
3112 int y; /* Result of bnYield() */
3113
3114 assert(mlen);
3115
3116 bitptr = BIGLITTLE(exp-elen, exp+elen-1);
3117 bitword = *bitptr;
3118 assert(bitword);
3119
3120 /* Clear n for future use. */
3121 lbnZero_64(n, mlen);
3122
3123 bits = lbnBits_64(exp, elen);
3124
3125 /* First, a couple of trivial cases. */
3126 if (bits <= 1) {
3127 /* 2 ^ 0 == 1, 2 ^ 1 == 2 */
3128 BIGLITTLE(n[-1],n[0]) = (BNWORD64)1<<elen;
3129 return 0;
3130 }
3131
3132 /* Set bitpos to the most significant bit */
3133 bitpos = (BNWORD64)1 << ((bits-1) & (64-1));
3134
3135 /* Now, count the bits in the modulus. */
3136 bits = lbnBits_64(mod, mlen);
3137 assert(bits > 1); /* a 1-bit modulus is just stupid... */
3138
3139 /*
3140 * We start with 1<<e, where "e" is as many high bits of the
3141 * exponent as we can manage without going over the modulus.
3142 * This first loop finds "e".
3143 */
3144 e = 1;
3145 while (elen) {
3146 /* Consume the first bit */
3147 bitpos >>= 1;
3148 if (!bitpos) {
3149 if (!--elen)
3150 break;
3151 bitword = BIGLITTLE(*++bitptr,*--bitptr);
3152 bitpos = (BNWORD64)1<<(64-1);
3153 }
3154 e = (e << 1) | ((bitpos & bitword) != 0);
3155 if (e >= bits) { /* Overflow! Back out. */
3156 e >>= 1;
3157 break;
3158 }
3159 }
3160 /*
3161 * The bit in "bitpos" being examined by the bit buffer has NOT
3162 * been consumed yet. This may be past the end of the exponent,
3163 * in which case elen == 1.
3164 */
3165
3166 /* Okay, now, set bit "e" in n. n is already zero. */
3167 inv = (BNWORD64)1 << (e & (64-1));
3168 e /= 64;
3169 BIGLITTLE(n[-e-1],n[e]) = inv;
3170 /*
3171 * The effective length of n in words is now "e+1".
3172 * This is used a little bit later.
3173 */
3174
3175 if (!elen)
3176 return 0; /* That was easy! */
3177
3178 /*
3179 * We have now processed the first few bits. The next step
3180 * is to convert this to Montgomery form for further squaring.
3181 */
3182
3183 /* Allocate working storage: two product buffers */
3184 LBNALLOC(a, BNWORD64, 2*mlen);
3185 if (!a)
3186 return -1;
3187 LBNALLOC(b, BNWORD64, 2*mlen);
3188 if (!b) {
3189 LBNFREE(a, 2*mlen);
3190 return -1;
3191 }
3192
3193 /* Convert n to Montgomery form */
3194 inv = BIGLITTLE(mod[-1],mod[0]); /* LSW of modulus */
3195 assert(inv & 1); /* Modulus must be odd */
3196 inv = lbnMontInv1_64(inv);
3197 /* Move n (length e+1, remember?) up "mlen" words into b */
3198 /* Note that we lie about a1 for a bit - it's pointing to b */
3199 a1 = BIGLITTLE(b-mlen,b+mlen);
3200 lbnCopy_64(a1, n, e+1);
3201 lbnZero_64(b, mlen);
3202 /* Do the division - dump the quotient into the high-order words */
3203 (void)lbnDiv_64(a1, b, mlen+e+1, mod, mlen);
3204 /*
3205 * Now do the first squaring and modular reduction to put
3206 * the number up in a1 where it belongs.
3207 */
3208 lbnMontSquare_64(a, b, mod, mlen, inv);
3209 /* Fix up a1 to point to where it should go. */
3210 a1 = BIGLITTLE(a-mlen,a+mlen);
3211
3212 /*
3213 * Okay, now, a1 holds the number being accumulated, and
3214 * b is a scratch register. Start working:
3215 */
3216 for (;;) {
3217 /*
3218 * Is the bit set? If so, double a1 as well.
3219 * A modular doubling like this is very cheap.
3220 */
3221 if (bitpos & bitword) {
3222 /*
3223 * Double the number. If there was a carry out OR
3224 * the result is greater than the modulus, subract
3225 * the modulus.
3226 */
3227 if (lbnDouble_64(a1, mlen) ||
3228 lbnCmp_64(a1, mod, mlen) > 0)
3229 (void)lbnSubN_64(a1, mod, mlen);
3230 }
3231
3232 /* Advance to the next exponent bit */
3233 bitpos >>= 1;
3234 if (!bitpos) {
3235 if (!--elen)
3236 break; /* Done! */
3237 bitword = BIGLITTLE(*++bitptr,*--bitptr);
3238 bitpos = (BNWORD64)1<<(64-1);
3239 }
3240
3241 /*
3242 * The elen/bitword/bitpos bit buffer is known to be
3243 * non-empty, i.e. there is at least one more unconsumed bit.
3244 * Thus, it's safe to square the number.
3245 */
3246 lbnMontSquare_64(b, a1, mod, mlen, inv);
3247 /* Rename result (in b) back to a (a1, really). */
3248 a1 = b; b = a; a = a1;
3249 a1 = BIGLITTLE(a-mlen,a+mlen);
3250#if BNYIELD
3251 if (bnYield && (y = bnYield()) < 0)
3252 goto yield;
3253#endif
3254 }
3255
3256 /* DONE! Just a little bit of cleanup... */
3257
3258 /*
3259 * Convert result out of Montgomery form... this is
3260 * just a Montgomery reduction.
3261 */
3262 lbnCopy_64(a, a1, mlen);
3263 lbnZero_64(a1, mlen);
3264 lbnMontReduce_64(a, mod, mlen, inv);
3265 lbnCopy_64(n, a1, mlen);
3266
3267 /* Clean up - free intermediate storage */
3268 y = 0;
3269#if BNYIELD
3270yield:
3271#endif
3272 LBNFREE(b, 2*mlen);
3273 LBNFREE(a, 2*mlen);
3274
3275 return y; /* Success */
3276}
3277
3278
3279/*
3280 * Returns a substring of the big-endian array of bytes representation
3281 * of the bignum array based on two parameters, the least significant
3282 * byte number (0 to start with the least significant byte) and the
3283 * length. I.e. the number returned is a representation of
3284 * (bn / 2^(8*lsbyte)) % 2 ^ (8*buflen).
3285 *
3286 * It is an error if the bignum is not at least buflen + lsbyte bytes
3287 * long.
3288 *
3289 * This code assumes that the compiler has the minimal intelligence
3290 * neded to optimize divides and modulo operations on an unsigned data
3291 * type with a power of two.
3292 */
3293void
3294lbnExtractBigBytes_64(BNWORD64 const *n, unsigned char *buf,
3295 unsigned lsbyte, unsigned buflen)
3296{
3297 BNWORD64 t = 0; /* Needed to shut up uninitialized var warnings */
3298 unsigned shift;
3299
3300 lsbyte += buflen;
3301
3302 shift = (8 * lsbyte) % 64;
3303 lsbyte /= (64/8); /* Convert to word offset */
3304 BIGLITTLE(n -= lsbyte, n += lsbyte);
3305
3306 if (shift)
3307 t = BIGLITTLE(n[-1],n[0]);
3308
3309 while (buflen--) {
3310 if (!shift) {
3311 t = BIGLITTLE(*n++,*--n);
3312 shift = 64;
3313 }
3314 shift -= 8;
3315 *buf++ = (unsigned char)(t>>shift);
3316 }
3317}
3318
3319/*
3320 * Merge a big-endian array of bytes into a bignum array.
3321 * The array had better be big enough. This is
3322 * equivalent to extracting the entire bignum into a
3323 * large byte array, copying the input buffer into the
3324 * middle of it, and converting back to a bignum.
3325 *
3326 * The buf is "len" bytes long, and its *last* byte is at
3327 * position "lsbyte" from the end of the bignum.
3328 *
3329 * Note that this is a pain to get right. Fortunately, it's hardly
3330 * critical for efficiency.
3331 */
3332void
3333lbnInsertBigBytes_64(BNWORD64 *n, unsigned char const *buf,
3334 unsigned lsbyte, unsigned buflen)
3335{
3336 BNWORD64 t = 0; /* Shut up uninitialized varibale warnings */
3337
3338 lsbyte += buflen;
3339
3340 BIGLITTLE(n -= lsbyte/(64/8), n += lsbyte/(64/8));
3341
3342 /* Load up leading odd bytes */
3343 if (lsbyte % (64/8)) {
3344 t = BIGLITTLE(*--n,*n++);
3345 t >>= (lsbyte * 8) % 64;
3346 }
3347
3348 /* The main loop - merge into t, storing at each word boundary. */
3349 while (buflen--) {
3350 t = (t << 8) | *buf++;
3351 if ((--lsbyte % (64/8)) == 0)
3352 BIGLITTLE(*n++,*--n) = t;
3353 }
3354
3355 /* Merge odd bytes in t into last word */
3356 lsbyte = (lsbyte * 8) % 64;
3357 if (lsbyte) {
3358 t <<= lsbyte;
3359 t |= (((BNWORD64)1 << lsbyte) - 1) & BIGLITTLE(n[0],n[-1]);
3360 BIGLITTLE(n[0],n[-1]) = t;
3361 }
3362
3363 return;
3364}
3365
3366/*
3367 * Returns a substring of the little-endian array of bytes representation
3368 * of the bignum array based on two parameters, the least significant
3369 * byte number (0 to start with the least significant byte) and the
3370 * length. I.e. the number returned is a representation of
3371 * (bn / 2^(8*lsbyte)) % 2 ^ (8*buflen).
3372 *
3373 * It is an error if the bignum is not at least buflen + lsbyte bytes
3374 * long.
3375 *
3376 * This code assumes that the compiler has the minimal intelligence
3377 * neded to optimize divides and modulo operations on an unsigned data
3378 * type with a power of two.
3379 */
3380void
3381lbnExtractLittleBytes_64(BNWORD64 const *n, unsigned char *buf,
3382 unsigned lsbyte, unsigned buflen)
3383{
3384 BNWORD64 t = 0; /* Needed to shut up uninitialized var warnings */
3385
3386 BIGLITTLE(n -= lsbyte/(64/8), n += lsbyte/(64/8));
3387
3388 if (lsbyte % (64/8)) {
3389 t = BIGLITTLE(*--n,*n++);
3390 t >>= (lsbyte % (64/8)) * 8 ;
3391 }
3392
3393 while (buflen--) {
3394 if ((lsbyte++ % (64/8)) == 0)
3395 t = BIGLITTLE(*--n,*n++);
3396 *buf++ = (unsigned char)t;
3397 t >>= 8;
3398 }
3399}
3400
3401/*
3402 * Merge a little-endian array of bytes into a bignum array.
3403 * The array had better be big enough. This is
3404 * equivalent to extracting the entire bignum into a
3405 * large byte array, copying the input buffer into the
3406 * middle of it, and converting back to a bignum.
3407 *
3408 * The buf is "len" bytes long, and its first byte is at
3409 * position "lsbyte" from the end of the bignum.
3410 *
3411 * Note that this is a pain to get right. Fortunately, it's hardly
3412 * critical for efficiency.
3413 */
3414void
3415lbnInsertLittleBytes_64(BNWORD64 *n, unsigned char const *buf,
3416 unsigned lsbyte, unsigned buflen)
3417{
3418 BNWORD64 t = 0; /* Shut up uninitialized varibale warnings */
3419
3420 /* Move to most-significant end */
3421 lsbyte += buflen;
3422 buf += buflen;
3423
3424 BIGLITTLE(n -= lsbyte/(64/8), n += lsbyte/(64/8));
3425
3426 /* Load up leading odd bytes */
3427 if (lsbyte % (64/8)) {
3428 t = BIGLITTLE(*--n,*n++);
3429 t >>= (lsbyte * 8) % 64;
3430 }
3431
3432 /* The main loop - merge into t, storing at each word boundary. */
3433 while (buflen--) {
3434 t = (t << 8) | *--buf;
3435 if ((--lsbyte % (64/8)) == 0)
3436 BIGLITTLE(*n++,*--n) = t;
3437 }
3438
3439 /* Merge odd bytes in t into last word */
3440 lsbyte = (lsbyte * 8) % 64;
3441 if (lsbyte) {
3442 t <<= lsbyte;
3443 t |= (((BNWORD64)1 << lsbyte) - 1) & BIGLITTLE(n[0],n[-1]);
3444 BIGLITTLE(n[0],n[-1]) = t;
3445 }
3446
3447 return;
3448}
3449
3450#ifdef DEADCODE /* This was a precursor to the more flexible lbnExtractBytes */
3451/*
3452 * Convert a big-endian array of bytes to a bignum.
3453 * Returns the number of words in the bignum.
3454 * Note the expression "64/8" for the number of bytes per word.
3455 * This is so the word-size adjustment will work.
3456 */
3457unsigned
3458lbnFromBytes_64(BNWORD64 *a, unsigned char const *b, unsigned blen)
3459{
3460 BNWORD64 t;
3461 unsigned alen = (blen + (64/8-1))/(64/8);
3462 BIGLITTLE(a -= alen, a += alen);
3463
3464 while (blen) {
3465 t = 0;
3466 do {
3467 t = t << 8 | *b++;
3468 } while (--blen & (64/8-1));
3469 BIGLITTLE(*a++,*--a) = t;
3470 }
3471 return alen;
3472}
3473#endif
3474
3475/*
3476 * Computes the GCD of a and b. Modifies both arguments; when it returns,
3477 * one of them is the GCD and the other is trash. The return value
3478 * indicates which: 0 for a, and 1 for b. The length of the retult is
3479 * returned in rlen. Both inputs must have one extra word of precision.
3480 * alen must be >= blen.
3481 *
3482 * TODO: use the binary algorithm (Knuth section 4.5.2, algorithm B).
3483 * This is based on taking out common powers of 2, then repeatedly:
3484 * gcd(2*u,v) = gcd(u,2*v) = gcd(u,v) - isolated powers of 2 can be deleted.
3485 * gcd(u,v) = gcd(u-v,v) - the numbers can be easily reduced.
3486 * It gets less reduction per step, but the steps are much faster than
3487 * the division case.
3488 */
3489int
3490lbnGcd_64(BNWORD64 *a, unsigned alen, BNWORD64 *b, unsigned blen,
3491 unsigned *rlen)
3492{
3493#if BNYIELD
3494 int y;
3495#endif
3496 assert(alen >= blen);
3497
3498 while (blen != 0) {
3499 (void)lbnDiv_64(BIGLITTLE(a-blen,a+blen), a, alen, b, blen);
3500 alen = lbnNorm_64(a, blen);
3501 if (alen == 0) {
3502 *rlen = blen;
3503 return 1;
3504 }
3505 (void)lbnDiv_64(BIGLITTLE(b-alen,b+alen), b, blen, a, alen);
3506 blen = lbnNorm_64(b, alen);
3507#if BNYIELD
3508 if (bnYield && (y = bnYield()) < 0)
3509 return y;
3510#endif
3511 }
3512 *rlen = alen;
3513 return 0;
3514}
3515
3516/*
3517 * Invert "a" modulo "mod" using the extended Euclidean algorithm.
3518 * Note that this only computes one of the cosequences, and uses the
3519 * theorem that the signs flip every step and the absolute value of
3520 * the cosequence values are always bounded by the modulus to avoid
3521 * having to work with negative numbers.
3522 * gcd(a,mod) had better equal 1. Returns 1 if the GCD is NOT 1.
3523 * a must be one word longer than "mod". It is overwritten with the
3524 * result.
3525 * TODO: Use Richard Schroeppel's *much* faster algorithm.
3526 */
3527int
3528lbnInv_64(BNWORD64 *a, unsigned alen, BNWORD64 const *mod, unsigned mlen)
3529{
3530 BNWORD64 *b; /* Hold a copy of mod during GCD reduction */
3531 BNWORD64 *p; /* Temporary for products added to t0 and t1 */
3532 BNWORD64 *t0, *t1; /* Inverse accumulators */
3533 BNWORD64 cy;
3534 unsigned blen, t0len, t1len, plen;
3535 int y;
3536
3537 alen = lbnNorm_64(a, alen);
3538 if (!alen)
3539 return 1; /* No inverse */
3540
3541 mlen = lbnNorm_64(mod, mlen);
3542
3543 assert (alen <= mlen);
3544
3545 /* Inverse of 1 is 1 */
3546 if (alen == 1 && BIGLITTLE(a[-1],a[0]) == 1) {
3547 lbnZero_64(BIGLITTLE(a-alen,a+alen), mlen-alen);
3548 return 0;
3549 }
3550
3551 /* Allocate a pile of space */
3552 LBNALLOC(b, BNWORD64, mlen+1);
3553 if (b) {
3554 /*
3555 * Although products are guaranteed to always be less than the
3556 * modulus, it can involve multiplying two 3-word numbers to
3557 * get a 5-word result, requiring a 6th word to store a 0
3558 * temporarily. Thus, mlen + 1.
3559 */
3560 LBNALLOC(p, BNWORD64, mlen+1);
3561 if (p) {
3562 LBNALLOC(t0, BNWORD64, mlen);
3563 if (t0) {
3564 LBNALLOC(t1, BNWORD64, mlen);
3565 if (t1)
3566 goto allocated;
3567 LBNFREE(t0, mlen);
3568 }
3569 LBNFREE(p, mlen+1);
3570 }
3571 LBNFREE(b, mlen+1);
3572 }
3573 return -1;
3574
3575allocated:
3576
3577 /* Set t0 to 1 */
3578 t0len = 1;
3579 BIGLITTLE(t0[-1],t0[0]) = 1;
3580
3581 /* b = mod */
3582 lbnCopy_64(b, mod, mlen);
3583 /* blen = mlen (implicitly) */
3584
3585 /* t1 = b / a; b = b % a */
3586 cy = lbnDiv_64(t1, b, mlen, a, alen);
3587 *(BIGLITTLE(t1-(mlen-alen)-1,t1+(mlen-alen))) = cy;
3588 t1len = lbnNorm_64(t1, mlen-alen+1);
3589 blen = lbnNorm_64(b, alen);
3590
3591 /* while (b > 1) */
3592 while (blen > 1 || BIGLITTLE(b[-1],b[0]) != (BNWORD64)1) {
3593 /* q = a / b; a = a % b; */
3594 if (alen < blen || (alen == blen && lbnCmp_64(a, a, alen) < 0))
3595 assert(0);
3596 cy = lbnDiv_64(BIGLITTLE(a-blen,a+blen), a, alen, b, blen);
3597 *(BIGLITTLE(a-alen-1,a+alen)) = cy;
3598 plen = lbnNorm_64(BIGLITTLE(a-blen,a+blen), alen-blen+1);
3599 assert(plen);
3600 alen = lbnNorm_64(a, blen);
3601 if (!alen)
3602 goto failure; /* GCD not 1 */
3603
3604 /* t0 += q * t1; */
3605 assert(plen+t1len <= mlen+1);
3606 lbnMul_64(p, BIGLITTLE(a-blen,a+blen), plen, t1, t1len);
3607 plen = lbnNorm_64(p, plen + t1len);
3608 assert(plen <= mlen);
3609 if (plen > t0len) {
3610 lbnZero_64(BIGLITTLE(t0-t0len,t0+t0len), plen-t0len);
3611 t0len = plen;
3612 }
3613 cy = lbnAddN_64(t0, p, plen);
3614 if (cy) {
3615 if (t0len > plen) {
3616 cy = lbnAdd1_64(BIGLITTLE(t0-plen,t0+plen),
3617 t0len-plen, cy);
3618 }
3619 if (cy) {
3620 BIGLITTLE(t0[-t0len-1],t0[t0len]) = cy;
3621 t0len++;
3622 }
3623 }
3624
3625 /* if (a <= 1) return a ? t0 : FAIL; */
3626 if (alen <= 1 && BIGLITTLE(a[-1],a[0]) == (BNWORD64)1) {
3627 if (alen == 0)
3628 goto failure; /* FAIL */
3629 assert(t0len <= mlen);
3630 lbnCopy_64(a, t0, t0len);
3631 lbnZero_64(BIGLITTLE(a-t0len, a+t0len), mlen-t0len);
3632 goto success;
3633 }
3634
3635 /* q = b / a; b = b % a; */
3636 if (blen < alen || (blen == alen && lbnCmp_64(b, a, alen) < 0))
3637 assert(0);
3638 cy = lbnDiv_64(BIGLITTLE(b-alen,b+alen), b, blen, a, alen);
3639 *(BIGLITTLE(b-blen-1,b+blen)) = cy;
3640 plen = lbnNorm_64(BIGLITTLE(b-alen,b+alen), blen-alen+1);
3641 assert(plen);
3642 blen = lbnNorm_64(b, alen);
3643 if (!blen)
3644 goto failure; /* GCD not 1 */
3645
3646 /* t1 += q * t0; */
3647 assert(plen+t0len <= mlen+1);
3648 lbnMul_64(p, BIGLITTLE(b-alen,b+alen), plen, t0, t0len);
3649 plen = lbnNorm_64(p, plen + t0len);
3650 assert(plen <= mlen);
3651 if (plen > t1len) {
3652 lbnZero_64(BIGLITTLE(t1-t1len,t1+t1len), plen-t1len);
3653 t1len = plen;
3654 }
3655 cy = lbnAddN_64(t1, p, plen);
3656 if (cy) {
3657 if (t1len > plen) {
3658 cy = lbnAdd1_64(BIGLITTLE(t1-plen,t0+plen),
3659 t1len-plen, cy);
3660 }
3661 if (cy) {
3662 BIGLITTLE(t1[-t1len-1],t1[t1len]) = cy;
3663 t1len++;
3664 }
3665 }
3666#if BNYIELD
3667 if (bnYield && (y = bnYield() < 0))
3668 goto yield;
3669#endif
3670 }
3671
3672 if (!blen)
3673 goto failure; /* gcd(a, mod) != 1 -- FAIL */
3674
3675 /* return mod-t1 */
3676 lbnCopy_64(a, mod, mlen);
3677 assert(t1len <= mlen);
3678 cy = lbnSubN_64(a, t1, t1len);
3679 if (cy) {
3680 assert(mlen > t1len);
3681 cy = lbnSub1_64(BIGLITTLE(a-t1len, a+t1len), mlen-t1len, cy);
3682 assert(!cy);
3683 }
3684
3685success:
3686 LBNFREE(t1, mlen);
3687 LBNFREE(t0, mlen);
3688 LBNFREE(p, mlen+1);
3689 LBNFREE(b, mlen+1);
3690
3691 return 0;
3692
3693failure: /* GCD is not 1 - no inverse exists! */
3694 y = 1;
3695#if BNYIELD
3696yield:
3697#endif
3698 LBNFREE(t1, mlen);
3699 LBNFREE(t0, mlen);
3700 LBNFREE(p, mlen+1);
3701 LBNFREE(b, mlen+1);
3702
3703 return y;
3704}
3705
3706/*
3707 * Precompute powers of "a" mod "mod". Compute them every "bits"
3708 * for "n" steps. This is sufficient to compute powers of g with
3709 * exponents up to n*bits bits long, i.e. less than 2^(n*bits).
3710 *
3711 * This assumes that the caller has already initialized "array" to point
3712 * to "n" buffers of size "mlen".
3713 */
3714int
3715lbnBasePrecompBegin_64(BNWORD64 **array, unsigned n, unsigned bits,
3716 BNWORD64 const *g, unsigned glen, BNWORD64 *mod, unsigned mlen)
3717{
3718 BNWORD64 *a, *b; /* Temporary double-width accumulators */
3719 BNWORD64 *a1; /* Pointer to high half of a*/
3720 BNWORD64 inv; /* Montgomery inverse of LSW of mod */
3721 BNWORD64 *t;
3722 unsigned i;
3723
3724 glen = lbnNorm_64(g, glen);
3725 assert(glen);
3726
3727 assert (mlen == lbnNorm_64(mod, mlen));
3728 assert (glen <= mlen);
3729
3730 /* Allocate two temporary buffers, and the array slots */
3731 LBNALLOC(a, BNWORD64, mlen*2);
3732 if (!a)
3733 return -1;
3734 LBNALLOC(b, BNWORD64, mlen*2);
3735 if (!b) {
3736 LBNFREE(a, 2*mlen);
3737 return -1;
3738 }
3739
3740 /* Okay, all ready */
3741
3742 /* Convert n to Montgomery form */
3743 inv = BIGLITTLE(mod[-1],mod[0]); /* LSW of modulus */
3744 assert(inv & 1); /* Modulus must be odd */
3745 inv = lbnMontInv1_64(inv);
3746 /* Move g up "mlen" words into a (clearing the low mlen words) */
3747 a1 = BIGLITTLE(a-mlen,a+mlen);
3748 lbnCopy_64(a1, g, glen);
3749 lbnZero_64(a, mlen);
3750
3751 /* Do the division - dump the quotient into the high-order words */
3752 (void)lbnDiv_64(a1, a, mlen+glen, mod, mlen);
3753
3754 /* Copy the first value into the array */
3755 t = *array;
3756 lbnCopy_64(t, a, mlen);
3757 a1 = a; /* This first value is *not* shifted up */
3758
3759 /* Now compute the remaining n-1 array entries */
3760 assert(bits);
3761 assert(n);
3762 while (--n) {
3763 i = bits;
3764 do {
3765 /* Square a1 into b1 */
3766 lbnMontSquare_64(b, a1, mod, mlen, inv);
3767 t = b; b = a; a = t;
3768 a1 = BIGLITTLE(a-mlen, a+mlen);
3769 } while (--i);
3770 t = *++array;
3771 lbnCopy_64(t, a1, mlen);
3772 }
3773
3774 /* Hooray, we're done. */
3775 LBNFREE(b, 2*mlen);
3776 LBNFREE(a, 2*mlen);
3777 return 0;
3778}
3779
3780/*
3781 * result = base^exp (mod mod). "array" is a an array of pointers
3782 * to procomputed powers of base, each 2^bits apart. (I.e. array[i]
3783 * is base^(2^(i*bits))).
3784 *
3785 * The algorithm consists of:
3786 * a = b = (powers of g to be raised to the power 2^bits-1)
3787 * a *= b *= (powers of g to be raised to the power 2^bits-2)
3788 * ...
3789 * a *= b *= (powers of g to be raised to the power 1)
3790 *
3791 * All we do is walk the exponent 2^bits-1 times in groups of "bits" bits,
3792 */
3793int
3794lbnBasePrecompExp_64(BNWORD64 *result, BNWORD64 const * const *array,
3795 unsigned bits, BNWORD64 const *exp, unsigned elen,
3796 BNWORD64 const *mod, unsigned mlen)
3797{
3798 BNWORD64 *a, *b, *c, *t;
3799 BNWORD64 *a1, *b1;
3800 int anull, bnull; /* Null flags: values are implicitly 1 */
3801 unsigned i, j; /* Loop counters */
3802 unsigned mask; /* Exponent bits to examime */
3803 BNWORD64 const *eptr; /* Pointer into exp */
3804 BNWORD64 buf, curbits, nextword; /* Bit-buffer varaibles */
3805 BNWORD64 inv; /* Inverse of LSW of modulus */
3806 unsigned ewords; /* Words of exponent left */
3807 int bufbits; /* Number of valid bits */
3808 int y = 0;
3809
3810 mlen = lbnNorm_64(mod, mlen);
3811 assert (mlen);
3812
3813 elen = lbnNorm_64(exp, elen);
3814 if (!elen) {
3815 lbnZero_64(result, mlen);
3816 BIGLITTLE(result[-1],result[0]) = 1;
3817 return 0;
3818 }
3819 /*
3820 * This could be precomputed, but it's so cheap, and it would require
3821 * making the precomputation structure word-size dependent.
3822 */
3823 inv = lbnMontInv1_64(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
3824
3825 assert(elen);
3826
3827 /*
3828 * Allocate three temporary buffers. The current numbers generally
3829 * live in the upper halves of these buffers.
3830 */
3831 LBNALLOC(a, BNWORD64, mlen*2);
3832 if (a) {
3833 LBNALLOC(b, BNWORD64, mlen*2);
3834 if (b) {
3835 LBNALLOC(c, BNWORD64, mlen*2);
3836 if (c)
3837 goto allocated;
3838 LBNFREE(b, 2*mlen);
3839 }
3840 LBNFREE(a, 2*mlen);
3841 }
3842 return -1;
3843
3844allocated:
3845
3846 anull = bnull = 1;
3847
3848 mask = (1u<<bits) - 1;
3849 for (i = mask; i; --i) {
3850 /* Set up bit buffer for walking the exponent */
3851 eptr = exp;
3852 buf = BIGLITTLE(*--eptr, *eptr++);
3853 ewords = elen-1;
3854 bufbits = 64;
3855 for (j = 0; ewords || buf; j++) {
3856 /* Shift down current buffer */
3857 curbits = buf;
3858 buf >>= bits;
3859 /* If necessary, add next word */
3860 bufbits -= bits;
3861 if (bufbits < 0 && ewords > 0) {
3862 nextword = BIGLITTLE(*--eptr, *eptr++);
3863 ewords--;
3864 curbits |= nextword << (bufbits+bits);
3865 buf = nextword >> -bufbits;
3866 bufbits += 64;
3867 }
3868 /* If appropriate, multiply b *= array[j] */
3869 if ((curbits & mask) == i) {
3870 BNWORD64 const *d = array[j];
3871
3872 b1 = BIGLITTLE(b-mlen-1,b+mlen);
3873 if (bnull) {
3874 lbnCopy_64(b1, d, mlen);
3875 bnull = 0;
3876 } else {
3877 lbnMontMul_64(c, b1, d, mod, mlen, inv);
3878 t = c; c = b; b = t;
3879 }
3880#if BNYIELD
3881 if (bnYield && (y = bnYield() < 0))
3882 goto yield;
3883#endif
3884 }
3885 }
3886
3887 /* Multiply a *= b */
3888 if (!bnull) {
3889 a1 = BIGLITTLE(a-mlen-1,a+mlen);
3890 b1 = BIGLITTLE(b-mlen-1,b+mlen);
3891 if (anull) {
3892 lbnCopy_64(a1, b1, mlen);
3893 anull = 0;
3894 } else {
3895 lbnMontMul_64(c, a1, b1, mod, mlen, inv);
3896 t = c; c = a; a = t;
3897 }
3898 }
3899 }
3900
3901 assert(!anull); /* If it were, elen would have been 0 */
3902
3903 /* Convert out of Montgomery form and return */
3904 a1 = BIGLITTLE(a-mlen-1,a+mlen);
3905 lbnCopy_64(a, a1, mlen);
3906 lbnZero_64(a1, mlen);
3907 lbnMontReduce_64(a, mod, mlen, inv);
3908 lbnCopy_64(result, a1, mlen);
3909
3910#if BNYIELD
3911yield:
3912#endif
3913 LBNFREE(c, 2*mlen);
3914 LBNFREE(b, 2*mlen);
3915 LBNFREE(a, 2*mlen);
3916
3917 return y;
3918}
3919
3920/*
3921 * result = base1^exp1 *base2^exp2 (mod mod). "array1" and "array2" are
3922 * arrays of pointers to procomputed powers of the corresponding bases,
3923 * each 2^bits apart. (I.e. array1[i] is base1^(2^(i*bits))).
3924 *
3925 * Bits must be the same in both. (It could be made adjustable, but it's
3926 * a bit of a pain. Just make them both equal to the larger one.)
3927 *
3928 * The algorithm consists of:
3929 * a = b = (powers of base1 and base2 to be raised to the power 2^bits-1)
3930 * a *= b *= (powers of base1 and base2 to be raised to the power 2^bits-2)
3931 * ...
3932 * a *= b *= (powers of base1 and base2 to be raised to the power 1)
3933 *
3934 * All we do is walk the exponent 2^bits-1 times in groups of "bits" bits,
3935 */
3936int
3937lbnDoubleBasePrecompExp_64(BNWORD64 *result, unsigned bits,
3938 BNWORD64 const * const *array1, BNWORD64 const *exp1, unsigned elen1,
3939 BNWORD64 const * const *array2, BNWORD64 const *exp2,
3940 unsigned elen2, BNWORD64 const *mod, unsigned mlen)
3941{
3942 BNWORD64 *a, *b, *c, *t;
3943 BNWORD64 *a1, *b1;
3944 int anull, bnull; /* Null flags: values are implicitly 1 */
3945 unsigned i, j, k; /* Loop counters */
3946 unsigned mask; /* Exponent bits to examime */
3947 BNWORD64 const *eptr; /* Pointer into exp */
3948 BNWORD64 buf, curbits, nextword; /* Bit-buffer varaibles */
3949 BNWORD64 inv; /* Inverse of LSW of modulus */
3950 unsigned ewords; /* Words of exponent left */
3951 int bufbits; /* Number of valid bits */
3952 int y = 0;
3953 BNWORD64 const * const *array;
3954
3955 mlen = lbnNorm_64(mod, mlen);
3956 assert (mlen);
3957
3958 elen1 = lbnNorm_64(exp1, elen1);
3959 if (!elen1) {
3960 return lbnBasePrecompExp_64(result, array2, bits, exp2, elen2,
3961 mod, mlen);
3962 }
3963 elen2 = lbnNorm_64(exp2, elen2);
3964 if (!elen2) {
3965 return lbnBasePrecompExp_64(result, array1, bits, exp1, elen1,
3966 mod, mlen);
3967 }
3968 /*
3969 * This could be precomputed, but it's so cheap, and it would require
3970 * making the precomputation structure word-size dependent.
3971 */
3972 inv = lbnMontInv1_64(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
3973
3974 assert(elen1);
3975 assert(elen2);
3976
3977 /*
3978 * Allocate three temporary buffers. The current numbers generally
3979 * live in the upper halves of these buffers.
3980 */
3981 LBNALLOC(a, BNWORD64, mlen*2);
3982 if (a) {
3983 LBNALLOC(b, BNWORD64, mlen*2);
3984 if (b) {
3985 LBNALLOC(c, BNWORD64, mlen*2);
3986 if (c)
3987 goto allocated;
3988 LBNFREE(b, 2*mlen);
3989 }
3990 LBNFREE(a, 2*mlen);
3991 }
3992 return -1;
3993
3994allocated:
3995
3996 anull = bnull = 1;
3997
3998 mask = (1u<<bits) - 1;
3999 for (i = mask; i; --i) {
4000 /* Walk each exponent in turn */
4001 for (k = 0; k < 2; k++) {
4002 /* Set up the exponent for walking */
4003 array = k ? array2 : array1;
4004 eptr = k ? exp2 : exp1;
4005 ewords = (k ? elen2 : elen1) - 1;
4006 /* Set up bit buffer for walking the exponent */
4007 buf = BIGLITTLE(*--eptr, *eptr++);
4008 bufbits = 64;
4009 for (j = 0; ewords || buf; j++) {
4010 /* Shift down current buffer */
4011 curbits = buf;
4012 buf >>= bits;
4013 /* If necessary, add next word */
4014 bufbits -= bits;
4015 if (bufbits < 0 && ewords > 0) {
4016 nextword = BIGLITTLE(*--eptr, *eptr++);
4017 ewords--;
4018 curbits |= nextword << (bufbits+bits);
4019 buf = nextword >> -bufbits;
4020 bufbits += 64;
4021 }
4022 /* If appropriate, multiply b *= array[j] */
4023 if ((curbits & mask) == i) {
4024 BNWORD64 const *d = array[j];
4025
4026 b1 = BIGLITTLE(b-mlen-1,b+mlen);
4027 if (bnull) {
4028 lbnCopy_64(b1, d, mlen);
4029 bnull = 0;
4030 } else {
4031 lbnMontMul_64(c, b1, d, mod, mlen, inv);
4032 t = c; c = b; b = t;
4033 }
4034#if BNYIELD
4035 if (bnYield && (y = bnYield() < 0))
4036 goto yield;
4037#endif
4038 }
4039 }
4040 }
4041
4042 /* Multiply a *= b */
4043 if (!bnull) {
4044 a1 = BIGLITTLE(a-mlen-1,a+mlen);
4045 b1 = BIGLITTLE(b-mlen-1,b+mlen);
4046 if (anull) {
4047 lbnCopy_64(a1, b1, mlen);
4048 anull = 0;
4049 } else {
4050 lbnMontMul_64(c, a1, b1, mod, mlen, inv);
4051 t = c; c = a; a = t;
4052 }
4053 }
4054 }
4055
4056 assert(!anull); /* If it were, elen would have been 0 */
4057
4058 /* Convert out of Montgomery form and return */
4059 a1 = BIGLITTLE(a-mlen-1,a+mlen);
4060 lbnCopy_64(a, a1, mlen);
4061 lbnZero_64(a1, mlen);
4062 lbnMontReduce_64(a, mod, mlen, inv);
4063 lbnCopy_64(result, a1, mlen);
4064
4065#if BNYIELD
4066yield:
4067#endif
4068 LBNFREE(c, 2*mlen);
4069 LBNFREE(b, 2*mlen);
4070 LBNFREE(a, 2*mlen);
4071
4072 return y;
4073}