Blame - jni/libzrtp/sources/bnlib/lbn32.c - jami-client-android

blob: 831b25cb8881e5bc0340ad8e570650358c70e6db [file] [log] [blame]

Alexandre Lision	7fd5d3d	2013-12-04 13:06:40 -0500	[diff] [blame]	1	/*
				2	* lbn32.c - Low-level bignum routines, 32-bit version.
				3	*
				4	* Copyright (c) 1995 Colin Plumb. All rights reserved.
				5	* For licensing and other legal details, see the file legal.c.
				6	*
				7	* NOTE: the magic constants "32" and "64" appear in many places in this
				8	* file, including inside identifiers. Because it is not possible to
				9	* ask "#ifdef" of a macro expansion, it is not possible to use the
				10	* preprocessor to conditionalize these properly. Thus, this file is
				11	* intended to be edited with textual search and replace to produce
				12	* alternate word size versions. Any reference to the number of bits
				13	* in a word must be the string "32", and that string must not appear
				14	* otherwise. Any reference to twice this number must appear as "64",
				15	* which likewise must not appear otherwise. Is that clear?
				16	*
				17	* Remember, when doubling the bit size replace the larger number (64)
				18	* first, then the smaller (32). When halving the bit size, do the
				19	* opposite. Otherwise, things will get wierd. Also, be sure to replace
				20	* every instance that appears. (:%s/foo/bar/g in vi)
				21	*
				22	* These routines work with a pointer to the least-significant end of
				23	* an array of WORD32s. The BIG(x), LITTLE(y) and BIGLTTLE(x,y) macros
				24	* defined in lbn.h (which expand to x on a big-edian machine and y on a
				25	* little-endian machine) are used to conditionalize the code to work
				26	* either way. If you have no assembly primitives, it doesn't matter.
				27	* Note that on a big-endian machine, the least-significant-end pointer
				28	* is ONE PAST THE END. The bytes are ptr[-1] through ptr[-len].
				29	* On little-endian, they are ptr[0] through ptr[len-1]. This makes
				30	* perfect sense if you consider pointers to point between bytes rather
				31	* than at them.
				32	*
				33	* Because the array index values are unsigned integers, ptr[-i]
				34	* may not work properly, since the index -i is evaluated as an unsigned,
				35	* and if pointers are wider, zero-extension will produce a positive
				36	* number rahter than the needed negative. The expression used in this
				37	* code, *(ptr-i) will, however, work. (The array syntax is equivalent
				38	* to *(ptr+-i), which is a pretty subtle difference.)
				39	*
				40	* Many of these routines will get very unhappy if fed zero-length inputs.
				41	* They use assert() to enforce this. An higher layer of code must make
				42	* sure that these aren't called with zero-length inputs.
				43	*
				44	* Any of these routines can be replaced with more efficient versions
				45	* elsewhere, by just #defining their names. If one of the names
				46	* is #defined, the C code is not compiled in and no declaration is
				47	* made. Use the BNINCLUDE file to do that. Typically, you compile
				48	* asm subroutines with the same name and just, e.g.
				49	* #define lbnMulAdd1_32 lbnMulAdd1_32
				50	*
				51	* If you want to write asm routines, start with lbnMulAdd1_32().
				52	* This is the workhorse of modular exponentiation. lbnMulN1_32() is
				53	* also used a fair bit, although not as much and it's defined in terms
				54	* of lbnMulAdd1_32 if that has a custom version. lbnMulSub1_32 and
				55	* lbnDiv21_32 are used in the usual division and remainder finding.
				56	* (Not the Montgomery reduction used in modular exponentiation, though.)
				57	* Once you have lbnMulAdd1_32 defined, writing the other two should
				58	* be pretty easy. (Just make sure you get the sign of the subtraction
				59	* in lbnMulSub1_32 right - it's dest = dest - source * k.)
				60	*
				61	* The only definitions that absolutely need a double-word (BNWORD64)
				62	* type are lbnMulAdd1_32 and lbnMulSub1_32; if those are provided,
				63	* the rest follows. lbnDiv21_32, however, is a lot slower unless you
				64	* have them, and lbnModQ_32 takes after it. That one is used quite a
				65	* bit for prime sieving.
				66	*/
				67
				68	#ifndef HAVE_CONFIG_H
				69	#define HAVE_CONFIG_H 0
				70	#endif
				71	#if HAVE_CONFIG_H
Alexandre Lision	ddd731e	2014-01-31 11:50:08 -0500	[diff] [blame]	72	#include "bnconfig.h"
Alexandre Lision	7fd5d3d	2013-12-04 13:06:40 -0500	[diff] [blame]	73	#endif
				74
				75	/*
				76	* Some compilers complain about #if FOO if FOO isn't defined,
				77	* so do the ANSI-mandated thing explicitly...
				78	*/
				79	#ifndef NO_ASSERT_H
				80	#define NO_ASSERT_H 0
				81	#endif
				82	#ifndef NO_STRING_H
				83	#define NO_STRING_H 0
				84	#endif
				85	#ifndef HAVE_STRINGS_H
				86	#define HAVE_STRINGS_H 0
				87	#endif
				88	#ifndef NEED_MEMORY_H
				89	#define NEED_MEMORY_H 0
				90	#endif
				91
				92	#if !NO_ASSERT_H
				93	#include <assert.h>
				94	#else
				95	#define assert(x) (void)0
				96	#endif
				97
				98	#if !NO_STRING_H
				99	#include <string.h> /* For memcpy */
				100	#elif HAVE_STRINGS_H
				101	#include <strings.h>
				102	#endif
				103	#if NEED_MEMORY_H
				104	#include <memory.h>
				105	#endif
				106
				107	#include "lbn.h"
				108	#include "lbn32.h"
				109	#include "lbnmem.h"
				110
				111	#include "kludge.h"
				112
				113	#ifndef BNWORD32
				114	#error 32-bit bignum library requires a 32-bit data type
				115	#endif
				116
				117	/* If this is defined, include bnYield() calls */
				118	#if BNYIELD
				119	extern int (bnYield)(void); / From bn.c */
				120	#endif
				121
				122	/*
				123	* Most of the multiply (and Montgomery reduce) routines use an outer
				124	* loop that iterates over one of the operands - a so-called operand
				125	* scanning approach. One big advantage of this is that the assembly
				126	* support routines are simpler. The loops can be rearranged to have
				127	* an outer loop that iterates over the product, a so-called product
				128	* scanning approach. This has the advantage of writing less data
				129	* and doing fewer adds to memory, so is supposedly faster. Some
				130	* code has been written using a product-scanning approach, but
				131	* it appears to be slower, so it is turned off by default. Some
				132	* experimentation would be appreciated.
				133	*
				134	* (The code is also annoying to get right and not very well commented,
				135	* one of my pet peeves about math libraries. I'm sorry.)
				136	*/
				137	#ifndef PRODUCT_SCAN
				138	#define PRODUCT_SCAN 0
				139	#endif
				140
				141	/*
				142	* Copy an array of words. <Marvin mode on> Thrilling, isn't it? </Marvin>
				143	* This is a good example of how the byte offsets and BIGLITTLE() macros work.
				144	* Another alternative would have been
				145	* memcpy(dest BIG(-len), src BIG(-len), len*sizeof(BNWORD32)), but I find that
				146	* putting operators into conditional macros is confusing.
				147	*/
				148	#ifndef lbnCopy_32
				149	void
				150	lbnCopy_32(BNWORD32 dest, BNWORD32 const src, unsigned len)
				151	{
				152	memcpy(BIGLITTLE(dest-len,dest), BIGLITTLE(src-len,src),
				153	len * sizeof(*src));
				154	}
				155	#endif /* !lbnCopy_32 */
				156
				157	/*
				158	* Fill n words with zero. This does it manually rather than calling
				159	* memset because it can assume alignment to make things faster while
				160	* memset can't. Note how big-endian numbers are naturally addressed
				161	* using predecrement, while little-endian is postincrement.
				162	*/
				163	#ifndef lbnZero_32
				164	void
				165	lbnZero_32(BNWORD32 *num, unsigned len)
				166	{
				167	while (len--)
				168	BIGLITTLE(--num,num++) = 0;
				169	}
				170	#endif /* !lbnZero_32 */
				171
				172	/*
				173	* Negate an array of words.
				174	* Negation is subtraction from zero. Negating low-order words
				175	* entails doing nothing until a non-zero word is hit. Once that
				176	* is negated, a borrow is generated and never dies until the end
				177	* of the number is hit. Negation with borrow, -x-1, is the same as ~x.
				178	* Repeat that until the end of the number.
				179	*
				180	* Doesn't return borrow out because that's pretty useless - it's
				181	* always set unless the input is 0, which is easy to notice in
				182	* normalized form.
				183	*/
				184	#ifndef lbnNeg_32
				185	void
				186	lbnNeg_32(BNWORD32 *num, unsigned len)
				187	{
				188	assert(len);
				189
				190	/* Skip low-order zero words */
				191	while (BIGLITTLE(--num,num) == 0) {
				192	if (!--len)
				193	return;
				194	LITTLE(num++;)
				195	}
				196	/* Negate the lowest-order non-zero word */
				197	num = -num;
				198	/* Complement all the higher-order words */
				199	while (--len) {
				200	BIGLITTLE(--num,++num);
				201	num = ~num;
				202	}
				203	}
				204	#endif /* !lbnNeg_32 */
				205
				206
				207	/*
				208	* lbnAdd1_32: add the single-word "carry" to the given number.
				209	* Used for minor increments and propagating the carry after
				210	* adding in a shorter bignum.
				211	*
				212	* Technique: If we have a double-width word, presumably the compiler
				213	* can add using its carry in inline code, so we just use a larger
				214	* accumulator to compute the carry from the first addition.
				215	* If not, it's more complex. After adding the first carry, which may
				216	* be > 1, compare the sum and the carry. If the sum wraps (causing a
				217	* carry out from the addition), the result will be less than each of the
				218	* inputs, since the wrap subtracts a number (2^32) which is larger than
				219	* the other input can possibly be. If the sum is >= the carry input,
				220	* return success immediately.
				221	* In either case, if there is a carry, enter a loop incrementing words
				222	* until one does not wrap. Since we are adding 1 each time, the wrap
				223	* will be to 0 and we can test for equality.
				224	*/
				225	#ifndef lbnAdd1_32 /* If defined, it's provided as an asm subroutine */
				226	#ifdef BNWORD64
				227	BNWORD32
				228	lbnAdd1_32(BNWORD32 *num, unsigned len, BNWORD32 carry)
				229	{
				230	BNWORD64 t;
				231	assert(len > 0); /* Alternative: if (!len) return carry */
				232
				233	t = (BNWORD64)BIGLITTLE(--num,num) + carry;
				234	BIGLITTLE(num,num++) = (BNWORD32)t;
				235	if ((t >> 32) == 0)
				236	return 0;
				237	while (--len) {
				238	if (++BIGLITTLE(--num,num++) != 0)
				239	return 0;
				240	}
				241	return 1;
				242	}
				243	#else /* no BNWORD64 */
				244	BNWORD32
				245	lbnAdd1_32(BNWORD32 *num, unsigned len, BNWORD32 carry)
				246	{
				247	assert(len > 0); /* Alternative: if (!len) return carry */
				248
				249	if ((BIGLITTLE(--num,num++) += carry) >= carry)
				250	return 0;
				251	while (--len) {
				252	if (++BIGLITTLE(--num,num++) != 0)
				253	return 0;
				254	}
				255	return 1;
				256	}
				257	#endif
				258	#endif/* !lbnAdd1_32 */
				259
				260	/*
				261	* lbnSub1_32: subtract the single-word "borrow" from the given number.
				262	* Used for minor decrements and propagating the borrow after
				263	* subtracting a shorter bignum.
				264	*
				265	* Technique: Similar to the add, above. If there is a double-length type,
				266	* use that to generate the first borrow.
				267	* If not, after subtracting the first borrow, which may be > 1, compare
				268	* the difference and the negative of the carry. If the subtract wraps
				269	* (causing a borrow out from the subtraction), the result will be at least
				270	* as large as -borrow. If the result < -borrow, then no borrow out has
				271	* appeared and we may return immediately, except when borrow == 0. To
				272	* deal with that case, use the identity that -x = ~x+1, and instead of
				273	* comparing < -borrow, compare for <= ~borrow.
				274	* Either way, if there is a borrow out, enter a loop decrementing words
				275	* until a non-zero word is reached.
				276	*
				277	* Note the cast of ~borrow to (BNWORD32). If the size of an int is larger
				278	* than BNWORD32, C rules say the number is expanded for the arithmetic, so
				279	* the inversion will be done on an int and the value won't be quite what
				280	* is expected.
				281	*/
				282	#ifndef lbnSub1_32 /* If defined, it's provided as an asm subroutine */
				283	#ifdef BNWORD64
				284	BNWORD32
				285	lbnSub1_32(BNWORD32 *num, unsigned len, BNWORD32 borrow)
				286	{
				287	BNWORD64 t;
				288	assert(len > 0); /* Alternative: if (!len) return borrow */
				289
				290	t = (BNWORD64)BIGLITTLE(--num,num) - borrow;
				291	BIGLITTLE(num,num++) = (BNWORD32)t;
				292	if ((t >> 32) == 0)
				293	return 0;
				294	while (--len) {
				295	if ((BIGLITTLE(--num,num++))-- != 0)
				296	return 0;
				297	}
				298	return 1;
				299	}
				300	#else /* no BNWORD64 */
				301	BNWORD32
				302	lbnSub1_32(BNWORD32 *num, unsigned len, BNWORD32 borrow)
				303	{
				304	assert(len > 0); /* Alternative: if (!len) return borrow */
				305
				306	if ((BIGLITTLE(--num,num++) -= borrow) <= (BNWORD32)~borrow)
				307	return 0;
				308	while (--len) {
				309	if ((BIGLITTLE(--num,num++))-- != 0)
				310	return 0;
				311	}
				312	return 1;
				313	}
				314	#endif
				315	#endif /* !lbnSub1_32 */
				316
				317	/*
				318	* lbnAddN_32: add two bignums of the same length, returning the carry (0 or 1).
				319	* One of the building blocks, along with lbnAdd1, of adding two bignums of
				320	* differing lengths.
				321	*
				322	* Technique: Maintain a word of carry. If there is no double-width type,
				323	* use the same technique as in lbnAdd1, above, to maintain the carry by
				324	* comparing the inputs. Adding the carry sources is used as an OR operator;
				325	* at most one of the two comparisons can possibly be true. The first can
				326	* only be true if carry == 1 and x, the result, is 0. In that case the
				327	* second can't possibly be true.
				328	*/
				329	#ifndef lbnAddN_32
				330	#ifdef BNWORD64
				331	BNWORD32
				332	lbnAddN_32(BNWORD32 num1, BNWORD32 const num2, unsigned len)
				333	{
				334	BNWORD64 t;
				335
				336	assert(len > 0);
				337
				338	t = (BNWORD64)BIGLITTLE(--num1,num1) + BIGLITTLE(--num2,num2++);
				339	BIGLITTLE(num1,num1++) = (BNWORD32)t;
				340	while (--len) {
				341	t = (BNWORD64)BIGLITTLE(--num1,num1) +
				342	(BNWORD64)BIGLITTLE(--num2,num2++) + (t >> 32);
				343	BIGLITTLE(num1,num1++) = (BNWORD32)t;
				344	}
				345
				346	return (BNWORD32)(t>>32);
				347	}
				348	#else /* no BNWORD64 */
				349	BNWORD32
				350	lbnAddN_32(BNWORD32 num1, BNWORD32 const num2, unsigned len)
				351	{
				352	BNWORD32 x, carry = 0;
				353
				354	assert(len > 0); /* Alternative: change loop to test at start */
				355
				356	do {
				357	x = BIGLITTLE(--num2,num2++);
				358	carry = (x += carry) < carry;
				359	carry += (BIGLITTLE(--num1,num1++) += x) < x;
				360	} while (--len);
				361
				362	return carry;
				363	}
				364	#endif
				365	#endif /* !lbnAddN_32 */
				366
				367	/*
				368	* lbnSubN_32: add two bignums of the same length, returning the carry (0 or 1).
				369	* One of the building blocks, along with subn1, of subtracting two bignums of
				370	* differing lengths.
				371	*
				372	* Technique: If no double-width type is availble, maintain a word of borrow.
				373	* First, add the borrow to the subtrahend (did you have to learn all those
				374	* awful words in elementary school, too?), and if it overflows, set the
				375	* borrow again. Then subtract the modified subtrahend from the next word
				376	* of input, using the same technique as in subn1, above.
				377	* Adding the borrows is used as an OR operator; at most one of the two
				378	* comparisons can possibly be true. The first can only be true if
				379	* borrow == 1 and x, the result, is 0. In that case the second can't
				380	* possibly be true.
				381	*
				382	* In the double-word case, (BNWORD32)-(t>>32) is subtracted, rather than
				383	* adding t>>32, because the shift would need to sign-extend and that's
				384	* not guaranteed to happen in ANSI C, even with signed types.
				385	*/
				386	#ifndef lbnSubN_32
				387	#ifdef BNWORD64
				388	BNWORD32
				389	lbnSubN_32(BNWORD32 num1, BNWORD32 const num2, unsigned len)
				390	{
				391	BNWORD64 t;
				392
				393	assert(len > 0);
				394
				395	t = (BNWORD64)BIGLITTLE(--num1,num1) - BIGLITTLE(--num2,num2++);
				396	BIGLITTLE(num1,num1++) = (BNWORD32)t;
				397
				398	while (--len) {
				399	t = (BNWORD64)BIGLITTLE(--num1,num1) -
				400	(BNWORD64)BIGLITTLE(--num2,num2++) - (BNWORD32)-(t >> 32);
				401	BIGLITTLE(num1,num1++) = (BNWORD32)t;
				402	}
				403
				404	return -(BNWORD32)(t>>32);
				405	}
				406	#else
				407	BNWORD32
				408	lbnSubN_32(BNWORD32 num1, BNWORD32 const num2, unsigned len)
				409	{
				410	BNWORD32 x, borrow = 0;
				411
				412	assert(len > 0); /* Alternative: change loop to test at start */
				413
				414	do {
				415	x = BIGLITTLE(--num2,num2++);
				416	borrow = (x += borrow) < borrow;
				417	borrow += (BIGLITTLE(--num1,num1++) -= x) > (BNWORD32)~x;
				418	} while (--len);
				419
				420	return borrow;
				421	}
				422	#endif
				423	#endif /* !lbnSubN_32 */
				424
				425	#ifndef lbnCmp_32
				426	/*
				427	* lbnCmp_32: compare two bignums of equal length, returning the sign of
				428	* num1 - num2. (-1, 0 or +1).
				429	*
				430	* Technique: Change the little-endian pointers to big-endian pointers
				431	* and compare from the most-significant end until a difference if found.
				432	* When it is, figure out the sign of the difference and return it.
				433	*/
				434	int
				435	lbnCmp_32(BNWORD32 const num1, BNWORD32 const num2, unsigned len)
				436	{
				437	BIGLITTLE(num1 -= len, num1 += len);
				438	BIGLITTLE(num2 -= len, num2 += len);
				439
				440	while (len--) {
				441	if (BIGLITTLE(num1++ != num2++, --num1 != --num2)) {
				442	if (BIGLITTLE(num1[-1] < num2[-1], num1 < num2))
				443	return -1;
				444	else
				445	return 1;
				446	}
				447	}
				448	return 0;
				449	}
				450	#endif /* !lbnCmp_32 */
				451
				452	/*
				453	* mul32_ppmmaa(ph,pl,x,y,a,b) is an optional routine that
				454	* computes (ph,pl) = x * y + a + b. mul32_ppmma and mul32_ppmm
				455	* are simpler versions. If you want to be lazy, all of these
				456	* can be defined in terms of the others, so here we create any
				457	* that have not been defined in terms of the ones that have been.
				458	*/
				459
				460	/* Define ones with fewer a's in terms of ones with more a's */
				461	#if !defined(mul32_ppmma) && defined(mul32_ppmmaa)
				462	#define mul32_ppmma(ph,pl,x,y,a) mul32_ppmmaa(ph,pl,x,y,a,0)
				463	#endif
				464
				465	#if !defined(mul32_ppmm) && defined(mul32_ppmma)
				466	#define mul32_ppmm(ph,pl,x,y) mul32_ppmma(ph,pl,x,y,0)
				467	#endif
				468
				469	/*
				470	* Use this definition to test the mul32_ppmm-based operations on machines
				471	* that do not provide mul32_ppmm. Change the final "0" to a "1" to
				472	* enable it.
				473	*/
				474	#if !defined(mul32_ppmm) && defined(BNWORD64) && 0 /* Debugging */
				475	#define mul32_ppmm(ph,pl,x,y) \
				476	({BNWORD64 _ = (BNWORD64)(x)*(y); (pl) = _; (ph) = _>>32;})
				477	#endif
				478
				479	#if defined(mul32_ppmm) && !defined(mul32_ppmma)
				480	#define mul32_ppmma(ph,pl,x,y,a) \
				481	(mul32_ppmm(ph,pl,x,y), (ph) += ((pl) += (a)) < (a))
				482	#endif
				483
				484	#if defined(mul32_ppmma) && !defined(mul32_ppmmaa)
				485	#define mul32_ppmmaa(ph,pl,x,y,a,b) \
				486	(mul32_ppmma(ph,pl,x,y,a), (ph) += ((pl) += (b)) < (b))
				487	#endif
				488
				489	/*
				490	* lbnMulN1_32: Multiply an n-word input by a 1-word input and store the
				491	* n+1-word product. This uses either the mul32_ppmm and mul32_ppmma
				492	* macros, or C multiplication with the BNWORD64 type. This uses mul32_ppmma
				493	* if available, assuming you won't bother defining it unless you can do
				494	* better than the normal multiplication.
				495	*/
				496	#ifndef lbnMulN1_32
				497	#ifdef lbnMulAdd1_32 /* If we have this asm primitive, use it. */
				498	void
				499	lbnMulN1_32(BNWORD32 out, BNWORD32 const in, unsigned len, BNWORD32 k)
				500	{
				501	lbnZero_32(out, len);
				502	BIGLITTLE((out-len-1),(out+len)) = lbnMulAdd1_32(out, in, len, k);
				503	}
				504	#elif defined(mul32_ppmm)
				505	void
				506	lbnMulN1_32(BNWORD32 out, BNWORD32 const in, unsigned len, BNWORD32 k)
				507	{
				508	BNWORD32 carry, carryin;
				509
				510	assert(len > 0);
				511
				512	BIG(--out;--in;);
				513	mul32_ppmm(carry, out, in, k);
				514	LITTLE(out++;in++;)
				515
				516	while (--len) {
				517	BIG(--out;--in;)
				518	carryin = carry;
				519	mul32_ppmma(carry, out, in, k, carryin);
				520	LITTLE(out++;in++;)
				521	}
				522	BIGLITTLE(--out,out) = carry;
				523	}
				524	#elif defined(BNWORD64)
				525	void
				526	lbnMulN1_32(BNWORD32 out, BNWORD32 const in, unsigned len, BNWORD32 k)
				527	{
				528	BNWORD64 p;
				529
				530	assert(len > 0);
				531
				532	p = (BNWORD64)BIGLITTLE(--in,in++) * k;
				533	BIGLITTLE(--out,out++) = (BNWORD32)p;
				534
				535	while (--len) {
				536	p = (BNWORD64)BIGLITTLE(--in,in++) * k + (BNWORD32)(p >> 32);
				537	BIGLITTLE(--out,out++) = (BNWORD32)p;
				538	}
				539	BIGLITTLE(--out,out) = (BNWORD32)(p >> 32);
				540	}
				541	#else
				542	#error No 32x32 -> 64 multiply available for 32-bit bignum package
				543	#endif
				544	#endif /* lbnMulN1_32 */
				545
				546	/*
				547	* lbnMulAdd1_32: Multiply an n-word input by a 1-word input and add the
				548	* low n words of the product to the destination. *Returns the n+1st word
				549	* of the product.* (That turns out to be more convenient than adding
				550	* it into the destination and dealing with a possible unit carry out
				551	* of that.) This uses either the mul32_ppmma and mul32_ppmmaa macros,
				552	* or C multiplication with the BNWORD64 type.
				553	*
				554	* If you're going to write assembly primitives, this is the one to
				555	* start with. It is by far the most commonly called function.
				556	*/
				557	#ifndef lbnMulAdd1_32
				558	#if defined(mul32_ppmm)
				559	BNWORD32
				560	lbnMulAdd1_32(BNWORD32 out, BNWORD32 const in, unsigned len, BNWORD32 k)
				561	{
				562	BNWORD32 prod, carry, carryin;
				563
				564	assert(len > 0);
				565
				566	BIG(--out;--in;);
				567	carryin = *out;
				568	mul32_ppmma(carry, out, in, k, carryin);
				569	LITTLE(out++;in++;)
				570
				571	while (--len) {
				572	BIG(--out;--in;);
				573	carryin = carry;
				574	mul32_ppmmaa(carry, prod, in, k, carryin, out);
				575	*out = prod;
				576	LITTLE(out++;in++;)
				577	}
				578
				579	return carry;
				580	}
				581	#elif defined(BNWORD64)
				582	BNWORD32
				583	lbnMulAdd1_32(BNWORD32 out, BNWORD32 const in, unsigned len, BNWORD32 k)
				584	{
				585	BNWORD64 p;
				586
				587	assert(len > 0);
				588
				589	p = (BNWORD64)BIGLITTLE(--in,in++) * k + BIGLITTLE(--out,out);
				590	BIGLITTLE(out,out++) = (BNWORD32)p;
				591
				592	while (--len) {
				593	p = (BNWORD64)BIGLITTLE(--in,in++) * k +
				594	(BNWORD32)(p >> 32) + BIGLITTLE(--out,out);
				595	BIGLITTLE(out,out++) = (BNWORD32)p;
				596	}
				597
				598	return (BNWORD32)(p >> 32);
				599	}
				600	#else
				601	#error No 32x32 -> 64 multiply available for 32-bit bignum package
				602	#endif
				603	#endif /* lbnMulAdd1_32 */
				604
				605	/*
				606	* lbnMulSub1_32: Multiply an n-word input by a 1-word input and subtract the
				607	* n-word product from the destination. Returns the n+1st word of the product.
				608	* This uses either the mul32_ppmm and mul32_ppmma macros, or
				609	* C multiplication with the BNWORD64 type.
				610	*
				611	* This is rather uglier than adding, but fortunately it's only used in
				612	* division which is not used too heavily.
				613	*/
				614	#ifndef lbnMulSub1_32
				615	#if defined(mul32_ppmm)
				616	BNWORD32
				617	lbnMulSub1_32(BNWORD32 out, BNWORD32 const in, unsigned len, BNWORD32 k)
				618	{
				619	BNWORD32 prod, carry, carryin;
				620
				621	assert(len > 0);
				622
				623	BIG(--in;)
				624	mul32_ppmm(carry, prod, *in, k);
				625	LITTLE(in++;)
				626	carry += (BIGLITTLE(--out,out++) -= prod) > (BNWORD32)~prod;
				627
				628	while (--len) {
				629	BIG(--in;);
				630	carryin = carry;
				631	mul32_ppmma(carry, prod, *in, k, carryin);
				632	LITTLE(in++;)
				633	carry += (BIGLITTLE(--out,out++) -= prod) > (BNWORD32)~prod;
				634	}
				635
				636	return carry;
				637	}
				638	#elif defined(BNWORD64)
				639	BNWORD32
				640	lbnMulSub1_32(BNWORD32 out, BNWORD32 const in, unsigned len, BNWORD32 k)
				641	{
				642	BNWORD64 p;
				643	BNWORD32 carry, t;
				644
				645	assert(len > 0);
				646
				647	p = (BNWORD64)BIGLITTLE(--in,in++) * k;
				648	t = BIGLITTLE(--out,out);
				649	carry = (BNWORD32)(p>>32) + ((BIGLITTLE(out,out++)=t-(BNWORD32)p) > t);
				650
				651	while (--len) {
				652	p = (BNWORD64)BIGLITTLE(--in,in++) * k + carry;
				653	t = BIGLITTLE(--out,out);
				654	carry = (BNWORD32)(p>>32) +
				655	( (BIGLITTLE(out,out++)=t-(BNWORD32)p) > t );
				656	}
				657
				658	return carry;
				659	}
				660	#else
				661	#error No 32x32 -> 64 multiply available for 32-bit bignum package
				662	#endif
				663	#endif /* !lbnMulSub1_32 */
				664
				665	/*
				666	* Shift n words left "shift" bits. 0 < shift < 32. Returns the
				667	* carry, any bits shifted off the left-hand side (0 <= carry < 2^shift).
				668	*/
				669	#ifndef lbnLshift_32
				670	BNWORD32
				671	lbnLshift_32(BNWORD32 *num, unsigned len, unsigned shift)
				672	{
				673	BNWORD32 x, carry;
				674
				675	assert(shift > 0);
				676	assert(shift < 32);
				677
				678	carry = 0;
				679	while (len--) {
				680	BIG(--num;)
				681	x = *num;
				682	*num = (x<<shift) \| carry;
				683	LITTLE(num++;)
				684	carry = x >> (32-shift);
				685	}
				686	return carry;
				687	}
				688	#endif /* !lbnLshift_32 */
				689
				690	/*
				691	* An optimized version of the above, for shifts of 1.
				692	* Some machines can use add-with-carry tricks for this.
				693	*/
				694	#ifndef lbnDouble_32
				695	BNWORD32
				696	lbnDouble_32(BNWORD32 *num, unsigned len)
				697	{
				698	BNWORD32 x, carry;
				699
				700	carry = 0;
				701	while (len--) {
				702	BIG(--num;)
				703	x = *num;
				704	*num = (x<<1) \| carry;
				705	LITTLE(num++;)
				706	carry = x >> (32-1);
				707	}
				708	return carry;
				709	}
				710	#endif /* !lbnDouble_32 */
				711
				712	/*
				713	* Shift n words right "shift" bits. 0 < shift < 32. Returns the
				714	* carry, any bits shifted off the right-hand side (0 <= carry < 2^shift).
				715	*/
				716	#ifndef lbnRshift_32
				717	BNWORD32
				718	lbnRshift_32(BNWORD32 *num, unsigned len, unsigned shift)
				719	{
				720	BNWORD32 x, carry = 0;
				721
				722	assert(shift > 0);
				723	assert(shift < 32);
				724
				725	BIGLITTLE(num -= len, num += len);
				726
				727	while (len--) {
				728	LITTLE(--num;)
				729	x = *num;
				730	*num = (x>>shift) \| carry;
				731	BIG(num++;)
				732	carry = x << (32-shift);
				733	}
				734	return carry >> (32-shift);
				735	}
				736	#endif /* !lbnRshift_32 */
				737
				738	/*
				739	* Multiply two numbers of the given lengths. prod and num2 may overlap,
				740	* provided that the low len1 bits of prod are free. (This corresponds
				741	* nicely to the place the result is returned from lbnMontReduce_32.)
				742	*
				743	* TODO: Use Karatsuba multiply. The overlap constraints may have
				744	* to get rewhacked.
				745	*/
				746	#ifndef lbnMul_32
				747	void
				748	lbnMul_32(BNWORD32 prod, BNWORD32 const num1, unsigned len1,
				749	BNWORD32 const *num2, unsigned len2)
				750	{
				751	/* Special case of zero */
				752	if (!len1 \|\| !len2) {
				753	lbnZero_32(prod, len1+len2);
				754	return;
				755	}
				756
				757	/* Multiply first word */
				758	lbnMulN1_32(prod, num1, len1, BIGLITTLE(--num2,num2++));
				759
				760	/*
				761	* Add in subsequent words, storing the most significant word,
				762	* which is new each time.
				763	*/
				764	while (--len2) {
				765	BIGLITTLE(--prod,prod++);
				766	BIGLITTLE((prod-len1-1),(prod+len1)) =
				767	lbnMulAdd1_32(prod, num1, len1, BIGLITTLE(--num2,num2++));
				768	}
				769	}
				770	#endif /* !lbnMul_32 */
				771
				772	/*
				773	* lbnMulX_32 is a square multiply - both inputs are the same length.
				774	* It's normally just a macro wrapper around the general multiply,
				775	* but might be implementable in assembly more efficiently (such as
				776	* when product scanning).
				777	*/
				778	#ifndef lbnMulX_32
				779	#if defined(BNWORD64) && PRODUCT_SCAN
				780	/*
				781	* Test code to see whether product scanning is any faster. It seems
				782	* to make the C code slower, so PRODUCT_SCAN is not defined.
				783	*/
				784	static void
				785	lbnMulX_32(BNWORD32 prod, BNWORD32 const num1, BNWORD32 const *num2,
				786	unsigned len)
				787	{
				788	BNWORD64 x, y;
				789	BNWORD32 const p1, p2;
				790	unsigned carry;
				791	unsigned i, j;
				792
				793	/* Special case of zero */
				794	if (!len)
				795	return;
				796
				797	x = (BNWORD64)BIGLITTLE(num1[-1] * num2[-1], num1[0] * num2[0]);
				798	BIGLITTLE(--prod, prod++) = (BNWORD32)x;
				799	x >>= 32;
				800
				801	for (i = 1; i < len; i++) {
				802	carry = 0;
				803	p1 = num1;
				804	p2 = BIGLITTLE(num2-i-1,num2+i+1);
				805	for (j = 0; j <= i; j++) {
				806	BIG(y = (BNWORD64)--p1 *p2++;)
				807	LITTLE(y = (BNWORD64)p1++ *--p2;)
				808	x += y;
				809	carry += (x < y);
				810	}
				811	BIGLITTLE(--prod,prod++) = (BNWORD32)x;
				812	x = (x >> 32) \| (BNWORD64)carry << 32;
				813	}
				814	for (i = 1; i < len; i++) {
				815	carry = 0;
				816	p1 = BIGLITTLE(num1-i,num1+i);
				817	p2 = BIGLITTLE(num2-len,num2+len);
				818	for (j = i; j < len; j++) {
				819	BIG(y = (BNWORD64)--p1 *p2++;)
				820	LITTLE(y = (BNWORD64)p1++ *--p2;)
				821	x += y;
				822	carry += (x < y);
				823	}
				824	BIGLITTLE(--prod,prod++) = (BNWORD32)x;
				825	x = (x >> 32) \| (BNWORD64)carry << 32;
				826	}
				827
				828	BIGLITTLE(--prod,prod) = (BNWORD32)x;
				829	}
				830	#else /* !defined(BNWORD64) \|\| !PRODUCT_SCAN */
				831	/* Default trivial macro definition */
				832	#define lbnMulX_32(prod, num1, num2, len) lbnMul_32(prod, num1, len, num2, len)
				833	#endif /* !defined(BNWORD64) \|\| !PRODUCT_SCAN */
				834	#endif /* !lbmMulX_32 */
				835
				836	#if !defined(lbnMontMul_32) && defined(BNWORD64) && PRODUCT_SCAN
				837	/*
				838	* Test code for product-scanning multiply. This seems to slow the C
				839	* code down rather than speed it up.
				840	* This does a multiply and Montgomery reduction together, using the
				841	* same loops. The outer loop scans across the product, twice.
				842	* The first pass computes the low half of the product and the
				843	* Montgomery multipliers. These are stored in the product array,
				844	* which contains no data as of yet. x and carry add up the columns
				845	* and propagate carries forward.
				846	*
				847	* The second half multiplies the upper half, adding in the modulus
				848	* times the Montgomery multipliers. The results of this multiply
				849	* are stored.
				850	*/
				851	static void
				852	lbnMontMul_32(BNWORD32 prod, BNWORD32 const num1, BNWORD32 const *num2,
				853	BNWORD32 const *mod, unsigned len, BNWORD32 inv)
				854	{
				855	BNWORD64 x, y;
				856	BNWORD32 const p1, p2, *pm;
				857	BNWORD32 *pp;
				858	BNWORD32 t;
				859	unsigned carry;
				860	unsigned i, j;
				861
				862	/* Special case of zero */
				863	if (!len)
				864	return;
				865
				866	/*
				867	* This computes directly into the high half of prod, so just
				868	* shift the pointer and consider prod only "len" elements long
				869	* for the rest of the code.
				870	*/
				871	BIGLITTLE(prod -= len, prod += len);
				872
				873	/* Pass 1 - compute Montgomery multipliers */
				874	/* First iteration can have certain simplifications. */
				875	x = (BNWORD64)BIGLITTLE(num1[-1] * num2[-1], num1[0] * num2[0]);
				876	BIGLITTLE(prod[-1], prod[0]) = t = inv * (BNWORD32)x;
				877	y = (BNWORD64)t * BIGLITTLE(mod[-1],mod[0]);
				878	x += y;
				879	/* Note: GCC 2.6.3 has a bug if you try to eliminate "carry" */
				880	carry = (x < y);
				881	assert((BNWORD32)x == 0);
				882	x = x >> 32 \| (BNWORD64)carry << 32;
				883
				884	for (i = 1; i < len; i++) {
				885	carry = 0;
				886	p1 = num1;
				887	p2 = BIGLITTLE(num2-i-1,num2+i+1);
				888	pp = prod;
				889	pm = BIGLITTLE(mod-i-1,mod+i+1);
				890	for (j = 0; j < i; j++) {
				891	y = (BNWORD64)BIGLITTLE(--p1 p2++, p1++ * *--p2);
				892	x += y;
				893	carry += (x < y);
				894	y = (BNWORD64)BIGLITTLE(--pp pm++, pp++ * *--pm);
				895	x += y;
				896	carry += (x < y);
				897	}
				898	y = (BNWORD64)BIGLITTLE(p1[-1] * p2[0], p1[0] * p2[-1]);
				899	x += y;
				900	carry += (x < y);
				901	assert(BIGLITTLE(pp == prod-i, pp == prod+i));
				902	BIGLITTLE(pp[-1], pp[0]) = t = inv * (BNWORD32)x;
				903	assert(BIGLITTLE(pm == mod-1, pm == mod+1));
				904	y = (BNWORD64)t * BIGLITTLE(pm[0],pm[-1]);
				905	x += y;
				906	carry += (x < y);
				907	assert((BNWORD32)x == 0);
				908	x = x >> 32 \| (BNWORD64)carry << 32;
				909	}
				910
				911	/* Pass 2 - compute reduced product and store */
				912	for (i = 1; i < len; i++) {
				913	carry = 0;
				914	p1 = BIGLITTLE(num1-i,num1+i);
				915	p2 = BIGLITTLE(num2-len,num2+len);
				916	pm = BIGLITTLE(mod-i,mod+i);
				917	pp = BIGLITTLE(prod-len,prod+len);
				918	for (j = i; j < len; j++) {
				919	y = (BNWORD64)BIGLITTLE(--p1 p2++, p1++ * *--p2);
				920	x += y;
				921	carry += (x < y);
				922	y = (BNWORD64)BIGLITTLE(--pm pp++, pm++ * *--pp);
				923	x += y;
				924	carry += (x < y);
				925	}
				926	assert(BIGLITTLE(pm == mod-len, pm == mod+len));
				927	assert(BIGLITTLE(pp == prod-i, pp == prod+i));
				928	BIGLITTLE(pp[0],pp[-1]) = (BNWORD32)x;
				929	x = (x >> 32) \| (BNWORD64)carry << 32;
				930	}
				931
				932	/* Last round of second half, simplified. */
				933	BIGLITTLE((prod-len),(prod+len-1)) = (BNWORD32)x;
				934	carry = (x >> 32);
				935
				936	while (carry)
				937	carry -= lbnSubN_32(prod, mod, len);
				938	while (lbnCmp_32(prod, mod, len) >= 0)
				939	(void)lbnSubN_32(prod, mod, len);
				940	}
				941	/* Suppress later definition */
				942	#define lbnMontMul_32 lbnMontMul_32
				943	#endif
				944
				945	#if !defined(lbnSquare_32) && defined(BNWORD64) && PRODUCT_SCAN
				946	/*
				947	* Trial code for product-scanning squaring. This seems to slow the C
				948	* code down rather than speed it up.
				949	*/
				950	void
				951	lbnSquare_32(BNWORD32 prod, BNWORD32 const num, unsigned len)
				952	{
				953	BNWORD64 x, y, z;
				954	BNWORD32 const p1, p2;
				955	unsigned carry;
				956	unsigned i, j;
				957
				958	/* Special case of zero */
				959	if (!len)
				960	return;
				961
				962	/* Word 0 of product */
				963	x = (BNWORD64)BIGLITTLE(num[-1] * num[-1], num[0] * num[0]);
				964	BIGLITTLE(--prod, prod++) = (BNWORD32)x;
				965	x >>= 32;
				966
				967	/* Words 1 through len-1 */
				968	for (i = 1; i < len; i++) {
				969	carry = 0;
				970	y = 0;
				971	p1 = num;
				972	p2 = BIGLITTLE(num-i-1,num+i+1);
				973	for (j = 0; j < (i+1)/2; j++) {
				974	BIG(z = (BNWORD64)--p1 *p2++;)
				975	LITTLE(z = (BNWORD64)p1++ *--p2;)
				976	y += z;
				977	carry += (y < z);
				978	}
				979	y += z = y;
				980	carry += carry + (y < z);
				981	if ((i & 1) == 0) {
				982	assert(BIGLITTLE(--p1 == p2, p1 == --p2));
				983	BIG(z = (BNWORD64)p2 *p2;)
				984	LITTLE(z = (BNWORD64)p1 *p1;)
				985	y += z;
				986	carry += (y < z);
				987	}
				988	x += y;
				989	carry += (x < y);
				990	BIGLITTLE(--prod,prod++) = (BNWORD32)x;
				991	x = (x >> 32) \| (BNWORD64)carry << 32;
				992	}
				993	/* Words len through 2len-2 /
				994	for (i = 1; i < len; i++) {
				995	carry = 0;
				996	y = 0;
				997	p1 = BIGLITTLE(num-i,num+i);
				998	p2 = BIGLITTLE(num-len,num+len);
				999	for (j = 0; j < (len-i)/2; j++) {
				1000	BIG(z = (BNWORD64)--p1 *p2++;)
				1001	LITTLE(z = (BNWORD64)p1++ *--p2;)
				1002	y += z;
				1003	carry += (y < z);
				1004	}
				1005	y += z = y;
				1006	carry += carry + (y < z);
				1007	if ((len-i) & 1) {
				1008	assert(BIGLITTLE(--p1 == p2, p1 == --p2));
				1009	BIG(z = (BNWORD64)p2 *p2;)
				1010	LITTLE(z = (BNWORD64)p1 *p1;)
				1011	y += z;
				1012	carry += (y < z);
				1013	}
				1014	x += y;
				1015	carry += (x < y);
				1016	BIGLITTLE(--prod,prod++) = (BNWORD32)x;
				1017	x = (x >> 32) \| (BNWORD64)carry << 32;
				1018	}
				1019
				1020	/* Word 2len-1 /
				1021	BIGLITTLE(--prod,prod) = (BNWORD32)x;
				1022	}
				1023	/* Suppress later definition */
				1024	#define lbnSquare_32 lbnSquare_32
				1025	#endif
				1026
				1027	/*
				1028	* Square a number, using optimized squaring to reduce the number of
				1029	* primitive multiples that are executed. There may not be any
				1030	* overlap of the input and output.
				1031	*
				1032	* Technique: Consider the partial products in the multiplication
				1033	* of "abcde" by itself:
				1034	*
				1035	* a b c d e
				1036	* * a b c d e
				1037	* ==================
				1038	* ae be ce de ee
				1039	* ad bd cd dd de
				1040	* ac bc cc cd ce
				1041	* ab bb bc bd be
				1042	* aa ab ac ad ae
				1043	*
				1044	* Note that everything above the main diagonal:
				1045	* ae be ce de = (abcd) * e
				1046	* ad bd cd = (abc) * d
				1047	* ac bc = (ab) * c
				1048	* ab = (a) * b
				1049	*
				1050	* is a copy of everything below the main diagonal:
				1051	* de
				1052	* cd ce
				1053	* bc bd be
				1054	* ab ac ad ae
				1055	*
				1056	* Thus, the sum is 2 * (off the diagonal) + diagonal.
				1057	*
				1058	* This is accumulated beginning with the diagonal (which
				1059	* consist of the squares of the digits of the input), which is then
				1060	* divided by two, the off-diagonal added, and multiplied by two
				1061	* again. The low bit is simply a copy of the low bit of the
				1062	* input, so it doesn't need special care.
				1063	*
				1064	* TODO: Merge the shift by 1 with the squaring loop.
				1065	* TODO: Use Karatsuba. (aW+b)^2 = a^2 (W^2+W) + b^2 * (W+1) - (a-b)^2 * W.
				1066	*/
				1067	#ifndef lbnSquare_32
				1068	void
				1069	lbnSquare_32(BNWORD32 prod, BNWORD32 const num, unsigned len)
				1070	{
				1071	BNWORD32 t;
				1072	BNWORD32 prodx = prod; / Working copy of the argument */
				1073	BNWORD32 const numx = num; / Working copy of the argument */
				1074	unsigned lenx = len; /* Working copy of the argument */
				1075
				1076	if (!len)
				1077	return;
				1078
				1079	/* First, store all the squares */
				1080	while (lenx--) {
				1081	#ifdef mul32_ppmm
				1082	BNWORD32 ph, pl;
				1083	t = BIGLITTLE(--numx,numx++);
				1084	mul32_ppmm(ph,pl,t,t);
				1085	BIGLITTLE(--prodx,prodx++) = pl;
				1086	BIGLITTLE(--prodx,prodx++) = ph;
				1087	#elif defined(BNWORD64) /* use BNWORD64 */
				1088	BNWORD64 p;
				1089	t = BIGLITTLE(--numx,numx++);
				1090	p = (BNWORD64)t * t;
				1091	BIGLITTLE(--prodx,prodx++) = (BNWORD32)p;
				1092	BIGLITTLE(--prodx,prodx++) = (BNWORD32)(p>>32);
				1093	#else /* Use lbnMulN1_32 */
				1094	t = BIGLITTLE(numx[-1],*numx);
				1095	lbnMulN1_32(prodx, numx, 1, t);
				1096	BIGLITTLE(--numx,numx++);
				1097	BIGLITTLE(prodx -= 2, prodx += 2);
				1098	#endif
				1099	}
				1100	/* Then, shift right 1 bit */
				1101	(void)lbnRshift_32(prod, 2*len, 1);
				1102
				1103	/* Then, add in the off-diagonal sums */
				1104	lenx = len;
				1105	numx = num;
				1106	prodx = prod;
				1107	while (--lenx) {
				1108	t = BIGLITTLE(--numx,numx++);
				1109	BIGLITTLE(--prodx,prodx++);
				1110	t = lbnMulAdd1_32(prodx, numx, lenx, t);
				1111	lbnAdd1_32(BIGLITTLE(prodx-lenx,prodx+lenx), lenx+1, t);
				1112	BIGLITTLE(--prodx,prodx++);
				1113	}
				1114
				1115	/* Shift it back up */
				1116	lbnDouble_32(prod, 2*len);
				1117
				1118	/* And set the low bit appropriately */
				1119	BIGLITTLE(prod[-1],prod[0]) \|= BIGLITTLE(num[-1],num[0]) & 1;
				1120	}
				1121	#endif /* !lbnSquare_32 */
				1122
				1123	/*
				1124	* lbnNorm_32 - given a number, return a modified length such that the
				1125	* most significant digit is non-zero. Zero-length input is okay.
				1126	*/
				1127	#ifndef lbnNorm_32
				1128	unsigned
				1129	lbnNorm_32(BNWORD32 const *num, unsigned len)
				1130	{
				1131	BIGLITTLE(num -= len,num += len);
				1132	while (len && BIGLITTLE(num++,--num) == 0)
				1133	--len;
				1134	return len;
				1135	}
				1136	#endif /* lbnNorm_32 */
				1137
				1138	/*
				1139	* lbnBits_32 - return the number of significant bits in the array.
				1140	* It starts by normalizing the array. Zero-length input is okay.
				1141	* Then assuming there's anything to it, it fetches the high word,
				1142	* generates a bit length by multiplying the word length by 32, and
				1143	* subtracts off 32/2, 32/4, 32/8, ... bits if the high bits are clear.
				1144	*/
				1145	#ifndef lbnBits_32
				1146	unsigned
				1147	lbnBits_32(BNWORD32 const *num, unsigned len)
				1148	{
				1149	BNWORD32 t;
				1150	unsigned i;
				1151
				1152	len = lbnNorm_32(num, len);
				1153	if (len) {
				1154	t = BIGLITTLE((num-len),(num+(len-1)));
				1155	assert(t);
				1156	len *= 32;
				1157	i = 32/2;
				1158	do {
				1159	if (t >> i)
				1160	t >>= i;
				1161	else
				1162	len -= i;
				1163	} while ((i /= 2) != 0);
				1164	}
				1165	return len;
				1166	}
				1167	#endif /* lbnBits_32 */
				1168
				1169	/*
				1170	* If defined, use hand-rolled divide rather than compiler's native.
				1171	* If the machine doesn't do it in line, the manual code is probably
				1172	* faster, since it can assume normalization and the fact that the
				1173	* quotient will fit into 32 bits, which a general 64-bit divide
				1174	* in a compiler's run-time library can't do.
				1175	*/
				1176	#ifndef BN_SLOW_DIVIDE_64
				1177	/* Assume that divisors of more than thirty-two bits are slow */
				1178	#define BN_SLOW_DIVIDE_64 (64 > 0x20)
				1179	#endif
				1180
				1181	/*
				1182	* Return (nh<<32\|nl) % d, and place the quotient digit into *q.
				1183	* It is guaranteed that nh < d, and that d is normalized (with its high
				1184	* bit set). If we have a double-width type, it's easy. If not, ooh,
				1185	* yuk!
				1186	*/
				1187	#ifndef lbnDiv21_32
				1188	#if defined(BNWORD64) && !BN_SLOW_DIVIDE_64
				1189	BNWORD32
				1190	lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)
				1191	{
				1192	BNWORD64 n = (BNWORD64)nh << 32 \| nl;
				1193
				1194	/* Divisor must be normalized */
				1195	assert(d >> (32-1) == 1);
				1196
				1197	*q = n / d;
				1198	return n % d;
				1199	}
				1200	#else
				1201	/*
				1202	* This is where it gets ugly.
				1203	*
				1204	* Do the division in two halves, using Algorithm D from section 4.3.1
				1205	* of Knuth. Note Theorem B from that section, that the quotient estimate
				1206	* is never more than the true quotient, and is never more than two
				1207	* too low.
				1208	*
				1209	* The mapping onto conventional long division is (everything a half word):
				1210	* _____________qh___ql_
				1211	* dh dl ) nh.h nh.l nl.h nl.l
				1212	* - (qh * d)
				1213	* -----------
				1214	* rrrr rrrr nl.l
				1215	* - (ql * d)
				1216	* -----------
				1217	* rrrr rrrr
				1218	*
				1219	* The implicit 3/2-digit dqh and dql subtractors are computed this way:
				1220	* First, estimate a q digit so that nh/dh works. Subtracting qh*dh from
				1221	* the (nh.h nh.l) list leaves a 1/2-word remainder r. Then compute the
				1222	* low part of the subtractor, qh * dl. This also needs to be subtracted
				1223	* from (nh.h nh.l nl.h) to get the final remainder. So we take the
				1224	* remainder, which is (nh.h nh.l) - qh*dl, shift it and add in nl.h, and
				1225	* try to subtract qh * dl from that. Since the remainder is 1/2-word
				1226	* long, shifting and adding nl.h results in a single word r.
				1227	* It is possible that the remainder we're working with, r, is less than
				1228	* the product qh * dl, if we estimated qh too high. The estimation
				1229	* technique can produce a qh that is too large (never too small), leading
				1230	* to r which is too small. In that case, decrement the digit qh, add
				1231	* shifted dh to r (to correct for that error), and subtract dl from the
				1232	* product we're comparing r with. That's the "correct" way to do it, but
				1233	* just adding dl to r instead of subtracting it from the product is
				1234	* equivalent and a lot simpler. You just have to watch out for overflow.
				1235	*
				1236	* The process is repeated with (rrrr rrrr nl.l) for the low digit of the
				1237	* quotient ql.
				1238	*
				1239	* The various uses of 32/2 for shifts are because of the note about
				1240	* automatic editing of this file at the very top of the file.
				1241	*/
				1242	#define highhalf(x) ( (x) >> 32/2 )
				1243	#define lowhalf(x) ( (x) & (((BNWORD32)1 << 32/2)-1) )
				1244	BNWORD32
				1245	lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)
				1246	{
				1247	BNWORD32 dh = highhalf(d), dl = lowhalf(d);
				1248	BNWORD32 qh, ql, prod, r;
				1249
				1250	/* Divisor must be normalized */
				1251	assert((d >> (32-1)) == 1);
				1252
				1253	/* Do first half-word of division */
				1254	qh = nh / dh;
				1255	r = nh % dh;
				1256	prod = qh * dl;
				1257
				1258	/*
				1259	* Add next half-word of numerator to remainder and correct.
				1260	* qh may be up to two too large.
				1261	*/
				1262	r = (r << (32/2)) \| highhalf(nl);
				1263	if (r < prod) {
				1264	--qh; r += d;
				1265	if (r >= d && r < prod) {
				1266	--qh; r += d;
				1267	}
				1268	}
				1269	r -= prod;
				1270
				1271	/* Do second half-word of division */
				1272	ql = r / dh;
				1273	r = r % dh;
				1274	prod = ql * dl;
				1275
				1276	r = (r << (32/2)) \| lowhalf(nl);
				1277	if (r < prod) {
				1278	--ql; r += d;
				1279	if (r >= d && r < prod) {
				1280	--ql; r += d;
				1281	}
				1282	}
				1283	r -= prod;
				1284
				1285	*q = (qh << (32/2)) \| ql;
				1286
				1287	return r;
				1288	}
				1289	#endif
				1290	#endif /* lbnDiv21_32 */
				1291
				1292
				1293	/*
				1294	* In the division functions, the dividend and divisor are referred to
				1295	* as "n" and "d", which stand for "numerator" and "denominator".
				1296	*
				1297	* The quotient is (nlen-dlen+1) digits long. It may be overlapped with
				1298	* the high (nlen-dlen) words of the dividend, but one extra word is needed
				1299	* on top to hold the top word.
				1300	*/
				1301
				1302	/*
				1303	* Divide an n-word number by a 1-word number, storing the remainder
				1304	* and n-1 words of the n-word quotient. The high word is returned.
				1305	* It IS legal for rem to point to the same address as n, and for
				1306	* q to point one word higher.
				1307	*
				1308	* TODO: If BN_SLOW_DIVIDE_64, add a divnhalf_32 which uses 32-bit
				1309	* dividends if the divisor is half that long.
				1310	* TODO: Shift the dividend on the fly to avoid the last division and
				1311	* instead have a remainder that needs shifting.
				1312	* TODO: Use reciprocals rather than dividing.
				1313	*/
				1314	#ifndef lbnDiv1_32
				1315	BNWORD32
				1316	lbnDiv1_32(BNWORD32 q, BNWORD32 rem, BNWORD32 const *n, unsigned len,
				1317	BNWORD32 d)
				1318	{
				1319	unsigned shift;
				1320	unsigned xlen;
				1321	BNWORD32 r;
				1322	BNWORD32 qhigh;
				1323
				1324	assert(len > 0);
				1325	assert(d);
				1326
				1327	if (len == 1) {
				1328	r = *n;
				1329	*rem = r%d;
				1330	return r/d;
				1331	}
				1332
				1333	shift = 0;
				1334	r = d;
				1335	xlen = 32/2;
				1336	do {
				1337	if (r >> xlen)
				1338	r >>= xlen;
				1339	else
				1340	shift += xlen;
				1341	} while ((xlen /= 2) != 0);
				1342	assert((d >> (32-1-shift)) == 1);
				1343	d <<= shift;
				1344
				1345	BIGLITTLE(q -= len-1,q += len-1);
				1346	BIGLITTLE(n -= len,n += len);
				1347
				1348	r = BIGLITTLE(n++,--n);
				1349	if (r < d) {
				1350	qhigh = 0;
				1351	} else {
				1352	qhigh = r/d;
				1353	r %= d;
				1354	}
				1355
				1356	xlen = len;
				1357	while (--xlen)
				1358	r = lbnDiv21_32(BIGLITTLE(q++,--q), r, BIGLITTLE(n++,--n), d);
				1359
				1360	/*
				1361	* Final correction for shift - shift the quotient up "shift"
				1362	* bits, and merge in the extra bits of quotient. Then reduce
				1363	* the final remainder mod the real d.
				1364	*/
				1365	if (shift) {
				1366	d >>= shift;
				1367	qhigh = (qhigh << shift) \| lbnLshift_32(q, len-1, shift);
				1368	BIGLITTLE(q[-1],*q) \|= r/d;
				1369	r %= d;
				1370	}
				1371	*rem = r;
				1372
				1373	return qhigh;
				1374	}
				1375	#endif
				1376
				1377	/*
				1378	* This function performs a "quick" modulus of a number with a divisor
				1379	* d which is guaranteed to be at most sixteen bits, i.e. less than 65536.
				1380	* This applies regardless of the word size the library is compiled with.
				1381	*
				1382	* This function is important to prime generation, for sieving.
				1383	*/
				1384	#ifndef lbnModQ_32
				1385	/* If there's a custom lbnMod21_32, no normalization needed */
				1386	#ifdef lbnMod21_32
				1387	unsigned
				1388	lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
				1389	{
				1390	unsigned i, shift;
				1391	BNWORD32 r;
				1392
				1393	assert(len > 0);
				1394
				1395	BIGLITTLE(n -= len,n += len);
				1396
				1397	/* Try using a compare to avoid the first divide */
				1398	r = BIGLITTLE(n++,--n);
				1399	if (r >= d)
				1400	r %= d;
				1401	while (--len)
				1402	r = lbnMod21_32(r, BIGLITTLE(n++,--n), d);
				1403
				1404	return r;
				1405	}
				1406	#elif defined(BNWORD64) && !BN_SLOW_DIVIDE_64
				1407	unsigned
				1408	lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
				1409	{
				1410	BNWORD32 r;
				1411
				1412	if (!--len)
				1413	return BIGLITTLE(n[-1],n[0]) % d;
				1414
				1415	BIGLITTLE(n -= len,n += len);
				1416	r = BIGLITTLE(n[-1],n[0]);
				1417
				1418	do {
				1419	r = (BNWORD32)((((BNWORD64)r<<32) \| BIGLITTLE(n++,--n)) % d);
				1420	} while (--len);
				1421
				1422	return r;
				1423	}
				1424	#elif 32 >= 0x20
				1425	/*
				1426	* If the single word size can hold 65535*65536, then this function
				1427	* is avilable.
				1428	*/
				1429	#ifndef highhalf
				1430	#define highhalf(x) ( (x) >> 32/2 )
				1431	#define lowhalf(x) ( (x) & ((1 << 32/2)-1) )
				1432	#endif
				1433	unsigned
				1434	lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
				1435	{
				1436	BNWORD32 r, x;
				1437
				1438	BIGLITTLE(n -= len,n += len);
				1439
				1440	r = BIGLITTLE(n++,--n);
				1441	while (--len) {
				1442	x = BIGLITTLE(n++,--n);
				1443	r = (r%d << 32/2) \| highhalf(x);
				1444	r = (r%d << 32/2) \| lowhalf(x);
				1445	}
				1446
				1447	return r%d;
				1448	}
				1449	#else
				1450	/* Default case - use lbnDiv21_32 */
				1451	unsigned
				1452	lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
				1453	{
				1454	unsigned i, shift;
				1455	BNWORD32 r;
				1456	BNWORD32 q;
				1457
				1458	assert(len > 0);
				1459
				1460	shift = 0;
				1461	r = d;
				1462	i = 32;
				1463	while (i /= 2) {
				1464	if (r >> i)
				1465	r >>= i;
				1466	else
				1467	shift += i;
				1468	}
				1469	assert(d >> (32-1-shift) == 1);
				1470	d <<= shift;
				1471
				1472	BIGLITTLE(n -= len,n += len);
				1473
				1474	r = BIGLITTLE(n++,--n);
				1475	if (r >= d)
				1476	r %= d;
				1477
				1478	while (--len)
				1479	r = lbnDiv21_32(&q, r, BIGLITTLE(n++,--n), d);
				1480
				1481	/*
				1482	* Final correction for shift - shift the quotient up "shift"
				1483	* bits, and merge in the extra bits of quotient. Then reduce
				1484	* the final remainder mod the real d.
				1485	*/
				1486	if (shift)
				1487	r %= d >> shift;
				1488
				1489	return r;
				1490	}
				1491	#endif
				1492	#endif /* lbnModQ_32 */
				1493
				1494	/*
				1495	* Reduce n mod d and return the quotient. That is, find:
				1496	* q = n / d;
				1497	* n = n % d;
				1498	* d is altered during the execution of this subroutine by normalizing it.
				1499	* It must already have its most significant word non-zero; it is shifted
				1500	* so its most significant bit is non-zero.
				1501	*
				1502	* The quotient q is nlen-dlen+1 words long. To make it possible to
				1503	* overlap the quptient with the input (you can store it in the high dlen
				1504	* words), the high word of the quotient is not stored, but is returned.
				1505	* (If all you want is the remainder, you don't care about it, anyway.)
				1506	*
				1507	* This uses algorithm D from Knuth (4.3.1), except that we do binary
				1508	* (shift) normalization of the divisor. WARNING: This is hairy!
				1509	*
				1510	* This function is used for some modular reduction, but it is not used in
				1511	* the modular exponentiation loops; they use Montgomery form and the
				1512	* corresponding, more efficient, Montgomery reduction. This code
				1513	* is needed for the conversion to Montgomery form, however, so it
				1514	* has to be here and it might as well be reasonably efficient.
				1515	*
				1516	* The overall operation is as follows ("top" and "up" refer to the
				1517	* most significant end of the number; "bottom" and "down", the least):
				1518	*
				1519	* - Shift the divisor up until the most significant bit is set.
				1520	* - Shift the dividend up the same amount. This will produce the
				1521	* correct quotient, and the remainder can be recovered by shifting
				1522	* it back down the same number of bits. This may produce an overflow
				1523	* word, but the word is always strictly less than the most significant
				1524	* divisor word.
				1525	* - Estimate the first quotient digit qhat:
				1526	* - First take the top two words (one of which is the overflow) of the
				1527	* dividend and divide by the top word of the divisor:
				1528	* qhat = (nh,nm)/dh. This qhat is >= the correct quotient digit
				1529	* and, since dh is normalized, it is at most two over.
				1530	* - Second, correct by comparing the top three words. If
				1531	* (dh,dl) * qhat > (nh,nm,ml), decrease qhat and try again.
				1532	* The second iteration can be simpler because there can't be a third.
				1533	* The computation can be simplified by subtracting dh*qhat from
				1534	* both sides, suitably shifted. This reduces the left side to
				1535	* dlqhat. On the right, (nh,nm)-dhqhat is simply the
				1536	* remainder r from (nh,nm)%dh, so the right is (r,nl).
				1537	* This produces qhat that is almost always correct and at
				1538	* most (prob ~ 2/2^32) one too high.
				1539	* - Subtract qhat times the divisor (suitably shifted) from the dividend.
				1540	* If there is a borrow, qhat was wrong, so decrement it
				1541	* and add the divisor back in (once).
				1542	* - Store the final quotient digit qhat in the quotient array q.
				1543	*
				1544	* Repeat the quotient digit computation for successive digits of the
				1545	* quotient until the whole quotient has been computed. Then shift the
				1546	* divisor and the remainder down to correct for the normalization.
				1547	*
				1548	* TODO: Special case 2-word divisors.
				1549	* TODO: Use reciprocals rather than dividing.
				1550	*/
				1551	#ifndef divn_32
				1552	BNWORD32
				1553	lbnDiv_32(BNWORD32 q, BNWORD32 n, unsigned nlen, BNWORD32 *d, unsigned dlen)
				1554	{
				1555	BNWORD32 nh,nm,nl; /* Top three words of the dividend */
				1556	BNWORD32 dh,dl; /* Top two words of the divisor */
				1557	BNWORD32 qhat; /* Extimate of quotient word */
				1558	BNWORD32 r; /* Remainder from quotient estimate division */
				1559	BNWORD32 qhigh; /* High word of quotient */
				1560	unsigned i; /* Temp */
				1561	unsigned shift; /* Bits shifted by normalization */
				1562	unsigned qlen = nlen-dlen; /* Size of quotient (less 1) */
				1563	#ifdef mul32_ppmm
				1564	BNWORD32 t32;
				1565	#elif defined(BNWORD64)
				1566	BNWORD64 t64;
				1567	#else /* use lbnMulN1_32 */
				1568	BNWORD32 t2[2];
				1569	#define t2high BIGLITTLE(t2[0],t2[1])
				1570	#define t2low BIGLITTLE(t2[1],t2[0])
				1571	#endif
				1572
				1573	assert(dlen);
				1574	assert(nlen >= dlen);
				1575
				1576	/*
				1577	* Special cases for short divisors. The general case uses the
				1578	* top top 2 digits of the divisor (d) to estimate a quotient digit,
				1579	* so it breaks if there are fewer digits available. Thus, we need
				1580	* special cases for a divisor of length 1. A divisor of length
				1581	* 2 can have a lot of administrivia overhead removed removed,
				1582	* so it's probably worth special-casing that case, too.
				1583	*/
				1584	if (dlen == 1)
				1585	return lbnDiv1_32(q, BIGLITTLE(n-1,n), n, nlen,
				1586	BIGLITTLE(d[-1],d[0]));
				1587
				1588	#if 0
				1589	/*
				1590	* @@@ This is not yet written... The general loop will do,
				1591	* albeit less efficiently
				1592	*/
				1593	if (dlen == 2) {
				1594	/*
				1595	* divisor two digits long:
				1596	* use the 3/2 technique from Knuth, but we know
				1597	* it's exact.
				1598	*/
				1599	dh = BIGLITTLE(d[-1],d[0]);
				1600	dl = BIGLITTLE(d[-2],d[1]);
				1601	shift = 0;
				1602	if ((sh & ((BNWORD32)1 << 32-1-shift)) == 0) {
				1603	do {
				1604	shift++;
				1605	} while (dh & (BNWORD32)1<<32-1-shift) == 0);
				1606	dh = dh << shift \| dl >> (32-shift);
				1607	dl <<= shift;
				1608
				1609
				1610	}
				1611
				1612
				1613	for (shift = 0; (dh & (BNWORD32)1 << 32-1-shift)) == 0; shift++)
				1614	;
				1615	if (shift) {
				1616	}
				1617	dh = dh << shift \| dl >> (32-shift);
				1618	shift = 0;
				1619	while (dh
				1620	}
				1621	#endif
				1622
				1623	dh = BIGLITTLE((d-dlen),(d+(dlen-1)));
				1624	assert(dh);
				1625
				1626	/* Normalize the divisor */
				1627	shift = 0;
				1628	r = dh;
				1629	i = 32/2;
				1630	do {
				1631	if (r >> i)
				1632	r >>= i;
				1633	else
				1634	shift += i;
				1635	} while ((i /= 2) != 0);
				1636
				1637	nh = 0;
				1638	if (shift) {
				1639	lbnLshift_32(d, dlen, shift);
				1640	dh = BIGLITTLE((d-dlen),(d+(dlen-1)));
				1641	nh = lbnLshift_32(n, nlen, shift);
				1642	}
				1643
				1644	/* Assert that dh is now normalized */
				1645	assert(dh >> (32-1));
				1646
				1647	/* Also get the second-most significant word of the divisor */
				1648	dl = BIGLITTLE((d-(dlen-1)),(d+(dlen-2)));
				1649
				1650	/*
				1651	* Adjust pointers: n to point to least significant end of first
				1652	* first subtract, and q to one the most-significant end of the
				1653	* quotient array.
				1654	*/
				1655	BIGLITTLE(n -= qlen,n += qlen);
				1656	BIGLITTLE(q -= qlen,q += qlen);
				1657
				1658	/* Fetch the most significant stored word of the dividend */
				1659	nm = BIGLITTLE((n-dlen),(n+(dlen-1)));
				1660
				1661	/*
				1662	* Compute the first digit of the quotient, based on the
				1663	* first two words of the dividend (the most significant of which
				1664	* is the overflow word h).
				1665	*/
				1666	if (nh) {
				1667	assert(nh < dh);
				1668	r = lbnDiv21_32(&qhat, nh, nm, dh);
				1669	} else if (nm >= dh) {
				1670	qhat = nm/dh;
				1671	r = nm % dh;
				1672	} else { /* Quotient is zero */
				1673	qhigh = 0;
				1674	goto divloop;
				1675	}
				1676
				1677	/* Now get the third most significant word of the dividend */
				1678	nl = BIGLITTLE((n-(dlen-1)),(n+(dlen-2)));
				1679
				1680	/*
				1681	* Correct qhat, the estimate of quotient digit.
				1682	* qhat can only be high, and at most two words high,
				1683	* so the loop can be unrolled and abbreviated.
				1684	*/
				1685	#ifdef mul32_ppmm
				1686	mul32_ppmm(nm, t32, qhat, dl);
				1687	if (nm > r \|\| (nm == r && t32 > nl)) {
				1688	/* Decrement qhat and adjust comparison parameters */
				1689	qhat--;
				1690	if ((r += dh) >= dh) {
				1691	nm -= (t32 < dl);
				1692	t32 -= dl;
				1693	if (nm > r \|\| (nm == r && t32 > nl))
				1694	qhat--;
				1695	}
				1696	}
				1697	#elif defined(BNWORD64)
				1698	t64 = (BNWORD64)qhat * dl;
				1699	if (t64 > ((BNWORD64)r << 32) + nl) {
				1700	/* Decrement qhat and adjust comparison parameters */
				1701	qhat--;
				1702	if ((r += dh) > dh) {
				1703	t64 -= dl;
				1704	if (t64 > ((BNWORD64)r << 32) + nl)
				1705	qhat--;
				1706	}
				1707	}
				1708	#else /* Use lbnMulN1_32 */
				1709	lbnMulN1_32(BIGLITTLE(t2+2,t2), &dl, 1, qhat);
				1710	if (t2high > r \|\| (t2high == r && t2low > nl)) {
				1711	/* Decrement qhat and adjust comparison parameters */
				1712	qhat--;
				1713	if ((r += dh) >= dh) {
				1714	t2high -= (t2low < dl);
				1715	t2low -= dl;
				1716	if (t2high > r \|\| (t2high == r && t2low > nl))
				1717	qhat--;
				1718	}
				1719	}
				1720	#endif
				1721
				1722	/* Do the multiply and subtract */
				1723	r = lbnMulSub1_32(n, d, dlen, qhat);
				1724	/* If there was a borrow, add back once. */
				1725	if (r > nh) { /* Borrow? */
				1726	(void)lbnAddN_32(n, d, dlen);
				1727	qhat--;
				1728	}
				1729
				1730	/* Remember the first quotient digit. */
				1731	qhigh = qhat;
				1732
				1733	/* Now, the main division loop: */
				1734	divloop:
				1735	while (qlen--) {
				1736
				1737	/* Advance n */
				1738	nh = BIGLITTLE((n-dlen),(n+(dlen-1)));
				1739	BIGLITTLE(++n,--n);
				1740	nm = BIGLITTLE((n-dlen),(n+(dlen-1)));
				1741
				1742	if (nh == dh) {
				1743	qhat = ~(BNWORD32)0;
				1744	/* Optimized computation of r = (nh,nm) - qhat * dh */
				1745	r = nh + nm;
				1746	if (r < nh)
				1747	goto subtract;
				1748	} else {
				1749	assert(nh < dh);
				1750	r = lbnDiv21_32(&qhat, nh, nm, dh);
				1751	}
				1752
				1753	nl = BIGLITTLE((n-(dlen-1)),(n+(dlen-2)));
				1754	#ifdef mul32_ppmm
				1755	mul32_ppmm(nm, t32, qhat, dl);
				1756	if (nm > r \|\| (nm == r && t32 > nl)) {
				1757	/* Decrement qhat and adjust comparison parameters */
				1758	qhat--;
				1759	if ((r += dh) >= dh) {
				1760	nm -= (t32 < dl);
				1761	t32 -= dl;
				1762	if (nm > r \|\| (nm == r && t32 > nl))
				1763	qhat--;
				1764	}
				1765	}
				1766	#elif defined(BNWORD64)
				1767	t64 = (BNWORD64)qhat * dl;
				1768	if (t64 > ((BNWORD64)r<<32) + nl) {
				1769	/* Decrement qhat and adjust comparison parameters */
				1770	qhat--;
				1771	if ((r += dh) >= dh) {
				1772	t64 -= dl;
				1773	if (t64 > ((BNWORD64)r << 32) + nl)
				1774	qhat--;
				1775	}
				1776	}
				1777	#else /* Use lbnMulN1_32 */
				1778	lbnMulN1_32(BIGLITTLE(t2+2,t2), &dl, 1, qhat);
				1779	if (t2high > r \|\| (t2high == r && t2low > nl)) {
				1780	/* Decrement qhat and adjust comparison parameters */
				1781	qhat--;
				1782	if ((r += dh) >= dh) {
				1783	t2high -= (t2low < dl);
				1784	t2low -= dl;
				1785	if (t2high > r \|\| (t2high == r && t2low > nl))
				1786	qhat--;
				1787	}
				1788	}
				1789	#endif
				1790
				1791	/*
				1792	* As a point of interest, note that it is not worth checking
				1793	* for qhat of 0 or 1 and installing special-case code. These
				1794	* occur with probability 2^-32, so spending 1 cycle to check
				1795	* for them is only worth it if we save more than 2^15 cycles,
				1796	* and a multiply-and-subtract for numbers in the 1024-bit
				1797	* range just doesn't take that long.
				1798	*/
				1799	subtract:
				1800	/*
				1801	* n points to the least significant end of the substring
				1802	* of n to be subtracted from. qhat is either exact or
				1803	* one too large. If the subtract gets a borrow, it was
				1804	* one too large and the divisor is added back in. It's
				1805	* a dlen+1 word add which is guaranteed to produce a
				1806	* carry out, so it can be done very simply.
				1807	*/
				1808	r = lbnMulSub1_32(n, d, dlen, qhat);
				1809	if (r > nh) { /* Borrow? */
				1810	(void)lbnAddN_32(n, d, dlen);
				1811	qhat--;
				1812	}
				1813	/* Store the quotient digit */
				1814	BIGLITTLE(q++,--q) = qhat;
				1815	}
				1816	/* Tah dah! */
				1817
				1818	if (shift) {
				1819	lbnRshift_32(d, dlen, shift);
				1820	lbnRshift_32(n, dlen, shift);
				1821	}
				1822
				1823	return qhigh;
				1824	}
				1825	#endif
				1826
				1827	/*
				1828	* Find the negative multiplicative inverse of x (x must be odd!) modulo 2^32.
				1829	*
				1830	* This just performs Newton's iteration until it gets the
				1831	* inverse. The initial estimate is always correct to 3 bits, and
				1832	* sometimes 4. The number of valid bits doubles each iteration.
				1833	* (To prove it, assume x * y == 1 (mod 2^n), and introduce a variable
				1834	* for the error mod 2^2n. x * y == 1 + k*2^n (mod 2^2n) and follow
				1835	* the iteration through.)
				1836	*/
				1837	#ifndef lbnMontInv1_32
				1838	BNWORD32
				1839	lbnMontInv1_32(BNWORD32 const x)
				1840	{
				1841	BNWORD32 y = x, z;
				1842
				1843	assert(x & 1);
				1844
				1845	while ((z = x*y) != 1)
				1846	y *= 2 - z;
				1847	return -y;
				1848	}
				1849	#endif /* !lbnMontInv1_32 */
				1850
				1851	#if defined(BNWORD64) && PRODUCT_SCAN
				1852	/*
				1853	* Test code for product-scanning Montgomery reduction.
				1854	* This seems to slow the C code down rather than speed it up.
				1855	*
				1856	* The first loop computes the Montgomery multipliers, storing them over
				1857	* the low half of the number n.
				1858	*
				1859	* The second half multiplies the upper half, adding in the modulus
				1860	* times the Montgomery multipliers. The results of this multiply
				1861	* are stored.
				1862	*/
				1863	void
				1864	lbnMontReduce_32(BNWORD32 n, BNWORD32 const mod, unsigned mlen, BNWORD32 inv)
				1865	{
				1866	BNWORD64 x, y;
				1867	BNWORD32 const *pm;
				1868	BNWORD32 *pn;
				1869	BNWORD32 t;
				1870	unsigned carry;
				1871	unsigned i, j;
				1872
				1873	/* Special case of zero */
				1874	if (!mlen)
				1875	return;
				1876
				1877	/* Pass 1 - compute Montgomery multipliers */
				1878	/* First iteration can have certain simplifications. */
				1879	t = BIGLITTLE(n[-1],n[0]);
				1880	x = t;
				1881	t *= inv;
				1882	BIGLITTLE(n[-1], n[0]) = t;
				1883	x += (BNWORD64)t * BIGLITTLE(mod[-1],mod[0]); /* Can't overflow */
				1884	assert((BNWORD32)x == 0);
				1885	x = x >> 32;
				1886
				1887	for (i = 1; i < mlen; i++) {
				1888	carry = 0;
				1889	pn = n;
				1890	pm = BIGLITTLE(mod-i-1,mod+i+1);
				1891	for (j = 0; j < i; j++) {
				1892	y = (BNWORD64)BIGLITTLE(--pn pm++, pn++ * *--pm);
				1893	x += y;
				1894	carry += (x < y);
				1895	}
				1896	assert(BIGLITTLE(pn == n-i, pn == n+i));
				1897	y = t = BIGLITTLE(pn[-1], pn[0]);
				1898	x += y;
				1899	carry += (x < y);
				1900	BIGLITTLE(pn[-1], pn[0]) = t = inv * (BNWORD32)x;
				1901	assert(BIGLITTLE(pm == mod-1, pm == mod+1));
				1902	y = (BNWORD64)t * BIGLITTLE(pm[0],pm[-1]);
				1903	x += y;
				1904	carry += (x < y);
				1905	assert((BNWORD32)x == 0);
				1906	x = x >> 32 \| (BNWORD64)carry << 32;
				1907	}
				1908
				1909	BIGLITTLE(n -= mlen, n += mlen);
				1910
				1911	/* Pass 2 - compute upper words and add to n */
				1912	for (i = 1; i < mlen; i++) {
				1913	carry = 0;
				1914	pm = BIGLITTLE(mod-i,mod+i);
				1915	pn = n;
				1916	for (j = i; j < mlen; j++) {
				1917	y = (BNWORD64)BIGLITTLE(--pm pn++, pm++ * *--pn);
				1918	x += y;
				1919	carry += (x < y);
				1920	}
				1921	assert(BIGLITTLE(pm == mod-mlen, pm == mod+mlen));
				1922	assert(BIGLITTLE(pn == n+mlen-i, pn == n-mlen+i));
				1923	y = t = BIGLITTLE((n-i),(n+i-1));
				1924	x += y;
				1925	carry += (x < y);
				1926	BIGLITTLE((n-i),(n+i-1)) = (BNWORD32)x;
				1927	x = (x >> 32) \| (BNWORD64)carry << 32;
				1928	}
				1929
				1930	/* Last round of second half, simplified. */
				1931	t = BIGLITTLE((n-mlen),(n+mlen-1));
				1932	x += t;
				1933	BIGLITTLE((n-mlen),(n+mlen-1)) = (BNWORD32)x;
				1934	carry = (unsigned)(x >> 32);
				1935
				1936	while (carry)
				1937	carry -= lbnSubN_32(n, mod, mlen);
				1938	while (lbnCmp_32(n, mod, mlen) >= 0)
				1939	(void)lbnSubN_32(n, mod, mlen);
				1940	}
				1941	#define lbnMontReduce_32 lbnMontReduce_32
				1942	#endif
				1943
				1944	/*
				1945	* Montgomery reduce n, modulo mod. This reduces modulo mod and divides by
				1946	* 2^(32mlen). Returns the result in the top* mlen words of the argument n.
				1947	* This is ready for another multiplication using lbnMul_32.
				1948	*
				1949	* Montgomery representation is a very useful way to encode numbers when
				1950	* you're doing lots of modular reduction. What you do is pick a multiplier
				1951	* R which is relatively prime to the modulus and very easy to divide by.
				1952	* Since the modulus is odd, R is closen as a power of 2, so the division
				1953	* is a shift. In fact, it's a shift of an integral number of words,
				1954	* so the shift can be implicit - just drop the low-order words.
				1955	*
				1956	* Now, choose R larger than the modulus m, 2^(32*mlen). Then convert
				1957	* all numbers a, b, etc. to Montgomery form M(a), M(b), etc using the
				1958	* relationship M(a) = aR mod m, M(b) = bR mod m, etc. Note that:
				1959	* - The Montgomery form of a number depends on the modulus m.
				1960	* A fixed modulus m is assumed throughout this discussion.
				1961	* - Since R is relaitvely prime to m, multiplication by R is invertible;
				1962	* no information about the numbers is lost, they're just scrambled.
				1963	* - Adding (and subtracting) numbers in this form works just as usual.
				1964	* M(a+b) = (a+b)R mod m = (aR + b*R) mod m = (M(a) + M(b)) mod m
				1965	* - Multiplying numbers in this form produces abR*R. The problem
				1966	* is to divide out the excess factor of R, modulo m as well as to
				1967	* reduce to the given length mlen. It turns out that this can be
				1968	* done faster than a normal divide, which is where the speedup
				1969	* in Montgomery division comes from.
				1970	*
				1971	* Normal reduction chooses a most-significant quotient digit q and then
				1972	* subtracts q*m from the number to be reduced. Choosing q is tricky
				1973	* and involved (just look at lbnDiv_32 to see!) and is usually
				1974	* imperfect, requiring a check for correction after the subtraction.
				1975	*
				1976	* Montgomery reduction adds a multiple of m to the low-order part
				1977	* of the number to be reduced. This multiple is chosen to make the
				1978	* low-order part of the number come out to zero. This can be done
				1979	* with no trickery or error using a precomputed inverse of the modulus.
				1980	* In this code, the "part" is one word, but any width can be used.
				1981	*
				1982	* Repeating this step sufficiently often results in a value which
				1983	* is a multiple of R (a power of two, remember) but is still (since
				1984	* the additions were to the low-order part and thus did not increase
				1985	* the value of the number being reduced very much) still not much
				1986	* larger than m*R. Then implicitly divide by R and subtract off
				1987	* m until the result is in the correct range.
				1988	*
				1989	* Since the low-order part being cancelled is less than R, the
				1990	* multiple of m added must have a multiplier which is at most R-1.
				1991	* Assuming that the input is at most m*R-1, the final number is
				1992	* at most m(2R-1)-1 = 2mR - m - 1, so subtracting m once from
				1993	* the high-order part, equivalent to subtracting m*R from the
				1994	* while number, produces a result which is at most m*R - m - 1,
				1995	* which divided by R is at most m-1.
				1996	*
				1997	* To convert to Montgomery form, you need a regular remainder
				1998	* routine, although you can just compute R*R (mod m) and do the
				1999	* conversion using Montgomery multiplication. To convert from
				2000	* Montgomery form, just Montgomery reduce the number to
				2001	* remove the extra factor of R.
				2002	*
				2003	* TODO: Change to a full inverse and use Karatsuba's multiplication
				2004	* rather than this word-at-a-time.
				2005	*/
				2006	#ifndef lbnMontReduce_32
				2007	void
				2008	lbnMontReduce_32(BNWORD32 n, BNWORD32 const mod, unsigned const mlen,
				2009	BNWORD32 inv)
				2010	{
				2011	BNWORD32 t;
				2012	BNWORD32 c = 0;
				2013	unsigned len = mlen;
				2014
				2015	/* inv must be the negative inverse of mod's least significant word */
				2016	assert((BNWORD32)(inv * BIGLITTLE(mod[-1],mod[0])) == (BNWORD32)-1);
				2017
				2018	assert(len);
				2019
				2020	do {
				2021	t = lbnMulAdd1_32(n, mod, mlen, inv * BIGLITTLE(n[-1],n[0]));
				2022	c += lbnAdd1_32(BIGLITTLE(n-mlen,n+mlen), len, t);
				2023	BIGLITTLE(--n,++n);
				2024	} while (--len);
				2025
				2026	/*
				2027	* All that adding can cause an overflow past the modulus size,
				2028	* but it's unusual, and never by much, so a subtraction loop
				2029	* is the right way to deal with it.
				2030	* This subtraction happens infrequently - I've only ever seen it
				2031	* invoked once per reduction, and then just under 22.5% of the time.
				2032	*/
				2033	while (c)
				2034	c -= lbnSubN_32(n, mod, mlen);
				2035	while (lbnCmp_32(n, mod, mlen) >= 0)
				2036	(void)lbnSubN_32(n, mod, mlen);
				2037	}
				2038	#endif /* !lbnMontReduce_32 */
				2039
				2040	/*
				2041	* A couple of helpers that you might want to implement atomically
				2042	* in asm sometime.
				2043	*/
				2044	#ifndef lbnMontMul_32
				2045	/*
				2046	* Multiply "num1" by "num2", modulo "mod", all of length "len", and
				2047	* place the result in the high half of "prod". "inv" is the inverse
				2048	* of the least-significant word of the modulus, modulo 2^32.
				2049	* This uses numbers in Montgomery form. Reduce using "len" and "inv".
				2050	*
				2051	* This is implemented as a macro to win on compilers that don't do
				2052	* inlining, since it's so trivial.
				2053	*/
				2054	#define lbnMontMul_32(prod, n1, n2, mod, len, inv) \
				2055	(lbnMulX_32(prod, n1, n2, len), lbnMontReduce_32(prod, mod, len, inv))
				2056	#endif /* !lbnMontMul_32 */
				2057
				2058	#ifndef lbnMontSquare_32
				2059	/*
				2060	* Square "n", modulo "mod", both of length "len", and place the result
				2061	* in the high half of "prod". "inv" is the inverse of the least-significant
				2062	* word of the modulus, modulo 2^32.
				2063	* This uses numbers in Montgomery form. Reduce using "len" and "inv".
				2064	*
				2065	* This is implemented as a macro to win on compilers that don't do
				2066	* inlining, since it's so trivial.
				2067	*/
				2068	#define lbnMontSquare_32(prod, n, mod, len, inv) \
				2069	(lbnSquare_32(prod, n, len), lbnMontReduce_32(prod, mod, len, inv))
				2070
				2071	#endif /* !lbnMontSquare_32 */
				2072
				2073	/*
				2074	* Convert a number to Montgomery form - requires mlen + nlen words
				2075	* of memory in "n".
				2076	*/
				2077	void
				2078	lbnToMont_32(BNWORD32 n, unsigned nlen, BNWORD32 mod, unsigned mlen)
				2079	{
				2080	/* Move n up "mlen" words */
				2081	lbnCopy_32(BIGLITTLE(n-mlen,n+mlen), n, nlen);
				2082	lbnZero_32(n, mlen);
				2083	/* Do the division - dump the quotient in the high-order words */
				2084	(void)lbnDiv_32(BIGLITTLE(n-mlen,n+mlen), n, mlen+nlen, mod, mlen);
				2085	}
				2086
				2087	/*
				2088	* Convert from Montgomery form. Montgomery reduction is all that is
				2089	* needed.
				2090	*/
				2091	void
				2092	lbnFromMont_32(BNWORD32 n, BNWORD32 mod, unsigned len)
				2093	{
				2094	/* Zero the high words of n */
				2095	lbnZero_32(BIGLITTLE(n-len,n+len), len);
				2096	lbnMontReduce_32(n, mod, len, lbnMontInv1_32(mod[BIGLITTLE(-1,0)]));
				2097	/* Move n down len words */
				2098	lbnCopy_32(n, BIGLITTLE(n-len,n+len), len);
				2099	}
				2100
				2101	/*
				2102	* The windowed exponentiation algorithm, precomputes a table of odd
				2103	* powers of n up to 2^k. See the comment in bnExpMod_32 below for
				2104	* an explanation of how it actually works works.
				2105	*
				2106	* It takes 2^(k-1)-1 multiplies to compute the table, and (e-1)/(k+1)
				2107	* multiplies (on average) to perform the exponentiation. To minimize
				2108	* the sum, k must vary with e. The optimal window sizes vary with the
				2109	* exponent length. Here are some selected values and the boundary cases.
				2110	* (An underscore _ has been inserted into some of the numbers to ensure
				2111	* that magic strings like 32 do not appear in this table. It should be
				2112	* ignored.)
				2113	*
				2114	* At e = 1 bits, k=1 (0.000000) is best
				2115	* At e = 2 bits, k=1 (0.500000) is best
				2116	* At e = 4 bits, k=1 (1.500000) is best
				2117	* At e = 8 bits, k=2 (3.333333) < k=1 (3.500000)
				2118	* At e = 1_6 bits, k=2 (6.000000) is best
				2119	* At e = 26 bits, k=3 (9.250000) < k=2 (9.333333)
				2120	* At e = 3_2 bits, k=3 (10.750000) is best
				2121	* At e = 6_4 bits, k=3 (18.750000) is best
				2122	* At e = 82 bits, k=4 (23.200000) < k=3 (23.250000)
				2123	* At e = 128 bits, k=4 (3_2.400000) is best
				2124	* At e = 242 bits, k=5 (55.1_66667) < k=4 (55.200000)
				2125	* At e = 256 bits, k=5 (57.500000) is best
				2126	* At e = 512 bits, k=5 (100.1_66667) is best
				2127	* At e = 674 bits, k=6 (127.142857) < k=5 (127.1_66667)
				2128	* At e = 1024 bits, k=6 (177.142857) is best
				2129	* At e = 1794 bits, k=7 (287.125000) < k=6 (287.142857)
				2130	* At e = 2048 bits, k=7 (318.875000) is best
				2131	* At e = 4096 bits, k=7 (574.875000) is best
				2132	*
				2133	* The numbers in parentheses are the expected number of multiplications
				2134	* needed to do the computation. The normal russian-peasant modular
				2135	* exponentiation technique always uses (e-1)/2. For exponents as
				2136	* small as 192 bits (below the range of current factoring algorithms),
				2137	* half of the multiplies are eliminated, 45.2 as opposed to the naive
				2138	* 95.5. Counting the 191 squarings as 3/4 a multiply each (squaring
				2139	* proper is just over half of multiplying, but the Montgomery
				2140	* reduction in each case is also a multiply), that's 143.25
				2141	* multiplies, for totals of 188.45 vs. 238.75 - a 21% savings.
				2142	* For larger exponents (like 512 bits), it's 483.92 vs. 639.25, a
				2143	* 24.3% savings. It asymptotically approaches 25%.
				2144	*
				2145	* Um, actually there's a slightly more accurate way to count, which
				2146	* really is the average number of multiplies required, averaged
				2147	* uniformly over all 2^(e-1) e-bit numbers, from 2^(e-1) to (2^e)-1.
				2148	* It's based on the recurrence that for the last b bits, b <= k, at
				2149	* most one multiply is needed (and none at all 1/2^b of the time),
				2150	* while when b > k, the odds are 1/2 each way that the bit will be
				2151	* 0 (meaning no multiplies to reduce it to the b-1-bit case) and
				2152	* 1/2 that the bit will be 1, starting a k-bit window and requiring
				2153	* 1 multiply beyond the b-k-bit case. Since the most significant
				2154	* bit is always 1, a k-bit window always starts there, and that
				2155	* multiply is by 1, so it isn't a multiply at all. Thus, the
				2156	* number of multiplies is simply that needed for the last e-k bits.
				2157	* This recurrence produces:
				2158	*
				2159	* At e = 1 bits, k=1 (0.000000) is best
				2160	* At e = 2 bits, k=1 (0.500000) is best
				2161	* At e = 4 bits, k=1 (1.500000) is best
				2162	* At e = 6 bits, k=2 (2.437500) < k=1 (2.500000)
				2163	* At e = 8 bits, k=2 (3.109375) is best
				2164	* At e = 1_6 bits, k=2 (5.777771) is best
				2165	* At e = 24 bits, k=3 (8.437629) < k=2 (8.444444)
				2166	* At e = 3_2 bits, k=3 (10.437492) is best
				2167	* At e = 6_4 bits, k=3 (18.437500) is best
				2168	* At e = 81 bits, k=4 (22.6_40000) < k=3 (22.687500)
				2169	* At e = 128 bits, k=4 (3_2.040000) is best
				2170	* At e = 241 bits, k=5 (54.611111) < k=4 (54.6_40000)
				2171	* At e = 256 bits, k=5 (57.111111) is best
				2172	* At e = 512 bits, k=5 (99.777778) is best
				2173	* At e = 673 bits, k=6 (126.591837) < k=5 (126.611111)
				2174	* At e = 1024 bits, k=6 (176.734694) is best
				2175	* At e = 1793 bits, k=7 (286.578125) < k=6 (286.591837)
				2176	* At e = 2048 bits, k=7 (318.453125) is best
				2177	* At e = 4096 bits, k=7 (574.453125) is best
				2178	*
				2179	* This has the rollover points at 6, 24, 81, 241, 673 and 1793 instead
				2180	* of 8, 26, 82, 242, 674, and 1794. Not a very big difference.
				2181	* (The numbers past that are k=8 at 4609 and k=9 at 11521,
				2182	* vs. one more in each case for the approximation.)
				2183	*
				2184	* Given that exponents for which k>7 are useful are uncommon,
				2185	* a fixed size table for k <= 7 is used for simplicity.
				2186	*
				2187	* The basic number of squarings needed is e-1, although a k-bit
				2188	* window (for k > 1) can save, on average, k-2 of those, too.
				2189	* That savings currently isn't counted here. It would drive the
				2190	* crossover points slightly lower.
				2191	* (Actually, this win is also reduced in the DoubleExpMod case,
				2192	* meaning we'd have to split the tables. Except for that, the
				2193	* multiplies by powers of the two bases are independent, so
				2194	* the same logic applies to each as the single case.)
				2195	*
				2196	* Table entry i is the largest number of bits in an exponent to
				2197	* process with a window size of i+1. Entry 6 is the largest
				2198	* possible unsigned number, so the window will never be more
				2199	* than 7 bits, requiring 2^6 = 0x40 slots.
				2200	*/
				2201	#define BNEXPMOD_MAX_WINDOW 7
				2202	static unsigned const bnExpModThreshTable[BNEXPMOD_MAX_WINDOW] = {
				2203	5, 23, 80, 240, 672, 1792, (unsigned)-1
				2204	/* 7, 25, 81, 241, 673, 1793, (unsigned)-1 ### The old approximations */
				2205	};
				2206
				2207	/*
				2208	* Perform modular exponentiation, as fast as possible! This uses
				2209	* Montgomery reduction, optimized squaring, and windowed exponentiation.
				2210	* The modulus "mod" MUST be odd!
				2211	*
				2212	* This returns 0 on success, -1 on out of memory.
				2213	*
				2214	* The window algorithm:
				2215	* The idea is to keep a running product of b1 = n^(high-order bits of exp),
				2216	* and then keep appending exponent bits to it. The following patterns
				2217	* apply to a 3-bit window (k = 3):
				2218	* To append 0: square
				2219	* To append 1: square, multiply by n^1
				2220	* To append 10: square, multiply by n^1, square
				2221	* To append 11: square, square, multiply by n^3
				2222	* To append 100: square, multiply by n^1, square, square
				2223	* To append 101: square, square, square, multiply by n^5
				2224	* To append 110: square, square, multiply by n^3, square
				2225	* To append 111: square, square, square, multiply by n^7
				2226	*
				2227	* Since each pattern involves only one multiply, the longer the pattern
				2228	* the better, except that a 0 (no multiplies) can be appended directly.
				2229	* We precompute a table of odd powers of n, up to 2^k, and can then
				2230	* multiply k bits of exponent at a time. Actually, assuming random
				2231	* exponents, there is on average one zero bit between needs to
				2232	* multiply (1/2 of the time there's none, 1/4 of the time there's 1,
				2233	* 1/8 of the time, there's 2, 1/32 of the time, there's 3, etc.), so
				2234	* you have to do one multiply per k+1 bits of exponent.
				2235	*
				2236	* The loop walks down the exponent, squaring the result buffer as
				2237	* it goes. There is a wbits+1 bit lookahead buffer, buf, that is
				2238	* filled with the upcoming exponent bits. (What is read after the
				2239	* end of the exponent is unimportant, but it is filled with zero here.)
				2240	* When the most-significant bit of this buffer becomes set, i.e.
				2241	* (buf & tblmask) != 0, we have to decide what pattern to multiply
				2242	* by, and when to do it. We decide, remember to do it in future
				2243	* after a suitable number of squarings have passed (e.g. a pattern
				2244	* of "100" in the buffer requires that we multiply by n^1 immediately;
				2245	* a pattern of "110" calls for multiplying by n^3 after one more
				2246	* squaring), clear the buffer, and continue.
				2247	*
				2248	* When we start, there is one more optimization: the result buffer
				2249	* is implcitly one, so squaring it or multiplying by it can be
				2250	* optimized away. Further, if we start with a pattern like "100"
				2251	* in the lookahead window, rather than placing n into the buffer
				2252	* and then starting to square it, we have already computed n^2
				2253	* to compute the odd-powers table, so we can place that into
				2254	* the buffer and save a squaring.
				2255	*
				2256	* This means that if you have a k-bit window, to compute n^z,
				2257	* where z is the high k bits of the exponent, 1/2 of the time
				2258	* it requires no squarings. 1/4 of the time, it requires 1
				2259	* squaring, ... 1/2^(k-1) of the time, it reqires k-2 squarings.
				2260	* And the remaining 1/2^(k-1) of the time, the top k bits are a
				2261	* 1 followed by k-1 0 bits, so it again only requires k-2
				2262	* squarings, not k-1. The average of these is 1. Add that
				2263	* to the one squaring we have to do to compute the table,
				2264	* and you'll see that a k-bit window saves k-2 squarings
				2265	* as well as reducing the multiplies. (It actually doesn't
				2266	* hurt in the case k = 1, either.)
				2267	*
				2268	* n must have mlen words allocated. Although fewer may be in use
				2269	* when n is passed in, all are in use on exit.
				2270	*/
				2271	int
				2272	lbnExpMod_32(BNWORD32 result, BNWORD32 const n, unsigned nlen,
				2273	BNWORD32 const e, unsigned elen, BNWORD32 mod, unsigned mlen)
				2274	{
				2275	BNWORD32 *table[1 << (BNEXPMOD_MAX_WINDOW-1)];
				2276	/* Table of odd powers of n */
				2277	unsigned ebits; /* Exponent bits */
				2278	unsigned wbits; /* Window size */
				2279	unsigned tblmask; /* Mask of exponentiation window */
				2280	BNWORD32 bitpos; /* Mask of current look-ahead bit */
				2281	unsigned buf; /* Buffer of exponent bits */
				2282	unsigned multpos; /* Where to do pending multiply */
				2283	BNWORD32 const mult; / What to multiply by */
				2284	unsigned i; /* Loop counter */
				2285	int isone; /* Flag: accum. is implicitly one */
				2286	BNWORD32 a, b; /* Working buffers/accumulators */
				2287	BNWORD32 t; / Pointer into the working buffers */
				2288	BNWORD32 inv; /* mod^-1 modulo 2^32 */
				2289	int y; /* bnYield() result */
				2290
				2291	assert(mlen);
				2292	assert(nlen <= mlen);
				2293
				2294	/* First, a couple of trivial cases. */
				2295	elen = lbnNorm_32(e, elen);
				2296	if (!elen) {
				2297	/* x ^ 0 == 1 */
				2298	lbnZero_32(result, mlen);
				2299	BIGLITTLE(result[-1],result[0]) = 1;
				2300	return 0;
				2301	}
				2302	ebits = lbnBits_32(e, elen);
				2303	if (ebits == 1) {
				2304	/* x ^ 1 == x */
				2305	if (n != result)
				2306	lbnCopy_32(result, n, nlen);
				2307	if (mlen > nlen)
				2308	lbnZero_32(BIGLITTLE(result-nlen,result+nlen),
				2309	mlen-nlen);
				2310	return 0;
				2311	}
				2312
				2313	/* Okay, now move the exponent pointer to the most-significant word */
				2314	e = BIGLITTLE(e-elen, e+elen-1);
				2315
				2316	/* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */
				2317	wbits = 0;
				2318	while (ebits > bnExpModThreshTable[wbits])
				2319	wbits++;
				2320
				2321	/* Allocate working storage: two product buffers and the tables. */
				2322	LBNALLOC(a, BNWORD32, 2*mlen);
				2323	if (!a)
				2324	return -1;
				2325	LBNALLOC(b, BNWORD32, 2*mlen);
				2326	if (!b) {
				2327	LBNFREE(a, 2*mlen);
				2328	return -1;
				2329	}
				2330
				2331	/* Convert to the appropriate table size: tblmask = 1<<(k-1) */
				2332	tblmask = 1u << wbits;
				2333
				2334	/* We have the result buffer available, so use it. */
				2335	table[0] = result;
				2336
				2337	/*
				2338	* Okay, we now have a minimal-sized table - expand it.
				2339	* This is allowed to fail! If so, scale back the table size
				2340	* and proceed.
				2341	*/
				2342	for (i = 1; i < tblmask; i++) {
				2343	LBNALLOC(t, BNWORD32, mlen);
				2344	if (!t) /* Out of memory! Quit the loop. */
				2345	break;
				2346	table[i] = t;
				2347	}
				2348
				2349	/* If we stopped, with i < tblmask, shrink the tables appropriately */
				2350	while (tblmask > i) {
				2351	wbits--;
				2352	tblmask >>= 1;
				2353	}
				2354	/* Free up our overallocations */
				2355	while (--i > tblmask)
				2356	LBNFREE(table[i], mlen);
				2357
				2358	/* Okay, fill in the table */
				2359
				2360	/* Compute the necessary modular inverse */
				2361	inv = lbnMontInv1_32(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
				2362
				2363	/* Convert n to Montgomery form */
				2364
				2365	/* Move n up "mlen" words into a */
				2366	t = BIGLITTLE(a-mlen, a+mlen);
				2367	lbnCopy_32(t, n, nlen);
				2368	lbnZero_32(a, mlen);
				2369	/* Do the division - lose the quotient into the high-order words */
				2370	(void)lbnDiv_32(t, a, mlen+nlen, mod, mlen);
				2371	/* Copy into first table entry */
				2372	lbnCopy_32(table[0], a, mlen);
				2373
				2374	/* Square a into b */
				2375	lbnMontSquare_32(b, a, mod, mlen, inv);
				2376
				2377	/* Use high half of b to initialize the table */
				2378	t = BIGLITTLE(b-mlen, b+mlen);
				2379	for (i = 1; i < tblmask; i++) {
				2380	lbnMontMul_32(a, t, table[i-1], mod, mlen, inv);
				2381	lbnCopy_32(table[i], BIGLITTLE(a-mlen, a+mlen), mlen);
				2382	#if BNYIELD
				2383	if (bnYield && (y = bnYield()) < 0)
				2384	goto yield;
				2385	#endif
				2386	}
				2387
				2388	/* We might use b = n^2 later... */
				2389
				2390	/* Initialze the fetch pointer */
				2391	bitpos = (BNWORD32)1 << ((ebits-1) & (32-1)); /* Initialize mask */
				2392
				2393	/* This should point to the msbit of e */
				2394	assert((*e & bitpos) != 0);
				2395
				2396	/*
				2397	* Pre-load the window. Becuase the window size is
				2398	* never larger than the exponent size, there is no need to
				2399	* detect running off the end of e in here.
				2400	*
				2401	* The read-ahead is controlled by elen and the bitpos mask.
				2402	* Note that this is ahead of ebits, which tracks the
				2403	* most significant end of the window. The purpose of this
				2404	* initialization is to get the two wbits+1 bits apart,
				2405	* like they should be.
				2406	*
				2407	* Note that bitpos and e1len together keep track of the
				2408	* lookahead read pointer in the exponent that is used here.
				2409	*/
				2410	buf = 0;
				2411	for (i = 0; i <= wbits; i++) {
				2412	buf = (buf << 1) \| ((*e & bitpos) != 0);
				2413	bitpos >>= 1;
				2414	if (!bitpos) {
				2415	BIGLITTLE(e++,e--);
				2416	bitpos = (BNWORD32)1 << (32-1);
				2417	elen--;
				2418	}
				2419	}
				2420	assert(buf & tblmask);
				2421
				2422	/*
				2423	* Set the pending multiply positions to a location that will
				2424	* never be encountered, thus ensuring that nothing will happen
				2425	* until the need for a multiply appears and one is scheduled.
				2426	*/
				2427	multpos = ebits; /* A NULL value */
				2428	mult = 0; /* Force a crash if we use these */
				2429
				2430	/*
				2431	* Okay, now begins the real work. The first step is
				2432	* slightly magic, so it's done outside the main loop,
				2433	* but it's very similar to what's inside.
				2434	*/
				2435	ebits--; /* Start processing the first bit... */
				2436	isone = 1;
				2437
				2438	/*
				2439	* This is just like the multiply in the loop, except that
				2440	* - We know the msbit of buf is set, and
				2441	* - We have the extra value n^2 floating around.
				2442	* So, do the usual computation, and if the result is that
				2443	* the buffer should be multiplied by n^1 immediately
				2444	* (which we'd normally then square), we multiply it
				2445	* (which reduces to a copy, which reduces to setting a flag)
				2446	* by n^2 and skip the squaring. Thus, we do the
				2447	* multiply and the squaring in one step.
				2448	*/
				2449	assert(buf & tblmask);
				2450	multpos = ebits - wbits;
				2451	while ((buf & 1) == 0) {
				2452	buf >>= 1;
				2453	multpos++;
				2454	}
				2455	/* Intermediates can wrap, but final must NOT */
				2456	assert(multpos <= ebits);
				2457	mult = table[buf>>1];
				2458	buf = 0;
				2459
				2460	/* Special case: use already-computed value sitting in buffer */
				2461	if (multpos == ebits)
				2462	isone = 0;
				2463
				2464	/*
				2465	* At this point, the buffer (which is the high half of b) holds
				2466	* either 1 (implicitly, as the "isone" flag is set), or n^2.
				2467	*/
				2468
				2469	/*
				2470	* The main loop. The procedure is:
				2471	* - Advance the window
				2472	* - If the most-significant bit of the window is set,
				2473	* schedule a multiply for the appropriate time in the
				2474	* future (may be immediately)
				2475	* - Perform any pending multiples
				2476	* - Check for termination
				2477	* - Square the buffer
				2478	*
				2479	* At any given time, the acumulated product is held in
				2480	* the high half of b.
				2481	*/
				2482	for (;;) {
				2483	ebits--;
				2484
				2485	/* Advance the window */
				2486	assert(buf < tblmask);
				2487	buf <<= 1;
				2488	/*
				2489	* This reads ahead of the current exponent position
				2490	* (controlled by ebits), so we have to be able to read
				2491	* past the lsb of the exponents without error.
				2492	*/
				2493	if (elen) {
				2494	buf \|= ((*e & bitpos) != 0);
				2495	bitpos >>= 1;
				2496	if (!bitpos) {
				2497	BIGLITTLE(e++,e--);
				2498	bitpos = (BNWORD32)1 << (32-1);
				2499	elen--;
				2500	}
				2501	}
				2502
				2503	/* Examine the window for pending multiplies */
				2504	if (buf & tblmask) {
				2505	multpos = ebits - wbits;
				2506	while ((buf & 1) == 0) {
				2507	buf >>= 1;
				2508	multpos++;
				2509	}
				2510	/* Intermediates can wrap, but final must NOT */
				2511	assert(multpos <= ebits);
				2512	mult = table[buf>>1];
				2513	buf = 0;
				2514	}
				2515
				2516	/* If we have a pending multiply, do it */
				2517	if (ebits == multpos) {
				2518	/* Multiply by the table entry remembered previously */
				2519	t = BIGLITTLE(b-mlen, b+mlen);
				2520	if (isone) {
				2521	/* Multiply by 1 is a trivial case */
				2522	lbnCopy_32(t, mult, mlen);
				2523	isone = 0;
				2524	} else {
				2525	lbnMontMul_32(a, t, mult, mod, mlen, inv);
				2526	/* Swap a and b */
				2527	t = a; a = b; b = t;
				2528	}
				2529	}
				2530
				2531	/* Are we done? */
				2532	if (!ebits)
				2533	break;
				2534
				2535	/* Square the input */
				2536	if (!isone) {
				2537	t = BIGLITTLE(b-mlen, b+mlen);
				2538	lbnMontSquare_32(a, t, mod, mlen, inv);
				2539	/* Swap a and b */
				2540	t = a; a = b; b = t;
				2541	}
				2542	#if BNYIELD
				2543	if (bnYield && (y = bnYield()) < 0)
				2544	goto yield;
				2545	#endif
				2546	} /* for (;;) */
				2547
				2548	assert(!isone);
				2549	assert(!buf);
				2550
				2551	/* DONE! */
				2552
				2553	/* Convert result out of Montgomery form */
				2554	t = BIGLITTLE(b-mlen, b+mlen);
				2555	lbnCopy_32(b, t, mlen);
				2556	lbnZero_32(t, mlen);
				2557	lbnMontReduce_32(b, mod, mlen, inv);
				2558	lbnCopy_32(result, t, mlen);
				2559	/*
				2560	* Clean up - free intermediate storage.
				2561	* Do NOT free table[0], which is the result
				2562	* buffer.
				2563	*/
				2564	y = 0;
				2565	#if BNYIELD
				2566	yield:
				2567	#endif
				2568	while (--tblmask)
				2569	LBNFREE(table[tblmask], mlen);
				2570	LBNFREE(b, 2*mlen);
				2571	LBNFREE(a, 2*mlen);
				2572
				2573	return y; /* Success */
				2574	}
				2575
				2576	/*
				2577	* Compute and return n1^e1 * n2^e2 mod "mod".
				2578	* result may be either input buffer, or something separate.
				2579	* It must be "mlen" words long.
				2580	*
				2581	* There is a current position in the exponents, which is kept in e1bits.
				2582	* (The exponents are swapped if necessary so e1 is the longer of the two.)
				2583	* At any given time, the value in the accumulator is
				2584	* n1^(e1>>e1bits) * n2^(e2>>e1bits) mod "mod".
				2585	* As e1bits is counted down, this is updated, by squaring it and doing
				2586	* any necessary multiplies.
				2587	* To decide on the necessary multiplies, two windows, each w1bits+1 bits
				2588	* wide, are maintained in buf1 and buf2, which read ahead of the
				2589	* e1bits position (with appropriate handling of the case when e1bits
				2590	* drops below w1bits+1). When the most-significant bit of either window
				2591	* becomes set, indicating that something needs to be multiplied by
				2592	* the accumulator or it will get out of sync, the window is examined
				2593	* to see which power of n1 or n2 to multiply by, and when (possibly
				2594	* later, if the power is greater than 1) the multiply should take
				2595	* place. Then the multiply and its location are remembered and the
				2596	* window is cleared.
				2597	*
				2598	* If we had every power of n1 in the table, the multiply would always
				2599	* be w1bits steps in the future. But we only keep the odd powers,
				2600	* so instead of waiting w1bits squarings and then multiplying
				2601	* by n1^k, we wait w1bits-k squarings and multiply by n1.
				2602	*
				2603	* Actually, w2bits can be less than w1bits, but the window is the same
				2604	* size, to make it easier to keep track of where we're reading. The
				2605	* appropriate number of low-order bits of the window are just ignored.
				2606	*/
				2607	int
				2608	lbnDoubleExpMod_32(BNWORD32 *result,
				2609	BNWORD32 const *n1, unsigned n1len,
				2610	BNWORD32 const *e1, unsigned e1len,
				2611	BNWORD32 const *n2, unsigned n2len,
				2612	BNWORD32 const *e2, unsigned e2len,
				2613	BNWORD32 *mod, unsigned mlen)
				2614	{
				2615	BNWORD32 *table1[1 << (BNEXPMOD_MAX_WINDOW-1)];
				2616	/* Table of odd powers of n1 */
				2617	BNWORD32 *table2[1 << (BNEXPMOD_MAX_WINDOW-1)];
				2618	/* Table of odd powers of n2 */
				2619	unsigned e1bits, e2bits; /* Exponent bits */
				2620	unsigned w1bits, w2bits; /* Window sizes */
				2621	unsigned tblmask; /* Mask of exponentiation window */
				2622	BNWORD32 bitpos; /* Mask of current look-ahead bit */
				2623	unsigned buf1, buf2; /* Buffer of exponent bits */
				2624	unsigned mult1pos, mult2pos; /* Where to do pending multiply */
				2625	BNWORD32 const mult1, mult2; /* What to multiply by */
				2626	unsigned i; /* Loop counter */
				2627	int isone; /* Flag: accum. is implicitly one */
				2628	BNWORD32 a, b; /* Working buffers/accumulators */
				2629	BNWORD32 t; / Pointer into the working buffers */
				2630	BNWORD32 inv; /* mod^-1 modulo 2^32 */
				2631	int y; /* bnYield() result */
				2632
				2633	assert(mlen);
				2634	assert(n1len <= mlen);
				2635	assert(n2len <= mlen);
				2636
				2637	/* First, a couple of trivial cases. */
				2638	e1len = lbnNorm_32(e1, e1len);
				2639	e2len = lbnNorm_32(e2, e2len);
				2640
				2641	/* Ensure that the first exponent is the longer */
				2642	e1bits = lbnBits_32(e1, e1len);
				2643	e2bits = lbnBits_32(e2, e2len);
				2644	if (e1bits < e2bits) {
				2645	i = e1len; e1len = e2len; e2len = i;
				2646	i = e1bits; e1bits = e2bits; e2bits = i;
				2647	t = (BNWORD32 *)n1; n1 = n2; n2 = t;
				2648	t = (BNWORD32 *)e1; e1 = e2; e2 = t;
				2649	}
				2650	assert(e1bits >= e2bits);
				2651
				2652	/* Handle a trivial case */
				2653	if (!e2len)
				2654	return lbnExpMod_32(result, n1, n1len, e1, e1len, mod, mlen);
				2655	assert(e2bits);
				2656
				2657	/* The code below fucks up if the exponents aren't at least 2 bits */
				2658	if (e1bits == 1) {
				2659	assert(e2bits == 1);
				2660
				2661	LBNALLOC(a, BNWORD32, n1len+n2len);
				2662	if (!a)
				2663	return -1;
				2664
				2665	lbnMul_32(a, n1, n1len, n2, n2len);
				2666	/* Do a direct modular reduction */
				2667	if (n1len + n2len >= mlen)
				2668	(void)lbnDiv_32(a+mlen, a, n1len+n2len, mod, mlen);
				2669	lbnCopy_32(result, a, mlen);
				2670	LBNFREE(a, n1len+n2len);
				2671	return 0;
				2672	}
				2673
				2674	/* Okay, now move the exponent pointers to the most-significant word */
				2675	e1 = BIGLITTLE(e1-e1len, e1+e1len-1);
				2676	e2 = BIGLITTLE(e2-e2len, e2+e2len-1);
				2677
				2678	/* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */
				2679	w1bits = 0;
				2680	while (e1bits > bnExpModThreshTable[w1bits])
				2681	w1bits++;
				2682	w2bits = 0;
				2683	while (e2bits > bnExpModThreshTable[w2bits])
				2684	w2bits++;
				2685
				2686	assert(w1bits >= w2bits);
				2687
				2688	/* Allocate working storage: two product buffers and the tables. */
				2689	LBNALLOC(a, BNWORD32, 2*mlen);
				2690	if (!a)
				2691	return -1;
				2692	LBNALLOC(b, BNWORD32, 2*mlen);
				2693	if (!b) {
				2694	LBNFREE(a, 2*mlen);
				2695	return -1;
				2696	}
				2697
				2698	/* Convert to the appropriate table size: tblmask = 1<<(k-1) */
				2699	tblmask = 1u << w1bits;
				2700	/* Use buf2 for its size, temporarily */
				2701	buf2 = 1u << w2bits;
				2702
				2703	LBNALLOC(t, BNWORD32, mlen);
				2704	if (!t) {
				2705	LBNFREE(b, 2*mlen);
				2706	LBNFREE(a, 2*mlen);
				2707	return -1;
				2708	}
				2709	table1[0] = t;
				2710	table2[0] = result;
				2711
				2712	/*
				2713	* Okay, we now have some minimal-sized tables - expand them.
				2714	* This is allowed to fail! If so, scale back the table sizes
				2715	* and proceed. We allocate both tables at the same time
				2716	* so if it fails partway through, they'll both be a reasonable
				2717	* size rather than one huge and one tiny.
				2718	* When i passes buf2 (the number of entries in the e2 window,
				2719	* which may be less than the number of entries in the e1 window),
				2720	* stop allocating e2 space.
				2721	*/
				2722	for (i = 1; i < tblmask; i++) {
				2723	LBNALLOC(t, BNWORD32, mlen);
				2724	if (!t) /* Out of memory! Quit the loop. */
				2725	break;
				2726	table1[i] = t;
				2727	if (i < buf2) {
				2728	LBNALLOC(t, BNWORD32, mlen);
				2729	if (!t) {
				2730	LBNFREE(table1[i], mlen);
				2731	break;
				2732	}
				2733	table2[i] = t;
				2734	}
				2735	}
				2736
				2737	/* If we stopped, with i < tblmask, shrink the tables appropriately */
				2738	while (tblmask > i) {
				2739	w1bits--;
				2740	tblmask >>= 1;
				2741	}
				2742	/* Free up our overallocations */
				2743	while (--i > tblmask) {
				2744	if (i < buf2)
				2745	LBNFREE(table2[i], mlen);
				2746	LBNFREE(table1[i], mlen);
				2747	}
				2748	/* And shrink the second window too, if needed */
				2749	if (w2bits > w1bits) {
				2750	w2bits = w1bits;
				2751	buf2 = tblmask;
				2752	}
				2753
				2754	/*
				2755	* From now on, use the w2bits variable for the difference
				2756	* between w1bits and w2bits.
				2757	*/
				2758	w2bits = w1bits-w2bits;
				2759
				2760	/* Okay, fill in the tables */
				2761
				2762	/* Compute the necessary modular inverse */
				2763	inv = lbnMontInv1_32(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
				2764
				2765	/* Convert n1 to Montgomery form */
				2766
				2767	/* Move n1 up "mlen" words into a */
				2768	t = BIGLITTLE(a-mlen, a+mlen);
				2769	lbnCopy_32(t, n1, n1len);
				2770	lbnZero_32(a, mlen);
				2771	/* Do the division - lose the quotient into the high-order words */
				2772	(void)lbnDiv_32(t, a, mlen+n1len, mod, mlen);
				2773	/* Copy into first table entry */
				2774	lbnCopy_32(table1[0], a, mlen);
				2775
				2776	/* Square a into b */
				2777	lbnMontSquare_32(b, a, mod, mlen, inv);
				2778
				2779	/* Use high half of b to initialize the first table */
				2780	t = BIGLITTLE(b-mlen, b+mlen);
				2781	for (i = 1; i < tblmask; i++) {
				2782	lbnMontMul_32(a, t, table1[i-1], mod, mlen, inv);
				2783	lbnCopy_32(table1[i], BIGLITTLE(a-mlen, a+mlen), mlen);
				2784	#if BNYIELD
				2785	if (bnYield && (y = bnYield()) < 0)
				2786	goto yield;
				2787	#endif
				2788	}
				2789
				2790	/* Convert n2 to Montgomery form */
				2791
				2792	t = BIGLITTLE(a-mlen, a+mlen);
				2793	/* Move n2 up "mlen" words into a */
				2794	lbnCopy_32(t, n2, n2len);
				2795	lbnZero_32(a, mlen);
				2796	/* Do the division - lose the quotient into the high-order words */
				2797	(void)lbnDiv_32(t, a, mlen+n2len, mod, mlen);
				2798	/* Copy into first table entry */
				2799	lbnCopy_32(table2[0], a, mlen);
				2800
				2801	/* Square it into a */
				2802	lbnMontSquare_32(a, table2[0], mod, mlen, inv);
				2803	/* Copy to b, low half */
				2804	lbnCopy_32(b, t, mlen);
				2805
				2806	/* Use b to initialize the second table */
				2807	for (i = 1; i < buf2; i++) {
				2808	lbnMontMul_32(a, b, table2[i-1], mod, mlen, inv);
				2809	lbnCopy_32(table2[i], t, mlen);
				2810	#if BNYIELD
				2811	if (bnYield && (y = bnYield()) < 0)
				2812	goto yield;
				2813	#endif
				2814	}
				2815
				2816	/*
				2817	* Okay, a recap: at this point, the low part of b holds
				2818	* n2^2, the high part holds n1^2, and the tables are
				2819	* initialized with the odd powers of n1 and n2 from 1
				2820	* through 2tblmask-1 and 2buf2-1.
				2821	*
				2822	* We might use those squares in b later, or we might not.
				2823	*/
				2824
				2825	/* Initialze the fetch pointer */
				2826	bitpos = (BNWORD32)1 << ((e1bits-1) & (32-1)); /* Initialize mask */
				2827
				2828	/* This should point to the msbit of e1 */
				2829	assert((*e1 & bitpos) != 0);
				2830
				2831	/*
				2832	* Pre-load the windows. Becuase the window size is
				2833	* never larger than the exponent size, there is no need to
				2834	* detect running off the end of e1 in here.
				2835	*
				2836	* The read-ahead is controlled by e1len and the bitpos mask.
				2837	* Note that this is ahead of e1bits, which tracks the
				2838	* most significant end of the window. The purpose of this
				2839	* initialization is to get the two w1bits+1 bits apart,
				2840	* like they should be.
				2841	*
				2842	* Note that bitpos and e1len together keep track of the
				2843	* lookahead read pointer in the exponent that is used here.
				2844	* e2len is not decremented, it is only ever compared with
				2845	* e1len as that is decremented.
				2846	*/
				2847	buf1 = buf2 = 0;
				2848	for (i = 0; i <= w1bits; i++) {
				2849	buf1 = (buf1 << 1) \| ((*e1 & bitpos) != 0);
				2850	if (e1len <= e2len)
				2851	buf2 = (buf2 << 1) \| ((*e2 & bitpos) != 0);
				2852	bitpos >>= 1;
				2853	if (!bitpos) {
				2854	BIGLITTLE(e1++,e1--);
				2855	if (e1len <= e2len)
				2856	BIGLITTLE(e2++,e2--);
				2857	bitpos = (BNWORD32)1 << (32-1);
				2858	e1len--;
				2859	}
				2860	}
				2861	assert(buf1 & tblmask);
				2862
				2863	/*
				2864	* Set the pending multiply positions to a location that will
				2865	* never be encountered, thus ensuring that nothing will happen
				2866	* until the need for a multiply appears and one is scheduled.
				2867	*/
				2868	mult1pos = mult2pos = e1bits; /* A NULL value */
				2869	mult1 = mult2 = 0; /* Force a crash if we use these */
				2870
				2871	/*
				2872	* Okay, now begins the real work. The first step is
				2873	* slightly magic, so it's done outside the main loop,
				2874	* but it's very similar to what's inside.
				2875	*/
				2876	isone = 1; /* Buffer is implicitly 1, so replace * by copy */
				2877	e1bits--; /* Start processing the first bit... */
				2878
				2879	/*
				2880	* This is just like the multiply in the loop, except that
				2881	* - We know the msbit of buf1 is set, and
				2882	* - We have the extra value n1^2 floating around.
				2883	* So, do the usual computation, and if the result is that
				2884	* the buffer should be multiplied by n1^1 immediately
				2885	* (which we'd normally then square), we multiply it
				2886	* (which reduces to a copy, which reduces to setting a flag)
				2887	* by n1^2 and skip the squaring. Thus, we do the
				2888	* multiply and the squaring in one step.
				2889	*/
				2890	assert(buf1 & tblmask);
				2891	mult1pos = e1bits - w1bits;
				2892	while ((buf1 & 1) == 0) {
				2893	buf1 >>= 1;
				2894	mult1pos++;
				2895	}
				2896	/* Intermediates can wrap, but final must NOT */
				2897	assert(mult1pos <= e1bits);
				2898	mult1 = table1[buf1>>1];
				2899	buf1 = 0;
				2900
				2901	/* Special case: use already-computed value sitting in buffer */
				2902	if (mult1pos == e1bits)
				2903	isone = 0;
				2904
				2905	/*
				2906	* The first multiply by a power of n2. Similar, but
				2907	* we might not even want to schedule a multiply if e2 is
				2908	* shorter than e1, and the window might be shorter so
				2909	* we have to leave the low w2bits bits alone.
				2910	*/
				2911	if (buf2 & tblmask) {
				2912	/* Remember low-order bits for later */
				2913	i = buf2 & ((1u << w2bits) - 1);
				2914	buf2 >>= w2bits;
				2915	mult2pos = e1bits - w1bits + w2bits;
				2916	while ((buf2 & 1) == 0) {
				2917	buf2 >>= 1;
				2918	mult2pos++;
				2919	}
				2920	assert(mult2pos <= e1bits);
				2921	mult2 = table2[buf2>>1];
				2922	buf2 = i;
				2923
				2924	if (mult2pos == e1bits) {
				2925	t = BIGLITTLE(b-mlen, b+mlen);
				2926	if (isone) {
				2927	lbnCopy_32(t, b, mlen); /* Copy low to high */
				2928	isone = 0;
				2929	} else {
				2930	lbnMontMul_32(a, t, b, mod, mlen, inv);
				2931	t = a; a = b; b = t;
				2932	}
				2933	}
				2934	}
				2935
				2936	/*
				2937	* At this point, the buffer (which is the high half of b)
				2938	* holds either 1 (implicitly, as the "isone" flag is set),
				2939	* n1^2, n2^2 or n1^2 * n2^2.
				2940	*/
				2941
				2942	/*
				2943	* The main loop. The procedure is:
				2944	* - Advance the windows
				2945	* - If the most-significant bit of a window is set,
				2946	* schedule a multiply for the appropriate time in the
				2947	* future (may be immediately)
				2948	* - Perform any pending multiples
				2949	* - Check for termination
				2950	* - Square the buffers
				2951	*
				2952	* At any given time, the acumulated product is held in
				2953	* the high half of b.
				2954	*/
				2955	for (;;) {
				2956	e1bits--;
				2957
				2958	/* Advance the windows */
				2959	assert(buf1 < tblmask);
				2960	buf1 <<= 1;
				2961	assert(buf2 < tblmask);
				2962	buf2 <<= 1;
				2963	/*
				2964	* This reads ahead of the current exponent position
				2965	* (controlled by e1bits), so we have to be able to read
				2966	* past the lsb of the exponents without error.
				2967	*/
				2968	if (e1len) {
				2969	buf1 \|= ((*e1 & bitpos) != 0);
				2970	if (e1len <= e2len)
				2971	buf2 \|= ((*e2 & bitpos) != 0);
				2972	bitpos >>= 1;
				2973	if (!bitpos) {
				2974	BIGLITTLE(e1++,e1--);
				2975	if (e1len <= e2len)
				2976	BIGLITTLE(e2++,e2--);
				2977	bitpos = (BNWORD32)1 << (32-1);
				2978	e1len--;
				2979	}
				2980	}
				2981
				2982	/* Examine the first window for pending multiplies */
				2983	if (buf1 & tblmask) {
				2984	mult1pos = e1bits - w1bits;
				2985	while ((buf1 & 1) == 0) {
				2986	buf1 >>= 1;
				2987	mult1pos++;
				2988	}
				2989	/* Intermediates can wrap, but final must NOT */
				2990	assert(mult1pos <= e1bits);
				2991	mult1 = table1[buf1>>1];
				2992	buf1 = 0;
				2993	}
				2994
				2995	/*
				2996	* Examine the second window for pending multiplies.
				2997	* Window 2 can be smaller than window 1, but we
				2998	* keep the same number of bits in buf2, so we need
				2999	* to ignore any low-order bits in the buffer when
				3000	* computing what to multiply by, and recompute them
				3001	* later.
				3002	*/
				3003	if (buf2 & tblmask) {
				3004	/* Remember low-order bits for later */
				3005	i = buf2 & ((1u << w2bits) - 1);
				3006	buf2 >>= w2bits;
				3007	mult2pos = e1bits - w1bits + w2bits;
				3008	while ((buf2 & 1) == 0) {
				3009	buf2 >>= 1;
				3010	mult2pos++;
				3011	}
				3012	assert(mult2pos <= e1bits);
				3013	mult2 = table2[buf2>>1];
				3014	buf2 = i;
				3015	}
				3016
				3017
				3018	/* If we have a pending multiply for e1, do it */
				3019	if (e1bits == mult1pos) {
				3020	/* Multiply by the table entry remembered previously */
				3021	t = BIGLITTLE(b-mlen, b+mlen);
				3022	if (isone) {
				3023	/* Multiply by 1 is a trivial case */
				3024	lbnCopy_32(t, mult1, mlen);
				3025	isone = 0;
				3026	} else {
				3027	lbnMontMul_32(a, t, mult1, mod, mlen, inv);
				3028	/* Swap a and b */
				3029	t = a; a = b; b = t;
				3030	}
				3031	}
				3032
				3033	/* If we have a pending multiply for e2, do it */
				3034	if (e1bits == mult2pos) {
				3035	/* Multiply by the table entry remembered previously */
				3036	t = BIGLITTLE(b-mlen, b+mlen);
				3037	if (isone) {
				3038	/* Multiply by 1 is a trivial case */
				3039	lbnCopy_32(t, mult2, mlen);
				3040	isone = 0;
				3041	} else {
				3042	lbnMontMul_32(a, t, mult2, mod, mlen, inv);
				3043	/* Swap a and b */
				3044	t = a; a = b; b = t;
				3045	}
				3046	}
				3047
				3048	/* Are we done? */
				3049	if (!e1bits)
				3050	break;
				3051
				3052	/* Square the buffer */
				3053	if (!isone) {
				3054	t = BIGLITTLE(b-mlen, b+mlen);
				3055	lbnMontSquare_32(a, t, mod, mlen, inv);
				3056	/* Swap a and b */
				3057	t = a; a = b; b = t;
				3058	}
				3059	#if BNYIELD
				3060	if (bnYield && (y = bnYield()) < 0)
				3061	goto yield;
				3062	#endif
				3063	} /* for (;;) */
				3064
				3065	assert(!isone);
				3066	assert(!buf1);
				3067	assert(!buf2);
				3068
				3069	/* DONE! */
				3070
				3071	/* Convert result out of Montgomery form */
				3072	t = BIGLITTLE(b-mlen, b+mlen);
				3073	lbnCopy_32(b, t, mlen);
				3074	lbnZero_32(t, mlen);
				3075	lbnMontReduce_32(b, mod, mlen, inv);
				3076	lbnCopy_32(result, t, mlen);
				3077
				3078	/* Clean up - free intermediate storage */
				3079	y = 0;
				3080	#if BNYIELD
				3081	yield:
				3082	#endif
				3083	buf2 = tblmask >> w2bits;
				3084	while (--tblmask) {
				3085	if (tblmask < buf2)
				3086	LBNFREE(table2[tblmask], mlen);
				3087	LBNFREE(table1[tblmask], mlen);
				3088	}
				3089	t = table1[0];
				3090	LBNFREE(t, mlen);
				3091	LBNFREE(b, 2*mlen);
				3092	LBNFREE(a, 2*mlen);
				3093
				3094	return y; /* Success */
				3095	}
				3096
				3097	/*
				3098	* 2^exp (mod mod). This is an optimized version for use in Fermat
				3099	* tests. The input value of n is ignored; it is returned with
				3100	* "mlen" words valid.
				3101	*/
				3102	int
				3103	lbnTwoExpMod_32(BNWORD32 n, BNWORD32 const exp, unsigned elen,
				3104	BNWORD32 *mod, unsigned mlen)
				3105	{
				3106	unsigned e; /* Copy of high words of the exponent */
				3107	unsigned bits; /* Assorted counter of bits */
				3108	BNWORD32 const *bitptr;
				3109	BNWORD32 bitword, bitpos;
				3110	BNWORD32 a, b, *a1;
				3111	BNWORD32 inv;
				3112	int y; /* Result of bnYield() */
				3113
				3114	assert(mlen);
				3115
				3116	bitptr = BIGLITTLE(exp-elen, exp+elen-1);
				3117	bitword = *bitptr;
				3118	assert(bitword);
				3119
				3120	/* Clear n for future use. */
				3121	lbnZero_32(n, mlen);
				3122
				3123	bits = lbnBits_32(exp, elen);
				3124
				3125	/* First, a couple of trivial cases. */
				3126	if (bits <= 1) {
				3127	/* 2 ^ 0 == 1, 2 ^ 1 == 2 */
				3128	BIGLITTLE(n[-1],n[0]) = (BNWORD32)1<<elen;
				3129	return 0;
				3130	}
				3131
				3132	/* Set bitpos to the most significant bit */
				3133	bitpos = (BNWORD32)1 << ((bits-1) & (32-1));
				3134
				3135	/* Now, count the bits in the modulus. */
				3136	bits = lbnBits_32(mod, mlen);
				3137	assert(bits > 1); /* a 1-bit modulus is just stupid... */
				3138
				3139	/*
				3140	* We start with 1<<e, where "e" is as many high bits of the
				3141	* exponent as we can manage without going over the modulus.
				3142	* This first loop finds "e".
				3143	*/
				3144	e = 1;
				3145	while (elen) {
				3146	/* Consume the first bit */
				3147	bitpos >>= 1;
				3148	if (!bitpos) {
				3149	if (!--elen)
				3150	break;
				3151	bitword = BIGLITTLE(++bitptr,--bitptr);
				3152	bitpos = (BNWORD32)1<<(32-1);
				3153	}
				3154	e = (e << 1) \| ((bitpos & bitword) != 0);
				3155	if (e >= bits) { /* Overflow! Back out. */
				3156	e >>= 1;
				3157	break;
				3158	}
				3159	}
				3160	/*
				3161	* The bit in "bitpos" being examined by the bit buffer has NOT
				3162	* been consumed yet. This may be past the end of the exponent,
				3163	* in which case elen == 1.
				3164	*/
				3165
				3166	/* Okay, now, set bit "e" in n. n is already zero. */
				3167	inv = (BNWORD32)1 << (e & (32-1));
				3168	e /= 32;
				3169	BIGLITTLE(n[-e-1],n[e]) = inv;
				3170	/*
				3171	* The effective length of n in words is now "e+1".
				3172	* This is used a little bit later.
				3173	*/
				3174
				3175	if (!elen)
				3176	return 0; /* That was easy! */
				3177
				3178	/*
				3179	* We have now processed the first few bits. The next step
				3180	* is to convert this to Montgomery form for further squaring.
				3181	*/
				3182
				3183	/* Allocate working storage: two product buffers */
				3184	LBNALLOC(a, BNWORD32, 2*mlen);
				3185	if (!a)
				3186	return -1;
				3187	LBNALLOC(b, BNWORD32, 2*mlen);
				3188	if (!b) {
				3189	LBNFREE(a, 2*mlen);
				3190	return -1;
				3191	}
				3192
				3193	/* Convert n to Montgomery form */
				3194	inv = BIGLITTLE(mod[-1],mod[0]); /* LSW of modulus */
				3195	assert(inv & 1); /* Modulus must be odd */
				3196	inv = lbnMontInv1_32(inv);
				3197	/* Move n (length e+1, remember?) up "mlen" words into b */
				3198	/* Note that we lie about a1 for a bit - it's pointing to b */
				3199	a1 = BIGLITTLE(b-mlen,b+mlen);
				3200	lbnCopy_32(a1, n, e+1);
				3201	lbnZero_32(b, mlen);
				3202	/* Do the division - dump the quotient into the high-order words */
				3203	(void)lbnDiv_32(a1, b, mlen+e+1, mod, mlen);
				3204	/*
				3205	* Now do the first squaring and modular reduction to put
				3206	* the number up in a1 where it belongs.
				3207	*/
				3208	lbnMontSquare_32(a, b, mod, mlen, inv);
				3209	/* Fix up a1 to point to where it should go. */
				3210	a1 = BIGLITTLE(a-mlen,a+mlen);
				3211
				3212	/*
				3213	* Okay, now, a1 holds the number being accumulated, and
				3214	* b is a scratch register. Start working:
				3215	*/
				3216	for (;;) {
				3217	/*
				3218	* Is the bit set? If so, double a1 as well.
				3219	* A modular doubling like this is very cheap.
				3220	*/
				3221	if (bitpos & bitword) {
				3222	/*
				3223	* Double the number. If there was a carry out OR
				3224	* the result is greater than the modulus, subract
				3225	* the modulus.
				3226	*/
				3227	if (lbnDouble_32(a1, mlen) \|\|
				3228	lbnCmp_32(a1, mod, mlen) > 0)
				3229	(void)lbnSubN_32(a1, mod, mlen);
				3230	}
				3231
				3232	/* Advance to the next exponent bit */
				3233	bitpos >>= 1;
				3234	if (!bitpos) {
				3235	if (!--elen)
				3236	break; /* Done! */
				3237	bitword = BIGLITTLE(++bitptr,--bitptr);
				3238	bitpos = (BNWORD32)1<<(32-1);
				3239	}
				3240
				3241	/*
				3242	* The elen/bitword/bitpos bit buffer is known to be
				3243	* non-empty, i.e. there is at least one more unconsumed bit.
				3244	* Thus, it's safe to square the number.
				3245	*/
				3246	lbnMontSquare_32(b, a1, mod, mlen, inv);
				3247	/* Rename result (in b) back to a (a1, really). */
				3248	a1 = b; b = a; a = a1;
				3249	a1 = BIGLITTLE(a-mlen,a+mlen);
				3250	#if BNYIELD
				3251	if (bnYield && (y = bnYield()) < 0)
				3252	goto yield;
				3253	#endif
				3254	}
				3255
				3256	/* DONE! Just a little bit of cleanup... */
				3257
				3258	/*
				3259	* Convert result out of Montgomery form... this is
				3260	* just a Montgomery reduction.
				3261	*/
				3262	lbnCopy_32(a, a1, mlen);
				3263	lbnZero_32(a1, mlen);
				3264	lbnMontReduce_32(a, mod, mlen, inv);
				3265	lbnCopy_32(n, a1, mlen);
				3266
				3267	/* Clean up - free intermediate storage */
				3268	y = 0;
				3269	#if BNYIELD
				3270	yield:
				3271	#endif
				3272	LBNFREE(b, 2*mlen);
				3273	LBNFREE(a, 2*mlen);
				3274
				3275	return y; /* Success */
				3276	}
				3277
				3278
				3279	/*
				3280	* Returns a substring of the big-endian array of bytes representation
				3281	* of the bignum array based on two parameters, the least significant
				3282	* byte number (0 to start with the least significant byte) and the
				3283	* length. I.e. the number returned is a representation of
				3284	* (bn / 2^(8lsbyte)) % 2 ^ (8buflen).
				3285	*
				3286	* It is an error if the bignum is not at least buflen + lsbyte bytes
				3287	* long.
				3288	*
				3289	* This code assumes that the compiler has the minimal intelligence
				3290	* neded to optimize divides and modulo operations on an unsigned data
				3291	* type with a power of two.
				3292	*/
				3293	void
				3294	lbnExtractBigBytes_32(BNWORD32 const n, unsigned char buf,
				3295	unsigned lsbyte, unsigned buflen)
				3296	{
				3297	BNWORD32 t = 0; /* Needed to shut up uninitialized var warnings */
				3298	unsigned shift;
				3299
				3300	lsbyte += buflen;
				3301
				3302	shift = (8 * lsbyte) % 32;
				3303	lsbyte /= (32/8); /* Convert to word offset */
				3304	BIGLITTLE(n -= lsbyte, n += lsbyte);
				3305
				3306	if (shift)
				3307	t = BIGLITTLE(n[-1],n[0]);
				3308
				3309	while (buflen--) {
				3310	if (!shift) {
				3311	t = BIGLITTLE(n++,--n);
				3312	shift = 32;
				3313	}
				3314	shift -= 8;
				3315	*buf++ = (unsigned char)(t>>shift);
				3316	}
				3317	}
				3318
				3319	/*
				3320	* Merge a big-endian array of bytes into a bignum array.
				3321	* The array had better be big enough. This is
				3322	* equivalent to extracting the entire bignum into a
				3323	* large byte array, copying the input buffer into the
				3324	* middle of it, and converting back to a bignum.
				3325	*
				3326	* The buf is "len" bytes long, and its last byte is at
				3327	* position "lsbyte" from the end of the bignum.
				3328	*
				3329	* Note that this is a pain to get right. Fortunately, it's hardly
				3330	* critical for efficiency.
				3331	*/
				3332	void
				3333	lbnInsertBigBytes_32(BNWORD32 n, unsigned char const buf,
				3334	unsigned lsbyte, unsigned buflen)
				3335	{
				3336	BNWORD32 t = 0; /* Shut up uninitialized varibale warnings */
				3337
				3338	lsbyte += buflen;
				3339
				3340	BIGLITTLE(n -= lsbyte/(32/8), n += lsbyte/(32/8));
				3341
				3342	/* Load up leading odd bytes */
				3343	if (lsbyte % (32/8)) {
				3344	t = BIGLITTLE(--n,n++);
				3345	t >>= (lsbyte * 8) % 32;
				3346	}
				3347
				3348	/* The main loop - merge into t, storing at each word boundary. */
				3349	while (buflen--) {
				3350	t = (t << 8) \| *buf++;
				3351	if ((--lsbyte % (32/8)) == 0)
				3352	BIGLITTLE(n++,--n) = t;
				3353	}
				3354
				3355	/* Merge odd bytes in t into last word */
				3356	lsbyte = (lsbyte * 8) % 32;
				3357	if (lsbyte) {
				3358	t <<= lsbyte;
				3359	t \|= (((BNWORD32)1 << lsbyte) - 1) & BIGLITTLE(n[0],n[-1]);
				3360	BIGLITTLE(n[0],n[-1]) = t;
				3361	}
				3362
				3363	return;
				3364	}
				3365
				3366	/*
				3367	* Returns a substring of the little-endian array of bytes representation
				3368	* of the bignum array based on two parameters, the least significant
				3369	* byte number (0 to start with the least significant byte) and the
				3370	* length. I.e. the number returned is a representation of
				3371	* (bn / 2^(8lsbyte)) % 2 ^ (8buflen).
				3372	*
				3373	* It is an error if the bignum is not at least buflen + lsbyte bytes
				3374	* long.
				3375	*
				3376	* This code assumes that the compiler has the minimal intelligence
				3377	* neded to optimize divides and modulo operations on an unsigned data
				3378	* type with a power of two.
				3379	*/
				3380	void
				3381	lbnExtractLittleBytes_32(BNWORD32 const n, unsigned char buf,
				3382	unsigned lsbyte, unsigned buflen)
				3383	{
				3384	BNWORD32 t = 0; /* Needed to shut up uninitialized var warnings */
				3385
				3386	BIGLITTLE(n -= lsbyte/(32/8), n += lsbyte/(32/8));
				3387
				3388	if (lsbyte % (32/8)) {
				3389	t = BIGLITTLE(--n,n++);
				3390	t >>= (lsbyte % (32/8)) * 8 ;
				3391	}
				3392
				3393	while (buflen--) {
				3394	if ((lsbyte++ % (32/8)) == 0)
				3395	t = BIGLITTLE(--n,n++);
				3396	*buf++ = (unsigned char)t;
				3397	t >>= 8;
				3398	}
				3399	}
				3400
				3401	/*
				3402	* Merge a little-endian array of bytes into a bignum array.
				3403	* The array had better be big enough. This is
				3404	* equivalent to extracting the entire bignum into a
				3405	* large byte array, copying the input buffer into the
				3406	* middle of it, and converting back to a bignum.
				3407	*
				3408	* The buf is "len" bytes long, and its first byte is at
				3409	* position "lsbyte" from the end of the bignum.
				3410	*
				3411	* Note that this is a pain to get right. Fortunately, it's hardly
				3412	* critical for efficiency.
				3413	*/
				3414	void
				3415	lbnInsertLittleBytes_32(BNWORD32 n, unsigned char const buf,
				3416	unsigned lsbyte, unsigned buflen)
				3417	{
				3418	BNWORD32 t = 0; /* Shut up uninitialized varibale warnings */
				3419
				3420	/* Move to most-significant end */
				3421	lsbyte += buflen;
				3422	buf += buflen;
				3423
				3424	BIGLITTLE(n -= lsbyte/(32/8), n += lsbyte/(32/8));
				3425
				3426	/* Load up leading odd bytes */
				3427	if (lsbyte % (32/8)) {
				3428	t = BIGLITTLE(--n,n++);
				3429	t >>= (lsbyte * 8) % 32;
				3430	}
				3431
				3432	/* The main loop - merge into t, storing at each word boundary. */
				3433	while (buflen--) {
				3434	t = (t << 8) \| *--buf;
				3435	if ((--lsbyte % (32/8)) == 0)
				3436	BIGLITTLE(n++,--n) = t;
				3437	}
				3438
				3439	/* Merge odd bytes in t into last word */
				3440	lsbyte = (lsbyte * 8) % 32;
				3441	if (lsbyte) {
				3442	t <<= lsbyte;
				3443	t \|= (((BNWORD32)1 << lsbyte) - 1) & BIGLITTLE(n[0],n[-1]);
				3444	BIGLITTLE(n[0],n[-1]) = t;
				3445	}
				3446
				3447	return;
				3448	}
				3449
				3450	#ifdef DEADCODE /* This was a precursor to the more flexible lbnExtractBytes */
				3451	/*
				3452	* Convert a big-endian array of bytes to a bignum.
				3453	* Returns the number of words in the bignum.
				3454	* Note the expression "32/8" for the number of bytes per word.
				3455	* This is so the word-size adjustment will work.
				3456	*/
				3457	unsigned
				3458	lbnFromBytes_32(BNWORD32 a, unsigned char const b, unsigned blen)
				3459	{
				3460	BNWORD32 t;
				3461	unsigned alen = (blen + (32/8-1))/(32/8);
				3462	BIGLITTLE(a -= alen, a += alen);
				3463
				3464	while (blen) {
				3465	t = 0;
				3466	do {
				3467	t = t << 8 \| *b++;
				3468	} while (--blen & (32/8-1));
				3469	BIGLITTLE(a++,--a) = t;
				3470	}
				3471	return alen;
				3472	}
				3473	#endif
				3474
				3475	/*
				3476	* Computes the GCD of a and b. Modifies both arguments; when it returns,
				3477	* one of them is the GCD and the other is trash. The return value
				3478	* indicates which: 0 for a, and 1 for b. The length of the retult is
				3479	* returned in rlen. Both inputs must have one extra word of precision.
				3480	* alen must be >= blen.
				3481	*
				3482	* TODO: use the binary algorithm (Knuth section 4.5.2, algorithm B).
				3483	* This is based on taking out common powers of 2, then repeatedly:
				3484	* gcd(2u,v) = gcd(u,2v) = gcd(u,v) - isolated powers of 2 can be deleted.
				3485	* gcd(u,v) = gcd(u-v,v) - the numbers can be easily reduced.
				3486	* It gets less reduction per step, but the steps are much faster than
				3487	* the division case.
				3488	*/
				3489	int
				3490	lbnGcd_32(BNWORD32 a, unsigned alen, BNWORD32 b, unsigned blen,
				3491	unsigned *rlen)
				3492	{
				3493	#if BNYIELD
				3494	int y;
				3495	#endif
				3496	assert(alen >= blen);
				3497
				3498	while (blen != 0) {
				3499	(void)lbnDiv_32(BIGLITTLE(a-blen,a+blen), a, alen, b, blen);
				3500	alen = lbnNorm_32(a, blen);
				3501	if (alen == 0) {
				3502	*rlen = blen;
				3503	return 1;
				3504	}
				3505	(void)lbnDiv_32(BIGLITTLE(b-alen,b+alen), b, blen, a, alen);
				3506	blen = lbnNorm_32(b, alen);
				3507	#if BNYIELD
				3508	if (bnYield && (y = bnYield()) < 0)
				3509	return y;
				3510	#endif
				3511	}
				3512	*rlen = alen;
				3513	return 0;
				3514	}
				3515
				3516	/*
				3517	* Invert "a" modulo "mod" using the extended Euclidean algorithm.
				3518	* Note that this only computes one of the cosequences, and uses the
				3519	* theorem that the signs flip every step and the absolute value of
				3520	* the cosequence values are always bounded by the modulus to avoid
				3521	* having to work with negative numbers.
				3522	* gcd(a,mod) had better equal 1. Returns 1 if the GCD is NOT 1.
				3523	* a must be one word longer than "mod". It is overwritten with the
				3524	* result.
				3525	* TODO: Use Richard Schroeppel's much faster algorithm.
				3526	*/
				3527	int
				3528	lbnInv_32(BNWORD32 a, unsigned alen, BNWORD32 const mod, unsigned mlen)
				3529	{
				3530	BNWORD32 b; / Hold a copy of mod during GCD reduction */
				3531	BNWORD32 p; / Temporary for products added to t0 and t1 */
				3532	BNWORD32 t0, t1; /* Inverse accumulators */
				3533	BNWORD32 cy;
				3534	unsigned blen, t0len, t1len, plen;
				3535	int y;
				3536
				3537	alen = lbnNorm_32(a, alen);
				3538	if (!alen)
				3539	return 1; /* No inverse */
				3540
				3541	mlen = lbnNorm_32(mod, mlen);
				3542
				3543	assert (alen <= mlen);
				3544
				3545	/* Inverse of 1 is 1 */
				3546	if (alen == 1 && BIGLITTLE(a[-1],a[0]) == 1) {
				3547	lbnZero_32(BIGLITTLE(a-alen,a+alen), mlen-alen);
				3548	return 0;
				3549	}
				3550
				3551	/* Allocate a pile of space */
				3552	LBNALLOC(b, BNWORD32, mlen+1);
				3553	if (b) {
				3554	/*
				3555	* Although products are guaranteed to always be less than the
				3556	* modulus, it can involve multiplying two 3-word numbers to
				3557	* get a 5-word result, requiring a 6th word to store a 0
				3558	* temporarily. Thus, mlen + 1.
				3559	*/
				3560	LBNALLOC(p, BNWORD32, mlen+1);
				3561	if (p) {
				3562	LBNALLOC(t0, BNWORD32, mlen);
				3563	if (t0) {
				3564	LBNALLOC(t1, BNWORD32, mlen);
				3565	if (t1)
				3566	goto allocated;
				3567	LBNFREE(t0, mlen);
				3568	}
				3569	LBNFREE(p, mlen+1);
				3570	}
				3571	LBNFREE(b, mlen+1);
				3572	}
				3573	return -1;
				3574
				3575	allocated:
				3576
				3577	/* Set t0 to 1 */
				3578	t0len = 1;
				3579	BIGLITTLE(t0[-1],t0[0]) = 1;
				3580
				3581	/* b = mod */
				3582	lbnCopy_32(b, mod, mlen);
				3583	/* blen = mlen (implicitly) */
				3584
				3585	/* t1 = b / a; b = b % a */
				3586	cy = lbnDiv_32(t1, b, mlen, a, alen);
				3587	*(BIGLITTLE(t1-(mlen-alen)-1,t1+(mlen-alen))) = cy;
				3588	t1len = lbnNorm_32(t1, mlen-alen+1);
				3589	blen = lbnNorm_32(b, alen);
				3590
				3591	/* while (b > 1) */
				3592	while (blen > 1 \|\| BIGLITTLE(b[-1],b[0]) != (BNWORD32)1) {
				3593	/* q = a / b; a = a % b; */
				3594	if (alen < blen \|\| (alen == blen && lbnCmp_32(a, a, alen) < 0))
				3595	assert(0);
				3596	cy = lbnDiv_32(BIGLITTLE(a-blen,a+blen), a, alen, b, blen);
				3597	*(BIGLITTLE(a-alen-1,a+alen)) = cy;
				3598	plen = lbnNorm_32(BIGLITTLE(a-blen,a+blen), alen-blen+1);
				3599	assert(plen);
				3600	alen = lbnNorm_32(a, blen);
				3601	if (!alen)
				3602	goto failure; /* GCD not 1 */
				3603
				3604	/* t0 += q * t1; */
				3605	assert(plen+t1len <= mlen+1);
				3606	lbnMul_32(p, BIGLITTLE(a-blen,a+blen), plen, t1, t1len);
				3607	plen = lbnNorm_32(p, plen + t1len);
				3608	assert(plen <= mlen);
				3609	if (plen > t0len) {
				3610	lbnZero_32(BIGLITTLE(t0-t0len,t0+t0len), plen-t0len);
				3611	t0len = plen;
				3612	}
				3613	cy = lbnAddN_32(t0, p, plen);
				3614	if (cy) {
				3615	if (t0len > plen) {
				3616	cy = lbnAdd1_32(BIGLITTLE(t0-plen,t0+plen),
				3617	t0len-plen, cy);
				3618	}
				3619	if (cy) {
				3620	BIGLITTLE(t0[-t0len-1],t0[t0len]) = cy;
				3621	t0len++;
				3622	}
				3623	}
				3624
				3625	/* if (a <= 1) return a ? t0 : FAIL; */
				3626	if (alen <= 1 && BIGLITTLE(a[-1],a[0]) == (BNWORD32)1) {
				3627	if (alen == 0)
				3628	goto failure; /* FAIL */
				3629	assert(t0len <= mlen);
				3630	lbnCopy_32(a, t0, t0len);
				3631	lbnZero_32(BIGLITTLE(a-t0len, a+t0len), mlen-t0len);
				3632	goto success;
				3633	}
				3634
				3635	/* q = b / a; b = b % a; */
				3636	if (blen < alen \|\| (blen == alen && lbnCmp_32(b, a, alen) < 0))
				3637	assert(0);
				3638	cy = lbnDiv_32(BIGLITTLE(b-alen,b+alen), b, blen, a, alen);
				3639	*(BIGLITTLE(b-blen-1,b+blen)) = cy;
				3640	plen = lbnNorm_32(BIGLITTLE(b-alen,b+alen), blen-alen+1);
				3641	assert(plen);
				3642	blen = lbnNorm_32(b, alen);
				3643	if (!blen)
				3644	goto failure; /* GCD not 1 */
				3645
				3646	/* t1 += q * t0; */
				3647	assert(plen+t0len <= mlen+1);
				3648	lbnMul_32(p, BIGLITTLE(b-alen,b+alen), plen, t0, t0len);
				3649	plen = lbnNorm_32(p, plen + t0len);
				3650	assert(plen <= mlen);
				3651	if (plen > t1len) {
				3652	lbnZero_32(BIGLITTLE(t1-t1len,t1+t1len), plen-t1len);
				3653	t1len = plen;
				3654	}
				3655	cy = lbnAddN_32(t1, p, plen);
				3656	if (cy) {
				3657	if (t1len > plen) {
				3658	cy = lbnAdd1_32(BIGLITTLE(t1-plen,t0+plen),
				3659	t1len-plen, cy);
				3660	}
				3661	if (cy) {
				3662	BIGLITTLE(t1[-t1len-1],t1[t1len]) = cy;
				3663	t1len++;
				3664	}
				3665	}
				3666	#if BNYIELD
				3667	if (bnYield && (y = bnYield() < 0))
				3668	goto yield;
				3669	#endif
				3670	}
				3671
				3672	if (!blen)
				3673	goto failure; /* gcd(a, mod) != 1 -- FAIL */
				3674
				3675	/* return mod-t1 */
				3676	lbnCopy_32(a, mod, mlen);
				3677	assert(t1len <= mlen);
				3678	cy = lbnSubN_32(a, t1, t1len);
				3679	if (cy) {
				3680	assert(mlen > t1len);
				3681	cy = lbnSub1_32(BIGLITTLE(a-t1len, a+t1len), mlen-t1len, cy);
				3682	assert(!cy);
				3683	}
				3684
				3685	success:
				3686	LBNFREE(t1, mlen);
				3687	LBNFREE(t0, mlen);
				3688	LBNFREE(p, mlen+1);
				3689	LBNFREE(b, mlen+1);
				3690
				3691	return 0;
				3692
				3693	failure: /* GCD is not 1 - no inverse exists! */
				3694	y = 1;
				3695	#if BNYIELD
				3696	yield:
				3697	#endif
				3698	LBNFREE(t1, mlen);
				3699	LBNFREE(t0, mlen);
				3700	LBNFREE(p, mlen+1);
				3701	LBNFREE(b, mlen+1);
				3702
				3703	return y;
				3704	}
				3705
				3706	/*
				3707	* Precompute powers of "a" mod "mod". Compute them every "bits"
				3708	* for "n" steps. This is sufficient to compute powers of g with
				3709	* exponents up to nbits bits long, i.e. less than 2^(nbits).
				3710	*
				3711	* This assumes that the caller has already initialized "array" to point
				3712	* to "n" buffers of size "mlen".
				3713	*/
				3714	int
				3715	lbnBasePrecompBegin_32(BNWORD32 **array, unsigned n, unsigned bits,
				3716	BNWORD32 const g, unsigned glen, BNWORD32 mod, unsigned mlen)
				3717	{
				3718	BNWORD32 a, b; /* Temporary double-width accumulators */
				3719	BNWORD32 a1; / Pointer to high half of a*/
				3720	BNWORD32 inv; /* Montgomery inverse of LSW of mod */
				3721	BNWORD32 *t;
				3722	unsigned i;
				3723
				3724	glen = lbnNorm_32(g, glen);
				3725	assert(glen);
				3726
				3727	assert (mlen == lbnNorm_32(mod, mlen));
				3728	assert (glen <= mlen);
				3729
				3730	/* Allocate two temporary buffers, and the array slots */
				3731	LBNALLOC(a, BNWORD32, mlen*2);
				3732	if (!a)
				3733	return -1;
				3734	LBNALLOC(b, BNWORD32, mlen*2);
				3735	if (!b) {
				3736	LBNFREE(a, 2*mlen);
				3737	return -1;
				3738	}
				3739
				3740	/* Okay, all ready */
				3741
				3742	/* Convert n to Montgomery form */
				3743	inv = BIGLITTLE(mod[-1],mod[0]); /* LSW of modulus */
				3744	assert(inv & 1); /* Modulus must be odd */
				3745	inv = lbnMontInv1_32(inv);
				3746	/* Move g up "mlen" words into a (clearing the low mlen words) */
				3747	a1 = BIGLITTLE(a-mlen,a+mlen);
				3748	lbnCopy_32(a1, g, glen);
				3749	lbnZero_32(a, mlen);
				3750
				3751	/* Do the division - dump the quotient into the high-order words */
				3752	(void)lbnDiv_32(a1, a, mlen+glen, mod, mlen);
				3753
				3754	/* Copy the first value into the array */
				3755	t = *array;
				3756	lbnCopy_32(t, a, mlen);
				3757	a1 = a; /* This first value is not shifted up */
				3758
				3759	/* Now compute the remaining n-1 array entries */
				3760	assert(bits);
				3761	assert(n);
				3762	while (--n) {
				3763	i = bits;
				3764	do {
				3765	/* Square a1 into b1 */
				3766	lbnMontSquare_32(b, a1, mod, mlen, inv);
				3767	t = b; b = a; a = t;
				3768	a1 = BIGLITTLE(a-mlen, a+mlen);
				3769	} while (--i);
				3770	t = *++array;
				3771	lbnCopy_32(t, a1, mlen);
				3772	}
				3773
				3774	/* Hooray, we're done. */
				3775	LBNFREE(b, 2*mlen);
				3776	LBNFREE(a, 2*mlen);
				3777	return 0;
				3778	}
				3779
				3780	/*
				3781	* result = base^exp (mod mod). "array" is a an array of pointers
				3782	* to procomputed powers of base, each 2^bits apart. (I.e. array[i]
				3783	* is base^(2^(i*bits))).
				3784	*
				3785	* The algorithm consists of:
				3786	* a = b = (powers of g to be raised to the power 2^bits-1)
				3787	* a = b = (powers of g to be raised to the power 2^bits-2)
				3788	* ...
				3789	* a = b = (powers of g to be raised to the power 1)
				3790	*
				3791	* All we do is walk the exponent 2^bits-1 times in groups of "bits" bits,
				3792	*/
				3793	int
				3794	lbnBasePrecompExp_32(BNWORD32 result, BNWORD32 const const *array,
				3795	unsigned bits, BNWORD32 const *exp, unsigned elen,
				3796	BNWORD32 const *mod, unsigned mlen)
				3797	{
				3798	BNWORD32 a, b, c, t;
				3799	BNWORD32 a1, b1;
				3800	int anull, bnull; /* Null flags: values are implicitly 1 */
				3801	unsigned i, j; /* Loop counters */
				3802	unsigned mask; /* Exponent bits to examime */
				3803	BNWORD32 const eptr; / Pointer into exp */
				3804	BNWORD32 buf, curbits, nextword; /* Bit-buffer varaibles */
				3805	BNWORD32 inv; /* Inverse of LSW of modulus */
				3806	unsigned ewords; /* Words of exponent left */
				3807	int bufbits; /* Number of valid bits */
				3808	int y = 0;
				3809
				3810	mlen = lbnNorm_32(mod, mlen);
				3811	assert (mlen);
				3812
				3813	elen = lbnNorm_32(exp, elen);
				3814	if (!elen) {
				3815	lbnZero_32(result, mlen);
				3816	BIGLITTLE(result[-1],result[0]) = 1;
				3817	return 0;
				3818	}
				3819	/*
				3820	* This could be precomputed, but it's so cheap, and it would require
				3821	* making the precomputation structure word-size dependent.
				3822	*/
				3823	inv = lbnMontInv1_32(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
				3824
				3825	assert(elen);
				3826
				3827	/*
				3828	* Allocate three temporary buffers. The current numbers generally
				3829	* live in the upper halves of these buffers.
				3830	*/
				3831	LBNALLOC(a, BNWORD32, mlen*2);
				3832	if (a) {
				3833	LBNALLOC(b, BNWORD32, mlen*2);
				3834	if (b) {
				3835	LBNALLOC(c, BNWORD32, mlen*2);
				3836	if (c)
				3837	goto allocated;
				3838	LBNFREE(b, 2*mlen);
				3839	}
				3840	LBNFREE(a, 2*mlen);
				3841	}
				3842	return -1;
				3843
				3844	allocated:
				3845
				3846	anull = bnull = 1;
				3847
				3848	mask = (1u<<bits) - 1;
				3849	for (i = mask; i; --i) {
				3850	/* Set up bit buffer for walking the exponent */
				3851	eptr = exp;
				3852	buf = BIGLITTLE(--eptr, eptr++);
				3853	ewords = elen-1;
				3854	bufbits = 32;
				3855	for (j = 0; ewords \|\| buf; j++) {
				3856	/* Shift down current buffer */
				3857	curbits = buf;
				3858	buf >>= bits;
				3859	/* If necessary, add next word */
				3860	bufbits -= bits;
				3861	if (bufbits < 0 && ewords > 0) {
				3862	nextword = BIGLITTLE(--eptr, eptr++);
				3863	ewords--;
				3864	curbits \|= nextword << (bufbits+bits);
				3865	buf = nextword >> -bufbits;
				3866	bufbits += 32;
				3867	}
				3868	/* If appropriate, multiply b = array[j] /
				3869	if ((curbits & mask) == i) {
				3870	BNWORD32 const *d = array[j];
				3871
				3872	b1 = BIGLITTLE(b-mlen-1,b+mlen);
				3873	if (bnull) {
				3874	lbnCopy_32(b1, d, mlen);
				3875	bnull = 0;
				3876	} else {
				3877	lbnMontMul_32(c, b1, d, mod, mlen, inv);
				3878	t = c; c = b; b = t;
				3879	}
				3880	#if BNYIELD
				3881	if (bnYield && (y = bnYield() < 0))
				3882	goto yield;
				3883	#endif
				3884	}
				3885	}
				3886
				3887	/* Multiply a = b /
				3888	if (!bnull) {
				3889	a1 = BIGLITTLE(a-mlen-1,a+mlen);
				3890	b1 = BIGLITTLE(b-mlen-1,b+mlen);
				3891	if (anull) {
				3892	lbnCopy_32(a1, b1, mlen);
				3893	anull = 0;
				3894	} else {
				3895	lbnMontMul_32(c, a1, b1, mod, mlen, inv);
				3896	t = c; c = a; a = t;
				3897	}
				3898	}
				3899	}
				3900
				3901	assert(!anull); /* If it were, elen would have been 0 */
				3902
				3903	/* Convert out of Montgomery form and return */
				3904	a1 = BIGLITTLE(a-mlen-1,a+mlen);
				3905	lbnCopy_32(a, a1, mlen);
				3906	lbnZero_32(a1, mlen);
				3907	lbnMontReduce_32(a, mod, mlen, inv);
				3908	lbnCopy_32(result, a1, mlen);
				3909
				3910	#if BNYIELD
				3911	yield:
				3912	#endif
				3913	LBNFREE(c, 2*mlen);
				3914	LBNFREE(b, 2*mlen);
				3915	LBNFREE(a, 2*mlen);
				3916
				3917	return y;
				3918	}
				3919
				3920	/*
				3921	* result = base1^exp1 *base2^exp2 (mod mod). "array1" and "array2" are
				3922	* arrays of pointers to procomputed powers of the corresponding bases,
				3923	* each 2^bits apart. (I.e. array1[i] is base1^(2^(i*bits))).
				3924	*
				3925	* Bits must be the same in both. (It could be made adjustable, but it's
				3926	* a bit of a pain. Just make them both equal to the larger one.)
				3927	*
				3928	* The algorithm consists of:
				3929	* a = b = (powers of base1 and base2 to be raised to the power 2^bits-1)
				3930	* a = b = (powers of base1 and base2 to be raised to the power 2^bits-2)
				3931	* ...
				3932	* a = b = (powers of base1 and base2 to be raised to the power 1)
				3933	*
				3934	* All we do is walk the exponent 2^bits-1 times in groups of "bits" bits,
				3935	*/
				3936	int
				3937	lbnDoubleBasePrecompExp_32(BNWORD32 *result, unsigned bits,
				3938	BNWORD32 const * const array1, BNWORD32 const exp1, unsigned elen1,
				3939	BNWORD32 const * const array2, BNWORD32 const exp2,
				3940	unsigned elen2, BNWORD32 const *mod, unsigned mlen)
				3941	{
				3942	BNWORD32 a, b, c, t;
				3943	BNWORD32 a1, b1;
				3944	int anull, bnull; /* Null flags: values are implicitly 1 */
				3945	unsigned i, j, k; /* Loop counters */
				3946	unsigned mask; /* Exponent bits to examime */
				3947	BNWORD32 const eptr; / Pointer into exp */
				3948	BNWORD32 buf, curbits, nextword; /* Bit-buffer varaibles */
				3949	BNWORD32 inv; /* Inverse of LSW of modulus */
				3950	unsigned ewords; /* Words of exponent left */
				3951	int bufbits; /* Number of valid bits */
				3952	int y = 0;
				3953	BNWORD32 const * const *array;
				3954
				3955	mlen = lbnNorm_32(mod, mlen);
				3956	assert (mlen);
				3957
				3958	elen1 = lbnNorm_32(exp1, elen1);
				3959	if (!elen1) {
				3960	return lbnBasePrecompExp_32(result, array2, bits, exp2, elen2,
				3961	mod, mlen);
				3962	}
				3963	elen2 = lbnNorm_32(exp2, elen2);
				3964	if (!elen2) {
				3965	return lbnBasePrecompExp_32(result, array1, bits, exp1, elen1,
				3966	mod, mlen);
				3967	}
				3968	/*
				3969	* This could be precomputed, but it's so cheap, and it would require
				3970	* making the precomputation structure word-size dependent.
				3971	*/
				3972	inv = lbnMontInv1_32(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
				3973
				3974	assert(elen1);
				3975	assert(elen2);
				3976
				3977	/*
				3978	* Allocate three temporary buffers. The current numbers generally
				3979	* live in the upper halves of these buffers.
				3980	*/
				3981	LBNALLOC(a, BNWORD32, mlen*2);
				3982	if (a) {
				3983	LBNALLOC(b, BNWORD32, mlen*2);
				3984	if (b) {
				3985	LBNALLOC(c, BNWORD32, mlen*2);
				3986	if (c)
				3987	goto allocated;
				3988	LBNFREE(b, 2*mlen);
				3989	}
				3990	LBNFREE(a, 2*mlen);
				3991	}
				3992	return -1;
				3993
				3994	allocated:
				3995
				3996	anull = bnull = 1;
				3997
				3998	mask = (1u<<bits) - 1;
				3999	for (i = mask; i; --i) {
				4000	/* Walk each exponent in turn */
				4001	for (k = 0; k < 2; k++) {
				4002	/* Set up the exponent for walking */
				4003	array = k ? array2 : array1;
				4004	eptr = k ? exp2 : exp1;
				4005	ewords = (k ? elen2 : elen1) - 1;
				4006	/* Set up bit buffer for walking the exponent */
				4007	buf = BIGLITTLE(--eptr, eptr++);
				4008	bufbits = 32;
				4009	for (j = 0; ewords \|\| buf; j++) {
				4010	/* Shift down current buffer */
				4011	curbits = buf;
				4012	buf >>= bits;
				4013	/* If necessary, add next word */
				4014	bufbits -= bits;
				4015	if (bufbits < 0 && ewords > 0) {
				4016	nextword = BIGLITTLE(--eptr, eptr++);
				4017	ewords--;
				4018	curbits \|= nextword << (bufbits+bits);
				4019	buf = nextword >> -bufbits;
				4020	bufbits += 32;
				4021	}
				4022	/* If appropriate, multiply b = array[j] /
				4023	if ((curbits & mask) == i) {
				4024	BNWORD32 const *d = array[j];
				4025
				4026	b1 = BIGLITTLE(b-mlen-1,b+mlen);
				4027	if (bnull) {
				4028	lbnCopy_32(b1, d, mlen);
				4029	bnull = 0;
				4030	} else {
				4031	lbnMontMul_32(c, b1, d, mod, mlen, inv);
				4032	t = c; c = b; b = t;
				4033	}
				4034	#if BNYIELD
				4035	if (bnYield && (y = bnYield() < 0))
				4036	goto yield;
				4037	#endif
				4038	}
				4039	}
				4040	}
				4041
				4042	/* Multiply a = b /
				4043	if (!bnull) {
				4044	a1 = BIGLITTLE(a-mlen-1,a+mlen);
				4045	b1 = BIGLITTLE(b-mlen-1,b+mlen);
				4046	if (anull) {
				4047	lbnCopy_32(a1, b1, mlen);
				4048	anull = 0;
				4049	} else {
				4050	lbnMontMul_32(c, a1, b1, mod, mlen, inv);
				4051	t = c; c = a; a = t;
				4052	}
				4053	}
				4054	}
				4055
				4056	assert(!anull); /* If it were, elen would have been 0 */
				4057
				4058	/* Convert out of Montgomery form and return */
				4059	a1 = BIGLITTLE(a-mlen-1,a+mlen);
				4060	lbnCopy_32(a, a1, mlen);
				4061	lbnZero_32(a1, mlen);
				4062	lbnMontReduce_32(a, mod, mlen, inv);
				4063	lbnCopy_32(result, a1, mlen);
				4064
				4065	#if BNYIELD
				4066	yield:
				4067	#endif
				4068	LBNFREE(c, 2*mlen);
				4069	LBNFREE(b, 2*mlen);
				4070	LBNFREE(a, 2*mlen);
				4071
				4072	return y;
				4073	}